Skip to content

Commit

Permalink
A0-4591: Added major sync metric back (although by different name) (#…
Browse files Browse the repository at this point in the history
…1919)

# Description

This PR brings back major syncing metric that was unwillingly removed in
14 release. It copies some code from `polkadot-sdk` (metric definition),
copy is required since we want different name and `MajorSyncingGauge`
constructor is private in `polkadot-sdk`. Other than that implementation
approach is taken from the `polkadot-sdk` - the `AtomicBool` is passed
from its creation down to when it's used in the metric gauge.

## Type of change

Please delete options that are not relevant.

- Bug fix (non-breaking change which fixes an issue)
- New feature (non-breaking change which adds functionality)

# Testing

I run locally `./scripts/run_nodes.sh -v 4`, then immediately run `watch
-n -1 ' curl localhost:9616/metrics | grep major_sync' is one console
and in the other
```
[12:22] marol-Latitude-5521:aleph-node (A0-4591 *%) | tail -f run-nodes-local/node-1.log | grep "major sync"
2025-01-24 12:22:36.317  INFO tokio-runtime-worker aleph-block-sync: Switched to major sync state.
2025-01-24 12:22:46.320  INFO tokio-runtime-worker aleph-block-sync: No longer in major sync state.
```
and I saw correlation when the log appeared, the metric switched its
state accordingly.
  • Loading branch information
Marcin-Radecki authored Jan 27, 2025
1 parent d85f16e commit d1c8a72
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 35 deletions.
9 changes: 0 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions bin/node/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ pub fn new_authority(

let backoff_authoring_blocks = Some(LimitNonfinalized(aleph_config.max_nonfinalized_blocks()));
let prometheus_registry = config.prometheus_registry().cloned();
let (sync_oracle, major_sync) = SyncOracle::new();
let sync_oracle = SyncOracle::new();
let proposer_factory = get_proposer_factory(&service_components, &config);
let slot_duration = sc_consensus_aura::slot_duration(&*service_components.client)?;
let (block_import, block_rx) = RedirectingBlockImport::new(service_components.client.clone());
Expand Down Expand Up @@ -311,7 +311,7 @@ pub fn new_authority(
network_config,
config.protocol_id(),
service_components.client.clone(),
major_sync,
sync_oracle.underlying_atomic(),
service_components.transaction_pool.clone(),
&service_components.task_manager.spawn_handle(),
config
Expand Down
9 changes: 0 additions & 9 deletions finality-aleph/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,13 @@ pallet-aleph-runtime-api = { workspace = true, features = ["std"] }

async-trait = { workspace = true }
array-bytes = { workspace = true }
bytes = { workspace = true }
derive_more = { workspace = true }
env_logger = { workspace = true }
futures = { workspace = true }
futures-timer = { workspace = true }
hash-db = { workspace = true }
hex = { workspace = true }
ip_network = { workspace = true }
log = { workspace = true }
lru = { workspace = true }
parity-scale-codec = { workspace = true, features = ["derive"] }
scale-info = { workspace = true, features = ["derive"] }
parking_lot = { workspace = true }
rand = { workspace = true }
serde = { workspace = true }
Expand All @@ -62,7 +57,6 @@ sc-network-light = { workspace = true }
sc-network-transactions = { workspace = true }
sc-rpc = { workspace = true }
sc-service = { workspace = true }
sc-telemetry = { workspace = true }
sc-transaction-pool = { workspace = true }
sc-transaction-pool-api = { workspace = true }
sc-utils = { workspace = true }
Expand All @@ -74,12 +68,9 @@ sp-consensus = { workspace = true }
sp-consensus-aura = { workspace = true }
sp-consensus-slots = { workspace = true }
sp-core = { workspace = true }
sp-io = { workspace = true }
sp-keystore = { workspace = true }
sp-runtime = { workspace = true }
sp-state-machine = { workspace = true }
sp-timestamp = { workspace = true }
sp-trie = { workspace = true }

[dev-dependencies]
substrate-test-runtime-client = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion finality-aleph/src/party/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ mod tests {
let readonly_session_authorities = shared_map.read_only();

let chain_state = Arc::new(MockChainState::new());
let (sync_oracle, _) = SyncOracle::new();
let sync_oracle = SyncOracle::new();
let session_manager = Arc::new(MockNodeSessionManager::new());
let session_info = SessionBoundaryInfo::new(session_period);

Expand Down
4 changes: 2 additions & 2 deletions finality-aleph/src/sync/handler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ mod tests {
let handler = Handler::new(
database_io,
verifier,
SyncOracle::new().0,
SyncOracle::new(),
SESSION_BOUNDARY_INFO,
)
.expect("mock backend works");
Expand Down Expand Up @@ -1718,7 +1718,7 @@ mod tests {
let mut handler = Handler::new(
database_io,
verifier,
SyncOracle::new().0,
SyncOracle::new(),
SessionBoundaryInfo::new(SessionPeriod(20)),
)
.expect("mock backend works");
Expand Down
48 changes: 45 additions & 3 deletions finality-aleph/src/sync/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
use std::collections::HashMap;
use std::{
collections::HashMap,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};

use substrate_prometheus_endpoint::{register, Counter, PrometheusError, Registry, U64};
use substrate_prometheus_endpoint::{
register, Counter, MetricSource, Opts, PrometheusError, Registry, SourcedGauge, U64,
};

#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum Event {
Expand Down Expand Up @@ -82,8 +90,39 @@ pub enum Metrics {
Noop,
}

#[derive(Clone)]
pub struct MajorSyncingGauge(Arc<AtomicBool>);

impl MajorSyncingGauge {
fn register(registry: &Registry, value: Arc<AtomicBool>) -> Result<(), PrometheusError> {
register(
SourcedGauge::new(
&Opts::new(
"aleph_sync_is_major_syncing",
"Whether the node is performing a major sync or not.",
),
MajorSyncingGauge(value),
)?,
registry,
)?;

Ok(())
}
}

impl MetricSource for MajorSyncingGauge {
type N = u64;

fn collect(&self, mut set: impl FnMut(&[&str], Self::N)) {
set(&[], self.0.load(Ordering::Relaxed) as u64);
}
}

impl Metrics {
pub fn new(registry: Option<Registry>) -> Result<Self, PrometheusError> {
pub fn new(
is_major_syncing: Arc<AtomicBool>,
registry: Option<Registry>,
) -> Result<Self, PrometheusError> {
let registry = match registry {
Some(registry) => registry,
None => return Ok(Metrics::Noop),
Expand Down Expand Up @@ -114,6 +153,9 @@ impl Metrics {
)?,
);
}

MajorSyncingGauge::register(&registry, is_major_syncing)?;

Ok(Metrics::Prometheus {
event_calls,
event_errors,
Expand Down
3 changes: 2 additions & 1 deletion finality-aleph/src/sync/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,13 @@ where
database_io,
} = io;
let network = VersionWrapper::new(network);
let is_major_syncing = sync_oracle.underlying_atomic();
let handler = Handler::new(database_io, verifier, sync_oracle, session_info)?;
let tasks = TaskQueue::new();
let broadcast_ticker = Ticker::new(TICK_PERIOD, BROADCAST_COOLDOWN);
let chain_extension_ticker = Ticker::new(TICK_PERIOD, CHAIN_EXTENSION_COOLDOWN);
let (block_requests_for_sync, block_requests_from_user) = mpsc::unbounded();
let metrics = Metrics::new(metrics_registry).unwrap_or_else(|e| {
let metrics = Metrics::new(is_major_syncing, metrics_registry).unwrap_or_else(|e| {
warn!(target: LOG_TARGET, "Failed to create metrics: {}.", e);
Metrics::noop()
});
Expand Down
16 changes: 8 additions & 8 deletions finality-aleph/src/sync_oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,24 @@ const OFFLINE_THRESHOLD: Duration = Duration::from_secs(6);
const FAR_BEHIND_THRESHOLD: u32 = 15;
const MAJOR_SYNC_THRESHOLD: Duration = Duration::from_secs(10);

pub type MajorSyncIndicator = Arc<AtomicBool>;

/// A sync oracle implementation tracking how recently the node was far behind the highest known justification.
/// It defines being in major sync as being more than 15 blocks behind the highest known justification less than 10 seconds ago.
/// It defines being offline as not getting any update for at least 6 seconds (or never at all).
#[derive(Clone)]
pub struct SyncOracle {
last_far_behind: Arc<Mutex<Instant>>,
last_update: Arc<Mutex<Instant>>,
// TODO: remove when SyncingService is no longer needed
is_major_syncing: Arc<AtomicBool>,
}

impl SyncOracle {
pub fn new() -> (Self, MajorSyncIndicator) {
pub fn new() -> Self {
let is_major_syncing = Arc::new(AtomicBool::new(true));
let oracle = SyncOracle {
SyncOracle {
last_update: Arc::new(Mutex::new(Instant::now() - OFFLINE_THRESHOLD)),
last_far_behind: Arc::new(Mutex::new(Instant::now())),
is_major_syncing: is_major_syncing.clone(),
};
(oracle, is_major_syncing)
}
}

pub fn update_behind(&self, behind: u32) {
Expand All @@ -53,11 +49,15 @@ impl SyncOracle {
.store(is_major_syncing, Ordering::Relaxed);
is_major_syncing
}

pub fn underlying_atomic(&self) -> Arc<AtomicBool> {
self.is_major_syncing.clone()
}
}

impl Default for SyncOracle {
fn default() -> Self {
SyncOracle::new().0
SyncOracle::new()
}
}

Expand Down

0 comments on commit d1c8a72

Please sign in to comment.