From d1c8a72f8908799dec26080fa5e2f1033ac72654 Mon Sep 17 00:00:00 2001 From: Marcin Date: Mon, 27 Jan 2025 08:59:59 +0100 Subject: [PATCH] A0-4591: Added major sync metric back (although by different name) (#1919) # Description This PR brings back major syncing metric that was unwillingly removed in 14 release. It copies some code from `polkadot-sdk` (metric definition), copy is required since we want different name and `MajorSyncingGauge` constructor is private in `polkadot-sdk`. Other than that implementation approach is taken from the `polkadot-sdk` - the `AtomicBool` is passed from its creation down to when it's used in the metric gauge. ## Type of change Please delete options that are not relevant. - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) # Testing I run locally `./scripts/run_nodes.sh -v 4`, then immediately run `watch -n -1 ' curl localhost:9616/metrics | grep major_sync' is one console and in the other ``` [12:22] marol-Latitude-5521:aleph-node (A0-4591 *%) | tail -f run-nodes-local/node-1.log | grep "major sync" 2025-01-24 12:22:36.317 INFO tokio-runtime-worker aleph-block-sync: Switched to major sync state. 2025-01-24 12:22:46.320 INFO tokio-runtime-worker aleph-block-sync: No longer in major sync state. ``` and I saw correlation when the log appeared, the metric switched its state accordingly. --- Cargo.lock | 9 ----- bin/node/src/service.rs | 4 +-- finality-aleph/Cargo.toml | 9 ----- finality-aleph/src/party/mod.rs | 2 +- finality-aleph/src/sync/handler/mod.rs | 4 +-- finality-aleph/src/sync/metrics.rs | 48 ++++++++++++++++++++++++-- finality-aleph/src/sync/service.rs | 3 +- finality-aleph/src/sync_oracle.rs | 16 ++++----- 8 files changed, 60 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b0ddc63..1fad450f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2288,16 +2288,12 @@ dependencies = [ "aleph-bft-rmc 0.14.0", "array-bytes 6.2.3", "async-trait", - "bytes", "derive_more 1.0.0", - "env_logger", "fake-runtime-api", "frame-support", "futures", "futures-timer", - "hash-db", "hex", - "ip_network", "libp2p", "log", "lru", @@ -2321,11 +2317,9 @@ dependencies = [ "sc-network-transactions", "sc-rpc", "sc-service", - "sc-telemetry", "sc-transaction-pool", "sc-transaction-pool-api", "sc-utils", - "scale-info", "serde", "sp-api", "sp-application-crypto", @@ -2334,12 +2328,9 @@ dependencies = [ "sp-consensus-aura", "sp-consensus-slots", "sp-core", - "sp-io", "sp-keystore", "sp-runtime", - "sp-state-machine", "sp-timestamp", - "sp-trie", "static_assertions", "substrate-prometheus-endpoint", "substrate-test-client", diff --git a/bin/node/src/service.rs b/bin/node/src/service.rs index d4715bdf..66844f89 100644 --- a/bin/node/src/service.rs +++ b/bin/node/src/service.rs @@ -258,7 +258,7 @@ pub fn new_authority( let backoff_authoring_blocks = Some(LimitNonfinalized(aleph_config.max_nonfinalized_blocks())); let prometheus_registry = config.prometheus_registry().cloned(); - let (sync_oracle, major_sync) = SyncOracle::new(); + let sync_oracle = SyncOracle::new(); let proposer_factory = get_proposer_factory(&service_components, &config); let slot_duration = sc_consensus_aura::slot_duration(&*service_components.client)?; let (block_import, block_rx) = RedirectingBlockImport::new(service_components.client.clone()); @@ -311,7 +311,7 @@ pub fn new_authority( network_config, config.protocol_id(), service_components.client.clone(), - major_sync, + sync_oracle.underlying_atomic(), service_components.transaction_pool.clone(), &service_components.task_manager.spawn_handle(), config diff --git a/finality-aleph/Cargo.toml b/finality-aleph/Cargo.toml index 54a99898..57f8bd6a 100644 --- a/finality-aleph/Cargo.toml +++ b/finality-aleph/Cargo.toml @@ -27,18 +27,13 @@ pallet-aleph-runtime-api = { workspace = true, features = ["std"] } async-trait = { workspace = true } array-bytes = { workspace = true } -bytes = { workspace = true } derive_more = { workspace = true } -env_logger = { workspace = true } futures = { workspace = true } futures-timer = { workspace = true } -hash-db = { workspace = true } hex = { workspace = true } -ip_network = { workspace = true } log = { workspace = true } lru = { workspace = true } parity-scale-codec = { workspace = true, features = ["derive"] } -scale-info = { workspace = true, features = ["derive"] } parking_lot = { workspace = true } rand = { workspace = true } serde = { workspace = true } @@ -62,7 +57,6 @@ sc-network-light = { workspace = true } sc-network-transactions = { workspace = true } sc-rpc = { workspace = true } sc-service = { workspace = true } -sc-telemetry = { workspace = true } sc-transaction-pool = { workspace = true } sc-transaction-pool-api = { workspace = true } sc-utils = { workspace = true } @@ -74,12 +68,9 @@ sp-consensus = { workspace = true } sp-consensus-aura = { workspace = true } sp-consensus-slots = { workspace = true } sp-core = { workspace = true } -sp-io = { workspace = true } sp-keystore = { workspace = true } sp-runtime = { workspace = true } -sp-state-machine = { workspace = true } sp-timestamp = { workspace = true } -sp-trie = { workspace = true } [dev-dependencies] substrate-test-runtime-client = { workspace = true } diff --git a/finality-aleph/src/party/mod.rs b/finality-aleph/src/party/mod.rs index 9652e815..1dd62132 100644 --- a/finality-aleph/src/party/mod.rs +++ b/finality-aleph/src/party/mod.rs @@ -501,7 +501,7 @@ mod tests { let readonly_session_authorities = shared_map.read_only(); let chain_state = Arc::new(MockChainState::new()); - let (sync_oracle, _) = SyncOracle::new(); + let sync_oracle = SyncOracle::new(); let session_manager = Arc::new(MockNodeSessionManager::new()); let session_info = SessionBoundaryInfo::new(session_period); diff --git a/finality-aleph/src/sync/handler/mod.rs b/finality-aleph/src/sync/handler/mod.rs index a5dda48b..85241b4d 100644 --- a/finality-aleph/src/sync/handler/mod.rs +++ b/finality-aleph/src/sync/handler/mod.rs @@ -765,7 +765,7 @@ mod tests { let handler = Handler::new( database_io, verifier, - SyncOracle::new().0, + SyncOracle::new(), SESSION_BOUNDARY_INFO, ) .expect("mock backend works"); @@ -1718,7 +1718,7 @@ mod tests { let mut handler = Handler::new( database_io, verifier, - SyncOracle::new().0, + SyncOracle::new(), SessionBoundaryInfo::new(SessionPeriod(20)), ) .expect("mock backend works"); diff --git a/finality-aleph/src/sync/metrics.rs b/finality-aleph/src/sync/metrics.rs index 6bbd37fb..77ae1800 100644 --- a/finality-aleph/src/sync/metrics.rs +++ b/finality-aleph/src/sync/metrics.rs @@ -1,6 +1,14 @@ -use std::collections::HashMap; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; -use substrate_prometheus_endpoint::{register, Counter, PrometheusError, Registry, U64}; +use substrate_prometheus_endpoint::{ + register, Counter, MetricSource, Opts, PrometheusError, Registry, SourcedGauge, U64, +}; #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] pub enum Event { @@ -82,8 +90,39 @@ pub enum Metrics { Noop, } +#[derive(Clone)] +pub struct MajorSyncingGauge(Arc); + +impl MajorSyncingGauge { + fn register(registry: &Registry, value: Arc) -> Result<(), PrometheusError> { + register( + SourcedGauge::new( + &Opts::new( + "aleph_sync_is_major_syncing", + "Whether the node is performing a major sync or not.", + ), + MajorSyncingGauge(value), + )?, + registry, + )?; + + Ok(()) + } +} + +impl MetricSource for MajorSyncingGauge { + type N = u64; + + fn collect(&self, mut set: impl FnMut(&[&str], Self::N)) { + set(&[], self.0.load(Ordering::Relaxed) as u64); + } +} + impl Metrics { - pub fn new(registry: Option) -> Result { + pub fn new( + is_major_syncing: Arc, + registry: Option, + ) -> Result { let registry = match registry { Some(registry) => registry, None => return Ok(Metrics::Noop), @@ -114,6 +153,9 @@ impl Metrics { )?, ); } + + MajorSyncingGauge::register(®istry, is_major_syncing)?; + Ok(Metrics::Prometheus { event_calls, event_errors, diff --git a/finality-aleph/src/sync/service.rs b/finality-aleph/src/sync/service.rs index ecfc24a6..0ccd0507 100644 --- a/finality-aleph/src/sync/service.rs +++ b/finality-aleph/src/sync/service.rs @@ -194,12 +194,13 @@ where database_io, } = io; let network = VersionWrapper::new(network); + let is_major_syncing = sync_oracle.underlying_atomic(); let handler = Handler::new(database_io, verifier, sync_oracle, session_info)?; let tasks = TaskQueue::new(); let broadcast_ticker = Ticker::new(TICK_PERIOD, BROADCAST_COOLDOWN); let chain_extension_ticker = Ticker::new(TICK_PERIOD, CHAIN_EXTENSION_COOLDOWN); let (block_requests_for_sync, block_requests_from_user) = mpsc::unbounded(); - let metrics = Metrics::new(metrics_registry).unwrap_or_else(|e| { + let metrics = Metrics::new(is_major_syncing, metrics_registry).unwrap_or_else(|e| { warn!(target: LOG_TARGET, "Failed to create metrics: {}.", e); Metrics::noop() }); diff --git a/finality-aleph/src/sync_oracle.rs b/finality-aleph/src/sync_oracle.rs index b9bf882a..4394bb2d 100644 --- a/finality-aleph/src/sync_oracle.rs +++ b/finality-aleph/src/sync_oracle.rs @@ -13,8 +13,6 @@ const OFFLINE_THRESHOLD: Duration = Duration::from_secs(6); const FAR_BEHIND_THRESHOLD: u32 = 15; const MAJOR_SYNC_THRESHOLD: Duration = Duration::from_secs(10); -pub type MajorSyncIndicator = Arc; - /// A sync oracle implementation tracking how recently the node was far behind the highest known justification. /// It defines being in major sync as being more than 15 blocks behind the highest known justification less than 10 seconds ago. /// It defines being offline as not getting any update for at least 6 seconds (or never at all). @@ -22,19 +20,17 @@ pub type MajorSyncIndicator = Arc; pub struct SyncOracle { last_far_behind: Arc>, last_update: Arc>, - // TODO: remove when SyncingService is no longer needed is_major_syncing: Arc, } impl SyncOracle { - pub fn new() -> (Self, MajorSyncIndicator) { + pub fn new() -> Self { let is_major_syncing = Arc::new(AtomicBool::new(true)); - let oracle = SyncOracle { + SyncOracle { last_update: Arc::new(Mutex::new(Instant::now() - OFFLINE_THRESHOLD)), last_far_behind: Arc::new(Mutex::new(Instant::now())), is_major_syncing: is_major_syncing.clone(), - }; - (oracle, is_major_syncing) + } } pub fn update_behind(&self, behind: u32) { @@ -53,11 +49,15 @@ impl SyncOracle { .store(is_major_syncing, Ordering::Relaxed); is_major_syncing } + + pub fn underlying_atomic(&self) -> Arc { + self.is_major_syncing.clone() + } } impl Default for SyncOracle { fn default() -> Self { - SyncOracle::new().0 + SyncOracle::new() } }