Skip to content

Commit

Permalink
Aligning timeouts to reflect real-world scenarios (#399)
Browse files Browse the repository at this point in the history
* Changing error messages in case of node disconnection;
* cleaning up unused properties
* removing request_limit and the logic attached to that since we don't actually handle multiple in-flight requests to binary-port
* Removing the possibility to define "infinite" as a valid retry amount in node client connector since it can lead to deadlocks. That allowed removal of RpcServerConfigTarget, NodeClientConfigTarget, ExponentialBackoffConfigTarget and MaxAttemptsTarget since we don't need custom code for deserialization of the config file.
* Added some metrics to track unwanted events (timeouts on connection/sending/receiving data from binary port, detecting response id mismatch)
* Changed buckets definitions in RESPONSE_TIME_MS_BUCKETS constant
* Added MAX_COMPONENT_STARTUP_TIMEOUT_SECS guard in case one of the components hangs on startup
* Making keepalive loop use the standard mechnism of sending messages to gain retries and id-checks
* Aligning message_timeout_secs
  • Loading branch information
zajko authored Jan 29, 2025
1 parent 7a22f99 commit 55151e6
Show file tree
Hide file tree
Showing 14 changed files with 194 additions and 375 deletions.
24 changes: 12 additions & 12 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ address = '0.0.0.0:28101'
max_message_size_bytes = 4_194_304
request_limit = 3
request_buffer_size = 16
message_timeout_secs = 30
client_access_timeout_secs = 2
message_timeout_secs = 10
client_access_timeout_secs = 10

[rpc_server.speculative_exec_server]
enable_server = true
Expand Down
39 changes: 37 additions & 2 deletions metrics/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ const RESPONSE_SIZE_BUCKETS: &[f64; 8] = &[
5e+2_f64, 1e+3_f64, 2e+3_f64, 5e+3_f64, 5e+4_f64, 5e+5_f64, 5e+6_f64, 5e+7_f64,
];

const RESPONSE_TIME_MS_BUCKETS: &[f64; 8] = &[
1_f64, 5_f64, 10_f64, 30_f64, 50_f64, 100_f64, 200_f64, 300_f64,
const RESPONSE_TIME_MS_BUCKETS: &[f64; 9] = &[
1_f64, 5_f64, 10_f64, 30_f64, 50_f64, 100_f64, 300_f64, 1000_f64, 3000_f64,
];

static ENDPOINT_CALLS: Lazy<IntCounterVec> = Lazy::new(|| {
Expand All @@ -24,6 +24,21 @@ static ENDPOINT_CALLS: Lazy<IntCounterVec> = Lazy::new(|| {
counter
});

static TIMEOUT_COUNTERS: Lazy<IntCounterVec> = Lazy::new(|| {
let counter = IntCounterVec::new(
Opts::new(
"rpc_server_timeout_counts",
"Counters for how many of the requests failed due to internal timeout",
),
&["timer"],
)
.unwrap();
REGISTRY
.register(Box::new(counter.clone()))
.expect("cannot register metric");
counter
});

static RESPONSE_TIMES_MS: Lazy<HistogramVec> = Lazy::new(|| {
let histogram = HistogramVec::new(
HistogramOpts {
Expand Down Expand Up @@ -56,6 +71,18 @@ static RECONNECT_TIMES_MS: Lazy<Histogram> = Lazy::new(|| {
histogram
});

static MISMATCHED_IDS: Lazy<IntGauge> = Lazy::new(|| {
let counter = IntGauge::new(
"rpc_server_mismatched_ids",
"Number of mismatched ID events observed in responses from binary port",
)
.expect("rpc_server_mismatched_ids metric can't be created");
REGISTRY
.register(Box::new(counter.clone()))
.expect("cannot register metric");
counter
});

static DISCONNECT_EVENTS: Lazy<IntGauge> = Lazy::new(|| {
let counter = IntGauge::new(
"rpc_server_disconnects",
Expand Down Expand Up @@ -108,3 +135,11 @@ pub fn register_request_size(method: &str, payload_size: f64) {
.with_label_values(&[method])
.observe(payload_size);
}

pub fn register_timeout(timer_name: &str) {
TIMEOUT_COUNTERS.with_label_values(&[timer_name]).inc();
}

pub fn register_mismatched_id() {
MISMATCHED_IDS.inc();
}
6 changes: 2 additions & 4 deletions resources/example_configs/EXAMPLE_NCTL_CONFIG.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ cors_origin = ""
ip_address = "0.0.0.0"
port = 28102
max_message_size_bytes = 4194304
request_limit = 3
request_buffer_size = 16
message_timeout_secs = 30
client_access_timeout_secs = 2
message_timeout_secs = 10
client_access_timeout_secs = 10
keepalive_timeout_ms = 10_000

[rpc_server.node_client.exponential_backoff]
Expand Down
6 changes: 2 additions & 4 deletions resources/example_configs/EXAMPLE_NCTL_POSTGRES_CONFIG.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ cors_origin = ""
ip_address = "0.0.0.0"
port = 28102
max_message_size_bytes = 4194304
request_limit = 3
request_buffer_size = 16
message_timeout_secs = 30
client_access_timeout_secs = 2
message_timeout_secs = 10
client_access_timeout_secs = 10
keepalive_timeout_ms = 10_000

[rpc_server.node_client.exponential_backoff]
Expand Down
6 changes: 2 additions & 4 deletions resources/example_configs/EXAMPLE_NODE_CONFIG.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ cors_origin = ""
ip_address = "3.20.57.210"
port = 7777
max_message_size_bytes = 4194304
request_limit = 10
request_buffer_size = 50
message_timeout_secs = 60
client_access_timeout_secs = 60
message_timeout_secs = 10
client_access_timeout_secs = 10
keepalive_timeout_ms = 10_000

[rpc_server.node_client.exponential_backoff]
Expand Down
8 changes: 2 additions & 6 deletions resources/example_configs/default_debian_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,10 @@ ip_address = '127.0.0.1'
port = 7779
# Maximum size of a message in bytes.
max_message_size_bytes = 4_194_304
# Maximum number of in-flight node requests.
request_limit = 3
# Number of node requests that can be buffered.
request_buffer_size = 16
# Timeout for a node request in seconds.
message_timeout_secs = 30
message_timeout_secs = 10
# Timeout specifying how long to wait for binary port client to be available.
client_access_timeout_secs = 2
client_access_timeout_secs = 10
# The amount of time in milliseconds to wait between sending keepalive requests.
keepalive_timeout_ms = 10_000

Expand Down
4 changes: 2 additions & 2 deletions resources/example_configs/default_rpc_only_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ request_limit = 3
# Number of node requests that can be buffered.
request_buffer_size = 16
# Timeout for a node request in seconds.
message_timeout_secs = 30
message_timeout_secs = 10
# Timeout specifying how long to wait for binary port client to be available.
client_access_timeout_secs = 2
client_access_timeout_secs = 10
# The amount of time in milliseconds to wait between sending keepalive requests.
keepalive_timeout_ms = 10_000

Expand Down
2 changes: 1 addition & 1 deletion resources/example_configs/default_sse_only_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ port = 18888
max_concurrent_requests = 50
max_requests_per_second = 50

[admin_server]
[admin_api_server]
enable_server = true
port = 18887
max_concurrent_requests = 1
Expand Down
Loading

0 comments on commit 55151e6

Please sign in to comment.