From 918742b39ac6e9c53ac3718fac634200caf0e50f Mon Sep 17 00:00:00 2001 From: Aman Khinvasara Date: Tue, 11 Feb 2025 20:06:06 +0000 Subject: [PATCH 1/3] quic: ack trace probes and timeout margin --- src/app/fdctl/config/default.toml | 2 +- src/waltz/quic/fd_quic.c | 5 +++-- src/waltz/quic/fd_quic_ack_tx.c | 4 ++++ src/waltz/quic/tests/fuzz_quic_wire.c | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index d8e55aa3c6..1ac222307f 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -945,7 +945,7 @@ dynamic_port_range = "8900-9000" idle_timeout_millis = 10000 # Max delay for outgoing ACKs. - ack_delay_millis = 50 + ack_delay_millis = 25 # QUIC retry is a feature to combat new connection request # spamming. See rfc9000 8.1.2 for more details. This flag diff --git a/src/waltz/quic/fd_quic.c b/src/waltz/quic/fd_quic.c index b576ca5033..2426579157 100644 --- a/src/waltz/quic/fd_quic.c +++ b/src/waltz/quic/fd_quic.c @@ -491,7 +491,7 @@ fd_quic_init( fd_quic_t * quic ) { } state->svc_delay[ FD_QUIC_SVC_INSTANT ] = 0UL; state->svc_delay[ FD_QUIC_SVC_ACK_TX ] = quic->config.ack_delay; - state->svc_delay[ FD_QUIC_SVC_WAIT ] = quic->config.idle_timeout; + state->svc_delay[ FD_QUIC_SVC_WAIT ] = quic->config.idle_timeout * 10; /* Check TX AIO */ @@ -2706,7 +2706,8 @@ fd_quic_tls_cb_peer_params( void * context, /* set the max_idle_timeout to the min of our and peer max_idle_timeout */ if( peer_tp->max_idle_timeout ) { - conn->idle_timeout = fd_ulong_min( (ulong)(1e6) * peer_tp->max_idle_timeout, conn->idle_timeout ); + /* 10 for margin of safety */ + conn->idle_timeout = 10*fd_ulong_min( (ulong)(1e6) * peer_tp->max_idle_timeout, conn->idle_timeout ); } /* set ack_delay_exponent so we can properly interpret peer's ack_delays diff --git a/src/waltz/quic/fd_quic_ack_tx.c b/src/waltz/quic/fd_quic_ack_tx.c index 0b0610ef74..6cf7802f5b 100644 --- a/src/waltz/quic/fd_quic_ack_tx.c +++ b/src/waltz/quic/fd_quic_ack_tx.c @@ -55,11 +55,13 @@ fd_quic_ack_pkt( fd_quic_ack_gen_t * gen, /* Attempt to allocate another ACK queue entry */ if( gen->head - gen->tail >= FD_QUIC_ACK_QUEUE_CNT ) { + FD_DTRACE_PROBE_3( fd_quic_ack_tx_overflow, enc_level, pkt_number, now ); FD_DEBUG( FD_LOG_DEBUG(( "ACK queue overflow! (excessive reordering)" )); ) return FD_QUIC_ACK_TX_ENOSPC; } /* Start new pending ACK */ + FD_DTRACE_PROBE_3( fd_quic_ack_range_new, enc_level, pkt_number, now ); FD_ACK_DEBUG( FD_LOG_DEBUG(( "gen=%p queue ACK for enc=%u pkt_num=%lu seq=%u", (void *)gen, enc_level, pkt_number, gen->head )); ) fd_quic_ack_t * next_ack = fd_quic_ack_queue_ele( gen, gen->head ); @@ -105,6 +107,7 @@ fd_quic_gen_ack_frames( fd_quic_ack_gen_t * gen, for( ; gen->tail != gen->head; gen->tail++ ) { fd_quic_ack_t * ack = fd_quic_ack_queue_ele( gen, gen->tail ); if( ack->enc_level != enc_level ) { + FD_DTRACE_PROBE_2( fd_quic_ack_tx_enc_level_break, ack->enc_level, enc_level ); FD_ACK_DEBUG( FD_LOG_DEBUG(( "need encryption level %u for ACKs but have %u", ack->enc_level, enc_level )); ) break; } @@ -134,6 +137,7 @@ fd_quic_gen_ack_frames( fd_quic_ack_gen_t * gen, if( gen->head == gen->tail ) { gen->is_elicited = 0; } else { + FD_DTRACE_PROBE_1( fd_quic_ack_not_all_frames_flushed, gen->head - gen->tail ); FD_ACK_DEBUG( FD_LOG_DEBUG(( "Not all ACK frames were flushed" )); ) } diff --git a/src/waltz/quic/tests/fuzz_quic_wire.c b/src/waltz/quic/tests/fuzz_quic_wire.c index 1c964b1e3e..9cfc42b406 100644 --- a/src/waltz/quic/tests/fuzz_quic_wire.c +++ b/src/waltz/quic/tests/fuzz_quic_wire.c @@ -198,7 +198,7 @@ LLVMFuzzerTestOneInput( uchar const * data, /* Simulate conn timeout */ while( state->svc_queue[ FD_QUIC_SVC_WAIT ].head != UINT_MAX ) { - ulong idle_timeout_ts = conn->last_activity + quic->config.idle_timeout + 1UL; + ulong idle_timeout_ts = conn->last_activity + 10*quic->config.idle_timeout + 1UL; fd_quic_conn_t * conn = fd_quic_conn_at_idx( state, state->svc_queue[ FD_QUIC_SVC_WAIT ].head ); /* Idle timeouts should not be scheduled significantly late */ From 1646d3a95e3b3926dd060aead32b56a2762a550c Mon Sep 17 00:00:00 2001 From: Aman Khinvasara Date: Tue, 11 Feb 2025 22:02:24 +0000 Subject: [PATCH 2/3] more probes --- src/waltz/quic/fd_quic.c | 8 +++++++- src/waltz/quic/fd_quic_ack_tx.c | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/waltz/quic/fd_quic.c b/src/waltz/quic/fd_quic.c index 2426579157..87e025b4f7 100644 --- a/src/waltz/quic/fd_quic.c +++ b/src/waltz/quic/fd_quic.c @@ -1755,6 +1755,7 @@ fd_quic_handle_v1_initial( fd_quic_t * quic, } /* update last activity */ + FD_DTRACE_PROBE_2( fd_quic_initial_touch_last_activity, conn->our_conn_id, state->now ); conn->last_activity = state->now; conn->flags &= ~( FD_QUIC_CONN_FLAGS_PING_SENT | FD_QUIC_CONN_FLAGS_PING ); @@ -1906,6 +1907,7 @@ fd_quic_handle_v1_handshake( } /* update last activity */ + FD_DTRACE_PROBE_2( fd_quic_hs_touch_last_activity, conn->our_conn_id, fd_quic_get_state( quic )->now ); conn->last_activity = fd_quic_get_state( quic )->now; conn->flags &= ~( FD_QUIC_CONN_FLAGS_PING_SENT | FD_QUIC_CONN_FLAGS_PING ); @@ -2161,6 +2163,7 @@ fd_quic_handle_v1_one_rtt( fd_quic_t * quic, } /* update last activity */ + FD_DTRACE_PROBE_2( fd_quic_one_rtt_touch_last_activity, conn->our_conn_id, fd_quic_get_state( quic )->now ); conn->last_activity = fd_quic_get_state( quic )->now; /* update expected packet number */ @@ -2169,7 +2172,6 @@ fd_quic_handle_v1_one_rtt( fd_quic_t * quic, return tot_sz; } - /* process v1 quic packets returns number of bytes consumed, or FD_QUIC_PARSE_FAIL upon error */ ulong @@ -2215,6 +2217,8 @@ fd_quic_process_quic_packet_v1( fd_quic_t * quic, if( dcid.sz == FD_QUIC_CONN_ID_SZ ) { conn = fd_quic_conn_query( state->conn_map, fd_ulong_load_8( dcid.conn_id ) ); } + + FD_DTRACE_PROBE_1( fd_quic_process_quic_packet_v1_long_hdr, fd_ulong_load_8(dcid.conn_id) ); fd_quic_conn_id_t scid = fd_quic_conn_id_new( long_hdr->src_conn_id, long_hdr->src_conn_id_len ); uchar long_packet_type = fd_quic_h0_long_packet_type( *cur_ptr ); @@ -2259,6 +2263,7 @@ fd_quic_process_quic_packet_v1( fd_quic_t * quic, /* find connection id */ ulong dst_conn_id = fd_ulong_load_8( cur_ptr+1 ); + FD_DTRACE_PROBE_1( fd_quic_process_quic_packet_v1_short_hdr, dst_conn_id ); conn = fd_quic_conn_query( state->conn_map, dst_conn_id ); if( FD_UNLIKELY( !conn ) ) { FD_DEBUG( FD_LOG_DEBUG(( "one_rtt failed: no connection found" )) ); @@ -2867,6 +2872,7 @@ fd_quic_svc_poll( fd_quic_t * quic, FD_DEBUG( FD_LOG_WARNING(( "%s conn %p conn_idx: %u closing due to idle timeout (%g ms)", conn->server?"SERVER":"CLIENT", (void *)conn, conn->conn_idx, (double)conn->idle_timeout / 1e6 )); ) + FD_DTRACE_PROBE_1( fd_quic_conn_idle_timeout, conn->our_conn_id ); conn->state = FD_QUIC_CONN_STATE_DEAD; quic->metrics.conn_timeout_cnt++; diff --git a/src/waltz/quic/fd_quic_ack_tx.c b/src/waltz/quic/fd_quic_ack_tx.c index 6cf7802f5b..e0038ff9e0 100644 --- a/src/waltz/quic/fd_quic_ack_tx.c +++ b/src/waltz/quic/fd_quic_ack_tx.c @@ -128,6 +128,7 @@ fd_quic_gen_ack_frames( fd_quic_ack_gen_t * gen, FD_DEBUG( FD_LOG_DEBUG(( "insufficient buffer space to send ACK" )); ) break; } + FD_DTRACE_PROBE_2( fd_quic_ack_frame_created, ack_frame.largest_ack, ack_frame.first_ack_range ); payload_ptr += frame_sz; FD_ACK_DEBUG( FD_LOG_DEBUG(( "gen=%p sending ACK enc=%u range=[%lu,%lu) seq=%u sz=%lu", (void *)gen, enc_level, ack->pkt_number.offset_lo, ack->pkt_number.offset_hi, gen->tail, frame_sz )); ) From 82a9521643e198e2ec99f0f684af0f1a75ef508e Mon Sep 17 00:00:00 2001 From: Aman Khinvasara Date: Wed, 12 Feb 2025 19:13:29 +0000 Subject: [PATCH 3/3] more traces --- src/waltz/quic/fd_quic.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/waltz/quic/fd_quic.c b/src/waltz/quic/fd_quic.c index 87e025b4f7..1e97fad05b 100644 --- a/src/waltz/quic/fd_quic.c +++ b/src/waltz/quic/fd_quic.c @@ -1368,6 +1368,7 @@ fd_quic_send_retry( fd_quic_t * quic, uint dst_ip_addr, ushort dst_udp_port ) { + FD_DTRACE_PROBE_3( fd_quic_send_retry, fd_ulong_load_8(odcid->conn_id), new_conn_id, dst_ip_addr ); fd_quic_state_t * state = fd_quic_get_state( quic ); ulong expire_at = state->now + quic->config.retry_ttl; @@ -2218,11 +2219,13 @@ fd_quic_process_quic_packet_v1( fd_quic_t * quic, conn = fd_quic_conn_query( state->conn_map, fd_ulong_load_8( dcid.conn_id ) ); } - FD_DTRACE_PROBE_1( fd_quic_process_quic_packet_v1_long_hdr, fd_ulong_load_8(dcid.conn_id) ); fd_quic_conn_id_t scid = fd_quic_conn_id_new( long_hdr->src_conn_id, long_hdr->src_conn_id_len ); uchar long_packet_type = fd_quic_h0_long_packet_type( *cur_ptr ); + FD_DTRACE_PROBE_3( fd_quic_process_quic_packet_v1_long_hdr, long_packet_type, fd_ulong_load_8(dcid.conn_id), + conn ? conn->our_conn_id : 0 ); + /* encryption level matches that of TLS */ pkt->enc_level = long_packet_type; /* V2 uses an indirect mapping */ @@ -2869,6 +2872,9 @@ fd_quic_svc_poll( fd_quic_t * quic, "... the connection is silently closed and its state is discarded when it remains idle for longer than the minimum of the max_idle_timeout value advertised by both endpoints." */ + if( FD_UNLIKELY( !fd_quic_conn_query( state->conn_map, conn->our_conn_id ) ) ) { + FD_DTRACE_PROBE_1( fd_quic_timeout_missing_conn, conn->our_conn_id ); + } FD_DEBUG( FD_LOG_WARNING(( "%s conn %p conn_idx: %u closing due to idle timeout (%g ms)", conn->server?"SERVER":"CLIENT", (void *)conn, conn->conn_idx, (double)conn->idle_timeout / 1e6 )); ) @@ -3862,6 +3868,8 @@ fd_quic_conn_tx( fd_quic_t * quic, pkt_meta->pkt_number++; } + FD_DTRACE_PROBE_3( fd_quic_conn_tx, conn_id, enc_level, pkt_number ); + if( enc_level == fd_quic_enc_level_appdata_id ) { /* short header must be last in datagram so send in packet immediately */