From 4394adaaebfad02512142c8f43bfb456cdf0bd73 Mon Sep 17 00:00:00 2001 From: Shunpoco Date: Tue, 30 Jan 2024 00:04:45 +0000 Subject: [PATCH 1/2] Add gRPC health check to Dataplane Currently our dataplane server doesn't have health check, so we can't confirm whether the dataplane daemonset runs properly or not. This PR introduce health check for gRPC to the dataplane server, and add liveness, readiness, and startup probe for the daemonset. Signed-off-by: Shunsuke Tokunaga Co-authored-by: Shane Utt --- config/dataplane/dataplane.yaml | 22 ++++++++++++++++++++++ dataplane/api-server/Cargo.toml | 1 + dataplane/api-server/src/lib.rs | 3 +++ 3 files changed, 26 insertions(+) diff --git a/config/dataplane/dataplane.yaml b/config/dataplane/dataplane.yaml index 042db122..77aef1d5 100644 --- a/config/dataplane/dataplane.yaml +++ b/config/dataplane/dataplane.yaml @@ -28,3 +28,25 @@ spec: - name: RUST_LOG value: debug imagePullPolicy: IfNotPresent + # The gRPC API has a slow startup time, so this probe helps to provide some + # grace while starting up to avoid unnecessary kills. + # + # TODO: When we complete https://github.com/kubernetes-sigs/blixt/issues/173 + # if we decide that we intend to keep the gRPC API around long term, we should + # take some time to see if we can clean up and improve the start time overall. + startupProbe: + grpc: + port: 9874 + failureThreshold: 30 + periodSeconds: 10 + livenessProbe: + grpc: + port: 9874 + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + grpc: + port: 9874 + initialDelaySeconds: 5 + periodSeconds: 5 + diff --git a/dataplane/api-server/Cargo.toml b/dataplane/api-server/Cargo.toml index 94c5e467..4a5add75 100644 --- a/dataplane/api-server/Cargo.toml +++ b/dataplane/api-server/Cargo.toml @@ -7,6 +7,7 @@ publish = false [dependencies] prost = "0.11.9" tonic = "0.9.2" +tonic-health = "0.9.2" anyhow = "1" log = "0.4" aya = { version = ">=0.11", features=["async_tokio"] } diff --git a/dataplane/api-server/src/lib.rs b/dataplane/api-server/src/lib.rs index c602c605..2fd7dfd1 100644 --- a/dataplane/api-server/src/lib.rs +++ b/dataplane/api-server/src/lib.rs @@ -24,9 +24,12 @@ pub async fn start( gateway_indexes_map: HashMap, tcp_conns_map: HashMap, ) -> Result<(), Error> { + let (_, health_service) = tonic_health::server::health_reporter(); + let server = server::BackendService::new(backends_map, gateway_indexes_map, tcp_conns_map); // TODO: mTLS https://github.com/Kong/blixt/issues/50 Server::builder() + .add_service(health_service) .add_service(BackendsServer::new(server)) .serve(SocketAddrV4::new(addr, port).into()) .await?; From d1351f4a66f08115927719d96a67d9301742dd3b Mon Sep 17 00:00:00 2001 From: Shunpoco Date: Tue, 6 Feb 2024 22:24:52 +0000 Subject: [PATCH 2/2] integration-test: add validation for readiness probe on dataplane This commit adds a validation function for dataplane. The function first checks whethre a dataplane pod has readiness probe setting. Then it checks all dataplane pods' readiness status. If all pods are ready, it finishes with no error. Signed-off-by: Shunsuke Tokunaga Co-authored-by: Shane Utt Co-authored-by: Sanskar Jaiswal --- test/integration/suite_test.go | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/test/integration/suite_test.go b/test/integration/suite_test.go index 97173749..c6e47b35 100644 --- a/test/integration/suite_test.go +++ b/test/integration/suite_test.go @@ -36,6 +36,7 @@ import ( "sigs.k8s.io/gateway-api/pkg/client/clientset/versioned" testutils "github.com/kubernetes-sigs/blixt/internal/test/utils" + "github.com/kubernetes-sigs/blixt/pkg/vars" ) var ( @@ -181,6 +182,9 @@ func TestMain(m *testing.M) { fmt.Println("INFO: waiting for Blixt component readiness") exitOnErr(testutils.WaitForBlixtReadiness(ctx, env)) + fmt.Println("INFO: waiting for Dataplane readiness") + exitOnErr(waitForDataplaneReadiness(ctx, env)) + exit := m.Run() exitOnErr(runCleanup(mainCleanupKey)) @@ -309,3 +313,50 @@ func waitForBpfdConfigDelete(ctx context.Context, env environments.Environment) } } } + +func waitForDataplaneReadiness(ctx context.Context, env environments.Environment) error { + for { + select { + case <-ctx.Done(): + if err := ctx.Err(); err != nil { + return fmt.Errorf("context completed while waiting for dataplane readiness, and an error occurred: %w", err) + } + return fmt.Errorf("context completed while waiting for dataplane readiness") + default: + dataplanes, err := env.Cluster().Client().CoreV1().Pods(vars.DefaultNamespace). + List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("component=%s", vars.DefaultDataPlaneComponentLabel)}) + if err != nil { + return fmt.Errorf("failed to fetch dataplane pod list: %w", err) + } + + if len(dataplanes.Items) == 0 { + return fmt.Errorf("no dataplanes found in namespace %s", vars.DefaultNamespace) + } + + // Check whether dataplane is set readiness probe. + for _, container := range dataplanes.Items[0].Spec.Containers { + if container.Name == "dataplane" && container.ReadinessProbe == nil { + return fmt.Errorf("found a dataplane container which doesn't have readiness probe") + } + } + + // Check all pods' readiness + var ready int + for _, pod := range dataplanes.Items { + for _, status := range pod.Status.ContainerStatuses { + if status.Name == "dataplane" && status.Ready { + ready++ + } + } + } + + if ready != len(dataplanes.Items) { + fmt.Printf("%d dataplanes not yet ready\n", len(dataplanes.Items)-ready) + time.Sleep(time.Second) // small rest from hitting the API over and over again + break + } + + return nil + } + } +}