Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataplane: Add gRPC health check #182

Merged
merged 2 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions config/dataplane/dataplane.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,25 @@ spec:
- name: RUST_LOG
value: debug
imagePullPolicy: IfNotPresent
# The gRPC API has a slow startup time, so this probe helps to provide some
# grace while starting up to avoid unnecessary kills.
#
# TODO: When we complete https://github.com/kubernetes-sigs/blixt/issues/173
# if we decide that we intend to keep the gRPC API around long term, we should
# take some time to see if we can clean up and improve the start time overall.
startupProbe:
grpc:
port: 9874
failureThreshold: 30
periodSeconds: 10
livenessProbe:
grpc:
port: 9874
initialDelaySeconds: 5
periodSeconds: 5
readinessProbe:
grpc:
port: 9874
initialDelaySeconds: 5
periodSeconds: 5

1 change: 1 addition & 0 deletions dataplane/api-server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ publish = false
[dependencies]
prost = "0.11.9"
tonic = "0.9.2"
tonic-health = "0.9.2"
anyhow = "1"
log = "0.4"
aya = { version = ">=0.11", features=["async_tokio"] }
Expand Down
3 changes: 3 additions & 0 deletions dataplane/api-server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ pub async fn start(
gateway_indexes_map: HashMap<MapData, BackendKey, u16>,
tcp_conns_map: HashMap<MapData, ClientKey, LoadBalancerMapping>,
) -> Result<(), Error> {
let (_, health_service) = tonic_health::server::health_reporter();

let server = server::BackendService::new(backends_map, gateway_indexes_map, tcp_conns_map);
// TODO: mTLS https://github.com/Kong/blixt/issues/50
Server::builder()
.add_service(health_service)
.add_service(BackendsServer::new(server))
.serve(SocketAddrV4::new(addr, port).into())
.await?;
Expand Down
51 changes: 51 additions & 0 deletions test/integration/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"sigs.k8s.io/gateway-api/pkg/client/clientset/versioned"

testutils "github.com/kubernetes-sigs/blixt/internal/test/utils"
"github.com/kubernetes-sigs/blixt/pkg/vars"
)

var (
Expand Down Expand Up @@ -181,6 +182,9 @@ func TestMain(m *testing.M) {
fmt.Println("INFO: waiting for Blixt component readiness")
exitOnErr(testutils.WaitForBlixtReadiness(ctx, env))

fmt.Println("INFO: waiting for Dataplane readiness")
exitOnErr(waitForDataplaneReadiness(ctx, env))

exit := m.Run()

exitOnErr(runCleanup(mainCleanupKey))
Expand Down Expand Up @@ -309,3 +313,50 @@ func waitForBpfdConfigDelete(ctx context.Context, env environments.Environment)
}
}
}

func waitForDataplaneReadiness(ctx context.Context, env environments.Environment) error {
for {
select {
case <-ctx.Done():
if err := ctx.Err(); err != nil {
return fmt.Errorf("context completed while waiting for dataplane readiness, and an error occurred: %w", err)
}
return fmt.Errorf("context completed while waiting for dataplane readiness")
default:
dataplanes, err := env.Cluster().Client().CoreV1().Pods(vars.DefaultNamespace).
List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("component=%s", vars.DefaultDataPlaneComponentLabel)})
if err != nil {
return fmt.Errorf("failed to fetch dataplane pod list: %w", err)
}

if len(dataplanes.Items) == 0 {
return fmt.Errorf("no dataplanes found in namespace %s", vars.DefaultNamespace)
}

// Check whether dataplane is set readiness probe.
for _, container := range dataplanes.Items[0].Spec.Containers {
if container.Name == "dataplane" && container.ReadinessProbe == nil {
return fmt.Errorf("found a dataplane container which doesn't have readiness probe")
}
}

// Check all pods' readiness
var ready int
for _, pod := range dataplanes.Items {
for _, status := range pod.Status.ContainerStatuses {
if status.Name == "dataplane" && status.Ready {
ready++
}
}
}

if ready != len(dataplanes.Items) {
fmt.Printf("%d dataplanes not yet ready\n", len(dataplanes.Items)-ready)
time.Sleep(time.Second) // small rest from hitting the API over and over again
break
}

return nil
}
}
}
Loading