Skip to content

Commit 417bebe

Browse files
Merge pull request #160 from srl-labs/feat/better-launcher-logs-on-container-start-fail
feat: some hopefully better logging for when clab fails due to container failing too quickly
2 parents 23bb254 + e9d1b4c commit 417bebe

File tree

7 files changed

+89
-20
lines changed

7 files changed

+89
-20
lines changed

build/launcher.Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ FROM --platform=linux/amd64 debian:bookworm-slim
2626
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
2727

2828
ARG DOCKER_VERSION="5:26.*"
29-
ARG CONTAINERLAB_VERSION="0.55.*"
29+
# pinning back as vxlan tools has some issue we need to investigate in 0.52.0
30+
ARG CONTAINERLAB_VERSION="0.51.3"
3031
ARG NERDCTL_VERSION="1.7.6"
3132

3233
RUN apt-get update && \

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ require (
1515
// pin back to help controller-runtime out
1616
// https://github.com/kubernetes-sigs/controller-runtime/issues/2788
1717
k8s.io/client-go v0.30.2
18-
k8s.io/klog/v2 v2.130.0
18+
k8s.io/klog/v2 v2.130.1
1919
k8s.io/kube-openapi v0.0.0-20240521193020-835d969ad83a
2020
sigs.k8s.io/controller-runtime v0.18.4
2121
sigs.k8s.io/yaml v1.4.0

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,8 @@ k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw=
310310
k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
311311
k8s.io/klog/v2 v2.130.0 h1:5nB3+3HpqKqXJIXNtJdtxcDCfaa9KL8StJgMzGJkUkM=
312312
k8s.io/klog/v2 v2.130.0/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
313+
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
314+
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
313315
k8s.io/kms v0.30.2 h1:VSZILO/tkzrz5Tu2j+yFQZ2Dc5JerQZX2GqhFJbQrfw=
314316
k8s.io/kms v0.30.2/go.mod h1:GrMurD0qk3G4yNgGcsCEmepqf9KyyIrTXYR2lyUOJC4=
315317
k8s.io/kube-openapi v0.0.0-20240521193020-835d969ad83a h1:zD1uj3Jf+mD4zmA7W+goE5TxDkI7OGJjBNBzq5fJtLA=

launcher/clabernetes.go

+23-3
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ func (c *clabernetes) startup() {
113113
c.launch()
114114
c.connectivity()
115115

116+
go c.imageCleanup()
116117
go c.runProbes()
117118
go c.watchContainers()
118119

@@ -198,10 +199,15 @@ func (c *clabernetes) launch() {
198199

199200
err := c.runContainerlab()
200201
if err != nil {
201-
c.logger.Fatalf("failed launching containerlab, err: %s", err)
202+
c.logger.Criticalf(
203+
"failed launching containerlab,"+
204+
" will try to gather crashed container logs then will exit, err: %s", err,
205+
)
206+
207+
c.reportContainerLaunchFail()
202208
}
203209

204-
c.containerIDs, err = getContainerIDs()
210+
c.containerIDs, err = getContainerIDs(false)
205211
if err != nil {
206212
c.logger.Warnf(
207213
"failed determining container ids will continue but will not log container output,"+
@@ -385,7 +391,7 @@ func (c *clabernetes) watchContainers() {
385391
ticker := time.NewTicker(containerCheckInterval)
386392

387393
for range ticker.C {
388-
currentContainerIDs, err := getContainerIDs()
394+
currentContainerIDs, err := getContainerIDs(false)
389395
if err != nil {
390396
c.logger.Warnf(
391397
"failed listing container ids, error: %s",
@@ -406,3 +412,17 @@ func (c *clabernetes) watchContainers() {
406412
}
407413
}
408414
}
415+
416+
func (c *clabernetes) reportContainerLaunchFail() {
417+
allContainerIDs, err := getContainerIDs(true)
418+
if err != nil {
419+
c.logger.Fatalf(
420+
"failed launching containerlab, then failed gathering all container "+
421+
"ids to report container status. error: %s", err,
422+
)
423+
}
424+
425+
printContainerLogs(c.nodeLogger, allContainerIDs)
426+
427+
os.Exit(clabernetesconstants.ExitCodeError)
428+
}

launcher/connectivity/vxlan.go

+8-12
Original file line numberDiff line numberDiff line change
@@ -73,20 +73,16 @@ func (m *vxlanManager) resolveVXLANService(vxlanRemote string) (string, error) {
7373
for attempt := 0; attempt < resolveServiceMaxAttempts; attempt++ {
7474
resolvedVxlanRemotes, err = net.LookupIP(vxlanRemote)
7575
if err != nil {
76-
if attempt < resolveServiceMaxAttempts {
77-
m.logger.Warnf(
78-
"failed resolving remote vxlan endpoint but under max attempts will try"+
79-
" again in %s. error: %s",
80-
resolveServiceSleep,
81-
err,
82-
)
83-
84-
time.Sleep(resolveServiceSleep)
76+
m.logger.Warnf(
77+
"failed resolving remote vxlan endpoint but under max attempts will try"+
78+
" again in %s. error: %s",
79+
resolveServiceSleep,
80+
err,
81+
)
8582

86-
continue
87-
}
83+
time.Sleep(resolveServiceSleep)
8884

89-
return "", err
85+
continue
9086
}
9187

9288
break

launcher/docker.go

+34-3
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,16 @@ func startDocker(logger io.Writer) error {
124124
}
125125
}
126126

127-
func getContainerIDs() ([]string, error) {
128-
// return all the container ids running in the pod
129-
psCmd := exec.Command("docker", "ps", "--quiet")
127+
func getContainerIDs(all bool) ([]string, error) {
128+
args := []string{"ps"}
129+
130+
if all {
131+
args = append(args, "-a")
132+
}
133+
134+
args = append(args, "--quiet")
135+
136+
psCmd := exec.Command("docker", args...)
130137

131138
output, err := psCmd.Output()
132139
if err != nil {
@@ -148,6 +155,30 @@ func getContainerIDs() ([]string, error) {
148155
return containerIDs, nil
149156
}
150157

158+
func printContainerLogs(
159+
logger claberneteslogging.Instance,
160+
containerIDs []string,
161+
) {
162+
for _, containerID := range containerIDs {
163+
args := []string{
164+
"logs",
165+
containerID,
166+
}
167+
168+
cmd := exec.Command("docker", args...) //nolint:gosec
169+
170+
cmd.Stdout = logger
171+
cmd.Stderr = logger
172+
173+
err := cmd.Run()
174+
if err != nil {
175+
logger.Warnf(
176+
"printing node logs for container id %q failed, err: %s", containerID, err,
177+
)
178+
}
179+
}
180+
}
181+
151182
func tailContainerLogs(
152183
logger claberneteslogging.Instance,
153184
nodeLogger io.Writer,

launcher/image.go

+19
Original file line numberDiff line numberDiff line change
@@ -356,3 +356,22 @@ func (c *clabernetes) imageImport() error {
356356

357357
return nil
358358
}
359+
360+
func (c *clabernetes) imageCleanup() {
361+
c.logger.Debug("running image (docker) cleanup in background...")
362+
363+
exportCmd := exec.Command(
364+
"docker",
365+
"system",
366+
"prune",
367+
"--force",
368+
)
369+
370+
exportCmd.Stdout = c.logger
371+
exportCmd.Stderr = c.logger
372+
373+
err := exportCmd.Run()
374+
if err != nil {
375+
c.logger.Warnf("failed pruning docker daemon, error: %s", err)
376+
}
377+
}

0 commit comments

Comments
 (0)