Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: restart services failed within okay delay #520

4 changes: 2 additions & 2 deletions docs/how-to/service-dependencies.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ successfully ("active"):
2024-06-28T02:20:03.346Z [pebble] Service "backend" starting: python3 -m http.server 8081
2024-06-28T02:20:03.346Z [pebble] Service "database" starting: python3 -m http.server 3306
2024-06-28T02:20:03.347Z [pebble] Service "frontend" starting: python3 -m http.server 8080
2024-06-28T02:20:03.396Z [pebble] Change 1 task (Start service "database") failed: cannot start service: exited quickly with code 1
2024-06-28T02:20:03.396Z [pebble] Change 1 task (Start service "database") failed: service start attempt: exited quickly with code 1, will restart
2024-06-28T02:20:04.353Z [pebble] Started default services with change 1.
```

Expand Down Expand Up @@ -238,7 +238,7 @@ to:
2024-06-28T02:28:06.569Z [pebble] Started daemon.
2024-06-28T02:28:06.575Z [pebble] POST /v1/services 3.212375ms 202
2024-06-28T02:28:06.578Z [pebble] Service "database" starting: python3 -m http.server 3306
2024-06-28T02:28:06.627Z [pebble] Change 1 task (Start service "database") failed: cannot start service: exited quickly with code 1
2024-06-28T02:28:06.627Z [pebble] Change 1 task (Start service "database") failed: service start attempt: exited quickly with code 1, will restart
2024-06-28T02:28:06.633Z [pebble] GET /v1/changes/1/wait 57.610375ms 200
2024-06-28T02:28:06.633Z [pebble] Started default services with change 1.
```
Expand Down
8 changes: 6 additions & 2 deletions internals/daemon/api_signals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ services:
test1:
override: replace
command: sleep 10
on-failure: ignore
IronCore864 marked this conversation as resolved.
Show resolved Hide resolved
`)
d := s.daemon(c)
d.overlord.Loop()
Expand Down Expand Up @@ -80,14 +81,17 @@ services:
c.Check(rec.Result().StatusCode, Equals, 200)
c.Check(rsp.Result, DeepEquals, true)

// Ensure it goes into inactive state due to the signal
// Ensure it goes into error state due to the SIGTERM signal.
// The service returns with a non-zero exit code's return code because of SIGTERM,
// and since on-failure is configured as ignore, the state is transitioned into
// exited, corresponding to the error status.
for i := 0; ; i++ {
if i > 50 {
c.Fatalf("timed out waiting for service to go into backoff")
}
services, err := serviceMgr.Services([]string{"test1"})
c.Assert(err, IsNil)
if len(services) == 1 && services[0].Current == servstate.StatusInactive {
if len(services) == 1 && services[0].Current == servstate.StatusError {
break
}
time.Sleep(5 * time.Millisecond)
Expand Down
38 changes: 9 additions & 29 deletions internals/overlord/servstate/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ func (m *ServiceManager) doStart(task *state.Task, tomb *tomb.Tomb) error {
// Start the service and transition to stateStarting.
err = service.start()
if err != nil {
m.removeService(config.Name)
return err
}

Expand All @@ -142,17 +141,18 @@ func (m *ServiceManager) doStart(task *state.Task, tomb *tomb.Tomb) error {
case err := <-service.started:
if err != nil {
addLastLogs(task, service.logs)
m.removeService(config.Name)
return fmt.Errorf("cannot start service: %w", err)
// Do not remove the service so that Pebble will restart it if the action is restart,
// and the logs are still accessible for failed services if the action is ignore.
return fmt.Errorf("service start attempt: %w", err)
IronCore864 marked this conversation as resolved.
Show resolved Hide resolved
}
// Started successfully (ran for small amount of time without exiting).
return nil
case <-tomb.Dying():
// User tried to abort the start, sending SIGKILL to process is about
// the best we can do.
m.removeService(config.Name)
m.servicesLock.Lock()
defer m.servicesLock.Unlock()
service.transition(stateStopped)
err := syscall.Kill(-service.cmd.Process.Pid, syscall.SIGKILL)
if err != nil {
return fmt.Errorf("start aborted, but cannot send SIGKILL to process: %v", err)
Expand Down Expand Up @@ -279,28 +279,6 @@ func (m *ServiceManager) serviceForStop(name string) (service *serviceData, task
}
}

func (m *ServiceManager) removeService(name string) {
m.servicesLock.Lock()
defer m.servicesLock.Unlock()
m.removeServiceInternal(name)
}

// not concurrency-safe, please lock m.servicesLock before calling
func (m *ServiceManager) removeServiceInternal(name string) {
svc, svcExists := m.services[name]
if !svcExists {
return
}
if svc.logs != nil {
err := svc.logs.Close()
if err != nil {
logger.Noticef("Error closing service %q ring buffer: %v", name, err)
}
}

delete(m.services, name)
}

// transition changes the service's state machine to the given state.
func (s *serviceData) transition(state serviceState) {
logger.Debugf("Service %q transitioning to state %q", s.config.Name, state)
Expand Down Expand Up @@ -329,6 +307,7 @@ func (s *serviceData) start() error {
case stateInitial:
err := s.startInternal()
if err != nil {
s.transition(stateStopped)
return err
}
s.transition(stateStarting)
Expand Down Expand Up @@ -439,7 +418,6 @@ func (s *serviceData) startInternal() error {
if outputIterator != nil {
_ = outputIterator.Close()
}
_ = s.logs.Close()
return fmt.Errorf("cannot start service: %w", err)
}
logger.Debugf("Service %q started with PID %d", serviceName, s.cmd.Process.Pid)
Expand Down Expand Up @@ -510,8 +488,10 @@ func (s *serviceData) exited(exitCode int) error {

switch s.state {
case stateStarting:
s.started <- fmt.Errorf("exited quickly with code %d", exitCode)
s.transition(stateExited) // not strictly necessary as doStart will return, but doesn't hurt
// Send error to select waiting in doStart, then fall through to perform action.
action, _ := getAction(s.config, exitCode == 0)
s.started <- fmt.Errorf("exited quickly with code %d, will %s", exitCode, action)
fallthrough

case stateRunning:
logger.Noticef("Service %q stopped unexpectedly with code %d", s.config.Name, exitCode)
Expand Down
10 changes: 0 additions & 10 deletions internals/overlord/servstate/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,6 @@ func (m *ServiceManager) getPlan() *plan.Plan {
return m.plan
}

// Stop implements overlord.StateStopper and stops background functions.
func (m *ServiceManager) Stop() {
// Close all the service ringbuffers
m.servicesLock.Lock()
defer m.servicesLock.Unlock()
for name := range m.services {
m.removeServiceInternal(name)
}
}

// Ensure implements StateManager.Ensure.
func (m *ServiceManager) Ensure() error {
return nil
Expand Down
41 changes: 36 additions & 5 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ func (s *S) TearDownTest(c *C) {
// service manager.
s.stopRunningServices(c)
}
s.manager.Stop()
}
// General test cleanup
s.BaseTest.TearDownTest(c)
Expand Down Expand Up @@ -633,21 +632,53 @@ services:

func (s *S) TestStartFastExitCommand(c *C) {
s.newServiceManager(c)
s.planAddLayer(c, testPlanLayer)
var layer = `
services:
test4:
override: replace
command: echo -e 'too-fast\nsecond line'
`
s.planAddLayer(c, layer)
s.planChanged(c)

chg := s.startServices(c, [][]string{{"test4"}})

s.st.Lock()
c.Check(chg.Status(), Equals, state.ErrorStatus)
c.Check(chg.Err(), ErrorMatches, `(?s).*\n- Start service "test4" \(cannot start service: exited quickly with code 0\)`)
c.Check(chg.Err(), ErrorMatches, `(?s).*\n- Start service "test4" \(service start attempt: exited quickly with code 0, will restart\)`)
c.Check(chg.Tasks()[0].Log(), HasLen, 2)
c.Check(chg.Tasks()[0].Log()[0], Matches, `(?s).* INFO Most recent service output:\n too-fast\n second line`)
c.Check(chg.Tasks()[0].Log()[1], Matches, `.* ERROR cannot start service: exited quickly with code 0`)
c.Check(chg.Tasks()[0].Log()[1], Matches, `.* ERROR service start attempt: exited quickly with code 0, will restart`)
s.st.Unlock()

svc := s.serviceByName(c, "test4")
c.Assert(svc.Current, Equals, servstate.StatusInactive)
c.Assert(svc.Current, Equals, servstate.StatusBackoff)
}

func (s *S) TestStartFastExitCommandOnFailureIgnore(c *C) {
IronCore864 marked this conversation as resolved.
Show resolved Hide resolved
s.newServiceManager(c)
var layer = `
services:
test1:
override: replace
command: /bin/sh -c "exit 1"
on-failure: ignore
`
s.planAddLayer(c, layer)
s.planChanged(c)

chg := s.startServices(c, [][]string{{"test1"}})

s.st.Lock()
c.Check(chg.Status(), Equals, state.ErrorStatus)
c.Check(chg.Err(), ErrorMatches, `(?s).*\n- Start service "test1" \(service start attempt: exited quickly with code 1, will ignore\)`)
c.Check(chg.Tasks()[0].Log(), HasLen, 2)
c.Check(chg.Tasks()[0].Log()[0], Matches, `(?s).* INFO Most recent service output:\n `)
c.Check(chg.Tasks()[0].Log()[1], Matches, `.* ERROR service start attempt: exited quickly with code 1, will ignore`)
s.st.Unlock()

svc := s.serviceByName(c, "test1")
c.Assert(svc.Current, Equals, servstate.StatusError)
}

func (s *S) TestServices(c *C) {
Expand Down
26 changes: 0 additions & 26 deletions internals/overlord/servstate/state-diagram.dot

This file was deleted.

Loading
Loading