From a29169ebd62b7e58c8941b5d1db4281e3e4703f3 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 9 Dec 2024 14:12:38 -0500 Subject: [PATCH 01/84] Add stopping state. --- docs/concepts.rst | 1 + python/apsis/cmdline.py | 2 ++ python/apsis/states.py | 8 +++++--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/concepts.rst b/docs/concepts.rst index cb45f2c6..bd47bc39 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -99,6 +99,7 @@ Each run, once created, is in one of these states: - **waiting**: The run is waiting for a condition to be met. - **starting**: The run is starting. - **running**: The run has started and is currently running. +- **stopping**: Apsis is stopping the run. - **success**: The run has completed successfully. - **failure**: The run has completed unsuccesfully. - **error**: Some other problem has occured with the run. This can include a diff --git a/python/apsis/cmdline.py b/python/apsis/cmdline.py index cf9aaf27..f44e0c9f 100644 --- a/python/apsis/cmdline.py +++ b/python/apsis/cmdline.py @@ -45,6 +45,7 @@ def get_console(): "starting" : Style(color="#767676"), "waiting" : Style(color="#626262"), "running" : Style(color="#af8700"), + "stopping" : Style(color="#767676"), "success" : Style(color="#00875f"), "failure" : Style(color="#af0000"), "error" : Style(color="#af00af"), @@ -57,6 +58,7 @@ def get_console(): "starting" : "›", "waiting" : "|", "running" : "»", + "stopping" : "≯", "success" : "+", "failure" : "X", "error" : "!", diff --git a/python/apsis/states.py b/python/apsis/states.py index bfddc875..cd4133df 100644 --- a/python/apsis/states.py +++ b/python/apsis/states.py @@ -10,6 +10,7 @@ class State(enum.Enum): waiting = enum.auto() starting = enum.auto() running = enum.auto() + stopping = enum.auto() success = enum.auto() failure = enum.auto() error = enum.auto() @@ -51,9 +52,10 @@ def to_states(states): State.waiting : {State.new, State.scheduled}, State.starting : {State.scheduled, State.waiting}, State.running : {State.starting}, - State.error : {State.new, State.scheduled, State.waiting, State.starting, State.running, State.skipped}, - State.success : {State.running}, - State.failure : {State.running}, + State.stopping : {State.running}, + State.error : {State.new, State.scheduled, State.waiting, State.starting, State.running, State.stopping, State.skipped}, + State.success : {State.running, State.stopping}, + State.failure : {State.running, State.stopping}, State.skipped : {State.new, State.scheduled, State.waiting}, } From d1da248fbaa3d4aac3e2593404551359aab49b05 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 11 Dec 2024 15:40:06 -0500 Subject: [PATCH 02/84] Add stop config for Job. --- python/apsis/jobs.py | 133 +++++++++++++++++++++++++------------------ python/apsis/stop.py | 121 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 54 deletions(-) create mode 100644 python/apsis/stop.py diff --git a/python/apsis/jobs.py b/python/apsis/jobs.py index 6864b427..0cdb341b 100644 --- a/python/apsis/jobs.py +++ b/python/apsis/jobs.py @@ -13,6 +13,7 @@ from .lib.py import tupleize, format_ctor from .program import Program, NoOpProgram from .schedule import Schedule +from .stop import Stop log = logging.getLogger(__name__) @@ -22,7 +23,7 @@ class Job: def __init__( self, job_id, params=[], schedules=[], program=NoOpProgram(), - conds=[], actions=[], *, meta={}, ad_hoc=False + conds=[], actions=[], *, stop=None, meta={}, ad_hoc=False ): """ :param schedules: @@ -39,6 +40,7 @@ def __init__( self.program = program self.conds = tupleize(conds) self.actions = actions + self.stop = stop self.meta = meta self.ad_hoc = bool(ad_hoc) @@ -50,6 +52,7 @@ def __repr__(self): program =self.program, conds =self.conds, actions =self.actions, + stop =self.stop, meta =self.meta, ad_hoc =self.ad_hoc, ) @@ -64,70 +67,92 @@ def __eq__(self, other): and other.program == self.program and other.conds == self.conds and other.actions == self.actions + and other.stop == self.stop and other.meta == self.meta ) - -#------------------------------------------------------------------------------- - -def jso_to_job(jso, job_id): - with check_schema(jso) as pop: - # FIXME: job_id here at all? - assert pop("job_id", default=job_id) == job_id, f"JSON job_id mismatch {job_id}" - - params = pop("params", default=[]) - params = [params] if isinstance(params, str) else params - - # FIXME: 'schedules' for backward compatibility; remove in a while. - schedules = pop("schedule", default=()) - schedules = ( - [schedules] if isinstance(schedules, dict) - else [] if schedules is None - else schedules + def to_jso(self): + return { + "job_id" : self.job_id, + "params" : list(sorted(self.params)), + "schedule" : [ s.to_jso() for s in self.schedules ], + "program" : self.program.to_jso(), + "condition" : [ c.to_jso() for c in self.conds ], + "action" : [ a.to_jso() for a in self.actions ], + "stop" : None if self.stop is None else self.stop.to_jso(), + "metadata" : self.meta, + "ad_hoc" : self.ad_hoc, + } + + + @classmethod + def from_jso(cls, jso, job_id): + with check_schema(jso) as pop: + assert pop("job_id", default=job_id) == job_id, \ + f"JSON job_id mismatch {job_id}" + + params = pop("params", default=[]) + params = [params] if isinstance(params, str) else params + + schedules = pop("schedule", default=()) + schedules = ( + [schedules] if isinstance(schedules, dict) + else [] if schedules is None + else schedules + ) + schedules = [ Schedule.from_jso(s) for s in schedules ] + + program = pop("program", Program.from_jso) + + conds = pop("condition", to_array, default=[]) + conds = [ Condition.from_jso(c) for c in conds ] + + acts = pop("action", to_array, default=[]) + acts = [ Action.from_jso(a) for a in acts ] + + # Successors are syntactic sugar for actions. + sucs = pop("successors", to_array, default=[]) + acts.extend([ successor_from_jso(s) for s in sucs ]) + + stop = pop("stop", default=None) + stop = None if stop is None else Stop.from_jso(stop) + + metadata = pop("metadata", default={}) + metadata["labels"] = [ + str(l) + for l in tupleize(metadata.get("labels", [])) + ] + + ad_hoc = pop("ad_hoc", bool, default=False) + + return cls( + job_id, params, schedules, program, + conds =conds, + actions =acts, + stop =stop, + meta =metadata, + ad_hoc =ad_hoc, ) - schedules = [ Schedule.from_jso(s) for s in schedules ] - program = pop("program", Program.from_jso) - conds = pop("condition", to_array, default=[]) - conds = [ Condition.from_jso(c) for c in conds ] - acts = pop("action", to_array, default=[]) - acts = [ Action.from_jso(a) for a in acts ] - - # Successors are syntactic sugar for actions. - sucs = pop("successors", to_array, default=[]) - acts.extend([ successor_from_jso(s) for s in sucs ]) - - metadata = pop("metadata", default={}) - metadata["labels"] = [ - str(l) - for l in tupleize(metadata.get("labels", [])) - ] - - ad_hoc = pop("ad_hoc", bool, default=False) +#------------------------------------------------------------------------------- - return Job( - job_id, params, schedules, program, - conds =conds, - actions =acts, - meta =metadata, - ad_hoc =ad_hoc, - ) +def job_to_jso(job): + """ + :deprecated: + Use `Job.to_jso()`. + """ + return job.to_jso() -def job_to_jso(job): - return { - "job_id" : job.job_id, - "params" : list(sorted(job.params)), - "schedule" : [ s.to_jso() for s in job.schedules ], - "program" : job.program.to_jso(), - "condition" : [ c.to_jso() for c in job.conds ], - "action" : [ a.to_jso() for a in job.actions ], - "metadata" : job.meta, - "ad_hoc" : job.ad_hoc, - } +def jso_to_job(jso, job_id): + """ + :deprecated: + Use `job.to_jso()`. + """ + return Job.from_jso(jso, job_id) def load_yaml(file, job_id): diff --git a/python/apsis/stop.py b/python/apsis/stop.py new file mode 100644 index 00000000..a6b8f240 --- /dev/null +++ b/python/apsis/stop.py @@ -0,0 +1,121 @@ +import asyncio +from signal import Signals + +from apsis.actions import Action +from apsis.cond import Condition +from apsis.lib.json import TypedJso, to_array, check_schema +from apsis.lib.sys import to_signal + +#------------------------------------------------------------------------------- + +class StopMethod(TypedJso): + + TYPE_NAMES = TypedJso.TypeNames() + + async def __call__(self, apsis, run): + raise NotImplementedError("StopMethod.__call__") + + + +class StopSignalMethod: + """ + Stops a program by sending a signal. + + Sends `signal`, waits `timeout` seconds, then sends SIGKILL. + """ + + def __init__(self, signal=Signals.SIGTERM, timeout=60): + self.signal = to_signal(signal) + self.timeout = float(timeout) + assert 0 <= self.timeout + + + def __eq__(self, other): + return other.signal == self.signal and other.timeout == self.timeout + + + def __repr__(self): + return format_ctor(self, signal=self.signal, timeout=self.timeout) + + + def __str__(self): + return f"signal {self.signal.name}" + + + def to_jso(self): + return { + **super().to_jso(), + "signal" : self.signal.name, + "timeout" : self.timeout, + } + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + signal = pop("signal", to_signal, Signal.SIGTERM), + timeout = pop("timeout", float, 60), + return cls(signal, timeout) + + + async def __call__(self, apsis, run): + await apsis.send_signal(run, self.signal) + await asyncio.sleep(self.timeout) + if not run.state.finished: + await asyncio.send_signal(run, Signal.SIGKILL) + + + +#------------------------------------------------------------------------------- + +class Stop: + + def __init__(self, method, schedules=[], conds=[], actions=[]): + self.method = method + self.schedules = schedules + self.conds = conds + self.actions = actions + + + def __eq__(self, other): + return ( + other.method == self.method + and other.schedules == self.schedules + and other.conds == self.conds + and other.actions == self.actions + ) + + + def __repr__(self): + return format_ctor( + self, + method =self.method, + schedules =self.schedules, + conds =self.conds, + actions =self.actions, + ) + + + def to_jso(self): + return { + "method" : self.method, + "schedules" : [ s.to_jso() for s in self.schedules ], + "conds" : [ c.to_jso() for c in self.conds ], + "actions" : [ a.to_jso() for a in self.actions ], + } + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + method = pop("method", StopMethod.from_jso) + schedules = pop("schedule", to_array, []) + schedules = [ StopSchedule.from_jso(s) for s in schedules ] + conds = pop("conds", to_array, []) + conds = [ Condition.from_jso(c) for c in conds ] + actions = pop("actions", to_array, []) + actions = [ Action.from_jso(a) for a in actions ] + return cls(method, schedules, conds, actions) + + + From a39f3727e80901e3a6d959c908c5fe5b98d0a232 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 12:26:17 -0500 Subject: [PATCH 03/84] Fix exception. --- python/apsis/schedule/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/apsis/schedule/base.py b/python/apsis/schedule/base.py index 886f3036..7ac30300 100644 --- a/python/apsis/schedule/base.py +++ b/python/apsis/schedule/base.py @@ -103,7 +103,7 @@ def __init__(self, *, enabled=True): def __call__(self, start: ora.Time): - raise NotImplementedError + raise NotImplementedError("Schedule.__call__") From 21cb51604b2351c973600b3a15c062180546883a Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 12:26:40 -0500 Subject: [PATCH 04/84] Stop schedules. --- python/apsis/stop.py | 178 ++++++++++++++++++++++++++++++++++++++--- test/unit/test_stop.py | 83 +++++++++++++++++++ 2 files changed, 248 insertions(+), 13 deletions(-) create mode 100644 test/unit/test_stop.py diff --git a/python/apsis/stop.py b/python/apsis/stop.py index a6b8f240..71686d88 100644 --- a/python/apsis/stop.py +++ b/python/apsis/stop.py @@ -1,10 +1,14 @@ import asyncio +import ora from signal import Signals from apsis.actions import Action from apsis.cond import Condition from apsis.lib.json import TypedJso, to_array, check_schema +from apsis.lib.parse import parse_duration +from apsis.lib.py import format_ctor from apsis.lib.sys import to_signal +from apsis.states import State, to_state, reachable #------------------------------------------------------------------------------- @@ -66,23 +70,179 @@ async def __call__(self, apsis, run): +#------------------------------------------------------------------------------- + +class StopSchedule(TypedJso): + + TYPE_NAMES = TypedJso.TypeNames() + + # The schedule is called when the run is running. These are the valid + # states after which to schedule a stop. + AFTERS = [ + "schedule", + "waiting", + "starting", + "running", + ] + + @classmethod + def _get_run_time(cls, run, after): + # Return the time for `after`, falling forward as needed. + for a in cls.AFTERS[cls.AFTERS.index(after) :]: + try: + return ora.Time(run.times[a]) + except KeyError: + pass + else: + raise RuntimeError(f"no {after} time for {run}") + + + def __call__(self, run) -> ora.Time: + """ + Returns the stop time of the run. + """ + raise NotImplementedError("StopSchedule.__call__") + + + +class StopDurationSchedule(StopSchedule): + + def __init__(self, duration, *, after="schedule"): + try: + duration = float(duration) + except (TypeError, ValueError): + duration = parse_duration(duration) + after = str(after) + if after not in self.AFTERS: + names = " ".join( s.name for s in self.AFTERS ) + raise ValueError(f"after must be in {names}") + + self.duration = duration + self.after = after + + + def __eq__(self, other): + return ( + other.duration == self.duration + and other.after == self.after + ) + + + def __repr__(self): + return format_ctor(self, self.duration, after=self.after) + + + def __str__(self): + return f"stop after {self.duration} s after {self.after}" + + + def to_jso(self): + return { + **super().to_jso(), + "duration" : self.duration, + "after" : self.after, + } + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + duration = pop("duration") + after = pop("after", str, "schedule") + return cls(duration, after=after) + + + def __call__(self, run): + time = self._get_run_time(run, self.after) + return time + self.duration + + + +class StopDaytimeSchedule(StopSchedule): + """ + Schedules to stop a run on the next occurrence of a daytime. + """ + + # FIXME: Add date_shift and cal_shift, as DailySchedule. + + def __init__(self, daytime, tz, *, after="schedule"): + """ + Schedules to stop the run on the next occurrence of `daytime` in `tz` + after the transition time for the state `after`. + """ + daytime = ora.Daytime(daytime) + tz = ora.TimeZone(tz) + after = str(after) + if after not in self.AFTERS: + names = " ".join( s.name for s in self.AFTERS ) + raise ValueError(f"after must be in {names}") + + self.daytime = daytime + self.tz = tz + self.after = after + + + def __eq__(self, other): + return ( + other.daytime == self.daytime + and other.tz == self.tz + and other.after == self.after + ) + + + def __repr__(self): + return format_ctor(self, self.daytime, self.tz, after=self.after) + + + def __str__(self): + return f"stop at {self.daytime} {self.tz} after {self.after}" + + + def to_jso(self): + return { + **super().to_jso(), + "daytime" : str(self.daytime), + "tz" : str(self.tz), + "after" : self.after, + } + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + daytime = pop("daytime", ora.Daytime) + tz = pop("tz", ora.TimeZone) + after = pop("after", str, "schedule") + return cls(daytime, tz, after=after) + + + def __call__(self, run): + time = self._get_run_time(run, self.after) + # FIXME: Handle invalid date/daytime pairs. + date, daytime = time @ self.tz + return ( + date if daytime < self.daytime else date + 1, + self.daytime + ) @ self.tz + + + +StopSchedule.TYPE_NAMES.set(StopDurationSchedule, "duration") +StopSchedule.TYPE_NAMES.set(StopDaytimeSchedule, "daytime") + #------------------------------------------------------------------------------- class Stop: - def __init__(self, method, schedules=[], conds=[], actions=[]): + def __init__(self, method, schedules=[]): self.method = method self.schedules = schedules - self.conds = conds - self.actions = actions def __eq__(self, other): return ( other.method == self.method and other.schedules == self.schedules - and other.conds == self.conds - and other.actions == self.actions ) @@ -91,8 +251,6 @@ def __repr__(self): self, method =self.method, schedules =self.schedules, - conds =self.conds, - actions =self.actions, ) @@ -100,8 +258,6 @@ def to_jso(self): return { "method" : self.method, "schedules" : [ s.to_jso() for s in self.schedules ], - "conds" : [ c.to_jso() for c in self.conds ], - "actions" : [ a.to_jso() for a in self.actions ], } @@ -111,10 +267,6 @@ def from_jso(cls, jso): method = pop("method", StopMethod.from_jso) schedules = pop("schedule", to_array, []) schedules = [ StopSchedule.from_jso(s) for s in schedules ] - conds = pop("conds", to_array, []) - conds = [ Condition.from_jso(c) for c in conds ] - actions = pop("actions", to_array, []) - actions = [ Action.from_jso(a) for a in actions ] return cls(method, schedules, conds, actions) diff --git a/test/unit/test_stop.py b/test/unit/test_stop.py new file mode 100644 index 00000000..bddf1e23 --- /dev/null +++ b/test/unit/test_stop.py @@ -0,0 +1,83 @@ +import ora + +from apsis.runs import Instance, Run +from apsis.states import State +from apsis.stop import StopSchedule + +#------------------------------------------------------------------------------- + +RUN = Run(Instance("test job", {})) +RUN.state = State.running +RUN.times.update({ + "scheduled" : ora.Time("2024-12-12T23:00:00Z"), + "schedule" : ora.Time("2024-12-13T12:00:00Z"), + "waiting" : ora.Time("2024-12-13T12:00:01Z"), + "starting" : ora.Time("2024-12-13T12:10:00Z"), + "running" : ora.Time("2024-12-13T12:10:05Z"), +}) + +# A run missing some of the `times` entries. +AD_HOC_RUN = Run(Instance("test job", {})) +AD_HOC_RUN.state = State.success +AD_HOC_RUN.times.update({ + "starting" : ora.Time("2024-12-13T12:10:23Z"), + "running" : ora.Time("2024-12-13T12:11:05Z"), +}) + + +def test_stop_duration_schedule(): + sched = StopSchedule.from_jso({ + "type": "duration", + "duration": "1h", + }) + assert sched == StopSchedule.from_jso(sched.to_jso()) + assert "3600" in repr(sched) + assert "3600" in str(sched) + jso = sched.to_jso() + assert jso["duration"] == 3600 + assert sched(RUN) == ora.Time("2024-12-13T13:00:00Z") + assert sched(AD_HOC_RUN) == ora.Time("2024-12-13T13:10:23Z") + + sched = StopSchedule.from_jso({ + "type": "duration", + "duration": "1h", + "after": "running", + }) + assert sched == StopSchedule.from_jso(sched.to_jso()) + jso = sched.to_jso() + assert jso["after"] == "running" + assert sched(RUN) == ora.Time("2024-12-13T13:10:05Z") + assert sched(AD_HOC_RUN) == ora.Time("2024-12-13T13:11:05Z") + + +def test_stop_daytime_schedule(): + sched = StopSchedule.from_jso({ + "type": "daytime", + "daytime": "16:00:00", + "tz": "UTC", + }) + assert sched == StopSchedule.from_jso(sched.to_jso()) + assert "UTC" in repr(sched) + assert "16:00:00" in str(sched) + jso = sched.to_jso() + assert jso["daytime"] == "16:00:00" + assert jso["tz"] == "UTC" + assert sched(RUN) == ora.Time("2024-12-13T16:00:00Z") + assert sched(AD_HOC_RUN) == ora.Time("2024-12-13T16:00:00Z") + + sched = StopSchedule.from_jso({ + "type": "daytime", + "daytime": "16:00:00", + "tz": "Asia/Tokyo", + "after": "waiting", + }) + assert sched == StopSchedule.from_jso(sched.to_jso()) + assert "Asia/Tokyo" in repr(sched) + jso = sched.to_jso() + assert jso["tz"] == "Asia/Tokyo" + assert jso["after"] == "waiting" + # Next day in Tokyo. + assert sched(RUN) == ora.Time("2024-12-14T07:00:00Z") + assert sched(AD_HOC_RUN) == ora.Time("2024-12-14T07:00:00Z") + + From 8323f997292fb7342f42fadb90ca7cbdc5ea3e68 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 13:29:04 -0500 Subject: [PATCH 05/84] Refactor Schedule JSO logic. --- python/apsis/schedule/base.py | 11 +++++- python/apsis/schedule/daily.py | 8 ++-- python/apsis/schedule/daily_interval.py | 5 +-- python/apsis/schedule/explicit.py | 5 +-- python/apsis/schedule/interval.py | 5 +-- test/unit/schedule/test_schedule.py | 49 ++++++++++++++++++++++++- 6 files changed, 68 insertions(+), 15 deletions(-) diff --git a/python/apsis/schedule/base.py b/python/apsis/schedule/base.py index 7ac30300..eb038cf8 100644 --- a/python/apsis/schedule/base.py +++ b/python/apsis/schedule/base.py @@ -1,7 +1,7 @@ from dataclasses import dataclass import ora -from apsis.lib.json import TypedJso, check_schema +from apsis.lib.json import TypedJso, check_schema, nkey #------------------------------------------------------------------------------- @@ -102,6 +102,15 @@ def __init__(self, *, enabled=True): self.enabled = bool(enabled) + def to_jso(self): + return super().to_jso() | nkey("enabled", self.enabled) + + + @classmethod + def _from_jso(cls, pop): + return dict(enabled=pop("enabled", bool, default=True)) + + def __call__(self, start: ora.Time): raise NotImplementedError("Schedule.__call__") diff --git a/python/apsis/schedule/daily.py b/python/apsis/schedule/daily.py index 423766d8..8c21e3d9 100644 --- a/python/apsis/schedule/daily.py +++ b/python/apsis/schedule/daily.py @@ -102,9 +102,8 @@ def __call__(self, start: ora.Time): def to_jso(self): return { **super().to_jso(), - "enabled" : self.enabled, "tz" : str(self.tz), - "calendar" : repr(self.calendar), # FIXME + "calendar" : self.calendar.name, # FIXME: Not necessarily round-trip. "daytime" : [ str(y) for y in self.daytimes ], "date_shift": self.date_shift, "cal_shift" : self.cal_shift, @@ -115,7 +114,7 @@ def to_jso(self): @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: - enabled = pop("enabled", bool, default=True) + kw_args = Schedule._from_jso(pop) args = pop("args", default={}) tz = pop("tz", ora.TimeZone) calendar = get_calendar(pop("calendar", default="all")) @@ -125,7 +124,8 @@ def from_jso(cls, jso): cal_shift = pop("cal_shift", int, default=0) return cls( tz, calendar, daytimes, args, - enabled=enabled, date_shift=date_shift, cal_shift=cal_shift, + date_shift=date_shift, cal_shift=cal_shift, + **kw_args ) diff --git a/python/apsis/schedule/daily_interval.py b/python/apsis/schedule/daily_interval.py index 29bdbaf0..105ff849 100644 --- a/python/apsis/schedule/daily_interval.py +++ b/python/apsis/schedule/daily_interval.py @@ -107,23 +107,22 @@ def to_jso(self): "stop" : self.stop.to_jso(), "interval" : self.interval, "args" : self.args, - "enabled" : self.enabled, } @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: + kw_args = Schedule._from_jso(pop) tz = pop("tz", ora.TimeZone) calendar = get_calendar(pop("calendar", default="all")) start = DaytimeSpec.from_jso(pop("start")) stop = DaytimeSpec.from_jso(pop("stop")) interval = pop("interval", int) args = pop("args", default={}) - enabled = pop("enabled", bool, default=True) return cls( tz, calendar, start, stop, interval, args, - enabled=enabled, + **kw_args ) diff --git a/python/apsis/schedule/explicit.py b/python/apsis/schedule/explicit.py index 97f47c2f..54be2afe 100644 --- a/python/apsis/schedule/explicit.py +++ b/python/apsis/schedule/explicit.py @@ -33,7 +33,6 @@ def __call__(self, start: ora.Time): def to_jso(self): return { **super().to_jso(), - "enabled" : self.enabled, "times" : [ str(t) for t in self.times ], "args" : self.args, } @@ -42,12 +41,12 @@ def to_jso(self): @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: - enabled = pop("enabled", bool, default=True) + kw_args = Schedule._from_jso(pop) times = pop("times") times = times if isinstance(times, list) else [times] times = [ ora.Time(t) for t in times ] args = pop("args", default={}) - return cls(times, args, enabled=enabled) + return cls(times, args, **kw_args) diff --git a/python/apsis/schedule/interval.py b/python/apsis/schedule/interval.py index 32d5dbda..76a17f6d 100644 --- a/python/apsis/schedule/interval.py +++ b/python/apsis/schedule/interval.py @@ -67,7 +67,6 @@ def __call__(self, start: ora.Time): def to_jso(self): return { **super().to_jso(), - "enabled" : self.enabled, "interval" : self.interval, "phase" : self.phase, "args" : self.args, @@ -77,12 +76,12 @@ def to_jso(self): @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: - enabled = pop("enabled", bool, default=True) + kw_args = Schedule._from_jso(pop) interval = pop("interval", parse_duration) phase = pop("phase", parse_duration, 0) assert 0 <= phase < interval, "phase not between 0 and interval" args = pop("args", default={}) - return cls(interval, args, enabled=enabled, phase=phase) + return cls(interval, args, phase=phase, **kw_args) diff --git a/test/unit/schedule/test_schedule.py b/test/unit/schedule/test_schedule.py index 12233abd..cbc42b5f 100644 --- a/test/unit/schedule/test_schedule.py +++ b/test/unit/schedule/test_schedule.py @@ -3,7 +3,7 @@ from ora import Date, Time, Daytime, UTC import pytest -from apsis.schedule import DailySchedule, IntervalSchedule +from apsis.schedule import Schedule, DailySchedule, IntervalSchedule #------------------------------------------------------------------------------- @@ -212,3 +212,50 @@ def test_interval_schedule_eq(): assert s0 != s5 +def test_jso_daily(): + sched = Schedule.from_jso({ + "type": "daily", + "tz": "America/New_York", + "calendar": "Mon-Thu", + "daytime": "12:00:00", + "args": {}, + }) + assert isinstance(sched, DailySchedule) + assert sched.enabled + assert list(sched.daytimes) == [Daytime(12, 0, 0)] + + jso = sched.to_jso() + assert jso["tz"] == "America/New_York" + assert jso["calendar"] == "Mon-Thu" + assert jso["args"] == {} + + rt = Schedule.from_jso(jso) + assert isinstance(rt, DailySchedule) + assert rt.enabled + assert list(rt.daytimes) == [Daytime(12, 0, 0)] + + +def test_jso_interval(): + sched = Schedule.from_jso({ + "type": "interval", + "interval": "2h", + "args": {"color": "orange"}, + "enabled": False, + }) + assert isinstance(sched, IntervalSchedule) + assert not sched.enabled + assert sched.interval == 7200 + assert sched.args == {"color": "orange"} + + jso = sched.to_jso() + assert jso["interval"] == 7200 + assert jso["args"] == {"color": "orange"} + assert jso["enabled"] is False + + rt = Schedule.from_jso(jso) + assert isinstance(rt, IntervalSchedule) + assert not rt.enabled + assert rt.interval == 7200 + assert rt.args == {"color": "orange"} + + From 90630322a1e99e9c3da1ecdd573fb5b6dc278b88 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 14:03:20 -0500 Subject: [PATCH 06/84] Just one schedule, no conds or actions. --- python/apsis/stop.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/python/apsis/stop.py b/python/apsis/stop.py index 71686d88..db3ac7e7 100644 --- a/python/apsis/stop.py +++ b/python/apsis/stop.py @@ -2,8 +2,6 @@ import ora from signal import Signals -from apsis.actions import Action -from apsis.cond import Condition from apsis.lib.json import TypedJso, to_array, check_schema from apsis.lib.parse import parse_duration from apsis.lib.py import format_ctor @@ -234,30 +232,26 @@ def __call__(self, run): class Stop: - def __init__(self, method, schedules=[]): + def __init__(self, method, schedule): self.method = method - self.schedules = schedules + self.schedule = schedule def __eq__(self, other): return ( other.method == self.method - and other.schedules == self.schedules + and other.schedule == self.shedule ) def __repr__(self): - return format_ctor( - self, - method =self.method, - schedules =self.schedules, - ) + return format_ctor(self, self.method, self.schedule) def to_jso(self): return { - "method" : self.method, - "schedules" : [ s.to_jso() for s in self.schedules ], + "method" : self.method.to_jso(), + "schedule" : self.schedule.to_jso(), } @@ -265,9 +259,8 @@ def to_jso(self): def from_jso(cls, jso): with check_schema(jso) as pop: method = pop("method", StopMethod.from_jso) - schedules = pop("schedule", to_array, []) - schedules = [ StopSchedule.from_jso(s) for s in schedules ] - return cls(method, schedules, conds, actions) + schedule = pop("schedule", StopSchedule.from_jso) + return cls(method, schedule) From 7361780559303ac5fb93c5a74277a9cf14a83ed1 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 14:13:25 -0500 Subject: [PATCH 07/84] Attach StopSchedule to Schedule, but make them siblings in JSO. --- python/apsis/jobs.py | 43 +++++++++++++++++++++-------------- python/apsis/lib/json.py | 4 ++++ python/apsis/schedule/base.py | 2 ++ 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/python/apsis/jobs.py b/python/apsis/jobs.py index 0cdb341b..d0e29224 100644 --- a/python/apsis/jobs.py +++ b/python/apsis/jobs.py @@ -9,7 +9,7 @@ from .actions.schedule import successor_from_jso from .cond import Condition from .exc import JobError, JobsDirErrors, SchemaError -from .lib.json import to_array, check_schema +from .lib.json import to_array, to_narray, check_schema from .lib.py import tupleize, format_ctor from .program import Program, NoOpProgram from .schedule import Schedule @@ -40,7 +40,6 @@ def __init__( self.program = program self.conds = tupleize(conds) self.actions = actions - self.stop = stop self.meta = meta self.ad_hoc = bool(ad_hoc) @@ -52,7 +51,6 @@ def __repr__(self): program =self.program, conds =self.conds, actions =self.actions, - stop =self.stop, meta =self.meta, ad_hoc =self.ad_hoc, ) @@ -67,20 +65,28 @@ def __eq__(self, other): and other.program == self.program and other.conds == self.conds and other.actions == self.actions - and other.stop == self.stop and other.meta == self.meta ) def to_jso(self): + def schedule_to_jso(schedule): + jso = schedule.to_jso() + return ( + jso if schedule.stop is None + else { + "start" : jso, + "stop" : schedule.stop.to_jso(), + } + ) + return { "job_id" : self.job_id, "params" : list(sorted(self.params)), - "schedule" : [ s.to_jso() for s in self.schedules ], + "schedule" : [ schedule_to_jso(s) for s in self.schedules ], "program" : self.program.to_jso(), "condition" : [ c.to_jso() for c in self.conds ], "action" : [ a.to_jso() for a in self.actions ], - "stop" : None if self.stop is None else self.stop.to_jso(), "metadata" : self.meta, "ad_hoc" : self.ad_hoc, } @@ -88,6 +94,18 @@ def to_jso(self): @classmethod def from_jso(cls, jso, job_id): + def schedule_from_jso(jso): + if set(jso) in ({"start"}, {"start", "stop"}): + # Explicit start schedule, and possibly a stop schedule. + schedule = Schedule.from_jso(jso["start"]) + stop = jso.get("stop", False) + if stop is not None: + schedule.stop = StopSchedule.from_jso(stop) + return schedule + else: + # Only a start schedule. + return Schedule.from_jso(jso) + with check_schema(jso) as pop: assert pop("job_id", default=job_id) == job_id, \ f"JSON job_id mismatch {job_id}" @@ -95,13 +113,8 @@ def from_jso(cls, jso, job_id): params = pop("params", default=[]) params = [params] if isinstance(params, str) else params - schedules = pop("schedule", default=()) - schedules = ( - [schedules] if isinstance(schedules, dict) - else [] if schedules is None - else schedules - ) - schedules = [ Schedule.from_jso(s) for s in schedules ] + schedules = pop("schedule", to_narray, default=()) + schedules = [ schedule_from_jso(s) for s in schedules ] program = pop("program", Program.from_jso) @@ -115,9 +128,6 @@ def from_jso(cls, jso, job_id): sucs = pop("successors", to_array, default=[]) acts.extend([ successor_from_jso(s) for s in sucs ]) - stop = pop("stop", default=None) - stop = None if stop is None else Stop.from_jso(stop) - metadata = pop("metadata", default={}) metadata["labels"] = [ str(l) @@ -130,7 +140,6 @@ def from_jso(cls, jso, job_id): job_id, params, schedules, program, conds =conds, actions =acts, - stop =stop, meta =metadata, ad_hoc =ad_hoc, ) diff --git a/python/apsis/lib/json.py b/python/apsis/lib/json.py index c667f559..e3629294 100644 --- a/python/apsis/lib/json.py +++ b/python/apsis/lib/json.py @@ -12,6 +12,10 @@ def to_array(obj): return obj if isinstance(obj, list) else [obj] +def to_narray(obj): + return [] if obj is None else to_array(obj) + + @contextlib.contextmanager def check_schema(jso): """ diff --git a/python/apsis/schedule/base.py b/python/apsis/schedule/base.py index eb038cf8..334516ac 100644 --- a/python/apsis/schedule/base.py +++ b/python/apsis/schedule/base.py @@ -95,11 +95,13 @@ def from_jso(cls, jso): #------------------------------------------------------------------------------- class Schedule(TypedJso): + # Note: `stop` is not included in the JSO representation. TYPE_NAMES = TypedJso.TypeNames() def __init__(self, *, enabled=True): self.enabled = bool(enabled) + self.stop = None def to_jso(self): From e8e9afdb44e3d42fd376ed1619578a89dc88bc5d Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 14:40:58 -0500 Subject: [PATCH 08/84] Notes. --- notes/notes.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/notes/notes.md b/notes/notes.md index 968f138c..25b2d410 100644 --- a/notes/notes.md +++ b/notes/notes.md @@ -1,3 +1,41 @@ +# Scheduled stop + +NO: +```yaml +schedule: + start: + type: interval + interval: 1h + + stop: + method: + type: signal + schedule: + type: duration + duration: 30m +``` + +THIS IS CORRECT: +```yaml +program: + start: + type: agent + argv: ["/path/to/my/service", "--foreground"] + stop: + type: signal + signal: SIGTERM + +schedule: + start: + type: interval + interval: 1h + stop: + type: duration + duration: 30m +``` + + + # SQLite Performance test of appending to string fields, in `work/sqlite-concat.py`. From 5130ebf210eb8469185d72c779a4e8a1e5b800db Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 14:41:29 -0500 Subject: [PATCH 09/84] Rename to stop_schedule. --- python/apsis/jobs.py | 12 ++++++------ python/apsis/schedule/base.py | 4 ++-- python/apsis/schedule/daily_interval.py | 2 +- python/apsis/stop.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/apsis/jobs.py b/python/apsis/jobs.py index d0e29224..cd5386d0 100644 --- a/python/apsis/jobs.py +++ b/python/apsis/jobs.py @@ -13,7 +13,7 @@ from .lib.py import tupleize, format_ctor from .program import Program, NoOpProgram from .schedule import Schedule -from .stop import Stop +from .stop import StopSchedule log = logging.getLogger(__name__) @@ -73,10 +73,10 @@ def to_jso(self): def schedule_to_jso(schedule): jso = schedule.to_jso() return ( - jso if schedule.stop is None + jso if schedule.stop_schedule is None else { "start" : jso, - "stop" : schedule.stop.to_jso(), + "stop" : schedule.stop_schedule.to_jso(), } ) @@ -98,9 +98,9 @@ def schedule_from_jso(jso): if set(jso) in ({"start"}, {"start", "stop"}): # Explicit start schedule, and possibly a stop schedule. schedule = Schedule.from_jso(jso["start"]) - stop = jso.get("stop", False) - if stop is not None: - schedule.stop = StopSchedule.from_jso(stop) + stop_jso = jso.get("stop", None) + if stop_jso is not None: + schedule.stop_schedule = StopSchedule.from_jso(stop_jso) return schedule else: # Only a start schedule. diff --git a/python/apsis/schedule/base.py b/python/apsis/schedule/base.py index 334516ac..5a662afe 100644 --- a/python/apsis/schedule/base.py +++ b/python/apsis/schedule/base.py @@ -95,13 +95,13 @@ def from_jso(cls, jso): #------------------------------------------------------------------------------- class Schedule(TypedJso): - # Note: `stop` is not included in the JSO representation. + # Note: `stop_schedule` is not included in the JSO representation. TYPE_NAMES = TypedJso.TypeNames() def __init__(self, *, enabled=True): self.enabled = bool(enabled) - self.stop = None + self.stop_schedule = None def to_jso(self): diff --git a/python/apsis/schedule/daily_interval.py b/python/apsis/schedule/daily_interval.py index 105ff849..9b0a6833 100644 --- a/python/apsis/schedule/daily_interval.py +++ b/python/apsis/schedule/daily_interval.py @@ -118,7 +118,7 @@ def from_jso(cls, jso): calendar = get_calendar(pop("calendar", default="all")) start = DaytimeSpec.from_jso(pop("start")) stop = DaytimeSpec.from_jso(pop("stop")) - interval = pop("interval", int) + interval = pop("interval", parse_duration) args = pop("args", default={}) return cls( tz, calendar, start, stop, interval, args, diff --git a/python/apsis/stop.py b/python/apsis/stop.py index db3ac7e7..44a1fb51 100644 --- a/python/apsis/stop.py +++ b/python/apsis/stop.py @@ -103,7 +103,7 @@ def __call__(self, run) -> ora.Time: -class StopDurationSchedule(StopSchedule): +class DurationStopSchedule(StopSchedule): def __init__(self, duration, *, after="schedule"): try: @@ -156,7 +156,7 @@ def __call__(self, run): -class StopDaytimeSchedule(StopSchedule): +class DaytimeStopSchedule(StopSchedule): """ Schedules to stop a run on the next occurrence of a daytime. """ @@ -225,8 +225,8 @@ def __call__(self, run): -StopSchedule.TYPE_NAMES.set(StopDurationSchedule, "duration") -StopSchedule.TYPE_NAMES.set(StopDaytimeSchedule, "daytime") +StopSchedule.TYPE_NAMES.set(DurationStopSchedule, "duration") +StopSchedule.TYPE_NAMES.set(DaytimeStopSchedule, "daytime") #------------------------------------------------------------------------------- From b93d8aa9789534b34aab433f067025cf131cd220 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 14:43:20 -0500 Subject: [PATCH 10/84] Test. --- test/unit/test_job.py | 84 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 test/unit/test_job.py diff --git a/test/unit/test_job.py b/test/unit/test_job.py new file mode 100644 index 00000000..53e84e07 --- /dev/null +++ b/test/unit/test_job.py @@ -0,0 +1,84 @@ +import ora + +from apsis.jobs import Job +from apsis.stop import DurationStopSchedule + +#------------------------------------------------------------------------------- + +def test_schedule_jso_start(): + job_jso = { + "program": {"type": "no-op"}, + "schedule": { + "type" : "daily-interval", + "start" : "09:30:00", + "stop" : "16:00:00", + "tz" : "America/New_York", + "interval" : "30m", + } + } + job = Job.from_jso(job_jso, "test job") + sched, = job.schedules + assert sched.start.daytime == ora.Daytime(9, 30, 0) + assert sched.stop.daytime == ora.Daytime(16, 0, 0) + assert sched.interval == 1800 + assert sched.enabled + assert sched.stop_schedule is None + jso = job.to_jso() + assert isinstance(jso["schedule"], list) + assert len(jso["schedule"]) == 1 + assert jso["schedule"][0]["interval"] == 1800 + + # This produces an idential job as the above. + job_jso = { + "program": {"type": "no-op"}, + "schedule": { + "start": { + "type" : "daily-interval", + "start" : "09:30:00", + "stop" : "16:00:00", + "tz" : "America/New_York", + "interval" : "30m", + }, + } + } + job = Job.from_jso(job_jso, "test job") + sched, = job.schedules + assert sched.interval == 1800 + assert sched.enabled + assert sched.stop_schedule is None + jso = job.to_jso() + assert isinstance(jso["schedule"], list) + assert len(jso["schedule"]) == 1 + assert jso["schedule"][0]["interval"] == 1800 + + +def test_schedule_jso_start_stop(): + # Includes a stop schedule. + job_jso = { + "program": {"type": "no-op"}, + "schedule": { + "start": { + "type" : "daily-interval", + "start" : "09:30:00", + "stop" : "16:00:00", + "tz" : "America/New_York", + "interval" : "30m", + }, + "stop": { + "type": "duration", + "duration": "15m", + }, + } + } + job = Job.from_jso(job_jso, "test job") + sched, = job.schedules + assert sched.interval == 1800 + assert sched.enabled + assert sched.stop_schedule == DurationStopSchedule(900) + jso = job.to_jso() + jso, = jso["schedule"] # one-element list + assert set(jso.keys()) == {"start", "stop"} + assert jso["start"]["interval"] == 1800 + assert jso["stop"]["duration"] == 900 + + From 25b5c57e1f2015d8cdee164520aa60a268961d20 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 14:44:59 -0500 Subject: [PATCH 11/84] Get rid of Stop class. --- python/apsis/stop.py | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/python/apsis/stop.py b/python/apsis/stop.py index 44a1fb51..4e47d2ae 100644 --- a/python/apsis/stop.py +++ b/python/apsis/stop.py @@ -228,39 +228,3 @@ def __call__(self, run): StopSchedule.TYPE_NAMES.set(DurationStopSchedule, "duration") StopSchedule.TYPE_NAMES.set(DaytimeStopSchedule, "daytime") -#------------------------------------------------------------------------------- - -class Stop: - - def __init__(self, method, schedule): - self.method = method - self.schedule = schedule - - - def __eq__(self, other): - return ( - other.method == self.method - and other.schedule == self.shedule - ) - - - def __repr__(self): - return format_ctor(self, self.method, self.schedule) - - - def to_jso(self): - return { - "method" : self.method.to_jso(), - "schedule" : self.schedule.to_jso(), - } - - - @classmethod - def from_jso(cls, jso): - with check_schema(jso) as pop: - method = pop("method", StopMethod.from_jso) - schedule = pop("schedule", StopSchedule.from_jso) - return cls(method, schedule) - - - From 5cc2af9e7c9a6cfc0e2afe094afcabe8d1e9f825 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 15:31:13 -0500 Subject: [PATCH 12/84] Simplify stop schedule. --- python/apsis/jobs.py | 25 +--------- python/apsis/lib/api.py | 13 ++++- python/apsis/schedule/__init__.py | 2 +- python/apsis/schedule/base.py | 25 ++++++++++ python/apsis/stop.py | 82 +++++++------------------------ test/unit/test_stop.py | 44 ++--------------- 6 files changed, 61 insertions(+), 130 deletions(-) diff --git a/python/apsis/jobs.py b/python/apsis/jobs.py index cd5386d0..92382354 100644 --- a/python/apsis/jobs.py +++ b/python/apsis/jobs.py @@ -12,8 +12,7 @@ from .lib.json import to_array, to_narray, check_schema from .lib.py import tupleize, format_ctor from .program import Program, NoOpProgram -from .schedule import Schedule -from .stop import StopSchedule +from .schedule import Schedule, schedule_to_jso, schedule_from_jso log = logging.getLogger(__name__) @@ -70,16 +69,6 @@ def __eq__(self, other): def to_jso(self): - def schedule_to_jso(schedule): - jso = schedule.to_jso() - return ( - jso if schedule.stop_schedule is None - else { - "start" : jso, - "stop" : schedule.stop_schedule.to_jso(), - } - ) - return { "job_id" : self.job_id, "params" : list(sorted(self.params)), @@ -94,18 +83,6 @@ def schedule_to_jso(schedule): @classmethod def from_jso(cls, jso, job_id): - def schedule_from_jso(jso): - if set(jso) in ({"start"}, {"start", "stop"}): - # Explicit start schedule, and possibly a stop schedule. - schedule = Schedule.from_jso(jso["start"]) - stop_jso = jso.get("stop", None) - if stop_jso is not None: - schedule.stop_schedule = StopSchedule.from_jso(stop_jso) - return schedule - else: - # Only a start schedule. - return Schedule.from_jso(jso) - with check_schema(jso) as pop: assert pop("job_id", default=job_id) == job_id, \ f"JSON job_id mismatch {job_id}" diff --git a/python/apsis/lib/api.py b/python/apsis/lib/api.py index ec274511..8dcbf5ed 100644 --- a/python/apsis/lib/api.py +++ b/python/apsis/lib/api.py @@ -5,6 +5,7 @@ import zlib from apsis.cond.dependency import Dependency +from apsis.schedule import schedule_to_jso log = logging.getLogger(__name__) @@ -95,10 +96,18 @@ def _to_jsos(objs): def job_to_jso(job): + def sched_to_jso(s): + jso = schedule_to_jso(s) + jso["str"] = ( + str(s) if s.stop_schedule is None + else f"{s}, {s.stop_schedule}" + ) + return jso + return { "job_id" : job.job_id, "params" : list(sorted(job.params)), - "schedule" : [ _to_jso(s) for s in job.schedules ], + "schedule" : [ sched_to_jso(s) for s in job.schedules ], "program" : _to_jso(job.program), "condition" : [ _to_jso(c) for c in job.conds ], "action" : [ _to_jso(a) for a in job.actions ], @@ -123,6 +132,8 @@ def run_to_summary_jso(run): } if run.expected: jso["expected"] = run.expected + if run.stop_time: + jso["stop_time"] = str(run.stop_time) if run.conds is not None: deps = [ diff --git a/python/apsis/schedule/__init__.py b/python/apsis/schedule/__init__.py index 7cd71bb6..dfee52fc 100644 --- a/python/apsis/schedule/__init__.py +++ b/python/apsis/schedule/__init__.py @@ -1,4 +1,4 @@ -from .base import Schedule, DaytimeSpec +from .base import Schedule, DaytimeSpec, schedule_to_jso, schedule_from_jso from .daily import DailySchedule from .daily_interval import DailyIntervalSchedule from .explicit import ExplicitSchedule diff --git a/python/apsis/schedule/base.py b/python/apsis/schedule/base.py index 5a662afe..a19d510c 100644 --- a/python/apsis/schedule/base.py +++ b/python/apsis/schedule/base.py @@ -2,6 +2,7 @@ import ora from apsis.lib.json import TypedJso, check_schema, nkey +from apsis.stop import StopSchedule #------------------------------------------------------------------------------- @@ -118,3 +119,27 @@ def __call__(self, start: ora.Time): +def schedule_to_jso(schedule): + jso = schedule.to_jso() + return ( + jso if schedule.stop_schedule is None + else { + "start" : jso, + "stop" : schedule.stop_schedule.to_jso(), + } + ) + + +def schedule_from_jso(jso): + if set(jso) in ({"start"}, {"start", "stop"}): + # Explicit start schedule, and possibly a stop schedule. + schedule = Schedule.from_jso(jso["start"]) + stop_jso = jso.get("stop", None) + if stop_jso is not None: + schedule.stop_schedule = StopSchedule.from_jso(stop_jso) + return schedule + else: + # Only a start schedule. + return Schedule.from_jso(jso) + + diff --git a/python/apsis/stop.py b/python/apsis/stop.py index 4e47d2ae..4eff6a79 100644 --- a/python/apsis/stop.py +++ b/python/apsis/stop.py @@ -74,71 +74,37 @@ class StopSchedule(TypedJso): TYPE_NAMES = TypedJso.TypeNames() - # The schedule is called when the run is running. These are the valid - # states after which to schedule a stop. - AFTERS = [ - "schedule", - "waiting", - "starting", - "running", - ] - - @classmethod - def _get_run_time(cls, run, after): - # Return the time for `after`, falling forward as needed. - for a in cls.AFTERS[cls.AFTERS.index(after) :]: - try: - return ora.Time(run.times[a]) - except KeyError: - pass - else: - raise RuntimeError(f"no {after} time for {run}") - - - def __call__(self, run) -> ora.Time: - """ - Returns the stop time of the run. - """ + def __call__(self, schedule_time) -> ora.Time: raise NotImplementedError("StopSchedule.__call__") class DurationStopSchedule(StopSchedule): - def __init__(self, duration, *, after="schedule"): + def __init__(self, duration): try: duration = float(duration) except (TypeError, ValueError): duration = parse_duration(duration) - after = str(after) - if after not in self.AFTERS: - names = " ".join( s.name for s in self.AFTERS ) - raise ValueError(f"after must be in {names}") - self.duration = duration - self.after = after def __eq__(self, other): - return ( - other.duration == self.duration - and other.after == self.after - ) + return other.duration == self.duration def __repr__(self): - return format_ctor(self, self.duration, after=self.after) + return format_ctor(self, self.duration) def __str__(self): - return f"stop after {self.duration} s after {self.after}" + return f"stop {self.duration} s after schedule time" def to_jso(self): return { **super().to_jso(), "duration" : self.duration, - "after" : self.after, } @@ -146,13 +112,11 @@ def to_jso(self): def from_jso(cls, jso): with check_schema(jso) as pop: duration = pop("duration") - after = pop("after", str, "schedule") - return cls(duration, after=after) + return cls(duration) - def __call__(self, run): - time = self._get_run_time(run, self.after) - return time + self.duration + def __call__(self, schedule_time): + return schedule_time + self.duration @@ -163,37 +127,28 @@ class DaytimeStopSchedule(StopSchedule): # FIXME: Add date_shift and cal_shift, as DailySchedule. - def __init__(self, daytime, tz, *, after="schedule"): + def __init__(self, daytime, tz): """ Schedules to stop the run on the next occurrence of `daytime` in `tz` - after the transition time for the state `after`. + after the schedule time. """ - daytime = ora.Daytime(daytime) - tz = ora.TimeZone(tz) - after = str(after) - if after not in self.AFTERS: - names = " ".join( s.name for s in self.AFTERS ) - raise ValueError(f"after must be in {names}") - - self.daytime = daytime - self.tz = tz - self.after = after + self.daytime = ora.Daytime(daytime) + self.tz = ora.TimeZone(tz) def __eq__(self, other): return ( other.daytime == self.daytime and other.tz == self.tz - and other.after == self.after ) def __repr__(self): - return format_ctor(self, self.daytime, self.tz, after=self.after) + return format_ctor(self, self.daytime, self.tz) def __str__(self): - return f"stop at {self.daytime} {self.tz} after {self.after}" + return f"stop at {self.daytime} {self.tz} after schedule time" def to_jso(self): @@ -201,7 +156,6 @@ def to_jso(self): **super().to_jso(), "daytime" : str(self.daytime), "tz" : str(self.tz), - "after" : self.after, } @@ -210,14 +164,12 @@ def from_jso(cls, jso): with check_schema(jso) as pop: daytime = pop("daytime", ora.Daytime) tz = pop("tz", ora.TimeZone) - after = pop("after", str, "schedule") - return cls(daytime, tz, after=after) + return cls(daytime, tz) - def __call__(self, run): - time = self._get_run_time(run, self.after) + def __call__(self, schedule_time): # FIXME: Handle invalid date/daytime pairs. - date, daytime = time @ self.tz + date, daytime = schedule_time @ self.tz return ( date if daytime < self.daytime else date + 1, self.daytime diff --git a/test/unit/test_stop.py b/test/unit/test_stop.py index bddf1e23..e8af8790 100644 --- a/test/unit/test_stop.py +++ b/test/unit/test_stop.py @@ -1,30 +1,10 @@ import ora -from apsis.runs import Instance, Run from apsis.states import State from apsis.stop import StopSchedule #------------------------------------------------------------------------------- -RUN = Run(Instance("test job", {})) -RUN.state = State.running -RUN.times.update({ - "scheduled" : ora.Time("2024-12-12T23:00:00Z"), - "schedule" : ora.Time("2024-12-13T12:00:00Z"), - "waiting" : ora.Time("2024-12-13T12:00:01Z"), - "starting" : ora.Time("2024-12-13T12:10:00Z"), - "running" : ora.Time("2024-12-13T12:10:05Z"), -}) - -# A run missing some of the `times` entries. -AD_HOC_RUN = Run(Instance("test job", {})) -AD_HOC_RUN.state = State.success -AD_HOC_RUN.times.update({ - "starting" : ora.Time("2024-12-13T12:10:23Z"), - "running" : ora.Time("2024-12-13T12:11:05Z"), -}) - - def test_stop_duration_schedule(): sched = StopSchedule.from_jso({ "type": "duration", @@ -35,19 +15,8 @@ def test_stop_duration_schedule(): assert "3600" in str(sched) jso = sched.to_jso() assert jso["duration"] == 3600 - assert sched(RUN) == ora.Time("2024-12-13T13:00:00Z") - assert sched(AD_HOC_RUN) == ora.Time("2024-12-13T13:10:23Z") - - sched = StopSchedule.from_jso({ - "type": "duration", - "duration": "1h", - "after": "running", - }) - assert sched == StopSchedule.from_jso(sched.to_jso()) - jso = sched.to_jso() - assert jso["after"] == "running" - assert sched(RUN) == ora.Time("2024-12-13T13:10:05Z") - assert sched(AD_HOC_RUN) == ora.Time("2024-12-13T13:11:05Z") + schedule_time = ora.Time("2024-12-13T12:00:00Z") + assert sched(schedule_time) == ora.Time("2024-12-13T13:00:00Z") def test_stop_daytime_schedule(): @@ -62,22 +31,19 @@ def test_stop_daytime_schedule(): jso = sched.to_jso() assert jso["daytime"] == "16:00:00" assert jso["tz"] == "UTC" - assert sched(RUN) == ora.Time("2024-12-13T16:00:00Z") - assert sched(AD_HOC_RUN) == ora.Time("2024-12-13T16:00:00Z") + schedule_time = ora.Time("2024-12-13T12:00:00Z") + assert sched(schedule_time) == ora.Time("2024-12-13T16:00:00Z") sched = StopSchedule.from_jso({ "type": "daytime", "daytime": "16:00:00", "tz": "Asia/Tokyo", - "after": "waiting", }) assert sched == StopSchedule.from_jso(sched.to_jso()) assert "Asia/Tokyo" in repr(sched) jso = sched.to_jso() assert jso["tz"] == "Asia/Tokyo" - assert jso["after"] == "waiting" # Next day in Tokyo. - assert sched(RUN) == ora.Time("2024-12-14T07:00:00Z") - assert sched(AD_HOC_RUN) == ora.Time("2024-12-14T07:00:00Z") + assert sched(schedule_time) == ora.Time("2024-12-14T07:00:00Z") From ec8fd4e90bd6602f3ee6093c910551def28516bb Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 15:31:23 -0500 Subject: [PATCH 13/84] We always go through the scheduled state. --- python/apsis/states.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/apsis/states.py b/python/apsis/states.py index cd4133df..f7cbf68c 100644 --- a/python/apsis/states.py +++ b/python/apsis/states.py @@ -49,7 +49,7 @@ def to_states(states): TRANSITIONS = { State.new : set(), State.scheduled : {State.new}, - State.waiting : {State.new, State.scheduled}, + State.waiting : {State.scheduled}, State.starting : {State.scheduled, State.waiting}, State.running : {State.starting}, State.stopping : {State.running}, From 9a4439bfc33a1c03a75514959c24024067ff51c1 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 15:55:15 -0500 Subject: [PATCH 14/84] Set stop time in run.times. --- python/apsis/apsis.py | 24 ++++++++++++++++-------- python/apsis/check.py | 4 ++-- python/apsis/lib/api.py | 2 -- python/apsis/scheduler.py | 13 +++++++++---- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 8a8e4142..ea097555 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -447,7 +447,7 @@ def _propagate_args(self, old_args, inst): # FIXME: Move the API elsewhere. - async def schedule(self, time, inst, *, expected=False): + async def schedule(self, time, inst, *, expected=False, stop_time=None): """ Creates and schedules a new run. @@ -459,7 +459,11 @@ async def schedule(self, time, inst, *, expected=False): :return: The run, either scheduled or error. """ - time = None if time is None else Time(time) + if time is None: + times = {"schedule": now()} + else: + time = Time(time) + times = {"schedule": time} # Create the run and add it to the run store, which assigns it a run ID # and persists it. @@ -471,6 +475,9 @@ async def schedule(self, time, inst, *, expected=False): job = self.jobs.get_job(run.inst.job_id) validate_args(run, job.params) bind(run, job, self.jobs) + # Add the stop time, if any. + if stop_time is not None: + times["stop"] = stop_time # Attach job labels to the run. run.meta["job"] = { "labels": job.meta.get("labels", []), @@ -479,15 +486,16 @@ async def schedule(self, time, inst, *, expected=False): self._run_exc(run, message=str(exc)) return run + # Transition to scheduled. + msg = f"scheduled: {'now' if time is None else time}" + self.run_log.record(run, msg) + self._transition(run, State.scheduled, times=times) + if time is None: - # Transition to scheduled and immediately to wait. - self.run_log.record(run, "scheduled: now") - self._transition(run, State.scheduled, times={"schedule": now()}) + # Transition immediately to wait. self._wait(run) else: # Schedule for the future. - self.run_log.record(run, f"scheduled: {time}") - self._transition(run, State.scheduled, times={"schedule": time}) await self.scheduled.schedule(time, run) return run @@ -572,7 +580,7 @@ async def rerun(self, run, *, time=None): """ # Create the new run. log.info(f"rerun: {run.run_id} at {time or 'now'}") - new_run = await self.schedule(time, run.inst) + new_run = await self.schedule(time, run.inst, stop_time=run.stop_time) self.run_log.info(new_run, f"scheduled as rerun of {run.run_id}") return new_run diff --git a/python/apsis/check.py b/python/apsis/check.py index 1bdd658a..f1eb71c1 100644 --- a/python/apsis/check.py +++ b/python/apsis/check.py @@ -119,7 +119,7 @@ def check_job_dependencies_scheduled( # Construct all instances that will be scheduled soon. insts = get_insts_to_schedule(job, sched_start, sched_stop) # Check each of scheduled instance. - for _, inst in insts: + for _, _, inst in insts: run = Run(inst) # Check each dependency. for dep in deps: @@ -131,7 +131,7 @@ def check_job_dependencies_scheduled( dep_job = jobs_dir.get_job(dep.job_id) if not any( i.args == dep.args - for _, i in get_insts_to_schedule( + for _, _, i in get_insts_to_schedule( dep_job, dep_start, dep_stop ) diff --git a/python/apsis/lib/api.py b/python/apsis/lib/api.py index 8dcbf5ed..34a2b9da 100644 --- a/python/apsis/lib/api.py +++ b/python/apsis/lib/api.py @@ -132,8 +132,6 @@ def run_to_summary_jso(run): } if run.expected: jso["expected"] = run.expected - if run.stop_time: - jso["stop_time"] = str(run.stop_time) if run.conds is not None: deps = [ diff --git a/python/apsis/scheduler.py b/python/apsis/scheduler.py index d3a73986..a9666e4f 100644 --- a/python/apsis/scheduler.py +++ b/python/apsis/scheduler.py @@ -15,7 +15,7 @@ def get_insts_to_schedule(job, start, stop): Builds runs to schedule for `job` between `start` and `stop`. :return: - Iterable of (time, inst). + Iterable of (sched_time, stop_time, inst). """ for schedule in job.schedules: times = itertools.takewhile(lambda t: t[0] < stop, schedule(start)) @@ -31,7 +31,11 @@ def get_insts_to_schedule(job, start, stop): # Runs instantiated by the scheduler are only expected; the job # schedule may change before the run is started. # FIXME: Store additional args for later expansion. - yield sched_time, Instance(job.job_id, args) + stop_time = ( + None if schedule.stop_schedule is None + else schedule.stop_schedule(sched_time) + ) + yield sched_time, stop_time, Instance(job.job_id, args) class Scheduler: @@ -97,8 +101,9 @@ async def schedule(self, stop): log.debug(f"scheduling runs until {stop}") for job in self.__jobs.get_jobs(): - for time, inst in get_insts_to_schedule(job, self.__stop, stop): - await self.__schedule(time, inst) + items = get_insts_to_schedule(job, self.__stop, stop) + for sched_time, stop_time, inst in items: + await self.__schedule(sched_time, inst, stop_time=stop_time) self.__stop = stop From 10489797f6bbb1956dfc28b4a5612c0a32b6b3c5 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 16:11:26 -0500 Subject: [PATCH 15/84] Update design notes. --- notes/notes.md | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/notes/notes.md b/notes/notes.md index 25b2d410..ad22440c 100644 --- a/notes/notes.md +++ b/notes/notes.md @@ -1,20 +1,5 @@ # Scheduled stop -NO: -```yaml -schedule: - start: - type: interval - interval: 1h - - stop: - method: - type: signal - schedule: - type: duration - duration: 30m -``` - THIS IS CORRECT: ```yaml program: From ba0d364827ee8343f81cc46cd9625ebbdb74419f Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 16:11:37 -0500 Subject: [PATCH 16/84] Todo. --- notes/todo.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/notes/todo.md b/notes/todo.md index 408de895..c2ec5b44 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -54,14 +54,22 @@ Transient connection (single run): - [x] move compression out of programs - [x] roll in Procstar agent changes to `/summary` - [ ] clean up API endpoints we don't need anymore -- [ ] roll in, or get rid of, log socket +- [x] roll in, or get rid of, log socket - [x] add live endpoints to `Client` - [x] live updates in CLUI - [x] output - [x] run log - [x] improve and clean up `State` enum +- [ ] scheduled stop + - [ ] add stop time to run view + - [ ] add stop time to runs table + - [ ] add stop method to program + - [ ] refactor `apsis.stop` + - [ ] actually stop the program at the stop time + - [ ] update `_process_updates` to set the state for a stopping job - [ ] if `send_signal` raises, error the run + - [ ] improve `apsis job` output style From 83fdd4b89f6ff235ec8a17ccea9005e3348483c6 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 16:11:45 -0500 Subject: [PATCH 17/84] Fix. --- python/apsis/apsis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index ea097555..f3931696 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -931,8 +931,8 @@ async def reschedule_runs(apsis, job_id): # and the scheduler time. job = apsis.jobs.get_job(job_id) schedule = list(get_insts_to_schedule(job, scheduled_time, scheduler_time)) - for time, inst in schedule: - await apsis.schedule(time, inst, expected=True) + for time, stop_time, inst in schedule: + await apsis.schedule(time, inst, expected=True, stop_time=stop_time) async def reload_jobs(apsis, *, dry_run=False): From cb05b2ac51b9eb85a43c63feddee685f7340cc0c Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 13 Dec 2024 16:11:49 -0500 Subject: [PATCH 18/84] Set up to use stop time. --- python/apsis/apsis.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index f3931696..859e5e9f 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -819,6 +819,14 @@ async def _process_updates(apsis, run, updates): assert run.state == State.running + try: + stop_time = run.times["stop"] + except KeyError: + pass + else: + apsis.run_log.record(run, f"stop time: {stop_time}") + # FIXME: Handle stop time. + while run.state == State.running: update = await anext(updates) match update: From de98cfe80d55799d0217dec4b7167ea6aa989f71 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 15 Dec 2024 10:01:07 -0500 Subject: [PATCH 19/84] Add FIXME. --- python/apsis/service/api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/apsis/service/api.py b/python/apsis/service/api.py index 99f32844..1aba5d4f 100644 --- a/python/apsis/service/api.py +++ b/python/apsis/service/api.py @@ -537,6 +537,7 @@ async def websocket_summary(request, ws): @API.route("/runs", methods={"POST"}) async def run_post(request): + # FIXME: Add a way to specify the stop time. query = parse_query(request.query_string) try: count = int(query["count"]) From f45b537327ac08af4c729b174ed2fdb17116ea2f Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 16 Dec 2024 23:40:53 -0500 Subject: [PATCH 20/84] Provisional stop logic. --- python/apsis/apsis.py | 27 +++++++++++++++++++++++++-- test/manual/procstar/jobs/stop0.yaml | 15 +++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 test/manual/procstar/jobs/stop0.yaml diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index f89572cb..2869bd2b 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -792,6 +792,7 @@ async def _process_updates(apsis, run, updates): """ Processes program `updates` for `run` until the program is finished. """ + run_id = run.run_id updates = aiter(updates) try: @@ -822,13 +823,31 @@ async def _process_updates(apsis, run, updates): assert run.state == State.running + # Does this run have a scheduled stop time? try: stop_time = run.times["stop"] + except KeyError: - pass + stop_task = None + else: apsis.run_log.record(run, f"stop time: {stop_time}") - # FIXME: Handle stop time. + + # Start a task to stop the run at the scheduled time. + async def stop(): + sleep = stop_time - now() + log.debug(f"{run_id}: sleeping {sleep} s until stop") + await asyncio.sleep(sleep) + log.debug(f"{run_id}: stopping") + + # FIXME: Generalize to program.stop. + if not run.state.finished: + await apsis.send_signal(run, to_signal("SIGTERM")) + await asyncio.sleep(30) + if not run.state.finished: + await apsis.send_signal(run, to_signal("SIGKILL")) + + stop_task = asyncio.create_task(stop()) while run.state == State.running: update = await anext(updates) @@ -883,6 +902,10 @@ async def _process_updates(apsis, run, updates): assert False, f"unexpected update: {update}" else: + # Cancel the stop task. + if stop_task is not None: + stop_task.cancel() + # Exhaust the async iterator, so that cleanup can run. try: update = await anext(updates) diff --git a/test/manual/procstar/jobs/stop0.yaml b/test/manual/procstar/jobs/stop0.yaml new file mode 100644 index 00000000..f7755be7 --- /dev/null +++ b/test/manual/procstar/jobs/stop0.yaml @@ -0,0 +1,15 @@ +params: [date] + +schedule: + start: + type: daily + tz: America/New_York + daytime: 23:38:00 + stop: + type: duration + duration: 20m + +program: + type: procstar-shell + command: sleep 30 + From 7fbbc1b4b9df794f449a504f022fc21124b30f21 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 16 Dec 2024 23:41:26 -0500 Subject: [PATCH 21/84] Update .gitignore. --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 218823f6..46941179 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ *.o .pytest_cache Makefile.local -apsis.db +apsis.db* apsis.log archive*.db *-journal From ca714c2d14a61df30cd7060c64803720526a702a Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 16 Dec 2024 23:47:39 -0500 Subject: [PATCH 22/84] Clean up logic. --- python/apsis/scheduler.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/python/apsis/scheduler.py b/python/apsis/scheduler.py index a9666e4f..e4060c4b 100644 --- a/python/apsis/scheduler.py +++ b/python/apsis/scheduler.py @@ -18,23 +18,20 @@ def get_insts_to_schedule(job, start, stop): Iterable of (sched_time, stop_time, inst). """ for schedule in job.schedules: - times = itertools.takewhile(lambda t: t[0] < stop, schedule(start)) - - for sched_time, args in times: - args = {**args, "schedule_time": sched_time} - args = { - a: str(v) - for a, v in args.items() - if a in job.params - } - if schedule.enabled: - # Runs instantiated by the scheduler are only expected; the job - # schedule may change before the run is started. - # FIXME: Store additional args for later expansion. + if schedule.enabled: + times = itertools.takewhile(lambda t: t[0] < stop, schedule(start)) + for sched_time, args in times: + args = {**args, "schedule_time": sched_time} + args = { + a: str(v) + for a, v in args.items() + if a in job.params + } stop_time = ( None if schedule.stop_schedule is None else schedule.stop_schedule(sched_time) ) + # FIXME: Store additional args for later expansion. yield sched_time, stop_time, Instance(job.job_id, args) From 27f4640e0bbbef597ec67415fcc341ffc89425b3 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 16 Dec 2024 23:47:45 -0500 Subject: [PATCH 23/84] Change stop time recording in run log. --- python/apsis/apsis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 2869bd2b..440b78e1 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -489,6 +489,9 @@ async def schedule(self, time, inst, *, expected=False, stop_time=None): # Transition to scheduled. msg = f"scheduled: {'now' if time is None else time}" self.run_log.record(run, msg) + if stop_time is not None: + self.run_log.record(run, f"stop time: {stop_time}") + self._transition(run, State.scheduled, times=times) if time is None: @@ -831,8 +834,6 @@ async def _process_updates(apsis, run, updates): stop_task = None else: - apsis.run_log.record(run, f"stop time: {stop_time}") - # Start a task to stop the run at the scheduled time. async def stop(): sleep = stop_time - now() From 2e0506d9ac015898ab21ae84e3fa85569b4bf197 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 11:46:45 -0500 Subject: [PATCH 24/84] Update design. --- notes/notes.md | 7 +++---- notes/todo.md | 38 +++----------------------------------- 2 files changed, 6 insertions(+), 39 deletions(-) diff --git a/notes/notes.md b/notes/notes.md index ad22440c..94cdddba 100644 --- a/notes/notes.md +++ b/notes/notes.md @@ -3,12 +3,11 @@ THIS IS CORRECT: ```yaml program: - start: - type: agent - argv: ["/path/to/my/service", "--foreground"] + type: agent + argv: ["/path/to/my/service", "--foreground"] stop: - type: signal signal: SIGTERM + kill_after: 1m schedule: start: diff --git a/notes/todo.md b/notes/todo.md index c2ec5b44..7d3f3f3f 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -1,24 +1,3 @@ -# Live updates - -WebSocket API for web UI: -- run updates for all runs (as current) -- run log updates for a single run -- run data updates for a single run -- agent changes: connect, disconnect, timeout -- job changes - -Persistent connection: -- run changes (run summaries only) -- job changes -- Procstar connection changes -- Apsis log (if we keep this) - -Transient connection (single run): -- run log updates -- run output data updates -- run metadata updates - - ### The Plan - [x] design internal run publisher protocol @@ -62,28 +41,17 @@ Transient connection (single run): - [x] improve and clean up `State` enum - [ ] scheduled stop - - [ ] add stop time to run view - - [ ] add stop time to runs table - [ ] add stop method to program - - [ ] refactor `apsis.stop` - [ ] actually stop the program at the stop time + - [ ] add stop time to run view + - [ ] add stop time to runs table (NO?) + - [ ] refactor `apsis.stop` - [ ] update `_process_updates` to set the state for a stopping job - [ ] if `send_signal` raises, error the run - [ ] improve `apsis job` output style -# Exantium tasks? - -- internals presentation -- reliable integration tests -- update Python deps, especially Sanic -- update JS deps, especially Vue3 -- database schema cleanup - - move metadata into its own table; remove `meta` from `Apsis._transition()` -- code cleanup, reorg, documentation - - # Cleanup ### Schema From 14ec55d13477dd842a2a0599eb3b07d2f014b9d8 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 11:47:21 -0500 Subject: [PATCH 25/84] Add stop() to Program API. --- python/apsis/program/base.py | 12 ++++++++++++ python/apsis/program/process.py | 7 ++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index 5666ae2f..8386d9bd 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -219,6 +219,7 @@ async def signal(self, run_id, signum: str): :param signum: Signal name or number. """ + raise NotImplementedError("program signal not implemented") @classmethod @@ -287,6 +288,13 @@ async def connect(self, run_id, run_state, cfg): yield success + async def stop(self): + """ + Instructs the running program to stop. + """ + raise NotImplementedError("program stop not implemented") + + #------------------------------------------------------------------------------- @@ -313,4 +321,8 @@ async def signal(self, run_id, signum: str): pass + async def stop(self): + pass + + diff --git a/python/apsis/program/process.py b/python/apsis/program/process.py index 8ba1a6f2..0fcbfd78 100644 --- a/python/apsis/program/process.py +++ b/python/apsis/program/process.py @@ -96,12 +96,13 @@ async def wait(self, run_id, proc): raise ProgramFailure(message, meta=meta, outputs=outputs) - async def signal(self, run_id, signum: str): - # FIXME - raise NotImplementedError() + # FIXME: Implement signal(). + # FIXME: Implement stop(). +#------------------------------------------------------------------------------- + class ShellCommandProgram(ProcessProgram): def __init__(self, command): From 5e9cca1cd574ffc28e0dd498aa5d6c95af6328c6 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 11:47:52 -0500 Subject: [PATCH 26/84] stop() for no-op program. --- python/apsis/program/noop.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/apsis/program/noop.py b/python/apsis/program/noop.py index 49cda5d9..9f6237fd 100644 --- a/python/apsis/program/noop.py +++ b/python/apsis/program/noop.py @@ -23,6 +23,8 @@ class NoOpProgram(Program): def __init__(self, *, duration=0, success=True): self.__duration = nstr(duration) self.__success = None if success is None else bool(success) + # For signaling stop. + self.__stop_queue = asyncio.Event() def __str__(self): @@ -60,7 +62,13 @@ async def start(self, run_id, cfg): async def wait(self, run_id, run_state): if self.__duration is not None: duration = parse_duration(self.__duration) - await asyncio.sleep(duration) + try: + await asyncio.wait_for(self.__stop_queue.wait(), duration) + except asyncio.TimeoutError: + # OK, duration expired. + pass + else: + raise ProgramError("program stopped") if self.__success is True: return ProgramSuccess() elif self.__success is False: @@ -77,4 +85,8 @@ async def signal(self, run_state, signum): log.info("ignoring signal to no-op program") + async def stop(self): + self.__stop_queue.set() + + From eecf760589b60a8923a7c1ca582a02dc8781556a Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 11:48:13 -0500 Subject: [PATCH 27/84] Use Program.stop() for scheduled stop. --- python/apsis/apsis.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 440b78e1..2b92f5b5 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -245,7 +245,7 @@ def _start(self, run): self if isinstance(run.program, _InternalProgram) else self.cfg, ) # Start a task to process updates from the program. - run_task = _process_updates(self, run, updates) + run_task = _process_updates(self, run, updates, run.program) self.__run_tasks.add(run.run_id, run_task) @@ -265,7 +265,7 @@ def __reconnect(self, run): self if isinstance(run.program, _InternalProgram) else self.cfg, ) # Start a task to process updates from the program. - run_task = _process_updates(self, run, updates) + run_task = _process_updates(self, run, updates, run.program) self.__run_tasks.add(run.run_id, run_task) @@ -791,7 +791,7 @@ async def _cmpr(output): return dict(zip(outputs.keys(), o)) -async def _process_updates(apsis, run, updates): +async def _process_updates(apsis, run, updates, program): """ Processes program `updates` for `run` until the program is finished. """ @@ -829,24 +829,16 @@ async def _process_updates(apsis, run, updates): # Does this run have a scheduled stop time? try: stop_time = run.times["stop"] - except KeyError: stop_task = None - else: # Start a task to stop the run at the scheduled time. async def stop(): - sleep = stop_time - now() - log.debug(f"{run_id}: sleeping {sleep} s until stop") - await asyncio.sleep(sleep) + duration = stop_time - now() + log.debug(f"{run_id}: running for {duration} s until stop") + await asyncio.sleep(duration) log.debug(f"{run_id}: stopping") - - # FIXME: Generalize to program.stop. - if not run.state.finished: - await apsis.send_signal(run, to_signal("SIGTERM")) - await asyncio.sleep(30) - if not run.state.finished: - await apsis.send_signal(run, to_signal("SIGKILL")) + await program.stop() stop_task = asyncio.create_task(stop()) From fde2309b7f6753f6234571b70702c1da984ec1f4 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 11:50:39 -0500 Subject: [PATCH 28/84] Add stop_time to schedule API. --- python/apsis/cli.py | 6 +++- python/apsis/service/api.py | 22 ++++++++++-- python/apsis/service/client.py | 66 ++++++++++++++++++---------------- 3 files changed, 59 insertions(+), 35 deletions(-) diff --git a/python/apsis/cli.py b/python/apsis/cli.py index 23c661e9..a3610d29 100644 --- a/python/apsis/cli.py +++ b/python/apsis/cli.py @@ -272,7 +272,8 @@ def cmd_schedule(client, args): args.job_id, dict(args.args), args.time, - count=args.count, + count =args.count, + stop_time =args.stop_time, ) for run in runs: apsis.cmdline.print_run(run, con) @@ -289,6 +290,9 @@ def parse_arg(arg): cmd.add_argument( "--count", metavar="NUM", type=int, default=1, help="schedule NUM runs [def: 1]") + cmd.add_argument( + "--stop-time", metavar="TIME", default=None, + help="schedule program stop at TIME [time or duration]") cmd.add_argument( "time", metavar="TIME", type=apsis.cmdline.parse_at_time, help="time to run [time, daytime, 'now']") diff --git a/python/apsis/service/api.py b/python/apsis/service/api.py index 1aba5d4f..d06f8366 100644 --- a/python/apsis/service/api.py +++ b/python/apsis/service/api.py @@ -17,6 +17,7 @@ output_metadata_to_jso, run_log_to_jso, output_to_http_message ) import apsis.lib.itr +from apsis.lib.parse import parse_duration from apsis.lib.sys import to_signal from apsis.states import to_state from ..jobs import jso_to_job @@ -564,10 +565,25 @@ async def run_post(request): args = jso.get("args", {}) inst = Instance(job_id, args) - time = jso.get("times", {}).get("schedule", "now") - time = None if time == "now" else ora.Time(time) + times = jso.get("times", {}) + time = times.get("schedule", "now") + time = ora.now() if time == "now" else ora.Time(time) - runs = ( apsis.schedule(time, inst) for _ in range(count) ) + stop_time = times.get("stop", None) + if stop_time is not None: + # Either an absolute time or a duration ahead of schedule time. + try: + stop_time = ora.Time(stop_time) + except ValueError: + try: + stop_time = time + parse_duration(stop_time) + except ValueError: + raise ValueError(f"invalid stop time: {stop_time}") + + runs = ( + apsis.schedule(time, inst, stop_time=stop_time) + for _ in range(count) + ) runs = await asyncio.gather(*runs) jso = runs_to_jso(request.app, ora.now(), runs) return response_json(jso) diff --git a/python/apsis/service/client.py b/python/apsis/service/client.py index b9b0bc6d..d4feab58 100644 --- a/python/apsis/service/client.py +++ b/python/apsis/service/client.py @@ -13,6 +13,7 @@ import websockets.client import apsis.service +from apsis.lib.json import nkey #------------------------------------------------------------------------------- @@ -312,44 +313,40 @@ def rerun(self, run_id): return run - def schedule(self, job_id, args, time="now", *, count=None): - """ - Creates and schedules a new run. - """ - job_id = str(job_id) - args = { str(k): str(v) for k, v in args.items() } - time = "now" if time == "now" else str(Time(time)) - - data = { - "job_id": job_id, - "args": args, - "times": { - "schedule": time, + def __schedule(self, time, job_spec, *, count=None, stop_time=None): + time = "now" if time == "now" else str(Time(time)) + stop_time = None if stop_time is None else str(stop_time) + params = { + "data": job_spec | { + "times": { + "schedule": time, + } | nkey("stop", stop_time) } } - runs = self.__post("/api/v1/runs", data=data, count=count)["runs"] + runs = self.__post("/api/v1/runs", **params)["runs"] # FIXME: Hacky. return next(iter(runs.values())) if count is None else runs.values() - def __schedule(self, time, job, count): - time = "now" if time == "now" else str(Time(time)) - data = { - "job": job, - "times": { - "schedule": time, + def schedule(self, job_id, args, time="now", **kw_args): + """ + Creates and schedules a new run. + """ + return self.__schedule( + time, + { + "job_id": str(job_id), + "args" : { str(k): str(v) for k, v in args.items() }, }, - } - runs = self.__post("/api/v1/runs", data=data, count=count)["runs"] - # FIXME: Hacky. - return next(iter(runs.values())) if count is None else runs.values() + **kw_args + ) - def schedule_adhoc(self, time, job, *, count=None): - return self.__schedule(time, job, count) + def schedule_adhoc(self, time, job, **kw_args): + return self.__schedule(time, {"job": job}, **kw_args) - def schedule_program(self, time, args, *, count=None): + def schedule_program(self, time, args, **kw_args): """ :param time: The schedule time, or "now" for immediate. @@ -357,18 +354,25 @@ def schedule_program(self, time, args, *, count=None): The argument vector. The first item is the path to the program to run. """ - args = [ str(a) for a in args ] - return self.__schedule(time, {"program": args}, count) + return self.__schedule( + time, + {"job": {"program": [ str(a) for a in args ]}}, + **kw_args + ) - def schedule_shell_program(self, time, command, *, count=None): + def schedule_shell_program(self, time, command, **kw_args): """ :param time: The schedule time, or "now" for immediate. :param command: The shell command to run. """ - return self.__schedule(time, {"program": str(command)}, count) + return self.__schedule( + time, + {"job": {"program": str(command)}}, + **kw_args + ) def reload_jobs(self, *, dry_run=False): From b3ad244842426598b6ebd243f5ad8c9b2ea64b81 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 11:51:51 -0500 Subject: [PATCH 29/84] Todo. --- notes/todo.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/notes/todo.md b/notes/todo.md index 7d3f3f3f..22e48471 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -41,8 +41,12 @@ - [x] improve and clean up `State` enum - [ ] scheduled stop - - [ ] add stop method to program - - [ ] actually stop the program at the stop time + - [x] add stop method to program + - [x] actually stop the program at the stop time + - [ ] Procstar stop method + - [ ] ProcessProgram stop method + - [ ] classig agent program stop method + - [ ] "stop" operation - [ ] add stop time to run view - [ ] add stop time to runs table (NO?) - [ ] refactor `apsis.stop` From 09c47c7e609e3fcf84a22fe1b5135b2437057362 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 12:22:22 -0500 Subject: [PATCH 30/84] Todo. --- notes/todo.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/notes/todo.md b/notes/todo.md index 22e48471..e6566482 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -43,13 +43,15 @@ - [ ] scheduled stop - [x] add stop method to program - [x] actually stop the program at the stop time + - [ ] go through _stopping_ state - [ ] Procstar stop method - [ ] ProcessProgram stop method + - distinguish between "service"-type programs - [ ] classig agent program stop method - [ ] "stop" operation - [ ] add stop time to run view - [ ] add stop time to runs table (NO?) - - [ ] refactor `apsis.stop` + - [ ] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job - [ ] if `send_signal` raises, error the run From de6e3cfedfc3e186228be3ca57639a14b33373f3 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 15:14:54 -0500 Subject: [PATCH 31/84] Move _process_updates into its own module. --- python/apsis/apsis.py | 155 +----------------------------------- python/apsis/running.py | 171 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 154 deletions(-) create mode 100644 python/apsis/running.py diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 2b92f5b5..c93df3a4 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -16,16 +16,15 @@ from .jobs import Jobs, load_jobs_dir, diff_jobs_dirs from .lib.api import run_to_summary_jso from .lib.asyn import TaskGroup, Publisher, KeyPublisher -from .lib.cmpr import compress_async from .lib.py import more_gc_stats from .lib.sys import to_signal from .output import OutputStore from .program.base import _InternalProgram from .program.base import Output, OutputMetadata -from .program.base import ProgramRunning, ProgramError, ProgramFailure, ProgramSuccess, ProgramUpdate from . import runs from .run_log import RunLog from .run_snapshot import snapshot_run +from .running import _process_updates from .runs import Run, RunStore, RunError, MissingArgumentError, ExtraArgumentError from .runs import validate_args, bind from .scheduled import ScheduledRuns @@ -770,158 +769,6 @@ def wait_with_timeout(cond_wait): apsis._start(run) -async def _maybe_compress(outputs, *, compression="br", min_size=16384): - """ - Compresses final outputs, if needed. - """ - async def _cmpr(output): - if output.compression is None and output.metadata.length >= min_size: - # Compress the output. - try: - compressed = await compress_async(output.data, compression) - except RuntimeError as exc: - log.error(f"{exc}; not compressiong") - return output - else: - return Output(output.metadata, compressed, compression) - else: - return output - - o = await asyncio.gather(*( _cmpr(o) for o in outputs.values() )) - return dict(zip(outputs.keys(), o)) - - -async def _process_updates(apsis, run, updates, program): - """ - Processes program `updates` for `run` until the program is finished. - """ - run_id = run.run_id - updates = aiter(updates) - - try: - if run.state == State.starting: - update = await anext(updates) - match update: - case ProgramRunning() as running: - apsis.run_log.record(run, "running") - apsis._transition( - run, State.running, - run_state =running.run_state, - meta ={"program": running.meta}, - times =running.times, - ) - - case ProgramError() as error: - apsis.run_log.info(run, f"error: {error.message}") - apsis._update_output_data(run, error.outputs, persist=True) - apsis._transition( - run, State.error, - meta ={"program": error.meta}, - times =error.times, - ) - return - - case _ as update: - assert False, f"unexpected update: {update}" - - assert run.state == State.running - - # Does this run have a scheduled stop time? - try: - stop_time = run.times["stop"] - except KeyError: - stop_task = None - else: - # Start a task to stop the run at the scheduled time. - async def stop(): - duration = stop_time - now() - log.debug(f"{run_id}: running for {duration} s until stop") - await asyncio.sleep(duration) - log.debug(f"{run_id}: stopping") - await program.stop() - - stop_task = asyncio.create_task(stop()) - - while run.state == State.running: - update = await anext(updates) - match update: - case ProgramUpdate() as update: - if update.outputs is not None: - apsis._update_output_data(run, update.outputs, False) - if update.meta is not None: - apsis._update_metadata(run, {"program": update.meta}) - - case ProgramSuccess() as success: - apsis.run_log.record(run, "success") - apsis._update_output_data( - run, - await _maybe_compress(success.outputs), - True - ) - apsis._transition( - run, State.success, - meta ={"program": success.meta}, - times =success.times, - ) - - case ProgramFailure() as failure: - # Program ran and failed. - apsis.run_log.record(run, f"failure: {failure.message}") - apsis._update_output_data( - run, - await _maybe_compress(failure.outputs), - True - ) - apsis._transition( - run, State.failure, - meta ={"program": failure.meta}, - times =failure.times, - ) - - case ProgramError() as error: - apsis.run_log.info(run, f"error: {error.message}") - apsis._update_output_data( - run, - await _maybe_compress(error.outputs), - True - ) - apsis._transition( - run, State.error, - meta ={"program": error.meta}, - times =error.times, - ) - - case _ as update: - assert False, f"unexpected update: {update}" - - else: - # Cancel the stop task. - if stop_task is not None: - stop_task.cancel() - - # Exhaust the async iterator, so that cleanup can run. - try: - update = await anext(updates) - except StopAsyncIteration: - # Expected. - pass - else: - assert False, f"unexpected update: {update}" - - except (asyncio.CancelledError, StopAsyncIteration): - # We do not transition the run here. The run can survive an Apsis - # restart and we can connect to it later. - pass - - except Exception: - # Program raised some other exception. - apsis.run_log.exc(run, "error: internal") - tb = traceback.format_exc().encode() - output = Output(OutputMetadata("traceback", length=len(tb)), tb) - apsis._update_output_data(run, {"outputs": output}, True) - apsis._transition(run, State.error) - - def _unschedule_runs(apsis, job_id): """ Deletes all scheduled expected runs of `job_id`. diff --git a/python/apsis/running.py b/python/apsis/running.py new file mode 100644 index 00000000..a343ee18 --- /dev/null +++ b/python/apsis/running.py @@ -0,0 +1,171 @@ +""" +Managing runs in the _running_ state. +""" + +import asyncio +import logging +from ora import now +import traceback + +from apsis.lib.cmpr import compress_async +from apsis.program.base import ( + Output, OutputMetadata, + ProgramRunning, ProgramError, ProgramFailure, ProgramSuccess, ProgramUpdate) +from apsis.states import State + +log = logging.getLogger(__name__) + +#------------------------------------------------------------------------------- + +async def _maybe_compress(outputs, *, compression="br", min_size=16384): + """ + Compresses final outputs, if needed. + """ + async def _cmpr(output): + if output.compression is None and output.metadata.length >= min_size: + # Compress the output. + try: + compressed = await compress_async(output.data, compression) + except RuntimeError as exc: + log.error(f"{exc}; not compressiong") + return output + else: + return Output(output.metadata, compressed, compression) + else: + return output + + o = await asyncio.gather(*( _cmpr(o) for o in outputs.values() )) + return dict(zip(outputs.keys(), o)) + + +async def _process_updates(apsis, run, updates, program): + """ + Processes program `updates` for `run` until the program is finished. + """ + run_id = run.run_id + updates = aiter(updates) + + try: + if run.state == State.starting: + update = await anext(updates) + match update: + case ProgramRunning() as running: + apsis.run_log.record(run, "running") + apsis._transition( + run, State.running, + run_state =running.run_state, + meta ={"program": running.meta}, + times =running.times, + ) + + case ProgramError() as error: + apsis.run_log.info(run, f"error: {error.message}") + apsis._update_output_data(run, error.outputs, persist=True) + apsis._transition( + run, State.error, + meta ={"program": error.meta}, + times =error.times, + ) + return + + case _ as update: + assert False, f"unexpected update: {update}" + + assert run.state == State.running + + # Does this run have a scheduled stop time? + try: + stop_time = run.times["stop"] + except KeyError: + stop_task = None + else: + # Start a task to stop the run at the scheduled time. + async def stop(): + duration = stop_time - now() + log.debug(f"{run_id}: running for {duration} s until stop") + await asyncio.sleep(duration) + log.debug(f"{run_id}: stopping") + await program.stop() + + stop_task = asyncio.create_task(stop()) + + while run.state == State.running: + update = await anext(updates) + match update: + case ProgramUpdate() as update: + if update.outputs is not None: + apsis._update_output_data(run, update.outputs, False) + if update.meta is not None: + apsis._update_metadata(run, {"program": update.meta}) + + case ProgramSuccess() as success: + apsis.run_log.record(run, "success") + apsis._update_output_data( + run, + await _maybe_compress(success.outputs), + True + ) + apsis._transition( + run, State.success, + meta ={"program": success.meta}, + times =success.times, + ) + + case ProgramFailure() as failure: + # Program ran and failed. + apsis.run_log.record(run, f"failure: {failure.message}") + apsis._update_output_data( + run, + await _maybe_compress(failure.outputs), + True + ) + apsis._transition( + run, State.failure, + meta ={"program": failure.meta}, + times =failure.times, + ) + + case ProgramError() as error: + apsis.run_log.info(run, f"error: {error.message}") + apsis._update_output_data( + run, + await _maybe_compress(error.outputs), + True + ) + apsis._transition( + run, State.error, + meta ={"program": error.meta}, + times =error.times, + ) + + case _ as update: + assert False, f"unexpected update: {update}" + + else: + # Cancel the stop task. + if stop_task is not None: + stop_task.cancel() + + # Exhaust the async iterator, so that cleanup can run. + try: + update = await anext(updates) + except StopAsyncIteration: + # Expected. + pass + else: + assert False, f"unexpected update: {update}" + + except (asyncio.CancelledError, StopAsyncIteration): + # We do not transition the run here. The run can survive an Apsis + # restart and we can connect to it later. + pass + + except Exception: + # Program raised some other exception. + apsis.run_log.exc(run, "error: internal") + tb = traceback.format_exc().encode() + output = Output(OutputMetadata("traceback", length=len(tb)), tb) + apsis._update_output_data(run, {"outputs": output}, True) + apsis._transition(run, State.error) + + From a6e57dd0c1ec4d5cba352748404bf684acfe6eb3 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 15:24:30 -0500 Subject: [PATCH 32/84] Fix. --- python/apsis/apsis.py | 4 ++++ python/apsis/service/api.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index c93df3a4..4465a201 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -455,9 +455,13 @@ async def schedule(self, time, inst, *, expected=False, stop_time=None): :param time: The schedule time at which to run the run. If `None`, the run is run now, instead of scheduled. + :param stop_time: + If not none, time at which to stop the program. :return: The run, either scheduled or error. """ + if time == "now": + time = None if time is None: times = {"schedule": now()} else: diff --git a/python/apsis/service/api.py b/python/apsis/service/api.py index d06f8366..4ca4c371 100644 --- a/python/apsis/service/api.py +++ b/python/apsis/service/api.py @@ -567,7 +567,7 @@ async def run_post(request): times = jso.get("times", {}) time = times.get("schedule", "now") - time = ora.now() if time == "now" else ora.Time(time) + time = "now" if time == "now" else ora.Time(time) stop_time = times.get("stop", None) if stop_time is not None: @@ -576,9 +576,11 @@ async def run_post(request): stop_time = ora.Time(stop_time) except ValueError: try: - stop_time = time + parse_duration(stop_time) + duration = parse_duration(stop_time) except ValueError: raise ValueError(f"invalid stop time: {stop_time}") + else: + stop_time = (ora.now() if time == "now" else time) + duration runs = ( apsis.schedule(time, inst, stop_time=stop_time) From 0082f77b0aec88a32d96caf7b57a240fdff3c5b7 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 15:51:49 -0500 Subject: [PATCH 33/84] Fix. --- python/apsis/program/base.py | 4 ++-- python/apsis/program/noop.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index 8386d9bd..b852be10 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -210,13 +210,13 @@ def reconnect(self, run_id, run_state): """ - async def signal(self, run_id, signum: str): + async def signal(self, run_id, run_state, signal): """ Sends a signal to the running program. :param run_id: The run ID; used for logging only. - :param signum: + :param signal: Signal name or number. """ raise NotImplementedError("program signal not implemented") diff --git a/python/apsis/program/noop.py b/python/apsis/program/noop.py index 9f6237fd..c384a4aa 100644 --- a/python/apsis/program/noop.py +++ b/python/apsis/program/noop.py @@ -81,7 +81,7 @@ def reconnect(self, run_id, run_state): return asyncio.ensure_future(self.wait(run_id, run_state)) - async def signal(self, run_state, signum): + async def signal(self, run_id, run_state, signal): log.info("ignoring signal to no-op program") From 9f594052b695dfb9347e775493ccce0e0293132b Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 15:51:57 -0500 Subject: [PATCH 34/84] Allow signal while stopping. --- python/apsis/apsis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 4465a201..3b11eac6 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -597,7 +597,7 @@ async def send_signal(self, run, signal): `run` is not running. """ signal = to_signal(signal) - if run.state != State.running: + if run.state not in (State.running, State.stopping): raise RuntimeError(f"invalid run state for signal: {run.state.name}") assert run.program is not None From 4c07602f0769eb019f3713c9c606b5315d8b1b2f Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 15:52:19 -0500 Subject: [PATCH 35/84] Transition to stopping when stopping. --- python/apsis/running.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/apsis/running.py b/python/apsis/running.py index a343ee18..75712aa0 100644 --- a/python/apsis/running.py +++ b/python/apsis/running.py @@ -81,15 +81,21 @@ async def _process_updates(apsis, run, updates, program): else: # Start a task to stop the run at the scheduled time. async def stop(): + # Wait until the stop time. duration = stop_time - now() log.debug(f"{run_id}: running for {duration} s until stop") await asyncio.sleep(duration) + # Transition to stopping. + apsis.run_log.record(run, "stopping") + apsis._transition(run, State.stopping) + # Ask the run to stop. log.debug(f"{run_id}: stopping") await program.stop() + # The main update loop handles updates in response. stop_task = asyncio.create_task(stop()) - while run.state == State.running: + while run.state in (State.running, State.stopping): update = await anext(updates) match update: case ProgramUpdate() as update: From 4b2a17711de85a1441aa9b8b658e395a419530db Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 15:52:30 -0500 Subject: [PATCH 36/84] Front end changes for stopping state. --- vue/src/components/RunElapsed.vue | 2 +- vue/src/components/State.vue | 4 ++++ vue/src/runs.js | 5 ++++- vue/src/views/RunView.vue | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/vue/src/components/RunElapsed.vue b/vue/src/components/RunElapsed.vue index a3781eea..771c0eec 100644 --- a/vue/src/components/RunElapsed.vue +++ b/vue/src/components/RunElapsed.vue @@ -5,7 +5,7 @@ diff --git a/vue/src/components/State.vue b/vue/src/components/State.vue index 2a3528e1..4f789e15 100644 --- a/vue/src/components/State.vue +++ b/vue/src/components/State.vue @@ -19,6 +19,10 @@ circle(cx="900" cy="900" r="800" fill="#a0b040") path(d="M 650 450 V 1350 L 1350 900 L 650 450" fill="#ffffff") + svg(v-else-if="state === 'stopping'" viewBox="0 0 1800 1800", xmlns="http://www.w3.org/2000/svg" width="18px") + circle(cx="900" cy="900" r="800" fill="#b07040") + path(d="M 450 650 H 1350 L 900 1350 L 450 650" fill="#ffffff") + svg(v-else-if="state === 'error'" viewBox="0 0 1800 1800", xmlns="http://www.w3.org/2000/svg" width="18px") circle(cx="900" cy="900" r="800" fill="#ff0060") path(d="M 900 400 V 1050 M 900 1350 V 1400" stroke="#ffffff" stroke-width="200" stroke-linecap="round" fill="transparent") diff --git a/vue/src/runs.js b/vue/src/runs.js index 008e6390..50ed97ac 100644 --- a/vue/src/runs.js +++ b/vue/src/runs.js @@ -10,6 +10,7 @@ const RUN_STATE_GROUPS = { 'waiting': 'R', 'starting': 'R', 'running': 'R', + 'stopping': 'R', 'success': 'C', 'failure': 'C', 'error': 'C', @@ -135,6 +136,7 @@ export const STATES = [ 'waiting', 'starting', 'running', + 'stopping', 'success', 'failure', 'skipped', @@ -146,7 +148,8 @@ export const OPERATIONS = { 'scheduled' : ['start', 'skip'], 'waiting' : ['start', 'skip'], 'starting' : [], - 'running' : ['terminate', 'kill'], + 'running' : ['terminate', 'kill', 'stop'], + 'stopping' : ['terminate', 'kill'], 'success' : ['rerun', 'mark failure', 'mark skipped', 'mark error'], 'failure' : ['rerun', 'mark success', 'mark skipped', 'mark error'], 'skipped' : ['rerun', 'mark success', 'mark failure', 'mark error'], diff --git a/vue/src/views/RunView.vue b/vue/src/views/RunView.vue index 20f1db0c..5cfd54b3 100644 --- a/vue/src/views/RunView.vue +++ b/vue/src/views/RunView.vue @@ -176,7 +176,7 @@ export default { hasOutput() { const state = this.runState - return state === 'running' || isComplete(state) + return state === 'running' || state === 'stopping' || isComplete(state) }, metadata() { From 459e52dc5d16c2a518d2dcb95140ef4aad70b59e Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 16:33:59 -0500 Subject: [PATCH 37/84] Fixes. --- python/apsis/program/base.py | 4 +++- python/apsis/running.py | 8 ++++---- python/apsis/service/client.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index b852be10..6d45be54 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -266,6 +266,8 @@ async def run(self, run_id, cfg): yield success + # FIXME: Remove `run_id` from API. The program should store this in + # `run_state`, if it needs it. async def connect(self, run_id, run_state, cfg): """ Connects to the running program specified by `run_state`. @@ -288,7 +290,7 @@ async def connect(self, run_id, run_state, cfg): yield success - async def stop(self): + async def stop(self, run_state): """ Instructs the running program to stop. """ diff --git a/python/apsis/running.py b/python/apsis/running.py index 75712aa0..aa50ba71 100644 --- a/python/apsis/running.py +++ b/python/apsis/running.py @@ -87,10 +87,9 @@ async def stop(): await asyncio.sleep(duration) # Transition to stopping. apsis.run_log.record(run, "stopping") - apsis._transition(run, State.stopping) + apsis._transition(run, State.stopping, run_state=run.run_state) # Ask the run to stop. - log.debug(f"{run_id}: stopping") - await program.stop() + await program.stop(run.run_state) # The main update loop handles updates in response. stop_task = asyncio.create_task(stop()) @@ -151,6 +150,7 @@ async def stop(): # Cancel the stop task. if stop_task is not None: stop_task.cancel() + await stop_task # Exhaust the async iterator, so that cleanup can run. try: @@ -172,6 +172,6 @@ async def stop(): tb = traceback.format_exc().encode() output = Output(OutputMetadata("traceback", length=len(tb)), tb) apsis._update_output_data(run, {"outputs": output}, True) - apsis._transition(run, State.error) + apsis._transition(run, State.error, force=True) diff --git a/python/apsis/service/client.py b/python/apsis/service/client.py index d4feab58..fbd90a3d 100644 --- a/python/apsis/service/client.py +++ b/python/apsis/service/client.py @@ -322,7 +322,7 @@ def __schedule(self, time, job_spec, *, count=None, stop_time=None): "schedule": time, } | nkey("stop", stop_time) } - } + } | nkey("count", count) runs = self.__post("/api/v1/runs", **params)["runs"] # FIXME: Hacky. return next(iter(runs.values())) if count is None else runs.values() From 6f83ade32319153b72432148032003b53bb177cb Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 16:34:09 -0500 Subject: [PATCH 38/84] Update todo. --- notes/notes.md | 2 +- notes/todo.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/notes/notes.md b/notes/notes.md index 94cdddba..05e11ab9 100644 --- a/notes/notes.md +++ b/notes/notes.md @@ -7,7 +7,7 @@ program: argv: ["/path/to/my/service", "--foreground"] stop: signal: SIGTERM - kill_after: 1m + grace_period: 1m schedule: start: diff --git a/notes/todo.md b/notes/todo.md index e6566482..51807479 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -43,11 +43,11 @@ - [ ] scheduled stop - [x] add stop method to program - [x] actually stop the program at the stop time - - [ ] go through _stopping_ state + - [x] go through _stopping_ state - [ ] Procstar stop method - [ ] ProcessProgram stop method - distinguish between "service"-type programs - - [ ] classig agent program stop method + - [ ] classic agent program stop method - [ ] "stop" operation - [ ] add stop time to run view - [ ] add stop time to runs table (NO?) From a159f7266ed7df1dfe56e131c38e95d55898f162 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 16:34:16 -0500 Subject: [PATCH 39/84] Split bound program, and fix stop. --- python/apsis/program/noop.py | 82 +++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/python/apsis/program/noop.py b/python/apsis/program/noop.py index c384a4aa..0e6acf71 100644 --- a/python/apsis/program/noop.py +++ b/python/apsis/program/noop.py @@ -13,6 +13,43 @@ #------------------------------------------------------------------------------- class NoOpProgram(Program): + + def __init__(self, *, duration=0, success=True): + self.__duration = nstr(duration) + self.__success = nbool(success) + + + def __str__(self): + return "no-op" + ( + "" if self.__duration is None else f" for {self.__duration} s" + ) + + + def bind(self, args): + duration = or_none(template_expand)(self.__duration, args) + return BoundNoOpProgram(duration=duration, success=self.__success) + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + duration = pop("duration", nstr, None) + success = pop("success", nbool, True) + return cls(duration=duration, success=success) + + + def to_jso(self): + return { + **super().to_jso(), + "duration" : self.__duration, + "success" : self.__success, + } + + + +#------------------------------------------------------------------------------- + +class BoundNoOpProgram(Program): """ A program that does nothing. @@ -20,11 +57,11 @@ class NoOpProgram(Program): the program always succeeds; if false, it fails; if none, it errors. """ + __stop_events = {} + def __init__(self, *, duration=0, success=True): self.__duration = nstr(duration) self.__success = None if success is None else bool(success) - # For signaling stop. - self.__stop_queue = asyncio.Event() def __str__(self): @@ -33,11 +70,6 @@ def __str__(self): ) - def bind(self, args): - duration = or_none(template_expand)(self.__duration, args) - return type(self)(duration=duration, success=self.__success) - - @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: @@ -54,39 +86,49 @@ def to_jso(self): } - async def start(self, run_id, cfg): - run_state = {} - return ProgramRunning(run_state), self.wait(run_id, run_state) + async def run(self, run_id, cfg): + run_state = {"run_id": run_id} + yield ProgramRunning(run_state) + async for update in self.wait(run_state): + yield update - async def wait(self, run_id, run_state): + async def wait(self, run_state): if self.__duration is not None: + run_id = run_state["run_id"] duration = parse_duration(self.__duration) + stop_event = self.__stop_events[run_id] = asyncio.Event() try: - await asyncio.wait_for(self.__stop_queue.wait(), duration) + await asyncio.wait_for(stop_event.wait(), duration) except asyncio.TimeoutError: # OK, duration expired. pass else: - raise ProgramError("program stopped") + yield ProgramError("program stopped") + return + finally: + assert self.__stop_events.pop(run_id) == stop_event + if self.__success is True: - return ProgramSuccess() + yield ProgramSuccess() elif self.__success is False: - raise ProgramFailure("failed") + yield ProgramFailure("failed") else: - raise ProgramError("error") + yield ProgramError("error") - def reconnect(self, run_id, run_state): - return asyncio.ensure_future(self.wait(run_id, run_state)) + def connect(self, run_id, run_state): + return self.wait(run_state) async def signal(self, run_id, run_state, signal): log.info("ignoring signal to no-op program") - async def stop(self): - self.__stop_queue.set() + async def stop(self, run_state): + run_id = run_state["run_id"] + stop_event = self.__stop_events[run_id] + stop_event.set() From f3810ee57e6879481962c88b2eb8f0d4c9796671 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 17 Dec 2024 17:58:30 -0500 Subject: [PATCH 40/84] Procstar agent program stop implementation. --- python/apsis/lib/json.py | 4 + python/apsis/program/procstar/agent.py | 198 +++++++++++++++++++------ 2 files changed, 154 insertions(+), 48 deletions(-) diff --git a/python/apsis/lib/json.py b/python/apsis/lib/json.py index 605a1ea5..04b2a1bc 100644 --- a/python/apsis/lib/json.py +++ b/python/apsis/lib/json.py @@ -122,6 +122,10 @@ def nkey(name, value): return {} if value is None else {name: value} +def ifkey(name, value, default): + return {} if value == default else {name: value} + + #------------------------------------------------------------------------------- class TypedJso: diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index 5a4d4963..81a0cd7d 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -1,15 +1,17 @@ import asyncio +from dataclasses import dataclass import logging import procstar.spec from procstar.agent.exc import NoConnectionError, NoOpenConnectionInGroup, ProcessUnknownError from procstar.agent.proc import FdData, Interval, Result +from signal import Signals import traceback import uuid from apsis.lib import asyn -from apsis.lib.json import check_schema +from apsis.lib.json import check_schema, ifkey from apsis.lib.parse import nparse_duration -from apsis.lib.py import or_none, get_cfg +from apsis.lib.py import or_none, nstr, get_cfg from apsis.lib.sys import to_signal from apsis.procstar import get_agent_server from apsis.program import base @@ -17,6 +19,36 @@ log = logging.getLogger(__name__) +ntemplate_expand = or_none(template_expand) + +#------------------------------------------------------------------------------- + +@dataclass +class Stop: + """ + Specification for how to stop a running agent program. + """ + + signal: Signals = Signals.SIGTERM + grace_period: int = 60 + + def to_jso(self): + cls = type(self) + return ( + ifkey("signal", self.signal, cls.signal) + | ifkey("grace_period", self.grace_period, cls.grace_period) + ) + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso or {}) as pop: + signal = pop("signal", Signals.__getattr__, cls.signal) + grace_period = pop("grace_period", int, cls.grace_period) + return cls(signal, grace_period) + + + #------------------------------------------------------------------------------- SUDO_ARGV_DEFAULT = ["/usr/bin/sudo", "--preserve-env", "--set-home"] @@ -143,10 +175,15 @@ async def _make_outputs(fd_data): class BoundProcstarProgram(base.Program): - def __init__(self, argv, *, group_id, sudo_user=None): + def __init__( + self, argv, *, group_id, + sudo_user =None, + stop =Stop(), + ): self.__argv = [ str(a) for a in argv ] - self.__group_id = str(group_id) - self.__sudo_user = None if sudo_user is None else str(sudo_user) + self.__group_id = str(group_id) + self.__sudo_user = nstr(sudo_user) + self.__stop = stop def __str__(self): @@ -154,10 +191,15 @@ def __str__(self): def to_jso(self): - return super().to_jso() | { - "argv" : self.__argv, - "group_id" : self.__group_id, - } | if_not_none("sudo_user", self.__sudo_user) + return ( + super().to_jso() + | { + "argv" : self.__argv, + "group_id" : self.__group_id, + } + | if_not_none("sudo_user", self.__sudo_user) + | ifkey("stop", self.__stop.to_jso(), {}) + ) @classmethod @@ -166,7 +208,8 @@ def from_jso(cls, jso): argv = pop("argv") group_id = pop("group_id", default=procstar.proto.DEFAULT_GROUP) sudo_user = pop("sudo_user", default=None) - return cls(argv, group_id=group_id, sudo_user=sudo_user) + stop = Stop.from_jso(pop("stop", default={})) + return cls(argv, group_id=group_id, sudo_user=sudo_user, stop=stop) def get_spec(self, cfg, *, run_id): @@ -212,7 +255,7 @@ async def run(self, run_id, cfg): Returns an async iterator of program updates. """ - server = get_agent_server() + server = get_agent_server() agent_cfg = get_cfg(cfg, "procstar.agent", {}) run_cfg = get_cfg(agent_cfg, "run", {}) conn_timeout = get_cfg(agent_cfg, "connection.start_timeout", None) @@ -417,13 +460,30 @@ def more_output(): await self.__delete(proc) + async def stop(self, run_state): + stop = self.__stop + + # Send the stop signal. + await self.signal(None, run_state, stop.signal) + + if stop.grace_period is not None: + # Wait for the grace period to expire. + await asyncio.sleep(stop.grace_period) + # Send a kill signal. + try: + await self.signal(None, run_state, Signals.SIGKILL) + except ValueError: + # Proc is gone; that's OK. + pass + + async def signal(self, run_id, run_state, signal): server = get_agent_server() signal = to_signal(signal) - log.info(f"sending signal: {run_id}: {signal}") proc_id = run_state["proc_id"] + log.info(f"sending signal: {proc_id}: {signal}") try: proc = server.processes[proc_id] except KeyError: @@ -434,16 +494,72 @@ async def signal(self, run_id, run_state, signal): #------------------------------------------------------------------------------- -class ProcstarProgram(base.Program): +class _ProcstarProgram(base.Program): + """ + Base class for (unbound) Procstar program types. + """ def __init__( - self, argv, *, + self, *, group_id =procstar.proto.DEFAULT_GROUP, - sudo_user =None + sudo_user =None, + stop =Stop(Stop.signal.name, Stop.grace_period), ): - self.__argv = [ str(a) for a in argv ] + super().__init__() self.__group_id = str(group_id) self.__sudo_user = None if sudo_user is None else str(sudo_user) + self.__stop = stop + + + def _bind(self, argv, args): + ntemplate_expand = or_none(template_expand) + stop = Stop( + to_signal(template_expand(self.__stop.signal, args)), + nparse_duration(ntemplate_expand(self.__stop.grace_period, args)) + ) + return BoundProcstarProgram( + argv, + group_id =ntemplate_expand(self.__group_id, args), + sudo_user =ntemplate_expand(self.__sudo_user, args), + stop =stop, + ) + + + def to_jso(self): + stop = ( + ifkey("signal", self.__stop.signal, Stop.signal.name) + | ifkey("grace_period", self.__stop.grace_period, Stop.grace_period) + ) + return ( + super().to_jso() + | { + "group_id" : self.__group_id, + } + | if_not_none("sudo_user", self.__sudo_user) + | ifkey("stop", stop, {}) + ) + + + @staticmethod + def _from_jso(pop): + with check_schema(pop("stop", default={})) as spop: + signal = spop("signal", str, default=Stop.signal.name) + grace_period = spop("grace_period", default=Stop.grace_period) + return dict( + group_id =pop("group_id", default=procstar.proto.DEFAULT_GROUP), + sudo_user =pop("sudo_user", default=None), + stop =Stop(signal, grace_period), + ) + + + +#------------------------------------------------------------------------------- + +class ProcstarProgram(_ProcstarProgram): + + def __init__(self, argv, **kw_args): + super().__init__(**kw_args) + self.__argv = [ str(a) for a in argv ] def __str__(self): @@ -451,67 +567,53 @@ def __str__(self): def bind(self, args): - argv = tuple( template_expand(a, args) for a in self.__argv ) - group_id = or_none(template_expand)(self.__group_id, args) - sudo_user = or_none(template_expand)(self.__sudo_user, args) - return BoundProcstarProgram( - argv, group_id=group_id, sudo_user=sudo_user) + argv = tuple( template_expand(a, args) for a in self.__argv ) + return super()._bind(argv, args) def to_jso(self): - return super().to_jso() | { - "argv" : self.__argv, - "group_id" : self.__group_id, - } | if_not_none("sudo_user", self.__sudo_user) + return super().to_jso() | {"argv" : self.__argv} @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: argv = pop("argv") - group_id = pop("group_id", default=procstar.proto.DEFAULT_GROUP) - sudo_user = pop("sudo_user", default=None) - return cls(argv, group_id=group_id, sudo_user=sudo_user) + kw_args = cls._from_jso(pop) + return cls(argv, **kw_args) #------------------------------------------------------------------------------- -class ProcstarShellProgram(base.Program): +class ProcstarShellProgram(_ProcstarProgram): SHELL = "/usr/bin/bash" - def __init__( - self, command, *, - group_id =procstar.proto.DEFAULT_GROUP, - sudo_user =None, - ): - self.__command = str(command) - self.__group_id = str(group_id) - self.__sudo_user = None if sudo_user is None else str(sudo_user) + def __init__(self, command, **kw_args): + super().__init__(**kw_args) + self.__command = str(command) + + + def __str__(self): + return self.__command def bind(self, args): - argv = [self.SHELL, "-c", template_expand(self.__command, args)] - group_id = or_none(template_expand)(self.__group_id, args) - sudo_user = or_none(template_expand)(self.__sudo_user, args) - return BoundProcstarProgram(argv, group_id=group_id, sudo_user=sudo_user) + argv = [self.SHELL, "-c", template_expand(self.__command, args)] + return super()._bind(argv, args) def to_jso(self): - return super().to_jso() | { - "command" : self.__command, - "group_id" : self.__group_id, - } | if_not_none("sudo_user", self.__sudo_user) + return super().to_jso() | {"command" : self.__command} @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: command = pop("command") - group_id = pop("group_id", default=procstar.proto.DEFAULT_GROUP) - sudo_user = pop("sudo_user", default=None) - return cls(command, group_id=group_id, sudo_user=sudo_user) + kw_args = cls._from_jso(pop) + return cls(command, **kw_args) From d191cfdf7657a67fe8e0c1b04d21366649cfadc2 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 18 Dec 2024 10:25:03 -0500 Subject: [PATCH 41/84] Todo. --- notes/notes.md | 9 +++++++++ notes/todo.md | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/notes/notes.md b/notes/notes.md index 05e11ab9..5ef95db5 100644 --- a/notes/notes.md +++ b/notes/notes.md @@ -8,6 +8,7 @@ program: stop: signal: SIGTERM grace_period: 1m + self_stop: false schedule: start: @@ -19,6 +20,14 @@ schedule: ``` +| | running | stopping | +|-----------|---------|----------| +| exit ==0 | success | success | +| exit !=0 | failure | failure | +| stop sig | failure | success | +| other sig | failure | failure | + + # SQLite diff --git a/notes/todo.md b/notes/todo.md index 51807479..615482b2 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -44,15 +44,15 @@ - [x] add stop method to program - [x] actually stop the program at the stop time - [x] go through _stopping_ state - - [ ] Procstar stop method + - [x] Procstar stop method + - [ ] distinguish between "service"-type programs in Procstar program - [ ] ProcessProgram stop method - - distinguish between "service"-type programs - - [ ] classic agent program stop method - [ ] "stop" operation - [ ] add stop time to run view - [ ] add stop time to runs table (NO?) - [ ] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job + - [ ] classic agent program stop method (or skip?) - [ ] if `send_signal` raises, error the run - [ ] improve `apsis job` output style From e0d3a6fd11d4bb9ded9bfddad1b14428e7d781f6 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 18 Dec 2024 10:25:39 -0500 Subject: [PATCH 42/84] Stopped run can succeed, WIP doesn't work. --- python/apsis/program/procstar/agent.py | 28 +++++++++++++++++++------- python/apsis/running.py | 14 +++++++++---- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index 81a0cd7d..f92309c8 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -292,7 +292,7 @@ async def run(self, run_id, cfg): ) # Hand off to __finish. - async for update in self.__finish(proc, res, run_cfg): + async for update in self.__finish(proc, res, run_cfg, run_state): yield update @@ -326,11 +326,11 @@ async def connect(self, run_id, run_state, cfg): else: log.info(f"reconnected: {proc_id} on conn {conn_id}") # Hand off to __finish. - async for update in self.__finish(proc, None, run_cfg): + async for update in self.__finish(proc, None, run_cfg, run_state): yield update - async def __finish(self, proc, res, run_cfg): + async def __finish(self, proc, res, run_cfg, run_state): """ Handles running `proc` until termination. @@ -419,18 +419,32 @@ def more_output(): outputs = await _make_outputs(fd_data) - if res.status.exit_code == 0: + log.debug( + f"terminated; stopping={run_state.get('stopping', False)} " + f"exit={res.status.exit_code!r} signal={res.status.signal!r}" + ) + if ( + res.status.exit_code == 0 + or ( + # The program is stopping and the process exited from + # the stop signal. + run_state.get("stopping", False) + and res.status.signal is not None + and Signals[res.status.signal] == self.__stop.signal + ) + ): # The process terminated successfully. yield base.ProgramSuccess(meta=meta, outputs=outputs) else: # The process terminated unsuccessfully. exit_code = res.status.exit_code signal = res.status.signal - cause = ( + yield base.ProgramFailure( f"exit code {exit_code}" if signal is None - else f"killed by {signal}" + else f"killed by {signal}", + meta=meta, + outputs=outputs ) - yield base.ProgramFailure(cause, meta=meta, outputs=outputs) except asyncio.CancelledError: # Don't clean up the proc; we can reconnect. diff --git a/python/apsis/running.py b/python/apsis/running.py index aa50ba71..4ac632f3 100644 --- a/python/apsis/running.py +++ b/python/apsis/running.py @@ -83,13 +83,19 @@ async def _process_updates(apsis, run, updates, program): async def stop(): # Wait until the stop time. duration = stop_time - now() - log.debug(f"{run_id}: running for {duration} s until stop") + log.debug(f"{run_id}: running for {duration:.3f} s until stop") await asyncio.sleep(duration) + # Ask the run to stop. + try: + await program.stop(run.run_state) + except: + log.info("program.stop() exception", exc_info=True) # Transition to stopping. apsis.run_log.record(run, "stopping") - apsis._transition(run, State.stopping, run_state=run.run_state) - # Ask the run to stop. - await program.stop(run.run_state) + apsis._transition( + run, State.stopping, + run_state=run.run_state | {"stopping": True} + ) # The main update loop handles updates in response. stop_task = asyncio.create_task(stop()) From 2b3a547a60ec4879301145a042fc337f842df60e Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Thu, 19 Dec 2024 17:43:03 -0500 Subject: [PATCH 43/84] Refactor. --- python/apsis/program/procstar/agent.py | 318 +++++++++++++------------ 1 file changed, 161 insertions(+), 157 deletions(-) diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index f92309c8..f6ab4e07 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -240,15 +240,6 @@ def get_spec(self, cfg, *, run_id): ) - async def __delete(self, proc): - try: - # Request deletion. - await proc.delete() - except Exception as exc: - # Just log this; from Apsis's standpoint, the proc is long done. - log.error(f"delete {proc.proc_id}: {exc}") - - async def run(self, run_id, cfg): """ Runs the program. @@ -282,6 +273,7 @@ async def run(self, run_id, cfg): yield base.ProgramError(f"procstar: {exc}") else: + # Up and running. run_state = { "conn_id": proc.conn_id, "proc_id": proc_id, @@ -291,8 +283,7 @@ async def run(self, run_id, cfg): meta=_make_metadata(proc_id, res) ) - # Hand off to __finish. - async for update in self.__finish(proc, res, run_cfg, run_state): + async for update in _run(Instance(proc, run_cfg, run_state), res): yield update @@ -324,156 +315,12 @@ async def connect(self, run_id, run_state, cfg): yield base.ProgramError(msg) else: + # Successfully connected. log.info(f"reconnected: {proc_id} on conn {conn_id}") - # Hand off to __finish. - async for update in self.__finish(proc, None, run_cfg, run_state): + async for update in _run(Instance(proc, run_cfg, run_state), None): yield update - async def __finish(self, proc, res, run_cfg, run_state): - """ - Handles running `proc` until termination. - - :param res: - The most recent `Result`, if any. - """ - proc_id = proc.proc_id - tasks = asyn.TaskGroup() - - try: - # Output collected so far. - fd_data = None - - # Start tasks to request periodic updates of results and output. - update_interval = nparse_duration(run_cfg.get("update_interval", None)) - output_interval = nparse_duration(run_cfg.get("output_interval", None)) - - if update_interval is not None: - # Start a task that periodically requests the current result. - tasks.add( - "poll update", - asyn.poll(proc.request_result, update_interval) - ) - - if output_interval is not None: - # Start a task that periodically requests additional output. - def more_output(): - # From the current position to the end. - start = 0 if fd_data is None else fd_data.interval.stop - interval = Interval(start, None) - return proc.request_fd_data("stdout", interval=interval) - - tasks.add("poll output", asyn.poll(more_output, output_interval)) - - # Process further updates, until the process terminates. - async for update in proc.updates: - match update: - case FdData(): - fd_data = _combine_fd_data(fd_data, update) - yield base.ProgramUpdate(outputs=await _make_outputs(fd_data)) - - case Result() as res: - meta = _make_metadata(proc_id, res) - - if res.state == "running": - # Intermediate result. - yield base.ProgramUpdate(meta=meta) - else: - # Process terminated. - break - - else: - # Proc was deleted--but we didn't delete it. - assert False, "proc deleted" - - # Stop update tasks. - await tasks.cancel_all() - - # Do we have the complete output? - length = res.fds.stdout.length - if length > 0 and ( - fd_data is None - or fd_data.interval.stop < length - ): - # Request any remaining output. - await proc.request_fd_data( - "stdout", - interval=Interval( - 0 if fd_data is None else fd_data.interval.stop, - None - ) - ) - # Wait for it. - async for update in proc.updates: - match update: - case FdData(): - fd_data = _combine_fd_data(fd_data, update) - # Confirm that we've accumulated all the output as - # specified in the result. - assert fd_data.interval.start == 0 - assert fd_data.interval.stop == res.fds.stdout.length - break - - case _: - log.debug("expected final FdData") - - outputs = await _make_outputs(fd_data) - - log.debug( - f"terminated; stopping={run_state.get('stopping', False)} " - f"exit={res.status.exit_code!r} signal={res.status.signal!r}" - ) - if ( - res.status.exit_code == 0 - or ( - # The program is stopping and the process exited from - # the stop signal. - run_state.get("stopping", False) - and res.status.signal is not None - and Signals[res.status.signal] == self.__stop.signal - ) - ): - # The process terminated successfully. - yield base.ProgramSuccess(meta=meta, outputs=outputs) - else: - # The process terminated unsuccessfully. - exit_code = res.status.exit_code - signal = res.status.signal - yield base.ProgramFailure( - f"exit code {exit_code}" if signal is None - else f"killed by {signal}", - meta=meta, - outputs=outputs - ) - - except asyncio.CancelledError: - # Don't clean up the proc; we can reconnect. - proc = None - - except ProcessUnknownError: - # Don't ask to clean it up; it's already gone. - proc = None - - except Exception as exc: - log.error(f"procstar: {traceback.format_exc()}") - - yield base.ProgramError( - f"procstar: {exc}", - meta=( - _make_metadata(proc_id, res) - if proc is not None and res is not None - else {} - ) - ) - - finally: - # Cancel our helper tasks. - await tasks.cancel_all() - if proc is not None: - # Giving up on this proc; ask the agent to delete it. - await self.__delete(proc) - - async def stop(self, run_state): stop = self.__stop @@ -506,6 +353,163 @@ async def signal(self, run_id, run_state, signal): +#------------------------------------------------------------------------------- + +@dataclass +class Instance: + proc: procstar.agent.proc.Process + run_cfg: dict + run_state: dict + + +async def _run(inst, res): + """ + Handles running `inst` until termination. + + :param res: + The most recent `Result`, if any. + """ + proc_id = inst.proc.proc_id + tasks = asyn.TaskGroup() + + try: + # Output collected so far. + fd_data = None + + # Start tasks to request periodic updates of results and output. + update_interval = inst.run_cfg.get("update_interval", None) + update_interval = nparse_duration(update_interval) + output_interval = inst.run_cfg.get("output_interval", None) + output_interval = nparse_duration(output_interval) + + if update_interval is not None: + # Start a task that periodically requests the current result. + tasks.add( + "poll update", + asyn.poll(inst.proc.request_result, update_interval) + ) + + if output_interval is not None: + # Start a task that periodically requests additional output. + def more_output(): + # From the current position to the end. + start = 0 if fd_data is None else fd_data.interval.stop + interval = Interval(start, None) + return inst.proc.request_fd_data("stdout", interval=interval) + + tasks.add("poll output", asyn.poll(more_output, output_interval)) + + # Process further updates, until the process terminates. + async for update in inst.proc.updates: + match update: + case FdData(): + fd_data = _combine_fd_data(fd_data, update) + yield base.ProgramUpdate(outputs=await _make_outputs(fd_data)) + + case Result() as res: + meta = _make_metadata(proc_id, res) + + if res.state == "running": + # Intermediate result. + yield base.ProgramUpdate(meta=meta) + else: + # Process terminated. + break + + else: + # Proc was deleted--but we didn't delete it. + assert False, "proc deleted" + + # Stop update tasks. + await tasks.cancel_all() + + # Do we have the complete output? + length = res.fds.stdout.length + if length > 0 and ( + fd_data is None + or fd_data.interval.stop < length + ): + # Request any remaining output. + await inst.proc.request_fd_data( + "stdout", + interval=Interval( + 0 if fd_data is None else fd_data.interval.stop, + None + ) + ) + # Wait for it. + async for update in inst.proc.updates: + match update: + case FdData(): + fd_data = _combine_fd_data(fd_data, update) + # Confirm that we've accumulated all the output as + # specified in the result. + assert fd_data.interval.start == 0 + assert fd_data.interval.stop == res.fds.stdout.length + break + + case _: + log.debug("expected final FdData") + + outputs = await _make_outputs(fd_data) + + if ( + res.status.exit_code == 0 + # FIXME: Orderly stop condition. + # or ( + # # The program is stopping and the process exited from + # # the stop signal. + # inst.run_state.get("stopping", False) + # and res.status.signal is not None + # and Signals[res.status.signal] == self.__stop.signal + # ) + ): + # The process terminated successfully. + yield base.ProgramSuccess(meta=meta, outputs=outputs) + else: + # The process terminated unsuccessfully. + exit_code = res.status.exit_code + signal = res.status.signal + yield base.ProgramFailure( + f"exit code {exit_code}" if signal is None + else f"killed by {signal}", + meta=meta, + outputs=outputs + ) + + except asyncio.CancelledError: + # Don't clean up the proc; we can reconnect. + inst.proc = None + + except ProcessUnknownError: + # Don't ask to clean it up; it's already gone. + inst.proc = None + + except Exception as exc: + log.error(f"procstar: {traceback.format_exc()}") + + yield base.ProgramError( + f"procstar: {exc}", + meta=( + _make_metadata(proc_id, res) + if inst.proc is not None and res is not None + else {} + ) + ) + + finally: + # Cancel our helper tasks. + await tasks.cancel_all() + if inst.proc is not None: + # Done with this proc; ask the agent to delete it. + try: + # Request deletion. + await inst.proc.delete() + except Exception as exc: + # Just log this; for Apsis, the proc is done. + log.error(f"delete {inst.proc.proc_id}: {exc}") + + #------------------------------------------------------------------------------- class _ProcstarProgram(base.Program): From ebd214fe0aaa28531aaa58a4e63edfa3ebe02a53 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 20 Dec 2024 13:18:12 -0500 Subject: [PATCH 44/84] Program may be missing in response. --- python/apsis/cmdline.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/apsis/cmdline.py b/python/apsis/cmdline.py index f44e0c9f..e7863e30 100644 --- a/python/apsis/cmdline.py +++ b/python/apsis/cmdline.py @@ -184,8 +184,10 @@ def header(title): elapsed = get_run_elapsed(now(), run) elapsed = "" if elapsed is None else format_duration(elapsed) - header("Program") - con.print(format_program(run["program"], verbosity=verbosity)) + program = run.get("program", None) + if program: + header("Program") + con.print(format_program(program, verbosity=verbosity)) # Format conds. header("Conditions") From 414b161933fde20101c4ddfee42200911090e1b4 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 20 Dec 2024 13:18:35 -0500 Subject: [PATCH 45/84] Running program object support and for no-op program. --- python/apsis/apsis.py | 11 ++-- python/apsis/program/base.py | 26 +++++++- python/apsis/program/noop.py | 120 +++++++++++++++++++++-------------- python/apsis/running.py | 11 +++- python/apsis/runs.py | 3 + 5 files changed, 115 insertions(+), 56 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 3b11eac6..5e85a0fb 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -235,16 +235,17 @@ def _start(self, run): Runs the run's program in a task added to `__run_tasks`. """ + assert run._running_program is None # Start the run by running its program. self.run_log.record(run, "starting") self._transition(run, State.starting) # Call the program. This produces an async iterator of updates. - updates = run.program.run( + run._running_program = run.program.run( run.run_id, self if isinstance(run.program, _InternalProgram) else self.cfg, ) # Start a task to process updates from the program. - run_task = _process_updates(self, run, updates, run.program) + run_task = _process_updates(self, run) self.__run_tasks.add(run.run_id, run_task) @@ -254,17 +255,19 @@ def __reconnect(self, run): Finishes running the run's program in a task added to `__run_tasks`. """ + # FIXME: Reconnect starting or stopping programs? assert run.state == State.running assert run.run_state is not None + assert run._running_program is None self.run_log.record(run, "reconnecting") # Connect to the program. This produces an async iterator of updates. - updates = run.program.connect( + run._running_program = run.program.connect( run.run_id, run.run_state, self if isinstance(run.program, _InternalProgram) else self.cfg, ) # Start a task to process updates from the program. - run_task = _process_updates(self, run, updates, run.program) + run_task = _process_updates(self, run) self.__run_tasks.add(run.run_id, run_task) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index 6d45be54..627d1d2a 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -177,6 +177,24 @@ def bind(self, args): #------------------------------------------------------------------------------- +class RunningProgram: + + # run_id: str + # updates: async iter + + def __init__(self, run_id): + self.run_id = run_id + + + async def stop(self): + raise NotImplementedError("not implemented: stop()") + + + async def signal(self, signal): + raise NotImplementedError("not implemented: signal()") + + + class Program(TypedJso): """ Program base class. @@ -235,7 +253,8 @@ def from_jso(cls, jso): return TypedJso.from_jso.__func__(cls, jso) - async def run(self, run_id, cfg): + # FIXME: Not async. + async def run(self, run_id, cfg) -> RunningProgram: """ Runs the program. @@ -266,9 +285,10 @@ async def run(self, run_id, cfg): yield success + # FIXME: Not async. # FIXME: Remove `run_id` from API. The program should store this in # `run_state`, if it needs it. - async def connect(self, run_id, run_state, cfg): + async def connect(self, run_id, run_state, cfg) -> RunningProgram: """ Connects to the running program specified by `run_state`. @@ -311,7 +331,7 @@ def bind(self, args): pass - async def start(self, run_id, apsis): + def start(self, run_id, apsis): pass diff --git a/python/apsis/program/noop.py b/python/apsis/program/noop.py index 0e6acf71..80e55076 100644 --- a/python/apsis/program/noop.py +++ b/python/apsis/program/noop.py @@ -1,8 +1,11 @@ import asyncio import logging +import ora from .base import ( - Program, ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError) + Program, RunningProgram, + ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError) +from apsis.lib import memo from apsis.lib.json import check_schema from apsis.lib.parse import parse_duration from apsis.lib.py import or_none, nstr, nbool @@ -15,19 +18,19 @@ class NoOpProgram(Program): def __init__(self, *, duration=0, success=True): - self.__duration = nstr(duration) - self.__success = nbool(success) + self.duration = nstr(duration) + self.success = nbool(success) def __str__(self): return "no-op" + ( - "" if self.__duration is None else f" for {self.__duration} s" + "" if self.duration is None else f" for {self.duration} s" ) def bind(self, args): - duration = or_none(template_expand)(self.__duration, args) - return BoundNoOpProgram(duration=duration, success=self.__success) + duration = or_none(template_expand)(self.duration, args) + return BoundNoOpProgram(duration=duration, success=self.success) @classmethod @@ -41,8 +44,8 @@ def from_jso(cls, jso): def to_jso(self): return { **super().to_jso(), - "duration" : self.__duration, - "success" : self.__success, + "duration" : self.duration, + "success" : self.success, } @@ -57,16 +60,14 @@ class BoundNoOpProgram(Program): the program always succeeds; if false, it fails; if none, it errors. """ - __stop_events = {} - def __init__(self, *, duration=0, success=True): - self.__duration = nstr(duration) - self.__success = None if success is None else bool(success) + self.duration = nstr(duration) + self.success = None if success is None else bool(success) def __str__(self): return "no-op" + ( - "" if self.__duration is None else f" for {self.__duration} s" + "" if self.duration is None else f" for {self.duration} s" ) @@ -81,54 +82,81 @@ def from_jso(cls, jso): def to_jso(self): return { **super().to_jso(), - "duration" : self.__duration, - "success" : self.__success, + "duration" : self.duration, + "success" : self.success, } - async def run(self, run_id, cfg): - run_state = {"run_id": run_id} - yield ProgramRunning(run_state) - async for update in self.wait(run_state): - yield update + def run(self, run_id, cfg) -> RunningProgram: + return RunningNoopProgram(self, run_id, None) + + + def connect(self, run_id, run_state) -> RunningProgram: + return RunningNoopProgram(self, run_id, run_state) + - async def wait(self, run_state): - if self.__duration is not None: - run_id = run_state["run_id"] - duration = parse_duration(self.__duration) - stop_event = self.__stop_events[run_id] = asyncio.Event() +#------------------------------------------------------------------------------- + +class RunningNoopProgram(RunningProgram): + """ + A running instance of a no-op program. + """ + + # FIXME: Should run_state belong to RunningProgram? + + def __init__(self, program, run_id, run_state): + """ + :param run_state: + Existing run state when connecting to an existing program, else none. + """ + super().__init__(run_id) + self.program = program + self.run_state = run_state + # Signals that the program was stopped. + self.stop_event = asyncio.Event() + + + @memo.property + async def updates(self): + if self.run_state is None: + # New instance. Record start time in the run state, so we know when + # to stop. + start = ora.now() + self.run_state = {"start": str(start)} + yield ProgramRunning(self.run_state) + else: + # Existing instance. + start = ora.Time(self.run_state["start"]) + + if self.program.duration is not None: + duration = parse_duration(self.program.duration) + timeout = start + duration - ora.now() try: - await asyncio.wait_for(stop_event.wait(), duration) + await asyncio.wait_for(self.stop_event.wait(), timeout) except asyncio.TimeoutError: # OK, duration expired. pass - else: - yield ProgramError("program stopped") - return - finally: - assert self.__stop_events.pop(run_id) == stop_event - - if self.__success is True: - yield ProgramSuccess() - elif self.__success is False: - yield ProgramFailure("failed") - else: - yield ProgramError("error") + if self.stop_event.is_set(): + yield ProgramError("program stopped") + match self.program.success: + case True: + yield ProgramSuccess() + case False: + yield ProgramFailure("failed") + case None: + yield ProgramError("error") + case _: + assert False - def connect(self, run_id, run_state): - return self.wait(run_state) + + async def stop(self): + self.stop_event.set() async def signal(self, run_id, run_state, signal): log.info("ignoring signal to no-op program") - async def stop(self, run_state): - run_id = run_state["run_id"] - stop_event = self.__stop_events[run_id] - stop_event.set() - - diff --git a/python/apsis/running.py b/python/apsis/running.py index 4ac632f3..82576eba 100644 --- a/python/apsis/running.py +++ b/python/apsis/running.py @@ -38,12 +38,14 @@ async def _cmpr(output): return dict(zip(outputs.keys(), o)) -async def _process_updates(apsis, run, updates, program): +async def _process_updates(apsis, run): """ Processes program `updates` for `run` until the program is finished. """ + assert run._running_program is not None + run_id = run.run_id - updates = aiter(updates) + updates = aiter(run._running_program.updates) try: if run.state == State.starting: @@ -87,7 +89,7 @@ async def stop(): await asyncio.sleep(duration) # Ask the run to stop. try: - await program.stop(run.run_state) + await run._running_program.stop(run) except: log.info("program.stop() exception", exc_info=True) # Transition to stopping. @@ -180,4 +182,7 @@ async def stop(): apsis._update_output_data(run, {"outputs": output}, True) apsis._transition(run, State.error, force=True) + finally: + run._running_program = None + diff --git a/python/apsis/runs.py b/python/apsis/runs.py index ed7d7dc2..386a9a7a 100644 --- a/python/apsis/runs.py +++ b/python/apsis/runs.py @@ -183,6 +183,7 @@ class Run: "run_state", "_summary_jso_cache", "_rowid", + "_running_program", ) def __init__(self, inst, *, expected=False): @@ -213,6 +214,8 @@ def __init__(self, inst, *, expected=False): # Cached summary JSO object. self._summary_jso_cache = None + # Running program instance, in states starting, running, stopping. + self._running_program = None def __hash__(self): From f8188887e2e4ee918a6542774d2fbe92bc914568 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 20 Dec 2024 15:01:19 -0500 Subject: [PATCH 46/84] Reorganize module. --- python/apsis/program/procstar/agent.py | 306 ++++++++++++------------- 1 file changed, 153 insertions(+), 153 deletions(-) diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index f6ab4e07..0fe99a88 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -21,34 +21,6 @@ ntemplate_expand = or_none(template_expand) -#------------------------------------------------------------------------------- - -@dataclass -class Stop: - """ - Specification for how to stop a running agent program. - """ - - signal: Signals = Signals.SIGTERM - grace_period: int = 60 - - def to_jso(self): - cls = type(self) - return ( - ifkey("signal", self.signal, cls.signal) - | ifkey("grace_period", self.grace_period, cls.grace_period) - ) - - - @classmethod - def from_jso(cls, jso): - with check_schema(jso or {}) as pop: - signal = pop("signal", Signals.__getattr__, cls.signal) - grace_period = pop("grace_period", int, cls.grace_period) - return cls(signal, grace_period) - - - #------------------------------------------------------------------------------- SUDO_ARGV_DEFAULT = ["/usr/bin/sudo", "--preserve-env", "--set-home"] @@ -171,6 +143,159 @@ async def _make_outputs(fd_data): return base.program_outputs(output, length=length, compression=None) +#------------------------------------------------------------------------------- + +@dataclass +class Stop: + """ + Specification for how to stop a running agent program. + """ + + signal: Signals = Signals.SIGTERM + grace_period: int = 60 + + def to_jso(self): + cls = type(self) + return ( + ifkey("signal", self.signal, cls.signal) + | ifkey("grace_period", self.grace_period, cls.grace_period) + ) + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso or {}) as pop: + signal = pop("signal", Signals.__getattr__, cls.signal) + grace_period = pop("grace_period", int, cls.grace_period) + return cls(signal, grace_period) + + + +#------------------------------------------------------------------------------- + +class _ProcstarProgram(base.Program): + """ + Base class for (unbound) Procstar program types. + """ + + def __init__( + self, *, + group_id =procstar.proto.DEFAULT_GROUP, + sudo_user =None, + stop =Stop(Stop.signal.name, Stop.grace_period), + ): + super().__init__() + self.__group_id = str(group_id) + self.__sudo_user = None if sudo_user is None else str(sudo_user) + self.__stop = stop + + + def _bind(self, argv, args): + ntemplate_expand = or_none(template_expand) + stop = Stop( + to_signal(template_expand(self.__stop.signal, args)), + nparse_duration(ntemplate_expand(self.__stop.grace_period, args)) + ) + return BoundProcstarProgram( + argv, + group_id =ntemplate_expand(self.__group_id, args), + sudo_user =ntemplate_expand(self.__sudo_user, args), + stop =stop, + ) + + + def to_jso(self): + stop = ( + ifkey("signal", self.__stop.signal, Stop.signal.name) + | ifkey("grace_period", self.__stop.grace_period, Stop.grace_period) + ) + return ( + super().to_jso() + | { + "group_id" : self.__group_id, + } + | if_not_none("sudo_user", self.__sudo_user) + | ifkey("stop", stop, {}) + ) + + + @staticmethod + def _from_jso(pop): + with check_schema(pop("stop", default={})) as spop: + signal = spop("signal", str, default=Stop.signal.name) + grace_period = spop("grace_period", default=Stop.grace_period) + return dict( + group_id =pop("group_id", default=procstar.proto.DEFAULT_GROUP), + sudo_user =pop("sudo_user", default=None), + stop =Stop(signal, grace_period), + ) + + + +#------------------------------------------------------------------------------- + +class ProcstarProgram(_ProcstarProgram): + + def __init__(self, argv, **kw_args): + super().__init__(**kw_args) + self.__argv = [ str(a) for a in argv ] + + + def __str__(self): + return join_args(self.__argv) + + + def bind(self, args): + argv = tuple( template_expand(a, args) for a in self.__argv ) + return super()._bind(argv, args) + + + def to_jso(self): + return super().to_jso() | {"argv" : self.__argv} + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + argv = pop("argv") + kw_args = cls._from_jso(pop) + return cls(argv, **kw_args) + + + +#------------------------------------------------------------------------------- + +class ProcstarShellProgram(_ProcstarProgram): + + SHELL = "/usr/bin/bash" + + def __init__(self, command, **kw_args): + super().__init__(**kw_args) + self.__command = str(command) + + + def __str__(self): + return self.__command + + + def bind(self, args): + argv = [self.SHELL, "-c", template_expand(self.__command, args)] + return super()._bind(argv, args) + + + def to_jso(self): + return super().to_jso() | {"command" : self.__command} + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + command = pop("command") + kw_args = cls._from_jso(pop) + return cls(command, **kw_args) + + + #------------------------------------------------------------------------------- class BoundProcstarProgram(base.Program): @@ -510,128 +635,3 @@ def more_output(): log.error(f"delete {inst.proc.proc_id}: {exc}") -#------------------------------------------------------------------------------- - -class _ProcstarProgram(base.Program): - """ - Base class for (unbound) Procstar program types. - """ - - def __init__( - self, *, - group_id =procstar.proto.DEFAULT_GROUP, - sudo_user =None, - stop =Stop(Stop.signal.name, Stop.grace_period), - ): - super().__init__() - self.__group_id = str(group_id) - self.__sudo_user = None if sudo_user is None else str(sudo_user) - self.__stop = stop - - - def _bind(self, argv, args): - ntemplate_expand = or_none(template_expand) - stop = Stop( - to_signal(template_expand(self.__stop.signal, args)), - nparse_duration(ntemplate_expand(self.__stop.grace_period, args)) - ) - return BoundProcstarProgram( - argv, - group_id =ntemplate_expand(self.__group_id, args), - sudo_user =ntemplate_expand(self.__sudo_user, args), - stop =stop, - ) - - - def to_jso(self): - stop = ( - ifkey("signal", self.__stop.signal, Stop.signal.name) - | ifkey("grace_period", self.__stop.grace_period, Stop.grace_period) - ) - return ( - super().to_jso() - | { - "group_id" : self.__group_id, - } - | if_not_none("sudo_user", self.__sudo_user) - | ifkey("stop", stop, {}) - ) - - - @staticmethod - def _from_jso(pop): - with check_schema(pop("stop", default={})) as spop: - signal = spop("signal", str, default=Stop.signal.name) - grace_period = spop("grace_period", default=Stop.grace_period) - return dict( - group_id =pop("group_id", default=procstar.proto.DEFAULT_GROUP), - sudo_user =pop("sudo_user", default=None), - stop =Stop(signal, grace_period), - ) - - - -#------------------------------------------------------------------------------- - -class ProcstarProgram(_ProcstarProgram): - - def __init__(self, argv, **kw_args): - super().__init__(**kw_args) - self.__argv = [ str(a) for a in argv ] - - - def __str__(self): - return join_args(self.__argv) - - - def bind(self, args): - argv = tuple( template_expand(a, args) for a in self.__argv ) - return super()._bind(argv, args) - - - def to_jso(self): - return super().to_jso() | {"argv" : self.__argv} - - - @classmethod - def from_jso(cls, jso): - with check_schema(jso) as pop: - argv = pop("argv") - kw_args = cls._from_jso(pop) - return cls(argv, **kw_args) - - - -#------------------------------------------------------------------------------- - -class ProcstarShellProgram(_ProcstarProgram): - - SHELL = "/usr/bin/bash" - - def __init__(self, command, **kw_args): - super().__init__(**kw_args) - self.__command = str(command) - - - def __str__(self): - return self.__command - - - def bind(self, args): - argv = [self.SHELL, "-c", template_expand(self.__command, args)] - return super()._bind(argv, args) - - - def to_jso(self): - return super().to_jso() | {"command" : self.__command} - - - @classmethod - def from_jso(cls, jso): - with check_schema(jso) as pop: - command = pop("command") - kw_args = cls._from_jso(pop) - return cls(command, **kw_args) - - - From 543998da8f49a5c2c9ba68617a4039a9c9df4de1 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 20 Dec 2024 16:29:08 -0500 Subject: [PATCH 47/84] ProcstarProgramRunning, WIP. --- python/apsis/program/base.py | 22 +- python/apsis/program/procstar/agent.py | 477 ++++++++++++------------- 2 files changed, 256 insertions(+), 243 deletions(-) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index 627d1d2a..ed60eedf 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -178,14 +178,30 @@ def bind(self, args): #------------------------------------------------------------------------------- class RunningProgram: + """ + A running instance of a program. + + An instance (of a subclass) represents a program while it is running, i.e. + for a run in the starting, running, and stopping states. - # run_id: str - # updates: async iter + The async iterable `updates` drives the program through the event loop. + Apsis will await this iterator to completion. + """ def __init__(self, run_id): self.run_id = run_id + @property + def updates(self): + """ + A singleton async iterable of program updates. + + Apsis async-iterates this to exhaustion, to drive the program through + the event loop. Exhaustion indicates the program is done. + """ + + async def stop(self): raise NotImplementedError("not implemented: stop()") @@ -195,6 +211,8 @@ async def signal(self, signal): +#------------------------------------------------------------------------------- + class Program(TypedJso): """ Program base class. diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index 0fe99a88..dcc7db2e 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -5,16 +5,17 @@ from procstar.agent.exc import NoConnectionError, NoOpenConnectionInGroup, ProcessUnknownError from procstar.agent.proc import FdData, Interval, Result from signal import Signals -import traceback import uuid from apsis.lib import asyn +from apsis.lib import memo from apsis.lib.json import check_schema, ifkey from apsis.lib.parse import nparse_duration from apsis.lib.py import or_none, nstr, get_cfg from apsis.lib.sys import to_signal from apsis.procstar import get_agent_server from apsis.program import base +from apsis.program.base import (ProgramSuccess, ProgramFailure, ProgramError) from apsis.runs import join_args, template_expand log = logging.getLogger(__name__) @@ -31,7 +32,7 @@ def _sudo_wrap(cfg, argv, sudo_user): if sudo_user is None: return argv else: - sudo_argv = get_cfg(cfg, "procstar.agent.sudo.argv", SUDO_ARGV_DEFAULT) + sudo_argv = get_cfg(cfg, "sudo.argv", SUDO_ARGV_DEFAULT) return [ str(a) for a in sudo_argv ] + [ "--non-interactive", "--user", str(sudo_user), @@ -305,25 +306,25 @@ def __init__( sudo_user =None, stop =Stop(), ): - self.__argv = [ str(a) for a in argv ] - self.__group_id = str(group_id) - self.__sudo_user = nstr(sudo_user) - self.__stop = stop + self.argv = [ str(a) for a in argv ] + self.group_id = str(group_id) + self.sudo_user = nstr(sudo_user) + self.stop = stop def __str__(self): - return join_args(self.__argv) + return join_args(self.argv) def to_jso(self): return ( super().to_jso() | { - "argv" : self.__argv, - "group_id" : self.__group_id, + "argv" : self.argv, + "group_id" : self.group_id, } - | if_not_none("sudo_user", self.__sudo_user) - | ifkey("stop", self.__stop.to_jso(), {}) + | if_not_none("sudo_user", self.sudo_user) + | ifkey("stop", self.stop.to_jso(), {}) ) @@ -337,15 +338,44 @@ def from_jso(cls, jso): return cls(argv, group_id=group_id, sudo_user=sudo_user, stop=stop) - def get_spec(self, cfg, *, run_id): + def run(self, run_id, cfg): + return RunningProcstarProgram(run_id, self, cfg) + + + def connect(self, run_id, run_state, cfg): + return RunningProcstarProgram(run_id, self, run_state) + + + +#------------------------------------------------------------------------------- + +class RunningProcstarProgram(base.RunningProgram): + + def __init__(self, run_id, program, cfg, run_state=None): + """ + :param res: + The most recent `Result`, if any. + """ + super().__init__(run_id) + self.program = program + self.cfg = get_cfg(cfg, "procstar.agent", {}) + self.run_state = run_state + + self.proc = None + self.stopping = False + + + @property + def _spec(self): """ Returns the procstar proc spec for the program. """ + argv = _sudo_wrap(self.cfg, self.program.argv, self.program.sudo_user) return procstar.spec.Proc( - _sudo_wrap(cfg, self.__argv, self.__sudo_user), + argv, env=procstar.spec.Proc.Env( vars={ - "APSIS_RUN_ID": run_id, + "APSIS_RUN_ID": self.run_id, }, # Inherit the entire environment from procstar, since it probably # includes important configuration. @@ -365,273 +395,238 @@ def get_spec(self, cfg, *, run_id): ) - async def run(self, run_id, cfg): + @memo.property + async def updates(self): """ - Runs the program. - - Returns an async iterator of program updates. + Handles running `inst` until termination. """ - server = get_agent_server() - agent_cfg = get_cfg(cfg, "procstar.agent", {}) - run_cfg = get_cfg(agent_cfg, "run", {}) - conn_timeout = get_cfg(agent_cfg, "connection.start_timeout", None) - conn_timeout = nparse_duration(conn_timeout) - - # Generate a proc ID. - proc_id = str(uuid.uuid4()) + run_cfg = get_cfg(self.cfg, "run", {}) + update_interval = run_cfg.get("update_interval", None) + update_interval = nparse_duration(update_interval) + output_interval = run_cfg.get("output_interval", None) + output_interval = nparse_duration(output_interval) - try: + if self.run_state is None: # Start the proc. - proc, res = await server.start( - proc_id =proc_id, - group_id =self.__group_id, - spec =self.get_spec(cfg, run_id=run_id), - conn_timeout=conn_timeout, - ) - except NoOpenConnectionInGroup as exc: - log.warning(str(exc)) - yield base.ProgramError(f"procstar: {exc}") + conn_timeout = get_cfg(self.cfg, "connection.start_timeout", None) + conn_timeout = nparse_duration(conn_timeout) + proc_id = str(uuid.uuid4()) - except Exception as exc: - log.error(f"procstar: {traceback.format_exc()}") - yield base.ProgramError(f"procstar: {exc}") + try: + # Start the proc. + self.proc, res = await get_agent_server().start( + proc_id =proc_id, + group_id =self.program.group_id, + spec =self._spec, + conn_timeout=conn_timeout, + ) + except NoOpenConnectionInGroup as exc: + msg = f"start failed: {proc_id}: {exc}" + log.warning(msg) + yield ProgramError(msg) + return - else: - # Up and running. - run_state = { - "conn_id": proc.conn_id, + conn_id = self.proc.conn_id + log.info(f"started: {proc_id} on conn {conn_id}") + + self.run_state = { + "conn_id": conn_id, "proc_id": proc_id, } yield base.ProgramRunning( - run_state, + run_state=self.run_state, meta=_make_metadata(proc_id, res) ) - async for update in _run(Instance(proc, run_cfg, run_state), res): - yield update - - - async def connect(self, run_id, run_state, cfg): - server = get_agent_server() - agent_cfg = get_cfg(cfg, "procstar.agent", {}) - run_cfg = get_cfg(agent_cfg, "run", {}) - - conn_id = run_state["conn_id"] - proc_id = run_state["proc_id"] - - try: - conn_timeout = nparse_duration( - get_cfg(agent_cfg, "connection.reconnect_timeout", None)) + else: + # Reconnect to the proc. + conn_timeout = get_cfg(self.cfg, "connection.reconnect_timeout", None) + conn_timeout = nparse_duration(conn_timeout) + conn_id = self.run_state["conn_id"] + proc_id = self.run_state["proc_id"] log.info(f"reconnecting: {proc_id} on conn {conn_id}") - proc = await server.reconnect( - conn_id =conn_id, - proc_id =proc_id, - conn_timeout=conn_timeout, - ) + + try: + proc = await get_agent_server().reconnect( + conn_id =conn_id, + proc_id =proc_id, + conn_timeout=conn_timeout, + ) + except NoConnectionError as exc: + msg = f"reconnect failed: {proc_id}: {exc}" + log.error(msg) + yield ProgramError(msg) + return # Request a result immediately. await proc.request_result() + res = None - except NoConnectionError as exc: - msg = f"reconnect failed: {proc_id}: {exc}" - log.error(msg) - yield base.ProgramError(msg) - - else: - # Successfully connected. log.info(f"reconnected: {proc_id} on conn {conn_id}") - async for update in _run(Instance(proc, run_cfg, run_state), None): - yield update - - - async def stop(self, run_state): - stop = self.__stop - - # Send the stop signal. - await self.signal(None, run_state, stop.signal) - if stop.grace_period is not None: - # Wait for the grace period to expire. - await asyncio.sleep(stop.grace_period) - # Send a kill signal. - try: - await self.signal(None, run_state, Signals.SIGKILL) - except ValueError: - # Proc is gone; that's OK. - pass + # We now have a proc running on the agent. + try: + tasks = asyn.TaskGroup() - async def signal(self, run_id, run_state, signal): - server = get_agent_server() + # Output collected so far. + fd_data = None - signal = to_signal(signal) + # Start tasks to request periodic updates of results and output. - proc_id = run_state["proc_id"] - log.info(f"sending signal: {proc_id}: {signal}") - try: - proc = server.processes[proc_id] - except KeyError: - raise ValueError(f"no process: {proc_id}") - await proc.send_signal(int(signal)) + if update_interval is not None: + # Start a task that periodically requests the current result. + tasks.add( + "poll update", + asyn.poll(self.proc.request_result, update_interval) + ) + if output_interval is not None: + # Start a task that periodically requests additional output. + def more_output(): + # From the current position to the end. + start = 0 if fd_data is None else fd_data.interval.stop + interval = Interval(start, None) + return self.proc.request_fd_data("stdout", interval=interval) + tasks.add("poll output", asyn.poll(more_output, output_interval)) -#------------------------------------------------------------------------------- + # Process further updates, until the process terminates. + async for update in self.proc.updates: + match update: + case FdData(): + fd_data = _combine_fd_data(fd_data, update) + yield base.ProgramUpdate(outputs=await _make_outputs(fd_data)) + + case Result() as res: + meta = _make_metadata(proc_id, res) + + if res.state == "running": + # Intermediate result. + yield base.ProgramUpdate(meta=meta) + else: + # Process terminated. + break + + else: + # Proc was deleted--but we didn't delete it. + assert False, "proc deleted" + + # Stop update tasks. + await tasks.cancel_all() + + # Do we have the complete output? + length = res.fds.stdout.length + if length > 0 and ( + fd_data is None + or fd_data.interval.stop < length + ): + # Request any remaining output. + await self.proc.request_fd_data( + "stdout", + interval=Interval( + 0 if fd_data is None else fd_data.interval.stop, + None + ) + ) + # Wait for it. + async for update in self.proc.updates: + match update: + case FdData(): + fd_data = _combine_fd_data(fd_data, update) + # Confirm that we've accumulated all the output as + # specified in the result. + assert fd_data.interval.start == 0 + assert fd_data.interval.stop == res.fds.stdout.length + break + + case _: + log.debug("expected final FdData") + + outputs = await _make_outputs(fd_data) + + if ( + res.status.exit_code == 0 + or ( + # The program is stopping and the process exited from + # the stop signal. + self.stopping + and res.status.signal is not None + and Signals[res.status.signal] == self.__stop.signal + ) + ): + # The process terminated successfully. + yield ProgramSuccess(meta=meta, outputs=outputs) + else: + # The process terminated unsuccessfully. + exit_code = res.status.exit_code + signal = res.status.signal + yield ProgramFailure( + f"exit code {exit_code}" if signal is None + else f"killed by {signal}", + meta=meta, + outputs=outputs + ) -@dataclass -class Instance: - proc: procstar.agent.proc.Process - run_cfg: dict - run_state: dict + except asyncio.CancelledError: + # Don't clean up the proc; we can reconnect. + self.proc = None + except ProcessUnknownError: + # Don't ask to clean it up; it's already gone. + self.proc = None -async def _run(inst, res): - """ - Handles running `inst` until termination. + except Exception as exc: + log.error("procstar", exc_info=True) + yield ProgramError( + f"procstar: {exc}", + meta={} if res is None else _make_metadata(proc_id, res), + ) - :param res: - The most recent `Result`, if any. - """ - proc_id = inst.proc.proc_id - tasks = asyn.TaskGroup() + finally: + # Cancel our helper tasks. + await tasks.cancel_all() + if self.proc is not None: + # Done with this proc; ask the agent to delete it. + try: + # Request deletion. + await self.proc.delete() + except Exception as exc: + # Just log this; for Apsis, the proc is done. + log.error(f"delete {self.proc.proc_id}: {exc}") + self.proc = None - try: - # Output collected so far. - fd_data = None - # Start tasks to request periodic updates of results and output. - update_interval = inst.run_cfg.get("update_interval", None) - update_interval = nparse_duration(update_interval) - output_interval = inst.run_cfg.get("output_interval", None) - output_interval = nparse_duration(output_interval) - - if update_interval is not None: - # Start a task that periodically requests the current result. - tasks.add( - "poll update", - asyn.poll(inst.proc.request_result, update_interval) - ) + async def stop(self): + if self.proc is None: + log.warning("no more proc to stop") + return - if output_interval is not None: - # Start a task that periodically requests additional output. - def more_output(): - # From the current position to the end. - start = 0 if fd_data is None else fd_data.interval.stop - interval = Interval(start, None) - return inst.proc.request_fd_data("stdout", interval=interval) - - tasks.add("poll output", asyn.poll(more_output, output_interval)) - - # Process further updates, until the process terminates. - async for update in inst.proc.updates: - match update: - case FdData(): - fd_data = _combine_fd_data(fd_data, update) - yield base.ProgramUpdate(outputs=await _make_outputs(fd_data)) - - case Result() as res: - meta = _make_metadata(proc_id, res) - - if res.state == "running": - # Intermediate result. - yield base.ProgramUpdate(meta=meta) - else: - # Process terminated. - break + stop = self.program.stop + self.stopping = True - else: - # Proc was deleted--but we didn't delete it. - assert False, "proc deleted" - - # Stop update tasks. - await tasks.cancel_all() - - # Do we have the complete output? - length = res.fds.stdout.length - if length > 0 and ( - fd_data is None - or fd_data.interval.stop < length - ): - # Request any remaining output. - await inst.proc.request_fd_data( - "stdout", - interval=Interval( - 0 if fd_data is None else fd_data.interval.stop, - None - ) - ) - # Wait for it. - async for update in inst.proc.updates: - match update: - case FdData(): - fd_data = _combine_fd_data(fd_data, update) - # Confirm that we've accumulated all the output as - # specified in the result. - assert fd_data.interval.start == 0 - assert fd_data.interval.stop == res.fds.stdout.length - break - - case _: - log.debug("expected final FdData") - - outputs = await _make_outputs(fd_data) - - if ( - res.status.exit_code == 0 - # FIXME: Orderly stop condition. - # or ( - # # The program is stopping and the process exited from - # # the stop signal. - # inst.run_state.get("stopping", False) - # and res.status.signal is not None - # and Signals[res.status.signal] == self.__stop.signal - # ) - ): - # The process terminated successfully. - yield base.ProgramSuccess(meta=meta, outputs=outputs) - else: - # The process terminated unsuccessfully. - exit_code = res.status.exit_code - signal = res.status.signal - yield base.ProgramFailure( - f"exit code {exit_code}" if signal is None - else f"killed by {signal}", - meta=meta, - outputs=outputs - ) + # Send the stop signal. + await self.signal(stop.signal) - except asyncio.CancelledError: - # Don't clean up the proc; we can reconnect. - inst.proc = None + if stop.grace_period is not None: + # Wait for the grace period to expire. + await asyncio.sleep(stop.grace_period) + # Send a kill signal. + try: + await self.signal(Signals.SIGKILL) + except ValueError: + # Proc is gone; that's OK. + pass - except ProcessUnknownError: - # Don't ask to clean it up; it's already gone. - inst.proc = None - except Exception as exc: - log.error(f"procstar: {traceback.format_exc()}") + async def signal(self, signal): + if self.proc is None: + log.warning("no more proc to signal") + return - yield base.ProgramError( - f"procstar: {exc}", - meta=( - _make_metadata(proc_id, res) - if inst.proc is not None and res is not None - else {} - ) - ) + await self.proc.send_signal(int(signal)) - finally: - # Cancel our helper tasks. - await tasks.cancel_all() - if inst.proc is not None: - # Done with this proc; ask the agent to delete it. - try: - # Request deletion. - await inst.proc.delete() - except Exception as exc: - # Just log this; for Apsis, the proc is done. - log.error(f"delete {inst.proc.proc_id}: {exc}") From d948b2a3f446f3a87e81ee8a72997be6bd444c6a Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Fri, 20 Dec 2024 18:40:36 -0500 Subject: [PATCH 48/84] Fixes. --- python/apsis/apsis.py | 5 +++-- python/apsis/program/procstar/agent.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 5e85a0fb..696a2147 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -602,11 +602,12 @@ async def send_signal(self, run, signal): signal = to_signal(signal) if run.state not in (State.running, State.stopping): raise RuntimeError(f"invalid run state for signal: {run.state.name}") - assert run.program is not None + if run._running_program is None: + raise RuntimeError("no running program to send signal to") self.run_log.info(run, f"sending {signal.name}") try: - await run.program.signal(run.run_id, run.run_state, signal) + await run._running_program.signal(signal) except Exception: self.run_log.exc(run, f"sending {signal.name} failed") raise RuntimeError(f"sending {signal.name} failed") diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index dcc7db2e..6b8000fb 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -343,7 +343,7 @@ def run(self, run_id, cfg): def connect(self, run_id, run_state, cfg): - return RunningProcstarProgram(run_id, self, run_state) + return RunningProcstarProgram(run_id, self, cfg, run_state) @@ -409,7 +409,7 @@ async def updates(self): if self.run_state is None: # Start the proc. - conn_timeout = get_cfg(self.cfg, "connection.start_timeout", None) + conn_timeout = get_cfg(self.cfg, "connection.start_timeout", 0) conn_timeout = nparse_duration(conn_timeout) proc_id = str(uuid.uuid4()) @@ -449,7 +449,7 @@ async def updates(self): log.info(f"reconnecting: {proc_id} on conn {conn_id}") try: - proc = await get_agent_server().reconnect( + self.proc = await get_agent_server().reconnect( conn_id =conn_id, proc_id =proc_id, conn_timeout=conn_timeout, @@ -461,7 +461,7 @@ async def updates(self): return # Request a result immediately. - await proc.request_result() + await self.proc.request_result() res = None log.info(f"reconnected: {proc_id} on conn {conn_id}") From 226115bd230c6794a03eb973fef1663ace4b18dc Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sat, 21 Dec 2024 09:09:09 -0500 Subject: [PATCH 49/84] Support legacy programs in with RunningProgram. --- python/apsis/program/base.py | 79 +++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index ed60eedf..d3176dd0 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -1,5 +1,6 @@ from dataclasses import dataclass +from apsis.lib import memo from apsis.lib.api import decompress from apsis.lib.json import TypedJso, check_schema from apsis.lib.parse import parse_duration @@ -177,6 +178,9 @@ def bind(self, args): #------------------------------------------------------------------------------- +# FIXME: Apsis should take run_state from RunningProgram, and serialize it on +# each transition. (??) + class RunningProgram: """ A running instance of a program. @@ -211,6 +215,41 @@ async def signal(self, signal): +class LegacyRunningProgram(RunningProgram): + + def __init__(self, run_id, program, cfg, run_state=None): + super().__init__(run_id) + self.program = program + self.cfg = cfg + self.run_state = run_state + + + @memo.property + async def updates(self): + if self.run_state is None: + # Starting. + try: + running, done = await self.program.start(self.run_id, self.cfg) + except ProgramError as err: + yield err + else: + assert isinstance(running, ProgramRunning) + yield running + + else: + done = self.program.reconnect(self.run_id, self.run_state) + + # Running. + try: + success = await done + except (ProgramError, ProgramFailure) as err: + yield err + else: + assert isinstance(success, ProgramSuccess) + yield success + + + #------------------------------------------------------------------------------- class Program(TypedJso): @@ -271,42 +310,23 @@ def from_jso(cls, jso): return TypedJso.from_jso.__func__(cls, jso) - # FIXME: Not async. - async def run(self, run_id, cfg) -> RunningProgram: + def run(self, run_id, cfg) -> RunningProgram: """ Runs the program. - The default implementation is a facade for `start()`, for backward + The default implementation is a facade for `start()`, for legacy compatibility. Subclasses should override this method. :param run_id: Used for logging only. :return: - Async iterator that yields `Program*` objects. + `RunningProgram` instance. """ - # Starting. - try: - running, done = await self.start(run_id, cfg) - except ProgramError as err: - yield err - else: - assert isinstance(running, ProgramRunning) - yield running - - # Running. - try: - success = await done - except (ProgramError, ProgramFailure) as err: - yield err - else: - assert isinstance(success, ProgramSuccess) - yield success + return LegacyRunningProgram(run_id, self, cfg) - # FIXME: Not async. - # FIXME: Remove `run_id` from API. The program should store this in - # `run_state`, if it needs it. - async def connect(self, run_id, run_state, cfg) -> RunningProgram: + # FIXME: Remove `run_id` from API; the running program carries it. + def connect(self, run_id, run_state, cfg) -> RunningProgram: """ Connects to the running program specified by `run_state`. @@ -318,14 +338,7 @@ async def connect(self, run_id, run_state, cfg) -> RunningProgram: :return: Async iterator that yields `Program*` objects. """ - done = self.reconnect(run_id, run_state) - try: - success = await done - except (ProgramError, ProgramFailure) as err: - yield err - else: - assert isinstance(success, ProgramSuccess) - yield success + return LegacyRunningProgram(run_id, self, cfg, run_state) async def stop(self, run_state): From ea33b77886c8b833eb63f3e1d1adb773a031b039 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sat, 4 Jan 2025 23:42:07 -0500 Subject: [PATCH 50/84] Fix. --- python/apsis/program/noop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/apsis/program/noop.py b/python/apsis/program/noop.py index 80e55076..46491b10 100644 --- a/python/apsis/program/noop.py +++ b/python/apsis/program/noop.py @@ -91,7 +91,7 @@ def run(self, run_id, cfg) -> RunningProgram: return RunningNoopProgram(self, run_id, None) - def connect(self, run_id, run_state) -> RunningProgram: + def connect(self, run_id, run_state, cfg) -> RunningProgram: return RunningNoopProgram(self, run_id, run_state) From 24600f553816e1632e163434fffe3097e7f4b1e8 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sat, 4 Jan 2025 23:42:12 -0500 Subject: [PATCH 51/84] RunningProgram for Procstar agent program. --- python/apsis/program/agent.py | 222 +++++++++++++++++++++++++++------- 1 file changed, 178 insertions(+), 44 deletions(-) diff --git a/python/apsis/program/agent.py b/python/apsis/program/agent.py index 093376e4..bcf48983 100644 --- a/python/apsis/program/agent.py +++ b/python/apsis/program/agent.py @@ -9,10 +9,11 @@ from .base import ( Program, ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError, - program_outputs, Timeout, + program_outputs, Timeout, RunningProgram, ) from apsis.agent.client import Agent, NoSuchProcessError, HTTP_IMPL from apsis.host_group import expand_host +from apsis.lib import memo from apsis.lib.cmpr import compress_async from apsis.lib.json import check_schema from apsis.lib.py import or_none, nstr @@ -31,38 +32,33 @@ def _get_agent(host, user): class AgentProgram(Program): def __init__(self, argv, *, host=None, user=None, timeout=None): - self.__argv = tuple( str(a) for a in argv ) - self.__host = nstr(host) - self.__user = nstr(user) - self.__timeout = timeout + self.argv = tuple( str(a) for a in argv ) + self.host = nstr(host) + self.user = nstr(user) + self.timeout = timeout def __str__(self): - return join_args(self.__argv) - - - def __get_agent(self, host): - host = None if host is None else socket.getfqdn(host) - return _get_agent(host, self.__user) + return join_args(self.argv) def bind(self, args): - argv = tuple( template_expand(a, args) for a in self.__argv ) - host = or_none(template_expand)(self.__host, args) - user = or_none(template_expand)(self.__user, args) - timeout = None if self.__timeout is None else self.__timeout.bind(args) + argv = tuple( template_expand(a, args) for a in self.argv ) + host = or_none(template_expand)(self.host, args) + user = or_none(template_expand)(self.user, args) + timeout = None if self.timeout is None else self.timeout.bind(args) return type(self)(argv, host=host, user=user, timeout=timeout) def to_jso(self): jso = { **super().to_jso(), - "argv" : list(self.__argv), - "host" : self.__host, - "user" : self.__user, + "argv" : list(self.argv), + "host" : self.host, + "user" : self.user, } - if self.__timeout is not None: - jso["timeout"] = self.__timeout.to_jso() + if self.timeout is not None: + jso["timeout"] = self.timeout.to_jso() return jso @@ -76,13 +72,9 @@ def from_jso(cls, jso): return cls(argv, host=host, user=user, timeout=timeout) - def get_host(self, cfg): - return expand_host(self.__host, cfg) - - async def start(self, run_id, cfg): host = self.get_host(cfg) - argv = self.__argv + argv = self.argv loc = "" if host is None else " on " + host cmd = join_args(argv) @@ -145,7 +137,7 @@ async def wait(self, run_id, run_state): host = run_state["host"] proc_id = run_state["proc_id"] agent = self.__get_agent(host) - if self.__timeout is not None: + if self.timeout is not None: try: start = ora.Time(run_state["start"]) except KeyError: @@ -177,14 +169,14 @@ async def null_ctx(): async with client_ctx as client: while True: - if self.__timeout is not None: + if self.timeout is not None: elapsed = ora.now() - start - if self.__timeout.duration < elapsed: + if self.timeout.duration < elapsed: msg = f"timeout after {elapsed:.0f} s" log.info(f"{run_id}: {msg}") explanation = f" ({msg})" # FIXME: Note timeout in run log. - await self.signal(run_id, run_state, self.__timeout.signal) + await self.signal(run_id, run_state, self.timeout.signal) await asyncio.sleep(POLL_INTERVAL) log.debug(f"polling proc: {run_id}: {proc_id} @ {host}") @@ -230,17 +222,6 @@ def reconnect(self, run_id, run_state): return asyncio.ensure_future(self.wait(run_id, run_state)) - async def signal(self, run_id, run_state, signal): - """ - :type signal: - Signal name or number. - """ - log.info(f"sending signal: {run_id}: {signal}") - proc_id = run_state["proc_id"] - agent = self.__get_agent(run_state["host"]) - await agent.signal(proc_id, signal) - - class AgentShellProgram(AgentProgram): @@ -248,11 +229,11 @@ def __init__(self, command, **kw_args): command = str(command) argv = ["/bin/bash", "-c", command] super().__init__(argv, **kw_args) - self.__command = command + self.command = command def bind(self, args): - command = template_expand(self.__command, args) + command = template_expand(self.command, args) host = or_none(template_expand)(self._AgentProgram__host, args) user = or_none(template_expand)(self._AgentProgram__user, args) timeout = self._AgentProgram__timeout @@ -261,14 +242,14 @@ def bind(self, args): def __str__(self): - return self.__command + return self.command def to_jso(self): # A bit hacky. Take the base-class JSO and replace argv with command. jso = super().to_jso() del jso["argv"] - jso["command"] = self.__command + jso["command"] = self.command return jso @@ -283,3 +264,156 @@ def from_jso(cls, jso): +#------------------------------------------------------------------------------- + +class RunningAgentProgram(RunningProgram): + + def __init__(self, run_id, program, cfg, run_state=None): + super().__init__(run_id) + self.program = program + self.cfg = cfg + self.run_state = run_state + + + def __get_agent(self, host): + host = None if host is None else socket.getfqdn(host) + return _get_agent(host, self.user) + + + @memo.property + def updates(self): + if self.run_state is None: + host = expand_host(self.program.host, self.cfg) + argv = self.program.argv + + loc = "" if host is None else " on " + host + cmd = join_args(argv) + log.debug(f"starting program{loc}: {cmd}") + + env = { + "inherit": True, + "vars": { + "APSIS_RUN_ID": self.run_id, + }, + } + + meta = { + "apsis_hostname" : socket.gethostname(), + "apsis_username" : get_username(), + } + + agent = self.__get_agent(host) + proc = await agent.start_process(argv, env=env, restart=True) + state = proc["state"] + if state == "run": + log.debug(f"program running: {self.run_id} as {proc['proc_id']}") + + start = ora.now() + proc_id = proc["proc_id"] + self.run_state = { + "host" : host, + "proc_id" : proc_id, + "pid" : proc["pid"], + "start" : str(start), + } + meta.update(self.run_state) + # FIXME: Propagate times from agent. + yield ProgramRunning(self.run_state, meta=meta) + + elif state == "err": + message = proc.get("exception", "program error") + log.info(f"program error: {self.run_id}: {message}") + # Clean up the process from the agent. + await agent.del_process(proc["proc_id"]) + + yield ProgramError(message) + return + + else: + assert False, f"unknown state: {state}" + + else: + proc = ... + proc_id = self.run_state["proc_id"] + start = ora.Time(self.run_state["start"]) + + #------------ + + explanation = "" + + # FIXME: This is so embarrassing. + POLL_INTERVAL = 1 + + TIMEOUT = 60 + client_ctx = httpx.AsyncClient( + verify=False, + timeout=httpx.Timeout(TIMEOUT), + limits=httpx.Limits( + max_keepalive_connections=1, + keepalive_expiry=TIMEOUT, + ), + ) + + async with client_ctx as client: + while True: + if self.timeout is not None: + elapsed = ora.now() - start + if self.timeout.duration < elapsed: + msg = f"timeout after {elapsed:.0f} s" + log.info(f"{self.run_id}: {msg}") + explanation = f" ({msg})" + # FIXME: Note timeout in run log. + await self.signal(self.run_id, self.run_state, self.timeout.signal) + await asyncio.sleep(POLL_INTERVAL) + + log.debug(f"polling proc: {self.run_id}: {proc_id} @ {host}") + try: + proc = await agent.get_process(proc_id, restart=True, client=client) + except NoSuchProcessError: + # Agent doesn't know about this process anymore. + raise ProgramError(f"program lost: {self.run_id}") + if proc["state"] == "run": + await asyncio.sleep(POLL_INTERVAL) + else: + break + + status = proc["status"] + output, length, compression = await agent.get_process_output(proc_id, client=client) + + if compression is None and len(output) > 16384: + # Compress the output. + try: + output, compression = await compress_async(output, "br"), "br" + except RuntimeError as exc: + log.error(f"{exc}; not compressing") + + outputs = program_outputs( + output, length=length, compression=compression) + log.debug(f"got output: {length} bytes, {compression or 'uncompressed'}") + + try: + if status == 0: + return ProgramSuccess(meta=proc, outputs=outputs) + + else: + message = f"program failed: status {status}{explanation}" + raise ProgramFailure(message, meta=proc, outputs=outputs) + + finally: + # Clean up the process from the agent. + await agent.del_process(proc_id, client=client) + + async def signal(self, signal): + """ + :type signal: + Signal name or number. + """ + if self.run_state is None: + raise RuntimeError("can't signal; not running yet") + + log.info(f"sending signal: {self.run_id}: {signal}") + proc_id = self.run_state["proc_id"] + agent = self.__get_agent(self.run_state["host"]) + await agent.signal(proc_id, signal) + + From b9cb61747366c0159fd9a4c08883c06d5300a5dc Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:10:06 -0500 Subject: [PATCH 52/84] Fix running program for classic agent. --- python/apsis/program/agent.py | 179 ++++------------------------------ 1 file changed, 21 insertions(+), 158 deletions(-) diff --git a/python/apsis/program/agent.py b/python/apsis/program/agent.py index bcf48983..47ba21ea 100644 --- a/python/apsis/program/agent.py +++ b/python/apsis/program/agent.py @@ -72,154 +72,12 @@ def from_jso(cls, jso): return cls(argv, host=host, user=user, timeout=timeout) - async def start(self, run_id, cfg): - host = self.get_host(cfg) - argv = self.argv - - loc = "" if host is None else " on " + host - cmd = join_args(argv) - log.debug(f"starting program{loc}: {cmd}") - - env = { - "inherit": True, - "vars": { - "APSIS_RUN_ID": run_id, - # FIXME: Other things? - }, - } - - meta = { - "apsis_hostname" : socket.gethostname(), - "apsis_username" : get_username(), - } - - try: - agent = self.__get_agent(host) - proc = await agent.start_process(argv, env=env, restart=True) - - except Exception as exc: - log.error("failed to start process", exc_info=True) - output = traceback.format_exc().encode() - # FIXME: Use a different "traceback" output, once the UI can - # understand it. - raise ProgramError( - message=str(exc), outputs=program_outputs(output)) - - state = proc["state"] - if state == "run": - log.debug(f"program running: {run_id} as {proc['proc_id']}") - - run_state = { - "host" : host, - "proc_id" : proc["proc_id"], - "pid" : proc["pid"], - } - meta.update(run_state) - # FIXME: Propagate times from agent. - # FIXME: Do this asynchronously from the agent instead. - done = self.wait(run_id, run_state) - run_state["start"] = str(ora.now()) - return ProgramRunning(run_state, meta=meta), done - - elif state == "err": - message = proc.get("exception", "program error") - log.info(f"program error: {run_id}: {message}") - # Clean up the process from the agent. - await agent.del_process(proc["proc_id"]) - - raise ProgramError(message) - - else: - assert False, f"unknown state: {state}" - - - async def wait(self, run_id, run_state): - host = run_state["host"] - proc_id = run_state["proc_id"] - agent = self.__get_agent(host) - if self.timeout is not None: - try: - start = ora.Time(run_state["start"]) - except KeyError: - # Backward compatibility: no start in run state. - # FIXME: Clean this up after transition. - start = ora.now() - - explanation = "" - - # FIXME: This is so embarrassing. - POLL_INTERVAL = 1 - - if HTTP_IMPL == "httpx": - TIMEOUT = 60 - client_ctx = httpx.AsyncClient( - verify=False, - timeout=httpx.Timeout(TIMEOUT), - limits=httpx.Limits( - max_keepalive_connections=1, - keepalive_expiry=TIMEOUT, - ), - ) - else: - @contextlib.asynccontextmanager - async def null_ctx(): - yield None - - client_ctx = null_ctx() - - async with client_ctx as client: - while True: - if self.timeout is not None: - elapsed = ora.now() - start - if self.timeout.duration < elapsed: - msg = f"timeout after {elapsed:.0f} s" - log.info(f"{run_id}: {msg}") - explanation = f" ({msg})" - # FIXME: Note timeout in run log. - await self.signal(run_id, run_state, self.timeout.signal) - await asyncio.sleep(POLL_INTERVAL) - - log.debug(f"polling proc: {run_id}: {proc_id} @ {host}") - try: - proc = await agent.get_process(proc_id, restart=True, client=client) - except NoSuchProcessError: - # Agent doesn't know about this process anymore. - raise ProgramError(f"program lost: {run_id}") - if proc["state"] == "run": - await asyncio.sleep(POLL_INTERVAL) - else: - break + def run(self, run_id, cfg): + return RunningAgentProgram(run_id, self, cfg) - status = proc["status"] - output, length, compression = await agent.get_process_output(proc_id, client=client) - if compression is None and len(output) > 16384: - # Compress the output. - try: - output, compression = await compress_async(output, "br"), "br" - except RuntimeError as exc: - log.error(f"{exc}; not compressing") - - outputs = program_outputs( - output, length=length, compression=compression) - log.debug(f"got output: {length} bytes, {compression or 'uncompressed'}") - - try: - if status == 0: - return ProgramSuccess(meta=proc, outputs=outputs) - - else: - message = f"program failed: status {status}{explanation}" - raise ProgramFailure(message, meta=proc, outputs=outputs) - - finally: - # Clean up the process from the agent. - await agent.del_process(proc_id, client=client) - - - def reconnect(self, run_id, run_state): - log.debug(f"reconnect: {run_id}") - return asyncio.ensure_future(self.wait(run_id, run_state)) + def connect(self, run_id, run_state, cfg): + return RunningAgentProgram(run_id, self, cfg, run_state) @@ -234,9 +92,9 @@ def __init__(self, command, **kw_args): def bind(self, args): command = template_expand(self.command, args) - host = or_none(template_expand)(self._AgentProgram__host, args) - user = or_none(template_expand)(self._AgentProgram__user, args) - timeout = self._AgentProgram__timeout + host = or_none(template_expand)(self.host, args) + user = or_none(template_expand)(self.user, args) + timeout = self.timeout timeout = None if timeout is None else timeout.bind(args) return type(self)(command, host=host, user=user, timeout=timeout) @@ -277,12 +135,13 @@ def __init__(self, run_id, program, cfg, run_state=None): def __get_agent(self, host): host = None if host is None else socket.getfqdn(host) - return _get_agent(host, self.user) + return _get_agent(host, self.program.user) @memo.property - def updates(self): + async def updates(self): if self.run_state is None: + # Start the proc. host = expand_host(self.program.host, self.cfg) argv = self.program.argv @@ -333,9 +192,11 @@ def updates(self): assert False, f"unknown state: {state}" else: - proc = ... + # Poll an existing proc. proc_id = self.run_state["proc_id"] - start = ora.Time(self.run_state["start"]) + host = self.run_state["host"] + start = ora.Time(self.run_state["start"]) + agent = self.__get_agent(host) #------------ @@ -356,14 +217,14 @@ def updates(self): async with client_ctx as client: while True: - if self.timeout is not None: + if self.program.timeout is not None: elapsed = ora.now() - start - if self.timeout.duration < elapsed: + if self.program.timeout.duration < elapsed: msg = f"timeout after {elapsed:.0f} s" log.info(f"{self.run_id}: {msg}") explanation = f" ({msg})" # FIXME: Note timeout in run log. - await self.signal(self.run_id, self.run_state, self.timeout.signal) + await self.signal(self.program.timeout.signal) await asyncio.sleep(POLL_INTERVAL) log.debug(f"polling proc: {self.run_id}: {proc_id} @ {host}") @@ -393,16 +254,17 @@ def updates(self): try: if status == 0: - return ProgramSuccess(meta=proc, outputs=outputs) + yield ProgramSuccess(meta=proc, outputs=outputs) else: message = f"program failed: status {status}{explanation}" - raise ProgramFailure(message, meta=proc, outputs=outputs) + yield ProgramFailure(message, meta=proc, outputs=outputs) finally: # Clean up the process from the agent. await agent.del_process(proc_id, client=client) + async def signal(self, signal): """ :type signal: @@ -417,3 +279,4 @@ async def signal(self, signal): await agent.signal(proc_id, signal) + From f6c577e7c47934d8118134b4d7918011e826860d Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:15:27 -0500 Subject: [PATCH 53/84] Clean up. --- python/apsis/jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/apsis/jobs.py b/python/apsis/jobs.py index 92382354..a5d4e942 100644 --- a/python/apsis/jobs.py +++ b/python/apsis/jobs.py @@ -12,7 +12,7 @@ from .lib.json import to_array, to_narray, check_schema from .lib.py import tupleize, format_ctor from .program import Program, NoOpProgram -from .schedule import Schedule, schedule_to_jso, schedule_from_jso +from .schedule import schedule_to_jso, schedule_from_jso log = logging.getLogger(__name__) @@ -22,7 +22,7 @@ class Job: def __init__( self, job_id, params=[], schedules=[], program=NoOpProgram(), - conds=[], actions=[], *, stop=None, meta={}, ad_hoc=False + conds=[], actions=[], *, meta={}, ad_hoc=False ): """ :param schedules: From 2ac12f61284901997090d03c8ea6ba87afb7a716 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:15:32 -0500 Subject: [PATCH 54/84] Todo. --- notes/todo.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notes/todo.md b/notes/todo.md index 615482b2..6e49bfb2 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -45,14 +45,14 @@ - [x] actually stop the program at the stop time - [x] go through _stopping_ state - [x] Procstar stop method - - [ ] distinguish between "service"-type programs in Procstar program - - [ ] ProcessProgram stop method + - [ ] classic agent program stop method + - [ ] ProcessProgram: RunningProgram and stop method - [ ] "stop" operation + - [ ] distinguish between "service"-type programs in Procstar program - [ ] add stop time to run view - [ ] add stop time to runs table (NO?) - [ ] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job - - [ ] classic agent program stop method (or skip?) - [ ] if `send_signal` raises, error the run - [ ] improve `apsis job` output style From 3400fd6d1d8c57b0614a15f47069a495aea5507f Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:16:40 -0500 Subject: [PATCH 55/84] Clean up no longer used. --- python/apsis/stop.py | 66 +------------------------------------------- 1 file changed, 1 insertion(+), 65 deletions(-) diff --git a/python/apsis/stop.py b/python/apsis/stop.py index 4eff6a79..40f2cbf6 100644 --- a/python/apsis/stop.py +++ b/python/apsis/stop.py @@ -1,72 +1,8 @@ -import asyncio import ora -from signal import Signals -from apsis.lib.json import TypedJso, to_array, check_schema +from apsis.lib.json import TypedJso, check_schema from apsis.lib.parse import parse_duration from apsis.lib.py import format_ctor -from apsis.lib.sys import to_signal -from apsis.states import State, to_state, reachable - -#------------------------------------------------------------------------------- - -class StopMethod(TypedJso): - - TYPE_NAMES = TypedJso.TypeNames() - - async def __call__(self, apsis, run): - raise NotImplementedError("StopMethod.__call__") - - - -class StopSignalMethod: - """ - Stops a program by sending a signal. - - Sends `signal`, waits `timeout` seconds, then sends SIGKILL. - """ - - def __init__(self, signal=Signals.SIGTERM, timeout=60): - self.signal = to_signal(signal) - self.timeout = float(timeout) - assert 0 <= self.timeout - - - def __eq__(self, other): - return other.signal == self.signal and other.timeout == self.timeout - - - def __repr__(self): - return format_ctor(self, signal=self.signal, timeout=self.timeout) - - - def __str__(self): - return f"signal {self.signal.name}" - - - def to_jso(self): - return { - **super().to_jso(), - "signal" : self.signal.name, - "timeout" : self.timeout, - } - - - @classmethod - def from_jso(cls, jso): - with check_schema(jso) as pop: - signal = pop("signal", to_signal, Signal.SIGTERM), - timeout = pop("timeout", float, 60), - return cls(signal, timeout) - - - async def __call__(self, apsis, run): - await apsis.send_signal(run, self.signal) - await asyncio.sleep(self.timeout) - if not run.state.finished: - await asyncio.send_signal(run, Signal.SIGKILL) - - #------------------------------------------------------------------------------- From 4299725373ca27b2b7ed247229d8035d0025ebd8 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:53:50 -0500 Subject: [PATCH 56/84] Fix stopping transition. --- python/apsis/apsis.py | 2 +- python/apsis/program/procstar/agent.py | 4 ++-- python/apsis/running.py | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 696a2147..f703e536 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -346,7 +346,7 @@ def _update_metadata(self, run, meta): """ Updates run metadata, without transitioning. """ - assert run.state in {State.starting, State.running} + assert run.state in {State.starting, State.running, State.stopping} if meta is None or len(meta) == 0: return diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index 6b8000fb..5b99a2eb 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -435,8 +435,8 @@ async def updates(self): "proc_id": proc_id, } yield base.ProgramRunning( - run_state=self.run_state, - meta=_make_metadata(proc_id, res) + run_state =self.run_state, + meta =_make_metadata(proc_id, res) ) else: diff --git a/python/apsis/running.py b/python/apsis/running.py index 82576eba..c1822f89 100644 --- a/python/apsis/running.py +++ b/python/apsis/running.py @@ -87,17 +87,17 @@ async def stop(): duration = stop_time - now() log.debug(f"{run_id}: running for {duration:.3f} s until stop") await asyncio.sleep(duration) - # Ask the run to stop. - try: - await run._running_program.stop(run) - except: - log.info("program.stop() exception", exc_info=True) # Transition to stopping. apsis.run_log.record(run, "stopping") apsis._transition( run, State.stopping, run_state=run.run_state | {"stopping": True} ) + # Ask the run to stop. + try: + await run._running_program.stop() + except: + log.info("program.stop() exception", exc_info=True) # The main update loop handles updates in response. stop_task = asyncio.create_task(stop()) From 08f7968d6c2c704064cf0427e49739ef0cf2f229 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:54:19 -0500 Subject: [PATCH 57/84] Record stop signals sent in meta. --- python/apsis/program/procstar/agent.py | 49 +++++++++++++++----------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index 5b99a2eb..acd190c5 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -357,12 +357,13 @@ def __init__(self, run_id, program, cfg, run_state=None): The most recent `Result`, if any. """ super().__init__(run_id) - self.program = program - self.cfg = get_cfg(cfg, "procstar.agent", {}) - self.run_state = run_state + self.program = program + self.cfg = get_cfg(cfg, "procstar.agent", {}) + self.run_state = run_state - self.proc = None - self.stopping = False + self.proc = None + self.stopping = False + self.stop_signals = [] @property @@ -546,19 +547,18 @@ def more_output(): log.debug("expected final FdData") outputs = await _make_outputs(fd_data) + meta["stop"] = {"signals": [ s.name for s in self.stop_signals ]} - if ( - res.status.exit_code == 0 - or ( - # The program is stopping and the process exited from - # the stop signal. - self.stopping - and res.status.signal is not None - and Signals[res.status.signal] == self.__stop.signal - ) - ): + if res.status.exit_code == 0: # The process terminated successfully. yield ProgramSuccess(meta=meta, outputs=outputs) + elif ( + self.stopping + and res.status.signal is not None + and Signals[res.status.signal] == self.program.stop.signal + ): + # The process stopped with the expected signal. + yield ProgramSuccess(meta=meta, outputs=outputs) else: # The process terminated unsuccessfully. exit_code = res.status.exit_code @@ -608,17 +608,24 @@ async def stop(self): self.stopping = True # Send the stop signal. + self.stop_signals.append(stop.signal) await self.signal(stop.signal) if stop.grace_period is not None: - # Wait for the grace period to expire. - await asyncio.sleep(stop.grace_period) - # Send a kill signal. try: - await self.signal(Signals.SIGKILL) - except ValueError: - # Proc is gone; that's OK. + # Wait for the grace period to expire. + await asyncio.sleep(stop.grace_period) + except asyncio.CancelledError: + # That's what we were hoping for. pass + else: + # Send a kill signal. + try: + self.stop_signals.append(Signals.SIGKILL) + await self.signal(Signals.SIGKILL) + except ValueError: + # Proc is gone; that's OK. + pass async def signal(self, signal): From a31362fa0039cb06c745b5dc966b11f35714aad2 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:54:29 -0500 Subject: [PATCH 58/84] Clean up. --- python/apsis/program/base.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/apsis/program/base.py b/python/apsis/program/base.py index d3176dd0..3ba6c32b 100644 --- a/python/apsis/program/base.py +++ b/python/apsis/program/base.py @@ -341,13 +341,6 @@ def connect(self, run_id, run_state, cfg) -> RunningProgram: return LegacyRunningProgram(run_id, self, cfg, run_state) - async def stop(self, run_state): - """ - Instructs the running program to stop. - """ - raise NotImplementedError("program stop not implemented") - - #------------------------------------------------------------------------------- From 598ba04d40997ffb144010240bdafcba011675e0 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 00:54:33 -0500 Subject: [PATCH 59/84] Show stop time in run view. --- vue/src/views/RunView.vue | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vue/src/views/RunView.vue b/vue/src/views/RunView.vue index 5cfd54b3..157bc960 100644 --- a/vue/src/views/RunView.vue +++ b/vue/src/views/RunView.vue @@ -58,6 +58,10 @@ div td RunElapsed(:run="run") + tr(v-if="run.times.stop") + th scheduled stop + td: Timestamp(:time="run.times.stop") + Frame(title="Run Log") div table.fields.run-log From 691f4ff0bb04b3385511324cadcc776eb0a1d3a6 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 20:25:53 -0500 Subject: [PATCH 60/84] Test job. --- test/manual/procstar/ignore-term.py | 20 ++++++++++++++++++++ test/manual/procstar/jobs/ignore stop.yaml | 8 ++++++++ 2 files changed, 28 insertions(+) create mode 100755 test/manual/procstar/ignore-term.py create mode 100644 test/manual/procstar/jobs/ignore stop.yaml diff --git a/test/manual/procstar/ignore-term.py b/test/manual/procstar/ignore-term.py new file mode 100755 index 00000000..71c3fd6a --- /dev/null +++ b/test/manual/procstar/ignore-term.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python + +from argparse import ArgumentParser +import signal +import time + +parser = ArgumentParser() +parser.add_argument("sleep", metavar="SECS", type=float) +args = parser.parse_args() + +def sigterm(signum, frame): + sig = signal.Signals(signum) + print(f"ignoring {sig.name}") + +signal.signal(signal.Signals.SIGTERM, sigterm) + +print(f"sleeping for {args.sleep} sec") +time.sleep(args.sleep) +print("done") + diff --git a/test/manual/procstar/jobs/ignore stop.yaml b/test/manual/procstar/jobs/ignore stop.yaml new file mode 100644 index 00000000..c91db41f --- /dev/null +++ b/test/manual/procstar/jobs/ignore stop.yaml @@ -0,0 +1,8 @@ +params: [time] + +program: + type: procstar + argv: + - /home/alex/dev/apsis/test/manual/procstar/ignore-term.py + - "{{ time }}" + From be75ee078370e7b5b2c346571029dca2242c01f4 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Sun, 5 Jan 2025 20:26:02 -0500 Subject: [PATCH 61/84] Add stop time to runs table. --- notes/todo.md | 8 +++++--- vue/src/components/RunsList.vue | 28 ++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/notes/todo.md b/notes/todo.md index 6e49bfb2..da7c5aa2 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -45,14 +45,16 @@ - [x] actually stop the program at the stop time - [x] go through _stopping_ state - [x] Procstar stop method + - [x] add stop time to runs table - [ ] classic agent program stop method - [ ] ProcessProgram: RunningProgram and stop method - [ ] "stop" operation - - [ ] distinguish between "service"-type programs in Procstar program - - [ ] add stop time to run view - - [ ] add stop time to runs table (NO?) + - [x] add stop time to run view - [ ] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job + - clarify the rules for `run_state` updates + - [ ] (?) distinguish between "service"-type programs in Procstar program + - [ ] (?) "schedule stop" operation? - [ ] if `send_signal` raises, error the run - [ ] improve `apsis job` output style diff --git a/vue/src/components/RunsList.vue b/vue/src/components/RunsList.vue index a6b051f6..9bdbe8ee 100644 --- a/vue/src/components/RunsList.vue +++ b/vue/src/components/RunsList.vue @@ -139,6 +139,7 @@ div col(style="width: 10rem") col(style="width: 10rem") col(style="width: 6rem") + col(style="width: 10rem") col(style="width: 4rem") thead @@ -153,6 +154,7 @@ div th.col-schedule-time Schedule th.col-start-time Start th.col-elapsed Elapsed + th.col-stop-time Scheduled Stop th.col-operations Operations tbody @@ -209,6 +211,8 @@ div Timestamp(v-if="run.times.running" :time="run.times.running") td.col-elapsed RunElapsed(:run="run") + td.col-stop-time + Timestamp(v-if="run.times.stop" :time="run.times.stop" :class="{ overdue: isOverdue(run) }") td.col-operations HamburgerMenu(v-if="OPERATIONS[run.state].length > 0") OperationButton( @@ -257,11 +261,11 @@ const COUNTS = [20, 50, 100, 200, 500, 1000] /** * Constructs a predicate fn for matching runs with `args`. - * + * * `args` is an array of "param=value" or "param" strings. The former requires * the given param have the cooresponding arg value. The latter requires that * it have any arg value at all. The result is the conjunction of these. - * + * * For example, the `args` array ["fruit=mango", "color"] produced a predicate * that matches all runs that both have a "fruit" param with arg value "mango", * and also have any value at all ofr the "color" param. @@ -439,7 +443,7 @@ export default { let runs = groups.groups // FIXME let now = (new Date()).toISOString() - + // Determine the time to center around. const time = this.time === 'now' ? now : this.time // Find the index corresponding to the center time. @@ -527,7 +531,7 @@ export default { } }, }, - + watch: { // If parent updates the query prop, update our state correspondingly. query: { @@ -597,6 +601,14 @@ export default { else return '' }, + + isOverdue(run) { + return run.times.stop + && ( run.state === 'starting' + || run.state === 'running' + || run.state === 'stopping') + && new Date(run.times.stop) < new Date() + }, }, } @@ -626,7 +638,7 @@ export default { white-space: nowrap; line-height: 28px; - + > div { display: grid; height: 30px; @@ -721,12 +733,16 @@ table.runlist { vertical-align: bottom; } - .col-schedule-time, .col-start-time { + .col-schedule-time, .col-start-time, .col-stop-time { font-size: 90%; color: $global-light-color; text-align: right; } + .col-stop-time .overdue { + color: #b07040; + } + .col-group { text-align: right; white-space: nowrap; From fcebe46e8cd48c59d1e1a5f34b637700ef278281 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 6 Jan 2025 13:36:33 -0500 Subject: [PATCH 62/84] Procstar stop tests. --- python/apsis/jobs.py | 10 ++++++ test/int/instance.py | 5 ++- test/int/procstar/ignore-term | 20 ++++++++++++ test/int/procstar/test_stop.py | 60 ++++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100755 test/int/procstar/ignore-term create mode 100644 test/int/procstar/test_stop.py diff --git a/python/apsis/jobs.py b/python/apsis/jobs.py index a5d4e942..f277414f 100644 --- a/python/apsis/jobs.py +++ b/python/apsis/jobs.py @@ -151,6 +151,10 @@ def load_yaml_file(path, job_id): return load_yaml(file, job_id) +def dump_yaml(file, job): + yaml.dump(job_to_jso(job), file) + + def list_yaml_files(dir_path): dir_path = Path(dir_path) for dir, _, names in os.walk(dir_path): @@ -267,6 +271,12 @@ def load_jobs_dir(path): return jobs_dir +def dump_job(jobs_dir_path, job): + path = (jobs_dir_path / job.job_id).with_suffix(".yaml") + with path.open("w") as file: + dump_yaml(file, job) + + #------------------------------------------------------------------------------- # FIXME: This feels so awkward. Is there a better design? diff --git a/test/int/instance.py b/test/int/instance.py index 9b99545d..a298087e 100644 --- a/test/int/instance.py +++ b/test/int/instance.py @@ -196,7 +196,10 @@ def wait_run( run_id, *, timeout=60, - wait_states=("new", "scheduled", "waiting", "starting", "running"), + wait_states=( + "new", "scheduled", "waiting", + "starting", "running", "stopping" + ), ): """ Polls for a run to no longer be running. diff --git a/test/int/procstar/ignore-term b/test/int/procstar/ignore-term new file mode 100755 index 00000000..71c3fd6a --- /dev/null +++ b/test/int/procstar/ignore-term @@ -0,0 +1,20 @@ +#!/usr/bin/env python + +from argparse import ArgumentParser +import signal +import time + +parser = ArgumentParser() +parser.add_argument("sleep", metavar="SECS", type=float) +args = parser.parse_args() + +def sigterm(signum, frame): + sig = signal.Signals(signum) + print(f"ignoring {sig.name}") + +signal.signal(signal.Signals.SIGTERM, sigterm) + +print(f"sleeping for {args.sleep} sec") +time.sleep(args.sleep) +print("done") + diff --git a/test/int/procstar/test_stop.py b/test/int/procstar/test_stop.py new file mode 100644 index 00000000..dea2010a --- /dev/null +++ b/test/int/procstar/test_stop.py @@ -0,0 +1,60 @@ +from pathlib import Path +import time + +from procstar_instance import ApsisService +from apsis.jobs import jso_to_job, dump_job + +#------------------------------------------------------------------------------- + +IGNORE_TERM_PATH = Path(__file__).parent / "ignore-term" + +JOB_ID = "ignore term" +JOB = jso_to_job({ + "params": ["time"], + "program": { + "type": "procstar", + "argv": [IGNORE_TERM_PATH, "{{ time }}"], + "stop": { + "grace_period": 2, + }, + } +}, JOB_ID) + +def test_dont_stop(): + svc = ApsisService() + dump_job(svc.jobs_dir, JOB) + with svc, svc.agent(): + # Schedule a 1 sec run but tell Apsis to stop it after 3 sec. + run_id = svc.client.schedule(JOB_ID, {"time": "1"}, stop_time="+3s")["run_id"] + res = svc.wait_run(run_id) + + assert res["state"] == "success" + meta = res["meta"]["program"] + assert meta["status"]["exit_code"] == 0 + assert meta["stop"]["signals"] == [] + assert meta["times"]["elapsed"] < 2 + + +def test_kill(): + svc = ApsisService() + dump_job(svc.jobs_dir, JOB) + with svc, svc.agent(): + # Schedule a 5 sec run but tell Apsis to stop it after 1 sec. The + # process ignores SIGTERM so Apsis will send SIGQUIT after the grace + # period. + run_id = svc.client.schedule(JOB_ID, {"time": "5"}, stop_time="+1s")["run_id"] + + time.sleep(1.5) + res = svc.client.get_run(run_id) + assert res["state"] == "stopping" + meta = res["meta"]["program"] + + res = svc.wait_run(run_id) + + assert res["state"] == "failure" + meta = res["meta"]["program"] + assert meta["status"]["signal"] == "SIGKILL" + assert meta["stop"]["signals"] == ["SIGTERM", "SIGKILL"] + assert meta["times"]["elapsed"] > 2.8 + + From be220a9dab950958227d3cf67279c64b3bc3553d Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Mon, 6 Jan 2025 13:49:25 -0500 Subject: [PATCH 63/84] Another test. --- test/int/procstar/ignore-term | 7 +++-- test/int/procstar/test_stop.py | 51 ++++++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/test/int/procstar/ignore-term b/test/int/procstar/ignore-term index 71c3fd6a..3502cfc5 100755 --- a/test/int/procstar/ignore-term +++ b/test/int/procstar/ignore-term @@ -3,6 +3,7 @@ from argparse import ArgumentParser import signal import time +import sys parser = ArgumentParser() parser.add_argument("sleep", metavar="SECS", type=float) @@ -10,11 +11,11 @@ args = parser.parse_args() def sigterm(signum, frame): sig = signal.Signals(signum) - print(f"ignoring {sig.name}") + print(f"ignoring {sig.name}", file=sys.stderr) signal.signal(signal.Signals.SIGTERM, sigterm) -print(f"sleeping for {args.sleep} sec") +print(f"sleeping for {args.sleep} sec", file=sys.stderr) time.sleep(args.sleep) -print("done") +print("done", file=sys.stderr) diff --git a/test/int/procstar/test_stop.py b/test/int/procstar/test_stop.py index dea2010a..4917e65d 100644 --- a/test/int/procstar/test_stop.py +++ b/test/int/procstar/test_stop.py @@ -6,10 +6,16 @@ #------------------------------------------------------------------------------- -IGNORE_TERM_PATH = Path(__file__).parent / "ignore-term" +SLEEP_JOB = jso_to_job({ + "params": ["time"], + "program": { + "type": "procstar", + "argv": ["/usr/bin/sleep", "{{ time }}"], + } +}, "sleep") -JOB_ID = "ignore term" -JOB = jso_to_job({ +IGNORE_TERM_PATH = Path(__file__).parent / "ignore-term" +IGNORE_TERM_JOB = jso_to_job({ "params": ["time"], "program": { "type": "procstar", @@ -18,14 +24,36 @@ "grace_period": 2, }, } -}, JOB_ID) +}, "ignore term") + +def test_stop(): + svc = ApsisService() + dump_job(svc.jobs_dir, SLEEP_JOB) + with svc, svc.agent(): + # Schedule a 3 sec job but tell Apsis to stop it after 1 sec. + run_id = svc.client.schedule( + SLEEP_JOB.job_id, {"time": "3"}, + stop_time="+1s", + )["run_id"] + res = svc.wait_run(run_id) + + # The run was successfully stopped by Apsis, by sending it SIGTERM. + assert res["state"] == "success" + meta = res["meta"]["program"] + assert meta["status"]["signal"] == "SIGTERM" + assert meta["stop"]["signals"] == ["SIGTERM"] + assert meta["times"]["elapsed"] < 2 + def test_dont_stop(): svc = ApsisService() - dump_job(svc.jobs_dir, JOB) + dump_job(svc.jobs_dir, IGNORE_TERM_JOB) with svc, svc.agent(): # Schedule a 1 sec run but tell Apsis to stop it after 3 sec. - run_id = svc.client.schedule(JOB_ID, {"time": "1"}, stop_time="+3s")["run_id"] + run_id = svc.client.schedule( + IGNORE_TERM_JOB.job_id, {"time": "1"}, + stop_time="+3s" + )["run_id"] res = svc.wait_run(run_id) assert res["state"] == "success" @@ -37,12 +65,15 @@ def test_dont_stop(): def test_kill(): svc = ApsisService() - dump_job(svc.jobs_dir, JOB) + dump_job(svc.jobs_dir, IGNORE_TERM_JOB) with svc, svc.agent(): # Schedule a 5 sec run but tell Apsis to stop it after 1 sec. The # process ignores SIGTERM so Apsis will send SIGQUIT after the grace # period. - run_id = svc.client.schedule(JOB_ID, {"time": "5"}, stop_time="+1s")["run_id"] + run_id = svc.client.schedule( + IGNORE_TERM_JOB.job_id, {"time": "5"}, + stop_time="+1s" + )["run_id"] time.sleep(1.5) res = svc.client.get_run(run_id) @@ -57,4 +88,8 @@ def test_kill(): assert meta["stop"]["signals"] == ["SIGTERM", "SIGKILL"] assert meta["times"]["elapsed"] > 2.8 + output = svc.client.get_output(run_id, "output").decode() + assert "ignoring SIGTERM" in output + assert "done" not in output + From 278a5593279d9f6cb07ef608e4b491dab7b4983f Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 7 Jan 2025 17:35:50 -0500 Subject: [PATCH 64/84] Update unit tests. --- test/unit/test_agent_program.py | 8 +++++--- test/unit/test_program_noop.py | 7 ++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/test/unit/test_agent_program.py b/test/unit/test_agent_program.py index a46b571b..11c97d8a 100644 --- a/test/unit/test_agent_program.py +++ b/test/unit/test_agent_program.py @@ -7,11 +7,13 @@ #------------------------------------------------------------------------------- async def _run(): - prog = apsis.program.AgentProgram(["/bin/sleep", "1"]) + prog = apsis.program.AgentProgram(["/bin/sleep", "1"]).bind({}) # Start the program. - running, coro = await prog.start("testrun", cfg={}) + running = prog.run("testrun", cfg={}) # Wait for it to finish. - return await coro + async for update in running.updates: + pass + return update @pytest.mark.asyncio diff --git a/test/unit/test_program_noop.py b/test/unit/test_program_noop.py index fc5d4cf6..61f45899 100644 --- a/test/unit/test_program_noop.py +++ b/test/unit/test_program_noop.py @@ -12,10 +12,11 @@ async def test_duration(): "duration": "0.75", } - prog = apsis.program.Program.from_jso(JSO) + prog = apsis.program.Program.from_jso(JSO).bind({}) start = time.monotonic() - running, coro = await prog.start("testrun", cfg={}) - _ = await coro + running = prog.run("testrun", cfg={}) + async for _ in running.updates: + pass elapsed = time.monotonic() - start assert elapsed > 0.7 From ac644e5743f2ab27c0842a3f4b3fbbb90e9db9a8 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 7 Jan 2025 18:40:34 -0500 Subject: [PATCH 65/84] Unit tests for ProcessProgram. --- test/unit/test_process_program.py | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 test/unit/test_process_program.py diff --git a/test/unit/test_process_program.py b/test/unit/test_process_program.py new file mode 100644 index 00000000..8989aaf4 --- /dev/null +++ b/test/unit/test_process_program.py @@ -0,0 +1,42 @@ +import pytest + +from apsis.program import Program, ProgramSuccess + +#------------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_process_program(): + program = Program.from_jso({ + "type": "apsis.program.process.ProcessProgram", + "argv": ["/usr/bin/echo", "Hello, {{ name }}!"], + }) + program = program.bind({"name": "world"}) + + running = program.run("testrun", cfg={}) + async for update in running.updates: + pass + + assert isinstance(update, ProgramSuccess) + assert update.meta["return_code"] == 0 + output = update.outputs["output"] + assert output.data == b"Hello, world!\n" + + +@pytest.mark.asyncio +async def test_shell_command_program(): + program = Program.from_jso({ + "type": "apsis.program.process.ShellCommandProgram", + "command": "echo 'Hello, {{ name }}!'", + }) + program = program.bind({"name": "world"}) + + running = program.run("testrun", cfg={}) + async for update in running.updates: + pass + + assert isinstance(update, ProgramSuccess) + assert update.meta["return_code"] == 0 + output = update.outputs["output"] + assert output.data == b"Hello, world!\n" + + From b36bd13982428484fa9e28066d4991abecbd8269 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Tue, 7 Jan 2025 18:40:47 -0500 Subject: [PATCH 66/84] Add BoundProcessProgram. --- python/apsis/program/process.py | 100 +++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/python/apsis/program/process.py b/python/apsis/program/process.py index 0fcbfd78..f80a7d4d 100644 --- a/python/apsis/program/process.py +++ b/python/apsis/program/process.py @@ -9,6 +9,7 @@ Program, ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError, program_outputs ) +from apsis.lib.json import check_schema from apsis.lib.sys import get_username from apsis.runs import template_expand, join_args @@ -30,7 +31,7 @@ def __str__(self): def bind(self, args): argv = tuple( template_expand(a, args) for a in self.__argv ) - return type(self)(argv) + return BoundProcessProgram(argv) def to_jso(self): @@ -42,7 +43,69 @@ def to_jso(self): @classmethod def from_jso(cls, jso): - return cls(jso["argv"]) + with check_schema(jso) as pop: + argv = pop("argv") + return cls(argv) + + + +#------------------------------------------------------------------------------- + +class ShellCommandProgram(Program): + + def __init__(self, command): + self.__command = str(command) + + + def bind(self, args): + command = template_expand(self.__command, args) + argv = ["/bin/bash", "-c", command] + return BoundProcessProgram(argv) + + + def __str__(self): + return self.__command + + + def to_jso(self): + return { + **super().to_jso(), + "command": self.__command, + } + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + command = pop("command", str) + return cls(command) + + + +#------------------------------------------------------------------------------- + +class BoundProcessProgram(Program): + + def __init__(self, argv): + self.__argv = tuple( str(a) for a in argv ) + + + def __str__(self): + return join_args(self.__argv) + + + def to_jso(self): + return { + **super().to_jso(), + "argv": self.argv, + } + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + argv = pop("argv") + return cls(argv) async def start(self, run_id, cfg): @@ -101,36 +164,3 @@ async def wait(self, run_id, proc): -#------------------------------------------------------------------------------- - -class ShellCommandProgram(ProcessProgram): - - def __init__(self, command): - command = str(command) - argv = ["/bin/bash", "-c", command] - super().__init__(argv) - self.__command = command - - - def bind(self, args): - command = template_expand(self.__command, args) - return type(self)(command) - - - def __str__(self): - return self.__command - - - def to_jso(self): - return { - **Program.to_jso(self), - "command" : self.__command, - } - - - @classmethod - def from_jso(cls, jso): - return cls(jso["command"]) - - - From 88bb5404d85b8b0e7e38052eae3e85ed0b373d7b Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 09:31:47 -0500 Subject: [PATCH 67/84] RunningProgram for process programs; factor out common Stop. --- python/apsis/program/process.py | 156 +++++++++++++++++++------ python/apsis/program/procstar/agent.py | 50 +------- 2 files changed, 128 insertions(+), 78 deletions(-) diff --git a/python/apsis/program/process.py b/python/apsis/program/process.py index f80a7d4d..094f81c8 100644 --- a/python/apsis/program/process.py +++ b/python/apsis/program/process.py @@ -1,28 +1,82 @@ import asyncio +from dataclasses import dataclass import logging import os from pathlib import Path import pwd +from signal import Signals import socket from .base import ( - Program, ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError, + Program, RunningProgram, + ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError, program_outputs ) -from apsis.lib.json import check_schema -from apsis.lib.sys import get_username +from apsis.lib import memo +from apsis.lib.json import check_schema, ifkey +from apsis.lib.parse import nparse_duration +from apsis.lib.py import or_none +from apsis.lib.sys import get_username, to_signal from apsis.runs import template_expand, join_args log = logging.getLogger(__name__) TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%kZ" +ntemplate_expand = or_none(template_expand) + +#------------------------------------------------------------------------------- + +@dataclass +class Stop: + """ + Specification for how to stop a running agent program. + """ + + signal: Signals = Signals.SIGTERM + grace_period: int = 60 + + def to_jso(self): + cls = type(self) + return ( + ifkey("signal", self.signal, cls.signal) + | ifkey("grace_period", self.grace_period, cls.grace_period) + ) + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso or {}) as pop: + signal = pop("signal", Signals.__getattr__, cls.signal) + grace_period = pop("grace_period", int, cls.grace_period) + return cls(signal, grace_period) + + + @classmethod + def from_jso_str(cls, jso): + with check_schema(jso or {}) as pop: + signal = pop("signal", str, default=cls.signal.name) + grace_period = pop("grace_period", default=cls.grace_period) + return cls(signal, grace_period) + + + def bind(self, args): + return type(self)( + to_signal(template_expand(self.signal, args)), + nparse_duration(ntemplate_expand(self.grace_period, args)) + ) + + + +Stop.DEFAULT = Stop() + #------------------------------------------------------------------------------- class ProcessProgram(Program): - def __init__(self, argv): + def __init__(self, argv, *, stop=Stop.DEFAULT): self.__argv = tuple( str(a) for a in argv ) + self.__stop = stop def __str__(self): @@ -31,21 +85,23 @@ def __str__(self): def bind(self, args): argv = tuple( template_expand(a, args) for a in self.__argv ) - return BoundProcessProgram(argv) + stop = self.__stop.bind(args) + return BoundProcessProgram(argv, stop=stop) def to_jso(self): return { **super().to_jso(), "argv" : list(self.__argv), - } + } | ifkey("stop", self.__stop.to_jso(), {}) @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: argv = pop("argv") - return cls(argv) + stop = Stop.from_jso_str(pop("stop", default={})) + return cls(argv, stop=stop) @@ -53,14 +109,16 @@ def from_jso(cls, jso): class ShellCommandProgram(Program): - def __init__(self, command): + def __init__(self, command, *, stop=Stop.DEFAULT): self.__command = str(command) + self.__stop = stop def bind(self, args): command = template_expand(self.__command, args) - argv = ["/bin/bash", "-c", command] - return BoundProcessProgram(argv) + argv = ["/bin/bash", "-c", command] + stop = self.__stop.bind(args) + return BoundProcessProgram(argv, stop=stop) def __str__(self): @@ -71,14 +129,15 @@ def to_jso(self): return { **super().to_jso(), "command": self.__command, - } + } | ifkey("stop", self.__stop.to_jso(), {}) @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: command = pop("command", str) - return cls(command) + stop = Stop.from_jso_str(pop("stop", default={})) + return cls(command, stop=stop) @@ -86,30 +145,52 @@ def from_jso(cls, jso): class BoundProcessProgram(Program): - def __init__(self, argv): - self.__argv = tuple( str(a) for a in argv ) + def __init__(self, argv, *, stop=Stop.DEFAULT): + self.argv = tuple( str(a) for a in argv ) + self.stop = stop def __str__(self): - return join_args(self.__argv) + return join_args(self.argv) def to_jso(self): return { **super().to_jso(), "argv": self.argv, - } + } | ifkey("stop", self.stop.to_jso(), {}) @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: argv = pop("argv") - return cls(argv) + stop = Stop.from_jso(pop("stop", default={})) + return cls(argv, stop=stop) + + + def run(self, run_id, cfg) -> RunningProgram: + return RunningProcessProgram(self, run_id) + + + + +#------------------------------------------------------------------------------- + +class RunningProcessProgram(RunningProgram): + + # FIXME: Configure? + grace_period = 60 + def __init__(self, program, run_id): + super().__init__(run_id) + self.program = program + self.process = None - async def start(self, run_id, cfg): - argv = self.__argv + + @memo.property + async def updates(self): + argv = self.program.argv log.info(f"starting program: {join_args(argv)}") meta = { @@ -120,11 +201,11 @@ async def start(self, run_id, cfg): try: with open("/dev/null") as stdin: - proc = await asyncio.create_subprocess_exec( + self.proc = await asyncio.create_subprocess_exec( *argv, executable =Path(argv[0]), stdin =stdin, - # Merge stderr with stdin. FIXME: Do better. + # Merge stderr with stdin. stdout =asyncio.subprocess.PIPE, stderr =asyncio.subprocess.STDOUT, ) @@ -133,15 +214,12 @@ async def start(self, run_id, cfg): # Error starting. raise ProgramError(str(exc), meta=meta) - else: - # Started successfully. - done = self.wait(run_id, proc) - return ProgramRunning({"pid": proc.pid}, meta=meta), done - + # Started successfully. + yield ProgramRunning({"pid": self.proc.pid}, meta=meta) - async def wait(self, run_id, proc): - stdout, stderr = await proc.communicate() - return_code = proc.returncode + stdout, stderr = await self.proc.communicate() + return_code = self.proc.returncode + self.proc = None log.info(f"complete with return code {return_code}") assert stderr is None assert return_code is not None @@ -152,15 +230,27 @@ async def wait(self, run_id, proc): outputs = program_outputs(stdout) if return_code == 0: - return ProgramSuccess(meta=meta, outputs=outputs) + yield ProgramSuccess(meta=meta, outputs=outputs) else: message = f"program failed: return code {return_code}" - raise ProgramFailure(message, meta=meta, outputs=outputs) + yield ProgramFailure(message, meta=meta, outputs=outputs) + + + def connect(self, run_id, run_state, cfg) -> RunningProgram: + pid = run_state["pid"] + raise NotImplementedError(f"can't reconnect to running proc {pid}") + + + async def signal(self, signal): + assert self.process is not None + self.process.send_signal(signal) - # FIXME: Implement signal(). - # FIXME: Implement stop(). + async def stop(self): + assert self.process is not None + self.process.send_signal(self.program.stop.signal) + # FIXME diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index acd190c5..d2b8ecfd 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -1,5 +1,4 @@ import asyncio -from dataclasses import dataclass import logging import procstar.spec from procstar.agent.exc import NoConnectionError, NoOpenConnectionInGroup, ProcessUnknownError @@ -16,6 +15,7 @@ from apsis.procstar import get_agent_server from apsis.program import base from apsis.program.base import (ProgramSuccess, ProgramFailure, ProgramError) +from apsis.program.process import Stop from apsis.runs import join_args, template_expand log = logging.getLogger(__name__) @@ -144,34 +144,6 @@ async def _make_outputs(fd_data): return base.program_outputs(output, length=length, compression=None) -#------------------------------------------------------------------------------- - -@dataclass -class Stop: - """ - Specification for how to stop a running agent program. - """ - - signal: Signals = Signals.SIGTERM - grace_period: int = 60 - - def to_jso(self): - cls = type(self) - return ( - ifkey("signal", self.signal, cls.signal) - | ifkey("grace_period", self.grace_period, cls.grace_period) - ) - - - @classmethod - def from_jso(cls, jso): - with check_schema(jso or {}) as pop: - signal = pop("signal", Signals.__getattr__, cls.signal) - grace_period = pop("grace_period", int, cls.grace_period) - return cls(signal, grace_period) - - - #------------------------------------------------------------------------------- class _ProcstarProgram(base.Program): @@ -183,7 +155,7 @@ def __init__( self, *, group_id =procstar.proto.DEFAULT_GROUP, sudo_user =None, - stop =Stop(Stop.signal.name, Stop.grace_period), + stop =Stop.DEFAULT, ): super().__init__() self.__group_id = str(group_id) @@ -192,43 +164,31 @@ def __init__( def _bind(self, argv, args): - ntemplate_expand = or_none(template_expand) - stop = Stop( - to_signal(template_expand(self.__stop.signal, args)), - nparse_duration(ntemplate_expand(self.__stop.grace_period, args)) - ) return BoundProcstarProgram( argv, group_id =ntemplate_expand(self.__group_id, args), sudo_user =ntemplate_expand(self.__sudo_user, args), - stop =stop, + stop =self.__stop.bind(args), ) def to_jso(self): - stop = ( - ifkey("signal", self.__stop.signal, Stop.signal.name) - | ifkey("grace_period", self.__stop.grace_period, Stop.grace_period) - ) return ( super().to_jso() | { "group_id" : self.__group_id, } | if_not_none("sudo_user", self.__sudo_user) - | ifkey("stop", stop, {}) + | ifkey("stop", self.__stop.to_jso(), {}) ) @staticmethod def _from_jso(pop): - with check_schema(pop("stop", default={})) as spop: - signal = spop("signal", str, default=Stop.signal.name) - grace_period = spop("grace_period", default=Stop.grace_period) return dict( group_id =pop("group_id", default=procstar.proto.DEFAULT_GROUP), sudo_user =pop("sudo_user", default=None), - stop =Stop(signal, grace_period), + stop =Stop.from_jso_str(pop("stop", default={})), ) From e96257be31c892183f310566a7d00813bb866f0f Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 09:53:40 -0500 Subject: [PATCH 68/84] Round-trip JSO tests for programs. --- python/apsis/program/process.py | 36 ++++++++-------- python/apsis/program/procstar/agent.py | 34 +++++++-------- test/unit/test_process_program.py | 43 +++++++++++++++++++ test/unit/test_procstar_program.py | 57 ++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 33 deletions(-) create mode 100644 test/unit/test_procstar_program.py diff --git a/python/apsis/program/process.py b/python/apsis/program/process.py index 094f81c8..db7047d0 100644 --- a/python/apsis/program/process.py +++ b/python/apsis/program/process.py @@ -30,7 +30,11 @@ @dataclass class Stop: """ - Specification for how to stop a running agent program. + Specification for how to stop a running process. + + 1. Send `signal` to the process. + 2. Wait up to `grace_period` sec. + 3. If the process has not terminated, send SIGKILL. """ signal: Signals = Signals.SIGTERM @@ -47,7 +51,7 @@ def to_jso(self): @classmethod def from_jso(cls, jso): with check_schema(jso or {}) as pop: - signal = pop("signal", Signals.__getattr__, cls.signal) + signal = pop("signal", to_signal, cls.signal) grace_period = pop("grace_period", int, cls.grace_period) return cls(signal, grace_period) @@ -75,25 +79,25 @@ def bind(self, args): class ProcessProgram(Program): def __init__(self, argv, *, stop=Stop.DEFAULT): - self.__argv = tuple( str(a) for a in argv ) - self.__stop = stop + self.argv = tuple( str(a) for a in argv ) + self.stop = stop def __str__(self): - return join_args(self.__argv) + return join_args(self.argv) def bind(self, args): - argv = tuple( template_expand(a, args) for a in self.__argv ) - stop = self.__stop.bind(args) + argv = tuple( template_expand(a, args) for a in self.argv ) + stop = self.stop.bind(args) return BoundProcessProgram(argv, stop=stop) def to_jso(self): return { **super().to_jso(), - "argv" : list(self.__argv), - } | ifkey("stop", self.__stop.to_jso(), {}) + "argv" : list(self.argv), + } | ifkey("stop", self.stop.to_jso(), {}) @classmethod @@ -110,26 +114,26 @@ def from_jso(cls, jso): class ShellCommandProgram(Program): def __init__(self, command, *, stop=Stop.DEFAULT): - self.__command = str(command) - self.__stop = stop + self.command = str(command) + self.stop = stop def bind(self, args): - command = template_expand(self.__command, args) + command = template_expand(self.command, args) argv = ["/bin/bash", "-c", command] - stop = self.__stop.bind(args) + stop = self.stop.bind(args) return BoundProcessProgram(argv, stop=stop) def __str__(self): - return self.__command + return self.command def to_jso(self): return { **super().to_jso(), - "command": self.__command, - } | ifkey("stop", self.__stop.to_jso(), {}) + "command": self.command, + } | ifkey("stop", self.stop.to_jso(), {}) @classmethod diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index d2b8ecfd..083b9fda 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -158,17 +158,17 @@ def __init__( stop =Stop.DEFAULT, ): super().__init__() - self.__group_id = str(group_id) - self.__sudo_user = None if sudo_user is None else str(sudo_user) - self.__stop = stop + self.group_id = str(group_id) + self.sudo_user = None if sudo_user is None else str(sudo_user) + self.stop = stop def _bind(self, argv, args): return BoundProcstarProgram( argv, - group_id =ntemplate_expand(self.__group_id, args), - sudo_user =ntemplate_expand(self.__sudo_user, args), - stop =self.__stop.bind(args), + group_id =ntemplate_expand(self.group_id, args), + sudo_user =ntemplate_expand(self.sudo_user, args), + stop =self.stop.bind(args), ) @@ -176,10 +176,10 @@ def to_jso(self): return ( super().to_jso() | { - "group_id" : self.__group_id, + "group_id" : self.group_id, } - | if_not_none("sudo_user", self.__sudo_user) - | ifkey("stop", self.__stop.to_jso(), {}) + | if_not_none("sudo_user", self.sudo_user) + | ifkey("stop", self.stop.to_jso(), {}) ) @@ -199,20 +199,20 @@ class ProcstarProgram(_ProcstarProgram): def __init__(self, argv, **kw_args): super().__init__(**kw_args) - self.__argv = [ str(a) for a in argv ] + self.argv = [ str(a) for a in argv ] def __str__(self): - return join_args(self.__argv) + return join_args(self.argv) def bind(self, args): - argv = tuple( template_expand(a, args) for a in self.__argv ) + argv = tuple( template_expand(a, args) for a in self.argv ) return super()._bind(argv, args) def to_jso(self): - return super().to_jso() | {"argv" : self.__argv} + return super().to_jso() | {"argv" : self.argv} @classmethod @@ -232,20 +232,20 @@ class ProcstarShellProgram(_ProcstarProgram): def __init__(self, command, **kw_args): super().__init__(**kw_args) - self.__command = str(command) + self.command = str(command) def __str__(self): - return self.__command + return self.command def bind(self, args): - argv = [self.SHELL, "-c", template_expand(self.__command, args)] + argv = [self.SHELL, "-c", template_expand(self.command, args)] return super()._bind(argv, args) def to_jso(self): - return super().to_jso() | {"command" : self.__command} + return super().to_jso() | {"command" : self.command} @classmethod diff --git a/test/unit/test_process_program.py b/test/unit/test_process_program.py index 8989aaf4..2f5c1dbb 100644 --- a/test/unit/test_process_program.py +++ b/test/unit/test_process_program.py @@ -1,4 +1,5 @@ import pytest +from signal import Signals from apsis.program import Program, ProgramSuccess @@ -40,3 +41,45 @@ async def test_shell_command_program(): assert output.data == b"Hello, world!\n" +def test_process_program_jso(): + program = Program.from_jso({ + "type": "apsis.program.process.ProcessProgram", + "argv": ["/usr/bin/echo", "Hello, {{ name }}!"], + "stop": {"grace_period": 30}, + }) + + # JSO round trip. + program = Program.from_jso(program.to_jso()) + assert list(program.argv) == ["/usr/bin/echo", "Hello, {{ name }}!"] + assert program.stop.signal == "SIGTERM" # default + assert program.stop.grace_period == 30 + + # Bind and do it again. + program = program.bind({"name": "Bob"}) + program = Program.from_jso(program.to_jso()) + assert list(program.argv) == ["/usr/bin/echo", "Hello, Bob!"] + assert program.stop.signal == Signals.SIGTERM # default + assert program.stop.grace_period == 30 + + +def test_shell_command_program_jso(): + program = Program.from_jso({ + "type": "apsis.program.process.ShellCommandProgram", + "command": "echo 'Hello, {{ name }}!'", + "stop": {"grace_period": 30}, + }) + + # JSO round trip. + program = Program.from_jso(program.to_jso()) + assert program.command == "echo 'Hello, {{ name }}!'" + assert program.stop.signal == "SIGTERM" # default + assert program.stop.grace_period == 30 + + # Bind and do it again. + program = program.bind({"name": "Bob"}) + program = Program.from_jso(program.to_jso()) + assert "echo 'Hello, Bob!'" in program.argv[2] + assert program.stop.signal == Signals.SIGTERM # default + assert program.stop.grace_period == 30 + + diff --git a/test/unit/test_procstar_program.py b/test/unit/test_procstar_program.py new file mode 100644 index 00000000..86f6d768 --- /dev/null +++ b/test/unit/test_procstar_program.py @@ -0,0 +1,57 @@ +from signal import Signals + +from apsis.program import Program + +#------------------------------------------------------------------------------- + +def test_process_program_jso(): + program = Program.from_jso({ + "type" : "apsis.program.procstar.agent.ProcstarProgram", + "argv" : ["/usr/bin/echo", "Hello, {{ name }}!"], + "stop" : {"signal": "SIGUSR1"}, + "group_id" : "prod", + }) + + # JSO round trip. + program = Program.from_jso(program.to_jso()) + assert list(program.argv) == ["/usr/bin/echo", "Hello, {{ name }}!"] + assert program.group_id == "prod" + assert program.sudo_user is None + assert program.stop.signal == "SIGUSR1" + assert program.stop.grace_period == 60 + + # Bind and do it again. + program = program.bind({"name": "Bob"}) + program = Program.from_jso(program.to_jso()) + assert list(program.argv) == ["/usr/bin/echo", "Hello, Bob!"] + assert program.group_id == "prod" + assert program.sudo_user is None + assert program.stop.signal == Signals.SIGUSR1 + assert program.stop.grace_period == 60 + + +def test_shell_command_program_jso(): + program = Program.from_jso({ + "type" : "apsis.program.procstar.agent.ProcstarShellProgram", + "command" : "echo 'Hello, {{ name }}!'", + "sudo_user" : "produser", + }) + + # JSO round trip. + program = Program.from_jso(program.to_jso()) + assert program.command == "echo 'Hello, {{ name }}!'" + assert program.group_id == "default" + assert program.sudo_user == "produser" + assert program.stop.signal == "SIGTERM" + assert program.stop.grace_period == 60 + + # Bind and do it again. + program = program.bind({"name": "Bob"}) + program = Program.from_jso(program.to_jso()) + assert "echo 'Hello, Bob!'" in program.argv[2] + assert program.group_id == "default" + assert program.sudo_user == "produser" + assert program.stop.signal == Signals.SIGTERM + assert program.stop.grace_period == 60 + + From 0677fd6f9fefedb8d349656fc5b1a40be1326228 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 09:55:59 -0500 Subject: [PATCH 69/84] Todo. --- notes/todo.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notes/todo.md b/notes/todo.md index da7c5aa2..1966f52f 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -47,7 +47,7 @@ - [x] Procstar stop method - [x] add stop time to runs table - [ ] classic agent program stop method - - [ ] ProcessProgram: RunningProgram and stop method + - [x] ProcessProgram: RunningProgram and stop method - [ ] "stop" operation - [x] add stop time to run view - [ ] refactor `apsis.stop` module From 4af21a88ac44584540cfa5bd4185426feafb192d Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 10:11:07 -0500 Subject: [PATCH 70/84] Clean up imports. --- python/apsis/program/agent.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/apsis/program/agent.py b/python/apsis/program/agent.py index 47ba21ea..ca762971 100644 --- a/python/apsis/program/agent.py +++ b/python/apsis/program/agent.py @@ -1,17 +1,15 @@ import asyncio -import contextlib import functools import httpx import logging import ora import socket -import traceback from .base import ( Program, ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError, program_outputs, Timeout, RunningProgram, ) -from apsis.agent.client import Agent, NoSuchProcessError, HTTP_IMPL +from apsis.agent.client import Agent, NoSuchProcessError from apsis.host_group import expand_host from apsis.lib import memo from apsis.lib.cmpr import compress_async From 27af1324d8e9437f05bbb609a735a93ad6008792 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:07:25 -0500 Subject: [PATCH 71/84] Clean up import. --- test/unit/test_stop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/unit/test_stop.py b/test/unit/test_stop.py index e8af8790..d8d4ab58 100644 --- a/test/unit/test_stop.py +++ b/test/unit/test_stop.py @@ -1,6 +1,5 @@ import ora -from apsis.states import State from apsis.stop import StopSchedule #------------------------------------------------------------------------------- From e388e882db99c719b9dab9a330e6993730afcb05 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:08:15 -0500 Subject: [PATCH 72/84] Stop run in API. --- python/apsis/apsis.py | 26 ++++++++++++++++++++++++++ python/apsis/running.py | 12 +----------- python/apsis/service/api.py | 15 +++++++++++++++ python/apsis/service/client.py | 4 ++++ test/int/procstar/test_stop.py | 22 ++++++++++++++++++++++ 5 files changed, 68 insertions(+), 11 deletions(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index f703e536..bb715b28 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -68,6 +68,8 @@ def __init__(self, cfg, jobs, db): self.__wait_tasks = TaskGroup(log) # One task for each starting/running run. self.__run_tasks = TaskGroup(log) + # One task for each stopping run. + self.__stopping_tasks = TaskGroup(log) # One task for each running action. self.__action_tasks = TaskGroup(log) @@ -594,6 +596,28 @@ async def rerun(self, run, *, time=None): return new_run + async def stop_run(self, run): + if run.state == "stopping": + log.info(f"run already stopping: {run.run_id}") + return + + # Transition to stopping. + self.run_log.record(run, "stopping") + self._transition( + run, State.stopping, + run_state=run.run_state | {"stopping": True} + ) + + # Ask the run to stop. + async def stop(): + try: + await run._running_program.stop() + except: + log.info("program.stop() exception", exc_info=True) + + self.__stopping_tasks.add(run.run_id, stop()) + + async def send_signal(self, run, signal): """ :raise RuntimeError: @@ -619,6 +643,7 @@ async def shut_down(self): await self.__action_tasks.cancel_all() await self.__wait_tasks.cancel_all() await self.__run_tasks.cancel_all() + await self.__stopping_tasks.cancel_all() await self.__tasks.cancel_all() log.info("Apsis shut down") @@ -656,6 +681,7 @@ def get_stats(self): "tasks": { "num_waiting" : len(self.__wait_tasks), "num_running" : len(self.__run_tasks), + "num_stopping" : len(self.__stopping_tasks), "num_action" : len(self.__action_tasks), }, "len_runlogdb_cache" : len(self.__db.run_log_db._RunLogDB__cache), diff --git a/python/apsis/running.py b/python/apsis/running.py index c1822f89..fc290fb5 100644 --- a/python/apsis/running.py +++ b/python/apsis/running.py @@ -87,17 +87,7 @@ async def stop(): duration = stop_time - now() log.debug(f"{run_id}: running for {duration:.3f} s until stop") await asyncio.sleep(duration) - # Transition to stopping. - apsis.run_log.record(run, "stopping") - apsis._transition( - run, State.stopping, - run_state=run.run_state | {"stopping": True} - ) - # Ask the run to stop. - try: - await run._running_program.stop() - except: - log.info("program.stop() exception", exc_info=True) + await apsis.stop_run(run) # The main update loop handles updates in response. stop_task = asyncio.create_task(stop()) diff --git a/python/apsis/service/api.py b/python/apsis/service/api.py index 4ca4c371..37e3c8f4 100644 --- a/python/apsis/service/api.py +++ b/python/apsis/service/api.py @@ -368,6 +368,21 @@ async def run_rerun(request, run_id): return response_json(jso) +# PUT is probably right, but run actions currently are POST only. +@API.route("/runs//stop", methods={"PUT", "POST"}) +async def run_stop(request, run_id): + apsis = request.app.apsis + _, run = apsis.run_store.get(run_id) + + try: + await apsis.stop_run(run) + except RuntimeError as exc: + return error(str(exc), 400) + else: + jso = runs_to_jso(request.app, ora.now(), [run]) + return response_json(jso) + + # PUT is probably right, but run actions currently are POST only. @API.route("/runs//signal/", methods={"PUT", "POST"}) async def run_signal(request, run_id, signal): diff --git a/python/apsis/service/client.py b/python/apsis/service/client.py index fbd90a3d..d3697a45 100644 --- a/python/apsis/service/client.py +++ b/python/apsis/service/client.py @@ -375,6 +375,10 @@ def schedule_shell_program(self, time, command, **kw_args): ) + def stop_run(self, run_id): + return self.__put("/api/v1/runs", run_id, "stop")["runs"][run_id] + + def reload_jobs(self, *, dry_run=False): return self.__post("/api/control/reload_jobs", data={}, dry_run=dry_run) diff --git a/test/int/procstar/test_stop.py b/test/int/procstar/test_stop.py index 4917e65d..9b64830c 100644 --- a/test/int/procstar/test_stop.py +++ b/test/int/procstar/test_stop.py @@ -45,6 +45,28 @@ def test_stop(): assert meta["times"]["elapsed"] < 2 +def test_stop_api(): + svc = ApsisService() + dump_job(svc.jobs_dir, SLEEP_JOB) + with svc, svc.agent(): + # Schedule a 3 sec job but tell Apsis to stop it after 1 sec. + run_id = svc.client.schedule(SLEEP_JOB.job_id, {"time": "3"})["run_id"] + res = svc.wait_run( + run_id, wait_states=("new", "scheduled", "waiting", "starting")) + + time.sleep(0.5) + res = svc.client.stop_run(run_id) + assert res["state"] == "stopping" + + res = svc.wait_run(run_id) + # The run was successfully stopped by Apsis, by sending it SIGTERM. + assert res["state"] == "success" + meta = res["meta"]["program"] + assert meta["status"]["signal"] == "SIGTERM" + assert meta["stop"]["signals"] == ["SIGTERM"] + assert meta["times"]["elapsed"] < 2 + + def test_dont_stop(): svc = ApsisService() dump_job(svc.jobs_dir, IGNORE_TERM_JOB) From 5ea1c23ddfb4cd4668b638f107ff60ece49e85a8 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:08:52 -0500 Subject: [PATCH 73/84] Proper stop behavior in process program. --- python/apsis/program/agent.py | 208 ++++++++++++++++++++----- python/apsis/program/process.py | 89 +++++++---- python/apsis/program/procstar/agent.py | 11 +- 3 files changed, 235 insertions(+), 73 deletions(-) diff --git a/python/apsis/program/agent.py b/python/apsis/program/agent.py index ca762971..256eebe1 100644 --- a/python/apsis/program/agent.py +++ b/python/apsis/program/agent.py @@ -3,17 +3,20 @@ import httpx import logging import ora +import os +from signal import Signals import socket from .base import ( Program, ProgramRunning, ProgramSuccess, ProgramFailure, ProgramError, program_outputs, Timeout, RunningProgram, ) +from .process import Stop, BoundStop from apsis.agent.client import Agent, NoSuchProcessError from apsis.host_group import expand_host from apsis.lib import memo from apsis.lib.cmpr import compress_async -from apsis.lib.json import check_schema +from apsis.lib.json import check_schema, ifkey from apsis.lib.py import or_none, nstr from apsis.lib.sys import get_username from apsis.runs import template_expand, join_args @@ -29,32 +32,35 @@ def _get_agent(host, user): class AgentProgram(Program): - def __init__(self, argv, *, host=None, user=None, timeout=None): - self.argv = tuple( str(a) for a in argv ) - self.host = nstr(host) - self.user = nstr(user) - self.timeout = timeout + def __init__( + self, + argv, *, + host =None, + user =None, + timeout =None, + stop =Stop(), + ): + self.argv = tuple( str(a) for a in argv ) + self.host = nstr(host) + self.user = nstr(user) + self.timeout = timeout + self.stop = stop def __str__(self): return join_args(self.argv) - def bind(self, args): - argv = tuple( template_expand(a, args) for a in self.argv ) - host = or_none(template_expand)(self.host, args) - user = or_none(template_expand)(self.user, args) - timeout = None if self.timeout is None else self.timeout.bind(args) - return type(self)(argv, host=host, user=user, timeout=timeout) - - def to_jso(self): - jso = { - **super().to_jso(), - "argv" : list(self.argv), - "host" : self.host, - "user" : self.user, - } + jso = ( + { + **super().to_jso(), + "argv" : list(self.argv), + } + | ifkey("host", self.host, None) + | ifkey("user", self.user, None) + | ifkey("stop", self.stop.to_jso(), {}) + ) if self.timeout is not None: jso["timeout"] = self.timeout.to_jso() return jso @@ -67,56 +73,142 @@ def from_jso(cls, jso): host = pop("host", nstr, None) user = pop("user", nstr, None) timeout = pop("timeout", Timeout.from_jso, None) - return cls(argv, host=host, user=user, timeout=timeout) + stop = pop("stop", Stop.from_jso, default=Stop()) + return cls(argv, host=host, user=user, timeout=timeout, stop=stop) - def run(self, run_id, cfg): - return RunningAgentProgram(run_id, self, cfg) + def bind(self, args): + argv = tuple( template_expand(a, args) for a in self.argv ) + host = or_none(template_expand)(self.host, args) + user = or_none(template_expand)(self.user, args) + timeout = None if self.timeout is None else self.timeout.bind(args) + stop = self.stop.bind(args) + return BoundAgentProgram( + argv, + host =host, + user =user, + timeout =timeout, + stop =stop, + ) - def connect(self, run_id, run_state, cfg): - return RunningAgentProgram(run_id, self, cfg, run_state) +class AgentShellProgram(Program): + + def __init__( + self, + command, *, + host =None, + user =None, + timeout =None, + stop =Stop(), + ): + self.command = str(command) + self.host = nstr(host) + self.user = nstr(user) + self.timeout = timeout + self.stop = stop -class AgentShellProgram(AgentProgram): + def __str__(self): + return self.command + - def __init__(self, command, **kw_args): - command = str(command) - argv = ["/bin/bash", "-c", command] - super().__init__(argv, **kw_args) - self.command = command + def to_jso(self): + jso = ( + { + **super().to_jso(), + "command" : self.command, + } + | ifkey("host", self.host, None) + | ifkey("user", self.user, None) + | ifkey("stop", self.stop.to_jso(), {}) + ) + if self.timeout is not None: + jso["timeout"] = self.timeout.to_jso() + return jso + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso) as pop: + command = pop("command", str) + host = pop("host", nstr, None) + user = pop("user", nstr, None) + timeout = pop("timeout", Timeout.from_jso, None) + stop = pop("stop", Stop.from_jso, Stop()) + return cls(command, host=host, user=user, timeout=timeout, stop=stop) def bind(self, args): command = template_expand(self.command, args) host = or_none(template_expand)(self.host, args) user = or_none(template_expand)(self.user, args) - timeout = self.timeout - timeout = None if timeout is None else timeout.bind(args) - return type(self)(command, host=host, user=user, timeout=timeout) + timeout = None if self.timeout is None else self.timeout.bind(args) + stop = self.stop.bind(args) + argv = ["/bin/bash", "-c", command] + return BoundAgentProgram( + argv, + host =host, + user =user, + timeout =timeout, + stop =stop, + ) + + + +#------------------------------------------------------------------------------- + +class BoundAgentProgram(Program): + + def __init__( + self, + argv, *, + host =None, + user =None, + timeout =None, + stop =BoundStop(), + ): + self.argv = tuple( str(a) for a in argv ) + self.host = nstr(host) + self.user = nstr(user) + self.timeout = timeout + self.stop = stop def __str__(self): - return self.command + return join_args(self.argv) def to_jso(self): - # A bit hacky. Take the base-class JSO and replace argv with command. - jso = super().to_jso() - del jso["argv"] - jso["command"] = self.command + jso = { + **super().to_jso(), + "argv" : list(self.argv), + "host" : self.host, + "user" : self.user, + } | ifkey("stop", self.stop.to_jso(), {}) + if self.timeout is not None: + jso["timeout"] = self.timeout.to_jso() return jso @classmethod def from_jso(cls, jso): with check_schema(jso) as pop: - command = pop("command", str) + argv = pop("argv") host = pop("host", nstr, None) user = pop("user", nstr, None) timeout = pop("timeout", Timeout.from_jso, None) - return cls(command, host=host, user=user, timeout=timeout) + stop = pop("stop", BoundStop.from_jso, BoundStop()) + return cls(argv, host=host, user=user, timeout=timeout, stop=stop) + + + def run(self, run_id, cfg): + return RunningAgentProgram(run_id, self, cfg) + + + def connect(self, run_id, run_state, cfg): + return RunningAgentProgram(run_id, self, cfg, run_state) @@ -130,6 +222,8 @@ def __init__(self, run_id, program, cfg, run_state=None): self.cfg = cfg self.run_state = run_state + self.stopping = False + def __get_agent(self, host): host = None if host is None else socket.getfqdn(host) @@ -254,6 +348,15 @@ async def updates(self): if status == 0: yield ProgramSuccess(meta=proc, outputs=outputs) + elif ( + self.stopping + and os.WIFSIGNALED(status) + and Signals(os.WTERMSIG(status)) == self.program.stop.signal + ): + # Program stopped as expected. + log.info("EXPECTED SIGTERM WHEN STOPPING") + yield ProgramSuccess(meta=proc, outputs=outputs) + else: message = f"program failed: status {status}{explanation}" yield ProgramFailure(message, meta=proc, outputs=outputs) @@ -277,4 +380,27 @@ async def signal(self, signal): await agent.signal(proc_id, signal) + async def stop(self): + stop = self.program.stop + self.stopping = True + + # Send the stop signal. + await self.signal(stop.signal) + + if stop.grace_period is not None: + try: + # Wait for the grace period to expire. + await asyncio.sleep(stop.grace_period) + except asyncio.CancelledError: + # That's what we were hoping for. + pass + else: + # Send a kill signal. + try: + self.stop_signals.append(Signals.SIGKILL) + await self.signal(Signals.SIGKILL) + except ValueError: + # Proc is gone; that's OK. + pass + diff --git a/python/apsis/program/process.py b/python/apsis/program/process.py index db7047d0..2c5a8885 100644 --- a/python/apsis/program/process.py +++ b/python/apsis/program/process.py @@ -37,8 +37,8 @@ class Stop: 3. If the process has not terminated, send SIGKILL. """ - signal: Signals = Signals.SIGTERM - grace_period: int = 60 + signal: str = "SIGTERM" + grace_period: str = "60" def to_jso(self): cls = type(self) @@ -51,34 +51,47 @@ def to_jso(self): @classmethod def from_jso(cls, jso): with check_schema(jso or {}) as pop: - signal = pop("signal", to_signal, cls.signal) - grace_period = pop("grace_period", int, cls.grace_period) - return cls(signal, grace_period) - - - @classmethod - def from_jso_str(cls, jso): - with check_schema(jso or {}) as pop: - signal = pop("signal", str, default=cls.signal.name) + signal = pop("signal", str, default=cls.signal) grace_period = pop("grace_period", default=cls.grace_period) return cls(signal, grace_period) def bind(self, args): - return type(self)( + return BoundStop( to_signal(template_expand(self.signal, args)), nparse_duration(ntemplate_expand(self.grace_period, args)) ) -Stop.DEFAULT = Stop() +@dataclass +class BoundStop: + + signal: Signals = Signals.SIGTERM + grace_period: float = 60 + + def to_jso(self): + cls = type(self) + return ( + ifkey("signal", self.signal, cls.signal) + | ifkey("grace_period", self.grace_period, cls.grace_period) + ) + + + @classmethod + def from_jso(cls, jso): + with check_schema(jso or {}) as pop: + signal = pop("signal", to_signal, cls.signal) + grace_period = pop("grace_period", int, cls.grace_period) + return cls(signal, grace_period) + + #------------------------------------------------------------------------------- class ProcessProgram(Program): - def __init__(self, argv, *, stop=Stop.DEFAULT): + def __init__(self, argv, *, stop=Stop()): self.argv = tuple( str(a) for a in argv ) self.stop = stop @@ -104,7 +117,7 @@ def to_jso(self): def from_jso(cls, jso): with check_schema(jso) as pop: argv = pop("argv") - stop = Stop.from_jso_str(pop("stop", default={})) + stop = pop("stop", Stop.from_jso, Stop()) return cls(argv, stop=stop) @@ -113,9 +126,9 @@ def from_jso(cls, jso): class ShellCommandProgram(Program): - def __init__(self, command, *, stop=Stop.DEFAULT): - self.command = str(command) - self.stop = stop + def __init__(self, command, *, stop=Stop()): + self.command = str(command) + self.stop = stop def bind(self, args): @@ -140,7 +153,7 @@ def to_jso(self): def from_jso(cls, jso): with check_schema(jso) as pop: command = pop("command", str) - stop = Stop.from_jso_str(pop("stop", default={})) + stop = pop("stop", Stop.from_jso, default=Stop()) return cls(command, stop=stop) @@ -149,7 +162,7 @@ def from_jso(cls, jso): class BoundProcessProgram(Program): - def __init__(self, argv, *, stop=Stop.DEFAULT): + def __init__(self, argv, *, stop=BoundStop()): self.argv = tuple( str(a) for a in argv ) self.stop = stop @@ -169,7 +182,7 @@ def to_jso(self): def from_jso(cls, jso): with check_schema(jso) as pop: argv = pop("argv") - stop = Stop.from_jso(pop("stop", default={})) + stop = pop("stop", BoundStop.from_jso, BoundStop.DEFAULT) return cls(argv, stop=stop) @@ -188,8 +201,9 @@ class RunningProcessProgram(RunningProgram): def __init__(self, program, run_id): super().__init__(run_id) - self.program = program - self.process = None + self.program = program + self.process = None + self.stopping = False @memo.property @@ -236,6 +250,14 @@ async def updates(self): if return_code == 0: yield ProgramSuccess(meta=meta, outputs=outputs) + elif ( + self.stopping + and self.returncode < 0 + and Signals(self.returncode) == self.program.stop.signal + ): + # Program stopped as expected. + yield ProgramFailure(meta=meta, outputs=outputs) + else: message = f"program failed: return code {return_code}" yield ProgramFailure(message, meta=meta, outputs=outputs) @@ -252,9 +274,24 @@ async def signal(self, signal): async def stop(self): - assert self.process is not None - self.process.send_signal(self.program.stop.signal) - # FIXME + self.stopping = True + + stop = self.program.stop + self.process.send_signal(stop.signal) + if stop.grace_period is not None: + try: + # Wait for the grace period to expire. + await asyncio.sleep(stop.grace_period) + except asyncio.CancelledError: + # That's what we were hoping for. + pass + else: + # Send a kill signal. + try: + await self.signal(Signals.SIGKILL) + except ValueError: + # Proc is gone; that's OK. + pass diff --git a/python/apsis/program/procstar/agent.py b/python/apsis/program/procstar/agent.py index 083b9fda..86b6a320 100644 --- a/python/apsis/program/procstar/agent.py +++ b/python/apsis/program/procstar/agent.py @@ -11,11 +11,10 @@ from apsis.lib.json import check_schema, ifkey from apsis.lib.parse import nparse_duration from apsis.lib.py import or_none, nstr, get_cfg -from apsis.lib.sys import to_signal from apsis.procstar import get_agent_server from apsis.program import base from apsis.program.base import (ProgramSuccess, ProgramFailure, ProgramError) -from apsis.program.process import Stop +from apsis.program.process import Stop, BoundStop from apsis.runs import join_args, template_expand log = logging.getLogger(__name__) @@ -155,7 +154,7 @@ def __init__( self, *, group_id =procstar.proto.DEFAULT_GROUP, sudo_user =None, - stop =Stop.DEFAULT, + stop =Stop(), ): super().__init__() self.group_id = str(group_id) @@ -188,7 +187,7 @@ def _from_jso(pop): return dict( group_id =pop("group_id", default=procstar.proto.DEFAULT_GROUP), sudo_user =pop("sudo_user", default=None), - stop =Stop.from_jso_str(pop("stop", default={})), + stop =pop("stop", Stop.from_jso, Stop()), ) @@ -264,7 +263,7 @@ class BoundProcstarProgram(base.Program): def __init__( self, argv, *, group_id, sudo_user =None, - stop =Stop(), + stop =BoundStop(), ): self.argv = [ str(a) for a in argv ] self.group_id = str(group_id) @@ -294,7 +293,7 @@ def from_jso(cls, jso): argv = pop("argv") group_id = pop("group_id", default=procstar.proto.DEFAULT_GROUP) sudo_user = pop("sudo_user", default=None) - stop = Stop.from_jso(pop("stop", default={})) + stop = pop("stop", BoundStop.from_jso, BoundStop()) return cls(argv, group_id=group_id, sudo_user=sudo_user, stop=stop) From 525663173d966874342d4273dc0e5ab6ac1beebb Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:09:09 -0500 Subject: [PATCH 74/84] Agent stop tests. --- test/int/basic/test_agent_stop.py | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 test/int/basic/test_agent_stop.py diff --git a/test/int/basic/test_agent_stop.py b/test/int/basic/test_agent_stop.py new file mode 100644 index 00000000..6af76e0c --- /dev/null +++ b/test/int/basic/test_agent_stop.py @@ -0,0 +1,50 @@ +from pathlib import Path +import time + +from instance import ApsisService + +JOB_DIR = Path(__file__).parent / "jobs" + +#------------------------------------------------------------------------------- + +def test_stop_basic(): + with ApsisService(job_dir=JOB_DIR) as inst: + run_id = inst.client.schedule( + "sleep", {"time": "5"}, + stop_time="+1s" + )["run_id"] + res = inst.wait_run( + run_id, wait_states=("new", "scheduled", "waiting", "starting")) + assert res["state"] == "running" + + res = inst.wait_run(run_id) + assert res["state"] == "success" + assert "stopping" in res["times"] + assert res["meta"]["elapsed"] < 2 + + +def test_stop_api(): + with ApsisService(job_dir=JOB_DIR) as inst: + run_id = inst.client.schedule("sleep", {"time": "5"},)["run_id"] + res = inst.wait_run( + run_id, wait_states=("new", "scheduled", "waiting", "starting")) + assert res["state"] == "running" + + time.sleep(0.5) + res = inst.client.get_run(run_id) + assert res["state"] == "running" + res = inst.client.stop_run(run_id) + assert res["state"] == "stopping" + + res = inst.wait_run(run_id) + assert res["state"] == "success" + assert "stopping" in res["times"] + assert res["meta"]["elapsed"] < 2 + + +if __name__ == "__main__": + import logging + logging.basicConfig(level=logging.DEBUG) + test_stop_api() + + From 7a5f58e38f0bee9d5d0d838968b72708d37620c4 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:11:09 -0500 Subject: [PATCH 75/84] Fix. --- python/apsis/program/process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/apsis/program/process.py b/python/apsis/program/process.py index 2c5a8885..930cf9a2 100644 --- a/python/apsis/program/process.py +++ b/python/apsis/program/process.py @@ -182,7 +182,7 @@ def to_jso(self): def from_jso(cls, jso): with check_schema(jso) as pop: argv = pop("argv") - stop = pop("stop", BoundStop.from_jso, BoundStop.DEFAULT) + stop = pop("stop", BoundStop.from_jso, BoundStop()) return cls(argv, stop=stop) From db8c462232ad28f98dd9eb0f5ba871587f97a35c Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:11:14 -0500 Subject: [PATCH 76/84] Fix. --- test/unit/test_procstar_program.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/test_procstar_program.py b/test/unit/test_procstar_program.py index 86f6d768..5d12ca77 100644 --- a/test/unit/test_procstar_program.py +++ b/test/unit/test_procstar_program.py @@ -18,7 +18,7 @@ def test_process_program_jso(): assert program.group_id == "prod" assert program.sudo_user is None assert program.stop.signal == "SIGUSR1" - assert program.stop.grace_period == 60 + assert program.stop.grace_period == "60" # Bind and do it again. program = program.bind({"name": "Bob"}) @@ -43,7 +43,7 @@ def test_shell_command_program_jso(): assert program.group_id == "default" assert program.sudo_user == "produser" assert program.stop.signal == "SIGTERM" - assert program.stop.grace_period == 60 + assert program.stop.grace_period == "60" # Bind and do it again. program = program.bind({"name": "Bob"}) From 6533a06c4c45ca5a63636fec57703b57ef274a17 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 15:14:16 -0500 Subject: [PATCH 77/84] Todo. --- notes/todo.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/notes/todo.md b/notes/todo.md index 1966f52f..3ad3885e 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -46,8 +46,9 @@ - [x] go through _stopping_ state - [x] Procstar stop method - [x] add stop time to runs table - - [ ] classic agent program stop method + - [x] classic agent program stop method - [x] ProcessProgram: RunningProgram and stop method + - [x] stop run API endpoint - [ ] "stop" operation - [x] add stop time to run view - [ ] refactor `apsis.stop` module From 1e50678bbf00a5a1fbccaf442f64659523b61268 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 16:37:11 -0500 Subject: [PATCH 78/84] Stop operation in web UI. --- vue/src/api.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vue/src/api.js b/vue/src/api.js index ec1fb308..967b0e08 100644 --- a/vue/src/api.js +++ b/vue/src/api.js @@ -56,6 +56,10 @@ export function getSummaryUrl(init) { return url } +export function getStopUrl(run_id) { + return getUrl('runs', run_id, 'stop') +} + export function getSignalUrl(run_id, signame) { return getUrl('runs', run_id, 'signal', signame) } @@ -87,6 +91,7 @@ export function getUrlForOperation(operation, run_id) { case 'skip': return getSkipUrl(run_id) case 'start': return getStartUrl(run_id) case 'rerun': return getRerunUrl(run_id) + case 'stop': return getStopUrl(run_id) case 'terminate': return getSignalUrl(run_id, 'SIGTERM') case 'kill': return getSignalUrl(run_id, 'SIGKILL') default: From 23b7bae19ed094c94182ca136bc4102529994dde Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 16:40:28 -0500 Subject: [PATCH 79/84] Stop operation in CLI. --- python/apsis/cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/apsis/cli.py b/python/apsis/cli.py index a3610d29..1df17b8f 100644 --- a/python/apsis/cli.py +++ b/python/apsis/cli.py @@ -332,6 +332,19 @@ def cmd_start(client, args): cmd.add_argument( "run_id", metavar="RUN-ID ...", nargs="+") + #--- command: stop --------------------------------------------------------- + + def cmd_stop(client, args): + for run_id in args.run_id: + client.stop_run(run_id) + + + cmd = parser.add_command( + "stop", cmd_stop, + description="Requests orderly stop of a running run.") + cmd.add_argument( + "run_id", metavar="RUN-ID ...", nargs="+") + #--- command: watch ---------------------------------------------- def cmd_watch(client, args): From 664bd4f335bee784ed42d5cc8e4431d8d30dafd7 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 8 Jan 2025 16:40:46 -0500 Subject: [PATCH 80/84] Todo. --- notes/todo.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notes/todo.md b/notes/todo.md index 3ad3885e..dc4b275c 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -49,7 +49,7 @@ - [x] classic agent program stop method - [x] ProcessProgram: RunningProgram and stop method - [x] stop run API endpoint - - [ ] "stop" operation + - [x] "stop" operation - [x] add stop time to run view - [ ] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job From e0fb7e4d13d5d758f0d663752e51f96545676981 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Thu, 9 Jan 2025 11:47:58 -0500 Subject: [PATCH 81/84] Fix states. --- python/apsis/apsis.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index bb715b28..326de6d6 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -597,10 +597,14 @@ async def rerun(self, run, *, time=None): async def stop_run(self, run): - if run.state == "stopping": + if run.state == State.stopping: log.info(f"run already stopping: {run.run_id}") return + if run.state != State.running: + raise RuntimeError( + f"can't stop run {run.run_id}: run is {run.state.name}") + # Transition to stopping. self.run_log.record(run, "stopping") self._transition( From e9429b30c239476dea8c98ba377b9ea650442563 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Thu, 9 Jan 2025 11:48:07 -0500 Subject: [PATCH 82/84] Program stop docs. --- docs/concepts.rst | 10 +++++++++ docs/programs.rst | 47 ++++++++++++++++++++++++++++++++++++++++++ docs/schedules.rst | 51 ++++++++++++++++++++++++++++++++++++++++++++++ notes/todo.md | 3 +-- 4 files changed, 109 insertions(+), 2 deletions(-) diff --git a/docs/concepts.rst b/docs/concepts.rst index bd47bc39..5d42b9af 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -150,6 +150,16 @@ You can apply the following operations, to induce transitions explicitly: - You can *skip* a **scheduled** or **waiting** run. Apsis no longer waits for its schedule time or conditions, and transitions it to **skipped**. +- You can *stop* a **running** run. Apsis requests that the run shut down in an + orderly manner. How this works depends on the run's program. For a program + that runs a (local or remote) UNIX process, this entails sending a termination + signal (usuall SIGTERM), then waiting for a grace period and then sending + SIGKILL if the process has not terminated. While Apsis is waiting for the run + to terminate, it is in the **stopping** state. + + You can also schedule Apsis to stop a run automatically; see + :ref:`stop-schedules`. + - You can *mark* a finished run (**success**, **failure**, **skipped**, or **error**) to a different finished state. diff --git a/docs/programs.rst b/docs/programs.rst index 82ed7197..c2ffeb30 100644 --- a/docs/programs.rst +++ b/docs/programs.rst @@ -163,6 +163,53 @@ the user running the Procstar agent to run the command as the sudo user, without any explicit password. +.. _program-stop: + +Program Stop +------------ + +Many program types provide a stop method, by which Apsis can request an orderly +shutdown of the program before it terminates on its own. Keep in mind, + +- Not all program types provide a program stop. +- The program may not stop immediately. +- The program stop may fail. + +Apsis requests a program to stop if the program's run is configured with a stop +schedule, or in response to an explicit stop operation invoked by the user. + +Before Apsis requests a program to stop, it transitions the run to the +*stopping* state. If the program terminates correctly in response to the stop +request, Apsis transitions the run to *success*; if the program terminates in an +unexpected way, *failure*. + +The program types above that create a UNIX process (`program`, `shell`, +`procstar`, `procstar-shell`) all implement program stop similarly. In response +to a program stop request, + +1. Apsis immediately sends the process a signal, by default `SIGTERM`. +2. Apsis waits for the process to terminate, up to a configured grace period, by + default 60 seconds. +3. If the process has not terminated, Apsis sends it `SIGKILL`. + +To configure the program stop, use the `stop` key. For example, Apsis will +request this program to stop by sending `SIGUSR2` instead of `SIGTERM`, and will +only wait 15 seconds before sending `SIGKILL`. + +.. code:: yaml + + program: + type: procstar + group_id: default + argv: ["/usr/bin/echo", "Hello, world!"] + stop: + signal: SIGUSR2 + grace_period: 15s + +If the program terminates with an exit status that indicates the process ended +from `SIGUSR2`, Apsis considers the run to have succeeded. + + Internal Programs ----------------- diff --git a/docs/schedules.rst b/docs/schedules.rst index 265b9b1a..1b489f68 100644 --- a/docs/schedules.rst +++ b/docs/schedules.rst @@ -210,3 +210,54 @@ transition, the schedule will include no times on this date at all. For example, a daily schedule with a start time between 2:00:00 and 3:00:00 and a U.S. time zone will contain no times on the dates in the spring when DST begins. + +.. _stop-schedules: + +Stop schedules +-------------- + +You can also configure a job so that a run will stop at a certain time. When +this time elapsis, Apsis stops the run, assuming it is running. The ways you +schedule a run to stop are different from the schedule types above, since the +stop time is generally related to the schedule time. + +To configure a stop schedule in addition to the normal start schedule, place the +latter into `start:` subkey, and use `stop:` to specify the stop schedule. The +available stop schedule types are `duration` and `daytime`. For example, + +.. code:: yaml + + schedule: + start: + type: daily + daytime: 10:30:00 + tz: Europe/Berlin + stop: + type: duration + duration: 30m + +This instructs Apsis to run the program for 30 minutes after the schedule time, +namely 11:00 Europe/Berlin. + +The `daytime` stop schedule type instructs Apsis to stop the run at the next +occurence of a specific daytime after the schedule time. The following example +has the same effect as the previous: + +.. code:: yaml + + schedule: + start: + type: daily + daytime: 10:30:00 + tz: Europe/Berlin + stop: + type: daytime + daytime: 11:00:00 + tz: Europe/Berlin + +You can use any of the schedule types in the previous section for the `start` +schedule. + +When the stop time elapses, Apsis stops the run in accordance with the program's +stop method. This is discussed in :ref:`program-stop`. + diff --git a/notes/todo.md b/notes/todo.md index dc4b275c..589f1a4f 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -51,11 +51,10 @@ - [x] stop run API endpoint - [x] "stop" operation - [x] add stop time to run view + - [x] docs - [ ] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job - clarify the rules for `run_state` updates - - [ ] (?) distinguish between "service"-type programs in Procstar program - - [ ] (?) "schedule stop" operation? - [ ] if `send_signal` raises, error the run - [ ] improve `apsis job` output style From 96363fa0939776b344f2874fafc8edfa8746dc6c Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 15 Jan 2025 14:46:48 -0500 Subject: [PATCH 83/84] Refactor. --- notes/todo.md | 2 +- python/apsis/schedule/base.py | 2 +- python/apsis/{ => schedule}/stop.py | 0 test/unit/{ => schedule}/test_stop.py | 2 +- test/unit/test_job.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename python/apsis/{ => schedule}/stop.py (100%) rename test/unit/{ => schedule}/test_stop.py (96%) diff --git a/notes/todo.md b/notes/todo.md index 589f1a4f..07d18a5f 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -52,7 +52,7 @@ - [x] "stop" operation - [x] add stop time to run view - [x] docs - - [ ] refactor `apsis.stop` module + - [x] refactor `apsis.stop` module - [ ] update `_process_updates` to set the state for a stopping job - clarify the rules for `run_state` updates - [ ] if `send_signal` raises, error the run diff --git a/python/apsis/schedule/base.py b/python/apsis/schedule/base.py index a19d510c..ed6861db 100644 --- a/python/apsis/schedule/base.py +++ b/python/apsis/schedule/base.py @@ -2,7 +2,7 @@ import ora from apsis.lib.json import TypedJso, check_schema, nkey -from apsis.stop import StopSchedule +from .stop import StopSchedule #------------------------------------------------------------------------------- diff --git a/python/apsis/stop.py b/python/apsis/schedule/stop.py similarity index 100% rename from python/apsis/stop.py rename to python/apsis/schedule/stop.py diff --git a/test/unit/test_stop.py b/test/unit/schedule/test_stop.py similarity index 96% rename from test/unit/test_stop.py rename to test/unit/schedule/test_stop.py index d8d4ab58..fa058b06 100644 --- a/test/unit/test_stop.py +++ b/test/unit/schedule/test_stop.py @@ -1,6 +1,6 @@ import ora -from apsis.stop import StopSchedule +from apsis.schedule.stop import StopSchedule #------------------------------------------------------------------------------- diff --git a/test/unit/test_job.py b/test/unit/test_job.py index 53e84e07..7811a20e 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -1,7 +1,7 @@ import ora from apsis.jobs import Job -from apsis.stop import DurationStopSchedule +from apsis.schedule.stop import DurationStopSchedule #------------------------------------------------------------------------------- From d7275fc541314806aeb2adb895bd7693c17c8571 Mon Sep 17 00:00:00 2001 From: Alex Samuel Date: Wed, 15 Jan 2025 14:53:20 -0500 Subject: [PATCH 84/84] Docstring. --- notes/todo.md | 4 +--- python/apsis/apsis.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/notes/todo.md b/notes/todo.md index 07d18a5f..40e3ca32 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -40,7 +40,7 @@ - [x] run log - [x] improve and clean up `State` enum -- [ ] scheduled stop +- [x] scheduled stop - [x] add stop method to program - [x] actually stop the program at the stop time - [x] go through _stopping_ state @@ -53,8 +53,6 @@ - [x] add stop time to run view - [x] docs - [x] refactor `apsis.stop` module - - [ ] update `_process_updates` to set the state for a stopping job - - clarify the rules for `run_state` updates - [ ] if `send_signal` raises, error the run - [ ] improve `apsis job` output style diff --git a/python/apsis/apsis.py b/python/apsis/apsis.py index 326de6d6..d73dc43e 100644 --- a/python/apsis/apsis.py +++ b/python/apsis/apsis.py @@ -597,6 +597,16 @@ async def rerun(self, run, *, time=None): async def stop_run(self, run): + """ + Transitions `run` to stopping, and requests its program to stop in a + new task. + + The `run` must be either running, or else stopping in which case + this is a no-op. + + Adds `{"stopping": True}` to the run's run state when it transitions to + stopping. + """ if run.state == State.stopping: log.info(f"run already stopping: {run.run_id}") return