From 9f7d3425fa01000c00c380f225b2a8c5bb2a5509 Mon Sep 17 00:00:00 2001 From: Delan Azabani Date: Thu, 2 Jan 2025 16:09:44 +0800 Subject: [PATCH] monitor: initial support for fully automated image rebuilds (#6) --- monitor/Cargo.lock | 149 ++++++++++++++++++++++++++++++++++++++++++++ monitor/Cargo.toml | 2 + monitor/src/main.rs | 110 +++++++++++++++++++++++++++++--- 3 files changed, 254 insertions(+), 7 deletions(-) diff --git a/monitor/Cargo.lock b/monitor/Cargo.lock index b46a122..980e21d 100644 --- a/monitor/Cargo.lock +++ b/monitor/Cargo.lock @@ -26,6 +26,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "askama" version = "0.12.1" @@ -131,6 +146,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + [[package]] name = "byteorder" version = "1.5.0" @@ -155,6 +176,20 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets", +] + [[package]] name = "color-eyre" version = "0.5.11" @@ -182,6 +217,12 @@ dependencies = [ "tracing-error", ] +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "cpufeatures" version = "0.2.13" @@ -486,6 +527,29 @@ dependencies = [ "want", ] +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "idna" version = "0.5.0" @@ -536,6 +600,16 @@ dependencies = [ "color-eyre", ] +[[package]] +name = "js-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -643,6 +717,7 @@ version = "0.1.0" dependencies = [ "askama", "askama_warp", + "chrono", "crossbeam-channel", "dotenv", "http 0.2.12", @@ -651,6 +726,7 @@ dependencies = [ "mktemp", "serde", "serde_json", + "subprocess", "tokio", "toml", "tracing", @@ -1039,6 +1115,16 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "syn" version = "2.0.72" @@ -1415,6 +1501,60 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasm-bindgen" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + [[package]] name = "winapi" version = "0.3.9" @@ -1437,6 +1577,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/monitor/Cargo.toml b/monitor/Cargo.toml index 097149d..8d82677 100644 --- a/monitor/Cargo.toml +++ b/monitor/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] askama = { version = "0.12.1", features = ["with-warp"] } askama_warp = "0.13.0" +chrono = "0.4.39" crossbeam-channel = "0.5.13" dotenv = "0.15.0" http = "0.2" @@ -14,6 +15,7 @@ jane-eyre = "0.3.0" mktemp = "0.5.1" serde = { version = "1.0.204", features = ["derive"] } serde_json = "1.0.120" +subprocess = "0.2.9" tokio = { version = "1.40.0", features = ["full"] } toml = "0.8.15" tracing = "0.1.40" diff --git a/monitor/src/main.rs b/monitor/src/main.rs index a56d56e..0a601d6 100644 --- a/monitor/src/main.rs +++ b/monitor/src/main.rs @@ -9,25 +9,29 @@ mod settings; mod shell; mod zfs; +use core::str; use std::{ collections::BTreeMap, fs::File, - io::Read, + io::{ErrorKind, Read}, + mem::take, net::IpAddr, path::Path, process::exit, sync::{LazyLock, RwLock}, - thread::{self}, + thread::{self, JoinHandle}, time::{Duration, UNIX_EPOCH}, }; use askama::Template; +use chrono::{SecondsFormat, Utc}; use crossbeam_channel::{Receiver, Sender}; use dotenv::dotenv; use http::StatusCode; -use jane_eyre::eyre::{self, eyre, Context}; +use jane_eyre::eyre::{self, bail, eyre, Context, OptionExt}; use mktemp::Temp; use serde_json::json; +use subprocess::{CommunicateError, Exec, Redirection}; use tracing::{error, info, trace, warn}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; use warp::{ @@ -372,8 +376,9 @@ fn monitor_thread() -> eyre::Result<()> { IdGen::new_empty() }); - let /* mut */ profiles = TOML.initial_profiles(); + let mut profiles = TOML.initial_profiles(); let mut registrations_cache = Cache::default(); + let mut image_rebuilds = BTreeMap::default(); loop { let registrations = registrations_cache.get(|| list_registered_runners_for_host())?; @@ -407,13 +412,18 @@ fn monitor_thread() -> eyre::Result<()> { }, ) in profile_runner_counts.iter() { - info!("profile {key}: image age {image_age:?}, {healthy}/{target} healthy runners ({idle} idle, {reserved} reserved, {busy} busy, {started_or_crashed} started or crashed, {excess_idle} excess idle, {wanted} wanted)"); + let profile = profiles.get(key).ok_or_eyre("Failed to get profile")?; + info!("profile {key}: {healthy}/{target} healthy runners ({idle} idle, {reserved} reserved, {busy} busy, {started_or_crashed} started or crashed, {excess_idle} excess idle, {wanted} wanted), image {}/{}@{} age {image_age:?}", DOTENV.zfs_clone_prefix, profile.base_vm_name, profile.base_image_snapshot); } for (_id, runner) in runners.iter() { runner.log_info(); } - // Determine whether any profiles need their images rebuilt. + // Kick off rebuilds for any profiles whose images are too old. + struct Rebuild { + thread: JoinHandle>, + snapshot_name: String, + } for (key, profile) in profiles.iter() { let needs_rebuild = profile.image_needs_rebuild(); if needs_rebuild.unwrap_or(true) { @@ -428,15 +438,52 @@ fn monitor_thread() -> eyre::Result<()> { key, runner_count, "profile image needs rebuild; waiting for runners" ); + } else if image_rebuilds.contains_key(key) { + info!( + key, + runner_count, "profile image needs rebuild; image rebuild still running" + ); } else { info!( key, - runner_count, "profile image needs rebuild; TODO start image rebuild" + runner_count, "profile image needs rebuild; starting image rebuild now" ); + let build_script_path = + Path::new(&profile.configuration_name).join("build-image.sh"); + let snapshot_name = Utc::now().to_rfc3339_opts(SecondsFormat::Nanos, true); + let cloned_snapshot_name = snapshot_name.clone(); + let rebuild = Rebuild { + thread: thread::spawn(move || { + image_rebuild_thread(build_script_path, &cloned_snapshot_name) + }), + snapshot_name: snapshot_name.clone(), + }; + image_rebuilds.insert(key.to_owned(), rebuild); } } } + // Reap image rebuild threads, updating the profile on success. + let mut remaining_image_rebuilds = BTreeMap::default(); + for (profile_key, rebuild) in take(&mut image_rebuilds) { + if rebuild.thread.is_finished() { + match rebuild.thread.join() { + Ok(Ok(())) => { + info!(profile_key, "Image rebuild thread exited"); + let profile = profiles + .get_mut(&profile_key) + .ok_or_eyre("Failed to get profile")?; + profile.base_image_snapshot = rebuild.snapshot_name; + } + Ok(Err(report)) => error!(profile_key, %report, "Image rebuild thread error"), + Err(panic) => error!(profile_key, ?panic, "Image rebuild thread panic"), + }; + } else { + remaining_image_rebuilds.insert(profile_key, rebuild); + } + } + image_rebuilds.extend(remaining_image_rebuilds); + let mut unregister_and_destroy = |id, runner: &Runner| { if runner.registration().is_some() { if let Err(error) = runners.unregister_runner(id) { @@ -578,3 +625,52 @@ fn monitor_thread() -> eyre::Result<()> { } } } + +#[tracing::instrument(skip(build_script_path), fields(build_script_path = ?build_script_path.as_ref()))] +fn image_rebuild_thread( + build_script_path: impl AsRef, + snapshot_name: &str, +) -> eyre::Result<()> { + let mut child = Exec::cmd(build_script_path.as_ref()) + .cwd("..") + .arg(snapshot_name) + .stdout(Redirection::Pipe) + .stderr(Redirection::Merge) + .popen()?; + let mut communicator = child + .communicate_start(None) + .limit_time(Duration::from_secs(1)); + let exit_status = loop { + match communicator.read() { + Err(error) if error.kind() != ErrorKind::TimedOut => { + warn!(?error, "Error reading from child process"); + break child.wait()?; + } + // Err(empty) or Err(non-empty) means we timed out, and there may be more output in future. + // Ok(non-empty) means we got some output. Hopefully this avoids giving us partial lines? + // Ok(empty) means there is definitely no more output. + ref result @ (Ok(ref capture) | Err(CommunicateError { ref capture, .. })) => { + let (Some(stdout), None) = capture else { + unreachable!("Guaranteed by child definition") + }; + if result.is_ok() && stdout.is_empty() { + // There is definitely no more output + break child.wait()?; + } else if !stdout.is_empty() { + for line in stdout.split(|&b| b == b'\n') { + let line = str::from_utf8(line).map_err(|_| line); + match line { + Ok(string) => info!(line = %string), + Err(bytes) => info!(?bytes), + } + } + } + } + } + }; + if !exit_status.success() { + bail!("Command exited with status {:?}", exit_status); + } + + Ok(()) +}