Skip to main content

kanade_shared/
boot_sentinel.rs

1//! Boot sentinel: auto-rollback to a last-known-good binary when a
2//! freshly-swapped binary crash-loops on startup (#582).
3//!
4//! Both `kanade-backend` and `kanade-agent` are **self-replacing**
5//! Windows services: an update overwrites the running exe and the
6//! Service Control Manager restarts it. If the new binary crashes
7//! during early boot (exactly what the #573 JetStream regression did
8//! to the backend on 2026-06-11), nothing rolls it back — the SCM
9//! just restarts the same broken exe forever.
10//!
11//! This module gates each boot. The swap step [`arm_for_swap`] writes
12//! a sentinel and snapshots the outgoing (known-good) binary to
13//! `<exe>.last-good`. The sentinel and quarantine files live in the
14//! shared `data_dir` but are namespaced by the exe's role
15//! (`.boot-sentinel-<role>.json`), so a backend and an agent co-located
16//! on the same host keep independent boot state instead of clobbering a
17//! single shared file. Every boot calls [`check_on_boot`] as the very
18//! first thing in `main()` — before NATS, the DB, or any bootstrap
19//! that can fail — which increments a persisted attempt counter and,
20//! once it crosses the crash-loop threshold, restores `.last-good`
21//! over the live exe and **quarantines** the failed version so the
22//! autonomous self-update path won't immediately re-deploy it (which
23//! would loop rollout↔rollback forever). [`confirm_healthy`], called
24//! once the process is genuinely up, promotes the running exe to the
25//! new last-good and clears the sentinel.
26//!
27//! The attempt counter is persisted BEFORE the crashy code runs, so a
28//! hard crash still advances it: boot 1..N each bump the counter, and
29//! the boot that crosses the threshold rolls back, after which the SCM
30//! restarts into `.last-good`.
31//!
32//! ## Windows exe lock
33//!
34//! A running exe is locked on Windows (no overwrite), but a *rename*
35//! of the running exe IS allowed. So the rollback renames the live exe
36//! aside (`<exe>.rollback-bak`) and copies `.last-good` into place,
37//! then the caller exits so the SCM relaunches the restored binary.
38//! The same rename-then-replace works on Unix and in unit tests (where
39//! the "exe" is just a temp file), so the logic is testable everywhere.
40
41use std::fs;
42use std::io;
43use std::path::{Path, PathBuf};
44
45use serde::{Deserialize, Serialize};
46use tracing::{error, info, warn};
47
48/// Filename prefixes under the data dir. The sentinel/quarantine files
49/// are suffixed with the exe's role (`-kanade-backend` / `-kanade-agent`)
50/// so a co-located backend + agent — which share one `data_dir` — keep
51/// SEPARATE boot state and can't clobber each other's pending sentinel
52/// or quarantine list. (`last-good` is already per-role: it sits next to
53/// each exe.)
54const SENTINEL_PREFIX: &str = ".boot-sentinel";
55const QUARANTINE_PREFIX: &str = ".boot-quarantine";
56const LAST_GOOD_SUFFIX: &str = "last-good";
57const ROLLBACK_BAK_SUFFIX: &str = "rollback-bak";
58
59/// The role namespace for the sentinel/quarantine filenames, derived
60/// from the exe's file stem (`kanade-backend.exe` → `kanade-backend`).
61/// Every call site passes the role's canonical exe — including the
62/// backend deploy's `arm-for-swap`, which is run by the staged binary
63/// but pointed at the installed `kanade-backend.exe` — so the namespace
64/// is stable across arm / boot / confirm for a given role.
65fn role_ns(exe: &Path) -> String {
66    // to_string_lossy (not to_str().unwrap_or): a non-UTF-8 exe path
67    // must still keep backend and agent DISTINCT. to_str() would return
68    // None on such a path and collapse both roles to the "kanade"
69    // fallback — re-introducing the very collision this namespacing
70    // fixes. Lossy conversion preserves the differing stem bytes; the
71    // "kanade" fallback is reachable only when there's no file stem at
72    // all (no real exe / no role to separate).
73    exe.file_stem()
74        .map(|s| s.to_string_lossy().into_owned())
75        .unwrap_or_else(|| "kanade".to_string())
76}
77
78/// Crash-loop threshold. Boot attempts `1..=N` proceed; attempt
79/// `N+1` triggers the rollback (the check is `attempts <= max`). So
80/// the default 3 gives a freshly-swapped binary three chances to
81/// confirm healthy and rolls back on the fourth boot — enough to ride
82/// out a one-off transient (slow disk, flaky first NATS connect)
83/// without masking a genuinely broken binary.
84pub const DEFAULT_MAX_ATTEMPTS: u32 = 3;
85
86#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
87struct Sentinel {
88    /// The version that was swapped in and is awaiting confirmation.
89    version: String,
90    /// Boot attempts so far for that version (incremented before the
91    /// boot can crash).
92    attempts: u32,
93}
94
95#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)]
96struct Quarantine {
97    /// Versions that crash-looped on boot and were rolled back. The
98    /// self-update path refuses to swap to any version listed here.
99    versions: Vec<String>,
100}
101
102/// What [`check_on_boot`] decided. On `RolledBack` the caller MUST
103/// exit (non-zero) so the service manager relaunches the restored
104/// last-good binary.
105#[derive(Debug, PartialEq, Eq)]
106pub enum BootDecision {
107    /// No pending swap, or the swap is still within its attempt
108    /// budget — continue booting normally.
109    Proceed,
110    /// The swapped-in binary crash-looped; `.last-good` has been
111    /// restored over the live exe. Exit now and let the SCM relaunch.
112    RolledBack { from: String },
113}
114
115/// Per-role boot guard. Construct once at the top of `main()`.
116pub struct BootSentinel {
117    sentinel_path: PathBuf,
118    quarantine_path: PathBuf,
119    exe: PathBuf,
120    last_good: PathBuf,
121    version: String,
122}
123
124impl BootSentinel {
125    /// `data_dir` holds the sentinel/quarantine state; `exe` is the
126    /// live binary path (`std::env::current_exe()` in production);
127    /// `version` is this binary's own version string.
128    pub fn new(data_dir: &Path, exe: PathBuf, version: impl Into<String>) -> Self {
129        let last_good = sibling(&exe, LAST_GOOD_SUFFIX);
130        let role = role_ns(&exe);
131        Self {
132            sentinel_path: data_dir.join(format!("{SENTINEL_PREFIX}-{role}.json")),
133            quarantine_path: data_dir.join(format!("{QUARANTINE_PREFIX}-{role}.json")),
134            exe,
135            last_good,
136            version: version.into(),
137        }
138    }
139
140    /// Call FIRST in `main()`, before anything that can crash.
141    ///
142    /// - No sentinel → `Proceed`.
143    /// - Sentinel for a different version (we already rolled back, or
144    ///   last-good is now live) → clear it, `Proceed`.
145    /// - Sentinel for THIS version → bump attempts; attempts
146    ///   `1..=max_attempts` `Proceed`, and the first that EXCEEDS
147    ///   `max_attempts` rolls back to `.last-good` + quarantines the
148    ///   bad version and returns `RolledBack`.
149    pub fn check_on_boot(&self, max_attempts: u32) -> BootDecision {
150        let Some(mut sentinel) = self.read_sentinel() else {
151            return BootDecision::Proceed;
152        };
153        if sentinel.version != self.version {
154            // A different binary is running than the sentinel expected
155            // — the swap already resolved (rollback or a later update).
156            // Stale marker; drop it and boot normally.
157            let _ = fs::remove_file(&self.sentinel_path);
158            return BootDecision::Proceed;
159        }
160
161        sentinel.attempts += 1;
162        info!(
163            version = %self.version,
164            attempts = sentinel.attempts,
165            max = max_attempts,
166            "boot sentinel: unconfirmed swap, recording boot attempt",
167        );
168        // Persist the bumped count BEFORE returning so a crash later
169        // this boot still advances the counter.
170        self.write_sentinel(&sentinel);
171
172        if sentinel.attempts <= max_attempts {
173            return BootDecision::Proceed;
174        }
175
176        // Crash-loop confirmed → roll back.
177        match self.rollback() {
178            Ok(true) => {
179                self.quarantine(&self.version);
180                let _ = fs::remove_file(&self.sentinel_path);
181                error!(
182                    version = %self.version,
183                    attempts = sentinel.attempts,
184                    "boot sentinel: crash-loop — rolled back to last-good and quarantined this version",
185                );
186                BootDecision::RolledBack {
187                    from: self.version.clone(),
188                }
189            }
190            Ok(false) => {
191                // No last-good to roll back to (first install). We
192                // can't restore a binary, but still quarantine the bad
193                // version so that IF a good binary ever comes up it
194                // won't re-deploy this one — and so the self-update
195                // path's refusal is consistent. We keep Proceeding
196                // (nothing better to do than let it keep trying).
197                self.quarantine(&self.version);
198                error!(
199                    version = %self.version,
200                    "boot sentinel: crash-loop but no last-good binary to roll back to; \
201                     quarantined the version and continuing (no rollback target)",
202                );
203                BootDecision::Proceed
204            }
205            Err(e) => {
206                error!(error = %e, "boot sentinel: rollback failed; continuing without it");
207                BootDecision::Proceed
208            }
209        }
210    }
211
212    /// Call once the process is confirmed healthy (backend: serving;
213    /// agent: NATS connected + first heartbeat). Promotes the live exe
214    /// to `.last-good` and clears the sentinel, so this version becomes
215    /// the rollback target for the next swap.
216    pub fn confirm_healthy(&self) -> io::Result<()> {
217        // Only promote/clear when there's an unconfirmed swap for THIS
218        // version — a plain restart of an already-good binary has no
219        // sentinel and shouldn't churn last-good.
220        let pending = matches!(self.read_sentinel(), Some(s) if s.version == self.version);
221        if pending {
222            if let Err(e) = fs::copy(&self.exe, &self.last_good) {
223                warn!(error = %e, "boot sentinel: could not promote exe to last-good");
224            } else {
225                info!(version = %self.version, "boot sentinel: confirmed healthy, promoted to last-good");
226            }
227            let _ = fs::remove_file(&self.sentinel_path);
228        } else if !self.last_good.exists() {
229            // First-ever healthy boot with no swap in flight: seed
230            // last-good so a future bad swap has something to fall
231            // back to.
232            if let Err(e) = fs::copy(&self.exe, &self.last_good) {
233                warn!(error = %e, "boot sentinel: could not seed last-good");
234            } else {
235                info!(version = %self.version, "boot sentinel: seeded initial last-good");
236            }
237        }
238        Ok(())
239    }
240
241    /// Call at swap time (deploy / self-update), before restarting into
242    /// the new binary. Snapshots the CURRENT (outgoing, known-good) exe
243    /// to `.last-good` and writes a fresh sentinel for `new_version` so
244    /// the next boot is gated.
245    ///
246    /// `current_exe` is the still-running good binary (copy it now,
247    /// before it's overwritten by the swap).
248    pub fn arm_for_swap(&self, current_exe: &Path, new_version: &str) -> io::Result<()> {
249        // The outgoing binary booted fine (it's running), so it's the
250        // rollback target.
251        fs::copy(current_exe, &self.last_good)?;
252        self.write_sentinel(&Sentinel {
253            version: new_version.to_string(),
254            attempts: 0,
255        });
256        info!(
257            new_version,
258            "boot sentinel: armed for swap (last-good snapshotted)"
259        );
260        Ok(())
261    }
262
263    /// True if `version` was rolled back after a failed boot. The
264    /// self-update path consults this before swapping so a bad rollout
265    /// target isn't re-attempted in a loop.
266    pub fn is_quarantined(&self, version: &str) -> bool {
267        self.read_quarantine().versions.iter().any(|v| v == version)
268    }
269
270    /// Every quarantined version (#582 Phase 2). The agent reports
271    /// these in its heartbeat so the SPA rollout view can flag which
272    /// PCs failed to adopt a target.
273    pub fn quarantined_versions(&self) -> Vec<String> {
274        self.read_quarantine().versions
275    }
276
277    /// Drop `version` from quarantine (operator re-published a fixed
278    /// binary under the same version string).
279    pub fn clear_quarantine(&self, version: &str) -> io::Result<()> {
280        let mut q = self.read_quarantine();
281        let before = q.versions.len();
282        q.versions.retain(|v| v != version);
283        if q.versions.len() != before {
284            self.write_quarantine(&q);
285        }
286        Ok(())
287    }
288
289    // ── internals ────────────────────────────────────────────────
290
291    /// Rename the live exe aside and copy `.last-good` into its place.
292    /// Returns `Ok(false)` when there's no last-good to restore.
293    fn rollback(&self) -> io::Result<bool> {
294        if !self.last_good.exists() {
295            return Ok(false);
296        }
297        let bak = sibling(&self.exe, ROLLBACK_BAK_SUFFIX);
298        // Best-effort: a leftover .rollback-bak from a prior cycle
299        // would block the rename.
300        let _ = fs::remove_file(&bak);
301        // Rename of a running/locked exe is permitted on Windows; the
302        // copy then lands a fresh file at the exe path.
303        fs::rename(&self.exe, &bak)?;
304        if let Err(e) = fs::copy(&self.last_good, &self.exe) {
305            // The exe path is now EMPTY (renamed to .rollback-bak,
306            // copy failed). Put the original back so the next SCM
307            // restart isn't left with no binary at all — mirroring the
308            // compensating rollback in self_update::swap_and_restart.
309            match fs::rename(&bak, &self.exe) {
310                Ok(()) => warn!(
311                    error = %e,
312                    "boot sentinel: last-good copy failed; restored the original exe in place",
313                ),
314                Err(restore_err) => error!(
315                    error = %e,
316                    restore_error = %restore_err,
317                    exe = ?self.exe,
318                    backup = ?bak,
319                    "boot sentinel: last-good copy failed AND restore failed — service binary path \
320                     is EMPTY; manual repair required (rename the .rollback-bak file back)",
321                ),
322            }
323            return Err(e);
324        }
325        Ok(true)
326    }
327
328    fn quarantine(&self, version: &str) {
329        let mut q = self.read_quarantine();
330        if !q.versions.iter().any(|v| v == version) {
331            q.versions.push(version.to_string());
332            self.write_quarantine(&q);
333        }
334    }
335
336    fn read_sentinel(&self) -> Option<Sentinel> {
337        let bytes = fs::read(&self.sentinel_path).ok()?;
338        match serde_json::from_slice(&bytes) {
339            Ok(s) => Some(s),
340            Err(e) => {
341                warn!(error = %e, "boot sentinel: corrupt sentinel, ignoring");
342                let _ = fs::remove_file(&self.sentinel_path);
343                None
344            }
345        }
346    }
347
348    fn write_sentinel(&self, s: &Sentinel) {
349        match serde_json::to_vec(s) {
350            Ok(bytes) => {
351                if let Err(e) = atomic_write(&self.sentinel_path, &bytes) {
352                    warn!(error = %e, "boot sentinel: write sentinel failed");
353                }
354            }
355            Err(e) => warn!(error = %e, "boot sentinel: encode sentinel failed"),
356        }
357    }
358
359    fn read_quarantine(&self) -> Quarantine {
360        fs::read(&self.quarantine_path)
361            .ok()
362            .and_then(|b| serde_json::from_slice(&b).ok())
363            .unwrap_or_default()
364    }
365
366    fn write_quarantine(&self, q: &Quarantine) {
367        match serde_json::to_vec(q) {
368            Ok(bytes) => {
369                if let Err(e) = atomic_write(&self.quarantine_path, &bytes) {
370                    warn!(error = %e, "boot sentinel: write quarantine failed");
371                }
372            }
373            Err(e) => warn!(error = %e, "boot sentinel: encode quarantine failed"),
374        }
375    }
376}
377
378/// `<path>.<suffix>` (e.g. `kanade-agent.exe` → `kanade-agent.exe.last-good`).
379fn sibling(path: &Path, suffix: &str) -> PathBuf {
380    let mut s = path.as_os_str().to_os_string();
381    s.push(".");
382    s.push(suffix);
383    PathBuf::from(s)
384}
385
386/// Write via a temp file + rename so a crash mid-write never leaves a
387/// torn sentinel/quarantine the next boot would misread. Creates the
388/// parent dir first — on a clean install the data dir may not exist
389/// yet when the first swap arms the sentinel.
390fn atomic_write(path: &Path, bytes: &[u8]) -> io::Result<()> {
391    if let Some(parent) = path.parent() {
392        fs::create_dir_all(parent)?;
393    }
394    let tmp = sibling(path, "tmp");
395    fs::write(&tmp, bytes)?;
396    fs::rename(&tmp, path)
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402    use tempfile::TempDir;
403
404    /// Build a sentinel over a temp dir with a fake "exe" containing
405    /// `body`, at `version`.
406    fn fixture(version: &str, body: &str) -> (TempDir, BootSentinel) {
407        let dir = TempDir::new().unwrap();
408        let exe = dir.path().join("kanade-agent.exe");
409        fs::write(&exe, body).unwrap();
410        let s = BootSentinel::new(dir.path(), exe, version);
411        (dir, s)
412    }
413
414    fn read(p: &Path) -> String {
415        fs::read_to_string(p).unwrap()
416    }
417
418    #[test]
419    fn no_sentinel_proceeds() {
420        let (_d, s) = fixture("1.0.0", "v1");
421        assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
422    }
423
424    #[test]
425    fn arm_snapshots_last_good_and_writes_sentinel() {
426        let (_d, s) = fixture("1.0.0", "v1-good");
427        // Pretend a new binary will be swapped in; arm with the
428        // current (good) exe.
429        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
430        assert_eq!(read(&s.last_good), "v1-good");
431        assert!(s.sentinel_path.exists());
432    }
433
434    #[test]
435    fn healthy_swap_confirms_and_promotes() {
436        let (_d, s) = fixture("1.0.0", "v1-good");
437        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
438        // Now the 2.0.0 binary boots. Simulate by writing new exe body
439        // + a 2.0.0 sentinel-aware guard.
440        fs::write(&s.exe, "v2").unwrap();
441        let s2 = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
442        assert_eq!(s2.check_on_boot(3), BootDecision::Proceed);
443        s2.confirm_healthy().unwrap();
444        // last-good is now v2; sentinel cleared.
445        assert_eq!(read(&s2.last_good), "v2");
446        assert!(!s2.sentinel_path.exists());
447        assert!(!s2.is_quarantined("2.0.0"));
448    }
449
450    #[test]
451    fn crash_loop_rolls_back_and_quarantines() {
452        let (_d, s) = fixture("1.0.0", "v1-good");
453        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
454        // The 2.0.0 binary is now live and crash-loops.
455        fs::write(&s.exe, "v2-broken").unwrap();
456        let bad = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
457
458        // attempts 1..3 proceed (each boot would then crash).
459        assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 1
460        assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 2
461        assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 3
462        // 4th attempt crosses the threshold → rollback.
463        assert_eq!(
464            bad.check_on_boot(3),
465            BootDecision::RolledBack {
466                from: "2.0.0".into()
467            }
468        );
469        // Live exe restored to the good binary; bad version quarantined;
470        // sentinel cleared.
471        assert_eq!(read(&bad.exe), "v1-good");
472        assert!(bad.is_quarantined("2.0.0"));
473        assert!(!bad.sentinel_path.exists());
474    }
475
476    #[test]
477    fn rollback_without_last_good_proceeds_but_quarantines() {
478        // No arm/last-good: a sentinel exists but nothing to restore.
479        let (_d, s) = fixture("2.0.0", "v2-broken");
480        s.write_sentinel(&Sentinel {
481            version: "2.0.0".into(),
482            attempts: 5,
483        });
484        // attempts already past max; rollback finds no last-good, so we
485        // Proceed (can't restore) but still quarantine so a future good
486        // binary won't re-deploy this one.
487        assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
488        assert!(s.is_quarantined("2.0.0"));
489    }
490
491    #[test]
492    fn stale_sentinel_for_other_version_is_cleared() {
493        let (_d, s) = fixture("1.0.0", "v1");
494        // A sentinel left for a version we are NOT (e.g. we are the
495        // rolled-back last-good).
496        s.write_sentinel(&Sentinel {
497            version: "2.0.0".into(),
498            attempts: 9,
499        });
500        assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
501        assert!(!s.sentinel_path.exists());
502    }
503
504    #[test]
505    fn quarantine_clear_roundtrip() {
506        let (_d, s) = fixture("1.0.0", "v1");
507        s.quarantine("2.0.0");
508        s.quarantine("2.0.1");
509        assert!(s.is_quarantined("2.0.0"));
510        assert!(s.is_quarantined("2.0.1"));
511        s.clear_quarantine("2.0.0").unwrap();
512        assert!(!s.is_quarantined("2.0.0"));
513        assert!(s.is_quarantined("2.0.1"));
514    }
515
516    #[test]
517    fn sentinel_and_quarantine_are_namespaced_per_role() {
518        // A backend and an agent share one data_dir. Their sentinel and
519        // quarantine files must NOT collide — otherwise one role's
520        // confirm_healthy clears the other's pending sentinel, and one
521        // role's quarantine masquerades as the other's.
522        let dir = TempDir::new().unwrap();
523        let be = BootSentinel::new(dir.path(), dir.path().join("kanade-backend.exe"), "1.0.0");
524        let ag = BootSentinel::new(dir.path(), dir.path().join("kanade-agent.exe"), "1.0.0");
525        assert_ne!(be.sentinel_path, ag.sentinel_path);
526        assert_ne!(be.quarantine_path, ag.quarantine_path);
527
528        // Backend arms + quarantines; none of it is visible to the agent.
529        fs::write(&be.exe, "be").unwrap();
530        be.arm_for_swap(&be.exe.clone(), "2.0.0").unwrap();
531        be.quarantine("9.9.9");
532        assert!(be.is_quarantined("9.9.9"));
533        assert!(!ag.is_quarantined("9.9.9"));
534        // The agent boots with no sentinel of its own — the backend's
535        // pending swap must not make the agent count a phantom attempt.
536        assert_eq!(ag.check_on_boot(3), BootDecision::Proceed);
537        assert!(be.sentinel_path.exists()); // backend's sentinel survived
538    }
539
540    #[test]
541    fn attempt_counter_persists_across_checks() {
542        // Each check_on_boot simulates a separate boot of the same
543        // crashing binary; the counter must accumulate via the file.
544        let (_d, s) = fixture("1.0.0", "good");
545        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
546        fs::write(&s.exe, "broken").unwrap();
547        let dir = s.sentinel_path.parent().unwrap().to_path_buf();
548        let mk = || BootSentinel::new(&dir, s.exe.clone(), "2.0.0");
549        assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 1
550        assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 2
551        assert!(matches!(
552            mk().check_on_boot(2),
553            BootDecision::RolledBack { .. }
554        )); // 3 > 2
555    }
556}