Skip to main content

kanade_shared/
boot_sentinel.rs

1//! Boot sentinel: auto-rollback to a last-known-good binary when a
2//! freshly-swapped binary crash-loops on startup (#582).
3//!
4//! Both `kanade-backend` and `kanade-agent` are **self-replacing**
5//! Windows services: an update overwrites the running exe and the
6//! Service Control Manager restarts it. If the new binary crashes
7//! during early boot (exactly what the #573 JetStream regression did
8//! to the backend on 2026-06-11), nothing rolls it back — the SCM
9//! just restarts the same broken exe forever.
10//!
11//! This module gates each boot. The swap step [`arm_for_swap`] writes
12//! a sentinel and snapshots the outgoing (known-good) binary to
13//! `<exe>.last-good`. Every boot calls [`check_on_boot`] as the very
14//! first thing in `main()` — before NATS, the DB, or any bootstrap
15//! that can fail — which increments a persisted attempt counter and,
16//! once it crosses the crash-loop threshold, restores `.last-good`
17//! over the live exe and **quarantines** the failed version so the
18//! autonomous self-update path won't immediately re-deploy it (which
19//! would loop rollout↔rollback forever). [`confirm_healthy`], called
20//! once the process is genuinely up, promotes the running exe to the
21//! new last-good and clears the sentinel.
22//!
23//! The attempt counter is persisted BEFORE the crashy code runs, so a
24//! hard crash still advances it: boot 1..N each bump the counter, and
25//! the boot that crosses the threshold rolls back, after which the SCM
26//! restarts into `.last-good`.
27//!
28//! ## Windows exe lock
29//!
30//! A running exe is locked on Windows (no overwrite), but a *rename*
31//! of the running exe IS allowed. So the rollback renames the live exe
32//! aside (`<exe>.rollback-bak`) and copies `.last-good` into place,
33//! then the caller exits so the SCM relaunches the restored binary.
34//! The same rename-then-replace works on Unix and in unit tests (where
35//! the "exe" is just a temp file), so the logic is testable everywhere.
36
37use std::fs;
38use std::io;
39use std::path::{Path, PathBuf};
40
41use serde::{Deserialize, Serialize};
42use tracing::{error, info, warn};
43
44/// Filenames under the data dir / next to the exe.
45const SENTINEL_FILE: &str = ".boot-sentinel.json";
46const QUARANTINE_FILE: &str = ".boot-quarantine.json";
47const LAST_GOOD_SUFFIX: &str = "last-good";
48const ROLLBACK_BAK_SUFFIX: &str = "rollback-bak";
49
50/// Crash-loop threshold. Boot attempts `1..=N` proceed; attempt
51/// `N+1` triggers the rollback (the check is `attempts <= max`). So
52/// the default 3 gives a freshly-swapped binary three chances to
53/// confirm healthy and rolls back on the fourth boot — enough to ride
54/// out a one-off transient (slow disk, flaky first NATS connect)
55/// without masking a genuinely broken binary.
56pub const DEFAULT_MAX_ATTEMPTS: u32 = 3;
57
58#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
59struct Sentinel {
60    /// The version that was swapped in and is awaiting confirmation.
61    version: String,
62    /// Boot attempts so far for that version (incremented before the
63    /// boot can crash).
64    attempts: u32,
65}
66
67#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)]
68struct Quarantine {
69    /// Versions that crash-looped on boot and were rolled back. The
70    /// self-update path refuses to swap to any version listed here.
71    versions: Vec<String>,
72}
73
74/// What [`check_on_boot`] decided. On `RolledBack` the caller MUST
75/// exit (non-zero) so the service manager relaunches the restored
76/// last-good binary.
77#[derive(Debug, PartialEq, Eq)]
78pub enum BootDecision {
79    /// No pending swap, or the swap is still within its attempt
80    /// budget — continue booting normally.
81    Proceed,
82    /// The swapped-in binary crash-looped; `.last-good` has been
83    /// restored over the live exe. Exit now and let the SCM relaunch.
84    RolledBack { from: String },
85}
86
87/// Per-role boot guard. Construct once at the top of `main()`.
88pub struct BootSentinel {
89    sentinel_path: PathBuf,
90    quarantine_path: PathBuf,
91    exe: PathBuf,
92    last_good: PathBuf,
93    version: String,
94}
95
96impl BootSentinel {
97    /// `data_dir` holds the sentinel/quarantine state; `exe` is the
98    /// live binary path (`std::env::current_exe()` in production);
99    /// `version` is this binary's own version string.
100    pub fn new(data_dir: &Path, exe: PathBuf, version: impl Into<String>) -> Self {
101        let last_good = sibling(&exe, LAST_GOOD_SUFFIX);
102        Self {
103            sentinel_path: data_dir.join(SENTINEL_FILE),
104            quarantine_path: data_dir.join(QUARANTINE_FILE),
105            exe,
106            last_good,
107            version: version.into(),
108        }
109    }
110
111    /// Call FIRST in `main()`, before anything that can crash.
112    ///
113    /// - No sentinel → `Proceed`.
114    /// - Sentinel for a different version (we already rolled back, or
115    ///   last-good is now live) → clear it, `Proceed`.
116    /// - Sentinel for THIS version → bump attempts; attempts
117    ///   `1..=max_attempts` `Proceed`, and the first that EXCEEDS
118    ///   `max_attempts` rolls back to `.last-good` + quarantines the
119    ///   bad version and returns `RolledBack`.
120    pub fn check_on_boot(&self, max_attempts: u32) -> BootDecision {
121        let Some(mut sentinel) = self.read_sentinel() else {
122            return BootDecision::Proceed;
123        };
124        if sentinel.version != self.version {
125            // A different binary is running than the sentinel expected
126            // — the swap already resolved (rollback or a later update).
127            // Stale marker; drop it and boot normally.
128            let _ = fs::remove_file(&self.sentinel_path);
129            return BootDecision::Proceed;
130        }
131
132        sentinel.attempts += 1;
133        info!(
134            version = %self.version,
135            attempts = sentinel.attempts,
136            max = max_attempts,
137            "boot sentinel: unconfirmed swap, recording boot attempt",
138        );
139        // Persist the bumped count BEFORE returning so a crash later
140        // this boot still advances the counter.
141        self.write_sentinel(&sentinel);
142
143        if sentinel.attempts <= max_attempts {
144            return BootDecision::Proceed;
145        }
146
147        // Crash-loop confirmed → roll back.
148        match self.rollback() {
149            Ok(true) => {
150                self.quarantine(&self.version);
151                let _ = fs::remove_file(&self.sentinel_path);
152                error!(
153                    version = %self.version,
154                    attempts = sentinel.attempts,
155                    "boot sentinel: crash-loop — rolled back to last-good and quarantined this version",
156                );
157                BootDecision::RolledBack {
158                    from: self.version.clone(),
159                }
160            }
161            Ok(false) => {
162                // No last-good to roll back to (first install). We
163                // can't restore a binary, but still quarantine the bad
164                // version so that IF a good binary ever comes up it
165                // won't re-deploy this one — and so the self-update
166                // path's refusal is consistent. We keep Proceeding
167                // (nothing better to do than let it keep trying).
168                self.quarantine(&self.version);
169                error!(
170                    version = %self.version,
171                    "boot sentinel: crash-loop but no last-good binary to roll back to; \
172                     quarantined the version and continuing (no rollback target)",
173                );
174                BootDecision::Proceed
175            }
176            Err(e) => {
177                error!(error = %e, "boot sentinel: rollback failed; continuing without it");
178                BootDecision::Proceed
179            }
180        }
181    }
182
183    /// Call once the process is confirmed healthy (backend: serving;
184    /// agent: NATS connected + first heartbeat). Promotes the live exe
185    /// to `.last-good` and clears the sentinel, so this version becomes
186    /// the rollback target for the next swap.
187    pub fn confirm_healthy(&self) -> io::Result<()> {
188        // Only promote/clear when there's an unconfirmed swap for THIS
189        // version — a plain restart of an already-good binary has no
190        // sentinel and shouldn't churn last-good.
191        let pending = matches!(self.read_sentinel(), Some(s) if s.version == self.version);
192        if pending {
193            if let Err(e) = fs::copy(&self.exe, &self.last_good) {
194                warn!(error = %e, "boot sentinel: could not promote exe to last-good");
195            } else {
196                info!(version = %self.version, "boot sentinel: confirmed healthy, promoted to last-good");
197            }
198            let _ = fs::remove_file(&self.sentinel_path);
199        } else if !self.last_good.exists() {
200            // First-ever healthy boot with no swap in flight: seed
201            // last-good so a future bad swap has something to fall
202            // back to.
203            if let Err(e) = fs::copy(&self.exe, &self.last_good) {
204                warn!(error = %e, "boot sentinel: could not seed last-good");
205            } else {
206                info!(version = %self.version, "boot sentinel: seeded initial last-good");
207            }
208        }
209        Ok(())
210    }
211
212    /// Call at swap time (deploy / self-update), before restarting into
213    /// the new binary. Snapshots the CURRENT (outgoing, known-good) exe
214    /// to `.last-good` and writes a fresh sentinel for `new_version` so
215    /// the next boot is gated.
216    ///
217    /// `current_exe` is the still-running good binary (copy it now,
218    /// before it's overwritten by the swap).
219    pub fn arm_for_swap(&self, current_exe: &Path, new_version: &str) -> io::Result<()> {
220        // The outgoing binary booted fine (it's running), so it's the
221        // rollback target.
222        fs::copy(current_exe, &self.last_good)?;
223        self.write_sentinel(&Sentinel {
224            version: new_version.to_string(),
225            attempts: 0,
226        });
227        info!(
228            new_version,
229            "boot sentinel: armed for swap (last-good snapshotted)"
230        );
231        Ok(())
232    }
233
234    /// True if `version` was rolled back after a failed boot. The
235    /// self-update path consults this before swapping so a bad rollout
236    /// target isn't re-attempted in a loop.
237    pub fn is_quarantined(&self, version: &str) -> bool {
238        self.read_quarantine().versions.iter().any(|v| v == version)
239    }
240
241    /// Every quarantined version (#582 Phase 2). The agent reports
242    /// these in its heartbeat so the SPA rollout view can flag which
243    /// PCs failed to adopt a target.
244    pub fn quarantined_versions(&self) -> Vec<String> {
245        self.read_quarantine().versions
246    }
247
248    /// Drop `version` from quarantine (operator re-published a fixed
249    /// binary under the same version string).
250    pub fn clear_quarantine(&self, version: &str) -> io::Result<()> {
251        let mut q = self.read_quarantine();
252        let before = q.versions.len();
253        q.versions.retain(|v| v != version);
254        if q.versions.len() != before {
255            self.write_quarantine(&q);
256        }
257        Ok(())
258    }
259
260    // ── internals ────────────────────────────────────────────────
261
262    /// Rename the live exe aside and copy `.last-good` into its place.
263    /// Returns `Ok(false)` when there's no last-good to restore.
264    fn rollback(&self) -> io::Result<bool> {
265        if !self.last_good.exists() {
266            return Ok(false);
267        }
268        let bak = sibling(&self.exe, ROLLBACK_BAK_SUFFIX);
269        // Best-effort: a leftover .rollback-bak from a prior cycle
270        // would block the rename.
271        let _ = fs::remove_file(&bak);
272        // Rename of a running/locked exe is permitted on Windows; the
273        // copy then lands a fresh file at the exe path.
274        fs::rename(&self.exe, &bak)?;
275        if let Err(e) = fs::copy(&self.last_good, &self.exe) {
276            // The exe path is now EMPTY (renamed to .rollback-bak,
277            // copy failed). Put the original back so the next SCM
278            // restart isn't left with no binary at all — mirroring the
279            // compensating rollback in self_update::swap_and_restart.
280            match fs::rename(&bak, &self.exe) {
281                Ok(()) => warn!(
282                    error = %e,
283                    "boot sentinel: last-good copy failed; restored the original exe in place",
284                ),
285                Err(restore_err) => error!(
286                    error = %e,
287                    restore_error = %restore_err,
288                    exe = ?self.exe,
289                    backup = ?bak,
290                    "boot sentinel: last-good copy failed AND restore failed — service binary path \
291                     is EMPTY; manual repair required (rename the .rollback-bak file back)",
292                ),
293            }
294            return Err(e);
295        }
296        Ok(true)
297    }
298
299    fn quarantine(&self, version: &str) {
300        let mut q = self.read_quarantine();
301        if !q.versions.iter().any(|v| v == version) {
302            q.versions.push(version.to_string());
303            self.write_quarantine(&q);
304        }
305    }
306
307    fn read_sentinel(&self) -> Option<Sentinel> {
308        let bytes = fs::read(&self.sentinel_path).ok()?;
309        match serde_json::from_slice(&bytes) {
310            Ok(s) => Some(s),
311            Err(e) => {
312                warn!(error = %e, "boot sentinel: corrupt sentinel, ignoring");
313                let _ = fs::remove_file(&self.sentinel_path);
314                None
315            }
316        }
317    }
318
319    fn write_sentinel(&self, s: &Sentinel) {
320        match serde_json::to_vec(s) {
321            Ok(bytes) => {
322                if let Err(e) = atomic_write(&self.sentinel_path, &bytes) {
323                    warn!(error = %e, "boot sentinel: write sentinel failed");
324                }
325            }
326            Err(e) => warn!(error = %e, "boot sentinel: encode sentinel failed"),
327        }
328    }
329
330    fn read_quarantine(&self) -> Quarantine {
331        fs::read(&self.quarantine_path)
332            .ok()
333            .and_then(|b| serde_json::from_slice(&b).ok())
334            .unwrap_or_default()
335    }
336
337    fn write_quarantine(&self, q: &Quarantine) {
338        match serde_json::to_vec(q) {
339            Ok(bytes) => {
340                if let Err(e) = atomic_write(&self.quarantine_path, &bytes) {
341                    warn!(error = %e, "boot sentinel: write quarantine failed");
342                }
343            }
344            Err(e) => warn!(error = %e, "boot sentinel: encode quarantine failed"),
345        }
346    }
347}
348
349/// `<path>.<suffix>` (e.g. `kanade-agent.exe` → `kanade-agent.exe.last-good`).
350fn sibling(path: &Path, suffix: &str) -> PathBuf {
351    let mut s = path.as_os_str().to_os_string();
352    s.push(".");
353    s.push(suffix);
354    PathBuf::from(s)
355}
356
357/// Write via a temp file + rename so a crash mid-write never leaves a
358/// torn sentinel/quarantine the next boot would misread. Creates the
359/// parent dir first — on a clean install the data dir may not exist
360/// yet when the first swap arms the sentinel.
361fn atomic_write(path: &Path, bytes: &[u8]) -> io::Result<()> {
362    if let Some(parent) = path.parent() {
363        fs::create_dir_all(parent)?;
364    }
365    let tmp = sibling(path, "tmp");
366    fs::write(&tmp, bytes)?;
367    fs::rename(&tmp, path)
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use tempfile::TempDir;
374
375    /// Build a sentinel over a temp dir with a fake "exe" containing
376    /// `body`, at `version`.
377    fn fixture(version: &str, body: &str) -> (TempDir, BootSentinel) {
378        let dir = TempDir::new().unwrap();
379        let exe = dir.path().join("kanade-agent.exe");
380        fs::write(&exe, body).unwrap();
381        let s = BootSentinel::new(dir.path(), exe, version);
382        (dir, s)
383    }
384
385    fn read(p: &Path) -> String {
386        fs::read_to_string(p).unwrap()
387    }
388
389    #[test]
390    fn no_sentinel_proceeds() {
391        let (_d, s) = fixture("1.0.0", "v1");
392        assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
393    }
394
395    #[test]
396    fn arm_snapshots_last_good_and_writes_sentinel() {
397        let (_d, s) = fixture("1.0.0", "v1-good");
398        // Pretend a new binary will be swapped in; arm with the
399        // current (good) exe.
400        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
401        assert_eq!(read(&s.last_good), "v1-good");
402        assert!(s.sentinel_path.exists());
403    }
404
405    #[test]
406    fn healthy_swap_confirms_and_promotes() {
407        let (_d, s) = fixture("1.0.0", "v1-good");
408        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
409        // Now the 2.0.0 binary boots. Simulate by writing new exe body
410        // + a 2.0.0 sentinel-aware guard.
411        fs::write(&s.exe, "v2").unwrap();
412        let s2 = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
413        assert_eq!(s2.check_on_boot(3), BootDecision::Proceed);
414        s2.confirm_healthy().unwrap();
415        // last-good is now v2; sentinel cleared.
416        assert_eq!(read(&s2.last_good), "v2");
417        assert!(!s2.sentinel_path.exists());
418        assert!(!s2.is_quarantined("2.0.0"));
419    }
420
421    #[test]
422    fn crash_loop_rolls_back_and_quarantines() {
423        let (_d, s) = fixture("1.0.0", "v1-good");
424        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
425        // The 2.0.0 binary is now live and crash-loops.
426        fs::write(&s.exe, "v2-broken").unwrap();
427        let bad = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
428
429        // attempts 1..3 proceed (each boot would then crash).
430        assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 1
431        assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 2
432        assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 3
433        // 4th attempt crosses the threshold → rollback.
434        assert_eq!(
435            bad.check_on_boot(3),
436            BootDecision::RolledBack {
437                from: "2.0.0".into()
438            }
439        );
440        // Live exe restored to the good binary; bad version quarantined;
441        // sentinel cleared.
442        assert_eq!(read(&bad.exe), "v1-good");
443        assert!(bad.is_quarantined("2.0.0"));
444        assert!(!bad.sentinel_path.exists());
445    }
446
447    #[test]
448    fn rollback_without_last_good_proceeds_but_quarantines() {
449        // No arm/last-good: a sentinel exists but nothing to restore.
450        let (_d, s) = fixture("2.0.0", "v2-broken");
451        s.write_sentinel(&Sentinel {
452            version: "2.0.0".into(),
453            attempts: 5,
454        });
455        // attempts already past max; rollback finds no last-good, so we
456        // Proceed (can't restore) but still quarantine so a future good
457        // binary won't re-deploy this one.
458        assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
459        assert!(s.is_quarantined("2.0.0"));
460    }
461
462    #[test]
463    fn stale_sentinel_for_other_version_is_cleared() {
464        let (_d, s) = fixture("1.0.0", "v1");
465        // A sentinel left for a version we are NOT (e.g. we are the
466        // rolled-back last-good).
467        s.write_sentinel(&Sentinel {
468            version: "2.0.0".into(),
469            attempts: 9,
470        });
471        assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
472        assert!(!s.sentinel_path.exists());
473    }
474
475    #[test]
476    fn quarantine_clear_roundtrip() {
477        let (_d, s) = fixture("1.0.0", "v1");
478        s.quarantine("2.0.0");
479        s.quarantine("2.0.1");
480        assert!(s.is_quarantined("2.0.0"));
481        assert!(s.is_quarantined("2.0.1"));
482        s.clear_quarantine("2.0.0").unwrap();
483        assert!(!s.is_quarantined("2.0.0"));
484        assert!(s.is_quarantined("2.0.1"));
485    }
486
487    #[test]
488    fn attempt_counter_persists_across_checks() {
489        // Each check_on_boot simulates a separate boot of the same
490        // crashing binary; the counter must accumulate via the file.
491        let (_d, s) = fixture("1.0.0", "good");
492        s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
493        fs::write(&s.exe, "broken").unwrap();
494        let dir = s.sentinel_path.parent().unwrap().to_path_buf();
495        let mk = || BootSentinel::new(&dir, s.exe.clone(), "2.0.0");
496        assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 1
497        assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 2
498        assert!(matches!(
499            mk().check_on_boot(2),
500            BootDecision::RolledBack { .. }
501        )); // 3 > 2
502    }
503}