kanade_shared/boot_sentinel.rs
1//! Boot sentinel: auto-rollback to a last-known-good binary when a
2//! freshly-swapped binary crash-loops on startup (#582).
3//!
4//! Both `kanade-backend` and `kanade-agent` are **self-replacing**
5//! Windows services: an update overwrites the running exe and the
6//! Service Control Manager restarts it. If the new binary crashes
7//! during early boot (exactly what the #573 JetStream regression did
8//! to the backend on 2026-06-11), nothing rolls it back — the SCM
9//! just restarts the same broken exe forever.
10//!
11//! This module gates each boot. The swap step [`arm_for_swap`] writes
12//! a sentinel and snapshots the outgoing (known-good) binary to
13//! `<exe>.last-good`. The sentinel and quarantine files live in the
14//! shared `data_dir` but are namespaced by the exe's role
15//! (`.boot-sentinel-<role>.json`), so a backend and an agent co-located
16//! on the same host keep independent boot state instead of clobbering a
17//! single shared file. Every boot calls [`check_on_boot`] as the very
18//! first thing in `main()` — before NATS, the DB, or any bootstrap
19//! that can fail — which increments a persisted attempt counter and,
20//! once it crosses the crash-loop threshold, restores `.last-good`
21//! over the live exe and **quarantines** the failed version so the
22//! autonomous self-update path won't immediately re-deploy it (which
23//! would loop rollout↔rollback forever). [`confirm_healthy`], called
24//! once the process is genuinely up, promotes the running exe to the
25//! new last-good and clears the sentinel.
26//!
27//! The attempt counter is persisted BEFORE the crashy code runs, so a
28//! hard crash still advances it: boot 1..N each bump the counter, and
29//! the boot that crosses the threshold rolls back, after which the SCM
30//! restarts into `.last-good`.
31//!
32//! ## Windows exe lock
33//!
34//! A running exe is locked on Windows (no overwrite), but a *rename*
35//! of the running exe IS allowed. So the rollback renames the live exe
36//! aside (`<exe>.rollback-bak`) and copies `.last-good` into place,
37//! then the caller exits so the SCM relaunches the restored binary.
38//! The same rename-then-replace works on Unix and in unit tests (where
39//! the "exe" is just a temp file), so the logic is testable everywhere.
40
41use std::fs;
42use std::io;
43use std::path::{Path, PathBuf};
44
45use serde::{Deserialize, Serialize};
46use tracing::{error, info, warn};
47
48/// Filename prefixes under the data dir. The sentinel/quarantine files
49/// are suffixed with the exe's role (`-kanade-backend` / `-kanade-agent`)
50/// so a co-located backend + agent — which share one `data_dir` — keep
51/// SEPARATE boot state and can't clobber each other's pending sentinel
52/// or quarantine list. (`last-good` is already per-role: it sits next to
53/// each exe.)
54const SENTINEL_PREFIX: &str = ".boot-sentinel";
55const QUARANTINE_PREFIX: &str = ".boot-quarantine";
56const LAST_GOOD_SUFFIX: &str = "last-good";
57const ROLLBACK_BAK_SUFFIX: &str = "rollback-bak";
58
59/// The role namespace for the sentinel/quarantine filenames, derived
60/// from the exe's file stem (`kanade-backend.exe` → `kanade-backend`).
61/// Every call site passes the role's canonical exe — including the
62/// backend deploy's `arm-for-swap`, which is run by the staged binary
63/// but pointed at the installed `kanade-backend.exe` — so the namespace
64/// is stable across arm / boot / confirm for a given role.
65fn role_ns(exe: &Path) -> String {
66 // to_string_lossy (not to_str().unwrap_or): a non-UTF-8 exe path
67 // must still keep backend and agent DISTINCT. to_str() would return
68 // None on such a path and collapse both roles to the "kanade"
69 // fallback — re-introducing the very collision this namespacing
70 // fixes. Lossy conversion preserves the differing stem bytes; the
71 // "kanade" fallback is reachable only when there's no file stem at
72 // all (no real exe / no role to separate).
73 exe.file_stem()
74 .map(|s| s.to_string_lossy().into_owned())
75 .unwrap_or_else(|| "kanade".to_string())
76}
77
78/// Crash-loop threshold. Boot attempts `1..=N` proceed; attempt
79/// `N+1` triggers the rollback (the check is `attempts <= max`). So
80/// the default 3 gives a freshly-swapped binary three chances to
81/// confirm healthy and rolls back on the fourth boot — enough to ride
82/// out a one-off transient (slow disk, flaky first NATS connect)
83/// without masking a genuinely broken binary.
84pub const DEFAULT_MAX_ATTEMPTS: u32 = 3;
85
86#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
87struct Sentinel {
88 /// The version that was swapped in and is awaiting confirmation.
89 version: String,
90 /// Boot attempts so far for that version (incremented before the
91 /// boot can crash).
92 attempts: u32,
93}
94
95#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)]
96struct Quarantine {
97 /// Versions that crash-looped on boot and were rolled back. The
98 /// self-update path refuses to swap to any version listed here.
99 versions: Vec<String>,
100}
101
102/// What [`check_on_boot`] decided. On `RolledBack` the caller MUST
103/// exit (non-zero) so the service manager relaunches the restored
104/// last-good binary.
105#[derive(Debug, PartialEq, Eq)]
106pub enum BootDecision {
107 /// No pending swap, or the swap is still within its attempt
108 /// budget — continue booting normally.
109 Proceed,
110 /// The swapped-in binary crash-looped; `.last-good` has been
111 /// restored over the live exe. Exit now and let the SCM relaunch.
112 RolledBack { from: String },
113}
114
115/// Per-role boot guard. Construct once at the top of `main()`.
116pub struct BootSentinel {
117 sentinel_path: PathBuf,
118 quarantine_path: PathBuf,
119 exe: PathBuf,
120 last_good: PathBuf,
121 version: String,
122}
123
124impl BootSentinel {
125 /// `data_dir` holds the sentinel/quarantine state; `exe` is the
126 /// live binary path (`std::env::current_exe()` in production);
127 /// `version` is this binary's own version string.
128 pub fn new(data_dir: &Path, exe: PathBuf, version: impl Into<String>) -> Self {
129 let last_good = sibling(&exe, LAST_GOOD_SUFFIX);
130 let role = role_ns(&exe);
131 Self {
132 sentinel_path: data_dir.join(format!("{SENTINEL_PREFIX}-{role}.json")),
133 quarantine_path: data_dir.join(format!("{QUARANTINE_PREFIX}-{role}.json")),
134 exe,
135 last_good,
136 version: version.into(),
137 }
138 }
139
140 /// Call FIRST in `main()`, before anything that can crash.
141 ///
142 /// - No sentinel → `Proceed`.
143 /// - Sentinel for a different version (we already rolled back, or
144 /// last-good is now live) → clear it, `Proceed`.
145 /// - Sentinel for THIS version → bump attempts; attempts
146 /// `1..=max_attempts` `Proceed`, and the first that EXCEEDS
147 /// `max_attempts` rolls back to `.last-good` + quarantines the
148 /// bad version and returns `RolledBack`.
149 pub fn check_on_boot(&self, max_attempts: u32) -> BootDecision {
150 let Some(mut sentinel) = self.read_sentinel() else {
151 return BootDecision::Proceed;
152 };
153 if sentinel.version != self.version {
154 // A different binary is running than the sentinel expected
155 // — the swap already resolved (rollback or a later update).
156 // Stale marker; drop it and boot normally.
157 let _ = fs::remove_file(&self.sentinel_path);
158 return BootDecision::Proceed;
159 }
160
161 sentinel.attempts += 1;
162 info!(
163 version = %self.version,
164 attempts = sentinel.attempts,
165 max = max_attempts,
166 "boot sentinel: unconfirmed swap, recording boot attempt",
167 );
168 // Persist the bumped count BEFORE returning so a crash later
169 // this boot still advances the counter.
170 self.write_sentinel(&sentinel);
171
172 if sentinel.attempts <= max_attempts {
173 return BootDecision::Proceed;
174 }
175
176 // Crash-loop confirmed → roll back.
177 match self.rollback() {
178 Ok(true) => {
179 self.quarantine(&self.version);
180 let _ = fs::remove_file(&self.sentinel_path);
181 error!(
182 version = %self.version,
183 attempts = sentinel.attempts,
184 "boot sentinel: crash-loop — rolled back to last-good and quarantined this version",
185 );
186 BootDecision::RolledBack {
187 from: self.version.clone(),
188 }
189 }
190 Ok(false) => {
191 // No last-good to roll back to (first install). We
192 // can't restore a binary, but still quarantine the bad
193 // version so that IF a good binary ever comes up it
194 // won't re-deploy this one — and so the self-update
195 // path's refusal is consistent. We keep Proceeding
196 // (nothing better to do than let it keep trying).
197 self.quarantine(&self.version);
198 error!(
199 version = %self.version,
200 "boot sentinel: crash-loop but no last-good binary to roll back to; \
201 quarantined the version and continuing (no rollback target)",
202 );
203 BootDecision::Proceed
204 }
205 Err(e) => {
206 error!(error = %e, "boot sentinel: rollback failed; continuing without it");
207 BootDecision::Proceed
208 }
209 }
210 }
211
212 /// Call once the process is confirmed healthy (backend: serving;
213 /// agent: NATS connected + first heartbeat). Promotes the live exe
214 /// to `.last-good` and clears the sentinel, so this version becomes
215 /// the rollback target for the next swap.
216 pub fn confirm_healthy(&self) -> io::Result<()> {
217 // Only promote/clear when there's an unconfirmed swap for THIS
218 // version — a plain restart of an already-good binary has no
219 // sentinel and shouldn't churn last-good.
220 let pending = matches!(self.read_sentinel(), Some(s) if s.version == self.version);
221 if pending {
222 if let Err(e) = fs::copy(&self.exe, &self.last_good) {
223 warn!(error = %e, "boot sentinel: could not promote exe to last-good");
224 } else {
225 info!(version = %self.version, "boot sentinel: confirmed healthy, promoted to last-good");
226 }
227 let _ = fs::remove_file(&self.sentinel_path);
228 } else if !self.last_good.exists() {
229 // First-ever healthy boot with no swap in flight: seed
230 // last-good so a future bad swap has something to fall
231 // back to.
232 if let Err(e) = fs::copy(&self.exe, &self.last_good) {
233 warn!(error = %e, "boot sentinel: could not seed last-good");
234 } else {
235 info!(version = %self.version, "boot sentinel: seeded initial last-good");
236 }
237 }
238 Ok(())
239 }
240
241 /// Call at swap time (deploy / self-update), before restarting into
242 /// the new binary. Snapshots the CURRENT (outgoing, known-good) exe
243 /// to `.last-good` and writes a fresh sentinel for `new_version` so
244 /// the next boot is gated.
245 ///
246 /// `current_exe` is the still-running good binary (copy it now,
247 /// before it's overwritten by the swap).
248 pub fn arm_for_swap(&self, current_exe: &Path, new_version: &str) -> io::Result<()> {
249 // The outgoing binary booted fine (it's running), so it's the
250 // rollback target.
251 fs::copy(current_exe, &self.last_good)?;
252 self.write_sentinel(&Sentinel {
253 version: new_version.to_string(),
254 attempts: 0,
255 });
256 info!(
257 new_version,
258 "boot sentinel: armed for swap (last-good snapshotted)"
259 );
260 Ok(())
261 }
262
263 /// True if `version` was rolled back after a failed boot. The
264 /// self-update path consults this before swapping so a bad rollout
265 /// target isn't re-attempted in a loop.
266 pub fn is_quarantined(&self, version: &str) -> bool {
267 self.read_quarantine().versions.iter().any(|v| v == version)
268 }
269
270 /// Every quarantined version (#582 Phase 2). The agent reports
271 /// these in its heartbeat so the SPA rollout view can flag which
272 /// PCs failed to adopt a target.
273 pub fn quarantined_versions(&self) -> Vec<String> {
274 self.read_quarantine().versions
275 }
276
277 /// Drop `version` from quarantine (operator re-published a fixed
278 /// binary under the same version string).
279 pub fn clear_quarantine(&self, version: &str) -> io::Result<()> {
280 let mut q = self.read_quarantine();
281 let before = q.versions.len();
282 q.versions.retain(|v| v != version);
283 if q.versions.len() != before {
284 self.write_quarantine(&q);
285 }
286 Ok(())
287 }
288
289 // ── internals ────────────────────────────────────────────────
290
291 /// Rename the live exe aside and copy `.last-good` into its place.
292 /// Returns `Ok(false)` when there's no last-good to restore.
293 fn rollback(&self) -> io::Result<bool> {
294 if !self.last_good.exists() {
295 return Ok(false);
296 }
297 let bak = sibling(&self.exe, ROLLBACK_BAK_SUFFIX);
298 // Best-effort: a leftover .rollback-bak from a prior cycle
299 // would block the rename.
300 let _ = fs::remove_file(&bak);
301 // Rename of a running/locked exe is permitted on Windows; the
302 // copy then lands a fresh file at the exe path.
303 fs::rename(&self.exe, &bak)?;
304 if let Err(e) = fs::copy(&self.last_good, &self.exe) {
305 // The exe path is now EMPTY (renamed to .rollback-bak,
306 // copy failed). Put the original back so the next SCM
307 // restart isn't left with no binary at all — mirroring the
308 // compensating rollback in self_update::swap_and_restart.
309 match fs::rename(&bak, &self.exe) {
310 Ok(()) => warn!(
311 error = %e,
312 "boot sentinel: last-good copy failed; restored the original exe in place",
313 ),
314 Err(restore_err) => error!(
315 error = %e,
316 restore_error = %restore_err,
317 exe = ?self.exe,
318 backup = ?bak,
319 "boot sentinel: last-good copy failed AND restore failed — service binary path \
320 is EMPTY; manual repair required (rename the .rollback-bak file back)",
321 ),
322 }
323 return Err(e);
324 }
325 Ok(true)
326 }
327
328 fn quarantine(&self, version: &str) {
329 let mut q = self.read_quarantine();
330 if !q.versions.iter().any(|v| v == version) {
331 q.versions.push(version.to_string());
332 self.write_quarantine(&q);
333 }
334 }
335
336 fn read_sentinel(&self) -> Option<Sentinel> {
337 let bytes = fs::read(&self.sentinel_path).ok()?;
338 match serde_json::from_slice(&bytes) {
339 Ok(s) => Some(s),
340 Err(e) => {
341 warn!(error = %e, "boot sentinel: corrupt sentinel, ignoring");
342 let _ = fs::remove_file(&self.sentinel_path);
343 None
344 }
345 }
346 }
347
348 fn write_sentinel(&self, s: &Sentinel) {
349 match serde_json::to_vec(s) {
350 Ok(bytes) => {
351 if let Err(e) = atomic_write(&self.sentinel_path, &bytes) {
352 warn!(error = %e, "boot sentinel: write sentinel failed");
353 }
354 }
355 Err(e) => warn!(error = %e, "boot sentinel: encode sentinel failed"),
356 }
357 }
358
359 fn read_quarantine(&self) -> Quarantine {
360 fs::read(&self.quarantine_path)
361 .ok()
362 .and_then(|b| serde_json::from_slice(&b).ok())
363 .unwrap_or_default()
364 }
365
366 fn write_quarantine(&self, q: &Quarantine) {
367 match serde_json::to_vec(q) {
368 Ok(bytes) => {
369 if let Err(e) = atomic_write(&self.quarantine_path, &bytes) {
370 warn!(error = %e, "boot sentinel: write quarantine failed");
371 }
372 }
373 Err(e) => warn!(error = %e, "boot sentinel: encode quarantine failed"),
374 }
375 }
376}
377
378/// `<path>.<suffix>` (e.g. `kanade-agent.exe` → `kanade-agent.exe.last-good`).
379fn sibling(path: &Path, suffix: &str) -> PathBuf {
380 let mut s = path.as_os_str().to_os_string();
381 s.push(".");
382 s.push(suffix);
383 PathBuf::from(s)
384}
385
386/// Write via a temp file + rename so a crash mid-write never leaves a
387/// torn sentinel/quarantine the next boot would misread. Creates the
388/// parent dir first — on a clean install the data dir may not exist
389/// yet when the first swap arms the sentinel.
390fn atomic_write(path: &Path, bytes: &[u8]) -> io::Result<()> {
391 if let Some(parent) = path.parent() {
392 fs::create_dir_all(parent)?;
393 }
394 let tmp = sibling(path, "tmp");
395 fs::write(&tmp, bytes)?;
396 fs::rename(&tmp, path)
397}
398
399#[cfg(test)]
400mod tests {
401 use super::*;
402 use tempfile::TempDir;
403
404 /// Build a sentinel over a temp dir with a fake "exe" containing
405 /// `body`, at `version`.
406 fn fixture(version: &str, body: &str) -> (TempDir, BootSentinel) {
407 let dir = TempDir::new().unwrap();
408 let exe = dir.path().join("kanade-agent.exe");
409 fs::write(&exe, body).unwrap();
410 let s = BootSentinel::new(dir.path(), exe, version);
411 (dir, s)
412 }
413
414 fn read(p: &Path) -> String {
415 fs::read_to_string(p).unwrap()
416 }
417
418 #[test]
419 fn no_sentinel_proceeds() {
420 let (_d, s) = fixture("1.0.0", "v1");
421 assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
422 }
423
424 #[test]
425 fn arm_snapshots_last_good_and_writes_sentinel() {
426 let (_d, s) = fixture("1.0.0", "v1-good");
427 // Pretend a new binary will be swapped in; arm with the
428 // current (good) exe.
429 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
430 assert_eq!(read(&s.last_good), "v1-good");
431 assert!(s.sentinel_path.exists());
432 }
433
434 #[test]
435 fn healthy_swap_confirms_and_promotes() {
436 let (_d, s) = fixture("1.0.0", "v1-good");
437 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
438 // Now the 2.0.0 binary boots. Simulate by writing new exe body
439 // + a 2.0.0 sentinel-aware guard.
440 fs::write(&s.exe, "v2").unwrap();
441 let s2 = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
442 assert_eq!(s2.check_on_boot(3), BootDecision::Proceed);
443 s2.confirm_healthy().unwrap();
444 // last-good is now v2; sentinel cleared.
445 assert_eq!(read(&s2.last_good), "v2");
446 assert!(!s2.sentinel_path.exists());
447 assert!(!s2.is_quarantined("2.0.0"));
448 }
449
450 #[test]
451 fn crash_loop_rolls_back_and_quarantines() {
452 let (_d, s) = fixture("1.0.0", "v1-good");
453 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
454 // The 2.0.0 binary is now live and crash-loops.
455 fs::write(&s.exe, "v2-broken").unwrap();
456 let bad = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
457
458 // attempts 1..3 proceed (each boot would then crash).
459 assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 1
460 assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 2
461 assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 3
462 // 4th attempt crosses the threshold → rollback.
463 assert_eq!(
464 bad.check_on_boot(3),
465 BootDecision::RolledBack {
466 from: "2.0.0".into()
467 }
468 );
469 // Live exe restored to the good binary; bad version quarantined;
470 // sentinel cleared.
471 assert_eq!(read(&bad.exe), "v1-good");
472 assert!(bad.is_quarantined("2.0.0"));
473 assert!(!bad.sentinel_path.exists());
474 }
475
476 #[test]
477 fn rollback_without_last_good_proceeds_but_quarantines() {
478 // No arm/last-good: a sentinel exists but nothing to restore.
479 let (_d, s) = fixture("2.0.0", "v2-broken");
480 s.write_sentinel(&Sentinel {
481 version: "2.0.0".into(),
482 attempts: 5,
483 });
484 // attempts already past max; rollback finds no last-good, so we
485 // Proceed (can't restore) but still quarantine so a future good
486 // binary won't re-deploy this one.
487 assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
488 assert!(s.is_quarantined("2.0.0"));
489 }
490
491 #[test]
492 fn stale_sentinel_for_other_version_is_cleared() {
493 let (_d, s) = fixture("1.0.0", "v1");
494 // A sentinel left for a version we are NOT (e.g. we are the
495 // rolled-back last-good).
496 s.write_sentinel(&Sentinel {
497 version: "2.0.0".into(),
498 attempts: 9,
499 });
500 assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
501 assert!(!s.sentinel_path.exists());
502 }
503
504 #[test]
505 fn quarantine_clear_roundtrip() {
506 let (_d, s) = fixture("1.0.0", "v1");
507 s.quarantine("2.0.0");
508 s.quarantine("2.0.1");
509 assert!(s.is_quarantined("2.0.0"));
510 assert!(s.is_quarantined("2.0.1"));
511 s.clear_quarantine("2.0.0").unwrap();
512 assert!(!s.is_quarantined("2.0.0"));
513 assert!(s.is_quarantined("2.0.1"));
514 }
515
516 #[test]
517 fn sentinel_and_quarantine_are_namespaced_per_role() {
518 // A backend and an agent share one data_dir. Their sentinel and
519 // quarantine files must NOT collide — otherwise one role's
520 // confirm_healthy clears the other's pending sentinel, and one
521 // role's quarantine masquerades as the other's.
522 let dir = TempDir::new().unwrap();
523 let be = BootSentinel::new(dir.path(), dir.path().join("kanade-backend.exe"), "1.0.0");
524 let ag = BootSentinel::new(dir.path(), dir.path().join("kanade-agent.exe"), "1.0.0");
525 assert_ne!(be.sentinel_path, ag.sentinel_path);
526 assert_ne!(be.quarantine_path, ag.quarantine_path);
527
528 // Backend arms + quarantines; none of it is visible to the agent.
529 fs::write(&be.exe, "be").unwrap();
530 be.arm_for_swap(&be.exe.clone(), "2.0.0").unwrap();
531 be.quarantine("9.9.9");
532 assert!(be.is_quarantined("9.9.9"));
533 assert!(!ag.is_quarantined("9.9.9"));
534 // The agent boots with no sentinel of its own — the backend's
535 // pending swap must not make the agent count a phantom attempt.
536 assert_eq!(ag.check_on_boot(3), BootDecision::Proceed);
537 assert!(be.sentinel_path.exists()); // backend's sentinel survived
538 }
539
540 #[test]
541 fn attempt_counter_persists_across_checks() {
542 // Each check_on_boot simulates a separate boot of the same
543 // crashing binary; the counter must accumulate via the file.
544 let (_d, s) = fixture("1.0.0", "good");
545 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
546 fs::write(&s.exe, "broken").unwrap();
547 let dir = s.sentinel_path.parent().unwrap().to_path_buf();
548 let mk = || BootSentinel::new(&dir, s.exe.clone(), "2.0.0");
549 assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 1
550 assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 2
551 assert!(matches!(
552 mk().check_on_boot(2),
553 BootDecision::RolledBack { .. }
554 )); // 3 > 2
555 }
556}