kanade_shared/boot_sentinel.rs
1//! Boot sentinel: auto-rollback to a last-known-good binary when a
2//! freshly-swapped binary crash-loops on startup (#582).
3//!
4//! Both `kanade-backend` and `kanade-agent` are **self-replacing**
5//! Windows services: an update overwrites the running exe and the
6//! Service Control Manager restarts it. If the new binary crashes
7//! during early boot (exactly what the #573 JetStream regression did
8//! to the backend on 2026-06-11), nothing rolls it back — the SCM
9//! just restarts the same broken exe forever.
10//!
11//! This module gates each boot. The swap step [`arm_for_swap`] writes
12//! a sentinel and snapshots the outgoing (known-good) binary to
13//! `<exe>.last-good`. Every boot calls [`check_on_boot`] as the very
14//! first thing in `main()` — before NATS, the DB, or any bootstrap
15//! that can fail — which increments a persisted attempt counter and,
16//! once it crosses the crash-loop threshold, restores `.last-good`
17//! over the live exe and **quarantines** the failed version so the
18//! autonomous self-update path won't immediately re-deploy it (which
19//! would loop rollout↔rollback forever). [`confirm_healthy`], called
20//! once the process is genuinely up, promotes the running exe to the
21//! new last-good and clears the sentinel.
22//!
23//! The attempt counter is persisted BEFORE the crashy code runs, so a
24//! hard crash still advances it: boot 1..N each bump the counter, and
25//! the boot that crosses the threshold rolls back, after which the SCM
26//! restarts into `.last-good`.
27//!
28//! ## Windows exe lock
29//!
30//! A running exe is locked on Windows (no overwrite), but a *rename*
31//! of the running exe IS allowed. So the rollback renames the live exe
32//! aside (`<exe>.rollback-bak`) and copies `.last-good` into place,
33//! then the caller exits so the SCM relaunches the restored binary.
34//! The same rename-then-replace works on Unix and in unit tests (where
35//! the "exe" is just a temp file), so the logic is testable everywhere.
36
37use std::fs;
38use std::io;
39use std::path::{Path, PathBuf};
40
41use serde::{Deserialize, Serialize};
42use tracing::{error, info, warn};
43
44/// Filenames under the data dir / next to the exe.
45const SENTINEL_FILE: &str = ".boot-sentinel.json";
46const QUARANTINE_FILE: &str = ".boot-quarantine.json";
47const LAST_GOOD_SUFFIX: &str = "last-good";
48const ROLLBACK_BAK_SUFFIX: &str = "rollback-bak";
49
50/// Crash-loop threshold. Boot attempts `1..=N` proceed; attempt
51/// `N+1` triggers the rollback (the check is `attempts <= max`). So
52/// the default 3 gives a freshly-swapped binary three chances to
53/// confirm healthy and rolls back on the fourth boot — enough to ride
54/// out a one-off transient (slow disk, flaky first NATS connect)
55/// without masking a genuinely broken binary.
56pub const DEFAULT_MAX_ATTEMPTS: u32 = 3;
57
58#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
59struct Sentinel {
60 /// The version that was swapped in and is awaiting confirmation.
61 version: String,
62 /// Boot attempts so far for that version (incremented before the
63 /// boot can crash).
64 attempts: u32,
65}
66
67#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)]
68struct Quarantine {
69 /// Versions that crash-looped on boot and were rolled back. The
70 /// self-update path refuses to swap to any version listed here.
71 versions: Vec<String>,
72}
73
74/// What [`check_on_boot`] decided. On `RolledBack` the caller MUST
75/// exit (non-zero) so the service manager relaunches the restored
76/// last-good binary.
77#[derive(Debug, PartialEq, Eq)]
78pub enum BootDecision {
79 /// No pending swap, or the swap is still within its attempt
80 /// budget — continue booting normally.
81 Proceed,
82 /// The swapped-in binary crash-looped; `.last-good` has been
83 /// restored over the live exe. Exit now and let the SCM relaunch.
84 RolledBack { from: String },
85}
86
87/// Per-role boot guard. Construct once at the top of `main()`.
88pub struct BootSentinel {
89 sentinel_path: PathBuf,
90 quarantine_path: PathBuf,
91 exe: PathBuf,
92 last_good: PathBuf,
93 version: String,
94}
95
96impl BootSentinel {
97 /// `data_dir` holds the sentinel/quarantine state; `exe` is the
98 /// live binary path (`std::env::current_exe()` in production);
99 /// `version` is this binary's own version string.
100 pub fn new(data_dir: &Path, exe: PathBuf, version: impl Into<String>) -> Self {
101 let last_good = sibling(&exe, LAST_GOOD_SUFFIX);
102 Self {
103 sentinel_path: data_dir.join(SENTINEL_FILE),
104 quarantine_path: data_dir.join(QUARANTINE_FILE),
105 exe,
106 last_good,
107 version: version.into(),
108 }
109 }
110
111 /// Call FIRST in `main()`, before anything that can crash.
112 ///
113 /// - No sentinel → `Proceed`.
114 /// - Sentinel for a different version (we already rolled back, or
115 /// last-good is now live) → clear it, `Proceed`.
116 /// - Sentinel for THIS version → bump attempts; attempts
117 /// `1..=max_attempts` `Proceed`, and the first that EXCEEDS
118 /// `max_attempts` rolls back to `.last-good` + quarantines the
119 /// bad version and returns `RolledBack`.
120 pub fn check_on_boot(&self, max_attempts: u32) -> BootDecision {
121 let Some(mut sentinel) = self.read_sentinel() else {
122 return BootDecision::Proceed;
123 };
124 if sentinel.version != self.version {
125 // A different binary is running than the sentinel expected
126 // — the swap already resolved (rollback or a later update).
127 // Stale marker; drop it and boot normally.
128 let _ = fs::remove_file(&self.sentinel_path);
129 return BootDecision::Proceed;
130 }
131
132 sentinel.attempts += 1;
133 info!(
134 version = %self.version,
135 attempts = sentinel.attempts,
136 max = max_attempts,
137 "boot sentinel: unconfirmed swap, recording boot attempt",
138 );
139 // Persist the bumped count BEFORE returning so a crash later
140 // this boot still advances the counter.
141 self.write_sentinel(&sentinel);
142
143 if sentinel.attempts <= max_attempts {
144 return BootDecision::Proceed;
145 }
146
147 // Crash-loop confirmed → roll back.
148 match self.rollback() {
149 Ok(true) => {
150 self.quarantine(&self.version);
151 let _ = fs::remove_file(&self.sentinel_path);
152 error!(
153 version = %self.version,
154 attempts = sentinel.attempts,
155 "boot sentinel: crash-loop — rolled back to last-good and quarantined this version",
156 );
157 BootDecision::RolledBack {
158 from: self.version.clone(),
159 }
160 }
161 Ok(false) => {
162 // No last-good to roll back to (first install). We
163 // can't restore a binary, but still quarantine the bad
164 // version so that IF a good binary ever comes up it
165 // won't re-deploy this one — and so the self-update
166 // path's refusal is consistent. We keep Proceeding
167 // (nothing better to do than let it keep trying).
168 self.quarantine(&self.version);
169 error!(
170 version = %self.version,
171 "boot sentinel: crash-loop but no last-good binary to roll back to; \
172 quarantined the version and continuing (no rollback target)",
173 );
174 BootDecision::Proceed
175 }
176 Err(e) => {
177 error!(error = %e, "boot sentinel: rollback failed; continuing without it");
178 BootDecision::Proceed
179 }
180 }
181 }
182
183 /// Call once the process is confirmed healthy (backend: serving;
184 /// agent: NATS connected + first heartbeat). Promotes the live exe
185 /// to `.last-good` and clears the sentinel, so this version becomes
186 /// the rollback target for the next swap.
187 pub fn confirm_healthy(&self) -> io::Result<()> {
188 // Only promote/clear when there's an unconfirmed swap for THIS
189 // version — a plain restart of an already-good binary has no
190 // sentinel and shouldn't churn last-good.
191 let pending = matches!(self.read_sentinel(), Some(s) if s.version == self.version);
192 if pending {
193 if let Err(e) = fs::copy(&self.exe, &self.last_good) {
194 warn!(error = %e, "boot sentinel: could not promote exe to last-good");
195 } else {
196 info!(version = %self.version, "boot sentinel: confirmed healthy, promoted to last-good");
197 }
198 let _ = fs::remove_file(&self.sentinel_path);
199 } else if !self.last_good.exists() {
200 // First-ever healthy boot with no swap in flight: seed
201 // last-good so a future bad swap has something to fall
202 // back to.
203 if let Err(e) = fs::copy(&self.exe, &self.last_good) {
204 warn!(error = %e, "boot sentinel: could not seed last-good");
205 } else {
206 info!(version = %self.version, "boot sentinel: seeded initial last-good");
207 }
208 }
209 Ok(())
210 }
211
212 /// Call at swap time (deploy / self-update), before restarting into
213 /// the new binary. Snapshots the CURRENT (outgoing, known-good) exe
214 /// to `.last-good` and writes a fresh sentinel for `new_version` so
215 /// the next boot is gated.
216 ///
217 /// `current_exe` is the still-running good binary (copy it now,
218 /// before it's overwritten by the swap).
219 pub fn arm_for_swap(&self, current_exe: &Path, new_version: &str) -> io::Result<()> {
220 // The outgoing binary booted fine (it's running), so it's the
221 // rollback target.
222 fs::copy(current_exe, &self.last_good)?;
223 self.write_sentinel(&Sentinel {
224 version: new_version.to_string(),
225 attempts: 0,
226 });
227 info!(
228 new_version,
229 "boot sentinel: armed for swap (last-good snapshotted)"
230 );
231 Ok(())
232 }
233
234 /// True if `version` was rolled back after a failed boot. The
235 /// self-update path consults this before swapping so a bad rollout
236 /// target isn't re-attempted in a loop.
237 pub fn is_quarantined(&self, version: &str) -> bool {
238 self.read_quarantine().versions.iter().any(|v| v == version)
239 }
240
241 /// Every quarantined version (#582 Phase 2). The agent reports
242 /// these in its heartbeat so the SPA rollout view can flag which
243 /// PCs failed to adopt a target.
244 pub fn quarantined_versions(&self) -> Vec<String> {
245 self.read_quarantine().versions
246 }
247
248 /// Drop `version` from quarantine (operator re-published a fixed
249 /// binary under the same version string).
250 pub fn clear_quarantine(&self, version: &str) -> io::Result<()> {
251 let mut q = self.read_quarantine();
252 let before = q.versions.len();
253 q.versions.retain(|v| v != version);
254 if q.versions.len() != before {
255 self.write_quarantine(&q);
256 }
257 Ok(())
258 }
259
260 // ── internals ────────────────────────────────────────────────
261
262 /// Rename the live exe aside and copy `.last-good` into its place.
263 /// Returns `Ok(false)` when there's no last-good to restore.
264 fn rollback(&self) -> io::Result<bool> {
265 if !self.last_good.exists() {
266 return Ok(false);
267 }
268 let bak = sibling(&self.exe, ROLLBACK_BAK_SUFFIX);
269 // Best-effort: a leftover .rollback-bak from a prior cycle
270 // would block the rename.
271 let _ = fs::remove_file(&bak);
272 // Rename of a running/locked exe is permitted on Windows; the
273 // copy then lands a fresh file at the exe path.
274 fs::rename(&self.exe, &bak)?;
275 if let Err(e) = fs::copy(&self.last_good, &self.exe) {
276 // The exe path is now EMPTY (renamed to .rollback-bak,
277 // copy failed). Put the original back so the next SCM
278 // restart isn't left with no binary at all — mirroring the
279 // compensating rollback in self_update::swap_and_restart.
280 match fs::rename(&bak, &self.exe) {
281 Ok(()) => warn!(
282 error = %e,
283 "boot sentinel: last-good copy failed; restored the original exe in place",
284 ),
285 Err(restore_err) => error!(
286 error = %e,
287 restore_error = %restore_err,
288 exe = ?self.exe,
289 backup = ?bak,
290 "boot sentinel: last-good copy failed AND restore failed — service binary path \
291 is EMPTY; manual repair required (rename the .rollback-bak file back)",
292 ),
293 }
294 return Err(e);
295 }
296 Ok(true)
297 }
298
299 fn quarantine(&self, version: &str) {
300 let mut q = self.read_quarantine();
301 if !q.versions.iter().any(|v| v == version) {
302 q.versions.push(version.to_string());
303 self.write_quarantine(&q);
304 }
305 }
306
307 fn read_sentinel(&self) -> Option<Sentinel> {
308 let bytes = fs::read(&self.sentinel_path).ok()?;
309 match serde_json::from_slice(&bytes) {
310 Ok(s) => Some(s),
311 Err(e) => {
312 warn!(error = %e, "boot sentinel: corrupt sentinel, ignoring");
313 let _ = fs::remove_file(&self.sentinel_path);
314 None
315 }
316 }
317 }
318
319 fn write_sentinel(&self, s: &Sentinel) {
320 match serde_json::to_vec(s) {
321 Ok(bytes) => {
322 if let Err(e) = atomic_write(&self.sentinel_path, &bytes) {
323 warn!(error = %e, "boot sentinel: write sentinel failed");
324 }
325 }
326 Err(e) => warn!(error = %e, "boot sentinel: encode sentinel failed"),
327 }
328 }
329
330 fn read_quarantine(&self) -> Quarantine {
331 fs::read(&self.quarantine_path)
332 .ok()
333 .and_then(|b| serde_json::from_slice(&b).ok())
334 .unwrap_or_default()
335 }
336
337 fn write_quarantine(&self, q: &Quarantine) {
338 match serde_json::to_vec(q) {
339 Ok(bytes) => {
340 if let Err(e) = atomic_write(&self.quarantine_path, &bytes) {
341 warn!(error = %e, "boot sentinel: write quarantine failed");
342 }
343 }
344 Err(e) => warn!(error = %e, "boot sentinel: encode quarantine failed"),
345 }
346 }
347}
348
349/// `<path>.<suffix>` (e.g. `kanade-agent.exe` → `kanade-agent.exe.last-good`).
350fn sibling(path: &Path, suffix: &str) -> PathBuf {
351 let mut s = path.as_os_str().to_os_string();
352 s.push(".");
353 s.push(suffix);
354 PathBuf::from(s)
355}
356
357/// Write via a temp file + rename so a crash mid-write never leaves a
358/// torn sentinel/quarantine the next boot would misread. Creates the
359/// parent dir first — on a clean install the data dir may not exist
360/// yet when the first swap arms the sentinel.
361fn atomic_write(path: &Path, bytes: &[u8]) -> io::Result<()> {
362 if let Some(parent) = path.parent() {
363 fs::create_dir_all(parent)?;
364 }
365 let tmp = sibling(path, "tmp");
366 fs::write(&tmp, bytes)?;
367 fs::rename(&tmp, path)
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373 use tempfile::TempDir;
374
375 /// Build a sentinel over a temp dir with a fake "exe" containing
376 /// `body`, at `version`.
377 fn fixture(version: &str, body: &str) -> (TempDir, BootSentinel) {
378 let dir = TempDir::new().unwrap();
379 let exe = dir.path().join("kanade-agent.exe");
380 fs::write(&exe, body).unwrap();
381 let s = BootSentinel::new(dir.path(), exe, version);
382 (dir, s)
383 }
384
385 fn read(p: &Path) -> String {
386 fs::read_to_string(p).unwrap()
387 }
388
389 #[test]
390 fn no_sentinel_proceeds() {
391 let (_d, s) = fixture("1.0.0", "v1");
392 assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
393 }
394
395 #[test]
396 fn arm_snapshots_last_good_and_writes_sentinel() {
397 let (_d, s) = fixture("1.0.0", "v1-good");
398 // Pretend a new binary will be swapped in; arm with the
399 // current (good) exe.
400 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
401 assert_eq!(read(&s.last_good), "v1-good");
402 assert!(s.sentinel_path.exists());
403 }
404
405 #[test]
406 fn healthy_swap_confirms_and_promotes() {
407 let (_d, s) = fixture("1.0.0", "v1-good");
408 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
409 // Now the 2.0.0 binary boots. Simulate by writing new exe body
410 // + a 2.0.0 sentinel-aware guard.
411 fs::write(&s.exe, "v2").unwrap();
412 let s2 = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
413 assert_eq!(s2.check_on_boot(3), BootDecision::Proceed);
414 s2.confirm_healthy().unwrap();
415 // last-good is now v2; sentinel cleared.
416 assert_eq!(read(&s2.last_good), "v2");
417 assert!(!s2.sentinel_path.exists());
418 assert!(!s2.is_quarantined("2.0.0"));
419 }
420
421 #[test]
422 fn crash_loop_rolls_back_and_quarantines() {
423 let (_d, s) = fixture("1.0.0", "v1-good");
424 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
425 // The 2.0.0 binary is now live and crash-loops.
426 fs::write(&s.exe, "v2-broken").unwrap();
427 let bad = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
428
429 // attempts 1..3 proceed (each boot would then crash).
430 assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 1
431 assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 2
432 assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); // 3
433 // 4th attempt crosses the threshold → rollback.
434 assert_eq!(
435 bad.check_on_boot(3),
436 BootDecision::RolledBack {
437 from: "2.0.0".into()
438 }
439 );
440 // Live exe restored to the good binary; bad version quarantined;
441 // sentinel cleared.
442 assert_eq!(read(&bad.exe), "v1-good");
443 assert!(bad.is_quarantined("2.0.0"));
444 assert!(!bad.sentinel_path.exists());
445 }
446
447 #[test]
448 fn rollback_without_last_good_proceeds_but_quarantines() {
449 // No arm/last-good: a sentinel exists but nothing to restore.
450 let (_d, s) = fixture("2.0.0", "v2-broken");
451 s.write_sentinel(&Sentinel {
452 version: "2.0.0".into(),
453 attempts: 5,
454 });
455 // attempts already past max; rollback finds no last-good, so we
456 // Proceed (can't restore) but still quarantine so a future good
457 // binary won't re-deploy this one.
458 assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
459 assert!(s.is_quarantined("2.0.0"));
460 }
461
462 #[test]
463 fn stale_sentinel_for_other_version_is_cleared() {
464 let (_d, s) = fixture("1.0.0", "v1");
465 // A sentinel left for a version we are NOT (e.g. we are the
466 // rolled-back last-good).
467 s.write_sentinel(&Sentinel {
468 version: "2.0.0".into(),
469 attempts: 9,
470 });
471 assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
472 assert!(!s.sentinel_path.exists());
473 }
474
475 #[test]
476 fn quarantine_clear_roundtrip() {
477 let (_d, s) = fixture("1.0.0", "v1");
478 s.quarantine("2.0.0");
479 s.quarantine("2.0.1");
480 assert!(s.is_quarantined("2.0.0"));
481 assert!(s.is_quarantined("2.0.1"));
482 s.clear_quarantine("2.0.0").unwrap();
483 assert!(!s.is_quarantined("2.0.0"));
484 assert!(s.is_quarantined("2.0.1"));
485 }
486
487 #[test]
488 fn attempt_counter_persists_across_checks() {
489 // Each check_on_boot simulates a separate boot of the same
490 // crashing binary; the counter must accumulate via the file.
491 let (_d, s) = fixture("1.0.0", "good");
492 s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
493 fs::write(&s.exe, "broken").unwrap();
494 let dir = s.sentinel_path.parent().unwrap().to_path_buf();
495 let mk = || BootSentinel::new(&dir, s.exe.clone(), "2.0.0");
496 assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 1
497 assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); // 2
498 assert!(matches!(
499 mk().check_on_boot(2),
500 BootDecision::RolledBack { .. }
501 )); // 3 > 2
502 }
503}