use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
use tracing::{error, info, warn};
const SENTINEL_PREFIX: &str = ".boot-sentinel";
const QUARANTINE_PREFIX: &str = ".boot-quarantine";
const LAST_GOOD_SUFFIX: &str = "last-good";
const ROLLBACK_BAK_SUFFIX: &str = "rollback-bak";
fn role_ns(exe: &Path) -> String {
exe.file_stem()
.map(|s| s.to_string_lossy().into_owned())
.unwrap_or_else(|| "kanade".to_string())
}
pub const DEFAULT_MAX_ATTEMPTS: u32 = 3;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct Sentinel {
version: String,
attempts: u32,
}
#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)]
struct Quarantine {
versions: Vec<String>,
}
#[derive(Debug, PartialEq, Eq)]
pub enum BootDecision {
Proceed,
RolledBack { from: String },
}
pub struct BootSentinel {
sentinel_path: PathBuf,
quarantine_path: PathBuf,
exe: PathBuf,
last_good: PathBuf,
version: String,
}
impl BootSentinel {
pub fn new(data_dir: &Path, exe: PathBuf, version: impl Into<String>) -> Self {
let last_good = sibling(&exe, LAST_GOOD_SUFFIX);
let role = role_ns(&exe);
Self {
sentinel_path: data_dir.join(format!("{SENTINEL_PREFIX}-{role}.json")),
quarantine_path: data_dir.join(format!("{QUARANTINE_PREFIX}-{role}.json")),
exe,
last_good,
version: version.into(),
}
}
pub fn check_on_boot(&self, max_attempts: u32) -> BootDecision {
let Some(mut sentinel) = self.read_sentinel() else {
return BootDecision::Proceed;
};
if sentinel.version != self.version {
let _ = fs::remove_file(&self.sentinel_path);
return BootDecision::Proceed;
}
sentinel.attempts += 1;
info!(
version = %self.version,
attempts = sentinel.attempts,
max = max_attempts,
"boot sentinel: unconfirmed swap, recording boot attempt",
);
self.write_sentinel(&sentinel);
if sentinel.attempts <= max_attempts {
return BootDecision::Proceed;
}
match self.rollback() {
Ok(true) => {
self.quarantine(&self.version);
let _ = fs::remove_file(&self.sentinel_path);
error!(
version = %self.version,
attempts = sentinel.attempts,
"boot sentinel: crash-loop — rolled back to last-good and quarantined this version",
);
BootDecision::RolledBack {
from: self.version.clone(),
}
}
Ok(false) => {
self.quarantine(&self.version);
error!(
version = %self.version,
"boot sentinel: crash-loop but no last-good binary to roll back to; \
quarantined the version and continuing (no rollback target)",
);
BootDecision::Proceed
}
Err(e) => {
error!(error = %e, "boot sentinel: rollback failed; continuing without it");
BootDecision::Proceed
}
}
}
pub fn confirm_healthy(&self) -> io::Result<()> {
let pending = matches!(self.read_sentinel(), Some(s) if s.version == self.version);
if pending {
if let Err(e) = fs::copy(&self.exe, &self.last_good) {
warn!(error = %e, "boot sentinel: could not promote exe to last-good");
} else {
info!(version = %self.version, "boot sentinel: confirmed healthy, promoted to last-good");
}
let _ = fs::remove_file(&self.sentinel_path);
} else if !self.last_good.exists() {
if let Err(e) = fs::copy(&self.exe, &self.last_good) {
warn!(error = %e, "boot sentinel: could not seed last-good");
} else {
info!(version = %self.version, "boot sentinel: seeded initial last-good");
}
}
Ok(())
}
pub fn arm_for_swap(&self, current_exe: &Path, new_version: &str) -> io::Result<()> {
fs::copy(current_exe, &self.last_good)?;
self.write_sentinel(&Sentinel {
version: new_version.to_string(),
attempts: 0,
});
info!(
new_version,
"boot sentinel: armed for swap (last-good snapshotted)"
);
Ok(())
}
pub fn is_quarantined(&self, version: &str) -> bool {
self.read_quarantine().versions.iter().any(|v| v == version)
}
pub fn quarantined_versions(&self) -> Vec<String> {
self.read_quarantine().versions
}
pub fn clear_quarantine(&self, version: &str) -> io::Result<()> {
let mut q = self.read_quarantine();
let before = q.versions.len();
q.versions.retain(|v| v != version);
if q.versions.len() != before {
self.write_quarantine(&q);
}
Ok(())
}
fn rollback(&self) -> io::Result<bool> {
if !self.last_good.exists() {
return Ok(false);
}
let bak = sibling(&self.exe, ROLLBACK_BAK_SUFFIX);
let _ = fs::remove_file(&bak);
fs::rename(&self.exe, &bak)?;
if let Err(e) = fs::copy(&self.last_good, &self.exe) {
match fs::rename(&bak, &self.exe) {
Ok(()) => warn!(
error = %e,
"boot sentinel: last-good copy failed; restored the original exe in place",
),
Err(restore_err) => error!(
error = %e,
restore_error = %restore_err,
exe = ?self.exe,
backup = ?bak,
"boot sentinel: last-good copy failed AND restore failed — service binary path \
is EMPTY; manual repair required (rename the .rollback-bak file back)",
),
}
return Err(e);
}
Ok(true)
}
fn quarantine(&self, version: &str) {
let mut q = self.read_quarantine();
if !q.versions.iter().any(|v| v == version) {
q.versions.push(version.to_string());
self.write_quarantine(&q);
}
}
fn read_sentinel(&self) -> Option<Sentinel> {
let bytes = fs::read(&self.sentinel_path).ok()?;
match serde_json::from_slice(&bytes) {
Ok(s) => Some(s),
Err(e) => {
warn!(error = %e, "boot sentinel: corrupt sentinel, ignoring");
let _ = fs::remove_file(&self.sentinel_path);
None
}
}
}
fn write_sentinel(&self, s: &Sentinel) {
match serde_json::to_vec(s) {
Ok(bytes) => {
if let Err(e) = atomic_write(&self.sentinel_path, &bytes) {
warn!(error = %e, "boot sentinel: write sentinel failed");
}
}
Err(e) => warn!(error = %e, "boot sentinel: encode sentinel failed"),
}
}
fn read_quarantine(&self) -> Quarantine {
fs::read(&self.quarantine_path)
.ok()
.and_then(|b| serde_json::from_slice(&b).ok())
.unwrap_or_default()
}
fn write_quarantine(&self, q: &Quarantine) {
match serde_json::to_vec(q) {
Ok(bytes) => {
if let Err(e) = atomic_write(&self.quarantine_path, &bytes) {
warn!(error = %e, "boot sentinel: write quarantine failed");
}
}
Err(e) => warn!(error = %e, "boot sentinel: encode quarantine failed"),
}
}
}
fn sibling(path: &Path, suffix: &str) -> PathBuf {
let mut s = path.as_os_str().to_os_string();
s.push(".");
s.push(suffix);
PathBuf::from(s)
}
fn atomic_write(path: &Path, bytes: &[u8]) -> io::Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
let tmp = sibling(path, "tmp");
fs::write(&tmp, bytes)?;
fs::rename(&tmp, path)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn fixture(version: &str, body: &str) -> (TempDir, BootSentinel) {
let dir = TempDir::new().unwrap();
let exe = dir.path().join("kanade-agent.exe");
fs::write(&exe, body).unwrap();
let s = BootSentinel::new(dir.path(), exe, version);
(dir, s)
}
fn read(p: &Path) -> String {
fs::read_to_string(p).unwrap()
}
#[test]
fn no_sentinel_proceeds() {
let (_d, s) = fixture("1.0.0", "v1");
assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
}
#[test]
fn arm_snapshots_last_good_and_writes_sentinel() {
let (_d, s) = fixture("1.0.0", "v1-good");
s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
assert_eq!(read(&s.last_good), "v1-good");
assert!(s.sentinel_path.exists());
}
#[test]
fn healthy_swap_confirms_and_promotes() {
let (_d, s) = fixture("1.0.0", "v1-good");
s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
fs::write(&s.exe, "v2").unwrap();
let s2 = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
assert_eq!(s2.check_on_boot(3), BootDecision::Proceed);
s2.confirm_healthy().unwrap();
assert_eq!(read(&s2.last_good), "v2");
assert!(!s2.sentinel_path.exists());
assert!(!s2.is_quarantined("2.0.0"));
}
#[test]
fn crash_loop_rolls_back_and_quarantines() {
let (_d, s) = fixture("1.0.0", "v1-good");
s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
fs::write(&s.exe, "v2-broken").unwrap();
let bad = BootSentinel::new(s.sentinel_path.parent().unwrap(), s.exe.clone(), "2.0.0");
assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); assert_eq!(bad.check_on_boot(3), BootDecision::Proceed); assert_eq!(
bad.check_on_boot(3),
BootDecision::RolledBack {
from: "2.0.0".into()
}
);
assert_eq!(read(&bad.exe), "v1-good");
assert!(bad.is_quarantined("2.0.0"));
assert!(!bad.sentinel_path.exists());
}
#[test]
fn rollback_without_last_good_proceeds_but_quarantines() {
let (_d, s) = fixture("2.0.0", "v2-broken");
s.write_sentinel(&Sentinel {
version: "2.0.0".into(),
attempts: 5,
});
assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
assert!(s.is_quarantined("2.0.0"));
}
#[test]
fn stale_sentinel_for_other_version_is_cleared() {
let (_d, s) = fixture("1.0.0", "v1");
s.write_sentinel(&Sentinel {
version: "2.0.0".into(),
attempts: 9,
});
assert_eq!(s.check_on_boot(3), BootDecision::Proceed);
assert!(!s.sentinel_path.exists());
}
#[test]
fn quarantine_clear_roundtrip() {
let (_d, s) = fixture("1.0.0", "v1");
s.quarantine("2.0.0");
s.quarantine("2.0.1");
assert!(s.is_quarantined("2.0.0"));
assert!(s.is_quarantined("2.0.1"));
s.clear_quarantine("2.0.0").unwrap();
assert!(!s.is_quarantined("2.0.0"));
assert!(s.is_quarantined("2.0.1"));
}
#[test]
fn sentinel_and_quarantine_are_namespaced_per_role() {
let dir = TempDir::new().unwrap();
let be = BootSentinel::new(dir.path(), dir.path().join("kanade-backend.exe"), "1.0.0");
let ag = BootSentinel::new(dir.path(), dir.path().join("kanade-agent.exe"), "1.0.0");
assert_ne!(be.sentinel_path, ag.sentinel_path);
assert_ne!(be.quarantine_path, ag.quarantine_path);
fs::write(&be.exe, "be").unwrap();
be.arm_for_swap(&be.exe.clone(), "2.0.0").unwrap();
be.quarantine("9.9.9");
assert!(be.is_quarantined("9.9.9"));
assert!(!ag.is_quarantined("9.9.9"));
assert_eq!(ag.check_on_boot(3), BootDecision::Proceed);
assert!(be.sentinel_path.exists()); }
#[test]
fn attempt_counter_persists_across_checks() {
let (_d, s) = fixture("1.0.0", "good");
s.arm_for_swap(&s.exe.clone(), "2.0.0").unwrap();
fs::write(&s.exe, "broken").unwrap();
let dir = s.sentinel_path.parent().unwrap().to_path_buf();
let mk = || BootSentinel::new(&dir, s.exe.clone(), "2.0.0");
assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); assert_eq!(mk().check_on_boot(2), BootDecision::Proceed); assert!(matches!(
mk().check_on_boot(2),
BootDecision::RolledBack { .. }
)); }
}