use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::thread;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use netsky_core::consts::{
AGENT0_CRASHLOOP_THRESHOLD_DEFAULT, AGENT0_CRASHLOOP_WINDOW_S_DEFAULT,
AGENT0_HANG_REPAGE_S_DEFAULT, AGENT0_HANG_S_DEFAULT, AGENT0_NAME, AGENTINFINITY_NAME,
AGENTINIT_THRESHOLD_DEFAULT, AGENTINIT_WINDOW_S_DEFAULT, CRASH_HANDOFF_FILENAME_PREFIX,
CRASH_HANDOFF_FILENAME_SUFFIX, DISK_MIN_MB_DEFAULT, ENV_AGENT0_CRASHLOOP_THRESHOLD,
ENV_AGENT0_CRASHLOOP_WINDOW_S, ENV_AGENT0_HANG_REPAGE_S, ENV_AGENT0_HANG_S,
ENV_AGENTINIT_THRESHOLD, ENV_AGENTINIT_WINDOW_S, ENV_DISK_MIN_MB, ENV_HANG_DETECT,
ENV_RESTART_ARCHIVE_TTL_S, NETSKY_BIN, RESTART_ARCHIVE_TTL_S_DEFAULT,
RESTART_DETACHED_LOG_FILENAME, RESTART_INFLIGHT_FILE, RESTART_INFLIGHT_STALE_S,
RESTART_PROCESSING_ARCHIVE_FILENAME_PREFIX, RESTART_PROCESSING_ARCHIVE_FILENAME_SUFFIX,
RESTART_PROCESSING_FILE, RESTART_PROCESSING_STALE_S, RESTART_REQUEST_FILE,
TICK_INTERVAL_CONFIG, TICK_LAST_MARKER, TICK_MIN_INTERVAL_S, TICKER_LOG_PATH, TICKER_SESSION,
WATCHDOG_LOCK_DIR, WATCHDOG_LOCK_PID_FILE, WATCHDOG_LOCK_STALE_S, WATCHDOG_RESTART_CLONE_COUNT,
WATCHDOG_RESTART_VERIFY_WINDOW_S, WATCHDOG_TICK_GAP_ESCALATE_S, WATCHDOG_TICK_GAP_WARN_S,
WATCHDOG_TICKER_MISSING_THRESHOLD,
};
use netsky_core::paths::{
agent0_crashloop_marker, agent0_hang_marker, agent0_hang_paged_marker, agent0_pane_hash_file,
agent0_quiet_sentinel_prefix, agent0_restart_attempts_file, agentinit_escalation_marker,
agentinit_failures_file, crash_handoffs_dir, ensure_state_dir, restart_archive_dir,
restart_detached_log_path, restart_status_dir, state_dir, watchdog_event_log_path,
};
use netsky_sh::tmux;
use sha2::{Digest, Sha256};
const CRASH_HANDOFF_TEMPLATE: &str = include_str!("../../prompts/crash-handoff.md");
#[derive(Debug, Clone)]
struct RestartStatusSnapshot {
path: PathBuf,
phase: String,
error: Option<String>,
}
#[derive(Debug, PartialEq, Eq)]
enum RestartVerification {
Pending,
Verified,
Failed(String),
}
pub fn tick() -> netsky_core::Result<()> {
let ts = chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string();
if !acquire_lock(&ts) {
return Ok(());
}
let _guard = LockGuard;
watchdog_tick_gap_preflight(&ts);
disk_preflight(&ts);
stale_processing_preflight(&ts);
quiet_sentinel_reap_preflight(&ts);
migrate_legacy_crash_handoffs_once(&ts);
migrate_legacy_restart_archive_once(&ts);
restart_archive_sweep_preflight(&ts);
if restart_in_flight(&ts) {
return Ok(());
}
if failed_revive_guard(&ts) {
return Ok(());
}
if !tmux::session_is_alive(AGENTINFINITY_NAME) {
return handle_agentinfinity_missing(&ts);
}
if !tmux::session_is_alive(TICKER_SESSION) {
let miss_count = crate::cmd::tick::ticker_missing_count().saturating_add(1);
let _ = crate::cmd::tick::ticker_missing_record(miss_count);
if miss_count >= WATCHDOG_TICKER_MISSING_THRESHOLD {
log(&format!(
"[watchdog-tick {ts}] ticker missing {miss_count} ticks; self-healing via netsky tick ticker-start"
));
let event = format!(
"[watchdog-event {ts}] ticker self-healed after {miss_count} consecutive miss(es)"
);
log(&event);
append_watchdog_event(&event);
match crate::cmd::tick::ticker_start() {
Ok(()) => log(&format!("[watchdog-tick {ts}] ticker respawned")),
Err(e) => log(&format!("[watchdog-tick {ts}] ticker respawn failed: {e}")),
}
} else {
log(&format!(
"[watchdog-tick {ts}] ticker session missing; miss {miss_count}/{WATCHDOG_TICKER_MISSING_THRESHOLD} before respawn"
));
let event = format!(
"[watchdog-event {ts}] ticker missing observed ({miss_count}/{WATCHDOG_TICKER_MISSING_THRESHOLD})"
);
log(&event);
append_watchdog_event(&event);
}
}
if let Some(handoff_path) = claim_restart_request(&ts) {
spawn_restart_detached(&ts, &handoff_path, "planned");
return Ok(());
}
if tmux::session_is_alive(AGENT0_NAME) {
agent0_healthy_path(&ts)?;
return Ok(());
}
log(&format!(
"[watchdog-tick {ts}] agent0 missing; crash-recovery restart"
));
crashloop_detection(&ts);
let handoff = write_crash_handoff(&ts)?;
spawn_restart_detached(&ts, &handoff, "crash");
Ok(())
}
fn spawn_restart_detached(ts: &str, handoff: &Path, kind: &str) {
use std::process::Stdio;
let log_path = restart_detached_log_path();
let _ = fs::create_dir_all(restart_archive_dir());
let log_file = fs::OpenOptions::new()
.create(true)
.append(true)
.open(&log_path);
let log_file = match log_file {
Ok(f) => f,
Err(e) => {
log(&format!(
"[watchdog-tick {ts}] restart detach: log open failed ({e}); proceeding without log redirect"
));
match fs::File::create("/dev/null") {
Ok(f) => f,
Err(_) => {
log(&format!(
"[watchdog-tick {ts}] restart detach: /dev/null open failed; aborting spawn"
));
return;
}
}
}
};
let log_file2 = match log_file.try_clone() {
Ok(f) => f,
Err(e) => {
log(&format!(
"[watchdog-tick {ts}] restart detach: log fd clone failed ({e}); aborting spawn"
));
return;
}
};
let handoff_arg = handoff.to_string_lossy().into_owned();
let spawn_result = Command::new(NETSKY_BIN)
.args([
"restart",
&WATCHDOG_RESTART_CLONE_COUNT.to_string(),
"--handoff",
&handoff_arg,
])
.stdin(Stdio::null())
.stdout(Stdio::from(log_file))
.stderr(Stdio::from(log_file2))
.spawn();
match spawn_result {
Ok(child) => {
let pid = child.id();
drop(child);
let body = format!("{pid}\n{ts}\n{kind}\n");
if let Err(e) = fs::write(RESTART_INFLIGHT_FILE, body) {
log(&format!(
"[watchdog-tick {ts}] restart detach: inflight marker write failed ({e}); pid={pid} still running"
));
}
log(&format!(
"[watchdog-tick {ts}] restart detached ({kind}) pid={pid}; lock released"
));
}
Err(e) => {
log(&format!(
"[watchdog-tick {ts}] restart detach spawn failed ({kind}): {e}"
));
}
}
}
fn restart_in_flight(ts: &str) -> bool {
restart_in_flight_at(
Path::new(RESTART_INFLIGHT_FILE),
RESTART_INFLIGHT_STALE_S,
ts,
process_alive,
)
}
fn restart_in_flight_at(marker: &Path, stale_s: u64, ts: &str, pid_alive: fn(i32) -> bool) -> bool {
if !marker.exists() {
return false;
}
if let Some(age) = age_seconds(marker)
&& age > stale_s
{
let _ = fs::remove_file(marker);
log(&format!(
"[watchdog-tick {ts}] stale restart-inflight marker removed (age {age}s > {stale_s}s)"
));
return false;
}
let pid = fs::read_to_string(marker)
.ok()
.and_then(|s| s.lines().next().and_then(|l| l.trim().parse::<i32>().ok()));
match pid {
Some(pid) if pid_alive(pid) => {
log(&format!(
"[watchdog-tick {ts}] restart in flight (pid={pid}); skipping tick body"
));
true
}
Some(pid) => {
let age = age_seconds(marker).unwrap_or(0);
let status = latest_restart_status(&restart_status_dir());
match restart_verification_state(
age,
tmux::session_is_alive(AGENT0_NAME),
status.as_ref(),
) {
RestartVerification::Pending => {
log(&format!(
"[watchdog-tick {ts}] restart-inflight pid={pid} dead; waiting for revive verification ({age}s/{WATCHDOG_RESTART_VERIFY_WINDOW_S}s)"
));
true
}
RestartVerification::Verified => {
let _ = fs::remove_file(marker);
log(&format!(
"[watchdog-tick {ts}] restart verified: agent0 is live and a post-revive tick elapsed; marker removed"
));
false
}
RestartVerification::Failed(reason) => {
let _ = fs::remove_file(marker);
record_failed_revive(ts, pid, age, &reason, status.as_ref());
true
}
}
}
None => {
let _ = fs::remove_file(marker);
log(&format!(
"[watchdog-tick {ts}] restart-inflight marker malformed; removed"
));
false
}
}
}
fn failed_revive_guard(ts: &str) -> bool {
let marker = agent0_crashloop_marker();
if !marker.exists() {
return false;
}
if tmux::session_is_alive(AGENT0_NAME) {
return false;
}
log(&format!(
"[watchdog-tick {ts}] failed-revive guard active; agent0 still missing (see {})",
marker.display()
));
true
}
fn restart_verification_state(
marker_age_s: u64,
agent0_alive: bool,
status: Option<&RestartStatusSnapshot>,
) -> RestartVerification {
let Some(status) = status else {
if marker_age_s >= WATCHDOG_RESTART_VERIFY_WINDOW_S {
return RestartVerification::Failed(
"no restart-status file reported a live agent0".to_string(),
);
}
return RestartVerification::Pending;
};
if status.phase == "up-detected" && agent0_alive {
if marker_age_s >= TICK_MIN_INTERVAL_S {
RestartVerification::Verified
} else {
RestartVerification::Pending
}
} else if marker_age_s >= WATCHDOG_RESTART_VERIFY_WINDOW_S {
RestartVerification::Failed(format!(
"latest restart-status phase={} error={}",
status.phase,
status.error.as_deref().unwrap_or("(none)")
))
} else {
RestartVerification::Pending
}
}
fn latest_restart_status(dir: &Path) -> Option<RestartStatusSnapshot> {
let mut newest: Option<(SystemTime, PathBuf)> = None;
for entry in fs::read_dir(dir).ok()?.flatten() {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) != Some("json") {
continue;
}
let mtime = entry.metadata().ok()?.modified().unwrap_or(UNIX_EPOCH);
if newest.as_ref().is_none_or(|(prev, _)| mtime > *prev) {
newest = Some((mtime, path));
}
}
let (_, path) = newest?;
let text = fs::read_to_string(&path).ok()?;
let v: serde_json::Value = serde_json::from_str(&text).ok()?;
let phase = v.get("phase").and_then(|x| x.as_str())?.to_string();
let error = v
.get("error")
.and_then(|x| x.as_str())
.map(|s| s.to_string());
Some(RestartStatusSnapshot { path, phase, error })
}
fn record_failed_revive(
ts: &str,
pid: i32,
age_s: u64,
reason: &str,
status: Option<&RestartStatusSnapshot>,
) {
let marker = agent0_crashloop_marker();
let status_detail = status
.map(|s| format!("latest restart-status: {} ({})", s.phase, s.path.display()))
.unwrap_or_else(|| "latest restart-status: none".to_string());
let body = format!(
"agent0 failed-revive: restart pid {pid} stayed dead for {age_s}s without a verified live tmux session.\n\
reason: {reason}.\n\
{status_detail}.\n\
written at {}.\n",
chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ")
);
let _ = fs::write(&marker, body);
let event =
format!("[watchdog-event {ts}] failed-revive: pid={pid} age={age_s}s reason={reason}");
log(&event);
append_watchdog_event(&event);
let subject = format!("agent0 failed-revive after {age_s}s");
let body = format!(
"restart pid {pid} died without a verified live agent0 tmux session within {window}s. \
{reason}. see {marker} and {event_log}.",
window = WATCHDOG_RESTART_VERIFY_WINDOW_S,
marker = marker.display(),
event_log = watchdog_event_log_path().display(),
);
match crate::cmd::escalate::run(&subject, Some(&body)) {
Ok(()) => log(&format!(
"[watchdog-tick {ts}] failed-revive escalation page sent"
)),
Err(e) => log(&format!(
"[watchdog-tick {ts}] failed-revive escalate FAILED ({e})"
)),
}
}
fn watchdog_tick_gap_preflight(ts: &str) {
let age = match age_seconds(Path::new(TICKER_LOG_PATH)) {
Some(age) => age,
None => return,
};
if age < WATCHDOG_TICK_GAP_WARN_S {
return;
}
let line = format!(
"[watchdog-event {ts}] ticker stopped: watchdog log gap {age}s (warn>{WATCHDOG_TICK_GAP_WARN_S}s)"
);
log(&line);
append_watchdog_event(&line);
if age > WATCHDOG_TICK_GAP_ESCALATE_S {
let subject = format!("agent0 watchdog gap {age}s");
let body = format!(
"watchdog log has been stale for {age}s (warn {warn}s, escalate {esc}s). \
ticker stopped or launchd stalled. see {} for the durable event.",
watchdog_event_log_path().display(),
warn = WATCHDOG_TICK_GAP_WARN_S,
esc = WATCHDOG_TICK_GAP_ESCALATE_S,
);
match crate::cmd::escalate::run(&subject, Some(&body)) {
Ok(()) => log(&format!(
"[watchdog-tick {ts}] watchdog gap escalation page sent ({age}s)"
)),
Err(e) => log(&format!(
"[watchdog-tick {ts}] watchdog gap escalate FAILED ({e})"
)),
}
}
}
fn append_watchdog_event(line: &str) {
append_watchdog_event_at(&watchdog_event_log_path(), line);
}
fn append_watchdog_event_at(path: &Path, line: &str) {
if let Some(parent) = path.parent() {
let _ = fs::create_dir_all(parent);
}
if let Ok(mut f) = fs::OpenOptions::new().create(true).append(true).open(path) {
let _ = writeln!(f, "{line}");
}
}
struct LockGuard;
impl Drop for LockGuard {
fn drop(&mut self) {
let lock = Path::new(WATCHDOG_LOCK_DIR);
let _ = fs::remove_file(lock.join(WATCHDOG_LOCK_PID_FILE));
let _ = fs::remove_dir(lock);
}
}
fn acquire_lock(ts: &str) -> bool {
if Path::new(WATCHDOG_LOCK_DIR).exists() && should_force_release(ts) {
let lock = Path::new(WATCHDOG_LOCK_DIR);
let _ = fs::remove_file(lock.join(WATCHDOG_LOCK_PID_FILE));
let _ = fs::remove_dir(lock);
}
match fs::create_dir(WATCHDOG_LOCK_DIR) {
Ok(()) => {
let _ = fs::write(
Path::new(WATCHDOG_LOCK_DIR).join(WATCHDOG_LOCK_PID_FILE),
std::process::id().to_string(),
);
true
}
Err(_) => {
log(&format!(
"[watchdog-tick {ts}] tick skipped: another instance holds {WATCHDOG_LOCK_DIR}"
));
false
}
}
}
fn should_force_release(ts: &str) -> bool {
let lock = Path::new(WATCHDOG_LOCK_DIR);
let pid_file = lock.join(WATCHDOG_LOCK_PID_FILE);
if let Some(pid) = fs::read_to_string(&pid_file)
.ok()
.and_then(|s| s.trim().parse::<i32>().ok())
{
if process_alive(pid) {
return false;
}
log(&format!(
"[watchdog-tick {ts}] lock held by dead pid {pid}; reclaiming"
));
return true;
}
let age = age_seconds(lock).unwrap_or(0);
if age > WATCHDOG_LOCK_STALE_S {
log(&format!(
"[watchdog-tick {ts}] stale legacy lock removed (age {age}s, no pid file)"
));
return true;
}
false
}
fn process_alive(pid: i32) -> bool {
if pid <= 0 {
return false;
}
Command::new("kill")
.args(["-0", &pid.to_string()])
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.map(|s| s.success())
.unwrap_or(false)
}
fn disk_preflight(ts: &str) {
let min_mb = std::env::var(ENV_DISK_MIN_MB)
.ok()
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(DISK_MIN_MB_DEFAULT);
let avail_kb = match Command::new("df").args(["-Pk", "/tmp"]).output() {
Ok(o) if o.status.success() => {
let out = String::from_utf8_lossy(&o.stdout);
out.lines()
.nth(1)
.and_then(|line| line.split_whitespace().nth(3))
.and_then(|s| s.parse::<u64>().ok())
}
_ => None,
};
if let Some(kb) = avail_kb {
let mb = kb / 1024;
if mb < min_mb {
log(&format!(
"[watchdog-tick {ts}] WARNING: /tmp free space {mb}MB < {min_mb}MB threshold"
));
}
}
}
fn quiet_sentinel_reap_preflight(ts: &str) {
quiet_sentinel_reap_at(&state_dir(), unix_now(), Some(ts));
}
fn quiet_sentinel_reap_at(dir: &Path, now_s: u64, ts: Option<&str>) -> usize {
let status = quiet_sentinel_status(dir, now_s);
let mut reaped = 0;
for stale in &status.stale_paths {
match fs::remove_file(stale) {
Ok(()) => reaped += 1,
Err(e) => {
if let Some(ts) = ts {
eprintln!(
"[watchdog-tick {ts}] quiet-sentinel reap failed for {}: {e}",
stale.display()
);
}
}
}
}
reaped
}
fn stale_processing_preflight(ts: &str) {
let proc = Path::new(RESTART_PROCESSING_FILE);
if !proc.exists() {
return;
}
let age = match age_seconds(proc) {
Some(a) => a,
None => return,
};
if age < RESTART_PROCESSING_STALE_S {
return;
}
let stamp = chrono::Utc::now().format("%Y%m%dT%H%M%SZ");
let archive_dir = restart_archive_dir();
if let Err(e) = fs::create_dir_all(&archive_dir) {
log(&format!(
"[watchdog-tick {ts}] stale .processing archive dir create failed ({e}); leaving in place"
));
return;
}
let archive = archive_dir.join(format!(
"{RESTART_PROCESSING_ARCHIVE_FILENAME_PREFIX}{stamp}{RESTART_PROCESSING_ARCHIVE_FILENAME_SUFFIX}"
));
let moved = fs::rename(proc, &archive).or_else(|_| {
fs::copy(proc, &archive)
.and_then(|_| fs::remove_file(proc))
.map(|_| ())
});
match moved {
Ok(()) => log(&format!(
"[watchdog-tick {ts}] stale .processing (age {age}s) archived to {}",
archive.display()
)),
Err(_) => log(&format!(
"[watchdog-tick {ts}] stale .processing (age {age}s) archive failed; leaving in place"
)),
}
}
fn handle_agentinfinity_missing(ts: &str) -> netsky_core::Result<()> {
log(&format!(
"[watchdog-tick {ts}] agentinfinity missing; respawning"
));
crate::cmd::agentinfinity::run()?;
for waited in 0..10 {
if tmux::session_is_alive(AGENTINFINITY_NAME) {
break;
}
if waited == 9 {
log(&format!(
"[watchdog-tick {ts}] agentinfinity respawn failed: tmux session did not register within 10s"
));
netsky_core::bail!("agentinfinity did not register");
}
thread::sleep(Duration::from_secs(1));
}
match crate::cmd::agentinit::run(AGENTINFINITY_NAME) {
Ok(()) => {
log(&format!(
"[watchdog-tick {ts}] agentinfinity respawned: agentinit ok"
));
Ok(())
}
Err(e) => {
log(&format!(
"[watchdog-tick {ts}] agentinfinity respawned: agentinit failed ({e})"
));
record_agentinit_failure(ts);
netsky_core::bail!("agentinit failed: {e}")
}
}
}
fn record_agentinit_failure(ts: &str) {
if let Err(e) = ensure_state_dir() {
log(&format!(
"[watchdog-tick {ts}] agentinit failure recorder: ensure_state_dir failed: {e}"
));
return;
}
let window = env_or(ENV_AGENTINIT_WINDOW_S, AGENTINIT_WINDOW_S_DEFAULT);
let threshold = env_or(ENV_AGENTINIT_THRESHOLD, AGENTINIT_THRESHOLD_DEFAULT);
let outcome = record_agentinit_failure_core(
&agentinit_failures_file(),
&agentinit_escalation_marker(),
unix_now(),
window,
threshold,
);
match outcome {
Ok(FailureOutcome { count, escalated }) if escalated => log(&format!(
"[watchdog-tick {ts}] agentinit ESCALATION: {count} failures within {window}s — wrote {}",
agentinit_escalation_marker().display()
)),
Ok(_) => {}
Err(FailureRecordError::Counter(e)) => log(&format!(
"[watchdog-tick {ts}] agentinit failure counter write FAILED ({e}); escalation defense-in-depth degraded"
)),
Err(FailureRecordError::Escalation(e)) => log(&format!(
"[watchdog-tick {ts}] agentinit escalation marker write FAILED ({e}); owner will not see the alert on next /up"
)),
}
}
#[derive(Debug)]
struct FailureOutcome {
count: u64,
escalated: bool,
}
#[derive(Debug)]
enum FailureRecordError {
Counter(std::io::Error),
Escalation(std::io::Error),
}
fn record_agentinit_failure_core(
failures_file: &Path,
escalation_marker: &Path,
now_s: u64,
window_s: u64,
threshold: u64,
) -> std::result::Result<FailureOutcome, FailureRecordError> {
let mut times: Vec<u64> = fs::read_to_string(failures_file)
.ok()
.map(|s| {
s.lines()
.filter_map(|l| l.trim().parse::<u64>().ok())
.collect()
})
.unwrap_or_default();
times.push(now_s);
let cutoff = now_s.saturating_sub(window_s);
times.retain(|&t| t >= cutoff);
let body = times
.iter()
.map(u64::to_string)
.collect::<Vec<_>>()
.join("\n")
+ "\n";
fs::write(failures_file, body).map_err(FailureRecordError::Counter)?;
let count = times.len() as u64;
let mut escalated = false;
if count >= threshold && !escalation_marker.exists() {
let contents = format!(
"agentinit failed {count} times within {window_s}s window.\n\
escalation written at {}.\n\
window contents (unix ts per failure):\n{}\n",
chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ"),
times
.iter()
.map(u64::to_string)
.collect::<Vec<_>>()
.join("\n"),
);
fs::write(escalation_marker, contents).map_err(FailureRecordError::Escalation)?;
escalated = true;
}
Ok(FailureOutcome { count, escalated })
}
fn claim_restart_request(ts: &str) -> Option<PathBuf> {
let req = Path::new(RESTART_REQUEST_FILE);
let len = fs::metadata(req).ok()?.len();
if len == 0 {
return None;
}
let processing = Path::new(RESTART_PROCESSING_FILE);
match fs::rename(req, processing) {
Ok(()) => Some(processing.to_path_buf()),
Err(e) => {
log(&format!(
"[watchdog-tick {ts}] restart-request mv failed: {e}"
));
None
}
}
}
fn agent0_healthy_path(ts: &str) -> netsky_core::Result<()> {
crashloop_clear_on_healthy(ts);
let hang_detect = std::env::var(ENV_HANG_DETECT)
.ok()
.map(|v| v != "0")
.unwrap_or(true);
if hang_detect {
hang_detection(ts);
}
let cfg = Path::new(TICK_INTERVAL_CONFIG);
if !cfg.exists() {
log(&format!(
"[watchdog-tick {ts}] agent0 healthy; ticks disabled (no config)"
));
return Ok(());
}
let interval = match fs::read_to_string(cfg)
.ok()
.and_then(|s| s.trim().parse::<u64>().ok())
{
Some(s) if s >= TICK_MIN_INTERVAL_S => s,
_ => {
log(&format!(
"[watchdog-tick {ts}] agent0 healthy; tick config invalid (ignoring)"
));
return Ok(());
}
};
let now_s = unix_now();
let last_s = age_seconds(Path::new(TICK_LAST_MARKER))
.map(|age| now_s.saturating_sub(age))
.unwrap_or(0);
if now_s.saturating_sub(last_s) >= interval {
match crate::cmd::tick::request() {
Ok(()) => {
touch(Path::new(TICK_LAST_MARKER));
log(&format!(
"[watchdog-tick {ts}] agent0 healthy; tick request dropped (interval {interval}s)"
));
}
Err(e) => log(&format!(
"[watchdog-tick {ts}] agent0 healthy; tick request FAILED ({e}); marker not advanced"
)),
}
} else {
let elapsed = now_s.saturating_sub(last_s);
log(&format!(
"[watchdog-tick {ts}] agent0 healthy; tick fresh ({elapsed}s of {interval}s)"
));
}
Ok(())
}
fn crashloop_detection(ts: &str) {
if ensure_state_dir().is_err() {
return;
}
let window = env_or(
ENV_AGENT0_CRASHLOOP_WINDOW_S,
AGENT0_CRASHLOOP_WINDOW_S_DEFAULT,
);
let threshold = env_or(
ENV_AGENT0_CRASHLOOP_THRESHOLD,
AGENT0_CRASHLOOP_THRESHOLD_DEFAULT,
);
let attempts = agent0_restart_attempts_file();
let marker = agent0_crashloop_marker();
let now_s = unix_now();
let last_error = latest_restart_error(&restart_status_dir());
let outcome = record_crash_attempt_core(
&attempts,
&marker,
now_s,
window,
threshold,
last_error.as_deref(),
);
let (count, escalated) = match outcome {
Ok(o) => (o.count, o.escalated),
Err(e) => {
log(&format!(
"[watchdog-tick {ts}] crashloop counter write FAILED ({e}); escalation defense-in-depth degraded"
));
return;
}
};
log(&format!(
"[watchdog-tick {ts}] crashloop counter: {count} attempts in window ({window}s)"
));
if !escalated {
return;
}
log(&format!(
"[watchdog-tick {ts}] CRASHLOOP-SUSPECTED: {count} restart attempts within {window}s — wrote {}",
marker.display()
));
let last = last_error
.as_deref()
.unwrap_or("no restart-status file found (detached restart died before writing status)");
let subject = format!("agent0 crashloop-suspected ({count} restart attempts)");
let body = format!(
"watchdog has seen {count} agent0 restart attempts within {window}s \
with no successful recovery. last restart-status error: {last}. \
marker at {marker}. run `netsky doctor` for full state.",
marker = marker.display()
);
match crate::cmd::escalate::run(&subject, Some(&body)) {
Ok(()) => log(&format!(
"[watchdog-tick {ts}] crashloop escalation page sent (floor)"
)),
Err(e) => log(&format!(
"[watchdog-tick {ts}] escalate FAILED ({e}); crashloop page not delivered"
)),
}
}
fn crashloop_clear_on_healthy(ts: &str) {
let attempts = agent0_restart_attempts_file();
let marker = agent0_crashloop_marker();
let mut cleared_anything = false;
if attempts.exists() {
let _ = fs::remove_file(&attempts);
cleared_anything = true;
}
if marker.exists() {
let _ = fs::remove_file(&marker);
cleared_anything = true;
}
if cleared_anything {
log(&format!(
"[watchdog-tick {ts}] agent0 recovered; cleared crashloop state"
));
}
}
#[derive(Debug)]
struct CrashloopOutcome {
count: u64,
escalated: bool,
}
fn record_crash_attempt_core(
attempts_file: &Path,
marker: &Path,
now_s: u64,
window_s: u64,
threshold: u64,
last_error: Option<&str>,
) -> std::io::Result<CrashloopOutcome> {
let mut times: Vec<u64> = fs::read_to_string(attempts_file)
.ok()
.map(|s| {
s.lines()
.filter_map(|l| l.trim().parse::<u64>().ok())
.collect()
})
.unwrap_or_default();
times.push(now_s);
let cutoff = now_s.saturating_sub(window_s);
times.retain(|&t| t >= cutoff);
let body = times
.iter()
.map(u64::to_string)
.collect::<Vec<_>>()
.join("\n")
+ "\n";
atomic_write_string(attempts_file, &body)?;
let count = times.len() as u64;
let mut escalated = false;
if count >= threshold && !marker.exists() {
let contents = format!(
"agent0 crashloop-suspected: {count} restart attempts within {window_s}s.\n\
marker written at {}.\n\
last restart-status error: {}.\n\
window contents (unix ts per attempt):\n{}\n",
chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ"),
last_error.unwrap_or("(no restart-status file found)"),
times
.iter()
.map(u64::to_string)
.collect::<Vec<_>>()
.join("\n"),
);
atomic_write_string(marker, &contents)?;
escalated = true;
}
Ok(CrashloopOutcome { count, escalated })
}
fn atomic_write_string(path: &Path, content: &str) -> std::io::Result<()> {
let filename = path.file_name().and_then(|n| n.to_str()).ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidInput,
format!(
"atomic_write_string: path has no valid filename: {}",
path.display()
),
)
})?;
let tmp_name = format!("{filename}.tmp.{}", std::process::id());
let parent = path.parent().unwrap_or_else(|| Path::new("."));
let tmp = parent.join(&tmp_name);
fs::write(&tmp, content)?;
fs::rename(&tmp, path)
}
fn latest_restart_error(dir: &Path) -> Option<String> {
let entries = fs::read_dir(dir).ok()?;
let mut newest: Option<(SystemTime, PathBuf)> = None;
for e in entries.flatten() {
let p = e.path();
if p.extension().and_then(|s| s.to_str()) != Some("json") {
continue;
}
let md = match e.metadata() {
Ok(m) => m,
Err(_) => continue,
};
let mtime = md.modified().unwrap_or(UNIX_EPOCH);
if newest.as_ref().is_none_or(|(m, _)| mtime > *m) {
newest = Some((mtime, p));
}
}
let (_, path) = newest?;
let text = fs::read_to_string(&path).ok()?;
let v: serde_json::Value = serde_json::from_str(&text).ok()?;
let phase = v.get("phase").and_then(|x| x.as_str())?;
if phase != "errored" {
return None;
}
v.get("error")
.and_then(|x| x.as_str())
.map(|s| s.to_owned())
}
fn hang_detection(ts: &str) {
let _ = ensure_state_dir();
let current_hash = match tmux::capture_pane(AGENT0_NAME, None) {
Ok(pane) => sha256_hex(&pane),
Err(_) => return,
};
let now_s = unix_now();
let hash_file = agent0_pane_hash_file();
let prev = fs::read_to_string(&hash_file).ok().and_then(|s| {
let mut lines = s.lines();
let h = lines.next()?.to_string();
let t = lines.next()?.parse::<u64>().ok()?;
Some((h, t))
});
let hang_threshold = env_or(ENV_AGENT0_HANG_S, AGENT0_HANG_S_DEFAULT);
let repage_threshold = env_or(ENV_AGENT0_HANG_REPAGE_S, AGENT0_HANG_REPAGE_S_DEFAULT);
match prev {
Some((prev_hash, prev_ts)) if prev_hash == current_hash => {
let stable_age = now_s.saturating_sub(prev_ts);
let marker = agent0_hang_marker();
if stable_age >= hang_threshold && !marker.exists() {
let status = quiet_sentinel_status(&state_dir(), now_s);
if let Some(expiry) = status.max_future_epoch {
let remaining = expiry.saturating_sub(now_s);
log(&format!(
"[watchdog-tick {ts}] hang-suspected suppressed by quiet sentinel; {remaining}s remaining (expiry epoch {expiry})"
));
return;
}
let body = format!(
"agent0 pane hash stable for {stable_age}s (threshold {hang_threshold}s).\n\
tmux session is alive but no pane output progress observed.\n\
may be genuinely thinking, or may be hung. owner decision: investigate.\n\
written by watchdog-tick at {}.\n",
chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ"),
);
let _ = fs::write(&marker, body);
log(&format!(
"[watchdog-tick {ts}] B3 HANG-SUSPECTED: agent0 pane stable {stable_age}s — wrote {}",
marker.display()
));
maybe_page(ts, stable_age, repage_threshold);
}
}
Some(_) => {
write_hash_state(&hash_file, ¤t_hash, now_s);
let marker = agent0_hang_marker();
let paged = agent0_hang_paged_marker();
if marker.exists() {
let _ = fs::remove_file(&marker);
let _ = fs::remove_file(&paged);
log(&format!(
"[watchdog-tick {ts}] agent0 pane progressed; cleared hang-suspected + paged markers"
));
}
}
None => {
write_hash_state(&hash_file, ¤t_hash, now_s);
}
}
}
#[derive(Debug, Default)]
struct QuietSentinelStatus {
max_future_epoch: Option<u64>,
stale_paths: Vec<PathBuf>,
}
fn quiet_sentinel_status(dir: &Path, now_s: u64) -> QuietSentinelStatus {
let prefix = agent0_quiet_sentinel_prefix();
let entries = match fs::read_dir(dir) {
Ok(e) => e,
Err(_) => return QuietSentinelStatus::default(),
};
let mut out = QuietSentinelStatus::default();
for entry in entries.flatten() {
let path = entry.path();
let name = match path.file_name().and_then(|n| n.to_str()) {
Some(n) => n,
None => continue,
};
let suffix = match name.strip_prefix(prefix) {
Some(s) => s,
None => continue,
};
match suffix.parse::<u64>() {
Ok(epoch) if epoch > now_s => {
out.max_future_epoch = Some(out.max_future_epoch.map_or(epoch, |e| e.max(epoch)));
}
_ => out.stale_paths.push(path),
}
}
out
}
fn maybe_page(ts: &str, stable_age: u64, repage_threshold: u64) {
let paged = agent0_hang_paged_marker();
let do_page = if !paged.exists() {
true
} else {
age_seconds(&paged)
.map(|age| age >= repage_threshold)
.unwrap_or(true)
};
if !do_page {
return;
}
touch(&paged);
let rc = crate::cmd::escalate::run(
&format!("agent0 hang-suspected, pane stable {stable_age}s"),
Some(
"tmux session alive but no output progress. run `netsky doctor` for state. \
agentinfinity will follow up with diagnosis on next wake.",
),
);
match rc {
Ok(()) => log(&format!(
"[watchdog-tick {ts}] escalation page sent (floor)"
)),
Err(e) => log(&format!(
"[watchdog-tick {ts}] escalate FAILED ({e}); page not delivered"
)),
}
}
fn write_hash_state(path: &Path, hash: &str, ts_s: u64) {
let body = format!("{hash}\n{ts_s}\n");
let _ = fs::write(path, body);
}
fn write_crash_handoff(ts: &str) -> netsky_core::Result<PathBuf> {
write_crash_handoff_core(&crash_handoffs_dir(), std::process::id(), ts)
}
fn write_crash_handoff_core(dir: &Path, pid: u32, ts: &str) -> netsky_core::Result<PathBuf> {
fs::create_dir_all(dir)?;
let body = CRASH_HANDOFF_TEMPLATE.replace("{{ ts }}", ts);
let path = dir.join(format!(
"{CRASH_HANDOFF_FILENAME_PREFIX}{pid}{CRASH_HANDOFF_FILENAME_SUFFIX}"
));
fs::write(&path, body)?;
Ok(path)
}
fn migrate_legacy_crash_handoffs_once(ts: &str) {
static ONCE: std::sync::OnceLock<()> = std::sync::OnceLock::new();
ONCE.get_or_init(|| {
match migrate_legacy_crash_handoffs_core(&std::env::temp_dir(), &crash_handoffs_dir()) {
Ok(n) if n > 0 => log(&format!(
"[watchdog-tick {ts}] migrated {n} legacy crash-handoff file(s) from $TMPDIR to state-dir"
)),
Ok(_) => {}
Err(e) => log(&format!(
"[watchdog-tick {ts}] legacy crash-handoff migration error: {e}"
)),
}
});
}
fn migrate_legacy_crash_handoffs_core(src: &Path, dest: &Path) -> std::io::Result<u64> {
let Ok(entries) = fs::read_dir(src) else {
return Ok(0);
};
let mut moved: u64 = 0;
let mut first_err: Option<std::io::Error> = None;
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
continue;
}
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
continue;
};
if !name.starts_with(CRASH_HANDOFF_FILENAME_PREFIX) {
continue;
}
if let Err(e) = fs::create_dir_all(dest) {
first_err.get_or_insert(e);
continue;
}
let target = dest.join(name);
match fs::rename(&path, &target) {
Ok(()) => moved += 1,
Err(_) => match fs::copy(&path, &target).and_then(|_| fs::remove_file(&path)) {
Ok(()) => moved += 1,
Err(e) => {
first_err.get_or_insert(e);
}
},
}
}
match first_err {
Some(e) if moved == 0 => Err(e),
_ => Ok(moved),
}
}
fn migrate_legacy_restart_archive_once(ts: &str) {
static ONCE: std::sync::OnceLock<()> = std::sync::OnceLock::new();
ONCE.get_or_init(|| {
match migrate_legacy_restart_archive_core(Path::new("/tmp"), &restart_archive_dir()) {
Ok(n) if n > 0 => log(&format!(
"[watchdog-tick {ts}] migrated {n} legacy restart-archive file(s) from /tmp to state-dir"
)),
Ok(_) => {}
Err(e) => log(&format!(
"[watchdog-tick {ts}] legacy restart-archive migration error: {e}"
)),
}
});
}
fn migrate_legacy_restart_archive_core(src: &Path, dest: &Path) -> std::io::Result<u64> {
let Ok(entries) = fs::read_dir(src) else {
return Ok(0);
};
let mut moved: u64 = 0;
let mut first_err: Option<std::io::Error> = None;
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
continue;
}
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
continue;
};
if !is_legacy_restart_archive_name(name) {
continue;
}
if let Err(e) = fs::create_dir_all(dest) {
first_err.get_or_insert(e);
continue;
}
let target = dest.join(name);
match fs::rename(&path, &target) {
Ok(()) => moved += 1,
Err(_) => match fs::copy(&path, &target).and_then(|_| fs::remove_file(&path)) {
Ok(()) => moved += 1,
Err(e) => {
first_err.get_or_insert(e);
}
},
}
}
match first_err {
Some(e) if moved == 0 => Err(e),
_ => Ok(moved),
}
}
fn is_legacy_restart_archive_name(name: &str) -> bool {
if name == RESTART_DETACHED_LOG_FILENAME {
return true;
}
name.starts_with(RESTART_PROCESSING_ARCHIVE_FILENAME_PREFIX)
&& name.ends_with(RESTART_PROCESSING_ARCHIVE_FILENAME_SUFFIX)
}
fn restart_archive_sweep_preflight(_ts: &str) {
let ttl = env_or(ENV_RESTART_ARCHIVE_TTL_S, RESTART_ARCHIVE_TTL_S_DEFAULT);
restart_archive_sweep_at(&restart_archive_dir(), unix_now(), ttl);
}
fn restart_archive_sweep_at(dir: &Path, now_s: u64, ttl_s: u64) -> u64 {
let Ok(entries) = fs::read_dir(dir) else {
return 0;
};
let mut reaped: u64 = 0;
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
continue;
}
let name = match path.file_name().and_then(|n| n.to_str()) {
Some(n) => n,
None => continue,
};
if name == RESTART_DETACHED_LOG_FILENAME {
continue;
}
let Ok(md) = entry.metadata() else { continue };
let Ok(mtime) = md.modified() else { continue };
let mtime_s = mtime
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let age = now_s.saturating_sub(mtime_s);
if age > ttl_s && fs::remove_file(&path).is_ok() {
reaped += 1;
}
}
reaped
}
fn age_seconds(path: &Path) -> Option<u64> {
let md = fs::metadata(path).ok()?;
let mtime = md.modified().ok()?;
let now = SystemTime::now();
now.duration_since(mtime).ok().map(|d| d.as_secs())
}
fn unix_now() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0)
}
fn env_or(key: &str, default: u64) -> u64 {
std::env::var(key)
.ok()
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(default)
}
fn touch(path: &Path) {
if let Ok(mut f) = fs::OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(path)
{
let _ = f.write_all(b"");
}
}
fn sha256_hex(s: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(s.as_bytes());
format!("{:x}", hasher.finalize())
}
fn log(msg: &str) {
println!("{msg}");
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn process_alive_reports_self_alive() {
let pid = std::process::id() as i32;
assert!(process_alive(pid));
}
#[test]
fn write_crash_handoff_writes_under_state_dir() {
let dir = tempdir().unwrap();
let out = write_crash_handoff_core(dir.path(), 4242, "2026-01-01T00:00:00Z").unwrap();
assert!(out.starts_with(dir.path()), "must land under state dir");
let fname = out.file_name().unwrap().to_str().unwrap();
assert!(
fname.starts_with(CRASH_HANDOFF_FILENAME_PREFIX)
&& fname.ends_with(CRASH_HANDOFF_FILENAME_SUFFIX)
&& fname.contains("4242"),
"unexpected filename {fname}"
);
let body = std::fs::read_to_string(&out).unwrap();
assert!(body.contains("2026-01-01T00:00:00Z"));
}
#[test]
fn migrate_legacy_crash_handoffs_moves_matching_files() {
let dir = tempdir().unwrap();
let src = dir.path().join("src");
let dest = dir.path().join("dest");
std::fs::create_dir_all(&src).unwrap();
let a = src.join(format!(
"{CRASH_HANDOFF_FILENAME_PREFIX}111{CRASH_HANDOFF_FILENAME_SUFFIX}"
));
let b = src.join(format!(
"{CRASH_HANDOFF_FILENAME_PREFIX}222{CRASH_HANDOFF_FILENAME_SUFFIX}"
));
let unrelated = src.join("other.txt");
std::fs::write(&a, "a").unwrap();
std::fs::write(&b, "b").unwrap();
std::fs::write(&unrelated, "keep").unwrap();
let n = migrate_legacy_crash_handoffs_core(&src, &dest).unwrap();
assert_eq!(n, 2);
assert!(!a.exists() && !b.exists(), "sources should be gone");
assert!(dest.join(a.file_name().unwrap()).exists());
assert!(dest.join(b.file_name().unwrap()).exists());
assert!(unrelated.exists(), "unrelated file must be untouched");
let n2 = migrate_legacy_crash_handoffs_core(&src, &dest).unwrap();
assert_eq!(n2, 0);
}
#[test]
fn migrate_legacy_crash_handoffs_tolerates_missing_src() {
let dir = tempdir().unwrap();
let src = dir.path().join("does-not-exist");
let dest = dir.path().join("dest");
let n = migrate_legacy_crash_handoffs_core(&src, &dest).unwrap();
assert_eq!(n, 0);
}
#[test]
fn migrate_legacy_crash_handoffs_catches_bash_mktemp_names() {
let dir = tempdir().unwrap();
let src = dir.path().join("src");
let dest = dir.path().join("dest");
std::fs::create_dir_all(&src).unwrap();
let legacy = src.join(format!("{CRASH_HANDOFF_FILENAME_PREFIX}XXXXXX.5wOTF6HZQG"));
std::fs::write(&legacy, "legacy").unwrap();
let n = migrate_legacy_crash_handoffs_core(&src, &dest).unwrap();
assert_eq!(n, 1);
assert!(!legacy.exists());
assert!(dest.join(legacy.file_name().unwrap()).exists());
}
#[test]
fn process_alive_rejects_nonsense_pids() {
assert!(!process_alive(0));
assert!(!process_alive(-1));
assert!(!process_alive(999_999));
}
#[test]
fn agentinit_single_failure_does_not_escalate() {
let dir = tempdir().unwrap();
let fails = dir.path().join("fails");
let mark = dir.path().join("marker");
let out = record_agentinit_failure_core(&fails, &mark, 1000, 600, 3).unwrap();
assert_eq!(out.count, 1);
assert!(!out.escalated);
assert!(!mark.exists());
assert_eq!(std::fs::read_to_string(&fails).unwrap(), "1000\n");
}
#[test]
fn agentinit_three_failures_within_window_escalate() {
let dir = tempdir().unwrap();
let fails = dir.path().join("fails");
let mark = dir.path().join("marker");
let _ = record_agentinit_failure_core(&fails, &mark, 1000, 600, 3).unwrap();
let _ = record_agentinit_failure_core(&fails, &mark, 1100, 600, 3).unwrap();
let out = record_agentinit_failure_core(&fails, &mark, 1200, 600, 3).unwrap();
assert_eq!(out.count, 3);
assert!(out.escalated);
assert!(mark.exists());
let marker_body = std::fs::read_to_string(&mark).unwrap();
assert!(marker_body.contains("agentinit failed 3 times"));
assert!(marker_body.contains("1000"));
assert!(marker_body.contains("1200"));
}
#[test]
fn agentinit_stale_failures_are_pruned_from_window() {
let dir = tempdir().unwrap();
let fails = dir.path().join("fails");
let mark = dir.path().join("marker");
std::fs::write(&fails, "100\n200\n").unwrap();
let out = record_agentinit_failure_core(&fails, &mark, 2200, 600, 3).unwrap();
assert_eq!(out.count, 1);
assert!(!out.escalated);
assert_eq!(std::fs::read_to_string(&fails).unwrap(), "2200\n");
}
#[test]
fn agentinit_escalation_is_idempotent_once_set() {
let dir = tempdir().unwrap();
let fails = dir.path().join("fails");
let mark = dir.path().join("marker");
std::fs::write(&mark, "pre-existing").unwrap();
let _ = record_agentinit_failure_core(&fails, &mark, 1000, 600, 3).unwrap();
let _ = record_agentinit_failure_core(&fails, &mark, 1100, 600, 3).unwrap();
let out = record_agentinit_failure_core(&fails, &mark, 1200, 600, 3).unwrap();
assert_eq!(out.count, 3);
assert!(!out.escalated, "should not re-escalate if marker exists");
assert_eq!(std::fs::read_to_string(&mark).unwrap(), "pre-existing");
}
#[test]
fn agentinit_threshold_override_honored() {
let dir = tempdir().unwrap();
let fails = dir.path().join("fails");
let mark = dir.path().join("marker");
let _ = record_agentinit_failure_core(&fails, &mark, 1000, 600, 2).unwrap();
let out = record_agentinit_failure_core(&fails, &mark, 1100, 600, 2).unwrap();
assert_eq!(out.count, 2);
assert!(out.escalated);
}
fn always_alive(_pid: i32) -> bool {
true
}
fn always_dead(_pid: i32) -> bool {
false
}
#[test]
fn restart_in_flight_false_when_marker_missing() {
let dir = tempdir().unwrap();
let marker = dir.path().join("inflight");
assert!(!restart_in_flight_at(&marker, 1800, "test", always_alive));
}
#[test]
fn restart_in_flight_true_for_live_pid_and_fresh_marker() {
let dir = tempdir().unwrap();
let marker = dir.path().join("inflight");
std::fs::write(&marker, "12345\n2026-01-01T00:00:00Z\nplanned\n").unwrap();
assert!(restart_in_flight_at(&marker, 1800, "test", always_alive));
assert!(marker.exists());
}
#[test]
fn restart_in_flight_keeps_dead_pid_pending_until_verified() {
let dir = tempdir().unwrap();
let marker = dir.path().join("inflight");
std::fs::write(&marker, "12345\n2026-01-01T00:00:00Z\nplanned\n").unwrap();
assert!(restart_in_flight_at(&marker, 1800, "test", always_dead));
assert!(
marker.exists(),
"dead-pid marker should remain until revive verification passes"
);
}
#[test]
fn restart_verification_pending_before_window() {
let status = RestartStatusSnapshot {
path: PathBuf::from("/tmp/status.json"),
phase: "spawned".to_string(),
error: None,
};
assert_eq!(
restart_verification_state(59, false, Some(&status)),
RestartVerification::Pending
);
}
#[test]
fn restart_verification_verified_after_up_detected_tick_elapsed() {
let status = RestartStatusSnapshot {
path: PathBuf::from("/tmp/status.json"),
phase: "up-detected".to_string(),
error: None,
};
assert_eq!(
restart_verification_state(TICK_MIN_INTERVAL_S, true, Some(&status)),
RestartVerification::Verified
);
}
#[test]
fn restart_verification_fails_after_window_without_liveness() {
assert_eq!(
restart_verification_state(WATCHDOG_RESTART_VERIFY_WINDOW_S, false, None),
RestartVerification::Failed(
"no restart-status file reported a live agent0".to_string()
)
);
}
#[test]
fn restart_in_flight_cleans_malformed_marker() {
let dir = tempdir().unwrap();
let marker = dir.path().join("inflight");
std::fs::write(&marker, "not-a-pid-at-all").unwrap();
assert!(!restart_in_flight_at(&marker, 1800, "test", always_alive));
assert!(
!marker.exists(),
"malformed marker should be cleaned up so the next tick proceeds"
);
}
#[test]
fn restart_in_flight_cleans_stale_marker_regardless_of_pid_state() {
let dir = tempdir().unwrap();
let marker = dir.path().join("inflight");
std::fs::write(&marker, "12345\n").unwrap();
let backdated = "202504010000"; std::process::Command::new("touch")
.args(["-t", backdated, marker.to_str().unwrap()])
.status()
.unwrap();
assert!(!restart_in_flight_at(&marker, 1800, "test", always_alive));
assert!(
!marker.exists(),
"stale marker should be removed regardless of liveness probe"
);
}
fn prefix() -> &'static str {
netsky_core::paths::agent0_quiet_sentinel_prefix()
}
#[test]
fn quiet_sentinel_absent_means_detect() {
let dir = tempdir().unwrap();
let status = quiet_sentinel_status(dir.path(), 1_000_000);
assert!(status.max_future_epoch.is_none());
assert!(status.stale_paths.is_empty());
}
#[test]
fn quiet_sentinel_future_epoch_silences_detection() {
let dir = tempdir().unwrap();
let now = 1_000_000;
let expiry = now + 300;
let path = dir.path().join(format!("{}{expiry}", prefix()));
std::fs::write(&path, "until <iso> (300s); reason: test\n").unwrap();
let status = quiet_sentinel_status(dir.path(), now);
assert_eq!(status.max_future_epoch, Some(expiry));
assert!(
status.stale_paths.is_empty(),
"future sentinel must not be reaped"
);
}
#[test]
fn quiet_sentinel_past_epoch_is_stale() {
let dir = tempdir().unwrap();
let now = 1_000_000;
let expiry = now - 300;
let path = dir.path().join(format!("{}{expiry}", prefix()));
std::fs::write(&path, "").unwrap();
let status = quiet_sentinel_status(dir.path(), now);
assert!(status.max_future_epoch.is_none());
assert_eq!(status.stale_paths, vec![path]);
}
#[test]
fn quiet_sentinel_picks_max_future_and_reaps_past() {
let dir = tempdir().unwrap();
let now = 1_000_000;
let stale_epoch = now - 100;
let near = now + 60;
let far = now + 3600;
let stale_path = dir.path().join(format!("{}{stale_epoch}", prefix()));
let near_path = dir.path().join(format!("{}{near}", prefix()));
let far_path = dir.path().join(format!("{}{far}", prefix()));
std::fs::write(&stale_path, "").unwrap();
std::fs::write(&near_path, "").unwrap();
std::fs::write(&far_path, "").unwrap();
let status = quiet_sentinel_status(dir.path(), now);
assert_eq!(
status.max_future_epoch,
Some(far),
"far future must win over near future"
);
assert_eq!(status.stale_paths, vec![stale_path]);
}
#[test]
fn quiet_sentinel_malformed_suffix_is_stale() {
let dir = tempdir().unwrap();
let path = dir.path().join(format!("{}not-a-number", prefix()));
std::fs::write(&path, "").unwrap();
let status = quiet_sentinel_status(dir.path(), 1_000_000);
assert!(status.max_future_epoch.is_none());
assert_eq!(status.stale_paths, vec![path]);
}
#[test]
fn quiet_sentinel_ignores_unrelated_files() {
let dir = tempdir().unwrap();
let siblings = [
"agent0-pane-hash",
"agent0-hang-suspected",
"agent0-hang-paged",
"agentinfinity-ready",
];
for s in &siblings {
std::fs::write(dir.path().join(s), "x").unwrap();
}
let status = quiet_sentinel_status(dir.path(), 1_000_000);
assert!(status.max_future_epoch.is_none());
assert!(
status.stale_paths.is_empty(),
"sibling files must not register as sentinels"
);
for s in &siblings {
assert!(dir.path().join(s).exists(), "unrelated file was lost");
}
}
#[test]
fn quiet_sentinel_missing_dir_returns_empty() {
let dir = tempdir().unwrap();
let status = quiet_sentinel_status(&dir.path().join("does-not-exist"), 1_000_000);
assert!(status.max_future_epoch.is_none());
assert!(status.stale_paths.is_empty());
}
#[test]
fn quiet_reap_preflight_removes_stale_and_preserves_future() {
let dir = tempdir().unwrap();
let now = 1_000_000;
let stale = dir.path().join(format!("{}{}", prefix(), now - 10));
let future = dir.path().join(format!("{}{}", prefix(), now + 3600));
std::fs::write(&stale, "").unwrap();
std::fs::write(&future, "").unwrap();
let reaped = quiet_sentinel_reap_at(dir.path(), now, None);
assert_eq!(reaped, 1, "exactly one stale sentinel reaped");
assert!(!stale.exists(), "past-epoch sentinel must be removed");
assert!(future.exists(), "future-epoch sentinel must survive");
}
#[test]
fn quiet_reap_preflight_is_noop_when_no_stale() {
let dir = tempdir().unwrap();
let now = 1_000_000;
let future = dir.path().join(format!("{}{}", prefix(), now + 60));
std::fs::write(&future, "").unwrap();
let reaped = quiet_sentinel_reap_at(dir.path(), now, None);
assert_eq!(reaped, 0);
assert!(future.exists());
}
#[test]
fn crashloop_single_attempt_does_not_escalate() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
let out = record_crash_attempt_core(&attempts, &marker, 1000, 600, 3, None).unwrap();
assert_eq!(out.count, 1);
assert!(!out.escalated);
assert!(!marker.exists());
assert_eq!(std::fs::read_to_string(&attempts).unwrap(), "1000\n");
}
#[test]
fn crashloop_escalates_on_third_attempt() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
let _ = record_crash_attempt_core(&attempts, &marker, 1000, 600, 3, None).unwrap();
let _ = record_crash_attempt_core(&attempts, &marker, 1100, 600, 3, None).unwrap();
let out = record_crash_attempt_core(
&attempts,
&marker,
1200,
600,
3,
Some("tmux: command too long"),
)
.unwrap();
assert_eq!(out.count, 3);
assert!(out.escalated);
assert!(marker.exists());
let body = std::fs::read_to_string(&marker).unwrap();
assert!(body.contains("3 restart attempts"));
assert!(body.contains("tmux: command too long"));
assert!(body.contains("1000"));
assert!(body.contains("1200"));
}
#[test]
fn crashloop_attempts_past_epoch_pruned() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
std::fs::write(&attempts, "100\n200\n").unwrap();
let out = record_crash_attempt_core(&attempts, &marker, 2200, 600, 3, None).unwrap();
assert_eq!(out.count, 1);
assert!(!out.escalated);
assert_eq!(std::fs::read_to_string(&attempts).unwrap(), "2200\n");
}
#[test]
fn crashloop_marker_reaped_on_repair() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
std::fs::write(&attempts, "1000\n1100\n1200\n").unwrap();
std::fs::write(&marker, "crashloop-suspected").unwrap();
assert!(attempts.exists() && marker.exists());
std::fs::remove_file(&attempts).unwrap();
std::fs::remove_file(&marker).unwrap();
assert!(!attempts.exists() && !marker.exists());
}
#[test]
fn crashloop_escalation_idempotent_once_marker_set() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
std::fs::write(&marker, "pre-existing body").unwrap();
let _ = record_crash_attempt_core(&attempts, &marker, 1000, 600, 3, None).unwrap();
let _ = record_crash_attempt_core(&attempts, &marker, 1100, 600, 3, None).unwrap();
let out = record_crash_attempt_core(&attempts, &marker, 1200, 600, 3, None).unwrap();
assert_eq!(out.count, 3);
assert!(!out.escalated, "should not re-escalate if marker exists");
assert_eq!(
std::fs::read_to_string(&marker).unwrap(),
"pre-existing body"
);
}
#[test]
fn crashloop_threshold_override_honored() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
let _ = record_crash_attempt_core(&attempts, &marker, 1000, 600, 2, None).unwrap();
let out = record_crash_attempt_core(&attempts, &marker, 1100, 600, 2, None).unwrap();
assert_eq!(out.count, 2);
assert!(out.escalated);
}
#[test]
fn latest_restart_error_returns_none_for_empty_dir() {
let dir = tempdir().unwrap();
assert!(latest_restart_error(dir.path()).is_none());
}
#[test]
fn latest_restart_error_returns_none_for_success_status() {
let dir = tempdir().unwrap();
let p = dir.path().join("a.json");
std::fs::write(
&p,
r#"{"pid":1,"phase":"up-detected","exit_code":0,"error":null,"started_at":"2026-04-15T16:00:00Z","updated_at":"2026-04-15T16:00:05Z"}"#,
)
.unwrap();
assert!(latest_restart_error(dir.path()).is_none());
}
#[test]
fn latest_restart_error_extracts_errored_message() {
let dir = tempdir().unwrap();
let p = dir.path().join("a.json");
std::fs::write(
&p,
r#"{"pid":1,"phase":"errored","exit_code":1,"error":"tmux refused: command too long","started_at":"2026-04-15T16:00:00Z","updated_at":"2026-04-15T16:00:05Z"}"#,
)
.unwrap();
assert_eq!(
latest_restart_error(dir.path()).as_deref(),
Some("tmux refused: command too long")
);
}
#[test]
fn latest_restart_error_picks_most_recent_by_mtime() {
let dir = tempdir().unwrap();
let old_path = dir.path().join("old.json");
let new_path = dir.path().join("new.json");
std::fs::write(
&old_path,
r#"{"pid":1,"phase":"errored","exit_code":1,"error":"old error","started_at":"2026-04-15T16:00:00Z","updated_at":"2026-04-15T16:00:05Z"}"#,
)
.unwrap();
std::fs::write(
&new_path,
r#"{"pid":2,"phase":"errored","exit_code":1,"error":"new error","started_at":"2026-04-15T16:05:00Z","updated_at":"2026-04-15T16:05:05Z"}"#,
)
.unwrap();
std::process::Command::new("touch")
.args(["-t", "202504010000", old_path.to_str().unwrap()])
.status()
.unwrap();
assert_eq!(
latest_restart_error(dir.path()).as_deref(),
Some("new error")
);
}
#[test]
fn latest_restart_error_ignores_non_json() {
let dir = tempdir().unwrap();
std::fs::write(dir.path().join("scratch.txt"), "x").unwrap();
std::fs::write(dir.path().join("a.json.tmp"), "x").unwrap();
assert!(latest_restart_error(dir.path()).is_none());
}
#[test]
fn watchdog_event_log_for_uses_daily_filename() {
let p = netsky_core::paths::watchdog_event_log_for("2026-04-15");
assert!(p.ends_with("netsky-io-watchdog.2026-04-15.log"));
}
#[test]
fn append_watchdog_event_at_writes_line() {
let dir = tempdir().unwrap();
let path = dir.path().join("netsky-io-watchdog.2026-04-15.log");
append_watchdog_event_at(&path, "[watchdog-event test] ticker stopped");
let body = std::fs::read_to_string(&path).unwrap();
assert!(body.contains("ticker stopped"));
}
fn backdate(path: &Path, years_back: u32) {
let y = 2026u32.saturating_sub(years_back);
let stamp = format!("{y}04010000");
std::process::Command::new("touch")
.args(["-t", &stamp, path.to_str().unwrap()])
.status()
.unwrap();
}
#[test]
fn migrate_legacy_restart_archive_moves_detached_log_and_processing() {
let src = tempdir().unwrap();
let dest = tempdir().unwrap();
let det = src.path().join(RESTART_DETACHED_LOG_FILENAME);
let proc_a = src
.path()
.join(format!("{RESTART_PROCESSING_ARCHIVE_FILENAME_PREFIX}20260414T203646Z{RESTART_PROCESSING_ARCHIVE_FILENAME_SUFFIX}"));
let noise = src.path().join("unrelated.txt");
std::fs::write(&det, "log\n").unwrap();
std::fs::write(&proc_a, "proc\n").unwrap();
std::fs::write(&noise, "noise\n").unwrap();
let n = migrate_legacy_restart_archive_core(src.path(), dest.path()).unwrap();
assert_eq!(n, 2);
assert!(
!det.exists() && !proc_a.exists(),
"matching files must have been moved out of src"
);
assert!(dest.path().join(RESTART_DETACHED_LOG_FILENAME).exists());
assert!(noise.exists(), "unrelated files must not move");
let n2 = migrate_legacy_restart_archive_core(src.path(), dest.path()).unwrap();
assert_eq!(n2, 0);
}
#[test]
fn migrate_legacy_restart_archive_tolerates_missing_src() {
let tmp = tempdir().unwrap();
let src = tmp.path().join("does-not-exist");
let dest = tmp.path().join("dest");
let n = migrate_legacy_restart_archive_core(&src, &dest).unwrap();
assert_eq!(n, 0);
}
#[test]
fn restart_archive_sweep_removes_old_files_and_preserves_fresh() {
let dir = tempdir().unwrap();
let old = dir
.path()
.join("netsky-restart-processing.20260101T000000Z.archived");
let fresh = dir
.path()
.join("netsky-restart-processing.20260415T000000Z.archived");
std::fs::write(&old, "old").unwrap();
std::fs::write(&fresh, "fresh").unwrap();
backdate(&old, 2);
let now_s = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap();
let ttl = 30 * 24 * 60 * 60;
let reaped = restart_archive_sweep_at(dir.path(), now_s, ttl);
assert_eq!(reaped, 1);
assert!(!old.exists(), "old archive must have been swept");
assert!(fresh.exists(), "fresh archive must survive");
}
#[test]
fn restart_archive_sweep_skips_active_detached_log() {
let dir = tempdir().unwrap();
let log = dir.path().join(RESTART_DETACHED_LOG_FILENAME);
std::fs::write(&log, "live").unwrap();
backdate(&log, 2);
let now_s = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap();
let reaped = restart_archive_sweep_at(dir.path(), now_s, 30 * 24 * 60 * 60);
assert_eq!(reaped, 0);
assert!(
log.exists(),
"active detached log must never be swept regardless of age"
);
}
#[test]
fn restart_archive_sweep_missing_dir_returns_zero() {
let tmp = tempdir().unwrap();
let missing = tmp.path().join("not-there");
assert_eq!(restart_archive_sweep_at(&missing, 1_000_000, 60), 0);
}
#[test]
fn is_legacy_restart_archive_name_matches_expected() {
assert!(is_legacy_restart_archive_name(
RESTART_DETACHED_LOG_FILENAME
));
assert!(is_legacy_restart_archive_name(
"netsky-restart-processing.20260414T203646Z.archived"
));
assert!(!is_legacy_restart_archive_name("netsky-watchdog.out.log"));
assert!(!is_legacy_restart_archive_name(
"netsky-restart-processing.txt"
));
}
#[test]
fn atomic_write_string_creates_target_on_happy_path() {
let dir = tempdir().unwrap();
let target = dir.path().join("attempts");
atomic_write_string(&target, "1000\n1100\n").unwrap();
assert_eq!(std::fs::read_to_string(&target).unwrap(), "1000\n1100\n");
}
#[test]
fn atomic_write_string_overwrites_atomically_with_no_leftover_tmp() {
let dir = tempdir().unwrap();
let target = dir.path().join("attempts");
std::fs::write(&target, "old\n").unwrap();
atomic_write_string(&target, "new\n").unwrap();
assert_eq!(std::fs::read_to_string(&target).unwrap(), "new\n");
let pid = std::process::id();
let tmp = dir.path().join(format!("attempts.tmp.{pid}"));
assert!(
!tmp.exists(),
"tmp file from successful write must not linger"
);
}
#[test]
fn crashloop_counter_survives_dangling_tmp_from_prior_kill() {
let dir = tempdir().unwrap();
let attempts = dir.path().join("attempts");
let marker = dir.path().join("marker");
std::fs::write(&attempts, "1000\n1100\n").unwrap();
std::fs::write(
dir.path().join("attempts.tmp.99999"),
"CORRUPT WRITE IN PROGRESS",
)
.unwrap();
let out = record_crash_attempt_core(&attempts, &marker, 1200, 3600, 3, None).unwrap();
assert_eq!(out.count, 3);
assert!(out.escalated);
let committed = std::fs::read_to_string(&attempts).unwrap();
assert!(!committed.contains("CORRUPT"));
assert!(committed.contains("1000"));
assert!(committed.contains("1200"));
assert!(dir.path().join("attempts.tmp.99999").exists());
}
}