netsky 0.1.4

netsky CLI: the viable system launcher and subcommand dispatcher
Documentation
//! `netsky drill {1|2}` — acceptance tests for the restart paths.
//!
//! - drill 1: planned restart. Stage a handoff file at
//!   `/tmp/netsky-restart-request.txt`, kill agent0, wait for the
//!   watchdog to claim the request, verify archive + /up.
//! - drill 2: crash recovery. Kill agent0 with NO handoff staged; watchdog
//!   synthesizes a crash-recovery handoff; verify archive carries the
//!   expected sigil.
//!
//! DESTRUCTIVE — actually kills + respawns agent0. Requires agent0 +
//! agentinfinity both alive at start.

use std::fs;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::{Duration, SystemTime, UNIX_EPOCH};

use netsky_core::consts::{AGENT0_NAME, AGENTINFINITY_NAME, RESTART_REQUEST_FILE};
use netsky_core::paths::{handoff_archive_dir, home};
use netsky_sh::tmux;

const DRILL_LOG_SUBDIR: &str = "Library/Logs/netsky-drills.log";
const WAIT_SECS: u64 = 150;
const UP_WAIT_SECS: u64 = 720;
const CRASH_SIGIL: &str = "crash-recovery restart by agentinfinity watchdog";

pub fn run(n: u8) -> netsky_core::Result<()> {
    match n {
        1 => drill_one(),
        2 => drill_two(),
        _ => netsky_core::bail!("unknown drill {n} (supported: 1, 2)"),
    }
}

fn drill_one() -> netsky_core::Result<()> {
    let d = Drill::new("drill-1")?;
    d.pre_checks()?;
    let pre_cutoff = unix_now();

    let marker = format!(
        "netsky-drill-1 marker {}-{}",
        chrono::Utc::now().format("%Y%m%dT%H%M%SZ"),
        std::process::id()
    );
    let body = format!(
        "planned restart by netsky-drill-1 at {}.\n\n{marker}\n\nthis is an acceptance-test handoff. \
         the new agent0 should /up and report normally. no further action expected.\n",
        chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ")
    );
    fs::write(RESTART_REQUEST_FILE, &body)?;
    d.log(&format!(
        "staged handoff at {RESTART_REQUEST_FILE} (marker: {marker})"
    ));

    tmux::kill_session(AGENT0_NAME)?;
    d.log(&format!(
        "killed agent0 tmux session; waiting up to {WAIT_SECS}s for respawn"
    ));

    d.wait_for_session(AGENT0_NAME)?;
    if Path::new(RESTART_REQUEST_FILE).exists() {
        d.fail("handoff request still present after respawn; watchdog did not claim")?;
    }
    d.wait_for_envelope_with(&marker, pre_cutoff, UP_WAIT_SECS)?;
    d.wait_for_up_marker(AGENT0_NAME, UP_WAIT_SECS)?;
    d.pass()
}

fn drill_two() -> netsky_core::Result<()> {
    let d = Drill::new("drill-2")?;
    d.pre_checks()?;
    if Path::new(RESTART_REQUEST_FILE).exists() {
        d.fail(&format!(
            "stale {RESTART_REQUEST_FILE} present; refusing to trample (drill-2 must not have a handoff request)"
        ))?;
    }
    let pre_cutoff = unix_now();

    tmux::kill_session(AGENT0_NAME)?;
    d.log(&format!(
        "killed agent0 tmux session; waiting up to {WAIT_SECS}s for respawn"
    ));

    d.wait_for_session(AGENT0_NAME)?;
    d.wait_for_envelope_with(CRASH_SIGIL, pre_cutoff, UP_WAIT_SECS)?;
    d.wait_for_up_marker(AGENT0_NAME, UP_WAIT_SECS)?;
    d.pass()
}

// ---- drill harness --------------------------------------------------------

struct Drill {
    name: &'static str,
    log_path: PathBuf,
}

impl Drill {
    fn new(name: &'static str) -> netsky_core::Result<Self> {
        let log_path = home().join(DRILL_LOG_SUBDIR);
        if let Some(p) = log_path.parent() {
            fs::create_dir_all(p)?;
        }
        let d = Drill { name, log_path };
        d.log("start");
        Ok(d)
    }

    fn pre_checks(&self) -> netsky_core::Result<()> {
        if !tmux::session_is_alive(AGENT0_NAME) {
            self.fail("agent0 tmux session not present at drill start")?;
        }
        if !tmux::session_is_alive(AGENTINFINITY_NAME) {
            self.fail("agentinfinity tmux session not present at drill start")?;
        }
        Ok(())
    }

    fn wait_for_session(&self, sess: &str) -> netsky_core::Result<()> {
        let deadline = unix_now() + WAIT_SECS;
        while unix_now() < deadline {
            if tmux::session_is_alive(sess) {
                self.log(&format!("{sess} tmux session observed alive"));
                return Ok(());
            }
            thread::sleep(Duration::from_secs(2));
        }
        self.fail(&format!("{sess} did not respawn within {WAIT_SECS}s"))
    }

    fn wait_for_envelope_with(
        &self,
        needle: &str,
        pre_cutoff: u64,
        timeout_s: u64,
    ) -> netsky_core::Result<()> {
        let archive = handoff_archive_dir();
        fs::create_dir_all(&archive)?;
        let deadline = unix_now() + timeout_s;
        while unix_now() < deadline {
            if let Some(path) = find_envelope(&archive, needle, pre_cutoff) {
                self.log(&format!(
                    "handoff envelope delivered: {}",
                    path.file_name().unwrap().to_string_lossy()
                ));
                return Ok(());
            }
            thread::sleep(Duration::from_secs(2));
        }
        self.fail(&format!(
            "handoff envelope with marker not found in {} within {timeout_s}s",
            archive.display()
        ))
    }

    fn wait_for_up_marker(&self, sess: &str, timeout_s: u64) -> netsky_core::Result<()> {
        let deadline = unix_now() + timeout_s;
        while unix_now() < deadline {
            if let Ok(pane) = tmux::capture_pane(sess, None)
                && pane.lines().any(|l| {
                    l.split_once("session ")
                        .is_some_and(|(_, r)| r.chars().next().is_some_and(|c| c.is_ascii_digit()))
                })
            {
                self.log(&format!("/up report line observed in {sess} pane"));
                return Ok(());
            }
            thread::sleep(Duration::from_secs(2));
        }
        self.fail(&format!(
            "/up report line not seen in {sess} pane within {timeout_s}s"
        ))
    }

    fn log(&self, msg: &str) {
        let line = format!(
            "[{}] {} {msg}",
            chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ"),
            self.name
        );
        println!("{line}");
        let _ = fs::OpenOptions::new()
            .create(true)
            .append(true)
            .open(&self.log_path)
            .and_then(|mut f| {
                use std::io::Write;
                writeln!(f, "{line}")
            });
    }

    fn pass(&self) -> netsky_core::Result<()> {
        self.log("RESULT: green");
        Ok(())
    }

    fn fail(&self, msg: &str) -> netsky_core::Result<()> {
        self.log(&format!("FAIL: {msg}"));
        self.log("RESULT: red");
        netsky_core::bail!("{msg}")
    }
}

fn find_envelope(dir: &Path, needle: &str, pre_cutoff: u64) -> Option<PathBuf> {
    let entries = fs::read_dir(dir).ok()?;
    for e in entries.flatten() {
        let p = e.path();
        if p.extension().and_then(|s| s.to_str()) != Some("json") {
            continue;
        }
        let mtime = e
            .metadata()
            .and_then(|m| m.modified())
            .ok()
            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
            .map(|d| d.as_secs())
            .unwrap_or(0);
        if mtime < pre_cutoff {
            continue;
        }
        let body = match fs::read_to_string(&p) {
            Ok(b) => b,
            Err(_) => continue,
        };
        if body.contains(needle) {
            return Some(p);
        }
    }
    None
}

fn unix_now() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .map(|d| d.as_secs())
        .unwrap_or(0)
}