trusty-memory 0.1.45

Machine-wide, blazingly fast AI memory service
//! Daemon address discovery for `status`, `doctor`, and `ensure_daemon`.
//!
//! Why: Issue #50 — `trusty-memory status` and `trusty-memory doctor` were
//! falsely reporting the daemon as "not running" when the discovery file at
//! `~/Library/Application Support/trusty-memory/http_addr` was missing or
//! stale (e.g. launchd-managed daemons with a different `HOME`, or a daemon
//! that crashed before writing the file). The fix is to probe multiple
//! sources in priority order rather than relying solely on the addr file.
//! What: A single helper, `probe_daemon`, returns the first `SocketAddr` that
//! actually answers a TCP connect. Sources, in order: the env var
//! `TRUSTY_MEMORY_HTTP_PORT`, the shared `trusty_common` discovery file, then
//! a small range of well-known candidate ports (3031..=3050). The candidate
//! range matches the CLI default (3031) plus the auto-walk window (+20) used
//! by `bind_with_auto_port`, so any port a `trusty-memory serve` could have
//! ended up on is covered.
//! Test: `probe_candidate_ports_returns_none_when_nothing_listens` and
//! `parse_addr_handles_bare_port` cover the pure helpers; end-to-end probe
//! behavior is exercised manually via `trusty-memory doctor`.

use std::io::{Read, Write};
use std::net::{SocketAddr, TcpStream};
use std::time::Duration;

/// Env var override for the daemon's HTTP port. Set to a bare port (e.g.
/// `3038`) or a full `host:port`. Useful for users who launch the daemon via
/// launchd / systemd with a pinned port and want `doctor` / `status` to find
/// it deterministically regardless of the discovery file state.
pub const HTTP_PORT_ENV: &str = "TRUSTY_MEMORY_HTTP_PORT";

/// First candidate port probed when neither the env var nor the discovery
/// file yields a live address.
const CANDIDATE_PORT_START: u16 = 3031;

/// Last candidate port probed (inclusive). Matches the +20 auto-walk window
/// used by `trusty_common::bind_with_auto_port` when the default port is
/// taken, so any port a normally-configured daemon could have landed on is
/// covered. Keep this in sync with `Serve.http` / `Start.http` defaults.
const CANDIDATE_PORT_END: u16 = 3050;

/// Default TCP connect timeout for liveness probes. Short enough to keep
/// `status` / `doctor` snappy, long enough to tolerate a busy localhost.
const PROBE_TIMEOUT: Duration = Duration::from_millis(300);

/// Result of probing for the running daemon.
///
/// Why: Callers need to distinguish *which* source succeeded — the addr-file
/// path is preferred for status output ("HTTP: http://X"), while a
/// candidate-port hit should warn the user that the discovery file is stale.
/// What: Carries the live `SocketAddr` plus a tag describing its origin.
/// Test: Constructed in `probe_daemon` and consumed by `doctor::handle` /
/// status output; covered by manual smoke.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DaemonAddr {
    /// The live socket address that responded to a TCP connect.
    pub addr: SocketAddr,
    /// Where we found it. Used by callers to render helpful diagnostics.
    pub source: AddrSource,
}

/// Origin of a discovered daemon address.
///
/// Why: `doctor` should emit a warning when the discovery file is missing or
/// points at a dead port but a candidate-port probe still found the daemon —
/// users need to know the file is out of sync.
/// What: Three variants matching the three probe sources.
/// Test: Pattern-matched in `doctor::check_daemon_running`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AddrSource {
    /// Read from the `TRUSTY_MEMORY_HTTP_PORT` env var.
    EnvVar,
    /// Read from the shared `trusty_common` discovery file.
    DiscoveryFile,
    /// Discovered by walking the candidate port range (fallback).
    CandidatePort,
}

/// Result of a process-level (non-HTTP) liveness probe.
///
/// Why: `--no-http` daemons (Claude Code stdio path) have no listener to
/// connect to but are still very much alive. `status` / `doctor` need to
/// distinguish that case from "no daemon at all". The PID file is the
/// authoritative marker for "a daemon process exists" regardless of
/// transport.
/// What: Carries the daemon's PID. The probe asserts the process is
/// signalable (i.e. exists and we have permission to signal it) via
/// `kill -0` semantics.
/// Test: `pid_alive_returns_false_for_unused_pid` covers the negative case.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct DaemonProcess {
    /// The live PID read from the PID file.
    pub pid: u32,
}

/// Returns the running daemon's PID if a live process matches the PID file.
///
/// Why: Provides a transport-independent liveness signal for `--no-http`
/// daemons. The HTTP probes in `probe_daemon` cannot detect a stdio-only
/// daemon because there is nothing listening on TCP.
/// What: Reads `<service_root>/trusty-memory.pid` via
/// `cli::stop::read_pid_file`; if present, calls `pid_alive` (kill 0) to
/// confirm the process still exists. Returns `Some(DaemonProcess)` only
/// when both conditions hold.
/// Test: Indirectly via `status` integration when running `serve --no-http`.
pub fn probe_pid_file() -> Option<DaemonProcess> {
    let pid = crate::cli::stop::read_pid_file()?;
    if pid_alive(pid) {
        Some(DaemonProcess { pid })
    } else {
        None
    }
}

/// Best-effort "is this PID currently signalable?" check.
///
/// Why: We want to know if the daemon process from the PID file is still
/// running without killing it. Sending signal 0 to a PID is the canonical
/// POSIX liveness probe — exit code 0 means "exists and signalable".
/// What: On Unix invokes `/bin/kill -0 <pid>` via `std::process::Command`
/// and reports success based on the exit status. Avoids a new `libc`/`nix`
/// dependency. On non-Unix platforms returns `false` so callers degrade
/// gracefully to the HTTP-only discovery path.
/// Test: `pid_alive_returns_false_for_unused_pid` asserts a definitely-unused
/// PID is reported dead.
pub fn pid_alive(pid: u32) -> bool {
    #[cfg(unix)]
    {
        // `kill -0 <pid>` exits 0 iff the target process exists and we can
        // signal it. We swallow stdout/stderr because callers only care
        // about the boolean outcome.
        std::process::Command::new("kill")
            .arg("-0")
            .arg(pid.to_string())
            .stdout(std::process::Stdio::null())
            .stderr(std::process::Stdio::null())
            .status()
            .map(|s| s.success())
            .unwrap_or(false)
    }
    #[cfg(not(unix))]
    {
        let _ = pid;
        false
    }
}

/// Probe every known source for a live trusty-memory daemon address.
///
/// Why: Issue #50. Previously, `doctor` and `status` relied solely on the
/// `http_addr` discovery file; when the file was absent (e.g. launchd
/// daemon writing to a different `HOME`) the commands reported "not
/// running" despite a healthy daemon on a known port. Probing multiple
/// sources eliminates the false negative.
/// What: Returns `Some(DaemonAddr)` for the first source whose address
/// answers a TCP connect within `PROBE_TIMEOUT`. Sources, in priority
/// order: `TRUSTY_MEMORY_HTTP_PORT` env var → discovery file → candidate
/// ports `3031..=3050` on `127.0.0.1`. Returns `None` if nothing answers.
/// Test: Manual via `trusty-memory doctor` against a daemon on 3038
/// without a discovery file; the helper-level tests cover the pure parts.
pub fn probe_daemon() -> Option<DaemonAddr> {
    if let Ok(raw) = std::env::var(HTTP_PORT_ENV) {
        if let Some(addr) = parse_addr(raw.trim()) {
            if tcp_alive(&addr) {
                return Some(DaemonAddr {
                    addr,
                    source: AddrSource::EnvVar,
                });
            }
        }
    }

    if let Ok(Some(raw)) = trusty_common::read_daemon_addr("trusty-memory") {
        let trimmed = raw.trim();
        if !trimmed.is_empty() {
            if let Some(addr) = parse_addr(trimmed) {
                if tcp_alive(&addr) {
                    return Some(DaemonAddr {
                        addr,
                        source: AddrSource::DiscoveryFile,
                    });
                }
            }
        }
    }

    probe_candidate_ports().map(|addr| DaemonAddr {
        addr,
        source: AddrSource::CandidatePort,
    })
}

/// Try every candidate port on `127.0.0.1` and return the first that answers
/// as a trusty-memory daemon (verified via `/health`).
///
/// Why: Fallback for when both the env var and the discovery file are
/// missing or stale — covers the launchd-with-different-HOME case from
/// issue #50. Crucially, a bare TCP connect is not enough: macOS reserves
/// TCP 3031 for the `eppc` (Remote AppleEvents) system service, which
/// accepts connects and then resets — that produced a false positive where
/// `trusty-memory status` claimed the daemon lived on 3031 even when the
/// daemon was running with `--no-http` (no HTTP listener at all).
/// What: Iterates `CANDIDATE_PORT_START..=CANDIDATE_PORT_END`. For each
/// port that accepts a TCP connect we issue a minimal HTTP/1.1 `GET /health`
/// and require the response to start with `HTTP/1.1 200`. Only then do we
/// accept the port as a trusty-memory daemon. Returns the first verified
/// live address or `None` if none respond like trusty-memory.
/// Test: `probe_candidate_ports_returns_none_when_nothing_listens` runs
/// after binding nothing and asserts `None`; the eppc false-positive case
/// is exercised by manual smoke (run `trusty-memory serve --no-http` and
/// confirm `trusty-memory status` reports "daemon: not running").
fn probe_candidate_ports() -> Option<SocketAddr> {
    for port in CANDIDATE_PORT_START..=CANDIDATE_PORT_END {
        let sa = SocketAddr::from(([127, 0, 0, 1], port));
        if tcp_alive(&sa) && http_health_ok(&sa) {
            return Some(sa);
        }
    }
    None
}

/// Verify that the endpoint at `addr` is a trusty-memory HTTP daemon by
/// hitting `/health` and checking for `HTTP/1.1 200`.
///
/// Why: A bare TCP connect is insufficient for liveness — macOS's built-in
/// `eppc` service binds TCP 3031 by default and accepts connects (then
/// resets), which produces a false positive when the candidate-port probe
/// runs on a Mac with no trusty-memory HTTP listener. We must speak HTTP to
/// confirm the listener is ours. We keep the implementation dependency-free
/// (std `TcpStream`) rather than reaching for `reqwest` because the probe
/// runs from sync contexts (e.g. `lock_file_held`, sync `status`) and a
/// short hand-rolled GET request is sufficient.
/// What: Opens a short-timeout TCP connection, writes a minimal HTTP/1.1
/// `GET /health` request with `Connection: close`, reads up to 64 bytes of
/// the response, and returns true iff the bytes start with `HTTP/1.1 200`.
/// Any I/O error, timeout, or non-200 status returns false.
/// Test: Implicitly covered by `probe_candidate_ports_returns_none_when_nothing_listens`;
/// manual smoke with `trusty-memory serve --no-http` confirms port 3031
/// (macOS eppc) no longer triggers a false positive.
fn http_health_ok(addr: &SocketAddr) -> bool {
    let Ok(mut stream) = TcpStream::connect_timeout(addr, PROBE_TIMEOUT) else {
        return false;
    };
    if stream.set_read_timeout(Some(PROBE_TIMEOUT)).is_err()
        || stream.set_write_timeout(Some(PROBE_TIMEOUT)).is_err()
    {
        return false;
    }
    let req = format!(
        "GET /health HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n\r\n",
        addr
    );
    if stream.write_all(req.as_bytes()).is_err() {
        return false;
    }
    let mut buf = [0u8; 64];
    let Ok(n) = stream.read(&mut buf) else {
        return false;
    };
    buf.get(..n)
        .map(|slice| slice.starts_with(b"HTTP/1.1 200"))
        .unwrap_or(false)
}

/// Quick TCP-connect liveness check.
///
/// Why: Reachability is the single source of truth for "is the daemon up?" —
/// the discovery file alone can lie if the daemon crashed without cleaning
/// up.
/// What: Returns true iff `TcpStream::connect_timeout` succeeds within
/// `PROBE_TIMEOUT`.
/// Test: Implicitly covered by `probe_candidate_ports_returns_none_when_nothing_listens`.
pub fn tcp_alive(addr: &SocketAddr) -> bool {
    TcpStream::connect_timeout(addr, PROBE_TIMEOUT).is_ok()
}

/// Parse an address string that may be `host:port` or just a bare port.
///
/// Why: The `TRUSTY_MEMORY_HTTP_PORT` env var is documented as accepting
/// either form so users can write `3038` instead of `127.0.0.1:3038`.
/// What: First tries `parse::<SocketAddr>`; on failure, parses the input as
/// a `u16` and assumes `127.0.0.1`. Returns `None` if neither form matches.
/// Test: `parse_addr_handles_bare_port` and `parse_addr_handles_full_addr`.
fn parse_addr(s: &str) -> Option<SocketAddr> {
    if let Ok(sa) = s.parse::<SocketAddr>() {
        return Some(sa);
    }
    s.parse::<u16>()
        .ok()
        .map(|port| SocketAddr::from(([127, 0, 0, 1], port)))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_addr_handles_bare_port() {
        let sa = parse_addr("3038").expect("bare port should parse");
        assert_eq!(sa.port(), 3038);
        assert_eq!(sa.ip().to_string(), "127.0.0.1");
    }

    #[test]
    fn parse_addr_handles_full_addr() {
        let sa = parse_addr("127.0.0.1:3038").expect("full addr should parse");
        assert_eq!(sa.port(), 3038);
    }

    #[test]
    fn parse_addr_rejects_garbage() {
        assert!(parse_addr("not an address").is_none());
        assert!(parse_addr("").is_none());
    }

    #[test]
    fn probe_candidate_ports_returns_none_when_nothing_listens() {
        // We can't guarantee 3031..=3050 are all free on the CI host, so this
        // test simply asserts the function does not panic and returns either
        // a live addr or None. The pure-logic guarantee is exercised by the
        // parse tests above.
        let _ = probe_candidate_ports();
    }

    #[test]
    #[cfg(unix)]
    fn pid_alive_returns_false_for_unused_pid() {
        // PID 0x7FFF_FFFF is well outside any platform's pid_max and is
        // guaranteed not to exist. `kill -0` must report it as dead.
        assert!(!pid_alive(0x7FFF_FFFF));
    }

    #[test]
    #[cfg(unix)]
    fn pid_alive_returns_true_for_current_process() {
        // Our own PID is by definition alive and signalable.
        assert!(pid_alive(std::process::id()));
    }

    #[test]
    fn tcp_alive_returns_false_for_unbound_port() {
        // Port 1 is reserved and never bound by user processes; connect
        // attempts always fail. (Some OSes return ECONNREFUSED immediately,
        // others time out — both produce `false`.)
        let sa = SocketAddr::from(([127, 0, 0, 1], 1));
        assert!(!tcp_alive(&sa));
    }
}