mati_core/mcp/metadata.rs
1//! Daemon metadata — PID file, session UUID, and Unix permission hardening.
2//!
3//! The on-disk file is `~/.mati/<slug>/mati.pid`. Its internal representation
4//! is [`DaemonMetadata`], which carries the daemon PID and a session UUID.
5//!
6//! ## Atomic publication
7//!
8//! Metadata is published atomically: write to `mati.pid.tmp`, set mode 0600,
9//! then rename over `mati.pid`. This eliminates the window where a reader sees
10//! a partially-written file.
11//!
12//! ## Permission model (Unix-only)
13//!
14//! - Runtime dir (`~/.mati/<slug>/`): mode 0700
15//! - Metadata file (`mati.pid`): mode 0600
16//! - Socket file (`mati.sock`): mode 0600 (set after bind)
17//!
18//! ## Stale-socket cleanup
19//!
20//! On startup, the daemon checks for an existing socket+metadata. If the
21//! recorded PID is dead, the files are removed. If the PID is alive, startup
22//! is refused. The socket is never blindly unlinked.
23
24use std::path::Path;
25
26use anyhow::{Context, Result};
27use serde::{Deserialize, Serialize};
28use uuid::Uuid;
29
30/// Owner identity — who created this daemon socket.
31///
32/// Used by `mati daemon stop` to refuse killing an MCP server session,
33/// and by proxy mode to determine whether to connect.
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
35#[serde(rename_all = "snake_case")]
36pub enum DaemonOwner {
37 /// Started via `mati daemon start`.
38 Daemon,
39 /// Started via `mati serve` (MCP stdio server with embedded socket).
40 Mcp,
41}
42
43impl std::fmt::Display for DaemonOwner {
44 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45 match self {
46 Self::Daemon => write!(f, "daemon"),
47 Self::Mcp => write!(f, "mcp"),
48 }
49 }
50}
51
52/// On-disk daemon metadata. Persisted as `mati.pid`, read by the CLI proxy
53/// and hook scripts to route through the daemon socket.
54///
55/// The session UUID is a session marker for audit/provenance — NOT an
56/// authentication token. Peer identity is established via Unix peer
57/// credentials (`peer_cred()`).
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct DaemonMetadata {
60 /// PID of the daemon process.
61 pub pid: u32,
62 /// Session UUID — included in every IPC request for audit correlation.
63 /// Generated fresh on each daemon startup.
64 pub session: Uuid,
65 /// Who started this daemon (daemon vs mcp server).
66 pub owner: DaemonOwner,
67}
68
69impl DaemonMetadata {
70 /// Create metadata for the current process.
71 pub fn new(owner: DaemonOwner) -> Self {
72 Self {
73 pid: std::process::id(),
74 session: Uuid::new_v4(),
75 owner,
76 }
77 }
78}
79
80// ── File paths ──────────────────────────────────────────────────────────────
81
82const METADATA_FILENAME: &str = "mati.pid";
83const METADATA_TMP_FILENAME: &str = "mati.pid.tmp";
84const SOCKET_FILENAME: &str = "mati.sock";
85
86/// Return the metadata file path for a given mati root.
87///
88/// Crate-internal: callers in `mcp::server` use it for rollback-on-bind-fail
89/// in the daemon-socket task. Outside the crate, prefer `read_metadata` /
90/// `publish_metadata` rather than constructing paths directly.
91pub(crate) fn metadata_path(root: &Path) -> std::path::PathBuf {
92 root.join(METADATA_FILENAME)
93}
94
95/// Return the socket file path for a given mati root.
96pub fn socket_path(root: &Path) -> std::path::PathBuf {
97 root.join(SOCKET_FILENAME)
98}
99
100// ── Permission hardening (Unix-only) ────────────────────────────────────────
101
102/// Ensure the runtime directory exists with mode 0700.
103///
104/// Creates `~/.mati/<slug>/` if absent. Always re-applies 0700 in case a
105/// previous run or manual change left weaker permissions.
106pub fn ensure_runtime_dir(root: &Path) -> Result<()> {
107 std::fs::create_dir_all(root)
108 .with_context(|| format!("cannot create runtime dir at {}", root.display()))?;
109 set_mode(root, 0o700).with_context(|| format!("cannot set mode 0700 on {}", root.display()))?;
110 Ok(())
111}
112
113/// Set mode 0600 on the socket file after `UnixListener::bind()`.
114///
115/// `bind()` creates the socket with permissions derived from the process umask.
116/// This call tightens them to owner-only regardless of umask.
117pub fn harden_socket(sock_path: &Path) -> Result<()> {
118 set_mode(sock_path, 0o600)
119 .with_context(|| format!("cannot set mode 0600 on {}", sock_path.display()))
120}
121
122/// Set Unix file mode. No-op on non-Unix (compile-gated).
123#[cfg(unix)]
124fn set_mode(path: &Path, mode: u32) -> Result<()> {
125 use std::os::unix::fs::PermissionsExt;
126 let perms = std::fs::Permissions::from_mode(mode);
127 std::fs::set_permissions(path, perms)?;
128 Ok(())
129}
130
131#[cfg(not(unix))]
132fn set_mode(_path: &Path, _mode: u32) -> Result<()> {
133 Ok(())
134}
135
136// ── Atomic metadata publication ─────────────────────────────────────────────
137
138/// Atomically publish daemon metadata to `mati.pid`.
139///
140/// Writes to `mati.pid.tmp` with mode 0600, then renames over `mati.pid`.
141/// The rename is atomic on Unix when both paths are on the same filesystem
142/// (always true within `~/.mati/<slug>/`).
143pub fn publish_metadata(root: &Path, metadata: &DaemonMetadata) -> Result<()> {
144 let tmp_path = root.join(METADATA_TMP_FILENAME);
145 let final_path = metadata_path(root);
146
147 let json = serde_json::to_string(metadata).context("failed to serialize daemon metadata")?;
148
149 std::fs::write(&tmp_path, json.as_bytes())
150 .with_context(|| format!("failed to write {}", tmp_path.display()))?;
151
152 // Set permissions BEFORE rename so the file is never visible with wrong mode.
153 set_mode(&tmp_path, 0o600)?;
154
155 std::fs::rename(&tmp_path, &final_path).with_context(|| {
156 format!(
157 "failed to rename {} → {}",
158 tmp_path.display(),
159 final_path.display()
160 )
161 })?;
162
163 Ok(())
164}
165
166// ── Metadata reading ────────────────────────────────────────────────────────
167
168/// Read daemon metadata from `mati.pid`.
169///
170/// Returns `None` if the file does not exist or cannot be parsed.
171/// Supports the v2 JSON format `{"pid":N,"session":"uuid","owner":"daemon"}`.
172/// Falls back to the legacy v1 formats for backward compatibility during
173/// the migration window.
174pub fn read_metadata(root: &Path) -> Option<DaemonMetadata> {
175 let content = std::fs::read_to_string(metadata_path(root)).ok()?;
176 let trimmed = content.trim();
177
178 // Try v2 format first (full DaemonMetadata).
179 if let Ok(meta) = serde_json::from_str::<DaemonMetadata>(trimmed) {
180 return Some(meta);
181 }
182
183 // Legacy plain PID format: "1234" — try before generic JSON parse
184 // so a bare number is not consumed by serde_json::Value.
185 if let Ok(pid) = trimmed.parse::<u32>() {
186 return Some(DaemonMetadata {
187 pid,
188 session: Uuid::nil(),
189 owner: DaemonOwner::Daemon,
190 });
191 }
192
193 // Legacy v1 JSON: {"pid":N,"owner":"daemon"|"mcp"} — no session field.
194 if let Ok(val) = serde_json::from_str::<serde_json::Value>(trimmed) {
195 let pid = val.get("pid").and_then(|v| v.as_u64())? as u32;
196 let owner_str = val
197 .get("owner")
198 .and_then(|v| v.as_str())
199 .unwrap_or("daemon");
200 let owner = match owner_str {
201 "mcp" => DaemonOwner::Mcp,
202 _ => DaemonOwner::Daemon,
203 };
204 return Some(DaemonMetadata {
205 pid,
206 // Legacy metadata has no session — generate one so callers always
207 // have a UUID. The daemon will reject requests with this UUID
208 // (SessionMismatch), forcing the proxy to re-read after daemon restart.
209 session: Uuid::nil(),
210 owner,
211 });
212 }
213
214 None
215}
216
217// ── PID liveness ────────────────────────────────────────────────────────────
218
219/// Check whether a PID is still alive.
220///
221/// Uses `kill(pid, 0)` which checks existence without sending a signal.
222/// Returns true if the process exists (even if owned by another user — EPERM).
223#[cfg(unix)]
224pub fn is_pid_alive(pid: u32) -> bool {
225 // SAFETY: kill(pid, 0) is a standard POSIX liveness check. It sends no
226 // signal — it only tests whether the PID exists and is reachable.
227 let ret = unsafe { libc::kill(pid as libc::pid_t, 0) };
228 if ret == 0 {
229 return true;
230 }
231 // EPERM means the process exists but belongs to another user — still alive.
232 std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM)
233}
234
235#[cfg(not(unix))]
236pub fn is_pid_alive(_pid: u32) -> bool {
237 true // Conservative: assume alive on non-Unix
238}
239
240/// Returns the effective UID of the current process.
241///
242/// Used by the peer credential check to compare against connecting peers.
243#[cfg(unix)]
244pub fn current_euid() -> u32 {
245 // SAFETY: geteuid() is a pure read with no side effects.
246 unsafe { libc::geteuid() }
247}
248
249#[cfg(not(unix))]
250pub fn current_euid() -> u32 {
251 0
252}
253
254/// Returns the calling thread's QoS class as a human-readable string.
255///
256/// Included in `serve_start` lifecycle events so that a silent failure of
257/// `pthread_set_qos_class_self_np` is visible in `mati doctor` output
258/// before the kernel-panic symptoms recur.
259#[cfg(target_os = "macos")]
260pub fn current_qos_class_str() -> &'static str {
261 extern "C" {
262 fn qos_class_self() -> libc::c_uint;
263 }
264 // SAFETY: qos_class_self() is a pure read; it queries the current thread's
265 // QoS class from the kernel without any side effects.
266 match unsafe { qos_class_self() } {
267 0x21 => "user_interactive",
268 0x19 => "user_initiated",
269 0x15 => "default",
270 0x11 => "utility",
271 0x09 => "background",
272 _ => "unknown",
273 }
274}
275
276#[cfg(not(target_os = "macos"))]
277pub fn current_qos_class_str() -> &'static str {
278 "n/a"
279}
280
281// ── SIGTERM / SIGKILL escalation ────────────────────────────────────────────
282
283/// How long to poll for `is_pid_alive` after sending SIGKILL before
284/// declaring the process [`KillOutcome::Stuck`].
285///
286/// Historical defaults and rationale for each step:
287///
288/// - **500ms** (pre-γ): worked for lightly-loaded shutdowns where the
289/// kernel reaped within the first tens of ms.
290/// - **2s** (γ-C7 followup `04ef6e2`): targeted the case where a
291/// slow-shutdown daemon exited at ~600ms.
292/// - **5s** (this commit, γ-C7-followup-2): smoke evidence from
293/// `mati_step_27_stop.out` plus the `serve_shutdown signal_sigterm`
294/// lifecycle event shows the daemon's `store.close()` path can be in
295/// mid-fsync when SIGKILL hits. The kernel MUST wait for the
296/// uninterruptible fsync to complete before fully tearing down the
297/// process, during which `kill(pid, 0)` keeps reporting alive. Under
298/// smoke load (tantivy index commit + dual SurrealKV WAL fsync), this
299/// teardown can legitimately take 2-4 seconds. 5s provides headroom
300/// without unbounded patience — a genuinely-wedged daemon still
301/// surfaces as Stuck within 25s total (20s SIGTERM + 5s SIGKILL).
302const SIGKILL_REAP_WINDOW: std::time::Duration = std::time::Duration::from_secs(5);
303
304/// Outcome of [`kill_and_wait`]. Carries elapsed wall time so callers can
305/// report or log exactly how the kill resolved.
306#[derive(Debug)]
307pub enum KillOutcome {
308 /// Process exited within the SIGTERM budget.
309 ExitedClean(std::time::Duration),
310 /// SIGTERM was ignored or absorbed; SIGKILL succeeded.
311 KilledHard(std::time::Duration),
312 /// Process is still alive after SIGKILL — manual intervention required.
313 /// Carries a [`StuckDiagnostic`] so callers can surface the actual
314 /// process state at the moment we gave up. γ smoke surfaced cases
315 /// where the daemon was effectively gone (lock released, next CLI
316 /// command worked) but our `kill(0)` poll kept reporting alive; the
317 /// diagnostic snapshot lets us distinguish kill(0)-lying-after-SIGKILL,
318 /// zombie state, PID reuse (different process at that PID now), and
319 /// genuinely-still-alive cases on the next failure.
320 Stuck(StuckDiagnostic),
321}
322
323/// Diagnostic data captured at the moment [`KillOutcome::Stuck`] is
324/// returned. Includes timing for each phase and a `ps`-driven snapshot
325/// of the process state at both the start of the kill and the giving-up
326/// point — enabling root-cause analysis without re-running the failure.
327#[derive(Debug, Clone)]
328pub struct StuckDiagnostic {
329 pub pid: u32,
330 /// Elapsed wall time from [`kill_and_wait`] / [`kill_directly`] entry.
331 pub total_elapsed_ms: u64,
332 /// Time spent in the SIGTERM phase. `None` if [`kill_directly`] was
333 /// used (no SIGTERM phase).
334 pub sigterm_elapsed_ms: Option<u64>,
335 /// Time spent polling after SIGKILL.
336 pub sigkill_elapsed_ms: u64,
337 /// Process state when the kill started (via `ps -o ...`).
338 pub initial_snapshot: PidSnapshot,
339 /// Process state when we gave up (via `ps -o ...`).
340 pub final_snapshot: PidSnapshot,
341}
342
343/// `ps -o`-derived snapshot of a PID. Used by [`StuckDiagnostic`] to
344/// pin down why `kill_and_wait` gave up.
345///
346/// On the failure path we cross-check `kill(pid, 0)`'s lying-alive report
347/// against three orthogonal indicators:
348///
349/// - **`lstart`** changed between initial and final → the PID was reused
350/// by a different process (kernel reaped the old one, assigned PID to a
351/// new spawn).
352/// - **`state`** is `Z` → process really is a zombie awaiting reap by its
353/// parent. `kill(0)` succeeds because the proc entry exists; the
354/// process holds no resources.
355/// - **all fields `None`** → `ps` reports the PID is gone but `kill(0)`
356/// still says alive: macOS kernel proc-table lag (the proc structure
357/// hasn't been fully torn down even though the process has exited).
358/// - **same `lstart`, normal `state`** → process is genuinely still
359/// alive. Real Stuck case — daemon shutdown is wedged.
360#[derive(Debug, Clone, Default)]
361pub struct PidSnapshot {
362 /// Process start time as reported by `ps -o lstart=`. `None` if ps
363 /// can't find the PID.
364 pub lstart: Option<String>,
365 /// Process state: 'R' running, 'S' sleeping, 'Z' zombie, etc.
366 pub state: Option<String>,
367 /// Process command name as reported by `ps -o comm=`.
368 pub comm: Option<String>,
369}
370
371impl PidSnapshot {
372 /// Render as a compact one-line diagnostic string suitable for
373 /// inclusion in lifecycle events and stderr.
374 pub fn render(&self) -> String {
375 match (&self.lstart, &self.state, &self.comm) {
376 (None, None, None) => "ps:gone".into(),
377 _ => format!(
378 "lstart={:?} state={:?} comm={:?}",
379 self.lstart.as_deref().unwrap_or("?"),
380 self.state.as_deref().unwrap_or("?"),
381 self.comm.as_deref().unwrap_or("?")
382 ),
383 }
384 }
385}
386
387/// Snapshot the named `ps` field for `pid`. Returns `None` if `ps` can't
388/// find the PID (process gone) or the call fails.
389fn ps_field(pid: u32, field: &str) -> Option<String> {
390 let pid_str = pid.to_string();
391 let output = std::process::Command::new("ps")
392 .args(["-o", &format!("{field}="), "-p", &pid_str])
393 .output()
394 .ok()?;
395 if !output.status.success() {
396 return None;
397 }
398 let trimmed = String::from_utf8_lossy(&output.stdout).trim().to_string();
399 if trimmed.is_empty() {
400 None
401 } else {
402 Some(trimmed)
403 }
404}
405
406/// Capture a `PidSnapshot` via three `ps` calls (lstart, state, comm).
407/// Each call is ~10ms on macOS; total ~30ms. Only invoked on the Stuck
408/// path so the cost doesn't touch the hot path.
409pub fn snapshot_pid(pid: u32) -> PidSnapshot {
410 PidSnapshot {
411 lstart: ps_field(pid, "lstart"),
412 state: ps_field(pid, "state"),
413 comm: ps_field(pid, "comm"),
414 }
415}
416
417/// Send SIGTERM to `pid`. Returns `true` on success or when the kernel
418/// reports the process is already gone (`ESRCH`). `kill(2)` returning
419/// any other error counts as failure — caller surfaces it to the user.
420#[cfg(unix)]
421fn send_sigterm(pid: u32) -> bool {
422 // SAFETY: `kill(pid, SIGTERM)` is a standard POSIX system call. The
423 // worst case is an ESRCH return — we treat that as success because
424 // the contract is "stop this process" and a nonexistent process is
425 // already stopped.
426 let ret = unsafe { libc::kill(pid as libc::pid_t, libc::SIGTERM) };
427 if ret == 0 {
428 return true;
429 }
430 let errno = std::io::Error::last_os_error().raw_os_error();
431 matches!(errno, Some(libc::ESRCH))
432}
433
434#[cfg(not(unix))]
435fn send_sigterm(_pid: u32) -> bool {
436 false
437}
438
439/// Send SIGKILL to `pid` and poll for exit. γ-C6: used by
440/// `mati daemon stop --force` to bypass the SIGTERM grace period and
441/// terminate the daemon immediately. The reaping window matches the
442/// SIGKILL escalation phase of [`kill_and_wait`] — see
443/// `SIGKILL_REAP_WINDOW` for the rationale.
444pub async fn kill_directly(pid: u32) -> KillOutcome {
445 let started = std::time::Instant::now();
446 let initial_snapshot = snapshot_pid(pid);
447 #[cfg(unix)]
448 {
449 // SAFETY: SIGKILL is non-catchable; the process either exits or
450 // we surface Stuck. `kill(2)` is a standard system call.
451 let ret = unsafe { libc::kill(pid as libc::pid_t, libc::SIGKILL) };
452 if ret != 0 {
453 let errno = std::io::Error::last_os_error().raw_os_error();
454 if !matches!(errno, Some(libc::ESRCH)) {
455 tracing::warn!(pid, ?errno, "kill_directly: SIGKILL rejected by kernel");
456 let elapsed_ms = started.elapsed().as_millis() as u64;
457 return KillOutcome::Stuck(StuckDiagnostic {
458 pid,
459 total_elapsed_ms: elapsed_ms,
460 sigterm_elapsed_ms: None,
461 sigkill_elapsed_ms: elapsed_ms,
462 initial_snapshot,
463 final_snapshot: snapshot_pid(pid),
464 });
465 }
466 // ESRCH — already gone, treat as success.
467 return KillOutcome::KilledHard(started.elapsed());
468 }
469 }
470
471 let sigkill_start = std::time::Instant::now();
472 if poll_until_exit(pid, SIGKILL_REAP_WINDOW, started).await {
473 return KillOutcome::KilledHard(started.elapsed());
474 }
475 let sigkill_elapsed_ms = sigkill_start.elapsed().as_millis() as u64;
476 KillOutcome::Stuck(StuckDiagnostic {
477 pid,
478 total_elapsed_ms: started.elapsed().as_millis() as u64,
479 sigterm_elapsed_ms: None,
480 sigkill_elapsed_ms,
481 initial_snapshot,
482 final_snapshot: snapshot_pid(pid),
483 })
484}
485
486/// Send SIGTERM to `pid`, wait up to `timeout` for the process to exit, and
487/// escalate to SIGKILL with `SIGKILL_REAP_WINDOW` of reaping budget if it
488/// does not.
489///
490/// Used by both `mati daemon stop` and the unresponsive-recovery branch of
491/// `ensure_daemon` so the synchronous-exit guarantee is identical across
492/// both paths. Pre-condition: caller has authorized the kill (`--force`
493/// gate, ownership check) and knows the PID is alive.
494pub async fn kill_and_wait(pid: u32, timeout: std::time::Duration) -> KillOutcome {
495 let started = std::time::Instant::now();
496 let initial_snapshot = snapshot_pid(pid);
497
498 if !send_sigterm(pid) {
499 tracing::warn!(pid, "kill_and_wait: SIGTERM rejected by kernel");
500 let elapsed_ms = started.elapsed().as_millis() as u64;
501 return KillOutcome::Stuck(StuckDiagnostic {
502 pid,
503 total_elapsed_ms: elapsed_ms,
504 sigterm_elapsed_ms: Some(elapsed_ms),
505 sigkill_elapsed_ms: 0,
506 initial_snapshot,
507 final_snapshot: snapshot_pid(pid),
508 });
509 }
510
511 let sigterm_start = std::time::Instant::now();
512 if poll_until_exit(pid, timeout, started).await {
513 return KillOutcome::ExitedClean(started.elapsed());
514 }
515 let sigterm_elapsed_ms = sigterm_start.elapsed().as_millis() as u64;
516
517 tracing::warn!(
518 pid,
519 timeout_secs = timeout.as_secs(),
520 "process did not exit within SIGTERM budget — sending SIGKILL"
521 );
522 #[cfg(unix)]
523 {
524 // SAFETY: SIGKILL is non-catchable; the process either exits or
525 // we surface Stuck. `kill(2)` is a standard system call.
526 let _ = unsafe { libc::kill(pid as libc::pid_t, libc::SIGKILL) };
527 }
528
529 let sigkill_start = std::time::Instant::now();
530 if poll_until_exit(pid, SIGKILL_REAP_WINDOW, sigkill_start).await {
531 return KillOutcome::KilledHard(started.elapsed());
532 }
533 let sigkill_elapsed_ms = sigkill_start.elapsed().as_millis() as u64;
534
535 KillOutcome::Stuck(StuckDiagnostic {
536 pid,
537 total_elapsed_ms: started.elapsed().as_millis() as u64,
538 sigterm_elapsed_ms: Some(sigterm_elapsed_ms),
539 sigkill_elapsed_ms,
540 initial_snapshot,
541 final_snapshot: snapshot_pid(pid),
542 })
543}
544
545/// Poll [`is_pid_alive`] until the PID is **effectively gone** or `budget`
546/// elapses (from `started`). Returns `true` if the process is gone or in a
547/// zombie state, `false` if it's still genuinely alive when the budget runs
548/// out.
549///
550/// ## Why zombie detection is needed
551///
552/// γ-C7 followup smoke surfaced a real zombie scenario: when the daemon's
553/// parent process is a `mati serve` proxy (post-γ-C4 architecture), and
554/// the proxy doesn't call `waitpid()` on its children, the daemon process
555/// exits cleanly under SIGKILL but its proc-table entry stays as a zombie
556/// (`<defunct>`, state `'Z'`) until the proxy is killed or exits.
557///
558/// `kill(pid, 0)` continues returning success for zombies — the kernel
559/// considers the proc entry "alive" until reaped. So a pure `kill(0)`
560/// poll loop hangs until the budget expires, returning a false `Stuck`
561/// even though the zombie holds no FDs, no locks, no resources.
562///
563/// This was empirically captured by the `StuckDiagnostic` instrumentation
564/// added in commit `dd5f5a0`: the final snapshot showed
565/// `state="Z" comm="<defunct>"` — the smoking gun.
566///
567/// ## How the zombie check works
568///
569/// Every `ZOMBIE_CHECK_INTERVAL` poll iterations, when `kill(0)` reports
570/// alive, also run `ps -o state= -p <pid>`. If the state starts with `Z`,
571/// the process is a zombie — functionally dead (locks released by the
572/// kernel at exit, no further user-code execution) — and we return `true`.
573///
574/// `ps_field` spawns a subprocess (~10ms), so it's amortized across
575/// every 5 polls (250ms) to keep the per-iteration overhead low. Zombie
576/// state is monotonic — once entered, it never reverts — so sub-second
577/// detection is unnecessary.
578async fn poll_until_exit(
579 pid: u32,
580 budget: std::time::Duration,
581 started: std::time::Instant,
582) -> bool {
583 const POLL_INTERVAL: std::time::Duration = std::time::Duration::from_millis(50);
584 const ZOMBIE_CHECK_INTERVAL: u32 = 5;
585 let deadline = started + budget;
586 let mut iter: u32 = 0;
587 while std::time::Instant::now() < deadline {
588 if !is_pid_alive(pid) {
589 return true;
590 }
591 // Cheap zombie check on a sub-rate to avoid spawning ps on every
592 // 50ms tick. State only ever transitions toward `Z` from running
593 // states (R/S/T), never back.
594 if iter % ZOMBIE_CHECK_INTERVAL == 0 {
595 if let Some(state) = ps_field(pid, "state") {
596 if state.starts_with('Z') {
597 return true;
598 }
599 }
600 }
601 iter = iter.wrapping_add(1);
602 tokio::time::sleep(POLL_INTERVAL).await;
603 }
604 false
605}
606
607// ── Peer credentials ────────────────────────────────────────────────────────
608
609/// Peer identity from a Unix socket connection. Carried through the request
610/// pipeline into handlers and the audit record.
611#[derive(Debug, Clone)]
612pub struct PeerContext {
613 /// Effective UID of the connecting process.
614 pub uid: u32,
615 /// PID of the connecting process (available on Linux and macOS, None on
616 /// platforms where `peer_cred()` does not expose it).
617 pub pid: Option<u32>,
618}
619
620/// Verify that a connecting peer has the same effective UID as the daemon.
621///
622/// Returns `Some(PeerContext)` on success, `None` on mismatch or failure.
623/// On `None`, the caller MUST drop the connection and continue the accept
624/// loop — never crash.
625///
626/// This enforces the Unix-socket UID boundary: only processes running as
627/// the same user can talk to the daemon.
628pub fn check_peer_cred(stream: &tokio::net::UnixStream, daemon_euid: u32) -> Option<PeerContext> {
629 match stream.peer_cred() {
630 Ok(cred) => {
631 let peer_uid = cred.uid();
632 if peer_uid != daemon_euid {
633 tracing::warn!(
634 peer_uid,
635 daemon_uid = daemon_euid,
636 "peer UID mismatch — dropping connection"
637 );
638 return None;
639 }
640 let peer_pid = cred.pid().map(|p| p as u32);
641 tracing::trace!(peer_uid, ?peer_pid, "peer credential check passed");
642 Some(PeerContext {
643 uid: peer_uid,
644 pid: peer_pid,
645 })
646 }
647 Err(e) => {
648 tracing::warn!(error = %e, "peer_cred() failed — dropping connection");
649 None
650 }
651 }
652}
653
654// ── Stale-socket cleanup ────────────────────────────────────────────────────
655
656/// Outcome of a stale-socket check.
657#[derive(Debug, PartialEq, Eq)]
658pub enum StaleCheckResult {
659 /// No metadata or socket — safe to proceed with startup.
660 Clean,
661 /// Metadata references a dead PID — stale files cleaned up, safe to proceed.
662 StaleRemoved,
663 /// Metadata references a live PID — daemon is running, refuse startup.
664 LiveDaemon {
665 pid: u32,
666 owner: DaemonOwner,
667 session: Uuid,
668 },
669 /// Metadata is absent but socket file exists — ambiguous state.
670 /// Caller should probe the socket before deciding.
671 OrphanSocket,
672}
673
674/// Check for stale daemon state and clean up if safe.
675///
676/// This implements the safe stale-socket protocol:
677/// 1. Read metadata if present
678/// 2. Test PID liveness
679/// 3. If live daemon exists, return `LiveDaemon` (refuse startup)
680/// 4. Only remove stale socket+metadata when PID is dead
681///
682/// The socket is NEVER blindly unlinked.
683pub fn check_and_cleanup_stale(root: &Path) -> StaleCheckResult {
684 let meta_path = metadata_path(root);
685 let sock_path = socket_path(root);
686
687 let has_metadata = meta_path.exists();
688 let has_socket = sock_path.exists();
689
690 if !has_metadata && !has_socket {
691 return StaleCheckResult::Clean;
692 }
693
694 // Socket exists but no metadata — ambiguous. Caller must probe.
695 if !has_metadata && has_socket {
696 return StaleCheckResult::OrphanSocket;
697 }
698
699 // Metadata exists — parse and check PID liveness.
700 let metadata = match read_metadata(root) {
701 Some(m) => m,
702 None => {
703 // Metadata file exists but is corrupt/unreadable.
704 // Treat as stale: remove both files.
705 tracing::warn!("daemon metadata corrupt — removing stale files");
706 let _ = std::fs::remove_file(&meta_path);
707 let _ = std::fs::remove_file(&sock_path);
708 return StaleCheckResult::StaleRemoved;
709 }
710 };
711
712 if is_pid_alive(metadata.pid) {
713 return StaleCheckResult::LiveDaemon {
714 pid: metadata.pid,
715 owner: metadata.owner,
716 session: metadata.session,
717 };
718 }
719
720 // PID is dead — clean up stale files.
721 tracing::info!(
722 pid = metadata.pid,
723 owner = %metadata.owner,
724 "removing stale daemon files (PID dead)"
725 );
726 let _ = std::fs::remove_file(&sock_path);
727 let _ = std::fs::remove_file(&meta_path);
728 // Also remove the starting sentinel if present.
729 let _ = std::fs::remove_file(root.join("mati.starting"));
730
731 StaleCheckResult::StaleRemoved
732}
733
734// ── Lifecycle log ───────────────────────────────────────────────────────────
735
736const LIFECYCLE_FILENAME: &str = "lifecycle.log";
737
738/// Maximum number of lines retained in `lifecycle.log`. Trimmed at
739/// `install_panic_hook` time (single-writer window: we hold the kernel
740/// flock, so no concurrent daemon can race the rotation). At ~150 bytes
741/// per line, 10k lines ≈ 1.5 MB — enough to retain a year of normal
742/// lifecycle events while bounding growth in pathological respawn loops.
743const MAX_LIFECYCLE_LINES: usize = 10_000;
744
745/// Hard ceiling on the byte size of `lifecycle.log` we will read into
746/// memory at startup. The legitimate cap (10k lines × ~150 B ≈ 1.5 MB)
747/// fits comfortably inside this; the ceiling exists only to prevent
748/// startup OOM if an external process or buggy actor wrote pathological
749/// content into the log (e.g. a 4 GB file of garbage). Above this size,
750/// the trim path nukes the file rather than reading it. Lifecycle events
751/// are best-effort observability — losing them on extreme corruption is
752/// strictly preferable to refusing to start the daemon (P9: graceful
753/// degradation, never block Claude on a mati outage).
754const LIFECYCLE_TRIM_MAX_READ_BYTES: u64 = 64 * 1024 * 1024;
755
756/// Best-effort one-time trim of `lifecycle.log` to its last N lines.
757///
758/// Uses tmp+rename for atomic replacement so a crash during rotation
759/// leaves either the old log or the new log on disk, never a partial
760/// truncation. Errors are silently ignored — log rotation must never
761/// block startup.
762///
763/// Hard size guard: if the on-disk file exceeds
764/// `LIFECYCLE_TRIM_MAX_READ_BYTES`, the file is truncated to empty
765/// without being read. This protects startup from OOM on a pathological
766/// log (P9). The legitimate cap is ~1.5 MB so the threshold is not hit
767/// under any normal operation.
768fn trim_lifecycle_log(root: &Path, max_lines: usize) {
769 let path = root.join(LIFECYCLE_FILENAME);
770
771 // Size guard: refuse to read pathological files into memory. Truncate
772 // to empty and continue. Best-effort — if `metadata` or `write` fails,
773 // we just return; startup must not block on log rotation.
774 if let Ok(meta) = std::fs::metadata(&path) {
775 if meta.is_file() && meta.len() > LIFECYCLE_TRIM_MAX_READ_BYTES {
776 let _ = std::fs::write(&path, b"");
777 return;
778 }
779 }
780
781 let content = match std::fs::read_to_string(&path) {
782 Ok(c) => c,
783 Err(_) => return, // log doesn't exist yet, or can't be read
784 };
785 // `lines()` does not yield trailing empty line, so length == event count.
786 let line_count = content.lines().count();
787 if line_count <= max_lines {
788 return;
789 }
790 let skip = line_count - max_lines;
791 let kept: String = content.lines().skip(skip).flat_map(|l| [l, "\n"]).collect();
792 // Atomic replace.
793 let tmp = path.with_extension("log.tmp");
794 if std::fs::write(&tmp, kept).is_err() {
795 return;
796 }
797 let _ = std::fs::rename(&tmp, &path);
798}
799
800/// Hard cap on a single lifecycle.log line, in bytes.
801///
802/// POSIX guarantees that `write(2)` calls of size ≤ `PIPE_BUF` (4096 bytes
803/// on Linux, ≥512 on every conformant system) on a file opened with
804/// `O_APPEND` are atomic with respect to other writers. Above that, two
805/// concurrent appenders can interleave bytes mid-line, producing torn
806/// records that confuse `lines()` consumers and the trim path.
807///
808/// Multiple processes can write here simultaneously: any running daemon
809/// instance, the panic hook firing in a background thread, sibling-process
810/// startup logging during stale cleanup. A pathological panic payload
811/// (large `Debug`-formatted struct, JSON dump of a serde error) can easily
812/// exceed 4 KB and tear the log.
813///
814/// 3900 bytes leaves headroom for the `{ts}\t{pid}\t{event}\t` prefix (well
815/// under 100 bytes in practice) plus the trailing `\n`, while staying
816/// safely below PIPE_BUF.
817const LIFECYCLE_MAX_LINE_BYTES: usize = 3900;
818
819/// Append a single event to `~/.mati/<slug>/lifecycle.log`.
820///
821/// Format: `unix_ts<TAB>pid<TAB>event<TAB>detail<NL>`. Newlines and tabs in
822/// `detail` are replaced with spaces so each event remains exactly one line.
823/// Lines exceeding `LIFECYCLE_MAX_LINE_BYTES` are truncated at a UTF-8 char
824/// boundary so concurrent appenders never produce torn records.
825///
826/// Best-effort — every failure path is silenced. Lifecycle logging must
827/// never block startup, shutdown, or panic paths.
828pub fn record_lifecycle_event(root: &Path, event: &str, detail: &str) {
829 use std::io::Write;
830 let path = root.join(LIFECYCLE_FILENAME);
831 let ts = std::time::SystemTime::now()
832 .duration_since(std::time::UNIX_EPOCH)
833 .unwrap_or_default()
834 .as_secs();
835 let pid = std::process::id();
836 let safe_detail: String = detail
837 .chars()
838 .map(|c| match c {
839 '\t' | '\n' | '\r' => ' ',
840 c => c,
841 })
842 .collect();
843 let mut line = format!("{ts}\t{pid}\t{event}\t{safe_detail}\n");
844 if line.len() > LIFECYCLE_MAX_LINE_BYTES {
845 // Reserve one byte for the trailing '\n' we re-add below. Walk back
846 // to the nearest UTF-8 char boundary so we never split a multibyte
847 // character — a torn UTF-8 sequence would corrupt `read_to_string`
848 // consumers. UTF-8 chars are ≤4 bytes, so this loop runs at most
849 // 3 iterations. Equivalent to `floor_char_boundary` (stable in
850 // 1.91) but works on the project's MSRV (1.82).
851 let mut cut = LIFECYCLE_MAX_LINE_BYTES - 1;
852 while cut > 0 && !line.is_char_boundary(cut) {
853 cut -= 1;
854 }
855 line.truncate(cut);
856 line.push('\n');
857 }
858 // Use the pre-opened fd when it matches this exact log path — avoids
859 // open(2) in the panic hook where VFS stalls are possible under memory
860 // pressure on macOS. Fall back to open-by-path for any other root
861 // (including test callers with arbitrary temp dirs).
862 //
863 // No mutex is needed: `O_APPEND` + line ≤ PIPE_BUF makes `write(2)`
864 // atomic at the kernel level, so concurrent emitters can share the fd
865 // without user-space locking. `<&File as Write>::write_all` lets us emit
866 // through a shared reference.
867 let used_preopen = if let Some(pre) = LIFECYCLE_LOG_FILE.get() {
868 if pre.path == path {
869 let _ = (&pre.file).write_all(line.as_bytes());
870 true
871 } else {
872 false
873 }
874 } else {
875 false
876 };
877
878 if !used_preopen {
879 if let Ok(mut f) = std::fs::OpenOptions::new()
880 .create(true)
881 .append(true)
882 .open(&path)
883 {
884 let _ = f.write_all(line.as_bytes());
885 }
886 }
887}
888
889// ── Panic hook ──────────────────────────────────────────────────────────────
890
891/// Cached daemon root used by the panic hook to clean up sock + pid files.
892/// Set by [`install_panic_hook`]; never overwritten.
893static PANIC_HOOK_ROOT: std::sync::OnceLock<std::path::PathBuf> = std::sync::OnceLock::new();
894
895/// Pre-opened lifecycle log file handle, paired with its canonical path
896/// and a pre-formatted pid prefix.
897///
898/// Opened at `install_panic_hook` time so the panic hook can call `write(2)`
899/// directly instead of `open(2)`. On macOS under memory pressure, `open(2)`
900/// can stall waiting for VFS resources; a pre-opened fd avoids that window.
901///
902/// `record_lifecycle_event` uses this handle only when the requested path
903/// matches, so test callers with arbitrary temp dirs always open by path.
904///
905/// **No `Mutex` around the file.** The fd is opened with `O_APPEND` and every
906/// emitted line is capped below `PIPE_BUF` (`LIFECYCLE_MAX_LINE_BYTES = 3900`),
907/// so the kernel guarantees `write(2)` calls are atomic w.r.t. concurrent
908/// appenders — both intra-process and cross-process. We use
909/// `<&File as std::io::Write>::write_all` to emit through a shared reference.
910/// Dropping the user-space mutex also removes a deadlock hazard on the panic
911/// path (a thread holding the mutex while panicking would self-deadlock when
912/// the hook tried to relock it).
913///
914/// `pid_prefix` is the bytes of `"<pid>\t"` formatted once at install time so
915/// the no-alloc panic path can copy it into a stack buffer without calling
916/// `format!`.
917struct PreOpenedLog {
918 path: std::path::PathBuf,
919 file: std::fs::File,
920 pid_prefix: Vec<u8>,
921}
922
923static LIFECYCLE_LOG_FILE: std::sync::OnceLock<PreOpenedLog> = std::sync::OnceLock::new();
924
925/// Test/diagnostic helper: returns `true` if `install_panic_hook` has run and
926/// successfully pre-opened the lifecycle log fd. Integration tests use this
927/// to assert the panic hook is wired up; it is `#[doc(hidden)]` to discourage
928/// production callers from depending on the pre-open state.
929#[doc(hidden)]
930pub fn is_lifecycle_log_preopened() -> bool {
931 LIFECYCLE_LOG_FILE.get().is_some()
932}
933
934// ── No-alloc panic write path ───────────────────────────────────────────────
935//
936// The panic hook may run with a corrupted allocator (e.g., panic-on-OOM,
937// allocator state poisoned by the bug being reported). Heap allocations on
938// the panic path can hang or abort the runtime before the lifecycle event is
939// recorded. The functions below let the hook emit a lifecycle line with zero
940// heap allocations: timestamp formatted into a stack buffer via
941// `u64_to_decimal_bytes`, pid pre-formatted at install time, detail strings
942// sanitized in place, and the line written directly through the pre-opened
943// fd via `<&File as Write>::write_all`.
944//
945// This is best-effort. If `LIFECYCLE_LOG_FILE` is unset (install_panic_hook
946// never ran, or the open(2) at install time failed), the no-alloc writer
947// returns false and the caller falls back to the heap path.
948
949/// Format `n` as decimal ASCII into the start of `out`, returning the number
950/// of bytes written. Stack-only — never allocates. `out` must be ≥ 20 bytes
951/// (u64 max = `18_446_744_073_709_551_615` is 20 digits).
952fn u64_to_decimal_bytes(mut n: u64, out: &mut [u8]) -> usize {
953 if n == 0 {
954 if out.is_empty() {
955 return 0;
956 }
957 out[0] = b'0';
958 return 1;
959 }
960 // Write digits backwards into a tmp stack buffer, then reverse-copy.
961 let mut tmp = [0u8; 20];
962 let mut len = 0;
963 while n > 0 && len < tmp.len() {
964 tmp[len] = b'0' + (n % 10) as u8;
965 n /= 10;
966 len += 1;
967 }
968 let take = len.min(out.len());
969 for i in 0..take {
970 out[i] = tmp[len - 1 - i];
971 }
972 take
973}
974
975/// Build a lifecycle log line into `out` with no heap allocations. Returns
976/// the number of bytes written (always ≤ `LIFECYCLE_MAX_LINE_BYTES`).
977///
978/// Mirrors the heap path's format: `{ts}\t{pid}\t{event}\t{detail}\n`, where
979/// `detail` is `detail_parts` joined by single spaces. Bytes from
980/// `detail_parts` matching `\t \n \r` are replaced with space (same
981/// sanitization as the heap path's `safe_detail`).
982///
983/// Truncation rules match the heap path: fill the buffer up to
984/// `LIFECYCLE_MAX_LINE_BYTES - 1`, walk back to the most recent UTF-8 char
985/// boundary if a truncation would split a multibyte character, then append
986/// the trailing `\n`. Each `&str` part is itself valid UTF-8, so we use
987/// `str::is_char_boundary` per-part rather than scanning the whole buffer.
988fn write_lifecycle_line(
989 out: &mut [u8; LIFECYCLE_MAX_LINE_BYTES],
990 ts: u64,
991 pid_prefix: &[u8],
992 event: &str,
993 detail_parts: &[&str],
994) -> usize {
995 // Reserve the final byte for the trailing newline.
996 let cap = LIFECYCLE_MAX_LINE_BYTES - 1;
997 let mut pos: usize = 0;
998
999 // Copy raw bytes (no sanitization) up to `cap`.
1000 fn push_raw(out: &mut [u8], pos: &mut usize, src: &[u8], cap: usize) {
1001 let remaining = cap.saturating_sub(*pos);
1002 let n = src.len().min(remaining);
1003 out[*pos..*pos + n].copy_from_slice(&src[..n]);
1004 *pos += n;
1005 }
1006
1007 // ts (decimal ASCII, stack-only).
1008 let mut ts_buf = [0u8; 20];
1009 let ts_len = u64_to_decimal_bytes(ts, &mut ts_buf);
1010 push_raw(out, &mut pos, &ts_buf[..ts_len], cap);
1011 push_raw(out, &mut pos, b"\t", cap);
1012
1013 // pid prefix (already includes trailing tab).
1014 push_raw(out, &mut pos, pid_prefix, cap);
1015
1016 // event tag — never sanitized (matches heap path, where the format-string
1017 // separators are real \t and only `safe_detail` is mapped).
1018 push_raw(out, &mut pos, event.as_bytes(), cap);
1019 push_raw(out, &mut pos, b"\t", cap);
1020
1021 // detail_parts joined by single space, sanitized byte-by-byte. We
1022 // sanitize per-part because (a) the join separator is already a space
1023 // and (b) `\t \n \r` are 1-byte ASCII so a byte-level swap preserves
1024 // UTF-8 validity.
1025 for (i, part) in detail_parts.iter().enumerate() {
1026 if i > 0 {
1027 push_raw(out, &mut pos, b" ", cap);
1028 }
1029 let bytes = part.as_bytes();
1030 let remaining = cap.saturating_sub(pos);
1031 let mut take = bytes.len().min(remaining);
1032 // If we'd split a multibyte char, walk back to the previous boundary.
1033 // `bytes` is the byte view of a `&str`, so we can use the str API.
1034 if take < bytes.len() {
1035 while take > 0 && !part.is_char_boundary(take) {
1036 take -= 1;
1037 }
1038 }
1039 for j in 0..take {
1040 out[pos + j] = match bytes[j] {
1041 b'\t' | b'\n' | b'\r' => b' ',
1042 b => b,
1043 };
1044 }
1045 pos += take;
1046 }
1047
1048 // Trailing newline — always fits because `cap = LIFECYCLE_MAX_LINE_BYTES - 1`.
1049 out[pos] = b'\n';
1050 pos + 1
1051}
1052
1053/// No-alloc lifecycle writer used by the panic hook. Returns `false` if the
1054/// pre-opened fd is unavailable, or if the requested `root` does not match
1055/// the root the panic hook was installed for, so the caller can fall back
1056/// to the heap path.
1057///
1058/// Allocation budget: zero. The line is built into a `[u8; LIFECYCLE_MAX_LINE_BYTES]`
1059/// stack buffer; emission is `(&File).write_all(...)`, a single `write(2)`
1060/// for the small (< PIPE_BUF) line. The path-equality gate uses
1061/// `Path::parent()` (returns `&Path`, no heap) and `PartialEq` on `Path`
1062/// (component iteration, no heap), mirroring the heap writer's `pre.path == path`
1063/// check without the `root.join(LIFECYCLE_FILENAME)` allocation.
1064fn record_lifecycle_event_no_alloc(root: &Path, event: &str, detail_parts: &[&str]) -> bool {
1065 use std::io::Write;
1066 let Some(pre) = LIFECYCLE_LOG_FILE.get() else {
1067 return false;
1068 };
1069 // Discriminate by root so test/dev callers with arbitrary temp dirs
1070 // route through the heap fallback. `pre.path` was constructed as
1071 // `root.join(LIFECYCLE_FILENAME)`, so its parent is exactly the root
1072 // that was registered at install time.
1073 if pre.path.parent() != Some(root) {
1074 return false;
1075 }
1076 let ts = std::time::SystemTime::now()
1077 .duration_since(std::time::UNIX_EPOCH)
1078 .unwrap_or_default()
1079 .as_secs();
1080 let mut buf = [0u8; LIFECYCLE_MAX_LINE_BYTES];
1081 let n = write_lifecycle_line(&mut buf, ts, &pre.pid_prefix, event, detail_parts);
1082 (&pre.file).write_all(&buf[..n]).is_ok()
1083}
1084
1085/// Idempotent cleanup the panic hook performs on every panic.
1086///
1087/// Removes daemon sock + pid files (kernel auto-releases the SurrealKV flock,
1088/// so file unlink is enough for sibling-process recovery) and appends a
1089/// `panic` lifecycle event with location + payload. Best-effort throughout:
1090/// every fs operation swallows its error so the panic still surfaces.
1091///
1092/// **Lifecycle event is written via the no-alloc path when possible** — the
1093/// hook may run with a corrupted allocator, so we avoid `format!` /
1094/// `PathBuf::join` / `chars().collect()` on the panic path. If the pre-opened
1095/// fd is unavailable (install_panic_hook never ran or its open(2) failed),
1096/// we fall back to the heap path so the event still lands on disk.
1097///
1098/// Crate-internal: the only callers are this module's `install_panic_hook`
1099/// and its `#[cfg(test)]` block. Same-module tests have access to private
1100/// items, so this does not need to be `pub` for testability.
1101pub(crate) fn run_panic_cleanup(root: &Path, location: &str, payload: &str) {
1102 let _ = std::fs::remove_file(socket_path(root));
1103 let _ = std::fs::remove_file(metadata_path(root));
1104 if !record_lifecycle_event_no_alloc(root, "panic", &[location, payload]) {
1105 record_lifecycle_event(root, "panic", &format!("{location} {payload}"));
1106 }
1107}
1108
1109/// Install a global panic hook that runs `run_panic_cleanup` before
1110/// delegating to the default hook.
1111///
1112/// Idempotent — only the first call's `root` is honored (subsequent calls are
1113/// no-ops). Safe to call from any startup path.
1114///
1115/// The hook runs on the panicking thread before unwinding, so it fires for
1116/// every panic in every tokio worker (tokio's spawn-boundary `catch_unwind`
1117/// invokes the hook before catching).
1118pub fn install_panic_hook(root: std::path::PathBuf) {
1119 if PANIC_HOOK_ROOT.set(root.clone()).is_err() {
1120 return;
1121 }
1122 // One-time lifecycle.log rotation. Single-writer window: we just
1123 // acquired the kernel flock to start serving, so no concurrent daemon
1124 // is rotating in parallel.
1125 trim_lifecycle_log(&root, MAX_LIFECYCLE_LINES);
1126
1127 // Pre-open the lifecycle log so the panic hook only calls write(2), not
1128 // open(2). On macOS under memory pressure, open(2) can stall in the VFS
1129 // layer; holding the fd from startup removes that stall from the panic path.
1130 //
1131 // Also pre-format the "<pid>\t" prefix bytes here so the no-alloc panic
1132 // writer can copy them into a stack buffer without calling `format!`.
1133 // pid is process-global and stable, so caching it once is sound.
1134 let log_path = root.join(LIFECYCLE_FILENAME);
1135 if let Ok(f) = std::fs::OpenOptions::new()
1136 .create(true)
1137 .append(true)
1138 .open(&log_path)
1139 {
1140 let pid = std::process::id();
1141 let mut pid_buf = [0u8; 20];
1142 let pid_len = u64_to_decimal_bytes(pid as u64, &mut pid_buf);
1143 let mut pid_prefix = Vec::with_capacity(pid_len + 1);
1144 pid_prefix.extend_from_slice(&pid_buf[..pid_len]);
1145 pid_prefix.push(b'\t');
1146 let _ = LIFECYCLE_LOG_FILE.set(PreOpenedLog {
1147 path: log_path,
1148 file: f,
1149 pid_prefix,
1150 });
1151 }
1152
1153 let default_hook = std::panic::take_hook();
1154 std::panic::set_hook(Box::new(move |info| {
1155 if let Some(root) = PANIC_HOOK_ROOT.get() {
1156 let location = info
1157 .location()
1158 .map(|l| format!("{}:{}", l.file(), l.line()))
1159 .unwrap_or_else(|| "<unknown>".to_string());
1160 let payload = info
1161 .payload()
1162 .downcast_ref::<&str>()
1163 .copied()
1164 .or_else(|| info.payload().downcast_ref::<String>().map(String::as_str))
1165 .unwrap_or("<non-string panic>");
1166 run_panic_cleanup(root, &location, payload);
1167 }
1168 default_hook(info);
1169 }));
1170}
1171
1172// ── Tests ───────────────────────────────────────────────────────────────────
1173
1174#[cfg(test)]
1175mod tests {
1176 use super::*;
1177
1178 #[test]
1179 fn metadata_roundtrip() {
1180 let meta = DaemonMetadata::new(DaemonOwner::Daemon);
1181 let json = serde_json::to_string(&meta).unwrap();
1182 let back: DaemonMetadata = serde_json::from_str(&json).unwrap();
1183 assert_eq!(back.pid, meta.pid);
1184 assert_eq!(back.session, meta.session);
1185 assert_eq!(back.owner, DaemonOwner::Daemon);
1186 }
1187
1188 #[test]
1189 fn metadata_mcp_owner_roundtrip() {
1190 let meta = DaemonMetadata {
1191 pid: 42,
1192 session: Uuid::new_v4(),
1193 owner: DaemonOwner::Mcp,
1194 };
1195 let json = serde_json::to_string(&meta).unwrap();
1196 let back: DaemonMetadata = serde_json::from_str(&json).unwrap();
1197 assert_eq!(back.owner, DaemonOwner::Mcp);
1198 }
1199
1200 #[test]
1201 fn read_metadata_v2_format() {
1202 let dir = tempfile::tempdir().unwrap();
1203 let session = Uuid::new_v4();
1204 let meta = DaemonMetadata {
1205 pid: 1234,
1206 session,
1207 owner: DaemonOwner::Daemon,
1208 };
1209 publish_metadata(dir.path(), &meta).unwrap();
1210
1211 let read = read_metadata(dir.path()).unwrap();
1212 assert_eq!(read.pid, 1234);
1213 assert_eq!(read.session, session);
1214 assert_eq!(read.owner, DaemonOwner::Daemon);
1215 }
1216
1217 #[test]
1218 fn read_metadata_legacy_v1_json() {
1219 let dir = tempfile::tempdir().unwrap();
1220 std::fs::write(dir.path().join("mati.pid"), r#"{"pid":5678,"owner":"mcp"}"#).unwrap();
1221
1222 let read = read_metadata(dir.path()).unwrap();
1223 assert_eq!(read.pid, 5678);
1224 assert_eq!(read.owner, DaemonOwner::Mcp);
1225 // Legacy format has no session — should get nil UUID.
1226 assert!(read.session.is_nil());
1227 }
1228
1229 #[test]
1230 fn read_metadata_legacy_plain_pid() {
1231 let dir = tempfile::tempdir().unwrap();
1232 std::fs::write(dir.path().join("mati.pid"), "9999\n").unwrap();
1233
1234 let read = read_metadata(dir.path()).unwrap();
1235 assert_eq!(read.pid, 9999);
1236 assert_eq!(read.owner, DaemonOwner::Daemon);
1237 assert!(read.session.is_nil());
1238 }
1239
1240 #[test]
1241 fn read_metadata_missing_returns_none() {
1242 let dir = tempfile::tempdir().unwrap();
1243 assert!(read_metadata(dir.path()).is_none());
1244 }
1245
1246 #[test]
1247 fn read_metadata_corrupt_returns_none() {
1248 let dir = tempfile::tempdir().unwrap();
1249 std::fs::write(dir.path().join("mati.pid"), "not json at all ~~~").unwrap();
1250 assert!(read_metadata(dir.path()).is_none());
1251 }
1252
1253 #[cfg(unix)]
1254 #[test]
1255 fn publish_metadata_sets_mode_0600() {
1256 use std::os::unix::fs::PermissionsExt;
1257 let dir = tempfile::tempdir().unwrap();
1258 let meta = DaemonMetadata::new(DaemonOwner::Daemon);
1259 publish_metadata(dir.path(), &meta).unwrap();
1260
1261 let perms = std::fs::metadata(dir.path().join("mati.pid"))
1262 .unwrap()
1263 .permissions();
1264 assert_eq!(
1265 perms.mode() & 0o777,
1266 0o600,
1267 "metadata file should be mode 0600"
1268 );
1269 }
1270
1271 #[cfg(unix)]
1272 #[test]
1273 fn publish_metadata_is_atomic() {
1274 let dir = tempfile::tempdir().unwrap();
1275
1276 // Write initial metadata.
1277 let meta1 = DaemonMetadata {
1278 pid: 1,
1279 session: Uuid::new_v4(),
1280 owner: DaemonOwner::Daemon,
1281 };
1282 publish_metadata(dir.path(), &meta1).unwrap();
1283
1284 // Overwrite atomically.
1285 let meta2 = DaemonMetadata {
1286 pid: 2,
1287 session: Uuid::new_v4(),
1288 owner: DaemonOwner::Mcp,
1289 };
1290 publish_metadata(dir.path(), &meta2).unwrap();
1291
1292 // Read should see meta2, not a partial mix.
1293 let read = read_metadata(dir.path()).unwrap();
1294 assert_eq!(read.pid, 2);
1295 assert_eq!(read.owner, DaemonOwner::Mcp);
1296
1297 // Temp file should not be left behind.
1298 assert!(!dir.path().join("mati.pid.tmp").exists());
1299 }
1300
1301 #[cfg(unix)]
1302 #[test]
1303 fn ensure_runtime_dir_sets_mode_0700() {
1304 use std::os::unix::fs::PermissionsExt;
1305 let dir = tempfile::tempdir().unwrap();
1306 let root = dir.path().join("test_root");
1307
1308 ensure_runtime_dir(&root).unwrap();
1309
1310 let perms = std::fs::metadata(&root).unwrap().permissions();
1311 assert_eq!(
1312 perms.mode() & 0o777,
1313 0o700,
1314 "runtime dir should be mode 0700"
1315 );
1316 }
1317
1318 #[test]
1319 fn is_pid_alive_for_current_process() {
1320 assert!(is_pid_alive(std::process::id()));
1321 }
1322
1323 #[test]
1324 fn is_pid_alive_for_dead_pid() {
1325 assert!(!is_pid_alive(4_000_000));
1326 }
1327
1328 #[test]
1329 fn stale_check_clean_when_no_files() {
1330 let dir = tempfile::tempdir().unwrap();
1331 assert_eq!(check_and_cleanup_stale(dir.path()), StaleCheckResult::Clean);
1332 }
1333
1334 #[test]
1335 fn stale_check_removes_dead_pid() {
1336 let dir = tempfile::tempdir().unwrap();
1337 let meta = DaemonMetadata {
1338 pid: 4_000_000, // almost certainly dead
1339 session: Uuid::new_v4(),
1340 owner: DaemonOwner::Daemon,
1341 };
1342 publish_metadata(dir.path(), &meta).unwrap();
1343 std::fs::write(dir.path().join("mati.sock"), "").unwrap();
1344
1345 let result = check_and_cleanup_stale(dir.path());
1346 assert_eq!(result, StaleCheckResult::StaleRemoved);
1347 assert!(!dir.path().join("mati.pid").exists());
1348 assert!(!dir.path().join("mati.sock").exists());
1349 }
1350
1351 #[test]
1352 fn stale_check_live_daemon_detected() {
1353 let dir = tempfile::tempdir().unwrap();
1354 let meta = DaemonMetadata {
1355 pid: std::process::id(), // our own PID — alive
1356 session: Uuid::new_v4(),
1357 owner: DaemonOwner::Daemon,
1358 };
1359 publish_metadata(dir.path(), &meta).unwrap();
1360
1361 match check_and_cleanup_stale(dir.path()) {
1362 StaleCheckResult::LiveDaemon { pid, .. } => {
1363 assert_eq!(pid, std::process::id());
1364 }
1365 other => panic!("expected LiveDaemon, got {:?}", other),
1366 }
1367 }
1368
1369 #[test]
1370 fn stale_check_orphan_socket() {
1371 let dir = tempfile::tempdir().unwrap();
1372 // Socket exists but no metadata file.
1373 std::fs::write(dir.path().join("mati.sock"), "").unwrap();
1374
1375 assert_eq!(
1376 check_and_cleanup_stale(dir.path()),
1377 StaleCheckResult::OrphanSocket
1378 );
1379 }
1380
1381 #[test]
1382 fn stale_check_corrupt_metadata_cleaned_up() {
1383 let dir = tempfile::tempdir().unwrap();
1384 std::fs::write(dir.path().join("mati.pid"), "garbage!!!").unwrap();
1385 std::fs::write(dir.path().join("mati.sock"), "").unwrap();
1386
1387 let result = check_and_cleanup_stale(dir.path());
1388 assert_eq!(result, StaleCheckResult::StaleRemoved);
1389 assert!(!dir.path().join("mati.pid").exists());
1390 assert!(!dir.path().join("mati.sock").exists());
1391 }
1392
1393 // ── Peer credential tests ───────────────────────────────────────────
1394
1395 /// Test peer credential check with a real Unix socket pair.
1396 /// Both endpoints run as the same user (test process), so the UID matches.
1397 #[cfg(unix)]
1398 #[tokio::test]
1399 async fn peer_cred_accepts_same_uid() {
1400 let dir = tempfile::tempdir().unwrap();
1401 let sock_path = dir.path().join("test.sock");
1402
1403 let listener = tokio::net::UnixListener::bind(&sock_path).unwrap();
1404 let connect_fut = tokio::net::UnixStream::connect(&sock_path);
1405 let accept_fut = listener.accept();
1406
1407 let (client_result, accept_result) = tokio::join!(connect_fut, accept_fut);
1408 let _client = client_result.unwrap();
1409 let (server_stream, _) = accept_result.unwrap();
1410
1411 let daemon_euid = current_euid();
1412 let peer = check_peer_cred(&server_stream, daemon_euid);
1413 assert!(
1414 peer.is_some(),
1415 "same-user connection should pass peer check"
1416 );
1417
1418 let ctx = peer.unwrap();
1419 assert_eq!(ctx.uid, daemon_euid);
1420 // PID should be available on macOS and Linux.
1421 assert!(ctx.pid.is_some(), "peer PID should be available");
1422 }
1423
1424 /// Test that a UID mismatch is correctly rejected.
1425 /// We simulate this by passing a fake daemon_euid that doesn't match.
1426 #[cfg(unix)]
1427 #[tokio::test]
1428 async fn peer_cred_rejects_uid_mismatch() {
1429 let dir = tempfile::tempdir().unwrap();
1430 let sock_path = dir.path().join("test_mismatch.sock");
1431
1432 let listener = tokio::net::UnixListener::bind(&sock_path).unwrap();
1433 let connect_fut = tokio::net::UnixStream::connect(&sock_path);
1434 let accept_fut = listener.accept();
1435
1436 let (client_result, accept_result) = tokio::join!(connect_fut, accept_fut);
1437 let _client = client_result.unwrap();
1438 let (server_stream, _) = accept_result.unwrap();
1439
1440 // Use a fake daemon_euid that won't match the test process.
1441 let fake_euid = current_euid().wrapping_add(1);
1442 let peer = check_peer_cred(&server_stream, fake_euid);
1443 assert!(peer.is_none(), "mismatched UID should be rejected");
1444 }
1445
1446 #[test]
1447 fn lifecycle_log_appends_one_line_per_event() {
1448 let dir = tempfile::tempdir().unwrap();
1449 record_lifecycle_event(dir.path(), "start", "owner=mcp");
1450 record_lifecycle_event(dir.path(), "shutdown", "reason=signal");
1451 let contents = std::fs::read_to_string(dir.path().join("lifecycle.log")).unwrap();
1452 let lines: Vec<&str> = contents.lines().collect();
1453 assert_eq!(lines.len(), 2, "exactly two events recorded");
1454 for line in &lines {
1455 // ts<TAB>pid<TAB>event<TAB>detail
1456 let cols: Vec<&str> = line.split('\t').collect();
1457 assert_eq!(cols.len(), 4, "each line has 4 tab-separated fields");
1458 // ts and pid must be valid integers.
1459 assert!(cols[0].parse::<u64>().is_ok());
1460 assert!(cols[1].parse::<u32>().is_ok());
1461 }
1462 assert!(lines[0].contains("\tstart\towner=mcp"));
1463 assert!(lines[1].contains("\tshutdown\treason=signal"));
1464 }
1465
1466 #[test]
1467 fn lifecycle_log_strips_newlines_and_tabs_in_detail() {
1468 let dir = tempfile::tempdir().unwrap();
1469 record_lifecycle_event(dir.path(), "panic", "line1\nline2\twith tab\rcr");
1470 let contents = std::fs::read_to_string(dir.path().join("lifecycle.log")).unwrap();
1471 // Exactly one newline (the trailing one) — so exactly one logical line.
1472 assert_eq!(contents.matches('\n').count(), 1);
1473 assert!(contents.contains("line1 line2 with tab cr"));
1474 }
1475
1476 #[test]
1477 fn lifecycle_log_silently_succeeds_when_dir_missing() {
1478 // Should not panic when target directory does not exist — best-effort.
1479 let dir = tempfile::tempdir().unwrap();
1480 let bogus = dir.path().join("nonexistent-subdir");
1481 record_lifecycle_event(&bogus, "start", "x");
1482 assert!(!bogus.join("lifecycle.log").exists());
1483 }
1484
1485 /// Concurrent appenders interleave bytes mid-line above PIPE_BUF. A
1486 /// pathological panic payload (large Debug-formatted struct, JSON dump
1487 /// from a serde error) can easily exceed 4 KB. We cap the on-disk line
1488 /// well below PIPE_BUF so POSIX append atomicity holds. The line still
1489 /// ends with `\n` so `lines()` consumers and the trim path see a clean
1490 /// record, and the truncation point sits on a UTF-8 char boundary so a
1491 /// multibyte character is never split mid-encoding.
1492 #[test]
1493 fn lifecycle_log_caps_line_below_pipe_buf() {
1494 let dir = tempfile::tempdir().unwrap();
1495 // 10 KB of `é` (2-byte UTF-8) — exercises both the size cap AND the
1496 // char-boundary requirement. A naive byte-truncate would land mid-
1497 // multibyte and produce invalid UTF-8 on disk.
1498 let huge_detail: String = "é".repeat(5_000); // 10_000 bytes
1499 record_lifecycle_event(dir.path(), "panic", &huge_detail);
1500
1501 let log = std::fs::read_to_string(dir.path().join("lifecycle.log")).unwrap();
1502 assert!(
1503 log.len() <= LIFECYCLE_MAX_LINE_BYTES,
1504 "line on disk ({} bytes) must not exceed cap ({})",
1505 log.len(),
1506 LIFECYCLE_MAX_LINE_BYTES
1507 );
1508 assert!(
1509 log.ends_with('\n'),
1510 "truncated line must still end with newline so lines() yields one record"
1511 );
1512 assert!(
1513 log.contains("\tpanic\t"),
1514 "event tag must survive truncation (it sits in the prefix)"
1515 );
1516 // `read_to_string` itself would have errored if truncation split a
1517 // UTF-8 char, but assert explicitly so the failure mode is named.
1518 assert!(
1519 log.is_char_boundary(log.len()),
1520 "truncation must land on UTF-8 char boundary"
1521 );
1522 }
1523
1524 #[test]
1525 fn run_panic_cleanup_removes_sock_pid_and_appends_lifecycle_event() {
1526 let dir = tempfile::tempdir().unwrap();
1527 // Pre-create the daemon files the panic hook is supposed to remove.
1528 std::fs::write(dir.path().join("mati.sock"), "").unwrap();
1529 std::fs::write(dir.path().join("mati.pid"), r#"{"pid":42}"#).unwrap();
1530
1531 run_panic_cleanup(dir.path(), "src/example.rs:99", "boom");
1532
1533 // Files removed.
1534 assert!(
1535 !dir.path().join("mati.sock").exists(),
1536 "panic hook must remove mati.sock so sibling daemons can rebind"
1537 );
1538 assert!(
1539 !dir.path().join("mati.pid").exists(),
1540 "panic hook must remove mati.pid so sibling stale-checks see no live daemon"
1541 );
1542 // Lifecycle event recorded with location + payload preserved.
1543 let log = std::fs::read_to_string(dir.path().join("lifecycle.log")).unwrap();
1544 assert!(log.contains("\tpanic\t"), "event tagged 'panic'");
1545 assert!(log.contains("src/example.rs:99"), "location preserved");
1546 assert!(log.contains("boom"), "payload preserved");
1547 }
1548
1549 #[test]
1550 fn run_panic_cleanup_is_safe_when_files_already_absent() {
1551 // The panic hook may run after another path has already cleaned up
1552 // (e.g., explicit shutdown ran first, then a panic during exit).
1553 // Cleanup must be idempotent — no crash, no error.
1554 let dir = tempfile::tempdir().unwrap();
1555 run_panic_cleanup(dir.path(), "src/x.rs:1", "noop");
1556 // Lifecycle log should still be written even when no files needed removal.
1557 assert!(dir.path().join("lifecycle.log").exists());
1558 }
1559
1560 #[test]
1561 fn trim_lifecycle_log_keeps_last_n_lines() {
1562 let dir = tempfile::tempdir().unwrap();
1563 let path = dir.path().join(LIFECYCLE_FILENAME);
1564 // Write 100 events, trim to last 10.
1565 let body: String = (0..100)
1566 .map(|i| format!("{i}\t{i}\tevent{i}\tdetail{i}\n"))
1567 .collect();
1568 std::fs::write(&path, body).unwrap();
1569
1570 trim_lifecycle_log(dir.path(), 10);
1571
1572 let after = std::fs::read_to_string(&path).unwrap();
1573 let lines: Vec<&str> = after.lines().collect();
1574 assert_eq!(lines.len(), 10, "trimmed log should have exactly N lines");
1575 // Kept the last 10: events 90..=99.
1576 assert!(
1577 lines[0].contains("\tevent90\t"),
1578 "first kept line: {}",
1579 lines[0]
1580 );
1581 assert!(
1582 lines[9].contains("\tevent99\t"),
1583 "last kept line: {}",
1584 lines[9]
1585 );
1586 // No leftover .tmp.
1587 assert!(!path.with_extension("log.tmp").exists());
1588 }
1589
1590 #[test]
1591 fn trim_lifecycle_log_noop_when_under_cap() {
1592 let dir = tempfile::tempdir().unwrap();
1593 let path = dir.path().join(LIFECYCLE_FILENAME);
1594 let body = "0\t0\tstart\tdetail\n1\t0\tstop\tclean\n";
1595 std::fs::write(&path, body).unwrap();
1596 let before = std::fs::read(&path).unwrap();
1597
1598 trim_lifecycle_log(dir.path(), 10);
1599
1600 let after = std::fs::read(&path).unwrap();
1601 assert_eq!(before, after, "trim must be a no-op when under cap");
1602 }
1603
1604 /// Regression: pass-21 checkpoint B. If a hostile or buggy actor wrote
1605 /// a multi-gigabyte `lifecycle.log` (or filled the file with binary
1606 /// garbage that happens to be huge), the previous trim path would
1607 /// `read_to_string` the entire file at daemon startup and OOM the
1608 /// process. Startup must never block or OOM on a corrupt log
1609 /// (P9: graceful degradation). The size guard truncates pathological
1610 /// files to empty and continues, sacrificing the (already corrupt)
1611 /// observability in favor of a successful daemon start.
1612 #[test]
1613 fn trim_lifecycle_log_truncates_pathologically_huge_file() {
1614 let dir = tempfile::tempdir().unwrap();
1615 let path = dir.path().join(LIFECYCLE_FILENAME);
1616
1617 // Write a file just over the read-cap. We don't need a real 64 MB
1618 // file to exercise the guard — we sparse-extend the file so the
1619 // metadata len() reads above the threshold without actually
1620 // allocating that much disk. (On the systems mati supports this
1621 // produces a sparse file; on filesystems that don't honor sparse
1622 // writes the test just uses a real 64 MB+1 byte file. Either way
1623 // the assertion holds.)
1624 {
1625 use std::io::{Seek, SeekFrom, Write};
1626 let mut f = std::fs::File::create(&path).unwrap();
1627 // Seek past the threshold so the file's reported length
1628 // exceeds LIFECYCLE_TRIM_MAX_READ_BYTES without writing the
1629 // intervening bytes. set_len would also work but seek+write
1630 // is the most portable form.
1631 f.seek(SeekFrom::Start(LIFECYCLE_TRIM_MAX_READ_BYTES + 1))
1632 .unwrap();
1633 f.write_all(b"x").unwrap();
1634 }
1635 let pre_size = std::fs::metadata(&path).unwrap().len();
1636 assert!(
1637 pre_size > LIFECYCLE_TRIM_MAX_READ_BYTES,
1638 "test setup: file must exceed the read cap"
1639 );
1640
1641 // The trim must not panic, must not OOM, and must reduce the
1642 // file's size to zero (it was truncated as pathological).
1643 trim_lifecycle_log(dir.path(), 10);
1644
1645 let post_meta = std::fs::metadata(&path).unwrap();
1646 assert!(
1647 post_meta.is_file(),
1648 "lifecycle.log should still exist after pathological trim"
1649 );
1650 assert_eq!(
1651 post_meta.len(),
1652 0,
1653 "pathologically large lifecycle.log must be truncated to empty so startup does not OOM"
1654 );
1655 // No leftover .tmp from the truncation path (we don't use tmp+rename here).
1656 assert!(!path.with_extension("log.tmp").exists());
1657 }
1658
1659 /// The size guard must not fire on legitimate (sub-cap) files —
1660 /// regression check that the new ceiling does not break the normal
1661 /// trim path.
1662 #[test]
1663 fn trim_lifecycle_log_size_guard_does_not_fire_under_cap() {
1664 let dir = tempfile::tempdir().unwrap();
1665 let path = dir.path().join(LIFECYCLE_FILENAME);
1666 // 100 events ≈ 2 KB, well under the 64 MB cap.
1667 let body: String = (0..100)
1668 .map(|i| format!("{i}\t{i}\tevent{i}\tdetail{i}\n"))
1669 .collect();
1670 std::fs::write(&path, &body).unwrap();
1671
1672 trim_lifecycle_log(dir.path(), 10);
1673
1674 // Size guard should NOT have nuked the file — normal trim path
1675 // ran instead and kept the last 10 events.
1676 let after = std::fs::read_to_string(&path).unwrap();
1677 let lines: Vec<&str> = after.lines().collect();
1678 assert_eq!(
1679 lines.len(),
1680 10,
1681 "normal trim path must run for sub-cap files"
1682 );
1683 assert!(lines[0].contains("event90"));
1684 assert!(lines[9].contains("event99"));
1685 }
1686
1687 #[test]
1688 fn trim_lifecycle_log_silently_succeeds_on_missing_log() {
1689 let dir = tempfile::tempdir().unwrap();
1690 // No log file yet — must not panic, must not create one.
1691 trim_lifecycle_log(dir.path(), 10);
1692 assert!(!dir.path().join(LIFECYCLE_FILENAME).exists());
1693 }
1694
1695 #[test]
1696 fn install_panic_hook_is_idempotent() {
1697 // Multiple calls must not crash. We can't easily test that the
1698 // FIRST root is honored across subsequent calls (that would
1699 // require process-global state inspection), but the contract is
1700 // "second call is a no-op" — exercised here.
1701 let dir = tempfile::tempdir().unwrap();
1702 install_panic_hook(dir.path().to_path_buf());
1703 install_panic_hook(dir.path().join("a-different-root"));
1704 // No assertion needed — test passes if neither call panics.
1705 }
1706
1707 /// `u64_to_decimal_bytes` must produce the same digits as `format!("{n}")`
1708 /// across boundary cases (zero, single digit, max u64). Any divergence
1709 /// would silently corrupt the panic-path lifecycle entry's timestamp.
1710 #[test]
1711 fn u64_to_decimal_bytes_matches_format() {
1712 for n in [
1713 0u64,
1714 1,
1715 9,
1716 10,
1717 99,
1718 100,
1719 12345,
1720 1_700_000_000,
1721 u64::MAX / 2,
1722 u64::MAX,
1723 ] {
1724 let mut buf = [0u8; 20];
1725 let len = u64_to_decimal_bytes(n, &mut buf);
1726 assert_eq!(
1727 std::str::from_utf8(&buf[..len]).unwrap(),
1728 n.to_string(),
1729 "decimal mismatch for {n}"
1730 );
1731 }
1732 }
1733
1734 /// Parity guard for Fix 3: the no-alloc panic-path formatter
1735 /// (`write_lifecycle_line`) must produce byte-identical output to the
1736 /// heap path's `format!("{ts}\t{pid}\t{event}\t{safe_detail}\n")` for
1737 /// representative inputs. If they ever drift, an external log consumer
1738 /// (`mati doctor`'s `read_lifecycle_tail`, the integration tests' line
1739 /// parsers) will silently see panic-path entries differently from
1740 /// normal-path entries.
1741 #[test]
1742 fn no_alloc_panic_format_matches_heap_format() {
1743 // Fixed inputs so the test is deterministic — the real writer reads
1744 // ts from the wall clock; here we pass it explicitly.
1745 let ts: u64 = 1_700_000_000;
1746 let pid: u32 = 42;
1747 let pid_prefix = format!("{pid}\t");
1748 let event = "panic";
1749
1750 // Helper: reproduce the heap path's full formatting + truncation
1751 // from `record_lifecycle_event` so we can compare bytes.
1752 fn heap_format(ts: u64, pid_prefix: &str, event: &str, detail: &str) -> String {
1753 let safe_detail: String = detail
1754 .chars()
1755 .map(|c| match c {
1756 '\t' | '\n' | '\r' => ' ',
1757 c => c,
1758 })
1759 .collect();
1760 let mut line = format!("{ts}\t{pid_prefix}{event}\t{safe_detail}\n");
1761 if line.len() > LIFECYCLE_MAX_LINE_BYTES {
1762 let mut cut = LIFECYCLE_MAX_LINE_BYTES - 1;
1763 while cut > 0 && !line.is_char_boundary(cut) {
1764 cut -= 1;
1765 }
1766 line.truncate(cut);
1767 line.push('\n');
1768 }
1769 line
1770 }
1771
1772 // Representative case 1: a typical panic with location + payload.
1773 let location = "src/mcp/server.rs:128";
1774 let payload = "boom!";
1775 let detail = format!("{location} {payload}");
1776 let heap = heap_format(ts, &pid_prefix, event, &detail);
1777 let mut buf = [0u8; LIFECYCLE_MAX_LINE_BYTES];
1778 let n = write_lifecycle_line(
1779 &mut buf,
1780 ts,
1781 pid_prefix.as_bytes(),
1782 event,
1783 &[location, payload],
1784 );
1785 assert_eq!(
1786 std::str::from_utf8(&buf[..n]).unwrap(),
1787 heap,
1788 "panic-path format must match heap path for typical input"
1789 );
1790
1791 // Representative case 2: payload contains \t \n \r — sanitization
1792 // must produce identical output through both paths.
1793 let location_2 = "src/x.rs:1";
1794 let payload_2 = "line1\nline2\twith tab\rcr";
1795 let detail_2 = format!("{location_2} {payload_2}");
1796 let heap_2 = heap_format(ts, &pid_prefix, event, &detail_2);
1797 let mut buf_2 = [0u8; LIFECYCLE_MAX_LINE_BYTES];
1798 let n2 = write_lifecycle_line(
1799 &mut buf_2,
1800 ts,
1801 pid_prefix.as_bytes(),
1802 event,
1803 &[location_2, payload_2],
1804 );
1805 assert_eq!(
1806 std::str::from_utf8(&buf_2[..n2]).unwrap(),
1807 heap_2,
1808 "panic-path format must match heap path with embedded control chars"
1809 );
1810
1811 // Representative case 3: empty detail (e.g., a `start` event with no
1812 // detail string). Heap path passes "" as detail; no-alloc passes
1813 // a single empty `&str`.
1814 let heap_3 = heap_format(ts, &pid_prefix, "start", "");
1815 let mut buf_3 = [0u8; LIFECYCLE_MAX_LINE_BYTES];
1816 let n3 = write_lifecycle_line(&mut buf_3, ts, pid_prefix.as_bytes(), "start", &[""]);
1817 assert_eq!(
1818 std::str::from_utf8(&buf_3[..n3]).unwrap(),
1819 heap_3,
1820 "panic-path format must match heap path with empty detail"
1821 );
1822 }
1823
1824 /// `record_lifecycle_event_no_alloc` must return `false` (not panic, not
1825 /// silently succeed) when the requested root does not match the
1826 /// preopened-fd root — that's how `run_panic_cleanup` knows to fall back
1827 /// to the heap path. The `Some` branch with a matching root is covered
1828 /// by `tests/panic_hook_preopen.rs`, which owns its own process.
1829 #[test]
1830 fn record_lifecycle_event_no_alloc_returns_false_for_unknown_root() {
1831 // Use a temp dir that no test would have called install_panic_hook
1832 // on. Whether or not LIFECYCLE_LOG_FILE has been set by a sibling
1833 // test in this binary, this temp dir cannot be the registered root,
1834 // so the path-equality gate must reject it.
1835 let dir = tempfile::tempdir().unwrap();
1836 assert!(!record_lifecycle_event_no_alloc(
1837 dir.path(),
1838 "smoke",
1839 &["from-tests"]
1840 ));
1841 }
1842
1843 #[test]
1844 fn peer_context_pid_is_optional() {
1845 let ctx = PeerContext {
1846 uid: 501,
1847 pid: None,
1848 };
1849 assert!(ctx.pid.is_none());
1850
1851 let ctx2 = PeerContext {
1852 uid: 501,
1853 pid: Some(1234),
1854 };
1855 assert_eq!(ctx2.pid, Some(1234));
1856 }
1857}