varta-watch 0.2.0

//! Single-threaded observer: bind one or more transport listeners, decode
//! incoming VLP frames, surface beats / stalls / decode errors via [`Event`].
//!
//! The observer never spawns threads, never allocates after setup,
//! and surfaces at most one [`Event`] per call to [`Observer::poll`]. The
//! caller drives the loop — see `main.rs` for the daemon entrypoint.
//!
//! Multiple listeners (e.g. UDS + UDP) are polled round-robin. Each call to
//! [`Observer::poll`] tries every listener once; the first non-`WouldBlock`
//! event is returned but all remaining listeners are still tried, so a
//! busy listener cannot starve co-located listeners. If all listeners return
//! `WouldBlock`, stalls are drained and `None` is returned.

use std::io;
use std::path::Path;
use std::time::Duration;

use varta_vlp::{DecodeError, Frame, Status};

use crate::clock::{Clock, ClockSource};
use crate::listener::{BeatListener, PreThreadAttestation, UdsListener};
use crate::peer_cred::{BeatOrigin, RecvResult};
use crate::tracker::{EvictionPolicy, Tracker, Update};

/// Reason a beat was dropped by the rate limiter.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum RateLimitReason {
    PerPid = 0,
    Global = 1,
}

pub(crate) const RATE_LIMIT_N: usize = 2;

/// Forward-jump sentinel: a single poll-tick advance exceeding this threshold
/// is counted as an anomalous forward jump (sleep/wake, VM live migration,
/// hypervisor pause). 5 s is far above worst-case poll-tick latency on a
/// loaded host and far below any plausible sleep or migration interval.
const CLOCK_JUMP_FORWARD_THRESHOLD_NS: u64 = 5_000_000_000;

/// Re-read `/proc/sys/kernel/pid_max` at most every 60 s. Bounded so that an
/// operator-driven `sysctl -w kernel.pid_max=...` change is picked up without
/// daemon restart; coarse enough that the `/proc` read never appears on any
/// latency profile (the refresh runs in the maintenance phase, not on the
/// poll hot path). Hardcoded — no CLI knob, matching the self-watchdog
/// cadence convention.
const PID_MAX_REFRESH_INTERVAL_NS: u64 = 60_000_000_000;

/// Global per-observer token bucket — one shared across all senders.
///
/// Guards against per-pid rotation attacks where an attacker cycles through
/// fake pids to keep every per-pid bucket empty.
///
/// Disabled when `capacity_milli == 0`.  All arithmetic is integer-only
/// (milli-tokens) to stay allocation-free on the hot path.
pub(crate) struct GlobalRateLimit {
    /// Current token count in milli-tokens (1000 milli-tokens = 1 frame allowed).
    tokens_milli: u64,
    /// Maximum token count (= burst × 1000).
    capacity_milli: u64,
    /// Tokens added per nanosecond × 1_000_000 to keep integer math.
    /// Stored as (rate_per_sec * 1_000_000) to avoid float division.
    refill_numerator: u64,
    /// Denominator for refill: 1_000_000_000 (ns per sec).
    refill_denominator: u64,
    /// Nanosecond timestamp of last refill.
    last_refill_ns: u64,
}

impl GlobalRateLimit {
    /// Construct a new token bucket.  `rate_per_sec = 0` or `burst = 0`
    /// produces a disabled bucket (always allows).
    pub(crate) fn new(rate_per_sec: u32, burst: u32) -> Self {
        if rate_per_sec == 0 || burst == 0 {
            return GlobalRateLimit {
                tokens_milli: 0,
                capacity_milli: 0,
                refill_numerator: 0,
                refill_denominator: 1,
                last_refill_ns: 0,
            };
        }
        let capacity_milli = (burst as u64).saturating_mul(1_000);
        GlobalRateLimit {
            tokens_milli: capacity_milli,
            capacity_milli,
            refill_numerator: (rate_per_sec as u64).saturating_mul(1_000_000),
            refill_denominator: 1_000_000_000,
            last_refill_ns: 0,
        }
    }

    /// Disabled when capacity is 0 — all frames pass.
    #[inline]
    pub(crate) fn is_disabled(&self) -> bool {
        self.capacity_milli == 0
    }

    /// Try to consume one token.  Returns `true` if the frame is allowed,
    /// `false` if the global bucket is exhausted.
    #[inline]
    pub(crate) fn try_consume(&mut self, now_ns: u64) -> bool {
        if self.is_disabled() {
            return true;
        }
        // Lazy refill: add tokens proportional to elapsed time since last refill.
        let elapsed_ns = now_ns.saturating_sub(self.last_refill_ns);
        if elapsed_ns > 0 {
            let added = elapsed_ns
                .saturating_mul(self.refill_numerator)
                .checked_div(self.refill_denominator)
                .unwrap_or(0);
            self.tokens_milli = self
                .tokens_milli
                .saturating_add(added)
                .min(self.capacity_milli);
            self.last_refill_ns = now_ns;
        }
        // Consume 1000 milli-tokens (= 1 frame).
        if self.tokens_milli >= 1_000 {
            self.tokens_milli -= 1_000;
            true
        } else {
            false
        }
    }
}

/// Event surfaced by [`Observer::poll`].
///
/// Each call to `poll` returns at most one event. Unknown-pid overflow and
/// out-of-order beats are silently dropped at this layer; the bench / metrics
/// sessions can layer counters on top without changing this enum.
#[derive(Debug)]
pub enum Event {
    /// A well-formed beat was accepted for a tracked pid.
    Beat {
        /// OS process id of the emitting agent.
        pid: u32,
        /// Decoded health status of the beat.
        status: Status,
        /// Application-defined payload carried by the beat.
        payload: u32,
        /// Monotonic nonce of the beat.
        nonce: u64,
        /// Transport-class classification of the beat (see [`BeatOrigin`]).
        /// Recovery commands consult this to refuse firing on non-kernel-attested origins.
        origin: BeatOrigin,
        /// Kernel-attested PID-namespace inode of the sender (Linux only).
        /// `None` for non-Linux platforms, UDP transports, or when the peer's
        /// `/proc/<pid>/ns/pid` was unreadable.
        pid_ns_inode: Option<u64>,
        /// Observer-local timestamp (ns since [`Observer`] start) when this
        /// event was produced.
        observer_ns: u64,
    },
    /// A tracked pid has not beaten within the configured threshold and the
    /// observer has not yet surfaced a stall event for this silence run.
    Stall {
        /// OS process id of the silent agent.
        pid: u32,
        /// Last nonce observed for this pid.
        last_nonce: u64,
        /// Observer-local timestamp (ns since [`Observer`] start) of the
        /// last accepted beat for this pid.
        last_ns: u64,
        /// Transport origin pinned by the slot's first beat. Recovery
        /// refuses to spawn for `NetworkUnverified` unless the operator has
        /// explicitly opted in via
        /// `--i-accept-recovery-on-unauthenticated-transport`.
        origin: BeatOrigin,
        /// PID-namespace inode pinned by the slot's first beat (Linux only).
        /// Used by main.rs to construct the recovery `StallSource`: a
        /// `Some(_)` value that differs from the observer's namespace inode
        /// indicates a cross-namespace agent and gates recovery refusal.
        pid_ns_inode: Option<u64>,
        /// Observer-local timestamp (ns since [`Observer`] start) when this
        /// stall event was produced.
        observer_ns: u64,
    },
    /// A 32-byte payload arrived but failed VLP decoding.
    Decode(DecodeError, u64),
    /// Frame decoded but the `frame.pid` does not match the kernel-verified
    /// peer PID of the sender. The claimed pid is preserved so exporters can
    /// record what the frame *claimed* to be.
    AuthFailure {
        /// The pid the frame on the wire claimed to be.
        claimed_pid: u32,
        /// Observer-local timestamp (ns since [`Observer`] start) when this
        /// event was produced.
        observer_ns: u64,
    },
    /// A beat arrived for an already-tracked pid, but its transport origin
    /// disagreed with the origin pinned by the slot's first beat. The slot
    /// was not mutated; the beat was dropped. First-origin-wins prevents an
    /// attacker on an untrusted transport from "tainting" a slot that
    /// legitimately belongs to a kernel-attested agent.
    OriginConflict {
        /// The pid claimed by the dropped beat (same as the existing slot's pid).
        claimed_pid: u32,
        /// Transport origin observed on this datagram.
        observed_origin: BeatOrigin,
        /// Origin pinned by the slot (the one that "won" the conflict).
        slot_origin: BeatOrigin,
        /// Observer-local timestamp (ns since [`Observer`] start) when this
        /// event was produced.
        observer_ns: u64,
    },
    /// A kernel-attested beat arrived whose peer PID-namespace inode differs
    /// from the observer's namespace (Linux only). Recovery for the
    /// associated pid cannot safely fire because the pid is in a different
    /// namespace — `kill(2)` and `systemctl` would target the wrong process.
    /// The beat was dropped at receive; the tracker was not modified.
    NamespaceConflict {
        /// The pid claimed by the dropped beat.
        claimed_pid: u32,
        /// PID-namespace inode of the sender (Linux only; `None` when
        /// `/proc/<peer_pid>/ns/pid` was unreadable).
        observed_ns_inode: Option<u64>,
        /// The observer's own PID-namespace inode (cached at startup; `None`
        /// when `/proc/self/ns/pid` is unreadable, which usually means the
        /// platform isn't Linux).
        observer_ns_inode: Option<u64>,
        /// Observer-local timestamp (ns since [`Observer`] start) when this
        /// event was produced.
        observer_ns: u64,
    },
    /// Receiving from a listener failed with an error other than
    /// `WouldBlock` / `TimedOut`.
    Io(io::Error, u64),
    /// Ancillary data truncated by the kernel (`MSG_CTRUNC` on Linux).
    /// Indicates the kernel's ancillary-data buffer was too small for the
    /// per-message metadata — a kernel-level buffer sizing issue.
    CtrlTruncated(io::Error, u64),
}

/// Observer bound to one or more transport listeners.
///
/// The observer owns all listeners; cleanup (e.g. socket file unlink) happens
/// when the [`Observer`] is dropped.
pub struct Observer {
    listeners: Vec<Box<dyn BeatListener>>,
    tracker: Tracker,
    threshold_ns: u64,
    clock: Clock,
    stall_queue: Vec<Option<Event>>,
    stall_cursor: usize,
    /// Next index to start polling from for fair round-robin across listeners.
    next_listener_start: usize,
    /// Minimum inter-beat interval applied per pid, in nanoseconds.
    /// `None` means no rate limiting (the default).
    rate_limit_interval_ns: Option<u64>,
    /// Beats dropped by the per-pid and global rate limiters since the last drain.
    /// Index 0 = per-pid (`RateLimitReason::PerPid`), 1 = global (`RateLimitReason::Global`).
    rate_limited_total: [u64; RATE_LIMIT_N],
    /// Global per-observer token bucket for defeating per-pid rotation attacks.
    global_rl: GlobalRateLimit,
    /// Monotonicity guard — last `now_ns()` value, clamped forward-only to
    /// survive TSC drift and VM live migration.
    last_now_ns: u64,
    /// Count of times the underlying monotonic clock returned a value
    /// strictly less than `last_now_ns` and the clamp absorbed the
    /// regression. Surfaced as `varta_observer_clock_regression_total` so
    /// operators can alert on TSC drift / VM-live-migration events that
    /// would otherwise be invisible. Drained via
    /// [`Observer::drain_clock_regressions`].
    clock_regressions: u64,
    /// Count of times consecutive `now_ns()` readings advanced by more than
    /// [`CLOCK_JUMP_FORWARD_THRESHOLD_NS`] in a single poll tick. This
    /// captures sleep/wake on `monotonic-raw`/`boottime`, VM live migration,
    /// and hypervisor pauses that are invisible to the regression counter.
    /// Surfaced as `varta_observer_clock_jump_forward_total`. Drained via
    /// [`Observer::drain_clock_jumps_forward`].
    clock_jumps_forward: u64,
    /// When true, beats from agents whose kernel-attested PID namespace
    /// differs from the observer's are admitted into the tracker (and may
    /// later be passed to recovery). Set by `--allow-cross-namespace-agents`.
    /// Default `false` — beats from cross-namespace agents are dropped at
    /// ingress and counted via [`Observer::drain_cross_namespace_drops`].
    allow_cross_namespace: bool,
    /// Count of beats dropped at ingress because the kernel-attested peer's
    /// PID namespace inode differs from the observer's. Linux-only signal;
    /// 0 on other platforms.
    cross_namespace_drops: u64,
    /// Maximum PID accepted on the wire — cached from
    /// `/proc/sys/kernel/pid_max` on Linux at observer startup. On non-Linux
    /// targets and when `/proc` is unreadable, this is `u32::MAX` (gate
    /// effectively disabled). See [`crate::pid_max::read_pid_max`].
    pid_max: u32,
    /// Count of beats dropped at ingress because `frame.pid > pid_max`.
    /// Surfaced as `varta_frame_rejected_pid_above_max_total`.
    pid_above_max_drops: u64,
    /// Monotonic-clock timestamp (ns) of the most recent `pid_max` refresh
    /// from `/proc/sys/kernel/pid_max`. `0` until the first periodic refresh
    /// fires from [`Observer::maybe_refresh_pid_max`]; the value cached at
    /// `Observer::new` covers the startup window until then. Compared against
    /// `self.now_ns()` with [`PID_MAX_REFRESH_INTERVAL_NS`].
    last_pid_max_refresh_ns: u64,
    /// Effective `SO_RCVBUF` size granted by the kernel for the observer UDS,
    /// in bytes.  `0` if `--uds-rcvbuf-bytes 0` was used or tuning failed.
    /// Set by [`Observer::bind`] from the [`UdsListener::rcvbuf_bytes`] accessor.
    pub uds_rcvbuf_bytes: u32,
}

impl Observer {
    /// Create an empty observer with no listeners. Use
    /// [`Observer::add_listener`] to attach transports, or call
    /// [`Observer::bind`] for the common single-UDS case.
    ///
    /// `tracker_capacity` sets the maximum number of distinct agent pids
    /// tracked concurrently. Beats for new pids beyond this limit are
    /// dropped with [`Update::CapacityExceeded`] (the counter is surfaced
    /// via `varta_tracker_capacity_exceeded_total`).
    ///
    /// `eviction_policy` controls which slot to reclaim when the tracker
    /// is full and a new pid arrives ([`EvictionPolicy::Strict`] only
    /// evicts confirmed-stalled agents; [`EvictionPolicy::Balanced`] also
    /// evicts the oldest active slot to prevent capacity exhaustion).
    ///
    /// `max_beat_rate` is an optional per-pid rate limit in beats per
    /// second.  When set, beats arriving faster than this rate from the
    /// same pid are dropped and counted via [`Observer::drain_rate_limited`].
    /// `None` (the default) disables rate limiting.
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        threshold: Duration,
        tracker_capacity: usize,
        eviction_policy: EvictionPolicy,
        eviction_scan_window: usize,
        max_beat_rate: Option<u32>,
        global_beat_rate: u32,
        global_beat_burst: u32,
        clock_source: ClockSource,
    ) -> io::Result<Self> {
        let threshold_ns = threshold.as_nanos().min(u64::MAX as u128) as u64;
        let rate_limit_interval_ns = max_beat_rate.and_then(|rps| {
            if rps == 0 {
                None
            } else {
                // Convert beats/sec to nanosecond interval.
                // Saturate at 1 ns (1 GHz rate) to avoid overflow.
                let interval_ns = 1_000_000_000u64.checked_div(rps as u64).unwrap_or(1);
                Some(interval_ns)
            }
        });
        let clock = Clock::new(clock_source).map_err(io::Error::from)?;
        Ok(Observer {
            listeners: Vec::new(),
            tracker: Tracker::new(tracker_capacity, eviction_policy, eviction_scan_window),
            threshold_ns,
            clock,
            stall_queue: Vec::with_capacity(tracker_capacity),
            stall_cursor: 0,
            next_listener_start: 0,
            rate_limit_interval_ns,
            rate_limited_total: [0; RATE_LIMIT_N],
            global_rl: GlobalRateLimit::new(global_beat_rate, global_beat_burst),
            last_now_ns: 0,
            clock_regressions: 0,
            clock_jumps_forward: 0,
            allow_cross_namespace: false,
            cross_namespace_drops: 0,
            pid_max: crate::pid_max::read_pid_max(),
            pid_above_max_drops: 0,
            last_pid_max_refresh_ns: 0,
            uds_rcvbuf_bytes: 0,
        })
    }

    /// Allow beats from agents whose kernel-attested PID namespace differs
    /// from the observer's own namespace. Default `false`. Wired from the
    /// `--allow-cross-namespace-agents` CLI flag.
    pub fn with_allow_cross_namespace(mut self, allow: bool) -> Self {
        self.allow_cross_namespace = allow;
        self
    }

    /// Create an observer from a single already-configured listener.
    #[allow(clippy::too_many_arguments)]
    pub fn from_listener<L: BeatListener + 'static>(
        listener: L,
        threshold: Duration,
        tracker_capacity: usize,
        eviction_policy: EvictionPolicy,
        eviction_scan_window: usize,
        max_beat_rate: Option<u32>,
        global_beat_rate: u32,
        global_beat_burst: u32,
        clock_source: ClockSource,
    ) -> io::Result<Self> {
        let mut obs = Self::new(
            threshold,
            tracker_capacity,
            eviction_policy,
            eviction_scan_window,
            max_beat_rate,
            global_beat_rate,
            global_beat_burst,
            clock_source,
        )?;
        obs.add_listener(Box::new(listener));
        Ok(obs)
    }

    /// Bind a Unix datagram socket at `path` and return an [`Observer`]
    /// with that single UDS listener.
    ///
    /// This is the backward-compatible convenience constructor for the common
    /// single-UDS case. For multi-transport setups, use [`Observer::new`]
    /// followed by [`Observer::add_listener`].
    #[allow(clippy::too_many_arguments)]
    pub fn bind(
        path: impl AsRef<Path>,
        threshold: Duration,
        socket_mode: u32,
        read_timeout: Duration,
        uds_rcvbuf_bytes: u32,
        tracker_capacity: usize,
        eviction_policy: EvictionPolicy,
        eviction_scan_window: usize,
        max_beat_rate: Option<u32>,
        global_beat_rate: u32,
        global_beat_burst: u32,
        clock_source: ClockSource,
        pre_thread: &PreThreadAttestation,
    ) -> io::Result<Self> {
        let listener = UdsListener::bind(
            path,
            socket_mode,
            read_timeout,
            uds_rcvbuf_bytes,
            pre_thread,
        )?;
        let rcvbuf = listener.rcvbuf_bytes();
        let mut obs = Self::from_listener(
            listener,
            threshold,
            tracker_capacity,
            eviction_policy,
            eviction_scan_window,
            max_beat_rate,
            global_beat_rate,
            global_beat_burst,
            clock_source,
        )?;
        obs.uds_rcvbuf_bytes = rcvbuf;
        Ok(obs)
    }

    /// Add a listener to the observer. The listener is polled in round-robin
    /// order alongside any existing listeners.
    pub fn add_listener(&mut self, listener: Box<dyn BeatListener>) {
        self.listeners.push(listener);
    }

    /// Poll every listener once round-robin and return the first
    /// non-`WouldBlock` [`Event`] found. Each listener is tried exactly
    /// once per call — a busy listener cannot starve others because the
    /// round-robin cursor (`next_listener_start`) advances past each
    /// non-`WouldBlock` listener on every call.
    ///
    /// **Latency bound:** worst-case per-call work is
    /// `N_listeners × per-listener-recv-cost + eviction_scan_window`.
    /// Under the canonical stress profile (3 listeners, 4096 tracker
    /// capacity, 256-slot eviction window) the p99 iteration time is
    /// ≤ 5 ms — see `book/src/architecture/observer-liveness.md` and the
    /// `tick-distribution` bench (`cargo run -p varta-bench --release --
    /// tick-distribution`) which asserts this bound under sustained load.
    ///
    /// This method never returns [`Event::Stall`] — queued stall events must
    /// be retrieved via [`Observer::poll_pending`].
    pub fn poll(&mut self) -> Option<Event> {
        let len = self.listeners.len();
        let start = self.next_listener_start;
        let mut first_event: Option<Event> = None;
        let mut round = 0;
        while round < len {
            let i = (start + round) % len;
            round += 1;
            match self.listeners[i].recv() {
                RecvResult::Authenticated {
                    peer_pid,
                    peer_uid: _,
                    peer_pid_ns_inode,
                    origin,
                    data,
                } => {
                    let now_ns = self.now_ns();
                    if first_event.is_none() {
                        self.next_listener_start = (i + 1) % len;
                    }
                    match Frame::decode(&data) {
                        Ok(frame) => {
                            // Observer-side PID range gate. VLP rejects 0/1
                            // as wire-format `BadPid`; here we additionally
                            // reject any pid above the kernel's configured
                            // `pid_max` (Linux) — no live process can hold
                            // that id, so the frame is either corrupted or
                            // forged. Non-Linux: `pid_max == u32::MAX`,
                            // gate is a no-op.
                            if frame.pid > self.pid_max {
                                self.pid_above_max_drops =
                                    self.pid_above_max_drops.saturating_add(1);
                                continue;
                            }
                            // Per-datagram PID verification — works on Linux
                            // (SCM_CREDENTIALS via SO_PASSCRED) and macOS
                            // (LOCAL_PEERTOKEN via getsockopt). For transports
                            // without kernel credential support, peer_pid is 0
                            // and this check is a no-op.
                            if peer_pid != 0 && frame.pid != peer_pid {
                                if first_event.is_none() {
                                    first_event = Some(Event::AuthFailure {
                                        claimed_pid: frame.pid,
                                        observer_ns: now_ns,
                                    });
                                }
                                continue;
                            }
                            // Global token bucket: drop BEFORE namespace /
                            // per-pid classification so a rotation attack
                            // cannot exhaust classification work.
                            if !self.global_rl.try_consume(now_ns) {
                                self.rate_limited_total[RateLimitReason::Global as usize] =
                                    self.rate_limited_total[RateLimitReason::Global as usize]
                                        .saturating_add(1);
                                continue;
                            }
                            // Cross-namespace gate (Linux only). When the
                            // kernel-attested peer's PID namespace inode
                            // differs from the observer's, the frame.pid
                            // cannot safely be used to target recovery
                            // commands. The check is a no-op on non-Linux
                            // (both inodes are `None`), for UDP transports
                            // (peer inode is `None`), and when the operator
                            // has opted in via --allow-cross-namespace-agents.
                            let observer_ns_inode =
                                crate::peer_cred::observer_pid_namespace_inode();
                            let cross_ns = matches!(
                                (observer_ns_inode, peer_pid_ns_inode),
                                (Some(a), Some(b)) if a != b
                            );
                            if cross_ns && !self.allow_cross_namespace {
                                self.cross_namespace_drops =
                                    self.cross_namespace_drops.saturating_add(1);
                                if first_event.is_none() {
                                    first_event = Some(Event::NamespaceConflict {
                                        claimed_pid: frame.pid,
                                        observed_ns_inode: peer_pid_ns_inode,
                                        observer_ns_inode,
                                        observer_ns: now_ns,
                                    });
                                }
                                continue;
                            }
                            // Per-pid rate limiting: if a minimum inter-beat
                            // interval is configured, skip frames that arrive
                            // too soon from the same pid.
                            if let Some(interval_ns) = self.rate_limit_interval_ns {
                                if let Some(last_ns) = self.tracker.last_ns_of(frame.pid) {
                                    if now_ns.saturating_sub(last_ns) < interval_ns {
                                        self.rate_limited_total[RateLimitReason::PerPid as usize] =
                                            self.rate_limited_total
                                                [RateLimitReason::PerPid as usize]
                                                .saturating_add(1);
                                        continue;
                                    }
                                }
                            }
                            // Capture the slot's pre-record pinned origin (if
                            // any) so an OriginConflict event can report what
                            // the slot was pinned to without an extra lookup
                            // afterwards.
                            let slot_origin_before = self.tracker.origin_of(frame.pid);
                            match self.tracker.record(
                                &frame,
                                now_ns,
                                self.threshold_ns,
                                origin,
                                peer_pid_ns_inode,
                            ) {
                                Update::Inserted | Update::Refreshed => {
                                    if first_event.is_none() {
                                        first_event = Some(Event::Beat {
                                            pid: frame.pid,
                                            status: frame.status,
                                            payload: frame.payload,
                                            nonce: frame.nonce,
                                            origin,
                                            pid_ns_inode: peer_pid_ns_inode,
                                            observer_ns: now_ns,
                                        });
                                    }
                                }
                                Update::OriginConflict => {
                                    if first_event.is_none() {
                                        first_event = Some(Event::OriginConflict {
                                            claimed_pid: frame.pid,
                                            observed_origin: origin,
                                            slot_origin: slot_origin_before.unwrap_or(origin),
                                            observer_ns: now_ns,
                                        });
                                    }
                                }
                                Update::NamespaceConflict => {
                                    if first_event.is_none() {
                                        first_event = Some(Event::NamespaceConflict {
                                            claimed_pid: frame.pid,
                                            observed_ns_inode: peer_pid_ns_inode,
                                            observer_ns_inode: self
                                                .tracker
                                                .pid_ns_inode_of(frame.pid)
                                                .flatten(),
                                            observer_ns: now_ns,
                                        });
                                    }
                                }
                                Update::OutOfOrder | Update::CapacityExceeded => {}
                            }
                        }
                        Err(e) => {
                            if first_event.is_none() {
                                first_event = Some(Event::Decode(e, now_ns));
                            }
                        }
                    }
                }
                RecvResult::WouldBlock => continue,
                RecvResult::ShortRead => continue,
                RecvResult::CtrlTruncated(e) => {
                    if first_event.is_none() {
                        self.next_listener_start = (i + 1) % len;
                        first_event = Some(Event::CtrlTruncated(e, self.now_ns()));
                    }
                }
                RecvResult::IoError(e) => {
                    if first_event.is_none() {
                        self.next_listener_start = (i + 1) % len;
                        first_event = Some(Event::Io(e, self.now_ns()));
                    }
                }
            }
        }
        self.drain_stalls();
        first_event
    }

    /// Return the next queued [`Event::Stall`], if any.
    pub fn poll_pending(&mut self) -> Option<Event> {
        if self.stall_cursor < self.stall_queue.len() {
            let stall = self.stall_queue[self.stall_cursor].take();
            self.stall_cursor += 1;
            return stall;
        }
        None
    }

    /// Whether the stall queue has unconsumed [`Event::Stall`] entries.
    pub fn has_pending_stalls(&self) -> bool {
        self.stall_cursor < self.stall_queue.len()
    }

    /// Observer-local nanosecond timestamp (ns since [`Observer`] start).
    ///
    /// Clamped to never decrease — on some platforms (VMs with TSC drift,
    /// live-migration pause-and-resume), the underlying clock can produce
    /// values that appear to go backwards. Without clamping, a forward clock
    /// jump after a backward excursion can cause false stall detections.
    ///
    /// The kernel clock backing this reading is selected via
    /// [`crate::clock::ClockSource`] (`--clock-source` CLI flag); see
    /// `book/src/architecture/safety-profiles.md` for the SRE vs. medical
    /// deployment matrix.
    pub fn now_ns(&mut self) -> u64 {
        let raw = self.clock.now_ns();
        self.apply_raw_clock(raw)
    }

    fn apply_raw_clock(&mut self, raw: u64) -> u64 {
        if raw < self.last_now_ns {
            self.clock_regressions = self.clock_regressions.saturating_add(1);
        } else if self.last_now_ns > 0
            && raw.saturating_sub(self.last_now_ns) > CLOCK_JUMP_FORWARD_THRESHOLD_NS
        {
            self.clock_jumps_forward = self.clock_jumps_forward.saturating_add(1);
        }
        self.last_now_ns = self.last_now_ns.max(raw);
        self.last_now_ns
    }

    /// Feed a synthetic raw clock value directly, bypassing `self.clock`.
    /// Only available in tests; allows forward-jump and regression scenarios
    /// without waiting for real time to advance.
    #[cfg(test)]
    pub(crate) fn apply_raw_clock_test(&mut self, raw: u64) -> u64 {
        self.apply_raw_clock(raw)
    }

    /// Drain and reset the clock-regression counter — number of times the
    /// kernel monotonic clock returned a value strictly less than the
    /// previously observed one and the forward clamp absorbed the
    /// regression. Non-zero values surface TSC drift, VM live migration,
    /// or other anomalous clock behavior that would otherwise be invisible.
    /// Surfaced as `varta_observer_clock_regression_total`.
    pub fn drain_clock_regressions(&mut self) -> u64 {
        let n = self.clock_regressions;
        self.clock_regressions = 0;
        n
    }

    /// Drain and reset the forward-jump counter — number of times the kernel
    /// monotonic clock advanced by more than [`CLOCK_JUMP_FORWARD_THRESHOLD_NS`]
    /// between adjacent poll ticks. Non-zero values indicate sleep/wake on
    /// `monotonic-raw`/`boottime`, VM live migration, or a hypervisor pause.
    /// Surfaced as `varta_observer_clock_jump_forward_total`.
    pub fn drain_clock_jumps_forward(&mut self) -> u64 {
        let n = self.clock_jumps_forward;
        self.clock_jumps_forward = 0;
        n
    }

    /// Inspect the kernel clock backing this observer's stall accounting.
    pub fn clock_source(&self) -> ClockSource {
        self.clock.source()
    }

    fn drain_stalls(&mut self) {
        if self.stall_cursor < self.stall_queue.len() {
            return;
        }
        let now_ns = self.now_ns();
        self.stall_queue.clear();
        self.stall_cursor = 0;
        self.tracker.drain_stalled_slots(
            now_ns,
            self.threshold_ns,
            |pid, last_nonce, last_ns, origin, pid_ns_inode| {
                self.stall_queue.push(Some(Event::Stall {
                    pid,
                    last_nonce,
                    last_ns,
                    origin,
                    pid_ns_inode,
                    observer_ns: now_ns,
                }));
            },
        );
    }

    /// Drain and reset the eviction counter.
    pub fn drain_evictions(&mut self) -> u64 {
        self.tracker.take_evictions()
    }

    /// Drain the pid of the most recently evicted slot, if any.
    pub fn drain_evicted_pid(&mut self) -> Option<u32> {
        self.tracker.take_evicted_pid()
    }

    /// Drain and reset the capacity-exceeded counter.
    pub fn drain_capacity_exceeded(&mut self) -> u64 {
        self.tracker.take_capacity_exceeded()
    }

    /// Drain and reset the nonce-wrap counter.
    pub fn drain_nonce_wraps(&mut self) -> u64 {
        self.tracker.take_nonce_wraps()
    }

    /// Drain and reset the count of bounded eviction-scan calls that ran
    /// the full [`crate::tracker::EVICTION_SCAN_WINDOW`] without finding a
    /// victim. Non-zero values prove the per-frame work cap engaged — i.e.
    /// the tracker was full and an attacker would otherwise have forced
    /// O(n) work per arriving frame.
    pub fn drain_eviction_scan_truncated(&mut self) -> u64 {
        self.tracker.take_eviction_scan_truncated()
    }

    /// Drain and reset the per-tracker origin-conflict counter — number of
    /// beats dropped because their transport origin disagreed with the
    /// slot's pinned origin (first-origin-wins). Surfaced as
    /// `varta_origin_conflict_total` in the Prometheus exporter.
    pub fn drain_origin_conflicts(&mut self) -> u64 {
        self.tracker.take_origin_conflicts()
    }

    /// Drain and reset the count of beats dropped at ingress because the
    /// peer's PID-namespace inode differs from the observer's. Surfaced as
    /// `varta_frame_namespace_mismatch_total` in the Prometheus exporter.
    pub fn drain_cross_namespace_drops(&mut self) -> u64 {
        let n = self.cross_namespace_drops;
        self.cross_namespace_drops = 0;
        n
    }

    /// Drain and reset the count of beats dropped at ingress because
    /// `frame.pid` exceeded the kernel's configured `pid_max`. Surfaced as
    /// `varta_frame_rejected_pid_above_max_total` in the Prometheus
    /// exporter. Linux-only signal; 0 on platforms where the gate defaults
    /// to `u32::MAX`.
    pub fn drain_pid_above_max_drops(&mut self) -> u64 {
        let n = self.pid_above_max_drops;
        self.pid_above_max_drops = 0;
        n
    }

    /// Observer's cached `pid_max`. Linux-only meaningful value; otherwise
    /// `u32::MAX`. Exposed for tests and for the Prometheus exporter's
    /// gauge.
    pub fn pid_max(&self) -> u32 {
        self.pid_max
    }

    /// Re-read `/proc/sys/kernel/pid_max` if at least
    /// [`PID_MAX_REFRESH_INTERVAL_NS`] has elapsed since the last refresh.
    /// Cheap no-op otherwise (single `u64` compare).
    ///
    /// Intended to be called from the daemon's maintenance phase — *not*
    /// from `poll()` — so the I/O hot path stays untouched. Picks up
    /// runtime `sysctl -w kernel.pid_max=...` changes within one interval.
    /// On non-Linux targets, [`crate::pid_max::read_pid_max`] returns
    /// `u32::MAX` so the gate stays effectively disabled and this method
    /// is a steady no-op.
    ///
    /// Returns `true` when a refresh actually ran this call (regardless of
    /// whether the read value changed), `false` when gated by the interval.
    pub fn maybe_refresh_pid_max(&mut self) -> bool {
        let now_ns = self.now_ns();
        if now_ns.saturating_sub(self.last_pid_max_refresh_ns) < PID_MAX_REFRESH_INTERVAL_NS {
            return false;
        }
        self.pid_max = crate::pid_max::read_pid_max();
        self.last_pid_max_refresh_ns = now_ns;
        true
    }

    /// Drain and reset the per-tracker namespace-conflict counter — beats
    /// dropped because the beat's namespace inode disagreed with the slot's
    /// pinned namespace inode (first-namespace-wins). Surfaced as
    /// `varta_tracker_namespace_conflict_total`.
    pub fn drain_namespace_conflicts(&mut self) -> u64 {
        self.tracker.take_namespace_conflicts()
    }

    /// Observer's own PID-namespace inode (Linux only; cached). Used by
    /// `main.rs` to construct recovery `StallSource` values that include
    /// the observer's namespace for the audit record.
    pub fn observer_pid_namespace_inode(&self) -> Option<u64> {
        crate::peer_cred::observer_pid_namespace_inode()
    }

    /// Drain and reset the tracker invariant-violation counter. Non-zero
    /// values surface that a defensive fall-through in the hot path
    /// triggered (e.g. a stale `PidIndex` entry pointed at an out-of-range
    /// slot). Exposed as `varta_tracker_invariant_violations_total`.
    pub fn drain_invariant_violations(&mut self) -> u64 {
        self.tracker.take_invariant_violations()
    }

    /// Drain and reset the `PidIndex` probe-exhaustion counter — number of
    /// times a pid lookup ran the full `MAX_PROBE` budget without finding
    /// a match. Surfaced as `varta_tracker_pid_index_probe_exhausted_total`.
    pub fn drain_pid_index_probe_exhausted(&mut self) -> u64 {
        self.tracker.take_probe_exhausted()
    }

    /// Drain and reset the per-pid rate-limited counter.
    pub fn drain_per_pid_rate_limited(&mut self) -> u64 {
        let n = self.rate_limited_total[RateLimitReason::PerPid as usize];
        self.rate_limited_total[RateLimitReason::PerPid as usize] = 0;
        n
    }

    /// Drain and reset the global rate-limited counter.
    pub fn drain_global_rate_limited(&mut self) -> u64 {
        let n = self.rate_limited_total[RateLimitReason::Global as usize];
        self.rate_limited_total[RateLimitReason::Global as usize] = 0;
        n
    }

    /// Effective `SO_RCVBUF` size granted by the kernel for the observer UDS.
    pub fn uds_rcvbuf_bytes(&self) -> u32 {
        self.uds_rcvbuf_bytes
    }

    /// Drain and reset the AEAD decryption failure counter across all
    /// listeners.
    pub fn drain_decrypt_failures(&mut self) -> u64 {
        self.listeners
            .iter_mut()
            .map(|l| l.drain_decrypt_failures())
            .sum()
    }

    /// Drain and reset the truncated-datagram counter across all listeners.
    pub fn drain_truncated(&mut self) -> u64 {
        self.listeners.iter_mut().map(|l| l.drain_truncated()).sum()
    }

    /// Drain and reset the sender-state-full counter across all listeners.
    pub fn drain_sender_state_full(&mut self) -> u64 {
        self.listeners
            .iter_mut()
            .map(|l| l.drain_sender_state_full())
            .sum()
    }

    /// Drain and reset the AEAD-decryption-attempt counter across all
    /// listeners. In steady state this equals
    /// `frames_received * (keys.len() + master_key_configured as u64)` for
    /// the secure-UDP listener — every loaded key is tried per frame to
    /// remove the key-rotation timing side-channel.
    pub fn drain_aead_attempts(&mut self) -> u64 {
        self.listeners
            .iter_mut()
            .map(|l| l.drain_aead_attempts())
            .sum()
    }

    /// Drain and reset the parent-directory fsync failure counter for UDS
    /// bind.  Non-zero only when the OS returned an error from `fsync(2)` on
    /// the socket's parent directory during startup.  Surfaced as
    /// `varta_socket_bind_dir_fsync_failed_total`.
    pub fn drain_bind_dir_fsync_failures() -> u64 {
        crate::listener::drain_bind_dir_fsync_failures()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tracker::DEFAULT_EVICTION_SCAN_WINDOW;
    use std::path::PathBuf;
    use std::sync::atomic::{AtomicU64, Ordering};

    static TEST_COUNTER: AtomicU64 = AtomicU64::new(0);

    fn unique_sock_path() -> PathBuf {
        let n = TEST_COUNTER.fetch_add(1, Ordering::Relaxed);
        let mut p = std::env::temp_dir();
        p.push(format!(
            "varta-observer-drop-{}-{}.sock",
            std::process::id(),
            n
        ));
        let _ = std::fs::remove_file(&p);
        p
    }

    #[test]
    #[allow(unsafe_code)]
    fn drop_unlinks_bound_socket() {
        // SAFETY: unit-test runner may be multi-threaded; the umask window is
        // benign since no concurrent thread creates files at our temp path.
        let pre = unsafe { PreThreadAttestation::new_unchecked() };
        let path = unique_sock_path();
        let obs = Observer::bind(
            &path,
            Duration::from_secs(1),
            0o600,
            Duration::from_millis(100),
            0,
            64,
            EvictionPolicy::Strict,
            DEFAULT_EVICTION_SCAN_WINDOW,
            None,
            0,
            0,
            ClockSource::Monotonic,
            &pre,
        )
        .expect("bind should succeed on a clean temp path");
        assert!(path.exists(), "socket file must exist after bind");
        drop(obs);
        assert!(
            !path.exists(),
            "socket file must be removed after observer drop"
        );
    }

    #[test]
    fn maybe_refresh_pid_max_respects_interval() {
        // Drive the cadence gate without exercising the /proc read itself —
        // the value `read_pid_max` returns is host-dependent (kernel default
        // 4_194_304 on Linux, u32::MAX elsewhere); we assert the gate's
        // *timing* contract, not the value.
        //
        // The observer's monotonic clock is anchored to `Observer::new` (see
        // `Clock::new`), so `now_ns()` starts near zero and only crosses
        // PID_MAX_REFRESH_INTERVAL_NS after ~60 s of real uptime. The test
        // advances the observer's `last_now_ns` directly via the forward
        // clamp to simulate elapsed time without sleeping.
        let mut obs = Observer::new(
            Duration::from_secs(1),
            64,
            EvictionPolicy::Strict,
            DEFAULT_EVICTION_SCAN_WINDOW,
            None,
            0,
            0,
            ClockSource::Monotonic,
        )
        .expect("Observer::new should succeed");

        let initial = obs.pid_max();
        assert_eq!(
            obs.last_pid_max_refresh_ns, 0,
            "fresh Observer has not yet run a periodic refresh"
        );

        // Immediately after construction the observer clock is still inside
        // the startup window. `now_ns() - 0 < INTERVAL`, so the gate skips:
        // `Observer::new` has already read pid_max, no need to re-read yet.
        let refreshed_at_startup = obs.maybe_refresh_pid_max();
        assert!(
            !refreshed_at_startup,
            "first call within startup window must skip (Observer::new already read pid_max)"
        );
        assert_eq!(
            obs.last_pid_max_refresh_ns, 0,
            "skip must leave the timestamp untouched"
        );

        // Simulate >60 s of observer uptime by pushing the forward-clamped
        // monotonic anchor past the interval. The next `now_ns()` reading
        // will be clamped to at least this value.
        obs.last_now_ns = PID_MAX_REFRESH_INTERVAL_NS + 1_000_000_000;
        // The forward clamp registers the real raw clock as a regression
        // when computing now_ns; drain it so unrelated tests stay clean.
        let refreshed_after_interval = obs.maybe_refresh_pid_max();
        assert!(
            refreshed_after_interval,
            "refresh must fire once the interval has elapsed since startup"
        );
        let first_ts = obs.last_pid_max_refresh_ns;
        assert!(
            first_ts >= PID_MAX_REFRESH_INTERVAL_NS,
            "post-interval refresh stamps a fresh timestamp >= INTERVAL"
        );
        assert_eq!(
            obs.pid_max(),
            initial,
            "refresh re-reads the same host value within a single test process"
        );

        // Immediate follow-up: the gate must close again until another full
        // interval elapses.
        let refreshed_again = obs.maybe_refresh_pid_max();
        assert!(
            !refreshed_again,
            "second call within new interval must skip"
        );
        assert_eq!(
            obs.last_pid_max_refresh_ns, first_ts,
            "skip must leave the new timestamp untouched"
        );

        // Rewind the recorded timestamp by more than the interval and confirm
        // the gate opens again.
        obs.last_pid_max_refresh_ns = first_ts.saturating_sub(PID_MAX_REFRESH_INTERVAL_NS + 1);
        let refreshed_after_rewind = obs.maybe_refresh_pid_max();
        assert!(
            refreshed_after_rewind,
            "refresh must fire after rewinding the recorded timestamp"
        );
        assert!(
            obs.last_pid_max_refresh_ns >= first_ts,
            "rewind-driven refresh records a fresh timestamp"
        );

        // Test produced clock regressions as a side effect of pushing
        // `last_now_ns` past the real raw clock; drain so subsequent suite
        // state stays neutral. The count is non-deterministic (depends on
        // how many `now_ns()` calls were issued by `maybe_refresh_pid_max`).
        let _ = obs.drain_clock_regressions();
    }

    #[test]
    fn clock_regression_counter_increments_on_backward_clock() {
        let mut obs = Observer::new(
            Duration::from_secs(1),
            64,
            EvictionPolicy::Strict,
            DEFAULT_EVICTION_SCAN_WINDOW,
            None,
            0,
            0,
            ClockSource::Monotonic,
        )
        .expect("Observer::new should succeed");

        // Baseline reading — the forward clamp seeds `last_now_ns` from the
        // current monotonic value. No regression yet.
        let _ = obs.now_ns();
        assert_eq!(
            obs.drain_clock_regressions(),
            0,
            "no regressions after the first reading"
        );

        // Simulate the kernel clock having previously reported a value far
        // in the future (e.g. before a VM live migration that rewound the
        // TSC). The next `now_ns()` call reads a real value strictly less
        // than `last_now_ns`, so the forward clamp absorbs it AND the
        // regression counter must increment.
        obs.last_now_ns = u64::MAX / 2;
        let clamped = obs.now_ns();
        assert_eq!(
            clamped,
            u64::MAX / 2,
            "forward clamp preserves the larger value"
        );
        assert_eq!(
            obs.drain_clock_regressions(),
            1,
            "exactly one regression observed"
        );

        // Drain resets — a second drain reads zero.
        assert_eq!(
            obs.drain_clock_regressions(),
            0,
            "drain must reset the counter"
        );

        // A second backward excursion bumps the counter again.
        obs.last_now_ns = u64::MAX / 2;
        let _ = obs.now_ns();
        obs.last_now_ns = u64::MAX / 2;
        let _ = obs.now_ns();
        assert_eq!(
            obs.drain_clock_regressions(),
            2,
            "counter is saturating-add cumulative until drained"
        );
    }

    #[test]
    fn clock_jump_forward_counter_increments_on_large_advance() {
        let mut obs = Observer::new(
            Duration::from_secs(1),
            64,
            EvictionPolicy::Strict,
            DEFAULT_EVICTION_SCAN_WINDOW,
            None,
            0,
            0,
            ClockSource::Monotonic,
        )
        .expect("Observer::new should succeed");

        // Feed synthetic timestamps via apply_raw_clock_test so we don't need
        // to wait real time. Simulate a baseline reading then a 10 s jump.
        let _ = obs.apply_raw_clock_test(1_000_000); // prime: 1 ms from baseline
        let _ = obs.apply_raw_clock_test(11_000_000_000); // +10 s jump
        assert_eq!(
            obs.drain_clock_jumps_forward(),
            1,
            "forward jump exceeding threshold must increment the counter"
        );
        assert_eq!(
            obs.drain_clock_regressions(),
            0,
            "a forward jump must not also count as a regression"
        );

        // Drain resets — second drain reads zero.
        assert_eq!(
            obs.drain_clock_jumps_forward(),
            0,
            "drain must reset the forward-jump counter"
        );

        // A sub-threshold advance (2 s) must not be counted.
        let _ = obs.apply_raw_clock_test(13_000_000_000); // +2 s — below 5 s sentinel
        assert_eq!(
            obs.drain_clock_jumps_forward(),
            0,
            "advance below threshold must not be counted as a jump"
        );

        // Bootstrap case: last_now_ns == 0 must not trigger a jump (startup).
        obs.last_now_ns = 0;
        let _ = obs.apply_raw_clock_test(10_000_000_000); // 10 s from zero
        assert_eq!(
            obs.drain_clock_jumps_forward(),
            0,
            "initial read from last_now_ns==0 must not count as a forward jump"
        );
    }
}