Skip to main content

varta_watch/config/
types.rs

1use std::net::SocketAddr;
2use std::path::PathBuf;
3use std::time::Duration;
4
5use crate::clock::ClockSource;
6use crate::signal_install::SignalHandlerMode;
7use crate::tracker::EvictionPolicy;
8
9/// Default per-pid debounce window applied when `--recovery-exec` is set
10/// without an explicit `--recovery-debounce-ms`.
11pub const DEFAULT_RECOVERY_DEBOUNCE_MS: u64 = 1000;
12
13/// Default UDS file permissions applied after bind (octal 0600 — owner-only
14/// read and write). Tightens the blast radius so only the owning UID can
15/// speak to the observer socket.
16pub const DEFAULT_SOCKET_MODE: u32 = 0o600;
17
18/// Default UDS read timeout in milliseconds. Capped so a stalled peer
19/// cannot hold the observer poll loop indefinitely.
20pub const DEFAULT_READ_TIMEOUT_MS: u64 = 100;
21
22/// Minimum allowed value for `--threshold-ms`. A threshold of 0 ms would
23/// cause every agent to be perpetually stalled, triggering recovery commands
24/// on every poll cycle.
25pub const MIN_THRESHOLD_MS: u64 = 10;
26
27/// Default per-source-IP refill rate (connections per second) for the
28/// Prometheus `/metrics` endpoint token bucket.  Comfortably above the
29/// 1-per-15-second cadence used by typical Prometheus scrapers; low enough
30/// that a hostile actor on the same network cannot exhaust file descriptors
31/// or saturate the observer's poll loop with a flood of opens.
32pub const DEFAULT_PROM_RATE_LIMIT_PER_SEC: u32 = 5;
33
34/// Default burst capacity for the per-source-IP token bucket.  Tolerates a
35/// short cluster of legitimate scrapes (e.g. dashboard refresh) while still
36/// shutting down a sustained flood within a few seconds.
37pub const DEFAULT_PROM_RATE_LIMIT_BURST: u32 = 10;
38
39/// Default per-pid maximum beat rate in beats per second.
40/// Enabled by default to provide a baseline DoS ceiling.
41/// Set `--max-beat-rate 0` to disable.
42#[cfg(not(feature = "compile-time-config"))]
43pub const DEFAULT_MAX_BEAT_RATE: u32 = 100;
44
45/// Default global beat rate cap across all senders combined, in beats per
46/// second.  Provides a hard ceiling that defeats per-pid rotation attacks.
47/// Set `--global-beat-rate 0` to disable.  Sized for 50 concurrent agents
48/// × 100 bps.
49#[cfg(not(feature = "compile-time-config"))]
50pub const DEFAULT_GLOBAL_BEAT_RATE: u32 = 5_000;
51
52/// Default global burst capacity (token-bucket capacity).  2× the refill
53/// rate so 50 agents can co-restart within a 1 s window.
54#[cfg(not(feature = "compile-time-config"))]
55pub const DEFAULT_GLOBAL_BEAT_BURST: u32 = 10_000;
56
57/// Default receive-buffer size requested via `SO_RCVBUF` on the observer
58/// UDS.  1 MiB ≈ 32 768 × 32 B frames ≈ 6 s of full-burst headroom at the
59/// default global rate.  Linux doubles the value then clamps to
60/// `net.core.rmem_max` (~208 KiB stock); the gauge surfaces the actual
61/// granted value.  Set `--uds-rcvbuf-bytes 0` to leave the kernel default.
62#[cfg(not(feature = "compile-time-config"))]
63pub const DEFAULT_UDS_RCVBUF_BYTES: u32 = 1_048_576;
64
65/// Default wall-clock budget (in milliseconds) [`crate::recovery::Recovery`]
66/// blocks in its [`Drop`] impl waiting for outstanding recovery children to
67/// exit after a `kill(2)`. Five seconds preserves the v0.1 hard-coded
68/// constant.  systemd `TimeoutStopSec` must be at least this value plus a
69/// small reap margin.
70pub const DEFAULT_SHUTDOWN_GRACE_MS: u64 = 5_000;
71
72/// Minimum accepted value for `--shutdown-grace-ms`.  Below this the
73/// shutdown poll loop cannot complete even one [`std::process::Child::try_wait`]
74/// round under load, which would orphan every outstanding child to PID 1.
75pub const MIN_SHUTDOWN_GRACE_MS: u64 = 100;
76
77/// Default per-child cap for combined stdout+stderr capture when
78/// `--recovery-capture-stdio` is enabled.  4 KiB is enough to fit a typical
79/// systemctl/journalctl output snippet without risking pipe-buffer pressure
80/// on a chatty recovery command.
81pub const DEFAULT_RECOVERY_CAPTURE_BYTES: u32 = 4096;
82
83/// Maximum value accepted by `--recovery-capture-bytes`.  Values above this
84/// risk holding too much child output in observer memory and making the
85/// non-blocking pipe drain expensive per tick.
86pub const MAX_RECOVERY_CAPTURE_BYTES: u32 = 1024 * 1024;
87
88/// Minimum accepted value for `--iteration-budget-ms`.  Below this the
89/// budget overlaps the noise floor of the work itself — `serve_pending`
90/// alone can spend up to ~200 ms by design — and every iteration would be
91/// flagged as an overrun, making the metric useless.
92pub const MIN_ITERATION_BUDGET_MS: u64 = 50;
93
94/// Maximum accepted value for `--iteration-budget-ms`.  Above this the
95/// soft budget can no longer fire before `--self-watchdog-secs` would
96/// abort the daemon, so the metric ceases to be a useful early signal.
97pub const MAX_ITERATION_BUDGET_MS: u64 = 60_000;
98
99/// Minimum accepted value for `--scrape-budget-ms`.  Below this the budget
100/// overlaps the structural cap of `serve_pending` itself (100 ms serve +
101/// 100 ms drain = 200 ms worst case), so it would fire spuriously.  Bounds
102/// chosen on the same logic as `--iteration-budget-ms`.
103pub const MIN_SCRAPE_BUDGET_MS: u64 = 50;
104
105/// Maximum accepted value for `--scrape-budget-ms`.  Above this the
106/// scrape budget can no longer fire before `--self-watchdog-secs` would
107/// abort the daemon, so the metric ceases to be a useful signal.
108pub const MAX_SCRAPE_BUDGET_MS: u64 = 60_000;
109
110/// Default value for `--audit-fsync-budget-ms`.  If a single
111/// `fdatasync(2)` on the audit file exceeds this, the remaining records
112/// in the current drain are written-to-BufWriter only and the sync is
113/// deferred to the next maintenance tick.  Bounds the worst-case poll
114/// stall on a slow disk to one fsync per tick.
115///
116/// Referenced only by the argv parser; the compile-time-config build
117/// reads its default directly from `build.rs`.
118#[cfg(not(feature = "compile-time-config"))]
119pub const DEFAULT_AUDIT_FSYNC_BUDGET_MS: u32 = 50;
120
121/// Default value for `--audit-sync-interval-ms`.  `0` disables the
122/// time-based cadence; durability falls back to the record-count cadence
123/// set by `--recovery-audit-sync-every` alone — the IEC 62304 Class C
124/// default semantics.  Operators who relax the record cadence pin a
125/// worst-case sync interval here.
126#[cfg(not(feature = "compile-time-config"))]
127pub const DEFAULT_AUDIT_SYNC_INTERVAL_MS: u32 = 0;
128
129/// Default value for `--audit-rotation-budget-ms`.  Rotation
130/// (rename × 5 + reopen + header + boot record + fsync) executes as a
131/// state machine; if a single tick exceeds this budget the state is
132/// preserved and resumed on the next tick.  Keeps a wedged filesystem
133/// from blocking the poll loop during rotation.
134#[cfg(not(feature = "compile-time-config"))]
135pub const DEFAULT_AUDIT_ROTATION_BUDGET_MS: u32 = 50;
136
137/// Parsed daemon configuration.
138#[derive(Clone, Debug)]
139pub struct Config {
140    /// Filesystem path the observer's UDS will be bound at.
141    pub socket: PathBuf,
142    /// Per-pid silence window before the observer surfaces `Event::Stall`.
143    pub threshold: Duration,
144    /// Optional exec command line invoked on each unique stall. `{pid}` in
145    /// any argument is replaced with the numeric PID. No shell is spawned.
146    pub recovery_exec_cmd: Option<String>,
147    /// Optional path to a file containing the `--recovery-exec` command line.
148    /// The file must be owned by the observer's UID and have mode 0600 or
149    /// stricter. Mutually exclusive with `recovery_exec_cmd`.
150    pub recovery_exec_file: Option<PathBuf>,
151    /// Per-pid debounce window for recovery invocations.
152    pub recovery_debounce: Duration,
153    /// Environment variables passed to recovery child processes. Each entry
154    /// is in `KEY=VALUE` format. Applied on top of the base env chosen by
155    /// [`Self::recovery_inherit_env`]: default-secure (cleared,
156    /// `PATH=/usr/bin:/bin` only) → these become an explicit allowlist;
157    /// inherit-mode → these override the inherited values for the named keys.
158    pub recovery_env: Vec<String>,
159    /// Opt in to inheriting the observer's full environment for recovery
160    /// child processes. Default `false` (secure) — child env is cleared to
161    /// `PATH=/usr/bin:/bin` plus any explicit `recovery_env` entries.
162    /// Set via `--recovery-inherit-env`. See
163    /// `book/src/architecture/recovery.md` for the rationale and migration
164    /// guide.
165    pub recovery_inherit_env: bool,
166    /// Optional path the file exporter appends one event-line per record to.
167    pub file_export: Option<PathBuf>,
168    /// Optional byte limit for the file export. When exceeded, the current
169    /// file is rotated (up to 5 generations) and a new one is opened.
170    pub export_file_max_bytes: Option<u64>,
171    /// Records between forced `fdatasync(2)` calls on the file exporter.
172    /// `0` (default) preserves the v0.1 behavior — flush only on clean
173    /// shutdown and during rotation. Non-zero values trade IO for
174    /// crash-time durability; `1` matches the recovery audit log's
175    /// per-record durability guarantee. Set via
176    /// `--export-file-sync-every <N>`.
177    pub export_file_sync_every: u32,
178    /// Optional listening address for the Prometheus exporter.
179    pub prom_addr: Option<SocketAddr>,
180    /// Path to a file containing the 32-byte (64-hex-character) bearer token
181    /// for the Prometheus `/metrics` endpoint.  Required whenever
182    /// [`Self::prom_addr`] is set: `/metrics` has no anonymous access.  The
183    /// file must be a regular file (no symlinks), owned by the observer's
184    /// UID, mode `0o600` or stricter — see [`super::validate::validate_secret_file`].
185    pub prom_token_file: Option<PathBuf>,
186    /// Optional deadline after which the daemon shuts itself down. Used by
187    /// integration tests to bound run time without relying on signals.
188    pub shutdown_after: Option<Duration>,
189    /// Maximum wall-clock time [`crate::recovery::Recovery::drop`] blocks
190    /// waiting for outstanding recovery children after issuing `kill(2)`.
191    /// Defaults to [`DEFAULT_SHUTDOWN_GRACE_MS`]; minimum
192    /// [`MIN_SHUTDOWN_GRACE_MS`].  systemd `TimeoutStopSec` must be at
193    /// least this value plus a small reap margin (~2 s).
194    pub shutdown_grace: Duration,
195    /// Optional kill-after deadline for outstanding recovery children.
196    /// `None` (the default) preserves v0.1.0 semantics: children are
197    /// reaped on completion but never killed. Set via
198    /// `--recovery-timeout-ms`.
199    pub recovery_timeout: Option<Duration>,
200    /// UDS file mode applied after bind (octal, e.g. `0o600`).
201    /// Defaults to [`DEFAULT_SOCKET_MODE`].
202    pub socket_mode: u32,
203    /// UDS read timeout for the bound socket. Defaults to
204    /// [`DEFAULT_READ_TIMEOUT_MS`] milliseconds.
205    pub read_timeout: Duration,
206    /// Maximum number of distinct agent pids tracked concurrently.
207    /// Defaults to [`crate::tracker::DEFAULT_CAPACITY`] (256). Beats for
208    /// new pids beyond this limit are dropped.
209    pub tracker_capacity: usize,
210    /// Eviction policy applied when the tracker is at capacity and a
211    /// new pid arrives. Defaults to [`EvictionPolicy::Strict`].
212    pub tracker_eviction_policy: EvictionPolicy,
213    /// Maximum slots scanned per eviction attempt.
214    /// Defaults to [`DEFAULT_EVICTION_SCAN_WINDOW`].
215    pub eviction_scan_window: usize,
216    /// Optional UDP port for network-based observers. When set, the observer
217    /// also binds a UDP listener alongside the UDS socket.
218    pub udp_port: Option<u16>,
219    /// IP address to bind the UDP listener on. Defaults to `0.0.0.0` when
220    /// `--udp-port` is set. Ignored when `--udp-port` is not set.
221    pub udp_bind_addr: Option<std::net::IpAddr>,
222    /// Path to a file containing a 64-character hex key for secure UDP
223    /// (requires `--features secure-udp`).
224    pub secure_key_file: Option<PathBuf>,
225    /// Path to a file with one hex key per line for zero-downtime key
226    /// rotation (requires `--features secure-udp`).
227    pub accepted_key_file: Option<PathBuf>,
228    /// Path to a file containing a 64-character hex master key for
229    /// per-agent key derivation (requires `--features secure-udp`).
230    /// The observer derives agent-specific keys from the PID in each
231    /// frame's `iv_random` prefix.
232    pub master_key_file: Option<PathBuf>,
233    /// Optional per-pid maximum beat rate in beats per second.
234    /// `None` disables per-pid limiting (pass `--max-beat-rate 0`).
235    /// Defaults to `Some(DEFAULT_MAX_BEAT_RATE)` — beats arriving faster
236    /// than this rate from the same pid are dropped and counted via
237    /// `varta_rate_limited_total{reason="per_pid"}`.
238    pub max_beat_rate: Option<u32>,
239    /// Global beat rate cap across all senders combined, in beats per
240    /// second.  Provides a ceiling that defeats per-pid rotation attacks.
241    /// `0` disables (`--global-beat-rate 0`).  Defaults to
242    /// [`DEFAULT_GLOBAL_BEAT_RATE`].
243    pub global_beat_rate: u32,
244    /// Global token-bucket burst capacity.  Defaults to
245    /// [`DEFAULT_GLOBAL_BEAT_BURST`].  `0` along with `global_beat_rate`
246    /// effectively disables the global bucket.
247    pub global_beat_burst: u32,
248    /// Requested `SO_RCVBUF` size in bytes for the observer UDS.  `0`
249    /// leaves the kernel default unchanged.  Defaults to
250    /// [`DEFAULT_UDS_RCVBUF_BYTES`].  The actual granted size (which Linux
251    /// clamps to `net.core.rmem_max`) is surfaced as
252    /// `varta_observer_uds_rcvbuf_bytes`.
253    pub uds_rcvbuf_bytes: u32,
254    /// Optional path for a heartbeat file. When set, the observer
255    /// writes a timestamp + loop-counter line on every poll iteration,
256    /// allowing external watchdogs to detect observer stalls.
257    pub heartbeat_file: Option<PathBuf>,
258    /// If `Some`, a background watchdog thread is spawned that calls
259    /// `process::abort()` if the poll loop has not ticked for longer than
260    /// this duration.  Catches hung poll loops that signal-based supervisors
261    /// cannot detect.  Set by `--self-watchdog-secs`.
262    pub self_watchdog: Option<Duration>,
263    /// If `Some`, the path to a hardware watchdog device (e.g.
264    /// `/dev/watchdog`) that is opened at startup and kicked once per poll
265    /// iteration.  On clean shutdown the magic-close byte `'V'` is written to
266    /// disarm the watchdog.  Set by `--hw-watchdog`.
267    pub hw_watchdog: Option<PathBuf>,
268    /// Per-source-IP refill rate (connections per second) for the
269    /// Prometheus `/metrics` endpoint.  Defaults to
270    /// [`DEFAULT_PROM_RATE_LIMIT_PER_SEC`].
271    pub prom_rate_limit_per_sec: u32,
272    /// Per-source-IP burst (token-bucket capacity) for the Prometheus
273    /// `/metrics` endpoint.  Defaults to [`DEFAULT_PROM_RATE_LIMIT_BURST`].
274    pub prom_rate_limit_burst: u32,
275    /// Operator opt-in required to bind a plaintext UDP listener.  When
276    /// `--udp-port` is set and no AEAD keys are configured, startup
277    /// refuses to proceed unless this is `true`.  The build must also
278    /// include `--features unsafe-plaintext-udp` for the plaintext path
279    /// to exist at all.  Set by `--i-accept-plaintext-udp`.
280    pub i_accept_plaintext_udp: bool,
281    /// Operator opt-in to combine the **secure-UDP** listener with a recovery
282    /// command.  Secure UDP authenticates wire bytes but cannot attest the
283    /// sending process — a holder of a shared PSK or a derived per-agent key
284    /// can forge a beat for any pid.  Without this flag, startup refuses to
285    /// proceed when both `--udp-port` (with key files) and a recovery template
286    /// are set.  With this flag the runtime origin gate stamps beats from this
287    /// listener [`BeatOrigin::OperatorAttestedTransport`] so recovery fires.
288    /// Set by `--secure-udp-i-accept-recovery-on-unauthenticated-transport`.
289    pub i_accept_recovery_on_secure_udp: bool,
290    /// Operator opt-in to combine the **plaintext-UDP** listener with a
291    /// recovery command.  Plaintext UDP has no authentication whatsoever —
292    /// any host that can reach the observer port can forge any frame.  Without
293    /// this flag, startup refuses to proceed when both `--udp-port` (without
294    /// key files) and a recovery template are set.  With this flag the runtime
295    /// origin gate stamps beats from this listener
296    /// [`BeatOrigin::OperatorAttestedTransport`] so recovery fires.
297    /// Set by `--plaintext-udp-i-accept-recovery-on-unauthenticated-transport`.
298    pub i_accept_recovery_on_plaintext_udp: bool,
299    /// Operator opt-in to bind the **secure-UDP** listener to a non-loopback
300    /// address (H4).  The per-sender replay protection retains state for up
301    /// to 1024 source addresses plus a 1-deep eviction shadow — an attacker
302    /// who can spoof ≥1025 UDP source addresses (trivial on a routed network)
303    /// can rotate the shadow and replay a captured frame against a target
304    /// sender.  Loopback is safe (only same-host processes can forge loopback
305    /// source addresses, which requires `CAP_NET_RAW`); any reachable network
306    /// must be explicitly acknowledged.  Without this flag, startup refuses
307    /// to proceed when `--udp-bind-addr` resolves to a non-loopback address
308    /// and secure-UDP keys are configured.  Set by
309    /// `--i-accept-secure-udp-non-loopback`.
310    pub i_accept_secure_udp_non_loopback: bool,
311    /// Permit beats — and, by extension, recovery commands — for agents
312    /// whose kernel-attested PID namespace differs from the observer's.
313    /// Use only when agents intentionally share the host namespace
314    /// (`--pid=host` containers) or an out-of-band translator is in place.
315    /// Set by `--allow-cross-namespace-agents`. Default `false` — beats from
316    /// cross-namespace agents are dropped at receive (counted via
317    /// `varta_frame_namespace_mismatch_total`), and any stalls that did
318    /// progress before opt-in refuse recovery (counted via
319    /// `varta_recovery_refused_total{reason="cross_namespace_agent"}`).
320    pub allow_cross_namespace_agents: bool,
321    /// Treat a cross-namespace agent as a fatal startup error instead of the
322    /// default refuse-recovery behaviour. Set by `--strict-namespace-check`.
323    /// Useful in environments where the operator wants the daemon to fail
324    /// loudly rather than silently log audit refusals. Default `false`.
325    pub strict_namespace_check: bool,
326    /// Optional path the recovery audit TSV is appended to. When set, every
327    /// recovery spawn and completion is recorded with wall-clock timestamp,
328    /// agent pid, child pid, mode, outcome, exit code, and duration. See
329    /// [`crate::audit::RecoveryAuditLog`] for the schema.
330    pub recovery_audit_file: Option<PathBuf>,
331    /// Optional byte cap for the recovery audit file. When exceeded, the
332    /// file rotates through up to 5 generations (PATH → PATH.1 → … →
333    /// PATH.5). Without a cap the file grows unbounded.
334    pub recovery_audit_max_bytes: Option<u64>,
335    /// How many records to write between forced `fdatasync(2)` calls on
336    /// the audit file. Default `1` (sync every record) — the only
337    /// IEC 62304 Class C-conforming value. Higher values trade a small
338    /// risk of losing up to N-1 records on power cut for a lower per-
339    /// record cost. Values >1 trigger a startup warning. `0` is rejected
340    /// at parse time.
341    pub recovery_audit_sync_every: u32,
342    /// Whether to capture child stdout/stderr non-blockingly for the audit
343    /// record. Default off — pipes are inherited from the observer. Opt-in
344    /// avoids deadlock risk for operators who alias chatty recovery
345    /// commands (e.g. `journalctl -xeu agent.service`).
346    pub recovery_capture_stdio: bool,
347    /// Total byte cap (stdout + stderr combined, per child) when
348    /// `recovery_capture_stdio` is enabled. Defaults to
349    /// [`DEFAULT_RECOVERY_CAPTURE_BYTES`]. Values larger than
350    /// [`MAX_RECOVERY_CAPTURE_BYTES`] are rejected at parse time.
351    pub recovery_capture_bytes: u32,
352    /// Soft per-iteration budget for the observer poll loop.  Iterations
353    /// exceeding this increment
354    /// `varta_observer_iteration_budget_exceeded_total` and are visible in
355    /// the `varta_observer_iteration_seconds` histogram.  Advisory only —
356    /// hard wedges are caught by `--self-watchdog-secs`.  Set by
357    /// `--iteration-budget-ms`; defaults to
358    /// [`crate::exporter::DEFAULT_ITERATION_BUDGET`].
359    pub iteration_budget: Duration,
360    /// Soft per-call budget for `PromExporter::serve_pending`.  Calls
361    /// exceeding this increment
362    /// `varta_observer_scrape_budget_exceeded_total` and are visible in
363    /// the `varta_observer_serve_pending_seconds` histogram.  Lets
364    /// operators alert on scrape-storm pressure separately from beat-path
365    /// slowness.  Set by `--scrape-budget-ms`; defaults to
366    /// [`crate::exporter::DEFAULT_SCRAPE_BUDGET`].
367    pub scrape_budget: Duration,
368    /// Soft per-call budget for a single `fdatasync(2)` on the audit
369    /// log.  If one fsync exceeds this, the remaining records in the
370    /// current drain are written-to-BufWriter only and the fsync is
371    /// deferred to the next maintenance tick — bounds the worst-case
372    /// poll stall on a slow disk to one fsync per tick.  Increments
373    /// `varta_audit_fsync_budget_exceeded_total` on overrun.  Set by
374    /// `--audit-fsync-budget-ms`; defaults to
375    /// [`DEFAULT_AUDIT_FSYNC_BUDGET_MS`].  `0` is rejected.
376    pub audit_fsync_budget_ms: u32,
377    /// Time-based fdatasync cadence in addition to the record-count
378    /// cadence from `--recovery-audit-sync-every`.  `0` (default)
379    /// disables the time-based cadence; with a non-zero value, the
380    /// drain force-syncs after this many ms have elapsed since the
381    /// last sync even when the per-record threshold has not yet
382    /// been crossed.  Operators on safety-critical profiles keep
383    /// `--recovery-audit-sync-every=1` and ignore this flag; deployments
384    /// that relax the record cadence pin a worst-case sync interval
385    /// here.  Set by `--audit-sync-interval-ms`; defaults to
386    /// [`DEFAULT_AUDIT_SYNC_INTERVAL_MS`].
387    pub audit_sync_interval_ms: u32,
388    /// Per-tick wall-clock budget for the audit-log rotation state
389    /// machine.  Rotation (rename × 5 + reopen + header + boot record +
390    /// fsync) advances incrementally; if a tick exceeds this budget the
391    /// state is preserved and the next tick resumes.  Increments
392    /// `varta_audit_rotation_budget_exceeded_total` on overrun.  Set by
393    /// `--audit-rotation-budget-ms`; defaults to
394    /// [`DEFAULT_AUDIT_ROTATION_BUDGET_MS`].  `0` is rejected.
395    pub audit_rotation_budget_ms: u32,
396    /// [test-hooks only] Sleep for this many milliseconds on the first poll
397    /// iteration, simulating a wedged loop.  Used by the self-watchdog
398    /// integration test (`tests/self_watchdog.rs`) to exercise the abort path
399    /// without relying on SIGSTOP (which freezes the watchdog thread too).
400    /// Present only when compiled with `--features test-hooks`.
401    #[cfg(feature = "test-hooks")]
402    pub inject_wedge_ms: Option<u64>,
403    /// Kernel clock that backs stall-threshold accounting (H7).
404    ///
405    /// - `Monotonic` (default): `CLOCK_MONOTONIC` — pauses on system
406    ///   suspend. Correct for SRE / cloud deployments.
407    /// - `Boottime` (Linux only): `CLOCK_BOOTTIME` — advances during
408    ///   suspend. Correct for embedded clinical devices that aggressively
409    ///   sleep (insulin pumps, holter monitors).
410    ///
411    /// See `book/src/architecture/safety-profiles.md` for the deployment
412    /// matrix. Set by `--clock-source <monotonic|boottime>`.
413    pub clock_source: ClockSource,
414    /// Signal-handler installation path on Linux.
415    ///
416    /// - `Direct` (default): direct `rt_sigaction(2)` syscall — owns the
417    ///   kernel ABI end-to-end, including the x86_64 signal-return trampoline.
418    ///   A readback + live SIGUSR1 smoke test run at startup.
419    /// - `Libc`: libc `sigaction(3)` wrapper — libc's `__restore_rt` is used.
420    ///   Opt-in for kernels not yet certified against the direct path.
421    ///
422    /// On macOS, FreeBSD, and other Unix, the mode is noted in startup
423    /// logs but has no operational effect (libc / POSIX is the only option).
424    /// Set by `--signal-handler-mode <direct|libc>`.
425    pub signal_handler_mode: SignalHandlerMode,
426}
427
428/// Failure modes for [`Config::from_args`].
429#[derive(Debug)]
430pub enum ConfigError {
431    /// A flag that requires a value was passed without one.
432    MissingValue(&'static str),
433    /// A required flag (e.g. `--socket`, `--threshold-ms`) was omitted.
434    MissingRequired(&'static str),
435    /// An unknown flag token was encountered.
436    UnknownFlag(String),
437    /// A numeric flag carried a value that would not parse as `u64`.
438    BadInteger {
439        /// The flag whose value failed to parse.
440        flag: &'static str,
441        /// The raw string that did not parse.
442        raw: String,
443    },
444    /// A value on `--socket-mode` could not be parsed as octal.
445    BadSocketMode(String),
446    /// `--prom-addr` value did not parse as `IP:PORT`.
447    BadAddr(String),
448    /// A value for a string-enum flag was not one of the accepted choices.
449    BadValue {
450        /// The flag whose value was rejected.
451        flag: &'static str,
452        /// The raw string that was provided.
453        raw: String,
454    },
455    /// The user passed `--help` / `-h`. Not a true error; `main` prints
456    /// [`Config::HELP`] and exits 0.
457    HelpRequested,
458    /// `--threshold-ms` value is below [`MIN_THRESHOLD_MS`].
459    ThresholdTooLow {
460        /// The value that was provided.
461        value: u64,
462        /// The minimum allowed value.
463        min: u64,
464    },
465    /// Two or more mutually exclusive recovery flags were specified.
466    MutuallyExclusive {
467        /// The pair of conflicting flags (e.g. `("--recovery-exec", "--recovery-exec-file")`).
468        a: &'static str,
469        /// Second conflicting flag.
470        b: &'static str,
471    },
472    /// A flag that has been removed for security reasons was passed.  The
473    /// `replacement` field carries an inline migration hint so operators
474    /// see the fix in the same line as the error.
475    RemovedFlag {
476        /// The removed flag token (e.g. `"--key-env"`).
477        flag: &'static str,
478        /// Human-readable migration hint (e.g.
479        /// `"--key-file (mode 0600, owned by the observer UID)"`).
480        replacement: &'static str,
481    },
482    /// `--prom-addr` was set but `--prom-token-file` was not.  /metrics
483    /// has no anonymous access; the observer refuses to start rather than
484    /// expose agent topology to anyone who can reach the bound port.
485    PromAddrRequiresToken,
486    /// `--recovery-capture-bytes` was set above
487    /// [`MAX_RECOVERY_CAPTURE_BYTES`]. Capturing more output than that
488    /// risks holding too much child stdout/stderr in observer memory.
489    RecoveryCaptureBytesTooLarge {
490        /// The value that was provided.
491        value: u32,
492        /// The maximum allowed value.
493        max: u32,
494    },
495    /// `--recovery-capture-stdio` was passed without any recovery command
496    /// configured (`--recovery-exec` / `--recovery-exec-file`). Capture is
497    /// meaningless without something to capture from.
498    RecoveryCaptureRequiresRecovery,
499    /// `--shutdown-grace-ms` was below [`MIN_SHUTDOWN_GRACE_MS`].
500    ShutdownGraceTooLow {
501        /// The value provided on the CLI.
502        value: u64,
503        /// The minimum allowed value.
504        min: u64,
505    },
506    /// Shell-mode recovery flags were passed (removed feature).  Use
507    /// `--recovery-exec` instead.
508    ShellRecoveryNotCompiledIn,
509    /// A recovery command (`--recovery-exec` / `--recovery-exec-file`) was
510    /// configured at the
511    /// same time as a UDP listener (`--udp-port`), without the matching
512    /// per-listener operator acknowledgement.  UDP transports cannot attest
513    /// the sending process — an attacker holding the AEAD key (or a derived
514    /// per-agent key) can forge a beat claiming any pid, then stop sending to
515    /// trigger the recovery command against the chosen pid.  Pass
516    /// `--secure-udp-i-accept-recovery-on-unauthenticated-transport` (for
517    /// secure UDP) or
518    /// `--plaintext-udp-i-accept-recovery-on-unauthenticated-transport` (for
519    /// plaintext UDP) to proceed.
520    RecoveryRequiresAuthenticatedTransport {
521        /// The `IP:PORT` of the UDP listener that would have been bound.
522        udp_addr: String,
523    },
524    /// A secure-UDP listener was configured with a non-loopback
525    /// `--udp-bind-addr`, but `--i-accept-secure-udp-non-loopback` was not
526    /// passed (H4).  The 1-deep replay shadow after capacity-forced eviction
527    /// is acceptable for closed local networks (loopback) but inadequate for
528    /// any reachable network — any spoofable-source attacker with ≥1025
529    /// distinct UDP source addresses can rotate the shadow and replay one
530    /// captured frame per target.
531    SecureUdpRequiresLoopbackBind {
532        /// The `IP:PORT` of the UDP listener that would have been bound.
533        udp_addr: String,
534    },
535    /// `--iteration-budget-ms` was outside the accepted range
536    /// (`[MIN_ITERATION_BUDGET_MS, MAX_ITERATION_BUDGET_MS]`).
537    IterationBudgetOutOfRange {
538        /// The value provided.
539        value: u64,
540        /// The minimum allowed value.
541        min: u64,
542        /// The maximum allowed value.
543        max: u64,
544    },
545    /// `--scrape-budget-ms` was outside the accepted range
546    /// (`[MIN_SCRAPE_BUDGET_MS, MAX_SCRAPE_BUDGET_MS]`).
547    ScrapeBudgetOutOfRange {
548        /// The value provided.
549        value: u64,
550        /// The minimum allowed value.
551        min: u64,
552        /// The maximum allowed value.
553        max: u64,
554    },
555    /// `--eviction-scan-window` was outside the accepted range
556    /// (`[MIN_EVICTION_SCAN_WINDOW, MAX_EVICTION_SCAN_WINDOW]`).
557    EvictionScanWindowOutOfRange {
558        /// The value provided.
559        value: usize,
560        /// The minimum allowed value.
561        min: usize,
562        /// The maximum allowed value.
563        max: usize,
564    },
565    /// `--clock-source boottime` was requested but the host kernel has no
566    /// equivalent of Linux's `CLOCK_BOOTTIME`. Currently fires on every
567    /// non-Linux target (macOS, *BSD).
568    ClockSourceUnsupported {
569        /// The source the operator requested.
570        source: ClockSource,
571        /// `std::env::consts::OS` for the build target.
572        platform: &'static str,
573    },
574    /// The binary was built with `--features compile-time-config` but the
575    /// operator supplied one or more argv tokens.  Class-A safety-critical
576    /// builds intentionally accept zero argv; the configuration is baked
577    /// into the binary by `build.rs` at compile time.
578    CompileTimeArgvForbidden,
579    /// `Config::compile_time()` produced a value that fails cross-field
580    /// validation at startup (e.g. recovery requires kernel-attested
581    /// transport but the compile-time blob enabled both UDP and recovery
582    /// without the acknowledgement flag).  Carries the same diagnostic
583    /// text the corresponding `from_args` error would produce.
584    CompileTimeConfigInvalid {
585        /// Static description of which invariant was violated.
586        reason: &'static str,
587    },
588}
589
590// The `ConfigError` Display impl has two cfg-gated personalities:
591//
592// 1. Default (SRE) builds: rich messages that name the flag the operator
593//    must supply or correct.  These strings carry literal flag names like
594//    `--socket` and `--prom-addr` and are linked unconditionally.
595//
596// 2. Class-A (`compile-time-config`) builds: terse, neutral phrasings that
597//    never mention argv flag names.  Most variants are dead code anyway —
598//    they are produced only by `Config::from_args`, which is excluded from
599//    compilation when the feature is on — but the Display impl must still
600//    cover every variant, and any literal flag string in the impl ends up
601//    in the binary (cerebrum 2026-05-12: `pub const &str` is always linked
602//    regardless of `#[cfg]` on the code paths that consume it).
603//
604// The Class-A wording uses `config key` instead of `--flag-name` and refers
605// the operator to `book/src/architecture/compile-time-config.md` for any
606// remediation.  The two impls are mutually exclusive at the `#[cfg]` layer.
607
608#[cfg(not(feature = "compile-time-config"))]
609impl core::fmt::Display for ConfigError {
610    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
611        match self {
612            ConfigError::MissingValue(flag) => write!(f, "{flag} requires a value"),
613            ConfigError::MissingRequired(flag) => write!(f, "missing required flag {flag}"),
614            ConfigError::UnknownFlag(s) => write!(f, "unknown flag {s}"),
615            ConfigError::BadInteger { flag, raw } => {
616                write!(f, "{flag}: not a valid unsigned integer: {raw:?}")
617            }
618            ConfigError::BadSocketMode(raw) => {
619                write!(
620                    f,
621                    "--socket-mode: expected octal digits (e.g. 600, 0600, or 0o600), got: {raw:?}"
622                )
623            }
624            ConfigError::BadAddr(raw) => {
625                write!(f, "--prom-addr: not a valid socket address: {raw:?}")
626            }
627            ConfigError::BadValue { flag, raw } => {
628                write!(f, "{flag}: invalid value {raw:?}",)
629            }
630            ConfigError::HelpRequested => f.write_str("--help"),
631            ConfigError::ThresholdTooLow { value, min } => {
632                write!(
633                    f,
634                    "--threshold-ms: {value} is below the minimum allowed value ({min} ms)"
635                )
636            }
637            ConfigError::MutuallyExclusive { a, b } => {
638                write!(f, "{a} and {b} are mutually exclusive")
639            }
640            ConfigError::RemovedFlag { flag, replacement } => write!(
641                f,
642                "{flag} has been removed for security reasons; use {replacement}"
643            ),
644            ConfigError::PromAddrRequiresToken => f.write_str(
645                "--prom-addr requires --prom-token-file. /metrics has no anonymous access; \
646                 generate a token with `openssl rand -hex 32 > /etc/varta/prom.token && \
647                 chmod 600 /etc/varta/prom.token`.",
648            ),
649            ConfigError::ShutdownGraceTooLow { value, min } => write!(
650                f,
651                "--shutdown-grace-ms: {value} is below the minimum allowed value ({min} ms)"
652            ),
653            ConfigError::RecoveryCaptureBytesTooLarge { value, max } => write!(
654                f,
655                "--recovery-capture-bytes: {value} exceeds the maximum allowed value ({max} bytes)"
656            ),
657            ConfigError::RecoveryCaptureRequiresRecovery => f.write_str(
658                "--recovery-capture-stdio requires --recovery-exec or --recovery-exec-file",
659            ),
660            ConfigError::ShellRecoveryNotCompiledIn => f.write_str(
661                "shell-mode recovery has been permanently removed; use --recovery-exec instead",
662            ),
663            ConfigError::RecoveryRequiresAuthenticatedTransport { udp_addr } => write!(
664                f,
665                "recovery command is configured alongside a UDP listener on {udp_addr}. \
666                 UDP transports cannot attest the sending process — a holder of the AEAD key \
667                 (or a per-agent key derived from a leaked master key) can forge a beat \
668                 claiming any pid, then stop sending to trigger recovery against the chosen pid. \
669                 Either remove the recovery command, switch to a UDS-only deployment, or pass \
670                 --secure-udp-i-accept-recovery-on-unauthenticated-transport (for secure UDP) \
671                 or --plaintext-udp-i-accept-recovery-on-unauthenticated-transport (for plaintext \
672                 UDP) to explicitly accept this risk on a per-listener basis."
673            ),
674            ConfigError::SecureUdpRequiresLoopbackBind { udp_addr } => write!(
675                f,
676                "secure-UDP listener configured with non-loopback --udp-bind-addr ({udp_addr}). \
677                 The per-sender replay-state map holds up to 1024 senders plus a 1-deep \
678                 eviction shadow; an attacker who can spoof ≥1025 UDP source addresses can \
679                 rotate the shadow and replay a captured frame against a target sender. \
680                 Either bind to a loopback address (default 127.0.0.1) or pass \
681                 --i-accept-secure-udp-non-loopback to explicitly accept this risk. \
682                 See book/src/architecture/vlp-transports.md for the threat-boundary derivation."
683            ),
684            ConfigError::IterationBudgetOutOfRange { value, min, max } => write!(
685                f,
686                "--iteration-budget-ms: {value} is outside the accepted range [{min}, {max}] ms"
687            ),
688            ConfigError::ScrapeBudgetOutOfRange { value, min, max } => write!(
689                f,
690                "--scrape-budget-ms: {value} is outside the accepted range [{min}, {max}] ms"
691            ),
692            ConfigError::EvictionScanWindowOutOfRange { value, min, max } => write!(
693                f,
694                "--eviction-scan-window: {value} is outside the accepted range [{min}, {max}]"
695            ),
696            ConfigError::ClockSourceUnsupported { source, platform } => {
697                let hint = match source {
698                    crate::clock::ClockSource::Boottime => {
699                        "`boottime` semantics (advance through suspend) require Linux's \
700                         CLOCK_BOOTTIME. On macOS / iOS use `--clock-source monotonic-raw` \
701                         (mach_continuous_time) for the same semantics; BSD has no equivalent \
702                         kernel clock."
703                    }
704                    crate::clock::ClockSource::MonotonicRaw => {
705                        "`monotonic-raw` is macOS / iOS only (CLOCK_MONOTONIC_RAW = \
706                         mach_continuous_time). On Linux use `--clock-source boottime` \
707                         (CLOCK_BOOTTIME) for advance-through-suspend semantics; BSD \
708                         has no equivalent kernel clock."
709                    }
710                    crate::clock::ClockSource::Monotonic => "",
711                };
712                write!(
713                    f,
714                    "--clock-source {source} is not supported on `{platform}`. {hint} \
715                     Otherwise use `--clock-source monotonic` (the default)."
716                )
717            }
718            ConfigError::CompileTimeArgvForbidden => f.write_str(
719                "this binary was configured at compile time \
720                 (--features compile-time-config); refusing to accept argv. \
721                 See book/src/architecture/compile-time-config.md for the \
722                 supported configuration mechanism.",
723            ),
724            ConfigError::CompileTimeConfigInvalid { reason } => write!(
725                f,
726                "compile-time config violates a cross-field invariant: {reason}"
727            ),
728        }
729    }
730}
731
732#[cfg(feature = "compile-time-config")]
733impl core::fmt::Display for ConfigError {
734    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
735        // Generic remediation pointer for every flag-relevant variant.
736        // Argv-only variants are unreachable in Class-A builds (their
737        // producer, `Config::from_args`, is excluded from compilation),
738        // but the Display impl must still cover them.  Neutral wording
739        // keeps argv flag names out of the binary's `strings` output.
740        const REF: &str = "see book/src/architecture/compile-time-config.md";
741        match self {
742            ConfigError::MissingValue(_)
743            | ConfigError::MissingRequired(_)
744            | ConfigError::UnknownFlag(_)
745            | ConfigError::BadInteger { .. }
746            | ConfigError::BadSocketMode(_)
747            | ConfigError::BadAddr(_)
748            | ConfigError::BadValue { .. }
749            | ConfigError::HelpRequested
750            | ConfigError::MutuallyExclusive { .. }
751            | ConfigError::RemovedFlag { .. }
752            | ConfigError::PromAddrRequiresToken
753            | ConfigError::ShutdownGraceTooLow { .. }
754            | ConfigError::RecoveryCaptureBytesTooLarge { .. }
755            | ConfigError::RecoveryCaptureRequiresRecovery
756            | ConfigError::RecoveryRequiresAuthenticatedTransport { .. }
757            | ConfigError::SecureUdpRequiresLoopbackBind { .. }
758            | ConfigError::IterationBudgetOutOfRange { .. }
759            | ConfigError::ScrapeBudgetOutOfRange { .. }
760            | ConfigError::EvictionScanWindowOutOfRange { .. } => {
761                write!(f, "configuration error (argv path unreachable; {REF})")
762            }
763            ConfigError::ThresholdTooLow { value, min } => {
764                write!(f, "threshold below minimum: {value} ms < {min} ms ({REF})")
765            }
766            ConfigError::ShellRecoveryNotCompiledIn => write!(
767                f,
768                "shell-mode recovery has been permanently removed ({REF})"
769            ),
770            ConfigError::ClockSourceUnsupported { platform, .. } => write!(
771                f,
772                "configured clock source is not supported on `{platform}`; \
773                 only the monotonic source is available off Linux ({REF})"
774            ),
775            ConfigError::CompileTimeArgvForbidden => f.write_str(
776                "this binary was configured at compile time; \
777                 refusing to accept command-line arguments",
778            ),
779            ConfigError::CompileTimeConfigInvalid { reason } => write!(
780                f,
781                "compile-time config violates a cross-field invariant: {reason}"
782            ),
783        }
784    }
785}
786
787impl std::error::Error for ConfigError {}