varta_watch/config/types.rs
1use std::net::SocketAddr;
2use std::path::PathBuf;
3use std::time::Duration;
4
5use crate::clock::ClockSource;
6use crate::signal_install::SignalHandlerMode;
7use crate::tracker::EvictionPolicy;
8
9/// Default per-pid debounce window applied when `--recovery-exec` is set
10/// without an explicit `--recovery-debounce-ms`.
11pub const DEFAULT_RECOVERY_DEBOUNCE_MS: u64 = 1000;
12
13/// Default UDS file permissions applied after bind (octal 0600 — owner-only
14/// read and write). Tightens the blast radius so only the owning UID can
15/// speak to the observer socket.
16pub const DEFAULT_SOCKET_MODE: u32 = 0o600;
17
18/// Default UDS read timeout in milliseconds. Capped so a stalled peer
19/// cannot hold the observer poll loop indefinitely.
20pub const DEFAULT_READ_TIMEOUT_MS: u64 = 100;
21
22/// Minimum allowed value for `--threshold-ms`. A threshold of 0 ms would
23/// cause every agent to be perpetually stalled, triggering recovery commands
24/// on every poll cycle.
25pub const MIN_THRESHOLD_MS: u64 = 10;
26
27/// Default per-source-IP refill rate (connections per second) for the
28/// Prometheus `/metrics` endpoint token bucket. Comfortably above the
29/// 1-per-15-second cadence used by typical Prometheus scrapers; low enough
30/// that a hostile actor on the same network cannot exhaust file descriptors
31/// or saturate the observer's poll loop with a flood of opens.
32pub const DEFAULT_PROM_RATE_LIMIT_PER_SEC: u32 = 5;
33
34/// Default burst capacity for the per-source-IP token bucket. Tolerates a
35/// short cluster of legitimate scrapes (e.g. dashboard refresh) while still
36/// shutting down a sustained flood within a few seconds.
37pub const DEFAULT_PROM_RATE_LIMIT_BURST: u32 = 10;
38
39/// Default per-pid maximum beat rate in beats per second.
40/// Enabled by default to provide a baseline DoS ceiling.
41/// Set `--max-beat-rate 0` to disable.
42#[cfg(not(feature = "compile-time-config"))]
43pub const DEFAULT_MAX_BEAT_RATE: u32 = 100;
44
45/// Default global beat rate cap across all senders combined, in beats per
46/// second. Provides a hard ceiling that defeats per-pid rotation attacks.
47/// Set `--global-beat-rate 0` to disable. Sized for 50 concurrent agents
48/// × 100 bps.
49#[cfg(not(feature = "compile-time-config"))]
50pub const DEFAULT_GLOBAL_BEAT_RATE: u32 = 5_000;
51
52/// Default global burst capacity (token-bucket capacity). 2× the refill
53/// rate so 50 agents can co-restart within a 1 s window.
54#[cfg(not(feature = "compile-time-config"))]
55pub const DEFAULT_GLOBAL_BEAT_BURST: u32 = 10_000;
56
57/// Default receive-buffer size requested via `SO_RCVBUF` on the observer
58/// UDS. 1 MiB ≈ 32 768 × 32 B frames ≈ 6 s of full-burst headroom at the
59/// default global rate. Linux doubles the value then clamps to
60/// `net.core.rmem_max` (~208 KiB stock); the gauge surfaces the actual
61/// granted value. Set `--uds-rcvbuf-bytes 0` to leave the kernel default.
62#[cfg(not(feature = "compile-time-config"))]
63pub const DEFAULT_UDS_RCVBUF_BYTES: u32 = 1_048_576;
64
65/// Default wall-clock budget (in milliseconds) [`crate::recovery::Recovery`]
66/// blocks in its [`Drop`] impl waiting for outstanding recovery children to
67/// exit after a `kill(2)`. Five seconds preserves the v0.1 hard-coded
68/// constant. systemd `TimeoutStopSec` must be at least this value plus a
69/// small reap margin.
70pub const DEFAULT_SHUTDOWN_GRACE_MS: u64 = 5_000;
71
72/// Minimum accepted value for `--shutdown-grace-ms`. Below this the
73/// shutdown poll loop cannot complete even one [`std::process::Child::try_wait`]
74/// round under load, which would orphan every outstanding child to PID 1.
75pub const MIN_SHUTDOWN_GRACE_MS: u64 = 100;
76
77/// Default per-child cap for combined stdout+stderr capture when
78/// `--recovery-capture-stdio` is enabled. 4 KiB is enough to fit a typical
79/// systemctl/journalctl output snippet without risking pipe-buffer pressure
80/// on a chatty recovery command.
81pub const DEFAULT_RECOVERY_CAPTURE_BYTES: u32 = 4096;
82
83/// Maximum value accepted by `--recovery-capture-bytes`. Values above this
84/// risk holding too much child output in observer memory and making the
85/// non-blocking pipe drain expensive per tick.
86pub const MAX_RECOVERY_CAPTURE_BYTES: u32 = 1024 * 1024;
87
88/// Minimum accepted value for `--iteration-budget-ms`. Below this the
89/// budget overlaps the noise floor of the work itself — `serve_pending`
90/// alone can spend up to ~200 ms by design — and every iteration would be
91/// flagged as an overrun, making the metric useless.
92pub const MIN_ITERATION_BUDGET_MS: u64 = 50;
93
94/// Maximum accepted value for `--iteration-budget-ms`. Above this the
95/// soft budget can no longer fire before `--self-watchdog-secs` would
96/// abort the daemon, so the metric ceases to be a useful early signal.
97pub const MAX_ITERATION_BUDGET_MS: u64 = 60_000;
98
99/// Minimum accepted value for `--scrape-budget-ms`. Below this the budget
100/// overlaps the structural cap of `serve_pending` itself (100 ms serve +
101/// 100 ms drain = 200 ms worst case), so it would fire spuriously. Bounds
102/// chosen on the same logic as `--iteration-budget-ms`.
103pub const MIN_SCRAPE_BUDGET_MS: u64 = 50;
104
105/// Maximum accepted value for `--scrape-budget-ms`. Above this the
106/// scrape budget can no longer fire before `--self-watchdog-secs` would
107/// abort the daemon, so the metric ceases to be a useful signal.
108pub const MAX_SCRAPE_BUDGET_MS: u64 = 60_000;
109
110/// Default value for `--audit-fsync-budget-ms`. If a single
111/// `fdatasync(2)` on the audit file exceeds this, the remaining records
112/// in the current drain are written-to-BufWriter only and the sync is
113/// deferred to the next maintenance tick. Bounds the worst-case poll
114/// stall on a slow disk to one fsync per tick.
115///
116/// Referenced only by the argv parser; the compile-time-config build
117/// reads its default directly from `build.rs`.
118#[cfg(not(feature = "compile-time-config"))]
119pub const DEFAULT_AUDIT_FSYNC_BUDGET_MS: u32 = 50;
120
121/// Default value for `--audit-sync-interval-ms`. `0` disables the
122/// time-based cadence; durability falls back to the record-count cadence
123/// set by `--recovery-audit-sync-every` alone — the IEC 62304 Class C
124/// default semantics. Operators who relax the record cadence pin a
125/// worst-case sync interval here.
126#[cfg(not(feature = "compile-time-config"))]
127pub const DEFAULT_AUDIT_SYNC_INTERVAL_MS: u32 = 0;
128
129/// Default value for `--audit-rotation-budget-ms`. Rotation
130/// (rename × 5 + reopen + header + boot record + fsync) executes as a
131/// state machine; if a single tick exceeds this budget the state is
132/// preserved and resumed on the next tick. Keeps a wedged filesystem
133/// from blocking the poll loop during rotation.
134#[cfg(not(feature = "compile-time-config"))]
135pub const DEFAULT_AUDIT_ROTATION_BUDGET_MS: u32 = 50;
136
137/// Parsed daemon configuration.
138#[derive(Clone, Debug)]
139pub struct Config {
140 /// Filesystem path the observer's UDS will be bound at.
141 pub socket: PathBuf,
142 /// Per-pid silence window before the observer surfaces `Event::Stall`.
143 pub threshold: Duration,
144 /// Optional exec command line invoked on each unique stall. `{pid}` in
145 /// any argument is replaced with the numeric PID. No shell is spawned.
146 pub recovery_exec_cmd: Option<String>,
147 /// Optional path to a file containing the `--recovery-exec` command line.
148 /// The file must be owned by the observer's UID and have mode 0600 or
149 /// stricter. Mutually exclusive with `recovery_exec_cmd`.
150 pub recovery_exec_file: Option<PathBuf>,
151 /// Per-pid debounce window for recovery invocations.
152 pub recovery_debounce: Duration,
153 /// Environment variables passed to recovery child processes. Each entry
154 /// is in `KEY=VALUE` format. Applied on top of the base env chosen by
155 /// [`Self::recovery_inherit_env`]: default-secure (cleared,
156 /// `PATH=/usr/bin:/bin` only) → these become an explicit allowlist;
157 /// inherit-mode → these override the inherited values for the named keys.
158 pub recovery_env: Vec<String>,
159 /// Opt in to inheriting the observer's full environment for recovery
160 /// child processes. Default `false` (secure) — child env is cleared to
161 /// `PATH=/usr/bin:/bin` plus any explicit `recovery_env` entries.
162 /// Set via `--recovery-inherit-env`. See
163 /// `book/src/architecture/recovery.md` for the rationale and migration
164 /// guide.
165 pub recovery_inherit_env: bool,
166 /// Optional path the file exporter appends one event-line per record to.
167 pub file_export: Option<PathBuf>,
168 /// Optional byte limit for the file export. When exceeded, the current
169 /// file is rotated (up to 5 generations) and a new one is opened.
170 pub export_file_max_bytes: Option<u64>,
171 /// Records between forced `fdatasync(2)` calls on the file exporter.
172 /// `0` (default) preserves the v0.1 behavior — flush only on clean
173 /// shutdown and during rotation. Non-zero values trade IO for
174 /// crash-time durability; `1` matches the recovery audit log's
175 /// per-record durability guarantee. Set via
176 /// `--export-file-sync-every <N>`.
177 pub export_file_sync_every: u32,
178 /// Optional listening address for the Prometheus exporter.
179 pub prom_addr: Option<SocketAddr>,
180 /// Path to a file containing the 32-byte (64-hex-character) bearer token
181 /// for the Prometheus `/metrics` endpoint. Required whenever
182 /// [`Self::prom_addr`] is set: `/metrics` has no anonymous access. The
183 /// file must be a regular file (no symlinks), owned by the observer's
184 /// UID, mode `0o600` or stricter — see [`super::validate::validate_secret_file`].
185 pub prom_token_file: Option<PathBuf>,
186 /// Optional deadline after which the daemon shuts itself down. Used by
187 /// integration tests to bound run time without relying on signals.
188 pub shutdown_after: Option<Duration>,
189 /// Maximum wall-clock time [`crate::recovery::Recovery::drop`] blocks
190 /// waiting for outstanding recovery children after issuing `kill(2)`.
191 /// Defaults to [`DEFAULT_SHUTDOWN_GRACE_MS`]; minimum
192 /// [`MIN_SHUTDOWN_GRACE_MS`]. systemd `TimeoutStopSec` must be at
193 /// least this value plus a small reap margin (~2 s).
194 pub shutdown_grace: Duration,
195 /// Optional kill-after deadline for outstanding recovery children.
196 /// `None` (the default) preserves v0.1.0 semantics: children are
197 /// reaped on completion but never killed. Set via
198 /// `--recovery-timeout-ms`.
199 pub recovery_timeout: Option<Duration>,
200 /// UDS file mode applied after bind (octal, e.g. `0o600`).
201 /// Defaults to [`DEFAULT_SOCKET_MODE`].
202 pub socket_mode: u32,
203 /// UDS read timeout for the bound socket. Defaults to
204 /// [`DEFAULT_READ_TIMEOUT_MS`] milliseconds.
205 pub read_timeout: Duration,
206 /// Maximum number of distinct agent pids tracked concurrently.
207 /// Defaults to [`crate::tracker::DEFAULT_CAPACITY`] (256). Beats for
208 /// new pids beyond this limit are dropped.
209 pub tracker_capacity: usize,
210 /// Eviction policy applied when the tracker is at capacity and a
211 /// new pid arrives. Defaults to [`EvictionPolicy::Strict`].
212 pub tracker_eviction_policy: EvictionPolicy,
213 /// Maximum slots scanned per eviction attempt.
214 /// Defaults to [`DEFAULT_EVICTION_SCAN_WINDOW`].
215 pub eviction_scan_window: usize,
216 /// Optional UDP port for network-based observers. When set, the observer
217 /// also binds a UDP listener alongside the UDS socket.
218 pub udp_port: Option<u16>,
219 /// IP address to bind the UDP listener on. Defaults to `0.0.0.0` when
220 /// `--udp-port` is set. Ignored when `--udp-port` is not set.
221 pub udp_bind_addr: Option<std::net::IpAddr>,
222 /// Path to a file containing a 64-character hex key for secure UDP
223 /// (requires `--features secure-udp`).
224 pub secure_key_file: Option<PathBuf>,
225 /// Path to a file with one hex key per line for zero-downtime key
226 /// rotation (requires `--features secure-udp`).
227 pub accepted_key_file: Option<PathBuf>,
228 /// Path to a file containing a 64-character hex master key for
229 /// per-agent key derivation (requires `--features secure-udp`).
230 /// The observer derives agent-specific keys from the PID in each
231 /// frame's `iv_random` prefix.
232 pub master_key_file: Option<PathBuf>,
233 /// Optional per-pid maximum beat rate in beats per second.
234 /// `None` disables per-pid limiting (pass `--max-beat-rate 0`).
235 /// Defaults to `Some(DEFAULT_MAX_BEAT_RATE)` — beats arriving faster
236 /// than this rate from the same pid are dropped and counted via
237 /// `varta_rate_limited_total{reason="per_pid"}`.
238 pub max_beat_rate: Option<u32>,
239 /// Global beat rate cap across all senders combined, in beats per
240 /// second. Provides a ceiling that defeats per-pid rotation attacks.
241 /// `0` disables (`--global-beat-rate 0`). Defaults to
242 /// [`DEFAULT_GLOBAL_BEAT_RATE`].
243 pub global_beat_rate: u32,
244 /// Global token-bucket burst capacity. Defaults to
245 /// [`DEFAULT_GLOBAL_BEAT_BURST`]. `0` along with `global_beat_rate`
246 /// effectively disables the global bucket.
247 pub global_beat_burst: u32,
248 /// Requested `SO_RCVBUF` size in bytes for the observer UDS. `0`
249 /// leaves the kernel default unchanged. Defaults to
250 /// [`DEFAULT_UDS_RCVBUF_BYTES`]. The actual granted size (which Linux
251 /// clamps to `net.core.rmem_max`) is surfaced as
252 /// `varta_observer_uds_rcvbuf_bytes`.
253 pub uds_rcvbuf_bytes: u32,
254 /// Optional path for a heartbeat file. When set, the observer
255 /// writes a timestamp + loop-counter line on every poll iteration,
256 /// allowing external watchdogs to detect observer stalls.
257 pub heartbeat_file: Option<PathBuf>,
258 /// If `Some`, a background watchdog thread is spawned that calls
259 /// `process::abort()` if the poll loop has not ticked for longer than
260 /// this duration. Catches hung poll loops that signal-based supervisors
261 /// cannot detect. Set by `--self-watchdog-secs`.
262 pub self_watchdog: Option<Duration>,
263 /// If `Some`, the path to a hardware watchdog device (e.g.
264 /// `/dev/watchdog`) that is opened at startup and kicked once per poll
265 /// iteration. On clean shutdown the magic-close byte `'V'` is written to
266 /// disarm the watchdog. Set by `--hw-watchdog`.
267 pub hw_watchdog: Option<PathBuf>,
268 /// Per-source-IP refill rate (connections per second) for the
269 /// Prometheus `/metrics` endpoint. Defaults to
270 /// [`DEFAULT_PROM_RATE_LIMIT_PER_SEC`].
271 pub prom_rate_limit_per_sec: u32,
272 /// Per-source-IP burst (token-bucket capacity) for the Prometheus
273 /// `/metrics` endpoint. Defaults to [`DEFAULT_PROM_RATE_LIMIT_BURST`].
274 pub prom_rate_limit_burst: u32,
275 /// Operator opt-in required to bind a plaintext UDP listener. When
276 /// `--udp-port` is set and no AEAD keys are configured, startup
277 /// refuses to proceed unless this is `true`. The build must also
278 /// include `--features unsafe-plaintext-udp` for the plaintext path
279 /// to exist at all. Set by `--i-accept-plaintext-udp`.
280 pub i_accept_plaintext_udp: bool,
281 /// Operator opt-in to combine the **secure-UDP** listener with a recovery
282 /// command. Secure UDP authenticates wire bytes but cannot attest the
283 /// sending process — a holder of a shared PSK or a derived per-agent key
284 /// can forge a beat for any pid. Without this flag, startup refuses to
285 /// proceed when both `--udp-port` (with key files) and a recovery template
286 /// are set. With this flag the runtime origin gate stamps beats from this
287 /// listener [`BeatOrigin::OperatorAttestedTransport`] so recovery fires.
288 /// Set by `--secure-udp-i-accept-recovery-on-unauthenticated-transport`.
289 pub i_accept_recovery_on_secure_udp: bool,
290 /// Operator opt-in to combine the **plaintext-UDP** listener with a
291 /// recovery command. Plaintext UDP has no authentication whatsoever —
292 /// any host that can reach the observer port can forge any frame. Without
293 /// this flag, startup refuses to proceed when both `--udp-port` (without
294 /// key files) and a recovery template are set. With this flag the runtime
295 /// origin gate stamps beats from this listener
296 /// [`BeatOrigin::OperatorAttestedTransport`] so recovery fires.
297 /// Set by `--plaintext-udp-i-accept-recovery-on-unauthenticated-transport`.
298 pub i_accept_recovery_on_plaintext_udp: bool,
299 /// Operator opt-in to bind the **secure-UDP** listener to a non-loopback
300 /// address (H4). The per-sender replay protection retains state for up
301 /// to 1024 source addresses plus a 1-deep eviction shadow — an attacker
302 /// who can spoof ≥1025 UDP source addresses (trivial on a routed network)
303 /// can rotate the shadow and replay a captured frame against a target
304 /// sender. Loopback is safe (only same-host processes can forge loopback
305 /// source addresses, which requires `CAP_NET_RAW`); any reachable network
306 /// must be explicitly acknowledged. Without this flag, startup refuses
307 /// to proceed when `--udp-bind-addr` resolves to a non-loopback address
308 /// and secure-UDP keys are configured. Set by
309 /// `--i-accept-secure-udp-non-loopback`.
310 pub i_accept_secure_udp_non_loopback: bool,
311 /// Permit beats — and, by extension, recovery commands — for agents
312 /// whose kernel-attested PID namespace differs from the observer's.
313 /// Use only when agents intentionally share the host namespace
314 /// (`--pid=host` containers) or an out-of-band translator is in place.
315 /// Set by `--allow-cross-namespace-agents`. Default `false` — beats from
316 /// cross-namespace agents are dropped at receive (counted via
317 /// `varta_frame_namespace_mismatch_total`), and any stalls that did
318 /// progress before opt-in refuse recovery (counted via
319 /// `varta_recovery_refused_total{reason="cross_namespace_agent"}`).
320 pub allow_cross_namespace_agents: bool,
321 /// Treat a cross-namespace agent as a fatal startup error instead of the
322 /// default refuse-recovery behaviour. Set by `--strict-namespace-check`.
323 /// Useful in environments where the operator wants the daemon to fail
324 /// loudly rather than silently log audit refusals. Default `false`.
325 pub strict_namespace_check: bool,
326 /// Optional path the recovery audit TSV is appended to. When set, every
327 /// recovery spawn and completion is recorded with wall-clock timestamp,
328 /// agent pid, child pid, mode, outcome, exit code, and duration. See
329 /// [`crate::audit::RecoveryAuditLog`] for the schema.
330 pub recovery_audit_file: Option<PathBuf>,
331 /// Optional byte cap for the recovery audit file. When exceeded, the
332 /// file rotates through up to 5 generations (PATH → PATH.1 → … →
333 /// PATH.5). Without a cap the file grows unbounded.
334 pub recovery_audit_max_bytes: Option<u64>,
335 /// How many records to write between forced `fdatasync(2)` calls on
336 /// the audit file. Default `1` (sync every record) — the only
337 /// IEC 62304 Class C-conforming value. Higher values trade a small
338 /// risk of losing up to N-1 records on power cut for a lower per-
339 /// record cost. Values >1 trigger a startup warning. `0` is rejected
340 /// at parse time.
341 pub recovery_audit_sync_every: u32,
342 /// Whether to capture child stdout/stderr non-blockingly for the audit
343 /// record. Default off — pipes are inherited from the observer. Opt-in
344 /// avoids deadlock risk for operators who alias chatty recovery
345 /// commands (e.g. `journalctl -xeu agent.service`).
346 pub recovery_capture_stdio: bool,
347 /// Total byte cap (stdout + stderr combined, per child) when
348 /// `recovery_capture_stdio` is enabled. Defaults to
349 /// [`DEFAULT_RECOVERY_CAPTURE_BYTES`]. Values larger than
350 /// [`MAX_RECOVERY_CAPTURE_BYTES`] are rejected at parse time.
351 pub recovery_capture_bytes: u32,
352 /// Soft per-iteration budget for the observer poll loop. Iterations
353 /// exceeding this increment
354 /// `varta_observer_iteration_budget_exceeded_total` and are visible in
355 /// the `varta_observer_iteration_seconds` histogram. Advisory only —
356 /// hard wedges are caught by `--self-watchdog-secs`. Set by
357 /// `--iteration-budget-ms`; defaults to
358 /// [`crate::exporter::DEFAULT_ITERATION_BUDGET`].
359 pub iteration_budget: Duration,
360 /// Soft per-call budget for `PromExporter::serve_pending`. Calls
361 /// exceeding this increment
362 /// `varta_observer_scrape_budget_exceeded_total` and are visible in
363 /// the `varta_observer_serve_pending_seconds` histogram. Lets
364 /// operators alert on scrape-storm pressure separately from beat-path
365 /// slowness. Set by `--scrape-budget-ms`; defaults to
366 /// [`crate::exporter::DEFAULT_SCRAPE_BUDGET`].
367 pub scrape_budget: Duration,
368 /// Soft per-call budget for a single `fdatasync(2)` on the audit
369 /// log. If one fsync exceeds this, the remaining records in the
370 /// current drain are written-to-BufWriter only and the fsync is
371 /// deferred to the next maintenance tick — bounds the worst-case
372 /// poll stall on a slow disk to one fsync per tick. Increments
373 /// `varta_audit_fsync_budget_exceeded_total` on overrun. Set by
374 /// `--audit-fsync-budget-ms`; defaults to
375 /// [`DEFAULT_AUDIT_FSYNC_BUDGET_MS`]. `0` is rejected.
376 pub audit_fsync_budget_ms: u32,
377 /// Time-based fdatasync cadence in addition to the record-count
378 /// cadence from `--recovery-audit-sync-every`. `0` (default)
379 /// disables the time-based cadence; with a non-zero value, the
380 /// drain force-syncs after this many ms have elapsed since the
381 /// last sync even when the per-record threshold has not yet
382 /// been crossed. Operators on safety-critical profiles keep
383 /// `--recovery-audit-sync-every=1` and ignore this flag; deployments
384 /// that relax the record cadence pin a worst-case sync interval
385 /// here. Set by `--audit-sync-interval-ms`; defaults to
386 /// [`DEFAULT_AUDIT_SYNC_INTERVAL_MS`].
387 pub audit_sync_interval_ms: u32,
388 /// Per-tick wall-clock budget for the audit-log rotation state
389 /// machine. Rotation (rename × 5 + reopen + header + boot record +
390 /// fsync) advances incrementally; if a tick exceeds this budget the
391 /// state is preserved and the next tick resumes. Increments
392 /// `varta_audit_rotation_budget_exceeded_total` on overrun. Set by
393 /// `--audit-rotation-budget-ms`; defaults to
394 /// [`DEFAULT_AUDIT_ROTATION_BUDGET_MS`]. `0` is rejected.
395 pub audit_rotation_budget_ms: u32,
396 /// [test-hooks only] Sleep for this many milliseconds on the first poll
397 /// iteration, simulating a wedged loop. Used by the self-watchdog
398 /// integration test (`tests/self_watchdog.rs`) to exercise the abort path
399 /// without relying on SIGSTOP (which freezes the watchdog thread too).
400 /// Present only when compiled with `--features test-hooks`.
401 #[cfg(feature = "test-hooks")]
402 pub inject_wedge_ms: Option<u64>,
403 /// Kernel clock that backs stall-threshold accounting (H7).
404 ///
405 /// - `Monotonic` (default): `CLOCK_MONOTONIC` — pauses on system
406 /// suspend. Correct for SRE / cloud deployments.
407 /// - `Boottime` (Linux only): `CLOCK_BOOTTIME` — advances during
408 /// suspend. Correct for embedded clinical devices that aggressively
409 /// sleep (insulin pumps, holter monitors).
410 ///
411 /// See `book/src/architecture/safety-profiles.md` for the deployment
412 /// matrix. Set by `--clock-source <monotonic|boottime>`.
413 pub clock_source: ClockSource,
414 /// Signal-handler installation path on Linux.
415 ///
416 /// - `Direct` (default): direct `rt_sigaction(2)` syscall — owns the
417 /// kernel ABI end-to-end, including the x86_64 signal-return trampoline.
418 /// A readback + live SIGUSR1 smoke test run at startup.
419 /// - `Libc`: libc `sigaction(3)` wrapper — libc's `__restore_rt` is used.
420 /// Opt-in for kernels not yet certified against the direct path.
421 ///
422 /// On macOS, FreeBSD, and other Unix, the mode is noted in startup
423 /// logs but has no operational effect (libc / POSIX is the only option).
424 /// Set by `--signal-handler-mode <direct|libc>`.
425 pub signal_handler_mode: SignalHandlerMode,
426}
427
428/// Failure modes for [`Config::from_args`].
429#[derive(Debug)]
430pub enum ConfigError {
431 /// A flag that requires a value was passed without one.
432 MissingValue(&'static str),
433 /// A required flag (e.g. `--socket`, `--threshold-ms`) was omitted.
434 MissingRequired(&'static str),
435 /// An unknown flag token was encountered.
436 UnknownFlag(String),
437 /// A numeric flag carried a value that would not parse as `u64`.
438 BadInteger {
439 /// The flag whose value failed to parse.
440 flag: &'static str,
441 /// The raw string that did not parse.
442 raw: String,
443 },
444 /// A value on `--socket-mode` could not be parsed as octal.
445 BadSocketMode(String),
446 /// `--prom-addr` value did not parse as `IP:PORT`.
447 BadAddr(String),
448 /// A value for a string-enum flag was not one of the accepted choices.
449 BadValue {
450 /// The flag whose value was rejected.
451 flag: &'static str,
452 /// The raw string that was provided.
453 raw: String,
454 },
455 /// The user passed `--help` / `-h`. Not a true error; `main` prints
456 /// [`Config::HELP`] and exits 0.
457 HelpRequested,
458 /// `--threshold-ms` value is below [`MIN_THRESHOLD_MS`].
459 ThresholdTooLow {
460 /// The value that was provided.
461 value: u64,
462 /// The minimum allowed value.
463 min: u64,
464 },
465 /// Two or more mutually exclusive recovery flags were specified.
466 MutuallyExclusive {
467 /// The pair of conflicting flags (e.g. `("--recovery-exec", "--recovery-exec-file")`).
468 a: &'static str,
469 /// Second conflicting flag.
470 b: &'static str,
471 },
472 /// A flag that has been removed for security reasons was passed. The
473 /// `replacement` field carries an inline migration hint so operators
474 /// see the fix in the same line as the error.
475 RemovedFlag {
476 /// The removed flag token (e.g. `"--key-env"`).
477 flag: &'static str,
478 /// Human-readable migration hint (e.g.
479 /// `"--key-file (mode 0600, owned by the observer UID)"`).
480 replacement: &'static str,
481 },
482 /// `--prom-addr` was set but `--prom-token-file` was not. /metrics
483 /// has no anonymous access; the observer refuses to start rather than
484 /// expose agent topology to anyone who can reach the bound port.
485 PromAddrRequiresToken,
486 /// `--recovery-capture-bytes` was set above
487 /// [`MAX_RECOVERY_CAPTURE_BYTES`]. Capturing more output than that
488 /// risks holding too much child stdout/stderr in observer memory.
489 RecoveryCaptureBytesTooLarge {
490 /// The value that was provided.
491 value: u32,
492 /// The maximum allowed value.
493 max: u32,
494 },
495 /// `--recovery-capture-stdio` was passed without any recovery command
496 /// configured (`--recovery-exec` / `--recovery-exec-file`). Capture is
497 /// meaningless without something to capture from.
498 RecoveryCaptureRequiresRecovery,
499 /// `--shutdown-grace-ms` was below [`MIN_SHUTDOWN_GRACE_MS`].
500 ShutdownGraceTooLow {
501 /// The value provided on the CLI.
502 value: u64,
503 /// The minimum allowed value.
504 min: u64,
505 },
506 /// Shell-mode recovery flags were passed (removed feature). Use
507 /// `--recovery-exec` instead.
508 ShellRecoveryNotCompiledIn,
509 /// A recovery command (`--recovery-exec` / `--recovery-exec-file`) was
510 /// configured at the
511 /// same time as a UDP listener (`--udp-port`), without the matching
512 /// per-listener operator acknowledgement. UDP transports cannot attest
513 /// the sending process — an attacker holding the AEAD key (or a derived
514 /// per-agent key) can forge a beat claiming any pid, then stop sending to
515 /// trigger the recovery command against the chosen pid. Pass
516 /// `--secure-udp-i-accept-recovery-on-unauthenticated-transport` (for
517 /// secure UDP) or
518 /// `--plaintext-udp-i-accept-recovery-on-unauthenticated-transport` (for
519 /// plaintext UDP) to proceed.
520 RecoveryRequiresAuthenticatedTransport {
521 /// The `IP:PORT` of the UDP listener that would have been bound.
522 udp_addr: String,
523 },
524 /// A secure-UDP listener was configured with a non-loopback
525 /// `--udp-bind-addr`, but `--i-accept-secure-udp-non-loopback` was not
526 /// passed (H4). The 1-deep replay shadow after capacity-forced eviction
527 /// is acceptable for closed local networks (loopback) but inadequate for
528 /// any reachable network — any spoofable-source attacker with ≥1025
529 /// distinct UDP source addresses can rotate the shadow and replay one
530 /// captured frame per target.
531 SecureUdpRequiresLoopbackBind {
532 /// The `IP:PORT` of the UDP listener that would have been bound.
533 udp_addr: String,
534 },
535 /// `--iteration-budget-ms` was outside the accepted range
536 /// (`[MIN_ITERATION_BUDGET_MS, MAX_ITERATION_BUDGET_MS]`).
537 IterationBudgetOutOfRange {
538 /// The value provided.
539 value: u64,
540 /// The minimum allowed value.
541 min: u64,
542 /// The maximum allowed value.
543 max: u64,
544 },
545 /// `--scrape-budget-ms` was outside the accepted range
546 /// (`[MIN_SCRAPE_BUDGET_MS, MAX_SCRAPE_BUDGET_MS]`).
547 ScrapeBudgetOutOfRange {
548 /// The value provided.
549 value: u64,
550 /// The minimum allowed value.
551 min: u64,
552 /// The maximum allowed value.
553 max: u64,
554 },
555 /// `--eviction-scan-window` was outside the accepted range
556 /// (`[MIN_EVICTION_SCAN_WINDOW, MAX_EVICTION_SCAN_WINDOW]`).
557 EvictionScanWindowOutOfRange {
558 /// The value provided.
559 value: usize,
560 /// The minimum allowed value.
561 min: usize,
562 /// The maximum allowed value.
563 max: usize,
564 },
565 /// `--clock-source boottime` was requested but the host kernel has no
566 /// equivalent of Linux's `CLOCK_BOOTTIME`. Currently fires on every
567 /// non-Linux target (macOS, *BSD).
568 ClockSourceUnsupported {
569 /// The source the operator requested.
570 source: ClockSource,
571 /// `std::env::consts::OS` for the build target.
572 platform: &'static str,
573 },
574 /// The binary was built with `--features compile-time-config` but the
575 /// operator supplied one or more argv tokens. Class-A safety-critical
576 /// builds intentionally accept zero argv; the configuration is baked
577 /// into the binary by `build.rs` at compile time.
578 CompileTimeArgvForbidden,
579 /// `Config::compile_time()` produced a value that fails cross-field
580 /// validation at startup (e.g. recovery requires kernel-attested
581 /// transport but the compile-time blob enabled both UDP and recovery
582 /// without the acknowledgement flag). Carries the same diagnostic
583 /// text the corresponding `from_args` error would produce.
584 CompileTimeConfigInvalid {
585 /// Static description of which invariant was violated.
586 reason: &'static str,
587 },
588}
589
590// The `ConfigError` Display impl has two cfg-gated personalities:
591//
592// 1. Default (SRE) builds: rich messages that name the flag the operator
593// must supply or correct. These strings carry literal flag names like
594// `--socket` and `--prom-addr` and are linked unconditionally.
595//
596// 2. Class-A (`compile-time-config`) builds: terse, neutral phrasings that
597// never mention argv flag names. Most variants are dead code anyway —
598// they are produced only by `Config::from_args`, which is excluded from
599// compilation when the feature is on — but the Display impl must still
600// cover every variant, and any literal flag string in the impl ends up
601// in the binary (cerebrum 2026-05-12: `pub const &str` is always linked
602// regardless of `#[cfg]` on the code paths that consume it).
603//
604// The Class-A wording uses `config key` instead of `--flag-name` and refers
605// the operator to `book/src/architecture/compile-time-config.md` for any
606// remediation. The two impls are mutually exclusive at the `#[cfg]` layer.
607
608#[cfg(not(feature = "compile-time-config"))]
609impl core::fmt::Display for ConfigError {
610 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
611 match self {
612 ConfigError::MissingValue(flag) => write!(f, "{flag} requires a value"),
613 ConfigError::MissingRequired(flag) => write!(f, "missing required flag {flag}"),
614 ConfigError::UnknownFlag(s) => write!(f, "unknown flag {s}"),
615 ConfigError::BadInteger { flag, raw } => {
616 write!(f, "{flag}: not a valid unsigned integer: {raw:?}")
617 }
618 ConfigError::BadSocketMode(raw) => {
619 write!(
620 f,
621 "--socket-mode: expected octal digits (e.g. 600, 0600, or 0o600), got: {raw:?}"
622 )
623 }
624 ConfigError::BadAddr(raw) => {
625 write!(f, "--prom-addr: not a valid socket address: {raw:?}")
626 }
627 ConfigError::BadValue { flag, raw } => {
628 write!(f, "{flag}: invalid value {raw:?}",)
629 }
630 ConfigError::HelpRequested => f.write_str("--help"),
631 ConfigError::ThresholdTooLow { value, min } => {
632 write!(
633 f,
634 "--threshold-ms: {value} is below the minimum allowed value ({min} ms)"
635 )
636 }
637 ConfigError::MutuallyExclusive { a, b } => {
638 write!(f, "{a} and {b} are mutually exclusive")
639 }
640 ConfigError::RemovedFlag { flag, replacement } => write!(
641 f,
642 "{flag} has been removed for security reasons; use {replacement}"
643 ),
644 ConfigError::PromAddrRequiresToken => f.write_str(
645 "--prom-addr requires --prom-token-file. /metrics has no anonymous access; \
646 generate a token with `openssl rand -hex 32 > /etc/varta/prom.token && \
647 chmod 600 /etc/varta/prom.token`.",
648 ),
649 ConfigError::ShutdownGraceTooLow { value, min } => write!(
650 f,
651 "--shutdown-grace-ms: {value} is below the minimum allowed value ({min} ms)"
652 ),
653 ConfigError::RecoveryCaptureBytesTooLarge { value, max } => write!(
654 f,
655 "--recovery-capture-bytes: {value} exceeds the maximum allowed value ({max} bytes)"
656 ),
657 ConfigError::RecoveryCaptureRequiresRecovery => f.write_str(
658 "--recovery-capture-stdio requires --recovery-exec or --recovery-exec-file",
659 ),
660 ConfigError::ShellRecoveryNotCompiledIn => f.write_str(
661 "shell-mode recovery has been permanently removed; use --recovery-exec instead",
662 ),
663 ConfigError::RecoveryRequiresAuthenticatedTransport { udp_addr } => write!(
664 f,
665 "recovery command is configured alongside a UDP listener on {udp_addr}. \
666 UDP transports cannot attest the sending process — a holder of the AEAD key \
667 (or a per-agent key derived from a leaked master key) can forge a beat \
668 claiming any pid, then stop sending to trigger recovery against the chosen pid. \
669 Either remove the recovery command, switch to a UDS-only deployment, or pass \
670 --secure-udp-i-accept-recovery-on-unauthenticated-transport (for secure UDP) \
671 or --plaintext-udp-i-accept-recovery-on-unauthenticated-transport (for plaintext \
672 UDP) to explicitly accept this risk on a per-listener basis."
673 ),
674 ConfigError::SecureUdpRequiresLoopbackBind { udp_addr } => write!(
675 f,
676 "secure-UDP listener configured with non-loopback --udp-bind-addr ({udp_addr}). \
677 The per-sender replay-state map holds up to 1024 senders plus a 1-deep \
678 eviction shadow; an attacker who can spoof ≥1025 UDP source addresses can \
679 rotate the shadow and replay a captured frame against a target sender. \
680 Either bind to a loopback address (default 127.0.0.1) or pass \
681 --i-accept-secure-udp-non-loopback to explicitly accept this risk. \
682 See book/src/architecture/vlp-transports.md for the threat-boundary derivation."
683 ),
684 ConfigError::IterationBudgetOutOfRange { value, min, max } => write!(
685 f,
686 "--iteration-budget-ms: {value} is outside the accepted range [{min}, {max}] ms"
687 ),
688 ConfigError::ScrapeBudgetOutOfRange { value, min, max } => write!(
689 f,
690 "--scrape-budget-ms: {value} is outside the accepted range [{min}, {max}] ms"
691 ),
692 ConfigError::EvictionScanWindowOutOfRange { value, min, max } => write!(
693 f,
694 "--eviction-scan-window: {value} is outside the accepted range [{min}, {max}]"
695 ),
696 ConfigError::ClockSourceUnsupported { source, platform } => {
697 let hint = match source {
698 crate::clock::ClockSource::Boottime => {
699 "`boottime` semantics (advance through suspend) require Linux's \
700 CLOCK_BOOTTIME. On macOS / iOS use `--clock-source monotonic-raw` \
701 (mach_continuous_time) for the same semantics; BSD has no equivalent \
702 kernel clock."
703 }
704 crate::clock::ClockSource::MonotonicRaw => {
705 "`monotonic-raw` is macOS / iOS only (CLOCK_MONOTONIC_RAW = \
706 mach_continuous_time). On Linux use `--clock-source boottime` \
707 (CLOCK_BOOTTIME) for advance-through-suspend semantics; BSD \
708 has no equivalent kernel clock."
709 }
710 crate::clock::ClockSource::Monotonic => "",
711 };
712 write!(
713 f,
714 "--clock-source {source} is not supported on `{platform}`. {hint} \
715 Otherwise use `--clock-source monotonic` (the default)."
716 )
717 }
718 ConfigError::CompileTimeArgvForbidden => f.write_str(
719 "this binary was configured at compile time \
720 (--features compile-time-config); refusing to accept argv. \
721 See book/src/architecture/compile-time-config.md for the \
722 supported configuration mechanism.",
723 ),
724 ConfigError::CompileTimeConfigInvalid { reason } => write!(
725 f,
726 "compile-time config violates a cross-field invariant: {reason}"
727 ),
728 }
729 }
730}
731
732#[cfg(feature = "compile-time-config")]
733impl core::fmt::Display for ConfigError {
734 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
735 // Generic remediation pointer for every flag-relevant variant.
736 // Argv-only variants are unreachable in Class-A builds (their
737 // producer, `Config::from_args`, is excluded from compilation),
738 // but the Display impl must still cover them. Neutral wording
739 // keeps argv flag names out of the binary's `strings` output.
740 const REF: &str = "see book/src/architecture/compile-time-config.md";
741 match self {
742 ConfigError::MissingValue(_)
743 | ConfigError::MissingRequired(_)
744 | ConfigError::UnknownFlag(_)
745 | ConfigError::BadInteger { .. }
746 | ConfigError::BadSocketMode(_)
747 | ConfigError::BadAddr(_)
748 | ConfigError::BadValue { .. }
749 | ConfigError::HelpRequested
750 | ConfigError::MutuallyExclusive { .. }
751 | ConfigError::RemovedFlag { .. }
752 | ConfigError::PromAddrRequiresToken
753 | ConfigError::ShutdownGraceTooLow { .. }
754 | ConfigError::RecoveryCaptureBytesTooLarge { .. }
755 | ConfigError::RecoveryCaptureRequiresRecovery
756 | ConfigError::RecoveryRequiresAuthenticatedTransport { .. }
757 | ConfigError::SecureUdpRequiresLoopbackBind { .. }
758 | ConfigError::IterationBudgetOutOfRange { .. }
759 | ConfigError::ScrapeBudgetOutOfRange { .. }
760 | ConfigError::EvictionScanWindowOutOfRange { .. } => {
761 write!(f, "configuration error (argv path unreachable; {REF})")
762 }
763 ConfigError::ThresholdTooLow { value, min } => {
764 write!(f, "threshold below minimum: {value} ms < {min} ms ({REF})")
765 }
766 ConfigError::ShellRecoveryNotCompiledIn => write!(
767 f,
768 "shell-mode recovery has been permanently removed ({REF})"
769 ),
770 ConfigError::ClockSourceUnsupported { platform, .. } => write!(
771 f,
772 "configured clock source is not supported on `{platform}`; \
773 only the monotonic source is available off Linux ({REF})"
774 ),
775 ConfigError::CompileTimeArgvForbidden => f.write_str(
776 "this binary was configured at compile time; \
777 refusing to accept command-line arguments",
778 ),
779 ConfigError::CompileTimeConfigInvalid { reason } => write!(
780 f,
781 "compile-time config violates a cross-field invariant: {reason}"
782 ),
783 }
784 }
785}
786
787impl std::error::Error for ConfigError {}