varta_vlp/lib.rs
1#![cfg_attr(not(feature = "std"), no_std)]
2#![deny(missing_docs, unsafe_op_in_unsafe_fn, rust_2018_idioms)]
3#![forbid(clippy::dbg_macro, clippy::print_stdout)]
4
5//! Varta Lifeline Protocol — 32-byte fixed-layout health frame.
6//!
7//! This crate is the protocol root for Varta v0.1.0. It defines the on-wire
8//! [`Frame`] representation that agents emit and observers decode, the
9//! [`Status`] enum that classifies an agent's last reported health, and the
10//! [`DecodeError`] returned when validation fails. Every helper operates on
11//! fixed-size byte arrays so the steady-state path on either side of the
12//! socket is heap-clean.
13//!
14//! The crate compiles as `#![no_std]` by default and pulls in zero allocator
15//! usage; the optional `std` feature enables `Key::from_file` and related
16//! `std::path::Path`-typed conveniences.
17//!
18//! See `book/src/architecture/vlp-frame.md` for the byte map and design notes.
19
20// Unit tests live inside the lib crate and use `format!` / `assert_eq!` against
21// dynamic strings; pull `std` in for the test harness only. This does not
22// affect the production library's `#![no_std]` posture in any build mode.
23#[cfg(test)]
24extern crate std;
25
26#[cfg(feature = "crypto")]
27pub mod crypto;
28
29pub mod crc32c;
30pub mod util;
31pub use util::{ct_eq, decode_hex_32, HexDecodeError};
32
33// Symbolic-verification harnesses live in their own module gated
34// `#[cfg(kani)]` so they compile only under `cargo kani`. The Kani crate
35// is injected by the verifier and never appears in [`Cargo.toml`]; the
36// zero-registry-dependency invariant for varta-vlp is preserved.
37//
38// See `book/src/architecture/verification.md`.
39#[cfg(kani)]
40pub mod proofs;
41
42/// Magic prefix on every VLP frame. ASCII `"VA"`, intentionally readable in
43/// hex dumps so a stray byte stream is easy to identify.
44pub const MAGIC: [u8; 2] = [0x56, 0x41];
45
46/// Current Varta Lifeline Protocol version. v0.2 introduces the CRC-32C
47/// integrity trailer at bytes 28..32 and shrinks `payload` from `u64` to
48/// `u32` to fit it. v0.1 frames decode as [`DecodeError::BadVersion`].
49pub const VERSION: u8 = 0x02;
50
51// Compile-time guard: VLP frame layout is little-endian by specification
52// (see book/src/architecture/vlp-frame.md). Building on a big-endian host would
53// silently produce broken frames.
54#[cfg(not(target_endian = "little"))]
55compile_error!(
56 "VLP frame protocol requires little-endian host (see book/src/architecture/vlp-frame.md)"
57);
58
59/// Sentinel nonce value reserved for terminal panic frames.
60///
61/// Emitted only by `varta_client::panic::install*` panic hooks, paired with
62/// [`Status::Critical`]. Regular beats from `varta_client::Varta::beat`
63/// increment monotonically from 1 and wrap to 0 on exhaustion (the wrap
64/// boundary in the client is `NONCE_TERMINAL - 1 → 0`), so the regular-beat
65/// nonce stream structurally never collides with this sentinel.
66///
67/// [`Frame::decode`] enforces `NONCE_TERMINAL ⇒ Status::Critical`
68/// ([`DecodeError::BadNonce`] otherwise). The converse is *not* enforced —
69/// operators may emit `Status::Critical` for non-panic alerts at any nonce.
70/// Downstream consumers that need to distinguish "panic terminal" from
71/// "operational critical" must inspect both `status` *and* `nonce`.
72///
73/// See `book/src/architecture/vlp-frame.md` ("Nonce semantics") for the
74/// full protocol-level rationale.
75pub const NONCE_TERMINAL: u64 = u64::MAX;
76
77/// Health status reported by an agent in a single VLP frame.
78///
79/// The discriminants are explicit because they form part of the on-wire
80/// contract: agents serialise `Status as u8` and observers reconstruct via
81/// [`Status::try_from_u8`].
82///
83/// This enum is exhaustive. Adding a variant is a workspace-wide compile-error
84/// change. The wire format (version-pinned by [`VERSION`]) guarantees that no
85/// in-memory `Status` value exists outside this list; unknown bytes are rejected
86/// by [`Status::try_from_u8`] as [`DecodeError::BadStatus`].
87#[repr(u8)]
88#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
89pub enum Status {
90 /// The agent is healthy and making progress.
91 Ok = 0,
92 /// The agent is making progress but reporting elevated trouble (e.g.
93 /// retrying, throttled).
94 Degraded = 1,
95 /// The agent is about to die. Emitted by the panic hook in
96 /// `varta-client` immediately before unwinding.
97 Critical = 2,
98 /// The agent appears stuck. Emitted by `varta-watch` when no beat has
99 /// arrived within the configured threshold.
100 Stall = 3,
101}
102
103impl Status {
104 /// Decode a status byte from the wire format. Returns
105 /// [`DecodeError::BadStatus`] carrying the offending byte if the value is
106 /// not a known variant.
107 pub fn try_from_u8(byte: u8) -> Result<Self, DecodeError> {
108 match byte {
109 0 => Ok(Status::Ok),
110 1 => Ok(Status::Degraded),
111 2 => Ok(Status::Critical),
112 3 => Ok(Status::Stall),
113 other => Err(DecodeError::BadStatus(other)),
114 }
115 }
116}
117
118/// On-wire health frame — exactly 32 bytes, 8-byte aligned, little-endian
119/// integer fields. The struct is `repr(C)` so its layout is ABI-stable across
120/// compilations and trivially verifiable by inspection.
121///
122/// Construct frames directly via the public fields, then call
123/// [`Frame::encode`] to write to a socket buffer or [`Frame::decode`] to read
124/// one. There is no `Default`; agents always supply a real `pid`, `nonce` and
125/// timestamp.
126#[non_exhaustive]
127#[repr(C, align(8))]
128#[derive(Clone, Copy, Debug, Eq, PartialEq)]
129pub struct Frame {
130 /// Magic prefix, always equal to [`MAGIC`].
131 pub magic: [u8; 2],
132 /// Protocol version, always equal to [`VERSION`] on emit.
133 pub version: u8,
134 /// Health status reported by the agent. Encoded on the wire as a
135 /// single byte at offset 3 ([`Status`] discriminants are `#[repr(u8)]`).
136 pub status: Status,
137 /// OS process id of the emitting agent.
138 pub pid: u32,
139 /// Monotonic timestamp chosen by the emitter (typically nanoseconds since
140 /// some agent-local epoch). Observers do not interpret it; they only
141 /// compare consecutive timestamps for the same pid.
142 pub timestamp: u64,
143 /// Strictly increasing counter, starting at 1 on the first beat after
144 /// `Varta::connect`. The panic hook pins this to [`NONCE_TERMINAL`] to
145 /// mark a final critical frame.
146 ///
147 /// Regular beats wrap at `NONCE_TERMINAL - 1 → 0` on exhaustion, so the
148 /// regular-beat nonce stream **structurally never collides** with
149 /// [`NONCE_TERMINAL`]. A wire frame with `nonce == NONCE_TERMINAL` is, by
150 /// construction, a panic frame (and [`Frame::decode`] enforces it must
151 /// also carry [`Status::Critical`]).
152 pub nonce: u64,
153 /// Free-form 4-byte payload — application-defined health context (queue
154 /// depth, error code, etc.). Carried opaquely by the protocol. Shrunk
155 /// from `u64` to `u32` in VLP v0.2 to fit the CRC-32C trailer in 32
156 /// bytes; the on-wire CRC occupies bytes 28..32 and is not surfaced as
157 /// a struct field — see [`Frame::encode`] / [`Frame::decode`].
158 pub payload: u32,
159}
160
161const _: () = assert!(core::mem::size_of::<Frame>() == 32);
162const _: () = assert!(core::mem::align_of::<Frame>() == 8);
163const _: () = assert!(core::mem::offset_of!(Frame, magic) == 0);
164const _: () = assert!(core::mem::offset_of!(Frame, version) == 2);
165const _: () = assert!(core::mem::offset_of!(Frame, status) == 3);
166const _: () = assert!(core::mem::offset_of!(Frame, pid) == 4);
167const _: () = assert!(core::mem::offset_of!(Frame, timestamp) == 8);
168const _: () = assert!(core::mem::offset_of!(Frame, nonce) == 16);
169const _: () = assert!(core::mem::offset_of!(Frame, payload) == 24);
170
171impl Frame {
172 /// Construct a new frame with the canonical [`MAGIC`] prefix and
173 /// [`VERSION`] byte already populated. All other fields are
174 /// caller-supplied.
175 pub const fn new(status: Status, pid: u32, timestamp: u64, nonce: u64, payload: u32) -> Frame {
176 Frame {
177 magic: MAGIC,
178 version: VERSION,
179 status,
180 pid,
181 timestamp,
182 nonce,
183 payload,
184 }
185 }
186
187 /// Serialise this frame into a 32-byte buffer in canonical
188 /// little-endian layout. The output buffer is overwritten in place; this
189 /// method allocates nothing.
190 ///
191 /// Bytes 28..32 are stamped with a CRC-32C computed over bytes 0..28 —
192 /// see [`crate::crc32c`]. The CRC is a wire-format artifact, not a
193 /// struct field; callers must never mutate the buffer between `encode`
194 /// and the on-wire write or the receiver will reject the frame as
195 /// [`DecodeError::BadCrc`].
196 pub fn encode(&self, out: &mut [u8; 32]) {
197 out[0..2].copy_from_slice(&self.magic);
198 out[2] = self.version;
199 out[3] = self.status as u8;
200 out[4..8].copy_from_slice(&self.pid.to_le_bytes());
201 out[8..16].copy_from_slice(&self.timestamp.to_le_bytes());
202 out[16..24].copy_from_slice(&self.nonce.to_le_bytes());
203 out[24..28].copy_from_slice(&self.payload.to_le_bytes());
204 let crc = crc32c::compute(&out[0..28]);
205 out[28..32].copy_from_slice(&crc.to_le_bytes());
206 }
207
208 /// Decode a 32-byte buffer back into a [`Frame`], validating magic,
209 /// version, CRC, status, and field ranges in that order. Returns
210 /// [`DecodeError`] on the first failed check.
211 ///
212 /// Order rationale: `magic` + `version` come first so random bytes
213 /// from a wrong-protocol sender surface as
214 /// [`DecodeError::BadMagic`] / [`DecodeError::BadVersion`] (the
215 /// "this isn't even VLP" diagnostic). The CRC then gates every
216 /// field-range check — a single-bit-flipped status byte must surface
217 /// as [`DecodeError::BadCrc`], not as a valid frame with the wrong
218 /// meaning.
219 ///
220 /// Field-range rules enforced after the CRC passes:
221 /// * `status == Status::Stall` is rejected — `Stall` is observer-synthesized
222 /// by `varta-watch` when a pid goes silent past its threshold; no
223 /// legitimate agent emits it on the wire. Accepting a spoofed `Stall`
224 /// frame would let a hostile sender pollute observer telemetry from
225 /// any pid.
226 /// * `pid ∈ {0, 1}` is rejected — pid 0 is the kernel/scheduler and
227 /// pid 1 is init/systemd; no legitimate agent runs at either, and
228 /// accepting them lets a hostile sender spoof "init has stalled" to
229 /// the recovery path.
230 /// * `timestamp == u64::MAX` is rejected — `varta_client::Varta::beat`
231 /// saturates at this value with `.min(u64::MAX as u128) as u64`, and
232 /// reaching it through real elapsed time (~584 years) is impossible.
233 /// The sentinel is reserved.
234 ///
235 /// *Asymmetry note*: a hypothetical agent whose monotonic clock
236 /// saturates still observes `BeatOutcome::Sent` from `send(2)` (the
237 /// kernel sees a well-formed 32-byte datagram), while the observer
238 /// drops the frame as `DecodeError::BadTimestamp`. The divergence is
239 /// physically unreachable on a single `Varta::connect` handle and is
240 /// documented for completeness only.
241 /// * `nonce == NONCE_TERMINAL` is allowed only when paired with
242 /// `Status::Critical`; the sentinel is the panic-hook's terminal
243 /// marker and is never emitted on the regular beat path.
244 pub fn decode(bytes: &[u8; 32]) -> Result<Frame, DecodeError> {
245 let magic = [bytes[0], bytes[1]];
246 if magic != MAGIC {
247 return Err(DecodeError::BadMagic);
248 }
249 let version = bytes[2];
250 if version != VERSION {
251 return Err(DecodeError::BadVersion);
252 }
253
254 // CRC trailer at bytes 28..32 covers bytes 0..28. Verified after
255 // magic/version (so wrong-protocol bytes surface as BadMagic, not
256 // BadCrc) and before any field-range check (so corruption cannot
257 // produce a "well-formed" frame with the wrong meaning).
258 let stored_crc = u32::from_le_bytes([bytes[28], bytes[29], bytes[30], bytes[31]]);
259 let computed_crc = crc32c::compute(&bytes[0..28]);
260 if stored_crc != computed_crc {
261 return Err(DecodeError::BadCrc {
262 expected: computed_crc,
263 actual: stored_crc,
264 });
265 }
266
267 let status = Status::try_from_u8(bytes[3])?;
268 if status == Status::Stall {
269 return Err(DecodeError::StallOnWire);
270 }
271
272 // Each integer field is decoded via explicit array indexing — the
273 // compiler statically proves every index is in-bounds against the
274 // `&[u8; 32]` reference, so there are no runtime panics.
275 let pid = u32::from_le_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]);
276 let timestamp = u64::from_le_bytes([
277 bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
278 ]);
279 let nonce = u64::from_le_bytes([
280 bytes[16], bytes[17], bytes[18], bytes[19], bytes[20], bytes[21], bytes[22], bytes[23],
281 ]);
282 let payload = u32::from_le_bytes([bytes[24], bytes[25], bytes[26], bytes[27]]);
283
284 if pid == 0 || pid == 1 {
285 return Err(DecodeError::BadPid(pid));
286 }
287 if timestamp == u64::MAX {
288 return Err(DecodeError::BadTimestamp(timestamp));
289 }
290 if nonce == NONCE_TERMINAL && status != Status::Critical {
291 return Err(DecodeError::BadNonce { nonce, status });
292 }
293
294 Ok(Frame {
295 magic,
296 version,
297 status,
298 pid,
299 timestamp,
300 nonce,
301 payload,
302 })
303 }
304}
305
306/// Error returned by [`Frame::decode`] and [`Status::try_from_u8`].
307///
308/// The variants form an exhaustive list of validation failures the protocol
309/// can detect statically; everything else (timestamp drift, nonce regression)
310/// is policy enforced higher in the stack.
311///
312/// This enum is exhaustive. Adding a variant is a workspace-wide compile-error
313/// change that requires updating every match site explicitly.
314#[derive(Clone, Copy, Debug, Eq, PartialEq)]
315pub enum DecodeError {
316 /// First two bytes did not equal [`MAGIC`].
317 BadMagic,
318 /// Version byte did not equal [`VERSION`].
319 BadVersion,
320 /// CRC-32C trailer at bytes 28..32 did not match the value computed
321 /// over bytes 0..28. Indicates wire corruption (cosmic ray / NIC
322 /// firmware / non-ECC RAM bit flip) on the UDS transport, or
323 /// in-process memory corruption between
324 /// [`crate::crypto::open`](crypto::open) and `Frame::decode` on the
325 /// secure-UDP transport. AEAD tag failures stay in the transport
326 /// layer (`crypto::AuthError`) and never surface as `BadCrc`.
327 BadCrc {
328 /// CRC-32C recomputed over bytes 0..28 of the received frame.
329 expected: u32,
330 /// CRC-32C value carried in bytes 28..32 of the received frame.
331 actual: u32,
332 },
333 /// Status byte did not match any known [`Status`] variant. The inner
334 /// value is the offending byte, surfaced for observer-side diagnostics.
335 BadStatus(u8),
336 /// Observer-only status `Status::Stall` observed on the wire. `Stall`
337 /// is synthesized by `varta-watch` when a pid goes silent past its
338 /// threshold; agents emit only `Ok`, `Degraded`, or `Critical`. A
339 /// spoofed `Stall` frame would inject false liveness telemetry from
340 /// any pid, so the decoder rejects it at the single chokepoint.
341 StallOnWire,
342 /// Reserved pid: `0` (kernel/scheduler) or `1` (init/systemd). No
343 /// legitimate agent runs at either pid; rejecting closes the "spoof
344 /// init has stalled" recovery-trigger attack on UDP listeners.
345 BadPid(u32),
346 /// Reserved timestamp sentinel `u64::MAX` — the saturation value from
347 /// `varta_client::Varta::beat`'s `.min(u64::MAX as u128) as u64`.
348 /// Reaching it through real elapsed nanoseconds is not physically
349 /// possible.
350 BadTimestamp(u64),
351 /// Protocol invariant violation: `nonce == NONCE_TERMINAL` is reserved
352 /// for the panic hook's terminal frame and MUST be paired with
353 /// [`Status::Critical`]. Carries the violating `status` for diagnostics.
354 BadNonce {
355 /// The terminal-sentinel nonce value observed on the wire.
356 nonce: u64,
357 /// The status byte that was paired with the sentinel nonce.
358 status: Status,
359 },
360}
361
362impl core::fmt::Display for DecodeError {
363 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
364 match self {
365 DecodeError::BadMagic => f.write_str("varta-vlp: bad magic prefix"),
366 DecodeError::BadVersion => f.write_str("varta-vlp: bad version byte"),
367 DecodeError::BadCrc { expected, actual } => {
368 write!(
369 f,
370 "varta-vlp: bad CRC-32C trailer (expected {expected:#010x}, actual {actual:#010x})"
371 )
372 }
373 DecodeError::BadStatus(byte) => {
374 write!(f, "varta-vlp: bad status byte {byte:#04x}")
375 }
376 DecodeError::StallOnWire => {
377 f.write_str("varta-vlp: Status::Stall is observer-only and forbidden on the wire")
378 }
379 DecodeError::BadPid(pid) => {
380 write!(f, "varta-vlp: reserved pid {pid}")
381 }
382 DecodeError::BadTimestamp(ts) => {
383 write!(f, "varta-vlp: reserved timestamp sentinel {ts:#x}")
384 }
385 DecodeError::BadNonce { nonce, status } => {
386 write!(
387 f,
388 "varta-vlp: terminal nonce {nonce:#x} requires Status::Critical, got {status:?}"
389 )
390 }
391 }
392 }
393}
394
395impl core::error::Error for DecodeError {}