Skip to main content

varta_vlp/
lib.rs

1#![cfg_attr(not(feature = "std"), no_std)]
2#![deny(missing_docs, unsafe_op_in_unsafe_fn, rust_2018_idioms)]
3#![forbid(clippy::dbg_macro, clippy::print_stdout)]
4
5//! Varta Lifeline Protocol — 32-byte fixed-layout health frame.
6//!
7//! This crate is the protocol root for Varta v0.1.0. It defines the on-wire
8//! [`Frame`] representation that agents emit and observers decode, the
9//! [`Status`] enum that classifies an agent's last reported health, and the
10//! [`DecodeError`] returned when validation fails. Every helper operates on
11//! fixed-size byte arrays so the steady-state path on either side of the
12//! socket is heap-clean.
13//!
14//! The crate compiles as `#![no_std]` by default and pulls in zero allocator
15//! usage; the optional `std` feature enables `Key::from_file` and related
16//! `std::path::Path`-typed conveniences.
17//!
18//! See `book/src/architecture/vlp-frame.md` for the byte map and design notes.
19
20// Unit tests live inside the lib crate and use `format!` / `assert_eq!` against
21// dynamic strings; pull `std` in for the test harness only. This does not
22// affect the production library's `#![no_std]` posture in any build mode.
23#[cfg(test)]
24extern crate std;
25
26#[cfg(feature = "crypto")]
27pub mod crypto;
28
29pub mod crc32c;
30pub mod util;
31pub use util::{ct_eq, decode_hex_32, HexDecodeError};
32
33// Symbolic-verification harnesses live in their own module gated
34// `#[cfg(kani)]` so they compile only under `cargo kani`.  The Kani crate
35// is injected by the verifier and never appears in [`Cargo.toml`]; the
36// zero-registry-dependency invariant for varta-vlp is preserved.
37//
38// See `book/src/architecture/verification.md`.
39#[cfg(kani)]
40pub mod proofs;
41
42/// Magic prefix on every VLP frame. ASCII `"VA"`, intentionally readable in
43/// hex dumps so a stray byte stream is easy to identify.
44pub const MAGIC: [u8; 2] = [0x56, 0x41];
45
46/// Current Varta Lifeline Protocol version. v0.2 introduces the CRC-32C
47/// integrity trailer at bytes 28..32 and shrinks `payload` from `u64` to
48/// `u32` to fit it. v0.1 frames decode as [`DecodeError::BadVersion`].
49pub const VERSION: u8 = 0x02;
50
51// Compile-time guard: VLP frame layout is little-endian by specification
52// (see book/src/architecture/vlp-frame.md). Building on a big-endian host would
53// silently produce broken frames.
54#[cfg(not(target_endian = "little"))]
55compile_error!(
56    "VLP frame protocol requires little-endian host (see book/src/architecture/vlp-frame.md)"
57);
58
59/// Sentinel nonce value reserved for terminal panic frames.
60///
61/// Emitted only by `varta_client::panic::install*` panic hooks, paired with
62/// [`Status::Critical`]. Regular beats from `varta_client::Varta::beat`
63/// increment monotonically from 1 and wrap to 0 on exhaustion (the wrap
64/// boundary in the client is `NONCE_TERMINAL - 1 → 0`), so the regular-beat
65/// nonce stream structurally never collides with this sentinel.
66///
67/// [`Frame::decode`] enforces `NONCE_TERMINAL ⇒ Status::Critical`
68/// ([`DecodeError::BadNonce`] otherwise). The converse is *not* enforced —
69/// operators may emit `Status::Critical` for non-panic alerts at any nonce.
70/// Downstream consumers that need to distinguish "panic terminal" from
71/// "operational critical" must inspect both `status` *and* `nonce`.
72///
73/// See `book/src/architecture/vlp-frame.md` ("Nonce semantics") for the
74/// full protocol-level rationale.
75pub const NONCE_TERMINAL: u64 = u64::MAX;
76
77/// Health status reported by an agent in a single VLP frame.
78///
79/// The discriminants are explicit because they form part of the on-wire
80/// contract: agents serialise `Status as u8` and observers reconstruct via
81/// [`Status::try_from_u8`].
82///
83/// This enum is exhaustive. Adding a variant is a workspace-wide compile-error
84/// change. The wire format (version-pinned by [`VERSION`]) guarantees that no
85/// in-memory `Status` value exists outside this list; unknown bytes are rejected
86/// by [`Status::try_from_u8`] as [`DecodeError::BadStatus`].
87#[repr(u8)]
88#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
89pub enum Status {
90    /// The agent is healthy and making progress.
91    Ok = 0,
92    /// The agent is making progress but reporting elevated trouble (e.g.
93    /// retrying, throttled).
94    Degraded = 1,
95    /// The agent is about to die. Emitted by the panic hook in
96    /// `varta-client` immediately before unwinding.
97    Critical = 2,
98    /// The agent appears stuck. Emitted by `varta-watch` when no beat has
99    /// arrived within the configured threshold.
100    Stall = 3,
101}
102
103impl Status {
104    /// Decode a status byte from the wire format. Returns
105    /// [`DecodeError::BadStatus`] carrying the offending byte if the value is
106    /// not a known variant.
107    pub fn try_from_u8(byte: u8) -> Result<Self, DecodeError> {
108        match byte {
109            0 => Ok(Status::Ok),
110            1 => Ok(Status::Degraded),
111            2 => Ok(Status::Critical),
112            3 => Ok(Status::Stall),
113            other => Err(DecodeError::BadStatus(other)),
114        }
115    }
116}
117
118/// On-wire health frame — exactly 32 bytes, 8-byte aligned, little-endian
119/// integer fields. The struct is `repr(C)` so its layout is ABI-stable across
120/// compilations and trivially verifiable by inspection.
121///
122/// Construct frames directly via the public fields, then call
123/// [`Frame::encode`] to write to a socket buffer or [`Frame::decode`] to read
124/// one. There is no `Default`; agents always supply a real `pid`, `nonce` and
125/// timestamp.
126#[non_exhaustive]
127#[repr(C, align(8))]
128#[derive(Clone, Copy, Debug, Eq, PartialEq)]
129pub struct Frame {
130    /// Magic prefix, always equal to [`MAGIC`].
131    pub magic: [u8; 2],
132    /// Protocol version, always equal to [`VERSION`] on emit.
133    pub version: u8,
134    /// Health status reported by the agent. Encoded on the wire as a
135    /// single byte at offset 3 ([`Status`] discriminants are `#[repr(u8)]`).
136    pub status: Status,
137    /// OS process id of the emitting agent.
138    pub pid: u32,
139    /// Monotonic timestamp chosen by the emitter (typically nanoseconds since
140    /// some agent-local epoch). Observers do not interpret it; they only
141    /// compare consecutive timestamps for the same pid.
142    pub timestamp: u64,
143    /// Strictly increasing counter, starting at 1 on the first beat after
144    /// `Varta::connect`. The panic hook pins this to [`NONCE_TERMINAL`] to
145    /// mark a final critical frame.
146    ///
147    /// Regular beats wrap at `NONCE_TERMINAL - 1 → 0` on exhaustion, so the
148    /// regular-beat nonce stream **structurally never collides** with
149    /// [`NONCE_TERMINAL`]. A wire frame with `nonce == NONCE_TERMINAL` is, by
150    /// construction, a panic frame (and [`Frame::decode`] enforces it must
151    /// also carry [`Status::Critical`]).
152    pub nonce: u64,
153    /// Free-form 4-byte payload — application-defined health context (queue
154    /// depth, error code, etc.). Carried opaquely by the protocol. Shrunk
155    /// from `u64` to `u32` in VLP v0.2 to fit the CRC-32C trailer in 32
156    /// bytes; the on-wire CRC occupies bytes 28..32 and is not surfaced as
157    /// a struct field — see [`Frame::encode`] / [`Frame::decode`].
158    pub payload: u32,
159}
160
161const _: () = assert!(core::mem::size_of::<Frame>() == 32);
162const _: () = assert!(core::mem::align_of::<Frame>() == 8);
163const _: () = assert!(core::mem::offset_of!(Frame, magic) == 0);
164const _: () = assert!(core::mem::offset_of!(Frame, version) == 2);
165const _: () = assert!(core::mem::offset_of!(Frame, status) == 3);
166const _: () = assert!(core::mem::offset_of!(Frame, pid) == 4);
167const _: () = assert!(core::mem::offset_of!(Frame, timestamp) == 8);
168const _: () = assert!(core::mem::offset_of!(Frame, nonce) == 16);
169const _: () = assert!(core::mem::offset_of!(Frame, payload) == 24);
170
171impl Frame {
172    /// Construct a new frame with the canonical [`MAGIC`] prefix and
173    /// [`VERSION`] byte already populated. All other fields are
174    /// caller-supplied.
175    pub const fn new(status: Status, pid: u32, timestamp: u64, nonce: u64, payload: u32) -> Frame {
176        Frame {
177            magic: MAGIC,
178            version: VERSION,
179            status,
180            pid,
181            timestamp,
182            nonce,
183            payload,
184        }
185    }
186
187    /// Serialise this frame into a 32-byte buffer in canonical
188    /// little-endian layout. The output buffer is overwritten in place; this
189    /// method allocates nothing.
190    ///
191    /// Bytes 28..32 are stamped with a CRC-32C computed over bytes 0..28 —
192    /// see [`crate::crc32c`]. The CRC is a wire-format artifact, not a
193    /// struct field; callers must never mutate the buffer between `encode`
194    /// and the on-wire write or the receiver will reject the frame as
195    /// [`DecodeError::BadCrc`].
196    pub fn encode(&self, out: &mut [u8; 32]) {
197        out[0..2].copy_from_slice(&self.magic);
198        out[2] = self.version;
199        out[3] = self.status as u8;
200        out[4..8].copy_from_slice(&self.pid.to_le_bytes());
201        out[8..16].copy_from_slice(&self.timestamp.to_le_bytes());
202        out[16..24].copy_from_slice(&self.nonce.to_le_bytes());
203        out[24..28].copy_from_slice(&self.payload.to_le_bytes());
204        let crc = crc32c::compute(&out[0..28]);
205        out[28..32].copy_from_slice(&crc.to_le_bytes());
206    }
207
208    /// Decode a 32-byte buffer back into a [`Frame`], validating magic,
209    /// version, CRC, status, and field ranges in that order. Returns
210    /// [`DecodeError`] on the first failed check.
211    ///
212    /// Order rationale: `magic` + `version` come first so random bytes
213    /// from a wrong-protocol sender surface as
214    /// [`DecodeError::BadMagic`] / [`DecodeError::BadVersion`] (the
215    /// "this isn't even VLP" diagnostic). The CRC then gates every
216    /// field-range check — a single-bit-flipped status byte must surface
217    /// as [`DecodeError::BadCrc`], not as a valid frame with the wrong
218    /// meaning.
219    ///
220    /// Field-range rules enforced after the CRC passes:
221    /// * `status == Status::Stall` is rejected — `Stall` is observer-synthesized
222    ///   by `varta-watch` when a pid goes silent past its threshold; no
223    ///   legitimate agent emits it on the wire. Accepting a spoofed `Stall`
224    ///   frame would let a hostile sender pollute observer telemetry from
225    ///   any pid.
226    /// * `pid ∈ {0, 1}` is rejected — pid 0 is the kernel/scheduler and
227    ///   pid 1 is init/systemd; no legitimate agent runs at either, and
228    ///   accepting them lets a hostile sender spoof "init has stalled" to
229    ///   the recovery path.
230    /// * `timestamp == u64::MAX` is rejected — `varta_client::Varta::beat`
231    ///   saturates at this value with `.min(u64::MAX as u128) as u64`, and
232    ///   reaching it through real elapsed time (~584 years) is impossible.
233    ///   The sentinel is reserved.
234    ///
235    ///   *Asymmetry note*: a hypothetical agent whose monotonic clock
236    ///   saturates still observes `BeatOutcome::Sent` from `send(2)` (the
237    ///   kernel sees a well-formed 32-byte datagram), while the observer
238    ///   drops the frame as `DecodeError::BadTimestamp`. The divergence is
239    ///   physically unreachable on a single `Varta::connect` handle and is
240    ///   documented for completeness only.
241    /// * `nonce == NONCE_TERMINAL` is allowed only when paired with
242    ///   `Status::Critical`; the sentinel is the panic-hook's terminal
243    ///   marker and is never emitted on the regular beat path.
244    pub fn decode(bytes: &[u8; 32]) -> Result<Frame, DecodeError> {
245        let magic = [bytes[0], bytes[1]];
246        if magic != MAGIC {
247            return Err(DecodeError::BadMagic);
248        }
249        let version = bytes[2];
250        if version != VERSION {
251            return Err(DecodeError::BadVersion);
252        }
253
254        // CRC trailer at bytes 28..32 covers bytes 0..28. Verified after
255        // magic/version (so wrong-protocol bytes surface as BadMagic, not
256        // BadCrc) and before any field-range check (so corruption cannot
257        // produce a "well-formed" frame with the wrong meaning).
258        let stored_crc = u32::from_le_bytes([bytes[28], bytes[29], bytes[30], bytes[31]]);
259        let computed_crc = crc32c::compute(&bytes[0..28]);
260        if stored_crc != computed_crc {
261            return Err(DecodeError::BadCrc {
262                expected: computed_crc,
263                actual: stored_crc,
264            });
265        }
266
267        let status = Status::try_from_u8(bytes[3])?;
268        if status == Status::Stall {
269            return Err(DecodeError::StallOnWire);
270        }
271
272        // Each integer field is decoded via explicit array indexing — the
273        // compiler statically proves every index is in-bounds against the
274        // `&[u8; 32]` reference, so there are no runtime panics.
275        let pid = u32::from_le_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]);
276        let timestamp = u64::from_le_bytes([
277            bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
278        ]);
279        let nonce = u64::from_le_bytes([
280            bytes[16], bytes[17], bytes[18], bytes[19], bytes[20], bytes[21], bytes[22], bytes[23],
281        ]);
282        let payload = u32::from_le_bytes([bytes[24], bytes[25], bytes[26], bytes[27]]);
283
284        if pid == 0 || pid == 1 {
285            return Err(DecodeError::BadPid(pid));
286        }
287        if timestamp == u64::MAX {
288            return Err(DecodeError::BadTimestamp(timestamp));
289        }
290        if nonce == NONCE_TERMINAL && status != Status::Critical {
291            return Err(DecodeError::BadNonce { nonce, status });
292        }
293
294        Ok(Frame {
295            magic,
296            version,
297            status,
298            pid,
299            timestamp,
300            nonce,
301            payload,
302        })
303    }
304}
305
306/// Error returned by [`Frame::decode`] and [`Status::try_from_u8`].
307///
308/// The variants form an exhaustive list of validation failures the protocol
309/// can detect statically; everything else (timestamp drift, nonce regression)
310/// is policy enforced higher in the stack.
311///
312/// This enum is exhaustive. Adding a variant is a workspace-wide compile-error
313/// change that requires updating every match site explicitly.
314#[derive(Clone, Copy, Debug, Eq, PartialEq)]
315pub enum DecodeError {
316    /// First two bytes did not equal [`MAGIC`].
317    BadMagic,
318    /// Version byte did not equal [`VERSION`].
319    BadVersion,
320    /// CRC-32C trailer at bytes 28..32 did not match the value computed
321    /// over bytes 0..28. Indicates wire corruption (cosmic ray / NIC
322    /// firmware / non-ECC RAM bit flip) on the UDS transport, or
323    /// in-process memory corruption between
324    /// [`crate::crypto::open`](crypto::open) and `Frame::decode` on the
325    /// secure-UDP transport. AEAD tag failures stay in the transport
326    /// layer (`crypto::AuthError`) and never surface as `BadCrc`.
327    BadCrc {
328        /// CRC-32C recomputed over bytes 0..28 of the received frame.
329        expected: u32,
330        /// CRC-32C value carried in bytes 28..32 of the received frame.
331        actual: u32,
332    },
333    /// Status byte did not match any known [`Status`] variant. The inner
334    /// value is the offending byte, surfaced for observer-side diagnostics.
335    BadStatus(u8),
336    /// Observer-only status `Status::Stall` observed on the wire. `Stall`
337    /// is synthesized by `varta-watch` when a pid goes silent past its
338    /// threshold; agents emit only `Ok`, `Degraded`, or `Critical`. A
339    /// spoofed `Stall` frame would inject false liveness telemetry from
340    /// any pid, so the decoder rejects it at the single chokepoint.
341    StallOnWire,
342    /// Reserved pid: `0` (kernel/scheduler) or `1` (init/systemd). No
343    /// legitimate agent runs at either pid; rejecting closes the "spoof
344    /// init has stalled" recovery-trigger attack on UDP listeners.
345    BadPid(u32),
346    /// Reserved timestamp sentinel `u64::MAX` — the saturation value from
347    /// `varta_client::Varta::beat`'s `.min(u64::MAX as u128) as u64`.
348    /// Reaching it through real elapsed nanoseconds is not physically
349    /// possible.
350    BadTimestamp(u64),
351    /// Protocol invariant violation: `nonce == NONCE_TERMINAL` is reserved
352    /// for the panic hook's terminal frame and MUST be paired with
353    /// [`Status::Critical`]. Carries the violating `status` for diagnostics.
354    BadNonce {
355        /// The terminal-sentinel nonce value observed on the wire.
356        nonce: u64,
357        /// The status byte that was paired with the sentinel nonce.
358        status: Status,
359    },
360}
361
362impl core::fmt::Display for DecodeError {
363    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
364        match self {
365            DecodeError::BadMagic => f.write_str("varta-vlp: bad magic prefix"),
366            DecodeError::BadVersion => f.write_str("varta-vlp: bad version byte"),
367            DecodeError::BadCrc { expected, actual } => {
368                write!(
369                    f,
370                    "varta-vlp: bad CRC-32C trailer (expected {expected:#010x}, actual {actual:#010x})"
371                )
372            }
373            DecodeError::BadStatus(byte) => {
374                write!(f, "varta-vlp: bad status byte {byte:#04x}")
375            }
376            DecodeError::StallOnWire => {
377                f.write_str("varta-vlp: Status::Stall is observer-only and forbidden on the wire")
378            }
379            DecodeError::BadPid(pid) => {
380                write!(f, "varta-vlp: reserved pid {pid}")
381            }
382            DecodeError::BadTimestamp(ts) => {
383                write!(f, "varta-vlp: reserved timestamp sentinel {ts:#x}")
384            }
385            DecodeError::BadNonce { nonce, status } => {
386                write!(
387                    f,
388                    "varta-vlp: terminal nonce {nonce:#x} requires Status::Critical, got {status:?}"
389                )
390            }
391        }
392    }
393}
394
395impl core::error::Error for DecodeError {}