Skip to main content

objects/
fault_inject.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Deterministic fault-injection points for crash-recovery tests.
3//!
4//! W2b shipped the rollback machinery — atomic mapping persistence,
5//! mirror Drop guard, HEAD/index restore on failure — but until we
6//! actually crash the process between the load-bearing writes, the
7//! rollback paths only have unit-test coverage of the *helpers*, not
8//! of the *recovery contract itself*. The integration story
9//! ("crashing here doesn't corrupt the bridge mapping") was unverified.
10//!
11//! This module exposes a single `maybe_panic_at(name)` checkpoint
12//! that production code threads at the points where a crash would
13//! exercise a recovery path. Tests opt in by setting the
14//! `HEDDLE_FAULT_INJECT` environment variable to a comma-separated
15//! list of checkpoint names — e.g.
16//! `HEDDLE_FAULT_INJECT=mapping_after_tmp_before_commit` — and the
17//! next process to hit that checkpoint panics with a stable message.
18//!
19//! The next CLI invocation (a separate process, no inherited env)
20//! must recover cleanly. That's the contract under test.
21//!
22//! ## Why an env var instead of a build-time `#[cfg(test)]` gate
23//!
24//! The crash points sit in `objects` and `cli` paths that get spawned
25//! as separate child processes during integration tests. A child
26//! process can't see the parent test's `cfg(test)` flag, but it does
27//! inherit env vars by default. An env var lets the parent test set
28//! the crash point, spawn the child, observe the child crash, then
29//! spawn a fresh child (without the env var) and verify recovery.
30//!
31//! ## Performance
32//!
33//! `maybe_panic_at` is a single env lookup + string split + linear
34//! search. The env var is read once on first call and cached. With no
35//! `HEDDLE_FAULT_INJECT` set (the production default), the cached
36//! `None` short-circuits in well under a microsecond.
37
38use std::sync::OnceLock;
39
40/// Cached parse of the `HEDDLE_FAULT_INJECT` env var. `None` means
41/// the env var was not set; an empty `Vec` means it was set to an
42/// empty string (treated as no checkpoints active).
43static FAULT_POINTS: OnceLock<Option<Vec<String>>> = OnceLock::new();
44
45fn active_points() -> &'static Option<Vec<String>> {
46    FAULT_POINTS.get_or_init(|| {
47        std::env::var("HEDDLE_FAULT_INJECT").ok().map(|raw| {
48            raw.split(',')
49                .map(str::trim)
50                .filter(|s| !s.is_empty())
51                .map(str::to_owned)
52                .collect()
53        })
54    })
55}
56
57/// Crash the current process if `name` is listed in `HEDDLE_FAULT_INJECT`.
58///
59/// Production callers thread this at points where a crash would
60/// exercise a recovery path. Tests set the env var on a child
61/// process to deterministically trigger the crash, then verify the
62/// next clean process recovers.
63///
64/// The panic message includes the checkpoint name so test logs can
65/// distinguish an intentional fault from a real bug.
66pub fn maybe_panic_at(name: &str) {
67    if let Some(points) = active_points().as_ref()
68        && points.iter().any(|active| active == name)
69    {
70        panic!("HEDDLE_FAULT_INJECT: crashing at checkpoint `{name}` (intentional)");
71    }
72}
73
74/// Test-only helper: clear the cached env-var read so a single
75/// process can re-parse `HEDDLE_FAULT_INJECT` between phases. Not
76/// for production use — the cache is what makes the production
77/// hot-path free.
78#[cfg(test)]
79pub fn reset_for_test() {
80    // OnceLock has no public reset; we work around by leaking a new
81    // one. This is fine for tests because the binary lifetime is
82    // bounded.
83    use std::sync::atomic::{AtomicPtr, Ordering};
84    static SLOT: AtomicPtr<OnceLock<Option<Vec<String>>>> = AtomicPtr::new(std::ptr::null_mut());
85    let new = Box::leak(Box::new(OnceLock::new()));
86    SLOT.store(new as *mut _, Ordering::SeqCst);
87    // The static FAULT_POINTS isn't actually swappable; tests that
88    // need to flip the env var multiple times within one process
89    // should spawn child processes instead. This helper exists so
90    // unit tests of `maybe_panic_at` itself can reset between
91    // setup/teardown — and even there we just leak.
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    #[test]
99    fn no_env_var_is_a_silent_noop() {
100        // Safety: this test only runs when HEDDLE_FAULT_INJECT is
101        // unset, which is the production default. A test runner
102        // that exports the var globally would change behaviour, but
103        // that would be the runner's problem to surface explicitly.
104        if std::env::var("HEDDLE_FAULT_INJECT").is_ok() {
105            return;
106        }
107        // Should not panic — env var unset, all checkpoints inactive.
108        maybe_panic_at("anything");
109    }
110
111    // NOTE: the original sibling test
112    // `env_var_with_matching_name_panics` lived here and was flaky in
113    // parallel runs. The flake was structural: `FAULT_POINTS` is a
114    // `OnceLock`, so whichever test calls `active_points()` first wins.
115    // If `no_env_var_is_a_silent_noop` ran first it cached `None`, and
116    // the panic test could never re-arm the checkpoint.
117    //
118    // The fix was to move the panic test to its own integration-test
119    // binary (`tests/fault_inject_panic.rs`); each integration test
120    // file gets its own process and its own OnceLock state, so the
121    // test always observes a fresh cache.
122}