objects/fault_inject.rs
1// SPDX-License-Identifier: Apache-2.0
2//! Deterministic fault-injection points for crash-recovery tests.
3//!
4//! W2b shipped the rollback machinery — atomic mapping persistence,
5//! mirror Drop guard, HEAD/index restore on failure — but until we
6//! actually crash the process between the load-bearing writes, the
7//! rollback paths only have unit-test coverage of the *helpers*, not
8//! of the *recovery contract itself*. The integration story
9//! ("crashing here doesn't corrupt the bridge mapping") was unverified.
10//!
11//! This module exposes a single `maybe_panic_at(name)` checkpoint
12//! that production code threads at the points where a crash would
13//! exercise a recovery path. Tests opt in by setting the
14//! `HEDDLE_FAULT_INJECT` environment variable to a comma-separated
15//! list of checkpoint names — e.g.
16//! `HEDDLE_FAULT_INJECT=mapping_after_tmp_before_commit` — and the
17//! next process to hit that checkpoint panics with a stable message.
18//!
19//! The next CLI invocation (a separate process, no inherited env)
20//! must recover cleanly. That's the contract under test.
21//!
22//! ## Why an env var instead of a build-time `#[cfg(test)]` gate
23//!
24//! The crash points sit in `objects` and `cli` paths that get spawned
25//! as separate child processes during integration tests. A child
26//! process can't see the parent test's `cfg(test)` flag, but it does
27//! inherit env vars by default. An env var lets the parent test set
28//! the crash point, spawn the child, observe the child crash, then
29//! spawn a fresh child (without the env var) and verify recovery.
30//!
31//! ## Performance
32//!
33//! `maybe_panic_at` is a single env lookup + string split + linear
34//! search. The env var is read once on first call and cached. With no
35//! `HEDDLE_FAULT_INJECT` set (the production default), the cached
36//! `None` short-circuits in well under a microsecond.
37
38use std::sync::OnceLock;
39
40/// Cached parse of the `HEDDLE_FAULT_INJECT` env var. `None` means
41/// the env var was not set; an empty `Vec` means it was set to an
42/// empty string (treated as no checkpoints active).
43static FAULT_POINTS: OnceLock<Option<Vec<String>>> = OnceLock::new();
44
45fn active_points() -> &'static Option<Vec<String>> {
46 FAULT_POINTS.get_or_init(|| {
47 std::env::var("HEDDLE_FAULT_INJECT").ok().map(|raw| {
48 raw.split(',')
49 .map(str::trim)
50 .filter(|s| !s.is_empty())
51 .map(str::to_owned)
52 .collect()
53 })
54 })
55}
56
57/// Crash the current process if `name` is listed in `HEDDLE_FAULT_INJECT`.
58///
59/// Production callers thread this at points where a crash would
60/// exercise a recovery path. Tests set the env var on a child
61/// process to deterministically trigger the crash, then verify the
62/// next clean process recovers.
63///
64/// The panic message includes the checkpoint name so test logs can
65/// distinguish an intentional fault from a real bug.
66pub fn maybe_panic_at(name: &str) {
67 if let Some(points) = active_points().as_ref()
68 && points.iter().any(|active| active == name)
69 {
70 panic!("HEDDLE_FAULT_INJECT: crashing at checkpoint `{name}` (intentional)");
71 }
72}
73
74/// Test-only helper: clear the cached env-var read so a single
75/// process can re-parse `HEDDLE_FAULT_INJECT` between phases. Not
76/// for production use — the cache is what makes the production
77/// hot-path free.
78#[cfg(test)]
79pub fn reset_for_test() {
80 // OnceLock has no public reset; we work around by leaking a new
81 // one. This is fine for tests because the binary lifetime is
82 // bounded.
83 use std::sync::atomic::{AtomicPtr, Ordering};
84 static SLOT: AtomicPtr<OnceLock<Option<Vec<String>>>> = AtomicPtr::new(std::ptr::null_mut());
85 let new = Box::leak(Box::new(OnceLock::new()));
86 SLOT.store(new as *mut _, Ordering::SeqCst);
87 // The static FAULT_POINTS isn't actually swappable; tests that
88 // need to flip the env var multiple times within one process
89 // should spawn child processes instead. This helper exists so
90 // unit tests of `maybe_panic_at` itself can reset between
91 // setup/teardown — and even there we just leak.
92}
93
94#[cfg(test)]
95mod tests {
96 use super::*;
97
98 #[test]
99 fn no_env_var_is_a_silent_noop() {
100 // Safety: this test only runs when HEDDLE_FAULT_INJECT is
101 // unset, which is the production default. A test runner
102 // that exports the var globally would change behaviour, but
103 // that would be the runner's problem to surface explicitly.
104 if std::env::var("HEDDLE_FAULT_INJECT").is_ok() {
105 return;
106 }
107 // Should not panic — env var unset, all checkpoints inactive.
108 maybe_panic_at("anything");
109 }
110
111 // NOTE: the original sibling test
112 // `env_var_with_matching_name_panics` lived here and was flaky in
113 // parallel runs. The flake was structural: `FAULT_POINTS` is a
114 // `OnceLock`, so whichever test calls `active_points()` first wins.
115 // If `no_env_var_is_a_silent_noop` ran first it cached `None`, and
116 // the panic test could never re-arm the checkpoint.
117 //
118 // The fix was to move the panic test to its own integration-test
119 // binary (`tests/fault_inject_panic.rs`); each integration test
120 // file gets its own process and its own OnceLock state, so the
121 // test always observes a fresh cache.
122}