objects/fault_inject.rs
1// SPDX-License-Identifier: Apache-2.0
2//! Deterministic fault-injection points for crash-recovery tests.
3//!
4//! W2b shipped the rollback machinery — atomic mapping persistence,
5//! mirror Drop guard, HEAD/index restore on failure — but until we
6//! actually crash the process between the load-bearing writes, the
7//! rollback paths only have unit-test coverage of the *helpers*, not
8//! of the *recovery contract itself*. The integration story
9//! ("crashing here doesn't corrupt the bridge mapping") was unverified.
10//!
11//! This module exposes a single `maybe_panic_at(name)` checkpoint
12//! that production code threads at the points where a crash would
13//! exercise a recovery path. Tests opt in by setting the
14//! `HEDDLE_FAULT_INJECT` environment variable to a comma-separated
15//! list of checkpoint names — e.g.
16//! `HEDDLE_FAULT_INJECT=mapping_after_tmp_before_commit` — and the
17//! next process to hit that checkpoint panics with a stable message.
18//!
19//! The next CLI invocation (a separate process, no inherited env)
20//! must recover cleanly. That's the contract under test.
21//!
22//! ## Why an env var instead of a build-time `#[cfg(test)]` gate
23//!
24//! The crash points sit in `objects` and `cli` paths that get spawned
25//! as separate child processes during integration tests. A child
26//! process can't see the parent test's `cfg(test)` flag, but it does
27//! inherit env vars by default. An env var lets the parent test set
28//! the crash point, spawn the child, observe the child crash, then
29//! spawn a fresh child (without the env var) and verify recovery.
30//!
31//! ## Performance
32//!
33//! `maybe_panic_at` is a single env lookup + string split + linear
34//! search. The env var is read once on first call and cached. With no
35//! `HEDDLE_FAULT_INJECT` set (the production default), the cached
36//! `None` short-circuits in well under a microsecond.
37
38use std::sync::OnceLock;
39
40/// Cached parse of the `HEDDLE_FAULT_INJECT` env var. `None` means
41/// the env var was not set; an empty `Vec` means it was set to an
42/// empty string (treated as no checkpoints active).
43static FAULT_POINTS: OnceLock<Option<Vec<String>>> = OnceLock::new();
44
45fn active_points() -> &'static Option<Vec<String>> {
46 FAULT_POINTS.get_or_init(|| {
47 std::env::var("HEDDLE_FAULT_INJECT").ok().map(|raw| {
48 raw.split(',')
49 .map(str::trim)
50 .filter(|s| !s.is_empty())
51 .map(str::to_owned)
52 .collect()
53 })
54 })
55}
56
57/// Crash the current process if `name` is listed in `HEDDLE_FAULT_INJECT`.
58///
59/// Production callers thread this at points where a crash would
60/// exercise a recovery path. Tests set the env var on a child
61/// process to deterministically trigger the crash, then verify the
62/// next clean process recovers.
63///
64/// The panic message includes the checkpoint name so test logs can
65/// distinguish an intentional fault from a real bug.
66pub fn maybe_panic_at(name: &str) {
67 if let Some(points) = active_points().as_ref()
68 && points.iter().any(|active| active == name)
69 {
70 panic!("HEDDLE_FAULT_INJECT: crashing at checkpoint `{name}` (intentional)");
71 }
72}
73
74/// Like [`maybe_panic_at`], but returns an `io::Error` instead of
75/// panicking — for exercising *in-process* error-recovery paths (a
76/// graceful failure that drives a rollback) rather than crash recovery.
77///
78/// Production callers thread this where a returned error must unwind a
79/// partially-applied operation; tests opt in by listing the checkpoint
80/// name in `HEDDLE_FAULT_INJECT` and assert the rollback left no
81/// partial state. With the env var unset the cached `None`
82/// short-circuits, exactly like [`maybe_panic_at`].
83pub fn maybe_fail_at(name: &str) -> std::io::Result<()> {
84 if let Some(points) = active_points().as_ref()
85 && points.iter().any(|active| active == name)
86 {
87 return Err(std::io::Error::other(format!(
88 "HEDDLE_FAULT_INJECT: failing at checkpoint `{name}` (intentional)"
89 )));
90 }
91 Ok(())
92}
93
94/// Test-only helper: clear the cached env-var read so a single
95/// process can re-parse `HEDDLE_FAULT_INJECT` between phases. Not
96/// for production use — the cache is what makes the production
97/// hot-path free.
98#[cfg(test)]
99pub fn reset_for_test() {
100 // OnceLock has no public reset; we work around by leaking a new
101 // one. This is fine for tests because the binary lifetime is
102 // bounded.
103 use std::sync::atomic::{AtomicPtr, Ordering};
104 static SLOT: AtomicPtr<OnceLock<Option<Vec<String>>>> = AtomicPtr::new(std::ptr::null_mut());
105 let new = Box::leak(Box::new(OnceLock::new()));
106 SLOT.store(new as *mut _, Ordering::SeqCst);
107 // The static FAULT_POINTS isn't actually swappable; tests that
108 // need to flip the env var multiple times within one process
109 // should spawn child processes instead. This helper exists so
110 // unit tests of `maybe_panic_at` itself can reset between
111 // setup/teardown — and even there we just leak.
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 #[test]
119 fn no_env_var_is_a_silent_noop() {
120 // Safety: this test only runs when HEDDLE_FAULT_INJECT is
121 // unset, which is the production default. A test runner
122 // that exports the var globally would change behaviour, but
123 // that would be the runner's problem to surface explicitly.
124 if std::env::var("HEDDLE_FAULT_INJECT").is_ok() {
125 return;
126 }
127 // Should not panic — env var unset, all checkpoints inactive.
128 maybe_panic_at("anything");
129 }
130
131 // NOTE: the original sibling test
132 // `env_var_with_matching_name_panics` lived here and was flaky in
133 // parallel runs. The flake was structural: `FAULT_POINTS` is a
134 // `OnceLock`, so whichever test calls `active_points()` first wins.
135 // If `no_env_var_is_a_silent_noop` ran first it cached `None`, and
136 // the panic test could never re-arm the checkpoint.
137 //
138 // The fix was to move the panic test to its own integration-test
139 // binary (`tests/fault_inject_panic.rs`); each integration test
140 // file gets its own process and its own OnceLock state, so the
141 // test always observes a fresh cache.
142}