obj-core 1.1.0

Storage engine internals for the obj embedded document database (pager, WAL, B-tree, codec, catalog).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
//! Deterministic, seed-controlled fault-injection harness.
//!
//! `FaultyFileHandle` is a test-only wrapper around [`FileHandle`] that
//! injects the kinds of failure modes real hardware exhibits — torn
//! writes, dropped fsyncs, short reads, bit-flips, sudden crashes — at
//! deterministic, seed-controlled points. The 10k crash-cycle test in
//! issue #18 drives this to exercise the WAL's recovery contract.
//!
//! Determinism is the load-bearing property: the same `FaultPlan` seed,
//! applied to the same sequence of operations, produces the same fault
//! sequence on every machine, every Rust toolchain, every CI run. The
//! PRNG is `rand_chacha::ChaCha8Rng`, chosen because its output is
//! specified at the algorithm level (not at the implementation level)
//! — so a future bump of `rand_chacha` cannot silently change the
//! seeds-to-faults mapping.
//!
//! # `unsafe` policy
//!
//! This module inherits `platform`'s `#![forbid(unsafe_code)]`. The
//! faults it injects are byte-level manipulations of the underlying
//! file via the existing [`FileHandle`] safe API; no syscalls of
//! `FaultyFileHandle`'s own.
//!
//! # Power-of-ten posture
//!
//! - **Rule 5.** Every fault probability is bounded in `[0.0, 1.0]`;
//!   out-of-range inputs are clamped at construction.
//! - **Rule 7.** No `unwrap`/`expect` in code paths exercised by the
//!   harness — see test code in modules that *use* the harness, where
//!   `expect` is conventional.
//! - **Rule 9.** `FaultyFileHandle` is a concrete type. No dynamic
//!   dispatch is introduced into production code by this module; the
//!   shared [`FileBackend`] trait is consumed by tests only.

use std::cell::RefCell;
use std::path::Path;

use rand::{Rng, RngCore, SeedableRng};
use rand_chacha::ChaCha8Rng;

use crate::error::{Error, Result};
use crate::platform::{FileBackend, FileHandle, SyncMode};

/// Panic message stamped onto deliberate-crash panics. The 10k cycle
/// test driver matches on this string at the [`catch_unwind`][cu]
/// boundary to distinguish injected crashes from genuine bugs.
///
/// [cu]: std::panic::catch_unwind
pub const FAULT_CRASH_MARKER: &str = "obj-core::fault::deliberate-crash";

/// Per-fault probabilities. All values are `f64` in `[0.0, 1.0]` and
/// clamped at construction.
#[derive(Debug, Clone, Copy)]
pub struct FaultPlan {
    /// Probability that a `write_all_at` call writes only a prefix of
    /// the bytes it was handed and silently returns Ok.
    pub torn_write_prob: f64,
    /// Probability that a `sync_data` / `sync_all` call is a silent
    /// no-op (i.e. the data is lost across a power loss).
    ///
    /// # Coverage limitation (issue #53)
    ///
    /// This models a *dropped fsync* by skipping the syscall, but it
    /// CANNOT model the resulting **data loss**. The bytes handed to a
    /// prior [`FileBackend::write_all_at`] have already been written
    /// through the real [`FileHandle`] into the kernel page cache.
    /// Dropping the subsequent fsync does not evict them, so a later
    /// read — in the *same process*, against the *same live kernel* —
    /// still observes the unsynced bytes. A true "fsync dropped, then
    /// power loss discards the page cache" sequence is unreachable
    /// from an in-process harness: only a real reboot, a forced cache
    /// drop, or a separate-machine fault injector can evict cached-but-
    /// unsynced pages.
    ///
    /// The harness therefore exercises the *control-flow* of the
    /// dropped-fsync path (the pager issues no real sync; recovery must
    /// still cope) but NOT the *durability* consequence. Coarser-grain
    /// power-loss durability is covered instead by the crash-cycle
    /// process-kill model in `obj-core/tests/crash_cycles.rs`, which
    /// treats an injected panic as a crash between two consistent
    /// commit points and asserts the reopen invariant. See the note on
    /// [`FaultyFileHandle::sync_data`] and the
    /// `dropped_fsync_on_checkpointed_main_file_recovers_via_wal_salt_match`
    /// test for the precise boundary of what is and is not verified.
    pub dropped_fsync_prob: f64,
    /// Probability that a `read_exact_at` call short-reads (the
    /// underlying FS would surface this as a [`std::io::Error`] of
    /// kind `UnexpectedEof`; we surface it as `Error::Io` with the
    /// same kind).
    pub short_read_prob: f64,
    /// Probability that a single bit in the buffer being written
    /// gets flipped before reaching the disk.
    pub bit_flip_prob: f64,
    /// If non-zero, the harness deliberately panics on the Nth
    /// `write_all_at` / `sync_data` / `sync_all` operation. `0`
    /// disables the trigger.
    pub crash_after_ops: u64,
    /// Seed for the deterministic PRNG. The seed alone uniquely
    /// determines every fault decision the plan makes.
    pub seed: u64,
}

impl Default for FaultPlan {
    fn default() -> Self {
        Self::noop(0)
    }
}

impl FaultPlan {
    /// Construct a plan that never injects any fault. Useful as a
    /// baseline against which to verify the harness is otherwise
    /// transparent.
    #[must_use]
    pub const fn noop(seed: u64) -> Self {
        Self {
            torn_write_prob: 0.0,
            dropped_fsync_prob: 0.0,
            short_read_prob: 0.0,
            bit_flip_prob: 0.0,
            crash_after_ops: 0,
            seed,
        }
    }

    /// Helper for the `[0.0, 1.0]` clamp used at probability sites.
    fn clamp01(v: f64) -> f64 {
        if v.is_nan() {
            0.0
        } else {
            v.clamp(0.0, 1.0)
        }
    }

    /// Construct a plan with caller-specified probabilities, each
    /// clamped to `[0.0, 1.0]`.
    #[must_use]
    pub fn new(
        seed: u64,
        torn_write_prob: f64,
        dropped_fsync_prob: f64,
        short_read_prob: f64,
        bit_flip_prob: f64,
        crash_after_ops: u64,
    ) -> Self {
        Self {
            torn_write_prob: Self::clamp01(torn_write_prob),
            dropped_fsync_prob: Self::clamp01(dropped_fsync_prob),
            short_read_prob: Self::clamp01(short_read_prob),
            bit_flip_prob: Self::clamp01(bit_flip_prob),
            crash_after_ops,
            seed,
        }
    }
}

/// Fault-injecting wrapper around [`FileHandle`].
///
/// The PRNG and operation counter live behind a `RefCell` so the
/// outer API stays `&self` (matching [`FileHandle`]); multi-threaded
/// access is not in scope at M3.
pub struct FaultyFileHandle {
    inner: FileHandle,
    plan: FaultPlan,
    state: RefCell<FaultState>,
}

struct FaultState {
    rng: ChaCha8Rng,
    op_count: u64,
}

impl std::fmt::Debug for FaultyFileHandle {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("FaultyFileHandle")
            .field("inner", &self.inner)
            .field("plan", &self.plan)
            .finish_non_exhaustive()
    }
}

impl FaultyFileHandle {
    /// Wrap an existing [`FileHandle`] with the given `plan`. The
    /// PRNG is reseeded from `plan.seed`; the operation counter
    /// starts at zero.
    #[must_use]
    pub fn new(inner: FileHandle, plan: FaultPlan) -> Self {
        let rng = ChaCha8Rng::seed_from_u64(plan.seed);
        Self {
            inner,
            plan,
            state: RefCell::new(FaultState { rng, op_count: 0 }),
        }
    }

    /// Convenience: open a file at `path` and wrap it.
    ///
    /// # Errors
    ///
    /// Returns [`Error::Io`] on syscall failure.
    pub fn open_or_create<P: AsRef<Path>>(path: P, plan: FaultPlan) -> Result<Self> {
        let inner = FileHandle::open_or_create(path)?;
        Ok(Self::new(inner, plan))
    }

    /// Borrow the wrapped [`FileHandle`]. Useful for tests that need
    /// to check the on-disk state past the harness.
    #[must_use]
    pub fn inner(&self) -> &FileHandle {
        &self.inner
    }

    /// Advance the operation counter and, if it reaches
    /// `plan.crash_after_ops`, panic with [`FAULT_CRASH_MARKER`].
    fn maybe_crash(&self, kind: &str) {
        let crash_at = self.plan.crash_after_ops;
        let mut state = self.state.borrow_mut();
        state.op_count = state.op_count.saturating_add(1);
        if crash_at != 0 && state.op_count == crash_at {
            // Drop the borrow before panicking so the RefCell isn't
            // poisoned in a way that confuses the test driver.
            drop(state);
            panic!("{FAULT_CRASH_MARKER}: {kind}");
        }
    }

    fn roll(&self, prob: f64) -> bool {
        if prob <= 0.0 {
            return false;
        }
        if prob >= 1.0 {
            return true;
        }
        let mut state = self.state.borrow_mut();
        state.rng.random::<f64>() < prob
    }

    fn rand_split(&self, len: usize) -> usize {
        if len <= 1 {
            return 0;
        }
        let mut state = self.state.borrow_mut();
        let r: u64 = state.rng.next_u64();
        let len_u64 = u64::try_from(len).unwrap_or(u64::MAX);
        let kept = r % len_u64;
        usize::try_from(kept).unwrap_or(len - 1)
    }
}

impl FileBackend for FaultyFileHandle {
    fn len(&self) -> Result<u64> {
        // Length queries are not fault-injected — they are a property
        // of the file system, not of write durability.
        self.inner.len()
    }

    fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> {
        if self.roll(self.plan.short_read_prob) {
            let kind = std::io::ErrorKind::UnexpectedEof;
            return Err(Error::Io(std::io::Error::new(
                kind,
                "fault-injected short read",
            )));
        }
        self.inner.read_exact_at(buf, offset)
    }

    fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()> {
        self.maybe_crash("write_all_at");
        // Choose torn write OR bit flip; not both. Order: torn check
        // first because torn writes are the more disruptive of the
        // two for the WAL.
        if self.roll(self.plan.torn_write_prob) {
            let kept = self.rand_split(buf.len());
            if kept == 0 {
                // Nothing actually written; surface success anyway —
                // the OS would have accepted the syscall and lost
                // the data.
                return Ok(());
            }
            return self.inner.write_all_at(&buf[..kept], offset);
        }
        if self.roll(self.plan.bit_flip_prob) && !buf.is_empty() {
            let mut buf_copy = buf.to_vec();
            // Pick a deterministic byte + bit index.
            let byte_idx: usize = {
                let mut state = self.state.borrow_mut();
                let r = state.rng.next_u64();
                let len_u64 = u64::try_from(buf_copy.len()).unwrap_or(u64::MAX);
                usize::try_from(r % len_u64).unwrap_or(0)
            };
            let bit_idx: u8 = {
                let mut state = self.state.borrow_mut();
                u8::try_from(state.rng.next_u32() & 0x7).unwrap_or(0)
            };
            buf_copy[byte_idx] ^= 1u8 << bit_idx;
            return self.inner.write_all_at(&buf_copy, offset);
        }
        self.inner.write_all_at(buf, offset)
    }

    fn set_len(&self, new_len: u64) -> Result<()> {
        self.inner.set_len(new_len)
    }

    fn sync_data(&self, mode: SyncMode) -> Result<()> {
        self.maybe_crash("sync_data");
        if self.roll(self.plan.dropped_fsync_prob) {
            // Pretend the fsync succeeded. NOTE (issue #53): the data
            // already reached the kernel page cache via `write_all_at`,
            // and this in-process harness cannot evict it — so a
            // same-process read still sees the "lost" bytes. The
            // durability *consequence* of a dropped fsync is therefore
            // NOT exercised here; only the control-flow (no real sync
            // issued) is. See `FaultPlan::dropped_fsync_prob` for the
            // full limitation and the crash-cycle process-kill model
            // (`tests/crash_cycles.rs`) that covers power-loss
            // durability at a coarser, commit-boundary granularity.
            return Ok(());
        }
        self.inner.sync_data(mode)
    }

    fn sync_all(&self) -> Result<()> {
        self.maybe_crash("sync_all");
        if self.roll(self.plan.dropped_fsync_prob) {
            return Ok(());
        }
        self.inner.sync_all()
    }
}

/// Test-support helpers for the `tests` submodule below. Module-level
/// so the path is short; private so external crates can't depend on
/// the panic-message format.
#[cfg(test)]
struct FaultBackendTestSupport;

#[cfg(test)]
impl FaultBackendTestSupport {
    #[allow(clippy::borrowed_box)] // The std panic API hands us &Box<dyn Any+Send>; we don't own it.
    fn extract_panic_message(payload: &Box<dyn std::any::Any + Send>) -> String {
        if let Some(s) = payload.downcast_ref::<String>() {
            return s.clone();
        }
        if let Some(s) = payload.downcast_ref::<&'static str>() {
            return (*s).to_string();
        }
        "<non-string panic payload>".to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::{FaultBackendTestSupport, FaultPlan, FaultyFileHandle, FAULT_CRASH_MARKER};
    use crate::platform::{FileBackend, SyncMode};
    use tempfile::TempDir;

    fn make(dir: &TempDir, name: &str, plan: FaultPlan) -> FaultyFileHandle {
        let path = dir.path().join(name);
        FaultyFileHandle::open_or_create(&path, plan).expect("open faulty")
    }

    #[test]
    fn noop_plan_is_transparent() {
        let dir = TempDir::new().expect("tempdir");
        let h = make(&dir, "noop.bin", FaultPlan::noop(0));
        h.set_len(4096).expect("set_len");
        h.write_all_at(&[0xAAu8; 4096], 0).expect("write");
        let mut out = [0u8; 4096];
        h.read_exact_at(&mut out, 0).expect("read");
        assert_eq!(out[0], 0xAA);
        h.sync_data(SyncMode::Full).expect("sync");
    }

    #[test]
    fn torn_write_truncates_on_disk() {
        let dir = TempDir::new().expect("tempdir");
        let plan = FaultPlan::new(123, 1.0, 0.0, 0.0, 0.0, 0);
        let h = make(&dir, "torn.bin", plan);
        h.set_len(4096).expect("set_len");
        // Pre-fill with 0xFF so we can detect a partial write.
        let inner_path = dir.path().join("torn.bin");
        std::fs::write(&inner_path, vec![0xFFu8; 4096]).expect("prefill");
        let buf = vec![0x00u8; 4096];
        h.write_all_at(&buf, 0).expect("torn write returns Ok");
        let on_disk = std::fs::read(&inner_path).expect("readback");
        let zeros = on_disk.iter().take_while(|&&b| b == 0).count();
        assert!(
            zeros < 4096,
            "torn write must NOT write the whole buffer; got {zeros} zero bytes",
        );
    }

    #[test]
    fn dropped_fsync_is_silent() {
        let dir = TempDir::new().expect("tempdir");
        let plan = FaultPlan::new(7, 0.0, 1.0, 0.0, 0.0, 0);
        let h = make(&dir, "df.bin", plan);
        // No file content needed; sync_data on an empty file is the
        // simplest path to exercise the fault.
        h.sync_data(SyncMode::Full)
            .expect("dropped fsync surfaces Ok");
        h.sync_all().expect("dropped sync_all surfaces Ok");
    }

    #[test]
    fn short_read_returns_unexpected_eof() {
        let dir = TempDir::new().expect("tempdir");
        let plan = FaultPlan::new(9, 0.0, 0.0, 1.0, 0.0, 0);
        let h = make(&dir, "sr.bin", plan);
        h.set_len(4096).expect("set_len");
        h.inner()
            .write_all_at(&[0x55u8; 4096], 0)
            .expect("ground truth write");
        let mut out = [0u8; 4096];
        let err = h.read_exact_at(&mut out, 0).expect_err("short read");
        // Surfaces as Error::Io with UnexpectedEof kind.
        let crate::error::Error::Io(io) = err else {
            panic!("expected Error::Io");
        };
        assert_eq!(io.kind(), std::io::ErrorKind::UnexpectedEof);
    }

    #[test]
    fn crash_after_ops_panics_with_marker() {
        let dir = TempDir::new().expect("tempdir");
        let plan = FaultPlan::new(0, 0.0, 0.0, 0.0, 0.0, 1);
        let path = dir.path().join("crash.bin");
        let h = FaultyFileHandle::open_or_create(&path, plan).expect("open");
        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            // First write call triggers the deliberate panic.
            let _ = h.write_all_at(&[0u8; 16], 0);
        }));
        let panic_payload = result.expect_err("deliberate panic");
        let msg = FaultBackendTestSupport::extract_panic_message(&panic_payload);
        assert!(
            msg.contains(FAULT_CRASH_MARKER),
            "panic must carry the crash marker; got {msg}",
        );
    }

    #[test]
    fn bit_flip_changes_one_byte() {
        let dir = TempDir::new().expect("tempdir");
        let plan = FaultPlan::new(42, 0.0, 0.0, 0.0, 1.0, 0);
        let path = dir.path().join("bf.bin");
        let h = FaultyFileHandle::open_or_create(&path, plan).expect("open");
        h.set_len(4096).expect("set_len");
        let buf = vec![0u8; 256];
        h.write_all_at(&buf, 0).expect("write");
        let on_disk = std::fs::read(&path).expect("readback");
        // Find the differing byte. It must be a single bit flipped.
        let diffs: Vec<(usize, u8)> = on_disk
            .iter()
            .take(256)
            .enumerate()
            .filter(|(_, &b)| b != 0)
            .map(|(i, &b)| (i, b))
            .collect();
        assert_eq!(diffs.len(), 1, "exactly one byte must differ");
        let (_, b) = diffs[0];
        assert_eq!(b.count_ones(), 1, "exactly one bit must be flipped");
    }
}