1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2026-present, Structured World Foundation
//! Tree-wide queue of SSTs whose blocks were ECC-corrected on a read that was
//! confirmed *persistent* (the on-disk bytes are still faulty after a
//! cache-bypassing re-read).
//!
//! A block-read that finds a checksum mismatch but recovers the payload from
//! its Page-ECC parity returns the *correct* bytes to the caller, yet leaves
//! the faulty bytes on disk. Without intervention the same latent fault is
//! re-corrected on every future read, and a second bit-flip in the same block
//! could push it past the parity's correction budget into unrecoverable
//! territory. Recording the SST here lets the compaction picker rewrite it
//! clean (a new file + atomic manifest swap) instead.
//!
//! # Why a tree-wide set and not a per-table flag
//!
//! Tables only know their own id + [`crate::fs::Fs`]; they have no
//! back-reference to the owning tree. Each table carries an
//! [`Arc<HealHints>`] (installed once after it joins a tree, [`None`] before
//! that) so the read path can record a hint in O(log n) (a `BTreeSet` insert,
//! n = pending SSTs, only on the cold corrected path), and the compaction picker
//! drains the whole set in one place. The set dedups by
//! [`GlobalTableId`](crate::GlobalTableId): many corrected block-reads against
//! the same SST enqueue it once.
use crate::GlobalTableId;
use alloc::collections::BTreeSet;
use alloc::sync::Arc;
#[cfg(test)]
use alloc::vec::Vec;
use core::sync::atomic::{AtomicBool, Ordering};
use spin::Mutex;
/// Shared set of SSTs awaiting priority recompaction after a confirmed
/// persistent ECC correction.
///
/// Cheap to clone (it is always held behind an [`Arc`]). The inner mutex is
/// only ever contended on the cold corrected-read path and the periodic
/// compaction drain, never on a clean read.
// `spin::Mutex` rather than `std::sync::Mutex` keeps this module's std
// footprint at zero — it is `no_std` + `alloc` by construction, matching
// `crate::deletion_pause`. The set is never touched on a clean read, so
// spin contention is irrelevant in practice.
#[derive(Default)]
pub struct HealHints {
/// SSTs queued for healing recompaction, deduped by id.
pending: Mutex<BTreeSet<GlobalTableId>>,
/// Mirrors [`RuntimeConfig::auto_heal`](crate::runtime_config::RuntimeConfig::auto_heal).
/// When `false`, the read path skips the persistence re-read and records
/// nothing — correction-on-read still happens, only the rewrite scheduling
/// is suppressed. The owning tree keeps this in sync with its runtime
/// config. Default `false` (scheduling is opt-in).
enabled: AtomicBool,
}
impl HealHints {
/// Creates a fresh, empty shared hint set with scheduling set to `enabled`
/// (mirrors the tree's initial `auto_heal`).
#[must_use]
pub fn new_shared(enabled: bool) -> Arc<Self> {
let hints = Self::default();
hints.set_enabled(enabled);
Arc::new(hints)
}
/// Returns `true` when rewrite scheduling is enabled (mirrors
/// [`RuntimeConfig::auto_heal`](crate::runtime_config::RuntimeConfig::auto_heal)).
#[must_use]
pub fn is_enabled(&self) -> bool {
self.enabled.load(Ordering::Relaxed)
}
/// Enables or disables rewrite scheduling. The owning tree calls this on open
/// and on every runtime-config update so the read-path gate tracks
/// `auto_heal`.
pub fn set_enabled(&self, enabled: bool) {
self.enabled.store(enabled, Ordering::Relaxed);
}
/// Records `id` as needing a healing recompaction.
///
/// Returns `true` when the id was newly inserted, `false` when it was
/// already queued — callers use this to count each SST once.
pub fn record(&self, id: GlobalTableId) -> bool {
self.pending.lock().insert(id)
}
/// Claims one queued id, removing and returning it (`None` when empty).
///
/// The heal compaction strategy pops one SST per pass; losing a popped id to
/// a failed compaction is self-correcting (the next read re-records it).
#[must_use]
pub fn pop(&self) -> Option<GlobalTableId> {
self.pending.lock().pop_first()
}
/// Returns `true` when no SST is currently queued for healing.
///
/// Lets a scheduler skip running the heal strategy when there is nothing to
/// do.
#[must_use]
pub fn is_empty(&self) -> bool {
self.pending.lock().is_empty()
}
/// Returns a snapshot of the queued ids (test-only helper).
#[cfg(test)]
#[must_use]
pub fn snapshot(&self) -> Vec<GlobalTableId> {
self.pending.lock().iter().copied().collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn id(tree: u64, table: u64) -> GlobalTableId {
GlobalTableId::from((tree, table))
}
#[test]
fn record_dedups_same_id_returns_false_on_repeat() {
let hints = HealHints::default();
assert!(hints.record(id(1, 7)));
assert!(!hints.record(id(1, 7)));
assert_eq!(hints.snapshot(), vec![id(1, 7)]);
}
#[test]
fn record_collects_distinct_ids() {
let hints = HealHints::default();
hints.record(id(1, 1));
hints.record(id(1, 2));
hints.record(id(1, 1));
let snapshot = hints.snapshot();
assert_eq!(snapshot, vec![id(1, 1), id(1, 2)]);
}
}