1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
//! Configuration bridge between shodh-redb and bf-tree.
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::bf_tree::{Config, StorageBackend, WalConfig};
use super::BfTreeError;
use super::verification::VerifyMode;
/// Configuration for the Bf-Tree storage engine.
///
/// Wraps `crate::bf_tree::Config` with shodh-redb-specific defaults and validation.
pub struct BfTreeConfig {
/// Path to the data file. Use `:memory:` for in-memory mode.
pub file_path: PathBuf,
/// Size of the circular buffer in bytes. Must be a power of two.
/// Default: 32 MiB. For embedded targets, consider 4-8 MiB.
pub circular_buffer_size: usize,
/// Maximum key length in bytes (including table prefix overhead). Default: 256.
/// Must not exceed 2020 (bf-tree hard limit).
/// Note: table namespace encoding adds `2 + table_name.len()` bytes to each key.
pub max_key_len: usize,
/// Minimum record size (key + value) in bytes. Default: 4.
pub min_record_size: usize,
/// Maximum record size (key + value) in bytes. Default: 1568.
pub max_record_size: usize,
/// Leaf page size in bytes. Default: 4096. Must be multiple of 4096 for disk mode.
pub leaf_page_size: usize,
/// Enable write-ahead log for durability. Default: true.
pub enable_wal: bool,
/// WAL flush interval in milliseconds. Default: 1.
pub wal_flush_interval_ms: u64,
/// Storage backend selection.
pub backend: BfTreeBackend,
/// Per-entry checksum verification mode.
///
/// When set to `Full` or `Sampled`, values are wrapped with a 4-byte
/// FNV-1a checksum on write and verified on read. This adds 4 bytes of
/// overhead per value and a small CPU cost per read. Default: `None`.
pub verify_mode: VerifyMode,
/// Number of commits between automatic snapshots. Default: 100.
///
/// Snapshots are expensive (full circular buffer drain + fsync) but speed
/// up crash recovery by bounding WAL replay length. Between snapshots,
/// durability is provided by the WAL (`append_and_wait` blocks until
/// fsync). Set to 0 to disable automatic snapshots entirely (you must
/// call `BfTreeDatabase::snapshot()` manually, or accept longer recovery).
pub snapshot_interval: u64,
/// Durability mode controlling when WAL data is fsynced. Default: `Sync`.
pub durability: DurabilityMode,
}
/// Controls when WAL data is fsynced to disk.
///
/// Determines the durability-performance trade-off for committed data.
/// All modes still write WAL entries for crash recovery -- the difference
/// is when those entries are fsynced to stable storage.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DurabilityMode {
/// Every commit fsyncs the WAL. No data loss on crash. (Default)
Sync,
/// WAL writes go to OS page cache. The WAL background thread fsyncs
/// every `wal_flush_interval_ms`. Up to one flush interval of committed
/// data may be lost on crash. Matches `RocksDB`'s default behavior.
Periodic,
/// No fsync at all. For benchmarks and ephemeral workloads only.
/// All committed data is lost on crash.
NoSync,
}
/// Storage backend for Bf-Tree.
#[derive(Debug, Clone)]
pub enum BfTreeBackend {
/// In-memory storage (no persistence).
Memory,
/// Standard file I/O (cross-platform).
Std,
/// Linux `io_uring` in blocking mode (Linux only, better throughput).
#[cfg(target_os = "linux")]
IoUringBlocking,
/// Linux `io_uring` in polling mode (Linux only, lowest latency).
#[cfg(target_os = "linux")]
IoUringPolling,
}
impl Default for BfTreeConfig {
fn default() -> Self {
Self {
file_path: PathBuf::from(":memory:"),
circular_buffer_size: 32 * 1024 * 1024, // 32 MiB
max_key_len: 256,
min_record_size: 4,
max_record_size: 1568,
leaf_page_size: 4096,
enable_wal: true,
wal_flush_interval_ms: 1,
backend: BfTreeBackend::Memory,
verify_mode: VerifyMode::None,
snapshot_interval: 100,
durability: DurabilityMode::Sync,
}
}
}
impl BfTreeBackend {
/// Returns `true` if this backend persists data to a file.
fn is_file_backed(&self) -> bool {
match self {
BfTreeBackend::Memory => false,
BfTreeBackend::Std => true,
#[cfg(target_os = "linux")]
BfTreeBackend::IoUringBlocking => true,
#[cfg(target_os = "linux")]
BfTreeBackend::IoUringPolling => true,
}
}
}
impl BfTreeConfig {
/// Create a configuration for file-backed storage.
pub fn new_file(path: impl AsRef<Path>, buffer_size_mib: usize) -> Self {
Self {
file_path: path.as_ref().to_path_buf(),
circular_buffer_size: buffer_size_mib * 1024 * 1024,
backend: BfTreeBackend::Std,
..Self::default()
}
}
/// Create a configuration for in-memory storage.
pub fn new_memory(buffer_size_mib: usize) -> Self {
Self {
file_path: PathBuf::from(":memory:"),
circular_buffer_size: buffer_size_mib * 1024 * 1024,
enable_wal: false,
backend: BfTreeBackend::Memory,
..Self::default()
}
}
/// Create a configuration optimized for embedded targets.
/// Uses smaller buffer (4 MiB) and conservative record sizes.
pub fn new_embedded(path: impl AsRef<Path>) -> Self {
Self {
file_path: path.as_ref().to_path_buf(),
circular_buffer_size: 4 * 1024 * 1024, // 4 MiB
max_key_len: 16,
min_record_size: 4,
max_record_size: 512,
leaf_page_size: 4096,
enable_wal: true,
wal_flush_interval_ms: 10, // less frequent flush for embedded
backend: BfTreeBackend::Std,
verify_mode: VerifyMode::None,
snapshot_interval: 100,
durability: DurabilityMode::Sync,
}
}
/// Convert to bf-tree's native `Config`.
///
/// Returns `Err(BfTreeError::InvalidConfig)` if WAL is disabled on a
/// file-backed backend. Without WAL, committed data is only durable after
/// an explicit `snapshot()` call; any crash between `commit()` and
/// `snapshot()` silently loses data. This configuration is rejected to
/// prevent silent data loss.
pub(crate) fn into_bf_config(self) -> Result<Config, BfTreeError> {
if self.backend.is_file_backed() && !self.enable_wal {
return Err(BfTreeError::InvalidConfig(String::from(
"WAL must be enabled for file-backed storage backends; \
disabling WAL on a file backend causes silent data loss on crash. \
Use enable_wal: true, or switch to BfTreeBackend::Memory for \
non-durable in-memory usage",
)));
}
let mut config = Config::new(&self.file_path, self.circular_buffer_size);
let storage_backend = match self.backend {
BfTreeBackend::Memory => StorageBackend::Memory,
BfTreeBackend::Std => StorageBackend::Std,
#[cfg(target_os = "linux")]
BfTreeBackend::IoUringBlocking => StorageBackend::IoUringBlocking,
#[cfg(target_os = "linux")]
BfTreeBackend::IoUringPolling => StorageBackend::IoUringPolling,
};
config.storage_backend(storage_backend);
config.cb_max_key_len(self.max_key_len);
config.cb_min_record_size(self.min_record_size);
config.cb_max_record_size(self.max_record_size);
config.leaf_page_size(self.leaf_page_size);
if self.enable_wal {
let wal_path = if self.file_path.to_str() == Some(":memory:") {
PathBuf::from(":memory:")
} else {
let parent = self.file_path.parent().unwrap_or(Path::new("."));
let stem = self
.file_path
.file_stem()
.unwrap_or_default()
.to_str()
.unwrap_or("data");
parent.join(format!("{stem}.wal"))
};
let mut wal_config = WalConfig::new(wal_path);
wal_config.flush_interval(std::time::Duration::from_millis(self.wal_flush_interval_ms));
config.enable_write_ahead_log(Arc::new(wal_config));
}
config.validate()?;
Ok(config)
}
}