1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
use std::{
ops::Deref,
path::{Path, PathBuf},
sync::Arc,
};
use crate::{
io,
utils::observe::{NoopObserver, Observer},
};
use super::OpCode;
fn dir_parent_for_sync(path: &Path) -> Option<&Path> {
match path.parent() {
Some(parent) if parent.as_os_str().is_empty() => Some(Path::new(".")),
Some(parent) => Some(parent),
None => None,
}
}
fn create_dir_all_and_sync(path: &Path) -> std::io::Result<()> {
if path.exists() {
return Ok(());
}
let mut created = Vec::new();
let mut cursor = Some(path);
while let Some(dir) = cursor {
if dir.exists() {
break;
}
created.push(dir.to_path_buf());
cursor = dir.parent();
}
std::fs::create_dir_all(path)?;
created.reverse();
for dir in created {
if let Some(parent) = dir_parent_for_sync(&dir) {
io::sync_dir(parent)?;
}
}
Ok(())
}
/// Configuration options for the Mace storage engine.
#[derive(Clone)]
pub struct Options {
/// Force-sync data to disk for every wal/data write.
///
/// The default value is `true` (use fsync or else use fdatasync). Turning it off may result in
/// data loss, while turning it on may reduce performance.
pub sync_on_write: bool,
/// Writer group count. Default is [`Self::CONCURRENT_WRITE`] and it must be in the range `[1, 128]`
///
/// **Once set, it cannot be modified**
pub concurrent_write: u8,
/// Garbage collection cycle interval (milliseconds).
pub gc_timeout: u64,
/// Proactive page-checkpoint trigger interval (milliseconds).
///
/// When a bucket has pending dirty pages but no foreground write reaches checkpoint thresholds,
/// the evictor triggers checkpoint near this interval to prevent WAL checkpoint stalling.
///
/// Set to 0 to disable proactive triggering.
pub checkpoint_nudge_ms: u64,
/// Perform compaction when the garbage ratio exceeds this value, in the range `[0, 100]`
pub data_garbage_ratio: u32,
/// If true, compact immediately when [`Self::data_garbage_ratio`] is reached.
pub gc_eager: bool,
/// Size limit of a blob file. Default is [`Self::BLOB_FILE_SIZE`]
pub blob_file_size: usize,
/// Trigger blob GC when the garbage ratio exceeds this value, in the range `[0, 100]`
pub blob_garbage_ratio: usize,
/// At each blob GC cycle, pick the lowest-utilization [`Self::blob_gc_ratio`]% of blob files as candidates.
pub blob_gc_ratio: usize,
/// Whether this is temporary storage.
///
/// If true, `db_root` will be removed on exit.
pub tmp_store: bool,
/// Directory where database files are stored.
pub(crate) db_root: PathBuf,
/// Directory where log files are stored.
///
/// The default value is `db_root/log`.
pub log_root: PathBuf,
/// Shared logical-address cache capacity in bytes.
///
/// This cache keeps file-loaded blob values and auxiliary history/sibling pages.
/// Resident tree pages and dirty pool pages are accounted elsewhere and are not inserted here.
/// Trimming is best-effort and happens in small rounds, so short-term overshoot is possible.
///
/// Different subsystems may transiently hold refs to the same allocation.
pub lru_capacity: usize,
/// Bitmap-cache entry count for data and blob stats.
pub stat_mask_cache_count: usize,
/// Maximum number of open data-file handles cached concurrently, used for loading data pages.
pub data_handle_cache_capacity: usize,
/// Maximum number of open blob-file handles cached concurrently, used for loading blob pages.
pub blob_handle_cache_capacity: usize,
/// Size limit of a data file. Minimum is [`Self::DATA_FILE_SIZE`]
pub data_file_size: usize,
/// WAL ring buffer size. Must be greater than the page size and a power of two.
pub wal_buffer_size: usize,
/// Number of checkpoints a transaction can span (i.e., transaction length limit).
///
/// If a transaction exceeds this limit, it is forcibly aborted.
pub max_ckpt_per_txn: usize,
/// WAL file size limit that triggers switching to a new WAL file, up to 2GB.
pub wal_file_size: u32,
/// If true, remove unused stable WAL files (never used in recovery).
///
/// Default is `false`.
pub keep_stable_wal_file: bool,
/// If true, corrupted WAL is truncated during recovery; otherwise recovery panics.
///
/// Default is true.
pub truncate_corrupted_wal: bool,
/// Observability callback. Default is no-op.
pub observer: Arc<dyn Observer>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(C)]
pub struct BucketOptions {
/// Per-bucket target resident bytes for mapped B+Tree pages.
pub cache_capacity: usize,
/// Percentage of items evicted per round. Range is `[10, 80]`, default is `20%`
pub cache_evict_pct: usize,
/// Per-bucket pool target bytes. Default is [`Self::POOL_CAP`]
pub pool_capacity: usize,
/// Maximum bytes a single checkpoint round should emit. Default is [`Self::CHECKPOINT_SIZE`]
pub checkpoint_size: usize,
/// For branch nodes, keys and indexes are always inlined. For leaf nodes, values smaller than
/// [`Self::MIN_INLINE_SIZE`] are always inlined, and values larger than [`Self::MAX_INLINE_SIZE`]
/// are stored as blobs. Default is [`Self::MIN_INLINE_SIZE`]
pub inline_size: usize,
/// Maximum number of elements in an SST (B+Tree node). Default is [`Self::MAX_SPLIT_ELEMS`]
pub split_elems: u16,
/// Threshold for consolidating delta chains. Range is `[16, Self::split_elems / 2]`
pub consolidate_threshold: u16,
/// Enable foreground write backpressure. Default is `true`
pub enable_backpressure: bool,
pub _padding: [u8; 3],
}
impl Default for BucketOptions {
fn default() -> Self {
Self::new()
}
}
impl BucketOptions {
pub const MIN_SPLIT_ELEMS: u16 = 64;
pub const MAX_SPLIT_ELEMS: u16 = 512;
pub const CACHE_CAP: usize = 1 << 30; // 1GB
pub const POOL_CAP: usize = 1 << 30; // 1GB
pub const CHECKPOINT_SIZE: usize = 256 << 20; // 256MB
pub const MIN_INLINE_SIZE: usize = 4096;
pub const MAX_INLINE_SIZE: usize = 16384;
pub fn new() -> Self {
Self {
cache_capacity: Self::CACHE_CAP,
cache_evict_pct: 20,
pool_capacity: Self::POOL_CAP,
checkpoint_size: Self::CHECKPOINT_SIZE,
consolidate_threshold: Self::MAX_SPLIT_ELEMS / 2,
inline_size: Self::MIN_INLINE_SIZE,
split_elems: Self::MAX_SPLIT_ELEMS,
enable_backpressure: true,
_padding: [0u8; 3],
}
}
pub fn validate(mut self) -> BucketOptions {
if self.checkpoint_size == 0 {
self.checkpoint_size = Self::CHECKPOINT_SIZE;
}
if self.pool_capacity == 0 {
self.pool_capacity = Self::POOL_CAP;
}
if self.cache_capacity == 0 {
self.cache_capacity = Self::CACHE_CAP;
}
if self.checkpoint_size > self.pool_capacity {
self.checkpoint_size = self.pool_capacity;
}
self.cache_evict_pct = self.cache_evict_pct.clamp(10, 80);
self.split_elems = self
.split_elems
.clamp(Self::MIN_SPLIT_ELEMS, Self::MAX_SPLIT_ELEMS);
self.consolidate_threshold = self.consolidate_threshold.clamp(16, self.split_elems / 2);
self.inline_size = self
.inline_size
.clamp(Self::MIN_INLINE_SIZE, Self::MAX_INLINE_SIZE);
self
}
pub(crate) fn max_delta_len(&self) -> usize {
self.split_elems as usize / 4
}
}
impl Options {
pub const CONCURRENT_WRITE: u8 = 16;
pub const MAX_CONCURRENT_WRITE: u8 = 128;
pub const DATA_FILE_SIZE: usize = 64 << 20; // 64MB
pub const BLOB_FILE_SIZE: usize = 256 << 20; // 256MB
pub const LRU_CAPACITY: usize = 256 << 20; // 256MB
// Assuming a MemData/BlobStat is 32 KB, 16,384 stats use ~512 MB of memory, which is reasonable.
pub const STAT_MASK_CACHE_CNT: usize = 16384;
pub const WAL_BUF_SZ: usize = 16 << 20; // 16MB
pub const WAL_FILE_SZ: usize = 64 << 20; // 64MB
pub(crate) const MAX_KEY_SIZE: usize = 64 << 10;
pub(crate) const MAX_KV_SIZE: usize = 1 << 30; // 1GB
/// Creates a new Options instance with default values and the given database root.
pub fn new<P: AsRef<Path>>(db_root: P) -> Self {
Self {
sync_on_write: true,
concurrent_write: Self::CONCURRENT_WRITE,
tmp_store: false,
gc_timeout: 60 * 1000, // 1min
checkpoint_nudge_ms: 60 * 1000, // 1min
data_garbage_ratio: 20, // 20%
gc_eager: true,
blob_file_size: Self::BLOB_FILE_SIZE,
blob_garbage_ratio: 50, // 50%
blob_gc_ratio: 25, // 25%
db_root: db_root.as_ref().to_path_buf(),
log_root: db_root.as_ref().to_path_buf(),
lru_capacity: Self::LRU_CAPACITY,
stat_mask_cache_count: Self::STAT_MASK_CACHE_CNT,
data_handle_cache_capacity: 128,
blob_handle_cache_capacity: 128,
data_file_size: Self::DATA_FILE_SIZE,
wal_buffer_size: Self::WAL_BUF_SZ,
max_ckpt_per_txn: 1_000_000, // 1 million
wal_file_size: Self::WAL_FILE_SZ as u32,
keep_stable_wal_file: false,
truncate_corrupted_wal: true,
observer: Arc::new(NoopObserver),
}
}
/// Validates the options and returns a ParsedOptions instance.
pub fn validate(mut self) -> Result<ParsedOptions, OpCode> {
self.concurrent_write = self
.concurrent_write
.clamp(1, Self::MAX_CONCURRENT_WRITE)
.next_power_of_two();
if self.stat_mask_cache_count == 0 {
self.stat_mask_cache_count = Self::STAT_MASK_CACHE_CNT;
}
if self.lru_capacity == 0 {
self.lru_capacity = Self::LRU_CAPACITY;
}
if self.data_file_size == 0 {
self.data_file_size = Self::DATA_FILE_SIZE;
}
if self.blob_file_size == 0 {
self.blob_file_size = Self::BLOB_FILE_SIZE;
}
self.create_dir().map_err(|e| {
eprintln!("create dir fail {e:?}");
OpCode::IoError
})?;
Ok(ParsedOptions { inner: self })
}
/// Creates the directory structure for the database.
pub fn create_dir(&self) -> std::io::Result<()> {
let (db_root, data_root, log_root) = (self.db_root(), self.data_root(), self.log_root());
if !db_root.exists() {
create_dir_all_and_sync(&db_root)?;
}
if !data_root.exists() {
create_dir_all_and_sync(&data_root)?;
}
if !log_root.exists() {
create_dir_all_and_sync(&log_root)?;
}
Ok(())
}
}
pub struct ParsedOptions {
pub(crate) inner: Options,
}
impl Deref for ParsedOptions {
type Target = Options;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl Options {
pub const SEP: &'static str = "_";
pub const DATA_PREFIX: &'static str = "data";
pub const BLOB_PREFIX: &'static str = "blob";
pub const WAL_PREFIX: &'static str = "wal";
pub const WAL_STABLE: &'static str = "stable-wal";
pub const MANIFEST: &'static str = "manifest";
pub fn data_root(&self) -> PathBuf {
self.db_root().join("data")
}
pub fn data_file(&self, id: u64) -> PathBuf {
self.data_root()
.join(format!("{}{}{}", Self::DATA_PREFIX, Self::SEP, id))
}
pub fn blob_file(&self, id: u64) -> PathBuf {
self.data_root()
.join(format!("{}{}{}", Self::BLOB_PREFIX, Self::SEP, id))
}
pub fn log_root(&self) -> PathBuf {
if self.log_root == self.db_root {
self.db_root.join("log")
} else {
self.log_root.clone()
}
}
pub fn db_root(&self) -> PathBuf {
self.db_root.clone()
}
pub fn wal_file(&self, group_id: u8, seq: u64) -> PathBuf {
self.log_root().join(format!(
"{}{}{}{}{}",
Self::WAL_PREFIX,
Self::SEP,
group_id,
Self::SEP,
seq
))
}
pub fn wal_backup(&self, group_id: u8, seq: u64) -> PathBuf {
self.log_root().join(format!(
"{}{}{}{}{}",
Self::WAL_STABLE,
Self::SEP,
group_id,
Self::SEP,
seq
))
}
pub fn manifest(&self) -> PathBuf {
self.log_root().join(Self::MANIFEST)
}
pub(crate) fn sync_data_dir(&self) {
io::sync_dir(self.data_root()).unwrap_or_else(|e| panic!("can't fail, {:?}", e));
}
pub(crate) fn sync_log_dir(&self) {
io::sync_dir(self.log_root()).unwrap_or_else(|e| panic!("can't fail, {:?}", e));
}
}
impl Drop for ParsedOptions {
fn drop(&mut self) {
if self.inner.tmp_store {
log::info!("remove db_root {:?}", self.inner.db_root);
let _ = std::fs::remove_dir_all(&self.inner.db_root);
}
}
}