Skip to main content

bf_tree/
config.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use std::{
5    fs,
6    path::{Path, PathBuf},
7    sync::{atomic::Ordering, Arc},
8    time::Duration,
9};
10
11use crate::{
12    error::ConfigError,
13    nodes::{
14        leaf_node::LeafKVMeta, LeafNode, CACHE_LINE_SIZE, DISK_PAGE_SIZE, MAX_KEY_LEN,
15        MAX_LEAF_PAGE_SIZE,
16    },
17};
18use serde::Deserialize;
19use std::sync::atomic::AtomicUsize;
20
21const DEFAULT_PROMOTION_RATE_DEBUG: usize = 50;
22const DEFAULT_PROMOTION_RATE_RLEASE: usize = 30;
23const DEFAULT_MAX_MINI_PAGE_SIZE: usize = 2048; // Deprecated
24const DEFAULT_COPY_ON_ACCESS_RATIO: f64 = 0.1;
25const DEFAULT_CIRCULAR_BUFFER_SIZE: usize = 1024 * 1024 * 32;
26const DEFAULT_MIN_RECORD_SIZE: usize = 4;
27const DEFAULT_MAX_RECORD_SIZE: usize = 1952;
28const DEFAULT_LEAF_PAGE_SIZE: usize = 4096;
29const DEFAULT_MAX_KEY_LEN: usize = 16;
30
31/// Bf-tree configuration for advanced usage.
32/// Bf-tree is designed to work well on various workloads that you don't have to change the default configuration.
33/// This configuration is more for advanced users who want to understand the different components of the system, rather than performance tuning.
34#[derive(Debug)]
35pub struct Config {
36    pub(crate) read_promotion_rate: AtomicUsize,
37    pub(crate) scan_promotion_rate: AtomicUsize,
38    pub(crate) storage_backend: StorageBackend,
39    pub(crate) cb_size_byte: usize,
40    pub(crate) cb_min_record_size: usize,
41    pub(crate) cb_max_record_size: usize,
42    pub(crate) leaf_page_size: usize,
43    pub(crate) cb_max_key_len: usize,
44    pub(crate) max_fence_len: usize,
45    pub(crate) cb_copy_on_access_ratio: f64,
46    pub(crate) read_record_cache: bool,
47    pub(crate) file_path: PathBuf,
48    pub(crate) max_mini_page_size: usize,
49    pub(crate) mini_page_binary_search: bool,
50    pub(crate) write_ahead_log: Option<Arc<WalConfig>>,
51    pub(crate) write_load_full_page: bool,
52    pub(crate) cache_only: bool,
53}
54
55impl Clone for Config {
56    fn clone(&self) -> Self {
57        Self {
58            read_promotion_rate: AtomicUsize::new(self.read_promotion_rate.load(Ordering::Relaxed)),
59            scan_promotion_rate: AtomicUsize::new(self.scan_promotion_rate.load(Ordering::Relaxed)),
60            storage_backend: self.storage_backend.clone(),
61            cb_size_byte: self.cb_size_byte,
62            cb_min_record_size: self.cb_min_record_size,
63            cb_max_record_size: self.cb_max_record_size,
64            leaf_page_size: self.leaf_page_size,
65            cb_max_key_len: self.cb_max_key_len,
66            max_fence_len: self.max_fence_len,
67            cb_copy_on_access_ratio: self.cb_copy_on_access_ratio,
68            read_record_cache: self.read_record_cache,
69            file_path: self.file_path.clone(),
70            max_mini_page_size: self.max_mini_page_size,
71            mini_page_binary_search: self.mini_page_binary_search,
72            write_ahead_log: self.write_ahead_log.clone(),
73            write_load_full_page: self.write_load_full_page,
74            cache_only: self.cache_only,
75        }
76    }
77}
78
79/// Where/how to store the leaf pages?
80#[derive(Debug, Default, Clone, Eq, PartialEq)]
81pub enum StorageBackend {
82    Memory,
83    #[default]
84    Std,
85    #[cfg(target_os = "linux")]
86    StdDirect,
87    #[cfg(target_os = "linux")]
88    IoUringPolling,
89    #[cfg(target_os = "linux")]
90    IoUringBlocking,
91    #[cfg(all(target_os = "linux", feature = "spdk"))]
92    Spdk,
93}
94
95#[derive(Debug, Deserialize)]
96pub struct ConfigFile {
97    pub(crate) cb_size_byte: usize,
98    pub(crate) cb_min_record_size: usize,
99    pub(crate) cb_max_record_size: usize,
100    pub(crate) cb_max_key_len: usize,
101    pub(crate) leaf_page_size: usize,
102    pub(crate) index_file_path: String,
103    pub(crate) backend_storage: String,
104    pub(crate) read_promotion_rate: usize,
105    pub(crate) write_load_full_page: bool,
106    pub(crate) cache_only: bool,
107}
108
109/// Default BfTree configuration
110///
111impl Default for Config {
112    fn default() -> Self {
113        let read_promotion_rate = if cfg!(debug_assertions) {
114            DEFAULT_PROMOTION_RATE_DEBUG
115        } else {
116            DEFAULT_PROMOTION_RATE_RLEASE
117        };
118        let scan_promotion_rate = if cfg!(debug_assertions) {
119            DEFAULT_PROMOTION_RATE_DEBUG
120        } else {
121            DEFAULT_PROMOTION_RATE_RLEASE
122        };
123
124        Self {
125            read_promotion_rate: AtomicUsize::new(read_promotion_rate),
126            scan_promotion_rate: AtomicUsize::new(scan_promotion_rate),
127            cb_size_byte: DEFAULT_CIRCULAR_BUFFER_SIZE,
128            cb_min_record_size: DEFAULT_MIN_RECORD_SIZE,
129            cb_max_record_size: DEFAULT_MAX_RECORD_SIZE,
130            leaf_page_size: DEFAULT_LEAF_PAGE_SIZE,
131            cb_max_key_len: DEFAULT_MAX_KEY_LEN,
132            max_fence_len: DEFAULT_MAX_KEY_LEN * 2,
133            cb_copy_on_access_ratio: DEFAULT_COPY_ON_ACCESS_RATIO,
134            file_path: PathBuf::new(),
135            read_record_cache: true,
136            max_mini_page_size: DEFAULT_MAX_MINI_PAGE_SIZE,
137            mini_page_binary_search: true,
138            storage_backend: StorageBackend::Memory,
139            write_ahead_log: None,
140            write_load_full_page: true,
141            cache_only: false,
142        }
143    }
144}
145impl Config {
146    pub fn new(file_path: impl AsRef<Path>, circular_buffer_size: usize) -> Self {
147        let mut config = Self::default();
148        let mut cache_only = false;
149        let storage_backend = if file_path.as_ref().to_str().unwrap().starts_with(":memory:") {
150            StorageBackend::Memory
151        } else if file_path.as_ref().to_str().unwrap().starts_with(":cache:") {
152            cache_only = true;
153            StorageBackend::Memory
154        } else {
155            StorageBackend::default()
156        };
157
158        config
159            .storage_backend(storage_backend)
160            .cache_only(cache_only)
161            .cb_size_byte(circular_buffer_size)
162            .file_path(file_path);
163
164        config
165    }
166
167    /// Constructor of Config based on a config TOML file
168    /// The config file must have all fields defined ConfigFile
169    pub fn new_with_config_file<P: AsRef<Path>>(config_file_path: P) -> Self {
170        let config_file_str =
171            fs::read_to_string(config_file_path).expect("couldn't read config file");
172        let config_file: ConfigFile =
173            toml::from_str(&config_file_str).expect("Fail to parse config file");
174        let scan_promotion_rate = if cfg!(debug_assertions) {
175            DEFAULT_PROMOTION_RATE_DEBUG
176        } else {
177            DEFAULT_PROMOTION_RATE_RLEASE
178        };
179        let mut storage = StorageBackend::Memory;
180        if config_file.backend_storage == "disk" {
181            storage = StorageBackend::default();
182        }
183
184        // Return the config
185        Self {
186            read_promotion_rate: AtomicUsize::new(config_file.read_promotion_rate),
187            scan_promotion_rate: AtomicUsize::new(scan_promotion_rate),
188            cb_size_byte: config_file.cb_size_byte,
189            cb_min_record_size: config_file.cb_min_record_size,
190            cb_max_record_size: config_file.cb_max_record_size,
191            leaf_page_size: config_file.leaf_page_size,
192            cb_max_key_len: config_file.cb_max_key_len,
193            max_fence_len: config_file.cb_max_key_len * 2,
194            cb_copy_on_access_ratio: DEFAULT_COPY_ON_ACCESS_RATIO,
195            file_path: PathBuf::from(config_file.index_file_path),
196            read_record_cache: true,
197            max_mini_page_size: DEFAULT_MAX_MINI_PAGE_SIZE,
198            mini_page_binary_search: true,
199            storage_backend: storage,
200            write_ahead_log: None,
201            write_load_full_page: config_file.write_load_full_page,
202            cache_only: config_file.cache_only,
203        }
204    }
205
206    /// Default: Std
207    ///
208    /// Use std::fs::file to store/access disk data.
209    /// For better performance, consider platform specific backends like: IoUringBlocking.
210    pub fn storage_backend(&mut self, backend: StorageBackend) -> &mut Self {
211        self.storage_backend = backend;
212        self
213    }
214
215    /// Default: 30
216    ///
217    /// prob% of chance that a **page** will be promoted to buffer during scan operations.
218    pub fn scan_promotion_rate(&mut self, prob: usize) -> &mut Self {
219        self.scan_promotion_rate.store(prob, Ordering::Relaxed);
220        self
221    }
222
223    /// Default: true
224    ///
225    /// By default bf-tree will cache the hot records in mini page.
226    /// Setting this to false will try to cache the entire base page, which is less efficient.
227    pub fn read_record_cache(&mut self, read_full_page_cache: bool) -> &mut Self {
228        self.read_record_cache = read_full_page_cache;
229        self
230    }
231
232    /// Default: 2048
233    ///
234    /// The maximum mini page size before it grows to a full page.
235    pub fn max_mini_page_size(&mut self, size: usize) -> &mut Self {
236        self.max_mini_page_size = size;
237        self
238    }
239
240    /// Default: true
241    ///
242    /// If set to false, the mini page will use linear search instead of binary search.
243    pub fn mini_page_binary_search(&mut self, binary_search: bool) -> &mut Self {
244        self.mini_page_binary_search = binary_search;
245        self
246    }
247
248    /// Default: 30
249    ///
250    /// prob% of chance that a record will be promoted from leaf page to mini page.
251    pub fn read_promotion_rate(&mut self, prob: usize) -> &mut Self {
252        self.read_promotion_rate.store(prob, Ordering::Relaxed);
253        self
254    }
255
256    /// Default: 0.1
257    ///
258    /// The ratio of copy-on-access region for circular buffer.
259    /// - 0.0 means the circular buffer is a FIFO.
260    /// - 1.0 means the circular buffer is a LRU.
261    ///
262    /// You don't want to change this unless you know what you are doing.
263    pub fn cb_copy_on_access_ratio(&mut self, ratio: f64) -> &mut Self {
264        self.cb_copy_on_access_ratio = ratio;
265        self
266    }
267
268    /// Default: false
269    ///
270    /// Whether to enable write ahead log, for persistency and recovery.
271    pub fn enable_write_ahead_log(&mut self, wal_config: Arc<WalConfig>) -> &mut Self {
272        // let write_ahead_log_path = self.file_path.parent().unwrap().join("wal.log");
273        self.write_ahead_log = Some(wal_config);
274        self
275    }
276
277    /// Default: false
278    ///
279    /// Similar to `enable_write_ahead_log`, but with default WAL configuration.
280    /// The path to write the write ahead log.
281    /// Advanced users may want to change the WAL to point to a different location
282    /// to leverage different storage patterns
283    /// (WAL is always sequence write and requires durability).
284    pub fn enable_write_ahead_log_default(&mut self) -> &mut Self {
285        let wal_config = WalConfig::new(self.file_path.parent().unwrap().join("wal.log"));
286        self.write_ahead_log = Some(Arc::new(wal_config));
287        self
288    }
289
290    /// Default: false
291    pub fn cache_only(&mut self, cache_only: bool) -> &mut Self {
292        self.cache_only = cache_only;
293        self
294    }
295
296    /// Default: 32 * 1024 * 1024
297    pub fn cb_size_byte(&mut self, cb_size_byte: usize) -> &mut Self {
298        self.cb_size_byte = cb_size_byte;
299        self
300    }
301
302    pub fn file_path<P: AsRef<Path>>(&mut self, file_path: P) -> &mut Self {
303        self.file_path = file_path.as_ref().to_path_buf();
304        self
305    }
306
307    pub fn cb_max_key_len(&mut self, max_key_len: usize) -> &mut Self {
308        self.cb_max_key_len = max_key_len;
309        self.max_fence_len = max_key_len * 2;
310        self
311    }
312
313    pub fn cb_min_record_size(&mut self, min_record_size: usize) -> &mut Self {
314        self.cb_min_record_size = min_record_size;
315        self
316    }
317
318    pub fn cb_max_record_size(&mut self, max_record_size: usize) -> &mut Self {
319        self.cb_max_record_size = max_record_size;
320        self
321    }
322
323    /// Returns the current max record size
324    pub fn get_cb_max_record_size(&self) -> usize {
325        self.cb_max_record_size
326    }
327
328    pub fn get_cb_size_byte(&self) -> usize {
329        self.cb_size_byte
330    }
331
332    pub fn leaf_page_size(&mut self, leaf_page_size: usize) -> &mut Self {
333        self.leaf_page_size = leaf_page_size;
334        self
335    }
336
337    /// Returns the current leaf page size
338    pub fn get_leaf_page_size(&self) -> usize {
339        self.leaf_page_size
340    }
341
342    /// Returns `true` if the storage backend is in-memory (no file-backed storage).
343    pub fn is_memory_backend(&self) -> bool {
344        self.storage_backend == StorageBackend::Memory
345    }
346
347    /// Validate the configuration and report any invalid parameter, if found.
348    pub fn validate(&self) -> Result<(), ConfigError> {
349        // Sanity check of the input parameters
350        if self.cb_min_record_size <= 1 {
351            return Err(ConfigError::MinimumRecordSize(
352                "cb_min_record_size (key + value in bytes) needs to be at least 2 bytes"
353                    .to_string(),
354            ));
355        }
356
357        if self.cb_min_record_size > self.cb_max_record_size {
358            return Err(ConfigError::MaximumRecordSize("cb_min_record_size (key + value in bytes) cannot be greater than cb_max_record_size".to_string()));
359        }
360
361        if self.max_fence_len == 0 {
362            return Err(ConfigError::MaxKeyLen(
363                "cb_max_key_len cannot be zero".to_string(),
364            ));
365        }
366
367        if self.max_fence_len / 2 > self.cb_max_record_size {
368            return Err(ConfigError::MaxKeyLen(
369                "cb_max_key_len cannot be greater than cb_max_record_size".to_string(),
370            ));
371        }
372
373        if self.leaf_page_size > MAX_LEAF_PAGE_SIZE {
374            return Err(ConfigError::LeafPageSize(format!(
375                "leaf_page_size cannot be larger than {}",
376                MAX_LEAF_PAGE_SIZE
377            )));
378        }
379
380        if self.max_fence_len / 2 > MAX_KEY_LEN {
381            return Err(ConfigError::MaxKeyLen(format!(
382                "cb_max_key_len cannot be larger than {}",
383                MAX_KEY_LEN
384            )));
385        }
386
387        if !self.cb_size_byte.is_power_of_two() {
388            return Err(ConfigError::CircularBufferSize(
389                "cb_size_byte should be a power of two".to_string(),
390            ));
391        }
392
393        if self.leaf_page_size / self.cb_min_record_size > 4096 {
394            return Err(ConfigError::MinimumRecordSize(
395                "leaf_page_size/min_record_size cannot exceed 2^12.".to_string(),
396            ));
397        }
398
399        // Page alignment checks
400        if !self.cache_only && !self.leaf_page_size.is_multiple_of(DISK_PAGE_SIZE) {
401            return Err(ConfigError::LeafPageSize(format!(
402                "In non cache-only mode leaf_page_size should be multiple of {}",
403                DISK_PAGE_SIZE
404            )));
405        } else if self.cache_only && !self.leaf_page_size.is_multiple_of(CACHE_LINE_SIZE) {
406            return Err(ConfigError::LeafPageSize(format!(
407                "In cache-only mode leaf_page_size should be multiple of {}",
408                CACHE_LINE_SIZE
409            )));
410        }
411
412        // Mini-page merge/split operation safety guarantee checks
413        let max_record_size_with_meta = self.cb_max_record_size + std::mem::size_of::<LeafKVMeta>();
414        let mut max_mini_page_size: usize;
415
416        if self.cache_only {
417            if self.leaf_page_size < 2 * max_record_size_with_meta + std::mem::size_of::<LeafNode>()
418            {
419                return Err(ConfigError::MaximumRecordSize(format!(
420                    "In cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
421                    (self.leaf_page_size - std::mem::size_of::<LeafNode>()) / 2
422                        - std::mem::size_of::<LeafKVMeta>()
423                )));
424            }
425        } else {
426            if max_record_size_with_meta
427                > self.leaf_page_size - self.max_fence_len - 2 * std::mem::size_of::<LeafKVMeta>()
428            {
429                return Err(ConfigError::MaximumRecordSize(format!(
430                    "In non cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
431                    self.leaf_page_size
432                        - self.max_fence_len
433                        - 2 * std::mem::size_of::<LeafKVMeta>()
434                )));
435            }
436            max_mini_page_size = self.leaf_page_size
437                - max_record_size_with_meta
438                - self.max_fence_len
439                - 2 * std::mem::size_of::<LeafKVMeta>();
440            max_mini_page_size = (max_mini_page_size / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
441
442            if max_mini_page_size < max_record_size_with_meta + std::mem::size_of::<LeafNode>() {
443                return Err(ConfigError::MaximumRecordSize(format!(
444                    "In non cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
445                    max_mini_page_size
446                        - std::mem::size_of::<LeafNode>()
447                        - std::mem::size_of::<LeafKVMeta>()
448                )));
449            }
450        }
451
452        // Circular buffer size checks
453        // In cache-only mode, the circular buffer size >= 4 * leaf_page_size
454        // This is because during page split, two full sized mini-page need to be in memory
455        // Note that 2 * leaf_page_size only guarantees one full sized mini-page due to AllocMeta
456        //
457        // In non cache-only mode, the circular buffer size >= 2 * leaf_page_size
458        // As at most one full sized leaf page is required
459        if self.cache_only {
460            if self.cb_size_byte < 4 * self.leaf_page_size {
461                return Err(ConfigError::CircularBufferSize(
462                    "In cache-only mode, cb_size_byte should be at least 4 times of leaf_page_size"
463                        .to_string(),
464                ));
465            }
466        } else if self.cb_size_byte < 2 * self.leaf_page_size {
467            return Err(ConfigError::CircularBufferSize(
468                "In non cache-only mode, cb_size_byte should be at least 2 times of leaf_page_size"
469                    .to_string(),
470            ));
471        }
472
473        Ok(())
474    }
475}
476
477#[derive(Clone, Debug)]
478
479pub struct WalConfig {
480    pub(crate) file_path: PathBuf,
481    pub(crate) flush_interval: Duration,
482    pub(crate) segment_size: usize,
483    pub(crate) storage_backend: StorageBackend,
484}
485
486impl WalConfig {
487    /// Default: same directory as the file_path
488    ///
489    /// The path to write the write ahead log.
490    /// Advanced users may want to change the WAL to point to a different location
491    /// to leverage different storage patterns
492    /// (WAL is always sequence write and requires durability).
493    pub fn new(file_path: impl AsRef<Path>) -> Self {
494        Self {
495            file_path: file_path.as_ref().to_path_buf(),
496            flush_interval: Duration::from_millis(1),
497            segment_size: 1024 * 1024 * 1024,
498            storage_backend: StorageBackend::Std,
499        }
500    }
501
502    /// Default: 1ms
503    pub fn flush_interval(&mut self, interval: Duration) -> &mut Self {
504        self.flush_interval = interval;
505        self
506    }
507
508    /// Default: 1MB
509    pub fn segment_size(&mut self, size: usize) -> &mut Self {
510        self.segment_size = size;
511        self
512    }
513
514    /// Default: Std
515    ///
516    /// Change the storage backend for potentially better performance, e.g., IoUring on Linux.
517    pub fn storage_backend(&mut self, backend: StorageBackend) -> &mut Self {
518        self.storage_backend = backend;
519        self
520    }
521}
522
523#[cfg(test)]
524mod tests {
525    use super::*;
526
527    const SAMPLE_CONFIG_FILE: &str = "src/sample_config.toml";
528    #[test]
529    fn test_new_with_config_file() {
530        let config = Config::new_with_config_file(SAMPLE_CONFIG_FILE);
531
532        assert_eq!(config.cb_size_byte, 8192);
533        assert_eq!(config.read_promotion_rate.load(Ordering::Relaxed), 100);
534        assert_eq!(config.write_load_full_page, true);
535        assert_eq!(config.file_path, PathBuf::from("c/d/E"));
536        assert_eq!(config.cache_only, false);
537    }
538
539    #[test]
540    fn test_leaf_page_size_getter_setter() {
541        let mut config = Config::default();
542
543        // Check default value
544        assert_eq!(config.get_leaf_page_size(), DEFAULT_LEAF_PAGE_SIZE);
545
546        // Set a new value and verify it
547        let new_leaf_page_size = 8192;
548        config.leaf_page_size(new_leaf_page_size);
549        assert_eq!(config.get_leaf_page_size(), new_leaf_page_size);
550
551        // Set another value and verify
552        let another_leaf_page_size = 16384;
553        config.leaf_page_size(another_leaf_page_size);
554        assert_eq!(config.get_leaf_page_size(), another_leaf_page_size);
555    }
556
557    #[test]
558    fn test_cb_max_record_size_getter_setter() {
559        let mut config = Config::default();
560
561        // Check default value
562        assert_eq!(config.get_cb_max_record_size(), DEFAULT_MAX_RECORD_SIZE);
563
564        // Set a new value and verify it
565        let new_max_record_size = 4096;
566        config.cb_max_record_size(new_max_record_size);
567        assert_eq!(config.get_cb_max_record_size(), new_max_record_size);
568
569        // Set another value and verify
570        let another_max_record_size = 8192;
571        config.cb_max_record_size(another_max_record_size);
572        assert_eq!(config.get_cb_max_record_size(), another_max_record_size);
573    }
574}