bf_tree/
config.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use std::{
5    fs,
6    path::{Path, PathBuf},
7    sync::{atomic::Ordering, Arc},
8    time::Duration,
9};
10
11use crate::{
12    error::ConfigError,
13    nodes::{
14        leaf_node::LeafKVMeta, LeafNode, CACHE_LINE_SIZE, DISK_PAGE_SIZE, MAX_KEY_LEN,
15        MAX_LEAF_PAGE_SIZE,
16    },
17};
18use serde::Deserialize;
19use std::sync::atomic::AtomicUsize;
20
21const DEFAULT_PROMOTION_RATE_DEBUG: usize = 50;
22const DEFAULT_PROMOTION_RATE_RLEASE: usize = 30;
23const DEFAULT_MAX_MINI_PAGE_SIZE: usize = 2048; // Deprecated
24const DEFAULT_COPY_ON_ACCESS_RATIO: f64 = 0.1;
25const DEFAULT_CIRCULAR_BUFFER_SIZE: usize = 1024 * 1024 * 32;
26const DEFAULT_MIN_RECORD_SIZE: usize = 4;
27const DEFAULT_MAX_RECORD_SIZE: usize = 1952;
28const DEFAULT_LEAF_PAGE_SIZE: usize = 4096;
29const DEFAULT_MAX_KEY_LEN: usize = 16;
30
31/// Bf-tree configuration for advanced usage.
32/// Bf-tree is designed to work well on various workloads that you don't have to change the default configuration.
33/// This configuration is more for advanced users who want to understand the different components of the system, rather than performance tuning.
34#[derive(Debug)]
35pub struct Config {
36    pub(crate) read_promotion_rate: AtomicUsize,
37    pub(crate) scan_promotion_rate: AtomicUsize,
38    pub(crate) storage_backend: StorageBackend,
39    pub(crate) cb_size_byte: usize,
40    pub(crate) cb_min_record_size: usize,
41    pub(crate) cb_max_record_size: usize,
42    pub(crate) leaf_page_size: usize,
43    pub(crate) cb_max_key_len: usize,
44    pub(crate) max_fence_len: usize,
45    pub(crate) cb_copy_on_access_ratio: f64,
46    pub(crate) read_record_cache: bool,
47    pub(crate) file_path: PathBuf,
48    pub(crate) max_mini_page_size: usize,
49    pub(crate) mini_page_binary_search: bool,
50    pub(crate) write_ahead_log: Option<Arc<WalConfig>>,
51    pub(crate) write_load_full_page: bool,
52    pub(crate) cache_only: bool,
53}
54
55impl Clone for Config {
56    fn clone(&self) -> Self {
57        Self {
58            read_promotion_rate: AtomicUsize::new(self.read_promotion_rate.load(Ordering::Relaxed)),
59            scan_promotion_rate: AtomicUsize::new(self.scan_promotion_rate.load(Ordering::Relaxed)),
60            storage_backend: self.storage_backend.clone(),
61            cb_size_byte: self.cb_size_byte,
62            cb_min_record_size: self.cb_min_record_size,
63            cb_max_record_size: self.cb_max_record_size,
64            leaf_page_size: self.leaf_page_size,
65            cb_max_key_len: self.cb_max_key_len,
66            max_fence_len: self.max_fence_len,
67            cb_copy_on_access_ratio: self.cb_copy_on_access_ratio,
68            read_record_cache: self.read_record_cache,
69            file_path: self.file_path.clone(),
70            max_mini_page_size: self.max_mini_page_size,
71            mini_page_binary_search: self.mini_page_binary_search,
72            write_ahead_log: self.write_ahead_log.clone(),
73            write_load_full_page: self.write_load_full_page,
74            cache_only: self.cache_only,
75        }
76    }
77}
78
79/// Where/how to store the leaf pages?
80#[derive(Debug, Default, Clone, Eq, PartialEq)]
81pub enum StorageBackend {
82    Memory,
83    #[default]
84    Std,
85    #[cfg(target_os = "linux")]
86    StdDirect,
87    #[cfg(target_os = "linux")]
88    IoUringPolling,
89    #[cfg(target_os = "linux")]
90    IoUringBlocking,
91    #[cfg(all(target_os = "linux", feature = "spdk"))]
92    Spdk,
93}
94
95#[derive(Debug, Deserialize)]
96pub struct ConfigFile {
97    pub(crate) cb_size_byte: usize,
98    pub(crate) cb_min_record_size: usize,
99    pub(crate) cb_max_record_size: usize,
100    pub(crate) cb_max_key_len: usize,
101    pub(crate) leaf_page_size: usize,
102    pub(crate) index_file_path: String,
103    pub(crate) backend_storage: String,
104    pub(crate) read_promotion_rate: usize,
105    pub(crate) write_load_full_page: bool,
106    pub(crate) cache_only: bool,
107}
108
109/// Default BfTree configuration
110///
111impl Default for Config {
112    fn default() -> Self {
113        let read_promotion_rate = if cfg!(debug_assertions) {
114            DEFAULT_PROMOTION_RATE_DEBUG
115        } else {
116            DEFAULT_PROMOTION_RATE_RLEASE
117        };
118        let scan_promotion_rate = if cfg!(debug_assertions) {
119            DEFAULT_PROMOTION_RATE_DEBUG
120        } else {
121            DEFAULT_PROMOTION_RATE_RLEASE
122        };
123
124        Self {
125            read_promotion_rate: AtomicUsize::new(read_promotion_rate),
126            scan_promotion_rate: AtomicUsize::new(scan_promotion_rate),
127            cb_size_byte: DEFAULT_CIRCULAR_BUFFER_SIZE,
128            cb_min_record_size: DEFAULT_MIN_RECORD_SIZE,
129            cb_max_record_size: DEFAULT_MAX_RECORD_SIZE,
130            leaf_page_size: DEFAULT_LEAF_PAGE_SIZE,
131            cb_max_key_len: DEFAULT_MAX_KEY_LEN,
132            max_fence_len: DEFAULT_MAX_KEY_LEN * 2,
133            cb_copy_on_access_ratio: DEFAULT_COPY_ON_ACCESS_RATIO,
134            file_path: PathBuf::new(),
135            read_record_cache: true,
136            max_mini_page_size: DEFAULT_MAX_MINI_PAGE_SIZE,
137            mini_page_binary_search: true,
138            storage_backend: StorageBackend::Memory,
139            write_ahead_log: None,
140            write_load_full_page: true,
141            cache_only: false,
142        }
143    }
144}
145impl Config {
146    pub fn new(file_path: impl AsRef<Path>, circular_buffer_size: usize) -> Self {
147        let mut config = Self::default();
148        let mut cache_only = false;
149        let storage_backend = if file_path.as_ref().to_str().unwrap().starts_with(":memory:") {
150            StorageBackend::Memory
151        } else if file_path.as_ref().to_str().unwrap().starts_with(":cache:") {
152            cache_only = true;
153            StorageBackend::Memory
154        } else {
155            StorageBackend::default()
156        };
157
158        config
159            .storage_backend(storage_backend)
160            .cache_only(cache_only)
161            .cb_size_byte(circular_buffer_size)
162            .file_path(file_path);
163
164        config
165    }
166
167    /// Constructor of Config based on a config TOML file
168    /// The config file must have all fields defined ConfigFile
169    pub fn new_with_config_file<P: AsRef<Path>>(config_file_path: P) -> Self {
170        let config_file_str =
171            fs::read_to_string(config_file_path).expect("couldn't read config file");
172        let config_file: ConfigFile =
173            toml::from_str(&config_file_str).expect("Fail to parse config file");
174        let scan_promotion_rate = if cfg!(debug_assertions) {
175            DEFAULT_PROMOTION_RATE_DEBUG
176        } else {
177            DEFAULT_PROMOTION_RATE_RLEASE
178        };
179        let mut storage = StorageBackend::Memory;
180        if config_file.backend_storage == "disk" {
181            storage = StorageBackend::default();
182        }
183
184        // Return the config
185        Self {
186            read_promotion_rate: AtomicUsize::new(config_file.read_promotion_rate),
187            scan_promotion_rate: AtomicUsize::new(scan_promotion_rate),
188            cb_size_byte: config_file.cb_size_byte,
189            cb_min_record_size: config_file.cb_min_record_size,
190            cb_max_record_size: config_file.cb_max_record_size,
191            leaf_page_size: config_file.leaf_page_size,
192            cb_max_key_len: config_file.cb_max_key_len,
193            max_fence_len: config_file.cb_max_key_len * 2,
194            cb_copy_on_access_ratio: DEFAULT_COPY_ON_ACCESS_RATIO,
195            file_path: PathBuf::from(config_file.index_file_path),
196            read_record_cache: true,
197            max_mini_page_size: DEFAULT_MAX_MINI_PAGE_SIZE,
198            mini_page_binary_search: true,
199            storage_backend: storage,
200            write_ahead_log: None,
201            write_load_full_page: config_file.write_load_full_page,
202            cache_only: config_file.cache_only,
203        }
204    }
205
206    /// Default: Std
207    ///
208    /// Use std::fs::file to store/access disk data.
209    /// For better performance, consider platform specific backends like: IoUringBlocking.
210    pub fn storage_backend(&mut self, backend: StorageBackend) -> &mut Self {
211        self.storage_backend = backend;
212        self
213    }
214
215    /// Default: 30
216    ///
217    /// prob% of chance that a **page** will be promoted to buffer during scan operations.
218    pub fn scan_promotion_rate(&mut self, prob: usize) -> &mut Self {
219        self.scan_promotion_rate.store(prob, Ordering::Relaxed);
220        self
221    }
222
223    /// Default: true
224    ///
225    /// By default bf-tree will cache the hot records in mini page.
226    /// Setting this to false will try to cache the entire base page, which is less efficient.
227    pub fn read_record_cache(&mut self, read_full_page_cache: bool) -> &mut Self {
228        self.read_record_cache = read_full_page_cache;
229        self
230    }
231
232    /// Default: 2048
233    ///
234    /// The maximum mini page size before it grows to a full page.
235    pub fn max_mini_page_size(&mut self, size: usize) -> &mut Self {
236        self.max_mini_page_size = size;
237        self
238    }
239
240    /// Default: true
241    ///
242    /// If set to false, the mini page will use linear search instead of binary search.
243    pub fn mini_page_binary_search(&mut self, binary_search: bool) -> &mut Self {
244        self.mini_page_binary_search = binary_search;
245        self
246    }
247
248    /// Default: 30
249    ///
250    /// prob% of chance that a record will be promoted from leaf page to mini page.
251    pub fn read_promotion_rate(&mut self, prob: usize) -> &mut Self {
252        self.read_promotion_rate.store(prob, Ordering::Relaxed);
253        self
254    }
255
256    /// Default: 0.1
257    ///
258    /// The ratio of copy-on-access region for circular buffer.
259    /// - 0.0 means the circular buffer is a FIFO.
260    /// - 1.0 means the circular buffer is a LRU.
261    ///
262    /// You don't want to change this unless you know what you are doing.
263    pub fn cb_copy_on_access_ratio(&mut self, ratio: f64) -> &mut Self {
264        self.cb_copy_on_access_ratio = ratio;
265        self
266    }
267
268    /// Default: false
269    ///
270    /// Whether to enable write ahead log, for persistency and recovery.
271    pub fn enable_write_ahead_log(&mut self, wal_config: Arc<WalConfig>) -> &mut Self {
272        // let write_ahead_log_path = self.file_path.parent().unwrap().join("wal.log");
273        self.write_ahead_log = Some(wal_config);
274        self
275    }
276
277    /// Default: false
278    ///
279    /// Similar to `enable_write_ahead_log`, but with default WAL configuration.
280    /// The path to write the write ahead log.
281    /// Advanced users may want to change the WAL to point to a different location
282    /// to leverage different storage patterns
283    /// (WAL is always sequence write and requires durability).
284    pub fn enable_write_ahead_log_default(&mut self) -> &mut Self {
285        let wal_config = WalConfig::new(self.file_path.parent().unwrap().join("wal.log"));
286        self.write_ahead_log = Some(Arc::new(wal_config));
287        self
288    }
289
290    /// Default: false
291    pub fn cache_only(&mut self, cache_only: bool) -> &mut Self {
292        self.cache_only = cache_only;
293        self
294    }
295
296    /// Default: 32 * 1024 * 1024
297    pub fn cb_size_byte(&mut self, cb_size_byte: usize) -> &mut Self {
298        self.cb_size_byte = cb_size_byte;
299        self
300    }
301
302    pub fn file_path<P: AsRef<Path>>(&mut self, file_path: P) -> &mut Self {
303        self.file_path = file_path.as_ref().to_path_buf();
304        self
305    }
306
307    pub fn cb_max_key_len(&mut self, max_key_len: usize) -> &mut Self {
308        self.cb_max_key_len = max_key_len;
309        self.max_fence_len = max_key_len * 2;
310        self
311    }
312
313    pub fn cb_min_record_size(&mut self, min_record_size: usize) -> &mut Self {
314        self.cb_min_record_size = min_record_size;
315        self
316    }
317
318    pub fn cb_max_record_size(&mut self, max_record_size: usize) -> &mut Self {
319        self.cb_max_record_size = max_record_size;
320        self
321    }
322
323    /// Returns the current max record size
324    pub fn get_cb_max_record_size(&self) -> usize {
325        self.cb_max_record_size
326    }
327
328    pub fn leaf_page_size(&mut self, leaf_page_size: usize) -> &mut Self {
329        self.leaf_page_size = leaf_page_size;
330        self
331    }
332
333    /// Returns the current leaf page size
334    pub fn get_leaf_page_size(&self) -> usize {
335        self.leaf_page_size
336    }
337
338    /// Validate the configuration and report any invalid parameter, if found.
339    pub fn validate(&self) -> Result<(), ConfigError> {
340        // Sanity check of the input parameters
341        if self.cb_min_record_size <= 1 {
342            return Err(ConfigError::MinimumRecordSize(
343                "cb_min_record_size (key + value in bytes) needs to be at least 2 bytes"
344                    .to_string(),
345            ));
346        }
347
348        if self.cb_min_record_size > self.cb_max_record_size {
349            return Err(ConfigError::MaximumRecordSize("cb_min_record_size (key + value in bytes) cannot be greater than cb_max_record_size".to_string()));
350        }
351
352        if self.max_fence_len == 0 {
353            return Err(ConfigError::MaxKeyLen(
354                "cb_max_key_len cannot be zero".to_string(),
355            ));
356        }
357
358        if self.max_fence_len / 2 > self.cb_max_record_size {
359            return Err(ConfigError::MaxKeyLen(
360                "cb_max_key_len cannot be greater than cb_max_record_size".to_string(),
361            ));
362        }
363
364        if self.leaf_page_size > MAX_LEAF_PAGE_SIZE {
365            return Err(ConfigError::LeafPageSize(format!(
366                "leaf_page_size cannot be larger than {}",
367                MAX_LEAF_PAGE_SIZE
368            )));
369        }
370
371        if self.max_fence_len / 2 > MAX_KEY_LEN {
372            return Err(ConfigError::MaxKeyLen(format!(
373                "cb_max_key_len cannot be larger than {}",
374                MAX_KEY_LEN
375            )));
376        }
377
378        if !self.cb_size_byte.is_power_of_two() {
379            return Err(ConfigError::CircularBufferSize(
380                "cb_size_byte should be a power of two".to_string(),
381            ));
382        }
383
384        if self.leaf_page_size / self.cb_min_record_size > 4096 {
385            return Err(ConfigError::MinimumRecordSize(
386                "leaf_page_size/min_record_size cannot exceed 2^12.".to_string(),
387            ));
388        }
389
390        // Page alignment checks
391        if !self.cache_only && !self.leaf_page_size.is_multiple_of(DISK_PAGE_SIZE) {
392            return Err(ConfigError::LeafPageSize(format!(
393                "In non cache-only mode leaf_page_size should be multiple of {}",
394                DISK_PAGE_SIZE
395            )));
396        } else if self.cache_only && !self.leaf_page_size.is_multiple_of(CACHE_LINE_SIZE) {
397            return Err(ConfigError::LeafPageSize(format!(
398                "In cache-only mode leaf_page_size should be multiple of {}",
399                CACHE_LINE_SIZE
400            )));
401        }
402
403        // Mini-page merge/split operation safety guarantee checks
404        let max_record_size_with_meta = self.cb_max_record_size + std::mem::size_of::<LeafKVMeta>();
405        let mut max_mini_page_size: usize;
406
407        if self.cache_only {
408            if self.leaf_page_size < 2 * max_record_size_with_meta + std::mem::size_of::<LeafNode>()
409            {
410                return Err(ConfigError::MaximumRecordSize(format!(
411                    "In cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
412                    (self.leaf_page_size - std::mem::size_of::<LeafNode>()) / 2
413                        - std::mem::size_of::<LeafKVMeta>()
414                )));
415            }
416        } else {
417            if max_record_size_with_meta
418                > self.leaf_page_size - self.max_fence_len - 2 * std::mem::size_of::<LeafKVMeta>()
419            {
420                return Err(ConfigError::MaximumRecordSize(format!(
421                    "In non cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
422                    self.leaf_page_size
423                        - self.max_fence_len
424                        - 2 * std::mem::size_of::<LeafKVMeta>()
425                )));
426            }
427            max_mini_page_size = self.leaf_page_size
428                - max_record_size_with_meta
429                - self.max_fence_len
430                - 2 * std::mem::size_of::<LeafKVMeta>();
431            max_mini_page_size = (max_mini_page_size / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
432
433            if max_mini_page_size < max_record_size_with_meta + std::mem::size_of::<LeafNode>() {
434                return Err(ConfigError::MaximumRecordSize(format!(
435                    "In non cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
436                    max_mini_page_size
437                        - std::mem::size_of::<LeafNode>()
438                        - std::mem::size_of::<LeafKVMeta>()
439                )));
440            }
441        }
442
443        // Circular buffer size checks
444        // In cache-only mode, the circular buffer size >= 4 * leaf_page_size
445        // This is because during page split, two full sized mini-page need to be in memory
446        // Note that 2 * leaf_page_size only guarantees one full sized mini-page due to AllocMeta
447        //
448        // In non cache-only mode, the circular buffer size >= 2 * leaf_page_size
449        // As at most one full sized leaf page is required
450        if self.cache_only {
451            if self.cb_size_byte < 4 * self.leaf_page_size {
452                return Err(ConfigError::CircularBufferSize(
453                    "In cache-only mode, cb_size_byte should be at least 4 times of leaf_page_size"
454                        .to_string(),
455                ));
456            }
457        } else if self.cb_size_byte < 2 * self.leaf_page_size {
458            return Err(ConfigError::CircularBufferSize(
459                "In non cache-only mode, cb_size_byte should be at least 2 times of leaf_page_size"
460                    .to_string(),
461            ));
462        }
463
464        Ok(())
465    }
466}
467
468#[derive(Clone, Debug)]
469
470pub struct WalConfig {
471    pub(crate) file_path: PathBuf,
472    pub(crate) flush_interval: Duration,
473    pub(crate) segment_size: usize,
474    pub(crate) storage_backend: StorageBackend,
475}
476
477impl WalConfig {
478    /// Default: same directory as the file_path
479    ///
480    /// The path to write the write ahead log.
481    /// Advanced users may want to change the WAL to point to a different location
482    /// to leverage different storage patterns
483    /// (WAL is always sequence write and requires durability).
484    pub fn new(file_path: impl AsRef<Path>) -> Self {
485        Self {
486            file_path: file_path.as_ref().to_path_buf(),
487            flush_interval: Duration::from_millis(1),
488            segment_size: 1024 * 1024 * 1024,
489            storage_backend: StorageBackend::Std,
490        }
491    }
492
493    /// Default: 1ms
494    pub fn flush_interval(&mut self, interval: Duration) -> &mut Self {
495        self.flush_interval = interval;
496        self
497    }
498
499    /// Default: 1MB
500    pub fn segment_size(&mut self, size: usize) -> &mut Self {
501        self.segment_size = size;
502        self
503    }
504
505    /// Default: Std
506    ///
507    /// Change the storage backend for potentially better performance, e.g., IoUring on Linux.
508    pub fn storage_backend(&mut self, backend: StorageBackend) -> &mut Self {
509        self.storage_backend = backend;
510        self
511    }
512}
513
514#[cfg(test)]
515mod tests {
516    use super::*;
517
518    const SAMPLE_CONFIG_FILE: &str = "src/sample_config.toml";
519    #[test]
520    fn test_new_with_config_file() {
521        let config = Config::new_with_config_file(SAMPLE_CONFIG_FILE);
522
523        assert_eq!(config.cb_size_byte, 8192);
524        assert_eq!(config.read_promotion_rate.load(Ordering::Relaxed), 100);
525        assert_eq!(config.write_load_full_page, true);
526        assert_eq!(config.file_path, PathBuf::from("c/d/E"));
527        assert_eq!(config.cache_only, false);
528    }
529
530    #[test]
531    fn test_leaf_page_size_getter_setter() {
532        let mut config = Config::default();
533
534        // Check default value
535        assert_eq!(config.get_leaf_page_size(), DEFAULT_LEAF_PAGE_SIZE);
536
537        // Set a new value and verify it
538        let new_leaf_page_size = 8192;
539        config.leaf_page_size(new_leaf_page_size);
540        assert_eq!(config.get_leaf_page_size(), new_leaf_page_size);
541
542        // Set another value and verify
543        let another_leaf_page_size = 16384;
544        config.leaf_page_size(another_leaf_page_size);
545        assert_eq!(config.get_leaf_page_size(), another_leaf_page_size);
546    }
547
548    #[test]
549    fn test_cb_max_record_size_getter_setter() {
550        let mut config = Config::default();
551
552        // Check default value
553        assert_eq!(config.get_cb_max_record_size(), DEFAULT_MAX_RECORD_SIZE);
554
555        // Set a new value and verify it
556        let new_max_record_size = 4096;
557        config.cb_max_record_size(new_max_record_size);
558        assert_eq!(config.get_cb_max_record_size(), new_max_record_size);
559
560        // Set another value and verify
561        let another_max_record_size = 8192;
562        config.cb_max_record_size(another_max_record_size);
563        assert_eq!(config.get_cb_max_record_size(), another_max_record_size);
564    }
565}