Skip to main content

bf_tree/
config.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use std::{
5    fs,
6    path::{Path, PathBuf},
7    sync::{atomic::Ordering, Arc},
8    time::Duration,
9};
10
11use crate::{
12    error::ConfigError,
13    nodes::{
14        leaf_node::LeafKVMeta, LeafNode, CACHE_LINE_SIZE, DISK_PAGE_SIZE, MAX_KEY_LEN,
15        MAX_LEAF_PAGE_SIZE,
16    },
17};
18use serde::Deserialize;
19use std::sync::atomic::AtomicUsize;
20
21const DEFAULT_PROMOTION_RATE_DEBUG: usize = 50;
22const DEFAULT_PROMOTION_RATE_RLEASE: usize = 30;
23const DEFAULT_MAX_MINI_PAGE_SIZE: usize = 2048; // Deprecated
24const DEFAULT_COPY_ON_ACCESS_RATIO: f64 = 0.1;
25const DEFAULT_CIRCULAR_BUFFER_SIZE: usize = 1024 * 1024 * 32;
26const DEFAULT_MIN_RECORD_SIZE: usize = 4;
27const DEFAULT_MAX_RECORD_SIZE: usize = 1952;
28const DEFAULT_LEAF_PAGE_SIZE: usize = 4096;
29const DEFAULT_MAX_KEY_LEN: usize = 16;
30
31/// Bf-tree configuration for advanced usage.
32/// Bf-tree is designed to work well on various workloads that you don't have to change the default configuration.
33/// This configuration is more for advanced users who want to understand the different components of the system, rather than performance tuning.
34#[derive(Debug)]
35pub struct Config {
36    pub(crate) read_promotion_rate: AtomicUsize,
37    pub(crate) scan_promotion_rate: AtomicUsize,
38    pub(crate) storage_backend: StorageBackend,
39    pub(crate) cb_size_byte: usize,
40    pub(crate) cb_min_record_size: usize,
41    pub(crate) cb_max_record_size: usize,
42    pub(crate) leaf_page_size: usize,
43    pub(crate) cb_max_key_len: usize,
44    pub(crate) max_fence_len: usize,
45    pub(crate) cb_copy_on_access_ratio: f64,
46    pub(crate) read_record_cache: bool,
47    pub(crate) file_path: PathBuf,
48    pub(crate) max_mini_page_size: usize,
49    pub(crate) mini_page_binary_search: bool,
50    pub(crate) write_ahead_log: Option<Arc<WalConfig>>,
51    pub(crate) write_load_full_page: bool,
52    pub(crate) cache_only: bool,
53}
54
55impl Clone for Config {
56    fn clone(&self) -> Self {
57        Self {
58            read_promotion_rate: AtomicUsize::new(self.read_promotion_rate.load(Ordering::Relaxed)),
59            scan_promotion_rate: AtomicUsize::new(self.scan_promotion_rate.load(Ordering::Relaxed)),
60            storage_backend: self.storage_backend.clone(),
61            cb_size_byte: self.cb_size_byte,
62            cb_min_record_size: self.cb_min_record_size,
63            cb_max_record_size: self.cb_max_record_size,
64            leaf_page_size: self.leaf_page_size,
65            cb_max_key_len: self.cb_max_key_len,
66            max_fence_len: self.max_fence_len,
67            cb_copy_on_access_ratio: self.cb_copy_on_access_ratio,
68            read_record_cache: self.read_record_cache,
69            file_path: self.file_path.clone(),
70            max_mini_page_size: self.max_mini_page_size,
71            mini_page_binary_search: self.mini_page_binary_search,
72            write_ahead_log: self.write_ahead_log.clone(),
73            write_load_full_page: self.write_load_full_page,
74            cache_only: self.cache_only,
75        }
76    }
77}
78
79/// Where/how to store the leaf pages?
80#[derive(Debug, Default, Clone, Eq, PartialEq)]
81pub enum StorageBackend {
82    Memory,
83    #[default]
84    Std,
85    #[cfg(target_os = "linux")]
86    StdDirect,
87    #[cfg(target_os = "linux")]
88    IoUringPolling,
89    #[cfg(target_os = "linux")]
90    IoUringBlocking,
91    #[cfg(all(target_os = "linux", feature = "spdk"))]
92    Spdk,
93}
94
95#[derive(Debug, Deserialize)]
96pub struct ConfigFile {
97    pub(crate) cb_size_byte: usize,
98    pub(crate) cb_min_record_size: usize,
99    pub(crate) cb_max_record_size: usize,
100    pub(crate) cb_max_key_len: usize,
101    pub(crate) leaf_page_size: usize,
102    pub(crate) index_file_path: String,
103    pub(crate) backend_storage: String,
104    pub(crate) read_promotion_rate: usize,
105    pub(crate) write_load_full_page: bool,
106    pub(crate) cache_only: bool,
107}
108
109/// Default BfTree configuration
110///
111impl Default for Config {
112    fn default() -> Self {
113        let read_promotion_rate = if cfg!(debug_assertions) {
114            DEFAULT_PROMOTION_RATE_DEBUG
115        } else {
116            DEFAULT_PROMOTION_RATE_RLEASE
117        };
118        let scan_promotion_rate = if cfg!(debug_assertions) {
119            DEFAULT_PROMOTION_RATE_DEBUG
120        } else {
121            DEFAULT_PROMOTION_RATE_RLEASE
122        };
123
124        Self {
125            read_promotion_rate: AtomicUsize::new(read_promotion_rate),
126            scan_promotion_rate: AtomicUsize::new(scan_promotion_rate),
127            cb_size_byte: DEFAULT_CIRCULAR_BUFFER_SIZE,
128            cb_min_record_size: DEFAULT_MIN_RECORD_SIZE,
129            cb_max_record_size: DEFAULT_MAX_RECORD_SIZE,
130            leaf_page_size: DEFAULT_LEAF_PAGE_SIZE,
131            cb_max_key_len: DEFAULT_MAX_KEY_LEN,
132            max_fence_len: DEFAULT_MAX_KEY_LEN * 2,
133            cb_copy_on_access_ratio: DEFAULT_COPY_ON_ACCESS_RATIO,
134            file_path: PathBuf::new(),
135            read_record_cache: true,
136            max_mini_page_size: DEFAULT_MAX_MINI_PAGE_SIZE,
137            mini_page_binary_search: true,
138            storage_backend: StorageBackend::Memory,
139            write_ahead_log: None,
140            write_load_full_page: true,
141            cache_only: false,
142        }
143    }
144}
145impl Config {
146    pub fn new(file_path: impl AsRef<Path>, circular_buffer_size: usize) -> Self {
147        let mut config = Self::default();
148        let mut cache_only = false;
149        let storage_backend = if file_path.as_ref().to_str().unwrap().starts_with(":memory:") {
150            StorageBackend::Memory
151        } else if file_path.as_ref().to_str().unwrap().starts_with(":cache:") {
152            cache_only = true;
153            StorageBackend::Memory
154        } else {
155            StorageBackend::default()
156        };
157
158        config
159            .storage_backend(storage_backend)
160            .cache_only(cache_only)
161            .cb_size_byte(circular_buffer_size)
162            .file_path(file_path);
163
164        config
165    }
166
167    /// Constructor of Config based on a config TOML file
168    /// The config file must have all fields defined ConfigFile
169    pub fn new_with_config_file<P: AsRef<Path>>(config_file_path: P) -> Self {
170        let config_file_str =
171            fs::read_to_string(config_file_path).expect("couldn't read config file");
172        let config_file: ConfigFile =
173            toml::from_str(&config_file_str).expect("Fail to parse config file");
174        let scan_promotion_rate = if cfg!(debug_assertions) {
175            DEFAULT_PROMOTION_RATE_DEBUG
176        } else {
177            DEFAULT_PROMOTION_RATE_RLEASE
178        };
179        let mut storage = StorageBackend::Memory;
180        if config_file.backend_storage == "disk" {
181            storage = StorageBackend::default();
182        }
183
184        // Return the config
185        Self {
186            read_promotion_rate: AtomicUsize::new(config_file.read_promotion_rate),
187            scan_promotion_rate: AtomicUsize::new(scan_promotion_rate),
188            cb_size_byte: config_file.cb_size_byte,
189            cb_min_record_size: config_file.cb_min_record_size,
190            cb_max_record_size: config_file.cb_max_record_size,
191            leaf_page_size: config_file.leaf_page_size,
192            cb_max_key_len: config_file.cb_max_key_len,
193            max_fence_len: config_file.cb_max_key_len * 2,
194            cb_copy_on_access_ratio: DEFAULT_COPY_ON_ACCESS_RATIO,
195            file_path: PathBuf::from(config_file.index_file_path),
196            read_record_cache: true,
197            max_mini_page_size: DEFAULT_MAX_MINI_PAGE_SIZE,
198            mini_page_binary_search: true,
199            storage_backend: storage,
200            write_ahead_log: None,
201            write_load_full_page: config_file.write_load_full_page,
202            cache_only: config_file.cache_only,
203        }
204    }
205
206    /// Default: Std
207    ///
208    /// Use std::fs::file to store/access disk data.
209    /// For better performance, consider platform specific backends like: IoUringBlocking.
210    pub fn storage_backend(&mut self, backend: StorageBackend) -> &mut Self {
211        self.storage_backend = backend;
212        self
213    }
214
215    /// Default: 30
216    ///
217    /// prob% of chance that a **page** will be promoted to buffer during scan operations.
218    pub fn scan_promotion_rate(&mut self, prob: usize) -> &mut Self {
219        self.scan_promotion_rate.store(prob, Ordering::Relaxed);
220        self
221    }
222
223    /// Default: true
224    ///
225    /// By default bf-tree will cache the hot records in mini page.
226    /// Setting this to false will try to cache the entire base page, which is less efficient.
227    pub fn read_record_cache(&mut self, read_full_page_cache: bool) -> &mut Self {
228        self.read_record_cache = read_full_page_cache;
229        self
230    }
231
232    /// Default: 2048
233    ///
234    /// The maximum mini page size before it grows to a full page.
235    pub fn max_mini_page_size(&mut self, size: usize) -> &mut Self {
236        self.max_mini_page_size = size;
237        self
238    }
239
240    /// Default: true
241    ///
242    /// If set to false, the mini page will use linear search instead of binary search.
243    pub fn mini_page_binary_search(&mut self, binary_search: bool) -> &mut Self {
244        self.mini_page_binary_search = binary_search;
245        self
246    }
247
248    /// Default: 30
249    ///
250    /// prob% of chance that a record will be promoted from leaf page to mini page.
251    pub fn read_promotion_rate(&mut self, prob: usize) -> &mut Self {
252        self.read_promotion_rate.store(prob, Ordering::Relaxed);
253        self
254    }
255
256    /// Default: 0.1
257    ///
258    /// The ratio of copy-on-access region for circular buffer.
259    /// - 0.0 means the circular buffer is a FIFO.
260    /// - 1.0 means the circular buffer is a LRU.
261    ///
262    /// You don't want to change this unless you know what you are doing.
263    pub fn cb_copy_on_access_ratio(&mut self, ratio: f64) -> &mut Self {
264        self.cb_copy_on_access_ratio = ratio;
265        self
266    }
267
268    /// Default: false
269    ///
270    /// Whether to enable write ahead log, for persistency and recovery.
271    pub fn enable_write_ahead_log(&mut self, wal_config: Arc<WalConfig>) -> &mut Self {
272        // let write_ahead_log_path = self.file_path.parent().unwrap().join("wal.log");
273        self.write_ahead_log = Some(wal_config);
274        self
275    }
276
277    /// Default: false
278    ///
279    /// Similar to `enable_write_ahead_log`, but with default WAL configuration.
280    /// The path to write the write ahead log.
281    /// Advanced users may want to change the WAL to point to a different location
282    /// to leverage different storage patterns
283    /// (WAL is always sequence write and requires durability).
284    pub fn enable_write_ahead_log_default(&mut self) -> &mut Self {
285        let wal_config = WalConfig::new(self.file_path.parent().unwrap().join("wal.log"));
286        self.write_ahead_log = Some(Arc::new(wal_config));
287        self
288    }
289
290    /// Default: false
291    pub fn cache_only(&mut self, cache_only: bool) -> &mut Self {
292        self.cache_only = cache_only;
293        self
294    }
295
296    /// Default: 32 * 1024 * 1024
297    pub fn cb_size_byte(&mut self, cb_size_byte: usize) -> &mut Self {
298        self.cb_size_byte = cb_size_byte;
299        self
300    }
301
302    pub fn file_path<P: AsRef<Path>>(&mut self, file_path: P) -> &mut Self {
303        self.file_path = file_path.as_ref().to_path_buf();
304        self
305    }
306
307    pub fn cb_max_key_len(&mut self, max_key_len: usize) -> &mut Self {
308        self.cb_max_key_len = max_key_len;
309        self.max_fence_len = max_key_len * 2;
310        self
311    }
312
313    pub fn cb_min_record_size(&mut self, min_record_size: usize) -> &mut Self {
314        self.cb_min_record_size = min_record_size;
315        self
316    }
317
318    pub fn cb_max_record_size(&mut self, max_record_size: usize) -> &mut Self {
319        self.cb_max_record_size = max_record_size;
320        self
321    }
322
323    /// Returns the current max record size
324    pub fn get_cb_max_record_size(&self) -> usize {
325        self.cb_max_record_size
326    }
327
328    pub fn get_cb_size_byte(&self) -> usize {
329        self.cb_size_byte
330    }
331
332    pub fn leaf_page_size(&mut self, leaf_page_size: usize) -> &mut Self {
333        self.leaf_page_size = leaf_page_size;
334        self
335    }
336
337    /// Returns the current leaf page size
338    pub fn get_leaf_page_size(&self) -> usize {
339        self.leaf_page_size
340    }
341
342    /// Validate the configuration and report any invalid parameter, if found.
343    pub fn validate(&self) -> Result<(), ConfigError> {
344        // Sanity check of the input parameters
345        if self.cb_min_record_size <= 1 {
346            return Err(ConfigError::MinimumRecordSize(
347                "cb_min_record_size (key + value in bytes) needs to be at least 2 bytes"
348                    .to_string(),
349            ));
350        }
351
352        if self.cb_min_record_size > self.cb_max_record_size {
353            return Err(ConfigError::MaximumRecordSize("cb_min_record_size (key + value in bytes) cannot be greater than cb_max_record_size".to_string()));
354        }
355
356        if self.max_fence_len == 0 {
357            return Err(ConfigError::MaxKeyLen(
358                "cb_max_key_len cannot be zero".to_string(),
359            ));
360        }
361
362        if self.max_fence_len / 2 > self.cb_max_record_size {
363            return Err(ConfigError::MaxKeyLen(
364                "cb_max_key_len cannot be greater than cb_max_record_size".to_string(),
365            ));
366        }
367
368        if self.leaf_page_size > MAX_LEAF_PAGE_SIZE {
369            return Err(ConfigError::LeafPageSize(format!(
370                "leaf_page_size cannot be larger than {}",
371                MAX_LEAF_PAGE_SIZE
372            )));
373        }
374
375        if self.max_fence_len / 2 > MAX_KEY_LEN {
376            return Err(ConfigError::MaxKeyLen(format!(
377                "cb_max_key_len cannot be larger than {}",
378                MAX_KEY_LEN
379            )));
380        }
381
382        if !self.cb_size_byte.is_power_of_two() {
383            return Err(ConfigError::CircularBufferSize(
384                "cb_size_byte should be a power of two".to_string(),
385            ));
386        }
387
388        if self.leaf_page_size / self.cb_min_record_size > 4096 {
389            return Err(ConfigError::MinimumRecordSize(
390                "leaf_page_size/min_record_size cannot exceed 2^12.".to_string(),
391            ));
392        }
393
394        // Page alignment checks
395        if !self.cache_only && !self.leaf_page_size.is_multiple_of(DISK_PAGE_SIZE) {
396            return Err(ConfigError::LeafPageSize(format!(
397                "In non cache-only mode leaf_page_size should be multiple of {}",
398                DISK_PAGE_SIZE
399            )));
400        } else if self.cache_only && !self.leaf_page_size.is_multiple_of(CACHE_LINE_SIZE) {
401            return Err(ConfigError::LeafPageSize(format!(
402                "In cache-only mode leaf_page_size should be multiple of {}",
403                CACHE_LINE_SIZE
404            )));
405        }
406
407        // Mini-page merge/split operation safety guarantee checks
408        let max_record_size_with_meta = self.cb_max_record_size + std::mem::size_of::<LeafKVMeta>();
409        let mut max_mini_page_size: usize;
410
411        if self.cache_only {
412            if self.leaf_page_size < 2 * max_record_size_with_meta + std::mem::size_of::<LeafNode>()
413            {
414                return Err(ConfigError::MaximumRecordSize(format!(
415                    "In cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
416                    (self.leaf_page_size - std::mem::size_of::<LeafNode>()) / 2
417                        - std::mem::size_of::<LeafKVMeta>()
418                )));
419            }
420        } else {
421            if max_record_size_with_meta
422                > self.leaf_page_size - self.max_fence_len - 2 * std::mem::size_of::<LeafKVMeta>()
423            {
424                return Err(ConfigError::MaximumRecordSize(format!(
425                    "In non cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
426                    self.leaf_page_size
427                        - self.max_fence_len
428                        - 2 * std::mem::size_of::<LeafKVMeta>()
429                )));
430            }
431            max_mini_page_size = self.leaf_page_size
432                - max_record_size_with_meta
433                - self.max_fence_len
434                - 2 * std::mem::size_of::<LeafKVMeta>();
435            max_mini_page_size = (max_mini_page_size / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
436
437            if max_mini_page_size < max_record_size_with_meta + std::mem::size_of::<LeafNode>() {
438                return Err(ConfigError::MaximumRecordSize(format!(
439                    "In non cache-only mode, given the leaf_page_size the corresponding cb_max_record_size should be <= {}",
440                    max_mini_page_size
441                        - std::mem::size_of::<LeafNode>()
442                        - std::mem::size_of::<LeafKVMeta>()
443                )));
444            }
445        }
446
447        // Circular buffer size checks
448        // In cache-only mode, the circular buffer size >= 4 * leaf_page_size
449        // This is because during page split, two full sized mini-page need to be in memory
450        // Note that 2 * leaf_page_size only guarantees one full sized mini-page due to AllocMeta
451        //
452        // In non cache-only mode, the circular buffer size >= 2 * leaf_page_size
453        // As at most one full sized leaf page is required
454        if self.cache_only {
455            if self.cb_size_byte < 4 * self.leaf_page_size {
456                return Err(ConfigError::CircularBufferSize(
457                    "In cache-only mode, cb_size_byte should be at least 4 times of leaf_page_size"
458                        .to_string(),
459                ));
460            }
461        } else if self.cb_size_byte < 2 * self.leaf_page_size {
462            return Err(ConfigError::CircularBufferSize(
463                "In non cache-only mode, cb_size_byte should be at least 2 times of leaf_page_size"
464                    .to_string(),
465            ));
466        }
467
468        Ok(())
469    }
470}
471
472#[derive(Clone, Debug)]
473
474pub struct WalConfig {
475    pub(crate) file_path: PathBuf,
476    pub(crate) flush_interval: Duration,
477    pub(crate) segment_size: usize,
478    pub(crate) storage_backend: StorageBackend,
479}
480
481impl WalConfig {
482    /// Default: same directory as the file_path
483    ///
484    /// The path to write the write ahead log.
485    /// Advanced users may want to change the WAL to point to a different location
486    /// to leverage different storage patterns
487    /// (WAL is always sequence write and requires durability).
488    pub fn new(file_path: impl AsRef<Path>) -> Self {
489        Self {
490            file_path: file_path.as_ref().to_path_buf(),
491            flush_interval: Duration::from_millis(1),
492            segment_size: 1024 * 1024 * 1024,
493            storage_backend: StorageBackend::Std,
494        }
495    }
496
497    /// Default: 1ms
498    pub fn flush_interval(&mut self, interval: Duration) -> &mut Self {
499        self.flush_interval = interval;
500        self
501    }
502
503    /// Default: 1MB
504    pub fn segment_size(&mut self, size: usize) -> &mut Self {
505        self.segment_size = size;
506        self
507    }
508
509    /// Default: Std
510    ///
511    /// Change the storage backend for potentially better performance, e.g., IoUring on Linux.
512    pub fn storage_backend(&mut self, backend: StorageBackend) -> &mut Self {
513        self.storage_backend = backend;
514        self
515    }
516}
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    const SAMPLE_CONFIG_FILE: &str = "src/sample_config.toml";
523    #[test]
524    fn test_new_with_config_file() {
525        let config = Config::new_with_config_file(SAMPLE_CONFIG_FILE);
526
527        assert_eq!(config.cb_size_byte, 8192);
528        assert_eq!(config.read_promotion_rate.load(Ordering::Relaxed), 100);
529        assert_eq!(config.write_load_full_page, true);
530        assert_eq!(config.file_path, PathBuf::from("c/d/E"));
531        assert_eq!(config.cache_only, false);
532    }
533
534    #[test]
535    fn test_leaf_page_size_getter_setter() {
536        let mut config = Config::default();
537
538        // Check default value
539        assert_eq!(config.get_leaf_page_size(), DEFAULT_LEAF_PAGE_SIZE);
540
541        // Set a new value and verify it
542        let new_leaf_page_size = 8192;
543        config.leaf_page_size(new_leaf_page_size);
544        assert_eq!(config.get_leaf_page_size(), new_leaf_page_size);
545
546        // Set another value and verify
547        let another_leaf_page_size = 16384;
548        config.leaf_page_size(another_leaf_page_size);
549        assert_eq!(config.get_leaf_page_size(), another_leaf_page_size);
550    }
551
552    #[test]
553    fn test_cb_max_record_size_getter_setter() {
554        let mut config = Config::default();
555
556        // Check default value
557        assert_eq!(config.get_cb_max_record_size(), DEFAULT_MAX_RECORD_SIZE);
558
559        // Set a new value and verify it
560        let new_max_record_size = 4096;
561        config.cb_max_record_size(new_max_record_size);
562        assert_eq!(config.get_cb_max_record_size(), new_max_record_size);
563
564        // Set another value and verify
565        let another_max_record_size = 8192;
566        config.cb_max_record_size(another_max_record_size);
567        assert_eq!(config.get_cb_max_record_size(), another_max_record_size);
568    }
569}