lsm_tree/abstract.rs
1// Copyright (c) 2024-present, fjall-rs
2// This source code is licensed under both the Apache 2.0 and MIT License
3// (found in the LICENSE-* files in the repository)
4
5use crate::{
6 blob_tree::FragmentationMap, compaction::CompactionStrategy, config::TreeType,
7 iter_guard::IterGuardImpl, table::Table, tree::inner::MemtableId, version::Version,
8 vlog::BlobFile, AnyTree, BlobTree, Config, Guard, InternalValue, KvPair, Memtable, SeqNo,
9 SequenceNumberCounter, TableId, Tree, TreeId, UserKey, UserValue,
10};
11use enum_dispatch::enum_dispatch;
12use std::{ops::RangeBounds, sync::Arc};
13
14pub type RangeItem = crate::Result<KvPair>;
15
16/// Generic Tree API
17#[expect(clippy::module_name_repetitions)]
18#[enum_dispatch]
19pub trait AbstractTree {
20 #[doc(hidden)]
21 fn next_table_id(&self) -> TableId;
22
23 #[doc(hidden)]
24 fn id(&self) -> TreeId;
25
26 #[doc(hidden)]
27 fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result<Option<InternalValue>>;
28
29 #[doc(hidden)]
30 fn current_version(&self) -> Version;
31
32 /// Synchronously flushes the active memtable to a table.
33 ///
34 /// The function may not return a result, if, during concurrent workloads, the memtable
35 /// ends up being empty before the flush is set up.
36 ///
37 /// The result will contain the [`Table`].
38 ///
39 /// # Errors
40 ///
41 /// Will return `Err` if an IO error occurs.
42 #[doc(hidden)]
43 fn flush_active_memtable(&self, seqno_threshold: SeqNo) -> crate::Result<Option<Table>>;
44
45 /// Returns an iterator that scans through the entire tree.
46 ///
47 /// Avoid using this function, or limit it as otherwise it may scan a lot of items.
48 fn iter(
49 &self,
50 seqno: SeqNo,
51 index: Option<Arc<Memtable>>,
52 ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl<'_>> + '_> {
53 self.range::<&[u8], _>(.., seqno, index)
54 }
55
56 /// Returns an iterator over a prefixed set of items.
57 ///
58 /// Avoid using an empty prefix as it may scan a lot of items (unless limited).
59 fn prefix<K: AsRef<[u8]>>(
60 &self,
61 prefix: K,
62 seqno: SeqNo,
63 index: Option<Arc<Memtable>>,
64 ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl<'_>> + '_>;
65
66 /// Returns an iterator over a range of items.
67 ///
68 /// Avoid using full or unbounded ranges as they may scan a lot of items (unless limited).
69 fn range<K: AsRef<[u8]>, R: RangeBounds<K>>(
70 &self,
71 range: R,
72 seqno: SeqNo,
73 index: Option<Arc<Memtable>>,
74 ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl<'_>> + '_>;
75
76 /// Ingests a sorted stream of key-value pairs into the tree.
77 ///
78 /// Can only be called on a new fresh, empty tree.
79 ///
80 /// # Errors
81 ///
82 /// Will return `Err` if an IO error occurs.
83 ///
84 /// # Panics
85 ///
86 /// Panics if the tree is **not** initially empty.
87 ///
88 /// Will panic if the input iterator is not sorted in ascending order.
89 #[doc(hidden)]
90 fn ingest(
91 &self,
92 iter: impl Iterator<Item = (UserKey, UserValue)>,
93 seqno_generator: &SequenceNumberCounter,
94 visible_seqno: &SequenceNumberCounter,
95 ) -> crate::Result<()>;
96
97 /// Returns the approximate number of tombstones in the tree.
98 fn tombstone_count(&self) -> u64;
99
100 /// Returns the approximate number of weak tombstones (single deletes) in the tree.
101 fn weak_tombstone_count(&self) -> u64;
102
103 /// Returns the approximate number of values reclaimable once weak tombstones can be GC'd.
104 fn weak_tombstone_reclaimable_count(&self) -> u64;
105
106 // TODO: clear() with Nuke compaction strategy (write lock) -> drop_range(..)
107
108 /// Drops tables that are fully contained in a given range.
109 ///
110 /// Accepts any `RangeBounds`, including unbounded or exclusive endpoints.
111 /// If the normalized lower bound is greater than the upper bound, the
112 /// method returns without performing any work.
113 ///
114 /// # Errors
115 ///
116 /// Will return `Err` only if an IO error occurs during compaction.
117 fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> crate::Result<()>;
118
119 /// Performs major compaction, blocking the caller until it's done.
120 ///
121 /// # Errors
122 ///
123 /// Will return `Err` if an IO error occurs.
124 fn major_compact(&self, target_size: u64, seqno_threshold: SeqNo) -> crate::Result<()>;
125
126 /// Returns the disk space used by stale blobs.
127 fn stale_blob_bytes(&self) -> u64 {
128 0
129 }
130
131 /// Gets the space usage of all filters in the tree.
132 ///
133 /// May not correspond to the actual memory size because filter blocks may be paged out.
134 fn filter_size(&self) -> usize;
135
136 /// Gets the memory usage of all pinned filters in the tree.
137 fn pinned_filter_size(&self) -> usize;
138
139 /// Gets the memory usage of all pinned index blocks in the tree.
140 fn pinned_block_index_size(&self) -> usize;
141
142 /// Gets the length of the version free list.
143 fn version_free_list_len(&self) -> usize;
144
145 /// Returns the metrics structure.
146 #[cfg(feature = "metrics")]
147 fn metrics(&self) -> &Arc<crate::Metrics>;
148
149 /// Synchronously flushes a memtable to a table.
150 ///
151 /// This method will not make the table immediately available,
152 /// use [`AbstractTree::register_tables`] for that.
153 ///
154 /// # Errors
155 ///
156 /// Will return `Err` if an IO error occurs.
157 #[warn(clippy::type_complexity)]
158 fn flush_memtable(
159 &self,
160 table_id: TableId, // TODO: remove?
161 memtable: &Arc<Memtable>,
162 seqno_threshold: SeqNo,
163 ) -> crate::Result<Option<(Table, Option<BlobFile>)>>;
164
165 /// Atomically registers flushed tables into the tree, removing their associated sealed memtables.
166 ///
167 /// # Errors
168 ///
169 /// Will return `Err` if an IO error occurs.
170 fn register_tables(
171 &self,
172 tables: &[Table],
173 blob_files: Option<&[BlobFile]>,
174 frag_map: Option<FragmentationMap>,
175 seqno_threshold: SeqNo,
176 ) -> crate::Result<()>;
177
178 /// Clears the active memtable atomically.
179 fn clear_active_memtable(&self);
180
181 /// Sets the active memtable.
182 ///
183 /// May be used to restore the LSM-tree's in-memory state from a write-ahead log
184 /// after tree recovery.
185 fn set_active_memtable(&self, memtable: Memtable);
186
187 /// Returns the number of sealed memtables.
188 fn sealed_memtable_count(&self) -> usize;
189
190 /// Adds a sealed memtables.
191 ///
192 /// May be used to restore the LSM-tree's in-memory state from some journals.
193 fn add_sealed_memtable(&self, id: MemtableId, memtable: Arc<Memtable>);
194
195 /// Performs compaction on the tree's levels, blocking the caller until it's done.
196 ///
197 /// # Errors
198 ///
199 /// Will return `Err` if an IO error occurs.
200 fn compact(
201 &self,
202 strategy: Arc<dyn CompactionStrategy>,
203 seqno_threshold: SeqNo,
204 ) -> crate::Result<()>;
205
206 /// Returns the next table's ID.
207 fn get_next_table_id(&self) -> TableId;
208
209 /// Returns the tree config.
210 fn tree_config(&self) -> &Config;
211
212 /// Returns the highest sequence number.
213 fn get_highest_seqno(&self) -> Option<SeqNo> {
214 let memtable_seqno = self.get_highest_memtable_seqno();
215 let table_seqno = self.get_highest_persisted_seqno();
216 memtable_seqno.max(table_seqno)
217 }
218
219 /// Returns the approximate size of the active memtable in bytes.
220 ///
221 /// May be used to flush the memtable if it grows too large.
222 fn active_memtable_size(&self) -> u64;
223
224 /// Returns the tree type.
225 fn tree_type(&self) -> TreeType;
226
227 /// Seals the active memtable, and returns a reference to it.
228 fn rotate_memtable(&self) -> Option<(MemtableId, Arc<Memtable>)>;
229
230 /// Returns the number of tables currently in the tree.
231 fn table_count(&self) -> usize;
232
233 /// Returns the number of tables in `levels[idx]`.
234 ///
235 /// Returns `None` if the level does not exist (if idx >= 7).
236 fn level_table_count(&self, idx: usize) -> Option<usize>;
237
238 /// Returns the number of disjoint runs in L0.
239 ///
240 /// Can be used to determine whether to write stall.
241 fn l0_run_count(&self) -> usize;
242
243 /// Returns the number of blob files currently in the tree.
244 fn blob_file_count(&self) -> usize {
245 0
246 }
247
248 /// Approximates the number of items in the tree.
249 fn approximate_len(&self) -> usize;
250
251 /// Returns the disk space usage.
252 fn disk_space(&self) -> u64;
253
254 /// Returns the highest sequence number of the active memtable.
255 fn get_highest_memtable_seqno(&self) -> Option<SeqNo>;
256
257 /// Returns the highest sequence number that is flushed to disk.
258 fn get_highest_persisted_seqno(&self) -> Option<SeqNo>;
259
260 /// Scans the entire tree, returning the number of items.
261 ///
262 /// ###### Caution
263 ///
264 /// This operation scans the entire tree: O(n) complexity!
265 ///
266 /// Never, under any circumstances, use .`len()` == 0 to check
267 /// if the tree is empty, use [`Tree::is_empty`] instead.
268 ///
269 /// # Examples
270 ///
271 /// ```
272 /// # use lsm_tree::Error as TreeError;
273 /// use lsm_tree::{AbstractTree, Config, Tree};
274 ///
275 /// let folder = tempfile::tempdir()?;
276 /// let tree = Config::new(folder).open()?;
277 ///
278 /// assert_eq!(tree.len(0, None)?, 0);
279 /// tree.insert("1", "abc", 0);
280 /// tree.insert("3", "abc", 1);
281 /// tree.insert("5", "abc", 2);
282 /// assert_eq!(tree.len(3, None)?, 3);
283 /// #
284 /// # Ok::<(), TreeError>(())
285 /// ```
286 ///
287 /// # Errors
288 ///
289 /// Will return `Err` if an IO error occurs.
290 fn len(&self, seqno: SeqNo, index: Option<Arc<Memtable>>) -> crate::Result<usize> {
291 let mut count = 0;
292
293 for item in self.iter(seqno, index) {
294 let _ = item.key()?;
295 count += 1;
296 }
297
298 Ok(count)
299 }
300
301 /// Returns `true` if the tree is empty.
302 ///
303 /// This operation has O(log N) complexity.
304 ///
305 /// # Examples
306 ///
307 /// ```
308 /// # let folder = tempfile::tempdir()?;
309 /// use lsm_tree::{AbstractTree, Config, Tree};
310 ///
311 /// let tree = Config::new(folder).open()?;
312 /// assert!(tree.is_empty(0, None)?);
313 ///
314 /// tree.insert("a", "abc", 0);
315 /// assert!(!tree.is_empty(1, None)?);
316 /// #
317 /// # Ok::<(), lsm_tree::Error>(())
318 /// ```
319 ///
320 /// # Errors
321 ///
322 /// Will return `Err` if an IO error occurs.
323 fn is_empty(&self, seqno: SeqNo, index: Option<Arc<Memtable>>) -> crate::Result<bool> {
324 self.first_key_value(seqno, index).map(|x| x.is_none())
325 }
326
327 /// Returns the first key-value pair in the tree.
328 /// The key in this pair is the minimum key in the tree.
329 ///
330 /// # Examples
331 ///
332 /// ```
333 /// # use lsm_tree::Error as TreeError;
334 /// # use lsm_tree::{AbstractTree, Config, Tree};
335 /// #
336 /// # let folder = tempfile::tempdir()?;
337 /// let tree = Config::new(folder).open()?;
338 ///
339 /// tree.insert("1", "abc", 0);
340 /// tree.insert("3", "abc", 1);
341 /// tree.insert("5", "abc", 2);
342 ///
343 /// let (key, _) = tree.first_key_value(3, None)?.expect("item should exist");
344 /// assert_eq!(&*key, "1".as_bytes());
345 /// #
346 /// # Ok::<(), TreeError>(())
347 /// ```
348 ///
349 /// # Errors
350 ///
351 /// Will return `Err` if an IO error occurs.
352 fn first_key_value(
353 &self,
354 seqno: SeqNo,
355 index: Option<Arc<Memtable>>,
356 ) -> crate::Result<Option<KvPair>> {
357 self.iter(seqno, index)
358 .next()
359 .map(Guard::into_inner)
360 .transpose()
361 }
362
363 /// Returns the last key-value pair in the tree.
364 /// The key in this pair is the maximum key in the tree.
365 ///
366 /// # Examples
367 ///
368 /// ```
369 /// # use lsm_tree::Error as TreeError;
370 /// # use lsm_tree::{AbstractTree, Config, Tree};
371 /// #
372 /// # let folder = tempfile::tempdir()?;
373 /// # let tree = Config::new(folder).open()?;
374 /// #
375 /// tree.insert("1", "abc", 0);
376 /// tree.insert("3", "abc", 1);
377 /// tree.insert("5", "abc", 2);
378 ///
379 /// let (key, _) = tree.last_key_value(3, None)?.expect("item should exist");
380 /// assert_eq!(&*key, "5".as_bytes());
381 /// #
382 /// # Ok::<(), TreeError>(())
383 /// ```
384 ///
385 /// # Errors
386 ///
387 /// Will return `Err` if an IO error occurs.
388 fn last_key_value(
389 &self,
390 seqno: SeqNo,
391 index: Option<Arc<Memtable>>,
392 ) -> crate::Result<Option<KvPair>> {
393 self.iter(seqno, index)
394 .next_back()
395 .map(Guard::into_inner)
396 .transpose()
397 }
398
399 /// Returns the size of a value if it exists.
400 ///
401 /// # Examples
402 ///
403 /// ```
404 /// # let folder = tempfile::tempdir()?;
405 /// use lsm_tree::{AbstractTree, Config, Tree};
406 ///
407 /// let tree = Config::new(folder).open()?;
408 /// tree.insert("a", "my_value", 0);
409 ///
410 /// let size = tree.size_of("a", 1)?.unwrap_or_default();
411 /// assert_eq!("my_value".len() as u32, size);
412 ///
413 /// let size = tree.size_of("b", 1)?.unwrap_or_default();
414 /// assert_eq!(0, size);
415 /// #
416 /// # Ok::<(), lsm_tree::Error>(())
417 /// ```
418 ///
419 /// # Errors
420 ///
421 /// Will return `Err` if an IO error occurs.
422 fn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<u32>>;
423
424 /// Retrieves an item from the tree.
425 ///
426 /// # Examples
427 ///
428 /// ```
429 /// # let folder = tempfile::tempdir()?;
430 /// use lsm_tree::{AbstractTree, Config, Tree};
431 ///
432 /// let tree = Config::new(folder).open()?;
433 /// tree.insert("a", "my_value", 0);
434 ///
435 /// let item = tree.get("a", 1)?;
436 /// assert_eq!(Some("my_value".as_bytes().into()), item);
437 /// #
438 /// # Ok::<(), lsm_tree::Error>(())
439 /// ```
440 ///
441 /// # Errors
442 ///
443 /// Will return `Err` if an IO error occurs.
444 fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<UserValue>>;
445
446 /// Returns `true` if the tree contains the specified key.
447 ///
448 /// # Examples
449 ///
450 /// ```
451 /// # let folder = tempfile::tempdir()?;
452 /// # use lsm_tree::{AbstractTree, Config, Tree};
453 /// #
454 /// let tree = Config::new(folder).open()?;
455 /// assert!(!tree.contains_key("a", 0)?);
456 ///
457 /// tree.insert("a", "abc", 0);
458 /// assert!(tree.contains_key("a", 1)?);
459 /// #
460 /// # Ok::<(), lsm_tree::Error>(())
461 /// ```
462 ///
463 /// # Errors
464 ///
465 /// Will return `Err` if an IO error occurs.
466 fn contains_key<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<bool> {
467 self.get(key, seqno).map(|x| x.is_some())
468 }
469
470 /// Inserts a key-value pair into the tree.
471 ///
472 /// If the key already exists, the item will be overwritten.
473 ///
474 /// Returns the added item's size and new size of the memtable.
475 ///
476 /// # Examples
477 ///
478 /// ```
479 /// # let folder = tempfile::tempdir()?;
480 /// use lsm_tree::{AbstractTree, Config, Tree};
481 ///
482 /// let tree = Config::new(folder).open()?;
483 /// tree.insert("a", "abc", 0);
484 /// #
485 /// # Ok::<(), lsm_tree::Error>(())
486 /// ```
487 ///
488 /// # Errors
489 ///
490 /// Will return `Err` if an IO error occurs.
491 fn insert<K: Into<UserKey>, V: Into<UserValue>>(
492 &self,
493 key: K,
494 value: V,
495 seqno: SeqNo,
496 ) -> (u64, u64);
497
498 /// Removes an item from the tree.
499 ///
500 /// Returns the added item's size and new size of the memtable.
501 ///
502 /// # Examples
503 ///
504 /// ```
505 /// # let folder = tempfile::tempdir()?;
506 /// # use lsm_tree::{AbstractTree, Config, Tree};
507 /// #
508 /// # let tree = Config::new(folder).open()?;
509 /// tree.insert("a", "abc", 0);
510 ///
511 /// let item = tree.get("a", 1)?.expect("should have item");
512 /// assert_eq!("abc".as_bytes(), &*item);
513 ///
514 /// tree.remove("a", 1);
515 ///
516 /// let item = tree.get("a", 2)?;
517 /// assert_eq!(None, item);
518 /// #
519 /// # Ok::<(), lsm_tree::Error>(())
520 /// ```
521 ///
522 /// # Errors
523 ///
524 /// Will return `Err` if an IO error occurs.
525 fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
526
527 /// Removes an item from the tree.
528 ///
529 /// The tombstone marker of this delete operation will vanish when it
530 /// collides with its corresponding insertion.
531 /// This may cause older versions of the value to be resurrected, so it should
532 /// only be used and preferred in scenarios where a key is only ever written once.
533 ///
534 /// Returns the added item's size and new size of the memtable.
535 ///
536 /// # Examples
537 ///
538 /// ```
539 /// # let folder = tempfile::tempdir()?;
540 /// # use lsm_tree::{AbstractTree, Config, Tree};
541 /// #
542 /// # let tree = Config::new(folder).open()?;
543 /// tree.insert("a", "abc", 0);
544 ///
545 /// let item = tree.get("a", 1)?.expect("should have item");
546 /// assert_eq!("abc".as_bytes(), &*item);
547 ///
548 /// tree.remove_weak("a", 1);
549 ///
550 /// let item = tree.get("a", 2)?;
551 /// assert_eq!(None, item);
552 /// #
553 /// # Ok::<(), lsm_tree::Error>(())
554 /// ```
555 ///
556 /// # Errors
557 ///
558 /// Will return `Err` if an IO error occurs.
559 #[doc(hidden)]
560 fn remove_weak<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
561}