Trait AbstractTree

Source

pub trait AbstractTree: Sealed {
Show 72 methods    // Required methods
    fn table_file_cache_size(&self) -> usize;
    fn create_checkpoint(&self, target_path: &Path) -> Result<CheckpointInfo>;
    fn prefix<K: AsRef<[u8]>>(
        &self,
        prefix: K,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
    fn range<K: AsRef<[u8]>, R: RangeBounds<K>>(
        &self,
        range: R,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
    fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>(
        &self,
        range: R,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Box<dyn SeekableGuardIter + 'static>;
    fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>(
        &self,
        intervals: I,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
       where I::IntoIter: Send + 'static;
    fn tombstone_count(&self) -> u64;
    fn weak_tombstone_count(&self) -> u64;
    fn weak_tombstone_reclaimable_count(&self) -> u64;
    fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(
        &self,
        range: R,
    ) -> Result<()>;
    fn clear(&self) -> Result<()>;
    fn major_compact(
        &self,
        target_size: u64,
        seqno_threshold: SeqNo,
    ) -> Result<CompactionResult>;
    fn filter_size(&self) -> u64;
    fn pinned_filter_size(&self) -> usize;
    fn pinned_block_index_size(&self) -> usize;
    fn version_free_list_len(&self) -> usize;
    fn metrics(&self) -> &Arc<Metrics> ⓘ;
    fn cache_stats(&self) -> CacheStats;
    fn get_flush_lock(&self) -> MutexGuard<'_, ()>;
    fn register_tables(
        &self,
        tables: &[Table],
        blob_files: Option<&[BlobFile]>,
        frag_map: Option<FragmentationMap>,
        sealed_memtables_to_delete: &[MemtableId],
        gc_watermark: SeqNo,
    ) -> Result<()>;
    fn clear_active_memtable(&self);
    fn sealed_memtable_count(&self) -> usize;
    fn compact(
        &self,
        strategy: Arc<dyn CompactionStrategy>,
        seqno_threshold: SeqNo,
    ) -> Result<CompactionResult>;
    fn get_next_table_id(&self) -> TableId;
    fn tree_config(&self) -> &Config;
    fn active_memtable(&self) -> Arc<Memtable> ⓘ;
    fn rotate_memtable(&self) -> Option<Arc<Memtable>>;
    fn table_count(&self) -> usize;
    fn level_table_count(&self, idx: usize) -> Option<usize>;
    fn l0_run_count(&self) -> usize;
    fn blob_file_count(&self) -> usize;
    fn approximate_len(&self) -> usize;
    fn disk_space(&self) -> u64;
    fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>(
        &self,
        range: R,
        seqno: SeqNo,
    ) -> Result<ApproximateRangeStats>;
    fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>(
        &self,
        range: R,
        seqno: SeqNo,
    ) -> Result<RangeCardinality>;
    fn get_highest_memtable_seqno(&self) -> Option<SeqNo>;
    fn get_highest_persisted_seqno(&self) -> Option<SeqNo>;
    fn size_of<K: AsRef<[u8]>>(
        &self,
        key: K,
        seqno: SeqNo,
    ) -> Result<Option<u32>>;
    fn get<K: AsRef<[u8]>>(
        &self,
        key: K,
        seqno: SeqNo,
    ) -> Result<Option<UserValue>>;
    fn apply_batch(&self, batch: WriteBatch, seqno: SeqNo) -> Result<(u64, u64)>;
    fn insert<K: Into<UserKey>, V: Into<UserValue>>(
        &self,
        key: K,
        value: V,
        seqno: SeqNo,
    ) -> (u64, u64);
    fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
    fn merge<K: Into<UserKey>, V: Into<UserValue>>(
        &self,
        key: K,
        operand: V,
        seqno: SeqNo,
    ) -> (u64, u64);
    fn remove_range<K: Into<UserKey>>(
        &self,
        start: K,
        end: K,
        seqno: SeqNo,
    ) -> u64;

    // Provided methods
    fn storage_stats(&self) -> Result<StorageStats> { ... }
    fn level_segment_stats(&self) -> Result<Vec<LevelStats>> { ... }
    fn compaction_debt(&self, strategy: &dyn CompactionStrategy) -> u64 { ... }
    fn write_backpressure(
        &self,
        _strategy: &dyn CompactionStrategy,
    ) -> Backpressure { ... }
    fn write_admission(&self) -> Result<()> { ... }
    fn is_read_only(&self) -> bool { ... }
    fn verify_checksum(&self) -> BlockVerifyReport
       where Self: Sized { ... }
    fn verify_checksum_with(&self, options: &VerifyOptions) -> BlockVerifyReport
       where Self: Sized { ... }
    fn flush(
        &self,
        _lock: &MutexGuard<'_, ()>,
        seqno_threshold: SeqNo,
    ) -> Result<Option<u64>> { ... }
    fn iter(
        &self,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static> { ... }
    fn stale_blob_bytes(&self) -> u64 { ... }
    fn flush_to_tables(
        &self,
        stream: impl Iterator<Item = Result<InternalValue>>,
    ) -> Result<Option<(Vec<Table>, Option<Vec<BlobFile>>)>> { ... }
    fn get_highest_seqno(&self) -> Option<SeqNo> { ... }
    fn tree_type(&self) -> TreeType { ... }
    fn len(
        &self,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Result<usize> { ... }
    fn is_empty(
        &self,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Result<bool> { ... }
    fn first_key_value(
        &self,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Option<IterGuardImpl> { ... }
    fn last_key_value(
        &self,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Option<IterGuardImpl> { ... }
    fn get_pinned<K: AsRef<[u8]>>(
        &self,
        key: K,
        seqno: SeqNo,
    ) -> Result<Option<PinnableSlice>> { ... }
    fn contains_key<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<bool> { ... }
    fn contains_prefix<K: AsRef<[u8]>>(
        &self,
        prefix: K,
        seqno: SeqNo,
        index: Option<(Arc<Memtable>, SeqNo)>,
    ) -> Result<bool> { ... }
    fn multi_get<K: AsRef<[u8]>>(
        &self,
        keys: impl IntoIterator<Item = K>,
        seqno: SeqNo,
    ) -> Result<Vec<Option<UserValue>>> { ... }
    fn try_insert<K: Into<UserKey>, V: Into<UserValue>>(
        &self,
        key: K,
        value: V,
        seqno: SeqNo,
    ) -> Result<(u64, u64)> { ... }
    fn try_merge<K: Into<UserKey>, V: Into<UserValue>>(
        &self,
        key: K,
        operand: V,
        seqno: SeqNo,
    ) -> Result<(u64, u64)> { ... }
    fn try_remove<K: Into<UserKey>>(
        &self,
        key: K,
        seqno: SeqNo,
    ) -> Result<(u64, u64)> { ... }
    fn try_remove_weak<K: Into<UserKey>>(
        &self,
        key: K,
        seqno: SeqNo,
    ) -> Result<(u64, u64)> { ... }
    fn try_remove_range<K: Into<UserKey>>(
        &self,
        start: K,
        end: K,
        seqno: SeqNo,
    ) -> Result<u64> { ... }
    fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64 { ... }
}

Expand description

Generic Tree API

Required Methods§

Source

fn table_file_cache_size(&self) -> usize

Returns the number of cached table file descriptors.

Source

fn create_checkpoint(&self, target_path: &Path) -> Result<CheckpointInfo>

Creates a hard-linked checkpoint of the tree’s on-disk state in target_path for point-in-time recovery (PITR) backup.

The checkpoint is a fully functional tree that can be opened independently via Config::open. For the common single-filesystem case all SST files (and blob files, for BlobTree) are hard-linked rather than copied, so the operation is O(1) per file and consumes zero additional disk space until the original files are compacted away — at which point the inode is kept alive by the checkpoint link.

§Cross-filesystem / cross-backend fall-back

When a source file lives on a different filesystem than the checkpoint target — e.g. an SST routed to a hot tier via level_routes on a separate volume, or a backup directory on a foreign mount — the hard link cannot be created (Unix EXDEV). In that case the checkpoint silently falls back to a streamed byte copy, which:

takes time linear in the file size instead of O(1), and
consumes disk space equal to the copied bytes on the target volume (no inode sharing across filesystems).

Each fall-back call emits one log::debug line (deliberately not warn: a misconfigured tier could trigger this path once per SST and per blob — thousands of times per snapshot — and per-file warnings would drown real signal). Operators wanting hard-visibility of unexpected full copies should enable debug logging on the fs module or watch the CheckpointInfo.total_bytes figure (≫ inode link cost means the fallback fired). The same debug policy applies when source and target use entirely different Fs backends (e.g. MemFs → StdFs in tests).

§Concurrency

While the checkpoint is being built, compaction continues normally but the physical removal of obsolete files is deferred until the checkpoint hard-links are in place. This is implemented by an internal reference-counted deletion gate; callers do not have to pause compaction themselves.

§Errors

Returns an error if:

the active memtable could not be flushed,
target_path already exists (to prevent accidental overwrites),
a hard link / copy fall-back could not be created, or
the manifest / version pointer files could not be replicated.

On error any partial checkpoint files are removed automatically (best-effort) so callers can safely retry against the same path.

Source

fn prefix<K: AsRef<[u8]>>( &self, prefix: K, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>

Returns an iterator over a prefixed set of items.

Avoid using an empty prefix as it may scan a lot of items (unless limited).

Source

fn range<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>

Returns an iterator over a range of items.

Avoid using full or unbounded ranges as they may scan a lot of items (unless limited).

Source

fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn SeekableGuardIter + 'static>

Returns a range iterator that can reposition (seek) in place without reopening its per-SST readers.

Unlike range’s boxed DoubleEndedIterator, the returned [SeekableGuardIter] additionally exposes seek_to / seek_to_for_prev, so a consumer can jump a live iterator to any key (RocksDB Seek / SeekForPrev) — enabling data-dependent scans (joins, skip-scan) without reopening per-SST readers per jump.

Source

fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>( &self, intervals: I, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
where I::IntoIter: Send + 'static,

Scans a sequence of disjoint, ascending key sub-intervals, reusing one set of per-SST readers across all of them.

The per-SST setup is paid once (when the underlying seekable iterator is opened); each interval is served by repositioning that iterator. This amortizes the setup across N intervals instead of paying it per interval, so multi-interval scan throughput scales with the total rows returned rather than the interval count.

The interval source is pulled lazily, so intervals may be produced on demand (e.g. computed from rows already returned).

Source

fn tombstone_count(&self) -> u64

Returns the approximate number of tombstones in the tree.

Source

fn weak_tombstone_count(&self) -> u64

Returns the approximate number of weak tombstones (single deletes) in the tree.

Source

fn weak_tombstone_reclaimable_count(&self) -> u64

Returns the approximate number of values reclaimable once weak tombstones can be GC’d.

Source

fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> Result<()>

Drops tables that are fully contained in a given range.

Accepts any RangeBounds, including unbounded or exclusive endpoints. If the normalized lower bound is greater than the upper bound, the method returns without performing any work.

§Errors

Will return Err only if an IO error occurs.

Source

fn clear(&self) -> Result<()>

Drops all tables and clears all memtables atomically.

§Errors

Will return Err only if an IO error occurs.

Source

fn major_compact( &self, target_size: u64, seqno_threshold: SeqNo, ) -> Result<CompactionResult>

Performs major compaction, blocking the caller until it’s done.

Returns a crate::compaction::CompactionResult describing what action was taken.

§Garbage-collection / merge-fold watermark (`seqno_threshold`)

seqno_threshold is the MVCC garbage-collection watermark: the engine may collapse history that no snapshot reading at a seqno < seqno_threshold can still observe. Concretely, only entries whose seqno is < seqno_threshold are eligible for:

dropping shadowed versions / GC-ing tombstones, and
folding merge operands via the crate::MergeOperator: a key written only through Self::merge (no base value) accumulates one operand per call, and reads re-apply the whole chain (O(operands) per read) until compaction folds it. Folding a chain into a single value is only MVCC-safe when no live snapshot reads between the operands, which is exactly what seqno_threshold certifies.

The engine does not track snapshots (unlike a RocksDB-style snapshot list); the caller owns snapshot lifecycle and must supply this watermark:

To fold/GC everything (no active snapshots), pass a value above every live seqno (e.g. the next value from the crate::SequenceNumberCounter).
With outstanding snapshots, pass the oldest snapshot’s seqno so their reads stay correct.
seqno_threshold == 0 certifies nothing as collapsible, so no folding or GC happens — major_compact(target, 0) only restructures tables and leaves a merge-only key’s full operand chain intact.

§Errors

Will return Err if an IO error occurs.

Source

fn filter_size(&self) -> u64

Gets the space usage of all filters in the tree.

May not correspond to the actual memory size because filter blocks may be paged out.

Source

fn pinned_filter_size(&self) -> usize

Gets the memory usage of all pinned filters in the tree.

Source

fn pinned_block_index_size(&self) -> usize

Gets the memory usage of all pinned index blocks in the tree.

Source

fn version_free_list_len(&self) -> usize

Gets the length of the version free list.

Source

fn metrics(&self) -> &Arc<Metrics> ⓘ

Returns the metrics structure.

Source

fn cache_stats(&self) -> CacheStats

A point-in-time CacheStats snapshot of block-cache effectiveness (cumulative hit / miss counts and rate) and occupancy (current size against capacity).

The stable, owned observability view over the block cache, so a consumer can read cache health without holding the mutable metrics handle. Counts are cumulative since process start; derive a rate over an interval from the delta between two polls.

Source

fn get_flush_lock(&self) -> MutexGuard<'_, ()>

Acquires the flush lock which is required to call Tree::flush.

Source

fn register_tables( &self, tables: &[Table], blob_files: Option<&[BlobFile]>, frag_map: Option<FragmentationMap>, sealed_memtables_to_delete: &[MemtableId], gc_watermark: SeqNo, ) -> Result<()>

Atomically registers flushed tables into the tree, removing their associated sealed memtables.

§Errors

Will return Err if an IO error occurs.

Source

fn clear_active_memtable(&self)

Clears the active memtable atomically.

Source

fn sealed_memtable_count(&self) -> usize

Returns the number of sealed memtables.

Source

fn compact( &self, strategy: Arc<dyn CompactionStrategy>, seqno_threshold: SeqNo, ) -> Result<CompactionResult>

Performs compaction on the tree’s levels, blocking the caller until it’s done.

Returns a crate::compaction::CompactionResult describing what action was taken.

§Errors

Will return Err if an IO error occurs.

Source

fn get_next_table_id(&self) -> TableId

Returns the next table’s ID.

Source

fn tree_config(&self) -> &Config

Returns the tree config.

Source

fn active_memtable(&self) -> Arc<Memtable> ⓘ

Returns the active memtable.

Source

fn rotate_memtable(&self) -> Option<Arc<Memtable>>

Seals the active memtable.

Source

fn table_count(&self) -> usize

Returns the number of tables currently in the tree.

Source

fn level_table_count(&self, idx: usize) -> Option<usize>

Returns the number of tables in levels[idx].

Returns None if the level does not exist (if idx >= 7).

Source

fn l0_run_count(&self) -> usize

Returns the number of disjoint runs in L0.

Can be used to determine whether to write stall.

Source

fn blob_file_count(&self) -> usize

Returns the number of blob files currently in the tree.

Source

fn approximate_len(&self) -> usize

Approximates the number of items in the tree.

Source

fn disk_space(&self) -> u64

Returns the disk space usage.

Source

fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, ) -> Result<ApproximateRangeStats>

Estimates the on-disk bytes and entry count contained in range at seqno, WITHOUT reading any data block.

The estimate interpolates each overlapping SST’s data-block offsets at the range boundaries (block granularity) and adds the active + sealed memtables’ in-range share. For a KV-separated tree the per-SST blob bytes are apportioned by the same in-range fraction (blob files are not key-indexed, so a finer estimate is impossible without reading data). Intended for query planning (split-point selection, cost-based join ordering), not exact accounting; accuracy is typically within ~10-15% on roughly-uniform data.

§Examples

use lsm_tree::{AbstractTree, Config};

let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u32 {
    tree.insert(format!("k{i:04}"), "value", 0);
}
tree.flush_active_memtable(0)?;

// The full range covers every entry exactly once it is all flushed —
// estimated without reading a single data block.
let all = tree.approximate_range_stats::<&str, _>(.., 1)?;
assert_eq!(all.key_count, tree.approximate_len() as u64);
assert!(all.bytes > 0);

// A range past every key is empty.
let none = tree.approximate_range_stats("zzzz".."zzzzz", 1)?;
assert_eq!(none.key_count, 0);
assert_eq!(none.bytes, 0);

§Errors

Returns an error if a block index or table metadata read fails.

Source

fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, ) -> Result<RangeCardinality>

Estimates the row cardinality and selectivity of range at seqno, WITHOUT reading any data block.

Uses the per-data-block zone map (per-block row counts + key ranges) for a block-granularity row count: every data block whose key range overlaps the query contributes its recorded row count, plus the active + sealed memtables’ in-range counts. When a table has no zone map the row count falls back to the byte-fraction estimate of approximate_range_stats. Selectivity is rows / total_rows, monotonic in predicate tightness, for cost-based planning (join ordering, scan-vs-seek). Not exact accounting.

§Examples

use lsm_tree::{AbstractTree, Config};

let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u32 {
    tree.insert(format!("k{i:04}"), "v", 0);
}
tree.flush_active_memtable(0)?;

// The full range covers every row; selectivity is 1.0.
let all = tree.approximate_range_cardinality::<&str, _>(.., 1)?;
assert_eq!(all.rows, tree.approximate_len() as u64);
assert!((all.selectivity - 1.0).abs() < 1e-9);

// A tighter range selects fewer rows.
let part = tree.approximate_range_cardinality("k0000".."k0050", 1)?;
assert!(part.selectivity <= all.selectivity);

§Errors

Returns an error if a block index, zone map, or table metadata read fails.

Source

fn get_highest_memtable_seqno(&self) -> Option<SeqNo>

Returns the highest sequence number of the active memtable.

Source

fn get_highest_persisted_seqno(&self) -> Option<SeqNo>

Returns the highest sequence number that is flushed to disk.

Source

fn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<u32>>

Returns the size of a value if it exists.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "my_value", 0);

let size = tree.size_of("a", 1)?.unwrap_or_default();
assert_eq!("my_value".len() as u32, size);

let size = tree.size_of("b", 1)?.unwrap_or_default();
assert_eq!(0, size);

§Errors

Will return Err if an IO error occurs.

Source

fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<UserValue>>

Retrieves an item from the tree.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "my_value", 0);

let item = tree.get("a", 1)?;
assert_eq!(Some("my_value".as_bytes().into()), item);

§Errors

Will return Err if an IO error occurs.

Source

fn apply_batch(&self, batch: WriteBatch, seqno: SeqNo) -> Result<(u64, u64)>

Applies a WriteBatch with the given sequence number.

All entries share a single seqno. This is more efficient than individual writes because the version-history lock and memtable size accounting are performed only once for the entire batch.

Visibility: entries become individually visible to concurrent readers as they are inserted. For atomic batch visibility, the caller must publish seqno (via visible_seqno.fetch_max(seqno + 1)) only after this method returns.

Returns the total bytes added and new size of the memtable.

§Examples

use lsm_tree::{AbstractTree, Config, Tree, WriteBatch};

let tree = Config::new(&folder, Default::default(), Default::default()).open()?;

let mut batch = WriteBatch::new();
batch.insert("key1", "value1");
batch.insert("key2", "value2");
batch.remove("key3");

let (bytes_added, memtable_size) = tree.apply_batch(batch, 0)?;
assert!(bytes_added > 0);

§Errors

Returns Error::MixedOperationBatch if the batch contains mixed operation types for the same user key.

Source

fn insert<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, value: V, seqno: SeqNo, ) -> (u64, u64)

Inserts a key-value pair into the tree.

If the key already exists, the item will be overwritten.

Returns the added item’s size and new size of the memtable.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "abc", 0);

§Errors

Will return Err if an IO error occurs.

Source

fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64)

Removes an item from the tree.

Returns the added item’s size and new size of the memtable.

§Examples

tree.insert("a", "abc", 0);

let item = tree.get("a", 1)?.expect("should have item");
assert_eq!("abc".as_bytes(), &*item);

tree.remove("a", 1);

let item = tree.get("a", 2)?;
assert_eq!(None, item);

§Errors

Will return Err if an IO error occurs.

Source

fn merge<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, operand: V, seqno: SeqNo, ) -> (u64, u64)

Writes a merge operand for a key.

The operand is stored as a partial update that will be combined with other operands and/or a base value via the configured crate::MergeOperator during reads and compaction.

Returns the added item’s size and new size of the memtable.

§Examples

tree.merge("counter", 1_i64.to_le_bytes(), 0);

Source

fn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64

Deletes all keys in the range [start, end) by inserting a range tombstone.

This is much more efficient than deleting keys individually when removing a contiguous range of keys.

Returns the approximate size added to the memtable. Returns 0 if start >= end (invalid interval is silently ignored).

This is a required method on the crate’s sealed tree types.

Provided Methods§

Source

fn storage_stats(&self) -> Result<StorageStats>

Returns a read-only snapshot of the tree’s on-disk storage footprint: total used bytes, entry count, the average shape of a stored entry (average key / value bytes), and an estimate of how many more average-shaped entries fit in a byte budget (see StorageStats::estimated_remaining_entries).

Computed from the live version’s table + blob metadata plus one size-stat per live file; it never reads a data block. The default implementation reports StorageStatus::Healthy; the standard tree overrides it to report StorageStatus::CompactionInProgress while a compaction runs.

§Examples

use lsm_tree::{AbstractTree, Config};

let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;

for i in 0..100u64 {
    tree.insert(format!("key{i:04}"), "value", i);
}
tree.flush_active_memtable(0)?;

let stats = tree.storage_stats()?;
assert_eq!(stats.item_count, 100);
// Roughly how many more average-shaped entries fit in another 1 MiB.
let _headroom = stats.estimated_remaining_entries(1024 * 1024);

§Errors

Returns an error if a live file’s size cannot be stat-ed.

Source

fn level_segment_stats(&self) -> Result<Vec<LevelStats>>

Per-LSM-level and per-segment size + entry-count stats, for tiering and erasure-coding placement decisions (which level / segment is large enough to demote, EC-encode, or migrate).

Cheap: derived from the live version’s metadata plus one file-size stat per segment (no data-block scan). The per-level totals reconcile with storage_stats: summed across levels they equal the SST portion of StorageStats::used_bytes and StorageStats::item_count.

§Examples

use lsm_tree::{AbstractTree, Config};

let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u32 {
    tree.insert(format!("k{i:04}"), "v", 0);
}
tree.flush_active_memtable(0)?;

let levels = tree.level_segment_stats()?;
let total: u64 = levels.iter().map(|l| l.item_count).sum();
assert_eq!(total, tree.storage_stats()?.item_count);

§Errors

Returns an error if a segment’s file size cannot be stat-ed.

Source

fn compaction_debt(&self, strategy: &dyn CompactionStrategy) -> u64

Estimated bytes pending compaction under strategy: on-disk data above its level’s target that must eventually be rewritten downward (a RocksDB estimate-pending-compaction-bytes analog), a compaction-debt signal for a scheduler / tiering consumer.

The strategy is supplied by the caller because the engine does not own a configured compaction strategy (it is injected per compaction run); a &dyn keeps this object-safe. Returns 0 for strategies without a size-target notion of debt (FIFO, drop-range), or when the tree is at or below its target shape. See CompactionStrategy::pending_compaction_bytes.

Source

fn write_backpressure(&self, _strategy: &dyn CompactionStrategy) -> Backpressure

Computed write-backpressure verdict from the live L0 table count and the strategy’s pending-compaction bytes, against the configured RuntimeConfig::backpressure thresholds.

Advisory, mirroring write_admission: the caller consults it and throttles in its own write loop (sleep suggested_delay at Slowdown, pause at Stop). The engine never blocks on it, because it does not own the compaction that drains the debt — an internal stall could deadlock the very thread that would compact.

The strategy is supplied by the caller for the same reason as compaction_debt. Returns Backpressure::None by default (no thresholds configured).

Source

fn write_admission(&self) -> Result<()>

Storage admission gate: Ok(()) if a write may proceed, or Error::StorageFull if the tree is over budget and should be treated as read-only.

Opt-in: returns Ok(()) unless storage_admission_check is enabled. The predicate is computed (not latched), so once space is freed — the budget raised, a compaction reclaiming space, or disk freed — the next call admits again with no restart.

Intended as a cheap pre-check the caller consults before applying a write batch. The footprint is cached per version, so the check is a constant-time read on the common path and only re-measures when a new version is installed (flush / compaction). Internal flush / compaction are never gated, so the engine can always reclaim.

§Errors

Error::StorageFull when the live footprint plus reserved headroom exceeds the effective budget.

Source

fn is_read_only(&self) -> bool

true when the admission gate is currently closed (see write_admission). Convenience for callers that want a boolean rather than a Result. Always false unless admission control is enabled and the tree is over budget.

Only Error::StorageFull counts as read-only: an unrelated admission error (e.g. an I/O failure while measuring the footprint) is NOT an out-of-space condition and must not be reported as one.

Source

fn verify_checksum(&self) -> BlockVerifyReport
where Self: Sized,

Proactively verifies every block’s XXH3 checksum across every SST in the tree’s current version — a scrubber for catching bit rot before it surfaces as a user-visible read failure (cron / scrub jobs).

Reports at block granularity and never aborts early. The returned BlockVerifyReport records block-corruption findings with (file, offset), while file-level errors (e.g. BlockVerifyError::SstFileUnreadable) carry the file only (no offset). It does not surface per-entry indices or ECC-correction counts (when ECC-at-rest is enabled a within-budget corrupt block may still be healed on read as a side effect of the scan, but the report does not tally corrections).

Filesystems with native per-block integrity (ZFS, Btrfs, ReFS, S3 — see Fs::capabilities) already detect corruption on read; this scrub is the portable check for the rest.

Use Self::verify_checksum_with for parallelism / throttle control.

Source

fn verify_checksum_with(&self, options: &VerifyOptions) -> BlockVerifyReport
where Self: Sized,

Like Self::verify_checksum but with configurable parallelism and I/O throttle (see VerifyOptions).

Source

fn flush( &self, _lock: &MutexGuard<'_, ()>, seqno_threshold: SeqNo, ) -> Result<Option<u64>>

Synchronously flushes pending sealed memtables to tables.

Returns the sum of flushed memtable sizes that were flushed.

The function may not return a result, if nothing was flushed.

§Errors

Returns Err on an I/O error, or on a memtable residence-verification failure under KvChecksumComputePoint::AtInsert: a sealed memtable whose insert-time per-KV digests do not verify before flush surfaces crate::Error::MemtableKvChecksumMismatch, crate::Error::MemtableKvChecksumCorruptAlgorithm, or crate::Error::InvalidTag (corrupt value_type).

Source

fn iter( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>

Returns an iterator that scans through the entire tree.

Avoid using this function, or limit it as otherwise it may scan a lot of items.

Source

fn stale_blob_bytes(&self) -> u64

Returns the disk space used by stale blobs.

Source

fn flush_to_tables( &self, stream: impl Iterator<Item = Result<InternalValue>>, ) -> Result<Option<(Vec<Table>, Option<Vec<BlobFile>>)>>

Synchronously flushes a memtable to a table.

This method will not make the table immediately available, use AbstractTree::register_tables for that.

§Errors

Will return Err if an IO error occurs.

Source

fn get_highest_seqno(&self) -> Option<SeqNo>

Returns the highest sequence number.

Source

fn tree_type(&self) -> TreeType

Returns the tree type.

Source

fn len( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Result<usize>

Scans the entire tree, returning the number of items.

§Caution

This operation scans the entire tree: O(n) complexity!

Never, under any circumstances, use .len() == 0 to check if the tree is empty, use Tree::is_empty instead.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;

assert_eq!(tree.len(0, None)?, 0);
tree.insert("1", "abc", 0);
tree.insert("3", "abc", 1);
tree.insert("5", "abc", 2);
assert_eq!(tree.len(3, None)?, 3);

§Errors

Will return Err if an IO error occurs.

Source

fn is_empty( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Result<bool>

Returns true if the tree is empty.

This operation has O(log N) complexity.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
assert!(tree.is_empty(0, None)?);

tree.insert("a", "abc", 0);
assert!(!tree.is_empty(1, None)?);

§Errors

Will return Err if an IO error occurs.

Source

fn first_key_value( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Option<IterGuardImpl>

Returns the first key-value pair in the tree. The key in this pair is the minimum key in the tree.

§Examples

let tree = Config::new(folder, Default::default(), Default::default()).open()?;

tree.insert("1", "abc", 0);
tree.insert("3", "abc", 1);
tree.insert("5", "abc", 2);

let key = tree.first_key_value(3, None).expect("item should exist").key()?;
assert_eq!(&*key, "1".as_bytes());

§Errors

Will return Err if an IO error occurs.

Source

fn last_key_value( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Option<IterGuardImpl>

Returns the last key-value pair in the tree. The key in this pair is the maximum key in the tree.

§Examples

tree.insert("1", "abc", 0);
tree.insert("3", "abc", 1);
tree.insert("5", "abc", 2);

let key = tree.last_key_value(3, None).expect("item should exist").key()?;
assert_eq!(&*key, "5".as_bytes());

§Errors

Will return Err if an IO error occurs.

Source

fn get_pinned<K: AsRef<[u8]>>( &self, key: K, seqno: SeqNo, ) -> Result<Option<PinnableSlice>>

Retrieves an item from the tree as a PinnableSlice.

When the value is backed by an on-disk data block, implementations may return PinnableSlice::Pinned holding a reference to that block’s decompressed buffer (avoiding a data copy). Memtable and blob-resolved values use PinnableSlice::Owned. The default implementation always returns Owned; only Tree overrides with the pinned path.

The existing AbstractTree::get method is unaffected.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
tree.insert("a", "my_value", 0);

let item = tree.get_pinned("a", 1)?;
assert_eq!(item.as_ref().map(|v| v.as_ref()), Some("my_value".as_bytes()));

§Errors

Will return Err if an IO error occurs.

Source

fn contains_key<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<bool>

Returns true if the tree contains the specified key.

§Examples

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
assert!(!tree.contains_key("a", 0)?);

tree.insert("a", "abc", 0);
assert!(tree.contains_key("a", 1)?);

§Errors

Will return Err if an IO error occurs.

Source

fn contains_prefix<K: AsRef<[u8]>>( &self, prefix: K, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Result<bool>

Returns true if the tree contains any key with the given prefix.

This is a convenience method that checks whether the corresponding prefix iterator yields at least one item, while surfacing any IO errors via the Result return type. Implementations may override this method to provide a more efficient prefix-existence check.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
assert!(!tree.contains_prefix("abc", 0, None)?);

tree.insert("abc:1", "value", 0);
assert!(tree.contains_prefix("abc", 1, None)?);
assert!(!tree.contains_prefix("xyz", 1, None)?);

§Errors

Will return Err if an IO error occurs.

Source

fn multi_get<K: AsRef<[u8]>>( &self, keys: impl IntoIterator<Item = K>, seqno: SeqNo, ) -> Result<Vec<Option<UserValue>>>

Reads multiple keys from the tree.

Implementations may choose to perform all lookups against a single version snapshot and acquire the version lock only once, which can be more efficient than calling AbstractTree::get in a loop. The default trait implementation, however, is a convenience wrapper that simply calls AbstractTree::get for each key and therefore does not guarantee a single-snapshot or single-lock acquisition. Optimized implementations (such as Tree and BlobTree) provide the single-snapshot/one-lock behavior.

§Examples

use lsm_tree::{AbstractTree, Config, Tree};

let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "value_a", 0);
tree.insert("b", "value_b", 1);

let results = tree.multi_get(["a", "b", "c"], 2)?;
assert_eq!(results[0], Some("value_a".as_bytes().into()));
assert_eq!(results[1], Some("value_b".as_bytes().into()));
assert_eq!(results[2], None);

§Errors

Will return Err if an IO error occurs.

Source

fn try_insert<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, value: V, seqno: SeqNo, ) -> Result<(u64, u64)>

Admission-gated insert: consults write_admission first and declines with Error::StorageFull when the tree is over budget, otherwise inserts and returns the same (added_bytes, memtable_size) tuple. This is the write entry a space-aware caller uses so over-budget writes are refused up front rather than failing a flush later; bare insert stays infallible for callers that do not opt into admission control.

§Errors

Error::StorageFull when the admission gate is closed.

Source

fn try_merge<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, operand: V, seqno: SeqNo, ) -> Result<(u64, u64)>

Admission-gated merge. See try_insert.

§Errors

Error::StorageFull when the admission gate is closed.

Source

fn try_remove<K: Into<UserKey>>( &self, key: K, seqno: SeqNo, ) -> Result<(u64, u64)>

Admission-gated remove. See try_insert.

Note a tombstone is itself a write that consumes space; an over-budget tree refuses it too. Reclaim space via compaction (never gated) rather than relying on deletes when already read-only.

§Errors

Error::StorageFull when the admission gate is closed.

Source

fn try_remove_weak<K: Into<UserKey>>( &self, key: K, seqno: SeqNo, ) -> Result<(u64, u64)>

Admission-gated remove_weak. See try_insert.

§Errors

Error::StorageFull when the admission gate is closed.

Source

fn try_remove_range<K: Into<UserKey>>( &self, start: K, end: K, seqno: SeqNo, ) -> Result<u64>

Admission-gated remove_range. See try_insert.

§Errors

Error::StorageFull when the admission gate is closed.

Source

fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64

Deletes all keys with the given prefix by inserting a range tombstone.

This is sugar over AbstractTree::remove_range using prefix bounds.

Returns the approximate size added to the memtable. Returns 0 for empty prefixes or all-0xFF prefixes (cannot form valid half-open range).

Dyn Compatibility§

This trait is not dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety".

Implementors§

Source §

impl AbstractTree for AnyTree

Source §

impl AbstractTree for BlobTree

Source §

AbstractTree

Trait AbstractTree Copy item path

Required Methods§

fn table_file_cache_size(&self) -> usize

fn create_checkpoint(&self, target_path: &Path) -> Result<CheckpointInfo>

§Cross-filesystem / cross-backend fall-back

§Concurrency

§Errors

fn prefix<K: AsRef<[u8]>>( &self, prefix: K, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>

fn range<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>

fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn SeekableGuardIter + 'static>

fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>( &self, intervals: I, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>where I::IntoIter: Send + 'static,

fn tombstone_count(&self) -> u64

fn weak_tombstone_count(&self) -> u64

fn weak_tombstone_reclaimable_count(&self) -> u64

fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> Result<()>

§Errors

fn clear(&self) -> Result<()>

§Errors

fn major_compact( &self, target_size: u64, seqno_threshold: SeqNo, ) -> Result<CompactionResult>

§Garbage-collection / merge-fold watermark (seqno_threshold)

§Errors

fn filter_size(&self) -> u64

fn pinned_filter_size(&self) -> usize

fn pinned_block_index_size(&self) -> usize

fn version_free_list_len(&self) -> usize

fn metrics(&self) -> &Arc<Metrics> ⓘ

fn cache_stats(&self) -> CacheStats

fn get_flush_lock(&self) -> MutexGuard<'_, ()>

fn register_tables( &self, tables: &[Table], blob_files: Option<&[BlobFile]>, frag_map: Option<FragmentationMap>, sealed_memtables_to_delete: &[MemtableId], gc_watermark: SeqNo, ) -> Result<()>

§Errors

fn clear_active_memtable(&self)

fn sealed_memtable_count(&self) -> usize

fn compact( &self, strategy: Arc<dyn CompactionStrategy>, seqno_threshold: SeqNo, ) -> Result<CompactionResult>

§Errors

fn get_next_table_id(&self) -> TableId

fn tree_config(&self) -> &Config

fn active_memtable(&self) -> Arc<Memtable> ⓘ

fn rotate_memtable(&self) -> Option<Arc<Memtable>>

fn table_count(&self) -> usize

fn level_table_count(&self, idx: usize) -> Option<usize>

fn l0_run_count(&self) -> usize

fn blob_file_count(&self) -> usize

fn approximate_len(&self) -> usize

fn disk_space(&self) -> u64

fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, ) -> Result<ApproximateRangeStats>

§Examples

§Errors

fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, ) -> Result<RangeCardinality>

§Examples

§Errors

fn get_highest_memtable_seqno(&self) -> Option<SeqNo>

fn get_highest_persisted_seqno(&self) -> Option<SeqNo>

fn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<u32>>

§Examples

§Errors

fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<UserValue>>

§Examples

§Errors

fn apply_batch(&self, batch: WriteBatch, seqno: SeqNo) -> Result<(u64, u64)>

§Examples

§Errors

fn insert<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, value: V, seqno: SeqNo, ) -> (u64, u64)

§Examples

§Errors

fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64)

§Examples

§Errors

fn merge<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, operand: V, seqno: SeqNo, ) -> (u64, u64)

§Examples

fn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64

Provided Methods§

fn storage_stats(&self) -> Result<StorageStats>

§Examples

§Errors

fn level_segment_stats(&self) -> Result<Vec<LevelStats>>

§Examples

§Errors

fn compaction_debt(&self, strategy: &dyn CompactionStrategy) -> u64

fn write_backpressure(&self, _strategy: &dyn CompactionStrategy) -> Backpressure

Trait AbstractTree

fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>( &self, intervals: I, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
where I::IntoIter: Send + 'static,

§Garbage-collection / merge-fold watermark (`seqno_threshold`)

fn verify_checksum(&self) -> BlockVerifyReport
where Self: Sized,

fn verify_checksum_with(&self, options: &VerifyOptions) -> BlockVerifyReport
where Self: Sized,