pub trait AbstractTree: Sealed {
Show 72 methods
// Required methods
fn table_file_cache_size(&self) -> usize;
fn create_checkpoint(&self, target_path: &Path) -> Result<CheckpointInfo>;
fn prefix<K: AsRef<[u8]>>(
&self,
prefix: K,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
fn range<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn SeekableGuardIter + 'static>;
fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>(
&self,
intervals: I,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
where I::IntoIter: Send + 'static;
fn tombstone_count(&self) -> u64;
fn weak_tombstone_count(&self) -> u64;
fn weak_tombstone_reclaimable_count(&self) -> u64;
fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
) -> Result<()>;
fn clear(&self) -> Result<()>;
fn major_compact(
&self,
target_size: u64,
seqno_threshold: SeqNo,
) -> Result<CompactionResult>;
fn filter_size(&self) -> u64;
fn pinned_filter_size(&self) -> usize;
fn pinned_block_index_size(&self) -> usize;
fn version_free_list_len(&self) -> usize;
fn metrics(&self) -> &Arc<Metrics> ⓘ;
fn cache_stats(&self) -> CacheStats;
fn get_flush_lock(&self) -> MutexGuard<'_, ()>;
fn register_tables(
&self,
tables: &[Table],
blob_files: Option<&[BlobFile]>,
frag_map: Option<FragmentationMap>,
sealed_memtables_to_delete: &[MemtableId],
gc_watermark: SeqNo,
) -> Result<()>;
fn clear_active_memtable(&self);
fn sealed_memtable_count(&self) -> usize;
fn compact(
&self,
strategy: Arc<dyn CompactionStrategy>,
seqno_threshold: SeqNo,
) -> Result<CompactionResult>;
fn get_next_table_id(&self) -> TableId;
fn tree_config(&self) -> &Config;
fn active_memtable(&self) -> Arc<Memtable> ⓘ;
fn rotate_memtable(&self) -> Option<Arc<Memtable>>;
fn table_count(&self) -> usize;
fn level_table_count(&self, idx: usize) -> Option<usize>;
fn l0_run_count(&self) -> usize;
fn blob_file_count(&self) -> usize;
fn approximate_len(&self) -> usize;
fn disk_space(&self) -> u64;
fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
) -> Result<ApproximateRangeStats>;
fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
) -> Result<RangeCardinality>;
fn get_highest_memtable_seqno(&self) -> Option<SeqNo>;
fn get_highest_persisted_seqno(&self) -> Option<SeqNo>;
fn size_of<K: AsRef<[u8]>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<Option<u32>>;
fn get<K: AsRef<[u8]>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<Option<UserValue>>;
fn apply_batch(&self, batch: WriteBatch, seqno: SeqNo) -> Result<(u64, u64)>;
fn insert<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
value: V,
seqno: SeqNo,
) -> (u64, u64);
fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
fn merge<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
operand: V,
seqno: SeqNo,
) -> (u64, u64);
fn remove_range<K: Into<UserKey>>(
&self,
start: K,
end: K,
seqno: SeqNo,
) -> u64;
// Provided methods
fn storage_stats(&self) -> Result<StorageStats> { ... }
fn level_segment_stats(&self) -> Result<Vec<LevelStats>> { ... }
fn compaction_debt(&self, strategy: &dyn CompactionStrategy) -> u64 { ... }
fn write_backpressure(
&self,
_strategy: &dyn CompactionStrategy,
) -> Backpressure { ... }
fn write_admission(&self) -> Result<()> { ... }
fn is_read_only(&self) -> bool { ... }
fn verify_checksum(&self) -> BlockVerifyReport
where Self: Sized { ... }
fn verify_checksum_with(&self, options: &VerifyOptions) -> BlockVerifyReport
where Self: Sized { ... }
fn flush(
&self,
_lock: &MutexGuard<'_, ()>,
seqno_threshold: SeqNo,
) -> Result<Option<u64>> { ... }
fn iter(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static> { ... }
fn stale_blob_bytes(&self) -> u64 { ... }
fn flush_to_tables(
&self,
stream: impl Iterator<Item = Result<InternalValue>>,
) -> Result<Option<(Vec<Table>, Option<Vec<BlobFile>>)>> { ... }
fn get_highest_seqno(&self) -> Option<SeqNo> { ... }
fn tree_type(&self) -> TreeType { ... }
fn len(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Result<usize> { ... }
fn is_empty(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Result<bool> { ... }
fn first_key_value(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Option<IterGuardImpl> { ... }
fn last_key_value(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Option<IterGuardImpl> { ... }
fn get_pinned<K: AsRef<[u8]>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<Option<PinnableSlice>> { ... }
fn contains_key<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<bool> { ... }
fn contains_prefix<K: AsRef<[u8]>>(
&self,
prefix: K,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Result<bool> { ... }
fn multi_get<K: AsRef<[u8]>>(
&self,
keys: impl IntoIterator<Item = K>,
seqno: SeqNo,
) -> Result<Vec<Option<UserValue>>> { ... }
fn try_insert<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
value: V,
seqno: SeqNo,
) -> Result<(u64, u64)> { ... }
fn try_merge<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
operand: V,
seqno: SeqNo,
) -> Result<(u64, u64)> { ... }
fn try_remove<K: Into<UserKey>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<(u64, u64)> { ... }
fn try_remove_weak<K: Into<UserKey>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<(u64, u64)> { ... }
fn try_remove_range<K: Into<UserKey>>(
&self,
start: K,
end: K,
seqno: SeqNo,
) -> Result<u64> { ... }
fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64 { ... }
}Expand description
Generic Tree API
Required Methods§
Sourcefn table_file_cache_size(&self) -> usize
fn table_file_cache_size(&self) -> usize
Returns the number of cached table file descriptors.
Sourcefn create_checkpoint(&self, target_path: &Path) -> Result<CheckpointInfo>
fn create_checkpoint(&self, target_path: &Path) -> Result<CheckpointInfo>
Creates a hard-linked checkpoint of the tree’s on-disk state in
target_path for point-in-time recovery (PITR) backup.
The checkpoint is a fully functional tree that can be opened
independently via Config::open. For the
common single-filesystem case all SST files (and blob files, for
BlobTree) are hard-linked rather than copied, so the operation
is O(1) per file and consumes zero additional disk space until the
original files are compacted away — at which point the inode is
kept alive by the checkpoint link.
§Cross-filesystem / cross-backend fall-back
When a source file lives on a different filesystem than the
checkpoint target — e.g. an SST routed to a hot tier via
level_routes on a separate volume,
or a backup directory on a foreign mount — the hard link cannot
be created (Unix EXDEV). In that case the checkpoint silently
falls back to a streamed byte copy, which:
- takes time linear in the file size instead of O(1), and
- consumes disk space equal to the copied bytes on the target volume (no inode sharing across filesystems).
Each fall-back call emits one log::debug line (deliberately not
warn: a misconfigured tier could trigger this path once per SST
and per blob — thousands of times per snapshot — and per-file
warnings would drown real signal). Operators wanting hard-visibility
of unexpected full copies should enable debug logging on the fs
module or watch the CheckpointInfo.total_bytes figure (≫ inode
link cost means the fallback fired). The same debug policy applies
when source and target use entirely different Fs
backends (e.g. MemFs → StdFs
in tests).
§Concurrency
While the checkpoint is being built, compaction continues normally but the physical removal of obsolete files is deferred until the checkpoint hard-links are in place. This is implemented by an internal reference-counted deletion gate; callers do not have to pause compaction themselves.
§Errors
Returns an error if:
- the active memtable could not be flushed,
target_pathalready exists (to prevent accidental overwrites),- a hard link / copy fall-back could not be created, or
- the manifest / version pointer files could not be replicated.
On error any partial checkpoint files are removed automatically (best-effort) so callers can safely retry against the same path.
Sourcefn prefix<K: AsRef<[u8]>>(
&self,
prefix: K,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>
fn prefix<K: AsRef<[u8]>>( &self, prefix: K, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>
Returns an iterator over a prefixed set of items.
Avoid using an empty prefix as it may scan a lot of items (unless limited).
Sourcefn range<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>
fn range<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>
Returns an iterator over a range of items.
Avoid using full or unbounded ranges as they may scan a lot of items (unless limited).
Sourcefn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn SeekableGuardIter + 'static>
fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn SeekableGuardIter + 'static>
Returns a range iterator that can reposition (seek) in place without reopening its per-SST readers.
Unlike range’s boxed DoubleEndedIterator, the returned
[SeekableGuardIter] additionally exposes seek_to / seek_to_for_prev,
so a consumer can jump a live iterator to any key (RocksDB Seek /
SeekForPrev) — enabling data-dependent scans (joins, skip-scan) without
reopening per-SST readers per jump.
Sourcefn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>(
&self,
intervals: I,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>( &self, intervals: I, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
Scans a sequence of disjoint, ascending key sub-intervals, reusing one set of per-SST readers across all of them.
The per-SST setup is paid once (when the underlying seekable iterator is
opened); each interval is served by repositioning that iterator. This
amortizes the setup across N intervals instead of paying it per
interval, so multi-interval scan throughput scales with the total rows
returned rather than the interval count.
The interval source is pulled lazily, so intervals may be produced on demand (e.g. computed from rows already returned).
Sourcefn tombstone_count(&self) -> u64
fn tombstone_count(&self) -> u64
Returns the approximate number of tombstones in the tree.
Sourcefn weak_tombstone_count(&self) -> u64
fn weak_tombstone_count(&self) -> u64
Returns the approximate number of weak tombstones (single deletes) in the tree.
Sourcefn weak_tombstone_reclaimable_count(&self) -> u64
fn weak_tombstone_reclaimable_count(&self) -> u64
Returns the approximate number of values reclaimable once weak tombstones can be GC’d.
Sourcefn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> Result<()>
fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> Result<()>
Drops tables that are fully contained in a given range.
Accepts any RangeBounds, including unbounded or exclusive endpoints.
If the normalized lower bound is greater than the upper bound, the
method returns without performing any work.
§Errors
Will return Err only if an IO error occurs.
Sourcefn clear(&self) -> Result<()>
fn clear(&self) -> Result<()>
Drops all tables and clears all memtables atomically.
§Errors
Will return Err only if an IO error occurs.
Sourcefn major_compact(
&self,
target_size: u64,
seqno_threshold: SeqNo,
) -> Result<CompactionResult>
fn major_compact( &self, target_size: u64, seqno_threshold: SeqNo, ) -> Result<CompactionResult>
Performs major compaction, blocking the caller until it’s done.
Returns a crate::compaction::CompactionResult describing what action was taken.
§Garbage-collection / merge-fold watermark (seqno_threshold)
seqno_threshold is the MVCC garbage-collection watermark: the engine may
collapse history that no snapshot reading at a seqno < seqno_threshold
can still observe. Concretely, only entries whose seqno is < seqno_threshold
are eligible for:
- dropping shadowed versions / GC-ing tombstones, and
- folding merge operands via the
crate::MergeOperator: a key written only throughSelf::merge(no base value) accumulates one operand per call, and reads re-apply the whole chain (O(operands)per read) until compaction folds it. Folding a chain into a single value is only MVCC-safe when no live snapshot reads between the operands, which is exactly whatseqno_thresholdcertifies.
The engine does not track snapshots (unlike a RocksDB-style
snapshot list); the caller owns snapshot lifecycle and must supply this
watermark:
- To fold/GC everything (no active snapshots), pass a value above every
live seqno (e.g. the next value from the
crate::SequenceNumberCounter). - With outstanding snapshots, pass the oldest snapshot’s seqno so their reads stay correct.
seqno_threshold == 0certifies nothing as collapsible, so no folding or GC happens —major_compact(target, 0)only restructures tables and leaves a merge-only key’s full operand chain intact.
§Errors
Will return Err if an IO error occurs.
Sourcefn filter_size(&self) -> u64
fn filter_size(&self) -> u64
Gets the space usage of all filters in the tree.
May not correspond to the actual memory size because filter blocks may be paged out.
Sourcefn pinned_filter_size(&self) -> usize
fn pinned_filter_size(&self) -> usize
Gets the memory usage of all pinned filters in the tree.
Sourcefn pinned_block_index_size(&self) -> usize
fn pinned_block_index_size(&self) -> usize
Gets the memory usage of all pinned index blocks in the tree.
Sourcefn version_free_list_len(&self) -> usize
fn version_free_list_len(&self) -> usize
Gets the length of the version free list.
Sourcefn cache_stats(&self) -> CacheStats
fn cache_stats(&self) -> CacheStats
A point-in-time CacheStats snapshot of block-cache
effectiveness (cumulative hit / miss counts and rate) and occupancy
(current size against capacity).
The stable, owned observability view over the block cache, so a consumer
can read cache health without holding the mutable
metrics handle. Counts are cumulative since process
start; derive a rate over an interval from the delta between two polls.
Sourcefn get_flush_lock(&self) -> MutexGuard<'_, ()>
fn get_flush_lock(&self) -> MutexGuard<'_, ()>
Acquires the flush lock which is required to call Tree::flush.
Sourcefn register_tables(
&self,
tables: &[Table],
blob_files: Option<&[BlobFile]>,
frag_map: Option<FragmentationMap>,
sealed_memtables_to_delete: &[MemtableId],
gc_watermark: SeqNo,
) -> Result<()>
fn register_tables( &self, tables: &[Table], blob_files: Option<&[BlobFile]>, frag_map: Option<FragmentationMap>, sealed_memtables_to_delete: &[MemtableId], gc_watermark: SeqNo, ) -> Result<()>
Atomically registers flushed tables into the tree, removing their associated sealed memtables.
§Errors
Will return Err if an IO error occurs.
Sourcefn clear_active_memtable(&self)
fn clear_active_memtable(&self)
Clears the active memtable atomically.
Sourcefn sealed_memtable_count(&self) -> usize
fn sealed_memtable_count(&self) -> usize
Returns the number of sealed memtables.
Sourcefn compact(
&self,
strategy: Arc<dyn CompactionStrategy>,
seqno_threshold: SeqNo,
) -> Result<CompactionResult>
fn compact( &self, strategy: Arc<dyn CompactionStrategy>, seqno_threshold: SeqNo, ) -> Result<CompactionResult>
Performs compaction on the tree’s levels, blocking the caller until it’s done.
Returns a crate::compaction::CompactionResult describing what action was taken.
§Errors
Will return Err if an IO error occurs.
Sourcefn get_next_table_id(&self) -> TableId
fn get_next_table_id(&self) -> TableId
Returns the next table’s ID.
Sourcefn tree_config(&self) -> &Config
fn tree_config(&self) -> &Config
Returns the tree config.
Sourcefn active_memtable(&self) -> Arc<Memtable> ⓘ
fn active_memtable(&self) -> Arc<Memtable> ⓘ
Returns the active memtable.
Sourcefn rotate_memtable(&self) -> Option<Arc<Memtable>>
fn rotate_memtable(&self) -> Option<Arc<Memtable>>
Seals the active memtable.
Sourcefn table_count(&self) -> usize
fn table_count(&self) -> usize
Returns the number of tables currently in the tree.
Sourcefn level_table_count(&self, idx: usize) -> Option<usize>
fn level_table_count(&self, idx: usize) -> Option<usize>
Returns the number of tables in levels[idx].
Returns None if the level does not exist (if idx >= 7).
Sourcefn l0_run_count(&self) -> usize
fn l0_run_count(&self) -> usize
Returns the number of disjoint runs in L0.
Can be used to determine whether to write stall.
Sourcefn blob_file_count(&self) -> usize
fn blob_file_count(&self) -> usize
Returns the number of blob files currently in the tree.
Sourcefn approximate_len(&self) -> usize
fn approximate_len(&self) -> usize
Approximates the number of items in the tree.
Sourcefn disk_space(&self) -> u64
fn disk_space(&self) -> u64
Returns the disk space usage.
Sourcefn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
) -> Result<ApproximateRangeStats>
fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, ) -> Result<ApproximateRangeStats>
Estimates the on-disk bytes and entry count contained in range at
seqno, WITHOUT reading any data block.
The estimate interpolates each overlapping SST’s data-block offsets at the range boundaries (block granularity) and adds the active + sealed memtables’ in-range share. For a KV-separated tree the per-SST blob bytes are apportioned by the same in-range fraction (blob files are not key-indexed, so a finer estimate is impossible without reading data). Intended for query planning (split-point selection, cost-based join ordering), not exact accounting; accuracy is typically within ~10-15% on roughly-uniform data.
§Examples
use lsm_tree::{AbstractTree, Config};
let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u32 {
tree.insert(format!("k{i:04}"), "value", 0);
}
tree.flush_active_memtable(0)?;
// The full range covers every entry exactly once it is all flushed —
// estimated without reading a single data block.
let all = tree.approximate_range_stats::<&str, _>(.., 1)?;
assert_eq!(all.key_count, tree.approximate_len() as u64);
assert!(all.bytes > 0);
// A range past every key is empty.
let none = tree.approximate_range_stats("zzzz".."zzzzz", 1)?;
assert_eq!(none.key_count, 0);
assert_eq!(none.bytes, 0);§Errors
Returns an error if a block index or table metadata read fails.
Sourcefn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>(
&self,
range: R,
seqno: SeqNo,
) -> Result<RangeCardinality>
fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>( &self, range: R, seqno: SeqNo, ) -> Result<RangeCardinality>
Estimates the row cardinality and selectivity of range at seqno,
WITHOUT reading any data block.
Uses the per-data-block zone map (per-block row counts + key ranges) for a
block-granularity row count: every data block whose key range overlaps the
query contributes its recorded row count, plus the active + sealed
memtables’ in-range counts. When a table has no zone map the row count
falls back to the byte-fraction estimate of approximate_range_stats.
Selectivity is rows / total_rows, monotonic in predicate tightness, for
cost-based planning (join ordering, scan-vs-seek). Not exact accounting.
§Examples
use lsm_tree::{AbstractTree, Config};
let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u32 {
tree.insert(format!("k{i:04}"), "v", 0);
}
tree.flush_active_memtable(0)?;
// The full range covers every row; selectivity is 1.0.
let all = tree.approximate_range_cardinality::<&str, _>(.., 1)?;
assert_eq!(all.rows, tree.approximate_len() as u64);
assert!((all.selectivity - 1.0).abs() < 1e-9);
// A tighter range selects fewer rows.
let part = tree.approximate_range_cardinality("k0000".."k0050", 1)?;
assert!(part.selectivity <= all.selectivity);§Errors
Returns an error if a block index, zone map, or table metadata read fails.
Sourcefn get_highest_memtable_seqno(&self) -> Option<SeqNo>
fn get_highest_memtable_seqno(&self) -> Option<SeqNo>
Returns the highest sequence number of the active memtable.
Sourcefn get_highest_persisted_seqno(&self) -> Option<SeqNo>
fn get_highest_persisted_seqno(&self) -> Option<SeqNo>
Returns the highest sequence number that is flushed to disk.
Sourcefn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<u32>>
fn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<u32>>
Returns the size of a value if it exists.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "my_value", 0);
let size = tree.size_of("a", 1)?.unwrap_or_default();
assert_eq!("my_value".len() as u32, size);
let size = tree.size_of("b", 1)?.unwrap_or_default();
assert_eq!(0, size);§Errors
Will return Err if an IO error occurs.
Sourcefn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<UserValue>>
fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> Result<Option<UserValue>>
Retrieves an item from the tree.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "my_value", 0);
let item = tree.get("a", 1)?;
assert_eq!(Some("my_value".as_bytes().into()), item);§Errors
Will return Err if an IO error occurs.
Sourcefn apply_batch(&self, batch: WriteBatch, seqno: SeqNo) -> Result<(u64, u64)>
fn apply_batch(&self, batch: WriteBatch, seqno: SeqNo) -> Result<(u64, u64)>
Applies a WriteBatch with the given sequence number.
All entries share a single seqno. This is more efficient than individual writes because the version-history lock and memtable size accounting are performed only once for the entire batch.
Visibility: entries become individually visible to concurrent readers
as they are inserted. For atomic batch visibility, the caller must
publish seqno (via visible_seqno.fetch_max(seqno + 1)) only
after this method returns.
Returns the total bytes added and new size of the memtable.
§Examples
use lsm_tree::{AbstractTree, Config, Tree, WriteBatch};
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
let mut batch = WriteBatch::new();
batch.insert("key1", "value1");
batch.insert("key2", "value2");
batch.remove("key3");
let (bytes_added, memtable_size) = tree.apply_batch(batch, 0)?;
assert!(bytes_added > 0);§Errors
Returns Error::MixedOperationBatch
if the batch contains mixed operation types for the same user key.
Sourcefn insert<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
value: V,
seqno: SeqNo,
) -> (u64, u64)
fn insert<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, value: V, seqno: SeqNo, ) -> (u64, u64)
Inserts a key-value pair into the tree.
If the key already exists, the item will be overwritten.
Returns the added item’s size and new size of the memtable.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "abc", 0);§Errors
Will return Err if an IO error occurs.
Sourcefn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64)
fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64)
Removes an item from the tree.
Returns the added item’s size and new size of the memtable.
§Examples
tree.insert("a", "abc", 0);
let item = tree.get("a", 1)?.expect("should have item");
assert_eq!("abc".as_bytes(), &*item);
tree.remove("a", 1);
let item = tree.get("a", 2)?;
assert_eq!(None, item);§Errors
Will return Err if an IO error occurs.
Sourcefn merge<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
operand: V,
seqno: SeqNo,
) -> (u64, u64)
fn merge<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, operand: V, seqno: SeqNo, ) -> (u64, u64)
Writes a merge operand for a key.
The operand is stored as a partial update that will be combined with
other operands and/or a base value via the configured crate::MergeOperator
during reads and compaction.
Returns the added item’s size and new size of the memtable.
§Examples
tree.merge("counter", 1_i64.to_le_bytes(), 0);Sourcefn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64
fn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64
Deletes all keys in the range [start, end) by inserting a range tombstone.
This is much more efficient than deleting keys individually when removing a contiguous range of keys.
Returns the approximate size added to the memtable.
Returns 0 if start >= end (invalid interval is silently ignored).
This is a required method on the crate’s sealed tree types.
Provided Methods§
Sourcefn storage_stats(&self) -> Result<StorageStats>
fn storage_stats(&self) -> Result<StorageStats>
Returns a read-only snapshot of the tree’s on-disk storage footprint:
total used bytes, entry count, the average shape of a stored entry
(average key / value bytes), and an estimate of how many more
average-shaped entries fit in a byte budget (see
StorageStats::estimated_remaining_entries).
Computed from the live version’s table + blob metadata plus one
size-stat per live file; it never reads a data block. The default
implementation reports StorageStatus::Healthy;
the standard tree overrides it to report
StorageStatus::CompactionInProgress while a
compaction runs.
§Examples
use lsm_tree::{AbstractTree, Config};
let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u64 {
tree.insert(format!("key{i:04}"), "value", i);
}
tree.flush_active_memtable(0)?;
let stats = tree.storage_stats()?;
assert_eq!(stats.item_count, 100);
// Roughly how many more average-shaped entries fit in another 1 MiB.
let _headroom = stats.estimated_remaining_entries(1024 * 1024);§Errors
Returns an error if a live file’s size cannot be stat-ed.
Sourcefn level_segment_stats(&self) -> Result<Vec<LevelStats>>
fn level_segment_stats(&self) -> Result<Vec<LevelStats>>
Per-LSM-level and per-segment size + entry-count stats, for tiering and erasure-coding placement decisions (which level / segment is large enough to demote, EC-encode, or migrate).
Cheap: derived from the live version’s metadata plus one file-size stat
per segment (no data-block scan). The per-level totals reconcile with
storage_stats: summed across levels they equal
the SST portion of StorageStats::used_bytes
and StorageStats::item_count.
§Examples
use lsm_tree::{AbstractTree, Config};
let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
for i in 0..100u32 {
tree.insert(format!("k{i:04}"), "v", 0);
}
tree.flush_active_memtable(0)?;
let levels = tree.level_segment_stats()?;
let total: u64 = levels.iter().map(|l| l.item_count).sum();
assert_eq!(total, tree.storage_stats()?.item_count);§Errors
Returns an error if a segment’s file size cannot be stat-ed.
Sourcefn compaction_debt(&self, strategy: &dyn CompactionStrategy) -> u64
fn compaction_debt(&self, strategy: &dyn CompactionStrategy) -> u64
Estimated bytes pending compaction under strategy: on-disk data above
its level’s target that must eventually be rewritten downward (a RocksDB
estimate-pending-compaction-bytes analog), a compaction-debt signal for a
scheduler / tiering consumer.
The strategy is supplied by the caller because the engine does not own a
configured compaction strategy (it is injected per compaction run); a
&dyn keeps this object-safe. Returns 0 for strategies without a
size-target notion of debt (FIFO, drop-range), or when the tree is at or
below its target shape. See
CompactionStrategy::pending_compaction_bytes.
Sourcefn write_backpressure(&self, _strategy: &dyn CompactionStrategy) -> Backpressure
fn write_backpressure(&self, _strategy: &dyn CompactionStrategy) -> Backpressure
Computed write-backpressure verdict from the live L0 table count and the
strategy’s pending-compaction bytes, against the configured
RuntimeConfig::backpressure
thresholds.
Advisory, mirroring write_admission: the caller
consults it and throttles in its own write loop (sleep suggested_delay
at Slowdown, pause at
Stop). The engine never blocks on it,
because it does not own the compaction that drains the debt — an internal
stall could deadlock the very thread that would compact.
The strategy is supplied by the caller for the same reason as
compaction_debt. Returns
Backpressure::None by default (no
thresholds configured).
Sourcefn write_admission(&self) -> Result<()>
fn write_admission(&self) -> Result<()>
Storage admission gate: Ok(()) if a write may proceed, or
Error::StorageFull if the tree is
over budget and should be treated as read-only.
Opt-in: returns Ok(()) unless
storage_admission_check
is enabled. The predicate is computed (not latched), so once space is
freed — the budget raised, a compaction reclaiming space, or disk freed —
the next call admits again with no restart.
Intended as a cheap pre-check the caller consults before applying a write batch. The footprint is cached per version, so the check is a constant-time read on the common path and only re-measures when a new version is installed (flush / compaction). Internal flush / compaction are never gated, so the engine can always reclaim.
§Errors
Error::StorageFull when the live footprint
plus reserved headroom exceeds the effective budget.
Sourcefn is_read_only(&self) -> bool
fn is_read_only(&self) -> bool
true when the admission gate is currently closed (see
write_admission). Convenience for callers that
want a boolean rather than a Result. Always false unless admission
control is enabled and the tree is over budget.
Only Error::StorageFull counts as
read-only: an unrelated admission error (e.g. an I/O failure while
measuring the footprint) is NOT an out-of-space condition and must not be
reported as one.
Sourcefn verify_checksum(&self) -> BlockVerifyReportwhere
Self: Sized,
fn verify_checksum(&self) -> BlockVerifyReportwhere
Self: Sized,
Proactively verifies every block’s XXH3 checksum across every SST in the tree’s current version — a scrubber for catching bit rot before it surfaces as a user-visible read failure (cron / scrub jobs).
Reports at block granularity and never aborts early. The returned
BlockVerifyReport records
block-corruption findings with (file, offset), while file-level errors
(e.g. BlockVerifyError::SstFileUnreadable)
carry the file only (no offset). It does not surface per-entry indices or
ECC-correction counts (when ECC-at-rest is enabled a within-budget corrupt
block may still be healed on read as a side effect of the scan, but the
report does not tally corrections).
Filesystems with native per-block integrity (ZFS, Btrfs, ReFS, S3 —
see Fs::capabilities) already detect
corruption on read; this scrub is the portable check for the rest.
Use Self::verify_checksum_with for parallelism / throttle control.
Sourcefn verify_checksum_with(&self, options: &VerifyOptions) -> BlockVerifyReportwhere
Self: Sized,
fn verify_checksum_with(&self, options: &VerifyOptions) -> BlockVerifyReportwhere
Self: Sized,
Like Self::verify_checksum but with configurable parallelism and
I/O throttle (see VerifyOptions).
Sourcefn flush(
&self,
_lock: &MutexGuard<'_, ()>,
seqno_threshold: SeqNo,
) -> Result<Option<u64>>
fn flush( &self, _lock: &MutexGuard<'_, ()>, seqno_threshold: SeqNo, ) -> Result<Option<u64>>
Synchronously flushes pending sealed memtables to tables.
Returns the sum of flushed memtable sizes that were flushed.
The function may not return a result, if nothing was flushed.
§Errors
Returns Err on an I/O error, or on a memtable residence-verification
failure under KvChecksumComputePoint::AtInsert:
a sealed memtable whose insert-time per-KV digests do not verify before
flush surfaces crate::Error::MemtableKvChecksumMismatch,
crate::Error::MemtableKvChecksumCorruptAlgorithm, or
crate::Error::InvalidTag (corrupt value_type).
Sourcefn iter(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>
fn iter( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>
Returns an iterator that scans through the entire tree.
Avoid using this function, or limit it as otherwise it may scan a lot of items.
Sourcefn stale_blob_bytes(&self) -> u64
fn stale_blob_bytes(&self) -> u64
Returns the disk space used by stale blobs.
Sourcefn flush_to_tables(
&self,
stream: impl Iterator<Item = Result<InternalValue>>,
) -> Result<Option<(Vec<Table>, Option<Vec<BlobFile>>)>>
fn flush_to_tables( &self, stream: impl Iterator<Item = Result<InternalValue>>, ) -> Result<Option<(Vec<Table>, Option<Vec<BlobFile>>)>>
Synchronously flushes a memtable to a table.
This method will not make the table immediately available,
use AbstractTree::register_tables for that.
§Errors
Will return Err if an IO error occurs.
Sourcefn get_highest_seqno(&self) -> Option<SeqNo>
fn get_highest_seqno(&self) -> Option<SeqNo>
Returns the highest sequence number.
Sourcefn len(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Result<usize>
fn len( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Result<usize>
Scans the entire tree, returning the number of items.
§Caution
This operation scans the entire tree: O(n) complexity!
Never, under any circumstances, use .len() == 0 to check
if the tree is empty, use Tree::is_empty instead.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let folder = tempfile::tempdir()?;
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
assert_eq!(tree.len(0, None)?, 0);
tree.insert("1", "abc", 0);
tree.insert("3", "abc", 1);
tree.insert("5", "abc", 2);
assert_eq!(tree.len(3, None)?, 3);§Errors
Will return Err if an IO error occurs.
Sourcefn is_empty(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Result<bool>
fn is_empty( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Result<bool>
Returns true if the tree is empty.
This operation has O(log N) complexity.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
assert!(tree.is_empty(0, None)?);
tree.insert("a", "abc", 0);
assert!(!tree.is_empty(1, None)?);§Errors
Will return Err if an IO error occurs.
Sourcefn first_key_value(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Option<IterGuardImpl>
fn first_key_value( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Option<IterGuardImpl>
Returns the first key-value pair in the tree. The key in this pair is the minimum key in the tree.
§Examples
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("1", "abc", 0);
tree.insert("3", "abc", 1);
tree.insert("5", "abc", 2);
let key = tree.first_key_value(3, None).expect("item should exist").key()?;
assert_eq!(&*key, "1".as_bytes());§Errors
Will return Err if an IO error occurs.
Sourcefn last_key_value(
&self,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Option<IterGuardImpl>
fn last_key_value( &self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Option<IterGuardImpl>
Returns the last key-value pair in the tree. The key in this pair is the maximum key in the tree.
§Examples
tree.insert("1", "abc", 0);
tree.insert("3", "abc", 1);
tree.insert("5", "abc", 2);
let key = tree.last_key_value(3, None).expect("item should exist").key()?;
assert_eq!(&*key, "5".as_bytes());§Errors
Will return Err if an IO error occurs.
Sourcefn get_pinned<K: AsRef<[u8]>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<Option<PinnableSlice>>
fn get_pinned<K: AsRef<[u8]>>( &self, key: K, seqno: SeqNo, ) -> Result<Option<PinnableSlice>>
Retrieves an item from the tree as a PinnableSlice.
When the value is backed by an on-disk data block, implementations
may return PinnableSlice::Pinned holding a reference to that block’s
decompressed buffer (avoiding a data copy). Memtable and blob-resolved
values use PinnableSlice::Owned. The default implementation always
returns Owned; only Tree overrides with the pinned path.
The existing AbstractTree::get method is unaffected.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
tree.insert("a", "my_value", 0);
let item = tree.get_pinned("a", 1)?;
assert_eq!(item.as_ref().map(|v| v.as_ref()), Some("my_value".as_bytes()));§Errors
Will return Err if an IO error occurs.
Sourcefn contains_prefix<K: AsRef<[u8]>>(
&self,
prefix: K,
seqno: SeqNo,
index: Option<(Arc<Memtable>, SeqNo)>,
) -> Result<bool>
fn contains_prefix<K: AsRef<[u8]>>( &self, prefix: K, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>, ) -> Result<bool>
Returns true if the tree contains any key with the given prefix.
This is a convenience method that checks whether the corresponding
prefix iterator yields at least one item, while surfacing any IO
errors via the Result return type. Implementations may override
this method to provide a more efficient prefix-existence check.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
assert!(!tree.contains_prefix("abc", 0, None)?);
tree.insert("abc:1", "value", 0);
assert!(tree.contains_prefix("abc", 1, None)?);
assert!(!tree.contains_prefix("xyz", 1, None)?);§Errors
Will return Err if an IO error occurs.
Sourcefn multi_get<K: AsRef<[u8]>>(
&self,
keys: impl IntoIterator<Item = K>,
seqno: SeqNo,
) -> Result<Vec<Option<UserValue>>>
fn multi_get<K: AsRef<[u8]>>( &self, keys: impl IntoIterator<Item = K>, seqno: SeqNo, ) -> Result<Vec<Option<UserValue>>>
Reads multiple keys from the tree.
Implementations may choose to perform all lookups against a single
version snapshot and acquire the version lock only once, which can be
more efficient than calling AbstractTree::get in a loop. The
default trait implementation, however, is a convenience wrapper that
simply calls AbstractTree::get for each key and therefore does not
guarantee a single-snapshot or single-lock acquisition. Optimized
implementations (such as Tree and BlobTree) provide the
single-snapshot/one-lock behavior.
§Examples
use lsm_tree::{AbstractTree, Config, Tree};
let tree = Config::new(folder, Default::default(), Default::default()).open()?;
tree.insert("a", "value_a", 0);
tree.insert("b", "value_b", 1);
let results = tree.multi_get(["a", "b", "c"], 2)?;
assert_eq!(results[0], Some("value_a".as_bytes().into()));
assert_eq!(results[1], Some("value_b".as_bytes().into()));
assert_eq!(results[2], None);§Errors
Will return Err if an IO error occurs.
Sourcefn try_insert<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
value: V,
seqno: SeqNo,
) -> Result<(u64, u64)>
fn try_insert<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, value: V, seqno: SeqNo, ) -> Result<(u64, u64)>
Admission-gated insert: consults
write_admission first and declines with
Error::StorageFull when the tree is over
budget, otherwise inserts and returns the same (added_bytes, memtable_size) tuple. This is the write entry a space-aware caller uses
so over-budget writes are refused up front rather than failing a flush
later; bare insert stays infallible for callers that
do not opt into admission control.
§Errors
Error::StorageFull when the admission gate
is closed.
Sourcefn try_merge<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
operand: V,
seqno: SeqNo,
) -> Result<(u64, u64)>
fn try_merge<K: Into<UserKey>, V: Into<UserValue>>( &self, key: K, operand: V, seqno: SeqNo, ) -> Result<(u64, u64)>
Admission-gated merge. See
try_insert.
§Errors
Error::StorageFull when the admission gate
is closed.
Sourcefn try_remove<K: Into<UserKey>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<(u64, u64)>
fn try_remove<K: Into<UserKey>>( &self, key: K, seqno: SeqNo, ) -> Result<(u64, u64)>
Admission-gated remove. See
try_insert.
Note a tombstone is itself a write that consumes space; an over-budget tree refuses it too. Reclaim space via compaction (never gated) rather than relying on deletes when already read-only.
§Errors
Error::StorageFull when the admission gate
is closed.
Sourcefn try_remove_weak<K: Into<UserKey>>(
&self,
key: K,
seqno: SeqNo,
) -> Result<(u64, u64)>
fn try_remove_weak<K: Into<UserKey>>( &self, key: K, seqno: SeqNo, ) -> Result<(u64, u64)>
Admission-gated remove_weak. See
try_insert.
§Errors
Error::StorageFull when the admission gate
is closed.
Sourcefn try_remove_range<K: Into<UserKey>>(
&self,
start: K,
end: K,
seqno: SeqNo,
) -> Result<u64>
fn try_remove_range<K: Into<UserKey>>( &self, start: K, end: K, seqno: SeqNo, ) -> Result<u64>
Admission-gated remove_range. See
try_insert.
§Errors
Error::StorageFull when the admission gate
is closed.
Sourcefn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64
fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64
Deletes all keys with the given prefix by inserting a range tombstone.
This is sugar over AbstractTree::remove_range using prefix bounds.
Returns the approximate size added to the memtable.
Returns 0 for empty prefixes or all-0xFF prefixes (cannot form valid half-open range).
Dyn Compatibility§
This trait is not dyn compatible.
In older versions of Rust, dyn compatibility was called "object safety".