lsm_tree/abstract_tree.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5use crate::tree::inner::{FlushGuard, VersionsWriteGuard};
6use crate::{
7 AnyTree, BlobTree, Config, Guard, InternalValue, KvPair, Memtable, SeqNo, TableId, Tree,
8 UserKey, UserValue,
9 iter_guard::{IterGuardImpl, SeekableGuardIter},
10 table::Table,
11 version::Version,
12 vlog::BlobFile,
13};
14use alloc::sync::Arc;
15#[cfg(not(feature = "std"))]
16use alloc::{boxed::Box, vec::Vec};
17use core::ops::RangeBounds;
18
19pub type RangeItem = crate::Result<KvPair>;
20
21type FlushToTablesResult = (Vec<Table>, Option<Vec<BlobFile>>);
22
23/// Summary of a checkpoint produced by
24/// [`AbstractTree::create_checkpoint`].
25///
26/// All byte counts are *logical* file sizes — hard links share the
27/// underlying inode storage, so a checkpoint's marginal disk usage is
28/// typically zero until the original files are compacted away.
29#[derive(Debug, Clone, Copy)]
30pub struct CheckpointInfo {
31 /// Number of SST files captured.
32 pub sst_files: usize,
33 /// Number of blob (value-log) files captured. Always `0` for a
34 /// standard [`Tree`].
35 pub blob_files: usize,
36 /// Sum of the logical file sizes of every captured SST + blob.
37 pub total_bytes: u64,
38 /// The version ID embedded in the checkpoint's `current` pointer.
39 pub version_id: u64,
40 /// Lower-bound visible-seqno watermark for the snapshot.
41 ///
42 /// Captured from the tree's `visible_seqno` generator BEFORE
43 /// [`AbstractTree::current_version`]. Following the standard
44 /// "lowest-excluded" watermark convention, `info.seqno = N` means
45 /// every record with `seqno < N` was committed at sample time and
46 /// is therefore guaranteed to be present in the snapshot. Records
47 /// with `seqno == N` may or may not be included (writers can hold
48 /// a record in the memtable for an instant before publishing the
49 /// next watermark); records with `seqno > N` may also be present
50 /// (writers can advance the counter between sample and version
51 /// snapshot, and those keys still land in the captured memtable).
52 ///
53 /// PITR consumers MUST use `seqno < info.seqno` as the inclusion
54 /// gate. Using `<=` (treating this as a max-included ceiling)
55 /// could move a recovery cutoff past data still needed from WAL
56 /// or replication; the field is a strict lower-exclusive watermark,
57 /// not a max-included ceiling.
58 pub seqno: SeqNo,
59}
60
61// Sealed on purpose: this trait is still public as a consumer-side bound
62// (`&impl AbstractTree`), but external implementations are no longer part of
63// the supported extension surface. Internal flush/version hooks keep evolving
64// with crate-owned tree types and must not create downstream semver traps.
65//
66// `sealed` stays `pub` only so sibling modules in this crate can write
67// `crate::abstract_tree::sealed::Sealed` in their impls. The parent module
68// `abstract_tree` is not publicly exported from the crate root, so downstream
69// crates still cannot name or implement this trait.
70pub mod sealed {
71 pub trait Sealed {}
72}
73
74/// Generic Tree API
75#[enum_dispatch::enum_dispatch]
76pub trait AbstractTree: sealed::Sealed {
77 /// Debug method for tracing the MVCC history of a key.
78 #[doc(hidden)]
79 fn print_trace(&self, key: &[u8]) -> crate::Result<()>;
80
81 /// Returns the number of cached table file descriptors.
82 fn table_file_cache_size(&self) -> usize;
83
84 // TODO: remove
85 #[doc(hidden)]
86 fn version_memtable_size_sum(&self) -> u64 {
87 self.get_version_history_lock().memtable_size_sum()
88 }
89
90 #[doc(hidden)]
91 fn next_table_id(&self) -> TableId;
92
93 #[doc(hidden)]
94 fn id(&self) -> crate::TreeId;
95
96 /// Like [`AbstractTree::get`], but returns the actual internal entry, not just the user value.
97 ///
98 /// Used in tests.
99 #[doc(hidden)]
100 fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result<Option<InternalValue>>;
101
102 #[doc(hidden)]
103 fn current_version(&self) -> Version;
104
105 /// Returns a read-only snapshot of the tree's on-disk storage footprint:
106 /// total used bytes, entry count, the average shape of a stored entry
107 /// (average key / value bytes), and an estimate of how many more
108 /// average-shaped entries fit in a byte budget (see
109 /// [`StorageStats::estimated_remaining_entries`](crate::StorageStats::estimated_remaining_entries)).
110 ///
111 /// Computed from the live version's table + blob metadata plus one
112 /// size-stat per live file; it never reads a data block. The default
113 /// implementation reports [`StorageStatus::Healthy`](crate::StorageStatus::Healthy);
114 /// the standard tree overrides it to report
115 /// [`StorageStatus::CompactionInProgress`](crate::StorageStatus::CompactionInProgress) while a
116 /// compaction runs.
117 ///
118 /// # Examples
119 ///
120 /// ```
121 /// # use lsm_tree::Error as TreeError;
122 /// use lsm_tree::{AbstractTree, Config};
123 ///
124 /// let folder = tempfile::tempdir()?;
125 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
126 ///
127 /// for i in 0..100u64 {
128 /// tree.insert(format!("key{i:04}"), "value", i);
129 /// }
130 /// tree.flush_active_memtable(0)?;
131 ///
132 /// let stats = tree.storage_stats()?;
133 /// assert_eq!(stats.item_count, 100);
134 /// // Roughly how many more average-shaped entries fit in another 1 MiB.
135 /// let _headroom = stats.estimated_remaining_entries(1024 * 1024);
136 /// #
137 /// # Ok::<(), TreeError>(())
138 /// ```
139 ///
140 /// # Errors
141 ///
142 /// Returns an error if a live file's size cannot be stat-ed.
143 fn storage_stats(&self) -> crate::Result<crate::StorageStats> {
144 crate::storage_stats::compute_storage_stats(&self.current_version(), false, true)
145 }
146
147 /// Per-LSM-level and per-segment size + entry-count stats, for tiering and
148 /// erasure-coding placement decisions (which level / segment is large enough
149 /// to demote, EC-encode, or migrate).
150 ///
151 /// Cheap: derived from the live version's metadata plus one file-size stat
152 /// per segment (no data-block scan). The per-level totals reconcile with
153 /// [`storage_stats`](Self::storage_stats): summed across levels they equal
154 /// the SST portion of [`StorageStats::used_bytes`](crate::StorageStats::used_bytes)
155 /// and [`StorageStats::item_count`](crate::StorageStats::item_count).
156 ///
157 /// # Examples
158 ///
159 /// ```
160 /// # use lsm_tree::Error as TreeError;
161 /// use lsm_tree::{AbstractTree, Config};
162 ///
163 /// let folder = tempfile::tempdir()?;
164 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
165 /// for i in 0..100u32 {
166 /// tree.insert(format!("k{i:04}"), "v", 0);
167 /// }
168 /// tree.flush_active_memtable(0)?;
169 ///
170 /// let levels = tree.level_segment_stats()?;
171 /// let total: u64 = levels.iter().map(|l| l.item_count).sum();
172 /// assert_eq!(total, tree.storage_stats()?.item_count);
173 /// #
174 /// # Ok::<(), TreeError>(())
175 /// ```
176 ///
177 /// # Errors
178 ///
179 /// Returns an error if a segment's file size cannot be stat-ed.
180 fn level_segment_stats(&self) -> crate::Result<Vec<crate::LevelStats>> {
181 crate::storage_stats::compute_level_segment_stats(&self.current_version())
182 }
183
184 /// Estimated bytes pending compaction under `strategy`: on-disk data above
185 /// its level's target that must eventually be rewritten downward (a `RocksDB`
186 /// `estimate-pending-compaction-bytes` analog), a compaction-debt signal for a
187 /// scheduler / tiering consumer.
188 ///
189 /// The strategy is supplied by the caller because the engine does not own a
190 /// configured compaction strategy (it is injected per compaction run); a
191 /// `&dyn` keeps this object-safe. Returns `0` for strategies without a
192 /// size-target notion of debt (FIFO, drop-range), or when the tree is at or
193 /// below its target shape. See
194 /// [`CompactionStrategy::pending_compaction_bytes`](crate::compaction::CompactionStrategy::pending_compaction_bytes).
195 fn compaction_debt(&self, strategy: &dyn crate::compaction::CompactionStrategy) -> u64 {
196 strategy.pending_compaction_bytes(&self.current_version())
197 }
198
199 /// Computed write-backpressure verdict from the live L0 table count and the
200 /// strategy's pending-compaction bytes, against the configured
201 /// [`RuntimeConfig::backpressure`](crate::runtime_config::RuntimeConfig)
202 /// thresholds.
203 ///
204 /// Advisory, mirroring [`write_admission`](Self::write_admission): the caller
205 /// consults it and throttles in its own write loop (sleep `suggested_delay`
206 /// at [`Slowdown`](crate::Backpressure::Slowdown), pause at
207 /// [`Stop`](crate::Backpressure::Stop)). The engine never blocks on it,
208 /// because it does not own the compaction that drains the debt — an internal
209 /// stall could deadlock the very thread that would compact.
210 ///
211 /// The strategy is supplied by the caller for the same reason as
212 /// [`compaction_debt`](Self::compaction_debt). Returns
213 /// [`Backpressure::None`](crate::Backpressure::None) by default (no
214 /// thresholds configured).
215 fn write_backpressure(
216 &self,
217 _strategy: &dyn crate::compaction::CompactionStrategy,
218 ) -> crate::Backpressure {
219 crate::Backpressure::None
220 }
221
222 /// Storage admission gate: `Ok(())` if a write may proceed, or
223 /// [`Error::StorageFull`](crate::Error::StorageFull) if the tree is
224 /// over budget and should be treated as read-only.
225 ///
226 /// Opt-in: returns `Ok(())` unless
227 /// [`storage_admission_check`](crate::runtime_config::RuntimeConfig::storage_admission_check)
228 /// is enabled. The predicate is computed (not latched), so once space is
229 /// freed — the budget raised, a compaction reclaiming space, or disk freed —
230 /// the next call admits again with no restart.
231 ///
232 /// Intended as a cheap pre-check the caller consults before applying a
233 /// write batch. The footprint is cached per version, so the check is a
234 /// constant-time read on the common path and only re-measures when a new
235 /// version is installed (flush / compaction). Internal flush / compaction
236 /// are never gated, so the engine can always reclaim.
237 ///
238 /// # Errors
239 ///
240 /// [`Error::StorageFull`](crate::Error::StorageFull) when the live footprint
241 /// plus reserved headroom exceeds the effective budget.
242 fn write_admission(&self) -> crate::Result<()> {
243 Ok(())
244 }
245
246 /// `true` when the admission gate is currently closed (see
247 /// [`write_admission`](Self::write_admission)). Convenience for callers that
248 /// want a boolean rather than a `Result`. Always `false` unless admission
249 /// control is enabled and the tree is over budget.
250 ///
251 /// Only [`Error::StorageFull`](crate::Error::StorageFull) counts as
252 /// read-only: an unrelated admission error (e.g. an I/O failure while
253 /// measuring the footprint) is NOT an out-of-space condition and must not be
254 /// reported as one.
255 fn is_read_only(&self) -> bool {
256 matches!(
257 self.write_admission(),
258 Err(crate::Error::StorageFull { .. })
259 )
260 }
261
262 /// Proactively verifies every block's XXH3 checksum across every SST in
263 /// the tree's current version — a scrubber for catching bit rot before it
264 /// surfaces as a user-visible read failure (cron / scrub jobs).
265 ///
266 /// Reports at block granularity and never aborts early. The returned
267 /// [`BlockVerifyReport`](crate::verify::BlockVerifyReport) records
268 /// block-corruption findings with `(file, offset)`, while file-level errors
269 /// (e.g. [`BlockVerifyError::SstFileUnreadable`](crate::verify::BlockVerifyError::SstFileUnreadable))
270 /// carry the file only (no offset). It does not surface per-entry indices or
271 /// ECC-correction counts (when ECC-at-rest is enabled a within-budget corrupt
272 /// block may still be healed on read as a side effect of the scan, but the
273 /// report does not tally corrections).
274 ///
275 /// Filesystems with native per-block integrity (ZFS, Btrfs, `ReFS`, S3 —
276 /// see [`Fs::capabilities`](crate::fs::Fs::capabilities)) already detect
277 /// corruption on read; this scrub is the portable check for the rest.
278 ///
279 /// Use [`Self::verify_checksum_with`] for parallelism / throttle control.
280 #[cfg(feature = "std")]
281 fn verify_checksum(&self) -> crate::verify::BlockVerifyReport
282 where
283 Self: Sized,
284 {
285 crate::verify::verify_block_checksums(self)
286 }
287
288 /// Like [`Self::verify_checksum`] but with configurable parallelism and
289 /// I/O throttle (see [`VerifyOptions`](crate::verify::VerifyOptions)).
290 #[cfg(feature = "std")]
291 fn verify_checksum_with(
292 &self,
293 options: &crate::verify::VerifyOptions,
294 ) -> crate::verify::BlockVerifyReport
295 where
296 Self: Sized,
297 {
298 crate::verify::verify_block_checksums_with(self, options)
299 }
300
301 #[doc(hidden)]
302 fn get_version_history_lock(&self) -> VersionsWriteGuard<'_>;
303
304 /// Creates a hard-linked checkpoint of the tree's on-disk state in
305 /// `target_path` for point-in-time recovery (PITR) backup.
306 ///
307 /// The checkpoint is a fully functional tree that can be opened
308 /// independently via [`Config::open`](crate::Config::open). For the
309 /// common single-filesystem case all SST files (and blob files, for
310 /// [`BlobTree`]) are hard-linked rather than copied, so the operation
311 /// is O(1) per file and consumes zero additional disk space until the
312 /// original files are compacted away — at which point the inode is
313 /// kept alive by the checkpoint link.
314 ///
315 /// # Cross-filesystem / cross-backend fall-back
316 ///
317 /// When a source file lives on a different filesystem than the
318 /// checkpoint target — e.g. an SST routed to a hot tier via
319 /// [`level_routes`](crate::Config::level_routes) on a separate volume,
320 /// or a backup directory on a foreign mount — the hard link cannot
321 /// be created (Unix `EXDEV`). In that case the checkpoint silently
322 /// falls back to a streamed byte copy, which:
323 ///
324 /// - takes time linear in the file size instead of O(1), and
325 /// - consumes disk space equal to the copied bytes on the target
326 /// volume (no inode sharing across filesystems).
327 ///
328 /// Each fall-back call emits one [`log::debug`] line (deliberately not
329 /// `warn`: a misconfigured tier could trigger this path once per SST
330 /// and per blob — thousands of times per snapshot — and per-file
331 /// warnings would drown real signal). Operators wanting hard-visibility
332 /// of unexpected full copies should enable debug logging on the `fs`
333 /// module or watch the `CheckpointInfo.total_bytes` figure (≫ inode
334 /// link cost means the fallback fired). The same `debug` policy applies
335 /// when source and target use entirely different [`Fs`](crate::fs::Fs)
336 /// backends (e.g. [`MemFs`](crate::fs::MemFs) → [`StdFs`](crate::fs::StdFs)
337 /// in tests).
338 ///
339 /// # Concurrency
340 ///
341 /// While the checkpoint is being built, compaction continues normally
342 /// but the physical removal of obsolete files is deferred until the
343 /// checkpoint hard-links are in place. This is implemented by an
344 /// internal reference-counted deletion gate; callers do not have to
345 /// pause compaction themselves.
346 ///
347 /// # Errors
348 ///
349 /// Returns an error if:
350 /// - the active memtable could not be flushed,
351 /// - `target_path` already exists (to prevent accidental overwrites),
352 /// - a hard link / copy fall-back could not be created, or
353 /// - the manifest / version pointer files could not be replicated.
354 ///
355 /// On error any partial checkpoint files are removed automatically
356 /// (best-effort) so callers can safely retry against the same path.
357 // std-only: checkpoint creation hard-links / copies files via std::fs.
358 #[cfg(feature = "std")]
359 fn create_checkpoint(&self, target_path: &crate::path::Path) -> crate::Result<CheckpointInfo>;
360
361 /// Seals the active memtable and flushes to table(s).
362 ///
363 /// If there are already other sealed memtables lined up, those will be flushed as well.
364 ///
365 /// Only used in tests.
366 #[doc(hidden)]
367 fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result<()> {
368 let lock = self.get_flush_lock();
369 self.rotate_memtable();
370 self.flush(&lock, eviction_seqno)?;
371 Ok(())
372 }
373
374 /// Synchronously flushes pending sealed memtables to tables.
375 ///
376 /// Returns the sum of flushed memtable sizes that were flushed.
377 ///
378 /// The function may not return a result, if nothing was flushed.
379 ///
380 /// # Errors
381 ///
382 /// Returns `Err` on an I/O error, or on a memtable residence-verification
383 /// failure under [`KvChecksumComputePoint::AtInsert`](crate::runtime_config::KvChecksumComputePoint::AtInsert):
384 /// a sealed memtable whose insert-time per-KV digests do not verify before
385 /// flush surfaces [`crate::Error::MemtableKvChecksumMismatch`],
386 /// [`crate::Error::MemtableKvChecksumCorruptAlgorithm`], or
387 /// [`crate::Error::InvalidTag`] (corrupt `value_type`).
388 fn flush(&self, _lock: &FlushGuard<'_>, seqno_threshold: SeqNo) -> crate::Result<Option<u64>> {
389 use crate::{
390 compaction::stream::CompactionStream, merge::Merger, range_tombstone::RangeTombstone,
391 };
392
393 let version_history = self.get_version_history_lock();
394 let latest = version_history.latest_version();
395
396 if latest.sealed_memtables.len() == 0 {
397 return Ok(None);
398 }
399
400 let sealed_ids = latest
401 .sealed_memtables
402 .iter()
403 .map(|mt| mt.id)
404 .collect::<Vec<_>>();
405
406 let flushed_size = latest.sealed_memtables.iter().map(|mt| mt.size()).sum();
407
408 // AtInsert residence check: verify each sealed memtable's insert-time
409 // per-KV digests against a recompute over the entries' current bytes
410 // before writing them out. A divergence means an entry was corrupted
411 // (a RAM bit-flip) while it sat in the memtable. Memtables with no
412 // insert digests (the default) return immediately without walking.
413 //
414 // This is a SEPARATE pass over the as-inserted memtable entries, not
415 // fused into the writer's per-KV footer encode, and deliberately so:
416 // the footer digest is computed over POST-merge / post-seqno-filter
417 // bytes (the CompactionStream below applies the merge operator), so a
418 // merge operator's combined value differs from any single inserted
419 // value. Comparing the carried insert digest against the footer digest
420 // would false-positive on every legitimate merge. Residence corruption
421 // is a property of what was inserted (pre-merge); it must be checked
422 // here, against the raw memtable entries.
423 for mt in latest.sealed_memtables.iter() {
424 mt.verify_kv_residence()?;
425 }
426
427 // Collect range tombstones from sealed memtables
428 let mut range_tombstones: Vec<RangeTombstone> = Vec::new();
429 for mt in latest.sealed_memtables.iter() {
430 range_tombstones.extend(mt.range_tombstones_sorted());
431 }
432 range_tombstones
433 .sort_by(|a, b| a.cmp_with_comparator(b, self.tree_config().comparator.as_ref()));
434 range_tombstones.dedup();
435
436 let merger = Merger::new(
437 latest
438 .sealed_memtables
439 .iter()
440 .map(|mt| mt.iter().map(Ok))
441 .collect::<Vec<_>>(),
442 self.tree_config().comparator.clone(),
443 );
444 // RT suppression is not needed here: flush writes both entries and RTs
445 // to the output tables. Suppression happens at read time, not write time.
446 let stream = CompactionStream::new(merger, seqno_threshold)
447 .with_merge_operator(self.tree_config().merge_operator.clone());
448
449 drop(version_history);
450
451 // Clone needed: flush_to_tables_with_rt consumes the Vec, but on the
452 // RT-only path (no KV data, tables.is_empty()) we re-insert RTs into the
453 // active memtable. Flush is infrequent and RT count is small.
454 if let Some((tables, blob_files)) =
455 self.flush_to_tables_with_rt(stream, range_tombstones.clone())?
456 {
457 // If no tables were produced (RT-only memtable), re-insert RTs
458 // into active memtable so they aren't lost
459 if tables.is_empty() && !range_tombstones.is_empty() {
460 let active = self.active_memtable();
461 for rt in &range_tombstones {
462 let _ =
463 active.insert_range_tombstone(rt.start.clone(), rt.end.clone(), rt.seqno);
464 }
465 }
466
467 self.register_tables(
468 &tables,
469 blob_files.as_deref(),
470 None,
471 &sealed_ids,
472 seqno_threshold,
473 )?;
474 }
475
476 Ok(Some(flushed_size))
477 }
478
479 /// Returns an iterator that scans through the entire tree.
480 ///
481 /// Avoid using this function, or limit it as otherwise it may scan a lot of items.
482 fn iter(
483 &self,
484 seqno: SeqNo,
485 index: Option<(Arc<Memtable>, SeqNo)>,
486 ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static> {
487 self.range::<&[u8], _>(.., seqno, index)
488 }
489
490 /// Returns an iterator over a prefixed set of items.
491 ///
492 /// Avoid using an empty prefix as it may scan a lot of items (unless limited).
493 fn prefix<K: AsRef<[u8]>>(
494 &self,
495 prefix: K,
496 seqno: SeqNo,
497 index: Option<(Arc<Memtable>, SeqNo)>,
498 ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
499
500 /// Returns an iterator over a range of items.
501 ///
502 /// Avoid using full or unbounded ranges as they may scan a lot of items (unless limited).
503 fn range<K: AsRef<[u8]>, R: RangeBounds<K>>(
504 &self,
505 range: R,
506 seqno: SeqNo,
507 index: Option<(Arc<Memtable>, SeqNo)>,
508 ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
509
510 /// Returns a range iterator that can reposition (seek) in place without
511 /// reopening its per-SST readers.
512 ///
513 /// Unlike [`range`](Self::range)'s boxed `DoubleEndedIterator`, the returned
514 /// [`SeekableGuardIter`] additionally exposes `seek_to` / `seek_to_for_prev`,
515 /// so a consumer can jump a live iterator to any key (`RocksDB` `Seek` /
516 /// `SeekForPrev`) — enabling data-dependent scans (joins, skip-scan) without
517 /// reopening per-SST readers per jump.
518 fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>(
519 &self,
520 range: R,
521 seqno: SeqNo,
522 index: Option<(Arc<Memtable>, SeqNo)>,
523 ) -> Box<dyn SeekableGuardIter + 'static>;
524
525 /// Scans a sequence of disjoint, ascending key sub-intervals, reusing one set
526 /// of per-SST readers across all of them.
527 ///
528 /// The per-SST setup is paid once (when the underlying seekable iterator is
529 /// opened); each interval is served by repositioning that iterator. This
530 /// amortizes the setup across `N` intervals instead of paying it per
531 /// interval, so multi-interval scan throughput scales with the total rows
532 /// returned rather than the interval count.
533 ///
534 /// The interval source is pulled lazily, so intervals may be produced on
535 /// demand (e.g. computed from rows already returned).
536 fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>(
537 &self,
538 intervals: I,
539 seqno: SeqNo,
540 index: Option<(Arc<Memtable>, SeqNo)>,
541 ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
542 where
543 I::IntoIter: Send + 'static;
544
545 /// Returns the approximate number of tombstones in the tree.
546 fn tombstone_count(&self) -> u64;
547
548 /// Returns the approximate number of weak tombstones (single deletes) in the tree.
549 fn weak_tombstone_count(&self) -> u64;
550
551 /// Returns the approximate number of values reclaimable once weak tombstones can be GC'd.
552 fn weak_tombstone_reclaimable_count(&self) -> u64;
553
554 /// Drops tables that are fully contained in a given range.
555 ///
556 /// Accepts any `RangeBounds`, including unbounded or exclusive endpoints.
557 /// If the normalized lower bound is greater than the upper bound, the
558 /// method returns without performing any work.
559 ///
560 /// # Errors
561 ///
562 /// Will return `Err` only if an IO error occurs.
563 fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> crate::Result<()>;
564
565 /// Drops all tables and clears all memtables atomically.
566 ///
567 /// # Errors
568 ///
569 /// Will return `Err` only if an IO error occurs.
570 fn clear(&self) -> crate::Result<()>;
571
572 /// Performs major compaction, blocking the caller until it's done.
573 ///
574 /// Returns a [`crate::compaction::CompactionResult`] describing what action was taken.
575 ///
576 /// # Garbage-collection / merge-fold watermark (`seqno_threshold`)
577 ///
578 /// `seqno_threshold` is the MVCC garbage-collection watermark: the engine may
579 /// collapse history that no snapshot reading at a seqno `< seqno_threshold`
580 /// can still observe. Concretely, only entries whose seqno is `< seqno_threshold`
581 /// are eligible for:
582 ///
583 /// - dropping shadowed versions / GC-ing tombstones, and
584 /// - **folding merge operands** via the [`crate::MergeOperator`]: a key written
585 /// only through [`Self::merge`] (no base value) accumulates one operand per
586 /// call, and reads re-apply the whole chain (`O(operands)` per read) until
587 /// compaction folds it. Folding a chain into a single value is only
588 /// MVCC-safe when no live snapshot reads *between* the operands, which is
589 /// exactly what `seqno_threshold` certifies.
590 ///
591 /// The engine does **not** track snapshots (unlike a `RocksDB`-style
592 /// snapshot list); the caller owns snapshot lifecycle and must supply this
593 /// watermark:
594 ///
595 /// - To fold/GC everything (no active snapshots), pass a value **above every
596 /// live seqno** (e.g. the next value from the [`crate::SequenceNumberCounter`]).
597 /// - With outstanding snapshots, pass the **oldest** snapshot's seqno so their
598 /// reads stay correct.
599 /// - `seqno_threshold == 0` certifies nothing as collapsible, so **no folding
600 /// or GC happens** — `major_compact(target, 0)` only restructures tables and
601 /// leaves a merge-only key's full operand chain intact.
602 ///
603 /// # Errors
604 ///
605 /// Will return `Err` if an IO error occurs.
606 fn major_compact(
607 &self,
608 target_size: u64,
609 seqno_threshold: SeqNo,
610 ) -> crate::Result<crate::compaction::CompactionResult>;
611
612 /// Returns the disk space used by stale blobs.
613 fn stale_blob_bytes(&self) -> u64 {
614 0
615 }
616
617 /// Gets the space usage of all filters in the tree.
618 ///
619 /// May not correspond to the actual memory size because filter blocks may be paged out.
620 fn filter_size(&self) -> u64;
621
622 /// Gets the memory usage of all pinned filters in the tree.
623 fn pinned_filter_size(&self) -> usize;
624
625 /// Gets the memory usage of all pinned index blocks in the tree.
626 fn pinned_block_index_size(&self) -> usize;
627
628 /// Gets the length of the version free list.
629 fn version_free_list_len(&self) -> usize;
630
631 /// Returns the metrics structure.
632 #[cfg(feature = "metrics")]
633 fn metrics(&self) -> &Arc<crate::Metrics>;
634
635 /// A point-in-time [`CacheStats`](crate::CacheStats) snapshot of block-cache
636 /// effectiveness (cumulative hit / miss counts and rate) and occupancy
637 /// (current size against capacity).
638 ///
639 /// The stable, owned observability view over the block cache, so a consumer
640 /// can read cache health without holding the mutable
641 /// [`metrics`](Self::metrics) handle. Counts are cumulative since process
642 /// start; derive a rate over an interval from the delta between two polls.
643 #[cfg(feature = "metrics")]
644 fn cache_stats(&self) -> crate::CacheStats;
645
646 /// Acquires the flush lock which is required to call [`Tree::flush`].
647 fn get_flush_lock(&self) -> FlushGuard<'_>;
648
649 /// Synchronously flushes a memtable to a table.
650 ///
651 /// This method will not make the table immediately available,
652 /// use [`AbstractTree::register_tables`] for that.
653 ///
654 /// # Errors
655 ///
656 /// Will return `Err` if an IO error occurs.
657 fn flush_to_tables(
658 &self,
659 stream: impl Iterator<Item = crate::Result<InternalValue>>,
660 ) -> crate::Result<Option<FlushToTablesResult>> {
661 self.flush_to_tables_with_rt(stream, Vec::new())
662 }
663
664 /// Like [`AbstractTree::flush_to_tables`], but also writes range tombstones.
665 ///
666 /// This is an internal extension hook on the crate's sealed tree types and
667 /// is hidden from generated documentation.
668 ///
669 /// # Errors
670 ///
671 /// Will return `Err` if an IO error occurs.
672 #[doc(hidden)]
673 fn flush_to_tables_with_rt(
674 &self,
675 stream: impl Iterator<Item = crate::Result<InternalValue>>,
676 range_tombstones: Vec<crate::range_tombstone::RangeTombstone>,
677 ) -> crate::Result<Option<FlushToTablesResult>>;
678
679 /// Atomically registers flushed tables into the tree, removing their associated sealed memtables.
680 ///
681 /// # Errors
682 ///
683 /// Will return `Err` if an IO error occurs.
684 fn register_tables(
685 &self,
686 tables: &[Table],
687 blob_files: Option<&[BlobFile]>,
688 frag_map: Option<crate::blob_tree::FragmentationMap>,
689 sealed_memtables_to_delete: &[crate::tree::inner::MemtableId],
690 gc_watermark: SeqNo,
691 ) -> crate::Result<()>;
692
693 /// Clears the active memtable atomically.
694 fn clear_active_memtable(&self);
695
696 /// Returns the number of sealed memtables.
697 fn sealed_memtable_count(&self) -> usize;
698
699 /// Performs compaction on the tree's levels, blocking the caller until it's done.
700 ///
701 /// Returns a [`crate::compaction::CompactionResult`] describing what action was taken.
702 ///
703 /// # Errors
704 ///
705 /// Will return `Err` if an IO error occurs.
706 fn compact(
707 &self,
708 strategy: Arc<dyn crate::compaction::CompactionStrategy>,
709 seqno_threshold: SeqNo,
710 ) -> crate::Result<crate::compaction::CompactionResult>;
711
712 /// Returns the next table's ID.
713 fn get_next_table_id(&self) -> TableId;
714
715 /// Returns the tree config.
716 fn tree_config(&self) -> &Config;
717
718 /// Returns the highest sequence number.
719 fn get_highest_seqno(&self) -> Option<SeqNo> {
720 let memtable_seqno = self.get_highest_memtable_seqno();
721 let table_seqno = self.get_highest_persisted_seqno();
722 memtable_seqno.max(table_seqno)
723 }
724
725 /// Returns the active memtable.
726 fn active_memtable(&self) -> Arc<Memtable>;
727
728 /// Returns the tree type.
729 fn tree_type(&self) -> crate::TreeType {
730 if self.tree_config().kv_separation_opts.is_some() {
731 crate::TreeType::Blob
732 } else {
733 crate::TreeType::Standard
734 }
735 }
736
737 /// Seals the active memtable.
738 fn rotate_memtable(&self) -> Option<Arc<Memtable>>;
739
740 /// Returns the number of tables currently in the tree.
741 fn table_count(&self) -> usize;
742
743 /// Returns the number of tables in `levels[idx]`.
744 ///
745 /// Returns `None` if the level does not exist (if idx >= 7).
746 fn level_table_count(&self, idx: usize) -> Option<usize>;
747
748 /// Returns the number of disjoint runs in L0.
749 ///
750 /// Can be used to determine whether to write stall.
751 fn l0_run_count(&self) -> usize;
752
753 /// Returns the number of blob files currently in the tree.
754 fn blob_file_count(&self) -> usize;
755
756 /// Approximates the number of items in the tree.
757 fn approximate_len(&self) -> usize;
758
759 /// Returns the disk space usage.
760 fn disk_space(&self) -> u64;
761
762 /// Estimates the on-disk bytes and entry count contained in `range` at
763 /// `seqno`, WITHOUT reading any data block.
764 ///
765 /// The estimate interpolates each overlapping SST's data-block offsets at
766 /// the range boundaries (block granularity) and adds the active + sealed
767 /// memtables' in-range share. For a KV-separated tree the per-SST blob
768 /// bytes are apportioned by the same in-range fraction (blob files are not
769 /// key-indexed, so a finer estimate is impossible without reading data).
770 /// Intended for query planning (split-point selection, cost-based join
771 /// ordering), not exact accounting; accuracy is typically within ~10-15%
772 /// on roughly-uniform data.
773 ///
774 /// # Examples
775 ///
776 /// ```
777 /// # use lsm_tree::Error as TreeError;
778 /// use lsm_tree::{AbstractTree, Config};
779 ///
780 /// let folder = tempfile::tempdir()?;
781 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
782 /// for i in 0..100u32 {
783 /// tree.insert(format!("k{i:04}"), "value", 0);
784 /// }
785 /// tree.flush_active_memtable(0)?;
786 ///
787 /// // The full range covers every entry exactly once it is all flushed —
788 /// // estimated without reading a single data block.
789 /// let all = tree.approximate_range_stats::<&str, _>(.., 1)?;
790 /// assert_eq!(all.key_count, tree.approximate_len() as u64);
791 /// assert!(all.bytes > 0);
792 ///
793 /// // A range past every key is empty.
794 /// let none = tree.approximate_range_stats("zzzz".."zzzzz", 1)?;
795 /// assert_eq!(none.key_count, 0);
796 /// assert_eq!(none.bytes, 0);
797 /// #
798 /// # Ok::<(), TreeError>(())
799 /// ```
800 ///
801 /// # Errors
802 ///
803 /// Returns an error if a block index or table metadata read fails.
804 fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>(
805 &self,
806 range: R,
807 seqno: SeqNo,
808 ) -> crate::Result<crate::ApproximateRangeStats>;
809
810 /// Estimates the row cardinality and selectivity of `range` at `seqno`,
811 /// WITHOUT reading any data block.
812 ///
813 /// Uses the per-data-block zone map (per-block row counts + key ranges) for a
814 /// block-granularity row count: every data block whose key range overlaps the
815 /// query contributes its recorded row count, plus the active + sealed
816 /// memtables' in-range counts. When a table has no zone map the row count
817 /// falls back to the byte-fraction estimate of [`approximate_range_stats`].
818 /// Selectivity is `rows / total_rows`, monotonic in predicate tightness, for
819 /// cost-based planning (join ordering, scan-vs-seek). Not exact accounting.
820 ///
821 /// [`approximate_range_stats`]: AbstractTree::approximate_range_stats
822 ///
823 /// # Examples
824 ///
825 /// ```
826 /// # use lsm_tree::Error as TreeError;
827 /// use lsm_tree::{AbstractTree, Config};
828 ///
829 /// let folder = tempfile::tempdir()?;
830 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
831 /// for i in 0..100u32 {
832 /// tree.insert(format!("k{i:04}"), "v", 0);
833 /// }
834 /// tree.flush_active_memtable(0)?;
835 ///
836 /// // The full range covers every row; selectivity is 1.0.
837 /// let all = tree.approximate_range_cardinality::<&str, _>(.., 1)?;
838 /// assert_eq!(all.rows, tree.approximate_len() as u64);
839 /// assert!((all.selectivity - 1.0).abs() < 1e-9);
840 ///
841 /// // A tighter range selects fewer rows.
842 /// let part = tree.approximate_range_cardinality("k0000".."k0050", 1)?;
843 /// assert!(part.selectivity <= all.selectivity);
844 /// #
845 /// # Ok::<(), TreeError>(())
846 /// ```
847 ///
848 /// # Errors
849 ///
850 /// Returns an error if a block index, zone map, or table metadata read fails.
851 fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>(
852 &self,
853 range: R,
854 seqno: SeqNo,
855 ) -> crate::Result<crate::RangeCardinality>;
856
857 /// Returns the highest sequence number of the active memtable.
858 fn get_highest_memtable_seqno(&self) -> Option<SeqNo>;
859
860 /// Returns the highest sequence number that is flushed to disk.
861 fn get_highest_persisted_seqno(&self) -> Option<SeqNo>;
862
863 /// Scans the entire tree, returning the number of items.
864 ///
865 /// ###### Caution
866 ///
867 /// This operation scans the entire tree: O(n) complexity!
868 ///
869 /// Never, under any circumstances, use .`len()` == 0 to check
870 /// if the tree is empty, use [`Tree::is_empty`] instead.
871 ///
872 /// # Examples
873 ///
874 /// ```
875 /// # use lsm_tree::Error as TreeError;
876 /// use lsm_tree::{AbstractTree, Config, Tree};
877 ///
878 /// let folder = tempfile::tempdir()?;
879 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
880 ///
881 /// assert_eq!(tree.len(0, None)?, 0);
882 /// tree.insert("1", "abc", 0);
883 /// tree.insert("3", "abc", 1);
884 /// tree.insert("5", "abc", 2);
885 /// assert_eq!(tree.len(3, None)?, 3);
886 /// #
887 /// # Ok::<(), TreeError>(())
888 /// ```
889 ///
890 /// # Errors
891 ///
892 /// Will return `Err` if an IO error occurs.
893 fn len(&self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>) -> crate::Result<usize> {
894 let mut count = 0;
895
896 for item in self.iter(seqno, index) {
897 let _ = item.key()?;
898 count += 1;
899 }
900
901 Ok(count)
902 }
903
904 /// Returns `true` if the tree is empty.
905 ///
906 /// This operation has O(log N) complexity.
907 ///
908 /// # Examples
909 ///
910 /// ```
911 /// # let folder = tempfile::tempdir()?;
912 /// use lsm_tree::{AbstractTree, Config, Tree};
913 ///
914 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
915 /// assert!(tree.is_empty(0, None)?);
916 ///
917 /// tree.insert("a", "abc", 0);
918 /// assert!(!tree.is_empty(1, None)?);
919 /// #
920 /// # Ok::<(), lsm_tree::Error>(())
921 /// ```
922 ///
923 /// # Errors
924 ///
925 /// Will return `Err` if an IO error occurs.
926 fn is_empty(&self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>) -> crate::Result<bool> {
927 Ok(self
928 .first_key_value(seqno, index)
929 .map(crate::Guard::key)
930 .transpose()?
931 .is_none())
932 }
933
934 /// Returns the first key-value pair in the tree.
935 /// The key in this pair is the minimum key in the tree.
936 ///
937 /// # Examples
938 ///
939 /// ```
940 /// # use lsm_tree::Error as TreeError;
941 /// # use lsm_tree::{AbstractTree, Config, Tree, Guard};
942 /// #
943 /// # let folder = tempfile::tempdir()?;
944 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
945 ///
946 /// tree.insert("1", "abc", 0);
947 /// tree.insert("3", "abc", 1);
948 /// tree.insert("5", "abc", 2);
949 ///
950 /// let key = tree.first_key_value(3, None).expect("item should exist").key()?;
951 /// assert_eq!(&*key, "1".as_bytes());
952 /// #
953 /// # Ok::<(), TreeError>(())
954 /// ```
955 ///
956 /// # Errors
957 ///
958 /// Will return `Err` if an IO error occurs.
959 fn first_key_value(
960 &self,
961 seqno: SeqNo,
962 index: Option<(Arc<Memtable>, SeqNo)>,
963 ) -> Option<IterGuardImpl> {
964 self.iter(seqno, index).next()
965 }
966
967 /// Returns the last key-value pair in the tree.
968 /// The key in this pair is the maximum key in the tree.
969 ///
970 /// # Examples
971 ///
972 /// ```
973 /// # use lsm_tree::Error as TreeError;
974 /// # use lsm_tree::{AbstractTree, Config, Tree, Guard};
975 /// #
976 /// # let folder = tempfile::tempdir()?;
977 /// # let tree = Config::new(folder, Default::default(), Default::default()).open()?;
978 /// #
979 /// tree.insert("1", "abc", 0);
980 /// tree.insert("3", "abc", 1);
981 /// tree.insert("5", "abc", 2);
982 ///
983 /// let key = tree.last_key_value(3, None).expect("item should exist").key()?;
984 /// assert_eq!(&*key, "5".as_bytes());
985 /// #
986 /// # Ok::<(), TreeError>(())
987 /// ```
988 ///
989 /// # Errors
990 ///
991 /// Will return `Err` if an IO error occurs.
992 fn last_key_value(
993 &self,
994 seqno: SeqNo,
995 index: Option<(Arc<Memtable>, SeqNo)>,
996 ) -> Option<IterGuardImpl> {
997 self.iter(seqno, index).next_back()
998 }
999
1000 /// Returns the size of a value if it exists.
1001 ///
1002 /// # Examples
1003 ///
1004 /// ```
1005 /// # let folder = tempfile::tempdir()?;
1006 /// use lsm_tree::{AbstractTree, Config, Tree};
1007 ///
1008 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1009 /// tree.insert("a", "my_value", 0);
1010 ///
1011 /// let size = tree.size_of("a", 1)?.unwrap_or_default();
1012 /// assert_eq!("my_value".len() as u32, size);
1013 ///
1014 /// let size = tree.size_of("b", 1)?.unwrap_or_default();
1015 /// assert_eq!(0, size);
1016 /// #
1017 /// # Ok::<(), lsm_tree::Error>(())
1018 /// ```
1019 ///
1020 /// # Errors
1021 ///
1022 /// Will return `Err` if an IO error occurs.
1023 fn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<u32>>;
1024
1025 /// Retrieves an item from the tree.
1026 ///
1027 /// # Examples
1028 ///
1029 /// ```
1030 /// # let folder = tempfile::tempdir()?;
1031 /// use lsm_tree::{AbstractTree, Config, Tree};
1032 ///
1033 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1034 /// tree.insert("a", "my_value", 0);
1035 ///
1036 /// let item = tree.get("a", 1)?;
1037 /// assert_eq!(Some("my_value".as_bytes().into()), item);
1038 /// #
1039 /// # Ok::<(), lsm_tree::Error>(())
1040 /// ```
1041 ///
1042 /// # Errors
1043 ///
1044 /// Will return `Err` if an IO error occurs.
1045 fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<UserValue>>;
1046
1047 /// Retrieves an item from the tree as a [`PinnableSlice`](crate::PinnableSlice).
1048 ///
1049 /// When the value is backed by an on-disk data block, implementations
1050 /// may return [`PinnableSlice::Pinned`](crate::PinnableSlice::Pinned) holding a reference to that block's
1051 /// decompressed buffer (avoiding a data copy). Memtable and blob-resolved
1052 /// values use [`PinnableSlice::Owned`](crate::PinnableSlice::Owned). The default implementation always
1053 /// returns `Owned`; only [`Tree`] overrides with the pinned path.
1054 ///
1055 /// The existing [`AbstractTree::get`] method is unaffected.
1056 ///
1057 /// # Examples
1058 ///
1059 /// ```
1060 /// # let folder = tempfile::tempdir()?;
1061 /// use lsm_tree::{AbstractTree, Config, Tree};
1062 ///
1063 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
1064 /// tree.insert("a", "my_value", 0);
1065 ///
1066 /// let item = tree.get_pinned("a", 1)?;
1067 /// assert_eq!(item.as_ref().map(|v| v.as_ref()), Some("my_value".as_bytes()));
1068 /// #
1069 /// # Ok::<(), lsm_tree::Error>(())
1070 /// ```
1071 ///
1072 /// # Errors
1073 ///
1074 /// Will return `Err` if an IO error occurs.
1075 fn get_pinned<K: AsRef<[u8]>>(
1076 &self,
1077 key: K,
1078 seqno: SeqNo,
1079 ) -> crate::Result<Option<crate::PinnableSlice>> {
1080 // Default: delegate to get() and wrap as Owned
1081 self.get(key, seqno)
1082 .map(|opt| opt.map(crate::PinnableSlice::owned))
1083 }
1084
1085 /// Returns `true` if the tree contains the specified key.
1086 ///
1087 /// # Examples
1088 ///
1089 /// ```
1090 /// # let folder = tempfile::tempdir()?;
1091 /// # use lsm_tree::{AbstractTree, Config, Tree};
1092 /// #
1093 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1094 /// assert!(!tree.contains_key("a", 0)?);
1095 ///
1096 /// tree.insert("a", "abc", 0);
1097 /// assert!(tree.contains_key("a", 1)?);
1098 /// #
1099 /// # Ok::<(), lsm_tree::Error>(())
1100 /// ```
1101 ///
1102 /// # Errors
1103 ///
1104 /// Will return `Err` if an IO error occurs.
1105 fn contains_key<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<bool> {
1106 self.get(key, seqno).map(|x| x.is_some())
1107 }
1108
1109 /// Returns `true` if the tree contains any key with the given prefix.
1110 ///
1111 /// This is a convenience method that checks whether the corresponding
1112 /// prefix iterator yields at least one item, while surfacing any IO
1113 /// errors via the `Result` return type. Implementations may override
1114 /// this method to provide a more efficient prefix-existence check.
1115 ///
1116 /// # Examples
1117 ///
1118 /// ```
1119 /// # let folder = tempfile::tempdir()?;
1120 /// use lsm_tree::{AbstractTree, Config, Tree};
1121 ///
1122 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1123 /// assert!(!tree.contains_prefix("abc", 0, None)?);
1124 ///
1125 /// tree.insert("abc:1", "value", 0);
1126 /// assert!(tree.contains_prefix("abc", 1, None)?);
1127 /// assert!(!tree.contains_prefix("xyz", 1, None)?);
1128 /// #
1129 /// # Ok::<(), lsm_tree::Error>(())
1130 /// ```
1131 ///
1132 /// # Errors
1133 ///
1134 /// Will return `Err` if an IO error occurs.
1135 fn contains_prefix<K: AsRef<[u8]>>(
1136 &self,
1137 prefix: K,
1138 seqno: SeqNo,
1139 index: Option<(Arc<Memtable>, SeqNo)>,
1140 ) -> crate::Result<bool> {
1141 match self.prefix(prefix, seqno, index).next() {
1142 Some(guard) => guard.key().map(|_| true),
1143 None => Ok(false),
1144 }
1145 }
1146
1147 /// Reads multiple keys from the tree.
1148 ///
1149 /// Implementations may choose to perform all lookups against a single
1150 /// version snapshot and acquire the version lock only once, which can be
1151 /// more efficient than calling [`AbstractTree::get`] in a loop. The
1152 /// default trait implementation, however, is a convenience wrapper that
1153 /// simply calls [`AbstractTree::get`] for each key and therefore does not
1154 /// guarantee a single-snapshot or single-lock acquisition. Optimized
1155 /// implementations (such as [`Tree`] and [`BlobTree`]) provide the
1156 /// single-snapshot/one-lock behavior.
1157 ///
1158 /// # Examples
1159 ///
1160 /// ```
1161 /// # let folder = tempfile::tempdir()?;
1162 /// use lsm_tree::{AbstractTree, Config, Tree};
1163 ///
1164 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1165 /// tree.insert("a", "value_a", 0);
1166 /// tree.insert("b", "value_b", 1);
1167 ///
1168 /// let results = tree.multi_get(["a", "b", "c"], 2)?;
1169 /// assert_eq!(results[0], Some("value_a".as_bytes().into()));
1170 /// assert_eq!(results[1], Some("value_b".as_bytes().into()));
1171 /// assert_eq!(results[2], None);
1172 /// #
1173 /// # Ok::<(), lsm_tree::Error>(())
1174 /// ```
1175 ///
1176 /// # Errors
1177 ///
1178 /// Will return `Err` if an IO error occurs.
1179 fn multi_get<K: AsRef<[u8]>>(
1180 &self,
1181 keys: impl IntoIterator<Item = K>,
1182 seqno: SeqNo,
1183 ) -> crate::Result<Vec<Option<UserValue>>> {
1184 keys.into_iter().map(|key| self.get(key, seqno)).collect()
1185 }
1186
1187 /// Applies a [`WriteBatch`](crate::WriteBatch) with the given sequence number.
1188 ///
1189 /// All entries share a single seqno. This is more efficient than individual
1190 /// writes because the version-history lock and memtable size accounting
1191 /// are performed only once for the entire batch.
1192 ///
1193 /// **Visibility:** entries become individually visible to concurrent readers
1194 /// as they are inserted. For atomic batch visibility, the caller must
1195 /// publish `seqno` (via `visible_seqno.fetch_max(seqno + 1)`) only
1196 /// **after** this method returns.
1197 ///
1198 /// Returns the total bytes added and new size of the memtable.
1199 ///
1200 /// # Examples
1201 ///
1202 /// ```
1203 /// # let folder = tempfile::tempdir()?;
1204 /// use lsm_tree::{AbstractTree, Config, Tree, WriteBatch};
1205 ///
1206 /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
1207 ///
1208 /// let mut batch = WriteBatch::new();
1209 /// batch.insert("key1", "value1");
1210 /// batch.insert("key2", "value2");
1211 /// batch.remove("key3");
1212 ///
1213 /// let (bytes_added, memtable_size) = tree.apply_batch(batch, 0)?;
1214 /// assert!(bytes_added > 0);
1215 /// #
1216 /// # Ok::<(), lsm_tree::Error>(())
1217 /// ```
1218 ///
1219 /// # Errors
1220 ///
1221 /// Returns [`Error::MixedOperationBatch`](crate::Error::MixedOperationBatch)
1222 /// if the batch contains mixed operation types for the same user key.
1223 fn apply_batch(&self, batch: crate::WriteBatch, seqno: SeqNo) -> crate::Result<(u64, u64)>;
1224
1225 /// Inserts a key-value pair into the tree.
1226 ///
1227 /// If the key already exists, the item will be overwritten.
1228 ///
1229 /// Returns the added item's size and new size of the memtable.
1230 ///
1231 /// # Examples
1232 ///
1233 /// ```
1234 /// # let folder = tempfile::tempdir()?;
1235 /// use lsm_tree::{AbstractTree, Config, Tree};
1236 ///
1237 /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1238 /// tree.insert("a", "abc", 0);
1239 /// #
1240 /// # Ok::<(), lsm_tree::Error>(())
1241 /// ```
1242 ///
1243 /// # Errors
1244 ///
1245 /// Will return `Err` if an IO error occurs.
1246 fn insert<K: Into<UserKey>, V: Into<UserValue>>(
1247 &self,
1248 key: K,
1249 value: V,
1250 seqno: SeqNo,
1251 ) -> (u64, u64);
1252
1253 /// Admission-gated [`insert`](Self::insert): consults
1254 /// [`write_admission`](Self::write_admission) first and declines with
1255 /// [`Error::StorageFull`](crate::Error::StorageFull) when the tree is over
1256 /// budget, otherwise inserts and returns the same `(added_bytes,
1257 /// memtable_size)` tuple. This is the write entry a space-aware caller uses
1258 /// so over-budget writes are refused up front rather than failing a flush
1259 /// later; bare [`insert`](Self::insert) stays infallible for callers that
1260 /// do not opt into admission control.
1261 ///
1262 /// # Errors
1263 ///
1264 /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1265 /// is closed.
1266 fn try_insert<K: Into<UserKey>, V: Into<UserValue>>(
1267 &self,
1268 key: K,
1269 value: V,
1270 seqno: SeqNo,
1271 ) -> crate::Result<(u64, u64)> {
1272 self.write_admission()?;
1273 Ok(self.insert(key, value, seqno))
1274 }
1275
1276 /// Admission-gated [`merge`](Self::merge). See
1277 /// [`try_insert`](Self::try_insert).
1278 ///
1279 /// # Errors
1280 ///
1281 /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1282 /// is closed.
1283 fn try_merge<K: Into<UserKey>, V: Into<UserValue>>(
1284 &self,
1285 key: K,
1286 operand: V,
1287 seqno: SeqNo,
1288 ) -> crate::Result<(u64, u64)> {
1289 self.write_admission()?;
1290 Ok(self.merge(key, operand, seqno))
1291 }
1292
1293 /// Admission-gated [`remove`](Self::remove). See
1294 /// [`try_insert`](Self::try_insert).
1295 ///
1296 /// Note a tombstone is itself a write that consumes space; an over-budget
1297 /// tree refuses it too. Reclaim space via compaction (never gated) rather
1298 /// than relying on deletes when already read-only.
1299 ///
1300 /// # Errors
1301 ///
1302 /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1303 /// is closed.
1304 fn try_remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> crate::Result<(u64, u64)> {
1305 self.write_admission()?;
1306 Ok(self.remove(key, seqno))
1307 }
1308
1309 /// Admission-gated [`remove_weak`](Self::remove_weak). See
1310 /// [`try_insert`](Self::try_insert).
1311 ///
1312 /// # Errors
1313 ///
1314 /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1315 /// is closed.
1316 fn try_remove_weak<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> crate::Result<(u64, u64)> {
1317 self.write_admission()?;
1318 Ok(self.remove_weak(key, seqno))
1319 }
1320
1321 /// Admission-gated [`remove_range`](Self::remove_range). See
1322 /// [`try_insert`](Self::try_insert).
1323 ///
1324 /// # Errors
1325 ///
1326 /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1327 /// is closed.
1328 fn try_remove_range<K: Into<UserKey>>(
1329 &self,
1330 start: K,
1331 end: K,
1332 seqno: SeqNo,
1333 ) -> crate::Result<u64> {
1334 self.write_admission()?;
1335 Ok(self.remove_range(start, end, seqno))
1336 }
1337
1338 /// Removes an item from the tree.
1339 ///
1340 /// Returns the added item's size and new size of the memtable.
1341 ///
1342 /// # Examples
1343 ///
1344 /// ```
1345 /// # let folder = tempfile::tempdir()?;
1346 /// # use lsm_tree::{AbstractTree, Config, Tree};
1347 /// #
1348 /// # let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1349 /// tree.insert("a", "abc", 0);
1350 ///
1351 /// let item = tree.get("a", 1)?.expect("should have item");
1352 /// assert_eq!("abc".as_bytes(), &*item);
1353 ///
1354 /// tree.remove("a", 1);
1355 ///
1356 /// let item = tree.get("a", 2)?;
1357 /// assert_eq!(None, item);
1358 /// #
1359 /// # Ok::<(), lsm_tree::Error>(())
1360 /// ```
1361 ///
1362 /// # Errors
1363 ///
1364 /// Will return `Err` if an IO error occurs.
1365 fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
1366
1367 /// Writes a merge operand for a key.
1368 ///
1369 /// The operand is stored as a partial update that will be combined with
1370 /// other operands and/or a base value via the configured [`crate::MergeOperator`]
1371 /// during reads and compaction.
1372 ///
1373 /// Returns the added item's size and new size of the memtable.
1374 ///
1375 /// # Examples
1376 ///
1377 /// ```
1378 /// # let folder = tempfile::tempdir()?;
1379 /// # use lsm_tree::{AbstractTree, Config, MergeOperator, UserValue};
1380 /// # use std::sync::Arc;
1381 /// # struct SumMerge;
1382 /// # impl MergeOperator for SumMerge {
1383 /// # fn merge(&self, _key: &[u8], base: Option<&[u8]>, operands: &[&[u8]]) -> lsm_tree::Result<UserValue> {
1384 /// # let mut sum: i64 = base.map_or(0, |b| i64::from_le_bytes(b.try_into().unwrap()));
1385 /// # for op in operands { sum += i64::from_le_bytes((*op).try_into().unwrap()); }
1386 /// # Ok(sum.to_le_bytes().to_vec().into())
1387 /// # }
1388 /// # }
1389 /// # let tree = Config::new(folder, Default::default(), Default::default())
1390 /// # .with_merge_operator(Some(Arc::new(SumMerge)))
1391 /// # .open()?;
1392 /// tree.merge("counter", 1_i64.to_le_bytes(), 0);
1393 /// # Ok::<(), lsm_tree::Error>(())
1394 /// ```
1395 fn merge<K: Into<UserKey>, V: Into<UserValue>>(
1396 &self,
1397 key: K,
1398 operand: V,
1399 seqno: SeqNo,
1400 ) -> (u64, u64);
1401
1402 /// Removes an item from the tree.
1403 ///
1404 /// The tombstone marker of this delete operation will vanish when it
1405 /// collides with its corresponding insertion.
1406 /// This may cause older versions of the value to be resurrected, so it should
1407 /// only be used and preferred in scenarios where a key is only ever written once.
1408 ///
1409 /// Returns the added item's size and new size of the memtable.
1410 ///
1411 /// # Examples
1412 ///
1413 /// ```
1414 /// # let folder = tempfile::tempdir()?;
1415 /// # use lsm_tree::{AbstractTree, Config, Tree};
1416 /// #
1417 /// # let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1418 /// tree.insert("a", "abc", 0);
1419 ///
1420 /// let item = tree.get("a", 1)?.expect("should have item");
1421 /// assert_eq!("abc".as_bytes(), &*item);
1422 ///
1423 /// tree.remove_weak("a", 1);
1424 ///
1425 /// let item = tree.get("a", 2)?;
1426 /// assert_eq!(None, item);
1427 /// #
1428 /// # Ok::<(), lsm_tree::Error>(())
1429 /// ```
1430 ///
1431 /// # Errors
1432 ///
1433 /// Will return `Err` if an IO error occurs.
1434 #[doc(hidden)]
1435 fn remove_weak<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
1436
1437 /// Deletes all keys in the range `[start, end)` by inserting a range tombstone.
1438 ///
1439 /// This is much more efficient than deleting keys individually when
1440 /// removing a contiguous range of keys.
1441 ///
1442 /// Returns the approximate size added to the memtable.
1443 /// Returns 0 if `start >= end` (invalid interval is silently ignored).
1444 ///
1445 /// This is a required method on the crate's sealed tree types.
1446 fn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64;
1447
1448 /// Deletes all keys with the given prefix by inserting a range tombstone.
1449 ///
1450 /// This is sugar over [`AbstractTree::remove_range`] using prefix bounds.
1451 ///
1452 /// Returns the approximate size added to the memtable.
1453 /// Returns 0 for empty prefixes or all-`0xFF` prefixes (cannot form valid half-open range).
1454 fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64 {
1455 use crate::range::prefix_to_range;
1456 use core::ops::Bound;
1457
1458 let (lo, hi) = prefix_to_range(prefix.as_ref());
1459
1460 let Bound::Included(start) = lo else { return 0 };
1461
1462 // Bound::Unbounded means the prefix is all 0xFF — no representable
1463 // exclusive upper bound exists, so we cannot form a valid range tombstone.
1464 let Bound::Excluded(end) = hi else { return 0 };
1465
1466 self.remove_range(start, end, seqno)
1467 }
1468}