Skip to main content

lsm_tree/
abstract_tree.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5use crate::tree::inner::{FlushGuard, VersionsWriteGuard};
6use crate::{
7    AnyTree, BlobTree, Config, Guard, InternalValue, KvPair, Memtable, SeqNo, TableId, Tree,
8    UserKey, UserValue,
9    iter_guard::{IterGuardImpl, SeekableGuardIter},
10    table::Table,
11    version::Version,
12    vlog::BlobFile,
13};
14use alloc::sync::Arc;
15#[cfg(not(feature = "std"))]
16use alloc::{boxed::Box, vec::Vec};
17use core::ops::RangeBounds;
18
19pub type RangeItem = crate::Result<KvPair>;
20
21type FlushToTablesResult = (Vec<Table>, Option<Vec<BlobFile>>);
22
23/// Summary of a checkpoint produced by
24/// [`AbstractTree::create_checkpoint`].
25///
26/// All byte counts are *logical* file sizes — hard links share the
27/// underlying inode storage, so a checkpoint's marginal disk usage is
28/// typically zero until the original files are compacted away.
29#[derive(Debug, Clone, Copy)]
30pub struct CheckpointInfo {
31    /// Number of SST files captured.
32    pub sst_files: usize,
33    /// Number of blob (value-log) files captured. Always `0` for a
34    /// standard [`Tree`].
35    pub blob_files: usize,
36    /// Sum of the logical file sizes of every captured SST + blob.
37    pub total_bytes: u64,
38    /// The version ID embedded in the checkpoint's `current` pointer.
39    pub version_id: u64,
40    /// Lower-bound visible-seqno watermark for the snapshot.
41    ///
42    /// Captured from the tree's `visible_seqno` generator BEFORE
43    /// [`AbstractTree::current_version`]. Following the standard
44    /// "lowest-excluded" watermark convention, `info.seqno = N` means
45    /// every record with `seqno < N` was committed at sample time and
46    /// is therefore guaranteed to be present in the snapshot. Records
47    /// with `seqno == N` may or may not be included (writers can hold
48    /// a record in the memtable for an instant before publishing the
49    /// next watermark); records with `seqno > N` may also be present
50    /// (writers can advance the counter between sample and version
51    /// snapshot, and those keys still land in the captured memtable).
52    ///
53    /// PITR consumers MUST use `seqno < info.seqno` as the inclusion
54    /// gate. Using `<=` (treating this as a max-included ceiling)
55    /// could move a recovery cutoff past data still needed from WAL
56    /// or replication; the field is a strict lower-exclusive watermark,
57    /// not a max-included ceiling.
58    pub seqno: SeqNo,
59}
60
61// Sealed on purpose: this trait is still public as a consumer-side bound
62// (`&impl AbstractTree`), but external implementations are no longer part of
63// the supported extension surface. Internal flush/version hooks keep evolving
64// with crate-owned tree types and must not create downstream semver traps.
65//
66// `sealed` stays `pub` only so sibling modules in this crate can write
67// `crate::abstract_tree::sealed::Sealed` in their impls. The parent module
68// `abstract_tree` is not publicly exported from the crate root, so downstream
69// crates still cannot name or implement this trait.
70pub mod sealed {
71    pub trait Sealed {}
72}
73
74/// Generic Tree API
75#[enum_dispatch::enum_dispatch]
76pub trait AbstractTree: sealed::Sealed {
77    /// Debug method for tracing the MVCC history of a key.
78    #[doc(hidden)]
79    fn print_trace(&self, key: &[u8]) -> crate::Result<()>;
80
81    /// Returns the number of cached table file descriptors.
82    fn table_file_cache_size(&self) -> usize;
83
84    // TODO: remove
85    #[doc(hidden)]
86    fn version_memtable_size_sum(&self) -> u64 {
87        self.get_version_history_lock().memtable_size_sum()
88    }
89
90    #[doc(hidden)]
91    fn next_table_id(&self) -> TableId;
92
93    #[doc(hidden)]
94    fn id(&self) -> crate::TreeId;
95
96    /// Like [`AbstractTree::get`], but returns the actual internal entry, not just the user value.
97    ///
98    /// Used in tests.
99    #[doc(hidden)]
100    fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result<Option<InternalValue>>;
101
102    #[doc(hidden)]
103    fn current_version(&self) -> Version;
104
105    /// Returns a read-only snapshot of the tree's on-disk storage footprint:
106    /// total used bytes, entry count, the average shape of a stored entry
107    /// (average key / value bytes), and an estimate of how many more
108    /// average-shaped entries fit in a byte budget (see
109    /// [`StorageStats::estimated_remaining_entries`](crate::StorageStats::estimated_remaining_entries)).
110    ///
111    /// Computed from the live version's table + blob metadata plus one
112    /// size-stat per live file; it never reads a data block. The default
113    /// implementation reports [`StorageStatus::Healthy`](crate::StorageStatus::Healthy);
114    /// the standard tree overrides it to report
115    /// [`StorageStatus::CompactionInProgress`](crate::StorageStatus::CompactionInProgress) while a
116    /// compaction runs.
117    ///
118    /// # Examples
119    ///
120    /// ```
121    /// # use lsm_tree::Error as TreeError;
122    /// use lsm_tree::{AbstractTree, Config};
123    ///
124    /// let folder = tempfile::tempdir()?;
125    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
126    ///
127    /// for i in 0..100u64 {
128    ///     tree.insert(format!("key{i:04}"), "value", i);
129    /// }
130    /// tree.flush_active_memtable(0)?;
131    ///
132    /// let stats = tree.storage_stats()?;
133    /// assert_eq!(stats.item_count, 100);
134    /// // Roughly how many more average-shaped entries fit in another 1 MiB.
135    /// let _headroom = stats.estimated_remaining_entries(1024 * 1024);
136    /// #
137    /// # Ok::<(), TreeError>(())
138    /// ```
139    ///
140    /// # Errors
141    ///
142    /// Returns an error if a live file's size cannot be stat-ed.
143    fn storage_stats(&self) -> crate::Result<crate::StorageStats> {
144        crate::storage_stats::compute_storage_stats(&self.current_version(), false, true)
145    }
146
147    /// Per-LSM-level and per-segment size + entry-count stats, for tiering and
148    /// erasure-coding placement decisions (which level / segment is large enough
149    /// to demote, EC-encode, or migrate).
150    ///
151    /// Cheap: derived from the live version's metadata plus one file-size stat
152    /// per segment (no data-block scan). The per-level totals reconcile with
153    /// [`storage_stats`](Self::storage_stats): summed across levels they equal
154    /// the SST portion of [`StorageStats::used_bytes`](crate::StorageStats::used_bytes)
155    /// and [`StorageStats::item_count`](crate::StorageStats::item_count).
156    ///
157    /// # Examples
158    ///
159    /// ```
160    /// # use lsm_tree::Error as TreeError;
161    /// use lsm_tree::{AbstractTree, Config};
162    ///
163    /// let folder = tempfile::tempdir()?;
164    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
165    /// for i in 0..100u32 {
166    ///     tree.insert(format!("k{i:04}"), "v", 0);
167    /// }
168    /// tree.flush_active_memtable(0)?;
169    ///
170    /// let levels = tree.level_segment_stats()?;
171    /// let total: u64 = levels.iter().map(|l| l.item_count).sum();
172    /// assert_eq!(total, tree.storage_stats()?.item_count);
173    /// #
174    /// # Ok::<(), TreeError>(())
175    /// ```
176    ///
177    /// # Errors
178    ///
179    /// Returns an error if a segment's file size cannot be stat-ed.
180    fn level_segment_stats(&self) -> crate::Result<Vec<crate::LevelStats>> {
181        crate::storage_stats::compute_level_segment_stats(&self.current_version())
182    }
183
184    /// Estimated bytes pending compaction under `strategy`: on-disk data above
185    /// its level's target that must eventually be rewritten downward (a `RocksDB`
186    /// `estimate-pending-compaction-bytes` analog), a compaction-debt signal for a
187    /// scheduler / tiering consumer.
188    ///
189    /// The strategy is supplied by the caller because the engine does not own a
190    /// configured compaction strategy (it is injected per compaction run); a
191    /// `&dyn` keeps this object-safe. Returns `0` for strategies without a
192    /// size-target notion of debt (FIFO, drop-range), or when the tree is at or
193    /// below its target shape. See
194    /// [`CompactionStrategy::pending_compaction_bytes`](crate::compaction::CompactionStrategy::pending_compaction_bytes).
195    fn compaction_debt(&self, strategy: &dyn crate::compaction::CompactionStrategy) -> u64 {
196        strategy.pending_compaction_bytes(&self.current_version())
197    }
198
199    /// Computed write-backpressure verdict from the live L0 table count and the
200    /// strategy's pending-compaction bytes, against the configured
201    /// [`RuntimeConfig::backpressure`](crate::runtime_config::RuntimeConfig)
202    /// thresholds.
203    ///
204    /// Advisory, mirroring [`write_admission`](Self::write_admission): the caller
205    /// consults it and throttles in its own write loop (sleep `suggested_delay`
206    /// at [`Slowdown`](crate::Backpressure::Slowdown), pause at
207    /// [`Stop`](crate::Backpressure::Stop)). The engine never blocks on it,
208    /// because it does not own the compaction that drains the debt — an internal
209    /// stall could deadlock the very thread that would compact.
210    ///
211    /// The strategy is supplied by the caller for the same reason as
212    /// [`compaction_debt`](Self::compaction_debt). Returns
213    /// [`Backpressure::None`](crate::Backpressure::None) by default (no
214    /// thresholds configured).
215    fn write_backpressure(
216        &self,
217        _strategy: &dyn crate::compaction::CompactionStrategy,
218    ) -> crate::Backpressure {
219        crate::Backpressure::None
220    }
221
222    /// Storage admission gate: `Ok(())` if a write may proceed, or
223    /// [`Error::StorageFull`](crate::Error::StorageFull) if the tree is
224    /// over budget and should be treated as read-only.
225    ///
226    /// Opt-in: returns `Ok(())` unless
227    /// [`storage_admission_check`](crate::runtime_config::RuntimeConfig::storage_admission_check)
228    /// is enabled. The predicate is computed (not latched), so once space is
229    /// freed — the budget raised, a compaction reclaiming space, or disk freed —
230    /// the next call admits again with no restart.
231    ///
232    /// Intended as a cheap pre-check the caller consults before applying a
233    /// write batch. The footprint is cached per version, so the check is a
234    /// constant-time read on the common path and only re-measures when a new
235    /// version is installed (flush / compaction). Internal flush / compaction
236    /// are never gated, so the engine can always reclaim.
237    ///
238    /// # Errors
239    ///
240    /// [`Error::StorageFull`](crate::Error::StorageFull) when the live footprint
241    /// plus reserved headroom exceeds the effective budget.
242    fn write_admission(&self) -> crate::Result<()> {
243        Ok(())
244    }
245
246    /// `true` when the admission gate is currently closed (see
247    /// [`write_admission`](Self::write_admission)). Convenience for callers that
248    /// want a boolean rather than a `Result`. Always `false` unless admission
249    /// control is enabled and the tree is over budget.
250    ///
251    /// Only [`Error::StorageFull`](crate::Error::StorageFull) counts as
252    /// read-only: an unrelated admission error (e.g. an I/O failure while
253    /// measuring the footprint) is NOT an out-of-space condition and must not be
254    /// reported as one.
255    fn is_read_only(&self) -> bool {
256        matches!(
257            self.write_admission(),
258            Err(crate::Error::StorageFull { .. })
259        )
260    }
261
262    /// Proactively verifies every block's XXH3 checksum across every SST in
263    /// the tree's current version — a scrubber for catching bit rot before it
264    /// surfaces as a user-visible read failure (cron / scrub jobs).
265    ///
266    /// Reports at block granularity and never aborts early. The returned
267    /// [`BlockVerifyReport`](crate::verify::BlockVerifyReport) records
268    /// block-corruption findings with `(file, offset)`, while file-level errors
269    /// (e.g. [`BlockVerifyError::SstFileUnreadable`](crate::verify::BlockVerifyError::SstFileUnreadable))
270    /// carry the file only (no offset). It does not surface per-entry indices or
271    /// ECC-correction counts (when ECC-at-rest is enabled a within-budget corrupt
272    /// block may still be healed on read as a side effect of the scan, but the
273    /// report does not tally corrections).
274    ///
275    /// Filesystems with native per-block integrity (ZFS, Btrfs, `ReFS`, S3 —
276    /// see [`Fs::capabilities`](crate::fs::Fs::capabilities)) already detect
277    /// corruption on read; this scrub is the portable check for the rest.
278    ///
279    /// Use [`Self::verify_checksum_with`] for parallelism / throttle control.
280    #[cfg(feature = "std")]
281    fn verify_checksum(&self) -> crate::verify::BlockVerifyReport
282    where
283        Self: Sized,
284    {
285        crate::verify::verify_block_checksums(self)
286    }
287
288    /// Like [`Self::verify_checksum`] but with configurable parallelism and
289    /// I/O throttle (see [`VerifyOptions`](crate::verify::VerifyOptions)).
290    #[cfg(feature = "std")]
291    fn verify_checksum_with(
292        &self,
293        options: &crate::verify::VerifyOptions,
294    ) -> crate::verify::BlockVerifyReport
295    where
296        Self: Sized,
297    {
298        crate::verify::verify_block_checksums_with(self, options)
299    }
300
301    #[doc(hidden)]
302    fn get_version_history_lock(&self) -> VersionsWriteGuard<'_>;
303
304    /// Creates a hard-linked checkpoint of the tree's on-disk state in
305    /// `target_path` for point-in-time recovery (PITR) backup.
306    ///
307    /// The checkpoint is a fully functional tree that can be opened
308    /// independently via [`Config::open`](crate::Config::open). For the
309    /// common single-filesystem case all SST files (and blob files, for
310    /// [`BlobTree`]) are hard-linked rather than copied, so the operation
311    /// is O(1) per file and consumes zero additional disk space until the
312    /// original files are compacted away — at which point the inode is
313    /// kept alive by the checkpoint link.
314    ///
315    /// # Cross-filesystem / cross-backend fall-back
316    ///
317    /// When a source file lives on a different filesystem than the
318    /// checkpoint target — e.g. an SST routed to a hot tier via
319    /// [`level_routes`](crate::Config::level_routes) on a separate volume,
320    /// or a backup directory on a foreign mount — the hard link cannot
321    /// be created (Unix `EXDEV`). In that case the checkpoint silently
322    /// falls back to a streamed byte copy, which:
323    ///
324    /// - takes time linear in the file size instead of O(1), and
325    /// - consumes disk space equal to the copied bytes on the target
326    ///   volume (no inode sharing across filesystems).
327    ///
328    /// Each fall-back call emits one [`log::debug`] line (deliberately not
329    /// `warn`: a misconfigured tier could trigger this path once per SST
330    /// and per blob — thousands of times per snapshot — and per-file
331    /// warnings would drown real signal). Operators wanting hard-visibility
332    /// of unexpected full copies should enable debug logging on the `fs`
333    /// module or watch the `CheckpointInfo.total_bytes` figure (≫ inode
334    /// link cost means the fallback fired). The same `debug` policy applies
335    /// when source and target use entirely different [`Fs`](crate::fs::Fs)
336    /// backends (e.g. [`MemFs`](crate::fs::MemFs) → [`StdFs`](crate::fs::StdFs)
337    /// in tests).
338    ///
339    /// # Concurrency
340    ///
341    /// While the checkpoint is being built, compaction continues normally
342    /// but the physical removal of obsolete files is deferred until the
343    /// checkpoint hard-links are in place. This is implemented by an
344    /// internal reference-counted deletion gate; callers do not have to
345    /// pause compaction themselves.
346    ///
347    /// # Errors
348    ///
349    /// Returns an error if:
350    /// - the active memtable could not be flushed,
351    /// - `target_path` already exists (to prevent accidental overwrites),
352    /// - a hard link / copy fall-back could not be created, or
353    /// - the manifest / version pointer files could not be replicated.
354    ///
355    /// On error any partial checkpoint files are removed automatically
356    /// (best-effort) so callers can safely retry against the same path.
357    // std-only: checkpoint creation hard-links / copies files via std::fs.
358    #[cfg(feature = "std")]
359    fn create_checkpoint(&self, target_path: &crate::path::Path) -> crate::Result<CheckpointInfo>;
360
361    /// Seals the active memtable and flushes to table(s).
362    ///
363    /// If there are already other sealed memtables lined up, those will be flushed as well.
364    ///
365    /// Only used in tests.
366    #[doc(hidden)]
367    fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result<()> {
368        let lock = self.get_flush_lock();
369        self.rotate_memtable();
370        self.flush(&lock, eviction_seqno)?;
371        Ok(())
372    }
373
374    /// Synchronously flushes pending sealed memtables to tables.
375    ///
376    /// Returns the sum of flushed memtable sizes that were flushed.
377    ///
378    /// The function may not return a result, if nothing was flushed.
379    ///
380    /// # Errors
381    ///
382    /// Returns `Err` on an I/O error, or on a memtable residence-verification
383    /// failure under [`KvChecksumComputePoint::AtInsert`](crate::runtime_config::KvChecksumComputePoint::AtInsert):
384    /// a sealed memtable whose insert-time per-KV digests do not verify before
385    /// flush surfaces [`crate::Error::MemtableKvChecksumMismatch`],
386    /// [`crate::Error::MemtableKvChecksumCorruptAlgorithm`], or
387    /// [`crate::Error::InvalidTag`] (corrupt `value_type`).
388    fn flush(&self, _lock: &FlushGuard<'_>, seqno_threshold: SeqNo) -> crate::Result<Option<u64>> {
389        use crate::{
390            compaction::stream::CompactionStream, merge::Merger, range_tombstone::RangeTombstone,
391        };
392
393        let version_history = self.get_version_history_lock();
394        let latest = version_history.latest_version();
395
396        if latest.sealed_memtables.len() == 0 {
397            return Ok(None);
398        }
399
400        let sealed_ids = latest
401            .sealed_memtables
402            .iter()
403            .map(|mt| mt.id)
404            .collect::<Vec<_>>();
405
406        let flushed_size = latest.sealed_memtables.iter().map(|mt| mt.size()).sum();
407
408        // AtInsert residence check: verify each sealed memtable's insert-time
409        // per-KV digests against a recompute over the entries' current bytes
410        // before writing them out. A divergence means an entry was corrupted
411        // (a RAM bit-flip) while it sat in the memtable. Memtables with no
412        // insert digests (the default) return immediately without walking.
413        //
414        // This is a SEPARATE pass over the as-inserted memtable entries, not
415        // fused into the writer's per-KV footer encode, and deliberately so:
416        // the footer digest is computed over POST-merge / post-seqno-filter
417        // bytes (the CompactionStream below applies the merge operator), so a
418        // merge operator's combined value differs from any single inserted
419        // value. Comparing the carried insert digest against the footer digest
420        // would false-positive on every legitimate merge. Residence corruption
421        // is a property of what was inserted (pre-merge); it must be checked
422        // here, against the raw memtable entries.
423        for mt in latest.sealed_memtables.iter() {
424            mt.verify_kv_residence()?;
425        }
426
427        // Collect range tombstones from sealed memtables
428        let mut range_tombstones: Vec<RangeTombstone> = Vec::new();
429        for mt in latest.sealed_memtables.iter() {
430            range_tombstones.extend(mt.range_tombstones_sorted());
431        }
432        range_tombstones
433            .sort_by(|a, b| a.cmp_with_comparator(b, self.tree_config().comparator.as_ref()));
434        range_tombstones.dedup();
435
436        let merger = Merger::new(
437            latest
438                .sealed_memtables
439                .iter()
440                .map(|mt| mt.iter().map(Ok))
441                .collect::<Vec<_>>(),
442            self.tree_config().comparator.clone(),
443        );
444        // RT suppression is not needed here: flush writes both entries and RTs
445        // to the output tables. Suppression happens at read time, not write time.
446        let stream = CompactionStream::new(merger, seqno_threshold)
447            .with_merge_operator(self.tree_config().merge_operator.clone());
448
449        drop(version_history);
450
451        // Clone needed: flush_to_tables_with_rt consumes the Vec, but on the
452        // RT-only path (no KV data, tables.is_empty()) we re-insert RTs into the
453        // active memtable. Flush is infrequent and RT count is small.
454        if let Some((tables, blob_files)) =
455            self.flush_to_tables_with_rt(stream, range_tombstones.clone())?
456        {
457            // If no tables were produced (RT-only memtable), re-insert RTs
458            // into active memtable so they aren't lost
459            if tables.is_empty() && !range_tombstones.is_empty() {
460                let active = self.active_memtable();
461                for rt in &range_tombstones {
462                    let _ =
463                        active.insert_range_tombstone(rt.start.clone(), rt.end.clone(), rt.seqno);
464                }
465            }
466
467            self.register_tables(
468                &tables,
469                blob_files.as_deref(),
470                None,
471                &sealed_ids,
472                seqno_threshold,
473            )?;
474        }
475
476        Ok(Some(flushed_size))
477    }
478
479    /// Returns an iterator that scans through the entire tree.
480    ///
481    /// Avoid using this function, or limit it as otherwise it may scan a lot of items.
482    fn iter(
483        &self,
484        seqno: SeqNo,
485        index: Option<(Arc<Memtable>, SeqNo)>,
486    ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static> {
487        self.range::<&[u8], _>(.., seqno, index)
488    }
489
490    /// Returns an iterator over a prefixed set of items.
491    ///
492    /// Avoid using an empty prefix as it may scan a lot of items (unless limited).
493    fn prefix<K: AsRef<[u8]>>(
494        &self,
495        prefix: K,
496        seqno: SeqNo,
497        index: Option<(Arc<Memtable>, SeqNo)>,
498    ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
499
500    /// Returns an iterator over a range of items.
501    ///
502    /// Avoid using full or unbounded ranges as they may scan a lot of items (unless limited).
503    fn range<K: AsRef<[u8]>, R: RangeBounds<K>>(
504        &self,
505        range: R,
506        seqno: SeqNo,
507        index: Option<(Arc<Memtable>, SeqNo)>,
508    ) -> Box<dyn DoubleEndedIterator<Item = IterGuardImpl> + Send + 'static>;
509
510    /// Returns a range iterator that can reposition (seek) in place without
511    /// reopening its per-SST readers.
512    ///
513    /// Unlike [`range`](Self::range)'s boxed `DoubleEndedIterator`, the returned
514    /// [`SeekableGuardIter`] additionally exposes `seek_to` / `seek_to_for_prev`,
515    /// so a consumer can jump a live iterator to any key (`RocksDB` `Seek` /
516    /// `SeekForPrev`) — enabling data-dependent scans (joins, skip-scan) without
517    /// reopening per-SST readers per jump.
518    fn range_seekable<K: AsRef<[u8]>, R: RangeBounds<K>>(
519        &self,
520        range: R,
521        seqno: SeqNo,
522        index: Option<(Arc<Memtable>, SeqNo)>,
523    ) -> Box<dyn SeekableGuardIter + 'static>;
524
525    /// Scans a sequence of disjoint, ascending key sub-intervals, reusing one set
526    /// of per-SST readers across all of them.
527    ///
528    /// The per-SST setup is paid once (when the underlying seekable iterator is
529    /// opened); each interval is served by repositioning that iterator. This
530    /// amortizes the setup across `N` intervals instead of paying it per
531    /// interval, so multi-interval scan throughput scales with the total rows
532    /// returned rather than the interval count.
533    ///
534    /// The interval source is pulled lazily, so intervals may be produced on
535    /// demand (e.g. computed from rows already returned).
536    fn batch_range_scan<K: AsRef<[u8]>, R: RangeBounds<K> + 'static, I: IntoIterator<Item = R>>(
537        &self,
538        intervals: I,
539        seqno: SeqNo,
540        index: Option<(Arc<Memtable>, SeqNo)>,
541    ) -> Box<dyn Iterator<Item = IterGuardImpl> + Send + 'static>
542    where
543        I::IntoIter: Send + 'static;
544
545    /// Returns the approximate number of tombstones in the tree.
546    fn tombstone_count(&self) -> u64;
547
548    /// Returns the approximate number of weak tombstones (single deletes) in the tree.
549    fn weak_tombstone_count(&self) -> u64;
550
551    /// Returns the approximate number of values reclaimable once weak tombstones can be GC'd.
552    fn weak_tombstone_reclaimable_count(&self) -> u64;
553
554    /// Drops tables that are fully contained in a given range.
555    ///
556    /// Accepts any `RangeBounds`, including unbounded or exclusive endpoints.
557    /// If the normalized lower bound is greater than the upper bound, the
558    /// method returns without performing any work.
559    ///
560    /// # Errors
561    ///
562    /// Will return `Err` only if an IO error occurs.
563    fn drop_range<K: AsRef<[u8]>, R: RangeBounds<K>>(&self, range: R) -> crate::Result<()>;
564
565    /// Drops all tables and clears all memtables atomically.
566    ///
567    /// # Errors
568    ///
569    /// Will return `Err` only if an IO error occurs.
570    fn clear(&self) -> crate::Result<()>;
571
572    /// Performs major compaction, blocking the caller until it's done.
573    ///
574    /// Returns a [`crate::compaction::CompactionResult`] describing what action was taken.
575    ///
576    /// # Garbage-collection / merge-fold watermark (`seqno_threshold`)
577    ///
578    /// `seqno_threshold` is the MVCC garbage-collection watermark: the engine may
579    /// collapse history that no snapshot reading at a seqno `< seqno_threshold`
580    /// can still observe. Concretely, only entries whose seqno is `< seqno_threshold`
581    /// are eligible for:
582    ///
583    /// - dropping shadowed versions / GC-ing tombstones, and
584    /// - **folding merge operands** via the [`crate::MergeOperator`]: a key written
585    ///   only through [`Self::merge`] (no base value) accumulates one operand per
586    ///   call, and reads re-apply the whole chain (`O(operands)` per read) until
587    ///   compaction folds it. Folding a chain into a single value is only
588    ///   MVCC-safe when no live snapshot reads *between* the operands, which is
589    ///   exactly what `seqno_threshold` certifies.
590    ///
591    /// The engine does **not** track snapshots (unlike a `RocksDB`-style
592    /// snapshot list); the caller owns snapshot lifecycle and must supply this
593    /// watermark:
594    ///
595    /// - To fold/GC everything (no active snapshots), pass a value **above every
596    ///   live seqno** (e.g. the next value from the [`crate::SequenceNumberCounter`]).
597    /// - With outstanding snapshots, pass the **oldest** snapshot's seqno so their
598    ///   reads stay correct.
599    /// - `seqno_threshold == 0` certifies nothing as collapsible, so **no folding
600    ///   or GC happens** — `major_compact(target, 0)` only restructures tables and
601    ///   leaves a merge-only key's full operand chain intact.
602    ///
603    /// # Errors
604    ///
605    /// Will return `Err` if an IO error occurs.
606    fn major_compact(
607        &self,
608        target_size: u64,
609        seqno_threshold: SeqNo,
610    ) -> crate::Result<crate::compaction::CompactionResult>;
611
612    /// Returns the disk space used by stale blobs.
613    fn stale_blob_bytes(&self) -> u64 {
614        0
615    }
616
617    /// Gets the space usage of all filters in the tree.
618    ///
619    /// May not correspond to the actual memory size because filter blocks may be paged out.
620    fn filter_size(&self) -> u64;
621
622    /// Gets the memory usage of all pinned filters in the tree.
623    fn pinned_filter_size(&self) -> usize;
624
625    /// Gets the memory usage of all pinned index blocks in the tree.
626    fn pinned_block_index_size(&self) -> usize;
627
628    /// Gets the length of the version free list.
629    fn version_free_list_len(&self) -> usize;
630
631    /// Returns the metrics structure.
632    #[cfg(feature = "metrics")]
633    fn metrics(&self) -> &Arc<crate::Metrics>;
634
635    /// A point-in-time [`CacheStats`](crate::CacheStats) snapshot of block-cache
636    /// effectiveness (cumulative hit / miss counts and rate) and occupancy
637    /// (current size against capacity).
638    ///
639    /// The stable, owned observability view over the block cache, so a consumer
640    /// can read cache health without holding the mutable
641    /// [`metrics`](Self::metrics) handle. Counts are cumulative since process
642    /// start; derive a rate over an interval from the delta between two polls.
643    #[cfg(feature = "metrics")]
644    fn cache_stats(&self) -> crate::CacheStats;
645
646    /// Acquires the flush lock which is required to call [`Tree::flush`].
647    fn get_flush_lock(&self) -> FlushGuard<'_>;
648
649    /// Synchronously flushes a memtable to a table.
650    ///
651    /// This method will not make the table immediately available,
652    /// use [`AbstractTree::register_tables`] for that.
653    ///
654    /// # Errors
655    ///
656    /// Will return `Err` if an IO error occurs.
657    fn flush_to_tables(
658        &self,
659        stream: impl Iterator<Item = crate::Result<InternalValue>>,
660    ) -> crate::Result<Option<FlushToTablesResult>> {
661        self.flush_to_tables_with_rt(stream, Vec::new())
662    }
663
664    /// Like [`AbstractTree::flush_to_tables`], but also writes range tombstones.
665    ///
666    /// This is an internal extension hook on the crate's sealed tree types and
667    /// is hidden from generated documentation.
668    ///
669    /// # Errors
670    ///
671    /// Will return `Err` if an IO error occurs.
672    #[doc(hidden)]
673    fn flush_to_tables_with_rt(
674        &self,
675        stream: impl Iterator<Item = crate::Result<InternalValue>>,
676        range_tombstones: Vec<crate::range_tombstone::RangeTombstone>,
677    ) -> crate::Result<Option<FlushToTablesResult>>;
678
679    /// Atomically registers flushed tables into the tree, removing their associated sealed memtables.
680    ///
681    /// # Errors
682    ///
683    /// Will return `Err` if an IO error occurs.
684    fn register_tables(
685        &self,
686        tables: &[Table],
687        blob_files: Option<&[BlobFile]>,
688        frag_map: Option<crate::blob_tree::FragmentationMap>,
689        sealed_memtables_to_delete: &[crate::tree::inner::MemtableId],
690        gc_watermark: SeqNo,
691    ) -> crate::Result<()>;
692
693    /// Clears the active memtable atomically.
694    fn clear_active_memtable(&self);
695
696    /// Returns the number of sealed memtables.
697    fn sealed_memtable_count(&self) -> usize;
698
699    /// Performs compaction on the tree's levels, blocking the caller until it's done.
700    ///
701    /// Returns a [`crate::compaction::CompactionResult`] describing what action was taken.
702    ///
703    /// # Errors
704    ///
705    /// Will return `Err` if an IO error occurs.
706    fn compact(
707        &self,
708        strategy: Arc<dyn crate::compaction::CompactionStrategy>,
709        seqno_threshold: SeqNo,
710    ) -> crate::Result<crate::compaction::CompactionResult>;
711
712    /// Returns the next table's ID.
713    fn get_next_table_id(&self) -> TableId;
714
715    /// Returns the tree config.
716    fn tree_config(&self) -> &Config;
717
718    /// Returns the highest sequence number.
719    fn get_highest_seqno(&self) -> Option<SeqNo> {
720        let memtable_seqno = self.get_highest_memtable_seqno();
721        let table_seqno = self.get_highest_persisted_seqno();
722        memtable_seqno.max(table_seqno)
723    }
724
725    /// Returns the active memtable.
726    fn active_memtable(&self) -> Arc<Memtable>;
727
728    /// Returns the tree type.
729    fn tree_type(&self) -> crate::TreeType {
730        if self.tree_config().kv_separation_opts.is_some() {
731            crate::TreeType::Blob
732        } else {
733            crate::TreeType::Standard
734        }
735    }
736
737    /// Seals the active memtable.
738    fn rotate_memtable(&self) -> Option<Arc<Memtable>>;
739
740    /// Returns the number of tables currently in the tree.
741    fn table_count(&self) -> usize;
742
743    /// Returns the number of tables in `levels[idx]`.
744    ///
745    /// Returns `None` if the level does not exist (if idx >= 7).
746    fn level_table_count(&self, idx: usize) -> Option<usize>;
747
748    /// Returns the number of disjoint runs in L0.
749    ///
750    /// Can be used to determine whether to write stall.
751    fn l0_run_count(&self) -> usize;
752
753    /// Returns the number of blob files currently in the tree.
754    fn blob_file_count(&self) -> usize;
755
756    /// Approximates the number of items in the tree.
757    fn approximate_len(&self) -> usize;
758
759    /// Returns the disk space usage.
760    fn disk_space(&self) -> u64;
761
762    /// Estimates the on-disk bytes and entry count contained in `range` at
763    /// `seqno`, WITHOUT reading any data block.
764    ///
765    /// The estimate interpolates each overlapping SST's data-block offsets at
766    /// the range boundaries (block granularity) and adds the active + sealed
767    /// memtables' in-range share. For a KV-separated tree the per-SST blob
768    /// bytes are apportioned by the same in-range fraction (blob files are not
769    /// key-indexed, so a finer estimate is impossible without reading data).
770    /// Intended for query planning (split-point selection, cost-based join
771    /// ordering), not exact accounting; accuracy is typically within ~10-15%
772    /// on roughly-uniform data.
773    ///
774    /// # Examples
775    ///
776    /// ```
777    /// # use lsm_tree::Error as TreeError;
778    /// use lsm_tree::{AbstractTree, Config};
779    ///
780    /// let folder = tempfile::tempdir()?;
781    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
782    /// for i in 0..100u32 {
783    ///     tree.insert(format!("k{i:04}"), "value", 0);
784    /// }
785    /// tree.flush_active_memtable(0)?;
786    ///
787    /// // The full range covers every entry exactly once it is all flushed —
788    /// // estimated without reading a single data block.
789    /// let all = tree.approximate_range_stats::<&str, _>(.., 1)?;
790    /// assert_eq!(all.key_count, tree.approximate_len() as u64);
791    /// assert!(all.bytes > 0);
792    ///
793    /// // A range past every key is empty.
794    /// let none = tree.approximate_range_stats("zzzz".."zzzzz", 1)?;
795    /// assert_eq!(none.key_count, 0);
796    /// assert_eq!(none.bytes, 0);
797    /// #
798    /// # Ok::<(), TreeError>(())
799    /// ```
800    ///
801    /// # Errors
802    ///
803    /// Returns an error if a block index or table metadata read fails.
804    fn approximate_range_stats<K: AsRef<[u8]>, R: RangeBounds<K>>(
805        &self,
806        range: R,
807        seqno: SeqNo,
808    ) -> crate::Result<crate::ApproximateRangeStats>;
809
810    /// Estimates the row cardinality and selectivity of `range` at `seqno`,
811    /// WITHOUT reading any data block.
812    ///
813    /// Uses the per-data-block zone map (per-block row counts + key ranges) for a
814    /// block-granularity row count: every data block whose key range overlaps the
815    /// query contributes its recorded row count, plus the active + sealed
816    /// memtables' in-range counts. When a table has no zone map the row count
817    /// falls back to the byte-fraction estimate of [`approximate_range_stats`].
818    /// Selectivity is `rows / total_rows`, monotonic in predicate tightness, for
819    /// cost-based planning (join ordering, scan-vs-seek). Not exact accounting.
820    ///
821    /// [`approximate_range_stats`]: AbstractTree::approximate_range_stats
822    ///
823    /// # Examples
824    ///
825    /// ```
826    /// # use lsm_tree::Error as TreeError;
827    /// use lsm_tree::{AbstractTree, Config};
828    ///
829    /// let folder = tempfile::tempdir()?;
830    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
831    /// for i in 0..100u32 {
832    ///     tree.insert(format!("k{i:04}"), "v", 0);
833    /// }
834    /// tree.flush_active_memtable(0)?;
835    ///
836    /// // The full range covers every row; selectivity is 1.0.
837    /// let all = tree.approximate_range_cardinality::<&str, _>(.., 1)?;
838    /// assert_eq!(all.rows, tree.approximate_len() as u64);
839    /// assert!((all.selectivity - 1.0).abs() < 1e-9);
840    ///
841    /// // A tighter range selects fewer rows.
842    /// let part = tree.approximate_range_cardinality("k0000".."k0050", 1)?;
843    /// assert!(part.selectivity <= all.selectivity);
844    /// #
845    /// # Ok::<(), TreeError>(())
846    /// ```
847    ///
848    /// # Errors
849    ///
850    /// Returns an error if a block index, zone map, or table metadata read fails.
851    fn approximate_range_cardinality<K: AsRef<[u8]>, R: RangeBounds<K>>(
852        &self,
853        range: R,
854        seqno: SeqNo,
855    ) -> crate::Result<crate::RangeCardinality>;
856
857    /// Returns the highest sequence number of the active memtable.
858    fn get_highest_memtable_seqno(&self) -> Option<SeqNo>;
859
860    /// Returns the highest sequence number that is flushed to disk.
861    fn get_highest_persisted_seqno(&self) -> Option<SeqNo>;
862
863    /// Scans the entire tree, returning the number of items.
864    ///
865    /// ###### Caution
866    ///
867    /// This operation scans the entire tree: O(n) complexity!
868    ///
869    /// Never, under any circumstances, use .`len()` == 0 to check
870    /// if the tree is empty, use [`Tree::is_empty`] instead.
871    ///
872    /// # Examples
873    ///
874    /// ```
875    /// # use lsm_tree::Error as TreeError;
876    /// use lsm_tree::{AbstractTree, Config, Tree};
877    ///
878    /// let folder = tempfile::tempdir()?;
879    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
880    ///
881    /// assert_eq!(tree.len(0, None)?, 0);
882    /// tree.insert("1", "abc", 0);
883    /// tree.insert("3", "abc", 1);
884    /// tree.insert("5", "abc", 2);
885    /// assert_eq!(tree.len(3, None)?, 3);
886    /// #
887    /// # Ok::<(), TreeError>(())
888    /// ```
889    ///
890    /// # Errors
891    ///
892    /// Will return `Err` if an IO error occurs.
893    fn len(&self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>) -> crate::Result<usize> {
894        let mut count = 0;
895
896        for item in self.iter(seqno, index) {
897            let _ = item.key()?;
898            count += 1;
899        }
900
901        Ok(count)
902    }
903
904    /// Returns `true` if the tree is empty.
905    ///
906    /// This operation has O(log N) complexity.
907    ///
908    /// # Examples
909    ///
910    /// ```
911    /// # let folder = tempfile::tempdir()?;
912    /// use lsm_tree::{AbstractTree, Config, Tree};
913    ///
914    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
915    /// assert!(tree.is_empty(0, None)?);
916    ///
917    /// tree.insert("a", "abc", 0);
918    /// assert!(!tree.is_empty(1, None)?);
919    /// #
920    /// # Ok::<(), lsm_tree::Error>(())
921    /// ```
922    ///
923    /// # Errors
924    ///
925    /// Will return `Err` if an IO error occurs.
926    fn is_empty(&self, seqno: SeqNo, index: Option<(Arc<Memtable>, SeqNo)>) -> crate::Result<bool> {
927        Ok(self
928            .first_key_value(seqno, index)
929            .map(crate::Guard::key)
930            .transpose()?
931            .is_none())
932    }
933
934    /// Returns the first key-value pair in the tree.
935    /// The key in this pair is the minimum key in the tree.
936    ///
937    /// # Examples
938    ///
939    /// ```
940    /// # use lsm_tree::Error as TreeError;
941    /// # use lsm_tree::{AbstractTree, Config, Tree, Guard};
942    /// #
943    /// # let folder = tempfile::tempdir()?;
944    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
945    ///
946    /// tree.insert("1", "abc", 0);
947    /// tree.insert("3", "abc", 1);
948    /// tree.insert("5", "abc", 2);
949    ///
950    /// let key = tree.first_key_value(3, None).expect("item should exist").key()?;
951    /// assert_eq!(&*key, "1".as_bytes());
952    /// #
953    /// # Ok::<(), TreeError>(())
954    /// ```
955    ///
956    /// # Errors
957    ///
958    /// Will return `Err` if an IO error occurs.
959    fn first_key_value(
960        &self,
961        seqno: SeqNo,
962        index: Option<(Arc<Memtable>, SeqNo)>,
963    ) -> Option<IterGuardImpl> {
964        self.iter(seqno, index).next()
965    }
966
967    /// Returns the last key-value pair in the tree.
968    /// The key in this pair is the maximum key in the tree.
969    ///
970    /// # Examples
971    ///
972    /// ```
973    /// # use lsm_tree::Error as TreeError;
974    /// # use lsm_tree::{AbstractTree, Config, Tree, Guard};
975    /// #
976    /// # let folder = tempfile::tempdir()?;
977    /// # let tree = Config::new(folder, Default::default(), Default::default()).open()?;
978    /// #
979    /// tree.insert("1", "abc", 0);
980    /// tree.insert("3", "abc", 1);
981    /// tree.insert("5", "abc", 2);
982    ///
983    /// let key = tree.last_key_value(3, None).expect("item should exist").key()?;
984    /// assert_eq!(&*key, "5".as_bytes());
985    /// #
986    /// # Ok::<(), TreeError>(())
987    /// ```
988    ///
989    /// # Errors
990    ///
991    /// Will return `Err` if an IO error occurs.
992    fn last_key_value(
993        &self,
994        seqno: SeqNo,
995        index: Option<(Arc<Memtable>, SeqNo)>,
996    ) -> Option<IterGuardImpl> {
997        self.iter(seqno, index).next_back()
998    }
999
1000    /// Returns the size of a value if it exists.
1001    ///
1002    /// # Examples
1003    ///
1004    /// ```
1005    /// # let folder = tempfile::tempdir()?;
1006    /// use lsm_tree::{AbstractTree, Config, Tree};
1007    ///
1008    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1009    /// tree.insert("a", "my_value", 0);
1010    ///
1011    /// let size = tree.size_of("a", 1)?.unwrap_or_default();
1012    /// assert_eq!("my_value".len() as u32, size);
1013    ///
1014    /// let size = tree.size_of("b", 1)?.unwrap_or_default();
1015    /// assert_eq!(0, size);
1016    /// #
1017    /// # Ok::<(), lsm_tree::Error>(())
1018    /// ```
1019    ///
1020    /// # Errors
1021    ///
1022    /// Will return `Err` if an IO error occurs.
1023    fn size_of<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<u32>>;
1024
1025    /// Retrieves an item from the tree.
1026    ///
1027    /// # Examples
1028    ///
1029    /// ```
1030    /// # let folder = tempfile::tempdir()?;
1031    /// use lsm_tree::{AbstractTree, Config, Tree};
1032    ///
1033    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1034    /// tree.insert("a", "my_value", 0);
1035    ///
1036    /// let item = tree.get("a", 1)?;
1037    /// assert_eq!(Some("my_value".as_bytes().into()), item);
1038    /// #
1039    /// # Ok::<(), lsm_tree::Error>(())
1040    /// ```
1041    ///
1042    /// # Errors
1043    ///
1044    /// Will return `Err` if an IO error occurs.
1045    fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<UserValue>>;
1046
1047    /// Retrieves an item from the tree as a [`PinnableSlice`](crate::PinnableSlice).
1048    ///
1049    /// When the value is backed by an on-disk data block, implementations
1050    /// may return [`PinnableSlice::Pinned`](crate::PinnableSlice::Pinned) holding a reference to that block's
1051    /// decompressed buffer (avoiding a data copy). Memtable and blob-resolved
1052    /// values use [`PinnableSlice::Owned`](crate::PinnableSlice::Owned). The default implementation always
1053    /// returns `Owned`; only [`Tree`] overrides with the pinned path.
1054    ///
1055    /// The existing [`AbstractTree::get`] method is unaffected.
1056    ///
1057    /// # Examples
1058    ///
1059    /// ```
1060    /// # let folder = tempfile::tempdir()?;
1061    /// use lsm_tree::{AbstractTree, Config, Tree};
1062    ///
1063    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
1064    /// tree.insert("a", "my_value", 0);
1065    ///
1066    /// let item = tree.get_pinned("a", 1)?;
1067    /// assert_eq!(item.as_ref().map(|v| v.as_ref()), Some("my_value".as_bytes()));
1068    /// #
1069    /// # Ok::<(), lsm_tree::Error>(())
1070    /// ```
1071    ///
1072    /// # Errors
1073    ///
1074    /// Will return `Err` if an IO error occurs.
1075    fn get_pinned<K: AsRef<[u8]>>(
1076        &self,
1077        key: K,
1078        seqno: SeqNo,
1079    ) -> crate::Result<Option<crate::PinnableSlice>> {
1080        // Default: delegate to get() and wrap as Owned
1081        self.get(key, seqno)
1082            .map(|opt| opt.map(crate::PinnableSlice::owned))
1083    }
1084
1085    /// Returns `true` if the tree contains the specified key.
1086    ///
1087    /// # Examples
1088    ///
1089    /// ```
1090    /// # let folder = tempfile::tempdir()?;
1091    /// # use lsm_tree::{AbstractTree, Config, Tree};
1092    /// #
1093    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1094    /// assert!(!tree.contains_key("a", 0)?);
1095    ///
1096    /// tree.insert("a", "abc", 0);
1097    /// assert!(tree.contains_key("a", 1)?);
1098    /// #
1099    /// # Ok::<(), lsm_tree::Error>(())
1100    /// ```
1101    ///
1102    /// # Errors
1103    ///
1104    /// Will return `Err` if an IO error occurs.
1105    fn contains_key<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<bool> {
1106        self.get(key, seqno).map(|x| x.is_some())
1107    }
1108
1109    /// Returns `true` if the tree contains any key with the given prefix.
1110    ///
1111    /// This is a convenience method that checks whether the corresponding
1112    /// prefix iterator yields at least one item, while surfacing any IO
1113    /// errors via the `Result` return type. Implementations may override
1114    /// this method to provide a more efficient prefix-existence check.
1115    ///
1116    /// # Examples
1117    ///
1118    /// ```
1119    /// # let folder = tempfile::tempdir()?;
1120    /// use lsm_tree::{AbstractTree, Config, Tree};
1121    ///
1122    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1123    /// assert!(!tree.contains_prefix("abc", 0, None)?);
1124    ///
1125    /// tree.insert("abc:1", "value", 0);
1126    /// assert!(tree.contains_prefix("abc", 1, None)?);
1127    /// assert!(!tree.contains_prefix("xyz", 1, None)?);
1128    /// #
1129    /// # Ok::<(), lsm_tree::Error>(())
1130    /// ```
1131    ///
1132    /// # Errors
1133    ///
1134    /// Will return `Err` if an IO error occurs.
1135    fn contains_prefix<K: AsRef<[u8]>>(
1136        &self,
1137        prefix: K,
1138        seqno: SeqNo,
1139        index: Option<(Arc<Memtable>, SeqNo)>,
1140    ) -> crate::Result<bool> {
1141        match self.prefix(prefix, seqno, index).next() {
1142            Some(guard) => guard.key().map(|_| true),
1143            None => Ok(false),
1144        }
1145    }
1146
1147    /// Reads multiple keys from the tree.
1148    ///
1149    /// Implementations may choose to perform all lookups against a single
1150    /// version snapshot and acquire the version lock only once, which can be
1151    /// more efficient than calling [`AbstractTree::get`] in a loop. The
1152    /// default trait implementation, however, is a convenience wrapper that
1153    /// simply calls [`AbstractTree::get`] for each key and therefore does not
1154    /// guarantee a single-snapshot or single-lock acquisition. Optimized
1155    /// implementations (such as [`Tree`] and [`BlobTree`]) provide the
1156    /// single-snapshot/one-lock behavior.
1157    ///
1158    /// # Examples
1159    ///
1160    /// ```
1161    /// # let folder = tempfile::tempdir()?;
1162    /// use lsm_tree::{AbstractTree, Config, Tree};
1163    ///
1164    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1165    /// tree.insert("a", "value_a", 0);
1166    /// tree.insert("b", "value_b", 1);
1167    ///
1168    /// let results = tree.multi_get(["a", "b", "c"], 2)?;
1169    /// assert_eq!(results[0], Some("value_a".as_bytes().into()));
1170    /// assert_eq!(results[1], Some("value_b".as_bytes().into()));
1171    /// assert_eq!(results[2], None);
1172    /// #
1173    /// # Ok::<(), lsm_tree::Error>(())
1174    /// ```
1175    ///
1176    /// # Errors
1177    ///
1178    /// Will return `Err` if an IO error occurs.
1179    fn multi_get<K: AsRef<[u8]>>(
1180        &self,
1181        keys: impl IntoIterator<Item = K>,
1182        seqno: SeqNo,
1183    ) -> crate::Result<Vec<Option<UserValue>>> {
1184        keys.into_iter().map(|key| self.get(key, seqno)).collect()
1185    }
1186
1187    /// Applies a [`WriteBatch`](crate::WriteBatch) with the given sequence number.
1188    ///
1189    /// All entries share a single seqno. This is more efficient than individual
1190    /// writes because the version-history lock and memtable size accounting
1191    /// are performed only once for the entire batch.
1192    ///
1193    /// **Visibility:** entries become individually visible to concurrent readers
1194    /// as they are inserted. For atomic batch visibility, the caller must
1195    /// publish `seqno` (via `visible_seqno.fetch_max(seqno + 1)`) only
1196    /// **after** this method returns.
1197    ///
1198    /// Returns the total bytes added and new size of the memtable.
1199    ///
1200    /// # Examples
1201    ///
1202    /// ```
1203    /// # let folder = tempfile::tempdir()?;
1204    /// use lsm_tree::{AbstractTree, Config, Tree, WriteBatch};
1205    ///
1206    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
1207    ///
1208    /// let mut batch = WriteBatch::new();
1209    /// batch.insert("key1", "value1");
1210    /// batch.insert("key2", "value2");
1211    /// batch.remove("key3");
1212    ///
1213    /// let (bytes_added, memtable_size) = tree.apply_batch(batch, 0)?;
1214    /// assert!(bytes_added > 0);
1215    /// #
1216    /// # Ok::<(), lsm_tree::Error>(())
1217    /// ```
1218    ///
1219    /// # Errors
1220    ///
1221    /// Returns [`Error::MixedOperationBatch`](crate::Error::MixedOperationBatch)
1222    /// if the batch contains mixed operation types for the same user key.
1223    fn apply_batch(&self, batch: crate::WriteBatch, seqno: SeqNo) -> crate::Result<(u64, u64)>;
1224
1225    /// Inserts a key-value pair into the tree.
1226    ///
1227    /// If the key already exists, the item will be overwritten.
1228    ///
1229    /// Returns the added item's size and new size of the memtable.
1230    ///
1231    /// # Examples
1232    ///
1233    /// ```
1234    /// # let folder = tempfile::tempdir()?;
1235    /// use lsm_tree::{AbstractTree, Config, Tree};
1236    ///
1237    /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1238    /// tree.insert("a", "abc", 0);
1239    /// #
1240    /// # Ok::<(), lsm_tree::Error>(())
1241    /// ```
1242    ///
1243    /// # Errors
1244    ///
1245    /// Will return `Err` if an IO error occurs.
1246    fn insert<K: Into<UserKey>, V: Into<UserValue>>(
1247        &self,
1248        key: K,
1249        value: V,
1250        seqno: SeqNo,
1251    ) -> (u64, u64);
1252
1253    /// Admission-gated [`insert`](Self::insert): consults
1254    /// [`write_admission`](Self::write_admission) first and declines with
1255    /// [`Error::StorageFull`](crate::Error::StorageFull) when the tree is over
1256    /// budget, otherwise inserts and returns the same `(added_bytes,
1257    /// memtable_size)` tuple. This is the write entry a space-aware caller uses
1258    /// so over-budget writes are refused up front rather than failing a flush
1259    /// later; bare [`insert`](Self::insert) stays infallible for callers that
1260    /// do not opt into admission control.
1261    ///
1262    /// # Errors
1263    ///
1264    /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1265    /// is closed.
1266    fn try_insert<K: Into<UserKey>, V: Into<UserValue>>(
1267        &self,
1268        key: K,
1269        value: V,
1270        seqno: SeqNo,
1271    ) -> crate::Result<(u64, u64)> {
1272        self.write_admission()?;
1273        Ok(self.insert(key, value, seqno))
1274    }
1275
1276    /// Admission-gated [`merge`](Self::merge). See
1277    /// [`try_insert`](Self::try_insert).
1278    ///
1279    /// # Errors
1280    ///
1281    /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1282    /// is closed.
1283    fn try_merge<K: Into<UserKey>, V: Into<UserValue>>(
1284        &self,
1285        key: K,
1286        operand: V,
1287        seqno: SeqNo,
1288    ) -> crate::Result<(u64, u64)> {
1289        self.write_admission()?;
1290        Ok(self.merge(key, operand, seqno))
1291    }
1292
1293    /// Admission-gated [`remove`](Self::remove). See
1294    /// [`try_insert`](Self::try_insert).
1295    ///
1296    /// Note a tombstone is itself a write that consumes space; an over-budget
1297    /// tree refuses it too. Reclaim space via compaction (never gated) rather
1298    /// than relying on deletes when already read-only.
1299    ///
1300    /// # Errors
1301    ///
1302    /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1303    /// is closed.
1304    fn try_remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> crate::Result<(u64, u64)> {
1305        self.write_admission()?;
1306        Ok(self.remove(key, seqno))
1307    }
1308
1309    /// Admission-gated [`remove_weak`](Self::remove_weak). See
1310    /// [`try_insert`](Self::try_insert).
1311    ///
1312    /// # Errors
1313    ///
1314    /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1315    /// is closed.
1316    fn try_remove_weak<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> crate::Result<(u64, u64)> {
1317        self.write_admission()?;
1318        Ok(self.remove_weak(key, seqno))
1319    }
1320
1321    /// Admission-gated [`remove_range`](Self::remove_range). See
1322    /// [`try_insert`](Self::try_insert).
1323    ///
1324    /// # Errors
1325    ///
1326    /// [`Error::StorageFull`](crate::Error::StorageFull) when the admission gate
1327    /// is closed.
1328    fn try_remove_range<K: Into<UserKey>>(
1329        &self,
1330        start: K,
1331        end: K,
1332        seqno: SeqNo,
1333    ) -> crate::Result<u64> {
1334        self.write_admission()?;
1335        Ok(self.remove_range(start, end, seqno))
1336    }
1337
1338    /// Removes an item from the tree.
1339    ///
1340    /// Returns the added item's size and new size of the memtable.
1341    ///
1342    /// # Examples
1343    ///
1344    /// ```
1345    /// # let folder = tempfile::tempdir()?;
1346    /// # use lsm_tree::{AbstractTree, Config, Tree};
1347    /// #
1348    /// # let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1349    /// tree.insert("a", "abc", 0);
1350    ///
1351    /// let item = tree.get("a", 1)?.expect("should have item");
1352    /// assert_eq!("abc".as_bytes(), &*item);
1353    ///
1354    /// tree.remove("a", 1);
1355    ///
1356    /// let item = tree.get("a", 2)?;
1357    /// assert_eq!(None, item);
1358    /// #
1359    /// # Ok::<(), lsm_tree::Error>(())
1360    /// ```
1361    ///
1362    /// # Errors
1363    ///
1364    /// Will return `Err` if an IO error occurs.
1365    fn remove<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
1366
1367    /// Writes a merge operand for a key.
1368    ///
1369    /// The operand is stored as a partial update that will be combined with
1370    /// other operands and/or a base value via the configured [`crate::MergeOperator`]
1371    /// during reads and compaction.
1372    ///
1373    /// Returns the added item's size and new size of the memtable.
1374    ///
1375    /// # Examples
1376    ///
1377    /// ```
1378    /// # let folder = tempfile::tempdir()?;
1379    /// # use lsm_tree::{AbstractTree, Config, MergeOperator, UserValue};
1380    /// # use std::sync::Arc;
1381    /// # struct SumMerge;
1382    /// # impl MergeOperator for SumMerge {
1383    /// #     fn merge(&self, _key: &[u8], base: Option<&[u8]>, operands: &[&[u8]]) -> lsm_tree::Result<UserValue> {
1384    /// #         let mut sum: i64 = base.map_or(0, |b| i64::from_le_bytes(b.try_into().unwrap()));
1385    /// #         for op in operands { sum += i64::from_le_bytes((*op).try_into().unwrap()); }
1386    /// #         Ok(sum.to_le_bytes().to_vec().into())
1387    /// #     }
1388    /// # }
1389    /// # let tree = Config::new(folder, Default::default(), Default::default())
1390    /// #     .with_merge_operator(Some(Arc::new(SumMerge)))
1391    /// #     .open()?;
1392    /// tree.merge("counter", 1_i64.to_le_bytes(), 0);
1393    /// # Ok::<(), lsm_tree::Error>(())
1394    /// ```
1395    fn merge<K: Into<UserKey>, V: Into<UserValue>>(
1396        &self,
1397        key: K,
1398        operand: V,
1399        seqno: SeqNo,
1400    ) -> (u64, u64);
1401
1402    /// Removes an item from the tree.
1403    ///
1404    /// The tombstone marker of this delete operation will vanish when it
1405    /// collides with its corresponding insertion.
1406    /// This may cause older versions of the value to be resurrected, so it should
1407    /// only be used and preferred in scenarios where a key is only ever written once.
1408    ///
1409    /// Returns the added item's size and new size of the memtable.
1410    ///
1411    /// # Examples
1412    ///
1413    /// ```
1414    /// # let folder = tempfile::tempdir()?;
1415    /// # use lsm_tree::{AbstractTree, Config, Tree};
1416    /// #
1417    /// # let tree = Config::new(folder, Default::default(), Default::default()).open()?;
1418    /// tree.insert("a", "abc", 0);
1419    ///
1420    /// let item = tree.get("a", 1)?.expect("should have item");
1421    /// assert_eq!("abc".as_bytes(), &*item);
1422    ///
1423    /// tree.remove_weak("a", 1);
1424    ///
1425    /// let item = tree.get("a", 2)?;
1426    /// assert_eq!(None, item);
1427    /// #
1428    /// # Ok::<(), lsm_tree::Error>(())
1429    /// ```
1430    ///
1431    /// # Errors
1432    ///
1433    /// Will return `Err` if an IO error occurs.
1434    #[doc(hidden)]
1435    fn remove_weak<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);
1436
1437    /// Deletes all keys in the range `[start, end)` by inserting a range tombstone.
1438    ///
1439    /// This is much more efficient than deleting keys individually when
1440    /// removing a contiguous range of keys.
1441    ///
1442    /// Returns the approximate size added to the memtable.
1443    /// Returns 0 if `start >= end` (invalid interval is silently ignored).
1444    ///
1445    /// This is a required method on the crate's sealed tree types.
1446    fn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64;
1447
1448    /// Deletes all keys with the given prefix by inserting a range tombstone.
1449    ///
1450    /// This is sugar over [`AbstractTree::remove_range`] using prefix bounds.
1451    ///
1452    /// Returns the approximate size added to the memtable.
1453    /// Returns 0 for empty prefixes or all-`0xFF` prefixes (cannot form valid half-open range).
1454    fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64 {
1455        use crate::range::prefix_to_range;
1456        use core::ops::Bound;
1457
1458        let (lo, hi) = prefix_to_range(prefix.as_ref());
1459
1460        let Bound::Included(start) = lo else { return 0 };
1461
1462        // Bound::Unbounded means the prefix is all 0xFF — no representable
1463        // exclusive upper bound exists, so we cannot form a valid range tombstone.
1464        let Bound::Excluded(end) = hi else { return 0 };
1465
1466        self.remove_range(start, end, seqno)
1467    }
1468}