coordinode-lsm-tree 5.7.0

Embedded LSM-tree storage engine: BuRR filters, zstd dictionary compression, MVCC, range tombstones, merge operators, K/V separation, AES-256-GCM at rest.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2026-present, Structured World Foundation

//! Read-only storage introspection: how much is stored, the average shape of a
//! stored entry, and an estimate of how many more entries fit in a byte budget.
//!
//! Computed from the live version's table + blob-file metadata plus one
//! size-stat per live file (the same accounting `Tree::create_checkpoint`
//! uses), so it never touches the data blocks. See
//! [`crate::AbstractTree::storage_stats`].

use crate::version::Version;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;

/// Coarse storage state of a tree.
///
/// With storage admission gating off (no configured quota and a backend that
/// cannot report free space) a tree reports [`Self::Healthy`] or, mid-run,
/// [`Self::CompactionInProgress`]. Once gating is active (bounded capacity), an
/// idle tree instead reports compaction availability:
/// [`Self::FullCompactionAvailable`] when a full compaction has working room,
/// [`Self::TightCompactionAvailable`] when only the opt-in tight-space mode
/// would fit, and [`Self::ReadOnlyOutOfSpace`] when the write gate is closed
/// (this takes precedence over a concurrent compaction).
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
#[non_exhaustive]
pub enum StorageStatus {
    /// Normal operation: writes and a full compaction are available.
    Healthy,
    /// Enough free space for a normal (full) compaction.
    FullCompactionAvailable,
    /// Not enough space for a full compaction, but the opt-in tight-space
    /// (incremental-reclaim) compaction mode can still run.
    TightCompactionAvailable,
    /// Out of space: the tree is read-only until space is freed or the quota
    /// is raised.
    ReadOnlyOutOfSpace,
    /// A compaction is currently running.
    CompactionInProgress,
}

/// A point-in-time snapshot of a tree's on-disk storage footprint and the
/// average shape of a stored entry.
///
/// All byte figures are on-disk (post-compression, including any per-block
/// overhead and blob files). Averages are over every stored entry version, so
/// they pair with [`Self::item_count`].
#[must_use]
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct StorageStats {
    /// Total on-disk bytes of all live SSTs plus blob files: how much is
    /// **occupied**. Pairs with [`Self::capacity_bytes`] / [`Self::available_bytes`]
    /// for an "X of Y used" view in a single call.
    pub used_bytes: u64,

    /// Total bytes the tree may occupy: the tighter of a configured byte quota
    /// (`storage_limit_bytes`) and the physical disk headroom (free space plus
    /// what is already used), across every volume the tree writes to. `None`
    /// when unbounded: no quota set AND the backend cannot report free space.
    pub capacity_bytes: Option<u64>,

    /// Free room left before the tree turns read-only: `capacity_bytes - used_bytes`
    /// (saturating). `None` exactly when [`Self::capacity_bytes`] is `None`
    /// (unbounded).
    pub available_bytes: Option<u64>,

    /// Whether a compaction can still run given the remaining free space (it
    /// needs working room to write merged output). `true` when unbounded or
    /// when at least [`Self::tight_compaction_bytes`] of free space remains;
    /// `false` when the disk is too full for a compaction to make progress. The
    /// finer full-vs-tight distinction is carried by [`Self::status`].
    pub compaction_possible: bool,

    /// Estimated free space (bytes) a FULL compaction needs for its transient
    /// output while the inputs still exist: the largest level's on-disk size
    /// (an upper bound on a single merge's input set). A full compaction has
    /// room when [`Self::available_bytes`] `>=` this. Pair with `used_bytes` /
    /// `capacity_bytes` to draw a capacity gauge: `used` → `used + tight_compaction_bytes`
    /// → `used + full_compaction_bytes` → `capacity`.
    pub full_compaction_bytes: u64,

    /// Estimated free space (bytes) a minimal (tight) space-reclaiming
    /// compaction needs to make forward progress: the reserved working floor.
    /// Tight compaction has room when [`Self::available_bytes`] `>=` this.
    pub tight_compaction_bytes: u64,

    /// Number of live entries (all versions) across all live SSTs.
    pub item_count: u64,

    /// Number of live SSTs.
    pub table_count: u64,

    /// Average on-disk bytes per entry (`used_bytes / item_count`), or `0` when
    /// the tree is empty. This is the figure
    /// [`Self::estimated_remaining_entries`] divides a budget by.
    pub avg_entry_on_disk_bytes: u64,

    /// Average user-key byte length per entry, or `None` if any live table was
    /// written before per-table key/value byte sums were recorded (the average
    /// key/value split is only exact when every table carries the figures).
    pub avg_key_bytes: Option<u64>,

    /// Average value byte length per entry, or `None` under the same condition
    /// as [`Self::avg_key_bytes`].
    pub avg_value_bytes: Option<u64>,

    /// Estimated bytes a full compaction could reclaim, from the
    /// weak-tombstone-reclaimable entry count times the average on-disk entry
    /// size. An estimate, not an exact figure.
    pub reclaimable_bytes_estimate: u64,

    /// Coarse storage state.
    pub status: StorageStatus,
}

/// Approximate size of a key range, estimated from SST block-index offsets and
/// the active memtable WITHOUT reading any data block. Returned by
/// [`crate::AbstractTree::approximate_range_stats`].
///
/// Both figures are estimates from the same in-range fraction per source: each
/// overlapping SST's data-block offsets are interpolated at the range
/// boundaries (block granularity) and that fraction is applied to the SST's
/// byte span and its entry count, while each memtable contributes its in-range
/// skiplist count and the matching share of its size. Accuracy is typically
/// within ~10-15% on roughly-uniform data; it is intended for query planning
/// (split-point selection, cost-based join ordering), not exact accounting.
#[must_use]
#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)]
pub struct ApproximateRangeStats {
    /// Estimated on-disk bytes occupied by the range across all overlapping
    /// SSTs (key + pointer + apportioned blob bytes) plus the active and sealed
    /// memtables' in-range share. `0` for an empty range.
    pub bytes: u64,

    /// Estimated number of entry versions in the range: the sum, over each
    /// overlapping SST, of `item_count × in-range fraction`, plus each
    /// memtable's in-range skiplist count. `0` for an empty range.
    pub key_count: u64,
}

/// Size and entry count of one stored segment (SST), for per-segment tiering and
/// erasure-coding placement decisions.
#[must_use]
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct SegmentStats {
    /// Identifier of the segment's SST within its tree.
    pub table_id: crate::TableId,
    /// LSM level the segment lives in (`0` is the newest / smallest level).
    pub level: usize,
    /// Physical on-disk bytes of the segment's SST file.
    pub used_bytes: u64,
    /// Number of entry versions stored in the segment.
    pub item_count: u64,
    /// Cumulative point reads that consulted this segment's data since it was
    /// created: only reads that pass the segment's seqno-range and bloom gates
    /// count (a bloom miss is not counted), so this tracks data hotness rather
    /// than raw probe frequency. A monotonic counter, not a rate: derive a
    /// read-rate / EMA from the delta between successive polls. `0` when never
    /// read.
    pub reads: u64,
    /// Unix seconds of the segment's most recent data-consulting read, or `0` if
    /// never read (or on a no-std build, which keeps no clock).
    pub last_access_secs: u64,
}

/// Per-LSM-level size + entry aggregates with the contributing segments, for
/// tiering and erasure-coding placement (which level / segment is large enough
/// to demote, EC-encode, or migrate).
///
/// Cheap to read: derived from version metadata plus one file-size stat per
/// segment, never a data-block scan. The per-level totals reconcile with the
/// tree-level [`StorageStats`]: summed across levels they equal the SST portion
/// of [`StorageStats::used_bytes`] and [`StorageStats::item_count`] (blob files
/// are tracked separately).
#[must_use]
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct LevelStats {
    /// LSM level index (`0` is the newest / smallest level).
    pub level: usize,
    /// Number of segments (SSTs) in the level.
    pub segment_count: usize,
    /// Physical on-disk bytes summed across the level's segments.
    pub used_bytes: u64,
    /// Entry versions summed across the level's segments.
    pub item_count: u64,
    /// Cumulative point-read probes summed across the level's segments.
    pub reads: u64,
    /// Most recent point-read probe across the level's segments, in unix
    /// seconds, or `0` if none was ever read.
    pub last_access_secs: u64,
    /// Per-segment breakdown, in level (run / table) order.
    pub segments: Vec<SegmentStats>,
}

/// Approximate cardinality and selectivity of a key range, for cost-based query
/// planning (join ordering, scan-vs-seek).
///
/// Both figures derive from the per-data-block zone map (per-block row counts +
/// key ranges) when present, falling back to the byte-fraction estimate of
/// [`ApproximateRangeStats`] otherwise. They are estimates at block granularity,
/// never exact.
#[must_use]
#[derive(Copy, Clone, Debug, Default, PartialEq)]
pub struct RangeCardinality {
    /// Estimated number of rows (entry versions) the range covers: the sum of
    /// the per-block row counts of every data block whose key range overlaps the
    /// query range, plus each memtable's in-range count. `0` for an empty range.
    pub rows: u64,

    /// Estimated fraction of the tree's rows the range selects, in `0.0..=1.0`:
    /// `rows / total_rows`. Monotonic in predicate tightness (a narrower range
    /// never yields a larger selectivity). `0.0` when the tree is empty.
    pub selectivity: f64,
}

/// Grouped, object-safe read-only storage-statistics surface.
///
/// A coherent view over a tree's non-query statistics: on-disk footprint
/// ([`storage_stats`](Self::storage_stats)), per-level / per-segment sizing
/// ([`level_segment_stats`](Self::level_segment_stats)), compaction debt
/// ([`compaction_debt`](Self::compaction_debt)), and block-cache health
/// (`cache_stats`, behind the `metrics` feature). A planner / tiering / capacity consumer
/// bounds on `T: StorageStatistics` (or `&dyn StorageStatistics`) and a test can
/// supply a mock. Every [`AbstractTree`](crate::AbstractTree) implements it via a
/// blanket impl (`impl<T: AbstractTree + ?Sized> StorageStatistics for T`).
///
/// The per-query range estimators
/// ([`approximate_range_stats`](crate::AbstractTree::approximate_range_stats),
/// [`approximate_range_cardinality`](crate::AbstractTree::approximate_range_cardinality))
/// are generic over the range type and so not object-safe; they stay on
/// [`AbstractTree`](crate::AbstractTree) rather than joining this trait.
pub trait StorageStatistics {
    /// On-disk footprint and average entry shape: used / capacity / available
    /// bytes, item & table counts, average entry size, reclaimable-bytes
    /// estimate, and a coarse [`StorageStatus`]. See
    /// [`StorageStats::estimated_remaining_entries`] for a budget projection.
    ///
    /// # Examples
    ///
    /// ```
    /// # use lsm_tree::Error as TreeError;
    /// use lsm_tree::{AbstractTree, Config, StorageStatistics};
    ///
    /// let folder = tempfile::tempdir()?;
    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
    /// for i in 0..100u32 {
    ///     tree.insert(format!("k{i:04}"), "v", 0);
    /// }
    /// tree.flush_active_memtable(0)?;
    ///
    /// // Both traits are in scope, so disambiguate the shared method name.
    /// let stats = StorageStatistics::storage_stats(&tree)?;
    /// assert_eq!(stats.item_count, 100);
    /// // Roughly how many more average-shaped entries fit in another 1 MiB.
    /// let _headroom = stats.estimated_remaining_entries(1024 * 1024);
    /// #
    /// # Ok::<(), TreeError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns an error if a live file's size cannot be stat-ed.
    fn storage_stats(&self) -> crate::Result<StorageStats>;

    /// Per-LSM-level and per-segment size + entry-count stats, for tiering and
    /// erasure-coding placement decisions (which level / segment is large enough
    /// to demote, EC-encode, or migrate).
    ///
    /// Cheap: derived from the live version's metadata plus one file-size stat
    /// per segment (no data-block scan). The per-level totals reconcile with
    /// [`storage_stats`](Self::storage_stats): summed across levels they equal
    /// the SST portion of [`StorageStats::used_bytes`] and
    /// [`StorageStats::item_count`].
    ///
    /// # Examples
    ///
    /// ```
    /// # use lsm_tree::Error as TreeError;
    /// use lsm_tree::{AbstractTree, Config, StorageStatistics};
    ///
    /// let folder = tempfile::tempdir()?;
    /// let tree = Config::new(&folder, Default::default(), Default::default()).open()?;
    /// for i in 0..100u32 {
    ///     tree.insert(format!("k{i:04}"), "v", 0);
    /// }
    /// tree.flush_active_memtable(0)?;
    ///
    /// // Both traits are in scope, so disambiguate the shared method names.
    /// let levels = StorageStatistics::level_segment_stats(&tree)?;
    /// let total: u64 = levels.iter().map(|l| l.item_count).sum();
    /// assert_eq!(total, StorageStatistics::storage_stats(&tree)?.item_count);
    /// #
    /// # Ok::<(), TreeError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns an error if a segment's file size cannot be stat-ed.
    fn level_segment_stats(&self) -> crate::Result<Vec<LevelStats>>;

    /// Estimated bytes pending compaction under `strategy`: on-disk data above
    /// its level's target that must eventually be rewritten downward (a `RocksDB`
    /// `estimate-pending-compaction-bytes` analog), a compaction-debt signal for a
    /// scheduler / tiering consumer.
    ///
    /// The strategy is a caller argument because the engine does not own a
    /// configured compaction strategy (it is injected per compaction run); a
    /// `&dyn` keeps this object-safe. Returns `0` for strategies without a
    /// size-target notion of debt (FIFO, drop-range), or when the tree is at or
    /// below its target shape. See
    /// [`CompactionStrategy::pending_compaction_bytes`](crate::compaction::CompactionStrategy::pending_compaction_bytes).
    fn compaction_debt(&self, strategy: &dyn crate::compaction::CompactionStrategy) -> u64;

    /// A point-in-time [`CacheStats`](crate::CacheStats) snapshot of block-cache
    /// effectiveness (cumulative hit / miss counts and rate) and occupancy
    /// (current size against capacity).
    ///
    /// The stable, owned observability view over the block cache, so a consumer
    /// reads cache health without holding the mutable
    /// [`metrics`](crate::AbstractTree::metrics) handle. Counts are cumulative
    /// since process start; derive a rate over an interval from the delta between
    /// two polls.
    #[cfg(feature = "metrics")]
    fn cache_stats(&self) -> crate::CacheStats;
}

/// Every [`AbstractTree`](crate::AbstractTree) is a [`StorageStatistics`] by
/// delegating to its own inherent stats methods, so a `Tree` / `BlobTree` can be
/// used directly as `&dyn StorageStatistics`. The logic lives once on
/// `AbstractTree`; this is a thin object-safe re-exposure for the grouped /
/// mockable surface (a test mock implements `StorageStatistics` directly without
/// being an `AbstractTree`). When both traits are in scope, disambiguate a bare
/// `tree.storage_stats()` with `StorageStatistics::storage_stats(&tree)`.
impl<T: crate::AbstractTree + ?Sized> StorageStatistics for T {
    fn storage_stats(&self) -> crate::Result<StorageStats> {
        crate::AbstractTree::storage_stats(self)
    }

    fn level_segment_stats(&self) -> crate::Result<Vec<LevelStats>> {
        crate::AbstractTree::level_segment_stats(self)
    }

    fn compaction_debt(&self, strategy: &dyn crate::compaction::CompactionStrategy) -> u64 {
        crate::AbstractTree::compaction_debt(self, strategy)
    }

    #[cfg(feature = "metrics")]
    fn cache_stats(&self) -> crate::CacheStats {
        crate::AbstractTree::cache_stats(self)
    }
}

impl StorageStats {
    /// Approximately how many more average-shaped entries fit in `budget_bytes`,
    /// using [`Self::avg_entry_on_disk_bytes`].
    ///
    /// Returns `0` when the average entry size is unknown (an empty tree), since
    /// there is no basis for the estimate.
    #[must_use]
    pub fn estimated_remaining_entries(&self, budget_bytes: u64) -> u64 {
        if self.avg_entry_on_disk_bytes == 0 {
            0
        } else {
            budget_bytes / self.avg_entry_on_disk_bytes
        }
    }
}

/// Sums the true physical on-disk size of every live table and blob file in
/// `version` (one metadata stat per file).
///
/// This is the same physical basis [`compute_storage_stats`] reports as
/// `used_bytes` and that `Tree::create_checkpoint` totals, so the storage
/// admission gate agrees with both. It deliberately does NOT use
/// `Metadata::file_size` (undercounts by the meta block / footer) or
/// `disk_space()` (metadata `Level::size`, which also omits blob files).
///
/// # Errors
///
/// Returns an error if a live table or blob file's size cannot be stat-ed.
pub(crate) fn compute_used_bytes(version: &Version) -> crate::Result<u64> {
    // Sum of on-disk file sizes, bounded by the filesystem capacity → cannot
    // overflow u64; plain arithmetic.
    let mut used_bytes = 0u64;
    for table in version.iter_tables() {
        used_bytes += table.fs.metadata(&table.path)?.len;
    }
    for blob in version.blob_files.iter() {
        used_bytes += blob.0.fs.metadata(&blob.0.path)?.len;
    }
    Ok(used_bytes)
}

/// The transient-output bound a full compaction's space check uses: the largest
/// level's on-disk size (the `full_compaction_bytes` gauge figure), an upper
/// bound on a single merge's input set. `0` for an empty tree.
///
/// This is the DEMAND. The destination VOLUME is a separate concern: a full
/// compaction writes its output to the last configured level
/// (`level_count - 1`), not to whichever level is currently largest, so callers
/// pass the last level as the destination to the per-volume space check (the two
/// differ only under tiered routing, where they can be different filesystems).
pub(crate) fn full_compaction_demand_bytes(version: &Version) -> u64 {
    version
        .iter_levels()
        .map(crate::version::Level::size)
        .max()
        .unwrap_or(0)
}

/// Computes [`StorageStats`] from a live version's table + blob-file metadata.
///
/// `is_compacting` selects [`StorageStatus::CompactionInProgress`] vs
/// [`StorageStatus::Healthy`]; the caller supplies it because compaction state
/// is engine-internal.
///
/// `value_bytes_are_user_values` must be `false` for a KV-separated
/// (`BlobTree`) tree: there the SST records a small indirection pointer per
/// large value, not the user value, so the per-table value-byte sum measures
/// pointers and the value average would misreport. When `false`,
/// [`StorageStats::avg_value_bytes`] is forced to `None`. Key bytes are never
/// separated, so [`StorageStats::avg_key_bytes`] stays exact either way.
///
/// `used_bytes` is the true on-disk file size of every live table and blob
/// file (one metadata stat per file), not the writer's `Metadata::file_size`
/// or `crate::version::Version::blob_files`' compressed-payload sum: those
/// undercount the physical file by the meta block / footer / blob trailer.
/// Statting matches the figure `Tree::create_checkpoint` reports, so the two
/// agree on disk reality.
///
/// # Errors
///
/// Returns an error if a live table or blob file's size cannot be stat-ed.
pub(crate) fn compute_storage_stats(
    version: &Version,
    is_compacting: bool,
    value_bytes_are_user_values: bool,
) -> crate::Result<StorageStats> {
    let mut used_bytes = 0u64;
    let mut item_count = 0u64;
    let mut table_count = 0u64;
    let mut reclaimable_entries = 0u64;
    let mut sum_key = 0u64;
    let mut sum_value = 0u64;
    // The key/value split is only exact when EVERY live table records the byte
    // sums; a single legacy table without them makes the average unrepresentable.
    let mut all_have_shape = true;

    // Every running total below is a sum of on-disk byte sizes or live item
    // counts; both are bounded by the filesystem capacity / the live entry count
    // and cannot overflow u64, so plain arithmetic is correct (a debug-overflow
    // would itself signal a corrupt metadata read).
    for table in version.iter_tables() {
        let m = &table.metadata;
        // Physical file size, NOT m.file_size (which undercounts — see above).
        let on_disk = table.fs.metadata(&table.path)?.len;
        used_bytes += on_disk;
        item_count += m.item_count;
        table_count += 1;
        reclaimable_entries += m.weak_tombstone_reclaimable;
        match (m.sum_user_key_bytes, m.sum_value_bytes) {
            (Some(k), Some(v)) => {
                sum_key += k;
                sum_value += v;
            }
            _ => all_have_shape = false,
        }
    }

    // Physical blob-file size (metadata + trailer included), NOT
    // BlobFileList::on_disk_size() which sums only the compressed payload.
    for blob in version.blob_files.iter() {
        used_bytes += blob.0.fs.metadata(&blob.0.path)?.len;
    }

    let avg_entry_on_disk_bytes = if item_count == 0 {
        0
    } else {
        used_bytes / item_count
    };

    let have_shape = all_have_shape && item_count > 0;
    let avg_key_bytes = have_shape.then(|| sum_key / item_count);
    // Value bytes are only meaningful when not KV-separated (see param doc).
    let avg_value_bytes =
        (have_shape && value_bytes_are_user_values).then(|| sum_value / item_count);

    // reclaimable_entries ≤ item_count and avg_entry_on_disk_bytes = used / item_count,
    // so the product is ≤ used_bytes (bounded by disk capacity): plain multiply.
    let reclaimable_bytes_estimate = reclaimable_entries * avg_entry_on_disk_bytes;

    // A full compaction's transient output is bounded by its input set; the
    // largest single merge is bounded by the largest level's on-disk size, so
    // that is the free space a full compaction needs.
    let full_compaction_bytes = full_compaction_demand_bytes(version);
    // A minimal (tight) space-reclaiming merge needs only the reserved working
    // floor to make forward progress.
    let tight_compaction_bytes = crate::tree::MIN_RESERVED_HEADROOM;

    let status = if is_compacting {
        StorageStatus::CompactionInProgress
    } else {
        StorageStatus::Healthy
    };

    Ok(StorageStats {
        used_bytes,
        // Capacity is disk-aware (quota + free-space probe) and lives at the
        // tree layer; this version-only computation leaves it unbounded. The
        // caller (`Tree::storage_stats`) fills the real figures.
        capacity_bytes: None,
        available_bytes: None,
        compaction_possible: true,
        full_compaction_bytes,
        tight_compaction_bytes,
        item_count,
        table_count,
        avg_entry_on_disk_bytes,
        avg_key_bytes,
        avg_value_bytes,
        reclaimable_bytes_estimate,
        status,
    })
}

/// Computes per-LSM-level and per-segment size + entry stats from a version.
///
/// Cost is O(levels x segments) plus one file-size stat per segment (the same
/// stat [`compute_storage_stats`] already performs); it never reads a data block.
///
/// # Errors
///
/// Returns an error if a segment's file size cannot be stat-ed.
pub(crate) fn compute_level_segment_stats(version: &Version) -> crate::Result<Vec<LevelStats>> {
    use core::sync::atomic::Ordering::Relaxed;
    let mut levels = Vec::with_capacity(version.level_count());
    for (level, run_group) in version.iter_levels().enumerate() {
        let mut segments = Vec::new();
        let mut used_bytes = 0u64;
        let mut item_count = 0u64;
        let mut reads = 0u64;
        let mut last_access_secs = 0u64;
        for run in run_group.iter() {
            for table in run.iter() {
                // Physical file size, NOT m.file_size (which undercounts), to
                // reconcile with the tree-level `used_bytes`.
                let on_disk = table.fs.metadata(&table.path)?.len;
                let items = table.metadata.item_count;
                let seg_reads = table.read_count.load(Relaxed);
                let seg_access = table.last_access_secs.load(Relaxed);
                used_bytes += on_disk;
                item_count += items;
                reads = reads.saturating_add(seg_reads);
                last_access_secs = last_access_secs.max(seg_access);
                segments.push(SegmentStats {
                    table_id: table.metadata.id,
                    level,
                    used_bytes: on_disk,
                    item_count: items,
                    reads: seg_reads,
                    last_access_secs: seg_access,
                });
            }
        }
        levels.push(LevelStats {
            level,
            segment_count: segments.len(),
            used_bytes,
            item_count,
            reads,
            last_access_secs,
            segments,
        });
    }
    Ok(levels)
}

#[cfg(test)]
mod tests;