Skip to main content

lsm_tree/
metrics.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5use core::sync::atomic::Ordering::Relaxed;
6use core::sync::atomic::{AtomicU64, AtomicUsize};
7
8/// Runtime metrics
9///
10/// Are not stored durably, so metrics will reset after a restart/crash.
11#[derive(Debug, Default)]
12pub struct Metrics {
13    /// Number of times a table file was opened using `fopen()`
14    pub(crate) table_file_opened_uncached: AtomicUsize,
15
16    /// Number of times a table file was retrieved from descriptor cache
17    pub(crate) table_file_opened_cached: AtomicUsize,
18
19    /// Number of index blocks that were actually read from disk
20    pub(crate) index_block_load_io: AtomicUsize,
21
22    /// Number of filter blocks that were actually read from disk
23    pub(crate) filter_block_load_io: AtomicUsize,
24
25    /// Number of blocks that were actually read from disk
26    pub(crate) data_block_load_io: AtomicUsize,
27
28    /// Number of blocks that were read from block cache
29    pub(crate) index_block_load_cached: AtomicUsize,
30
31    /// Number of blocks that were read from block cache
32    pub(crate) filter_block_load_cached: AtomicUsize,
33
34    /// Number of blocks that were read from block cache
35    pub(crate) data_block_load_cached: AtomicUsize,
36
37    /// Number of range tombstone blocks that were actually read from disk
38    pub(crate) range_tombstone_block_load_io: AtomicUsize,
39
40    /// Number of range tombstone blocks that were read from block cache
41    pub(crate) range_tombstone_block_load_cached: AtomicUsize,
42
43    /// Number of filter queries that were performed
44    pub(crate) filter_queries: AtomicUsize,
45
46    /// Number of IOs that were skipped due to filter
47    pub(crate) io_skipped_by_filter: AtomicUsize,
48
49    /// Number of segments skipped during prefix scans via
50    /// [`Tree::create_prefix`](crate::Tree::create_prefix) where the per-table prefix bloom filter
51    /// returned `Ok(false)`. Counted in both single-table and
52    /// multi-table run paths of `TreeIter::create_range`.
53    ///
54    /// Note: `BlobTree` prefix scans do not currently record this metric.
55    pub(crate) prefix_bloom_skips: AtomicUsize,
56
57    /// Number of data block bytes that were requested from OS or disk
58    pub(crate) data_block_io_requested: AtomicU64,
59
60    /// Number of index block bytes that were requested from OS or disk
61    pub(crate) index_block_io_requested: AtomicU64,
62
63    /// Number of filter block bytes that were requested from OS or disk
64    pub(crate) filter_block_io_requested: AtomicU64,
65
66    /// Number of range tombstone block bytes that were requested from OS or disk
67    pub(crate) range_tombstone_block_io_requested: AtomicU64,
68
69    /// Number of SSTs flagged for a healing recompaction after a read recovered
70    /// a block from Page-ECC parity and confirmed the fault persistent (counted
71    /// only when `auto_heal` is enabled). Each SST is counted once per pending
72    /// schedule.
73    pub(crate) ecc_auto_heal_scheduled: AtomicUsize,
74
75    /// On-read blocks healed by the SEC-DED single-bit fast path (one corrected
76    /// bit flip). Counted on every primary read that observes the recovery
77    /// (point/range loads, partial-decode, patrol scrub); the persistence
78    /// confirming re-read does NOT re-count. A non-zero, growing value is a
79    /// scrapeable latent-bit-rot signal.
80    pub(crate) ecc_secded_corrected: AtomicUsize,
81
82    /// On-read blocks recovered from Reed-Solomon shard parity (the general
83    /// multi-byte path). Same counting discipline as
84    /// [`Self::ecc_secded_corrected`]; the two are disjoint by recovery
85    /// mechanism and sum to the total on-read ECC recoveries.
86    pub(crate) ecc_shard_recovered: AtomicUsize,
87}
88
89/// A point-in-time snapshot of block-cache effectiveness and occupancy.
90///
91/// Derived from [`Metrics`] (the cumulative hit / miss counters) plus the live
92/// block cache's current size and capacity, so an observability consumer gets a
93/// stable owned value instead of reaching into the mutable `&Arc<Metrics>`.
94/// Counts are cumulative since process start (they reset on restart, like all of
95/// [`Metrics`]); derive a rate over an interval from the delta between two polls.
96// No `PartialEq`: `hit_rate` is an `f64`, so equality would inherit float
97// comparison semantics. Compare the integer fields explicitly instead.
98#[must_use]
99#[derive(Copy, Clone, Debug)]
100pub struct CacheStats {
101    /// Cumulative block reads served from the block cache (all block types).
102    pub hits: u64,
103    /// Cumulative block reads that missed the cache and hit disk (all block types).
104    pub misses: u64,
105    /// Hit rate in `0.0..=1.0` (`hits / (hits + misses)`); `1.0` when no block
106    /// has been loaded yet (nothing has missed).
107    pub hit_rate: f64,
108    /// Current weighted bytes resident in the block cache.
109    pub size_bytes: u64,
110    /// Configured maximum bytes the block cache may hold.
111    pub capacity_bytes: u64,
112}
113
114#[expect(
115    clippy::cast_precision_loss,
116    reason = "metrics can accept precision loss"
117)]
118impl Metrics {
119    /// Builds a [`CacheStats`] snapshot from the cumulative cache counters and
120    /// the caller-supplied live cache `size_bytes` / `capacity_bytes` (the block
121    /// cache owns its occupancy, [`Metrics`] owns the hit / miss tallies).
122    pub fn cache_stats(&self, size_bytes: u64, capacity_bytes: u64) -> CacheStats {
123        // Read the counters once so hits / misses / hit_rate are a single
124        // consistent snapshot (block_cache_hit_rate would re-read the atomics).
125        let hits = self.block_load_cached_count() as u64;
126        let misses = self.block_load_io_count() as u64;
127        let total = hits + misses;
128        let hit_rate = if total == 0 {
129            1.0
130        } else {
131            hits as f64 / total as f64
132        };
133        CacheStats {
134            hits,
135            misses,
136            hit_rate,
137            size_bytes,
138            capacity_bytes,
139        }
140    }
141
142    /// Returns the cache hit rate for file descriptors in percent (0.0 - 1.0).
143    pub fn table_file_cache_hit_rate(&self) -> f64 {
144        let uncached = self.table_file_opened_uncached.load(Relaxed) as f64;
145        let cached = self.table_file_opened_cached.load(Relaxed) as f64;
146
147        if cached + uncached == 0.0 {
148            1.0
149        } else {
150            cached / (cached + uncached)
151        }
152    }
153
154    /// Number of I/O data block bytes transferred from disk or OS page cache.
155    pub fn data_block_io(&self) -> u64 {
156        self.data_block_io_requested.load(Relaxed)
157    }
158
159    /// Number of I/O index block bytes transferred from disk or OS page cache.
160    pub fn index_block_io(&self) -> u64 {
161        self.index_block_io_requested.load(Relaxed)
162    }
163
164    /// Number of I/O filter block bytes transferred from disk or OS page cache.
165    pub fn filter_block_io(&self) -> u64 {
166        self.filter_block_io_requested.load(Relaxed)
167    }
168
169    /// Number of I/O range tombstone block bytes transferred from disk or OS page cache.
170    pub fn range_tombstone_block_io(&self) -> u64 {
171        self.range_tombstone_block_io_requested.load(Relaxed)
172    }
173
174    /// Number of I/O block bytes transferred from disk or OS page cache.
175    pub fn block_io(&self) -> u64 {
176        self.data_block_io_requested.load(Relaxed)
177            + self.index_block_io_requested.load(Relaxed)
178            + self.filter_block_io_requested.load(Relaxed)
179            + self.range_tombstone_block_io_requested.load(Relaxed)
180    }
181
182    /// Number of data blocks that were accessed.
183    pub fn data_block_load_count(&self) -> usize {
184        self.data_block_load_cached.load(Relaxed) + self.data_block_load_io.load(Relaxed)
185    }
186
187    /// Number of index blocks that were accessed.
188    pub fn index_block_load_count(&self) -> usize {
189        self.index_block_load_cached.load(Relaxed) + self.index_block_load_io.load(Relaxed)
190    }
191
192    /// Number of filter blocks that were accessed.
193    pub fn filter_block_load_count(&self) -> usize {
194        self.filter_block_load_cached.load(Relaxed) + self.filter_block_load_io.load(Relaxed)
195    }
196
197    /// Number of range tombstone blocks that were accessed.
198    pub fn range_tombstone_block_load_count(&self) -> usize {
199        self.range_tombstone_block_load_cached.load(Relaxed)
200            + self.range_tombstone_block_load_io.load(Relaxed)
201    }
202
203    /// Number of SSTs scheduled for a healing recompaction after a persistent
204    /// ECC correction on read (`auto_heal` enabled).
205    pub fn ecc_auto_heal_scheduled_count(&self) -> usize {
206        self.ecc_auto_heal_scheduled.load(Relaxed)
207    }
208
209    /// On-read blocks healed by the SEC-DED single-bit fast path.
210    pub fn ecc_secded_corrected_count(&self) -> usize {
211        self.ecc_secded_corrected.load(Relaxed)
212    }
213
214    /// On-read blocks recovered from Reed-Solomon shard parity.
215    pub fn ecc_shard_recovered_count(&self) -> usize {
216        self.ecc_shard_recovered.load(Relaxed)
217    }
218
219    /// Total on-read ECC recoveries across both mechanisms (SEC-DED + RS shard).
220    /// A scrapeable latent-bit-rot signal: growth here means the medium is
221    /// returning faulty bytes that parity is silently repairing.
222    pub fn ecc_recovered_count(&self) -> usize {
223        self.ecc_secded_corrected_count() + self.ecc_shard_recovered_count()
224    }
225
226    /// Records one on-read ECC recovery, attributing it to the mechanism that
227    /// did the repair. Called from the primary read paths (`load_block`, the
228    /// partial-decode path, patrol scrub); the persistence-confirming re-read
229    /// must NOT call this, to avoid double-counting a single fault.
230    pub(crate) fn record_ecc_recovery(&self, kind: crate::table::block::EccRecoveryKind) {
231        use crate::table::block::EccRecoveryKind;
232        match kind {
233            EccRecoveryKind::Secded => &self.ecc_secded_corrected,
234            EccRecoveryKind::Shard => &self.ecc_shard_recovered,
235        }
236        .fetch_add(1, Relaxed);
237    }
238
239    /// Number of blocks that were loaded from disk or OS page cache.
240    pub fn block_load_io_count(&self) -> usize {
241        self.data_block_load_io.load(Relaxed)
242            + self.index_block_load_io.load(Relaxed)
243            + self.filter_block_load_io.load(Relaxed)
244            + self.range_tombstone_block_load_io.load(Relaxed)
245    }
246
247    /// Number of data blocks that were served from block cache.
248    pub fn data_block_load_cached_count(&self) -> usize {
249        self.data_block_load_cached.load(Relaxed)
250    }
251
252    /// Number of index blocks that were served from block cache.
253    pub fn index_block_load_cached_count(&self) -> usize {
254        self.index_block_load_cached.load(Relaxed)
255    }
256
257    /// Number of filter blocks that were served from block cache.
258    pub fn filter_block_load_cached_count(&self) -> usize {
259        self.filter_block_load_cached.load(Relaxed)
260    }
261
262    /// Number of range tombstone blocks that were served from block cache.
263    pub fn range_tombstone_block_load_cached_count(&self) -> usize {
264        self.range_tombstone_block_load_cached.load(Relaxed)
265    }
266
267    /// Number of blocks that were served from block cache.
268    pub fn block_load_cached_count(&self) -> usize {
269        self.data_block_load_cached.load(Relaxed)
270            + self.index_block_load_cached.load(Relaxed)
271            + self.filter_block_load_cached.load(Relaxed)
272            + self.range_tombstone_block_load_cached.load(Relaxed)
273    }
274
275    /// Number of blocks that were accessed.
276    pub fn block_loads(&self) -> usize {
277        self.block_load_io_count() + self.block_load_cached_count()
278    }
279
280    /// Data block cache efficiency in percent (0.0 - 1.0).
281    pub fn data_block_cache_hit_rate(&self) -> f64 {
282        let queries = self.data_block_load_count() as f64;
283        let hits = self.data_block_load_cached_count() as f64;
284
285        if queries == 0.0 { 1.0 } else { hits / queries }
286    }
287
288    /// Filter block cache efficiency in percent (0.0 - 1.0).
289    pub fn filter_block_cache_hit_rate(&self) -> f64 {
290        let queries = self.filter_block_load_count() as f64;
291        let hits = self.filter_block_load_cached_count() as f64;
292
293        if queries == 0.0 { 1.0 } else { hits / queries }
294    }
295
296    /// Index block cache efficiency in percent (0.0 - 1.0).
297    pub fn index_block_cache_hit_rate(&self) -> f64 {
298        let queries = self.index_block_load_count() as f64;
299        let hits = self.index_block_load_cached_count() as f64;
300
301        if queries == 0.0 { 1.0 } else { hits / queries }
302    }
303
304    /// Range tombstone block cache efficiency in percent (0.0 - 1.0).
305    pub fn range_tombstone_block_cache_hit_rate(&self) -> f64 {
306        let queries = self.range_tombstone_block_load_count() as f64;
307        let hits = self.range_tombstone_block_load_cached_count() as f64;
308
309        if queries == 0.0 { 1.0 } else { hits / queries }
310    }
311
312    /// Block cache efficiency in percent (0.0 - 1.0).
313    pub fn block_cache_hit_rate(&self) -> f64 {
314        let queries = self.block_loads() as f64;
315        let hits = self.block_load_cached_count() as f64;
316
317        if queries == 0.0 { 1.0 } else { hits / queries }
318    }
319
320    /// Filter efficiency in percent (0.0 - 1.0).
321    ///
322    /// Represents the ratio of I/O operations avoided due to filter.
323    pub fn filter_efficiency(&self) -> f64 {
324        let queries = self.filter_queries.load(Relaxed) as f64;
325        let io_skipped = self.io_skipped_by_filter.load(Relaxed) as f64;
326
327        if queries == 0.0 {
328            1.0
329        } else {
330            io_skipped / queries
331        }
332    }
333
334    /// Number of filter queries performed.
335    pub fn filter_queries(&self) -> usize {
336        self.filter_queries.load(Relaxed)
337    }
338
339    /// Number of I/O operations skipped by filter.
340    pub fn io_skipped_by_filter(&self) -> usize {
341        self.io_skipped_by_filter.load(Relaxed)
342    }
343
344    /// Number of segments skipped during [`Tree::create_prefix`](crate::Tree::create_prefix) scans
345    /// by prefix bloom filters (single-table and multi-table run paths).
346    ///
347    /// Note: `BlobTree` prefix scans do not currently record this metric.
348    pub fn prefix_bloom_skips(&self) -> usize {
349        self.prefix_bloom_skips.load(Relaxed)
350    }
351}
352
353#[cfg(test)]
354mod tests;