lsm_tree/metrics.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5use core::sync::atomic::Ordering::Relaxed;
6use core::sync::atomic::{AtomicU64, AtomicUsize};
7
8/// Runtime metrics
9///
10/// Are not stored durably, so metrics will reset after a restart/crash.
11#[derive(Debug, Default)]
12pub struct Metrics {
13 /// Number of times a table file was opened using `fopen()`
14 pub(crate) table_file_opened_uncached: AtomicUsize,
15
16 /// Number of times a table file was retrieved from descriptor cache
17 pub(crate) table_file_opened_cached: AtomicUsize,
18
19 /// Number of index blocks that were actually read from disk
20 pub(crate) index_block_load_io: AtomicUsize,
21
22 /// Number of filter blocks that were actually read from disk
23 pub(crate) filter_block_load_io: AtomicUsize,
24
25 /// Number of blocks that were actually read from disk
26 pub(crate) data_block_load_io: AtomicUsize,
27
28 /// Number of blocks that were read from block cache
29 pub(crate) index_block_load_cached: AtomicUsize,
30
31 /// Number of blocks that were read from block cache
32 pub(crate) filter_block_load_cached: AtomicUsize,
33
34 /// Number of blocks that were read from block cache
35 pub(crate) data_block_load_cached: AtomicUsize,
36
37 /// Number of range tombstone blocks that were actually read from disk
38 pub(crate) range_tombstone_block_load_io: AtomicUsize,
39
40 /// Number of range tombstone blocks that were read from block cache
41 pub(crate) range_tombstone_block_load_cached: AtomicUsize,
42
43 /// Number of filter queries that were performed
44 pub(crate) filter_queries: AtomicUsize,
45
46 /// Number of IOs that were skipped due to filter
47 pub(crate) io_skipped_by_filter: AtomicUsize,
48
49 /// Number of segments skipped during prefix scans via
50 /// [`Tree::create_prefix`](crate::Tree::create_prefix) where the per-table prefix bloom filter
51 /// returned `Ok(false)`. Counted in both single-table and
52 /// multi-table run paths of `TreeIter::create_range`.
53 ///
54 /// Note: `BlobTree` prefix scans do not currently record this metric.
55 pub(crate) prefix_bloom_skips: AtomicUsize,
56
57 /// Number of data block bytes that were requested from OS or disk
58 pub(crate) data_block_io_requested: AtomicU64,
59
60 /// Number of index block bytes that were requested from OS or disk
61 pub(crate) index_block_io_requested: AtomicU64,
62
63 /// Number of filter block bytes that were requested from OS or disk
64 pub(crate) filter_block_io_requested: AtomicU64,
65
66 /// Number of range tombstone block bytes that were requested from OS or disk
67 pub(crate) range_tombstone_block_io_requested: AtomicU64,
68
69 /// Number of SSTs flagged for a healing recompaction after a read recovered
70 /// a block from Page-ECC parity and confirmed the fault persistent (counted
71 /// only when `auto_heal` is enabled). Each SST is counted once per pending
72 /// schedule.
73 pub(crate) ecc_auto_heal_scheduled: AtomicUsize,
74
75 /// On-read blocks healed by the SEC-DED single-bit fast path (one corrected
76 /// bit flip). Counted on every primary read that observes the recovery
77 /// (point/range loads, partial-decode, patrol scrub); the persistence
78 /// confirming re-read does NOT re-count. A non-zero, growing value is a
79 /// scrapeable latent-bit-rot signal.
80 pub(crate) ecc_secded_corrected: AtomicUsize,
81
82 /// On-read blocks recovered from Reed-Solomon shard parity (the general
83 /// multi-byte path). Same counting discipline as
84 /// [`Self::ecc_secded_corrected`]; the two are disjoint by recovery
85 /// mechanism and sum to the total on-read ECC recoveries.
86 pub(crate) ecc_shard_recovered: AtomicUsize,
87}
88
89/// A point-in-time snapshot of block-cache effectiveness and occupancy.
90///
91/// Derived from [`Metrics`] (the cumulative hit / miss counters) plus the live
92/// block cache's current size and capacity, so an observability consumer gets a
93/// stable owned value instead of reaching into the mutable `&Arc<Metrics>`.
94/// Counts are cumulative since process start (they reset on restart, like all of
95/// [`Metrics`]); derive a rate over an interval from the delta between two polls.
96// No `PartialEq`: `hit_rate` is an `f64`, so equality would inherit float
97// comparison semantics. Compare the integer fields explicitly instead.
98#[must_use]
99#[derive(Copy, Clone, Debug)]
100pub struct CacheStats {
101 /// Cumulative block reads served from the block cache (all block types).
102 pub hits: u64,
103 /// Cumulative block reads that missed the cache and hit disk (all block types).
104 pub misses: u64,
105 /// Hit rate in `0.0..=1.0` (`hits / (hits + misses)`); `1.0` when no block
106 /// has been loaded yet (nothing has missed).
107 pub hit_rate: f64,
108 /// Current weighted bytes resident in the block cache.
109 pub size_bytes: u64,
110 /// Configured maximum bytes the block cache may hold.
111 pub capacity_bytes: u64,
112}
113
114#[expect(
115 clippy::cast_precision_loss,
116 reason = "metrics can accept precision loss"
117)]
118impl Metrics {
119 /// Builds a [`CacheStats`] snapshot from the cumulative cache counters and
120 /// the caller-supplied live cache `size_bytes` / `capacity_bytes` (the block
121 /// cache owns its occupancy, [`Metrics`] owns the hit / miss tallies).
122 pub fn cache_stats(&self, size_bytes: u64, capacity_bytes: u64) -> CacheStats {
123 // Read the counters once so hits / misses / hit_rate are a single
124 // consistent snapshot (block_cache_hit_rate would re-read the atomics).
125 let hits = self.block_load_cached_count() as u64;
126 let misses = self.block_load_io_count() as u64;
127 let total = hits + misses;
128 let hit_rate = if total == 0 {
129 1.0
130 } else {
131 hits as f64 / total as f64
132 };
133 CacheStats {
134 hits,
135 misses,
136 hit_rate,
137 size_bytes,
138 capacity_bytes,
139 }
140 }
141
142 /// Returns the cache hit rate for file descriptors in percent (0.0 - 1.0).
143 pub fn table_file_cache_hit_rate(&self) -> f64 {
144 let uncached = self.table_file_opened_uncached.load(Relaxed) as f64;
145 let cached = self.table_file_opened_cached.load(Relaxed) as f64;
146
147 if cached + uncached == 0.0 {
148 1.0
149 } else {
150 cached / (cached + uncached)
151 }
152 }
153
154 /// Number of I/O data block bytes transferred from disk or OS page cache.
155 pub fn data_block_io(&self) -> u64 {
156 self.data_block_io_requested.load(Relaxed)
157 }
158
159 /// Number of I/O index block bytes transferred from disk or OS page cache.
160 pub fn index_block_io(&self) -> u64 {
161 self.index_block_io_requested.load(Relaxed)
162 }
163
164 /// Number of I/O filter block bytes transferred from disk or OS page cache.
165 pub fn filter_block_io(&self) -> u64 {
166 self.filter_block_io_requested.load(Relaxed)
167 }
168
169 /// Number of I/O range tombstone block bytes transferred from disk or OS page cache.
170 pub fn range_tombstone_block_io(&self) -> u64 {
171 self.range_tombstone_block_io_requested.load(Relaxed)
172 }
173
174 /// Number of I/O block bytes transferred from disk or OS page cache.
175 pub fn block_io(&self) -> u64 {
176 self.data_block_io_requested.load(Relaxed)
177 + self.index_block_io_requested.load(Relaxed)
178 + self.filter_block_io_requested.load(Relaxed)
179 + self.range_tombstone_block_io_requested.load(Relaxed)
180 }
181
182 /// Number of data blocks that were accessed.
183 pub fn data_block_load_count(&self) -> usize {
184 self.data_block_load_cached.load(Relaxed) + self.data_block_load_io.load(Relaxed)
185 }
186
187 /// Number of index blocks that were accessed.
188 pub fn index_block_load_count(&self) -> usize {
189 self.index_block_load_cached.load(Relaxed) + self.index_block_load_io.load(Relaxed)
190 }
191
192 /// Number of filter blocks that were accessed.
193 pub fn filter_block_load_count(&self) -> usize {
194 self.filter_block_load_cached.load(Relaxed) + self.filter_block_load_io.load(Relaxed)
195 }
196
197 /// Number of range tombstone blocks that were accessed.
198 pub fn range_tombstone_block_load_count(&self) -> usize {
199 self.range_tombstone_block_load_cached.load(Relaxed)
200 + self.range_tombstone_block_load_io.load(Relaxed)
201 }
202
203 /// Number of SSTs scheduled for a healing recompaction after a persistent
204 /// ECC correction on read (`auto_heal` enabled).
205 pub fn ecc_auto_heal_scheduled_count(&self) -> usize {
206 self.ecc_auto_heal_scheduled.load(Relaxed)
207 }
208
209 /// On-read blocks healed by the SEC-DED single-bit fast path.
210 pub fn ecc_secded_corrected_count(&self) -> usize {
211 self.ecc_secded_corrected.load(Relaxed)
212 }
213
214 /// On-read blocks recovered from Reed-Solomon shard parity.
215 pub fn ecc_shard_recovered_count(&self) -> usize {
216 self.ecc_shard_recovered.load(Relaxed)
217 }
218
219 /// Total on-read ECC recoveries across both mechanisms (SEC-DED + RS shard).
220 /// A scrapeable latent-bit-rot signal: growth here means the medium is
221 /// returning faulty bytes that parity is silently repairing.
222 pub fn ecc_recovered_count(&self) -> usize {
223 self.ecc_secded_corrected_count() + self.ecc_shard_recovered_count()
224 }
225
226 /// Records one on-read ECC recovery, attributing it to the mechanism that
227 /// did the repair. Called from the primary read paths (`load_block`, the
228 /// partial-decode path, patrol scrub); the persistence-confirming re-read
229 /// must NOT call this, to avoid double-counting a single fault.
230 pub(crate) fn record_ecc_recovery(&self, kind: crate::table::block::EccRecoveryKind) {
231 use crate::table::block::EccRecoveryKind;
232 match kind {
233 EccRecoveryKind::Secded => &self.ecc_secded_corrected,
234 EccRecoveryKind::Shard => &self.ecc_shard_recovered,
235 }
236 .fetch_add(1, Relaxed);
237 }
238
239 /// Number of blocks that were loaded from disk or OS page cache.
240 pub fn block_load_io_count(&self) -> usize {
241 self.data_block_load_io.load(Relaxed)
242 + self.index_block_load_io.load(Relaxed)
243 + self.filter_block_load_io.load(Relaxed)
244 + self.range_tombstone_block_load_io.load(Relaxed)
245 }
246
247 /// Number of data blocks that were served from block cache.
248 pub fn data_block_load_cached_count(&self) -> usize {
249 self.data_block_load_cached.load(Relaxed)
250 }
251
252 /// Number of index blocks that were served from block cache.
253 pub fn index_block_load_cached_count(&self) -> usize {
254 self.index_block_load_cached.load(Relaxed)
255 }
256
257 /// Number of filter blocks that were served from block cache.
258 pub fn filter_block_load_cached_count(&self) -> usize {
259 self.filter_block_load_cached.load(Relaxed)
260 }
261
262 /// Number of range tombstone blocks that were served from block cache.
263 pub fn range_tombstone_block_load_cached_count(&self) -> usize {
264 self.range_tombstone_block_load_cached.load(Relaxed)
265 }
266
267 /// Number of blocks that were served from block cache.
268 pub fn block_load_cached_count(&self) -> usize {
269 self.data_block_load_cached.load(Relaxed)
270 + self.index_block_load_cached.load(Relaxed)
271 + self.filter_block_load_cached.load(Relaxed)
272 + self.range_tombstone_block_load_cached.load(Relaxed)
273 }
274
275 /// Number of blocks that were accessed.
276 pub fn block_loads(&self) -> usize {
277 self.block_load_io_count() + self.block_load_cached_count()
278 }
279
280 /// Data block cache efficiency in percent (0.0 - 1.0).
281 pub fn data_block_cache_hit_rate(&self) -> f64 {
282 let queries = self.data_block_load_count() as f64;
283 let hits = self.data_block_load_cached_count() as f64;
284
285 if queries == 0.0 { 1.0 } else { hits / queries }
286 }
287
288 /// Filter block cache efficiency in percent (0.0 - 1.0).
289 pub fn filter_block_cache_hit_rate(&self) -> f64 {
290 let queries = self.filter_block_load_count() as f64;
291 let hits = self.filter_block_load_cached_count() as f64;
292
293 if queries == 0.0 { 1.0 } else { hits / queries }
294 }
295
296 /// Index block cache efficiency in percent (0.0 - 1.0).
297 pub fn index_block_cache_hit_rate(&self) -> f64 {
298 let queries = self.index_block_load_count() as f64;
299 let hits = self.index_block_load_cached_count() as f64;
300
301 if queries == 0.0 { 1.0 } else { hits / queries }
302 }
303
304 /// Range tombstone block cache efficiency in percent (0.0 - 1.0).
305 pub fn range_tombstone_block_cache_hit_rate(&self) -> f64 {
306 let queries = self.range_tombstone_block_load_count() as f64;
307 let hits = self.range_tombstone_block_load_cached_count() as f64;
308
309 if queries == 0.0 { 1.0 } else { hits / queries }
310 }
311
312 /// Block cache efficiency in percent (0.0 - 1.0).
313 pub fn block_cache_hit_rate(&self) -> f64 {
314 let queries = self.block_loads() as f64;
315 let hits = self.block_load_cached_count() as f64;
316
317 if queries == 0.0 { 1.0 } else { hits / queries }
318 }
319
320 /// Filter efficiency in percent (0.0 - 1.0).
321 ///
322 /// Represents the ratio of I/O operations avoided due to filter.
323 pub fn filter_efficiency(&self) -> f64 {
324 let queries = self.filter_queries.load(Relaxed) as f64;
325 let io_skipped = self.io_skipped_by_filter.load(Relaxed) as f64;
326
327 if queries == 0.0 {
328 1.0
329 } else {
330 io_skipped / queries
331 }
332 }
333
334 /// Number of filter queries performed.
335 pub fn filter_queries(&self) -> usize {
336 self.filter_queries.load(Relaxed)
337 }
338
339 /// Number of I/O operations skipped by filter.
340 pub fn io_skipped_by_filter(&self) -> usize {
341 self.io_skipped_by_filter.load(Relaxed)
342 }
343
344 /// Number of segments skipped during [`Tree::create_prefix`](crate::Tree::create_prefix) scans
345 /// by prefix bloom filters (single-table and multi-table run paths).
346 ///
347 /// Note: `BlobTree` prefix scans do not currently record this metric.
348 pub fn prefix_bloom_skips(&self) -> usize {
349 self.prefix_bloom_skips.load(Relaxed)
350 }
351}
352
353#[cfg(test)]
354mod tests;