velesdb_core/storage/
metrics.rs

1//! Storage operation metrics for monitoring and debugging.
2//!
3//! Provides latency tracking for critical storage operations:
4//! - `ensure_capacity`: mmap resize operations (P0 - critical for P99 latency)
5//!
6//! # P0 Audit Recommendation
7//!
8//! The `ensure_capacity` operation can cause "stop-the-world" pauses during
9//! large resizes (e.g., 2GB → 4GB). Monitoring P99 latency is essential.
10//!
11//! # PERF-001: Lock-Free Implementation
12//!
13//! Uses `LockFreeHistogram` for wait-free latency recording in the hot path.
14//! No mutex contention even under high concurrency.
15
16use super::histogram::LockFreeHistogram;
17use std::sync::atomic::{AtomicU64, Ordering};
18use std::time::{Duration, Instant};
19
20/// Storage operation metrics collector.
21///
22/// Thread-safe metrics collection using lock-free data structures.
23/// Designed for minimal overhead in the hot path (wait-free recording).
24#[derive(Debug)]
25pub struct StorageMetrics {
26    /// Total number of actual resize operations
27    resize_count: AtomicU64,
28    /// Total bytes resized
29    total_bytes_resized: AtomicU64,
30    /// PERF-001: Lock-free histogram for latency tracking
31    latency_histogram: LockFreeHistogram,
32}
33
34impl Default for StorageMetrics {
35    fn default() -> Self {
36        Self::new()
37    }
38}
39
40impl StorageMetrics {
41    /// Creates a new metrics collector.
42    #[must_use]
43    pub fn new() -> Self {
44        Self {
45            resize_count: AtomicU64::new(0),
46            total_bytes_resized: AtomicU64::new(0),
47            latency_histogram: LockFreeHistogram::new(),
48        }
49    }
50
51    /// Records an `ensure_capacity` operation. Wait-free operation.
52    ///
53    /// # Arguments
54    ///
55    /// * `latency` - Duration of the operation
56    /// * `did_resize` - Whether an actual resize occurred
57    /// * `bytes_resized` - Number of bytes added (0 if no resize)
58    #[inline]
59    pub fn record_ensure_capacity(&self, latency: Duration, did_resize: bool, bytes_resized: u64) {
60        // PERF-001: Wait-free latency recording
61        #[allow(clippy::cast_possible_truncation)]
62        let micros = latency.as_micros().min(u128::from(u64::MAX)) as u64;
63        self.latency_histogram.record(micros);
64
65        if did_resize {
66            self.resize_count.fetch_add(1, Ordering::Relaxed);
67            self.total_bytes_resized
68                .fetch_add(bytes_resized, Ordering::Relaxed);
69        }
70    }
71
72    /// Returns the total number of `ensure_capacity` calls.
73    #[must_use]
74    pub fn ensure_capacity_count(&self) -> u64 {
75        self.latency_histogram.count()
76    }
77
78    /// Returns true if no metrics have been recorded yet.
79    #[must_use]
80    pub fn is_empty(&self) -> bool {
81        self.latency_histogram.is_empty()
82    }
83
84    /// Returns the number of actual resize operations.
85    #[must_use]
86    pub fn resize_count(&self) -> u64 {
87        self.resize_count.load(Ordering::Relaxed)
88    }
89
90    /// Returns the total bytes resized.
91    #[must_use]
92    pub fn total_bytes_resized(&self) -> u64 {
93        self.total_bytes_resized.load(Ordering::Relaxed)
94    }
95
96    /// Returns latency statistics for `ensure_capacity` operations.
97    #[must_use]
98    pub fn ensure_capacity_latency_stats(&self) -> LatencyStats {
99        LatencyStats {
100            count: self.latency_histogram.count(),
101            min_us: self.latency_histogram.min(),
102            max_us: self.latency_histogram.max(),
103            mean_us: self.latency_histogram.mean(),
104            p50_us: self.latency_histogram.percentile(50),
105            p95_us: self.latency_histogram.percentile(95),
106            p99_us: self.latency_histogram.percentile(99),
107        }
108    }
109
110    /// Resets all metrics to zero.
111    pub fn reset(&self) {
112        self.resize_count.store(0, Ordering::Relaxed);
113        self.total_bytes_resized.store(0, Ordering::Relaxed);
114        self.latency_histogram.reset();
115    }
116}
117
118/// Latency statistics with percentiles.
119#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
120pub struct LatencyStats {
121    /// Number of samples
122    pub count: u64,
123    /// Minimum latency in microseconds
124    pub min_us: u64,
125    /// Maximum latency in microseconds
126    pub max_us: u64,
127    /// Mean latency in microseconds
128    pub mean_us: u64,
129    /// 50th percentile (median) in microseconds
130    pub p50_us: u64,
131    /// 95th percentile in microseconds
132    pub p95_us: u64,
133    /// 99th percentile in microseconds
134    pub p99_us: u64,
135}
136
137impl LatencyStats {
138    /// Returns the P99 latency as a Duration.
139    #[must_use]
140    pub fn p99(&self) -> Duration {
141        Duration::from_micros(self.p99_us)
142    }
143
144    /// Returns the P95 latency as a Duration.
145    #[must_use]
146    pub fn p95(&self) -> Duration {
147        Duration::from_micros(self.p95_us)
148    }
149
150    /// Returns the P50 (median) latency as a Duration.
151    #[must_use]
152    pub fn p50(&self) -> Duration {
153        Duration::from_micros(self.p50_us)
154    }
155
156    /// Returns the mean latency as a Duration.
157    #[must_use]
158    pub fn mean(&self) -> Duration {
159        Duration::from_micros(self.mean_us)
160    }
161
162    /// Returns true if P99 latency exceeds the threshold.
163    ///
164    /// # Arguments
165    ///
166    /// * `threshold` - Maximum acceptable P99 latency
167    #[must_use]
168    pub fn p99_exceeds(&self, threshold: Duration) -> bool {
169        self.p99() > threshold
170    }
171}
172
173/// RAII guard for timing operations.
174///
175/// Automatically records the elapsed time when dropped.
176pub struct TimingGuard<'a, F>
177where
178    F: FnOnce(Duration),
179{
180    start: Instant,
181    callback: Option<F>,
182    _marker: std::marker::PhantomData<&'a ()>,
183}
184
185impl<F> TimingGuard<'_, F>
186where
187    F: FnOnce(Duration),
188{
189    /// Creates a new timing guard that will call the callback with elapsed time on drop.
190    pub fn new(callback: F) -> Self {
191        Self {
192            start: Instant::now(),
193            callback: Some(callback),
194            _marker: std::marker::PhantomData,
195        }
196    }
197
198    /// Returns the elapsed time since creation.
199    #[must_use]
200    pub fn elapsed(&self) -> Duration {
201        self.start.elapsed()
202    }
203}
204
205impl<F> Drop for TimingGuard<'_, F>
206where
207    F: FnOnce(Duration),
208{
209    fn drop(&mut self) {
210        if let Some(cb) = self.callback.take() {
211            cb(self.start.elapsed());
212        }
213    }
214}