rustkernel_core/
slo.rs

1//! Service Level Objective (SLO) validation.
2//!
3//! This module provides runtime validation of throughput and latency SLOs
4//! for kernels, ensuring they meet their performance targets.
5
6use crate::kernel::KernelMetadata;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::sync::{Arc, RwLock};
10use std::time::{Duration, Instant};
11
12/// SLO validation result.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub enum SLOResult {
15    /// SLO is met.
16    Met {
17        /// Actual value.
18        actual: f64,
19        /// Target value.
20        target: f64,
21        /// Headroom percentage.
22        headroom_pct: f64,
23    },
24    /// SLO is at risk (within 10% of target).
25    AtRisk {
26        /// Actual value.
27        actual: f64,
28        /// Target value.
29        target: f64,
30        /// Percentage of target used.
31        usage_pct: f64,
32    },
33    /// SLO is violated.
34    Violated {
35        /// Actual value.
36        actual: f64,
37        /// Target value.
38        target: f64,
39        /// Percentage over target.
40        overage_pct: f64,
41    },
42}
43
44impl SLOResult {
45    /// Returns true if the SLO is met.
46    #[must_use]
47    pub fn is_met(&self) -> bool {
48        matches!(self, SLOResult::Met { .. })
49    }
50
51    /// Returns true if the SLO is at risk.
52    #[must_use]
53    pub fn is_at_risk(&self) -> bool {
54        matches!(self, SLOResult::AtRisk { .. })
55    }
56
57    /// Returns true if the SLO is violated.
58    #[must_use]
59    pub fn is_violated(&self) -> bool {
60        matches!(self, SLOResult::Violated { .. })
61    }
62}
63
64/// SLO validator for kernel performance.
65#[derive(Debug, Default)]
66pub struct SLOValidator {
67    /// Kernel-specific overrides.
68    overrides: HashMap<String, SLOOverride>,
69    /// Whether to enable strict mode (fail on violation).
70    strict_mode: bool,
71}
72
73/// SLO override for a specific kernel.
74#[derive(Debug, Clone)]
75pub struct SLOOverride {
76    /// Override throughput target (ops/sec).
77    pub throughput: Option<u64>,
78    /// Override latency target (microseconds).
79    pub latency_us: Option<f64>,
80    /// Tolerance percentage (default 10%).
81    pub tolerance_pct: f64,
82}
83
84impl Default for SLOOverride {
85    fn default() -> Self {
86        Self {
87            throughput: None,
88            latency_us: None,
89            tolerance_pct: 10.0,
90        }
91    }
92}
93
94impl SLOValidator {
95    /// Create a new SLO validator.
96    #[must_use]
97    pub fn new() -> Self {
98        Self::default()
99    }
100
101    /// Enable strict mode (fail on any violation).
102    #[must_use]
103    pub fn with_strict_mode(mut self) -> Self {
104        self.strict_mode = true;
105        self
106    }
107
108    /// Add an SLO override for a specific kernel.
109    pub fn with_override(mut self, kernel_id: impl Into<String>, override_: SLOOverride) -> Self {
110        self.overrides.insert(kernel_id.into(), override_);
111        self
112    }
113
114    /// Validate throughput against target.
115    #[must_use]
116    pub fn validate_throughput(
117        &self,
118        metadata: &KernelMetadata,
119        actual_ops_per_sec: u64,
120    ) -> SLOResult {
121        let target = self
122            .overrides
123            .get(&metadata.id)
124            .and_then(|o| o.throughput)
125            .unwrap_or(metadata.expected_throughput);
126
127        let tolerance_pct = self
128            .overrides
129            .get(&metadata.id)
130            .map(|o| o.tolerance_pct)
131            .unwrap_or(10.0);
132
133        let actual = actual_ops_per_sec as f64;
134        let target_f64 = target as f64;
135
136        // For throughput, we want actual >= target
137        if actual >= target_f64 {
138            let headroom = ((actual - target_f64) / target_f64) * 100.0;
139            SLOResult::Met {
140                actual,
141                target: target_f64,
142                headroom_pct: headroom,
143            }
144        } else {
145            let usage = (actual / target_f64) * 100.0;
146            if usage >= (100.0 - tolerance_pct) {
147                SLOResult::AtRisk {
148                    actual,
149                    target: target_f64,
150                    usage_pct: usage,
151                }
152            } else {
153                let overage = ((target_f64 - actual) / target_f64) * 100.0;
154                SLOResult::Violated {
155                    actual,
156                    target: target_f64,
157                    overage_pct: overage,
158                }
159            }
160        }
161    }
162
163    /// Validate latency against target.
164    #[must_use]
165    pub fn validate_latency(&self, metadata: &KernelMetadata, actual_latency_us: f64) -> SLOResult {
166        let target = self
167            .overrides
168            .get(&metadata.id)
169            .and_then(|o| o.latency_us)
170            .unwrap_or(metadata.target_latency_us);
171
172        let tolerance_pct = self
173            .overrides
174            .get(&metadata.id)
175            .map(|o| o.tolerance_pct)
176            .unwrap_or(10.0);
177
178        // For latency, we want actual <= target
179        if actual_latency_us <= target {
180            let headroom = ((target - actual_latency_us) / target) * 100.0;
181            SLOResult::Met {
182                actual: actual_latency_us,
183                target,
184                headroom_pct: headroom,
185            }
186        } else {
187            let usage = (actual_latency_us / target) * 100.0;
188            if usage <= (100.0 + tolerance_pct) {
189                SLOResult::AtRisk {
190                    actual: actual_latency_us,
191                    target,
192                    usage_pct: usage,
193                }
194            } else {
195                let overage = ((actual_latency_us - target) / target) * 100.0;
196                SLOResult::Violated {
197                    actual: actual_latency_us,
198                    target,
199                    overage_pct: overage,
200                }
201            }
202        }
203    }
204
205    /// Check if strict mode is enabled.
206    #[must_use]
207    pub fn is_strict(&self) -> bool {
208        self.strict_mode
209    }
210}
211
212/// Performance metrics for a kernel.
213#[derive(Debug, Clone, Default)]
214pub struct KernelMetrics {
215    /// Total operations completed.
216    pub operations: u64,
217    /// Total processing time.
218    pub total_time: Duration,
219    /// Minimum latency observed.
220    pub min_latency: Option<Duration>,
221    /// Maximum latency observed.
222    pub max_latency: Option<Duration>,
223    /// Sum of latencies for average calculation.
224    pub latency_sum: Duration,
225    /// Number of latency samples.
226    pub latency_count: u64,
227}
228
229impl KernelMetrics {
230    /// Create new metrics.
231    #[must_use]
232    pub fn new() -> Self {
233        Self::default()
234    }
235
236    /// Record an operation.
237    pub fn record(&mut self, latency: Duration) {
238        self.operations += 1;
239        self.latency_count += 1;
240        self.latency_sum += latency;
241
242        match self.min_latency {
243            Some(min) if latency < min => self.min_latency = Some(latency),
244            None => self.min_latency = Some(latency),
245            _ => {}
246        }
247
248        match self.max_latency {
249            Some(max) if latency > max => self.max_latency = Some(latency),
250            None => self.max_latency = Some(latency),
251            _ => {}
252        }
253    }
254
255    /// Calculate average latency.
256    #[must_use]
257    pub fn avg_latency(&self) -> Option<Duration> {
258        if self.latency_count > 0 {
259            Some(self.latency_sum / self.latency_count as u32)
260        } else {
261            None
262        }
263    }
264
265    /// Calculate throughput in operations per second.
266    #[must_use]
267    pub fn throughput(&self) -> f64 {
268        if self.total_time.is_zero() {
269            0.0
270        } else {
271            self.operations as f64 / self.total_time.as_secs_f64()
272        }
273    }
274
275    /// Reset metrics.
276    pub fn reset(&mut self) {
277        *self = Self::default();
278    }
279}
280
281/// Metrics collector for all kernels.
282#[derive(Debug, Clone)]
283pub struct MetricsCollector {
284    metrics: Arc<RwLock<HashMap<String, KernelMetrics>>>,
285}
286
287impl MetricsCollector {
288    /// Create a new metrics collector.
289    #[must_use]
290    pub fn new() -> Self {
291        Self {
292            metrics: Arc::new(RwLock::new(HashMap::new())),
293        }
294    }
295
296    /// Record an operation for a kernel.
297    pub fn record(&self, kernel_id: &str, latency: Duration) {
298        let mut metrics = self.metrics.write().unwrap();
299        metrics
300            .entry(kernel_id.to_string())
301            .or_default()
302            .record(latency);
303    }
304
305    /// Get metrics for a kernel.
306    #[must_use]
307    pub fn get(&self, kernel_id: &str) -> Option<KernelMetrics> {
308        let metrics = self.metrics.read().unwrap();
309        metrics.get(kernel_id).cloned()
310    }
311
312    /// Get all metrics.
313    #[must_use]
314    pub fn all(&self) -> HashMap<String, KernelMetrics> {
315        self.metrics.read().unwrap().clone()
316    }
317
318    /// Reset metrics for a kernel.
319    pub fn reset(&self, kernel_id: &str) {
320        let mut metrics = self.metrics.write().unwrap();
321        if let Some(m) = metrics.get_mut(kernel_id) {
322            m.reset();
323        }
324    }
325
326    /// Reset all metrics.
327    pub fn reset_all(&self) {
328        let mut metrics = self.metrics.write().unwrap();
329        metrics.clear();
330    }
331}
332
333impl Default for MetricsCollector {
334    fn default() -> Self {
335        Self::new()
336    }
337}
338
339/// RAII guard for timing operations.
340pub struct TimingGuard<'a> {
341    collector: &'a MetricsCollector,
342    kernel_id: String,
343    start: Instant,
344}
345
346impl<'a> TimingGuard<'a> {
347    /// Create a new timing guard.
348    #[must_use]
349    pub fn new(collector: &'a MetricsCollector, kernel_id: impl Into<String>) -> Self {
350        Self {
351            collector,
352            kernel_id: kernel_id.into(),
353            start: Instant::now(),
354        }
355    }
356}
357
358impl<'a> Drop for TimingGuard<'a> {
359    fn drop(&mut self) {
360        let latency = self.start.elapsed();
361        self.collector.record(&self.kernel_id, latency);
362    }
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368    use crate::domain::Domain;
369    use crate::kernel::KernelMetadata;
370
371    fn test_metadata() -> KernelMetadata {
372        KernelMetadata::ring("test-kernel", Domain::Core)
373            .with_throughput(100_000)
374            .with_latency_us(1.0)
375    }
376
377    #[test]
378    fn test_throughput_met() {
379        let validator = SLOValidator::new();
380        let metadata = test_metadata();
381
382        let result = validator.validate_throughput(&metadata, 120_000);
383        assert!(result.is_met());
384
385        if let SLOResult::Met { headroom_pct, .. } = result {
386            assert!((headroom_pct - 20.0).abs() < 0.1);
387        }
388    }
389
390    #[test]
391    fn test_throughput_at_risk() {
392        let validator = SLOValidator::new();
393        let metadata = test_metadata();
394
395        let result = validator.validate_throughput(&metadata, 95_000);
396        assert!(result.is_at_risk());
397    }
398
399    #[test]
400    fn test_throughput_violated() {
401        let validator = SLOValidator::new();
402        let metadata = test_metadata();
403
404        let result = validator.validate_throughput(&metadata, 50_000);
405        assert!(result.is_violated());
406    }
407
408    #[test]
409    fn test_latency_met() {
410        let validator = SLOValidator::new();
411        let metadata = test_metadata();
412
413        let result = validator.validate_latency(&metadata, 0.5);
414        assert!(result.is_met());
415    }
416
417    #[test]
418    fn test_latency_at_risk() {
419        let validator = SLOValidator::new();
420        let metadata = test_metadata();
421
422        let result = validator.validate_latency(&metadata, 1.05);
423        assert!(result.is_at_risk());
424    }
425
426    #[test]
427    fn test_latency_violated() {
428        let validator = SLOValidator::new();
429        let metadata = test_metadata();
430
431        let result = validator.validate_latency(&metadata, 2.0);
432        assert!(result.is_violated());
433    }
434
435    #[test]
436    fn test_metrics_recording() {
437        let collector = MetricsCollector::new();
438
439        collector.record("test", Duration::from_micros(100));
440        collector.record("test", Duration::from_micros(200));
441        collector.record("test", Duration::from_micros(150));
442
443        let metrics = collector.get("test").unwrap();
444        assert_eq!(metrics.operations, 3);
445        assert_eq!(metrics.min_latency, Some(Duration::from_micros(100)));
446        assert_eq!(metrics.max_latency, Some(Duration::from_micros(200)));
447        assert_eq!(metrics.avg_latency(), Some(Duration::from_micros(150)));
448    }
449
450    #[test]
451    fn test_slo_override() {
452        let validator = SLOValidator::new().with_override(
453            "test-kernel",
454            SLOOverride {
455                throughput: Some(50_000),
456                latency_us: None,
457                tolerance_pct: 5.0,
458            },
459        );
460
461        let metadata = test_metadata();
462
463        // With override, 60K should be met (target is now 50K)
464        let result = validator.validate_throughput(&metadata, 60_000);
465        assert!(result.is_met());
466    }
467}