avila_telemetry/
observability.rs

1//! Observability module integrating NASA and Google Cloud best practices
2
3use crate::{Result, TimeSeries};
4use chrono::{DateTime, Utc};
5use std::collections::HashMap;
6use std::time::{Duration, Instant};
7
8/// Google's Four Golden Signals
9#[derive(Debug, Clone)]
10pub struct GoldenSignals {
11    pub latency: LatencyMetrics,
12    pub traffic: TrafficMetrics,
13    pub errors: ErrorMetrics,
14    pub saturation: SaturationMetrics,
15}
16
17#[derive(Debug, Clone)]
18pub struct LatencyMetrics {
19    pub p50: Duration,
20    pub p95: Duration,
21    pub p99: Duration,
22    pub p99_9: Duration,
23}
24
25#[derive(Debug, Clone)]
26pub struct TrafficMetrics {
27    pub requests_per_second: f64,
28    pub bytes_per_second: f64,
29    pub active_connections: usize,
30}
31
32#[derive(Debug, Clone)]
33pub struct ErrorMetrics {
34    pub error_rate: f64,
35    pub error_budget: f64,
36    pub total_errors: usize,
37}
38
39#[derive(Debug, Clone)]
40pub struct SaturationMetrics {
41    pub cpu_usage: f64,
42    pub memory_usage: f64,
43    pub disk_usage: f64,
44    pub network_usage: f64,
45}
46
47/// NASA Data Quality Assessment
48#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
49pub struct DataQualityAssessment {
50    /// Accuracy score (0-1)
51    pub accuracy: f64,
52    /// Completeness score (0-1)
53    pub completeness: f64,
54    /// Consistency score (0-1)
55    pub consistency: f64,
56    /// Timeliness in milliseconds
57    pub timeliness_ms: u64,
58    /// Validity score (0-1)
59    pub validity: f64,
60    /// Overall quality score
61    pub overall_score: f64,
62}
63
64impl DataQualityAssessment {
65    /// Check if data meets NASA standards (>= 95% overall)
66    pub fn meets_nasa_standards(&self) -> bool {
67        self.overall_score >= 0.95
68    }
69
70    /// Calculate overall quality score
71    pub fn calculate_overall(&mut self) {
72        self.overall_score = self.accuracy * 0.3
73            + self.completeness * 0.25
74            + self.consistency * 0.25
75            + self.validity * 0.2;
76    }
77}
78
79/// NASA-style Quality Control
80#[derive(Debug, Clone)]
81pub struct NASAQualityControl {
82    pub ucl: f64, // Upper Control Limit
83    pub lcl: f64, // Lower Control Limit
84    pub target: f64,
85    pub sigma: usize, // 3-sigma or 6-sigma
86}
87
88impl NASAQualityControl {
89    pub fn new(target: f64, std_dev: f64, sigma: usize) -> Self {
90        let sigma_multiplier = sigma as f64;
91        Self {
92            ucl: target + sigma_multiplier * std_dev,
93            lcl: target - sigma_multiplier * std_dev,
94            target,
95            sigma,
96        }
97    }
98
99    /// Check if value is within control limits
100    pub fn is_in_control(&self, value: f64) -> bool {
101        value >= self.lcl && value <= self.ucl
102    }
103
104    /// Apply Western Electric Rules
105    pub fn apply_western_electric_rules(&self, ts: &TimeSeries) -> Vec<QualityViolation> {
106        let mut violations = Vec::new();
107
108        // Rule 1: One point beyond 3-sigma
109        for (i, &value) in ts.values.iter().enumerate() {
110            if !self.is_in_control(value) {
111                violations.push(QualityViolation {
112                    index: i,
113                    value,
114                    rule: WesternElectricRule::BeyondLimits,
115                    severity: ViolationSeverity::Critical,
116                });
117            }
118        }
119
120        // Rule 2: Two out of three consecutive points beyond 2-sigma
121        // Rule 3: Four out of five consecutive points beyond 1-sigma
122        // ... (implement remaining rules)
123
124        violations
125    }
126}
127
128#[derive(Debug, Clone)]
129pub struct QualityViolation {
130    pub index: usize,
131    pub value: f64,
132    pub rule: WesternElectricRule,
133    pub severity: ViolationSeverity,
134}
135
136#[derive(Debug, Clone, Copy)]
137pub enum WesternElectricRule {
138    BeyondLimits,
139    TwoOfThree,
140    FourOfFive,
141    EightConsecutive,
142}
143
144#[derive(Debug, Clone, Copy)]
145pub enum ViolationSeverity {
146    Warning,
147    Critical,
148    Emergency,
149}
150
151/// Data Quality metrics (NASA standards)
152#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
153pub struct DataQualityMetrics {
154    pub glitches: usize,
155    pub gaps: usize,
156    pub score: f64, // 0.0 - 1.0
157}
158
159/// Service Level Objective (Google SRE)
160#[derive(Debug, Clone)]
161pub struct ServiceLevelObjective {
162    pub name: String,
163    pub target: f64, // e.g., 0.999 for 99.9%
164    pub window: Duration,
165    pub error_budget: ErrorBudget,
166}
167
168#[derive(Debug, Clone)]
169pub struct ErrorBudget {
170    pub total: f64,
171    pub consumed: f64,
172    pub remaining: f64,
173    pub burn_rate: f64,
174}
175
176impl ErrorBudget {
177    pub fn new(total: f64) -> Self {
178        Self {
179            total,
180            consumed: 0.0,
181            remaining: total,
182            burn_rate: 0.0,
183        }
184    }
185
186    pub fn is_exhausted(&self) -> bool {
187        self.remaining <= 0.0
188    }
189
190    pub fn time_to_exhaustion(&self) -> Option<Duration> {
191        if self.burn_rate > 0.0 {
192            Some(Duration::from_secs_f64(self.remaining / self.burn_rate))
193        } else {
194            None
195        }
196    }
197}
198
199/// Structured log entry (Google Cloud Logging format)
200#[derive(Debug, Clone)]
201pub struct StructuredLog {
202    pub timestamp: DateTime<Utc>,
203    pub severity: LogSeverity,
204    pub message: String,
205    pub labels: HashMap<String, String>,
206    pub trace_id: Option<String>,
207    pub span_id: Option<String>,
208}
209
210#[derive(Debug, Clone, Copy)]
211pub enum LogSeverity {
212    Debug,
213    Info,
214    Notice,
215    Warning,
216    Error,
217    Critical,
218    Alert,
219    Emergency,
220}
221
222impl StructuredLog {
223    pub fn new(severity: LogSeverity, message: impl Into<String>) -> Self {
224        Self {
225            timestamp: Utc::now(),
226            severity,
227            message: message.into(),
228            labels: HashMap::new(),
229            trace_id: None,
230            span_id: None,
231        }
232    }
233
234    pub fn with_label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
235        self.labels.insert(key.into(), value.into());
236        self
237    }
238
239    pub fn with_trace(mut self, trace_id: String, span_id: String) -> Self {
240        self.trace_id = Some(trace_id);
241        self.span_id = Some(span_id);
242        self
243    }
244}
245
246/// Alert levels (NASA-style)
247#[derive(Debug, Clone, Copy, PartialEq, Eq)]
248pub enum AlertLevel {
249    Green,  // Normal operations
250    Yellow, // Caution
251    Red,    // Warning - immediate attention
252    Black,  // Emergency - system critical
253}
254
255#[derive(Debug, Clone)]
256pub struct Alert {
257    pub timestamp: DateTime<Utc>,
258    pub level: AlertLevel,
259    pub message: String,
260    pub metric: String,
261    pub value: f64,
262    pub threshold: f64,
263    pub recommended_action: String,
264}
265
266/// Performance metrics tracker
267pub struct PerformanceTracker {
268    _start_time: Instant,
269    measurements: Vec<Duration>,
270}
271
272impl PerformanceTracker {
273    pub fn new() -> Self {
274        Self {
275            _start_time: Instant::now(),
276            measurements: Vec::new(),
277        }
278    }
279
280    pub fn measure<F, T>(&mut self, f: F) -> Result<T>
281    where
282        F: FnOnce() -> Result<T>,
283    {
284        let start = Instant::now();
285        let result = f();
286        let duration = start.elapsed();
287        self.measurements.push(duration);
288        result
289    }
290
291    pub fn calculate_percentiles(&self) -> LatencyMetrics {
292        let mut sorted = self.measurements.clone();
293        sorted.sort();
294
295        let len = sorted.len();
296        if len == 0 {
297            return LatencyMetrics {
298                p50: Duration::ZERO,
299                p95: Duration::ZERO,
300                p99: Duration::ZERO,
301                p99_9: Duration::ZERO,
302            };
303        }
304
305        LatencyMetrics {
306            p50: sorted[len * 50 / 100],
307            p95: sorted[len * 95 / 100],
308            p99: sorted[len * 99 / 100],
309            p99_9: sorted[(len * 999 / 1000).min(len - 1)],
310        }
311    }
312}
313
314impl Default for PerformanceTracker {
315    fn default() -> Self {
316        Self::new()
317    }
318}
319
320/// Trait for data quality assessment
321pub trait DataQuality {
322    fn assess_quality(&self) -> DataQualityAssessment;
323}
324
325impl DataQuality for TimeSeries {
326    fn assess_quality(&self) -> DataQualityAssessment {
327        let stats = self.statistics();
328
329        // Accuracy: based on std deviation (lower is better)
330        let accuracy = if stats.std_dev > 0.0 {
331            (1.0 / (1.0 + stats.std_dev / stats.mean.abs().max(1.0))).min(1.0)
332        } else {
333            1.0
334        };
335
336        // Completeness: no NaN or infinite values
337        let valid_count = self.values.iter().filter(|v| v.is_finite()).count();
338        let completeness = valid_count as f64 / self.values.len() as f64;
339
340        // Consistency: check for sudden jumps
341        let diffs = self.diff();
342        let mean_diff = diffs.iter().sum::<f64>() / diffs.len() as f64;
343        let sudden_jumps = diffs
344            .iter()
345            .filter(|&&d| (d - mean_diff).abs() > 3.0 * stats.std_dev)
346            .count();
347        let consistency = 1.0 - (sudden_jumps as f64 / diffs.len() as f64);
348
349        // Validity: values within reasonable range
350        let validity = if stats.min >= 0.0 && stats.max < f64::MAX {
351            1.0
352        } else {
353            0.5
354        };
355
356        let mut dqa = DataQualityAssessment {
357            accuracy,
358            completeness,
359            consistency,
360            validity,
361            timeliness_ms: 0, // Would be set from actual latency measurement
362            overall_score: 0.0,
363        };
364
365        dqa.calculate_overall();
366        dqa
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373
374    #[test]
375    fn test_quality_control() {
376        let qc = NASAQualityControl::new(10.0, 2.0, 3);
377        assert!(qc.is_in_control(10.0));
378        assert!(qc.is_in_control(12.0));
379        assert!(!qc.is_in_control(20.0));
380    }
381
382    #[test]
383    fn test_error_budget() {
384        let mut budget = ErrorBudget::new(0.001);
385        assert!(!budget.is_exhausted());
386
387        budget.consumed = 0.001;
388        budget.remaining = 0.0;
389        assert!(budget.is_exhausted());
390    }
391
392    #[test]
393    fn test_data_quality() {
394        let ts = TimeSeries::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
395        let dqa = ts.assess_quality();
396
397        assert!(dqa.accuracy > 0.0);
398        assert!(dqa.completeness == 1.0);
399        assert!(dqa.overall_score > 0.0);
400    }
401}