1#![forbid(unsafe_code)]
2
3use std::collections::HashMap;
29
30#[derive(Debug, Clone, PartialEq, Eq)]
36pub enum MetricType {
37 Latency,
39 Memory,
41 ErrorRate,
43}
44
45#[derive(Debug, Clone)]
47pub struct MetricSlo {
48 pub metric_type: MetricType,
50 pub max_value: Option<f64>,
52 pub max_ratio: Option<f64>,
54 pub safe_mode_trigger: bool,
56}
57
58#[derive(Debug, Clone)]
60pub struct SloSchema {
61 pub regression_threshold: f64,
63 pub noise_tolerance: f64,
65 pub metrics: HashMap<String, MetricSlo>,
67 pub safe_mode_breach_count: usize,
69 pub safe_mode_error_rate: f64,
71}
72
73#[derive(Debug, Clone, PartialEq)]
75pub enum SloSchemaError {
76 InvalidThreshold { field: String, value: f64 },
78 MissingField(String),
80 ParseError { field: String, reason: String },
82 UnknownMetricType(String),
84 DuplicateMetric(String),
86 MalformedStructure(String),
88}
89
90impl std::fmt::Display for SloSchemaError {
91 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92 match self {
93 Self::InvalidThreshold { field, value } => {
94 write!(f, "invalid threshold for '{field}': {value}")
95 }
96 Self::MissingField(field) => write!(f, "missing required field: '{field}'"),
97 Self::ParseError { field, reason } => {
98 write!(f, "parse error for '{field}': {reason}")
99 }
100 Self::UnknownMetricType(t) => write!(f, "unknown metric type: '{t}'"),
101 Self::DuplicateMetric(name) => write!(f, "duplicate metric: '{name}'"),
102 Self::MalformedStructure(msg) => write!(f, "malformed structure: {msg}"),
103 }
104 }
105}
106
107impl std::error::Error for SloSchemaError {}
108
109#[derive(Debug, Clone)]
111pub struct BreachResult {
112 pub metric_name: String,
113 pub metric_type: MetricType,
114 pub baseline: f64,
115 pub current: f64,
116 pub ratio: f64,
117 pub severity: BreachSeverity,
118 pub safe_mode_trigger: bool,
119}
120
121#[derive(Debug, Clone, PartialEq, Eq)]
123pub enum BreachSeverity {
124 None,
126 Noise,
128 Breach,
130 AbsoluteBreach,
132}
133
134#[derive(Debug, Clone, PartialEq, Eq)]
136pub enum SafeModeDecision {
137 Normal,
139 Triggered(String),
141}
142
143impl Default for SloSchema {
148 fn default() -> Self {
149 Self {
150 regression_threshold: 0.10,
151 noise_tolerance: 0.05,
152 metrics: HashMap::new(),
153 safe_mode_breach_count: 3,
154 safe_mode_error_rate: 0.10,
155 }
156 }
157}
158
159pub fn parse_slo_yaml(yaml: &str) -> Result<SloSchema, Vec<SloSchemaError>> {
167 let mut schema = SloSchema::default();
168 let mut errors = Vec::new();
169 let mut in_metrics = false;
170 let mut current_metric: Option<String> = None;
171 let mut current_slo = MetricSlo {
172 metric_type: MetricType::Latency,
173 max_value: None,
174 max_ratio: None,
175 safe_mode_trigger: false,
176 };
177 let mut seen_metrics = std::collections::HashSet::new();
178
179 for (line_num, line) in yaml.lines().enumerate() {
180 let trimmed = line.trim();
181 if trimmed.is_empty() || trimmed.starts_with('#') {
182 continue;
183 }
184
185 if trimmed.contains('\t') {
186 errors.push(SloSchemaError::MalformedStructure(format!(
187 "line {}: tabs not allowed, use spaces",
188 line_num + 1
189 )));
190 continue;
191 }
192
193 if let Some(value) = trimmed.strip_prefix("regression_threshold:") {
194 parse_threshold(
195 value.trim(),
196 "regression_threshold",
197 &mut schema.regression_threshold,
198 &mut errors,
199 );
200 } else if let Some(value) = trimmed.strip_prefix("noise_tolerance:") {
201 parse_threshold(
202 value.trim(),
203 "noise_tolerance",
204 &mut schema.noise_tolerance,
205 &mut errors,
206 );
207 } else if let Some(value) = trimmed.strip_prefix("safe_mode_breach_count:") {
208 match value.trim().parse::<usize>() {
209 Ok(v) if v > 0 => schema.safe_mode_breach_count = v,
210 Ok(_) => errors.push(SloSchemaError::InvalidThreshold {
211 field: "safe_mode_breach_count".into(),
212 value: 0.0,
213 }),
214 Err(e) => errors.push(SloSchemaError::ParseError {
215 field: "safe_mode_breach_count".into(),
216 reason: e.to_string(),
217 }),
218 }
219 } else if let Some(value) = trimmed.strip_prefix("safe_mode_error_rate:") {
220 parse_threshold(
221 value.trim(),
222 "safe_mode_error_rate",
223 &mut schema.safe_mode_error_rate,
224 &mut errors,
225 );
226 } else if trimmed == "metrics:" {
227 in_metrics = true;
228 } else if in_metrics
229 && trimmed.ends_with(':')
230 && !trimmed.starts_with("max_")
231 && !trimmed.starts_with("metric_type:")
232 && !trimmed.starts_with("safe_mode")
233 {
234 if let Some(ref name) = current_metric {
236 schema.metrics.insert(name.clone(), current_slo.clone());
237 }
238 let metric_name = trimmed.trim_end_matches(':').to_string();
239 if !seen_metrics.insert(metric_name.clone()) {
240 errors.push(SloSchemaError::DuplicateMetric(metric_name.clone()));
241 }
242 current_metric = Some(metric_name);
243 current_slo = MetricSlo {
244 metric_type: MetricType::Latency,
245 max_value: None,
246 max_ratio: None,
247 safe_mode_trigger: false,
248 };
249 } else if let Some(value) = trimmed.strip_prefix("metric_type:") {
250 match value.trim() {
251 "latency" => current_slo.metric_type = MetricType::Latency,
252 "memory" => current_slo.metric_type = MetricType::Memory,
253 "error_rate" => current_slo.metric_type = MetricType::ErrorRate,
254 other => errors.push(SloSchemaError::UnknownMetricType(other.to_string())),
255 }
256 } else if let Some(value) = trimmed.strip_prefix("max_value:") {
257 match value.trim().parse::<f64>() {
258 Ok(v) => current_slo.max_value = Some(v),
259 Err(e) => errors.push(SloSchemaError::ParseError {
260 field: "max_value".into(),
261 reason: e.to_string(),
262 }),
263 }
264 } else if let Some(value) = trimmed.strip_prefix("max_ratio:") {
265 match value.trim().parse::<f64>() {
266 Ok(v) => current_slo.max_ratio = Some(v),
267 Err(e) => errors.push(SloSchemaError::ParseError {
268 field: "max_ratio".into(),
269 reason: e.to_string(),
270 }),
271 }
272 } else if let Some(value) = trimmed.strip_prefix("safe_mode_trigger:") {
273 match value.trim() {
274 "true" => current_slo.safe_mode_trigger = true,
275 "false" => current_slo.safe_mode_trigger = false,
276 other => errors.push(SloSchemaError::ParseError {
277 field: "safe_mode_trigger".into(),
278 reason: format!("expected 'true' or 'false', got '{other}'"),
279 }),
280 }
281 }
282 }
283
284 if let Some(ref name) = current_metric {
286 schema.metrics.insert(name.clone(), current_slo);
287 }
288
289 if schema.noise_tolerance >= schema.regression_threshold {
291 errors.push(SloSchemaError::InvalidThreshold {
292 field: "noise_tolerance".into(),
293 value: schema.noise_tolerance,
294 });
295 }
296
297 if errors.is_empty() {
298 Ok(schema)
299 } else {
300 Err(errors)
301 }
302}
303
304fn parse_threshold(value: &str, field: &str, target: &mut f64, errors: &mut Vec<SloSchemaError>) {
305 match value.parse::<f64>() {
306 Ok(v) if (0.0..=1.0).contains(&v) => *target = v,
307 Ok(v) => errors.push(SloSchemaError::InvalidThreshold {
308 field: field.into(),
309 value: v,
310 }),
311 Err(e) => errors.push(SloSchemaError::ParseError {
312 field: field.into(),
313 reason: e.to_string(),
314 }),
315 }
316}
317
318pub fn check_breach(
324 metric_name: &str,
325 baseline: f64,
326 current: f64,
327 schema: &SloSchema,
328) -> BreachResult {
329 let ratio = if baseline > 0.0 {
330 current / baseline
331 } else {
332 1.0
333 };
334
335 let metric_slo = schema.metrics.get(metric_name);
336 let metric_type = metric_slo
337 .map(|s| s.metric_type.clone())
338 .unwrap_or(MetricType::Latency);
339 let safe_mode_trigger = metric_slo.map(|s| s.safe_mode_trigger).unwrap_or(false);
340
341 if let Some(slo) = metric_slo {
343 if let Some(max_val) = slo.max_value
344 && current > max_val
345 {
346 return BreachResult {
347 metric_name: metric_name.to_string(),
348 metric_type,
349 baseline,
350 current,
351 ratio,
352 severity: BreachSeverity::AbsoluteBreach,
353 safe_mode_trigger,
354 };
355 }
356 if let Some(max_ratio) = slo.max_ratio
357 && ratio > max_ratio
358 {
359 return BreachResult {
360 metric_name: metric_name.to_string(),
361 metric_type,
362 baseline,
363 current,
364 ratio,
365 severity: BreachSeverity::Breach,
366 safe_mode_trigger,
367 };
368 }
369 }
370
371 let change_pct = ratio - 1.0;
373 let severity = if change_pct > schema.regression_threshold {
374 BreachSeverity::Breach
375 } else if change_pct > schema.noise_tolerance {
376 BreachSeverity::Noise
377 } else {
378 BreachSeverity::None
379 };
380
381 BreachResult {
382 metric_name: metric_name.to_string(),
383 metric_type,
384 baseline,
385 current,
386 ratio,
387 severity,
388 safe_mode_trigger,
389 }
390}
391
392pub fn check_safe_mode(breaches: &[BreachResult], schema: &SloSchema) -> SafeModeDecision {
394 for b in breaches {
396 if b.safe_mode_trigger
397 && (b.severity == BreachSeverity::Breach
398 || b.severity == BreachSeverity::AbsoluteBreach)
399 {
400 return SafeModeDecision::Triggered(format!(
401 "metric '{}' breached with safe_mode_trigger=true (ratio={:.3})",
402 b.metric_name, b.ratio
403 ));
404 }
405 }
406
407 for b in breaches {
409 if b.metric_type == MetricType::ErrorRate && b.current > schema.safe_mode_error_rate {
410 return SafeModeDecision::Triggered(format!(
411 "error rate '{}' at {:.3} exceeds safe_mode_error_rate {:.3}",
412 b.metric_name, b.current, schema.safe_mode_error_rate
413 ));
414 }
415 }
416
417 let breach_count = breaches
419 .iter()
420 .filter(|b| {
421 b.severity == BreachSeverity::Breach || b.severity == BreachSeverity::AbsoluteBreach
422 })
423 .count();
424
425 if breach_count >= schema.safe_mode_breach_count {
426 return SafeModeDecision::Triggered(format!(
427 "{breach_count} simultaneous breaches (threshold: {})",
428 schema.safe_mode_breach_count
429 ));
430 }
431
432 SafeModeDecision::Normal
433}
434
435pub fn emit_slo_check(breach: &BreachResult, safe_mode: &SafeModeDecision) {
446 let span = tracing::info_span!(
447 "slo.check",
448 metric_name = breach.metric_name.as_str(),
449 metric_type = ?breach.metric_type,
450 baseline = breach.baseline,
451 current = breach.current,
452 ratio = breach.ratio,
453 severity = ?breach.severity,
454 );
455 let _guard = span.enter();
456
457 match safe_mode {
458 SafeModeDecision::Triggered(reason) => {
459 tracing::error!(
460 metric = breach.metric_name.as_str(),
461 ratio = breach.ratio,
462 reason = reason.as_str(),
463 "safe-mode triggered"
464 );
465 }
466 SafeModeDecision::Normal => match breach.severity {
467 BreachSeverity::Breach | BreachSeverity::AbsoluteBreach => {
468 tracing::warn!(
469 metric = breach.metric_name.as_str(),
470 baseline = breach.baseline,
471 current = breach.current,
472 ratio = breach.ratio,
473 severity = ?breach.severity,
474 "SLO breach detected"
475 );
476 }
477 BreachSeverity::Noise => {
478 tracing::debug!(
479 metric = breach.metric_name.as_str(),
480 ratio = breach.ratio,
481 "noise-level change within tolerance"
482 );
483 }
484 BreachSeverity::None => {
485 tracing::trace!(
486 metric = breach.metric_name.as_str(),
487 ratio = breach.ratio,
488 "metric within SLO"
489 );
490 }
491 },
492 }
493}
494
495pub fn run_slo_check(
499 schema: &SloSchema,
500 observations: &[(&str, f64, f64)], ) -> (Vec<BreachResult>, SafeModeDecision) {
502 let breaches: Vec<BreachResult> = observations
503 .iter()
504 .map(|(name, baseline, current)| check_breach(name, *baseline, *current, schema))
505 .collect();
506
507 let safe_mode = check_safe_mode(&breaches, schema);
508
509 for b in &breaches {
511 emit_slo_check(b, &safe_mode);
512 }
513
514 (breaches, safe_mode)
515}
516
517#[cfg(test)]
522mod tests {
523 use super::*;
524
525 #[test]
526 fn parse_minimal_valid_yaml() {
527 let yaml = r#"
528regression_threshold: 0.10
529noise_tolerance: 0.05
530metrics:
531 render_p99:
532 metric_type: latency
533 max_value: 4000.0
534 max_ratio: 1.25
535 safe_mode_trigger: true
536"#;
537 let schema = parse_slo_yaml(yaml).expect("should parse");
538 assert_eq!(schema.metrics.len(), 1);
539 let m = schema.metrics.get("render_p99").unwrap();
540 assert_eq!(m.metric_type, MetricType::Latency);
541 assert_eq!(m.max_value, Some(4000.0));
542 assert!(m.safe_mode_trigger);
543 }
544
545 #[test]
546 fn parse_empty_uses_defaults() {
547 let schema = parse_slo_yaml("").expect("empty should use defaults");
548 assert!((schema.regression_threshold - 0.10).abs() < f64::EPSILON);
549 assert!((schema.noise_tolerance - 0.05).abs() < f64::EPSILON);
550 assert_eq!(schema.safe_mode_breach_count, 3);
551 assert!(schema.metrics.is_empty());
552 }
553
554 #[test]
555 fn reject_invalid_threshold() {
556 let yaml = "regression_threshold: 1.5\nnoise_tolerance: 0.05\n";
557 let errors = parse_slo_yaml(yaml).unwrap_err();
558 assert!(errors.iter().any(|e| matches!(
559 e,
560 SloSchemaError::InvalidThreshold { field, .. } if field == "regression_threshold"
561 )));
562 }
563
564 #[test]
565 fn reject_noise_gte_regression() {
566 let yaml = "regression_threshold: 0.05\nnoise_tolerance: 0.10\n";
567 let errors = parse_slo_yaml(yaml).unwrap_err();
568 assert!(errors.iter().any(|e| matches!(
569 e,
570 SloSchemaError::InvalidThreshold { field, .. } if field == "noise_tolerance"
571 )));
572 }
573
574 #[test]
575 fn reject_unknown_metric_type() {
576 let yaml = "regression_threshold: 0.10\nnoise_tolerance: 0.05\nmetrics:\n m:\n metric_type: throughput\n";
577 let errors = parse_slo_yaml(yaml).unwrap_err();
578 assert!(errors.iter().any(|e| matches!(
579 e,
580 SloSchemaError::UnknownMetricType(t) if t == "throughput"
581 )));
582 }
583
584 #[test]
585 fn reject_duplicate_metric() {
586 let yaml = "regression_threshold: 0.10\nnoise_tolerance: 0.05\nmetrics:\n m:\n metric_type: latency\n m:\n metric_type: latency\n";
587 let errors = parse_slo_yaml(yaml).unwrap_err();
588 assert!(
589 errors
590 .iter()
591 .any(|e| matches!(e, SloSchemaError::DuplicateMetric(_)))
592 );
593 }
594
595 #[test]
596 fn breach_absolute_threshold() {
597 let schema = SloSchema {
598 metrics: {
599 let mut m = HashMap::new();
600 m.insert(
601 "p99".into(),
602 MetricSlo {
603 metric_type: MetricType::Latency,
604 max_value: Some(500.0),
605 max_ratio: Some(1.15),
606 safe_mode_trigger: false,
607 },
608 );
609 m
610 },
611 ..SloSchema::default()
612 };
613 let result = check_breach("p99", 400.0, 520.0, &schema);
614 assert_eq!(result.severity, BreachSeverity::AbsoluteBreach);
615 }
616
617 #[test]
618 fn breach_ratio_threshold() {
619 let schema = SloSchema {
620 metrics: {
621 let mut m = HashMap::new();
622 m.insert(
623 "p99".into(),
624 MetricSlo {
625 metric_type: MetricType::Latency,
626 max_value: Some(1000.0),
627 max_ratio: Some(1.10),
628 safe_mode_trigger: false,
629 },
630 );
631 m
632 },
633 ..SloSchema::default()
634 };
635 let result = check_breach("p99", 400.0, 480.0, &schema);
636 assert_eq!(result.severity, BreachSeverity::Breach);
637 }
638
639 #[test]
640 fn within_slo_no_breach() {
641 let schema = SloSchema {
642 metrics: {
643 let mut m = HashMap::new();
644 m.insert(
645 "p99".into(),
646 MetricSlo {
647 metric_type: MetricType::Latency,
648 max_value: Some(500.0),
649 max_ratio: Some(1.15),
650 safe_mode_trigger: false,
651 },
652 );
653 m
654 },
655 ..SloSchema::default()
656 };
657 let result = check_breach("p99", 400.0, 404.0, &schema);
658 assert_eq!(result.severity, BreachSeverity::None);
659 }
660
661 #[test]
662 fn safe_mode_triggered_by_flag() {
663 let schema = SloSchema::default();
664 let breaches = vec![BreachResult {
665 metric_name: "critical".into(),
666 metric_type: MetricType::Latency,
667 baseline: 200.0,
668 current: 600.0,
669 ratio: 3.0,
670 severity: BreachSeverity::Breach,
671 safe_mode_trigger: true,
672 }];
673 let decision = check_safe_mode(&breaches, &schema);
674 assert!(matches!(decision, SafeModeDecision::Triggered(_)));
675 }
676
677 #[test]
678 fn safe_mode_triggered_by_error_rate() {
679 let schema = SloSchema {
680 safe_mode_error_rate: 0.10,
681 ..SloSchema::default()
682 };
683 let breaches = vec![BreachResult {
684 metric_name: "errors".into(),
685 metric_type: MetricType::ErrorRate,
686 baseline: 0.02,
687 current: 0.15,
688 ratio: 7.5,
689 severity: BreachSeverity::Breach,
690 safe_mode_trigger: false,
691 }];
692 let decision = check_safe_mode(&breaches, &schema);
693 assert!(matches!(decision, SafeModeDecision::Triggered(ref r) if r.contains("error rate")));
694 }
695
696 #[test]
697 fn safe_mode_triggered_by_breach_count() {
698 let schema = SloSchema {
699 safe_mode_breach_count: 2,
700 ..SloSchema::default()
701 };
702 let breaches = vec![
703 BreachResult {
704 metric_name: "a".into(),
705 metric_type: MetricType::Latency,
706 baseline: 100.0,
707 current: 200.0,
708 ratio: 2.0,
709 severity: BreachSeverity::Breach,
710 safe_mode_trigger: false,
711 },
712 BreachResult {
713 metric_name: "b".into(),
714 metric_type: MetricType::Memory,
715 baseline: 1000.0,
716 current: 3000.0,
717 ratio: 3.0,
718 severity: BreachSeverity::AbsoluteBreach,
719 safe_mode_trigger: false,
720 },
721 ];
722 let decision = check_safe_mode(&breaches, &schema);
723 assert!(
724 matches!(decision, SafeModeDecision::Triggered(ref r) if r.contains("simultaneous"))
725 );
726 }
727
728 #[test]
729 fn safe_mode_not_triggered_below_thresholds() {
730 let schema = SloSchema::default();
731 let breaches = vec![BreachResult {
732 metric_name: "ok".into(),
733 metric_type: MetricType::Latency,
734 baseline: 100.0,
735 current: 115.0,
736 ratio: 1.15,
737 severity: BreachSeverity::Breach,
738 safe_mode_trigger: false,
739 }];
740 let decision = check_safe_mode(&breaches, &schema);
741 assert_eq!(decision, SafeModeDecision::Normal);
742 }
743
744 #[test]
745 fn zero_baseline_no_panic() {
746 let schema = SloSchema::default();
747 let result = check_breach("zero", 0.0, 5.0, &schema);
748 assert!((result.ratio - 1.0).abs() < f64::EPSILON);
749 }
750
751 #[test]
752 fn improvement_not_flagged() {
753 let schema = SloSchema::default();
754 let result = check_breach("improving", 200.0, 150.0, &schema);
755 assert_eq!(result.severity, BreachSeverity::None);
756 }
757
758 #[test]
759 fn run_slo_check_batch_normal() {
760 let schema = SloSchema {
761 metrics: {
762 let mut m = HashMap::new();
763 m.insert(
764 "p99".into(),
765 MetricSlo {
766 metric_type: MetricType::Latency,
767 max_value: Some(500.0),
768 max_ratio: Some(1.15),
769 safe_mode_trigger: false,
770 },
771 );
772 m
773 },
774 ..SloSchema::default()
775 };
776 let observations = vec![("p99", 400.0, 404.0)];
777 let (breaches, decision) = run_slo_check(&schema, &observations);
778 assert_eq!(breaches.len(), 1);
779 assert_eq!(decision, SafeModeDecision::Normal);
780 }
781
782 #[test]
783 fn schema_error_display() {
784 let err = SloSchemaError::InvalidThreshold {
785 field: "regression_threshold".into(),
786 value: 1.5,
787 };
788 let msg = err.to_string();
789 assert!(msg.contains("regression_threshold"));
790 assert!(msg.contains("1.5"));
791 }
792
793 #[test]
794 fn parse_all_three_metric_types() {
795 let yaml = r#"
796regression_threshold: 0.10
797noise_tolerance: 0.05
798metrics:
799 lat:
800 metric_type: latency
801 max_value: 100.0
802 mem:
803 metric_type: memory
804 max_value: 1000.0
805 err:
806 metric_type: error_rate
807 max_value: 0.01
808"#;
809 let schema = parse_slo_yaml(yaml).unwrap();
810 assert_eq!(
811 schema.metrics.get("lat").unwrap().metric_type,
812 MetricType::Latency
813 );
814 assert_eq!(
815 schema.metrics.get("mem").unwrap().metric_type,
816 MetricType::Memory
817 );
818 assert_eq!(
819 schema.metrics.get("err").unwrap().metric_type,
820 MetricType::ErrorRate
821 );
822 }
823
824 #[test]
825 fn comments_and_blanks_ignored() {
826 let yaml = "# comment\nregression_threshold: 0.12\n\n# another\nnoise_tolerance: 0.03\n";
827 let schema = parse_slo_yaml(yaml).unwrap();
828 assert!((schema.regression_threshold - 0.12).abs() < f64::EPSILON);
829 }
830}