sentinel_proxy/proxy/
fallback_metrics.rs

1//! Fallback routing metrics for observability.
2//!
3//! Provides Prometheus metrics for:
4//! - Fallback attempts by route, upstream, and reason
5//! - Successful responses after fallback
6//! - Exhausted fallback events (all fallbacks tried)
7
8use anyhow::{Context, Result};
9use once_cell::sync::OnceCell;
10use prometheus::{register_int_counter_vec, IntCounterVec};
11use std::sync::Arc;
12
13use super::context::FallbackReason;
14
15/// Global fallback metrics instance.
16static FALLBACK_METRICS: OnceCell<Arc<FallbackMetrics>> = OnceCell::new();
17
18/// Get or initialize the global fallback metrics.
19pub fn get_fallback_metrics() -> Option<Arc<FallbackMetrics>> {
20    FALLBACK_METRICS.get().cloned()
21}
22
23/// Initialize the global fallback metrics.
24/// Returns Ok if already initialized or initialization succeeds.
25pub fn init_fallback_metrics() -> Result<Arc<FallbackMetrics>> {
26    if let Some(metrics) = FALLBACK_METRICS.get() {
27        return Ok(metrics.clone());
28    }
29
30    let metrics = Arc::new(FallbackMetrics::new()?);
31    let _ = FALLBACK_METRICS.set(metrics.clone());
32    Ok(metrics)
33}
34
35/// Fallback routing metrics collector.
36///
37/// Tracks fallback attempts, successes, and exhaustion events for
38/// observability and alerting.
39pub struct FallbackMetrics {
40    /// Total fallback attempts
41    /// Labels: route, from_upstream, to_upstream, reason
42    fallback_attempts: IntCounterVec,
43
44    /// Successful responses after fallback
45    /// Labels: route, upstream
46    fallback_success: IntCounterVec,
47
48    /// All fallbacks exhausted (no more upstreams to try)
49    /// Labels: route
50    fallback_exhausted: IntCounterVec,
51
52    /// Model mapping applied during fallback
53    /// Labels: route, original_model, mapped_model
54    model_mapping_applied: IntCounterVec,
55}
56
57impl FallbackMetrics {
58    /// Create new fallback metrics and register with Prometheus.
59    pub fn new() -> Result<Self> {
60        let fallback_attempts = register_int_counter_vec!(
61            "sentinel_fallback_attempts_total",
62            "Total number of fallback routing attempts",
63            &["route", "from_upstream", "to_upstream", "reason"]
64        )
65        .context("Failed to register fallback_attempts metric")?;
66
67        let fallback_success = register_int_counter_vec!(
68            "sentinel_fallback_success_total",
69            "Successful responses after fallback routing",
70            &["route", "upstream"]
71        )
72        .context("Failed to register fallback_success metric")?;
73
74        let fallback_exhausted = register_int_counter_vec!(
75            "sentinel_fallback_exhausted_total",
76            "Number of requests where all fallback upstreams were exhausted",
77            &["route"]
78        )
79        .context("Failed to register fallback_exhausted metric")?;
80
81        let model_mapping_applied = register_int_counter_vec!(
82            "sentinel_fallback_model_mapping_total",
83            "Number of times model mapping was applied during fallback",
84            &["route", "original_model", "mapped_model"]
85        )
86        .context("Failed to register fallback_model_mapping metric")?;
87
88        Ok(Self {
89            fallback_attempts,
90            fallback_success,
91            fallback_exhausted,
92            model_mapping_applied,
93        })
94    }
95
96    /// Record a fallback attempt.
97    ///
98    /// Called when fallback routing is triggered from one upstream to another.
99    pub fn record_fallback_attempt(
100        &self,
101        route: &str,
102        from_upstream: &str,
103        to_upstream: &str,
104        reason: &FallbackReason,
105    ) {
106        let reason_str = Self::reason_label(reason);
107        self.fallback_attempts
108            .with_label_values(&[route, from_upstream, to_upstream, reason_str])
109            .inc();
110    }
111
112    /// Record a successful response after fallback.
113    ///
114    /// Called when a request succeeds after being routed to a fallback upstream.
115    pub fn record_fallback_success(&self, route: &str, upstream: &str) {
116        self.fallback_success
117            .with_label_values(&[route, upstream])
118            .inc();
119    }
120
121    /// Record that all fallback upstreams were exhausted.
122    ///
123    /// Called when no more fallback upstreams are available and the request fails.
124    pub fn record_fallback_exhausted(&self, route: &str) {
125        self.fallback_exhausted.with_label_values(&[route]).inc();
126    }
127
128    /// Record model mapping applied during fallback.
129    pub fn record_model_mapping(&self, route: &str, original_model: &str, mapped_model: &str) {
130        self.model_mapping_applied
131            .with_label_values(&[route, original_model, mapped_model])
132            .inc();
133    }
134
135    /// Convert FallbackReason to a label string.
136    fn reason_label(reason: &FallbackReason) -> &'static str {
137        match reason {
138            FallbackReason::HealthCheckFailed => "health_check_failed",
139            FallbackReason::BudgetExhausted => "budget_exhausted",
140            FallbackReason::LatencyThreshold { .. } => "latency_threshold",
141            FallbackReason::ErrorCode(_) => "error_code",
142            FallbackReason::ConnectionError(_) => "connection_error",
143        }
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    #[test]
152    fn test_reason_label() {
153        assert_eq!(
154            FallbackMetrics::reason_label(&FallbackReason::HealthCheckFailed),
155            "health_check_failed"
156        );
157        assert_eq!(
158            FallbackMetrics::reason_label(&FallbackReason::BudgetExhausted),
159            "budget_exhausted"
160        );
161        assert_eq!(
162            FallbackMetrics::reason_label(&FallbackReason::LatencyThreshold {
163                observed_ms: 5000,
164                threshold_ms: 3000
165            }),
166            "latency_threshold"
167        );
168        assert_eq!(
169            FallbackMetrics::reason_label(&FallbackReason::ErrorCode(503)),
170            "error_code"
171        );
172        assert_eq!(
173            FallbackMetrics::reason_label(&FallbackReason::ConnectionError("timeout".to_string())),
174            "connection_error"
175        );
176    }
177}