use anyhow::{Context, Result};
use once_cell::sync::OnceCell;
use prometheus::{register_int_counter_vec, IntCounterVec};
use std::sync::Arc;
use super::context::FallbackReason;
static FALLBACK_METRICS: OnceCell<Arc<FallbackMetrics>> = OnceCell::new();
pub fn get_fallback_metrics() -> Option<Arc<FallbackMetrics>> {
FALLBACK_METRICS.get().cloned()
}
pub fn init_fallback_metrics() -> Result<Arc<FallbackMetrics>> {
if let Some(metrics) = FALLBACK_METRICS.get() {
return Ok(metrics.clone());
}
let metrics = Arc::new(FallbackMetrics::new()?);
let _ = FALLBACK_METRICS.set(metrics.clone());
Ok(metrics)
}
pub struct FallbackMetrics {
fallback_attempts: IntCounterVec,
fallback_success: IntCounterVec,
fallback_exhausted: IntCounterVec,
model_mapping_applied: IntCounterVec,
}
impl FallbackMetrics {
pub fn new() -> Result<Self> {
let fallback_attempts = register_int_counter_vec!(
"zentinel_fallback_attempts_total",
"Total number of fallback routing attempts",
&["route", "from_upstream", "to_upstream", "reason"]
)
.context("Failed to register fallback_attempts metric")?;
let fallback_success = register_int_counter_vec!(
"zentinel_fallback_success_total",
"Successful responses after fallback routing",
&["route", "upstream"]
)
.context("Failed to register fallback_success metric")?;
let fallback_exhausted = register_int_counter_vec!(
"zentinel_fallback_exhausted_total",
"Number of requests where all fallback upstreams were exhausted",
&["route"]
)
.context("Failed to register fallback_exhausted metric")?;
let model_mapping_applied = register_int_counter_vec!(
"zentinel_fallback_model_mapping_total",
"Number of times model mapping was applied during fallback",
&["route", "original_model", "mapped_model"]
)
.context("Failed to register fallback_model_mapping metric")?;
Ok(Self {
fallback_attempts,
fallback_success,
fallback_exhausted,
model_mapping_applied,
})
}
pub fn record_fallback_attempt(
&self,
route: &str,
from_upstream: &str,
to_upstream: &str,
reason: &FallbackReason,
) {
let reason_str = Self::reason_label(reason);
self.fallback_attempts
.with_label_values(&[route, from_upstream, to_upstream, reason_str])
.inc();
}
pub fn record_fallback_success(&self, route: &str, upstream: &str) {
self.fallback_success
.with_label_values(&[route, upstream])
.inc();
}
pub fn record_fallback_exhausted(&self, route: &str) {
self.fallback_exhausted.with_label_values(&[route]).inc();
}
pub fn record_model_mapping(&self, route: &str, original_model: &str, mapped_model: &str) {
self.model_mapping_applied
.with_label_values(&[route, original_model, mapped_model])
.inc();
}
fn reason_label(reason: &FallbackReason) -> &'static str {
match reason {
FallbackReason::HealthCheckFailed => "health_check_failed",
FallbackReason::BudgetExhausted => "budget_exhausted",
FallbackReason::LatencyThreshold { .. } => "latency_threshold",
FallbackReason::ErrorCode(_) => "error_code",
FallbackReason::ConnectionError(_) => "connection_error",
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reason_label() {
assert_eq!(
FallbackMetrics::reason_label(&FallbackReason::HealthCheckFailed),
"health_check_failed"
);
assert_eq!(
FallbackMetrics::reason_label(&FallbackReason::BudgetExhausted),
"budget_exhausted"
);
assert_eq!(
FallbackMetrics::reason_label(&FallbackReason::LatencyThreshold {
observed_ms: 5000,
threshold_ms: 3000
}),
"latency_threshold"
);
assert_eq!(
FallbackMetrics::reason_label(&FallbackReason::ErrorCode(503)),
"error_code"
);
assert_eq!(
FallbackMetrics::reason_label(&FallbackReason::ConnectionError("timeout".to_string())),
"connection_error"
);
}
}