use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use axum::Router;
use axum::extract::State;
use axum::routing::get;
use prometheus::{
Counter, CounterVec, Encoder, Gauge, HistogramOpts, HistogramVec, Opts, Registry, TextEncoder,
};
use crate::report::Report;
#[derive(Debug, Clone)]
struct ExemplarData {
trace_id: String,
}
fn sanitize_exemplar_value(value: &str) -> String {
value
.chars()
.filter(|c| c.is_ascii_alphanumeric() || *c == '-' || *c == '_')
.take(64)
.collect()
}
#[derive(Clone)]
pub struct MetricsState {
registry: Registry,
pub findings_total: CounterVec,
pub io_waste_ratio: Gauge,
pub traces_analyzed_total: Counter,
pub events_processed_total: Counter,
pub active_traces: Gauge,
pub total_io_ops: Counter,
pub avoidable_io_ops: Counter,
pub service_io_ops_total: CounterVec,
pub scaphandre_last_scrape_age_seconds: Gauge,
pub cloud_energy_last_scrape_age_seconds: Gauge,
pub slow_duration_seconds: HistogramVec,
worst_finding_trace: Arc<RwLock<HashMap<(&'static str, &'static str), ExemplarData>>>,
worst_waste_trace: Arc<RwLock<Option<ExemplarData>>>,
}
impl MetricsState {
#[must_use]
#[allow(clippy::too_many_lines)]
pub fn new() -> Self {
let registry = Registry::new();
let findings_total = CounterVec::new(
Opts::new(
"perf_sentinel_findings_total",
"Total findings detected by type and severity",
),
&["type", "severity"],
)
.expect("metric creation should not fail");
let io_waste_ratio = Gauge::new(
"perf_sentinel_io_waste_ratio",
"Cumulative I/O waste ratio since daemon start",
)
.expect("metric creation should not fail");
let traces_analyzed_total = Counter::new(
"perf_sentinel_traces_analyzed_total",
"Total traces analyzed since start",
)
.expect("metric creation should not fail");
let events_processed_total = Counter::new(
"perf_sentinel_events_processed_total",
"Total events processed since start",
)
.expect("metric creation should not fail");
let active_traces = Gauge::new(
"perf_sentinel_active_traces",
"Currently active traces in the sliding window",
)
.expect("metric creation should not fail");
let total_io_ops = Counter::new(
"perf_sentinel_total_io_ops",
"Cumulative total I/O ops processed",
)
.expect("metric creation should not fail");
let avoidable_io_ops = Counter::new(
"perf_sentinel_avoidable_io_ops",
"Cumulative avoidable I/O ops detected",
)
.expect("metric creation should not fail");
let service_io_ops_total = CounterVec::new(
Opts::new(
"perf_sentinel_service_io_ops_total",
"Cumulative I/O ops attributed to each service",
),
&["service"],
)
.expect("metric creation should not fail");
let scaphandre_last_scrape_age_seconds = Gauge::new(
"perf_sentinel_scaphandre_last_scrape_age_seconds",
"Age in seconds since the last successful Scaphandre scrape",
)
.expect("metric creation should not fail");
registry
.register(Box::new(findings_total.clone()))
.expect("registration should not fail");
registry
.register(Box::new(io_waste_ratio.clone()))
.expect("registration should not fail");
registry
.register(Box::new(traces_analyzed_total.clone()))
.expect("registration should not fail");
registry
.register(Box::new(events_processed_total.clone()))
.expect("registration should not fail");
registry
.register(Box::new(active_traces.clone()))
.expect("registration should not fail");
registry
.register(Box::new(total_io_ops.clone()))
.expect("registration should not fail");
registry
.register(Box::new(avoidable_io_ops.clone()))
.expect("registration should not fail");
registry
.register(Box::new(service_io_ops_total.clone()))
.expect("registration should not fail");
registry
.register(Box::new(scaphandre_last_scrape_age_seconds.clone()))
.expect("registration should not fail");
let slow_duration_seconds = HistogramVec::new(
HistogramOpts::new(
"perf_sentinel_slow_duration_seconds",
"Duration of spans exceeding the slow threshold",
)
.buckets(vec![
0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0, 30.0,
]),
&["type"],
)
.expect("metric creation should not fail");
registry
.register(Box::new(slow_duration_seconds.clone()))
.expect("registration should not fail");
let cloud_energy_last_scrape_age_seconds = Gauge::new(
"perf_sentinel_cloud_energy_last_scrape_age_seconds",
"Age in seconds since the last successful cloud energy scrape",
)
.expect("metric creation should not fail");
registry
.register(Box::new(cloud_energy_last_scrape_age_seconds.clone()))
.expect("registration should not fail");
Self {
registry,
findings_total,
io_waste_ratio,
traces_analyzed_total,
events_processed_total,
active_traces,
total_io_ops,
avoidable_io_ops,
service_io_ops_total,
scaphandre_last_scrape_age_seconds,
cloud_energy_last_scrape_age_seconds,
slow_duration_seconds,
worst_finding_trace: Arc::new(RwLock::new(HashMap::new())),
worst_waste_trace: Arc::new(RwLock::new(None)),
}
}
#[must_use]
pub fn snapshot_service_io_ops(&self) -> HashMap<String, u64> {
use prometheus::core::Collector;
let mut out = HashMap::new();
for family in Collector::collect(&self.service_io_ops_total) {
for metric in family.get_metric() {
let counter_value = metric.get_counter().value();
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let count = if counter_value <= 0.0 {
0u64
} else if counter_value >= u64::MAX as f64 {
u64::MAX
} else {
counter_value as u64
};
for label in metric.get_label() {
if label.name() == "service" {
out.insert(label.value().to_string(), count);
break;
}
}
}
}
out
}
pub fn record_batch(&self, report: &Report) {
self.traces_analyzed_total
.inc_by(report.analysis.traces_analyzed as f64);
self.events_processed_total
.inc_by(report.analysis.events_processed as f64);
self.total_io_ops
.inc_by(report.green_summary.total_io_ops as f64);
self.avoidable_io_ops
.inc_by(report.green_summary.avoidable_io_ops as f64);
let cumulative_total = self.total_io_ops.get();
if cumulative_total > 0.0 {
self.io_waste_ratio
.set(self.avoidable_io_ops.get() / cumulative_total);
}
for finding in &report.findings {
self.findings_total
.with_label_values(&[finding.finding_type.as_str(), finding.severity.as_str()])
.inc();
}
self.record_exemplars(&report.findings, &report.green_summary);
}
pub fn record_exemplars(
&self,
findings: &[crate::detect::Finding],
green_summary: &crate::report::GreenSummary,
) {
let mut new_exemplars: HashMap<(&'static str, &'static str), ExemplarData> = HashMap::new();
for finding in findings {
new_exemplars.insert(
(finding.finding_type.as_str(), finding.severity.as_str()),
ExemplarData {
trace_id: finding.trace_id.clone(),
},
);
}
if !new_exemplars.is_empty() {
let mut worst_map = self
.worst_finding_trace
.write()
.unwrap_or_else(std::sync::PoisonError::into_inner);
worst_map.extend(new_exemplars);
}
if let Some(worst_finding) = (green_summary.io_waste_ratio > 0.0)
.then(|| {
findings
.iter()
.filter(|f| f.finding_type.is_avoidable_io())
.max_by_key(|f| f.pattern.occurrences)
})
.flatten()
{
let mut waste_lock = self
.worst_waste_trace
.write()
.unwrap_or_else(std::sync::PoisonError::into_inner);
*waste_lock = Some(ExemplarData {
trace_id: worst_finding.trace_id.clone(),
});
}
}
#[must_use]
pub fn has_exemplars(&self) -> bool {
let finding_lock = self
.worst_finding_trace
.read()
.unwrap_or_else(std::sync::PoisonError::into_inner);
if !finding_lock.is_empty() {
return true;
}
let waste_lock = self
.worst_waste_trace
.read()
.unwrap_or_else(std::sync::PoisonError::into_inner);
waste_lock.is_some()
}
#[must_use]
pub fn render(&self) -> String {
let encoder = TextEncoder::new();
let metric_families = self.registry.gather();
let mut buffer = Vec::new();
if encoder.encode(&metric_families, &mut buffer).is_err() {
return "# error encoding metrics\n".to_string();
}
let Ok(base_output) = String::from_utf8(buffer) else {
return "# error encoding metrics\n".to_string();
};
self.inject_exemplars(base_output)
}
fn inject_exemplars(&self, base: String) -> String {
use std::fmt::Write;
let finding_map = self
.worst_finding_trace
.read()
.unwrap_or_else(std::sync::PoisonError::into_inner);
let waste_exemplar = self
.worst_waste_trace
.read()
.unwrap_or_else(std::sync::PoisonError::into_inner);
if finding_map.is_empty() && waste_exemplar.is_none() {
return base;
}
let mut output = String::with_capacity(base.len() + 256);
for line in base.lines() {
output.push_str(line);
if let Some(exemplar) = line
.starts_with("perf_sentinel_findings_total{")
.then(|| extract_finding_exemplar(line, &finding_map))
.flatten()
{
let sanitized = sanitize_exemplar_value(&exemplar.trace_id);
let _ = write!(output, " # {{trace_id=\"{sanitized}\"}}");
}
if let Some(exemplar) = waste_exemplar
.as_ref()
.filter(|_| line.starts_with("perf_sentinel_io_waste_ratio "))
{
let sanitized = sanitize_exemplar_value(&exemplar.trace_id);
let _ = write!(output, " # {{trace_id=\"{sanitized}\"}}");
}
output.push('\n');
}
output
}
#[must_use]
pub fn content_type(&self) -> &'static str {
if self.has_exemplars() {
"application/openmetrics-text; version=1.0.0; charset=utf-8"
} else {
"text/plain; version=0.0.4; charset=utf-8"
}
}
}
fn extract_finding_exemplar<'a>(
line: &str,
map: &'a HashMap<(&'static str, &'static str), ExemplarData>,
) -> Option<&'a ExemplarData> {
let labels_start = line.find('{')?;
let labels_end = line.find('}')?;
let labels_str = &line[labels_start + 1..labels_end];
let mut finding_type = None;
let mut severity = None;
for part in labels_str.split(',') {
let part = part.trim();
if let Some(val) = part.strip_prefix("type=\"") {
finding_type = val.strip_suffix('"');
} else if let Some(val) = part.strip_prefix("severity=\"") {
severity = val.strip_suffix('"');
}
}
let ft = finding_type?;
let sev = severity?;
map.iter()
.find(|((k_ft, k_sev), _)| *k_ft == ft && *k_sev == sev)
.map(|(_, v)| v)
}
impl Default for MetricsState {
fn default() -> Self {
Self::new()
}
}
pub fn metrics_route(state: Arc<MetricsState>) -> Router {
async fn handle_metrics(
State(metrics): State<Arc<MetricsState>>,
) -> ([(axum::http::header::HeaderName, &'static str); 1], String) {
let content_type = metrics.content_type();
(
[(axum::http::header::CONTENT_TYPE, content_type)],
metrics.render(),
)
}
Router::new()
.route("/metrics", get(handle_metrics))
.with_state(state)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::detect::{Confidence, Finding, FindingType, GreenImpact, Pattern, Severity};
use crate::report::{Analysis, GreenSummary, QualityGate, Report};
#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
fn make_test_report(findings: Vec<Finding>, waste_ratio: f64) -> Report {
let total = 10;
let avoidable = (total as f64 * waste_ratio) as usize;
Report {
analysis: Analysis {
duration_ms: 1,
events_processed: 100,
traces_analyzed: 2,
},
findings,
green_summary: GreenSummary {
total_io_ops: total,
avoidable_io_ops: avoidable,
io_waste_ratio: waste_ratio,
io_waste_ratio_band: crate::report::interpret::InterpretationLevel::for_waste_ratio(
waste_ratio,
),
top_offenders: vec![],
co2: None,
regions: vec![],
transport_gco2: None,
},
quality_gate: QualityGate {
passed: true,
rules: vec![],
},
per_endpoint_io_ops: vec![],
}
}
fn make_finding(
finding_type: FindingType,
severity: Severity,
trace_id: &str,
occurrences: usize,
) -> Finding {
Finding {
finding_type,
severity,
trace_id: trace_id.to_string(),
service: "order-svc".to_string(),
source_endpoint: "POST /api/orders/42/submit".to_string(),
pattern: Pattern {
template: "SELECT * FROM t WHERE id = ?".to_string(),
occurrences,
window_ms: 200,
distinct_params: occurrences,
},
suggestion: "batch".to_string(),
first_timestamp: "2025-07-10T14:32:01.000Z".to_string(),
last_timestamp: "2025-07-10T14:32:01.250Z".to_string(),
green_impact: Some(GreenImpact {
estimated_extra_io_ops: occurrences.saturating_sub(1),
io_intensity_score: 6.0,
io_intensity_band: crate::report::interpret::InterpretationLevel::for_iis(6.0),
}),
confidence: Confidence::default(),
code_location: None,
suggested_fix: None,
}
}
#[test]
fn default_creates_same_as_new() {
let state = MetricsState::default();
state.events_processed_total.inc();
let output = state.render();
assert!(output.contains("perf_sentinel_events_processed_total"));
}
#[tokio::test]
async fn metrics_route_returns_prometheus_output() {
use axum::body::Body;
use axum::http::{Request, StatusCode};
use http_body_util::BodyExt;
use tower::ServiceExt;
let state = Arc::new(MetricsState::new());
state.traces_analyzed_total.inc_by(42.0);
state.io_waste_ratio.set(0.25);
state
.findings_total
.with_label_values(&["n_plus_one_sql", "warning"])
.inc();
let router = metrics_route(state);
let request = Request::builder()
.uri("/metrics")
.body(Body::empty())
.unwrap();
let response = router.oneshot(request).await.unwrap();
assert_eq!(response.status(), StatusCode::OK);
let content_type = response
.headers()
.get("content-type")
.expect("should have content-type header")
.to_str()
.unwrap();
assert!(
content_type.contains("text/plain"),
"Content-Type should be text/plain, got: {content_type}"
);
let body = response.into_body().collect().await.unwrap().to_bytes();
let body_str = String::from_utf8(body.to_vec()).unwrap();
assert!(
body_str.contains("perf_sentinel_traces_analyzed_total 42"),
"should contain traces count, got: {body_str}"
);
assert!(
body_str.contains("perf_sentinel_io_waste_ratio 0.25"),
"should contain waste ratio"
);
assert!(
body_str.contains("n_plus_one_sql"),
"should contain finding type label"
);
}
#[test]
fn metrics_state_creates_successfully() {
let state = MetricsState::new();
state
.findings_total
.with_label_values(&["test", "test"])
.inc_by(0.0);
let output = state.render();
assert!(
output.contains("perf_sentinel_findings_total"),
"output: {output}"
);
assert!(output.contains("perf_sentinel_io_waste_ratio"));
assert!(output.contains("perf_sentinel_traces_analyzed_total"));
assert!(output.contains("perf_sentinel_events_processed_total"));
assert!(output.contains("perf_sentinel_active_traces"));
}
#[test]
fn increment_findings_counter() {
let state = MetricsState::new();
state
.findings_total
.with_label_values(&["n_plus_one_sql", "critical"])
.inc();
state
.findings_total
.with_label_values(&["n_plus_one_sql", "critical"])
.inc();
let output = state.render();
assert!(output.contains(r#"type="n_plus_one_sql""#));
assert!(output.contains(r#"severity="critical""#));
}
#[test]
fn set_gauge_values() {
let state = MetricsState::new();
state.io_waste_ratio.set(0.42);
state.active_traces.set(5.0);
let output = state.render();
assert!(output.contains("0.42"));
}
#[test]
fn increment_counters() {
let state = MetricsState::new();
state.traces_analyzed_total.inc_by(10.0);
state.events_processed_total.inc_by(100.0);
let output = state.render();
assert!(output.contains("100"));
}
#[test]
fn record_batch_tracks_worst_finding_trace() {
let state = MetricsState::new();
let report = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Critical,
"trace-abc",
10,
)],
0.5,
);
state.record_batch(&report);
let map = state.worst_finding_trace.read().unwrap();
assert_eq!(
map.get(&("n_plus_one_sql", "critical")).unwrap().trace_id,
"trace-abc"
);
}
#[test]
fn record_batch_tracks_worst_waste_trace() {
let state = MetricsState::new();
let report = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"trace-waste",
8,
)],
0.4,
);
state.record_batch(&report);
let waste = state.worst_waste_trace.read().unwrap();
assert_eq!(waste.as_ref().unwrap().trace_id, "trace-waste");
}
#[test]
fn render_includes_exemplar_annotation() {
let state = MetricsState::new();
let report = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"trace-exemplar",
6,
)],
0.3,
);
state.record_batch(&report);
let output = state.render();
assert!(
output.contains(r#"# {trace_id="trace-exemplar"}"#),
"should contain exemplar annotation, got: {output}"
);
}
#[test]
fn render_no_exemplar_when_no_data() {
let state = MetricsState::new();
state.traces_analyzed_total.inc();
state
.findings_total
.with_label_values(&["n_plus_one_sql", "warning"])
.inc();
let output = state.render();
assert!(
!output.contains("# {trace_id="),
"should not contain exemplar when no record_batch called"
);
}
#[test]
fn exemplar_on_io_waste_ratio() {
let state = MetricsState::new();
let report = make_test_report(
vec![make_finding(
FindingType::RedundantSql,
Severity::Warning,
"trace-waste-ratio",
4,
)],
0.5,
);
state.record_batch(&report);
let output = state.render();
for line in output.lines() {
if line.starts_with("perf_sentinel_io_waste_ratio ") {
assert!(
line.contains(r#"# {trace_id="trace-waste-ratio"}"#),
"waste ratio line should have exemplar: {line}"
);
}
}
}
#[test]
fn content_type_is_openmetrics_with_exemplars() {
let state = MetricsState::new();
assert_eq!(
state.content_type(),
"text/plain; version=0.0.4; charset=utf-8"
);
let report = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"trace-1",
5,
)],
0.0,
);
state.record_batch(&report);
assert_eq!(
state.content_type(),
"application/openmetrics-text; version=1.0.0; charset=utf-8"
);
}
#[test]
fn multiple_batches_update_exemplars() {
let state = MetricsState::new();
let report1 = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"trace-old",
5,
)],
0.3,
);
state.record_batch(&report1);
let report2 = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"trace-new",
10,
)],
0.5,
);
state.record_batch(&report2);
let map = state.worst_finding_trace.read().unwrap();
assert_eq!(
map.get(&("n_plus_one_sql", "warning")).unwrap().trace_id,
"trace-new",
"should update to latest batch's worst finding"
);
}
#[tokio::test]
async fn metrics_route_returns_openmetrics_with_exemplars() {
use axum::body::Body;
use axum::http::{Request, StatusCode};
use http_body_util::BodyExt;
use tower::ServiceExt;
let state = Arc::new(MetricsState::new());
let report = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"trace-route-test",
5,
)],
0.0,
);
state.record_batch(&report);
let router = metrics_route(state);
let request = Request::builder()
.uri("/metrics")
.body(Body::empty())
.unwrap();
let response = router.oneshot(request).await.unwrap();
assert_eq!(response.status(), StatusCode::OK);
let content_type = response
.headers()
.get("content-type")
.unwrap()
.to_str()
.unwrap();
assert!(
content_type.contains("openmetrics"),
"should use OpenMetrics content type: {content_type}"
);
let body = response.into_body().collect().await.unwrap().to_bytes();
let body_str = String::from_utf8(body.to_vec()).unwrap();
assert!(
body_str.contains(r#"trace_id="trace-route-test""#),
"should contain exemplar trace_id"
);
}
#[test]
fn sanitize_exemplar_value_strips_dangerous_chars() {
assert_eq!(sanitize_exemplar_value("abc-123_def"), "abc-123_def");
assert_eq!(
sanitize_exemplar_value("evil\"} 999\nfake_metric"),
"evil999fake_metric"
);
assert_eq!(sanitize_exemplar_value(""), "");
let long = "a".repeat(100);
assert_eq!(sanitize_exemplar_value(&long).len(), 64);
}
#[test]
fn exemplar_with_malicious_trace_id_is_sanitized() {
let state = MetricsState::new();
let report = make_test_report(
vec![make_finding(
FindingType::NPlusOneSql,
Severity::Warning,
"evil\"} 999\nmy_fake_metric",
5,
)],
0.0,
);
state.record_batch(&report);
let output = state.render();
assert!(
!output.contains("evil\""),
"malicious trace_id should be sanitized"
);
assert!(output.contains("evil999my_fake_metric"));
}
#[test]
fn prometheus_output_format_matches_expected_prefixes() {
let state = MetricsState::new();
state
.findings_total
.with_label_values(&["n_plus_one_sql", "warning"])
.inc();
state.io_waste_ratio.set(0.5);
let output = state.render();
let has_findings_prefix = output
.lines()
.any(|l| l.starts_with("perf_sentinel_findings_total{"));
assert!(
has_findings_prefix,
"prometheus output must contain lines starting with 'perf_sentinel_findings_total{{': {output}"
);
let has_waste_prefix = output
.lines()
.any(|l| l.starts_with("perf_sentinel_io_waste_ratio "));
assert!(
has_waste_prefix,
"prometheus output must contain lines starting with 'perf_sentinel_io_waste_ratio ': {output}"
);
}
}