Skip to main content

envoy/
metrics.rs

1//! Prometheus metrics for envoy.
2//!
3//! Uses the `metrics` facade with `metrics-exporter-prometheus` for scraping.
4//! Exposes a `/metrics` endpoint for Prometheus to scrape.
5
6use std::sync::LazyLock;
7use std::time::Instant;
8
9use axum::body::Body;
10use axum::extract::Request;
11use axum::middleware::Next;
12use axum::response::{IntoResponse, Response};
13use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram};
14use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
15
16/// Histogram bucket boundaries for request duration (ms).
17///
18/// Buckets: 0.5ms, 1ms, 5ms, 10ms, 50ms, 100ms, 500ms, 1s, 5s
19const REQUEST_DURATION_BUCKETS: &[f64] = &[0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0];
20
21/// Global Prometheus handle. Initialized once at startup.
22static HANDLE: LazyLock<PrometheusHandle> = LazyLock::new(|| {
23    let builder = PrometheusBuilder::new()
24        .set_buckets_for_metric(
25            Matcher::Full("envoy_request_duration_ms".to_string()),
26            REQUEST_DURATION_BUCKETS,
27        )
28        .expect("failed to set histogram buckets");
29    let handle = builder
30        .install_recorder()
31        .expect("failed to install Prometheus recorder");
32    describe_metrics();
33    handle
34});
35
36/// Describe all metrics so they appear in `/metrics` output with documentation.
37fn describe_metrics() {
38    describe_counter!(
39        "envoy_requests_total",
40        "Total HTTP requests processed, labeled by operation and status class"
41    );
42    describe_histogram!(
43        "envoy_request_duration_ms",
44        "Request latency in milliseconds, labeled by operation"
45    );
46    describe_gauge!("envoy_agents_online", "Number of currently active agents");
47    describe_gauge!(
48        "envoy_messages_pending",
49        "Number of undelivered messages in the store"
50    );
51    describe_gauge!(
52        "envoy_ws_connections",
53        "Number of active WebSocket connections"
54    );
55}
56
57/// Initialize the Prometheus metrics recorder and return the handle.
58///
59/// Call once at server startup. The `LazyLock` ensures this is safe to call
60/// multiple times — only the first call does real work.
61pub fn init() -> &'static PrometheusHandle {
62    &HANDLE
63}
64
65/// Render the Prometheus metrics text output.
66pub fn render() -> String {
67    HANDLE.render()
68}
69
70/// AXUM handler for `GET /metrics`.
71pub async fn metrics_endpoint() -> impl IntoResponse {
72    let body = render();
73    (
74        axum::http::StatusCode::OK,
75        [(
76            axum::http::header::CONTENT_TYPE,
77            "text/plain; version=0.0.4; charset=utf-8",
78        )],
79        body,
80    )
81}
82
83/// Middleware that records request count and latency for every HTTP request.
84pub async fn metrics_middleware(request: Request<Body>, next: Next) -> Response {
85    let start = Instant::now();
86    let method = request.method().clone();
87    let path = normalize_path(request.uri().path());
88
89    let response = next.run(request).await;
90
91    let latency_ms = start.elapsed().as_secs_f64() * 1000.0;
92    let status = response.status().as_u16();
93
94    record_request(&method, &path, status);
95    record_latency(&path, latency_ms);
96
97    response
98}
99
100/// Normalize a path into a stable label for metrics.
101///
102/// Collapses path segments that look like IDs (numeric or `id\d+` patterns)
103/// into a single `:id` placeholder to avoid cardinality explosion.
104fn normalize_path(path: &str) -> String {
105    let segments: Vec<&str> = path.split('/').collect();
106    let mut normalized = Vec::with_capacity(segments.len());
107    for seg in &segments {
108        if seg.is_empty() {
109            continue;
110        }
111        if is_id_segment(seg) {
112            normalized.push(":id");
113        } else {
114            normalized.push(seg);
115        }
116    }
117    if normalized.is_empty() {
118        "/".to_string()
119    } else {
120        format!("/{}", normalized.join("/"))
121    }
122}
123
124/// Check if a path segment looks like an ID (numeric, `id\d+`, or UUID).
125fn is_id_segment(seg: &str) -> bool {
126    if seg.parse::<u64>().is_ok() || seg.starts_with("id") {
127        return true;
128    }
129    // UUID pattern: 8-4-4-4-12 hex chars
130    let bytes = seg.as_bytes();
131    if bytes.len() == 36 {
132        let has_dashes =
133            bytes[8] == b'-' && bytes[13] == b'-' && bytes[18] == b'-' && bytes[23] == b'-';
134        if has_dashes {
135            return seg.chars().all(|c| c.is_ascii_hexdigit() || c == '-');
136        }
137    }
138    false
139}
140
141/// Record a request counter.
142fn record_request(method: &axum::http::Method, path: &str, status: u16) {
143    let status_class = match status {
144        200..=299 => "2xx",
145        300..=399 => "3xx",
146        400..=499 => "4xx",
147        500..=599 => "5xx",
148        _ => "other",
149    };
150    counter!(
151        "envoy_requests_total",
152        "method" => method.to_string(),
153        "path" => path.to_string(),
154        "status" => status_class.to_string()
155    )
156    .increment(1);
157}
158
159/// Record request latency.
160fn record_latency(path: &str, latency_ms: f64) {
161    histogram!("envoy_request_duration_ms", "path" => path.to_string()).record(latency_ms);
162}
163
164/// Update the agents-online gauge. Call after agent register/disconnect/retire.
165pub fn set_agents_online(count: usize) {
166    gauge!("envoy_agents_online").set(count as f64);
167}
168
169/// Update the pending-messages gauge. Call after send/poll/ack.
170pub fn set_messages_pending(count: usize) {
171    gauge!("envoy_messages_pending").set(count as f64);
172}
173
174/// Update the WebSocket connections gauge.
175pub fn set_ws_connections(count: usize) {
176    gauge!("envoy_ws_connections").set(count as f64);
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182
183    #[test]
184    fn test_normalize_path_root() {
185        assert_eq!(normalize_path("/"), "/");
186    }
187
188    #[test]
189    fn test_normalize_path_static() {
190        assert_eq!(normalize_path("/agents"), "/agents");
191        assert_eq!(normalize_path("/health"), "/health");
192        assert_eq!(normalize_path("/atheneum/events"), "/atheneum/events");
193    }
194
195    #[test]
196    fn test_normalize_path_numeric_id() {
197        assert_eq!(normalize_path("/agents/42"), "/agents/:id");
198        assert_eq!(normalize_path("/messages/123"), "/messages/:id");
199    }
200
201    #[test]
202    fn test_normalize_path_named_id() {
203        assert_eq!(normalize_path("/agents/id1118"), "/agents/:id");
204        assert_eq!(normalize_path("/agents/id1.2"), "/agents/:id");
205    }
206
207    #[test]
208    fn test_normalize_path_mixed() {
209        assert_eq!(
210            normalize_path("/agents/id5/messages/42/ack"),
211            "/agents/:id/messages/:id/ack"
212        );
213    }
214
215    #[test]
216    fn test_is_id_segment() {
217        assert!(is_id_segment("42"));
218        assert!(is_id_segment("0"));
219        assert!(is_id_segment("id1118"));
220        assert!(is_id_segment("id1"));
221        assert!(!is_id_segment("agents"));
222        assert!(!is_id_segment("health"));
223        assert!(!is_id_segment("ack"));
224    }
225
226    #[test]
227    fn test_is_id_segment_uuid() {
228        assert!(is_id_segment("338b8adc-6c08-4664-af1d-69300e7c576a"));
229        assert!(is_id_segment("550e8400-e29b-41d4-a716-446655440000"));
230        assert!(!is_id_segment("not-a-uuid"));
231        assert!(!is_id_segment("short"));
232    }
233
234    #[test]
235    fn test_normalize_path_uuid() {
236        assert_eq!(
237            normalize_path("/atheneum/sessions/338b8adc-6c08-4664-af1d-69300e7c576a"),
238            "/atheneum/sessions/:id"
239        );
240        assert_eq!(
241            normalize_path("/atheneum/sessions/338b8adc-6c08-4664-af1d-69300e7c576a/handover"),
242            "/atheneum/sessions/:id/handover"
243        );
244    }
245}