Skip to main content

agentkernel/
metrics.rs

1//! Prometheus metrics for the agentkernel HTTP API and sandbox lifecycle.
2//!
3//! All metrics use a dedicated Registry (not the global default) to avoid
4//! collisions with any dependency that might also use the prometheus crate.
5
6use prometheus::{
7    HistogramOpts, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, Registry, TextEncoder,
8};
9use std::sync::LazyLock;
10
11static REGISTRY: LazyLock<Registry> = LazyLock::new(Registry::new);
12
13// ---- HTTP request metrics ----
14
15static HTTP_REQUESTS_TOTAL: LazyLock<IntCounterVec> = LazyLock::new(|| {
16    let c = IntCounterVec::new(
17        prometheus::opts!("agentkernel_http_requests_total", "Total HTTP API requests"),
18        &["method", "path", "status"],
19    )
20    .expect("metric can be created");
21    REGISTRY
22        .register(Box::new(c.clone()))
23        .expect("metric can be registered");
24    c
25});
26
27static HTTP_REQUEST_DURATION_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
28    let h = HistogramVec::new(
29        HistogramOpts::new(
30            "agentkernel_http_request_duration_seconds",
31            "HTTP request latency in seconds",
32        )
33        .buckets(vec![
34            0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
35        ]),
36        &["method", "path"],
37    )
38    .expect("metric can be created");
39    REGISTRY
40        .register(Box::new(h.clone()))
41        .expect("metric can be registered");
42    h
43});
44
45// ---- Sandbox lifecycle metrics ----
46
47static SANDBOX_LIFECYCLE_TOTAL: LazyLock<IntCounterVec> = LazyLock::new(|| {
48    let c = IntCounterVec::new(
49        prometheus::opts!(
50            "agentkernel_sandbox_lifecycle_total",
51            "Sandbox lifecycle events"
52        ),
53        &["action", "backend"],
54    )
55    .expect("metric can be created");
56    REGISTRY
57        .register(Box::new(c.clone()))
58        .expect("metric can be registered");
59    c
60});
61
62static SANDBOX_LIFECYCLE_DURATION_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
63    let h = HistogramVec::new(
64        HistogramOpts::new(
65            "agentkernel_sandbox_lifecycle_duration_seconds",
66            "Sandbox lifecycle operation latency in seconds",
67        )
68        .buckets(vec![
69            0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0,
70        ]),
71        &["action", "backend"],
72    )
73    .expect("metric can be created");
74    REGISTRY
75        .register(Box::new(h.clone()))
76        .expect("metric can be registered");
77    h
78});
79
80static SANDBOXES_ACTIVE: LazyLock<IntGauge> = LazyLock::new(|| {
81    let g = IntGauge::new(
82        "agentkernel_sandboxes_active",
83        "Number of currently known sandboxes",
84    )
85    .expect("metric can be created");
86    REGISTRY
87        .register(Box::new(g.clone()))
88        .expect("metric can be registered");
89    g
90});
91
92// ---- Command execution metrics ----
93
94static COMMANDS_TOTAL: LazyLock<IntCounterVec> = LazyLock::new(|| {
95    let c = IntCounterVec::new(
96        prometheus::opts!("agentkernel_commands_total", "Total commands executed"),
97        &["backend"],
98    )
99    .expect("metric can be created");
100    REGISTRY
101        .register(Box::new(c.clone()))
102        .expect("metric can be registered");
103    c
104});
105
106static COMMAND_DURATION_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
107    let h = HistogramVec::new(
108        HistogramOpts::new(
109            "agentkernel_command_duration_seconds",
110            "Command execution latency in seconds",
111        )
112        .buckets(vec![
113            0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
114        ]),
115        &["backend"],
116    )
117    .expect("metric can be created");
118    REGISTRY
119        .register(Box::new(h.clone()))
120        .expect("metric can be registered");
121    h
122});
123
124// ---- LLM request metrics ----
125
126static LLM_REQUESTS_TOTAL: LazyLock<IntCounterVec> = LazyLock::new(|| {
127    let c = IntCounterVec::new(
128        prometheus::opts!(
129            "agentkernel_llm_requests_total",
130            "Total LLM API requests intercepted"
131        ),
132        &["provider", "model"],
133    )
134    .expect("metric can be created");
135    REGISTRY
136        .register(Box::new(c.clone()))
137        .expect("metric can be registered");
138    c
139});
140
141static LLM_TOKENS_TOTAL: LazyLock<IntCounterVec> = LazyLock::new(|| {
142    let c = IntCounterVec::new(
143        prometheus::opts!("agentkernel_llm_tokens_total", "Total LLM tokens consumed"),
144        &["provider", "direction"],
145    )
146    .expect("metric can be created");
147    REGISTRY
148        .register(Box::new(c.clone()))
149        .expect("metric can be registered");
150    c
151});
152
153// ---- Build info ----
154
155static BUILD_INFO: LazyLock<IntGaugeVec> = LazyLock::new(|| {
156    let g = IntGaugeVec::new(
157        prometheus::opts!("agentkernel_build_info", "Build metadata"),
158        &["version"],
159    )
160    .expect("metric can be created");
161    REGISTRY
162        .register(Box::new(g.clone()))
163        .expect("metric can be registered");
164    g.with_label_values(&[env!("CARGO_PKG_VERSION")]).set(1);
165    g
166});
167
168// ==== Public instrumentation API ====
169
170/// Record an HTTP request (counter + histogram).
171pub fn record_http_request(method: &str, path: &str, status: u16, duration_secs: f64) {
172    let normalized = normalize_path(path);
173    let status_str = status.to_string();
174    HTTP_REQUESTS_TOTAL
175        .with_label_values(&[method, &normalized, &status_str])
176        .inc();
177    HTTP_REQUEST_DURATION_SECONDS
178        .with_label_values(&[method, &normalized])
179        .observe(duration_secs);
180}
181
182/// Record a sandbox lifecycle operation (counter + histogram).
183pub fn record_sandbox_lifecycle(action: &str, backend: &str, duration_secs: f64) {
184    SANDBOX_LIFECYCLE_TOTAL
185        .with_label_values(&[action, backend])
186        .inc();
187    SANDBOX_LIFECYCLE_DURATION_SECONDS
188        .with_label_values(&[action, backend])
189        .observe(duration_secs);
190}
191
192pub fn inc_active_sandboxes() {
193    SANDBOXES_ACTIVE.inc();
194}
195
196pub fn dec_active_sandboxes() {
197    SANDBOXES_ACTIVE.dec();
198}
199
200pub fn set_active_sandboxes(count: i64) {
201    SANDBOXES_ACTIVE.set(count);
202}
203
204/// Record an LLM API request (counter + token counters).
205pub fn record_llm_request(provider: &str, model: &str, input_tokens: u64, output_tokens: u64) {
206    LLM_REQUESTS_TOTAL
207        .with_label_values(&[provider, model])
208        .inc();
209    if input_tokens > 0 {
210        LLM_TOKENS_TOTAL
211            .with_label_values(&[provider, "input"])
212            .inc_by(input_tokens);
213    }
214    if output_tokens > 0 {
215        LLM_TOKENS_TOTAL
216            .with_label_values(&[provider, "output"])
217            .inc_by(output_tokens);
218    }
219}
220
221/// Record a command execution (counter + histogram).
222pub fn record_command(backend: &str, duration_secs: f64) {
223    COMMANDS_TOTAL.with_label_values(&[backend]).inc();
224    COMMAND_DURATION_SECONDS
225        .with_label_values(&[backend])
226        .observe(duration_secs);
227}
228
229/// Encode all registered metrics into Prometheus text exposition format.
230pub fn gather() -> String {
231    // Ensure build_info is initialized
232    let _ = &*BUILD_INFO;
233    let encoder = TextEncoder::new();
234    let families = REGISTRY.gather();
235    encoder.encode_to_string(&families).unwrap_or_default()
236}
237
238/// Normalize dynamic path segments to prevent label cardinality explosion.
239///
240/// Replaces sandbox/snapshot/secret names with `:name` so that metrics
241/// group by route pattern, not by individual resource.
242fn normalize_path(path: &str) -> String {
243    let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
244    let mut result = Vec::with_capacity(segments.len());
245    for (i, seg) in segments.iter().enumerate() {
246        if i > 0 {
247            let prev = segments[i - 1];
248            if matches!(
249                prev,
250                "sandboxes" | "snapshots" | "secrets" | "detached" | "hooks" | "usage"
251            ) {
252                result.push(":name");
253                continue;
254            }
255            if prev == "pages" {
256                result.push(":page");
257                continue;
258            }
259        }
260        result.push(seg);
261    }
262    format!("/{}", result.join("/"))
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_normalize_static_paths() {
271        assert_eq!(normalize_path("/health"), "/health");
272        assert_eq!(normalize_path("/sandboxes"), "/sandboxes");
273        assert_eq!(normalize_path("/run"), "/run");
274        assert_eq!(normalize_path("/run/stream"), "/run/stream");
275    }
276
277    #[test]
278    fn test_normalize_dynamic_paths() {
279        assert_eq!(normalize_path("/sandboxes/my-box"), "/sandboxes/:name");
280        assert_eq!(
281            normalize_path("/sandboxes/my-box/exec"),
282            "/sandboxes/:name/exec"
283        );
284        assert_eq!(normalize_path("/snapshots/snap-1"), "/snapshots/:name");
285        assert_eq!(normalize_path("/secrets/my-key"), "/secrets/:name");
286    }
287
288    #[test]
289    fn test_normalize_browser_paths() {
290        assert_eq!(
291            normalize_path("/sandboxes/x/browser/pages/p1/click"),
292            "/sandboxes/:name/browser/pages/:page/click"
293        );
294    }
295
296    #[test]
297    fn test_normalize_detached_paths() {
298        assert_eq!(
299            normalize_path("/sandboxes/x/exec/detached/cmd-1"),
300            "/sandboxes/:name/exec/detached/:name"
301        );
302    }
303
304    #[test]
305    fn test_gather_produces_output() {
306        let output = gather();
307        assert!(output.contains("agentkernel_build_info"));
308    }
309
310    #[test]
311    fn test_record_http_request() {
312        record_http_request("GET", "/sandboxes", 200, 0.05);
313        let output = gather();
314        assert!(output.contains("agentkernel_http_requests_total"));
315    }
316}