Skip to main content

edgeguard/
metrics.rs

1//! Prometheus metrics, hand-rolled.
2//!
3//! A full metrics library (`prometheus`, `metrics`) would be a heavy dependency for the
4//! handful of series EdgeGuard exposes, so — in the same spirit as `parse_host_port` being a
5//! small URL parser rather than a full one — this is a minimal text-exposition renderer over
6//! a few atomics. It emits the Prometheus text format (v0.0.4) at `/__edgeguard/metrics`.
7//!
8//! The registry lives in [`crate::proxy::AppState`] *outside* the hot-swappable runtime, so
9//! counters survive a config hot-reload instead of resetting to zero.
10
11use std::sync::atomic::{AtomicU64, Ordering};
12use std::time::Duration;
13
14/// Request `outcome` label values. These mirror the `outcome` field already emitted on the
15/// JSON access log in [`crate::proxy`], so a metric series lines up 1:1 with a log line.
16/// Anything not in this list is bucketed under `other` rather than silently dropped.
17const OUTCOMES: &[&str] = &[
18    "ok",
19    "rate_limited",
20    "limiter_error",
21    "unauthorized",
22    "forbidden",
23    "method_not_allowed",
24    "not_found",
25    "payload_too_large",
26    "header_too_large",
27    "bad_gateway",
28    "upstream_error",
29    "upstream_timeout",
30    "upstream_body_too_large",
31    "upstream_body_error",
32    "other",
33];
34
35/// Rate-limit `scope` label values (which limiter rejected the request).
36const RL_SCOPES: &[&str] = &["ip", "route", "key"];
37
38/// WAF `rule` label values (which ruleset class matched). Custom `[[waf.rules]]` all roll up
39/// under `custom`; the specific rule id is in the log line, not the metric.
40const WAF_RULES: &[&str] = &["sqli", "xss", "path_traversal", "custom"];
41
42/// Upper bounds (seconds) for the request-duration histogram, plus an implicit `+Inf`.
43const LATENCY_BUCKETS: &[f64] = &[
44    0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
45];
46
47/// Process-wide metric registry. All methods take `&self` and use relaxed atomics — metrics
48/// are monotonic counters/observations where exact inter-thread ordering doesn't matter.
49pub struct Metrics {
50    /// One counter per [`OUTCOMES`] entry (parallel index).
51    requests: Vec<AtomicU64>,
52    /// One counter per [`RL_SCOPES`] entry (parallel index).
53    ratelimit_hits: Vec<AtomicU64>,
54    /// One counter per [`WAF_RULES`] entry (parallel index).
55    waf_hits: Vec<AtomicU64>,
56    /// Cumulative histogram buckets (parallel to [`LATENCY_BUCKETS`]): `bucket[i]` counts
57    /// observations with value <= `LATENCY_BUCKETS[i]`.
58    latency_buckets: Vec<AtomicU64>,
59    latency_sum_micros: AtomicU64,
60    latency_count: AtomicU64,
61    csp_reports: AtomicU64,
62}
63
64impl Default for Metrics {
65    fn default() -> Self {
66        Metrics {
67            requests: OUTCOMES.iter().map(|_| AtomicU64::new(0)).collect(),
68            ratelimit_hits: RL_SCOPES.iter().map(|_| AtomicU64::new(0)).collect(),
69            waf_hits: WAF_RULES.iter().map(|_| AtomicU64::new(0)).collect(),
70            latency_buckets: LATENCY_BUCKETS.iter().map(|_| AtomicU64::new(0)).collect(),
71            latency_sum_micros: AtomicU64::new(0),
72            latency_count: AtomicU64::new(0),
73            csp_reports: AtomicU64::new(0),
74        }
75    }
76}
77
78impl Metrics {
79    pub fn new() -> Self {
80        Self::default()
81    }
82
83    /// Count one finished request under its `outcome` label.
84    pub fn record_request(&self, outcome: &str) {
85        let idx = OUTCOMES
86            .iter()
87            .position(|o| *o == outcome)
88            .unwrap_or(OUTCOMES.len() - 1); // -> "other"
89        self.requests[idx].fetch_add(1, Ordering::Relaxed);
90    }
91
92    /// Observe a request's end-to-end latency into the histogram.
93    pub fn observe_latency(&self, elapsed: Duration) {
94        let secs = elapsed.as_secs_f64();
95        for (i, bound) in LATENCY_BUCKETS.iter().enumerate() {
96            if secs <= *bound {
97                self.latency_buckets[i].fetch_add(1, Ordering::Relaxed);
98            }
99        }
100        self.latency_sum_micros
101            .fetch_add(elapsed.as_micros() as u64, Ordering::Relaxed);
102        self.latency_count.fetch_add(1, Ordering::Relaxed);
103    }
104
105    /// Count a rate-limit rejection by which limiter scope tripped (`ip`/`route`/`key`).
106    pub fn record_ratelimit_hit(&self, scope: &str) {
107        if let Some(idx) = RL_SCOPES.iter().position(|s| *s == scope) {
108            self.ratelimit_hits[idx].fetch_add(1, Ordering::Relaxed);
109        }
110    }
111
112    /// Count one WAF rule match by rule class (`sqli`/`xss`/`path_traversal`/`custom`).
113    /// Recorded for both report-only and blocking modes — so a report-first rollout is
114    /// visible — while a *blocked* request is additionally counted under the `forbidden`
115    /// request outcome.
116    pub fn record_waf_hit(&self, class: &str) {
117        if let Some(idx) = WAF_RULES.iter().position(|c| *c == class) {
118            self.waf_hits[idx].fetch_add(1, Ordering::Relaxed);
119        }
120    }
121
122    /// Count one received CSP violation report.
123    pub fn record_csp_report(&self) {
124        self.csp_reports.fetch_add(1, Ordering::Relaxed);
125    }
126
127    /// Render the Prometheus text exposition (format version 0.0.4).
128    pub fn render(&self) -> String {
129        let mut out = String::with_capacity(1024);
130
131        out.push_str("# HELP edgeguard_requests_total Total proxied requests by outcome.\n");
132        out.push_str("# TYPE edgeguard_requests_total counter\n");
133        for (i, label) in OUTCOMES.iter().enumerate() {
134            let v = self.requests[i].load(Ordering::Relaxed);
135            out.push_str(&format!(
136                "edgeguard_requests_total{{outcome=\"{label}\"}} {v}\n"
137            ));
138        }
139
140        out.push_str(
141            "# HELP edgeguard_ratelimit_hits_total Requests rejected by a rate limiter, by scope.\n",
142        );
143        out.push_str("# TYPE edgeguard_ratelimit_hits_total counter\n");
144        for (i, label) in RL_SCOPES.iter().enumerate() {
145            let v = self.ratelimit_hits[i].load(Ordering::Relaxed);
146            out.push_str(&format!(
147                "edgeguard_ratelimit_hits_total{{scope=\"{label}\"}} {v}\n"
148            ));
149        }
150
151        out.push_str(
152            "# HELP edgeguard_waf_hits_total WAF rule matches by class (report-only + blocked).\n",
153        );
154        out.push_str("# TYPE edgeguard_waf_hits_total counter\n");
155        for (i, label) in WAF_RULES.iter().enumerate() {
156            let v = self.waf_hits[i].load(Ordering::Relaxed);
157            out.push_str(&format!(
158                "edgeguard_waf_hits_total{{rule=\"{label}\"}} {v}\n"
159            ));
160        }
161
162        out.push_str("# HELP edgeguard_csp_reports_total CSP violation reports received.\n");
163        out.push_str("# TYPE edgeguard_csp_reports_total counter\n");
164        out.push_str(&format!(
165            "edgeguard_csp_reports_total {}\n",
166            self.csp_reports.load(Ordering::Relaxed)
167        ));
168
169        out.push_str(
170            "# HELP edgeguard_request_duration_seconds Request handling latency in seconds.\n",
171        );
172        out.push_str("# TYPE edgeguard_request_duration_seconds histogram\n");
173        for (i, bound) in LATENCY_BUCKETS.iter().enumerate() {
174            let v = self.latency_buckets[i].load(Ordering::Relaxed);
175            out.push_str(&format!(
176                "edgeguard_request_duration_seconds_bucket{{le=\"{bound}\"}} {v}\n"
177            ));
178        }
179        let count = self.latency_count.load(Ordering::Relaxed);
180        // The `+Inf` bucket equals the total observation count by definition.
181        out.push_str(&format!(
182            "edgeguard_request_duration_seconds_bucket{{le=\"+Inf\"}} {count}\n"
183        ));
184        let sum_secs = self.latency_sum_micros.load(Ordering::Relaxed) as f64 / 1_000_000.0;
185        out.push_str(&format!(
186            "edgeguard_request_duration_seconds_sum {sum_secs}\n"
187        ));
188        out.push_str(&format!(
189            "edgeguard_request_duration_seconds_count {count}\n"
190        ));
191
192        out
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    #[test]
201    fn records_and_renders_request_outcomes() {
202        let m = Metrics::new();
203        m.record_request("ok");
204        m.record_request("ok");
205        m.record_request("rate_limited");
206        // An unknown outcome falls into the `other` bucket, not "ok".
207        m.record_request("totally_unknown");
208
209        let text = m.render();
210        assert!(
211            text.contains("edgeguard_requests_total{outcome=\"ok\"} 2"),
212            "{text}"
213        );
214        assert!(
215            text.contains("edgeguard_requests_total{outcome=\"rate_limited\"} 1"),
216            "{text}"
217        );
218        assert!(
219            text.contains("edgeguard_requests_total{outcome=\"other\"} 1"),
220            "{text}"
221        );
222    }
223
224    #[test]
225    fn latency_histogram_is_cumulative() {
226        let m = Metrics::new();
227        m.observe_latency(Duration::from_millis(3)); // <= 0.005
228        m.observe_latency(Duration::from_millis(40)); // <= 0.05
229        let text = m.render();
230        // 3ms falls under every bucket >= 0.005; 40ms under every bucket >= 0.05.
231        assert!(
232            text.contains("edgeguard_request_duration_seconds_bucket{le=\"0.005\"} 1"),
233            "{text}"
234        );
235        assert!(
236            text.contains("edgeguard_request_duration_seconds_bucket{le=\"0.05\"} 2"),
237            "{text}"
238        );
239        assert!(
240            text.contains("edgeguard_request_duration_seconds_bucket{le=\"+Inf\"} 2"),
241            "{text}"
242        );
243        assert!(
244            text.contains("edgeguard_request_duration_seconds_count 2"),
245            "{text}"
246        );
247    }
248
249    #[test]
250    fn ratelimit_and_csp_counters() {
251        let m = Metrics::new();
252        m.record_ratelimit_hit("ip");
253        m.record_ratelimit_hit("route");
254        m.record_ratelimit_hit("route");
255        m.record_csp_report();
256        let text = m.render();
257        assert!(
258            text.contains("edgeguard_ratelimit_hits_total{scope=\"ip\"} 1"),
259            "{text}"
260        );
261        assert!(
262            text.contains("edgeguard_ratelimit_hits_total{scope=\"route\"} 2"),
263            "{text}"
264        );
265        assert!(text.contains("edgeguard_csp_reports_total 1"), "{text}");
266    }
267
268    #[test]
269    fn waf_hit_counters_by_class() {
270        let m = Metrics::new();
271        m.record_waf_hit("sqli");
272        m.record_waf_hit("sqli");
273        m.record_waf_hit("custom");
274        // An unknown class is ignored rather than miscounted.
275        m.record_waf_hit("totally_unknown");
276        let text = m.render();
277        assert!(
278            text.contains("edgeguard_waf_hits_total{rule=\"sqli\"} 2"),
279            "{text}"
280        );
281        assert!(
282            text.contains("edgeguard_waf_hits_total{rule=\"custom\"} 1"),
283            "{text}"
284        );
285        // A class that never fired still renders at 0.
286        assert!(
287            text.contains("edgeguard_waf_hits_total{rule=\"xss\"} 0"),
288            "{text}"
289        );
290    }
291}