osproxy_observe/stats.rs
1//! The always-on operational metrics snapshot, the one observability surface
2//! that works in **every** environment, including production where the
3//! `/debug/*` introspection tools are off.
4//!
5//! An external agent (or a Prometheus-style scraper) polls each proxy's snapshot
6//! to see what it is doing: how much traffic it served, how it fared, and whether
7//! its upstream pools are amortizing handshakes. The readout is deliberately
8//! **per instance**, building a fleet-wide rollup is the job of the external
9//! metrics/log aggregator the deployment already runs, not of the proxy. The
10//! proxy's only obligation is to expose a clean, shape-only source to scrape.
11//!
12//! **Shape-only by construction** (`docs/05`): counts, rates, and cluster *ids*,
13//! never tenant values, document bodies, query literals, or principals. A counter
14//! cannot become a value-leak channel, so the snapshot is safe to expose
15//! unauthenticated and to ship anywhere.
16
17use std::sync::atomic::{AtomicU64, Ordering};
18
19use serde::{Deserialize, Serialize};
20
21/// Live, lock-free counters a proxy increments as it serves data-plane requests.
22/// Cheap enough to update on every request (three relaxed atomic adds); the
23/// snapshot is taken on demand when an agent scrapes.
24// The `requests_*` prefix is the intended metric naming (flat, scraper-friendly),
25// not accidental field-name repetition.
26#[allow(clippy::struct_field_names)]
27#[derive(Debug, Default)]
28pub struct Metrics {
29 requests_total: AtomicU64,
30 requests_ok: AtomicU64,
31 requests_error: AtomicU64,
32}
33
34impl Metrics {
35 /// A fresh zeroed collector.
36 #[must_use]
37 pub fn new() -> Self {
38 Self::default()
39 }
40
41 /// Records one completed data-plane request and whether it succeeded (a 2xx
42 /// response). Introspection requests (`/debug/*`, `/metrics`) are not counted,
43 /// this measures the proxy's actual proxying.
44 pub fn record(&self, ok: bool) {
45 self.requests_total.fetch_add(1, Ordering::Relaxed);
46 let bucket = if ok {
47 &self.requests_ok
48 } else {
49 &self.requests_error
50 };
51 bucket.fetch_add(1, Ordering::Relaxed);
52 }
53
54 /// Builds a serializable snapshot from the current counters and the supplied
55 /// per-cluster pool readout (gathered by the caller, which owns the sink).
56 #[must_use]
57 pub fn snapshot(&self, pools: Vec<PoolSnapshot>) -> StatsSnapshot {
58 StatsSnapshot {
59 requests_total: self.requests_total.load(Ordering::Relaxed),
60 requests_ok: self.requests_ok.load(Ordering::Relaxed),
61 requests_error: self.requests_error.load(Ordering::Relaxed),
62 pools,
63 }
64 }
65}
66
67/// One upstream cluster's connection-reuse counters, the signal that the pool is
68/// amortizing TLS/TCP handshakes (`opened` far below `dispatched`). `cluster` is
69/// an infrastructure id, not tenant data.
70#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
71pub struct PoolSnapshot {
72 /// The cluster id (infrastructure identifier).
73 pub cluster: String,
74 /// Connections the pool opened to the cluster (cold handshakes).
75 pub opened: u64,
76 /// Requests dispatched to the cluster (cold + reused).
77 pub dispatched: u64,
78 /// Requests that rode a reused pooled connection (`dispatched - opened`).
79 pub reused: u64,
80}
81
82/// A single proxy instance's operational snapshot. Per-instance by definition;
83/// the fleet rollup is the external aggregator's job. Safe to serve
84/// unauthenticated, it is shape-only.
85// `requests_*` is the intended flat metric naming, not field-name repetition.
86#[allow(clippy::struct_field_names)]
87#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
88pub struct StatsSnapshot {
89 /// Data-plane requests served since start.
90 pub requests_total: u64,
91 /// Of those, how many responded 2xx.
92 pub requests_ok: u64,
93 /// Of those, how many responded with an error status.
94 pub requests_error: u64,
95 /// Per-cluster upstream pool reuse counters.
96 pub pools: Vec<PoolSnapshot>,
97}
98
99impl StatsSnapshot {
100 /// The snapshot as compact JSON, what a scrape returns and an agent parses.
101 /// Serialization of plain counters cannot fail; an error collapses to an
102 /// explicit error object rather than a panic.
103 #[must_use]
104 pub fn to_json(&self) -> String {
105 serde_json::to_string(self)
106 .unwrap_or_else(|e| format!("{{\"error\":\"stats serialize failed: {e}\"}}"))
107 }
108}
109
110#[cfg(test)]
111#[path = "stats_tests.rs"]
112mod tests;