Skip to main content

osproxy_observe/
stats.rs

1//! The always-on operational metrics snapshot, the one observability surface
2//! that works in **every** environment, including production where the
3//! `/debug/*` introspection tools are off.
4//!
5//! An external agent (or a Prometheus-style scraper) polls each proxy's snapshot
6//! to see what it is doing: how much traffic it served, how it fared, and whether
7//! its upstream pools are amortizing handshakes. The readout is deliberately
8//! **per instance**, building a fleet-wide rollup is the job of the external
9//! metrics/log aggregator the deployment already runs, not of the proxy. The
10//! proxy's only obligation is to expose a clean, shape-only source to scrape.
11//!
12//! **Shape-only by construction** (`docs/05`): counts, rates, and cluster *ids*,
13//! never tenant values, document bodies, query literals, or principals. A counter
14//! cannot become a value-leak channel, so the snapshot is safe to expose
15//! unauthenticated and to ship anywhere.
16
17use std::sync::atomic::{AtomicU64, Ordering};
18
19use serde::{Deserialize, Serialize};
20
21/// Live, lock-free counters a proxy increments as it serves data-plane requests.
22/// Cheap enough to update on every request (three relaxed atomic adds); the
23/// snapshot is taken on demand when an agent scrapes.
24// The `requests_*` prefix is the intended metric naming (flat, scraper-friendly),
25// not accidental field-name repetition.
26#[allow(clippy::struct_field_names)]
27#[derive(Debug, Default)]
28pub struct Metrics {
29    requests_total: AtomicU64,
30    requests_ok: AtomicU64,
31    requests_error: AtomicU64,
32}
33
34impl Metrics {
35    /// A fresh zeroed collector.
36    #[must_use]
37    pub fn new() -> Self {
38        Self::default()
39    }
40
41    /// Records one completed data-plane request and whether it succeeded (a 2xx
42    /// response). Introspection requests (`/debug/*`, `/metrics`) are not counted,
43    /// this measures the proxy's actual proxying.
44    pub fn record(&self, ok: bool) {
45        self.requests_total.fetch_add(1, Ordering::Relaxed);
46        let bucket = if ok {
47            &self.requests_ok
48        } else {
49            &self.requests_error
50        };
51        bucket.fetch_add(1, Ordering::Relaxed);
52    }
53
54    /// Builds a serializable snapshot from the current counters and the supplied
55    /// per-cluster pool readout (gathered by the caller, which owns the sink).
56    #[must_use]
57    pub fn snapshot(&self, pools: Vec<PoolSnapshot>) -> StatsSnapshot {
58        StatsSnapshot {
59            requests_total: self.requests_total.load(Ordering::Relaxed),
60            requests_ok: self.requests_ok.load(Ordering::Relaxed),
61            requests_error: self.requests_error.load(Ordering::Relaxed),
62            pools,
63        }
64    }
65}
66
67/// One upstream cluster's connection-reuse counters, the signal that the pool is
68/// amortizing TLS/TCP handshakes (`opened` far below `dispatched`). `cluster` is
69/// an infrastructure id, not tenant data.
70#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
71pub struct PoolSnapshot {
72    /// The cluster id (infrastructure identifier).
73    pub cluster: String,
74    /// Connections the pool opened to the cluster (cold handshakes).
75    pub opened: u64,
76    /// Requests dispatched to the cluster (cold + reused).
77    pub dispatched: u64,
78    /// Requests that rode a reused pooled connection (`dispatched - opened`).
79    pub reused: u64,
80}
81
82/// A single proxy instance's operational snapshot. Per-instance by definition;
83/// the fleet rollup is the external aggregator's job. Safe to serve
84/// unauthenticated, it is shape-only.
85// `requests_*` is the intended flat metric naming, not field-name repetition.
86#[allow(clippy::struct_field_names)]
87#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
88pub struct StatsSnapshot {
89    /// Data-plane requests served since start.
90    pub requests_total: u64,
91    /// Of those, how many responded 2xx.
92    pub requests_ok: u64,
93    /// Of those, how many responded with an error status.
94    pub requests_error: u64,
95    /// Per-cluster upstream pool reuse counters.
96    pub pools: Vec<PoolSnapshot>,
97}
98
99impl StatsSnapshot {
100    /// The snapshot as compact JSON, what a scrape returns and an agent parses.
101    /// Serialization of plain counters cannot fail; an error collapses to an
102    /// explicit error object rather than a panic.
103    #[must_use]
104    pub fn to_json(&self) -> String {
105        serde_json::to_string(self)
106            .unwrap_or_else(|e| format!("{{\"error\":\"stats serialize failed: {e}\"}}"))
107    }
108}
109
110#[cfg(test)]
111#[path = "stats_tests.rs"]
112mod tests;