Skip to main content

rlx_runtime/
telemetry.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Telemetry primitives (plan #65).
17//!
18//! Borrowed from MAX's `serve/telemetry/` module shape (counters,
19//! histograms, stopwatches) but stripped to a pure data layer.
20//! Today this is a single in-process `MetricsRegistry`; a future
21//! serving crate can route the registry to a sidecar process /
22//! Prometheus / OTel exporter without changing the call sites.
23//!
24//! Why now (without a serving crate to consume it)?
25//!   - Lets the autotuner / fusion passes record decisions while
26//!     they're being made, viewable later via [`MetricsRegistry::
27//!     snapshot`].
28//!   - Establishes the "metrics live in their own type, not
29//!     scattered through hot paths" pattern before we accumulate
30//!     a hundred ad-hoc counters.
31
32use std::collections::BTreeMap;
33use std::sync::{
34    Mutex, OnceLock,
35    atomic::{AtomicU64, Ordering},
36};
37
38/// Monotonic 64-bit counter. Cheap (atomic add); safe to call from
39/// any thread.
40#[derive(Debug, Default)]
41pub struct Counter {
42    value: AtomicU64,
43}
44
45impl Counter {
46    pub const fn new() -> Self {
47        Self {
48            value: AtomicU64::new(0),
49        }
50    }
51    // #[inline(always)] (plan #76) — these are the per-event hot
52    // paths; we want callers to see one atomic instruction, not a
53    // function-call indirection through the Counter type.
54    #[inline(always)]
55    pub fn inc(&self) {
56        self.value.fetch_add(1, Ordering::Relaxed);
57    }
58    #[inline(always)]
59    pub fn add(&self, delta: u64) {
60        self.value.fetch_add(delta, Ordering::Relaxed);
61    }
62    #[inline(always)]
63    pub fn get(&self) -> u64 {
64        self.value.load(Ordering::Relaxed)
65    }
66    pub fn reset(&self) {
67        self.value.store(0, Ordering::Relaxed);
68    }
69}
70
71/// Fixed-bucket exponential histogram — 16 buckets covering up to
72/// `2^16 ≈ 65k` of the chosen unit. Right for ns-to-ms latency
73/// distributions or 1-100k sample-count distributions.
74#[derive(Debug)]
75pub struct Histogram {
76    buckets: [AtomicU64; 16],
77    sum: AtomicU64,
78    count: AtomicU64,
79}
80
81impl Default for Histogram {
82    fn default() -> Self {
83        Self::new()
84    }
85}
86
87impl Histogram {
88    pub const fn new() -> Self {
89        // `[Z; N]` array-init produces N independent AtomicU64s, not
90        // shared interior-mutable const aliases — same idiom std uses
91        // in `[AtomicUsize::new(0); N]`.
92        #[allow(clippy::declare_interior_mutable_const)]
93        const Z: AtomicU64 = AtomicU64::new(0);
94        Self {
95            buckets: [Z; 16],
96            sum: Z,
97            count: Z,
98        }
99    }
100
101    pub fn record(&self, value: u64) {
102        let bucket = (value.checked_ilog2().unwrap_or(0) as usize).min(15);
103        self.buckets[bucket].fetch_add(1, Ordering::Relaxed);
104        self.sum.fetch_add(value, Ordering::Relaxed);
105        self.count.fetch_add(1, Ordering::Relaxed);
106    }
107
108    pub fn count(&self) -> u64 {
109        self.count.load(Ordering::Relaxed)
110    }
111    pub fn sum(&self) -> u64 {
112        self.sum.load(Ordering::Relaxed)
113    }
114    pub fn mean(&self) -> Option<f64> {
115        let c = self.count();
116        if c == 0 {
117            None
118        } else {
119            Some(self.sum() as f64 / c as f64)
120        }
121    }
122    pub fn bucket_counts(&self) -> [u64; 16] {
123        let mut out = [0u64; 16];
124        for (i, b) in self.buckets.iter().enumerate() {
125            out[i] = b.load(Ordering::Relaxed);
126        }
127        out
128    }
129}
130
131/// Global registry of named counters and histograms. Indexed by
132/// static string keys; lookups are O(log N) on a BTreeMap. Lock
133/// is only held during register/lookup, not during increment.
134pub struct MetricsRegistry {
135    counters: Mutex<BTreeMap<&'static str, &'static Counter>>,
136    histograms: Mutex<BTreeMap<&'static str, &'static Histogram>>,
137}
138
139impl MetricsRegistry {
140    /// Process-wide registry. First access lazily initializes.
141    pub fn global() -> &'static Self {
142        static R: OnceLock<MetricsRegistry> = OnceLock::new();
143        R.get_or_init(|| MetricsRegistry {
144            counters: Mutex::new(BTreeMap::new()),
145            histograms: Mutex::new(BTreeMap::new()),
146        })
147    }
148
149    /// Register a `'static` counter so it shows up in
150    /// `snapshot()`. Idempotent — re-registering with the same
151    /// pointer is a no-op.
152    pub fn register_counter(&self, name: &'static str, c: &'static Counter) {
153        self.counters
154            .lock()
155            .expect("registry poisoned")
156            .insert(name, c);
157    }
158
159    pub fn register_histogram(&self, name: &'static str, h: &'static Histogram) {
160        self.histograms
161            .lock()
162            .expect("registry poisoned")
163            .insert(name, h);
164    }
165
166    /// Snapshot all metrics into a serializable map. Useful for
167    /// dumping to a log / exporting to Prometheus.
168    pub fn snapshot(&self) -> Snapshot {
169        let counters = self
170            .counters
171            .lock()
172            .unwrap()
173            .iter()
174            .map(|(&n, c)| (n.to_string(), c.get()))
175            .collect();
176        let histograms = self
177            .histograms
178            .lock()
179            .unwrap()
180            .iter()
181            .map(|(&n, h)| {
182                (
183                    n.to_string(),
184                    HistogramSnapshot {
185                        count: h.count(),
186                        sum: h.sum(),
187                        buckets: h.bucket_counts(),
188                    },
189                )
190            })
191            .collect();
192        Snapshot {
193            counters,
194            histograms,
195        }
196    }
197}
198
199#[derive(Debug)]
200pub struct HistogramSnapshot {
201    pub count: u64,
202    pub sum: u64,
203    pub buckets: [u64; 16],
204}
205
206#[derive(Debug, Default)]
207pub struct Snapshot {
208    pub counters: BTreeMap<String, u64>,
209    pub histograms: BTreeMap<String, HistogramSnapshot>,
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn counter_basic() {
218        let c = Counter::new();
219        c.inc();
220        c.inc();
221        c.add(10);
222        assert_eq!(c.get(), 12);
223    }
224
225    #[test]
226    fn histogram_records_in_bucket() {
227        let h = Histogram::new();
228        h.record(0); // bucket 0 (ilog2(0).unwrap_or(0) = 0)
229        h.record(1); // bucket 0
230        h.record(7); // bucket 2 (ilog2(7) = 2)
231        h.record(1024); // bucket 10
232        let b = h.bucket_counts();
233        assert_eq!(h.count(), 4);
234        assert_eq!(b[0] + b[2] + b[10], 4);
235    }
236
237    #[test]
238    fn registry_round_trip() {
239        static C: Counter = Counter::new();
240        static H: Histogram = Histogram::new();
241        let r = MetricsRegistry::global();
242        r.register_counter("test_count", &C);
243        r.register_histogram("test_hist", &H);
244        C.inc();
245        H.record(42);
246        let snap = r.snapshot();
247        assert!(snap.counters.contains_key("test_count"));
248        assert!(snap.histograms.contains_key("test_hist"));
249    }
250}