Skip to main content

hyperi_rustlib/metrics/
process.rs

1// Project:   hyperi-rustlib
2// File:      src/metrics/process.rs
3// Purpose:   Process-level metrics collection
4// Language:  Rust
5//
6// License:   BUSL-1.1
7// Copyright: (c) 2026 HYPERI PTY LIMITED
8
9//! Process-level metrics collection.
10
11use std::sync::Arc;
12use std::time::{SystemTime, UNIX_EPOCH};
13
14use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
15
16/// Process metrics collector.
17#[derive(Debug, Clone)]
18pub struct ProcessMetrics {
19    namespace: String,
20    system: Arc<std::sync::Mutex<System>>,
21    pid: sysinfo::Pid,
22    start_time: f64,
23}
24
25impl ProcessMetrics {
26    /// Create a new process metrics collector.
27    #[must_use]
28    pub fn new(namespace: &str) -> Self {
29        let pid = sysinfo::Pid::from_u32(std::process::id());
30        let system = System::new_with_specifics(
31            RefreshKind::nothing().with_processes(ProcessRefreshKind::everything()),
32        );
33
34        let start_time = SystemTime::now()
35            .duration_since(UNIX_EPOCH)
36            .map_or(0.0, |d| d.as_secs_f64());
37
38        let this = Self {
39            namespace: namespace.to_string(),
40            system: Arc::new(std::sync::Mutex::new(system)),
41            pid,
42            start_time,
43        };
44
45        // Register metric descriptions
46        this.register_metrics();
47        this
48    }
49
50    /// Register metric descriptions.
51    fn register_metrics(&self) {
52        let ns = &self.namespace;
53
54        metrics::describe_counter!(
55            format!("{ns}_process_cpu_seconds_total"),
56            metrics::Unit::Seconds,
57            "Total user + system CPU time consumed, in seconds (cumulative counter)".to_string()
58        );
59        metrics::describe_gauge!(
60            format!("{ns}_process_resident_memory_bytes"),
61            "Resident memory size in bytes".to_string()
62        );
63        metrics::describe_gauge!(
64            format!("{ns}_process_virtual_memory_bytes"),
65            "Virtual memory size in bytes".to_string()
66        );
67        metrics::describe_gauge!(
68            format!("{ns}_process_open_fds"),
69            "Number of open file descriptors".to_string()
70        );
71        metrics::describe_gauge!(
72            format!("{ns}_process_start_time_seconds"),
73            "Start time of the process since unix epoch in seconds".to_string()
74        );
75    }
76
77    /// Update process metrics.
78    pub fn update(&self) {
79        // Recover from a poisoned lock rather than panicking: a panic in a
80        // prior update must not turn metrics collection into a repeat-panic.
81        // Observability degrades, it does not crash.
82        let mut system = self.system.lock().unwrap_or_else(|e| e.into_inner());
83        system.refresh_processes_specifics(
84            ProcessesToUpdate::Some(&[self.pid]),
85            true,
86            ProcessRefreshKind::everything(),
87        );
88
89        if let Some(process) = system.process(self.pid) {
90            let ns = &self.namespace;
91
92            // Cumulative CPU seconds as a proper monotonic COUNTER (Prometheus
93            // convention). sysinfo's cpu_usage() is an instantaneous percentage,
94            // NOT cumulative time -- on Linux read utime+stime from
95            // /proc/self/stat instead. Utilisation is derived downstream as
96            // rate(this) / container_cpu_limit_cores (in the pressure CEL / a
97            // recording rule), not baked in here.
98            #[cfg(target_os = "linux")]
99            if let Some(cpu_secs) = cumulative_cpu_seconds() {
100                // Monotonic, non-negative cumulative seconds -> whole-second
101                // truncation into the u64 counter is intentional.
102                #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
103                let secs = cpu_secs as u64;
104                metrics::counter!(format!("{ns}_process_cpu_seconds_total")).absolute(secs);
105            }
106
107            // Memory
108            let rss = process.memory();
109            let virtual_mem = process.virtual_memory();
110            metrics::gauge!(format!("{ns}_process_resident_memory_bytes")).set(rss as f64);
111            metrics::gauge!(format!("{ns}_process_virtual_memory_bytes")).set(virtual_mem as f64);
112
113            // File descriptors (Linux-specific)
114            #[cfg(target_os = "linux")]
115            {
116                if let Ok(fds) = count_open_fds() {
117                    metrics::gauge!(format!("{ns}_process_open_fds")).set(fds as f64);
118                }
119            }
120
121            // Start time
122            metrics::gauge!(format!("{ns}_process_start_time_seconds")).set(self.start_time);
123        }
124    }
125}
126
127/// Count open file descriptors (Linux only).
128#[cfg(target_os = "linux")]
129fn count_open_fds() -> std::io::Result<usize> {
130    let fd_dir = format!("/proc/{}/fd", std::process::id());
131    std::fs::read_dir(fd_dir).map(|entries| entries.count())
132}
133
134/// Cumulative process CPU time (user + system) in seconds, from
135/// `/proc/self/stat` (Linux). Returns `None` if it cannot be read/parsed.
136///
137/// USER_HZ (clock ticks per second) is hard-coded to 100 -- the Linux default,
138/// and not queryable via `sysconf` under `#![forbid(unsafe_code)]`. The u64
139/// counter truncates to whole seconds; that is ample for the `rate()`-derived
140/// CPU utilisation the pressure engine consumes.
141#[cfg(target_os = "linux")]
142pub(crate) fn cumulative_cpu_seconds() -> Option<f64> {
143    const USER_HZ: f64 = 100.0;
144    let stat = std::fs::read_to_string("/proc/self/stat").ok()?;
145    // "pid (comm) state ..." -- comm may contain spaces/parens, so split on the
146    // LAST ')' to skip it safely.
147    let rest = stat.rsplit_once(')')?.1;
148    let fields: Vec<&str> = rest.split_whitespace().collect();
149    // Fields after comm (0-based): [0]=state ... [11]=utime [12]=stime (ticks).
150    let utime: u64 = fields.get(11)?.parse().ok()?;
151    let stime: u64 = fields.get(12)?.parse().ok()?;
152    Some((utime + stime) as f64 / USER_HZ)
153}
154
155/// Non-Linux fallback: cumulative CPU seconds is unavailable (the scaling engine
156/// then derives no CPU term on those platforms). Production targets are Linux.
157#[cfg(all(not(target_os = "linux"), feature = "scaling", feature = "expression"))]
158pub(crate) fn cumulative_cpu_seconds() -> Option<f64> {
159    None
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn test_process_metrics_new() {
168        let pm = ProcessMetrics::new("test");
169        assert_eq!(pm.namespace, "test");
170        assert!(pm.start_time > 0.0);
171    }
172
173    #[test]
174    fn test_process_metrics_update() {
175        let pm = ProcessMetrics::new("test");
176        // Should not panic
177        pm.update();
178    }
179}