Skip to main content

runtimo_core/
telemetry.rs

1//! System Telemetry — Environment awareness for the capability runtime.
2//!
3//! Captures a full snapshot of the host machine: CPU, RAM, disk, TPU/GPU
4//! devices, running services (vLLM), and network state (public IP, tunnels).
5//!
6//! Inspired by the Kaggle session telemetry pattern. Every capability execution
7//! records telemetry before and after to detect resource deltas.
8//!
9//! # Example
10//!
11//! ```rust,ignore
12//! use runtimo_core::Telemetry;
13//!
14//! let tel = Telemetry::capture();
15//! tel.print_report();
16//! // RUNTIMO TELEMETRY [1715800000]
17//! // CPU   : AMD EPYC 7T83
18//! // RAM   : 16Gi total, 13Gi free
19//! // ...
20//! ```
21
22use crate::cmd::run_cmd;
23use serde::{Deserialize, Serialize};
24use std::sync::Mutex;
25
26static TELEMETRY_CACHE: Mutex<Option<(Telemetry, std::time::Instant)>> = Mutex::new(None);
27const CACHE_TTL_SECS: u64 = 5;
28
29/// Full system telemetry snapshot.
30///
31/// Contains four sub-structures: [`SystemInfo`], [`HardwareInfo`],
32/// [`ServiceInfo`], and [`NetworkInfo`], plus a Unix timestamp.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct Telemetry {
35    /// Unix timestamp (seconds) when the snapshot was taken.
36    pub timestamp: u64,
37    /// Basic system information (CPU model, RAM, disk, uptime, load).
38    pub system: SystemInfo,
39    /// Special hardware devices (TPU, GPU, JAX availability).
40    pub hardware: HardwareInfo,
41    /// Service status (vLLM version, running state, port binding).
42    pub services: ServiceInfo,
43    /// Network state (public IP, tunnel status).
44    pub network: NetworkInfo,
45}
46
47/// Basic system information from `/proc` and shell commands.
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct SystemInfo {
50    /// CPU model string (from `/proc/cpuinfo`).
51    pub cpu_model: String,
52    /// Total RAM (human-readable, e.g. `"16Gi"`).
53    pub ram_total: String,
54    /// Free RAM (human-readable, e.g. `"13Gi"`).
55    pub ram_free: String,
56    /// Total disk space (human-readable, e.g. `"100G"`).
57    pub disk_total: String,
58    /// Free disk space (human-readable).
59    pub disk_free: String,
60    /// Disk usage percentage (e.g. `"45%"`).
61    pub disk_used_percent: String,
62    /// System uptime (e.g. `"up 3 days, 2 hours"`).
63    pub uptime: String,
64    /// Load average (e.g. `" 0.50,  0.30,  0.20"`).
65    pub load_average: String,
66    // --- Numeric fields for agent threshold computation ---
67    /// Total RAM in bytes (machine-readable).
68    pub ram_total_bytes: u64,
69    /// Free RAM in bytes (machine-readable).
70    pub ram_free_bytes: u64,
71    /// Total disk space in bytes (machine-readable).
72    pub disk_total_bytes: u64,
73    /// Free disk space in bytes (machine-readable).
74    pub disk_free_bytes: u64,
75    /// Disk usage percentage as numeric (e.g. `45.0`, no `%` sign).
76    pub disk_used_percent_numeric: f64,
77}
78
79/// Special hardware device information.
80///
81/// Detects TPU accelerators (`/dev/accel*`), NVIDIA GPUs (`nvidia-smi`),
82/// and JAX availability (Python import check).
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct HardwareInfo {
85    /// Number of TPU accelerator devices detected.
86    pub tpu_devices: usize,
87    /// Number of NVIDIA GPU devices detected.
88    pub gpu_devices: usize,
89    /// Whether the `jax` Python package is importable.
90    pub jax_available: bool,
91    /// JAX version string (e.g. `"0.4.25"`), if available.
92    pub jax_version: Option<String>,
93    /// Number of JAX-visible devices, if available.
94    pub jax_device_count: Option<usize>,
95}
96
97/// Service status information.
98///
99/// Currently tracks vLLM: version, whether the process is running,
100/// and whether port 8200 is bound.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct ServiceInfo {
103    /// vLLM version string (e.g. `"0.4.0"`), if installed.
104    pub vllm_version: Option<String>,
105    /// Whether a `vllm serve` process is running.
106    pub vllm_running: bool,
107    /// Whether port 8200 is currently bound.
108    pub vllm_port_bound: bool,
109}
110
111/// Network state information.
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct NetworkInfo {
114    /// Public IP address (from `ifconfig.me`), or `"unknown"`.
115    pub public_ip: String,
116    /// Whether a `cloudflared` tunnel process is running.
117    pub tunnel_running: bool,
118    /// The full `cloudflared` process command line, if running.
119    pub tunnel_name: Option<String>,
120}
121
122impl Telemetry {
123    /// Captures a full system telemetry snapshot.
124    ///
125    /// Results are cached for 5 seconds to avoid running 15+ shell subprocesses
126    /// on repeated calls. Network queries (public_ip, tunnel) are skipped when
127    /// returning a cached value.
128    pub fn capture() -> Self {
129        let now = std::time::Instant::now();
130        {
131            let cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
132            if let Some((cached, instant)) = cache.as_ref() {
133                if now.duration_since(*instant).as_secs() < CACHE_TTL_SECS {
134                    return cached.clone();
135                }
136            }
137        }
138
139        let timestamp = std::time::SystemTime::now()
140            .duration_since(std::time::UNIX_EPOCH)
141            .map(|d| d.as_secs())
142            .unwrap_or(0);
143
144        let telemetry = Self {
145            timestamp,
146            system: SystemInfo::capture(),
147            hardware: HardwareInfo::capture(),
148            services: ServiceInfo::capture(),
149            network: NetworkInfo::capture(),
150        };
151
152        let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
153        *cache = Some((telemetry.clone(), now));
154        telemetry
155    }
156
157    /// Prints telemetry in a human-readable report to stdout.
158    pub fn print_report(&self) {
159        println!("\n{}", "=".repeat(60));
160        println!(" RUNTIMO TELEMETRY [{}]", self.timestamp);
161        println!("{}", "=".repeat(60));
162
163        println!("\n--- SYSTEM ---");
164        println!(" CPU   : {}", self.system.cpu_model);
165        println!(
166            " RAM   : {} total, {} free",
167            self.system.ram_total, self.system.ram_free
168        );
169        println!(
170            " Disk  : {} total, {} free ({}% used)",
171            self.system.disk_total, self.system.disk_free, self.system.disk_used_percent
172        );
173        println!(" Uptime: {}", self.system.uptime);
174        println!(" Load  : {}", self.system.load_average);
175
176        println!("\n--- HARDWARE ---");
177        println!(" TPU Devices: {}", self.hardware.tpu_devices);
178        println!(" GPU Devices: {}", self.hardware.gpu_devices);
179        if self.hardware.jax_available {
180            println!(
181                " JAX: v{} ({} devices)",
182                self.hardware
183                    .jax_version
184                    .clone()
185                    .unwrap_or_else(|| "unknown".into()),
186                self.hardware.jax_device_count.unwrap_or(0)
187            );
188        } else {
189            println!(" JAX: Not available");
190        }
191
192        println!("\n--- SERVICES ---");
193        match &self.services.vllm_version {
194            Some(v) => println!(
195                " vLLM: v{} ({})",
196                v,
197                if self.services.vllm_running {
198                    "running"
199                } else {
200                    "not running"
201                }
202            ),
203            None => println!(" vLLM: not installed"),
204        }
205        println!(
206            " Port 8200: {}",
207            if self.services.vllm_port_bound {
208                "BOUND"
209            } else {
210                "NOT BOUND"
211            }
212        );
213
214        println!("\n--- NETWORK ---");
215        println!(" Public IP: {}", self.network.public_ip);
216        println!(
217            " Tunnel: {} ({})",
218            if self.network.tunnel_running {
219                "running"
220            } else {
221                "not running"
222            },
223            self.network
224                .tunnel_name
225                .clone()
226                .unwrap_or_else(|| "unknown".into())
227        );
228
229        println!("\n{}", "=".repeat(60));
230    }
231}
232
233impl SystemInfo {
234    fn capture() -> Self {
235        let ram_total = run_cmd("free -h | grep Mem | awk '{print $2}'");
236        let ram_free = run_cmd("free -h | grep Mem | awk '{print $4}'");
237        let disk_total = run_cmd("df -h / | tail -1 | awk '{print $2}'");
238        let disk_free = run_cmd("df -h / | tail -1 | awk '{print $4}'");
239        let disk_pct_str = run_cmd("df / | tail -1 | awk '{print $5}'");
240        let disk_used_percent = disk_pct_str.replace('%', "");
241        let disk_used_percent_numeric = disk_used_percent.parse::<f64>().unwrap_or(0.0);
242        let ram_total_bytes = run_cmd("free -b | grep Mem | awk '{print $2}'")
243            .parse()
244            .unwrap_or(0);
245        let ram_free_bytes = run_cmd("free -b | grep Mem | awk '{print $4}'")
246            .parse()
247            .unwrap_or(0);
248        let disk_total_bytes = run_cmd("df --bytes / | tail -1 | awk '{print $2}'")
249            .parse()
250            .unwrap_or(0);
251        let disk_free_bytes = run_cmd("df --bytes / | tail -1 | awk '{print $4}'")
252            .parse()
253            .unwrap_or(0);
254
255        Self {
256            cpu_model: run_cmd("cat /proc/cpuinfo | grep 'model name' | head -1 | cut -d: -f2"),
257            ram_total,
258            ram_free,
259            disk_total,
260            disk_free,
261            disk_used_percent,
262            uptime: run_cmd("uptime -p"),
263            load_average: run_cmd("uptime | awk -F'load average:' '{print $2}'"),
264            ram_total_bytes,
265            ram_free_bytes,
266            disk_total_bytes,
267            disk_free_bytes,
268            disk_used_percent_numeric,
269        }
270    }
271}
272
273impl HardwareInfo {
274    fn capture() -> Self {
275        let tpu_devices = run_cmd("ls /dev/accel* 2>/dev/null | wc -l")
276            .parse()
277            .unwrap_or(0);
278
279        let gpu_devices = run_cmd("nvidia-smi --list-gpus 2>/dev/null | wc -l")
280            .parse()
281            .unwrap_or(0);
282
283        let jax_available =
284            run_cmd("timeout 10 python3 -c 'import jax' 2>/dev/null && echo yes || echo no") == "yes";
285        let jax_version = if jax_available {
286            Some(run_cmd("timeout 10 python3 -c 'import jax; print(jax.__version__)'"))
287        } else {
288            None
289        };
290        let jax_device_count = if jax_available {
291            run_cmd("timeout 10 python3 -c 'import jax; print(len(jax.devices()))'")
292                .parse()
293                .ok()
294        } else {
295            None
296        };
297
298        Self {
299            tpu_devices,
300            gpu_devices,
301            jax_available,
302            jax_version,
303            jax_device_count,
304        }
305    }
306}
307
308impl ServiceInfo {
309    fn capture() -> Self {
310        let vllm_version = run_cmd("timeout 10 python3 -c 'import vllm; print(vllm.__version__)' 2>/dev/null");
311        let vllm_running = !run_cmd("pgrep -fa 'vllm serve'").is_empty();
312        let vllm_port_bound =
313            !run_cmd("ss -ltn '( sport = :8200 )' 2>/dev/null | grep 8200").is_empty();
314
315        Self {
316            vllm_version: if vllm_version.is_empty() {
317                None
318            } else {
319                Some(vllm_version)
320            },
321            vllm_running,
322            vllm_port_bound,
323        }
324    }
325}
326
327impl NetworkInfo {
328    fn capture() -> Self {
329        let public_ip = run_cmd("curl -s --connect-timeout 5 --max-time 5 ifconfig.me 2>/dev/null || echo 'unknown'");
330        let tunnel_output = run_cmd("pgrep -fa cloudflared");
331        let tunnel_running = !tunnel_output.is_empty();
332        let tunnel_name = if tunnel_running {
333            Some(tunnel_output)
334        } else {
335            None
336        };
337
338        Self {
339            public_ip,
340            tunnel_running,
341            tunnel_name,
342        }
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn test_telemetry_capture() {
352        let telemetry = Telemetry::capture();
353        assert!(telemetry.timestamp > 0);
354    }
355}