runtimo_core/
telemetry.rs

1//! System Telemetry — Via Negativa: raw observation, no interpretation.
2//!
3//! Captures a snapshot of the host machine by reading `/proc` and `/sys`
4//! directly. Every field is backed by a raw kernel filesystem read — no
5//! shell-out for data available in `/proc`, no pgrep, no service name
6//! guessing, no version detection.
7//!
8//! # Via Negativa Philosophy
9//!
10//! This module removes everything that is not direct observation:
11//!
12//! - **No pgrep** — tunnel detection reads `/proc/[0-9]*/comm` files
13//!   (process names, not command lines). The observer no longer matches
14//!   its own shell command as a running `cloudflared` process.
15//! - **No service guessing** — port detection reads `/proc/net/tcp` and
16//!   `/proc/net/tcp6` directly, returning raw `Vec<u16>`. Port 22 is
17//!   just `22` — the consumer decides it is SSH.
18//! - **No `ss -ltnp` parsing** — eliminated >50 lines of fragile
19//!   positional output parsing.
20//! - **No version detection** — no `sshd -V`, `nginx -v`, etc.
21//! - **Raw /proc reads** — cpuinfo, meminfo, uptime, loadavg, net/tcp.
22//! - **Shell-out only where no `/proc` equivalent exists** — `df` for
23//!   disk, `curl` for public IP (opt-in), accelerator detection.
24//!
25//! # Example
26//!
27//! ```rust,ignore
28//! use runtimo_core::Telemetry;
29//!
30//! let tel = Telemetry::capture();
31//! tel.print_report();
32//! ```
33//!
34//! # Performance
35//!
36//! Results are cached for 30 seconds via an internal mutex cache to avoid
37//! repeated `/proc` reads on consecutive calls.
38
39use crate::cmd::run_cmd;
40use serde::{Deserialize, Serialize};
41use std::sync::Mutex;
42
43static TELEMETRY_CACHE: Mutex<Option<(Telemetry, std::time::Instant)>> = Mutex::new(None);
44const CACHE_TTL_SECS: u64 = 30;
45
46/// Full system telemetry snapshot.
47///
48/// Contains three sub-structures: [`SystemInfo`], [`HardwareInfo`],
49/// and [`NetworkInfo`], plus a Unix timestamp. Service detection has been
50/// removed in favor of raw listening ports in [`NetworkInfo`].
51#[derive(Debug, Clone, Serialize, Deserialize)]
52#[allow(clippy::exhaustive_structs)]
53pub struct Telemetry {
54    /// Unix timestamp (seconds) when the snapshot was taken.
55    pub timestamp: u64,
56    /// Basic system information (CPU, RAM, disk, uptime, load).
57    pub system: SystemInfo,
58    /// Special hardware devices (TPU, GPU, JAX availability).
59    pub hardware: HardwareInfo,
60    /// Network state (public IP, tunnel status, listening ports).
61    pub network: NetworkInfo,
62}
63
64/// Basic system information — direct `/proc` reads only.
65///
66/// No shell commands are used for data available in `/proc`. Disk
67/// information (`df`) is the only exception because Linux provides
68/// no per-mount usage summary in `/proc`.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70#[allow(clippy::exhaustive_structs)]
71pub struct SystemInfo {
72    /// CPU model string from `/proc/cpuinfo` `model name` field.
73    pub cpu_model: String,
74    /// Logical CPU core count from `/proc/cpuinfo` (counts `processor` entries).
75    pub cpu_count: u32,
76    /// Total RAM in human-readable form (e.g. `"32Gi"`) from `/proc/meminfo`
77    /// `MemTotal` (kB → human).
78    pub ram_total: String,
79    /// Free RAM in human-readable form (e.g. `"750Mi"`) from `/proc/meminfo`
80    /// `MemFree` (kB → human).
81    pub ram_free: String,
82    /// Available RAM in human-readable form (e.g. `"22Gi"`) from `/proc/meminfo`
83    /// `MemAvailable` (kB → human). This is the memory usable for new
84    /// allocations without swapping — more useful than `ram_free` for
85    /// capacity planning.
86    pub ram_available: String,
87    /// Total disk space in human-readable form (e.g. `"100G"`) from `df -h /`.
88    pub disk_total: String,
89    /// Free disk space in human-readable form from `df -h /`.
90    pub disk_free: String,
91    /// Disk usage percentage as a string without `%` sign (e.g. `"45"`).
92    pub disk_used_percent: String,
93    /// Human-readable uptime (e.g. `"up 6 days, 3 hours"`) computed from
94    /// `/proc/uptime`.
95    pub uptime: String,
96    /// Machine-parseable uptime in seconds from `/proc/uptime` first field.
97    pub uptime_seconds: u64,
98    /// Load average string (e.g. `"0.50, 0.30, 0.20"`) from `/proc/loadavg`
99    /// first three fields.
100    pub load_average: String,
101}
102
103/// Special hardware device information.
104///
105/// Detects accelerators generically — GPUs (nvidia-smi, rocm-smi, /dev/dri),
106/// TPUs (/dev/accel*), and JAX availability. Reports what exists, not what
107/// was expected. Shell commands are used here because accelerator detection
108/// requires vendor-specific tools that have no `/proc` equivalent.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110#[allow(clippy::exhaustive_structs)]
111pub struct HardwareInfo {
112    /// Detected accelerator devices (any kind). Empty vec = no accelerators found.
113    #[serde(default)]
114    pub accelerators: Vec<AcceleratorInfo>,
115    /// Whether the `jax` Python package is importable.
116    #[serde(default)]
117    pub jax_available: bool,
118    /// JAX version string (e.g. `"0.4.25"`), if available.
119    #[serde(default)]
120    pub jax_version: Option<String>,
121    /// Number of JAX-visible devices, if available.
122    #[serde(default)]
123    pub jax_device_count: Option<usize>,
124}
125
126/// A detected hardware accelerator.
127#[derive(Debug, Clone, Serialize, Deserialize)]
128#[allow(clippy::exhaustive_structs)]
129pub struct AcceleratorInfo {
130    /// Accelerator kind: "gpu", "tpu", "npu".
131    pub kind: String,
132    /// Number of devices of this kind detected.
133    pub count: usize,
134    /// Vendor name if identifiable (e.g. "nvidia", "amd", "google").
135    #[serde(default)]
136    pub vendor: Option<String>,
137    /// Device model string if available.
138    #[serde(default)]
139    pub model: Option<String>,
140}
141
142/// Network state information.
143///
144/// Public IP capture is **opt-in** via `RUNTIMO_ENABLE_PUBLIC_IP=1`.
145/// Without this env var, `public_ip` defaults to `"unknown"` to prevent
146/// unintended external network metadata leakage.
147///
148/// Tunnel detection reads `/proc/[0-9]*/comm` files (process names only,
149/// not command lines). This eliminates the self-match bug where `pgrep`
150/// would match the shell that runs `pgrep` itself.
151///
152/// Listening ports are read directly from `/proc/net/tcp` and `/proc/net/tcp6`
153/// — no `ss` shell-out, no service name guessing.
154#[derive(Debug, Clone, Serialize, Deserialize)]
155#[allow(clippy::exhaustive_structs)]
156pub struct NetworkInfo {
157    /// Public IP address (from `ifconfig.me` when `RUNTIMO_ENABLE_PUBLIC_IP=1`),
158    /// or `"unknown"`.
159    pub public_ip: String,
160    /// Whether a `cloudflared` tunnel process is running (detected via
161    /// `/proc/*/comm` content match, not pgrep).
162    pub tunnel_running: bool,
163    /// PID of the `cloudflared` process if found, extracted from the
164    /// `/proc/<pid>` directory name.
165    pub tunnel_pid: Option<u32>,
166    /// Raw listening TCP ports from `/proc/net/tcp` and `/proc/net/tcp6`.
167    /// Only ports in `LISTEN` (state `0A`) state are included.
168    /// Sorted ascending, duplicates removed.
169    #[serde(default)]
170    pub listening_ports: Vec<u16>,
171}
172
173// ── /proc file reading helpers ───────────────────────────────────────────
174
175/// Reads the entire contents of a `/proc` file into a `String`.
176///
177/// # Input
178///
179/// `path` — Absolute path to a `/proc` file (e.g. `"/proc/cpuinfo"`).
180///
181/// # Output
182///
183/// `Ok(String)` — Full file contents.
184/// `Err(io::Error)` — File does not exist, permission denied, or I/O error.
185///
186/// Callers must handle the error case — an empty `/proc` file is a
187/// valid success (e.g. empty tcp6 in a container), only I/O errors
188/// should produce `Err`.
189fn read_proc_file(path: &str) -> std::io::Result<String> {
190    std::fs::read_to_string(path)
191}
192
193/// Parses a `/proc/meminfo` key value in kB and returns the raw numeric value.
194///
195/// `/proc/meminfo` lines have the format `Key:    12345 kB`. This function
196/// finds the line starting with `key`, extracts the numeric value (first
197/// whitespace-delimited field after the colon), and parses it as `u64`.
198///
199/// Returns `0` if the key is not found or the value cannot be parsed.
200fn parse_meminfo_kb(data: &str, key: &str) -> u64 {
201    data.lines()
202        .find(|l| l.starts_with(key))
203        .and_then(|l| l.split_whitespace().nth(1))
204        .and_then(|v| v.parse::<u64>().ok())
205        .unwrap_or(0)
206}
207
208/// Converts a kilobyte count to a human-readable string.
209///
210/// Uses binary suffixes (KiB, MiB, GiB, TiB). Values >= 1000 KiB are
211/// displayed with the next-higher unit. The output format matches the
212/// `free -h` style: e.g. `"16Gi"`, `"750Mi"`, `"512Ki"`.
213///
214/// # Examples
215///
216/// - `format_mem_kb(512)` → `"512Ki"`
217/// - `format_mem_kb(768000)` → `"750Mi"`
218/// - `format_mem_kb(16777216)` → `"16Gi"`
219fn format_mem_kb(kb: u64) -> String {
220    if kb >= 1_048_576 {
221        // GiB: >= 1024^2 KiB
222        format!("{}Gi", kb / 1_048_576)
223    } else if kb >= 1_024 {
224        // MiB: >= 1024 KiB
225        format!("{}Mi", kb / 1_024)
226    } else {
227        // KiB: raw value
228        format!("{}Ki", kb)
229    }
230}
231
232/// Formats a duration in seconds into a human-readable uptime string.
233///
234/// Breaks down the duration into days, hours, and minutes. Omits zero-value
235/// units. The format matches `uptime -p` output: e.g. `"up 6 days, 3 hours,
236/// 12 minutes"`.
237///
238/// # Examples
239///
240/// - `format_uptime(60)` → `"up 1 minute"`
241/// - `format_uptime(3661)` → `"up 1 hour, 1 minute"`
242/// - `format_uptime(526380)` → `"up 6 days, 2 hours, 13 minutes"`
243fn format_uptime(total_seconds: u64) -> String {
244    let days = total_seconds / 86_400;
245    let hours = (total_seconds % 86_400) / 3_600;
246    let minutes = (total_seconds % 3_600) / 60;
247
248    let mut parts: Vec<String> = Vec::with_capacity(3);
249    if days > 0 {
250        parts.push(format!("{} day{}", days, if days == 1 { "" } else { "s" }));
251    }
252    if hours > 0 {
253        parts.push(format!(
254            "{} hour{}",
255            hours,
256            if hours == 1 { "" } else { "s" }
257        ));
258    }
259    if minutes > 0 || parts.is_empty() {
260        // Always show at least minutes
261        parts.push(format!(
262            "{} minute{}",
263            minutes,
264            if minutes == 1 { "" } else { "s" }
265        ));
266    }
267    format!("up {}", parts.join(", "))
268}
269
270// ── Telemetry capture ────────────────────────────────────────────────────
271
272impl Telemetry {
273    /// Captures a full system telemetry snapshot.
274    ///
275    /// Results are cached for 30 seconds to avoid
276    /// repeated filesystem reads on consecutive calls. Network queries
277    /// (public_ip, tunnel) are included in the cached value.
278    ///
279    /// Use [`capture_lightweight`](Telemetry::capture_lightweight) for
280    /// execution paths that don't need accelerator detection or network
281    /// probing (e.g., the executor's WAL audit trail — which only needs
282    /// `/proc`-based system health data).
283    pub fn capture() -> Self {
284        let now = std::time::Instant::now();
285        {
286            let cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
287            if let Some((cached, instant)) = cache.as_ref() {
288                if now.duration_since(*instant).as_secs() < CACHE_TTL_SECS {
289                    return cached.clone();
290                }
291            }
292        }
293
294        let timestamp = std::time::SystemTime::now()
295            .duration_since(std::time::UNIX_EPOCH)
296            .map_or(0, |d| d.as_secs());
297
298        let telemetry = Self {
299            timestamp,
300            system: SystemInfo::capture(),
301            hardware: HardwareInfo::capture(),
302            network: NetworkInfo::capture(),
303        };
304
305        let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
306        *cache = Some((telemetry.clone(), now));
307        telemetry
308    }
309
310    /// Captures a lightweight system telemetry snapshot without shell-outs.
311    ///
312    /// Unlike [`capture`](Telemetry::capture), this method skips all
313    /// accelerator detection (rocm-smi, nvidia-smi, JAX import), tunnel
314    /// probing, and public IP queries. Only [`SystemInfo`] is populated
315    /// from `/proc` reads and a single `df` shell-out.
316    ///
317    /// [`HardwareInfo`] is zeroed (empty accelerators, no JAX), and
318    /// [`NetworkInfo`] returns defaults (`public_ip = "unknown"`,
319    /// `tunnel_running = false`, empty `listening_ports`).
320    ///
321    /// Use this in hot paths like the executor's WAL audit trail where
322    /// GPU/TPU/JAX counts are irrelevant and shell-outs produce unwanted
323    /// stderr noise on systems without those tools installed.
324    ///
325    /// Results share the same internal cache — a previous full
326    /// [`capture`](Telemetry::capture) within 30 seconds will
327    /// be returned AS-IS (including hardware/network data). Callers
328    /// on hot paths should NOT rely on this returning empty hardware
329    /// if a full capture was recently cached.
330    #[must_use]
331    pub fn capture_lightweight() -> Self {
332        // Check cache first — if a full capture exists within TTL, return it.
333        // This is intentional: lightweight callers on hot paths benefit from
334        // cache hits without re-reading /proc.
335        let now = std::time::Instant::now();
336        {
337            let cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
338            if let Some((cached, instant)) = cache.as_ref() {
339                if now.duration_since(*instant).as_secs() < CACHE_TTL_SECS {
340                    return cached.clone();
341                }
342            }
343        }
344
345        let timestamp = std::time::SystemTime::now()
346            .duration_since(std::time::UNIX_EPOCH)
347            .map_or(0, |d| d.as_secs());
348
349        let telemetry = Self {
350            timestamp,
351            system: SystemInfo::capture(),
352            hardware: HardwareInfo {
353                accelerators: Vec::new(),
354                jax_available: false,
355                jax_version: None,
356                jax_device_count: None,
357            },
358            network: NetworkInfo {
359                public_ip: "unknown".to_string(),
360                tunnel_running: false,
361                tunnel_pid: None,
362                listening_ports: Vec::new(),
363            },
364        };
365
366        let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
367        *cache = Some((telemetry.clone(), now));
368        telemetry
369    }
370
371    /// Clears the telemetry cache.
372    ///
373    /// Call this in tests or between full/lightweight captures to prevent
374    /// stale cached data from leaking across capture modes.
375    pub fn clear_cache() {
376        let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
377        *cache = None;
378    }
379
380    /// Prints telemetry in a human-readable report to stdout.
381    ///
382    /// Output includes CPU cores, RAM available, machine-parseable uptime
383    /// seconds, contextualized load average (with core count), raw listening
384    /// ports, and tunnel PID.
385    pub fn print_report(&self) {
386        println!("\n{}", "=".repeat(60));
387        println!(" RUNTIMO TELEMETRY [{}]", self.timestamp);
388        println!("{}", "=".repeat(60));
389
390        println!("\n--- SYSTEM ---");
391        println!(
392            " CPU   : {} ({} cores)",
393            self.system.cpu_model, self.system.cpu_count
394        );
395        println!(
396            " RAM   : {} total, {} free, {} available",
397            self.system.ram_total, self.system.ram_free, self.system.ram_available
398        );
399        println!(
400            " Disk  : {} total, {} free ({}% used)",
401            self.system.disk_total, self.system.disk_free, self.system.disk_used_percent
402        );
403        // Machine-parseable uptime: "up 6 days (526380s)"
404        println!(
405            " Uptime: {} ({}s)",
406            self.system.uptime, self.system.uptime_seconds
407        );
408        // Contextualized load: "3.19, 4.93, 7.68 (4 cores)"
409        println!(
410            " Load  : {} ({} cores)",
411            self.system.load_average, self.system.cpu_count
412        );
413
414        println!("\n--- HARDWARE ---");
415        if self.hardware.accelerators.is_empty() {
416            println!(" Accelerators: none detected");
417        } else {
418            for acc in &self.hardware.accelerators {
419                println!(
420                    " {}: {}x {}{}",
421                    acc.kind,
422                    acc.count,
423                    acc.model.as_deref().unwrap_or("unknown"),
424                    acc.vendor
425                        .as_ref()
426                        .map(|v| format!(" ({})", v))
427                        .unwrap_or_default()
428                );
429            }
430        }
431        if self.hardware.jax_available {
432            println!(
433                " JAX: v{} ({} devices)",
434                self.hardware
435                    .jax_version
436                    .clone()
437                    .unwrap_or_else(|| "unknown".into()),
438                self.hardware.jax_device_count.unwrap_or(0)
439            );
440        }
441
442        println!("\n--- NETWORK ---");
443        println!(" Public IP: {}", self.network.public_ip);
444        // Tunnel with PID: "cloudflared (PID 1234)" or "none"
445        if self.network.tunnel_running {
446            println!(
447                " Tunnel: cloudflared (PID {})",
448                self.network
449                    .tunnel_pid
450                    .map_or_else(|| "?".to_string(), |p| p.to_string())
451            );
452        } else {
453            println!(" Tunnel: none");
454        }
455        if self.network.listening_ports.is_empty() {
456            println!(" Listening ports: none");
457        } else {
458            let ports_str = self
459                .network
460                .listening_ports
461                .iter()
462                .map(|p| p.to_string())
463                .collect::<Vec<_>>()
464                .join(", ");
465            println!(" Listening ports: {}", ports_str);
466        }
467
468        println!("\n{}", "=".repeat(60));
469    }
470}
471
472// ── SystemInfo capture — direct /proc reads ──────────────────────────────
473
474impl SystemInfo {
475    /// Captures system information from `/proc` and `/sys` files with a single
476    /// `df` shell-out for disk usage. No accelerator or network probing.
477    ///
478    /// Reads `/proc/cpuinfo` (model, count), `/proc/meminfo` (MemTotal,
479    /// MemFree, MemAvailable), `/proc/uptime`, and `/proc/loadavg`.
480    /// Disk info comes from `df` because Linux provides no per-mount usage
481    /// summary in procfs.
482    #[must_use]
483    pub(crate) fn capture() -> Self {
484        // /proc/cpuinfo: extract model name and count logical processors
485        let cpuinfo = read_proc_file("/proc/cpuinfo").unwrap_or_default();
486        let cpu_model = cpuinfo
487            .lines()
488            .find(|l| l.starts_with("model name"))
489            .and_then(|l| l.split(':').nth(1))
490            .map_or_else(|| "unknown".to_string(), |s| s.trim().to_string());
491        // Count lines beginning with "processor" — each is a logical core
492        let cpu_count: u32 = cpuinfo
493            .lines()
494            .filter(|l| l.starts_with("processor"))
495            .count()
496            .try_into()
497            .unwrap_or(0);
498
499        // /proc/meminfo: MemTotal, MemFree, MemAvailable (all in kB)
500        let meminfo = read_proc_file("/proc/meminfo").unwrap_or_default();
501        let ram_total = format_mem_kb(parse_meminfo_kb(&meminfo, "MemTotal:"));
502        let ram_free = format_mem_kb(parse_meminfo_kb(&meminfo, "MemFree:"));
503        let ram_available = format_mem_kb(parse_meminfo_kb(&meminfo, "MemAvailable:"));
504
505        // /proc/uptime: first field is uptime in seconds (fractional).
506        // The value is always non-negative; cast truncation is safe.
507        let uptime = read_proc_file("/proc/uptime").unwrap_or_default();
508        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
509        let uptime_seconds: u64 = uptime
510            .split_whitespace()
511            .next()
512            .and_then(|s| s.parse::<f64>().ok())
513            .map_or(0, |f: f64| f as u64);
514        let uptime_str = format_uptime(uptime_seconds);
515
516        // /proc/loadavg: first three fields are 1/5/15 min load averages
517        let loadavg = read_proc_file("/proc/loadavg").unwrap_or_default();
518        let load_average = {
519            // Extract first three whitespace-separated fields from /proc/loadavg
520            let mut fields = loadavg.split_whitespace();
521            match (fields.next(), fields.next(), fields.next()) {
522                (Some(one), Some(five), Some(fifteen)) => {
523                    format!("{one}, {five}, {fifteen}")
524                }
525                _ => String::from("unknown"),
526            }
527        };
528
529        // Disk: no /proc equivalent; keep df shell-out
530        let disk_total = run_cmd("df -h / | tail -1 | awk '{print $2}'").unwrap_or_default();
531        let disk_free = run_cmd("df -h / | tail -1 | awk '{print $4}'").unwrap_or_default();
532        let disk_pct_str = run_cmd("df / | tail -1 | awk '{print $5}'").unwrap_or_default();
533        let disk_used_percent = disk_pct_str.replace('%', "");
534
535        Self {
536            cpu_model,
537            cpu_count,
538            ram_total,
539            ram_free,
540            ram_available,
541            disk_total,
542            disk_free,
543            disk_used_percent,
544            uptime: uptime_str,
545            uptime_seconds,
546            load_average,
547        }
548    }
549}
550
551// ── HardwareInfo capture — vendor tools (no /proc equivalent) ────────────
552
553impl HardwareInfo {
554    fn capture() -> Self {
555        let mut accelerators = Vec::new();
556
557        // TPU devices via /dev/accel*
558        let tpu_count: usize = run_cmd("ls /dev/accel* 2>/dev/null | wc -l")
559            .unwrap_or_default()
560            .parse()
561            .unwrap_or(0);
562        if tpu_count > 0 {
563            accelerators.push(AcceleratorInfo {
564                kind: "tpu".into(),
565                count: tpu_count,
566                vendor: Some("google".into()),
567                model: None,
568            });
569        }
570
571        // NVIDIA GPUs via nvidia-smi
572        let nvidia_gpu_count: usize = run_cmd("nvidia-smi --list-gpus 2>/dev/null | wc -l")
573            .unwrap_or_default()
574            .parse()
575            .unwrap_or(0);
576        if nvidia_gpu_count > 0 {
577            let model =
578                run_cmd("nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1")
579                    .unwrap_or_default();
580            accelerators.push(AcceleratorInfo {
581                kind: "gpu".into(),
582                count: nvidia_gpu_count,
583                vendor: Some("nvidia".into()),
584                model: if model.is_empty() { None } else { Some(model) },
585            });
586        }
587
588        // AMD GPUs via rocm-smi
589        let amd_gpu_count: usize =
590            run_cmd("rocm-smi --showproductname 2>/dev/null | grep -c 'GPU\\['")
591                .unwrap_or_default()
592                .parse()
593                .unwrap_or(0);
594        if amd_gpu_count > 0 {
595            accelerators.push(AcceleratorInfo {
596                kind: "gpu".into(),
597                count: amd_gpu_count,
598                vendor: Some("amd".into()),
599                model: None,
600            });
601        }
602
603        // Generic DRM devices (fallback for any GPU)
604        if nvidia_gpu_count == 0 && amd_gpu_count == 0 {
605            let dri_count: usize = run_cmd("ls /dev/dri/render* 2>/dev/null | wc -l")
606                .unwrap_or_default()
607                .parse()
608                .unwrap_or(0);
609            if dri_count > 0 {
610                accelerators.push(AcceleratorInfo {
611                    kind: "gpu".into(),
612                    count: dri_count,
613                    vendor: None,
614                    model: Some("drm-render".into()),
615                });
616            }
617        }
618
619        let jax_available =
620            run_cmd("timeout 10 python3 -c 'import jax' 2>/dev/null && echo yes || echo no")
621                .unwrap_or_default()
622                == "yes";
623        let jax_version = if jax_available {
624            Some(
625                run_cmd("timeout 10 python3 -c 'import jax; print(jax.__version__)'")
626                    .unwrap_or_default(),
627            )
628        } else {
629            None
630        };
631        let jax_device_count = if jax_available {
632            run_cmd("timeout 10 python3 -c 'import jax; print(len(jax.devices()))'")
633                .unwrap_or_default()
634                .parse()
635                .ok()
636        } else {
637            None
638        };
639
640        Self {
641            accelerators,
642            jax_available,
643            jax_version,
644            jax_device_count,
645        }
646    }
647}
648
649// ── NetworkInfo capture — /proc for tunnels and ports ────────────────────
650
651impl NetworkInfo {
652    /// Captures network state with opt-in public IP, tunnel detection via
653    /// `/proc/*/comm`, and listening ports from `/proc/net/tcp` + `tcp6`.
654    ///
655    /// Public IP is only queried when `RUNTIMO_ENABLE_PUBLIC_IP=1`. Without it,
656    /// `public_ip` is set to `"unknown"`.
657    ///
658    /// Tunnel detection reads `/proc/[0-9]*/comm` files and checks if any
659    /// contain `"cloudflared"`. The `comm` file holds only the process name
660    /// (max 16 chars), never the command line — this eliminates the self-match
661    /// bug where `pgrep -fa cloudflared` matches its own shell invocation.
662    fn capture() -> Self {
663        let public_ip = if std::env::var("RUNTIMO_ENABLE_PUBLIC_IP").as_deref() == Ok("1") {
664            run_cmd(
665                "curl -s --connect-timeout 5 --max-time 5 ifconfig.me 2>/dev/null || echo 'unknown'",
666            )
667            .unwrap_or_else(|_| "unknown".to_string())
668        } else {
669            "unknown".to_string()
670        };
671
672        let (tunnel_running, tunnel_pid) = detect_cloudflared();
673        let listening_ports = read_listening_ports();
674
675        Self {
676            public_ip,
677            tunnel_running,
678            tunnel_pid,
679            listening_ports,
680        }
681    }
682}
683
684/// Scans `/proc/[0-9]*/comm` for a `cloudflared` process.
685///
686/// # How it works
687///
688/// 1. Iterates all directory entries in `/proc` whose names consist solely
689///    of ASCII digits (these are PID directories).
690/// 2. Reads the `comm` file inside each PID directory — this file contains
691///    only the process name (truncated to 15 chars by the kernel), never
692///    the command line or arguments.
693/// 3. If the trimmed content equals `"cloudflared"`, extracts the PID from
694///    the directory name.
695///
696/// # Why `comm`, not `cmdline`
697///
698/// The `cmdline` file (`/proc/[pid]/cmdline`) contains the full command
699/// line (null-delimited), including arguments like `--token <value>`.
700/// Using `comm` avoids:
701/// - Reading potentially sensitive command-line tokens.
702/// - The self-match bug: `sh -c pgrep -fa cloudflared` contains `cloudflared`
703///   in its command line but NOT in its `comm` file (which would be `sh`
704///   or `pgrep`).
705///
706/// Returns `(true, Some(pid))` if found, `(false, None)` otherwise.
707fn detect_cloudflared() -> (bool, Option<u32>) {
708    // Read /proc directory — each numeric subdirectory is a PID
709    let Ok(dir) = std::fs::read_dir("/proc") else {
710        return (false, None);
711    };
712
713    for entry in dir.flatten() {
714        let path = entry.path();
715        // Only consider entries whose filename is purely numeric (PIDs)
716        let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
717            continue;
718        };
719        if !name.chars().all(|c| c.is_ascii_digit()) {
720            continue;
721        }
722
723        let comm_path = path.join("comm");
724        let Ok(content) = std::fs::read_to_string(&comm_path) else {
725            continue;
726        };
727
728        if content.trim() == "cloudflared" {
729            if let Ok(pid) = name.parse::<u32>() {
730                return (true, Some(pid));
731            }
732        }
733    }
734
735    (false, None)
736}
737
738/// Reads listening TCP ports from `/proc/net/tcp` and `/proc/net/tcp6`.
739///
740/// # Format
741///
742/// Each line (after the header) has the format:
743/// ```text
744///   0: 00000000:0016 00000000:0000 0A ...
745/// ```
746///
747/// - Column 2 (`00000000:0016`) is the local address. The part after the
748///   colon (`0016`) is the port number in hexadecimal.
749/// - Column 4 (`0A`) is the socket state in hexadecimal. `0A` = `LISTEN`.
750///
751/// Only entries with state `0A` (LISTEN) are included. Ports are sorted
752/// ascending and deduplicated.
753///
754/// # Why `/proc/net/tcp`, not `ss -ltnp`
755///
756/// - `/proc/net/tcp` is a kernel-provided procfs file — no subprocess,
757///   no command parsing, no fragile positional output logic.
758/// - `ss -ltnp` requires shell-out, parses variable-width columns, and
759///   may produce output that varies across `iproute2` versions.
760/// - The procfs format is stable kernel ABI.
761fn read_listening_ports() -> Vec<u16> {
762    let mut ports = Vec::new();
763
764    for path in &["/proc/net/tcp", "/proc/net/tcp6"] {
765        let data = read_proc_file(path).unwrap_or_default();
766        // Skip header line (starts with "  sl")
767        for line in data.lines().skip(1) {
768            let parts: Vec<&str> = line.split_whitespace().collect();
769            // Minimum columns: sl(0:) + local_address + rem_address + state
770            if parts.len() < 4 {
771                continue;
772            }
773
774            // Column 2 = local_address (e.g. "00000000:0016")
775            // Column 4 = state (e.g. "0A" = LISTEN)
776            // Use .get() for clippy::indexing_slicing compliance
777            if parts.get(3) != Some(&"0A") {
778                continue;
779            }
780
781            // Extract port hex from local_address (portion after ':')
782            if let Some(port_hex) = parts.get(1).and_then(|addr| addr.split(':').nth(1)) {
783                if let Ok(port) = u16::from_str_radix(port_hex, 16) {
784                    ports.push(port);
785                }
786            }
787        }
788    }
789
790    ports.sort_unstable();
791    ports.dedup();
792    ports
793}
794
795// ── Tests ────────────────────────────────────────────────────────────────
796
797#[cfg(test)]
798mod tests {
799    use super::*;
800
801    // ── SystemInfo tests ────────────────────────────────────────────
802
803    #[test]
804    fn test_telemetry_capture() {
805        let telemetry = Telemetry::capture();
806        assert!(telemetry.timestamp > 0, "timestamp must be positive");
807
808        let s = &telemetry.system;
809        assert!(!s.cpu_model.is_empty(), "cpu_model must not be empty");
810        assert!(s.cpu_count > 0, "cpu_count must be > 0");
811        assert!(!s.ram_total.is_empty(), "ram_total must not be empty");
812        assert!(!s.ram_free.is_empty(), "ram_free must not be empty");
813        assert!(
814            !s.ram_available.is_empty(),
815            "ram_available must not be empty"
816        );
817        assert!(!s.disk_total.is_empty(), "disk_total must not be empty");
818        assert!(s.uptime_seconds > 0, "uptime_seconds must be > 0");
819        assert!(!s.load_average.is_empty(), "load_average must not be empty");
820
821        let h = &telemetry.hardware;
822        assert!(
823            h.accelerators.iter().all(|a| !a.kind.is_empty()),
824            "accelerator kind must not be empty"
825        );
826        assert!(
827            h.accelerators.iter().all(|a| a.count > 0),
828            "accelerator count must be > 0"
829        );
830
831        let net = &telemetry.network;
832        assert!(!net.public_ip.is_empty(), "public_ip must not be empty");
833        // Default: public_ip is "unknown" unless RUNTIMO_ENABLE_PUBLIC_IP=1
834        assert_eq!(
835            net.public_ip, "unknown",
836            "public_ip should be 'unknown' by default (opt-in via RUNTIMO_ENABLE_PUBLIC_IP=1)"
837        );
838        // listening_ports is a Vec — can be empty in container/isolated env
839        assert!(
840            net.listening_ports.iter().all(|p| *p > 0),
841            "all listening ports must be > 0"
842        );
843    }
844
845    #[test]
846    fn test_telemetry_cache_works() {
847        let t1 = Telemetry::capture();
848        let t2 = Telemetry::capture();
849        assert_eq!(
850            t1.timestamp, t2.timestamp,
851            "cached telemetry should be identical"
852        );
853    }
854
855    #[test]
856    fn test_system_info_from_proc() {
857        // Verify cpu_count, ram_available, uptime_seconds are populated
858        // from /proc reads (not from shell commands that might fail in
859        // minimal containers).
860        let sys = SystemInfo::capture();
861        assert!(sys.cpu_count > 0, "cpu_count from /proc/cpuinfo");
862        assert!(
863            !sys.ram_available.is_empty(),
864            "ram_available from /proc/meminfo MemAvailable"
865        );
866        assert!(sys.uptime_seconds > 0, "uptime_seconds from /proc/uptime");
867        // uptime string should be non-empty and start with "up"
868        assert!(
869            sys.uptime.starts_with("up "),
870            "uptime string should start with 'up ': got '{}'",
871            sys.uptime
872        );
873        // cpu_model should be non-empty
874        assert!(
875            !sys.cpu_model.is_empty(),
876            "cpu_model from /proc/cpuinfo model name"
877        );
878    }
879
880    #[test]
881    fn test_cloudflared_detection() {
882        // The cloudflared detection must NOT self-match.
883        // This test verifies that detecting cloudflared doesn't find
884        // the shell that is running the detection command (because it reads
885        // /proc/*/comm, not pgrep).
886        let (running, pid) = detect_cloudflared();
887
888        // If cloudflared is actually running on this machine, it should be found.
889        // But it should NEVER report pid of the detection process itself.
890        if running {
891            assert!(pid.is_some(), "tunnel_running implies tunnel_pid");
892            let found_pid = pid.unwrap();
893            // Verify the PID actually belongs to a cloudflared process
894            let comm_path = format!("/proc/{}/comm", found_pid);
895            if let Ok(content) = std::fs::read_to_string(&comm_path) {
896                assert_eq!(
897                    content.trim(),
898                    "cloudflared",
899                    "PID {} comm should be 'cloudflared', got '{}'",
900                    found_pid,
901                    content.trim()
902                );
903            }
904        }
905        // Even if not running, the function must return cleanly
906        assert!(!running || pid.is_some());
907    }
908
909    #[test]
910    fn test_listening_ports() {
911        let ports = read_listening_ports();
912
913        // Verify no duplicate ports
914        let mut uniq = ports.clone();
915        uniq.dedup();
916        assert_eq!(
917            ports.len(),
918            uniq.len(),
919            "listening ports must have no duplicates"
920        );
921
922        // Verify ports are sorted
923        for w in ports.windows(2) {
924            assert!(w[0] <= w[1], "listening ports must be sorted: {:?}", ports);
925        }
926
927        // All ports should be valid (1-65535)
928        for &p in &ports {
929            assert!(p > 0, "port 0 is not a valid listening port");
930        }
931
932        // If this runs on a live system, ports is a Vec — it can be empty
933        // in isolated containers. That's valid — no asserting on length.
934    }
935
936    // ── Helper function tests ────────────────────────────────────────
937
938    #[test]
939    fn test_format_mem_kb() {
940        assert_eq!(format_mem_kb(512), "512Ki");
941        assert_eq!(format_mem_kb(1024), "1Mi");
942        assert_eq!(format_mem_kb(1536), "1Mi"); // >1024 snaps to Mi
943        assert_eq!(format_mem_kb(1048576), "1Gi");
944        assert_eq!(format_mem_kb(2097152), "2Gi");
945        assert_eq!(format_mem_kb(768000), "750Mi"); // ~750Mi
946                                                    // Edge: 0 KB
947        assert_eq!(format_mem_kb(0), "0Ki");
948    }
949
950    #[test]
951    fn test_format_uptime() {
952        assert!(
953            format_uptime(0).contains("minute"),
954            "zero uptime: {}",
955            format_uptime(0)
956        );
957        assert!(
958            format_uptime(60).contains("1 minute"),
959            "60s: {}",
960            format_uptime(60)
961        );
962        assert!(
963            format_uptime(3600).contains("1 hour"),
964            "3600s: {}",
965            format_uptime(3600)
966        );
967        assert!(
968            format_uptime(86400).contains("1 day"),
969            "86400s: {}",
970            format_uptime(86400)
971        );
972        // All start with "up "
973        assert!(
974            format_uptime(12345).starts_with("up "),
975            "uptime should start with 'up '"
976        );
977    }
978
979    #[test]
980    fn test_parse_meminfo_kb() {
981        let sample = "MemTotal:       32768000 kB\nMemFree:         8000000 kB\nMemAvailable:   22000000 kB\n";
982        assert_eq!(parse_meminfo_kb(sample, "MemTotal:"), 32_768_000);
983        assert_eq!(parse_meminfo_kb(sample, "MemFree:"), 8_000_000);
984        assert_eq!(parse_meminfo_kb(sample, "MemAvailable:"), 22_000_000);
985        // Missing key
986        assert_eq!(parse_meminfo_kb(sample, "SwapTotal:"), 0);
987        // Empty input
988        assert_eq!(parse_meminfo_kb("", "MemTotal:"), 0);
989    }
990
991    // ── Backward compatibility tests ─────────────────────────────────
992
993    #[test]
994    fn test_accelerators_back_compat() {
995        let hw = HardwareInfo {
996            accelerators: vec![
997                AcceleratorInfo {
998                    kind: "gpu".into(),
999                    count: 4,
1000                    vendor: Some("nvidia".into()),
1001                    model: Some("A100".into()),
1002                },
1003                AcceleratorInfo {
1004                    kind: "tpu".into(),
1005                    count: 8,
1006                    vendor: Some("google".into()),
1007                    model: None,
1008                },
1009            ],
1010            jax_available: false,
1011            jax_version: None,
1012            jax_device_count: None,
1013        };
1014
1015        let total_tpu: usize = hw
1016            .accelerators
1017            .iter()
1018            .filter(|a| a.kind == "tpu")
1019            .map(|a| a.count)
1020            .sum();
1021        let total_gpu: usize = hw
1022            .accelerators
1023            .iter()
1024            .filter(|a| a.kind == "gpu")
1025            .map(|a| a.count)
1026            .sum();
1027
1028        assert_eq!(total_tpu, 8, "total tpu should be 8");
1029        assert_eq!(total_gpu, 4, "total gpu should be 4");
1030    }
1031
1032    #[test]
1033    fn test_accelerators_empty_is_valid() {
1034        let hw = HardwareInfo {
1035            accelerators: vec![],
1036            jax_available: false,
1037            jax_version: None,
1038            jax_device_count: None,
1039        };
1040
1041        assert!(hw.accelerators.is_empty());
1042    }
1043
1044    #[test]
1045    fn test_telemetry_serialization_roundtrip() {
1046        let hw = HardwareInfo {
1047            accelerators: vec![AcceleratorInfo {
1048                kind: "gpu".into(),
1049                count: 2,
1050                vendor: Some("nvidia".into()),
1051                model: Some("H100".into()),
1052            }],
1053            jax_available: true,
1054            jax_version: Some("0.4.30".into()),
1055            jax_device_count: Some(2),
1056        };
1057
1058        let net = NetworkInfo {
1059            public_ip: "192.0.2.1".into(),
1060            tunnel_running: false,
1061            tunnel_pid: None,
1062            listening_ports: vec![22, 80, 443],
1063        };
1064
1065        let json = serde_json::to_string(&hw).unwrap();
1066        let parsed: HardwareInfo = serde_json::from_str(&json).unwrap();
1067        assert_eq!(parsed.accelerators.len(), 1);
1068        assert_eq!(parsed.accelerators[0].kind, "gpu");
1069        assert_eq!(parsed.accelerators[0].model.as_deref(), Some("H100"));
1070
1071        let json = serde_json::to_string(&net).unwrap();
1072        let parsed: NetworkInfo = serde_json::from_str(&json).unwrap();
1073        assert!(parsed.listening_ports.contains(&22));
1074        assert!(parsed.listening_ports.contains(&443));
1075        assert!(!parsed.tunnel_running);
1076        assert!(parsed.tunnel_pid.is_none());
1077    }
1078
1079    #[test]
1080    fn test_telemetry_deserialize_old_wal_event() {
1081        let old_json = r#"{
1082            "jax_available": true,
1083            "jax_version": "0.4.25",
1084            "jax_device_count": 8
1085        }"#;
1086
1087        let parsed: HardwareInfo = serde_json::from_str(old_json).unwrap();
1088        assert!(
1089            parsed.accelerators.is_empty(),
1090            "old WAL events deserialize with empty accelerators"
1091        );
1092        assert!(parsed.jax_available);
1093    }
1094
1095    #[test]
1096    fn test_network_info_listening_ports_roundtrip() {
1097        // Verify that listening_ports serializes/deserializes correctly
1098        let net = NetworkInfo {
1099            public_ip: "unknown".into(),
1100            tunnel_running: false,
1101            tunnel_pid: None,
1102            listening_ports: vec![22, 11434, 3389],
1103        };
1104
1105        let json = serde_json::to_string(&net).unwrap();
1106        let parsed: NetworkInfo = serde_json::from_str(&json).unwrap();
1107        assert_eq!(parsed.listening_ports, vec![22, 11434, 3389]);
1108        assert!(!parsed.tunnel_running);
1109    }
1110}
runtimo_core/telemetry.rs

runtimo_core/
telemetry.rs