runtimo_core/
telemetry.rs

1//! System Telemetry — Via Negativa: raw observation, no interpretation.
2//!
3//! Captures a snapshot of the host machine by reading `/proc` and `/sys`
4//! directly. Every field is backed by a raw kernel filesystem read — no
5//! shell-out for data available in `/proc`, no pgrep, no service name
6//! guessing, no version detection.
7//!
8//! # Via Negativa Philosophy
9//!
10//! This module removes everything that is not direct observation:
11//!
12//! - **No pgrep** — tunnel detection reads `/proc/[0-9]*/comm` files
13//!   (process names, not command lines). The observer no longer matches
14//!   its own shell command as a running `cloudflared` process.
15//! - **No service guessing** — port detection reads `/proc/net/tcp` and
16//!   `/proc/net/tcp6` directly, returning raw `Vec<u16>`. Port 22 is
17//!   just `22` — the consumer decides it is SSH.
18//! - **No `ss -ltnp` parsing** — eliminated >50 lines of fragile
19//!   positional output parsing.
20//! - **No version detection** — no `sshd -V`, `nginx -v`, etc.
21//! - **Raw /proc reads** — cpuinfo, meminfo, uptime, loadavg, net/tcp.
22//! - **Shell-out only where no `/proc` equivalent exists** — `df` for
23//!   disk, `curl` for public IP (opt-in), accelerator detection.
24//!
25//! # Example
26//!
27//! ```rust,ignore
28//! use runtimo_core::Telemetry;
29//!
30//! let tel = Telemetry::capture();
31//! tel.print_report();
32//! ```
33//!
34//! # Performance
35//!
36//! Results are cached for 30 seconds via [`TELEMETRY_CACHE`] to avoid
37//! repeated `/proc` reads on consecutive calls.
38
39use crate::cmd::run_cmd;
40use serde::{Deserialize, Serialize};
41use std::sync::Mutex;
42
43static TELEMETRY_CACHE: Mutex<Option<(Telemetry, std::time::Instant)>> = Mutex::new(None);
44const CACHE_TTL_SECS: u64 = 30;
45
46/// Full system telemetry snapshot.
47///
48/// Contains three sub-structures: [`SystemInfo`], [`HardwareInfo`],
49/// and [`NetworkInfo`], plus a Unix timestamp. Service detection has been
50/// removed in favor of raw listening ports in [`NetworkInfo`].
51#[derive(Debug, Clone, Serialize, Deserialize)]
52#[allow(clippy::exhaustive_structs)]
53pub struct Telemetry {
54    /// Unix timestamp (seconds) when the snapshot was taken.
55    pub timestamp: u64,
56    /// Basic system information (CPU, RAM, disk, uptime, load).
57    pub system: SystemInfo,
58    /// Special hardware devices (TPU, GPU, JAX availability).
59    pub hardware: HardwareInfo,
60    /// Network state (public IP, tunnel status, listening ports).
61    pub network: NetworkInfo,
62}
63
64/// Basic system information — direct `/proc` reads only.
65///
66/// No shell commands are used for data available in `/proc`. Disk
67/// information (`df`) is the only exception because Linux provides
68/// no per-mount usage summary in `/proc`.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70#[allow(clippy::exhaustive_structs)]
71pub struct SystemInfo {
72    /// CPU model string from `/proc/cpuinfo` `model name` field.
73    pub cpu_model: String,
74    /// Logical CPU core count from `/proc/cpuinfo` (counts `processor` entries).
75    pub cpu_count: u32,
76    /// Total RAM in human-readable form (e.g. `"32Gi"`) from `/proc/meminfo`
77    /// `MemTotal` (kB → human).
78    pub ram_total: String,
79    /// Free RAM in human-readable form (e.g. `"750Mi"`) from `/proc/meminfo`
80    /// `MemFree` (kB → human).
81    pub ram_free: String,
82    /// Available RAM in human-readable form (e.g. `"22Gi"`) from `/proc/meminfo`
83    /// `MemAvailable` (kB → human). This is the memory usable for new
84    /// allocations without swapping — more useful than `ram_free` for
85    /// capacity planning.
86    pub ram_available: String,
87    /// Total disk space in human-readable form (e.g. `"100G"`) from `df -h /`.
88    pub disk_total: String,
89    /// Free disk space in human-readable form from `df -h /`.
90    pub disk_free: String,
91    /// Disk usage percentage as a string without `%` sign (e.g. `"45"`).
92    pub disk_used_percent: String,
93    /// Human-readable uptime (e.g. `"up 6 days, 3 hours"`) computed from
94    /// `/proc/uptime`.
95    pub uptime: String,
96    /// Machine-parseable uptime in seconds from `/proc/uptime` first field.
97    pub uptime_seconds: u64,
98    /// Load average string (e.g. `"0.50, 0.30, 0.20"`) from `/proc/loadavg`
99    /// first three fields.
100    pub load_average: String,
101}
102
103/// Special hardware device information.
104///
105/// Detects accelerators generically — GPUs (nvidia-smi, rocm-smi, /dev/dri),
106/// TPUs (/dev/accel*), and JAX availability. Reports what exists, not what
107/// was expected. Shell commands are used here because accelerator detection
108/// requires vendor-specific tools that have no `/proc` equivalent.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110#[allow(clippy::exhaustive_structs)]
111pub struct HardwareInfo {
112    /// Detected accelerator devices (any kind). Empty vec = no accelerators found.
113    #[serde(default)]
114    pub accelerators: Vec<AcceleratorInfo>,
115    /// Whether the `jax` Python package is importable.
116    #[serde(default)]
117    pub jax_available: bool,
118    /// JAX version string (e.g. `"0.4.25"`), if available.
119    #[serde(default)]
120    pub jax_version: Option<String>,
121    /// Number of JAX-visible devices, if available.
122    #[serde(default)]
123    pub jax_device_count: Option<usize>,
124}
125
126/// A detected hardware accelerator.
127#[derive(Debug, Clone, Serialize, Deserialize)]
128#[allow(clippy::exhaustive_structs)]
129pub struct AcceleratorInfo {
130    /// Accelerator kind: "gpu", "tpu", "npu".
131    pub kind: String,
132    /// Number of devices of this kind detected.
133    pub count: usize,
134    /// Vendor name if identifiable (e.g. "nvidia", "amd", "google").
135    #[serde(default)]
136    pub vendor: Option<String>,
137    /// Device model string if available.
138    #[serde(default)]
139    pub model: Option<String>,
140}
141
142/// Network state information.
143///
144/// Public IP capture is **opt-in** via `RUNTIMO_ENABLE_PUBLIC_IP=1`.
145/// Without this env var, `public_ip` defaults to `"unknown"` to prevent
146/// unintended external network metadata leakage.
147///
148/// Tunnel detection reads `/proc/[0-9]*/comm` files (process names only,
149/// not command lines). This eliminates the self-match bug where `pgrep`
150/// would match the shell that runs `pgrep` itself.
151///
152/// Listening ports are read directly from `/proc/net/tcp` and `/proc/net/tcp6`
153/// — no `ss` shell-out, no service name guessing.
154#[derive(Debug, Clone, Serialize, Deserialize)]
155#[allow(clippy::exhaustive_structs)]
156pub struct NetworkInfo {
157    /// Public IP address (from `ifconfig.me` when `RUNTIMO_ENABLE_PUBLIC_IP=1`),
158    /// or `"unknown"`.
159    pub public_ip: String,
160    /// Whether a `cloudflared` tunnel process is running (detected via
161    /// `/proc/*/comm` content match, not pgrep).
162    pub tunnel_running: bool,
163    /// PID of the `cloudflared` process if found, extracted from the
164    /// `/proc/<pid>` directory name.
165    pub tunnel_pid: Option<u32>,
166    /// Raw listening TCP ports from `/proc/net/tcp` and `/proc/net/tcp6`.
167    /// Only ports in `LISTEN` (state `0A`) state are included.
168    /// Sorted ascending, duplicates removed.
169    #[serde(default)]
170    pub listening_ports: Vec<u16>,
171}
172
173// ── /proc file reading helpers ───────────────────────────────────────────
174
175/// Reads the entire contents of a `/proc` file into a `String`.
176///
177/// # Input
178///
179/// `path` — Absolute path to a `/proc` file (e.g. `"/proc/cpuinfo"`).
180///
181/// # Output
182///
183/// `Ok(String)` — Full file contents.
184/// `Err(io::Error)` — File does not exist, permission denied, or I/O error.
185///
186/// Callers must handle the error case — an empty `/proc` file is a
187/// valid success (e.g. empty tcp6 in a container), only I/O errors
188/// should produce `Err`.
189fn read_proc_file(path: &str) -> std::io::Result<String> {
190    std::fs::read_to_string(path)
191}
192
193/// Parses a `/proc/meminfo` key value in kB and returns the raw numeric value.
194///
195/// `/proc/meminfo` lines have the format `Key:    12345 kB`. This function
196/// finds the line starting with `key`, extracts the numeric value (first
197/// whitespace-delimited field after the colon), and parses it as `u64`.
198///
199/// Returns `0` if the key is not found or the value cannot be parsed.
200fn parse_meminfo_kb(data: &str, key: &str) -> u64 {
201    data.lines()
202        .find(|l| l.starts_with(key))
203        .and_then(|l| l.split_whitespace().nth(1))
204        .and_then(|v| v.parse::<u64>().ok())
205        .unwrap_or(0)
206}
207
208/// Converts a kilobyte count to a human-readable string.
209///
210/// Uses binary suffixes (KiB, MiB, GiB, TiB). Values >= 1000 KiB are
211/// displayed with the next-higher unit. The output format matches the
212/// `free -h` style: e.g. `"16Gi"`, `"750Mi"`, `"512Ki"`.
213///
214/// # Examples
215///
216/// - `format_mem_kb(512)` → `"512Ki"`
217/// - `format_mem_kb(768000)` → `"750Mi"`
218/// - `format_mem_kb(16777216)` → `"16Gi"`
219fn format_mem_kb(kb: u64) -> String {
220    if kb >= 1_048_576 {
221        // GiB: >= 1024^2 KiB
222        format!("{}Gi", kb / 1_048_576)
223    } else if kb >= 1_024 {
224        // MiB: >= 1024 KiB
225        format!("{}Mi", kb / 1_024)
226    } else {
227        // KiB: raw value
228        format!("{}Ki", kb)
229    }
230}
231
232/// Formats a duration in seconds into a human-readable uptime string.
233///
234/// Breaks down the duration into days, hours, and minutes. Omits zero-value
235/// units. The format matches `uptime -p` output: e.g. `"up 6 days, 3 hours,
236/// 12 minutes"`.
237///
238/// # Examples
239///
240/// - `format_uptime(60)` → `"up 1 minute"`
241/// - `format_uptime(3661)` → `"up 1 hour, 1 minute"`
242/// - `format_uptime(526380)` → `"up 6 days, 2 hours, 13 minutes"`
243fn format_uptime(total_seconds: u64) -> String {
244    let days = total_seconds / 86_400;
245    let hours = (total_seconds % 86_400) / 3_600;
246    let minutes = (total_seconds % 3_600) / 60;
247
248    let mut parts: Vec<String> = Vec::with_capacity(3);
249    if days > 0 {
250        parts.push(format!("{} day{}", days, if days == 1 { "" } else { "s" }));
251    }
252    if hours > 0 {
253        parts.push(format!(
254            "{} hour{}",
255            hours,
256            if hours == 1 { "" } else { "s" }
257        ));
258    }
259    if minutes > 0 || parts.is_empty() {
260        // Always show at least minutes
261        parts.push(format!(
262            "{} minute{}",
263            minutes,
264            if minutes == 1 { "" } else { "s" }
265        ));
266    }
267    format!("up {}", parts.join(", "))
268}
269
270// ── Telemetry capture ────────────────────────────────────────────────────
271
272impl Telemetry {
273    /// Captures a full system telemetry snapshot.
274    ///
275    /// Results are cached for [`CACHE_TTL_SECS`] (30 seconds) to avoid
276    /// repeated filesystem reads on consecutive calls. Network queries
277    /// (public_ip, tunnel) are included in the cached value.
278    pub fn capture() -> Self {
279        let now = std::time::Instant::now();
280        {
281            let cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
282            if let Some((cached, instant)) = cache.as_ref() {
283                if now.duration_since(*instant).as_secs() < CACHE_TTL_SECS {
284                    return cached.clone();
285                }
286            }
287        }
288
289        let timestamp = std::time::SystemTime::now()
290            .duration_since(std::time::UNIX_EPOCH)
291            .map_or(0, |d| d.as_secs());
292
293        let telemetry = Self {
294            timestamp,
295            system: SystemInfo::capture(),
296            hardware: HardwareInfo::capture(),
297            network: NetworkInfo::capture(),
298        };
299
300        let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
301        *cache = Some((telemetry.clone(), now));
302        telemetry
303    }
304
305    /// Prints telemetry in a human-readable report to stdout.
306    ///
307    /// Output includes CPU cores, RAM available, machine-parseable uptime
308    /// seconds, contextualized load average (with core count), raw listening
309    /// ports, and tunnel PID.
310    pub fn print_report(&self) {
311        println!("\n{}", "=".repeat(60));
312        println!(" RUNTIMO TELEMETRY [{}]", self.timestamp);
313        println!("{}", "=".repeat(60));
314
315        println!("\n--- SYSTEM ---");
316        println!(
317            " CPU   : {} ({} cores)",
318            self.system.cpu_model, self.system.cpu_count
319        );
320        println!(
321            " RAM   : {} total, {} free, {} available",
322            self.system.ram_total, self.system.ram_free, self.system.ram_available
323        );
324        println!(
325            " Disk  : {} total, {} free ({}% used)",
326            self.system.disk_total, self.system.disk_free, self.system.disk_used_percent
327        );
328        // Machine-parseable uptime: "up 6 days (526380s)"
329        println!(
330            " Uptime: {} ({}s)",
331            self.system.uptime, self.system.uptime_seconds
332        );
333        // Contextualized load: "3.19, 4.93, 7.68 (4 cores)"
334        println!(
335            " Load  : {} ({} cores)",
336            self.system.load_average, self.system.cpu_count
337        );
338
339        println!("\n--- HARDWARE ---");
340        if self.hardware.accelerators.is_empty() {
341            println!(" Accelerators: none detected");
342        } else {
343            for acc in &self.hardware.accelerators {
344                println!(
345                    " {}: {}x {}{}",
346                    acc.kind,
347                    acc.count,
348                    acc.model.as_deref().unwrap_or("unknown"),
349                    acc.vendor
350                        .as_ref()
351                        .map(|v| format!(" ({})", v))
352                        .unwrap_or_default()
353                );
354            }
355        }
356        if self.hardware.jax_available {
357            println!(
358                " JAX: v{} ({} devices)",
359                self.hardware
360                    .jax_version
361                    .clone()
362                    .unwrap_or_else(|| "unknown".into()),
363                self.hardware.jax_device_count.unwrap_or(0)
364            );
365        }
366
367        println!("\n--- NETWORK ---");
368        println!(" Public IP: {}", self.network.public_ip);
369        // Tunnel with PID: "cloudflared (PID 1234)" or "none"
370        if self.network.tunnel_running {
371            println!(
372                " Tunnel: cloudflared (PID {})",
373                self.network
374                    .tunnel_pid
375                    .map_or_else(|| "?".to_string(), |p| p.to_string())
376            );
377        } else {
378            println!(" Tunnel: none");
379        }
380        if self.network.listening_ports.is_empty() {
381            println!(" Listening ports: none");
382        } else {
383            let ports_str = self
384                .network
385                .listening_ports
386                .iter()
387                .map(|p| p.to_string())
388                .collect::<Vec<_>>()
389                .join(", ");
390            println!(" Listening ports: {}", ports_str);
391        }
392
393        println!("\n{}", "=".repeat(60));
394    }
395}
396
397// ── SystemInfo capture — direct /proc reads ──────────────────────────────
398
399impl SystemInfo {
400    fn capture() -> Self {
401        // /proc/cpuinfo: extract model name and count logical processors
402        let cpuinfo = read_proc_file("/proc/cpuinfo").unwrap_or_default();
403        let cpu_model = cpuinfo
404            .lines()
405            .find(|l| l.starts_with("model name"))
406            .and_then(|l| l.split(':').nth(1))
407            .map_or_else(|| "unknown".to_string(), |s| s.trim().to_string());
408        // Count lines beginning with "processor" — each is a logical core
409        let cpu_count: u32 = cpuinfo
410            .lines()
411            .filter(|l| l.starts_with("processor"))
412            .count()
413            .try_into()
414            .unwrap_or(0);
415
416        // /proc/meminfo: MemTotal, MemFree, MemAvailable (all in kB)
417        let meminfo = read_proc_file("/proc/meminfo").unwrap_or_default();
418        let ram_total = format_mem_kb(parse_meminfo_kb(&meminfo, "MemTotal:"));
419        let ram_free = format_mem_kb(parse_meminfo_kb(&meminfo, "MemFree:"));
420        let ram_available = format_mem_kb(parse_meminfo_kb(&meminfo, "MemAvailable:"));
421
422        // /proc/uptime: first field is uptime in seconds (fractional).
423        // The value is always non-negative; cast truncation is safe.
424        let uptime = read_proc_file("/proc/uptime").unwrap_or_default();
425        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
426        let uptime_seconds: u64 = uptime
427            .split_whitespace()
428            .next()
429            .and_then(|s| s.parse::<f64>().ok())
430            .map_or(0, |f: f64| f as u64);
431        let uptime_str = format_uptime(uptime_seconds);
432
433        // /proc/loadavg: first three fields are 1/5/15 min load averages
434        let loadavg = read_proc_file("/proc/loadavg").unwrap_or_default();
435        let load_average = {
436            // Extract first three whitespace-separated fields from /proc/loadavg
437            let mut fields = loadavg.split_whitespace();
438            match (fields.next(), fields.next(), fields.next()) {
439                (Some(one), Some(five), Some(fifteen)) => {
440                    format!("{one}, {five}, {fifteen}")
441                }
442                _ => String::from("unknown"),
443            }
444        };
445
446        // Disk: no /proc equivalent; keep df shell-out
447        let disk_total = run_cmd("df -h / | tail -1 | awk '{print $2}'");
448        let disk_free = run_cmd("df -h / | tail -1 | awk '{print $4}'");
449        let disk_pct_str = run_cmd("df / | tail -1 | awk '{print $5}'");
450        let disk_used_percent = disk_pct_str.replace('%', "");
451
452        Self {
453            cpu_model,
454            cpu_count,
455            ram_total,
456            ram_free,
457            ram_available,
458            disk_total,
459            disk_free,
460            disk_used_percent,
461            uptime: uptime_str,
462            uptime_seconds,
463            load_average,
464        }
465    }
466}
467
468// ── HardwareInfo capture — vendor tools (no /proc equivalent) ────────────
469
470impl HardwareInfo {
471    fn capture() -> Self {
472        let mut accelerators = Vec::new();
473
474        // TPU devices via /dev/accel*
475        let tpu_count: usize = run_cmd("ls /dev/accel* 2>/dev/null | wc -l")
476            .parse()
477            .unwrap_or(0);
478        if tpu_count > 0 {
479            accelerators.push(AcceleratorInfo {
480                kind: "tpu".into(),
481                count: tpu_count,
482                vendor: Some("google".into()),
483                model: None,
484            });
485        }
486
487        // NVIDIA GPUs via nvidia-smi
488        let nvidia_gpu_count: usize = run_cmd("nvidia-smi --list-gpus 2>/dev/null | wc -l")
489            .parse()
490            .unwrap_or(0);
491        if nvidia_gpu_count > 0 {
492            let model =
493                run_cmd("nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1");
494            accelerators.push(AcceleratorInfo {
495                kind: "gpu".into(),
496                count: nvidia_gpu_count,
497                vendor: Some("nvidia".into()),
498                model: if model.is_empty() { None } else { Some(model) },
499            });
500        }
501
502        // AMD GPUs via rocm-smi
503        let amd_gpu_count: usize =
504            run_cmd("rocm-smi --showproductname 2>/dev/null | grep -c 'GPU\\['")
505                .parse()
506                .unwrap_or(0);
507        if amd_gpu_count > 0 {
508            accelerators.push(AcceleratorInfo {
509                kind: "gpu".into(),
510                count: amd_gpu_count,
511                vendor: Some("amd".into()),
512                model: None,
513            });
514        }
515
516        // Generic DRM devices (fallback for any GPU)
517        if nvidia_gpu_count == 0 && amd_gpu_count == 0 {
518            let dri_count: usize = run_cmd("ls /dev/dri/render* 2>/dev/null | wc -l")
519                .parse()
520                .unwrap_or(0);
521            if dri_count > 0 {
522                accelerators.push(AcceleratorInfo {
523                    kind: "gpu".into(),
524                    count: dri_count,
525                    vendor: None,
526                    model: Some("drm-render".into()),
527                });
528            }
529        }
530
531        let jax_available =
532            run_cmd("timeout 10 python3 -c 'import jax' 2>/dev/null && echo yes || echo no")
533                == "yes";
534        let jax_version = if jax_available {
535            Some(run_cmd(
536                "timeout 10 python3 -c 'import jax; print(jax.__version__)'",
537            ))
538        } else {
539            None
540        };
541        let jax_device_count = if jax_available {
542            run_cmd("timeout 10 python3 -c 'import jax; print(len(jax.devices()))'")
543                .parse()
544                .ok()
545        } else {
546            None
547        };
548
549        Self {
550            accelerators,
551            jax_available,
552            jax_version,
553            jax_device_count,
554        }
555    }
556}
557
558// ── NetworkInfo capture — /proc for tunnels and ports ────────────────────
559
560impl NetworkInfo {
561    /// Captures network state with opt-in public IP, tunnel detection via
562    /// `/proc/*/comm`, and listening ports from `/proc/net/tcp` + `tcp6`.
563    ///
564    /// Public IP is only queried when `RUNTIMO_ENABLE_PUBLIC_IP=1`. Without it,
565    /// `public_ip` is set to `"unknown"`.
566    ///
567    /// Tunnel detection reads `/proc/[0-9]*/comm` files and checks if any
568    /// contain `"cloudflared"`. The `comm` file holds only the process name
569    /// (max 16 chars), never the command line — this eliminates the self-match
570    /// bug where `pgrep -fa cloudflared` matches its own shell invocation.
571    fn capture() -> Self {
572        let public_ip = if std::env::var("RUNTIMO_ENABLE_PUBLIC_IP").as_deref() == Ok("1") {
573            run_cmd(
574                "curl -s --connect-timeout 5 --max-time 5 ifconfig.me 2>/dev/null || echo 'unknown'",
575            )
576        } else {
577            "unknown".to_string()
578        };
579
580        let (tunnel_running, tunnel_pid) = detect_cloudflared();
581        let listening_ports = read_listening_ports();
582
583        Self {
584            public_ip,
585            tunnel_running,
586            tunnel_pid,
587            listening_ports,
588        }
589    }
590}
591
592/// Scans `/proc/[0-9]*/comm` for a `cloudflared` process.
593///
594/// # How it works
595///
596/// 1. Iterates all directory entries in `/proc` whose names consist solely
597///    of ASCII digits (these are PID directories).
598/// 2. Reads the `comm` file inside each PID directory — this file contains
599///    only the process name (truncated to 15 chars by the kernel), never
600///    the command line or arguments.
601/// 3. If the trimmed content equals `"cloudflared"`, extracts the PID from
602///    the directory name.
603///
604/// # Why `comm`, not `cmdline`
605///
606/// The `cmdline` file (`/proc/[pid]/cmdline`) contains the full command
607/// line (null-delimited), including arguments like `--token <value>`.
608/// Using `comm` avoids:
609/// - Reading potentially sensitive command-line tokens.
610/// - The self-match bug: `sh -c pgrep -fa cloudflared` contains `cloudflared`
611///   in its command line but NOT in its `comm` file (which would be `sh`
612///   or `pgrep`).
613///
614/// Returns `(true, Some(pid))` if found, `(false, None)` otherwise.
615fn detect_cloudflared() -> (bool, Option<u32>) {
616    // Read /proc directory — each numeric subdirectory is a PID
617    let Ok(dir) = std::fs::read_dir("/proc") else {
618        return (false, None);
619    };
620
621    for entry in dir.flatten() {
622        let path = entry.path();
623        // Only consider entries whose filename is purely numeric (PIDs)
624        let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
625            continue;
626        };
627        if !name.chars().all(|c| c.is_ascii_digit()) {
628            continue;
629        }
630
631        let comm_path = path.join("comm");
632        let Ok(content) = std::fs::read_to_string(&comm_path) else {
633            continue;
634        };
635
636        if content.trim() == "cloudflared" {
637            if let Ok(pid) = name.parse::<u32>() {
638                return (true, Some(pid));
639            }
640        }
641    }
642
643    (false, None)
644}
645
646/// Reads listening TCP ports from `/proc/net/tcp` and `/proc/net/tcp6`.
647///
648/// # Format
649///
650/// Each line (after the header) has the format:
651/// ```text
652///   0: 00000000:0016 00000000:0000 0A ...
653/// ```
654///
655/// - Column 2 (`00000000:0016`) is the local address. The part after the
656///   colon (`0016`) is the port number in hexadecimal.
657/// - Column 4 (`0A`) is the socket state in hexadecimal. `0A` = `LISTEN`.
658///
659/// Only entries with state `0A` (LISTEN) are included. Ports are sorted
660/// ascending and deduplicated.
661///
662/// # Why `/proc/net/tcp`, not `ss -ltnp`
663///
664/// - `/proc/net/tcp` is a kernel-provided procfs file — no subprocess,
665///   no command parsing, no fragile positional output logic.
666/// - `ss -ltnp` requires shell-out, parses variable-width columns, and
667///   may produce output that varies across `iproute2` versions.
668/// - The procfs format is stable kernel ABI.
669fn read_listening_ports() -> Vec<u16> {
670    let mut ports = Vec::new();
671
672    for path in &["/proc/net/tcp", "/proc/net/tcp6"] {
673        let data = read_proc_file(path).unwrap_or_default();
674        // Skip header line (starts with "  sl")
675        for line in data.lines().skip(1) {
676            let parts: Vec<&str> = line.split_whitespace().collect();
677            // Minimum columns: sl(0:) + local_address + rem_address + state
678            if parts.len() < 4 {
679                continue;
680            }
681
682            // Column 2 = local_address (e.g. "00000000:0016")
683            // Column 4 = state (e.g. "0A" = LISTEN)
684            // Use .get() for clippy::indexing_slicing compliance
685            if parts.get(3) != Some(&"0A") {
686                continue;
687            }
688
689            // Extract port hex from local_address (portion after ':')
690            if let Some(port_hex) = parts.get(1).and_then(|addr| addr.split(':').nth(1)) {
691                if let Ok(port) = u16::from_str_radix(port_hex, 16) {
692                    ports.push(port);
693                }
694            }
695        }
696    }
697
698    ports.sort_unstable();
699    ports.dedup();
700    ports
701}
702
703// ── Tests ────────────────────────────────────────────────────────────────
704
705#[cfg(test)]
706mod tests {
707    use super::*;
708
709    // ── SystemInfo tests ────────────────────────────────────────────
710
711    #[test]
712    fn test_telemetry_capture() {
713        let telemetry = Telemetry::capture();
714        assert!(telemetry.timestamp > 0, "timestamp must be positive");
715
716        let s = &telemetry.system;
717        assert!(!s.cpu_model.is_empty(), "cpu_model must not be empty");
718        assert!(s.cpu_count > 0, "cpu_count must be > 0");
719        assert!(!s.ram_total.is_empty(), "ram_total must not be empty");
720        assert!(!s.ram_free.is_empty(), "ram_free must not be empty");
721        assert!(
722            !s.ram_available.is_empty(),
723            "ram_available must not be empty"
724        );
725        assert!(!s.disk_total.is_empty(), "disk_total must not be empty");
726        assert!(s.uptime_seconds > 0, "uptime_seconds must be > 0");
727        assert!(!s.load_average.is_empty(), "load_average must not be empty");
728
729        let h = &telemetry.hardware;
730        assert!(
731            h.accelerators.iter().all(|a| !a.kind.is_empty()),
732            "accelerator kind must not be empty"
733        );
734        assert!(
735            h.accelerators.iter().all(|a| a.count > 0),
736            "accelerator count must be > 0"
737        );
738
739        let net = &telemetry.network;
740        assert!(!net.public_ip.is_empty(), "public_ip must not be empty");
741        // Default: public_ip is "unknown" unless RUNTIMO_ENABLE_PUBLIC_IP=1
742        assert_eq!(
743            net.public_ip, "unknown",
744            "public_ip should be 'unknown' by default (opt-in via RUNTIMO_ENABLE_PUBLIC_IP=1)"
745        );
746        // listening_ports is a Vec — can be empty in container/isolated env
747        assert!(
748            net.listening_ports.iter().all(|p| *p > 0),
749            "all listening ports must be > 0"
750        );
751    }
752
753    #[test]
754    fn test_telemetry_cache_works() {
755        let t1 = Telemetry::capture();
756        let t2 = Telemetry::capture();
757        assert_eq!(
758            t1.timestamp, t2.timestamp,
759            "cached telemetry should be identical"
760        );
761    }
762
763    #[test]
764    fn test_system_info_from_proc() {
765        // Verify cpu_count, ram_available, uptime_seconds are populated
766        // from /proc reads (not from shell commands that might fail in
767        // minimal containers).
768        let sys = SystemInfo::capture();
769        assert!(sys.cpu_count > 0, "cpu_count from /proc/cpuinfo");
770        assert!(
771            !sys.ram_available.is_empty(),
772            "ram_available from /proc/meminfo MemAvailable"
773        );
774        assert!(sys.uptime_seconds > 0, "uptime_seconds from /proc/uptime");
775        // uptime string should be non-empty and start with "up"
776        assert!(
777            sys.uptime.starts_with("up "),
778            "uptime string should start with 'up ': got '{}'",
779            sys.uptime
780        );
781        // cpu_model should be non-empty
782        assert!(
783            !sys.cpu_model.is_empty(),
784            "cpu_model from /proc/cpuinfo model name"
785        );
786    }
787
788    #[test]
789    fn test_cloudflared_detection() {
790        // The cloudflared detection must NOT self-match.
791        // This test verifies that detecting cloudflared doesn't find
792        // the shell that is running the detection command (because it reads
793        // /proc/*/comm, not pgrep).
794        let (running, pid) = detect_cloudflared();
795
796        // If cloudflared is actually running on this machine, it should be found.
797        // But it should NEVER report pid of the detection process itself.
798        if running {
799            assert!(pid.is_some(), "tunnel_running implies tunnel_pid");
800            let found_pid = pid.unwrap();
801            // Verify the PID actually belongs to a cloudflared process
802            let comm_path = format!("/proc/{}/comm", found_pid);
803            if let Ok(content) = std::fs::read_to_string(&comm_path) {
804                assert_eq!(
805                    content.trim(),
806                    "cloudflared",
807                    "PID {} comm should be 'cloudflared', got '{}'",
808                    found_pid,
809                    content.trim()
810                );
811            }
812        }
813        // Even if not running, the function must return cleanly
814        assert!(!running || pid.is_some());
815    }
816
817    #[test]
818    fn test_listening_ports() {
819        let ports = read_listening_ports();
820
821        // Verify no duplicate ports
822        let mut uniq = ports.clone();
823        uniq.dedup();
824        assert_eq!(
825            ports.len(),
826            uniq.len(),
827            "listening ports must have no duplicates"
828        );
829
830        // Verify ports are sorted
831        for w in ports.windows(2) {
832            assert!(w[0] <= w[1], "listening ports must be sorted: {:?}", ports);
833        }
834
835        // All ports should be valid (1-65535)
836        for &p in &ports {
837            assert!(p > 0, "port 0 is not a valid listening port");
838        }
839
840        // If this runs on a live system, ports is a Vec — it can be empty
841        // in isolated containers. That's valid — no asserting on length.
842    }
843
844    // ── Helper function tests ────────────────────────────────────────
845
846    #[test]
847    fn test_format_mem_kb() {
848        assert_eq!(format_mem_kb(512), "512Ki");
849        assert_eq!(format_mem_kb(1024), "1Mi");
850        assert_eq!(format_mem_kb(1536), "1Mi"); // >1024 snaps to Mi
851        assert_eq!(format_mem_kb(1048576), "1Gi");
852        assert_eq!(format_mem_kb(2097152), "2Gi");
853        assert_eq!(format_mem_kb(768000), "750Mi"); // ~750Mi
854                                                    // Edge: 0 KB
855        assert_eq!(format_mem_kb(0), "0Ki");
856    }
857
858    #[test]
859    fn test_format_uptime() {
860        assert!(
861            format_uptime(0).contains("minute"),
862            "zero uptime: {}",
863            format_uptime(0)
864        );
865        assert!(
866            format_uptime(60).contains("1 minute"),
867            "60s: {}",
868            format_uptime(60)
869        );
870        assert!(
871            format_uptime(3600).contains("1 hour"),
872            "3600s: {}",
873            format_uptime(3600)
874        );
875        assert!(
876            format_uptime(86400).contains("1 day"),
877            "86400s: {}",
878            format_uptime(86400)
879        );
880        // All start with "up "
881        assert!(
882            format_uptime(12345).starts_with("up "),
883            "uptime should start with 'up '"
884        );
885    }
886
887    #[test]
888    fn test_parse_meminfo_kb() {
889        let sample = "MemTotal:       32768000 kB\nMemFree:         8000000 kB\nMemAvailable:   22000000 kB\n";
890        assert_eq!(parse_meminfo_kb(sample, "MemTotal:"), 32_768_000);
891        assert_eq!(parse_meminfo_kb(sample, "MemFree:"), 8_000_000);
892        assert_eq!(parse_meminfo_kb(sample, "MemAvailable:"), 22_000_000);
893        // Missing key
894        assert_eq!(parse_meminfo_kb(sample, "SwapTotal:"), 0);
895        // Empty input
896        assert_eq!(parse_meminfo_kb("", "MemTotal:"), 0);
897    }
898
899    // ── Backward compatibility tests ─────────────────────────────────
900
901    #[test]
902    fn test_accelerators_back_compat() {
903        let hw = HardwareInfo {
904            accelerators: vec![
905                AcceleratorInfo {
906                    kind: "gpu".into(),
907                    count: 4,
908                    vendor: Some("nvidia".into()),
909                    model: Some("A100".into()),
910                },
911                AcceleratorInfo {
912                    kind: "tpu".into(),
913                    count: 8,
914                    vendor: Some("google".into()),
915                    model: None,
916                },
917            ],
918            jax_available: false,
919            jax_version: None,
920            jax_device_count: None,
921        };
922
923        let total_tpu: usize = hw
924            .accelerators
925            .iter()
926            .filter(|a| a.kind == "tpu")
927            .map(|a| a.count)
928            .sum();
929        let total_gpu: usize = hw
930            .accelerators
931            .iter()
932            .filter(|a| a.kind == "gpu")
933            .map(|a| a.count)
934            .sum();
935
936        assert_eq!(total_tpu, 8, "total tpu should be 8");
937        assert_eq!(total_gpu, 4, "total gpu should be 4");
938    }
939
940    #[test]
941    fn test_accelerators_empty_is_valid() {
942        let hw = HardwareInfo {
943            accelerators: vec![],
944            jax_available: false,
945            jax_version: None,
946            jax_device_count: None,
947        };
948
949        assert!(hw.accelerators.is_empty());
950    }
951
952    #[test]
953    fn test_telemetry_serialization_roundtrip() {
954        let hw = HardwareInfo {
955            accelerators: vec![AcceleratorInfo {
956                kind: "gpu".into(),
957                count: 2,
958                vendor: Some("nvidia".into()),
959                model: Some("H100".into()),
960            }],
961            jax_available: true,
962            jax_version: Some("0.4.30".into()),
963            jax_device_count: Some(2),
964        };
965
966        let net = NetworkInfo {
967            public_ip: "192.0.2.1".into(),
968            tunnel_running: false,
969            tunnel_pid: None,
970            listening_ports: vec![22, 80, 443],
971        };
972
973        let json = serde_json::to_string(&hw).unwrap();
974        let parsed: HardwareInfo = serde_json::from_str(&json).unwrap();
975        assert_eq!(parsed.accelerators.len(), 1);
976        assert_eq!(parsed.accelerators[0].kind, "gpu");
977        assert_eq!(parsed.accelerators[0].model.as_deref(), Some("H100"));
978
979        let json = serde_json::to_string(&net).unwrap();
980        let parsed: NetworkInfo = serde_json::from_str(&json).unwrap();
981        assert!(parsed.listening_ports.contains(&22));
982        assert!(parsed.listening_ports.contains(&443));
983        assert!(!parsed.tunnel_running);
984        assert!(parsed.tunnel_pid.is_none());
985    }
986
987    #[test]
988    fn test_telemetry_deserialize_old_wal_event() {
989        let old_json = r#"{
990            "jax_available": true,
991            "jax_version": "0.4.25",
992            "jax_device_count": 8
993        }"#;
994
995        let parsed: HardwareInfo = serde_json::from_str(old_json).unwrap();
996        assert!(
997            parsed.accelerators.is_empty(),
998            "old WAL events deserialize with empty accelerators"
999        );
1000        assert!(parsed.jax_available);
1001    }
1002
1003    #[test]
1004    fn test_network_info_listening_ports_roundtrip() {
1005        // Verify that listening_ports serializes/deserializes correctly
1006        let net = NetworkInfo {
1007            public_ip: "unknown".into(),
1008            tunnel_running: false,
1009            tunnel_pid: None,
1010            listening_ports: vec![22, 11434, 3389],
1011        };
1012
1013        let json = serde_json::to_string(&net).unwrap();
1014        let parsed: NetworkInfo = serde_json::from_str(&json).unwrap();
1015        assert_eq!(parsed.listening_ports, vec![22, 11434, 3389]);
1016        assert!(!parsed.tunnel_running);
1017    }
1018}
runtimo_core/telemetry.rs

runtimo_core/
telemetry.rs