runtimo_core/telemetry.rs
1//! System Telemetry — Via Negativa: raw observation, no interpretation.
2//!
3//! Captures a snapshot of the host machine by reading `/proc` and `/sys`
4//! directly. Every field is backed by a raw kernel filesystem read — no
5//! shell-out for data available in `/proc`, no pgrep, no service name
6//! guessing, no version detection.
7//!
8//! # Via Negativa Philosophy
9//!
10//! This module removes everything that is not direct observation:
11//!
12//! - **No pgrep** — tunnel detection reads `/proc/[0-9]*/comm` files
13//! (process names, not command lines). The observer no longer matches
14//! its own shell command as a running `cloudflared` process.
15//! - **No service guessing** — port detection reads `/proc/net/tcp` and
16//! `/proc/net/tcp6` directly, returning raw `Vec<u16>`. Port 22 is
17//! just `22` — the consumer decides it is SSH.
18//! - **No `ss -ltnp` parsing** — eliminated >50 lines of fragile
19//! positional output parsing.
20//! - **No version detection** — no `sshd -V`, `nginx -v`, etc.
21//! - **Raw /proc reads** — cpuinfo, meminfo, uptime, loadavg, net/tcp.
22//! - **Shell-out only where no `/proc` equivalent exists** — `df` for
23//! disk, `curl` for public IP (opt-in), accelerator detection.
24//!
25//! # Example
26//!
27//! ```rust,ignore
28//! use runtimo_core::Telemetry;
29//!
30//! let tel = Telemetry::capture();
31//! tel.print_report();
32//! ```
33//!
34//! # Performance
35//!
36//! Results are cached for 30 seconds via an internal mutex cache to avoid
37//! repeated `/proc` reads on consecutive calls.
38
39use crate::cmd::run_cmd;
40use serde::{Deserialize, Serialize};
41use std::sync::Mutex;
42
43static TELEMETRY_CACHE: Mutex<Option<(Telemetry, std::time::Instant)>> = Mutex::new(None);
44const CACHE_TTL_SECS: u64 = 30;
45
46/// Full system telemetry snapshot.
47///
48/// Contains three sub-structures: [`SystemInfo`], [`HardwareInfo`],
49/// and [`NetworkInfo`], plus a Unix timestamp. Service detection has been
50/// removed in favor of raw listening ports in [`NetworkInfo`].
51#[derive(Debug, Clone, Serialize, Deserialize)]
52#[allow(clippy::exhaustive_structs)]
53pub struct Telemetry {
54 /// Unix timestamp (seconds) when the snapshot was taken.
55 pub timestamp: u64,
56 /// Basic system information (CPU, RAM, disk, uptime, load).
57 pub system: SystemInfo,
58 /// Special hardware devices (TPU, GPU, JAX availability).
59 pub hardware: HardwareInfo,
60 /// Network state (public IP, tunnel status, listening ports).
61 pub network: NetworkInfo,
62}
63
64/// Basic system information — direct `/proc` reads only.
65///
66/// No shell commands are used for data available in `/proc`. Disk
67/// information (`df`) is the only exception because Linux provides
68/// no per-mount usage summary in `/proc`.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70#[allow(clippy::exhaustive_structs)]
71pub struct SystemInfo {
72 /// CPU model string from `/proc/cpuinfo` `model name` field.
73 pub cpu_model: String,
74 /// Logical CPU core count from `/proc/cpuinfo` (counts `processor` entries).
75 pub cpu_count: u32,
76 /// Total RAM in human-readable form (e.g. `"32Gi"`) from `/proc/meminfo`
77 /// `MemTotal` (kB → human).
78 pub ram_total: String,
79 /// Free RAM in human-readable form (e.g. `"750Mi"`) from `/proc/meminfo`
80 /// `MemFree` (kB → human).
81 pub ram_free: String,
82 /// Available RAM in human-readable form (e.g. `"22Gi"`) from `/proc/meminfo`
83 /// `MemAvailable` (kB → human). This is the memory usable for new
84 /// allocations without swapping — more useful than `ram_free` for
85 /// capacity planning.
86 pub ram_available: String,
87 /// Total disk space in human-readable form (e.g. `"100G"`) from `df -h /`.
88 pub disk_total: String,
89 /// Free disk space in human-readable form from `df -h /`.
90 pub disk_free: String,
91 /// Disk usage percentage as a string without `%` sign (e.g. `"45"`).
92 pub disk_used_percent: String,
93 /// Human-readable uptime (e.g. `"up 6 days, 3 hours"`) computed from
94 /// `/proc/uptime`.
95 pub uptime: String,
96 /// Machine-parseable uptime in seconds from `/proc/uptime` first field.
97 pub uptime_seconds: u64,
98 /// Load average string (e.g. `"0.50, 0.30, 0.20"`) from `/proc/loadavg`
99 /// first three fields.
100 pub load_average: String,
101}
102
103/// Special hardware device information.
104///
105/// Detects accelerators generically — GPUs (nvidia-smi, rocm-smi, /dev/dri),
106/// TPUs (/dev/accel*), and JAX availability. Reports what exists, not what
107/// was expected. Shell commands are used here because accelerator detection
108/// requires vendor-specific tools that have no `/proc` equivalent.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110#[allow(clippy::exhaustive_structs)]
111pub struct HardwareInfo {
112 /// Detected accelerator devices (any kind). Empty vec = no accelerators found.
113 #[serde(default)]
114 pub accelerators: Vec<AcceleratorInfo>,
115 /// Whether the `jax` Python package is importable.
116 #[serde(default)]
117 pub jax_available: bool,
118 /// JAX version string (e.g. `"0.4.25"`), if available.
119 #[serde(default)]
120 pub jax_version: Option<String>,
121 /// Number of JAX-visible devices, if available.
122 #[serde(default)]
123 pub jax_device_count: Option<usize>,
124}
125
126/// A detected hardware accelerator.
127#[derive(Debug, Clone, Serialize, Deserialize)]
128#[allow(clippy::exhaustive_structs)]
129pub struct AcceleratorInfo {
130 /// Accelerator kind: "gpu", "tpu", "npu".
131 pub kind: String,
132 /// Number of devices of this kind detected.
133 pub count: usize,
134 /// Vendor name if identifiable (e.g. "nvidia", "amd", "google").
135 #[serde(default)]
136 pub vendor: Option<String>,
137 /// Device model string if available.
138 #[serde(default)]
139 pub model: Option<String>,
140}
141
142/// Network state information.
143///
144/// Public IP capture is **opt-in** via `RUNTIMO_ENABLE_PUBLIC_IP=1`.
145/// Without this env var, `public_ip` defaults to `"unknown"` to prevent
146/// unintended external network metadata leakage.
147///
148/// Tunnel detection reads `/proc/[0-9]*/comm` files (process names only,
149/// not command lines). This eliminates the self-match bug where `pgrep`
150/// would match the shell that runs `pgrep` itself.
151///
152/// Listening ports are read directly from `/proc/net/tcp` and `/proc/net/tcp6`
153/// — no `ss` shell-out, no service name guessing.
154#[derive(Debug, Clone, Serialize, Deserialize)]
155#[allow(clippy::exhaustive_structs)]
156pub struct NetworkInfo {
157 /// Public IP address (from `ifconfig.me` when `RUNTIMO_ENABLE_PUBLIC_IP=1`),
158 /// or `"unknown"`.
159 pub public_ip: String,
160 /// Whether a `cloudflared` tunnel process is running (detected via
161 /// `/proc/*/comm` content match, not pgrep).
162 pub tunnel_running: bool,
163 /// PID of the `cloudflared` process if found, extracted from the
164 /// `/proc/<pid>` directory name.
165 pub tunnel_pid: Option<u32>,
166 /// Raw listening TCP ports from `/proc/net/tcp` and `/proc/net/tcp6`.
167 /// Only ports in `LISTEN` (state `0A`) state are included.
168 /// Sorted ascending, duplicates removed.
169 #[serde(default)]
170 pub listening_ports: Vec<u16>,
171}
172
173// ── /proc file reading helpers ───────────────────────────────────────────
174
175/// Reads the entire contents of a `/proc` file into a `String`.
176///
177/// # Input
178///
179/// `path` — Absolute path to a `/proc` file (e.g. `"/proc/cpuinfo"`).
180///
181/// # Output
182///
183/// `Ok(String)` — Full file contents.
184/// `Err(io::Error)` — File does not exist, permission denied, or I/O error.
185///
186/// Callers must handle the error case — an empty `/proc` file is a
187/// valid success (e.g. empty tcp6 in a container), only I/O errors
188/// should produce `Err`.
189fn read_proc_file(path: &str) -> std::io::Result<String> {
190 std::fs::read_to_string(path)
191}
192
193/// Parses a `/proc/meminfo` key value in kB and returns the raw numeric value.
194///
195/// `/proc/meminfo` lines have the format `Key: 12345 kB`. This function
196/// finds the line starting with `key`, extracts the numeric value (first
197/// whitespace-delimited field after the colon), and parses it as `u64`.
198///
199/// Returns `0` if the key is not found or the value cannot be parsed.
200fn parse_meminfo_kb(data: &str, key: &str) -> u64 {
201 data.lines()
202 .find(|l| l.starts_with(key))
203 .and_then(|l| l.split_whitespace().nth(1))
204 .and_then(|v| v.parse::<u64>().ok())
205 .unwrap_or(0)
206}
207
208/// Converts a kilobyte count to a human-readable string.
209///
210/// Uses binary suffixes (KiB, MiB, GiB, TiB). Values >= 1000 KiB are
211/// displayed with the next-higher unit. The output format matches the
212/// `free -h` style: e.g. `"16Gi"`, `"750Mi"`, `"512Ki"`.
213///
214/// # Examples
215///
216/// - `format_mem_kb(512)` → `"512Ki"`
217/// - `format_mem_kb(768000)` → `"750Mi"`
218/// - `format_mem_kb(16777216)` → `"16Gi"`
219fn format_mem_kb(kb: u64) -> String {
220 if kb >= 1_048_576 {
221 // GiB: >= 1024^2 KiB
222 format!("{}Gi", kb / 1_048_576)
223 } else if kb >= 1_024 {
224 // MiB: >= 1024 KiB
225 format!("{}Mi", kb / 1_024)
226 } else {
227 // KiB: raw value
228 format!("{}Ki", kb)
229 }
230}
231
232/// Formats a duration in seconds into a human-readable uptime string.
233///
234/// Breaks down the duration into days, hours, and minutes. Omits zero-value
235/// units. The format matches `uptime -p` output: e.g. `"up 6 days, 3 hours,
236/// 12 minutes"`.
237///
238/// # Examples
239///
240/// - `format_uptime(60)` → `"up 1 minute"`
241/// - `format_uptime(3661)` → `"up 1 hour, 1 minute"`
242/// - `format_uptime(526380)` → `"up 6 days, 2 hours, 13 minutes"`
243fn format_uptime(total_seconds: u64) -> String {
244 let days = total_seconds / 86_400;
245 let hours = (total_seconds % 86_400) / 3_600;
246 let minutes = (total_seconds % 3_600) / 60;
247
248 let mut parts: Vec<String> = Vec::with_capacity(3);
249 if days > 0 {
250 parts.push(format!("{} day{}", days, if days == 1 { "" } else { "s" }));
251 }
252 if hours > 0 {
253 parts.push(format!(
254 "{} hour{}",
255 hours,
256 if hours == 1 { "" } else { "s" }
257 ));
258 }
259 if minutes > 0 || parts.is_empty() {
260 // Always show at least minutes
261 parts.push(format!(
262 "{} minute{}",
263 minutes,
264 if minutes == 1 { "" } else { "s" }
265 ));
266 }
267 format!("up {}", parts.join(", "))
268}
269
270// ── Telemetry capture ────────────────────────────────────────────────────
271
272impl Telemetry {
273 /// Captures a full system telemetry snapshot.
274 ///
275 /// Results are cached for 30 seconds to avoid
276 /// repeated filesystem reads on consecutive calls. Network queries
277 /// (public_ip, tunnel) are included in the cached value.
278 ///
279 /// Use [`capture_lightweight`](Telemetry::capture_lightweight) for
280 /// execution paths that don't need accelerator detection or network
281 /// probing (e.g., the executor's WAL audit trail — which only needs
282 /// `/proc`-based system health data).
283 pub fn capture() -> Self {
284 let now = std::time::Instant::now();
285 {
286 let cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
287 if let Some((cached, instant)) = cache.as_ref() {
288 if now.duration_since(*instant).as_secs() < CACHE_TTL_SECS {
289 return cached.clone();
290 }
291 }
292 }
293
294 let timestamp = std::time::SystemTime::now()
295 .duration_since(std::time::UNIX_EPOCH)
296 .map_or(0, |d| d.as_secs());
297
298 let telemetry = Self {
299 timestamp,
300 system: SystemInfo::capture(),
301 hardware: HardwareInfo::capture(),
302 network: NetworkInfo::capture(),
303 };
304
305 let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
306 *cache = Some((telemetry.clone(), now));
307 telemetry
308 }
309
310 /// Captures a lightweight system telemetry snapshot without shell-outs.
311 ///
312 /// Unlike [`capture`](Telemetry::capture), this method skips all
313 /// accelerator detection (rocm-smi, nvidia-smi, JAX import), tunnel
314 /// probing, and public IP queries. Only [`SystemInfo`] is populated
315 /// from `/proc` reads and a single `df` shell-out.
316 ///
317 /// [`HardwareInfo`] is zeroed (empty accelerators, no JAX), and
318 /// [`NetworkInfo`] returns defaults (`public_ip = "unknown"`,
319 /// `tunnel_running = false`, empty `listening_ports`).
320 ///
321 /// Use this in hot paths like the executor's WAL audit trail where
322 /// GPU/TPU/JAX counts are irrelevant and shell-outs produce unwanted
323 /// stderr noise on systems without those tools installed.
324 ///
325 /// Results share the same internal cache — a previous full
326 /// [`capture`](Telemetry::capture) within 30 seconds will
327 /// be returned AS-IS (including hardware/network data). Callers
328 /// on hot paths should NOT rely on this returning empty hardware
329 /// if a full capture was recently cached.
330 #[must_use]
331 pub fn capture_lightweight() -> Self {
332 // Check cache first — if a full capture exists within TTL, return it.
333 // This is intentional: lightweight callers on hot paths benefit from
334 // cache hits without re-reading /proc.
335 let now = std::time::Instant::now();
336 {
337 let cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
338 if let Some((cached, instant)) = cache.as_ref() {
339 if now.duration_since(*instant).as_secs() < CACHE_TTL_SECS {
340 return cached.clone();
341 }
342 }
343 }
344
345 let timestamp = std::time::SystemTime::now()
346 .duration_since(std::time::UNIX_EPOCH)
347 .map_or(0, |d| d.as_secs());
348
349 let telemetry = Self {
350 timestamp,
351 system: SystemInfo::capture(),
352 hardware: HardwareInfo {
353 accelerators: Vec::new(),
354 jax_available: false,
355 jax_version: None,
356 jax_device_count: None,
357 },
358 network: NetworkInfo {
359 public_ip: "unknown".to_string(),
360 tunnel_running: false,
361 tunnel_pid: None,
362 listening_ports: Vec::new(),
363 },
364 };
365
366 let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
367 *cache = Some((telemetry.clone(), now));
368 telemetry
369 }
370
371 /// Clears the telemetry cache.
372 ///
373 /// Call this in tests or between full/lightweight captures to prevent
374 /// stale cached data from leaking across capture modes.
375 pub fn clear_cache() {
376 let mut cache = TELEMETRY_CACHE.lock().unwrap_or_else(|e| e.into_inner());
377 *cache = None;
378 }
379
380 /// Prints telemetry in a human-readable report to stdout.
381 ///
382 /// Output includes CPU cores, RAM available, machine-parseable uptime
383 /// seconds, contextualized load average (with core count), raw listening
384 /// ports, and tunnel PID.
385 pub fn print_report(&self) {
386 println!("\n{}", "=".repeat(60));
387 println!(" RUNTIMO TELEMETRY [{}]", self.timestamp);
388 println!("{}", "=".repeat(60));
389
390 println!("\n--- SYSTEM ---");
391 println!(
392 " CPU : {} ({} cores)",
393 self.system.cpu_model, self.system.cpu_count
394 );
395 println!(
396 " RAM : {} total, {} free, {} available",
397 self.system.ram_total, self.system.ram_free, self.system.ram_available
398 );
399 println!(
400 " Disk : {} total, {} free ({}% used)",
401 self.system.disk_total, self.system.disk_free, self.system.disk_used_percent
402 );
403 // Machine-parseable uptime: "up 6 days (526380s)"
404 println!(
405 " Uptime: {} ({}s)",
406 self.system.uptime, self.system.uptime_seconds
407 );
408 // Contextualized load: "3.19, 4.93, 7.68 (4 cores)"
409 println!(
410 " Load : {} ({} cores)",
411 self.system.load_average, self.system.cpu_count
412 );
413
414 println!("\n--- HARDWARE ---");
415 if self.hardware.accelerators.is_empty() {
416 println!(" Accelerators: none detected");
417 } else {
418 for acc in &self.hardware.accelerators {
419 println!(
420 " {}: {}x {}{}",
421 acc.kind,
422 acc.count,
423 acc.model.as_deref().unwrap_or("unknown"),
424 acc.vendor
425 .as_ref()
426 .map(|v| format!(" ({})", v))
427 .unwrap_or_default()
428 );
429 }
430 }
431 if self.hardware.jax_available {
432 println!(
433 " JAX: v{} ({} devices)",
434 self.hardware
435 .jax_version
436 .clone()
437 .unwrap_or_else(|| "unknown".into()),
438 self.hardware.jax_device_count.unwrap_or(0)
439 );
440 }
441
442 println!("\n--- NETWORK ---");
443 println!(" Public IP: {}", self.network.public_ip);
444 // Tunnel with PID: "cloudflared (PID 1234)" or "none"
445 if self.network.tunnel_running {
446 println!(
447 " Tunnel: cloudflared (PID {})",
448 self.network
449 .tunnel_pid
450 .map_or_else(|| "?".to_string(), |p| p.to_string())
451 );
452 } else {
453 println!(" Tunnel: none");
454 }
455 if self.network.listening_ports.is_empty() {
456 println!(" Listening ports: none");
457 } else {
458 let ports_str = self
459 .network
460 .listening_ports
461 .iter()
462 .map(|p| p.to_string())
463 .collect::<Vec<_>>()
464 .join(", ");
465 println!(" Listening ports: {}", ports_str);
466 }
467
468 println!("\n{}", "=".repeat(60));
469 }
470}
471
472// ── SystemInfo capture — direct /proc reads ──────────────────────────────
473
474impl SystemInfo {
475 /// Captures system information from `/proc` and `/sys` files with a single
476 /// `df` shell-out for disk usage. No accelerator or network probing.
477 ///
478 /// Reads `/proc/cpuinfo` (model, count), `/proc/meminfo` (MemTotal,
479 /// MemFree, MemAvailable), `/proc/uptime`, and `/proc/loadavg`.
480 /// Disk info comes from `df` because Linux provides no per-mount usage
481 /// summary in procfs.
482 #[must_use]
483 pub(crate) fn capture() -> Self {
484 // /proc/cpuinfo: extract model name and count logical processors
485 let cpuinfo = read_proc_file("/proc/cpuinfo").unwrap_or_default();
486 let cpu_model = cpuinfo
487 .lines()
488 .find(|l| l.starts_with("model name"))
489 .and_then(|l| l.split(':').nth(1))
490 .map_or_else(|| "unknown".to_string(), |s| s.trim().to_string());
491 // Count lines beginning with "processor" — each is a logical core
492 let cpu_count: u32 = cpuinfo
493 .lines()
494 .filter(|l| l.starts_with("processor"))
495 .count()
496 .try_into()
497 .unwrap_or(0);
498
499 // /proc/meminfo: MemTotal, MemFree, MemAvailable (all in kB)
500 let meminfo = read_proc_file("/proc/meminfo").unwrap_or_default();
501 let ram_total = format_mem_kb(parse_meminfo_kb(&meminfo, "MemTotal:"));
502 let ram_free = format_mem_kb(parse_meminfo_kb(&meminfo, "MemFree:"));
503 let ram_available = format_mem_kb(parse_meminfo_kb(&meminfo, "MemAvailable:"));
504
505 // /proc/uptime: first field is uptime in seconds (fractional).
506 // The value is always non-negative; cast truncation is safe.
507 let uptime = read_proc_file("/proc/uptime").unwrap_or_default();
508 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
509 let uptime_seconds: u64 = uptime
510 .split_whitespace()
511 .next()
512 .and_then(|s| s.parse::<f64>().ok())
513 .map_or(0, |f: f64| f as u64);
514 let uptime_str = format_uptime(uptime_seconds);
515
516 // /proc/loadavg: first three fields are 1/5/15 min load averages
517 let loadavg = read_proc_file("/proc/loadavg").unwrap_or_default();
518 let load_average = {
519 // Extract first three whitespace-separated fields from /proc/loadavg
520 let mut fields = loadavg.split_whitespace();
521 match (fields.next(), fields.next(), fields.next()) {
522 (Some(one), Some(five), Some(fifteen)) => {
523 format!("{one}, {five}, {fifteen}")
524 }
525 _ => String::from("unknown"),
526 }
527 };
528
529 // Disk: no /proc equivalent; keep df shell-out
530 let disk_total = run_cmd("df -h / | tail -1 | awk '{print $2}'").unwrap_or_default();
531 let disk_free = run_cmd("df -h / | tail -1 | awk '{print $4}'").unwrap_or_default();
532 let disk_pct_str = run_cmd("df / | tail -1 | awk '{print $5}'").unwrap_or_default();
533 let disk_used_percent = disk_pct_str.replace('%', "");
534
535 Self {
536 cpu_model,
537 cpu_count,
538 ram_total,
539 ram_free,
540 ram_available,
541 disk_total,
542 disk_free,
543 disk_used_percent,
544 uptime: uptime_str,
545 uptime_seconds,
546 load_average,
547 }
548 }
549}
550
551// ── HardwareInfo capture — vendor tools (no /proc equivalent) ────────────
552
553impl HardwareInfo {
554 fn capture() -> Self {
555 let mut accelerators = Vec::new();
556
557 // TPU devices via /dev/accel*
558 let tpu_count: usize = run_cmd("ls /dev/accel* 2>/dev/null | wc -l")
559 .unwrap_or_default()
560 .parse()
561 .unwrap_or(0);
562 if tpu_count > 0 {
563 accelerators.push(AcceleratorInfo {
564 kind: "tpu".into(),
565 count: tpu_count,
566 vendor: Some("google".into()),
567 model: None,
568 });
569 }
570
571 // NVIDIA GPUs via nvidia-smi
572 let nvidia_gpu_count: usize = run_cmd("nvidia-smi --list-gpus 2>/dev/null | wc -l")
573 .unwrap_or_default()
574 .parse()
575 .unwrap_or(0);
576 if nvidia_gpu_count > 0 {
577 let model =
578 run_cmd("nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1")
579 .unwrap_or_default();
580 accelerators.push(AcceleratorInfo {
581 kind: "gpu".into(),
582 count: nvidia_gpu_count,
583 vendor: Some("nvidia".into()),
584 model: if model.is_empty() { None } else { Some(model) },
585 });
586 }
587
588 // AMD GPUs via rocm-smi
589 let amd_gpu_count: usize =
590 run_cmd("rocm-smi --showproductname 2>/dev/null | grep -c 'GPU\\['")
591 .unwrap_or_default()
592 .parse()
593 .unwrap_or(0);
594 if amd_gpu_count > 0 {
595 accelerators.push(AcceleratorInfo {
596 kind: "gpu".into(),
597 count: amd_gpu_count,
598 vendor: Some("amd".into()),
599 model: None,
600 });
601 }
602
603 // Generic DRM devices (fallback for any GPU)
604 if nvidia_gpu_count == 0 && amd_gpu_count == 0 {
605 let dri_count: usize = run_cmd("ls /dev/dri/render* 2>/dev/null | wc -l")
606 .unwrap_or_default()
607 .parse()
608 .unwrap_or(0);
609 if dri_count > 0 {
610 accelerators.push(AcceleratorInfo {
611 kind: "gpu".into(),
612 count: dri_count,
613 vendor: None,
614 model: Some("drm-render".into()),
615 });
616 }
617 }
618
619 let jax_available =
620 run_cmd("timeout 10 python3 -c 'import jax' 2>/dev/null && echo yes || echo no")
621 .unwrap_or_default()
622 == "yes";
623 let jax_version = if jax_available {
624 Some(
625 run_cmd("timeout 10 python3 -c 'import jax; print(jax.__version__)'")
626 .unwrap_or_default(),
627 )
628 } else {
629 None
630 };
631 let jax_device_count = if jax_available {
632 run_cmd("timeout 10 python3 -c 'import jax; print(len(jax.devices()))'")
633 .unwrap_or_default()
634 .parse()
635 .ok()
636 } else {
637 None
638 };
639
640 Self {
641 accelerators,
642 jax_available,
643 jax_version,
644 jax_device_count,
645 }
646 }
647}
648
649// ── NetworkInfo capture — /proc for tunnels and ports ────────────────────
650
651impl NetworkInfo {
652 /// Captures network state with opt-in public IP, tunnel detection via
653 /// `/proc/*/comm`, and listening ports from `/proc/net/tcp` + `tcp6`.
654 ///
655 /// Public IP is only queried when `RUNTIMO_ENABLE_PUBLIC_IP=1`. Without it,
656 /// `public_ip` is set to `"unknown"`.
657 ///
658 /// Tunnel detection reads `/proc/[0-9]*/comm` files and checks if any
659 /// contain `"cloudflared"`. The `comm` file holds only the process name
660 /// (max 16 chars), never the command line — this eliminates the self-match
661 /// bug where `pgrep -fa cloudflared` matches its own shell invocation.
662 fn capture() -> Self {
663 let public_ip = if std::env::var("RUNTIMO_ENABLE_PUBLIC_IP").as_deref() == Ok("1") {
664 run_cmd(
665 "curl -s --connect-timeout 5 --max-time 5 ifconfig.me 2>/dev/null || echo 'unknown'",
666 )
667 .unwrap_or_else(|_| "unknown".to_string())
668 } else {
669 "unknown".to_string()
670 };
671
672 let (tunnel_running, tunnel_pid) = detect_cloudflared();
673 let listening_ports = read_listening_ports();
674
675 Self {
676 public_ip,
677 tunnel_running,
678 tunnel_pid,
679 listening_ports,
680 }
681 }
682}
683
684/// Scans `/proc/[0-9]*/comm` for a `cloudflared` process.
685///
686/// # How it works
687///
688/// 1. Iterates all directory entries in `/proc` whose names consist solely
689/// of ASCII digits (these are PID directories).
690/// 2. Reads the `comm` file inside each PID directory — this file contains
691/// only the process name (truncated to 15 chars by the kernel), never
692/// the command line or arguments.
693/// 3. If the trimmed content equals `"cloudflared"`, extracts the PID from
694/// the directory name.
695///
696/// # Why `comm`, not `cmdline`
697///
698/// The `cmdline` file (`/proc/[pid]/cmdline`) contains the full command
699/// line (null-delimited), including arguments like `--token <value>`.
700/// Using `comm` avoids:
701/// - Reading potentially sensitive command-line tokens.
702/// - The self-match bug: `sh -c pgrep -fa cloudflared` contains `cloudflared`
703/// in its command line but NOT in its `comm` file (which would be `sh`
704/// or `pgrep`).
705///
706/// Returns `(true, Some(pid))` if found, `(false, None)` otherwise.
707fn detect_cloudflared() -> (bool, Option<u32>) {
708 // Read /proc directory — each numeric subdirectory is a PID
709 let Ok(dir) = std::fs::read_dir("/proc") else {
710 return (false, None);
711 };
712
713 for entry in dir.flatten() {
714 let path = entry.path();
715 // Only consider entries whose filename is purely numeric (PIDs)
716 let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
717 continue;
718 };
719 if !name.chars().all(|c| c.is_ascii_digit()) {
720 continue;
721 }
722
723 let comm_path = path.join("comm");
724 let Ok(content) = std::fs::read_to_string(&comm_path) else {
725 continue;
726 };
727
728 if content.trim() == "cloudflared" {
729 if let Ok(pid) = name.parse::<u32>() {
730 return (true, Some(pid));
731 }
732 }
733 }
734
735 (false, None)
736}
737
738/// Reads listening TCP ports from `/proc/net/tcp` and `/proc/net/tcp6`.
739///
740/// # Format
741///
742/// Each line (after the header) has the format:
743/// ```text
744/// 0: 00000000:0016 00000000:0000 0A ...
745/// ```
746///
747/// - Column 2 (`00000000:0016`) is the local address. The part after the
748/// colon (`0016`) is the port number in hexadecimal.
749/// - Column 4 (`0A`) is the socket state in hexadecimal. `0A` = `LISTEN`.
750///
751/// Only entries with state `0A` (LISTEN) are included. Ports are sorted
752/// ascending and deduplicated.
753///
754/// # Why `/proc/net/tcp`, not `ss -ltnp`
755///
756/// - `/proc/net/tcp` is a kernel-provided procfs file — no subprocess,
757/// no command parsing, no fragile positional output logic.
758/// - `ss -ltnp` requires shell-out, parses variable-width columns, and
759/// may produce output that varies across `iproute2` versions.
760/// - The procfs format is stable kernel ABI.
761fn read_listening_ports() -> Vec<u16> {
762 let mut ports = Vec::new();
763
764 for path in &["/proc/net/tcp", "/proc/net/tcp6"] {
765 let data = read_proc_file(path).unwrap_or_default();
766 // Skip header line (starts with " sl")
767 for line in data.lines().skip(1) {
768 let parts: Vec<&str> = line.split_whitespace().collect();
769 // Minimum columns: sl(0:) + local_address + rem_address + state
770 if parts.len() < 4 {
771 continue;
772 }
773
774 // Column 2 = local_address (e.g. "00000000:0016")
775 // Column 4 = state (e.g. "0A" = LISTEN)
776 // Use .get() for clippy::indexing_slicing compliance
777 if parts.get(3) != Some(&"0A") {
778 continue;
779 }
780
781 // Extract port hex from local_address (portion after ':')
782 if let Some(port_hex) = parts.get(1).and_then(|addr| addr.split(':').nth(1)) {
783 if let Ok(port) = u16::from_str_radix(port_hex, 16) {
784 ports.push(port);
785 }
786 }
787 }
788 }
789
790 ports.sort_unstable();
791 ports.dedup();
792 ports
793}
794
795// ── Tests ────────────────────────────────────────────────────────────────
796
797#[cfg(test)]
798mod tests {
799 use super::*;
800
801 // ── SystemInfo tests ────────────────────────────────────────────
802
803 #[test]
804 fn test_telemetry_capture() {
805 let telemetry = Telemetry::capture();
806 assert!(telemetry.timestamp > 0, "timestamp must be positive");
807
808 let s = &telemetry.system;
809 assert!(!s.cpu_model.is_empty(), "cpu_model must not be empty");
810 assert!(s.cpu_count > 0, "cpu_count must be > 0");
811 assert!(!s.ram_total.is_empty(), "ram_total must not be empty");
812 assert!(!s.ram_free.is_empty(), "ram_free must not be empty");
813 assert!(
814 !s.ram_available.is_empty(),
815 "ram_available must not be empty"
816 );
817 assert!(!s.disk_total.is_empty(), "disk_total must not be empty");
818 assert!(s.uptime_seconds > 0, "uptime_seconds must be > 0");
819 assert!(!s.load_average.is_empty(), "load_average must not be empty");
820
821 let h = &telemetry.hardware;
822 assert!(
823 h.accelerators.iter().all(|a| !a.kind.is_empty()),
824 "accelerator kind must not be empty"
825 );
826 assert!(
827 h.accelerators.iter().all(|a| a.count > 0),
828 "accelerator count must be > 0"
829 );
830
831 let net = &telemetry.network;
832 assert!(!net.public_ip.is_empty(), "public_ip must not be empty");
833 // Default: public_ip is "unknown" unless RUNTIMO_ENABLE_PUBLIC_IP=1
834 assert_eq!(
835 net.public_ip, "unknown",
836 "public_ip should be 'unknown' by default (opt-in via RUNTIMO_ENABLE_PUBLIC_IP=1)"
837 );
838 // listening_ports is a Vec — can be empty in container/isolated env
839 assert!(
840 net.listening_ports.iter().all(|p| *p > 0),
841 "all listening ports must be > 0"
842 );
843 }
844
845 #[test]
846 fn test_telemetry_cache_works() {
847 let t1 = Telemetry::capture();
848 let t2 = Telemetry::capture();
849 assert_eq!(
850 t1.timestamp, t2.timestamp,
851 "cached telemetry should be identical"
852 );
853 }
854
855 #[test]
856 fn test_system_info_from_proc() {
857 // Verify cpu_count, ram_available, uptime_seconds are populated
858 // from /proc reads (not from shell commands that might fail in
859 // minimal containers).
860 let sys = SystemInfo::capture();
861 assert!(sys.cpu_count > 0, "cpu_count from /proc/cpuinfo");
862 assert!(
863 !sys.ram_available.is_empty(),
864 "ram_available from /proc/meminfo MemAvailable"
865 );
866 assert!(sys.uptime_seconds > 0, "uptime_seconds from /proc/uptime");
867 // uptime string should be non-empty and start with "up"
868 assert!(
869 sys.uptime.starts_with("up "),
870 "uptime string should start with 'up ': got '{}'",
871 sys.uptime
872 );
873 // cpu_model should be non-empty
874 assert!(
875 !sys.cpu_model.is_empty(),
876 "cpu_model from /proc/cpuinfo model name"
877 );
878 }
879
880 #[test]
881 fn test_cloudflared_detection() {
882 // The cloudflared detection must NOT self-match.
883 // This test verifies that detecting cloudflared doesn't find
884 // the shell that is running the detection command (because it reads
885 // /proc/*/comm, not pgrep).
886 let (running, pid) = detect_cloudflared();
887
888 // If cloudflared is actually running on this machine, it should be found.
889 // But it should NEVER report pid of the detection process itself.
890 if running {
891 assert!(pid.is_some(), "tunnel_running implies tunnel_pid");
892 let found_pid = pid.unwrap();
893 // Verify the PID actually belongs to a cloudflared process
894 let comm_path = format!("/proc/{}/comm", found_pid);
895 if let Ok(content) = std::fs::read_to_string(&comm_path) {
896 assert_eq!(
897 content.trim(),
898 "cloudflared",
899 "PID {} comm should be 'cloudflared', got '{}'",
900 found_pid,
901 content.trim()
902 );
903 }
904 }
905 // Even if not running, the function must return cleanly
906 assert!(!running || pid.is_some());
907 }
908
909 #[test]
910 fn test_listening_ports() {
911 let ports = read_listening_ports();
912
913 // Verify no duplicate ports
914 let mut uniq = ports.clone();
915 uniq.dedup();
916 assert_eq!(
917 ports.len(),
918 uniq.len(),
919 "listening ports must have no duplicates"
920 );
921
922 // Verify ports are sorted
923 for w in ports.windows(2) {
924 assert!(w[0] <= w[1], "listening ports must be sorted: {:?}", ports);
925 }
926
927 // All ports should be valid (1-65535)
928 for &p in &ports {
929 assert!(p > 0, "port 0 is not a valid listening port");
930 }
931
932 // If this runs on a live system, ports is a Vec — it can be empty
933 // in isolated containers. That's valid — no asserting on length.
934 }
935
936 // ── Helper function tests ────────────────────────────────────────
937
938 #[test]
939 fn test_format_mem_kb() {
940 assert_eq!(format_mem_kb(512), "512Ki");
941 assert_eq!(format_mem_kb(1024), "1Mi");
942 assert_eq!(format_mem_kb(1536), "1Mi"); // >1024 snaps to Mi
943 assert_eq!(format_mem_kb(1048576), "1Gi");
944 assert_eq!(format_mem_kb(2097152), "2Gi");
945 assert_eq!(format_mem_kb(768000), "750Mi"); // ~750Mi
946 // Edge: 0 KB
947 assert_eq!(format_mem_kb(0), "0Ki");
948 }
949
950 #[test]
951 fn test_format_uptime() {
952 assert!(
953 format_uptime(0).contains("minute"),
954 "zero uptime: {}",
955 format_uptime(0)
956 );
957 assert!(
958 format_uptime(60).contains("1 minute"),
959 "60s: {}",
960 format_uptime(60)
961 );
962 assert!(
963 format_uptime(3600).contains("1 hour"),
964 "3600s: {}",
965 format_uptime(3600)
966 );
967 assert!(
968 format_uptime(86400).contains("1 day"),
969 "86400s: {}",
970 format_uptime(86400)
971 );
972 // All start with "up "
973 assert!(
974 format_uptime(12345).starts_with("up "),
975 "uptime should start with 'up '"
976 );
977 }
978
979 #[test]
980 fn test_parse_meminfo_kb() {
981 let sample = "MemTotal: 32768000 kB\nMemFree: 8000000 kB\nMemAvailable: 22000000 kB\n";
982 assert_eq!(parse_meminfo_kb(sample, "MemTotal:"), 32_768_000);
983 assert_eq!(parse_meminfo_kb(sample, "MemFree:"), 8_000_000);
984 assert_eq!(parse_meminfo_kb(sample, "MemAvailable:"), 22_000_000);
985 // Missing key
986 assert_eq!(parse_meminfo_kb(sample, "SwapTotal:"), 0);
987 // Empty input
988 assert_eq!(parse_meminfo_kb("", "MemTotal:"), 0);
989 }
990
991 // ── Backward compatibility tests ─────────────────────────────────
992
993 #[test]
994 fn test_accelerators_back_compat() {
995 let hw = HardwareInfo {
996 accelerators: vec![
997 AcceleratorInfo {
998 kind: "gpu".into(),
999 count: 4,
1000 vendor: Some("nvidia".into()),
1001 model: Some("A100".into()),
1002 },
1003 AcceleratorInfo {
1004 kind: "tpu".into(),
1005 count: 8,
1006 vendor: Some("google".into()),
1007 model: None,
1008 },
1009 ],
1010 jax_available: false,
1011 jax_version: None,
1012 jax_device_count: None,
1013 };
1014
1015 let total_tpu: usize = hw
1016 .accelerators
1017 .iter()
1018 .filter(|a| a.kind == "tpu")
1019 .map(|a| a.count)
1020 .sum();
1021 let total_gpu: usize = hw
1022 .accelerators
1023 .iter()
1024 .filter(|a| a.kind == "gpu")
1025 .map(|a| a.count)
1026 .sum();
1027
1028 assert_eq!(total_tpu, 8, "total tpu should be 8");
1029 assert_eq!(total_gpu, 4, "total gpu should be 4");
1030 }
1031
1032 #[test]
1033 fn test_accelerators_empty_is_valid() {
1034 let hw = HardwareInfo {
1035 accelerators: vec![],
1036 jax_available: false,
1037 jax_version: None,
1038 jax_device_count: None,
1039 };
1040
1041 assert!(hw.accelerators.is_empty());
1042 }
1043
1044 #[test]
1045 fn test_telemetry_serialization_roundtrip() {
1046 let hw = HardwareInfo {
1047 accelerators: vec![AcceleratorInfo {
1048 kind: "gpu".into(),
1049 count: 2,
1050 vendor: Some("nvidia".into()),
1051 model: Some("H100".into()),
1052 }],
1053 jax_available: true,
1054 jax_version: Some("0.4.30".into()),
1055 jax_device_count: Some(2),
1056 };
1057
1058 let net = NetworkInfo {
1059 public_ip: "192.0.2.1".into(),
1060 tunnel_running: false,
1061 tunnel_pid: None,
1062 listening_ports: vec![22, 80, 443],
1063 };
1064
1065 let json = serde_json::to_string(&hw).unwrap();
1066 let parsed: HardwareInfo = serde_json::from_str(&json).unwrap();
1067 assert_eq!(parsed.accelerators.len(), 1);
1068 assert_eq!(parsed.accelerators[0].kind, "gpu");
1069 assert_eq!(parsed.accelerators[0].model.as_deref(), Some("H100"));
1070
1071 let json = serde_json::to_string(&net).unwrap();
1072 let parsed: NetworkInfo = serde_json::from_str(&json).unwrap();
1073 assert!(parsed.listening_ports.contains(&22));
1074 assert!(parsed.listening_ports.contains(&443));
1075 assert!(!parsed.tunnel_running);
1076 assert!(parsed.tunnel_pid.is_none());
1077 }
1078
1079 #[test]
1080 fn test_telemetry_deserialize_old_wal_event() {
1081 let old_json = r#"{
1082 "jax_available": true,
1083 "jax_version": "0.4.25",
1084 "jax_device_count": 8
1085 }"#;
1086
1087 let parsed: HardwareInfo = serde_json::from_str(old_json).unwrap();
1088 assert!(
1089 parsed.accelerators.is_empty(),
1090 "old WAL events deserialize with empty accelerators"
1091 );
1092 assert!(parsed.jax_available);
1093 }
1094
1095 #[test]
1096 fn test_network_info_listening_ports_roundtrip() {
1097 // Verify that listening_ports serializes/deserializes correctly
1098 let net = NetworkInfo {
1099 public_ip: "unknown".into(),
1100 tunnel_running: false,
1101 tunnel_pid: None,
1102 listening_ports: vec![22, 11434, 3389],
1103 };
1104
1105 let json = serde_json::to_string(&net).unwrap();
1106 let parsed: NetworkInfo = serde_json::from_str(&json).unwrap();
1107 assert_eq!(parsed.listening_ports, vec![22, 11434, 3389]);
1108 assert!(!parsed.tunnel_running);
1109 }
1110}