pub mod cpu;
pub mod disk;
pub mod gpu;
pub mod memory;
pub mod network;
use crate::engines::EngineSnapshot;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tokio::sync::broadcast;
#[derive(Clone, serde::Serialize, Debug)]
pub struct MetricsSnapshot {
pub timestamp_ms: u64,
pub gpu: GpuMetrics,
pub cpu: CpuMetrics,
pub memory: MemoryMetrics,
pub disk: DiskMetrics,
pub network: NetworkMetrics,
pub engines: Vec<EngineSnapshot>,
pub gpu_events: Vec<gpu::GpuEvent>,
}
#[cfg(target_os = "linux")]
pub async fn metrics_collector(
tx: broadcast::Sender<String>,
poll_interval_ms: u64,
gpu_index: u32,
engine_state: std::sync::Arc<tokio::sync::RwLock<Vec<EngineSnapshot>>>,
) {
let mut interval = tokio::time::interval(Duration::from_millis(poll_interval_ms));
let mut sys = sysinfo::System::new();
let mut networks = sysinfo::Networks::new_with_refreshed_list();
let mut disks = sysinfo::Disks::new_with_refreshed_list();
let nvml = nvml_wrapper::Nvml::init().ok();
let device = match nvml.as_ref() {
Some(n) => {
let count = n.device_count().unwrap_or(0);
tracing::info!("NVML initialized: {} GPU(s) available", count);
if gpu_index >= count {
tracing::warn!(
"--gpu-index {} is out of range (found {} GPU(s)); GPU metrics disabled",
gpu_index,
count
);
None
} else {
match n.device_by_index(gpu_index) {
Ok(d) => Some(d),
Err(e) => {
tracing::warn!(
"Failed to open GPU at index {}: {} — GPU metrics disabled",
gpu_index,
e
);
None
}
}
}
}
None => {
tracing::warn!("NVML not available -- GPU metrics will be empty");
None
}
};
sys.refresh_cpu_usage();
loop {
interval.tick().await;
sys.refresh_cpu_usage();
sys.refresh_memory();
networks.refresh(true);
disks.refresh(true);
let timestamp_ms = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
let engines = engine_state.read().await.clone();
let gpu_events = gpu::detect_gpu_events(&device, timestamp_ms);
let snapshot = MetricsSnapshot {
timestamp_ms,
gpu: gpu::collect_gpu_metrics(&device),
cpu: cpu::collect_cpu_metrics(&sys),
memory: memory::collect_memory_metrics(&device),
disk: disk::collect_disk_metrics(&disks),
network: network::collect_network_metrics(&networks),
engines,
gpu_events,
};
match serde_json::to_string(&snapshot) {
Ok(json) => {
let _ = tx.send(json);
}
Err(e) => {
tracing::error!("Failed to serialize metrics: {}", e);
}
}
}
}
#[cfg(not(target_os = "linux"))]
pub async fn metrics_collector(
tx: broadcast::Sender<String>,
poll_interval_ms: u64,
_gpu_index: u32,
engine_state: std::sync::Arc<tokio::sync::RwLock<Vec<EngineSnapshot>>>,
) {
let mut interval = tokio::time::interval(Duration::from_millis(poll_interval_ms));
let mut sys = sysinfo::System::new();
let mut networks = sysinfo::Networks::new_with_refreshed_list();
let mut disks = sysinfo::Disks::new_with_refreshed_list();
tracing::warn!("Running on non-Linux platform -- GPU metrics will be stubs");
sys.refresh_cpu_usage();
loop {
interval.tick().await;
sys.refresh_cpu_usage();
sys.refresh_memory();
networks.refresh(true);
disks.refresh(true);
let timestamp_ms = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
let engines = engine_state.read().await.clone();
let gpu_events = gpu::detect_gpu_events(timestamp_ms);
let snapshot = MetricsSnapshot {
timestamp_ms,
gpu: gpu::collect_gpu_metrics(),
cpu: cpu::collect_cpu_metrics(&sys),
memory: memory::collect_memory_metrics(&sys),
disk: disk::collect_disk_metrics(&disks),
network: network::collect_network_metrics(&networks),
engines,
gpu_events,
};
match serde_json::to_string(&snapshot) {
Ok(json) => {
let _ = tx.send(json);
}
Err(e) => {
tracing::error!("Failed to serialize metrics: {}", e);
}
}
}
}
#[derive(Clone, serde::Serialize, Debug)]
pub struct GpuMetrics {
pub name: Option<String>,
pub utilization_percent: Option<u32>,
pub temperature_celsius: Option<u32>,
pub power_watts: Option<f64>,
pub power_limit_watts: Option<f64>,
pub clock_graphics_mhz: Option<u32>,
pub clock_sm_mhz: Option<u32>,
pub clock_memory_mhz: Option<u32>,
pub fan_speed_percent: Option<u32>,
}
#[derive(Clone, serde::Serialize, Debug)]
pub struct CpuMetrics {
pub name: Option<String>,
pub aggregate_percent: f32,
pub per_core: Vec<CoreMetrics>,
}
#[derive(Clone, serde::Serialize, Debug)]
pub struct CoreMetrics {
pub id: usize,
pub usage_percent: f32,
}
#[derive(Clone, serde::Serialize, Debug)]
pub struct MemoryMetrics {
pub total_bytes: u64,
pub used_bytes: u64,
pub available_bytes: u64,
pub cached_bytes: u64,
pub gpu_estimated_bytes: Option<u64>,
pub gpu_memory_total_bytes: Option<u64>,
pub gpu_memory_used_bytes: Option<u64>,
pub is_unified: bool,
}
#[derive(Clone, serde::Serialize, Debug)]
pub struct DiskMetrics {
pub name: Option<String>,
pub read_bytes_per_sec: u64,
pub write_bytes_per_sec: u64,
}
#[derive(Clone, serde::Serialize, Debug)]
pub struct NetworkMetrics {
pub name: Option<String>,
pub rx_bytes_per_sec: u64,
pub tx_bytes_per_sec: u64,
}