use anyhow::Result;
use std::sync::{Arc, Mutex, OnceLock};
#[derive(Debug, Clone, Default)]
pub struct GpuStats {
pub utilization: f32,
pub memory_utilization: f32,
pub temperature: f32,
pub power_usage: f32,
pub memory_free_mb: u64,
pub memory_total_mb: u64,
}
pub struct GpuMonitor {
#[cfg(feature = "gpu")]
nvml: Option<Arc<Mutex<nvml_wrapper::Nvml>>>,
#[cfg(feature = "gpu")]
device_index: u32,
}
static GPU_MONITOR: OnceLock<Arc<Mutex<GpuMonitor>>> = OnceLock::new();
impl GpuMonitor {
pub fn new() -> Self {
Self::with_device(0)
}
pub fn with_device(device_index: u32) -> Self {
#[cfg(feature = "gpu")]
{
let nvml = match nvml_wrapper::Nvml::init() {
Ok(nvml) => Some(Arc::new(Mutex::new(nvml))),
Err(e) => {
tracing::warn!("Failed to initialize NVML: {}. GPU monitoring disabled.", e);
None
}
};
Self { nvml, device_index }
}
#[cfg(not(feature = "gpu"))]
{
let _ = device_index; Self {}
}
}
pub fn global() -> Arc<Mutex<GpuMonitor>> {
GPU_MONITOR
.get_or_init(|| Arc::new(Mutex::new(GpuMonitor::new())))
.clone()
}
pub fn get_stats(&self) -> Result<GpuStats> {
#[cfg(feature = "gpu")]
{
if let Some(nvml_arc) = &self.nvml {
let nvml = nvml_arc
.lock()
.map_err(|e| anyhow::anyhow!("Failed to lock NVML: {}", e))?;
let device = nvml.device_by_index(self.device_index)?;
let utilization = device.utilization_rates()?;
let memory_info = device.memory_info()?;
let temperature = device
.temperature(nvml_wrapper::enum_wrappers::device::TemperatureSensor::Gpu)
.unwrap_or(0);
let power_usage = device.power_usage().unwrap_or(0) as f32 / 1000.0;
Ok(GpuStats {
utilization: utilization.gpu as f32,
memory_utilization: (memory_info.used as f32 / memory_info.total as f32)
* 100.0,
temperature: temperature as f32,
power_usage,
memory_free_mb: memory_info.free / (1024 * 1024),
memory_total_mb: memory_info.total / (1024 * 1024),
})
} else {
Ok(GpuStats::default())
}
}
#[cfg(not(feature = "gpu"))]
{
Ok(GpuStats::default())
}
}
pub fn get_utilization(&self) -> f32 {
self.get_stats()
.map(|stats| stats.utilization)
.unwrap_or(0.0)
}
pub fn is_available(&self) -> bool {
#[cfg(feature = "gpu")]
{
self.nvml.is_some()
}
#[cfg(not(feature = "gpu"))]
{
false
}
}
pub fn device_count() -> u32 {
#[cfg(feature = "gpu")]
{
if let Ok(nvml) = nvml_wrapper::Nvml::init() {
nvml.device_count().unwrap_or(0)
} else {
0
}
}
#[cfg(not(feature = "gpu"))]
{
0
}
}
}
impl Default for GpuMonitor {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_monitor_creation() {
let monitor = GpuMonitor::new();
let _ = monitor.is_available();
}
#[test]
fn test_gpu_stats() {
let monitor = GpuMonitor::new();
let stats = monitor.get_stats();
assert!(stats.is_ok());
if monitor.is_available() {
let stats = stats.expect("stats should be available");
assert!(stats.utilization >= 0.0 && stats.utilization <= 100.0);
}
}
#[test]
fn test_device_count() {
let _count = GpuMonitor::device_count();
}
#[test]
fn test_global_monitor() {
let monitor1 = GpuMonitor::global();
let monitor2 = GpuMonitor::global();
assert!(Arc::ptr_eq(&monitor1, &monitor2));
}
}