Skip to main content

trueno_gpu/monitor/device/
mod.rs

1//! Unified Compute Device Abstraction (TRUENO-SPEC-020)
2//!
3//! Hardware abstraction layer providing a unified interface for CPU, NVIDIA GPU,
4//! and AMD GPU monitoring.
5//!
6//! # Design Principles (Toyota Way)
7//!
8//! | Principle | Application |
9//! |-----------|-------------|
10//! | **Genchi Genbutsu** | Direct hardware sampling via native APIs |
11//! | **Poka-Yoke** | Type-safe metrics prevent unit confusion |
12//!
13//! # References
14//!
15//! - [Nickolls2008] CUDA programming model
16//! - [Jia2018] GPU microarchitecture analysis
17
18mod cpu;
19mod types;
20
21pub use cpu::*;
22pub use types::*;
23
24use crate::GpuError;
25
26// ============================================================================
27// Unified Device Trait (TRUENO-SPEC-020 Section 2.1)
28// ============================================================================
29
30/// Unified compute device abstraction
31///
32/// All compute devices (CPU, NVIDIA GPU, AMD GPU) implement this trait
33/// for consistent monitoring across heterogeneous hardware.
34///
35/// # Example
36///
37/// ```rust,ignore
38/// use trueno_gpu::monitor::{ComputeDevice, CpuDevice};
39///
40/// let cpu = CpuDevice::new();
41/// println!("CPU: {} @ {:.1}%", cpu.device_name(), cpu.compute_utilization()?);
42/// ```
43pub trait ComputeDevice: Send + Sync {
44    /// Get the unique device identifier
45    fn device_id(&self) -> DeviceId;
46
47    /// Get the device name (e.g., "NVIDIA GeForce RTX 4090")
48    fn device_name(&self) -> &str;
49
50    /// Get the device type
51    fn device_type(&self) -> DeviceType;
52
53    /// Get compute utilization (0.0-100.0%)
54    fn compute_utilization(&self) -> Result<f64, GpuError>;
55
56    /// Get compute clock speed in MHz
57    fn compute_clock_mhz(&self) -> Result<u32, GpuError>;
58
59    /// Get compute temperature in Celsius
60    fn compute_temperature_c(&self) -> Result<f64, GpuError>;
61
62    /// Get current power consumption in Watts
63    fn compute_power_watts(&self) -> Result<f64, GpuError>;
64
65    /// Get power limit in Watts
66    fn compute_power_limit_watts(&self) -> Result<f64, GpuError>;
67
68    /// Get used memory in bytes
69    fn memory_used_bytes(&self) -> Result<u64, GpuError>;
70
71    /// Get total memory in bytes
72    fn memory_total_bytes(&self) -> Result<u64, GpuError>;
73
74    /// Get memory bandwidth in GB/s (if available)
75    fn memory_bandwidth_gbps(&self) -> Result<f64, GpuError>;
76
77    /// Get number of compute units (SMs for NVIDIA, CUs for AMD, cores for CPU)
78    fn compute_unit_count(&self) -> u32;
79
80    /// Get number of active compute units
81    fn active_compute_units(&self) -> Result<u32, GpuError>;
82
83    /// Get PCIe TX bytes per second (GPU only)
84    fn pcie_tx_bytes_per_sec(&self) -> Result<u64, GpuError>;
85
86    /// Get PCIe RX bytes per second (GPU only)
87    fn pcie_rx_bytes_per_sec(&self) -> Result<u64, GpuError>;
88
89    /// Get PCIe generation (1, 2, 3, 4, 5)
90    fn pcie_generation(&self) -> u8;
91
92    /// Get PCIe width (x1, x4, x8, x16)
93    fn pcie_width(&self) -> u8;
94
95    /// Refresh metrics from hardware
96    fn refresh(&mut self) -> Result<(), GpuError>;
97
98    // =========================================================================
99    // Default implementations for derived metrics
100    // =========================================================================
101
102    /// Get memory usage percentage (0.0-100.0)
103    fn memory_usage_percent(&self) -> Result<f64, GpuError> {
104        let used = self.memory_used_bytes()?;
105        let total = self.memory_total_bytes()?;
106        if total == 0 {
107            return Ok(0.0);
108        }
109        Ok((used as f64 / total as f64) * 100.0)
110    }
111
112    /// Get available memory in bytes
113    fn memory_available_bytes(&self) -> Result<u64, GpuError> {
114        let used = self.memory_used_bytes()?;
115        let total = self.memory_total_bytes()?;
116        Ok(total.saturating_sub(used))
117    }
118
119    /// Get memory used in MB
120    fn memory_used_mb(&self) -> Result<u64, GpuError> {
121        Ok(self.memory_used_bytes()? / (1024 * 1024))
122    }
123
124    /// Get memory total in MB
125    fn memory_total_mb(&self) -> Result<u64, GpuError> {
126        Ok(self.memory_total_bytes()? / (1024 * 1024))
127    }
128
129    /// Get memory total in GB
130    fn memory_total_gb(&self) -> Result<f64, GpuError> {
131        Ok(self.memory_total_bytes()? as f64 / (1024.0 * 1024.0 * 1024.0))
132    }
133
134    /// Get power usage percentage (current/limit * 100)
135    fn power_usage_percent(&self) -> Result<f64, GpuError> {
136        let current = self.compute_power_watts()?;
137        let limit = self.compute_power_limit_watts()?;
138        if limit == 0.0 {
139            return Ok(0.0);
140        }
141        Ok((current / limit) * 100.0)
142    }
143
144    /// Check if device is throttling due to temperature
145    fn is_thermal_throttling(&self) -> Result<bool, GpuError> {
146        let temp = self.compute_temperature_c()?;
147        // Conservative threshold - most GPUs throttle around 83-85°C
148        Ok(temp > 80.0)
149    }
150
151    /// Check if device is throttling due to power
152    fn is_power_throttling(&self) -> Result<bool, GpuError> {
153        let percent = self.power_usage_percent()?;
154        Ok(percent > 95.0)
155    }
156}
157
158// ============================================================================
159// Tests (Extreme TDD - TRUENO-SPEC-020)
160// ============================================================================
161
162#[cfg(test)]
163mod tests_core;
164
165#[cfg(test)]
166mod tests_coverage;
167
168#[cfg(test)]
169mod tests_error_propagation;