trueno_gpu/monitor/device/mod.rs
1//! Unified Compute Device Abstraction (TRUENO-SPEC-020)
2//!
3//! Hardware abstraction layer providing a unified interface for CPU, NVIDIA GPU,
4//! and AMD GPU monitoring.
5//!
6//! # Design Principles (Toyota Way)
7//!
8//! | Principle | Application |
9//! |-----------|-------------|
10//! | **Genchi Genbutsu** | Direct hardware sampling via native APIs |
11//! | **Poka-Yoke** | Type-safe metrics prevent unit confusion |
12//!
13//! # References
14//!
15//! - [Nickolls2008] CUDA programming model
16//! - [Jia2018] GPU microarchitecture analysis
17
18mod cpu;
19mod types;
20
21pub use cpu::*;
22pub use types::*;
23
24use crate::GpuError;
25
26// ============================================================================
27// Unified Device Trait (TRUENO-SPEC-020 Section 2.1)
28// ============================================================================
29
30/// Unified compute device abstraction
31///
32/// All compute devices (CPU, NVIDIA GPU, AMD GPU) implement this trait
33/// for consistent monitoring across heterogeneous hardware.
34///
35/// # Example
36///
37/// ```rust,ignore
38/// use trueno_gpu::monitor::{ComputeDevice, CpuDevice};
39///
40/// let cpu = CpuDevice::new();
41/// println!("CPU: {} @ {:.1}%", cpu.device_name(), cpu.compute_utilization()?);
42/// ```
43pub trait ComputeDevice: Send + Sync {
44 /// Get the unique device identifier
45 fn device_id(&self) -> DeviceId;
46
47 /// Get the device name (e.g., "NVIDIA GeForce RTX 4090")
48 fn device_name(&self) -> &str;
49
50 /// Get the device type
51 fn device_type(&self) -> DeviceType;
52
53 /// Get compute utilization (0.0-100.0%)
54 fn compute_utilization(&self) -> Result<f64, GpuError>;
55
56 /// Get compute clock speed in MHz
57 fn compute_clock_mhz(&self) -> Result<u32, GpuError>;
58
59 /// Get compute temperature in Celsius
60 fn compute_temperature_c(&self) -> Result<f64, GpuError>;
61
62 /// Get current power consumption in Watts
63 fn compute_power_watts(&self) -> Result<f64, GpuError>;
64
65 /// Get power limit in Watts
66 fn compute_power_limit_watts(&self) -> Result<f64, GpuError>;
67
68 /// Get used memory in bytes
69 fn memory_used_bytes(&self) -> Result<u64, GpuError>;
70
71 /// Get total memory in bytes
72 fn memory_total_bytes(&self) -> Result<u64, GpuError>;
73
74 /// Get memory bandwidth in GB/s (if available)
75 fn memory_bandwidth_gbps(&self) -> Result<f64, GpuError>;
76
77 /// Get number of compute units (SMs for NVIDIA, CUs for AMD, cores for CPU)
78 fn compute_unit_count(&self) -> u32;
79
80 /// Get number of active compute units
81 fn active_compute_units(&self) -> Result<u32, GpuError>;
82
83 /// Get PCIe TX bytes per second (GPU only)
84 fn pcie_tx_bytes_per_sec(&self) -> Result<u64, GpuError>;
85
86 /// Get PCIe RX bytes per second (GPU only)
87 fn pcie_rx_bytes_per_sec(&self) -> Result<u64, GpuError>;
88
89 /// Get PCIe generation (1, 2, 3, 4, 5)
90 fn pcie_generation(&self) -> u8;
91
92 /// Get PCIe width (x1, x4, x8, x16)
93 fn pcie_width(&self) -> u8;
94
95 /// Refresh metrics from hardware
96 fn refresh(&mut self) -> Result<(), GpuError>;
97
98 // =========================================================================
99 // Default implementations for derived metrics
100 // =========================================================================
101
102 /// Get memory usage percentage (0.0-100.0)
103 fn memory_usage_percent(&self) -> Result<f64, GpuError> {
104 let used = self.memory_used_bytes()?;
105 let total = self.memory_total_bytes()?;
106 if total == 0 {
107 return Ok(0.0);
108 }
109 Ok((used as f64 / total as f64) * 100.0)
110 }
111
112 /// Get available memory in bytes
113 fn memory_available_bytes(&self) -> Result<u64, GpuError> {
114 let used = self.memory_used_bytes()?;
115 let total = self.memory_total_bytes()?;
116 Ok(total.saturating_sub(used))
117 }
118
119 /// Get memory used in MB
120 fn memory_used_mb(&self) -> Result<u64, GpuError> {
121 Ok(self.memory_used_bytes()? / (1024 * 1024))
122 }
123
124 /// Get memory total in MB
125 fn memory_total_mb(&self) -> Result<u64, GpuError> {
126 Ok(self.memory_total_bytes()? / (1024 * 1024))
127 }
128
129 /// Get memory total in GB
130 fn memory_total_gb(&self) -> Result<f64, GpuError> {
131 Ok(self.memory_total_bytes()? as f64 / (1024.0 * 1024.0 * 1024.0))
132 }
133
134 /// Get power usage percentage (current/limit * 100)
135 fn power_usage_percent(&self) -> Result<f64, GpuError> {
136 let current = self.compute_power_watts()?;
137 let limit = self.compute_power_limit_watts()?;
138 if limit == 0.0 {
139 return Ok(0.0);
140 }
141 Ok((current / limit) * 100.0)
142 }
143
144 /// Check if device is throttling due to temperature
145 fn is_thermal_throttling(&self) -> Result<bool, GpuError> {
146 let temp = self.compute_temperature_c()?;
147 // Conservative threshold - most GPUs throttle around 83-85°C
148 Ok(temp > 80.0)
149 }
150
151 /// Check if device is throttling due to power
152 fn is_power_throttling(&self) -> Result<bool, GpuError> {
153 let percent = self.power_usage_percent()?;
154 Ok(percent > 95.0)
155 }
156}
157
158// ============================================================================
159// Tests (Extreme TDD - TRUENO-SPEC-020)
160// ============================================================================
161
162#[cfg(test)]
163mod tests_core;
164
165#[cfg(test)]
166mod tests_coverage;
167
168#[cfg(test)]
169mod tests_error_propagation;