Skip to main content

trueno/hardware/
mod.rs

1//! Hardware Capability Detection (PMAT-447)
2//!
3//! Detects CPU SIMD capabilities, GPU presence, and calculates
4//! theoretical peak performance for roofline analysis.
5//!
6//! Integrates with `pmat brick-score` for hardware-aware profiling.
7
8use serde::{Deserialize, Serialize};
9use std::fs;
10use std::path::Path;
11
12/// Get hostname (native only, returns "wasm" on WASM targets)
13#[cfg(not(target_arch = "wasm32"))]
14fn get_hostname() -> String {
15    hostname::get().map(|h| h.to_string_lossy().to_string()).unwrap_or_else(|e| {
16        eprintln!("warning: failed to get hostname: {e}");
17        "unknown".to_string()
18    })
19}
20
21/// Get hostname (WASM fallback)
22#[cfg(target_arch = "wasm32")]
23fn get_hostname() -> String {
24    "wasm".to_string()
25}
26
27/// SIMD instruction set width
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
29pub enum SimdWidth {
30    /// No SIMD (scalar)
31    Scalar,
32    /// ARM NEON (128-bit, 4×f32)
33    Neon128,
34    /// SSE2 (128-bit, 4×f32)
35    Sse2,
36    /// AVX2 (256-bit, 8×f32)
37    Avx2,
38    /// AVX-512 (512-bit, 16×f32)
39    Avx512,
40    /// WebAssembly SIMD (128-bit, 4×f32)
41    WasmSimd128,
42}
43
44impl SimdWidth {
45    /// Number of f32 lanes
46    pub fn lanes(&self) -> usize {
47        match self {
48            SimdWidth::Scalar => 1,
49            SimdWidth::Neon128 | SimdWidth::Sse2 | SimdWidth::WasmSimd128 => 4,
50            SimdWidth::Avx2 => 8,
51            SimdWidth::Avx512 => 16,
52        }
53    }
54
55    /// Bit width
56    pub fn bits(&self) -> usize {
57        self.lanes() * 32
58    }
59
60    /// Typical speedup factor for compute-bound operations
61    pub fn compute_speedup(&self) -> f64 {
62        match self {
63            SimdWidth::Scalar => 1.0,
64            SimdWidth::Neon128 | SimdWidth::Sse2 | SimdWidth::WasmSimd128 => 4.0,
65            SimdWidth::Avx2 => 10.0,   // 8-12x measured in trueno-zram
66            SimdWidth::Avx512 => 12.0, // 8-13x measured
67        }
68    }
69}
70
71/// GPU compute backend
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
73pub enum GpuBackend {
74    /// No GPU available
75    None,
76    /// NVIDIA CUDA
77    Cuda,
78    /// WebGPU (cross-platform)
79    Wgpu,
80    /// Apple Metal
81    Metal,
82    /// Vulkan compute
83    Vulkan,
84}
85
86/// CPU capabilities
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct CpuCapability {
89    /// CPU vendor (Intel, AMD, Apple, etc.)
90    pub vendor: String,
91    /// CPU model name
92    pub model: String,
93    /// Number of physical cores
94    pub cores: usize,
95    /// Number of logical threads
96    pub threads: usize,
97    /// Best available SIMD width
98    pub simd: SimdWidth,
99    /// Base frequency in GHz
100    pub base_freq_ghz: f64,
101    /// Theoretical peak GFLOP/s (FMA)
102    pub peak_gflops: f64,
103    /// Memory bandwidth in GB/s (estimated)
104    pub memory_bw_gbps: f64,
105}
106
107/// GPU capabilities
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct GpuCapability {
110    /// GPU vendor
111    pub vendor: String,
112    /// GPU model name
113    pub model: String,
114    /// Compute backend
115    pub backend: GpuBackend,
116    /// CUDA compute capability (e.g., "8.9" for RTX 4090)
117    pub compute_capability: Option<String>,
118    /// Peak FP32 TFLOP/s
119    pub peak_tflops_fp32: f64,
120    /// Peak Tensor Core TFLOP/s (NVIDIA only)
121    pub peak_tflops_tensor: Option<f64>,
122    /// Memory bandwidth in GB/s
123    pub memory_bw_gbps: f64,
124    /// VRAM in GB
125    pub vram_gb: f64,
126}
127
128/// Complete hardware capability profile
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct HardwareCapability {
131    /// Detection timestamp
132    pub timestamp: String,
133    /// Hostname
134    pub hostname: String,
135    /// CPU capabilities
136    pub cpu: CpuCapability,
137    /// GPU capabilities (if present)
138    pub gpu: Option<GpuCapability>,
139    /// Roofline model parameters
140    pub roofline: RooflineParams,
141    /// PMAT-452: Byte budget configuration for compression/I/O workloads
142    #[serde(default)]
143    pub byte_budget: Option<crate::brick::ByteBudget>,
144}
145
146/// Roofline model parameters
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct RooflineParams {
149    /// CPU arithmetic intensity threshold (GFLOP/s ÷ GB/s)
150    pub cpu_arithmetic_intensity: f64,
151    /// GPU arithmetic intensity threshold
152    pub gpu_arithmetic_intensity: Option<f64>,
153}
154
155impl HardwareCapability {
156    /// Detect hardware capabilities at runtime
157    pub fn detect() -> Self {
158        let cpu = detect_cpu();
159        let gpu = detect_gpu();
160
161        let cpu_ai = cpu.peak_gflops / cpu.memory_bw_gbps;
162        let gpu_ai = gpu.as_ref().map(|g| g.peak_tflops_fp32 * 1000.0 / g.memory_bw_gbps);
163        // PMAT-452: Extract memory bandwidth before moving cpu
164        let byte_budget_throughput = cpu.memory_bw_gbps.min(25.0);
165
166        HardwareCapability {
167            timestamp: chrono::Utc::now().to_rfc3339(),
168            hostname: get_hostname(),
169            cpu,
170            gpu,
171            roofline: RooflineParams {
172                cpu_arithmetic_intensity: cpu_ai,
173                gpu_arithmetic_intensity: gpu_ai,
174            },
175            // PMAT-452: Default byte budget based on memory bandwidth
176            byte_budget: Some(crate::brick::ByteBudget::from_throughput(byte_budget_throughput)),
177        }
178    }
179
180    /// Load from TOML file or detect if missing
181    pub fn load_or_detect(path: &Path) -> Self {
182        if path.exists() {
183            if let Ok(content) = fs::read_to_string(path) {
184                if let Ok(cap) = toml::from_str(&content) {
185                    return cap;
186                }
187            }
188        }
189        let cap = Self::detect();
190        // Try to cache it
191        let _ = cap.save(path);
192        cap
193    }
194
195    /// Save to TOML file
196    pub fn save(&self, path: &Path) -> std::io::Result<()> {
197        if let Some(parent) = path.parent() {
198            fs::create_dir_all(parent)?;
199        }
200        let content = toml::to_string_pretty(self)
201            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
202        fs::write(path, content)
203    }
204
205    /// Get the best available backend for a workload
206    pub fn best_backend(&self) -> GpuBackend {
207        self.gpu.as_ref().map(|g| g.backend).unwrap_or(GpuBackend::None)
208    }
209
210    /// Calculate expected throughput for a brick given its arithmetic intensity
211    pub fn expected_throughput_gflops(&self, arithmetic_intensity: f64, use_gpu: bool) -> f64 {
212        if use_gpu {
213            if let Some(gpu) = &self.gpu {
214                let memory_bound = gpu.memory_bw_gbps * arithmetic_intensity;
215                let compute_bound = gpu.peak_tflops_fp32 * 1000.0;
216                memory_bound.min(compute_bound)
217            } else {
218                self.cpu_expected_throughput(arithmetic_intensity)
219            }
220        } else {
221            self.cpu_expected_throughput(arithmetic_intensity)
222        }
223    }
224
225    fn cpu_expected_throughput(&self, arithmetic_intensity: f64) -> f64 {
226        let memory_bound = self.cpu.memory_bw_gbps * arithmetic_intensity;
227        let compute_bound = self.cpu.peak_gflops;
228        memory_bound.min(compute_bound)
229    }
230
231    /// Determine if workload is memory-bound or compute-bound
232    pub fn bottleneck(&self, arithmetic_intensity: f64, use_gpu: bool) -> Bottleneck {
233        let threshold = if use_gpu {
234            self.roofline.gpu_arithmetic_intensity.unwrap_or(f64::MAX)
235        } else {
236            self.roofline.cpu_arithmetic_intensity
237        };
238
239        if arithmetic_intensity < threshold {
240            Bottleneck::Memory
241        } else {
242            Bottleneck::Compute
243        }
244    }
245}
246
247/// Workload bottleneck classification
248#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
249pub enum Bottleneck {
250    /// Limited by memory bandwidth
251    Memory,
252    /// Limited by compute throughput
253    Compute,
254}
255
256/// Detect CPU capabilities
257fn detect_cpu() -> CpuCapability {
258    let simd = detect_simd();
259    let cores = num_cpus::get_physical();
260    let threads = num_cpus::get();
261
262    // Estimate frequency (fallback to 3.0 GHz if unknown)
263    let base_freq_ghz = 3.0;
264
265    // Calculate peak GFLOP/s: cores × lanes × 2 (FMA) × freq
266    let peak_gflops = (cores as f64) * (simd.lanes() as f64) * 2.0 * base_freq_ghz;
267
268    // Estimate memory bandwidth (DDR5-5600 dual channel ≈ 89.6 GB/s)
269    let memory_bw_gbps = 80.0; // Conservative estimate
270
271    CpuCapability {
272        vendor: "Unknown".to_string(),
273        model: "Unknown".to_string(),
274        cores,
275        threads,
276        simd,
277        base_freq_ghz,
278        peak_gflops,
279        memory_bw_gbps,
280    }
281}
282
283/// Detect best available SIMD width
284fn detect_simd() -> SimdWidth {
285    #[cfg(target_arch = "x86_64")]
286    {
287        if is_x86_feature_detected!("avx512f") {
288            return SimdWidth::Avx512;
289        }
290        if is_x86_feature_detected!("avx2") {
291            return SimdWidth::Avx2;
292        }
293        if is_x86_feature_detected!("sse2") {
294            return SimdWidth::Sse2;
295        }
296    }
297
298    #[cfg(target_arch = "aarch64")]
299    {
300        // NEON is always available on aarch64
301        return SimdWidth::Neon128;
302    }
303
304    #[cfg(target_arch = "wasm32")]
305    {
306        return SimdWidth::WasmSimd128;
307    }
308
309    SimdWidth::Scalar
310}
311
312/// Detect GPU capabilities
313fn detect_gpu() -> Option<GpuCapability> {
314    // Check for CUDA first (highest performance)
315    #[cfg(feature = "cuda")]
316    {
317        if let Some(gpu) = detect_cuda_gpu() {
318            return Some(gpu);
319        }
320    }
321
322    // Fallback: no GPU detected
323    None
324}
325
326#[cfg(feature = "cuda")]
327fn detect_cuda_gpu() -> Option<GpuCapability> {
328    // This would use cuDeviceGetAttribute in a real implementation
329    // For now, return None and let the caller provide GPU info
330    None
331}
332
333/// Default hardware.toml path
334pub fn default_hardware_path() -> std::path::PathBuf {
335    #[cfg(feature = "hardware-detect")]
336    {
337        dirs::home_dir()
338            .unwrap_or_else(|| std::path::PathBuf::from("."))
339            .join(".pmat")
340            .join("hardware.toml")
341    }
342    #[cfg(not(feature = "hardware-detect"))]
343    {
344        std::path::PathBuf::from(".pmat").join("hardware.toml")
345    }
346}
347
348#[cfg(test)]
349mod tests;