1use serde::{Deserialize, Serialize};
9use std::fs;
10use std::path::Path;
11
12#[cfg(not(target_arch = "wasm32"))]
14fn get_hostname() -> String {
15 hostname::get().map(|h| h.to_string_lossy().to_string()).unwrap_or_else(|e| {
16 eprintln!("warning: failed to get hostname: {e}");
17 "unknown".to_string()
18 })
19}
20
21#[cfg(target_arch = "wasm32")]
23fn get_hostname() -> String {
24 "wasm".to_string()
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
29pub enum SimdWidth {
30 Scalar,
32 Neon128,
34 Sse2,
36 Avx2,
38 Avx512,
40 WasmSimd128,
42}
43
44impl SimdWidth {
45 pub fn lanes(&self) -> usize {
47 match self {
48 SimdWidth::Scalar => 1,
49 SimdWidth::Neon128 | SimdWidth::Sse2 | SimdWidth::WasmSimd128 => 4,
50 SimdWidth::Avx2 => 8,
51 SimdWidth::Avx512 => 16,
52 }
53 }
54
55 pub fn bits(&self) -> usize {
57 self.lanes() * 32
58 }
59
60 pub fn compute_speedup(&self) -> f64 {
62 match self {
63 SimdWidth::Scalar => 1.0,
64 SimdWidth::Neon128 | SimdWidth::Sse2 | SimdWidth::WasmSimd128 => 4.0,
65 SimdWidth::Avx2 => 10.0, SimdWidth::Avx512 => 12.0, }
68 }
69}
70
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
73pub enum GpuBackend {
74 None,
76 Cuda,
78 Wgpu,
80 Metal,
82 Vulkan,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct CpuCapability {
89 pub vendor: String,
91 pub model: String,
93 pub cores: usize,
95 pub threads: usize,
97 pub simd: SimdWidth,
99 pub base_freq_ghz: f64,
101 pub peak_gflops: f64,
103 pub memory_bw_gbps: f64,
105}
106
107#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct GpuCapability {
110 pub vendor: String,
112 pub model: String,
114 pub backend: GpuBackend,
116 pub compute_capability: Option<String>,
118 pub peak_tflops_fp32: f64,
120 pub peak_tflops_tensor: Option<f64>,
122 pub memory_bw_gbps: f64,
124 pub vram_gb: f64,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct HardwareCapability {
131 pub timestamp: String,
133 pub hostname: String,
135 pub cpu: CpuCapability,
137 pub gpu: Option<GpuCapability>,
139 pub roofline: RooflineParams,
141 #[serde(default)]
143 pub byte_budget: Option<crate::brick::ByteBudget>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct RooflineParams {
149 pub cpu_arithmetic_intensity: f64,
151 pub gpu_arithmetic_intensity: Option<f64>,
153}
154
155impl HardwareCapability {
156 pub fn detect() -> Self {
158 let cpu = detect_cpu();
159 let gpu = detect_gpu();
160
161 let cpu_ai = cpu.peak_gflops / cpu.memory_bw_gbps;
162 let gpu_ai = gpu.as_ref().map(|g| g.peak_tflops_fp32 * 1000.0 / g.memory_bw_gbps);
163 let byte_budget_throughput = cpu.memory_bw_gbps.min(25.0);
165
166 HardwareCapability {
167 timestamp: chrono::Utc::now().to_rfc3339(),
168 hostname: get_hostname(),
169 cpu,
170 gpu,
171 roofline: RooflineParams {
172 cpu_arithmetic_intensity: cpu_ai,
173 gpu_arithmetic_intensity: gpu_ai,
174 },
175 byte_budget: Some(crate::brick::ByteBudget::from_throughput(byte_budget_throughput)),
177 }
178 }
179
180 pub fn load_or_detect(path: &Path) -> Self {
182 if path.exists() {
183 if let Ok(content) = fs::read_to_string(path) {
184 if let Ok(cap) = toml::from_str(&content) {
185 return cap;
186 }
187 }
188 }
189 let cap = Self::detect();
190 let _ = cap.save(path);
192 cap
193 }
194
195 pub fn save(&self, path: &Path) -> std::io::Result<()> {
197 if let Some(parent) = path.parent() {
198 fs::create_dir_all(parent)?;
199 }
200 let content = toml::to_string_pretty(self)
201 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
202 fs::write(path, content)
203 }
204
205 pub fn best_backend(&self) -> GpuBackend {
207 self.gpu.as_ref().map(|g| g.backend).unwrap_or(GpuBackend::None)
208 }
209
210 pub fn expected_throughput_gflops(&self, arithmetic_intensity: f64, use_gpu: bool) -> f64 {
212 if use_gpu {
213 if let Some(gpu) = &self.gpu {
214 let memory_bound = gpu.memory_bw_gbps * arithmetic_intensity;
215 let compute_bound = gpu.peak_tflops_fp32 * 1000.0;
216 memory_bound.min(compute_bound)
217 } else {
218 self.cpu_expected_throughput(arithmetic_intensity)
219 }
220 } else {
221 self.cpu_expected_throughput(arithmetic_intensity)
222 }
223 }
224
225 fn cpu_expected_throughput(&self, arithmetic_intensity: f64) -> f64 {
226 let memory_bound = self.cpu.memory_bw_gbps * arithmetic_intensity;
227 let compute_bound = self.cpu.peak_gflops;
228 memory_bound.min(compute_bound)
229 }
230
231 pub fn bottleneck(&self, arithmetic_intensity: f64, use_gpu: bool) -> Bottleneck {
233 let threshold = if use_gpu {
234 self.roofline.gpu_arithmetic_intensity.unwrap_or(f64::MAX)
235 } else {
236 self.roofline.cpu_arithmetic_intensity
237 };
238
239 if arithmetic_intensity < threshold {
240 Bottleneck::Memory
241 } else {
242 Bottleneck::Compute
243 }
244 }
245}
246
247#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
249pub enum Bottleneck {
250 Memory,
252 Compute,
254}
255
256fn detect_cpu() -> CpuCapability {
258 let simd = detect_simd();
259 let cores = num_cpus::get_physical();
260 let threads = num_cpus::get();
261
262 let base_freq_ghz = 3.0;
264
265 let peak_gflops = (cores as f64) * (simd.lanes() as f64) * 2.0 * base_freq_ghz;
267
268 let memory_bw_gbps = 80.0; CpuCapability {
272 vendor: "Unknown".to_string(),
273 model: "Unknown".to_string(),
274 cores,
275 threads,
276 simd,
277 base_freq_ghz,
278 peak_gflops,
279 memory_bw_gbps,
280 }
281}
282
283fn detect_simd() -> SimdWidth {
285 #[cfg(target_arch = "x86_64")]
286 {
287 if is_x86_feature_detected!("avx512f") {
288 return SimdWidth::Avx512;
289 }
290 if is_x86_feature_detected!("avx2") {
291 return SimdWidth::Avx2;
292 }
293 if is_x86_feature_detected!("sse2") {
294 return SimdWidth::Sse2;
295 }
296 }
297
298 #[cfg(target_arch = "aarch64")]
299 {
300 return SimdWidth::Neon128;
302 }
303
304 #[cfg(target_arch = "wasm32")]
305 {
306 return SimdWidth::WasmSimd128;
307 }
308
309 SimdWidth::Scalar
310}
311
312fn detect_gpu() -> Option<GpuCapability> {
314 #[cfg(feature = "cuda")]
316 {
317 if let Some(gpu) = detect_cuda_gpu() {
318 return Some(gpu);
319 }
320 }
321
322 None
324}
325
326#[cfg(feature = "cuda")]
327fn detect_cuda_gpu() -> Option<GpuCapability> {
328 None
331}
332
333pub fn default_hardware_path() -> std::path::PathBuf {
335 #[cfg(feature = "hardware-detect")]
336 {
337 dirs::home_dir()
338 .unwrap_or_else(|| std::path::PathBuf::from("."))
339 .join(".pmat")
340 .join("hardware.toml")
341 }
342 #[cfg(not(feature = "hardware-detect"))]
343 {
344 std::path::PathBuf::from(".pmat").join("hardware.toml")
345 }
346}
347
348#[cfg(test)]
349mod tests;