use serde::{Deserialize, Serialize};
use std::fs;
use std::path::Path;
#[cfg(not(target_arch = "wasm32"))]
fn get_hostname() -> String {
hostname::get().map(|h| h.to_string_lossy().to_string()).unwrap_or_else(|e| {
eprintln!("warning: failed to get hostname: {e}");
"unknown".to_string()
})
}
#[cfg(target_arch = "wasm32")]
fn get_hostname() -> String {
"wasm".to_string()
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum SimdWidth {
Scalar,
Neon128,
Sse2,
Avx2,
Avx512,
WasmSimd128,
}
impl SimdWidth {
pub fn lanes(&self) -> usize {
match self {
SimdWidth::Scalar => 1,
SimdWidth::Neon128 | SimdWidth::Sse2 | SimdWidth::WasmSimd128 => 4,
SimdWidth::Avx2 => 8,
SimdWidth::Avx512 => 16,
}
}
pub fn bits(&self) -> usize {
self.lanes() * 32
}
pub fn compute_speedup(&self) -> f64 {
match self {
SimdWidth::Scalar => 1.0,
SimdWidth::Neon128 | SimdWidth::Sse2 | SimdWidth::WasmSimd128 => 4.0,
SimdWidth::Avx2 => 10.0, SimdWidth::Avx512 => 12.0, }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum GpuBackend {
None,
Cuda,
Wgpu,
Metal,
Vulkan,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CpuCapability {
pub vendor: String,
pub model: String,
pub cores: usize,
pub threads: usize,
pub simd: SimdWidth,
pub base_freq_ghz: f64,
pub peak_gflops: f64,
pub memory_bw_gbps: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuCapability {
pub vendor: String,
pub model: String,
pub backend: GpuBackend,
pub compute_capability: Option<String>,
pub peak_tflops_fp32: f64,
pub peak_tflops_tensor: Option<f64>,
pub memory_bw_gbps: f64,
pub vram_gb: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HardwareCapability {
pub timestamp: String,
pub hostname: String,
pub cpu: CpuCapability,
pub gpu: Option<GpuCapability>,
pub roofline: RooflineParams,
#[serde(default)]
pub byte_budget: Option<crate::brick::ByteBudget>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RooflineParams {
pub cpu_arithmetic_intensity: f64,
pub gpu_arithmetic_intensity: Option<f64>,
}
impl HardwareCapability {
pub fn detect() -> Self {
let cpu = detect_cpu();
let gpu = detect_gpu();
let cpu_ai = cpu.peak_gflops / cpu.memory_bw_gbps;
let gpu_ai = gpu.as_ref().map(|g| g.peak_tflops_fp32 * 1000.0 / g.memory_bw_gbps);
let byte_budget_throughput = cpu.memory_bw_gbps.min(25.0);
HardwareCapability {
timestamp: chrono::Utc::now().to_rfc3339(),
hostname: get_hostname(),
cpu,
gpu,
roofline: RooflineParams {
cpu_arithmetic_intensity: cpu_ai,
gpu_arithmetic_intensity: gpu_ai,
},
byte_budget: Some(crate::brick::ByteBudget::from_throughput(byte_budget_throughput)),
}
}
pub fn load_or_detect(path: &Path) -> Self {
if path.exists() {
if let Ok(content) = fs::read_to_string(path) {
if let Ok(cap) = toml::from_str(&content) {
return cap;
}
}
}
let cap = Self::detect();
let _ = cap.save(path);
cap
}
pub fn save(&self, path: &Path) -> std::io::Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
let content = toml::to_string_pretty(self)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
fs::write(path, content)
}
pub fn best_backend(&self) -> GpuBackend {
self.gpu.as_ref().map(|g| g.backend).unwrap_or(GpuBackend::None)
}
pub fn expected_throughput_gflops(&self, arithmetic_intensity: f64, use_gpu: bool) -> f64 {
if use_gpu {
if let Some(gpu) = &self.gpu {
let memory_bound = gpu.memory_bw_gbps * arithmetic_intensity;
let compute_bound = gpu.peak_tflops_fp32 * 1000.0;
memory_bound.min(compute_bound)
} else {
self.cpu_expected_throughput(arithmetic_intensity)
}
} else {
self.cpu_expected_throughput(arithmetic_intensity)
}
}
fn cpu_expected_throughput(&self, arithmetic_intensity: f64) -> f64 {
let memory_bound = self.cpu.memory_bw_gbps * arithmetic_intensity;
let compute_bound = self.cpu.peak_gflops;
memory_bound.min(compute_bound)
}
pub fn bottleneck(&self, arithmetic_intensity: f64, use_gpu: bool) -> Bottleneck {
let threshold = if use_gpu {
self.roofline.gpu_arithmetic_intensity.unwrap_or(f64::MAX)
} else {
self.roofline.cpu_arithmetic_intensity
};
if arithmetic_intensity < threshold {
Bottleneck::Memory
} else {
Bottleneck::Compute
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Bottleneck {
Memory,
Compute,
}
fn detect_cpu() -> CpuCapability {
let simd = detect_simd();
let cores = num_cpus::get_physical();
let threads = num_cpus::get();
let base_freq_ghz = 3.0;
let peak_gflops = (cores as f64) * (simd.lanes() as f64) * 2.0 * base_freq_ghz;
let memory_bw_gbps = 80.0;
CpuCapability {
vendor: "Unknown".to_string(),
model: "Unknown".to_string(),
cores,
threads,
simd,
base_freq_ghz,
peak_gflops,
memory_bw_gbps,
}
}
fn detect_simd() -> SimdWidth {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f") {
return SimdWidth::Avx512;
}
if is_x86_feature_detected!("avx2") {
return SimdWidth::Avx2;
}
if is_x86_feature_detected!("sse2") {
return SimdWidth::Sse2;
}
}
#[cfg(target_arch = "aarch64")]
{
return SimdWidth::Neon128;
}
#[cfg(target_arch = "wasm32")]
{
return SimdWidth::WasmSimd128;
}
SimdWidth::Scalar
}
fn detect_gpu() -> Option<GpuCapability> {
#[cfg(feature = "cuda")]
{
if let Some(gpu) = detect_cuda_gpu() {
return Some(gpu);
}
}
None
}
#[cfg(feature = "cuda")]
fn detect_cuda_gpu() -> Option<GpuCapability> {
None
}
pub fn default_hardware_path() -> std::path::PathBuf {
#[cfg(feature = "hardware-detect")]
{
dirs::home_dir()
.unwrap_or_else(|| std::path::PathBuf::from("."))
.join(".pmat")
.join("hardware.toml")
}
#[cfg(not(feature = "hardware-detect"))]
{
std::path::PathBuf::from(".pmat").join("hardware.toml")
}
}
#[cfg(test)]
mod tests;