car_inference/
hardware.rs

1//! Hardware detection — auto-configure models and context based on system capabilities.
2
3use serde::{Serialize, Deserialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct HardwareInfo {
7    pub os: String,
8    pub arch: String,
9    pub cpu_cores: usize,
10    pub total_ram_mb: u64,
11    pub gpu_backend: GpuBackend,
12    pub gpu_memory_mb: Option<u64>,
13    /// Recommended model based on available resources.
14    pub recommended_model: String,
15    /// Recommended max context length in tokens.
16    pub recommended_context: usize,
17    /// Maximum model size in MB that fits in memory (with headroom for KV cache).
18    pub max_model_mb: u64,
19}
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
22#[serde(rename_all = "snake_case")]
23pub enum GpuBackend {
24    Metal,
25    Cuda,
26    Cpu,
27}
28
29impl HardwareInfo {
30    /// Auto-detect system hardware and compute recommendations.
31    pub fn detect() -> Self {
32        let os = detect_os();
33        let arch = std::env::consts::ARCH.to_string();
34        let cpu_cores = std::thread::available_parallelism()
35            .map(|n| n.get()).unwrap_or(1);
36        let total_ram_mb = detect_ram_mb();
37        let gpu_backend = detect_gpu_backend();
38        let gpu_memory_mb = detect_gpu_memory_mb(&gpu_backend, total_ram_mb);
39
40        // Compute how much memory is available for models
41        // On unified memory (Apple Silicon): GPU shares system RAM
42        // Budget: use ~60% of available memory for model weights.
43        // Reserve ~700MB for the embedding model (Qwen3-Embedding-0.6B Q8_0).
44        // Rest for KV cache, activations, and OS.
45        let available_mb = gpu_memory_mb.unwrap_or(total_ram_mb);
46        let embedding_model_mb: u64 = 700; // Qwen3-Embedding-0.6B-Q8_0
47        let max_model_mb = ((available_mb as f64 * 0.6) as u64).saturating_sub(embedding_model_mb);
48
49        let recommended_model = recommend_model(max_model_mb);
50        let recommended_context = recommend_context(available_mb, &recommended_model);
51
52        Self {
53            os, arch, cpu_cores, total_ram_mb,
54            gpu_backend, gpu_memory_mb,
55            recommended_model, recommended_context, max_model_mb,
56        }
57    }
58}
59
60fn detect_os() -> String {
61    if cfg!(target_os = "macos") { "macos".into() }
62    else if cfg!(target_os = "linux") { "linux".into() }
63    else if cfg!(target_os = "windows") { "windows".into() }
64    else { std::env::consts::OS.into() }
65}
66
67fn detect_ram_mb() -> u64 {
68    // macOS: sysctl hw.memsize
69    #[cfg(target_os = "macos")]
70    {
71        if let Ok(output) = std::process::Command::new("sysctl")
72            .args(["-n", "hw.memsize"])
73            .output()
74        {
75            if let Ok(s) = String::from_utf8(output.stdout) {
76                if let Ok(bytes) = s.trim().parse::<u64>() {
77                    return bytes / (1024 * 1024);
78                }
79            }
80        }
81    }
82    // Linux: /proc/meminfo
83    #[cfg(target_os = "linux")]
84    {
85        if let Ok(content) = std::fs::read_to_string("/proc/meminfo") {
86            for line in content.lines() {
87                if line.starts_with("MemTotal:") {
88                    let parts: Vec<&str> = line.split_whitespace().collect();
89                    if parts.len() >= 2 {
90                        if let Ok(kb) = parts[1].parse::<u64>() {
91                            return kb / 1024;
92                        }
93                    }
94                }
95            }
96        }
97    }
98    // Fallback: assume 8GB
99    8192
100}
101
102fn detect_gpu_backend() -> GpuBackend {
103    #[cfg(feature = "metal")]
104    { return GpuBackend::Metal; }
105    #[cfg(feature = "cuda")]
106    { return GpuBackend::Cuda; }
107    #[cfg(not(any(feature = "metal", feature = "cuda")))]
108    {
109        // Check if we're on Apple Silicon (could use Metal even if feature isn't compiled)
110        if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
111            GpuBackend::Metal
112        } else {
113            GpuBackend::Cpu
114        }
115    }
116}
117
118fn detect_gpu_memory_mb(backend: &GpuBackend, total_ram_mb: u64) -> Option<u64> {
119    match backend {
120        GpuBackend::Metal => {
121            // Apple Silicon has unified memory — GPU can use most of system RAM.
122            // macOS reserves ~2-4GB for OS, so usable = total - 4GB
123            Some(total_ram_mb.saturating_sub(4096))
124        }
125        GpuBackend::Cuda => {
126            // Try nvidia-smi
127            #[cfg(target_os = "linux")]
128            {
129                if let Ok(output) = std::process::Command::new("nvidia-smi")
130                    .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
131                    .output()
132                {
133                    if let Ok(s) = String::from_utf8(output.stdout) {
134                        if let Ok(mb) = s.trim().parse::<u64>() {
135                            return Some(mb);
136                        }
137                    }
138                }
139            }
140            None
141        }
142        GpuBackend::Cpu => None,
143    }
144}
145
146/// Recommend the best model that fits in available memory.
147fn recommend_model(max_model_mb: u64) -> String {
148    if max_model_mb >= 17000 {
149        "Qwen3-30B-A3B".into()
150    } else if max_model_mb >= 4900 {
151        "Qwen3-8B".into()
152    } else if max_model_mb >= 2500 {
153        "Qwen3-4B".into()
154    } else if max_model_mb >= 1800 {
155        "Qwen3-1.7B".into()
156    } else {
157        "Qwen3-0.6B".into()
158    }
159}
160
161/// Recommend context length based on available memory and model size.
162fn recommend_context(available_mb: u64, model_name: &str) -> usize {
163    let model_mb = match model_name {
164        "Qwen3-0.6B" => 650,
165        "Qwen3-1.7B" => 1800,
166        "Qwen3-4B" => 2500,
167        "Qwen3-8B" => 4900,
168        "Qwen3-30B-A3B" => 17000,
169        _ => 650,
170    };
171    let kv_cost_per_1k = match model_name {
172        "Qwen3-0.6B" => 0.1,
173        "Qwen3-1.7B" => 0.3,
174        "Qwen3-4B" => 0.5,
175        "Qwen3-8B" => 1.0,
176        "Qwen3-30B-A3B" => 1.5,
177        _ => 0.1,
178    };
179
180    let kv_budget_mb = available_mb.saturating_sub(model_mb).saturating_sub(1024) as f64;
181    let max_context = (kv_budget_mb / kv_cost_per_1k * 1000.0) as usize;
182
183    max_context.clamp(2048, 131072)
184}
car_inference/hardware.rs

car_inference/
hardware.rs