Skip to main content

somatize_worker/
detect.rs

1//! Auto-detection of hardware capabilities and resource limiting.
2//!
3//! Scans the system for CPU cores, RAM, GPUs, and Python environments.
4//! Users can apply [`ResourceLimits`] to restrict what fraction of the
5//! hardware a worker exposes (Slurm-style).
6
7use crate::protocol::{Capabilities, GpuInfo};
8use std::path::Path;
9use std::process::Command;
10
11/// Limits on what fraction of detected hardware the worker may use.
12///
13/// Any `None` field means "use all available".
14///
15/// ```bash
16/// soma-worker --cpus 4 --memory 8G --gpus 1 --max-concurrent 2
17/// ```
18#[derive(Debug, Clone)]
19pub struct ResourceLimits {
20    /// Max CPU cores to expose (None = all detected).
21    pub max_cpus: Option<usize>,
22    /// Max RAM in bytes (None = all detected).
23    pub max_memory_bytes: Option<u64>,
24    /// Max GPUs to expose (None = all detected).
25    pub max_gpus: Option<usize>,
26    /// Max concurrent plans this worker will accept.
27    pub max_concurrent: usize,
28}
29
30impl Default for ResourceLimits {
31    fn default() -> Self {
32        Self {
33            max_cpus: None,
34            max_memory_bytes: None,
35            max_gpus: None,
36            max_concurrent: 4,
37        }
38    }
39}
40
41impl Capabilities {
42    /// Auto-detect hardware capabilities of the current machine.
43    pub fn detect() -> Self {
44        let sys = sysinfo::System::new_all();
45
46        let cpu_cores = sys.cpus().len();
47        let ram_bytes = sys.total_memory();
48        let gpus = detect_gpus();
49        let python_envs = detect_python_envs();
50
51        // Auto-tag based on detected hardware
52        let mut tags = Vec::new();
53        if !gpus.is_empty() {
54            tags.push("gpu".to_string());
55        }
56        tags.push("cpu".to_string());
57
58        Self {
59            cpu_cores,
60            ram_bytes,
61            gpus,
62            python_envs,
63            tags,
64        }
65    }
66
67    /// Apply resource limits: effective = min(detected, limit).
68    pub fn with_limits(mut self, limits: &ResourceLimits) -> Self {
69        if let Some(max_cpus) = limits.max_cpus {
70            self.cpu_cores = self.cpu_cores.min(max_cpus);
71        }
72        if let Some(max_mem) = limits.max_memory_bytes {
73            self.ram_bytes = self.ram_bytes.min(max_mem);
74        }
75        if let Some(max_gpus) = limits.max_gpus {
76            self.gpus.truncate(max_gpus);
77        }
78        self
79    }
80
81    /// Summary string for logging.
82    pub fn summary(&self) -> String {
83        let gpu_str = if self.gpus.is_empty() {
84            "none".to_string()
85        } else {
86            self.gpus
87                .iter()
88                .map(|g| {
89                    format!(
90                        "{} ({:.1} GB)",
91                        g.name,
92                        g.memory_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
93                    )
94                })
95                .collect::<Vec<_>>()
96                .join(", ")
97        };
98        format!(
99            "{} CPUs, {:.1} GB RAM, GPUs: {}, Python: {:?}, tags: {:?}",
100            self.cpu_cores,
101            self.ram_bytes as f64 / (1024.0 * 1024.0 * 1024.0),
102            gpu_str,
103            self.python_envs,
104            self.tags,
105        )
106    }
107}
108
109/// Detect NVIDIA GPUs via nvidia-smi.
110fn detect_gpus() -> Vec<GpuInfo> {
111    let output = Command::new("nvidia-smi")
112        .args([
113            "--query-gpu=name,memory.total",
114            "--format=csv,noheader,nounits",
115        ])
116        .output();
117
118    let output = match output {
119        Ok(o) if o.status.success() => o,
120        _ => return vec![], // no nvidia-smi or no GPUs
121    };
122
123    let stdout = String::from_utf8_lossy(&output.stdout);
124    stdout
125        .lines()
126        .filter_map(|line| {
127            let parts: Vec<&str> = line.splitn(2, ',').map(|s| s.trim()).collect();
128            if parts.len() == 2 {
129                let name = parts[0].to_string();
130                let memory_mb: u64 = parts[1].parse().unwrap_or(0);
131                Some(GpuInfo {
132                    name,
133                    memory_bytes: memory_mb * 1024 * 1024,
134                })
135            } else {
136                None
137            }
138        })
139        .collect()
140}
141
142/// Detect available Python interpreters.
143fn detect_python_envs() -> Vec<String> {
144    let candidates = ["python3", "python"];
145    let mut envs = Vec::new();
146
147    for cmd in &candidates {
148        let Ok(output) = Command::new(cmd).args(["--version"]).output() else {
149            continue;
150        };
151        if !output.status.success() {
152            continue;
153        }
154        let version = String::from_utf8_lossy(&output.stdout);
155        let version = version.trim();
156        if let Ok(which) = Command::new("which").arg(cmd).output() {
157            let path = String::from_utf8_lossy(&which.stdout).trim().to_string();
158            envs.push(format!("{version} ({path})"));
159        } else {
160            envs.push(version.to_string());
161        }
162    }
163
164    // Detect conda envs
165    if let Ok(output) = Command::new("conda")
166        .args(["env", "list", "--json"])
167        .output()
168        && output.status.success()
169        && let Ok(json) = serde_json::from_slice::<serde_json::Value>(&output.stdout)
170        && let Some(envs_arr) = json.get("envs").and_then(|v| v.as_array())
171    {
172        for env in envs_arr {
173            if let Some(path) = env.as_str() {
174                let name = Path::new(path)
175                    .file_name()
176                    .unwrap_or_default()
177                    .to_string_lossy()
178                    .to_string();
179                if !name.is_empty() {
180                    envs.push(format!("conda:{name}"));
181                }
182            }
183        }
184    }
185
186    envs
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn detect_finds_cpus_and_ram() {
195        let caps = Capabilities::detect();
196        assert!(caps.cpu_cores > 0, "should detect at least 1 CPU");
197        assert!(caps.ram_bytes > 0, "should detect RAM");
198        assert!(caps.tags.contains(&"cpu".to_string()));
199    }
200
201    #[test]
202    fn limits_restrict_capabilities() {
203        let caps = Capabilities {
204            cpu_cores: 16,
205            ram_bytes: 64 * 1024 * 1024 * 1024,
206            gpus: vec![
207                GpuInfo {
208                    name: "A100".into(),
209                    memory_bytes: 80_000_000_000,
210                },
211                GpuInfo {
212                    name: "A100".into(),
213                    memory_bytes: 80_000_000_000,
214                },
215            ],
216            python_envs: vec![],
217            tags: vec![],
218        };
219
220        let limited = caps.with_limits(&ResourceLimits {
221            max_cpus: Some(4),
222            max_memory_bytes: Some(8 * 1024 * 1024 * 1024),
223            max_gpus: Some(1),
224            max_concurrent: 2,
225        });
226
227        assert_eq!(limited.cpu_cores, 4);
228        assert_eq!(limited.ram_bytes, 8 * 1024 * 1024 * 1024);
229        assert_eq!(limited.gpus.len(), 1);
230    }
231
232    #[test]
233    fn limits_none_keeps_all() {
234        let caps = Capabilities {
235            cpu_cores: 8,
236            ram_bytes: 32_000_000_000,
237            gpus: vec![],
238            python_envs: vec![],
239            tags: vec![],
240        };
241
242        let limited = caps.with_limits(&ResourceLimits::default());
243        assert_eq!(limited.cpu_cores, 8);
244        assert_eq!(limited.ram_bytes, 32_000_000_000);
245    }
246
247    #[test]
248    fn summary_format() {
249        let caps = Capabilities {
250            cpu_cores: 4,
251            ram_bytes: 8 * 1024 * 1024 * 1024,
252            gpus: vec![],
253            python_envs: vec!["Python 3.11".into()],
254            tags: vec!["cpu".into()],
255        };
256        let s = caps.summary();
257        assert!(s.contains("4 CPUs"));
258        assert!(s.contains("8.0 GB RAM"));
259    }
260}