1use ferrum_types::RuntimeConfigSnapshot;
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12
13#[derive(Debug, Clone, Default, Serialize, Deserialize)]
18pub struct Env {
19 pub commit_sha: String,
21 pub hw_id: String,
23 #[serde(default, skip_serializing_if = "Option::is_none")]
25 pub driver: Option<String>,
26 #[serde(default, skip_serializing_if = "Option::is_none")]
28 pub cuda: Option<String>,
29 pub rust: String,
31 pub ferrum_features: Vec<String>,
33
34 #[serde(default, skip_serializing_if = "Option::is_none")]
36 pub gpu_clock_lock_mhz: Option<u32>,
37 #[serde(default, skip_serializing_if = "Option::is_none")]
39 pub gpu_power_limit_w: Option<u32>,
40 #[serde(default, skip_serializing_if = "Option::is_none")]
42 pub gpu_persistence_mode: Option<bool>,
43 #[serde(default, skip_serializing_if = "Option::is_none")]
45 pub gpu_auto_boost: Option<bool>,
46
47 pub ferrum_env: BTreeMap<String, String>,
49
50 #[serde(default)]
52 pub runtime_config: RuntimeConfigSnapshot,
53
54 #[serde(default, skip_serializing_if = "Option::is_none")]
57 pub vllm_args: Option<Vec<String>>,
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
62pub struct EnvHash(pub String);
63
64impl EnvHash {
65 pub fn as_str(&self) -> &str {
66 &self.0
67 }
68}
69
70impl std::fmt::Display for EnvHash {
71 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72 f.write_str(&self.0)
73 }
74}
75
76impl Env {
77 pub fn hash(&self) -> EnvHash {
84 use sha2::{Digest, Sha256};
85 let canonical = serde_json::to_vec(self).expect("Env serialization must not fail");
86 let mut hasher = Sha256::new();
87 hasher.update(&canonical);
88 let digest = hasher.finalize();
89 EnvHash(format!("sha256:{:x}", digest))
90 }
91
92 pub fn capture_minimal(commit_sha: String, ferrum_features: Vec<String>) -> Self {
96 let mut feat = ferrum_features;
97 feat.sort();
98 feat.dedup();
99 Self {
100 commit_sha,
101 hw_id: detect_hw_id(),
102 driver: detect_nvidia_driver(),
103 cuda: detect_cuda_version(),
104 rust: detect_rust_version(),
105 ferrum_features: feat,
106 gpu_clock_lock_mhz: detect_gpu_clock_lock_mhz(),
107 gpu_power_limit_w: detect_gpu_power_limit_w(),
108 gpu_persistence_mode: detect_gpu_persistence(),
109 gpu_auto_boost: None,
110 ferrum_env: capture_ferrum_env(),
111 runtime_config: RuntimeConfigSnapshot::capture_current(),
112 vllm_args: None,
113 }
114 }
115}
116
117pub fn detect_hw_id() -> String {
121 if let Some(name) = nvidia_smi_query("name") {
123 let normalized = name
125 .to_lowercase()
126 .replace("nvidia ", "")
127 .replace("geforce ", "")
128 .trim()
129 .replace(' ', "-");
130 if !normalized.is_empty() {
131 return normalized;
132 }
133 }
134 #[cfg(target_os = "macos")]
135 {
136 if let Some(brand) = std::process::Command::new("sysctl")
137 .args(["-n", "machdep.cpu.brand_string"])
138 .output()
139 .ok()
140 .and_then(|o| String::from_utf8(o.stdout).ok())
141 {
142 return brand.trim().to_lowercase().replace(' ', "-");
143 }
144 }
145 if let Ok(content) = std::fs::read_to_string("/proc/cpuinfo") {
147 for line in content.lines() {
148 if let Some(rest) = line.strip_prefix("model name") {
149 if let Some(name) = rest.split(':').nth(1) {
150 return name.trim().to_lowercase().replace(' ', "-");
151 }
152 }
153 }
154 }
155 "unknown".to_string()
156}
157
158pub fn detect_nvidia_driver() -> Option<String> {
161 nvidia_smi_query("driver_version")
162}
163
164pub fn detect_cuda_version() -> Option<String> {
168 if let Ok(out) = std::process::Command::new("nvcc").arg("--version").output() {
170 if let Ok(s) = String::from_utf8(out.stdout) {
171 for line in s.lines() {
172 if let Some(idx) = line.find("release ") {
173 let rest = &line[idx + 8..];
174 if let Some(comma) = rest.find(',') {
175 return Some(rest[..comma].trim().to_string());
176 }
177 }
178 }
179 }
180 }
181 nvidia_smi_query("cuda_version")
183}
184
185fn nvidia_smi_query(field: &str) -> Option<String> {
188 let out = std::process::Command::new("nvidia-smi")
189 .args([
190 &format!("--query-gpu={field}"),
191 "--format=csv,noheader,nounits",
192 ])
193 .output()
194 .ok()?;
195 if !out.status.success() {
196 return None;
197 }
198 let s = String::from_utf8(out.stdout).ok()?;
199 let first = s.lines().next()?.trim().to_string();
200 if first.is_empty() || first == "[Not Supported]" || first == "[N/A]" {
201 return None;
202 }
203 Some(first)
204}
205
206pub fn detect_gpu_clock_lock_mhz() -> Option<u32> {
210 nvidia_smi_query("clocks.gr").and_then(|s| s.parse::<u32>().ok())
211}
212
213pub fn detect_gpu_power_limit_w() -> Option<u32> {
215 nvidia_smi_query("power.limit").and_then(|s| s.split('.').next()?.parse::<u32>().ok())
216}
217
218pub fn detect_gpu_persistence() -> Option<bool> {
220 nvidia_smi_query("persistence_mode").map(|s| s == "Enabled")
221}
222
223pub fn detect_rust_version() -> String {
228 let rustc = std::env::var("RUSTC").unwrap_or_else(|_| "rustc".to_string());
229 std::process::Command::new(rustc)
230 .arg("--version")
231 .output()
232 .ok()
233 .and_then(|o| String::from_utf8(o.stdout).ok())
234 .and_then(|s| {
235 s.split_whitespace().nth(1).map(|v| v.to_string())
237 })
238 .unwrap_or_else(|| "unknown".to_string())
239}
240
241pub fn capture_ferrum_env() -> BTreeMap<String, String> {
243 std::env::vars()
244 .filter(|(k, _)| k.starts_with("FERRUM_"))
245 .collect()
246}
247
248#[cfg(test)]
249mod tests {
250 use super::*;
251
252 fn fixture_env() -> Env {
253 let mut ferrum_env = BTreeMap::new();
254 ferrum_env.insert("FERRUM_KV_MAX_BLOCKS".into(), "2048".into());
255 ferrum_env.insert("FERRUM_PREFIX_CACHE".into(), "0".into());
256 Env {
257 commit_sha: "b769bbd".into(),
258 hw_id: "rtx-4090".into(),
259 driver: Some("555.42.06".into()),
260 cuda: Some("12.4".into()),
261 rust: "1.78.0".into(),
262 ferrum_features: vec!["cuda".into(), "vllm-moe-marlin".into()],
263 gpu_clock_lock_mhz: Some(2520),
264 gpu_power_limit_w: Some(350),
265 gpu_persistence_mode: Some(true),
266 gpu_auto_boost: Some(false),
267 ferrum_env,
268 runtime_config: RuntimeConfigSnapshot::default(),
269 vllm_args: None,
270 }
271 }
272
273 #[test]
274 fn env_hash_is_deterministic() {
275 let h1 = fixture_env().hash();
276 let h2 = fixture_env().hash();
277 assert_eq!(h1, h2);
278 assert!(h1.0.starts_with("sha256:"));
279 assert_eq!(h1.0.len(), "sha256:".len() + 64);
280 }
281
282 #[test]
283 fn env_hash_changes_on_clock_lock() {
284 let h1 = fixture_env().hash();
285 let mut e = fixture_env();
286 e.gpu_clock_lock_mhz = Some(2400); let h2 = e.hash();
288 assert_ne!(h1, h2);
289 }
290
291 #[test]
292 fn env_hash_changes_on_ferrum_env() {
293 let h1 = fixture_env().hash();
294 let mut e = fixture_env();
295 e.ferrum_env.insert("FERRUM_VLLM_MOE".into(), "1".into());
296 let h2 = e.hash();
297 assert_ne!(h1, h2);
298 }
299
300 #[test]
301 fn ferrum_env_order_independent() {
302 let mut e1 = fixture_env();
304 e1.ferrum_env.clear();
305 e1.ferrum_env.insert("A".into(), "1".into());
306 e1.ferrum_env.insert("B".into(), "2".into());
307
308 let mut e2 = fixture_env();
309 e2.ferrum_env.clear();
310 e2.ferrum_env.insert("B".into(), "2".into());
311 e2.ferrum_env.insert("A".into(), "1".into());
312
313 assert_eq!(e1.hash(), e2.hash());
314 }
315}