Skip to main content

ferrum_types/
auto_config.rs

1//! Startup auto-configuration and selector decision trace types.
2//!
3//! This is the typed control-plane surface for gradually replacing M3 shell
4//! env bundles with validated model/hardware/workload driven selections.
5
6use crate::{
7    parse_bool_env_value, parse_usize_env_value, RuntimeConfigEffect, RuntimeConfigEntry,
8    RuntimeConfigSnapshot, RuntimeConfigSource,
9};
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12use thiserror::Error;
13
14pub const M3_QWEN3_30B_A3B_INT4_PRESET: &str = "m3_qwen3_30b_a3b_int4";
15const DEFAULT_KV_BLOCK_SIZE_TOKENS: usize = 16;
16const DEFAULT_KV_BLOCKS: usize = 2048;
17const GIB: u64 = 1024 * 1024 * 1024;
18
19#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
20pub struct ModelCapabilities {
21    pub architecture: String,
22    pub quantization: Option<String>,
23    pub moe: Option<MoeCapabilities>,
24    pub max_context_len: Option<usize>,
25    pub num_hidden_layers: Option<usize>,
26    pub head_dim: Option<usize>,
27    pub kv_heads: Option<usize>,
28    pub estimated_weight_bytes: Option<u64>,
29    pub supported_dtypes: Vec<String>,
30    pub graph_safe_moe: bool,
31}
32
33impl ModelCapabilities {
34    pub fn unknown() -> Self {
35        Self {
36            architecture: "unknown".to_string(),
37            quantization: None,
38            moe: None,
39            max_context_len: None,
40            num_hidden_layers: None,
41            head_dim: None,
42            kv_heads: None,
43            estimated_weight_bytes: None,
44            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
45            graph_safe_moe: false,
46        }
47    }
48
49    pub fn qwen3_30b_a3b_gptq_int4() -> Self {
50        Self {
51            architecture: "qwen3_moe".to_string(),
52            quantization: Some("gptq_int4".to_string()),
53            moe: Some(MoeCapabilities {
54                num_experts: 128,
55                experts_per_token: 8,
56                moe_intermediate_size: Some(768),
57            }),
58            max_context_len: Some(40960),
59            num_hidden_layers: Some(48),
60            head_dim: Some(128),
61            kv_heads: Some(4),
62            // Conservative GPTQ int4 weight footprint including quant scales
63            // and loader/runtime overhead. This keeps the RTX 4090 M3 preset
64            // at the historical 2048 KV blocks while still allowing smaller
65            // GPUs to be downgraded before startup allocation.
66            estimated_weight_bytes: Some(18 * GIB),
67            supported_dtypes: vec!["fp16".to_string()],
68            graph_safe_moe: true,
69        }
70    }
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74pub struct MoeCapabilities {
75    pub num_experts: usize,
76    pub experts_per_token: usize,
77    pub moe_intermediate_size: Option<usize>,
78}
79
80#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
81pub struct HardwareCapabilities {
82    pub backend: String,
83    pub cuda_runtime: Option<String>,
84    pub compute_capability: Option<String>,
85    pub vram_bytes: Option<u64>,
86    pub sm_count: Option<u32>,
87    pub supported_dtypes: Vec<String>,
88    pub supported_kv_dtypes: Vec<String>,
89    pub graph_support: bool,
90    pub compiled_features: CompiledKernelFeatures,
91}
92
93impl HardwareCapabilities {
94    pub fn unknown() -> Self {
95        Self {
96            backend: "unknown".to_string(),
97            cuda_runtime: None,
98            compute_capability: None,
99            vram_bytes: None,
100            sm_count: None,
101            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
102            supported_kv_dtypes: vec!["fp16".to_string()],
103            graph_support: false,
104            compiled_features: CompiledKernelFeatures::default(),
105        }
106    }
107
108    pub fn rtx4090_cuda(features: CompiledKernelFeatures) -> Self {
109        Self {
110            backend: "cuda".to_string(),
111            cuda_runtime: None,
112            compute_capability: Some("8.9".to_string()),
113            vram_bytes: Some(24 * 1024 * 1024 * 1024),
114            sm_count: Some(128),
115            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
116            supported_kv_dtypes: vec!["fp16".to_string(), "int8".to_string()],
117            graph_support: true,
118            compiled_features: features,
119        }
120    }
121}
122
123#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
124pub struct CompiledKernelFeatures {
125    pub cuda: bool,
126    pub vllm_paged_attn: bool,
127    pub vllm_moe_marlin: bool,
128    pub cuda_graph: bool,
129    pub greedy_argmax: bool,
130    pub fa2_source: bool,
131    pub fa2_direct_ffi: bool,
132}
133
134impl Default for CompiledKernelFeatures {
135    fn default() -> Self {
136        Self {
137            cuda: false,
138            vllm_paged_attn: false,
139            vllm_moe_marlin: false,
140            cuda_graph: false,
141            greedy_argmax: false,
142            fa2_source: false,
143            fa2_direct_ffi: false,
144        }
145    }
146}
147
148impl CompiledKernelFeatures {
149    pub fn m3_fast_path_without_fa2() -> Self {
150        Self {
151            cuda: true,
152            vllm_paged_attn: true,
153            vllm_moe_marlin: true,
154            cuda_graph: true,
155            greedy_argmax: true,
156            fa2_source: false,
157            fa2_direct_ffi: false,
158        }
159    }
160
161    pub fn m3_fast_path_with_source_fa2() -> Self {
162        Self {
163            fa2_source: true,
164            ..Self::m3_fast_path_without_fa2()
165        }
166    }
167}
168
169#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
170pub struct WorkloadProfile {
171    pub preset: Option<String>,
172    pub serving_mode: String,
173    pub target_concurrency: usize,
174    pub prompt_length_class: String,
175    pub output_length_class: String,
176    pub priority: WorkloadPriority,
177}
178
179impl WorkloadProfile {
180    pub fn serving_default() -> Self {
181        Self {
182            preset: None,
183            serving_mode: "openai_chat".to_string(),
184            target_concurrency: 1,
185            prompt_length_class: "unknown".to_string(),
186            output_length_class: "unknown".to_string(),
187            priority: WorkloadPriority::Balanced,
188        }
189    }
190
191    pub fn m3_qwen3_30b_a3b_int4() -> Self {
192        Self {
193            preset: Some(M3_QWEN3_30B_A3B_INT4_PRESET.to_string()),
194            serving_mode: "bench_serve".to_string(),
195            target_concurrency: 32,
196            prompt_length_class: "random_256".to_string(),
197            output_length_class: "random_128".to_string(),
198            priority: WorkloadPriority::Throughput,
199        }
200    }
201
202    fn is_m3_preset(&self) -> bool {
203        self.preset.as_deref() == Some(M3_QWEN3_30B_A3B_INT4_PRESET)
204    }
205}
206
207impl Default for WorkloadProfile {
208    fn default() -> Self {
209        Self::serving_default()
210    }
211}
212
213#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
214#[serde(rename_all = "snake_case")]
215pub enum WorkloadPriority {
216    Latency,
217    Throughput,
218    Balanced,
219}
220
221#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
222pub struct ResolvedFerrumConfig {
223    pub schema_version: u32,
224    pub preset: Option<String>,
225    pub runtime_config: RuntimeConfigSnapshot,
226    pub model_capabilities: ModelCapabilities,
227    pub hardware_capabilities: HardwareCapabilities,
228    pub workload_profile: WorkloadProfile,
229    pub decisions: Vec<AutoConfigDecision>,
230}
231
232impl ResolvedFerrumConfig {
233    pub fn effective_config_document(&self) -> serde_json::Value {
234        serde_json::json!({
235            "schema_version": 1,
236            "preset": self.preset,
237            "env_hash": self.runtime_env_hash(),
238            "entries": self.runtime_config.entries,
239            "model_capabilities": self.model_capabilities,
240            "hardware_capabilities": self.hardware_capabilities,
241            "workload_profile": self.workload_profile,
242            "decisions": self.decisions,
243        })
244    }
245
246    pub fn decision_trace_jsonl(&self) -> Result<String, serde_json::Error> {
247        let mut out = String::new();
248        for decision in &self.decisions {
249            out.push_str(&serde_json::to_string(decision)?);
250            out.push('\n');
251        }
252        Ok(out)
253    }
254
255    pub fn runtime_env_hash(&self) -> String {
256        use sha2::{Digest, Sha256};
257
258        let bytes = serde_json::to_vec(&self.runtime_config.entries).unwrap_or_default();
259        let digest = Sha256::digest(bytes);
260        format!("sha256:{digest:x}")
261    }
262}
263
264#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
265pub struct AutoConfigDecision {
266    pub schema_version: u32,
267    pub selection: String,
268    pub selected: String,
269    pub source: AutoConfigSource,
270    pub source_key: Option<String>,
271    pub candidates: Vec<String>,
272    pub rejected: Vec<RejectedCandidate>,
273    pub affects: Vec<RuntimeConfigEffect>,
274}
275
276#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
277pub struct RejectedCandidate {
278    pub value: String,
279    pub reason: String,
280}
281
282#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
283#[serde(rename_all = "snake_case")]
284pub enum AutoConfigSource {
285    Default,
286    Cli,
287    ConfigFile,
288    Env,
289    ScriptCase,
290    ModelMetadata,
291    HardwareCapability,
292    MemoryProfile,
293    WorkloadPreset,
294    CompiledFeature,
295}
296
297#[derive(Debug, Clone, PartialEq, Eq, Error)]
298pub enum AutoConfigError {
299    #[error("{key}: invalid override: {reason}")]
300    InvalidOverride { key: String, reason: String },
301    #[error("{selection}: unsupported combination: {reason}")]
302    UnsupportedCombination { selection: String, reason: String },
303}
304
305pub struct FerrumConfigBuilder {
306    runtime_config: RuntimeConfigSnapshot,
307    model: ModelCapabilities,
308    hardware: HardwareCapabilities,
309    workload: WorkloadProfile,
310}
311
312impl FerrumConfigBuilder {
313    pub fn new(runtime_config: RuntimeConfigSnapshot) -> Self {
314        Self {
315            runtime_config,
316            model: ModelCapabilities::unknown(),
317            hardware: HardwareCapabilities::unknown(),
318            workload: WorkloadProfile::default(),
319        }
320    }
321
322    pub fn m3_qwen3_30b_a3b_int4(runtime_config: RuntimeConfigSnapshot) -> Self {
323        Self::new(runtime_config)
324            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
325            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
326                CompiledKernelFeatures::m3_fast_path_without_fa2(),
327            ))
328            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
329    }
330
331    pub fn with_model_capabilities(mut self, model: ModelCapabilities) -> Self {
332        self.model = model;
333        self
334    }
335
336    pub fn with_hardware_capabilities(mut self, hardware: HardwareCapabilities) -> Self {
337        self.hardware = hardware;
338        self
339    }
340
341    pub fn with_workload_profile(mut self, workload: WorkloadProfile) -> Self {
342        self.workload = workload;
343        self
344    }
345
346    pub fn resolve(self) -> Result<ResolvedFerrumConfig, AutoConfigError> {
347        let mut decisions = Vec::new();
348        let cuda_backend = self.is_cuda_backend();
349        let use_vllm_paged_attn = self.bool_value(
350            "FERRUM_USE_VLLM_PAGED_ATTN",
351            self.workload.is_m3_preset()
352                && cuda_backend
353                && self.hardware.compiled_features.vllm_paged_attn,
354            AutoConfigSource::WorkloadPreset,
355        )?;
356        let fa_layout =
357            self.bool_value("FERRUM_FA_LAYOUT_VARLEN", false, AutoConfigSource::Default)?;
358        let fa2_source = self.bool_value("FERRUM_FA2_SOURCE", false, AutoConfigSource::Default)?;
359        let shim_present = self.raw("FERRUM_FA2_DIRECT_FFI_SHIM").is_some();
360        let fa2_direct_ffi = self.bool_value(
361            "FERRUM_FA2_DIRECT_FFI",
362            shim_present,
363            if shim_present {
364                AutoConfigSource::Env
365            } else {
366                AutoConfigSource::Default
367            },
368        )?;
369        let vllm_v1_short = self.bool_value(
370            "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
371            use_vllm_paged_attn.value,
372            AutoConfigSource::Default,
373        )?;
374        let vllm_moe = self.bool_value(
375            "FERRUM_VLLM_MOE",
376            self.workload.is_m3_preset()
377                && cuda_backend
378                && self.hardware.compiled_features.vllm_moe_marlin,
379            AutoConfigSource::WorkloadPreset,
380        )?;
381        let device_route = self.bool_value(
382            "FERRUM_MOE_DEVICE_ROUTE",
383            self.workload.is_m3_preset() && vllm_moe.value,
384            AutoConfigSource::WorkloadPreset,
385        )?;
386        let pair_ids = self.bool_value(
387            "FERRUM_VLLM_MOE_PAIR_IDS",
388            vllm_moe.value,
389            AutoConfigSource::WorkloadPreset,
390        )?;
391        let graph = self.bool_value(
392            "FERRUM_MOE_GRAPH",
393            self.workload.is_m3_preset()
394                && vllm_moe.value
395                && self.hardware.graph_support
396                && self.hardware.compiled_features.cuda_graph,
397            AutoConfigSource::WorkloadPreset,
398        )?;
399        let greedy = self.bool_value(
400            "FERRUM_GREEDY_ARGMAX",
401            self.workload.is_m3_preset()
402                && cuda_backend
403                && self.hardware.compiled_features.greedy_argmax,
404            AutoConfigSource::WorkloadPreset,
405        )?;
406        let prefix_cache = self.bool_value(
407            "FERRUM_PREFIX_CACHE",
408            false,
409            if self.workload.is_m3_preset() {
410                AutoConfigSource::WorkloadPreset
411            } else {
412                AutoConfigSource::Default
413            },
414        )?;
415        let default_max_sequences = self.default_max_sequences();
416        let max_sequences = self.usize_value(
417            "FERRUM_PAGED_MAX_SEQS",
418            default_max_sequences.value,
419            default_max_sequences.source,
420        )?;
421        let default_kv_blocks = self.default_kv_blocks(&max_sequences);
422        let kv_blocks = self.usize_value(
423            "FERRUM_KV_MAX_BLOCKS",
424            default_kv_blocks.value,
425            default_kv_blocks.source,
426        )?;
427        let default_max_batched_tokens =
428            self.default_max_batched_tokens(&max_sequences, &kv_blocks);
429        let max_batched_tokens = self.usize_value(
430            "FERRUM_MAX_BATCHED_TOKENS",
431            default_max_batched_tokens.value,
432            default_max_batched_tokens.source,
433        )?;
434        let max_model_len = self.optional_usize_value("FERRUM_MAX_MODEL_LEN")?;
435
436        self.validate_attention(
437            use_vllm_paged_attn.value,
438            fa_layout.value,
439            fa2_source.value,
440            fa2_direct_ffi.value,
441            shim_present,
442            vllm_v1_short.value,
443        )?;
444        self.validate_moe(
445            vllm_moe.value,
446            device_route.value,
447            pair_ids.value,
448            graph.value,
449        )?;
450        self.validate_memory(
451            kv_blocks.value,
452            max_sequences.value,
453            max_batched_tokens.value,
454            max_model_len.as_ref().map(|value| value.value),
455        )?;
456        self.validate_dtypes()?;
457        self.validate_sampling(greedy.value)?;
458
459        decisions.push(self.attention_prefill_decision(
460            use_vllm_paged_attn.clone(),
461            fa_layout,
462            fa2_source,
463            fa2_direct_ffi,
464        ));
465        decisions.push(self.attention_decode_decision(use_vllm_paged_attn, vllm_v1_short));
466        decisions.push(self.moe_decision(vllm_moe, device_route, pair_ids));
467        decisions.push(self.graph_decision(graph));
468        decisions.push(self.scalar_decision(
469            "kv_block_count",
470            kv_blocks,
471            RuntimeConfigEffect::Memory,
472        ));
473        decisions.push(self.scalar_decision(
474            "max_sequences",
475            max_sequences,
476            RuntimeConfigEffect::Memory,
477        ));
478        decisions.push(self.scalar_decision(
479            "max_batched_tokens",
480            max_batched_tokens,
481            RuntimeConfigEffect::Performance,
482        ));
483        if let Some(max_model_len) = max_model_len {
484            decisions.push(self.scalar_decision(
485                "max_model_len",
486                max_model_len,
487                RuntimeConfigEffect::Memory,
488            ));
489        }
490        decisions.push(self.prefix_cache_decision(prefix_cache));
491        decisions.push(self.scheduler_decision()?);
492        decisions.push(self.sampling_decision(greedy));
493
494        Ok(ResolvedFerrumConfig {
495            schema_version: 1,
496            preset: self.workload.preset.clone(),
497            runtime_config: self.runtime_config.clone(),
498            model_capabilities: self.model.clone(),
499            hardware_capabilities: self.hardware.clone(),
500            workload_profile: self.workload.clone(),
501            decisions,
502        })
503    }
504
505    fn entries(&self) -> BTreeMap<&str, &str> {
506        self.runtime_config
507            .entries
508            .iter()
509            .map(|entry| (entry.key.as_str(), entry.effective_value.as_str()))
510            .collect()
511    }
512
513    fn raw(&self, key: &str) -> Option<&str> {
514        self.entry(key).map(|entry| entry.effective_value.as_str())
515    }
516
517    fn entry(&self, key: &str) -> Option<&RuntimeConfigEntry> {
518        self.runtime_config
519            .entries
520            .iter()
521            .find(|entry| entry.key == key)
522    }
523
524    fn source_for_key(&self, key: &str, default_source: AutoConfigSource) -> AutoConfigSource {
525        self.entry(key)
526            .map(|entry| auto_config_source_from_runtime(entry.source))
527            .unwrap_or(default_source)
528    }
529
530    fn is_cuda_backend(&self) -> bool {
531        self.hardware.backend.eq_ignore_ascii_case("cuda")
532    }
533
534    fn cuda_compute_capability_at_least(&self, major: u32, minor: u32) -> Option<bool> {
535        let (actual_major, actual_minor) =
536            parse_compute_capability(self.hardware.compute_capability.as_deref()?)?;
537        Some((actual_major, actual_minor) >= (major, minor))
538    }
539
540    fn default_max_sequences(&self) -> ResolvedValue<usize> {
541        let target = self.workload.target_concurrency.max(1);
542        let mut selected = target;
543        if self.workload.is_m3_preset() {
544            if let Some(sm_count) = self.hardware.sm_count {
545                // The M3 throughput preset assumes a large GPU. On smaller
546                // known GPUs, avoid auto-selecting a c32-sized admission
547                // window before memory profiling has a chance to refine KV.
548                selected = selected.min((sm_count as usize / 4).max(1));
549            }
550            if let Some(vram_bytes) = self.hardware.vram_bytes {
551                selected = selected.min(vram_default_max_sequences(vram_bytes));
552            }
553        }
554        ResolvedValue {
555            value: selected.max(1),
556            source: if selected < target {
557                AutoConfigSource::HardwareCapability
558            } else {
559                AutoConfigSource::WorkloadPreset
560            },
561            source_key: None,
562        }
563    }
564
565    fn default_max_batched_tokens(
566        &self,
567        max_sequences: &ResolvedValue<usize>,
568        kv_blocks: &ResolvedValue<usize>,
569    ) -> ResolvedValue<usize> {
570        let kv_token_capacity = kv_blocks
571            .value
572            .saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS)
573            .max(max_sequences.value.max(1));
574        let value = max_sequences
575            .value
576            .max(1)
577            .saturating_mul(64)
578            .min(kv_token_capacity)
579            .max(max_sequences.value.max(1));
580        ResolvedValue {
581            value,
582            source: if max_sequences.source == AutoConfigSource::HardwareCapability
583                || kv_blocks.source == AutoConfigSource::HardwareCapability
584            {
585                AutoConfigSource::HardwareCapability
586            } else {
587                AutoConfigSource::WorkloadPreset
588            },
589            source_key: None,
590        }
591    }
592
593    fn default_kv_blocks(&self, max_sequences: &ResolvedValue<usize>) -> ResolvedValue<usize> {
594        let min_blocks = ceil_div(max_sequences.value.max(1), DEFAULT_KV_BLOCK_SIZE_TOKENS);
595        let target = DEFAULT_KV_BLOCKS.max(min_blocks);
596        let selected = match (
597            self.hardware.vram_bytes,
598            self.model.estimated_weight_bytes,
599            self.kv_cache_bytes_per_token(),
600        ) {
601            (Some(vram_bytes), Some(weight_bytes), Some(kv_bytes_per_token))
602                if kv_bytes_per_token > 0 =>
603            {
604                let headroom = (vram_bytes / 10).max(2 * GIB);
605                let available = vram_bytes.saturating_sub(weight_bytes.saturating_add(headroom));
606                let kv_token_budget = (available / kv_bytes_per_token) as usize;
607                let block_budget = kv_token_budget / DEFAULT_KV_BLOCK_SIZE_TOKENS;
608                target.min(block_budget.max(min_blocks))
609            }
610            _ => target,
611        };
612        ResolvedValue {
613            value: selected.max(1),
614            source: if selected < target {
615                AutoConfigSource::HardwareCapability
616            } else {
617                AutoConfigSource::WorkloadPreset
618            },
619            source_key: None,
620        }
621    }
622
623    fn kv_cache_bytes_per_token(&self) -> Option<u64> {
624        let layers = self.model.num_hidden_layers? as u64;
625        let kv_heads = self.model.kv_heads? as u64;
626        let head_dim = self.model.head_dim? as u64;
627        layers
628            .checked_mul(2)?
629            .checked_mul(kv_heads)?
630            .checked_mul(head_dim)?
631            .checked_mul(2)
632    }
633
634    fn bool_value(
635        &self,
636        key: &str,
637        default: bool,
638        default_source: AutoConfigSource,
639    ) -> Result<ResolvedValue<bool>, AutoConfigError> {
640        match self.entry(key) {
641            Some(entry) => Ok(ResolvedValue {
642                value: parse_bool_env_value(&entry.effective_value).map_err(|reason| {
643                    AutoConfigError::InvalidOverride {
644                        key: key.to_string(),
645                        reason,
646                    }
647                })?,
648                source: auto_config_source_from_runtime(entry.source),
649                source_key: Some(key.to_string()),
650            }),
651            None => Ok(ResolvedValue {
652                value: default,
653                source: default_source,
654                source_key: None,
655            }),
656        }
657    }
658
659    fn usize_value(
660        &self,
661        key: &str,
662        default: usize,
663        default_source: AutoConfigSource,
664    ) -> Result<ResolvedValue<usize>, AutoConfigError> {
665        match self.entry(key) {
666            Some(entry) => Ok(ResolvedValue {
667                value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
668                    AutoConfigError::InvalidOverride {
669                        key: key.to_string(),
670                        reason,
671                    }
672                })?,
673                source: auto_config_source_from_runtime(entry.source),
674                source_key: Some(key.to_string()),
675            }),
676            None => Ok(ResolvedValue {
677                value: default,
678                source: default_source,
679                source_key: None,
680            }),
681        }
682    }
683
684    fn optional_usize_value(
685        &self,
686        key: &str,
687    ) -> Result<Option<ResolvedValue<usize>>, AutoConfigError> {
688        match self.entry(key) {
689            Some(entry) => Ok(Some(ResolvedValue {
690                value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
691                    AutoConfigError::InvalidOverride {
692                        key: key.to_string(),
693                        reason,
694                    }
695                })?,
696                source: auto_config_source_from_runtime(entry.source),
697                source_key: Some(key.to_string()),
698            })),
699            None => Ok(None),
700        }
701    }
702
703    fn validate_attention(
704        &self,
705        use_vllm_paged_attn: bool,
706        fa_layout: bool,
707        fa2_source: bool,
708        fa2_direct_ffi: bool,
709        shim_present: bool,
710        vllm_v1_short: bool,
711    ) -> Result<(), AutoConfigError> {
712        if use_vllm_paged_attn && !self.hardware.compiled_features.vllm_paged_attn {
713            return self.invalid(
714                "FERRUM_USE_VLLM_PAGED_ATTN",
715                "vLLM paged attention is not compiled",
716            );
717        }
718        if use_vllm_paged_attn && !self.is_cuda_backend() {
719            return self.invalid(
720                "FERRUM_USE_VLLM_PAGED_ATTN",
721                "vLLM paged attention requires CUDA backend",
722            );
723        }
724        if fa_layout && !use_vllm_paged_attn {
725            return self.invalid(
726                "FERRUM_FA_LAYOUT_VARLEN",
727                "FA layout requires vLLM paged attention layout",
728            );
729        }
730        if fa2_source && !self.hardware.compiled_features.fa2_source {
731            return self.invalid(
732                "FERRUM_FA2_SOURCE",
733                "source-built FA2 support is not compiled",
734            );
735        }
736        if fa2_source && !self.is_cuda_backend() {
737            return self.invalid(
738                "FERRUM_FA2_SOURCE",
739                "source-built FA2 requires CUDA backend",
740            );
741        }
742        if fa2_source && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
743            return self.invalid(
744                "FERRUM_FA2_SOURCE",
745                "source-built FA2 requires CUDA compute capability >= 8.0",
746            );
747        }
748        if fa2_direct_ffi && !self.hardware.compiled_features.fa2_direct_ffi {
749            return self.invalid(
750                "FERRUM_FA2_DIRECT_FFI",
751                "direct FA2 FFI shim support is not compiled",
752            );
753        }
754        if fa2_direct_ffi && !self.is_cuda_backend() {
755            return self.invalid(
756                "FERRUM_FA2_DIRECT_FFI",
757                "direct FA2 FFI shim requires CUDA backend",
758            );
759        }
760        if fa2_direct_ffi && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
761            return self.invalid(
762                "FERRUM_FA2_DIRECT_FFI",
763                "direct FA2 FFI shim requires CUDA compute capability >= 8.0",
764            );
765        }
766        if fa2_direct_ffi && !shim_present {
767            return self.invalid(
768                "FERRUM_FA2_DIRECT_FFI",
769                "requires FERRUM_FA2_DIRECT_FFI_SHIM",
770            );
771        }
772        if fa2_source && fa2_direct_ffi {
773            return self.unsupported(
774                "attention_prefill_mixed_backend",
775                "FA2 source and direct FFI shim cannot both own the prefill path",
776            );
777        }
778        if vllm_v1_short && !use_vllm_paged_attn {
779            return self.invalid(
780                "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
781                "short-context v1 requires vLLM paged attention",
782            );
783        }
784        Ok(())
785    }
786
787    fn validate_moe(
788        &self,
789        vllm_moe: bool,
790        device_route: bool,
791        pair_ids: bool,
792        graph: bool,
793    ) -> Result<(), AutoConfigError> {
794        if vllm_moe && !self.hardware.compiled_features.vllm_moe_marlin {
795            return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE is not compiled");
796        }
797        if vllm_moe && !self.is_cuda_backend() {
798            return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE requires CUDA backend");
799        }
800        if device_route && !vllm_moe {
801            return self.invalid(
802                "FERRUM_MOE_DEVICE_ROUTE",
803                "device route currently requires vLLM MoE",
804            );
805        }
806        if pair_ids && !vllm_moe {
807            return self.invalid(
808                "FERRUM_VLLM_MOE_PAIR_IDS",
809                "pair-id routing requires vLLM MoE",
810            );
811        }
812        let graph_relevant = self.model.moe.is_some() || self.workload.is_m3_preset();
813        if graph && graph_relevant && !self.hardware.graph_support {
814            return self.invalid(
815                "FERRUM_MOE_GRAPH",
816                "hardware/backend does not support CUDA graph replay",
817            );
818        }
819        if graph && graph_relevant && !self.hardware.compiled_features.cuda_graph {
820            return self.invalid("FERRUM_MOE_GRAPH", "CUDA graph support is not compiled");
821        }
822        if graph && graph_relevant && !vllm_moe {
823            return self.invalid(
824                "FERRUM_MOE_GRAPH",
825                "graph decode requires the graph-clean vLLM MoE path",
826            );
827        }
828        if graph && graph_relevant && self.model.moe.is_some() && !self.model.graph_safe_moe {
829            return self.unsupported(
830                "moe_graph_policy",
831                "model MoE path is not marked graph-safe",
832            );
833        }
834        Ok(())
835    }
836
837    fn validate_sampling(&self, greedy: bool) -> Result<(), AutoConfigError> {
838        if greedy && !self.hardware.compiled_features.greedy_argmax {
839            return self.invalid("FERRUM_GREEDY_ARGMAX", "GPU argmax is not compiled");
840        }
841        if greedy
842            && !(self.is_cuda_backend() || self.hardware.backend.eq_ignore_ascii_case("metal"))
843        {
844            return self.invalid(
845                "FERRUM_GREEDY_ARGMAX",
846                "greedy argmax requires CUDA or Metal backend",
847            );
848        }
849        Ok(())
850    }
851
852    fn validate_memory(
853        &self,
854        kv_blocks: usize,
855        max_sequences: usize,
856        max_batched_tokens: usize,
857        requested_max_model_len: Option<usize>,
858    ) -> Result<(), AutoConfigError> {
859        if kv_blocks == 0 {
860            return self.invalid("FERRUM_KV_MAX_BLOCKS", "must be greater than zero");
861        }
862        if max_sequences == 0 {
863            return self.invalid("FERRUM_PAGED_MAX_SEQS", "must be greater than zero");
864        }
865        if max_batched_tokens < max_sequences {
866            return self.invalid(
867                "FERRUM_MAX_BATCHED_TOKENS",
868                "must be at least FERRUM_PAGED_MAX_SEQS",
869            );
870        }
871        let kv_token_capacity = kv_blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS);
872        if max_batched_tokens > kv_token_capacity {
873            return self.invalid(
874                "FERRUM_MAX_BATCHED_TOKENS",
875                "exceeds KV cache token capacity",
876            );
877        }
878        if let Some(max_model_len) = requested_max_model_len {
879            if max_model_len == 0 {
880                return self.invalid("FERRUM_MAX_MODEL_LEN", "must be greater than zero");
881            }
882            if let Some(model_max) = self.model.max_context_len {
883                if max_model_len > model_max {
884                    return self.invalid(
885                        "FERRUM_MAX_MODEL_LEN",
886                        "exceeds model metadata max context length",
887                    );
888                }
889            }
890            if max_model_len > kv_token_capacity {
891                return self.invalid(
892                    "FERRUM_KV_MAX_BLOCKS",
893                    "KV cache token capacity is smaller than FERRUM_MAX_MODEL_LEN",
894                );
895            }
896        }
897        Ok(())
898    }
899
900    fn validate_dtypes(&self) -> Result<(), AutoConfigError> {
901        if let Some(dtype) = self.raw("FERRUM_DTYPE") {
902            let dtype = dtype.to_ascii_lowercase();
903            if !self.hardware.supported_dtypes.iter().any(|d| d == &dtype) {
904                return self.invalid("FERRUM_DTYPE", "dtype is not supported by hardware profile");
905            }
906        }
907        if let Some(dtype) = self.raw("FERRUM_KV_DTYPE") {
908            let dtype = dtype.to_ascii_lowercase();
909            if !self
910                .hardware
911                .supported_kv_dtypes
912                .iter()
913                .any(|d| d == &dtype)
914            {
915                return self.invalid(
916                    "FERRUM_KV_DTYPE",
917                    "KV dtype is not supported by hardware profile",
918                );
919            }
920        }
921        Ok(())
922    }
923
924    fn attention_prefill_decision(
925        &self,
926        use_vllm_paged_attn: ResolvedValue<bool>,
927        fa_layout: ResolvedValue<bool>,
928        fa2_source: ResolvedValue<bool>,
929        fa2_direct_ffi: ResolvedValue<bool>,
930    ) -> AutoConfigDecision {
931        let (selected, source, source_key) = if fa2_source.value {
932            ("fa2_source", fa2_source.source, fa2_source.source_key)
933        } else if fa2_direct_ffi.value {
934            (
935                "fa2_direct_ffi",
936                fa2_direct_ffi.source,
937                fa2_direct_ffi.source_key,
938            )
939        } else if fa_layout.value {
940            ("fa_layout_varlen", fa_layout.source, fa_layout.source_key)
941        } else if use_vllm_paged_attn.value {
942            (
943                "vllm_paged_varlen",
944                use_vllm_paged_attn.source,
945                use_vllm_paged_attn.source_key,
946            )
947        } else {
948            ("legacy_paged_varlen", AutoConfigSource::Default, None)
949        };
950        self.decision(
951            "attention_prefill_mixed_backend",
952            selected,
953            source,
954            source_key,
955            [
956                "fa2_source",
957                "fa2_direct_ffi",
958                "fa_layout_varlen",
959                "vllm_paged_varlen",
960                "legacy_paged_varlen",
961            ],
962            self.rejected_except(
963                selected,
964                [
965                    ("fa2_source", "source-built FA2 not selected"),
966                    ("fa2_direct_ffi", "diagnostic direct FFI shim not selected"),
967                    ("fa_layout_varlen", "FA-compatible layout not selected"),
968                    ("vllm_paged_varlen", "vLLM paged varlen bridge not selected"),
969                    (
970                        "legacy_paged_varlen",
971                        "a higher-priority attention path was selected",
972                    ),
973                ],
974            ),
975            vec![
976                RuntimeConfigEffect::Performance,
977                RuntimeConfigEffect::Memory,
978            ],
979        )
980    }
981
982    fn attention_decode_decision(
983        &self,
984        use_vllm_paged_attn: ResolvedValue<bool>,
985        vllm_v1_short: ResolvedValue<bool>,
986    ) -> AutoConfigDecision {
987        let (selected, source, source_key) = if use_vllm_paged_attn.value {
988            if vllm_v1_short.value {
989                (
990                    "vllm_paged_attn_v1_short",
991                    vllm_v1_short.source,
992                    vllm_v1_short.source_key,
993                )
994            } else {
995                (
996                    "vllm_paged_attn_v2",
997                    vllm_v1_short.source,
998                    vllm_v1_short.source_key,
999                )
1000            }
1001        } else {
1002            ("legacy_paged_decode", use_vllm_paged_attn.source, None)
1003        };
1004        self.decision(
1005            "attention_decode_backend",
1006            selected,
1007            source,
1008            source_key,
1009            [
1010                "vllm_paged_attn_v1_short",
1011                "vllm_paged_attn_v2",
1012                "legacy_paged_decode",
1013            ],
1014            self.rejected_except(
1015                selected,
1016                [
1017                    (
1018                        "vllm_paged_attn_v1_short",
1019                        "short-context v1 decode not selected",
1020                    ),
1021                    ("vllm_paged_attn_v2", "v2 decode not selected"),
1022                    ("legacy_paged_decode", "legacy decode not selected"),
1023                ],
1024            ),
1025            vec![RuntimeConfigEffect::Performance],
1026        )
1027    }
1028
1029    fn moe_decision(
1030        &self,
1031        vllm_moe: ResolvedValue<bool>,
1032        device_route: ResolvedValue<bool>,
1033        pair_ids: ResolvedValue<bool>,
1034    ) -> AutoConfigDecision {
1035        let selected = if vllm_moe.value && device_route.value && pair_ids.value {
1036            "vllm_marlin_moe_device_route_pair_ids"
1037        } else if vllm_moe.value && device_route.value {
1038            "vllm_marlin_moe_device_route"
1039        } else if vllm_moe.value {
1040            "vllm_marlin_moe"
1041        } else {
1042            "legacy_moe"
1043        };
1044        self.decision(
1045            "moe_implementation",
1046            selected,
1047            vllm_moe.source,
1048            vllm_moe.source_key,
1049            [
1050                "vllm_marlin_moe_device_route_pair_ids",
1051                "vllm_marlin_moe_device_route",
1052                "vllm_marlin_moe",
1053                "legacy_moe",
1054            ],
1055            self.rejected_except(
1056                selected,
1057                [
1058                    (
1059                        "vllm_marlin_moe_device_route_pair_ids",
1060                        "pair-id device route not selected",
1061                    ),
1062                    (
1063                        "vllm_marlin_moe_device_route",
1064                        "device-route MoE not selected",
1065                    ),
1066                    ("vllm_marlin_moe", "vLLM Marlin MoE not selected"),
1067                    ("legacy_moe", "legacy MoE not selected"),
1068                ],
1069            ),
1070            vec![RuntimeConfigEffect::Performance],
1071        )
1072    }
1073
1074    fn graph_decision(&self, graph: ResolvedValue<bool>) -> AutoConfigDecision {
1075        let selected = if graph.value {
1076            "graph_clean_decode"
1077        } else {
1078            "graph_disabled"
1079        };
1080        self.decision(
1081            "moe_graph_policy",
1082            selected,
1083            graph.source,
1084            graph.source_key,
1085            ["graph_clean_decode", "graph_disabled"],
1086            self.rejected_except(
1087                selected,
1088                [
1089                    ("graph_clean_decode", "graph decode not selected"),
1090                    ("graph_disabled", "graph decode selected"),
1091                ],
1092            ),
1093            vec![
1094                RuntimeConfigEffect::Performance,
1095                RuntimeConfigEffect::Correctness,
1096            ],
1097        )
1098    }
1099
1100    fn scalar_decision(
1101        &self,
1102        selection: &str,
1103        value: ResolvedValue<usize>,
1104        effect: RuntimeConfigEffect,
1105    ) -> AutoConfigDecision {
1106        self.decision(
1107            selection,
1108            &value.value.to_string(),
1109            value.source,
1110            value.source_key,
1111            [value.value.to_string()],
1112            Vec::new(),
1113            vec![effect],
1114        )
1115    }
1116
1117    fn scheduler_decision(&self) -> Result<AutoConfigDecision, AutoConfigError> {
1118        let entries = self.entries();
1119        let mut selected = "continuous_default".to_string();
1120        let mut source_key = None;
1121        if let Some(chunk) = entries.get("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK") {
1122            parse_usize_env_value(chunk).map_err(|reason| AutoConfigError::InvalidOverride {
1123                key: "FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string(),
1124                reason,
1125            })?;
1126            selected = format!("active_decode_prefill_chunk:{chunk}");
1127            source_key = Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string());
1128        } else if let Some(until) = entries.get("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE") {
1129            parse_usize_env_value(until).map_err(|reason| AutoConfigError::InvalidOverride {
1130                key: "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string(),
1131                reason,
1132            })?;
1133            selected = format!("prefill_first_until_active:{until}");
1134            source_key = Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string());
1135        } else if self
1136            .bool_value(
1137                "FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE",
1138                false,
1139                AutoConfigSource::Default,
1140            )?
1141            .value
1142        {
1143            selected = "prompt_token_estimate".to_string();
1144            source_key = Some("FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE".to_string());
1145        }
1146        self.unsupported_if(
1147            source_key.as_deref() == Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1148                && selected.ends_with(":0"),
1149            "scheduler_admission_policy",
1150            "active decode prefill chunk must be greater than zero",
1151        )?;
1152        Ok(self.decision(
1153            "scheduler_admission_policy",
1154            &selected,
1155            source_key
1156                .as_deref()
1157                .map(|key| self.source_for_key(key, AutoConfigSource::Default))
1158                .unwrap_or(AutoConfigSource::Default),
1159            source_key,
1160            [
1161                "continuous_default",
1162                "prompt_token_estimate",
1163                "prefill_first_until_active",
1164                "active_decode_prefill_chunk",
1165            ],
1166            Vec::new(),
1167            vec![RuntimeConfigEffect::Performance],
1168        ))
1169    }
1170
1171    fn prefix_cache_decision(&self, prefix_cache: ResolvedValue<bool>) -> AutoConfigDecision {
1172        let selected = if prefix_cache.value {
1173            "prefix_cache_enabled"
1174        } else {
1175            "prefix_cache_disabled"
1176        };
1177        self.decision(
1178            "prefix_cache_policy",
1179            selected,
1180            prefix_cache.source,
1181            prefix_cache.source_key,
1182            ["prefix_cache_enabled", "prefix_cache_disabled"],
1183            self.rejected_except(
1184                selected,
1185                [
1186                    ("prefix_cache_enabled", "prefix cache not selected"),
1187                    ("prefix_cache_disabled", "prefix cache enabled"),
1188                ],
1189            ),
1190            vec![
1191                RuntimeConfigEffect::Correctness,
1192                RuntimeConfigEffect::Performance,
1193                RuntimeConfigEffect::Memory,
1194            ],
1195        )
1196    }
1197
1198    fn sampling_decision(&self, greedy: ResolvedValue<bool>) -> AutoConfigDecision {
1199        let selected = if greedy.value {
1200            "gpu_greedy_argmax"
1201        } else {
1202            "logits_readback"
1203        };
1204        self.decision(
1205            "sampling_readback_path",
1206            selected,
1207            greedy.source,
1208            greedy.source_key,
1209            ["gpu_greedy_argmax", "logits_readback"],
1210            self.rejected_except(
1211                selected,
1212                [
1213                    ("gpu_greedy_argmax", "GPU argmax not selected"),
1214                    ("logits_readback", "logits readback not selected"),
1215                ],
1216            ),
1217            vec![
1218                RuntimeConfigEffect::Performance,
1219                RuntimeConfigEffect::Correctness,
1220            ],
1221        )
1222    }
1223
1224    fn decision<I, C>(
1225        &self,
1226        selection: &str,
1227        selected: &str,
1228        source: AutoConfigSource,
1229        source_key: Option<String>,
1230        candidates: I,
1231        rejected: Vec<RejectedCandidate>,
1232        affects: Vec<RuntimeConfigEffect>,
1233    ) -> AutoConfigDecision
1234    where
1235        I: IntoIterator<Item = C>,
1236        C: Into<String>,
1237    {
1238        AutoConfigDecision {
1239            schema_version: 1,
1240            selection: selection.to_string(),
1241            selected: selected.to_string(),
1242            source,
1243            source_key,
1244            candidates: candidates.into_iter().map(Into::into).collect(),
1245            rejected,
1246            affects,
1247        }
1248    }
1249
1250    fn rejected_except<I>(&self, selected: &str, candidates: I) -> Vec<RejectedCandidate>
1251    where
1252        I: IntoIterator<Item = (&'static str, &'static str)>,
1253    {
1254        candidates
1255            .into_iter()
1256            .filter(|(value, _)| *value != selected)
1257            .map(|(value, reason)| RejectedCandidate {
1258                value: value.to_string(),
1259                reason: reason.to_string(),
1260            })
1261            .collect()
1262    }
1263
1264    fn invalid<T>(&self, key: &str, reason: &str) -> Result<T, AutoConfigError> {
1265        Err(AutoConfigError::InvalidOverride {
1266            key: key.to_string(),
1267            reason: reason.to_string(),
1268        })
1269    }
1270
1271    fn unsupported<T>(&self, selection: &str, reason: &str) -> Result<T, AutoConfigError> {
1272        Err(AutoConfigError::UnsupportedCombination {
1273            selection: selection.to_string(),
1274            reason: reason.to_string(),
1275        })
1276    }
1277
1278    fn unsupported_if(
1279        &self,
1280        condition: bool,
1281        selection: &str,
1282        reason: &str,
1283    ) -> Result<(), AutoConfigError> {
1284        if condition {
1285            self.unsupported(selection, reason)
1286        } else {
1287            Ok(())
1288        }
1289    }
1290}
1291
1292#[derive(Debug, Clone, PartialEq, Eq)]
1293struct ResolvedValue<T> {
1294    value: T,
1295    source: AutoConfigSource,
1296    source_key: Option<String>,
1297}
1298
1299fn parse_compute_capability(value: &str) -> Option<(u32, u32)> {
1300    let value = value.trim();
1301    if value.is_empty() {
1302        return None;
1303    }
1304    let (major, minor) = value.split_once('.').unwrap_or((value, "0"));
1305    Some((major.trim().parse().ok()?, minor.trim().parse().ok()?))
1306}
1307
1308fn vram_default_max_sequences(vram_bytes: u64) -> usize {
1309    match vram_bytes {
1310        bytes if bytes >= 20 * GIB => 32,
1311        bytes if bytes >= 12 * GIB => 16,
1312        bytes if bytes >= 8 * GIB => 8,
1313        _ => 4,
1314    }
1315}
1316
1317fn ceil_div(value: usize, divisor: usize) -> usize {
1318    value.div_ceil(divisor)
1319}
1320
1321fn auto_config_source_from_runtime(source: RuntimeConfigSource) -> AutoConfigSource {
1322    match source {
1323        RuntimeConfigSource::Default => AutoConfigSource::Default,
1324        RuntimeConfigSource::ConfigFile => AutoConfigSource::ConfigFile,
1325        RuntimeConfigSource::Cli => AutoConfigSource::Cli,
1326        RuntimeConfigSource::Env => AutoConfigSource::Env,
1327        RuntimeConfigSource::ScriptCase => AutoConfigSource::ScriptCase,
1328        RuntimeConfigSource::MemoryProfile => AutoConfigSource::MemoryProfile,
1329    }
1330}
1331
1332#[cfg(test)]
1333mod tests {
1334    use super::*;
1335
1336    fn snapshot(vars: &[(&str, &str)]) -> RuntimeConfigSnapshot {
1337        RuntimeConfigSnapshot::from_env_vars(vars.iter().copied())
1338    }
1339
1340    fn snapshot_with_sources(vars: &[(&str, &str, RuntimeConfigSource)]) -> RuntimeConfigSnapshot {
1341        let mut entries: Vec<_> = vars
1342            .iter()
1343            .map(|(key, effective_value, source)| RuntimeConfigEntry {
1344                key: (*key).to_string(),
1345                effective_value: (*effective_value).to_string(),
1346                source: *source,
1347                affects: vec![RuntimeConfigEffect::Performance],
1348            })
1349            .collect();
1350        entries.sort_by(|a, b| a.key.cmp(&b.key));
1351        RuntimeConfigSnapshot { entries }
1352    }
1353
1354    fn m3(vars: &[(&str, &str)], features: CompiledKernelFeatures) -> FerrumConfigBuilder {
1355        FerrumConfigBuilder::new(snapshot(vars))
1356            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1357            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(features))
1358            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1359    }
1360
1361    fn m3_with_hardware(
1362        vars: &[(&str, &str)],
1363        hardware: HardwareCapabilities,
1364    ) -> FerrumConfigBuilder {
1365        FerrumConfigBuilder::new(snapshot(vars))
1366            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1367            .with_hardware_capabilities(hardware)
1368            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1369    }
1370
1371    fn expect_invalid_key(vars: &[(&str, &str)], key: &str) {
1372        expect_invalid_key_with_features(
1373            vars,
1374            key,
1375            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1376        );
1377    }
1378
1379    fn expect_invalid_key_with_features(
1380        vars: &[(&str, &str)],
1381        key: &str,
1382        features: CompiledKernelFeatures,
1383    ) {
1384        expect_invalid_key_with_hardware(vars, key, HardwareCapabilities::rtx4090_cuda(features));
1385    }
1386
1387    fn expect_invalid_key_with_hardware(
1388        vars: &[(&str, &str)],
1389        key: &str,
1390        hardware: HardwareCapabilities,
1391    ) {
1392        let err = m3_with_hardware(vars, hardware)
1393            .resolve()
1394            .expect_err("override should fail");
1395        match err {
1396            AutoConfigError::InvalidOverride { key: actual, .. } => assert_eq!(actual, key),
1397            other => panic!("expected invalid override for {key}, got {other:?}"),
1398        }
1399    }
1400
1401    fn cpu_hardware_with_features(features: CompiledKernelFeatures) -> HardwareCapabilities {
1402        HardwareCapabilities {
1403            backend: "cpu".to_string(),
1404            supported_dtypes: vec!["fp32".to_string()],
1405            supported_kv_dtypes: vec!["fp16".to_string()],
1406            compiled_features: features,
1407            ..HardwareCapabilities::unknown()
1408        }
1409    }
1410
1411    #[test]
1412    fn m3_preset_selects_current_safe_fast_path_without_fa2() {
1413        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1414            .resolve()
1415            .unwrap();
1416        let decisions: BTreeMap<_, _> = resolved
1417            .decisions
1418            .iter()
1419            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1420            .collect();
1421        assert_eq!(
1422            decisions["attention_prefill_mixed_backend"],
1423            "vllm_paged_varlen"
1424        );
1425        assert_eq!(
1426            decisions["attention_decode_backend"],
1427            "vllm_paged_attn_v1_short"
1428        );
1429        assert_eq!(
1430            decisions["moe_implementation"],
1431            "vllm_marlin_moe_device_route_pair_ids"
1432        );
1433        assert_eq!(decisions["moe_graph_policy"], "graph_clean_decode");
1434        assert_eq!(decisions["prefix_cache_policy"], "prefix_cache_disabled");
1435        assert_eq!(decisions["sampling_readback_path"], "gpu_greedy_argmax");
1436        assert_eq!(
1437            resolved.preset.as_deref(),
1438            Some(M3_QWEN3_30B_A3B_INT4_PRESET)
1439        );
1440    }
1441
1442    #[test]
1443    fn source_fa2_selects_only_when_compiled() {
1444        let resolved = m3(
1445            &[("FERRUM_FA2_SOURCE", "1")],
1446            CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1447        )
1448        .resolve()
1449        .unwrap();
1450        let prefill = resolved
1451            .decisions
1452            .iter()
1453            .find(|decision| decision.selection == "attention_prefill_mixed_backend")
1454            .unwrap();
1455        assert_eq!(prefill.selected, "fa2_source");
1456        expect_invalid_key(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE");
1457    }
1458
1459    #[test]
1460    fn hardware_capabilities_keep_m3_preset_on_compatible_backend_paths() {
1461        let resolved = m3_with_hardware(
1462            &[],
1463            cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2()),
1464        )
1465        .resolve()
1466        .unwrap();
1467        let decisions: BTreeMap<_, _> = resolved
1468            .decisions
1469            .iter()
1470            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1471            .collect();
1472
1473        assert_eq!(
1474            decisions["attention_prefill_mixed_backend"],
1475            "legacy_paged_varlen"
1476        );
1477        assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1478        assert_eq!(decisions["moe_implementation"], "legacy_moe");
1479        assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1480        assert_eq!(decisions["sampling_readback_path"], "logits_readback");
1481    }
1482
1483    #[test]
1484    fn hardware_incompatible_attention_and_sampling_overrides_are_rejected() {
1485        let cpu =
1486            cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2());
1487        expect_invalid_key_with_hardware(
1488            &[("FERRUM_USE_VLLM_PAGED_ATTN", "1")],
1489            "FERRUM_USE_VLLM_PAGED_ATTN",
1490            cpu.clone(),
1491        );
1492        expect_invalid_key_with_hardware(
1493            &[("FERRUM_VLLM_MOE", "1")],
1494            "FERRUM_VLLM_MOE",
1495            cpu.clone(),
1496        );
1497        expect_invalid_key_with_hardware(
1498            &[("FERRUM_GREEDY_ARGMAX", "1")],
1499            "FERRUM_GREEDY_ARGMAX",
1500            cpu.clone(),
1501        );
1502        expect_invalid_key_with_hardware(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE", cpu);
1503
1504        let mut old_cuda = HardwareCapabilities::rtx4090_cuda(
1505            CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1506        );
1507        old_cuda.compute_capability = Some("7.5".to_string());
1508        expect_invalid_key_with_hardware(
1509            &[("FERRUM_FA2_SOURCE", "1")],
1510            "FERRUM_FA2_SOURCE",
1511            old_cuda,
1512        );
1513    }
1514
1515    #[test]
1516    fn hardware_capacity_sizes_default_sequence_budget_without_overriding_user_values() {
1517        let mut small_gpu =
1518            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1519        small_gpu.sm_count = Some(16);
1520        small_gpu.vram_bytes = Some(24 * 1024 * 1024 * 1024);
1521
1522        let resolved = m3_with_hardware(&[], small_gpu.clone()).resolve().unwrap();
1523        let decision = |selection: &str| {
1524            resolved
1525                .decisions
1526                .iter()
1527                .find(|decision| decision.selection == selection)
1528                .unwrap()
1529        };
1530        let max_sequences = decision("max_sequences");
1531        assert_eq!(max_sequences.selected, "4");
1532        assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1533        let max_batched_tokens = decision("max_batched_tokens");
1534        assert_eq!(max_batched_tokens.selected, "256");
1535        assert_eq!(
1536            max_batched_tokens.source,
1537            AutoConfigSource::HardwareCapability
1538        );
1539
1540        let resolved = m3_with_hardware(&[("FERRUM_PAGED_MAX_SEQS", "16")], small_gpu)
1541            .resolve()
1542            .unwrap();
1543        let max_sequences = resolved
1544            .decisions
1545            .iter()
1546            .find(|decision| decision.selection == "max_sequences")
1547            .unwrap();
1548        assert_eq!(max_sequences.selected, "16");
1549        assert_eq!(max_sequences.source, AutoConfigSource::Env);
1550        assert_eq!(
1551            max_sequences.source_key.as_deref(),
1552            Some("FERRUM_PAGED_MAX_SEQS")
1553        );
1554    }
1555
1556    #[test]
1557    fn vram_capacity_caps_m3_default_sequence_budget() {
1558        let mut low_vram_gpu =
1559            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1560        low_vram_gpu.sm_count = Some(128);
1561        low_vram_gpu.vram_bytes = Some(7 * 1024 * 1024 * 1024);
1562
1563        let resolved = m3_with_hardware(&[], low_vram_gpu).resolve().unwrap();
1564        let max_sequences = resolved
1565            .decisions
1566            .iter()
1567            .find(|decision| decision.selection == "max_sequences")
1568            .unwrap();
1569        assert_eq!(max_sequences.selected, "4");
1570        assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1571    }
1572
1573    #[test]
1574    fn memory_budget_keeps_rtx4090_m3_kv_blocks_but_caps_constrained_vram() {
1575        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1576            .resolve()
1577            .unwrap();
1578        let decision = |selection: &str| {
1579            resolved
1580                .decisions
1581                .iter()
1582                .find(|decision| decision.selection == selection)
1583                .unwrap()
1584        };
1585        assert_eq!(decision("kv_block_count").selected, "2048");
1586        assert_eq!(
1587            decision("kv_block_count").source,
1588            AutoConfigSource::WorkloadPreset
1589        );
1590
1591        let mut constrained =
1592            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1593        constrained.vram_bytes = Some(20 * 1024 * 1024 * 1024);
1594        let resolved = m3_with_hardware(&[], constrained).resolve().unwrap();
1595        let decision = |selection: &str| {
1596            resolved
1597                .decisions
1598                .iter()
1599                .find(|decision| decision.selection == selection)
1600                .unwrap()
1601        };
1602        assert_eq!(decision("kv_block_count").selected, "2");
1603        assert_eq!(
1604            decision("kv_block_count").source,
1605            AutoConfigSource::HardwareCapability
1606        );
1607        assert_eq!(decision("max_batched_tokens").selected, "32");
1608        assert_eq!(
1609            decision("max_batched_tokens").source,
1610            AutoConfigSource::HardwareCapability
1611        );
1612    }
1613
1614    #[test]
1615    fn compute_capability_parser_accepts_major_minor_and_major_only() {
1616        assert_eq!(parse_compute_capability("8.9"), Some((8, 9)));
1617        assert_eq!(parse_compute_capability("9"), Some((9, 0)));
1618        assert_eq!(parse_compute_capability("N/A"), None);
1619    }
1620
1621    #[test]
1622    fn vram_capacity_tiers_are_monotonic() {
1623        assert_eq!(vram_default_max_sequences(24 * 1024 * 1024 * 1024), 32);
1624        assert_eq!(vram_default_max_sequences(16 * 1024 * 1024 * 1024), 16);
1625        assert_eq!(vram_default_max_sequences(8 * 1024 * 1024 * 1024), 8);
1626        assert_eq!(vram_default_max_sequences(6 * 1024 * 1024 * 1024), 4);
1627    }
1628
1629    #[test]
1630    fn validates_invalid_override_matrix() {
1631        expect_invalid_key(
1632            &[("FERRUM_USE_VLLM_PAGED_ATTN", "maybe")],
1633            "FERRUM_USE_VLLM_PAGED_ATTN",
1634        );
1635        expect_invalid_key(&[("FERRUM_PREFIX_CACHE", "maybe")], "FERRUM_PREFIX_CACHE");
1636        expect_invalid_key(
1637            &[
1638                ("FERRUM_FA_LAYOUT_VARLEN", "1"),
1639                ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1640            ],
1641            "FERRUM_FA_LAYOUT_VARLEN",
1642        );
1643        expect_invalid_key(&[("FERRUM_FA2_DIRECT_FFI", "1")], "FERRUM_FA2_DIRECT_FFI");
1644        expect_invalid_key_with_features(
1645            &[("FERRUM_VLLM_MOE", "1")],
1646            "FERRUM_VLLM_MOE",
1647            CompiledKernelFeatures::default(),
1648        );
1649        expect_invalid_key(
1650            &[("FERRUM_MOE_DEVICE_ROUTE", "1"), ("FERRUM_VLLM_MOE", "0")],
1651            "FERRUM_MOE_DEVICE_ROUTE",
1652        );
1653        expect_invalid_key(
1654            &[("FERRUM_VLLM_MOE_PAIR_IDS", "1"), ("FERRUM_VLLM_MOE", "0")],
1655            "FERRUM_VLLM_MOE_PAIR_IDS",
1656        );
1657        expect_invalid_key(
1658            &[("FERRUM_MOE_GRAPH", "1"), ("FERRUM_VLLM_MOE", "0")],
1659            "FERRUM_MOE_GRAPH",
1660        );
1661        expect_invalid_key(&[("FERRUM_KV_MAX_BLOCKS", "0")], "FERRUM_KV_MAX_BLOCKS");
1662        expect_invalid_key(&[("FERRUM_PAGED_MAX_SEQS", "0")], "FERRUM_PAGED_MAX_SEQS");
1663        expect_invalid_key(
1664            &[
1665                ("FERRUM_PAGED_MAX_SEQS", "32"),
1666                ("FERRUM_MAX_BATCHED_TOKENS", "16"),
1667            ],
1668            "FERRUM_MAX_BATCHED_TOKENS",
1669        );
1670        expect_invalid_key(
1671            &[
1672                ("FERRUM_KV_MAX_BLOCKS", "16"),
1673                ("FERRUM_MAX_BATCHED_TOKENS", "512"),
1674            ],
1675            "FERRUM_MAX_BATCHED_TOKENS",
1676        );
1677        expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "0")], "FERRUM_MAX_MODEL_LEN");
1678        expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "50000")], "FERRUM_MAX_MODEL_LEN");
1679        expect_invalid_key(
1680            &[
1681                ("FERRUM_KV_MAX_BLOCKS", "16"),
1682                ("FERRUM_MAX_MODEL_LEN", "1024"),
1683            ],
1684            "FERRUM_KV_MAX_BLOCKS",
1685        );
1686        expect_invalid_key(&[("FERRUM_DTYPE", "bf16")], "FERRUM_DTYPE");
1687        expect_invalid_key(&[("FERRUM_KV_DTYPE", "fp8")], "FERRUM_KV_DTYPE");
1688        expect_invalid_key(
1689            &[
1690                ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", "1"),
1691                ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1692            ],
1693            "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
1694        );
1695    }
1696
1697    #[test]
1698    fn requested_max_model_len_is_optional_and_reflected_when_valid() {
1699        let default_resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1700            .resolve()
1701            .unwrap();
1702        assert!(!default_resolved
1703            .decisions
1704            .iter()
1705            .any(|decision| decision.selection == "max_model_len"));
1706
1707        let resolved = m3(
1708            &[
1709                ("FERRUM_KV_MAX_BLOCKS", "64"),
1710                ("FERRUM_MAX_MODEL_LEN", "1024"),
1711            ],
1712            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1713        )
1714        .resolve()
1715        .unwrap();
1716        let max_model_len = resolved
1717            .decisions
1718            .iter()
1719            .find(|decision| decision.selection == "max_model_len")
1720            .unwrap();
1721        assert_eq!(max_model_len.selected, "1024");
1722        assert_eq!(
1723            max_model_len.source_key.as_deref(),
1724            Some("FERRUM_MAX_MODEL_LEN")
1725        );
1726    }
1727
1728    #[test]
1729    fn graph_enabled_with_graph_unsafe_moe_is_rejected() {
1730        let mut model = ModelCapabilities::qwen3_30b_a3b_gptq_int4();
1731        model.graph_safe_moe = false;
1732        let err = FerrumConfigBuilder::new(snapshot(&[("FERRUM_MOE_GRAPH", "1")]))
1733            .with_model_capabilities(model)
1734            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1735                CompiledKernelFeatures::m3_fast_path_without_fa2(),
1736            ))
1737            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1738            .resolve()
1739            .expect_err("graph unsafe MoE must fail");
1740        assert!(matches!(
1741            err,
1742            AutoConfigError::UnsupportedCombination {
1743                selection,
1744                ..
1745            } if selection == "moe_graph_policy"
1746        ));
1747    }
1748
1749    #[test]
1750    fn scheduler_override_is_reflected_in_decision_trace() {
1751        let resolved = m3(
1752            &[("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK", "64")],
1753            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1754        )
1755        .resolve()
1756        .unwrap();
1757        let scheduler = resolved
1758            .decisions
1759            .iter()
1760            .find(|decision| decision.selection == "scheduler_admission_policy")
1761            .unwrap();
1762        assert_eq!(scheduler.selected, "active_decode_prefill_chunk:64");
1763        assert_eq!(
1764            scheduler.source_key.as_deref(),
1765            Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1766        );
1767    }
1768
1769    #[test]
1770    fn prefix_cache_override_is_reflected_in_decision_trace() {
1771        let resolved = m3(
1772            &[("FERRUM_PREFIX_CACHE", "1")],
1773            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1774        )
1775        .resolve()
1776        .unwrap();
1777        let prefix_cache = resolved
1778            .decisions
1779            .iter()
1780            .find(|decision| decision.selection == "prefix_cache_policy")
1781            .unwrap();
1782        assert_eq!(prefix_cache.selected, "prefix_cache_enabled");
1783        assert_eq!(
1784            prefix_cache.source_key.as_deref(),
1785            Some("FERRUM_PREFIX_CACHE")
1786        );
1787    }
1788
1789    #[test]
1790    fn non_env_runtime_sources_are_preserved_in_decision_trace() {
1791        let runtime_config = snapshot_with_sources(&[
1792            (
1793                "FERRUM_FA_LAYOUT_VARLEN",
1794                "1",
1795                RuntimeConfigSource::ConfigFile,
1796            ),
1797            ("FERRUM_PAGED_MAX_SEQS", "48", RuntimeConfigSource::Cli),
1798            (
1799                "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE",
1800                "32",
1801                RuntimeConfigSource::ScriptCase,
1802            ),
1803        ]);
1804        let resolved = FerrumConfigBuilder::new(runtime_config)
1805            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1806            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1807                CompiledKernelFeatures::m3_fast_path_without_fa2(),
1808            ))
1809            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1810            .resolve()
1811            .unwrap();
1812
1813        let decision = |selection: &str| {
1814            resolved
1815                .decisions
1816                .iter()
1817                .find(|decision| decision.selection == selection)
1818                .unwrap()
1819        };
1820        let attention = decision("attention_prefill_mixed_backend");
1821        assert_eq!(attention.selected, "fa_layout_varlen");
1822        assert_eq!(attention.source, AutoConfigSource::ConfigFile);
1823        assert_eq!(
1824            attention.source_key.as_deref(),
1825            Some("FERRUM_FA_LAYOUT_VARLEN")
1826        );
1827
1828        let max_sequences = decision("max_sequences");
1829        assert_eq!(max_sequences.selected, "48");
1830        assert_eq!(max_sequences.source, AutoConfigSource::Cli);
1831        assert_eq!(
1832            max_sequences.source_key.as_deref(),
1833            Some("FERRUM_PAGED_MAX_SEQS")
1834        );
1835
1836        let scheduler = decision("scheduler_admission_policy");
1837        assert_eq!(scheduler.selected, "prefill_first_until_active:32");
1838        assert_eq!(scheduler.source, AutoConfigSource::ScriptCase);
1839        assert_eq!(
1840            scheduler.source_key.as_deref(),
1841            Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE")
1842        );
1843    }
1844
1845    #[test]
1846    fn renders_effective_config_and_decision_trace_artifacts() {
1847        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1848            .resolve()
1849            .unwrap();
1850        let effective = resolved.effective_config_document();
1851        assert_eq!(effective["schema_version"], 1);
1852        assert!(effective["env_hash"]
1853            .as_str()
1854            .unwrap()
1855            .starts_with("sha256:"));
1856        assert!(effective["entries"].is_array());
1857        assert_eq!(effective["model_capabilities"]["architecture"], "qwen3_moe");
1858        assert_eq!(effective["hardware_capabilities"]["backend"], "cuda");
1859        assert_eq!(
1860            effective["workload_profile"]["preset"],
1861            M3_QWEN3_30B_A3B_INT4_PRESET
1862        );
1863        assert_eq!(
1864            effective["decisions"].as_array().unwrap().len(),
1865            resolved.decisions.len()
1866        );
1867        let trace = resolved.decision_trace_jsonl().unwrap();
1868        assert_eq!(trace.lines().count(), resolved.decisions.len());
1869        assert!(trace.contains("\"attention_prefill_mixed_backend\""));
1870    }
1871
1872    #[test]
1873    fn auto_config_artifacts_match_locked_schema_shape() {
1874        let resolved = FerrumConfigBuilder::m3_qwen3_30b_a3b_int4(snapshot_with_sources(&[
1875            (
1876                "FERRUM_FA_LAYOUT_VARLEN",
1877                "1",
1878                RuntimeConfigSource::ScriptCase,
1879            ),
1880            ("FERRUM_PAGED_MAX_SEQS", "32", RuntimeConfigSource::Cli),
1881        ]))
1882        .resolve()
1883        .unwrap();
1884
1885        let effective = resolved.effective_config_document();
1886        assert_eq!(effective["schema_version"], 1);
1887        assert!(effective["env_hash"]
1888            .as_str()
1889            .unwrap()
1890            .starts_with("sha256:"));
1891
1892        let entries = effective["entries"].as_array().unwrap();
1893        let keys: Vec<_> = entries
1894            .iter()
1895            .map(|entry| entry["key"].as_str().unwrap())
1896            .collect();
1897        let mut sorted_keys = keys.clone();
1898        sorted_keys.sort_unstable();
1899        assert_eq!(keys, sorted_keys);
1900        for entry in entries {
1901            assert!(entry["key"].as_str().unwrap().starts_with("FERRUM_"));
1902            assert!(entry["effective_value"].is_string());
1903            assert!(matches!(
1904                entry["source"].as_str().unwrap(),
1905                "default" | "config_file" | "cli" | "env" | "script_case" | "memory_profile"
1906            ));
1907            assert!(!entry["affects"].as_array().unwrap().is_empty());
1908        }
1909        assert_eq!(
1910            effective["model_capabilities"]["quantization"].as_str(),
1911            Some("gptq_int4")
1912        );
1913        assert_eq!(
1914            effective["model_capabilities"]["moe"]["experts_per_token"].as_u64(),
1915            Some(8)
1916        );
1917        assert_eq!(
1918            effective["hardware_capabilities"]["compute_capability"].as_str(),
1919            Some("8.9")
1920        );
1921        assert_eq!(
1922            effective["hardware_capabilities"]["compiled_features"]["vllm_moe_marlin"].as_bool(),
1923            Some(true)
1924        );
1925        assert_eq!(
1926            effective["workload_profile"]["target_concurrency"].as_u64(),
1927            Some(32)
1928        );
1929        assert_eq!(
1930            effective["workload_profile"]["priority"].as_str(),
1931            Some("throughput")
1932        );
1933
1934        let trace = resolved.decision_trace_jsonl().unwrap();
1935        let trace_decisions: Vec<AutoConfigDecision> = trace
1936            .lines()
1937            .map(|line| serde_json::from_str(line).unwrap())
1938            .collect();
1939        assert_eq!(trace_decisions, resolved.decisions);
1940        assert_eq!(
1941            serde_json::from_value::<Vec<AutoConfigDecision>>(effective["decisions"].clone())
1942                .unwrap(),
1943            trace_decisions
1944        );
1945
1946        for decision in &trace_decisions {
1947            assert_eq!(decision.schema_version, 1);
1948            assert!(!decision.selection.trim().is_empty());
1949            assert!(!decision.selected.trim().is_empty());
1950            assert!(!decision.candidates.is_empty());
1951            assert!(!decision.affects.is_empty());
1952            if let Some(source_key) = &decision.source_key {
1953                assert!(source_key.starts_with("FERRUM_"));
1954            }
1955            for rejected in &decision.rejected {
1956                assert!(!rejected.value.trim().is_empty());
1957                assert!(!rejected.reason.trim().is_empty());
1958            }
1959        }
1960    }
1961}