Skip to main content

ferrum_types/
auto_config.rs

1//! Startup auto-configuration and selector decision trace types.
2//!
3//! This is the typed control-plane surface for gradually replacing M3 shell
4//! env bundles with validated model/hardware/workload driven selections.
5
6use crate::{
7    parse_bool_env_value, parse_usize_env_value, RuntimeConfigEffect, RuntimeConfigEntry,
8    RuntimeConfigSnapshot, RuntimeConfigSource,
9};
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12use thiserror::Error;
13
14pub const M3_QWEN3_30B_A3B_INT4_PRESET: &str = "m3_qwen3_30b_a3b_int4";
15const DEFAULT_KV_BLOCK_SIZE_TOKENS: usize = 16;
16const DEFAULT_KV_BLOCKS: usize = 2048;
17const GIB: u64 = 1024 * 1024 * 1024;
18
19#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
20pub struct ModelCapabilities {
21    pub architecture: String,
22    pub quantization: Option<String>,
23    pub moe: Option<MoeCapabilities>,
24    pub max_context_len: Option<usize>,
25    pub num_hidden_layers: Option<usize>,
26    pub head_dim: Option<usize>,
27    pub kv_heads: Option<usize>,
28    pub estimated_weight_bytes: Option<u64>,
29    pub supported_dtypes: Vec<String>,
30    pub graph_safe_moe: bool,
31}
32
33impl ModelCapabilities {
34    pub fn unknown() -> Self {
35        Self {
36            architecture: "unknown".to_string(),
37            quantization: None,
38            moe: None,
39            max_context_len: None,
40            num_hidden_layers: None,
41            head_dim: None,
42            kv_heads: None,
43            estimated_weight_bytes: None,
44            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
45            graph_safe_moe: false,
46        }
47    }
48
49    pub fn qwen3_30b_a3b_gptq_int4() -> Self {
50        Self {
51            architecture: "qwen3_moe".to_string(),
52            quantization: Some("gptq_int4".to_string()),
53            moe: Some(MoeCapabilities {
54                num_experts: 128,
55                experts_per_token: 8,
56                moe_intermediate_size: Some(768),
57            }),
58            max_context_len: Some(40960),
59            num_hidden_layers: Some(48),
60            head_dim: Some(128),
61            kv_heads: Some(4),
62            // Conservative GPTQ int4 weight footprint including quant scales
63            // and loader/runtime overhead. This keeps the RTX 4090 M3 preset
64            // at the historical 2048 KV blocks while still allowing smaller
65            // GPUs to be downgraded before startup allocation.
66            estimated_weight_bytes: Some(18 * GIB),
67            supported_dtypes: vec!["fp16".to_string()],
68            graph_safe_moe: false,
69        }
70    }
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74pub struct MoeCapabilities {
75    pub num_experts: usize,
76    pub experts_per_token: usize,
77    pub moe_intermediate_size: Option<usize>,
78}
79
80#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
81pub struct HardwareCapabilities {
82    pub backend: String,
83    pub cuda_runtime: Option<String>,
84    pub compute_capability: Option<String>,
85    pub vram_bytes: Option<u64>,
86    pub sm_count: Option<u32>,
87    pub supported_dtypes: Vec<String>,
88    pub supported_kv_dtypes: Vec<String>,
89    pub graph_support: bool,
90    pub compiled_features: CompiledKernelFeatures,
91}
92
93impl HardwareCapabilities {
94    pub fn unknown() -> Self {
95        Self {
96            backend: "unknown".to_string(),
97            cuda_runtime: None,
98            compute_capability: None,
99            vram_bytes: None,
100            sm_count: None,
101            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
102            supported_kv_dtypes: vec!["fp16".to_string()],
103            graph_support: false,
104            compiled_features: CompiledKernelFeatures::default(),
105        }
106    }
107
108    pub fn rtx4090_cuda(features: CompiledKernelFeatures) -> Self {
109        Self {
110            backend: "cuda".to_string(),
111            cuda_runtime: None,
112            compute_capability: Some("8.9".to_string()),
113            vram_bytes: Some(24 * 1024 * 1024 * 1024),
114            sm_count: Some(128),
115            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
116            supported_kv_dtypes: vec!["fp16".to_string(), "int8".to_string()],
117            graph_support: true,
118            compiled_features: features,
119        }
120    }
121}
122
123#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
124pub struct CompiledKernelFeatures {
125    pub cuda: bool,
126    pub vllm_paged_attn: bool,
127    pub vllm_moe_marlin: bool,
128    pub cuda_graph: bool,
129    pub greedy_argmax: bool,
130    pub fa2_source: bool,
131    pub fa2_direct_ffi: bool,
132}
133
134impl Default for CompiledKernelFeatures {
135    fn default() -> Self {
136        Self {
137            cuda: false,
138            vllm_paged_attn: false,
139            vllm_moe_marlin: false,
140            cuda_graph: false,
141            greedy_argmax: false,
142            fa2_source: false,
143            fa2_direct_ffi: false,
144        }
145    }
146}
147
148impl CompiledKernelFeatures {
149    pub fn m3_fast_path_without_fa2() -> Self {
150        Self {
151            cuda: true,
152            vllm_paged_attn: true,
153            vllm_moe_marlin: true,
154            cuda_graph: true,
155            greedy_argmax: true,
156            fa2_source: false,
157            fa2_direct_ffi: false,
158        }
159    }
160
161    pub fn m3_fast_path_with_source_fa2() -> Self {
162        Self {
163            fa2_source: true,
164            ..Self::m3_fast_path_without_fa2()
165        }
166    }
167}
168
169#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
170pub struct WorkloadProfile {
171    pub preset: Option<String>,
172    pub serving_mode: String,
173    pub target_concurrency: usize,
174    pub prompt_length_class: String,
175    pub output_length_class: String,
176    pub priority: WorkloadPriority,
177}
178
179impl WorkloadProfile {
180    pub fn serving_default() -> Self {
181        Self {
182            preset: None,
183            serving_mode: "openai_chat".to_string(),
184            target_concurrency: 1,
185            prompt_length_class: "unknown".to_string(),
186            output_length_class: "unknown".to_string(),
187            priority: WorkloadPriority::Balanced,
188        }
189    }
190
191    pub fn serving_default_for_hardware(hardware: &HardwareCapabilities) -> Self {
192        let mut profile = Self::serving_default();
193        if hardware.backend.eq_ignore_ascii_case("cuda")
194            || hardware.backend.eq_ignore_ascii_case("metal")
195        {
196            profile.target_concurrency = hardware
197                .vram_bytes
198                .map(vram_default_max_sequences)
199                .unwrap_or(4)
200                .max(1);
201        }
202        profile
203    }
204
205    pub fn m3_qwen3_30b_a3b_int4() -> Self {
206        Self {
207            preset: Some(M3_QWEN3_30B_A3B_INT4_PRESET.to_string()),
208            serving_mode: "bench_serve".to_string(),
209            target_concurrency: 32,
210            prompt_length_class: "random_256".to_string(),
211            output_length_class: "random_128".to_string(),
212            priority: WorkloadPriority::Throughput,
213        }
214    }
215
216    fn is_m3_preset(&self) -> bool {
217        self.preset.as_deref() == Some(M3_QWEN3_30B_A3B_INT4_PRESET)
218    }
219}
220
221impl Default for WorkloadProfile {
222    fn default() -> Self {
223        Self::serving_default()
224    }
225}
226
227#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
228#[serde(rename_all = "snake_case")]
229pub enum WorkloadPriority {
230    Latency,
231    Throughput,
232    Balanced,
233}
234
235#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
236pub struct ResolvedFerrumConfig {
237    pub schema_version: u32,
238    pub preset: Option<String>,
239    pub runtime_config: RuntimeConfigSnapshot,
240    pub model_capabilities: ModelCapabilities,
241    pub hardware_capabilities: HardwareCapabilities,
242    pub workload_profile: WorkloadProfile,
243    pub decisions: Vec<AutoConfigDecision>,
244}
245
246impl ResolvedFerrumConfig {
247    pub fn effective_config_document(&self) -> serde_json::Value {
248        serde_json::json!({
249            "schema_version": 1,
250            "preset": self.preset,
251            "env_hash": self.runtime_env_hash(),
252            "entries": self.runtime_config.entries,
253            "model_capabilities": self.model_capabilities,
254            "hardware_capabilities": self.hardware_capabilities,
255            "workload_profile": self.workload_profile,
256            "decisions": self.decisions,
257        })
258    }
259
260    pub fn decision_trace_jsonl(&self) -> Result<String, serde_json::Error> {
261        let mut out = String::new();
262        for decision in &self.decisions {
263            out.push_str(&serde_json::to_string(decision)?);
264            out.push('\n');
265        }
266        Ok(out)
267    }
268
269    pub fn runtime_env_hash(&self) -> String {
270        use sha2::{Digest, Sha256};
271
272        let bytes = serde_json::to_vec(&self.runtime_config.entries).unwrap_or_default();
273        let digest = Sha256::digest(bytes);
274        format!("sha256:{digest:x}")
275    }
276}
277
278#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
279pub struct AutoConfigDecision {
280    pub schema_version: u32,
281    pub selection: String,
282    pub selected: String,
283    pub source: AutoConfigSource,
284    pub source_key: Option<String>,
285    pub candidates: Vec<String>,
286    pub rejected: Vec<RejectedCandidate>,
287    pub affects: Vec<RuntimeConfigEffect>,
288}
289
290#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
291pub struct RejectedCandidate {
292    pub value: String,
293    pub reason: String,
294}
295
296#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
297#[serde(rename_all = "snake_case")]
298pub enum AutoConfigSource {
299    Default,
300    Cli,
301    ConfigFile,
302    Env,
303    ScriptCase,
304    ModelMetadata,
305    HardwareCapability,
306    MemoryProfile,
307    WorkloadPreset,
308    CompiledFeature,
309}
310
311#[derive(Debug, Clone, PartialEq, Eq, Error)]
312pub enum AutoConfigError {
313    #[error("{key}: invalid override: {reason}")]
314    InvalidOverride { key: String, reason: String },
315    #[error("{selection}: unsupported combination: {reason}")]
316    UnsupportedCombination { selection: String, reason: String },
317}
318
319pub struct FerrumConfigBuilder {
320    runtime_config: RuntimeConfigSnapshot,
321    model: ModelCapabilities,
322    hardware: HardwareCapabilities,
323    workload: WorkloadProfile,
324}
325
326impl FerrumConfigBuilder {
327    pub fn new(runtime_config: RuntimeConfigSnapshot) -> Self {
328        Self {
329            runtime_config,
330            model: ModelCapabilities::unknown(),
331            hardware: HardwareCapabilities::unknown(),
332            workload: WorkloadProfile::default(),
333        }
334    }
335
336    pub fn m3_qwen3_30b_a3b_int4(runtime_config: RuntimeConfigSnapshot) -> Self {
337        Self::new(runtime_config)
338            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
339            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
340                CompiledKernelFeatures::m3_fast_path_without_fa2(),
341            ))
342            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
343    }
344
345    pub fn with_model_capabilities(mut self, model: ModelCapabilities) -> Self {
346        self.model = model;
347        self
348    }
349
350    pub fn with_hardware_capabilities(mut self, hardware: HardwareCapabilities) -> Self {
351        self.hardware = hardware;
352        self
353    }
354
355    pub fn with_workload_profile(mut self, workload: WorkloadProfile) -> Self {
356        self.workload = workload;
357        self
358    }
359
360    pub fn resolve(self) -> Result<ResolvedFerrumConfig, AutoConfigError> {
361        let mut decisions = Vec::new();
362        let cuda_backend = self.is_cuda_backend();
363        let use_vllm_paged_attn = self.bool_value(
364            "FERRUM_USE_VLLM_PAGED_ATTN",
365            self.workload.is_m3_preset()
366                && cuda_backend
367                && self.hardware.compiled_features.vllm_paged_attn,
368            AutoConfigSource::WorkloadPreset,
369        )?;
370        let fa_layout =
371            self.bool_value("FERRUM_FA_LAYOUT_VARLEN", false, AutoConfigSource::Default)?;
372        let fa2_source = self.bool_value("FERRUM_FA2_SOURCE", false, AutoConfigSource::Default)?;
373        let shim_present = self.raw("FERRUM_FA2_DIRECT_FFI_SHIM").is_some();
374        let fa2_direct_ffi = self.bool_value(
375            "FERRUM_FA2_DIRECT_FFI",
376            shim_present,
377            if shim_present {
378                AutoConfigSource::Env
379            } else {
380                AutoConfigSource::Default
381            },
382        )?;
383        let vllm_v1_short = self.bool_value(
384            "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
385            use_vllm_paged_attn.value,
386            AutoConfigSource::Default,
387        )?;
388        let vllm_moe = self.bool_value(
389            "FERRUM_VLLM_MOE",
390            self.workload.is_m3_preset()
391                && cuda_backend
392                && self.hardware.compiled_features.vllm_moe_marlin,
393            AutoConfigSource::WorkloadPreset,
394        )?;
395        let device_route = self.bool_value(
396            "FERRUM_MOE_DEVICE_ROUTE",
397            self.workload.is_m3_preset() && vllm_moe.value,
398            AutoConfigSource::WorkloadPreset,
399        )?;
400        let pair_ids = self.bool_value(
401            "FERRUM_VLLM_MOE_PAIR_IDS",
402            vllm_moe.value,
403            AutoConfigSource::WorkloadPreset,
404        )?;
405        let graph = self.bool_value("FERRUM_MOE_GRAPH", false, AutoConfigSource::WorkloadPreset)?;
406        let greedy = self.bool_value(
407            "FERRUM_GREEDY_ARGMAX",
408            self.workload.is_m3_preset()
409                && cuda_backend
410                && self.hardware.compiled_features.greedy_argmax,
411            AutoConfigSource::WorkloadPreset,
412        )?;
413        let prefix_cache = self.bool_value(
414            "FERRUM_PREFIX_CACHE",
415            false,
416            if self.workload.is_m3_preset() {
417                AutoConfigSource::WorkloadPreset
418            } else {
419                AutoConfigSource::Default
420            },
421        )?;
422        let default_max_sequences = self.default_max_sequences();
423        let max_sequences = self.usize_value(
424            "FERRUM_PAGED_MAX_SEQS",
425            default_max_sequences.value,
426            default_max_sequences.source,
427        )?;
428        let default_kv_blocks = self.default_kv_blocks(&max_sequences);
429        let kv_blocks = self.usize_value(
430            "FERRUM_KV_MAX_BLOCKS",
431            default_kv_blocks.value,
432            default_kv_blocks.source,
433        )?;
434        let default_max_batched_tokens =
435            self.default_max_batched_tokens(&max_sequences, &kv_blocks);
436        let max_batched_tokens = self.usize_value(
437            "FERRUM_MAX_BATCHED_TOKENS",
438            default_max_batched_tokens.value,
439            default_max_batched_tokens.source,
440        )?;
441        let max_model_len = self.optional_usize_value("FERRUM_MAX_MODEL_LEN")?;
442
443        self.validate_attention(
444            use_vllm_paged_attn.value,
445            fa_layout.value,
446            fa2_source.value,
447            fa2_direct_ffi.value,
448            shim_present,
449            vllm_v1_short.value,
450        )?;
451        self.validate_moe(
452            vllm_moe.value,
453            device_route.value,
454            pair_ids.value,
455            graph.value,
456        )?;
457        self.validate_memory(
458            kv_blocks.value,
459            max_sequences.value,
460            max_batched_tokens.value,
461            max_model_len.as_ref().map(|value| value.value),
462        )?;
463        self.validate_dtypes()?;
464        self.validate_sampling(greedy.value)?;
465
466        decisions.push(self.attention_prefill_decision(
467            use_vllm_paged_attn.clone(),
468            fa_layout,
469            fa2_source,
470            fa2_direct_ffi,
471        ));
472        decisions.push(self.attention_decode_decision(use_vllm_paged_attn, vllm_v1_short));
473        decisions.push(self.moe_decision(vllm_moe, device_route, pair_ids));
474        decisions.push(self.graph_decision(graph));
475        decisions.push(self.scalar_decision(
476            "kv_block_count",
477            kv_blocks,
478            RuntimeConfigEffect::Memory,
479        ));
480        decisions.push(self.scalar_decision(
481            "max_sequences",
482            max_sequences,
483            RuntimeConfigEffect::Memory,
484        ));
485        decisions.push(self.scalar_decision(
486            "max_batched_tokens",
487            max_batched_tokens,
488            RuntimeConfigEffect::Performance,
489        ));
490        if let Some(max_model_len) = max_model_len {
491            decisions.push(self.scalar_decision(
492                "max_model_len",
493                max_model_len,
494                RuntimeConfigEffect::Memory,
495            ));
496        }
497        decisions.push(self.prefix_cache_decision(prefix_cache));
498        decisions.push(self.scheduler_decision()?);
499        decisions.push(self.sampling_decision(greedy));
500
501        Ok(ResolvedFerrumConfig {
502            schema_version: 1,
503            preset: self.workload.preset.clone(),
504            runtime_config: self.runtime_config.clone(),
505            model_capabilities: self.model.clone(),
506            hardware_capabilities: self.hardware.clone(),
507            workload_profile: self.workload.clone(),
508            decisions,
509        })
510    }
511
512    fn entries(&self) -> BTreeMap<&str, &str> {
513        self.runtime_config
514            .entries
515            .iter()
516            .map(|entry| (entry.key.as_str(), entry.effective_value.as_str()))
517            .collect()
518    }
519
520    fn raw(&self, key: &str) -> Option<&str> {
521        self.entry(key).map(|entry| entry.effective_value.as_str())
522    }
523
524    fn entry(&self, key: &str) -> Option<&RuntimeConfigEntry> {
525        self.runtime_config
526            .entries
527            .iter()
528            .find(|entry| entry.key == key)
529    }
530
531    fn source_for_key(&self, key: &str, default_source: AutoConfigSource) -> AutoConfigSource {
532        self.entry(key)
533            .map(|entry| auto_config_source_from_runtime(entry.source))
534            .unwrap_or(default_source)
535    }
536
537    fn is_cuda_backend(&self) -> bool {
538        self.hardware.backend.eq_ignore_ascii_case("cuda")
539    }
540
541    fn cuda_compute_capability_at_least(&self, major: u32, minor: u32) -> Option<bool> {
542        let (actual_major, actual_minor) =
543            parse_compute_capability(self.hardware.compute_capability.as_deref()?)?;
544        Some((actual_major, actual_minor) >= (major, minor))
545    }
546
547    fn default_max_sequences(&self) -> ResolvedValue<usize> {
548        let target = self.workload.target_concurrency.max(1);
549        let mut selected = target;
550        if self.workload.is_m3_preset() {
551            if let Some(sm_count) = self.hardware.sm_count {
552                // The M3 throughput preset assumes a large GPU. On smaller
553                // known GPUs, avoid auto-selecting a c32-sized admission
554                // window before memory profiling has a chance to refine KV.
555                selected = selected.min((sm_count as usize / 4).max(1));
556            }
557            if let Some(vram_bytes) = self.hardware.vram_bytes {
558                selected = selected.min(vram_default_max_sequences(vram_bytes));
559            }
560        }
561        ResolvedValue {
562            value: selected.max(1),
563            source: if selected < target {
564                AutoConfigSource::HardwareCapability
565            } else {
566                AutoConfigSource::WorkloadPreset
567            },
568            source_key: None,
569        }
570    }
571
572    fn default_max_batched_tokens(
573        &self,
574        max_sequences: &ResolvedValue<usize>,
575        kv_blocks: &ResolvedValue<usize>,
576    ) -> ResolvedValue<usize> {
577        let kv_token_capacity = kv_blocks
578            .value
579            .saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS)
580            .max(max_sequences.value.max(1));
581        let value = max_sequences
582            .value
583            .max(1)
584            .saturating_mul(64)
585            .min(kv_token_capacity)
586            .max(max_sequences.value.max(1));
587        ResolvedValue {
588            value,
589            source: if max_sequences.source == AutoConfigSource::HardwareCapability
590                || kv_blocks.source == AutoConfigSource::HardwareCapability
591            {
592                AutoConfigSource::HardwareCapability
593            } else {
594                AutoConfigSource::WorkloadPreset
595            },
596            source_key: None,
597        }
598    }
599
600    fn default_kv_blocks(&self, max_sequences: &ResolvedValue<usize>) -> ResolvedValue<usize> {
601        let min_blocks = ceil_div(max_sequences.value.max(1), DEFAULT_KV_BLOCK_SIZE_TOKENS);
602        let target = DEFAULT_KV_BLOCKS.max(min_blocks);
603        let selected = match (
604            self.hardware.vram_bytes,
605            self.model.estimated_weight_bytes,
606            self.kv_cache_bytes_per_token(),
607        ) {
608            (Some(vram_bytes), Some(weight_bytes), Some(kv_bytes_per_token))
609                if kv_bytes_per_token > 0 =>
610            {
611                let headroom = (vram_bytes / 10).max(2 * GIB);
612                let available = vram_bytes.saturating_sub(weight_bytes.saturating_add(headroom));
613                let kv_token_budget = (available / kv_bytes_per_token) as usize;
614                let block_budget = kv_token_budget / DEFAULT_KV_BLOCK_SIZE_TOKENS;
615                target.min(block_budget.max(min_blocks))
616            }
617            _ => target,
618        };
619        ResolvedValue {
620            value: selected.max(1),
621            source: if selected < target {
622                AutoConfigSource::HardwareCapability
623            } else {
624                AutoConfigSource::WorkloadPreset
625            },
626            source_key: None,
627        }
628    }
629
630    fn kv_cache_bytes_per_token(&self) -> Option<u64> {
631        let layers = self.model.num_hidden_layers? as u64;
632        let kv_heads = self.model.kv_heads? as u64;
633        let head_dim = self.model.head_dim? as u64;
634        layers
635            .checked_mul(2)?
636            .checked_mul(kv_heads)?
637            .checked_mul(head_dim)?
638            .checked_mul(2)
639    }
640
641    fn bool_value(
642        &self,
643        key: &str,
644        default: bool,
645        default_source: AutoConfigSource,
646    ) -> Result<ResolvedValue<bool>, AutoConfigError> {
647        match self.entry(key) {
648            Some(entry) => Ok(ResolvedValue {
649                value: parse_bool_env_value(&entry.effective_value).map_err(|reason| {
650                    AutoConfigError::InvalidOverride {
651                        key: key.to_string(),
652                        reason,
653                    }
654                })?,
655                source: auto_config_source_from_runtime(entry.source),
656                source_key: Some(key.to_string()),
657            }),
658            None => Ok(ResolvedValue {
659                value: default,
660                source: default_source,
661                source_key: None,
662            }),
663        }
664    }
665
666    fn usize_value(
667        &self,
668        key: &str,
669        default: usize,
670        default_source: AutoConfigSource,
671    ) -> Result<ResolvedValue<usize>, AutoConfigError> {
672        match self.entry(key) {
673            Some(entry) => Ok(ResolvedValue {
674                value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
675                    AutoConfigError::InvalidOverride {
676                        key: key.to_string(),
677                        reason,
678                    }
679                })?,
680                source: auto_config_source_from_runtime(entry.source),
681                source_key: Some(key.to_string()),
682            }),
683            None => Ok(ResolvedValue {
684                value: default,
685                source: default_source,
686                source_key: None,
687            }),
688        }
689    }
690
691    fn optional_usize_value(
692        &self,
693        key: &str,
694    ) -> Result<Option<ResolvedValue<usize>>, AutoConfigError> {
695        match self.entry(key) {
696            Some(entry) => Ok(Some(ResolvedValue {
697                value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
698                    AutoConfigError::InvalidOverride {
699                        key: key.to_string(),
700                        reason,
701                    }
702                })?,
703                source: auto_config_source_from_runtime(entry.source),
704                source_key: Some(key.to_string()),
705            })),
706            None => Ok(None),
707        }
708    }
709
710    fn validate_attention(
711        &self,
712        use_vllm_paged_attn: bool,
713        fa_layout: bool,
714        fa2_source: bool,
715        fa2_direct_ffi: bool,
716        shim_present: bool,
717        vllm_v1_short: bool,
718    ) -> Result<(), AutoConfigError> {
719        if use_vllm_paged_attn && !self.hardware.compiled_features.vllm_paged_attn {
720            return self.invalid(
721                "FERRUM_USE_VLLM_PAGED_ATTN",
722                "vLLM paged attention is not compiled",
723            );
724        }
725        if use_vllm_paged_attn && !self.is_cuda_backend() {
726            return self.invalid(
727                "FERRUM_USE_VLLM_PAGED_ATTN",
728                "vLLM paged attention requires CUDA backend",
729            );
730        }
731        if fa_layout && !use_vllm_paged_attn {
732            return self.invalid(
733                "FERRUM_FA_LAYOUT_VARLEN",
734                "FA layout requires vLLM paged attention layout",
735            );
736        }
737        if fa2_source && !self.hardware.compiled_features.fa2_source {
738            return self.invalid(
739                "FERRUM_FA2_SOURCE",
740                "source-linked FA2 support is not compiled",
741            );
742        }
743        if fa2_source && !self.is_cuda_backend() {
744            return self.invalid(
745                "FERRUM_FA2_SOURCE",
746                "source-linked FA2 requires CUDA backend",
747            );
748        }
749        if fa2_source && !use_vllm_paged_attn {
750            return self.invalid(
751                "FERRUM_FA2_SOURCE",
752                "source-linked FA2 requires vLLM paged attention layout",
753            );
754        }
755        if fa2_source && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
756            return self.invalid(
757                "FERRUM_FA2_SOURCE",
758                "source-linked FA2 requires CUDA compute capability >= 8.0",
759            );
760        }
761        if fa2_direct_ffi && !self.hardware.compiled_features.fa2_direct_ffi {
762            return self.invalid(
763                "FERRUM_FA2_DIRECT_FFI",
764                "direct FA2 FFI shim support is not compiled",
765            );
766        }
767        if fa2_direct_ffi && !self.is_cuda_backend() {
768            return self.invalid(
769                "FERRUM_FA2_DIRECT_FFI",
770                "direct FA2 FFI shim requires CUDA backend",
771            );
772        }
773        if fa2_direct_ffi && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
774            return self.invalid(
775                "FERRUM_FA2_DIRECT_FFI",
776                "direct FA2 FFI shim requires CUDA compute capability >= 8.0",
777            );
778        }
779        if fa2_direct_ffi && !shim_present {
780            return self.invalid(
781                "FERRUM_FA2_DIRECT_FFI",
782                "requires FERRUM_FA2_DIRECT_FFI_SHIM",
783            );
784        }
785        if fa2_source && fa2_direct_ffi {
786            return self.unsupported(
787                "attention_prefill_mixed_backend",
788                "FA2 source and direct FFI shim cannot both own the prefill path",
789            );
790        }
791        if vllm_v1_short && !use_vllm_paged_attn {
792            return self.invalid(
793                "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
794                "short-context v1 requires vLLM paged attention",
795            );
796        }
797        Ok(())
798    }
799
800    fn validate_moe(
801        &self,
802        vllm_moe: bool,
803        device_route: bool,
804        pair_ids: bool,
805        graph: bool,
806    ) -> Result<(), AutoConfigError> {
807        if vllm_moe && !self.hardware.compiled_features.vllm_moe_marlin {
808            return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE is not compiled");
809        }
810        if vllm_moe && !self.is_cuda_backend() {
811            return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE requires CUDA backend");
812        }
813        if device_route && !vllm_moe {
814            return self.invalid(
815                "FERRUM_MOE_DEVICE_ROUTE",
816                "device route currently requires vLLM MoE",
817            );
818        }
819        if pair_ids && !vllm_moe {
820            return self.invalid(
821                "FERRUM_VLLM_MOE_PAIR_IDS",
822                "pair-id routing requires vLLM MoE",
823            );
824        }
825        let graph_relevant = self.model.moe.is_some() || self.workload.is_m3_preset();
826        if graph && graph_relevant && !self.hardware.graph_support {
827            return self.invalid(
828                "FERRUM_MOE_GRAPH",
829                "hardware/backend does not support CUDA graph replay",
830            );
831        }
832        if graph && graph_relevant && !self.hardware.compiled_features.cuda_graph {
833            return self.invalid("FERRUM_MOE_GRAPH", "CUDA graph support is not compiled");
834        }
835        if graph && graph_relevant && !vllm_moe {
836            return self.invalid(
837                "FERRUM_MOE_GRAPH",
838                "graph decode requires the graph-clean vLLM MoE path",
839            );
840        }
841        if graph && graph_relevant && self.model.moe.is_some() && !self.model.graph_safe_moe {
842            return self.unsupported(
843                "moe_graph_policy",
844                "model MoE path is not marked graph-safe",
845            );
846        }
847        Ok(())
848    }
849
850    fn validate_sampling(&self, greedy: bool) -> Result<(), AutoConfigError> {
851        if greedy && !self.hardware.compiled_features.greedy_argmax {
852            return self.invalid("FERRUM_GREEDY_ARGMAX", "GPU argmax is not compiled");
853        }
854        if greedy
855            && !(self.is_cuda_backend() || self.hardware.backend.eq_ignore_ascii_case("metal"))
856        {
857            return self.invalid(
858                "FERRUM_GREEDY_ARGMAX",
859                "greedy argmax requires CUDA or Metal backend",
860            );
861        }
862        Ok(())
863    }
864
865    fn validate_memory(
866        &self,
867        kv_blocks: usize,
868        max_sequences: usize,
869        max_batched_tokens: usize,
870        requested_max_model_len: Option<usize>,
871    ) -> Result<(), AutoConfigError> {
872        if kv_blocks == 0 {
873            return self.invalid("FERRUM_KV_MAX_BLOCKS", "must be greater than zero");
874        }
875        if max_sequences == 0 {
876            return self.invalid("FERRUM_PAGED_MAX_SEQS", "must be greater than zero");
877        }
878        if max_batched_tokens < max_sequences {
879            return self.invalid(
880                "FERRUM_MAX_BATCHED_TOKENS",
881                "must be at least FERRUM_PAGED_MAX_SEQS",
882            );
883        }
884        let kv_token_capacity = kv_blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS);
885        if max_batched_tokens > kv_token_capacity {
886            return self.invalid(
887                "FERRUM_MAX_BATCHED_TOKENS",
888                "exceeds KV cache token capacity",
889            );
890        }
891        if let Some(max_model_len) = requested_max_model_len {
892            if max_model_len == 0 {
893                return self.invalid("FERRUM_MAX_MODEL_LEN", "must be greater than zero");
894            }
895            if let Some(model_max) = self.model.max_context_len {
896                if max_model_len > model_max {
897                    return self.invalid(
898                        "FERRUM_MAX_MODEL_LEN",
899                        "exceeds model metadata max context length",
900                    );
901                }
902            }
903            if max_model_len > kv_token_capacity {
904                return self.invalid(
905                    "FERRUM_KV_MAX_BLOCKS",
906                    "KV cache token capacity is smaller than FERRUM_MAX_MODEL_LEN",
907                );
908            }
909        }
910        Ok(())
911    }
912
913    fn validate_dtypes(&self) -> Result<(), AutoConfigError> {
914        if let Some(dtype) = self.raw("FERRUM_DTYPE") {
915            let dtype = dtype.to_ascii_lowercase();
916            if !self.hardware.supported_dtypes.iter().any(|d| d == &dtype) {
917                return self.invalid("FERRUM_DTYPE", "dtype is not supported by hardware profile");
918            }
919        }
920        if let Some(dtype) = self.raw("FERRUM_KV_DTYPE") {
921            let dtype = dtype.to_ascii_lowercase();
922            if !self
923                .hardware
924                .supported_kv_dtypes
925                .iter()
926                .any(|d| d == &dtype)
927            {
928                return self.invalid(
929                    "FERRUM_KV_DTYPE",
930                    "KV dtype is not supported by hardware profile",
931                );
932            }
933        }
934        Ok(())
935    }
936
937    fn attention_prefill_decision(
938        &self,
939        use_vllm_paged_attn: ResolvedValue<bool>,
940        fa_layout: ResolvedValue<bool>,
941        fa2_source: ResolvedValue<bool>,
942        fa2_direct_ffi: ResolvedValue<bool>,
943    ) -> AutoConfigDecision {
944        let (selected, source, source_key) = if fa2_source.value {
945            ("fa2_source", fa2_source.source, fa2_source.source_key)
946        } else if fa2_direct_ffi.value {
947            (
948                "fa2_direct_ffi",
949                fa2_direct_ffi.source,
950                fa2_direct_ffi.source_key,
951            )
952        } else if fa_layout.value {
953            ("fa_layout_varlen", fa_layout.source, fa_layout.source_key)
954        } else if use_vllm_paged_attn.value {
955            (
956                "vllm_paged_varlen",
957                use_vllm_paged_attn.source,
958                use_vllm_paged_attn.source_key,
959            )
960        } else {
961            ("legacy_paged_varlen", AutoConfigSource::Default, None)
962        };
963        self.decision(
964            "attention_prefill_mixed_backend",
965            selected,
966            source,
967            source_key,
968            [
969                "fa2_source",
970                "fa2_direct_ffi",
971                "fa_layout_varlen",
972                "vllm_paged_varlen",
973                "legacy_paged_varlen",
974            ],
975            self.rejected_except(
976                selected,
977                [
978                    ("fa2_source", "source-linked FA2 path not selected"),
979                    ("fa2_direct_ffi", "diagnostic direct FFI shim not selected"),
980                    ("fa_layout_varlen", "FA-compatible layout not selected"),
981                    ("vllm_paged_varlen", "vLLM paged varlen bridge not selected"),
982                    (
983                        "legacy_paged_varlen",
984                        "a higher-priority attention path was selected",
985                    ),
986                ],
987            ),
988            vec![
989                RuntimeConfigEffect::Performance,
990                RuntimeConfigEffect::Memory,
991            ],
992        )
993    }
994
995    fn attention_decode_decision(
996        &self,
997        use_vllm_paged_attn: ResolvedValue<bool>,
998        vllm_v1_short: ResolvedValue<bool>,
999    ) -> AutoConfigDecision {
1000        let (selected, source, source_key) = if use_vllm_paged_attn.value {
1001            if vllm_v1_short.value {
1002                (
1003                    "vllm_paged_attn_v1_short",
1004                    vllm_v1_short.source,
1005                    vllm_v1_short.source_key,
1006                )
1007            } else {
1008                (
1009                    "vllm_paged_attn_v2",
1010                    vllm_v1_short.source,
1011                    vllm_v1_short.source_key,
1012                )
1013            }
1014        } else {
1015            ("legacy_paged_decode", use_vllm_paged_attn.source, None)
1016        };
1017        self.decision(
1018            "attention_decode_backend",
1019            selected,
1020            source,
1021            source_key,
1022            [
1023                "vllm_paged_attn_v1_short",
1024                "vllm_paged_attn_v2",
1025                "legacy_paged_decode",
1026            ],
1027            self.rejected_except(
1028                selected,
1029                [
1030                    (
1031                        "vllm_paged_attn_v1_short",
1032                        "short-context v1 decode not selected",
1033                    ),
1034                    ("vllm_paged_attn_v2", "v2 decode not selected"),
1035                    ("legacy_paged_decode", "legacy decode not selected"),
1036                ],
1037            ),
1038            vec![RuntimeConfigEffect::Performance],
1039        )
1040    }
1041
1042    fn moe_decision(
1043        &self,
1044        vllm_moe: ResolvedValue<bool>,
1045        device_route: ResolvedValue<bool>,
1046        pair_ids: ResolvedValue<bool>,
1047    ) -> AutoConfigDecision {
1048        let selected = if vllm_moe.value && device_route.value && pair_ids.value {
1049            "vllm_marlin_moe_device_route_pair_ids"
1050        } else if vllm_moe.value && device_route.value {
1051            "vllm_marlin_moe_device_route"
1052        } else if vllm_moe.value {
1053            "vllm_marlin_moe"
1054        } else {
1055            "legacy_moe"
1056        };
1057        self.decision(
1058            "moe_implementation",
1059            selected,
1060            vllm_moe.source,
1061            vllm_moe.source_key,
1062            [
1063                "vllm_marlin_moe_device_route_pair_ids",
1064                "vllm_marlin_moe_device_route",
1065                "vllm_marlin_moe",
1066                "legacy_moe",
1067            ],
1068            self.rejected_except(
1069                selected,
1070                [
1071                    (
1072                        "vllm_marlin_moe_device_route_pair_ids",
1073                        "pair-id device route not selected",
1074                    ),
1075                    (
1076                        "vllm_marlin_moe_device_route",
1077                        "device-route MoE not selected",
1078                    ),
1079                    ("vllm_marlin_moe", "vLLM Marlin MoE not selected"),
1080                    ("legacy_moe", "legacy MoE not selected"),
1081                ],
1082            ),
1083            vec![RuntimeConfigEffect::Performance],
1084        )
1085    }
1086
1087    fn graph_decision(&self, graph: ResolvedValue<bool>) -> AutoConfigDecision {
1088        let selected = if graph.value {
1089            "graph_clean_decode"
1090        } else {
1091            "graph_disabled"
1092        };
1093        self.decision(
1094            "moe_graph_policy",
1095            selected,
1096            graph.source,
1097            graph.source_key,
1098            ["graph_clean_decode", "graph_disabled"],
1099            self.rejected_except(
1100                selected,
1101                [
1102                    ("graph_clean_decode", "graph decode not selected"),
1103                    ("graph_disabled", "graph decode selected"),
1104                ],
1105            ),
1106            vec![
1107                RuntimeConfigEffect::Performance,
1108                RuntimeConfigEffect::Correctness,
1109            ],
1110        )
1111    }
1112
1113    fn scalar_decision(
1114        &self,
1115        selection: &str,
1116        value: ResolvedValue<usize>,
1117        effect: RuntimeConfigEffect,
1118    ) -> AutoConfigDecision {
1119        self.decision(
1120            selection,
1121            &value.value.to_string(),
1122            value.source,
1123            value.source_key,
1124            [value.value.to_string()],
1125            Vec::new(),
1126            vec![effect],
1127        )
1128    }
1129
1130    fn scheduler_decision(&self) -> Result<AutoConfigDecision, AutoConfigError> {
1131        let entries = self.entries();
1132        let mut selected = "continuous_default".to_string();
1133        let mut source_key = None;
1134        if let Some(chunk) = entries.get("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK") {
1135            parse_usize_env_value(chunk).map_err(|reason| AutoConfigError::InvalidOverride {
1136                key: "FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string(),
1137                reason,
1138            })?;
1139            selected = format!("active_decode_prefill_chunk:{chunk}");
1140            source_key = Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string());
1141        } else if let Some(until) = entries.get("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE") {
1142            parse_usize_env_value(until).map_err(|reason| AutoConfigError::InvalidOverride {
1143                key: "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string(),
1144                reason,
1145            })?;
1146            selected = format!("prefill_first_until_active:{until}");
1147            source_key = Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string());
1148        } else if self
1149            .bool_value(
1150                "FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE",
1151                false,
1152                AutoConfigSource::Default,
1153            )?
1154            .value
1155        {
1156            selected = "prompt_token_estimate".to_string();
1157            source_key = Some("FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE".to_string());
1158        }
1159        self.unsupported_if(
1160            source_key.as_deref() == Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1161                && selected.ends_with(":0"),
1162            "scheduler_admission_policy",
1163            "active decode prefill chunk must be greater than zero",
1164        )?;
1165        Ok(self.decision(
1166            "scheduler_admission_policy",
1167            &selected,
1168            source_key
1169                .as_deref()
1170                .map(|key| self.source_for_key(key, AutoConfigSource::Default))
1171                .unwrap_or(AutoConfigSource::Default),
1172            source_key,
1173            [
1174                "continuous_default",
1175                "prompt_token_estimate",
1176                "prefill_first_until_active",
1177                "active_decode_prefill_chunk",
1178            ],
1179            Vec::new(),
1180            vec![RuntimeConfigEffect::Performance],
1181        ))
1182    }
1183
1184    fn prefix_cache_decision(&self, prefix_cache: ResolvedValue<bool>) -> AutoConfigDecision {
1185        let selected = if prefix_cache.value {
1186            "prefix_cache_enabled"
1187        } else {
1188            "prefix_cache_disabled"
1189        };
1190        self.decision(
1191            "prefix_cache_policy",
1192            selected,
1193            prefix_cache.source,
1194            prefix_cache.source_key,
1195            ["prefix_cache_enabled", "prefix_cache_disabled"],
1196            self.rejected_except(
1197                selected,
1198                [
1199                    ("prefix_cache_enabled", "prefix cache not selected"),
1200                    ("prefix_cache_disabled", "prefix cache enabled"),
1201                ],
1202            ),
1203            vec![
1204                RuntimeConfigEffect::Correctness,
1205                RuntimeConfigEffect::Performance,
1206                RuntimeConfigEffect::Memory,
1207            ],
1208        )
1209    }
1210
1211    fn sampling_decision(&self, greedy: ResolvedValue<bool>) -> AutoConfigDecision {
1212        let selected = if greedy.value {
1213            "gpu_greedy_argmax"
1214        } else {
1215            "logits_readback"
1216        };
1217        self.decision(
1218            "sampling_readback_path",
1219            selected,
1220            greedy.source,
1221            greedy.source_key,
1222            ["gpu_greedy_argmax", "logits_readback"],
1223            self.rejected_except(
1224                selected,
1225                [
1226                    ("gpu_greedy_argmax", "GPU argmax not selected"),
1227                    ("logits_readback", "logits readback not selected"),
1228                ],
1229            ),
1230            vec![
1231                RuntimeConfigEffect::Performance,
1232                RuntimeConfigEffect::Correctness,
1233            ],
1234        )
1235    }
1236
1237    fn decision<I, C>(
1238        &self,
1239        selection: &str,
1240        selected: &str,
1241        source: AutoConfigSource,
1242        source_key: Option<String>,
1243        candidates: I,
1244        rejected: Vec<RejectedCandidate>,
1245        affects: Vec<RuntimeConfigEffect>,
1246    ) -> AutoConfigDecision
1247    where
1248        I: IntoIterator<Item = C>,
1249        C: Into<String>,
1250    {
1251        AutoConfigDecision {
1252            schema_version: 1,
1253            selection: selection.to_string(),
1254            selected: selected.to_string(),
1255            source,
1256            source_key,
1257            candidates: candidates.into_iter().map(Into::into).collect(),
1258            rejected,
1259            affects,
1260        }
1261    }
1262
1263    fn rejected_except<I>(&self, selected: &str, candidates: I) -> Vec<RejectedCandidate>
1264    where
1265        I: IntoIterator<Item = (&'static str, &'static str)>,
1266    {
1267        candidates
1268            .into_iter()
1269            .filter(|(value, _)| *value != selected)
1270            .map(|(value, reason)| RejectedCandidate {
1271                value: value.to_string(),
1272                reason: reason.to_string(),
1273            })
1274            .collect()
1275    }
1276
1277    fn invalid<T>(&self, key: &str, reason: &str) -> Result<T, AutoConfigError> {
1278        Err(AutoConfigError::InvalidOverride {
1279            key: key.to_string(),
1280            reason: reason.to_string(),
1281        })
1282    }
1283
1284    fn unsupported<T>(&self, selection: &str, reason: &str) -> Result<T, AutoConfigError> {
1285        Err(AutoConfigError::UnsupportedCombination {
1286            selection: selection.to_string(),
1287            reason: reason.to_string(),
1288        })
1289    }
1290
1291    fn unsupported_if(
1292        &self,
1293        condition: bool,
1294        selection: &str,
1295        reason: &str,
1296    ) -> Result<(), AutoConfigError> {
1297        if condition {
1298            self.unsupported(selection, reason)
1299        } else {
1300            Ok(())
1301        }
1302    }
1303}
1304
1305#[derive(Debug, Clone, PartialEq, Eq)]
1306struct ResolvedValue<T> {
1307    value: T,
1308    source: AutoConfigSource,
1309    source_key: Option<String>,
1310}
1311
1312fn parse_compute_capability(value: &str) -> Option<(u32, u32)> {
1313    let value = value.trim();
1314    if value.is_empty() {
1315        return None;
1316    }
1317    let (major, minor) = value.split_once('.').unwrap_or((value, "0"));
1318    Some((major.trim().parse().ok()?, minor.trim().parse().ok()?))
1319}
1320
1321fn vram_default_max_sequences(vram_bytes: u64) -> usize {
1322    match vram_bytes {
1323        bytes if bytes >= 20 * GIB => 32,
1324        bytes if bytes >= 12 * GIB => 16,
1325        bytes if bytes >= 8 * GIB => 8,
1326        _ => 4,
1327    }
1328}
1329
1330fn ceil_div(value: usize, divisor: usize) -> usize {
1331    value.div_ceil(divisor)
1332}
1333
1334fn auto_config_source_from_runtime(source: RuntimeConfigSource) -> AutoConfigSource {
1335    match source {
1336        RuntimeConfigSource::Default => AutoConfigSource::Default,
1337        RuntimeConfigSource::ConfigFile => AutoConfigSource::ConfigFile,
1338        RuntimeConfigSource::Cli => AutoConfigSource::Cli,
1339        RuntimeConfigSource::Env => AutoConfigSource::Env,
1340        RuntimeConfigSource::ScriptCase => AutoConfigSource::ScriptCase,
1341        RuntimeConfigSource::MemoryProfile => AutoConfigSource::MemoryProfile,
1342    }
1343}
1344
1345#[cfg(test)]
1346mod tests {
1347    use super::*;
1348
1349    fn snapshot(vars: &[(&str, &str)]) -> RuntimeConfigSnapshot {
1350        RuntimeConfigSnapshot::from_env_vars(vars.iter().copied())
1351    }
1352
1353    fn snapshot_with_sources(vars: &[(&str, &str, RuntimeConfigSource)]) -> RuntimeConfigSnapshot {
1354        let mut entries: Vec<_> = vars
1355            .iter()
1356            .map(|(key, effective_value, source)| RuntimeConfigEntry {
1357                key: (*key).to_string(),
1358                effective_value: (*effective_value).to_string(),
1359                source: *source,
1360                affects: vec![RuntimeConfigEffect::Performance],
1361            })
1362            .collect();
1363        entries.sort_by(|a, b| a.key.cmp(&b.key));
1364        RuntimeConfigSnapshot { entries }
1365    }
1366
1367    fn m3(vars: &[(&str, &str)], features: CompiledKernelFeatures) -> FerrumConfigBuilder {
1368        FerrumConfigBuilder::new(snapshot(vars))
1369            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1370            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(features))
1371            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1372    }
1373
1374    fn m3_with_hardware(
1375        vars: &[(&str, &str)],
1376        hardware: HardwareCapabilities,
1377    ) -> FerrumConfigBuilder {
1378        FerrumConfigBuilder::new(snapshot(vars))
1379            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1380            .with_hardware_capabilities(hardware)
1381            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1382    }
1383
1384    fn expect_invalid_key(vars: &[(&str, &str)], key: &str) {
1385        expect_invalid_key_with_features(
1386            vars,
1387            key,
1388            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1389        );
1390    }
1391
1392    fn expect_invalid_key_with_features(
1393        vars: &[(&str, &str)],
1394        key: &str,
1395        features: CompiledKernelFeatures,
1396    ) {
1397        expect_invalid_key_with_hardware(vars, key, HardwareCapabilities::rtx4090_cuda(features));
1398    }
1399
1400    fn expect_invalid_key_with_hardware(
1401        vars: &[(&str, &str)],
1402        key: &str,
1403        hardware: HardwareCapabilities,
1404    ) {
1405        let err = m3_with_hardware(vars, hardware)
1406            .resolve()
1407            .expect_err("override should fail");
1408        match err {
1409            AutoConfigError::InvalidOverride { key: actual, .. } => assert_eq!(actual, key),
1410            other => panic!("expected invalid override for {key}, got {other:?}"),
1411        }
1412    }
1413
1414    fn cpu_hardware_with_features(features: CompiledKernelFeatures) -> HardwareCapabilities {
1415        HardwareCapabilities {
1416            backend: "cpu".to_string(),
1417            supported_dtypes: vec!["fp32".to_string()],
1418            supported_kv_dtypes: vec!["fp16".to_string()],
1419            compiled_features: features,
1420            ..HardwareCapabilities::unknown()
1421        }
1422    }
1423
1424    #[test]
1425    fn m3_preset_selects_current_safe_fast_path_without_fa2() {
1426        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1427            .resolve()
1428            .unwrap();
1429        let decisions: BTreeMap<_, _> = resolved
1430            .decisions
1431            .iter()
1432            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1433            .collect();
1434        assert_eq!(
1435            decisions["attention_prefill_mixed_backend"],
1436            "vllm_paged_varlen"
1437        );
1438        assert_eq!(
1439            decisions["attention_decode_backend"],
1440            "vllm_paged_attn_v1_short"
1441        );
1442        assert_eq!(
1443            decisions["moe_implementation"],
1444            "vllm_marlin_moe_device_route_pair_ids"
1445        );
1446        assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1447        assert_eq!(decisions["prefix_cache_policy"], "prefix_cache_disabled");
1448        assert_eq!(decisions["sampling_readback_path"], "gpu_greedy_argmax");
1449        assert_eq!(
1450            resolved.preset.as_deref(),
1451            Some(M3_QWEN3_30B_A3B_INT4_PRESET)
1452        );
1453    }
1454
1455    #[test]
1456    fn source_fa2_selects_source_linked_attention_when_compiled() {
1457        let resolved = m3(
1458            &[("FERRUM_FA2_SOURCE", "1")],
1459            CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1460        )
1461        .resolve()
1462        .unwrap();
1463        let decisions: BTreeMap<_, _> = resolved
1464            .decisions
1465            .iter()
1466            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1467            .collect();
1468
1469        assert_eq!(decisions["attention_prefill_mixed_backend"], "fa2_source");
1470    }
1471
1472    #[test]
1473    fn source_fa2_is_rejected_when_not_compiled() {
1474        expect_invalid_key(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE");
1475    }
1476
1477    #[test]
1478    fn hardware_capabilities_keep_m3_preset_on_compatible_backend_paths() {
1479        let resolved = m3_with_hardware(
1480            &[],
1481            cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2()),
1482        )
1483        .resolve()
1484        .unwrap();
1485        let decisions: BTreeMap<_, _> = resolved
1486            .decisions
1487            .iter()
1488            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1489            .collect();
1490
1491        assert_eq!(
1492            decisions["attention_prefill_mixed_backend"],
1493            "legacy_paged_varlen"
1494        );
1495        assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1496        assert_eq!(decisions["moe_implementation"], "legacy_moe");
1497        assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1498        assert_eq!(decisions["sampling_readback_path"], "logits_readback");
1499    }
1500
1501    #[test]
1502    fn hardware_incompatible_attention_and_sampling_overrides_are_rejected() {
1503        let cpu =
1504            cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2());
1505        expect_invalid_key_with_hardware(
1506            &[("FERRUM_USE_VLLM_PAGED_ATTN", "1")],
1507            "FERRUM_USE_VLLM_PAGED_ATTN",
1508            cpu.clone(),
1509        );
1510        expect_invalid_key_with_hardware(
1511            &[("FERRUM_VLLM_MOE", "1")],
1512            "FERRUM_VLLM_MOE",
1513            cpu.clone(),
1514        );
1515        expect_invalid_key_with_hardware(
1516            &[("FERRUM_GREEDY_ARGMAX", "1")],
1517            "FERRUM_GREEDY_ARGMAX",
1518            cpu.clone(),
1519        );
1520        expect_invalid_key_with_hardware(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE", cpu);
1521
1522        let mut old_cuda = HardwareCapabilities::rtx4090_cuda(
1523            CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1524        );
1525        old_cuda.compute_capability = Some("7.5".to_string());
1526        expect_invalid_key_with_hardware(
1527            &[("FERRUM_FA2_SOURCE", "1")],
1528            "FERRUM_FA2_SOURCE",
1529            old_cuda,
1530        );
1531    }
1532
1533    #[test]
1534    fn hardware_capacity_sizes_default_sequence_budget_without_overriding_user_values() {
1535        let mut small_gpu =
1536            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1537        small_gpu.sm_count = Some(16);
1538        small_gpu.vram_bytes = Some(24 * 1024 * 1024 * 1024);
1539
1540        let resolved = m3_with_hardware(&[], small_gpu.clone()).resolve().unwrap();
1541        let decision = |selection: &str| {
1542            resolved
1543                .decisions
1544                .iter()
1545                .find(|decision| decision.selection == selection)
1546                .unwrap()
1547        };
1548        let max_sequences = decision("max_sequences");
1549        assert_eq!(max_sequences.selected, "4");
1550        assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1551        let max_batched_tokens = decision("max_batched_tokens");
1552        assert_eq!(max_batched_tokens.selected, "256");
1553        assert_eq!(
1554            max_batched_tokens.source,
1555            AutoConfigSource::HardwareCapability
1556        );
1557
1558        let resolved = m3_with_hardware(&[("FERRUM_PAGED_MAX_SEQS", "16")], small_gpu)
1559            .resolve()
1560            .unwrap();
1561        let max_sequences = resolved
1562            .decisions
1563            .iter()
1564            .find(|decision| decision.selection == "max_sequences")
1565            .unwrap();
1566        assert_eq!(max_sequences.selected, "16");
1567        assert_eq!(max_sequences.source, AutoConfigSource::Env);
1568        assert_eq!(
1569            max_sequences.source_key.as_deref(),
1570            Some("FERRUM_PAGED_MAX_SEQS")
1571        );
1572    }
1573
1574    #[test]
1575    fn vram_capacity_caps_m3_default_sequence_budget() {
1576        let mut low_vram_gpu =
1577            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1578        low_vram_gpu.sm_count = Some(128);
1579        low_vram_gpu.vram_bytes = Some(7 * 1024 * 1024 * 1024);
1580
1581        let resolved = m3_with_hardware(&[], low_vram_gpu).resolve().unwrap();
1582        let max_sequences = resolved
1583            .decisions
1584            .iter()
1585            .find(|decision| decision.selection == "max_sequences")
1586            .unwrap();
1587        assert_eq!(max_sequences.selected, "4");
1588        assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1589    }
1590
1591    #[test]
1592    fn memory_budget_keeps_rtx4090_m3_kv_blocks_but_caps_constrained_vram() {
1593        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1594            .resolve()
1595            .unwrap();
1596        let decision = |selection: &str| {
1597            resolved
1598                .decisions
1599                .iter()
1600                .find(|decision| decision.selection == selection)
1601                .unwrap()
1602        };
1603        assert_eq!(decision("kv_block_count").selected, "2048");
1604        assert_eq!(
1605            decision("kv_block_count").source,
1606            AutoConfigSource::WorkloadPreset
1607        );
1608
1609        let mut constrained =
1610            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1611        constrained.vram_bytes = Some(20 * 1024 * 1024 * 1024);
1612        let resolved = m3_with_hardware(&[], constrained).resolve().unwrap();
1613        let decision = |selection: &str| {
1614            resolved
1615                .decisions
1616                .iter()
1617                .find(|decision| decision.selection == selection)
1618                .unwrap()
1619        };
1620        assert_eq!(decision("kv_block_count").selected, "2");
1621        assert_eq!(
1622            decision("kv_block_count").source,
1623            AutoConfigSource::HardwareCapability
1624        );
1625        assert_eq!(decision("max_batched_tokens").selected, "32");
1626        assert_eq!(
1627            decision("max_batched_tokens").source,
1628            AutoConfigSource::HardwareCapability
1629        );
1630    }
1631
1632    #[test]
1633    fn compute_capability_parser_accepts_major_minor_and_major_only() {
1634        assert_eq!(parse_compute_capability("8.9"), Some((8, 9)));
1635        assert_eq!(parse_compute_capability("9"), Some((9, 0)));
1636        assert_eq!(parse_compute_capability("N/A"), None);
1637    }
1638
1639    #[test]
1640    fn vram_capacity_tiers_are_monotonic() {
1641        assert_eq!(vram_default_max_sequences(24 * 1024 * 1024 * 1024), 32);
1642        assert_eq!(vram_default_max_sequences(16 * 1024 * 1024 * 1024), 16);
1643        assert_eq!(vram_default_max_sequences(8 * 1024 * 1024 * 1024), 8);
1644        assert_eq!(vram_default_max_sequences(6 * 1024 * 1024 * 1024), 4);
1645    }
1646
1647    #[test]
1648    fn accelerator_serving_default_uses_hardware_concurrency_budget() {
1649        let hardware =
1650            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1651        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1652        assert_eq!(workload.target_concurrency, 32);
1653
1654        let resolved = FerrumConfigBuilder::new(snapshot(&[]))
1655            .with_model_capabilities(ModelCapabilities::unknown())
1656            .with_hardware_capabilities(hardware)
1657            .with_workload_profile(workload)
1658            .resolve()
1659            .unwrap();
1660        let max_sequences = resolved
1661            .decisions
1662            .iter()
1663            .find(|decision| decision.selection == "max_sequences")
1664            .unwrap();
1665        assert_eq!(max_sequences.selected, "32");
1666    }
1667
1668    #[test]
1669    fn cpu_serving_default_keeps_single_sequence_budget() {
1670        let hardware = HardwareCapabilities {
1671            backend: "cpu".to_string(),
1672            supported_dtypes: vec!["fp32".to_string()],
1673            ..HardwareCapabilities::unknown()
1674        };
1675        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1676        assert_eq!(workload.target_concurrency, 1);
1677    }
1678
1679    #[test]
1680    fn validates_invalid_override_matrix() {
1681        expect_invalid_key(
1682            &[("FERRUM_USE_VLLM_PAGED_ATTN", "maybe")],
1683            "FERRUM_USE_VLLM_PAGED_ATTN",
1684        );
1685        expect_invalid_key(&[("FERRUM_PREFIX_CACHE", "maybe")], "FERRUM_PREFIX_CACHE");
1686        expect_invalid_key(
1687            &[
1688                ("FERRUM_FA_LAYOUT_VARLEN", "1"),
1689                ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1690            ],
1691            "FERRUM_FA_LAYOUT_VARLEN",
1692        );
1693        expect_invalid_key(&[("FERRUM_FA2_DIRECT_FFI", "1")], "FERRUM_FA2_DIRECT_FFI");
1694        expect_invalid_key_with_features(
1695            &[("FERRUM_VLLM_MOE", "1")],
1696            "FERRUM_VLLM_MOE",
1697            CompiledKernelFeatures::default(),
1698        );
1699        expect_invalid_key(
1700            &[("FERRUM_MOE_DEVICE_ROUTE", "1"), ("FERRUM_VLLM_MOE", "0")],
1701            "FERRUM_MOE_DEVICE_ROUTE",
1702        );
1703        expect_invalid_key(
1704            &[("FERRUM_VLLM_MOE_PAIR_IDS", "1"), ("FERRUM_VLLM_MOE", "0")],
1705            "FERRUM_VLLM_MOE_PAIR_IDS",
1706        );
1707        expect_invalid_key(
1708            &[("FERRUM_MOE_GRAPH", "1"), ("FERRUM_VLLM_MOE", "0")],
1709            "FERRUM_MOE_GRAPH",
1710        );
1711        expect_invalid_key(&[("FERRUM_KV_MAX_BLOCKS", "0")], "FERRUM_KV_MAX_BLOCKS");
1712        expect_invalid_key(&[("FERRUM_PAGED_MAX_SEQS", "0")], "FERRUM_PAGED_MAX_SEQS");
1713        expect_invalid_key(
1714            &[
1715                ("FERRUM_PAGED_MAX_SEQS", "32"),
1716                ("FERRUM_MAX_BATCHED_TOKENS", "16"),
1717            ],
1718            "FERRUM_MAX_BATCHED_TOKENS",
1719        );
1720        expect_invalid_key(
1721            &[
1722                ("FERRUM_KV_MAX_BLOCKS", "16"),
1723                ("FERRUM_MAX_BATCHED_TOKENS", "512"),
1724            ],
1725            "FERRUM_MAX_BATCHED_TOKENS",
1726        );
1727        expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "0")], "FERRUM_MAX_MODEL_LEN");
1728        expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "50000")], "FERRUM_MAX_MODEL_LEN");
1729        expect_invalid_key(
1730            &[
1731                ("FERRUM_KV_MAX_BLOCKS", "16"),
1732                ("FERRUM_MAX_MODEL_LEN", "1024"),
1733            ],
1734            "FERRUM_KV_MAX_BLOCKS",
1735        );
1736        expect_invalid_key(&[("FERRUM_DTYPE", "bf16")], "FERRUM_DTYPE");
1737        expect_invalid_key(&[("FERRUM_KV_DTYPE", "fp8")], "FERRUM_KV_DTYPE");
1738        expect_invalid_key(
1739            &[
1740                ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", "1"),
1741                ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1742            ],
1743            "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
1744        );
1745    }
1746
1747    #[test]
1748    fn requested_max_model_len_is_optional_and_reflected_when_valid() {
1749        let default_resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1750            .resolve()
1751            .unwrap();
1752        assert!(!default_resolved
1753            .decisions
1754            .iter()
1755            .any(|decision| decision.selection == "max_model_len"));
1756
1757        let resolved = m3(
1758            &[
1759                ("FERRUM_KV_MAX_BLOCKS", "64"),
1760                ("FERRUM_MAX_MODEL_LEN", "1024"),
1761            ],
1762            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1763        )
1764        .resolve()
1765        .unwrap();
1766        let max_model_len = resolved
1767            .decisions
1768            .iter()
1769            .find(|decision| decision.selection == "max_model_len")
1770            .unwrap();
1771        assert_eq!(max_model_len.selected, "1024");
1772        assert_eq!(
1773            max_model_len.source_key.as_deref(),
1774            Some("FERRUM_MAX_MODEL_LEN")
1775        );
1776    }
1777
1778    #[test]
1779    fn graph_enabled_with_graph_unsafe_moe_is_rejected() {
1780        let mut model = ModelCapabilities::qwen3_30b_a3b_gptq_int4();
1781        model.graph_safe_moe = false;
1782        let err = FerrumConfigBuilder::new(snapshot(&[("FERRUM_MOE_GRAPH", "1")]))
1783            .with_model_capabilities(model)
1784            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1785                CompiledKernelFeatures::m3_fast_path_without_fa2(),
1786            ))
1787            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1788            .resolve()
1789            .expect_err("graph unsafe MoE must fail");
1790        assert!(matches!(
1791            err,
1792            AutoConfigError::UnsupportedCombination {
1793                selection,
1794                ..
1795            } if selection == "moe_graph_policy"
1796        ));
1797    }
1798
1799    #[test]
1800    fn scheduler_override_is_reflected_in_decision_trace() {
1801        let resolved = m3(
1802            &[("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK", "64")],
1803            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1804        )
1805        .resolve()
1806        .unwrap();
1807        let scheduler = resolved
1808            .decisions
1809            .iter()
1810            .find(|decision| decision.selection == "scheduler_admission_policy")
1811            .unwrap();
1812        assert_eq!(scheduler.selected, "active_decode_prefill_chunk:64");
1813        assert_eq!(
1814            scheduler.source_key.as_deref(),
1815            Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1816        );
1817    }
1818
1819    #[test]
1820    fn prefix_cache_override_is_reflected_in_decision_trace() {
1821        let resolved = m3(
1822            &[("FERRUM_PREFIX_CACHE", "1")],
1823            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1824        )
1825        .resolve()
1826        .unwrap();
1827        let prefix_cache = resolved
1828            .decisions
1829            .iter()
1830            .find(|decision| decision.selection == "prefix_cache_policy")
1831            .unwrap();
1832        assert_eq!(prefix_cache.selected, "prefix_cache_enabled");
1833        assert_eq!(
1834            prefix_cache.source_key.as_deref(),
1835            Some("FERRUM_PREFIX_CACHE")
1836        );
1837    }
1838
1839    #[test]
1840    fn non_env_runtime_sources_are_preserved_in_decision_trace() {
1841        let runtime_config = snapshot_with_sources(&[
1842            (
1843                "FERRUM_FA_LAYOUT_VARLEN",
1844                "1",
1845                RuntimeConfigSource::ConfigFile,
1846            ),
1847            ("FERRUM_PAGED_MAX_SEQS", "48", RuntimeConfigSource::Cli),
1848            (
1849                "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE",
1850                "32",
1851                RuntimeConfigSource::ScriptCase,
1852            ),
1853        ]);
1854        let resolved = FerrumConfigBuilder::new(runtime_config)
1855            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1856            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1857                CompiledKernelFeatures::m3_fast_path_without_fa2(),
1858            ))
1859            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1860            .resolve()
1861            .unwrap();
1862
1863        let decision = |selection: &str| {
1864            resolved
1865                .decisions
1866                .iter()
1867                .find(|decision| decision.selection == selection)
1868                .unwrap()
1869        };
1870        let attention = decision("attention_prefill_mixed_backend");
1871        assert_eq!(attention.selected, "fa_layout_varlen");
1872        assert_eq!(attention.source, AutoConfigSource::ConfigFile);
1873        assert_eq!(
1874            attention.source_key.as_deref(),
1875            Some("FERRUM_FA_LAYOUT_VARLEN")
1876        );
1877
1878        let max_sequences = decision("max_sequences");
1879        assert_eq!(max_sequences.selected, "48");
1880        assert_eq!(max_sequences.source, AutoConfigSource::Cli);
1881        assert_eq!(
1882            max_sequences.source_key.as_deref(),
1883            Some("FERRUM_PAGED_MAX_SEQS")
1884        );
1885
1886        let scheduler = decision("scheduler_admission_policy");
1887        assert_eq!(scheduler.selected, "prefill_first_until_active:32");
1888        assert_eq!(scheduler.source, AutoConfigSource::ScriptCase);
1889        assert_eq!(
1890            scheduler.source_key.as_deref(),
1891            Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE")
1892        );
1893    }
1894
1895    #[test]
1896    fn renders_effective_config_and_decision_trace_artifacts() {
1897        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1898            .resolve()
1899            .unwrap();
1900        let effective = resolved.effective_config_document();
1901        assert_eq!(effective["schema_version"], 1);
1902        assert!(effective["env_hash"]
1903            .as_str()
1904            .unwrap()
1905            .starts_with("sha256:"));
1906        assert!(effective["entries"].is_array());
1907        assert_eq!(effective["model_capabilities"]["architecture"], "qwen3_moe");
1908        assert_eq!(effective["hardware_capabilities"]["backend"], "cuda");
1909        assert_eq!(
1910            effective["workload_profile"]["preset"],
1911            M3_QWEN3_30B_A3B_INT4_PRESET
1912        );
1913        assert_eq!(
1914            effective["decisions"].as_array().unwrap().len(),
1915            resolved.decisions.len()
1916        );
1917        let trace = resolved.decision_trace_jsonl().unwrap();
1918        assert_eq!(trace.lines().count(), resolved.decisions.len());
1919        assert!(trace.contains("\"attention_prefill_mixed_backend\""));
1920    }
1921
1922    #[test]
1923    fn auto_config_artifacts_match_locked_schema_shape() {
1924        let resolved = FerrumConfigBuilder::m3_qwen3_30b_a3b_int4(snapshot_with_sources(&[
1925            (
1926                "FERRUM_FA_LAYOUT_VARLEN",
1927                "1",
1928                RuntimeConfigSource::ScriptCase,
1929            ),
1930            ("FERRUM_PAGED_MAX_SEQS", "32", RuntimeConfigSource::Cli),
1931        ]))
1932        .resolve()
1933        .unwrap();
1934
1935        let effective = resolved.effective_config_document();
1936        assert_eq!(effective["schema_version"], 1);
1937        assert!(effective["env_hash"]
1938            .as_str()
1939            .unwrap()
1940            .starts_with("sha256:"));
1941
1942        let entries = effective["entries"].as_array().unwrap();
1943        let keys: Vec<_> = entries
1944            .iter()
1945            .map(|entry| entry["key"].as_str().unwrap())
1946            .collect();
1947        let mut sorted_keys = keys.clone();
1948        sorted_keys.sort_unstable();
1949        assert_eq!(keys, sorted_keys);
1950        for entry in entries {
1951            assert!(entry["key"].as_str().unwrap().starts_with("FERRUM_"));
1952            assert!(entry["effective_value"].is_string());
1953            assert!(matches!(
1954                entry["source"].as_str().unwrap(),
1955                "default" | "config_file" | "cli" | "env" | "script_case" | "memory_profile"
1956            ));
1957            assert!(!entry["affects"].as_array().unwrap().is_empty());
1958        }
1959        assert_eq!(
1960            effective["model_capabilities"]["quantization"].as_str(),
1961            Some("gptq_int4")
1962        );
1963        assert_eq!(
1964            effective["model_capabilities"]["moe"]["experts_per_token"].as_u64(),
1965            Some(8)
1966        );
1967        assert_eq!(
1968            effective["hardware_capabilities"]["compute_capability"].as_str(),
1969            Some("8.9")
1970        );
1971        assert_eq!(
1972            effective["hardware_capabilities"]["compiled_features"]["vllm_moe_marlin"].as_bool(),
1973            Some(true)
1974        );
1975        assert_eq!(
1976            effective["workload_profile"]["target_concurrency"].as_u64(),
1977            Some(32)
1978        );
1979        assert_eq!(
1980            effective["workload_profile"]["priority"].as_str(),
1981            Some("throughput")
1982        );
1983
1984        let trace = resolved.decision_trace_jsonl().unwrap();
1985        let trace_decisions: Vec<AutoConfigDecision> = trace
1986            .lines()
1987            .map(|line| serde_json::from_str(line).unwrap())
1988            .collect();
1989        assert_eq!(trace_decisions, resolved.decisions);
1990        assert_eq!(
1991            serde_json::from_value::<Vec<AutoConfigDecision>>(effective["decisions"].clone())
1992                .unwrap(),
1993            trace_decisions
1994        );
1995
1996        for decision in &trace_decisions {
1997            assert_eq!(decision.schema_version, 1);
1998            assert!(!decision.selection.trim().is_empty());
1999            assert!(!decision.selected.trim().is_empty());
2000            assert!(!decision.candidates.is_empty());
2001            assert!(!decision.affects.is_empty());
2002            if let Some(source_key) = &decision.source_key {
2003                assert!(source_key.starts_with("FERRUM_"));
2004            }
2005            for rejected in &decision.rejected {
2006                assert!(!rejected.value.trim().is_empty());
2007                assert!(!rejected.reason.trim().is_empty());
2008            }
2009        }
2010    }
2011}