Skip to main content

ferrum_types/
auto_config.rs

1//! Startup auto-configuration and selector decision trace types.
2//!
3//! This is the typed control-plane surface for gradually replacing M3 shell
4//! env bundles with validated model/hardware/workload driven selections.
5
6use crate::{
7    parse_bool_env_value, parse_usize_env_value, RuntimeConfigEffect, RuntimeConfigEntry,
8    RuntimeConfigSnapshot, RuntimeConfigSource,
9};
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12use thiserror::Error;
13
14pub const M3_QWEN3_30B_A3B_INT4_PRESET: &str = "m3_qwen3_30b_a3b_int4";
15pub const QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET: &str =
16    "qwen25_72b_gptq_int4_2x4090_layer_split";
17const DEFAULT_KV_BLOCK_SIZE_TOKENS: usize = 16;
18const DEFAULT_KV_BLOCKS: usize = 2048;
19const GIB: u64 = 1024 * 1024 * 1024;
20
21#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22pub struct ModelCapabilities {
23    pub architecture: String,
24    pub quantization: Option<String>,
25    pub moe: Option<MoeCapabilities>,
26    pub max_context_len: Option<usize>,
27    pub num_hidden_layers: Option<usize>,
28    pub head_dim: Option<usize>,
29    pub kv_heads: Option<usize>,
30    pub estimated_weight_bytes: Option<u64>,
31    pub supported_dtypes: Vec<String>,
32    pub graph_safe_moe: bool,
33}
34
35impl ModelCapabilities {
36    pub fn unknown() -> Self {
37        Self {
38            architecture: "unknown".to_string(),
39            quantization: None,
40            moe: None,
41            max_context_len: None,
42            num_hidden_layers: None,
43            head_dim: None,
44            kv_heads: None,
45            estimated_weight_bytes: None,
46            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
47            graph_safe_moe: false,
48        }
49    }
50
51    pub fn qwen3_30b_a3b_gptq_int4() -> Self {
52        Self {
53            architecture: "qwen3_moe".to_string(),
54            quantization: Some("gptq_int4".to_string()),
55            moe: Some(MoeCapabilities {
56                num_experts: 128,
57                experts_per_token: 8,
58                moe_intermediate_size: Some(768),
59            }),
60            max_context_len: Some(40960),
61            num_hidden_layers: Some(48),
62            head_dim: Some(128),
63            kv_heads: Some(4),
64            // Conservative GPTQ int4 weight footprint including quant scales
65            // and loader/runtime overhead. This keeps the RTX 4090 M3 preset
66            // at the historical 2048 KV blocks while still allowing smaller
67            // GPUs to be downgraded before startup allocation.
68            estimated_weight_bytes: Some(18 * GIB),
69            supported_dtypes: vec!["fp16".to_string()],
70            graph_safe_moe: false,
71        }
72    }
73
74    pub fn qwen25_72b_gptq_int4() -> Self {
75        Self {
76            architecture: "qwen2".to_string(),
77            quantization: Some("gptq_int4".to_string()),
78            moe: None,
79            max_context_len: Some(32_768),
80            num_hidden_layers: Some(80),
81            head_dim: Some(128),
82            kv_heads: Some(8),
83            estimated_weight_bytes: Some(39 * GIB),
84            supported_dtypes: vec!["fp16".to_string()],
85            graph_safe_moe: false,
86        }
87    }
88}
89
90#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
91pub struct MoeCapabilities {
92    pub num_experts: usize,
93    pub experts_per_token: usize,
94    pub moe_intermediate_size: Option<usize>,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
98pub struct HardwareCapabilities {
99    pub backend: String,
100    pub cuda_runtime: Option<String>,
101    pub compute_capability: Option<String>,
102    pub vram_bytes: Option<u64>,
103    pub sm_count: Option<u32>,
104    pub supported_dtypes: Vec<String>,
105    pub supported_kv_dtypes: Vec<String>,
106    pub graph_support: bool,
107    pub compiled_features: CompiledKernelFeatures,
108}
109
110impl HardwareCapabilities {
111    pub fn unknown() -> Self {
112        Self {
113            backend: "unknown".to_string(),
114            cuda_runtime: None,
115            compute_capability: None,
116            vram_bytes: None,
117            sm_count: None,
118            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
119            supported_kv_dtypes: vec!["fp16".to_string()],
120            graph_support: false,
121            compiled_features: CompiledKernelFeatures::default(),
122        }
123    }
124
125    pub fn rtx4090_cuda(features: CompiledKernelFeatures) -> Self {
126        Self {
127            backend: "cuda".to_string(),
128            cuda_runtime: None,
129            compute_capability: Some("8.9".to_string()),
130            vram_bytes: Some(24 * 1024 * 1024 * 1024),
131            sm_count: Some(128),
132            supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
133            supported_kv_dtypes: vec!["fp16".to_string(), "int8".to_string()],
134            graph_support: true,
135            compiled_features: features,
136        }
137    }
138}
139
140#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
141pub struct CompiledKernelFeatures {
142    pub cuda: bool,
143    pub vllm_paged_attn: bool,
144    pub vllm_moe_marlin: bool,
145    pub cuda_graph: bool,
146    pub greedy_argmax: bool,
147    pub fa2_source: bool,
148    pub fa2_direct_ffi: bool,
149}
150
151impl Default for CompiledKernelFeatures {
152    fn default() -> Self {
153        Self {
154            cuda: false,
155            vllm_paged_attn: false,
156            vllm_moe_marlin: false,
157            cuda_graph: false,
158            greedy_argmax: false,
159            fa2_source: false,
160            fa2_direct_ffi: false,
161        }
162    }
163}
164
165impl CompiledKernelFeatures {
166    pub fn m3_fast_path_without_fa2() -> Self {
167        Self {
168            cuda: true,
169            vllm_paged_attn: true,
170            vllm_moe_marlin: true,
171            cuda_graph: true,
172            greedy_argmax: true,
173            fa2_source: false,
174            fa2_direct_ffi: false,
175        }
176    }
177
178    pub fn m3_fast_path_with_source_fa2() -> Self {
179        Self {
180            fa2_source: true,
181            ..Self::m3_fast_path_without_fa2()
182        }
183    }
184}
185
186#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
187pub struct WorkloadProfile {
188    pub preset: Option<String>,
189    pub serving_mode: String,
190    pub target_concurrency: usize,
191    pub prompt_length_class: String,
192    pub output_length_class: String,
193    pub priority: WorkloadPriority,
194}
195
196impl WorkloadProfile {
197    pub fn serving_default() -> Self {
198        Self {
199            preset: None,
200            serving_mode: "openai_chat".to_string(),
201            target_concurrency: 1,
202            prompt_length_class: "unknown".to_string(),
203            output_length_class: "unknown".to_string(),
204            priority: WorkloadPriority::Balanced,
205        }
206    }
207
208    pub fn serving_default_for_hardware(hardware: &HardwareCapabilities) -> Self {
209        let mut profile = Self::serving_default();
210        if hardware.backend.eq_ignore_ascii_case("cuda")
211            || hardware.backend.eq_ignore_ascii_case("metal")
212        {
213            profile.target_concurrency = hardware
214                .vram_bytes
215                .map(vram_default_max_sequences)
216                .unwrap_or(4)
217                .max(1);
218        }
219        profile
220    }
221
222    pub fn m3_qwen3_30b_a3b_int4() -> Self {
223        Self {
224            preset: Some(M3_QWEN3_30B_A3B_INT4_PRESET.to_string()),
225            serving_mode: "bench_serve".to_string(),
226            target_concurrency: 32,
227            prompt_length_class: "random_256".to_string(),
228            output_length_class: "random_128".to_string(),
229            priority: WorkloadPriority::Throughput,
230        }
231    }
232
233    pub fn qwen25_72b_gptq_int4_2x4090_layer_split() -> Self {
234        Self {
235            preset: Some(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET.to_string()),
236            serving_mode: "bench_serve".to_string(),
237            target_concurrency: 16,
238            prompt_length_class: "random_256".to_string(),
239            output_length_class: "random_128".to_string(),
240            priority: WorkloadPriority::Throughput,
241        }
242    }
243
244    fn is_m3_preset(&self) -> bool {
245        self.is_preset(M3_QWEN3_30B_A3B_INT4_PRESET)
246    }
247
248    fn is_preset(&self, preset: &str) -> bool {
249        self.preset.as_deref() == Some(preset)
250    }
251}
252
253impl Default for WorkloadProfile {
254    fn default() -> Self {
255        Self::serving_default()
256    }
257}
258
259#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
260#[serde(rename_all = "snake_case")]
261pub enum WorkloadPriority {
262    Latency,
263    Throughput,
264    Balanced,
265}
266
267#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
268pub struct ResolvedFerrumConfig {
269    pub schema_version: u32,
270    pub preset: Option<String>,
271    pub runtime_config: RuntimeConfigSnapshot,
272    pub model_capabilities: ModelCapabilities,
273    pub hardware_capabilities: HardwareCapabilities,
274    pub workload_profile: WorkloadProfile,
275    pub decisions: Vec<AutoConfigDecision>,
276}
277
278impl ResolvedFerrumConfig {
279    pub fn effective_config_document(&self) -> serde_json::Value {
280        let backend = self.hardware_capabilities.backend.clone();
281        let requested_gpu_devices = self
282            .runtime_csv_usize("FERRUM_REQUESTED_GPU_DEVICES")
283            .or_else(|| default_gpu_devices_for_backend(&backend));
284        let selected_gpu_devices = self
285            .runtime_csv_usize("FERRUM_SELECTED_GPU_DEVICES")
286            .or_else(|| requested_gpu_devices.clone())
287            .or_else(|| default_gpu_devices_for_backend(&backend));
288        let cuda_device_count = self
289            .runtime_usize("FERRUM_CUDA_DEVICE_COUNT")
290            .or_else(|| {
291                backend.eq_ignore_ascii_case("cuda").then(|| {
292                    selected_gpu_devices
293                        .as_ref()
294                        .map(|devices| devices.len())
295                        .unwrap_or(1)
296                })
297            })
298            .unwrap_or(0);
299        let selected_distributed_strategy = self
300            .runtime_entry_value("FERRUM_SELECTED_DISTRIBUTED_STRATEGY")
301            .unwrap_or_else(|| {
302                if selected_gpu_devices
303                    .as_ref()
304                    .map(|devices| devices.len() > 1)
305                    .unwrap_or(false)
306                {
307                    "layer_split".to_string()
308                } else if backend.eq_ignore_ascii_case("cuda") {
309                    "single_gpu".to_string()
310                } else {
311                    "none".to_string()
312                }
313            });
314        let selected_layer_split_plan =
315            self.runtime_entry_value("FERRUM_SELECTED_LAYER_SPLIT_PLAN");
316        let selected_layer_split_stages =
317            self.runtime_json_value("FERRUM_SELECTED_LAYER_SPLIT_STAGES");
318        let selected_layer_split_stage_count = selected_layer_split_stages
319            .as_ref()
320            .and_then(|value| value.as_array().map(|stages| stages.len()))
321            .or_else(|| {
322                selected_layer_split_plan
323                    .as_ref()
324                    .and_then(|_| selected_gpu_devices.as_ref().map(Vec::len))
325            });
326        let requested_pipeline_mode = self.runtime_entry_value("FERRUM_LAYER_SPLIT_PIPELINE_MODE");
327        let selected_pipeline_mode = if selected_layer_split_plan.is_some() {
328            requested_pipeline_mode.unwrap_or_else(|| {
329                if selected_layer_split_stage_count == Some(2) {
330                    "overlapped".to_string()
331                } else {
332                    "batch".to_string()
333                }
334            })
335        } else {
336            "sequential".to_string()
337        };
338        let selected_max_sequences = self.selected_usize("max_sequences");
339        let selected_microbatch_size = if selected_layer_split_plan.is_some() {
340            selected_max_sequences.map(|max_sequences| {
341                if selected_pipeline_mode == "overlapped" {
342                    max_sequences.div_ceil(2).max(1)
343                } else {
344                    max_sequences
345                }
346            })
347        } else {
348            Some(1)
349        };
350        let selected_stage_bridge = selected_layer_split_plan.as_ref().map(|_| "host");
351        let selected_max_model_len = self.selected_usize("max_model_len");
352        let selected_kv_capacity = self.runtime_usize("FERRUM_KV_CAPACITY");
353        let selected_max_batched_tokens = self.selected_usize("max_batched_tokens");
354        serde_json::json!({
355            "schema_version": 1,
356            "preset": self.preset,
357            "env_hash": self.runtime_env_hash(),
358            "backend": backend.clone(),
359            "requested_gpu_devices": requested_gpu_devices.clone(),
360            "selected_gpu_devices": selected_gpu_devices.clone(),
361            "cuda_device_count": cuda_device_count,
362            "selected_distributed_strategy": selected_distributed_strategy.clone(),
363            "selected_layer_split_plan": selected_layer_split_plan.clone(),
364            "selected_layer_split_stages": selected_layer_split_stages,
365            "selected_pipeline_mode": selected_pipeline_mode,
366            "selected_microbatch_size": selected_microbatch_size,
367            "selected_stage_bridge": selected_stage_bridge,
368            "selected_weight_placement": if selected_layer_split_plan.is_some() { "layer_split" } else { "single_device" },
369            "selected_kv_layout": if backend.eq_ignore_ascii_case("cpu") { "contiguous" } else { "paged" },
370            "selected_attention_impl": self.selected_string("attention_decode_backend"),
371            "selected_graph_mode": self.selected_string("moe_graph_policy"),
372            "selected_max_sequences": selected_max_sequences,
373            "selected_max_model_len": selected_max_model_len,
374            "selected_kv_capacity": selected_kv_capacity,
375            "selected_max_batched_tokens": selected_max_batched_tokens,
376            "selected_admission_limit": selected_max_sequences,
377            "entries": self.runtime_config.entries,
378            "model_capabilities": self.model_capabilities,
379            "hardware_capabilities": self.hardware_capabilities,
380            "workload_profile": self.workload_profile,
381            "admission": self.admission_summary_document(),
382            "decisions": self.decisions,
383        })
384    }
385
386    pub fn admission_summary_document(&self) -> serde_json::Value {
387        let max_sequences = self.selected_usize("max_sequences");
388        let kv_blocks = self.selected_usize("kv_block_count");
389        let max_batched_tokens = self.selected_usize("max_batched_tokens");
390        let max_model_len = self.selected_usize("max_model_len");
391        let kv_capacity_tokens =
392            kv_blocks.map(|blocks| blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS));
393        let kv_bytes_per_token = kv_cache_bytes_per_token_for_model(&self.model_capabilities);
394        let scheduler_policy = self
395            .selected_string("scheduler_admission_policy")
396            .unwrap_or_else(|| "unknown".to_string());
397        serde_json::json!({
398            "schema_version": 1,
399            "backend": self.hardware_capabilities.backend,
400            "model_architecture": self.model_capabilities.architecture,
401            "scheduler_policy": scheduler_policy,
402            "effective_max_concurrent": max_sequences,
403            "queue_depth": 0u64,
404            "active_prefill": 0u64,
405            "active_decode": 0u64,
406            "current_batch_size": 0u64,
407            "rejected_requests_total": 0u64,
408            "failed_requests_total": 0u64,
409            "completed_requests_total": 0u64,
410            "max_sequences": max_sequences,
411            "kv_block_count": kv_blocks,
412            "kv_block_size_tokens": DEFAULT_KV_BLOCK_SIZE_TOKENS,
413            "kv_capacity_tokens": kv_capacity_tokens,
414            "max_model_length": max_model_len,
415            "max_batched_tokens": max_batched_tokens,
416            "memory_estimate": {
417                "vram_bytes": self.hardware_capabilities.vram_bytes,
418                "estimated_weight_bytes": self.model_capabilities.estimated_weight_bytes,
419                "kv_bytes_per_token": kv_bytes_per_token,
420                "kv_capacity_bytes": match (kv_capacity_tokens, kv_bytes_per_token) {
421                    (Some(tokens), Some(bytes_per_token)) => {
422                        (tokens as u64).checked_mul(bytes_per_token)
423                    }
424                    _ => None,
425                },
426            },
427        })
428    }
429
430    pub fn decision_trace_jsonl(&self) -> Result<String, serde_json::Error> {
431        let mut out = String::new();
432        for decision in &self.decisions {
433            out.push_str(&serde_json::to_string(decision)?);
434            out.push('\n');
435        }
436        Ok(out)
437    }
438
439    pub fn runtime_env_hash(&self) -> String {
440        use sha2::{Digest, Sha256};
441
442        let bytes = serde_json::to_vec(&self.runtime_config.entries).unwrap_or_default();
443        let digest = Sha256::digest(bytes);
444        format!("sha256:{digest:x}")
445    }
446
447    fn selected_usize(&self, selection: &str) -> Option<usize> {
448        self.selected_string(selection)?.parse().ok()
449    }
450
451    fn selected_string(&self, selection: &str) -> Option<String> {
452        self.decisions
453            .iter()
454            .find(|decision| decision.selection == selection)
455            .map(|decision| decision.selected.clone())
456    }
457
458    fn runtime_entry_value(&self, key: &str) -> Option<String> {
459        self.runtime_config
460            .entries
461            .iter()
462            .find(|entry| entry.key == key)
463            .map(|entry| entry.effective_value.clone())
464    }
465
466    fn runtime_usize(&self, key: &str) -> Option<usize> {
467        self.runtime_entry_value(key)?.parse().ok()
468    }
469
470    fn runtime_csv_usize(&self, key: &str) -> Option<Vec<usize>> {
471        let raw = self.runtime_entry_value(key)?;
472        let mut out = Vec::new();
473        for part in raw.split(',') {
474            let value = part.trim();
475            if value.is_empty() {
476                return None;
477            }
478            out.push(value.parse().ok()?);
479        }
480        Some(out)
481    }
482
483    fn runtime_json_value(&self, key: &str) -> Option<serde_json::Value> {
484        serde_json::from_str(&self.runtime_entry_value(key)?).ok()
485    }
486}
487
488#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
489pub struct AutoConfigDecision {
490    pub schema_version: u32,
491    pub selection: String,
492    pub selected: String,
493    pub source: AutoConfigSource,
494    pub source_key: Option<String>,
495    pub candidates: Vec<String>,
496    pub rejected: Vec<RejectedCandidate>,
497    pub affects: Vec<RuntimeConfigEffect>,
498}
499
500#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
501pub struct RejectedCandidate {
502    pub value: String,
503    pub reason: String,
504}
505
506#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
507#[serde(rename_all = "snake_case")]
508pub enum AutoConfigSource {
509    Default,
510    Cli,
511    ConfigFile,
512    Env,
513    ScriptCase,
514    ModelMetadata,
515    HardwareCapability,
516    MemoryProfile,
517    WorkloadPreset,
518    CompiledFeature,
519}
520
521#[derive(Debug, Clone, PartialEq, Eq, Error)]
522pub enum AutoConfigError {
523    #[error("{key}: invalid override: {reason}")]
524    InvalidOverride { key: String, reason: String },
525    #[error("{selection}: unsupported combination: {reason}")]
526    UnsupportedCombination { selection: String, reason: String },
527}
528
529pub struct FerrumConfigBuilder {
530    runtime_config: RuntimeConfigSnapshot,
531    model: ModelCapabilities,
532    hardware: HardwareCapabilities,
533    workload: WorkloadProfile,
534}
535
536impl FerrumConfigBuilder {
537    pub fn new(runtime_config: RuntimeConfigSnapshot) -> Self {
538        Self {
539            runtime_config,
540            model: ModelCapabilities::unknown(),
541            hardware: HardwareCapabilities::unknown(),
542            workload: WorkloadProfile::default(),
543        }
544    }
545
546    pub fn m3_qwen3_30b_a3b_int4(runtime_config: RuntimeConfigSnapshot) -> Self {
547        Self::new(runtime_config)
548            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
549            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
550                CompiledKernelFeatures::m3_fast_path_without_fa2(),
551            ))
552            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
553    }
554
555    pub fn with_model_capabilities(mut self, model: ModelCapabilities) -> Self {
556        self.model = model;
557        self
558    }
559
560    pub fn with_hardware_capabilities(mut self, hardware: HardwareCapabilities) -> Self {
561        self.hardware = hardware;
562        self
563    }
564
565    pub fn with_workload_profile(mut self, workload: WorkloadProfile) -> Self {
566        self.workload = workload;
567        self
568    }
569
570    pub fn resolve(self) -> Result<ResolvedFerrumConfig, AutoConfigError> {
571        let mut decisions = Vec::new();
572        let cuda_backend = self.is_cuda_backend();
573        // Any CUDA GPTQ/INT4 MoE model gets the vLLM-Marlin fast MoE path when
574        // the kernel is compiled — not only the m3 bench preset. `ferrum run`
575        // resolves with the serving-default workload (not the m3 preset), so
576        // without this it silently fell back to the slow host-route MoE
577        // (~9.7 vs ~59 tok/s on a 4090 for Qwen3-30B-A3B). Capability-gated,
578        // never model-name-gated.
579        let cuda_gptq_moe = cuda_backend
580            && self.model.moe.is_some()
581            && self.model.quantization.as_deref().is_some_and(|q| {
582                let q = q.to_ascii_lowercase();
583                q.contains("gptq") || q.contains("int4")
584            });
585        let cuda_qwen3_moe = cuda_backend
586            && self.model.moe.is_some()
587            && self.model.architecture.eq_ignore_ascii_case("qwen3_moe");
588        let use_vllm_paged_attn = self.bool_value(
589            "FERRUM_USE_VLLM_PAGED_ATTN",
590            (self.workload.is_m3_preset() || cuda_qwen3_moe)
591                && cuda_backend
592                && self.hardware.compiled_features.vllm_paged_attn,
593            AutoConfigSource::WorkloadPreset,
594        )?;
595        let fa_layout =
596            self.bool_value("FERRUM_FA_LAYOUT_VARLEN", false, AutoConfigSource::Default)?;
597        let fa2_source = self.bool_value("FERRUM_FA2_SOURCE", false, AutoConfigSource::Default)?;
598        let shim_present = self.raw("FERRUM_FA2_DIRECT_FFI_SHIM").is_some();
599        let fa2_direct_ffi = self.bool_value(
600            "FERRUM_FA2_DIRECT_FFI",
601            shim_present,
602            if shim_present {
603                AutoConfigSource::Env
604            } else {
605                AutoConfigSource::Default
606            },
607        )?;
608        let vllm_v1_short = self.bool_value(
609            "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
610            use_vllm_paged_attn.value,
611            AutoConfigSource::Default,
612        )?;
613        let vllm_moe = self.bool_value(
614            "FERRUM_VLLM_MOE",
615            (cuda_gptq_moe || (self.workload.is_m3_preset() && cuda_backend))
616                && self.hardware.compiled_features.vllm_moe_marlin,
617            AutoConfigSource::WorkloadPreset,
618        )?;
619        let device_route = self.bool_value(
620            "FERRUM_MOE_DEVICE_ROUTE",
621            vllm_moe.value,
622            AutoConfigSource::WorkloadPreset,
623        )?;
624        let pair_ids = self.bool_value(
625            "FERRUM_VLLM_MOE_PAIR_IDS",
626            vllm_moe.value,
627            AutoConfigSource::WorkloadPreset,
628        )?;
629        let graph = self.bool_value("FERRUM_MOE_GRAPH", false, AutoConfigSource::WorkloadPreset)?;
630        let greedy = self.bool_value(
631            "FERRUM_GREEDY_ARGMAX",
632            self.workload.is_m3_preset()
633                && cuda_backend
634                && self.hardware.compiled_features.greedy_argmax,
635            AutoConfigSource::WorkloadPreset,
636        )?;
637        let prefix_cache = self.bool_value(
638            "FERRUM_PREFIX_CACHE",
639            false,
640            if self.workload.is_m3_preset() {
641                AutoConfigSource::WorkloadPreset
642            } else {
643                AutoConfigSource::Default
644            },
645        )?;
646        let default_max_sequences = self.default_max_sequences();
647        let max_sequences = self.usize_value(
648            "FERRUM_PAGED_MAX_SEQS",
649            default_max_sequences.value,
650            default_max_sequences.source,
651        )?;
652        let default_kv_blocks = self.default_kv_blocks(&max_sequences);
653        let kv_blocks = self.usize_value(
654            "FERRUM_KV_MAX_BLOCKS",
655            default_kv_blocks.value,
656            default_kv_blocks.source,
657        )?;
658        let default_max_batched_tokens =
659            self.default_max_batched_tokens(&max_sequences, &kv_blocks);
660        let max_batched_tokens = self.usize_value(
661            "FERRUM_MAX_BATCHED_TOKENS",
662            default_max_batched_tokens.value,
663            default_max_batched_tokens.source,
664        )?;
665        let max_model_len = self.optional_usize_value("FERRUM_MAX_MODEL_LEN")?;
666
667        self.validate_attention(
668            use_vllm_paged_attn.value,
669            fa_layout.value,
670            fa2_source.value,
671            fa2_direct_ffi.value,
672            shim_present,
673            vllm_v1_short.value,
674        )?;
675        self.validate_moe(
676            vllm_moe.value,
677            device_route.value,
678            pair_ids.value,
679            graph.value,
680        )?;
681        self.validate_memory(
682            kv_blocks.value,
683            max_sequences.value,
684            max_batched_tokens.value,
685            max_model_len.as_ref().map(|value| value.value),
686        )?;
687        self.validate_dtypes()?;
688        self.validate_layer_split_pipeline_mode()?;
689        self.validate_sampling(greedy.value)?;
690
691        decisions.push(self.attention_prefill_decision(
692            use_vllm_paged_attn.clone(),
693            fa_layout,
694            fa2_source,
695            fa2_direct_ffi,
696        ));
697        decisions.push(
698            self.attention_decode_decision(use_vllm_paged_attn.clone(), vllm_v1_short.clone()),
699        );
700        // Materialize the auto-resolved fast-path MoE knobs into the effective
701        // config BEFORE moe_decision consumes them, so they reach the model
702        // (which reads FERRUM_*, not the decisions). Only auto-derived values —
703        // user/env entries are already present. Without this, `ferrum run`'s
704        // non-preset path resolved FERRUM_VLLM_MOE as a decision only and the
705        // model never saw it (~9.7 vs ~59 tok/s on a 4090 for Qwen3-30B-A3B).
706        let mut runtime_config = self.runtime_config.clone();
707        for (key, resolved) in [
708            ("FERRUM_USE_VLLM_PAGED_ATTN", &use_vllm_paged_attn),
709            ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", &vllm_v1_short),
710            ("FERRUM_VLLM_MOE", &vllm_moe),
711            ("FERRUM_MOE_DEVICE_ROUTE", &device_route),
712            ("FERRUM_VLLM_MOE_PAIR_IDS", &pair_ids),
713        ] {
714            if resolved.source != AutoConfigSource::Env {
715                runtime_config.upsert(
716                    key,
717                    if resolved.value { "1" } else { "0" },
718                    RuntimeConfigSource::MemoryProfile,
719                );
720            }
721        }
722        decisions.push(self.moe_decision(vllm_moe, device_route, pair_ids));
723        decisions.push(self.graph_decision(graph));
724        decisions.push(self.scalar_decision(
725            "kv_block_count",
726            kv_blocks,
727            RuntimeConfigEffect::Memory,
728        ));
729        decisions.push(self.scalar_decision(
730            "max_sequences",
731            max_sequences,
732            RuntimeConfigEffect::Memory,
733        ));
734        decisions.push(self.scalar_decision(
735            "max_batched_tokens",
736            max_batched_tokens,
737            RuntimeConfigEffect::Performance,
738        ));
739        if let Some(max_model_len) = max_model_len {
740            decisions.push(self.scalar_decision(
741                "max_model_len",
742                max_model_len,
743                RuntimeConfigEffect::Memory,
744            ));
745        }
746        decisions.push(self.prefix_cache_decision(prefix_cache));
747        decisions.push(self.scheduler_decision()?);
748        decisions.push(self.sampling_decision(greedy));
749
750        Ok(ResolvedFerrumConfig {
751            schema_version: 1,
752            preset: self.workload.preset.clone(),
753            runtime_config,
754            model_capabilities: self.model.clone(),
755            hardware_capabilities: self.hardware.clone(),
756            workload_profile: self.workload.clone(),
757            decisions,
758        })
759    }
760
761    fn entries(&self) -> BTreeMap<&str, &str> {
762        self.runtime_config
763            .entries
764            .iter()
765            .map(|entry| (entry.key.as_str(), entry.effective_value.as_str()))
766            .collect()
767    }
768
769    fn raw(&self, key: &str) -> Option<&str> {
770        self.entry(key).map(|entry| entry.effective_value.as_str())
771    }
772
773    fn entry(&self, key: &str) -> Option<&RuntimeConfigEntry> {
774        self.runtime_config
775            .entries
776            .iter()
777            .find(|entry| entry.key == key)
778    }
779
780    fn source_for_key(&self, key: &str, default_source: AutoConfigSource) -> AutoConfigSource {
781        self.entry(key)
782            .map(|entry| auto_config_source_from_runtime(entry.source))
783            .unwrap_or(default_source)
784    }
785
786    fn is_cuda_backend(&self) -> bool {
787        self.hardware.backend.eq_ignore_ascii_case("cuda")
788    }
789
790    fn cuda_compute_capability_at_least(&self, major: u32, minor: u32) -> Option<bool> {
791        let (actual_major, actual_minor) =
792            parse_compute_capability(self.hardware.compute_capability.as_deref()?)?;
793        Some((actual_major, actual_minor) >= (major, minor))
794    }
795
796    fn default_max_sequences(&self) -> ResolvedValue<usize> {
797        let target = self.workload.target_concurrency.max(1);
798        let mut selected = target;
799        if self.workload.is_m3_preset() {
800            if let Some(sm_count) = self.hardware.sm_count {
801                // The M3 throughput preset assumes a large GPU. On smaller
802                // known GPUs, avoid auto-selecting a c32-sized admission
803                // window before memory profiling has a chance to refine KV.
804                selected = selected.min((sm_count as usize / 4).max(1));
805            }
806            if let Some(vram_bytes) = self.hardware.vram_bytes {
807                selected = selected.min(vram_default_max_sequences(vram_bytes));
808            }
809        }
810        ResolvedValue {
811            value: selected.max(1),
812            source: if selected < target {
813                AutoConfigSource::HardwareCapability
814            } else {
815                AutoConfigSource::WorkloadPreset
816            },
817            source_key: None,
818        }
819    }
820
821    fn default_max_batched_tokens(
822        &self,
823        max_sequences: &ResolvedValue<usize>,
824        kv_blocks: &ResolvedValue<usize>,
825    ) -> ResolvedValue<usize> {
826        let kv_token_capacity = kv_blocks
827            .value
828            .saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS)
829            .max(max_sequences.value.max(1));
830        let target = if self
831            .workload
832            .is_preset(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET)
833        {
834            1536
835        } else {
836            max_sequences.value.max(1).saturating_mul(64)
837        };
838        let value = target
839            .min(kv_token_capacity)
840            .max(max_sequences.value.max(1));
841        ResolvedValue {
842            value,
843            source: if max_sequences.source == AutoConfigSource::HardwareCapability
844                || kv_blocks.source == AutoConfigSource::HardwareCapability
845            {
846                AutoConfigSource::HardwareCapability
847            } else {
848                AutoConfigSource::WorkloadPreset
849            },
850            source_key: None,
851        }
852    }
853
854    fn default_kv_blocks(&self, max_sequences: &ResolvedValue<usize>) -> ResolvedValue<usize> {
855        let min_blocks = ceil_div(max_sequences.value.max(1), DEFAULT_KV_BLOCK_SIZE_TOKENS);
856        if self
857            .workload
858            .is_preset(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET)
859        {
860            return ResolvedValue {
861                value: 1024.max(min_blocks),
862                source: AutoConfigSource::WorkloadPreset,
863                source_key: None,
864            };
865        }
866        let target = DEFAULT_KV_BLOCKS.max(min_blocks);
867        let selected = match (
868            self.hardware.vram_bytes,
869            self.model.estimated_weight_bytes,
870            self.kv_cache_bytes_per_token(),
871        ) {
872            (Some(vram_bytes), Some(weight_bytes), Some(kv_bytes_per_token))
873                if kv_bytes_per_token > 0 =>
874            {
875                let headroom = (vram_bytes / 10).max(2 * GIB);
876                let available = vram_bytes.saturating_sub(weight_bytes.saturating_add(headroom));
877                let kv_token_budget = (available / kv_bytes_per_token) as usize;
878                let block_budget = kv_token_budget / DEFAULT_KV_BLOCK_SIZE_TOKENS;
879                target.min(block_budget.max(min_blocks))
880            }
881            _ => target,
882        };
883        ResolvedValue {
884            value: selected.max(1),
885            source: if selected < target {
886                AutoConfigSource::HardwareCapability
887            } else {
888                AutoConfigSource::WorkloadPreset
889            },
890            source_key: None,
891        }
892    }
893
894    fn kv_cache_bytes_per_token(&self) -> Option<u64> {
895        kv_cache_bytes_per_token_for_model(&self.model)
896    }
897
898    fn bool_value(
899        &self,
900        key: &str,
901        default: bool,
902        default_source: AutoConfigSource,
903    ) -> Result<ResolvedValue<bool>, AutoConfigError> {
904        match self.entry(key) {
905            Some(entry) => Ok(ResolvedValue {
906                value: parse_bool_env_value(&entry.effective_value).map_err(|reason| {
907                    AutoConfigError::InvalidOverride {
908                        key: key.to_string(),
909                        reason,
910                    }
911                })?,
912                source: auto_config_source_from_runtime(entry.source),
913                source_key: Some(key.to_string()),
914            }),
915            None => Ok(ResolvedValue {
916                value: default,
917                source: default_source,
918                source_key: None,
919            }),
920        }
921    }
922
923    fn usize_value(
924        &self,
925        key: &str,
926        default: usize,
927        default_source: AutoConfigSource,
928    ) -> Result<ResolvedValue<usize>, AutoConfigError> {
929        match self.entry(key) {
930            Some(entry) => Ok(ResolvedValue {
931                value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
932                    AutoConfigError::InvalidOverride {
933                        key: key.to_string(),
934                        reason,
935                    }
936                })?,
937                source: auto_config_source_from_runtime(entry.source),
938                source_key: Some(key.to_string()),
939            }),
940            None => Ok(ResolvedValue {
941                value: default,
942                source: default_source,
943                source_key: None,
944            }),
945        }
946    }
947
948    fn optional_usize_value(
949        &self,
950        key: &str,
951    ) -> Result<Option<ResolvedValue<usize>>, AutoConfigError> {
952        match self.entry(key) {
953            Some(entry) => Ok(Some(ResolvedValue {
954                value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
955                    AutoConfigError::InvalidOverride {
956                        key: key.to_string(),
957                        reason,
958                    }
959                })?,
960                source: auto_config_source_from_runtime(entry.source),
961                source_key: Some(key.to_string()),
962            })),
963            None => Ok(None),
964        }
965    }
966
967    fn validate_attention(
968        &self,
969        use_vllm_paged_attn: bool,
970        fa_layout: bool,
971        fa2_source: bool,
972        fa2_direct_ffi: bool,
973        shim_present: bool,
974        vllm_v1_short: bool,
975    ) -> Result<(), AutoConfigError> {
976        if use_vllm_paged_attn && !self.hardware.compiled_features.vllm_paged_attn {
977            return self.invalid(
978                "FERRUM_USE_VLLM_PAGED_ATTN",
979                "vLLM paged attention is not compiled",
980            );
981        }
982        if use_vllm_paged_attn && !self.is_cuda_backend() {
983            return self.invalid(
984                "FERRUM_USE_VLLM_PAGED_ATTN",
985                "vLLM paged attention requires CUDA backend",
986            );
987        }
988        if fa_layout && !use_vllm_paged_attn {
989            return self.invalid(
990                "FERRUM_FA_LAYOUT_VARLEN",
991                "FA layout requires vLLM paged attention layout",
992            );
993        }
994        if fa2_source && !self.hardware.compiled_features.fa2_source {
995            return self.invalid(
996                "FERRUM_FA2_SOURCE",
997                "source-linked FA2 support is not compiled",
998            );
999        }
1000        if fa2_source && !self.is_cuda_backend() {
1001            return self.invalid(
1002                "FERRUM_FA2_SOURCE",
1003                "source-linked FA2 requires CUDA backend",
1004            );
1005        }
1006        if fa2_source && !use_vllm_paged_attn {
1007            return self.invalid(
1008                "FERRUM_FA2_SOURCE",
1009                "source-linked FA2 requires vLLM paged attention layout",
1010            );
1011        }
1012        if fa2_source && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
1013            return self.invalid(
1014                "FERRUM_FA2_SOURCE",
1015                "source-linked FA2 requires CUDA compute capability >= 8.0",
1016            );
1017        }
1018        if fa2_direct_ffi && !self.hardware.compiled_features.fa2_direct_ffi {
1019            return self.invalid(
1020                "FERRUM_FA2_DIRECT_FFI",
1021                "direct FA2 FFI shim support is not compiled",
1022            );
1023        }
1024        if fa2_direct_ffi && !self.is_cuda_backend() {
1025            return self.invalid(
1026                "FERRUM_FA2_DIRECT_FFI",
1027                "direct FA2 FFI shim requires CUDA backend",
1028            );
1029        }
1030        if fa2_direct_ffi && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
1031            return self.invalid(
1032                "FERRUM_FA2_DIRECT_FFI",
1033                "direct FA2 FFI shim requires CUDA compute capability >= 8.0",
1034            );
1035        }
1036        if fa2_direct_ffi && !shim_present {
1037            return self.invalid(
1038                "FERRUM_FA2_DIRECT_FFI",
1039                "requires FERRUM_FA2_DIRECT_FFI_SHIM",
1040            );
1041        }
1042        if fa2_source && fa2_direct_ffi {
1043            return self.unsupported(
1044                "attention_prefill_mixed_backend",
1045                "FA2 source and direct FFI shim cannot both own the prefill path",
1046            );
1047        }
1048        if vllm_v1_short && !use_vllm_paged_attn {
1049            return self.invalid(
1050                "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
1051                "short-context v1 requires vLLM paged attention",
1052            );
1053        }
1054        Ok(())
1055    }
1056
1057    fn validate_moe(
1058        &self,
1059        vllm_moe: bool,
1060        device_route: bool,
1061        pair_ids: bool,
1062        graph: bool,
1063    ) -> Result<(), AutoConfigError> {
1064        if vllm_moe && !self.hardware.compiled_features.vllm_moe_marlin {
1065            return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE is not compiled");
1066        }
1067        if vllm_moe && !self.is_cuda_backend() {
1068            return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE requires CUDA backend");
1069        }
1070        if device_route && !vllm_moe {
1071            return self.invalid(
1072                "FERRUM_MOE_DEVICE_ROUTE",
1073                "device route currently requires vLLM MoE",
1074            );
1075        }
1076        if pair_ids && !vllm_moe {
1077            return self.invalid(
1078                "FERRUM_VLLM_MOE_PAIR_IDS",
1079                "pair-id routing requires vLLM MoE",
1080            );
1081        }
1082        let graph_relevant = self.model.moe.is_some() || self.workload.is_m3_preset();
1083        if graph && graph_relevant && !self.hardware.graph_support {
1084            return self.invalid(
1085                "FERRUM_MOE_GRAPH",
1086                "hardware/backend does not support CUDA graph replay",
1087            );
1088        }
1089        if graph && graph_relevant && !self.hardware.compiled_features.cuda_graph {
1090            return self.invalid("FERRUM_MOE_GRAPH", "CUDA graph support is not compiled");
1091        }
1092        if graph && graph_relevant && !vllm_moe {
1093            return self.invalid(
1094                "FERRUM_MOE_GRAPH",
1095                "graph decode requires the graph-clean vLLM MoE path",
1096            );
1097        }
1098        if graph && graph_relevant && self.model.moe.is_some() && !self.model.graph_safe_moe {
1099            return self.unsupported(
1100                "moe_graph_policy",
1101                "model MoE path is not marked graph-safe",
1102            );
1103        }
1104        Ok(())
1105    }
1106
1107    fn validate_sampling(&self, greedy: bool) -> Result<(), AutoConfigError> {
1108        if greedy && !self.hardware.compiled_features.greedy_argmax {
1109            return self.invalid("FERRUM_GREEDY_ARGMAX", "GPU argmax is not compiled");
1110        }
1111        if greedy
1112            && !(self.is_cuda_backend() || self.hardware.backend.eq_ignore_ascii_case("metal"))
1113        {
1114            return self.invalid(
1115                "FERRUM_GREEDY_ARGMAX",
1116                "greedy argmax requires CUDA or Metal backend",
1117            );
1118        }
1119        Ok(())
1120    }
1121
1122    fn validate_memory(
1123        &self,
1124        kv_blocks: usize,
1125        max_sequences: usize,
1126        max_batched_tokens: usize,
1127        requested_max_model_len: Option<usize>,
1128    ) -> Result<(), AutoConfigError> {
1129        if kv_blocks == 0 {
1130            return self.invalid("FERRUM_KV_MAX_BLOCKS", "must be greater than zero");
1131        }
1132        if max_sequences == 0 {
1133            return self.invalid("FERRUM_PAGED_MAX_SEQS", "must be greater than zero");
1134        }
1135        if max_batched_tokens < max_sequences {
1136            return self.invalid(
1137                "FERRUM_MAX_BATCHED_TOKENS",
1138                "must be at least FERRUM_PAGED_MAX_SEQS",
1139            );
1140        }
1141        let kv_token_capacity = kv_blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS);
1142        if max_batched_tokens > kv_token_capacity {
1143            return self.invalid(
1144                "FERRUM_MAX_BATCHED_TOKENS",
1145                "exceeds KV cache token capacity",
1146            );
1147        }
1148        if let Some(max_model_len) = requested_max_model_len {
1149            if max_model_len == 0 {
1150                return self.invalid("FERRUM_MAX_MODEL_LEN", "must be greater than zero");
1151            }
1152            if let Some(model_max) = self.model.max_context_len {
1153                if max_model_len > model_max {
1154                    return self.invalid(
1155                        "FERRUM_MAX_MODEL_LEN",
1156                        "exceeds model metadata max context length",
1157                    );
1158                }
1159            }
1160            if max_model_len > kv_token_capacity {
1161                return self.invalid(
1162                    "FERRUM_KV_MAX_BLOCKS",
1163                    "KV cache token capacity is smaller than FERRUM_MAX_MODEL_LEN",
1164                );
1165            }
1166        }
1167        Ok(())
1168    }
1169
1170    fn validate_dtypes(&self) -> Result<(), AutoConfigError> {
1171        if let Some(dtype) = self.raw("FERRUM_DTYPE") {
1172            let dtype = dtype.to_ascii_lowercase();
1173            if !self.hardware.supported_dtypes.iter().any(|d| d == &dtype) {
1174                return self.invalid("FERRUM_DTYPE", "dtype is not supported by hardware profile");
1175            }
1176        }
1177        if let Some(dtype) = self.raw("FERRUM_KV_DTYPE") {
1178            let dtype = dtype.to_ascii_lowercase();
1179            if !self
1180                .hardware
1181                .supported_kv_dtypes
1182                .iter()
1183                .any(|d| d == &dtype)
1184            {
1185                return self.invalid(
1186                    "FERRUM_KV_DTYPE",
1187                    "KV dtype is not supported by hardware profile",
1188                );
1189            }
1190        }
1191        Ok(())
1192    }
1193
1194    fn validate_layer_split_pipeline_mode(&self) -> Result<(), AutoConfigError> {
1195        let Some(mode) = self.raw("FERRUM_LAYER_SPLIT_PIPELINE_MODE") else {
1196            return Ok(());
1197        };
1198        match mode.trim().to_ascii_lowercase().as_str() {
1199            "batch" | "overlapped" => Ok(()),
1200            _ => self.invalid(
1201                "FERRUM_LAYER_SPLIT_PIPELINE_MODE",
1202                "must be batch or overlapped",
1203            ),
1204        }
1205    }
1206
1207    fn attention_prefill_decision(
1208        &self,
1209        use_vllm_paged_attn: ResolvedValue<bool>,
1210        fa_layout: ResolvedValue<bool>,
1211        fa2_source: ResolvedValue<bool>,
1212        fa2_direct_ffi: ResolvedValue<bool>,
1213    ) -> AutoConfigDecision {
1214        let (selected, source, source_key) = if fa2_source.value {
1215            ("fa2_source", fa2_source.source, fa2_source.source_key)
1216        } else if fa2_direct_ffi.value {
1217            (
1218                "fa2_direct_ffi",
1219                fa2_direct_ffi.source,
1220                fa2_direct_ffi.source_key,
1221            )
1222        } else if fa_layout.value {
1223            ("fa_layout_varlen", fa_layout.source, fa_layout.source_key)
1224        } else if use_vllm_paged_attn.value {
1225            (
1226                "vllm_paged_varlen",
1227                use_vllm_paged_attn.source,
1228                use_vllm_paged_attn.source_key,
1229            )
1230        } else {
1231            ("legacy_paged_varlen", AutoConfigSource::Default, None)
1232        };
1233        self.decision(
1234            "attention_prefill_mixed_backend",
1235            selected,
1236            source,
1237            source_key,
1238            [
1239                "fa2_source",
1240                "fa2_direct_ffi",
1241                "fa_layout_varlen",
1242                "vllm_paged_varlen",
1243                "legacy_paged_varlen",
1244            ],
1245            self.rejected_except(
1246                selected,
1247                [
1248                    ("fa2_source", "source-linked FA2 path not selected"),
1249                    ("fa2_direct_ffi", "diagnostic direct FFI shim not selected"),
1250                    ("fa_layout_varlen", "FA-compatible layout not selected"),
1251                    ("vllm_paged_varlen", "vLLM paged varlen bridge not selected"),
1252                    (
1253                        "legacy_paged_varlen",
1254                        "a higher-priority attention path was selected",
1255                    ),
1256                ],
1257            ),
1258            vec![
1259                RuntimeConfigEffect::Performance,
1260                RuntimeConfigEffect::Memory,
1261            ],
1262        )
1263    }
1264
1265    fn attention_decode_decision(
1266        &self,
1267        use_vllm_paged_attn: ResolvedValue<bool>,
1268        vllm_v1_short: ResolvedValue<bool>,
1269    ) -> AutoConfigDecision {
1270        let (selected, source, source_key) = if use_vllm_paged_attn.value {
1271            if vllm_v1_short.value {
1272                (
1273                    "vllm_paged_attn_v1_short",
1274                    vllm_v1_short.source,
1275                    vllm_v1_short.source_key,
1276                )
1277            } else {
1278                (
1279                    "vllm_paged_attn_v2",
1280                    vllm_v1_short.source,
1281                    vllm_v1_short.source_key,
1282                )
1283            }
1284        } else {
1285            ("legacy_paged_decode", use_vllm_paged_attn.source, None)
1286        };
1287        self.decision(
1288            "attention_decode_backend",
1289            selected,
1290            source,
1291            source_key,
1292            [
1293                "vllm_paged_attn_v1_short",
1294                "vllm_paged_attn_v2",
1295                "legacy_paged_decode",
1296            ],
1297            self.rejected_except(
1298                selected,
1299                [
1300                    (
1301                        "vllm_paged_attn_v1_short",
1302                        "short-context v1 decode not selected",
1303                    ),
1304                    ("vllm_paged_attn_v2", "v2 decode not selected"),
1305                    ("legacy_paged_decode", "legacy decode not selected"),
1306                ],
1307            ),
1308            vec![RuntimeConfigEffect::Performance],
1309        )
1310    }
1311
1312    fn moe_decision(
1313        &self,
1314        vllm_moe: ResolvedValue<bool>,
1315        device_route: ResolvedValue<bool>,
1316        pair_ids: ResolvedValue<bool>,
1317    ) -> AutoConfigDecision {
1318        let selected = if vllm_moe.value && device_route.value && pair_ids.value {
1319            "vllm_marlin_moe_device_route_pair_ids"
1320        } else if vllm_moe.value && device_route.value {
1321            "vllm_marlin_moe_device_route"
1322        } else if vllm_moe.value {
1323            "vllm_marlin_moe"
1324        } else {
1325            "legacy_moe"
1326        };
1327        self.decision(
1328            "moe_implementation",
1329            selected,
1330            vllm_moe.source,
1331            vllm_moe.source_key,
1332            [
1333                "vllm_marlin_moe_device_route_pair_ids",
1334                "vllm_marlin_moe_device_route",
1335                "vllm_marlin_moe",
1336                "legacy_moe",
1337            ],
1338            self.rejected_except(
1339                selected,
1340                [
1341                    (
1342                        "vllm_marlin_moe_device_route_pair_ids",
1343                        "pair-id device route not selected",
1344                    ),
1345                    (
1346                        "vllm_marlin_moe_device_route",
1347                        "device-route MoE not selected",
1348                    ),
1349                    ("vllm_marlin_moe", "vLLM Marlin MoE not selected"),
1350                    ("legacy_moe", "legacy MoE not selected"),
1351                ],
1352            ),
1353            vec![RuntimeConfigEffect::Performance],
1354        )
1355    }
1356
1357    fn graph_decision(&self, graph: ResolvedValue<bool>) -> AutoConfigDecision {
1358        let selected = if graph.value {
1359            "graph_clean_decode"
1360        } else {
1361            "graph_disabled"
1362        };
1363        self.decision(
1364            "moe_graph_policy",
1365            selected,
1366            graph.source,
1367            graph.source_key,
1368            ["graph_clean_decode", "graph_disabled"],
1369            self.rejected_except(
1370                selected,
1371                [
1372                    ("graph_clean_decode", "graph decode not selected"),
1373                    ("graph_disabled", "graph decode selected"),
1374                ],
1375            ),
1376            vec![
1377                RuntimeConfigEffect::Performance,
1378                RuntimeConfigEffect::Correctness,
1379            ],
1380        )
1381    }
1382
1383    fn scalar_decision(
1384        &self,
1385        selection: &str,
1386        value: ResolvedValue<usize>,
1387        effect: RuntimeConfigEffect,
1388    ) -> AutoConfigDecision {
1389        self.decision(
1390            selection,
1391            &value.value.to_string(),
1392            value.source,
1393            value.source_key,
1394            [value.value.to_string()],
1395            Vec::new(),
1396            vec![effect],
1397        )
1398    }
1399
1400    fn scheduler_decision(&self) -> Result<AutoConfigDecision, AutoConfigError> {
1401        let entries = self.entries();
1402        let mut selected = "continuous_default".to_string();
1403        let mut source_key = None;
1404        if let Some(chunk) = entries.get("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK") {
1405            parse_usize_env_value(chunk).map_err(|reason| AutoConfigError::InvalidOverride {
1406                key: "FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string(),
1407                reason,
1408            })?;
1409            selected = format!("active_decode_prefill_chunk:{chunk}");
1410            source_key = Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string());
1411        } else if let Some(until) = entries.get("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE") {
1412            parse_usize_env_value(until).map_err(|reason| AutoConfigError::InvalidOverride {
1413                key: "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string(),
1414                reason,
1415            })?;
1416            selected = format!("prefill_first_until_active:{until}");
1417            source_key = Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string());
1418        } else if self
1419            .bool_value(
1420                "FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE",
1421                false,
1422                AutoConfigSource::Default,
1423            )?
1424            .value
1425        {
1426            selected = "prompt_token_estimate".to_string();
1427            source_key = Some("FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE".to_string());
1428        }
1429        self.unsupported_if(
1430            source_key.as_deref() == Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1431                && selected.ends_with(":0"),
1432            "scheduler_admission_policy",
1433            "active decode prefill chunk must be greater than zero",
1434        )?;
1435        Ok(self.decision(
1436            "scheduler_admission_policy",
1437            &selected,
1438            source_key
1439                .as_deref()
1440                .map(|key| self.source_for_key(key, AutoConfigSource::Default))
1441                .unwrap_or(AutoConfigSource::Default),
1442            source_key,
1443            [
1444                "continuous_default",
1445                "prompt_token_estimate",
1446                "prefill_first_until_active",
1447                "active_decode_prefill_chunk",
1448            ],
1449            Vec::new(),
1450            vec![RuntimeConfigEffect::Performance],
1451        ))
1452    }
1453
1454    fn prefix_cache_decision(&self, prefix_cache: ResolvedValue<bool>) -> AutoConfigDecision {
1455        let selected = if prefix_cache.value {
1456            "prefix_cache_enabled"
1457        } else {
1458            "prefix_cache_disabled"
1459        };
1460        self.decision(
1461            "prefix_cache_policy",
1462            selected,
1463            prefix_cache.source,
1464            prefix_cache.source_key,
1465            ["prefix_cache_enabled", "prefix_cache_disabled"],
1466            self.rejected_except(
1467                selected,
1468                [
1469                    ("prefix_cache_enabled", "prefix cache not selected"),
1470                    ("prefix_cache_disabled", "prefix cache enabled"),
1471                ],
1472            ),
1473            vec![
1474                RuntimeConfigEffect::Correctness,
1475                RuntimeConfigEffect::Performance,
1476                RuntimeConfigEffect::Memory,
1477            ],
1478        )
1479    }
1480
1481    fn sampling_decision(&self, greedy: ResolvedValue<bool>) -> AutoConfigDecision {
1482        let selected = if greedy.value {
1483            "gpu_greedy_argmax"
1484        } else {
1485            "logits_readback"
1486        };
1487        self.decision(
1488            "sampling_readback_path",
1489            selected,
1490            greedy.source,
1491            greedy.source_key,
1492            ["gpu_greedy_argmax", "logits_readback"],
1493            self.rejected_except(
1494                selected,
1495                [
1496                    ("gpu_greedy_argmax", "GPU argmax not selected"),
1497                    ("logits_readback", "logits readback not selected"),
1498                ],
1499            ),
1500            vec![
1501                RuntimeConfigEffect::Performance,
1502                RuntimeConfigEffect::Correctness,
1503            ],
1504        )
1505    }
1506
1507    fn decision<I, C>(
1508        &self,
1509        selection: &str,
1510        selected: &str,
1511        source: AutoConfigSource,
1512        source_key: Option<String>,
1513        candidates: I,
1514        rejected: Vec<RejectedCandidate>,
1515        affects: Vec<RuntimeConfigEffect>,
1516    ) -> AutoConfigDecision
1517    where
1518        I: IntoIterator<Item = C>,
1519        C: Into<String>,
1520    {
1521        AutoConfigDecision {
1522            schema_version: 1,
1523            selection: selection.to_string(),
1524            selected: selected.to_string(),
1525            source,
1526            source_key,
1527            candidates: candidates.into_iter().map(Into::into).collect(),
1528            rejected,
1529            affects,
1530        }
1531    }
1532
1533    fn rejected_except<I>(&self, selected: &str, candidates: I) -> Vec<RejectedCandidate>
1534    where
1535        I: IntoIterator<Item = (&'static str, &'static str)>,
1536    {
1537        candidates
1538            .into_iter()
1539            .filter(|(value, _)| *value != selected)
1540            .map(|(value, reason)| RejectedCandidate {
1541                value: value.to_string(),
1542                reason: reason.to_string(),
1543            })
1544            .collect()
1545    }
1546
1547    fn invalid<T>(&self, key: &str, reason: &str) -> Result<T, AutoConfigError> {
1548        Err(AutoConfigError::InvalidOverride {
1549            key: key.to_string(),
1550            reason: reason.to_string(),
1551        })
1552    }
1553
1554    fn unsupported<T>(&self, selection: &str, reason: &str) -> Result<T, AutoConfigError> {
1555        Err(AutoConfigError::UnsupportedCombination {
1556            selection: selection.to_string(),
1557            reason: reason.to_string(),
1558        })
1559    }
1560
1561    fn unsupported_if(
1562        &self,
1563        condition: bool,
1564        selection: &str,
1565        reason: &str,
1566    ) -> Result<(), AutoConfigError> {
1567        if condition {
1568            self.unsupported(selection, reason)
1569        } else {
1570            Ok(())
1571        }
1572    }
1573}
1574
1575fn kv_cache_bytes_per_token_for_model(model: &ModelCapabilities) -> Option<u64> {
1576    let layers = model.num_hidden_layers? as u64;
1577    let kv_heads = model.kv_heads? as u64;
1578    let head_dim = model.head_dim? as u64;
1579    layers
1580        .checked_mul(2)?
1581        .checked_mul(kv_heads)?
1582        .checked_mul(head_dim)?
1583        .checked_mul(2)
1584}
1585
1586#[derive(Debug, Clone, PartialEq, Eq)]
1587struct ResolvedValue<T> {
1588    value: T,
1589    source: AutoConfigSource,
1590    source_key: Option<String>,
1591}
1592
1593fn parse_compute_capability(value: &str) -> Option<(u32, u32)> {
1594    let value = value.trim();
1595    if value.is_empty() {
1596        return None;
1597    }
1598    let (major, minor) = value.split_once('.').unwrap_or((value, "0"));
1599    Some((major.trim().parse().ok()?, minor.trim().parse().ok()?))
1600}
1601
1602fn vram_default_max_sequences(vram_bytes: u64) -> usize {
1603    match vram_bytes {
1604        bytes if bytes >= 20 * GIB => 32,
1605        bytes if bytes >= 12 * GIB => 16,
1606        bytes if bytes >= 8 * GIB => 8,
1607        _ => 4,
1608    }
1609}
1610
1611fn default_gpu_devices_for_backend(backend: &str) -> Option<Vec<usize>> {
1612    backend.eq_ignore_ascii_case("cuda").then(|| vec![0])
1613}
1614
1615fn ceil_div(value: usize, divisor: usize) -> usize {
1616    value.div_ceil(divisor)
1617}
1618
1619fn auto_config_source_from_runtime(source: RuntimeConfigSource) -> AutoConfigSource {
1620    match source {
1621        RuntimeConfigSource::Default => AutoConfigSource::Default,
1622        RuntimeConfigSource::ConfigFile => AutoConfigSource::ConfigFile,
1623        RuntimeConfigSource::Cli => AutoConfigSource::Cli,
1624        RuntimeConfigSource::Env => AutoConfigSource::Env,
1625        RuntimeConfigSource::ScriptCase => AutoConfigSource::ScriptCase,
1626        RuntimeConfigSource::MemoryProfile => AutoConfigSource::MemoryProfile,
1627    }
1628}
1629
1630#[cfg(test)]
1631mod tests {
1632    use super::*;
1633
1634    fn snapshot(vars: &[(&str, &str)]) -> RuntimeConfigSnapshot {
1635        RuntimeConfigSnapshot::from_env_vars(vars.iter().copied())
1636    }
1637
1638    fn snapshot_with_sources(vars: &[(&str, &str, RuntimeConfigSource)]) -> RuntimeConfigSnapshot {
1639        let mut entries: Vec<_> = vars
1640            .iter()
1641            .map(|(key, effective_value, source)| RuntimeConfigEntry {
1642                key: (*key).to_string(),
1643                effective_value: (*effective_value).to_string(),
1644                source: *source,
1645                affects: vec![RuntimeConfigEffect::Performance],
1646            })
1647            .collect();
1648        entries.sort_by(|a, b| a.key.cmp(&b.key));
1649        RuntimeConfigSnapshot { entries }
1650    }
1651
1652    fn m3(vars: &[(&str, &str)], features: CompiledKernelFeatures) -> FerrumConfigBuilder {
1653        FerrumConfigBuilder::new(snapshot(vars))
1654            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1655            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(features))
1656            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1657    }
1658
1659    fn m3_with_hardware(
1660        vars: &[(&str, &str)],
1661        hardware: HardwareCapabilities,
1662    ) -> FerrumConfigBuilder {
1663        FerrumConfigBuilder::new(snapshot(vars))
1664            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1665            .with_hardware_capabilities(hardware)
1666            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1667    }
1668
1669    fn qwen25_layer_split_runtime_entries(source: RuntimeConfigSource) -> RuntimeConfigSnapshot {
1670        snapshot_with_sources(&[
1671            ("FERRUM_REQUESTED_GPU_DEVICES", "0,1", source),
1672            ("FERRUM_SELECTED_GPU_DEVICES", "0,1", source),
1673            ("FERRUM_CUDA_DEVICE_COUNT", "2", source),
1674            (
1675                "FERRUM_SELECTED_DISTRIBUTED_STRATEGY",
1676                "layer_split",
1677                source,
1678            ),
1679            (
1680                "FERRUM_SELECTED_LAYER_SPLIT_PLAN",
1681                "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79",
1682                source,
1683            ),
1684            ("FERRUM_LAYER_SPLIT_PIPELINE_MODE", "batch", source),
1685            ("FERRUM_MAX_MODEL_LEN", "4096", source),
1686            ("FERRUM_KV_MAX_BLOCKS", "1024", source),
1687            ("FERRUM_KV_CAPACITY", "1024", source),
1688            ("FERRUM_PAGED_MAX_SEQS", "16", source),
1689            ("FERRUM_MAX_BATCHED_TOKENS", "1536", source),
1690            ("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE", "16", source),
1691        ])
1692    }
1693
1694    fn expect_invalid_key(vars: &[(&str, &str)], key: &str) {
1695        expect_invalid_key_with_features(
1696            vars,
1697            key,
1698            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1699        );
1700    }
1701
1702    fn expect_invalid_key_with_features(
1703        vars: &[(&str, &str)],
1704        key: &str,
1705        features: CompiledKernelFeatures,
1706    ) {
1707        expect_invalid_key_with_hardware(vars, key, HardwareCapabilities::rtx4090_cuda(features));
1708    }
1709
1710    fn expect_invalid_key_with_hardware(
1711        vars: &[(&str, &str)],
1712        key: &str,
1713        hardware: HardwareCapabilities,
1714    ) {
1715        let err = m3_with_hardware(vars, hardware)
1716            .resolve()
1717            .expect_err("override should fail");
1718        match err {
1719            AutoConfigError::InvalidOverride { key: actual, .. } => assert_eq!(actual, key),
1720            other => panic!("expected invalid override for {key}, got {other:?}"),
1721        }
1722    }
1723
1724    fn cpu_hardware_with_features(features: CompiledKernelFeatures) -> HardwareCapabilities {
1725        HardwareCapabilities {
1726            backend: "cpu".to_string(),
1727            supported_dtypes: vec!["fp32".to_string()],
1728            supported_kv_dtypes: vec!["fp16".to_string()],
1729            compiled_features: features,
1730            ..HardwareCapabilities::unknown()
1731        }
1732    }
1733
1734    #[test]
1735    fn m3_preset_selects_current_safe_fast_path_without_fa2() {
1736        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1737            .resolve()
1738            .unwrap();
1739        let decisions: BTreeMap<_, _> = resolved
1740            .decisions
1741            .iter()
1742            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1743            .collect();
1744        assert_eq!(
1745            decisions["attention_prefill_mixed_backend"],
1746            "vllm_paged_varlen"
1747        );
1748        assert_eq!(
1749            decisions["attention_decode_backend"],
1750            "vllm_paged_attn_v1_short"
1751        );
1752        assert_eq!(
1753            decisions["moe_implementation"],
1754            "vllm_marlin_moe_device_route_pair_ids"
1755        );
1756        assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1757        assert_eq!(decisions["prefix_cache_policy"], "prefix_cache_disabled");
1758        assert_eq!(decisions["sampling_readback_path"], "gpu_greedy_argmax");
1759        assert_eq!(
1760            resolved.preset.as_deref(),
1761            Some(M3_QWEN3_30B_A3B_INT4_PRESET)
1762        );
1763    }
1764
1765    #[test]
1766    fn cuda_gptq_moe_enables_vllm_marlin_without_m3_preset() {
1767        // `ferrum run` resolves with the serving-default workload, NOT the m3
1768        // bench preset, so the old `is_m3_preset()`-gated FERRUM_VLLM_MOE never
1769        // fired and the 30B fell back to the slow host-route MoE (~9.7 vs ~59
1770        // tok/s on a 4090). A CUDA GPTQ MoE must get the vLLM-Marlin fast path
1771        // on capability alone.
1772        let hardware =
1773            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1774        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1775        let resolved = FerrumConfigBuilder::new(snapshot(&[]))
1776            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1777            .with_hardware_capabilities(hardware)
1778            .with_workload_profile(workload)
1779            .resolve()
1780            .unwrap();
1781        let decisions: BTreeMap<_, _> = resolved
1782            .decisions
1783            .iter()
1784            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1785            .collect();
1786        assert_ne!(
1787            resolved.preset.as_deref(),
1788            Some(M3_QWEN3_30B_A3B_INT4_PRESET),
1789            "serving-default workload must not be the m3 preset"
1790        );
1791        assert_eq!(
1792            decisions["moe_implementation"], "vllm_marlin_moe_device_route_pair_ids",
1793            "CUDA GPTQ MoE should get the fast vLLM-Marlin path without the m3 preset"
1794        );
1795        // The decision is not enough — the model reads FERRUM_VLLM_MOE from the
1796        // effective config, not the decisions. The resolved knob must be a
1797        // runtime_config entry so `ferrum run`'s materialize/apply propagates it.
1798        let entry = resolved
1799            .runtime_config
1800            .entries
1801            .iter()
1802            .find(|e| e.key == "FERRUM_VLLM_MOE");
1803        assert_eq!(
1804            entry.map(|e| e.effective_value.as_str()),
1805            Some("1"),
1806            "resolved FERRUM_VLLM_MOE must be materialized into the effective config"
1807        );
1808    }
1809
1810    #[test]
1811    fn cuda_qwen3_moe_enables_vllm_paged_attn_without_m3_preset() {
1812        // `ferrum run` and ordinary `serve` use the serving-default workload,
1813        // not the m3 preset. Qwen3-MoE on CUDA with the VPA kernel compiled
1814        // must still select and materialize the paged-attention runtime knob,
1815        // otherwise the effective config/decision trace says "legacy" while
1816        // the model runtime can take the VPA path through its own defaults.
1817        let hardware =
1818            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1819        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1820        let resolved = FerrumConfigBuilder::new(snapshot(&[]))
1821            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1822            .with_hardware_capabilities(hardware)
1823            .with_workload_profile(workload)
1824            .resolve()
1825            .unwrap();
1826        let decisions: BTreeMap<_, _> = resolved
1827            .decisions
1828            .iter()
1829            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1830            .collect();
1831        assert_eq!(
1832            decisions["attention_decode_backend"], "vllm_paged_attn_v1_short",
1833            "CUDA Qwen3-MoE should get VPA decode without the m3 preset"
1834        );
1835        let entry = |key: &str| {
1836            resolved
1837                .runtime_config
1838                .entries
1839                .iter()
1840                .find(|entry| entry.key == key)
1841                .unwrap_or_else(|| panic!("missing runtime config entry {key}"))
1842        };
1843        assert_eq!(entry("FERRUM_USE_VLLM_PAGED_ATTN").effective_value, "1");
1844        assert_eq!(
1845            entry("FERRUM_VLLM_PAGED_ATTN_V1_SHORT").effective_value,
1846            "1"
1847        );
1848    }
1849
1850    #[test]
1851    fn cuda_qwen3_moe_vllm_paged_attn_env_opt_out_is_materialized() {
1852        let hardware =
1853            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1854        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1855        let resolved = FerrumConfigBuilder::new(snapshot(&[("FERRUM_USE_VLLM_PAGED_ATTN", "0")]))
1856            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1857            .with_hardware_capabilities(hardware)
1858            .with_workload_profile(workload)
1859            .resolve()
1860            .unwrap();
1861        let decisions: BTreeMap<_, _> = resolved
1862            .decisions
1863            .iter()
1864            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1865            .collect();
1866        assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1867        let entry = resolved
1868            .runtime_config
1869            .entries
1870            .iter()
1871            .find(|entry| entry.key == "FERRUM_USE_VLLM_PAGED_ATTN")
1872            .expect("env opt-out should stay in effective config");
1873        assert_eq!(entry.effective_value, "0");
1874        assert_eq!(entry.source, RuntimeConfigSource::Env);
1875    }
1876
1877    #[test]
1878    fn qwen25_72b_layer_split_preset_selects_batch_tuned_defaults() {
1879        let resolved = FerrumConfigBuilder::new(qwen25_layer_split_runtime_entries(
1880            RuntimeConfigSource::Default,
1881        ))
1882        .with_model_capabilities(ModelCapabilities::qwen25_72b_gptq_int4())
1883        .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1884            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1885        ))
1886        .with_workload_profile(WorkloadProfile::qwen25_72b_gptq_int4_2x4090_layer_split())
1887        .resolve()
1888        .unwrap();
1889        let decision = |selection: &str| {
1890            resolved
1891                .decisions
1892                .iter()
1893                .find(|decision| decision.selection == selection)
1894                .unwrap_or_else(|| panic!("missing decision {selection}"))
1895        };
1896
1897        assert_eq!(
1898            resolved.preset.as_deref(),
1899            Some(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET)
1900        );
1901        assert_eq!(decision("kv_block_count").selected, "1024");
1902        assert_eq!(decision("max_sequences").selected, "16");
1903        assert_eq!(decision("max_batched_tokens").selected, "1536");
1904        assert_eq!(decision("max_model_len").selected, "4096");
1905        assert_eq!(
1906            decision("scheduler_admission_policy").selected,
1907            "prefill_first_until_active:16"
1908        );
1909        assert_eq!(
1910            decision("scheduler_admission_policy").source,
1911            AutoConfigSource::Default
1912        );
1913
1914        let doc = resolved.effective_config_document();
1915        assert_eq!(doc["selected_pipeline_mode"], "batch");
1916        assert_eq!(doc["selected_microbatch_size"], 16);
1917        assert_eq!(doc["selected_kv_capacity"], 1024);
1918    }
1919
1920    #[test]
1921    fn source_fa2_selects_source_linked_attention_when_compiled() {
1922        let resolved = m3(
1923            &[("FERRUM_FA2_SOURCE", "1")],
1924            CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1925        )
1926        .resolve()
1927        .unwrap();
1928        let decisions: BTreeMap<_, _> = resolved
1929            .decisions
1930            .iter()
1931            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1932            .collect();
1933
1934        assert_eq!(decisions["attention_prefill_mixed_backend"], "fa2_source");
1935    }
1936
1937    #[test]
1938    fn source_fa2_is_rejected_when_not_compiled() {
1939        expect_invalid_key(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE");
1940    }
1941
1942    #[test]
1943    fn hardware_capabilities_keep_m3_preset_on_compatible_backend_paths() {
1944        let resolved = m3_with_hardware(
1945            &[],
1946            cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2()),
1947        )
1948        .resolve()
1949        .unwrap();
1950        let decisions: BTreeMap<_, _> = resolved
1951            .decisions
1952            .iter()
1953            .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1954            .collect();
1955
1956        assert_eq!(
1957            decisions["attention_prefill_mixed_backend"],
1958            "legacy_paged_varlen"
1959        );
1960        assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1961        assert_eq!(decisions["moe_implementation"], "legacy_moe");
1962        assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1963        assert_eq!(decisions["sampling_readback_path"], "logits_readback");
1964    }
1965
1966    #[test]
1967    fn effective_config_document_records_cuda_gpu_device_selection() {
1968        let resolved = FerrumConfigBuilder::new(snapshot_with_sources(&[
1969            (
1970                "FERRUM_REQUESTED_GPU_DEVICES",
1971                "0,1",
1972                RuntimeConfigSource::Cli,
1973            ),
1974            (
1975                "FERRUM_SELECTED_GPU_DEVICES",
1976                "0,1",
1977                RuntimeConfigSource::Cli,
1978            ),
1979            ("FERRUM_CUDA_DEVICE_COUNT", "2", RuntimeConfigSource::Cli),
1980            (
1981                "FERRUM_SELECTED_DISTRIBUTED_STRATEGY",
1982                "layer_split",
1983                RuntimeConfigSource::Cli,
1984            ),
1985            (
1986                "FERRUM_SELECTED_LAYER_SPLIT_PLAN",
1987                "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79",
1988                RuntimeConfigSource::Cli,
1989            ),
1990            (
1991                "FERRUM_SELECTED_LAYER_SPLIT_STAGES",
1992                r#"[{"stage":0,"device":0,"layer_start":0,"layer_end":39},{"stage":1,"device":1,"layer_start":40,"layer_end":79}]"#,
1993                RuntimeConfigSource::Cli,
1994            ),
1995            ("FERRUM_KV_CAPACITY", "512", RuntimeConfigSource::Cli),
1996        ]))
1997        .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1998            CompiledKernelFeatures::m3_fast_path_without_fa2(),
1999        ))
2000        .resolve()
2001        .unwrap();
2002
2003        let doc = resolved.effective_config_document();
2004        assert_eq!(doc["backend"], "cuda");
2005        assert_eq!(doc["requested_gpu_devices"], serde_json::json!([0, 1]));
2006        assert_eq!(doc["selected_gpu_devices"], serde_json::json!([0, 1]));
2007        assert_eq!(doc["cuda_device_count"], 2);
2008        assert_eq!(doc["selected_distributed_strategy"], "layer_split");
2009        assert_eq!(
2010            doc["selected_layer_split_plan"],
2011            "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79"
2012        );
2013        assert_eq!(
2014            doc["selected_layer_split_stages"],
2015            serde_json::json!([
2016                {"stage": 0, "device": 0, "layer_start": 0, "layer_end": 39},
2017                {"stage": 1, "device": 1, "layer_start": 40, "layer_end": 79}
2018            ])
2019        );
2020        assert_eq!(doc["selected_weight_placement"], "layer_split");
2021        assert_eq!(doc["selected_pipeline_mode"], "overlapped");
2022        assert_eq!(doc["selected_stage_bridge"], "host");
2023        assert_eq!(
2024            doc["selected_microbatch_size"],
2025            serde_json::json!(doc["selected_max_sequences"].as_u64().unwrap().div_ceil(2))
2026        );
2027        assert_eq!(
2028            doc["selected_admission_limit"],
2029            doc["selected_max_sequences"]
2030        );
2031        assert_eq!(doc["selected_kv_capacity"], 512);
2032    }
2033
2034    #[test]
2035    fn effective_config_document_honors_explicit_layer_split_batch_mode() {
2036        let resolved = FerrumConfigBuilder::new(snapshot_with_sources(&[
2037            (
2038                "FERRUM_REQUESTED_GPU_DEVICES",
2039                "0,1",
2040                RuntimeConfigSource::Cli,
2041            ),
2042            (
2043                "FERRUM_SELECTED_GPU_DEVICES",
2044                "0,1",
2045                RuntimeConfigSource::Cli,
2046            ),
2047            (
2048                "FERRUM_SELECTED_DISTRIBUTED_STRATEGY",
2049                "layer_split",
2050                RuntimeConfigSource::Cli,
2051            ),
2052            (
2053                "FERRUM_SELECTED_LAYER_SPLIT_PLAN",
2054                "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79",
2055                RuntimeConfigSource::Cli,
2056            ),
2057            (
2058                "FERRUM_LAYER_SPLIT_PIPELINE_MODE",
2059                "batch",
2060                RuntimeConfigSource::Cli,
2061            ),
2062            ("FERRUM_PAGED_MAX_SEQS", "16", RuntimeConfigSource::Cli),
2063        ]))
2064        .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
2065            CompiledKernelFeatures::m3_fast_path_without_fa2(),
2066        ))
2067        .resolve()
2068        .unwrap();
2069
2070        let doc = resolved.effective_config_document();
2071        assert_eq!(doc["selected_pipeline_mode"], "batch");
2072        assert_eq!(doc["selected_microbatch_size"], 16);
2073    }
2074
2075    #[test]
2076    fn invalid_layer_split_pipeline_mode_is_rejected() {
2077        expect_invalid_key_with_hardware(
2078            &[("FERRUM_LAYER_SPLIT_PIPELINE_MODE", "serial")],
2079            "FERRUM_LAYER_SPLIT_PIPELINE_MODE",
2080            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2()),
2081        );
2082    }
2083
2084    #[test]
2085    fn hardware_incompatible_attention_and_sampling_overrides_are_rejected() {
2086        let cpu =
2087            cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2());
2088        expect_invalid_key_with_hardware(
2089            &[("FERRUM_USE_VLLM_PAGED_ATTN", "1")],
2090            "FERRUM_USE_VLLM_PAGED_ATTN",
2091            cpu.clone(),
2092        );
2093        expect_invalid_key_with_hardware(
2094            &[("FERRUM_VLLM_MOE", "1")],
2095            "FERRUM_VLLM_MOE",
2096            cpu.clone(),
2097        );
2098        expect_invalid_key_with_hardware(
2099            &[("FERRUM_GREEDY_ARGMAX", "1")],
2100            "FERRUM_GREEDY_ARGMAX",
2101            cpu.clone(),
2102        );
2103        expect_invalid_key_with_hardware(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE", cpu);
2104
2105        let mut old_cuda = HardwareCapabilities::rtx4090_cuda(
2106            CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
2107        );
2108        old_cuda.compute_capability = Some("7.5".to_string());
2109        expect_invalid_key_with_hardware(
2110            &[("FERRUM_FA2_SOURCE", "1")],
2111            "FERRUM_FA2_SOURCE",
2112            old_cuda,
2113        );
2114    }
2115
2116    #[test]
2117    fn hardware_capacity_sizes_default_sequence_budget_without_overriding_user_values() {
2118        let mut small_gpu =
2119            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2120        small_gpu.sm_count = Some(16);
2121        small_gpu.vram_bytes = Some(24 * 1024 * 1024 * 1024);
2122
2123        let resolved = m3_with_hardware(&[], small_gpu.clone()).resolve().unwrap();
2124        let decision = |selection: &str| {
2125            resolved
2126                .decisions
2127                .iter()
2128                .find(|decision| decision.selection == selection)
2129                .unwrap()
2130        };
2131        let max_sequences = decision("max_sequences");
2132        assert_eq!(max_sequences.selected, "4");
2133        assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
2134        let max_batched_tokens = decision("max_batched_tokens");
2135        assert_eq!(max_batched_tokens.selected, "256");
2136        assert_eq!(
2137            max_batched_tokens.source,
2138            AutoConfigSource::HardwareCapability
2139        );
2140
2141        let resolved = m3_with_hardware(&[("FERRUM_PAGED_MAX_SEQS", "16")], small_gpu)
2142            .resolve()
2143            .unwrap();
2144        let max_sequences = resolved
2145            .decisions
2146            .iter()
2147            .find(|decision| decision.selection == "max_sequences")
2148            .unwrap();
2149        assert_eq!(max_sequences.selected, "16");
2150        assert_eq!(max_sequences.source, AutoConfigSource::Env);
2151        assert_eq!(
2152            max_sequences.source_key.as_deref(),
2153            Some("FERRUM_PAGED_MAX_SEQS")
2154        );
2155    }
2156
2157    #[test]
2158    fn vram_capacity_caps_m3_default_sequence_budget() {
2159        let mut low_vram_gpu =
2160            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2161        low_vram_gpu.sm_count = Some(128);
2162        low_vram_gpu.vram_bytes = Some(7 * 1024 * 1024 * 1024);
2163
2164        let resolved = m3_with_hardware(&[], low_vram_gpu).resolve().unwrap();
2165        let max_sequences = resolved
2166            .decisions
2167            .iter()
2168            .find(|decision| decision.selection == "max_sequences")
2169            .unwrap();
2170        assert_eq!(max_sequences.selected, "4");
2171        assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
2172    }
2173
2174    #[test]
2175    fn memory_budget_keeps_rtx4090_m3_kv_blocks_but_caps_constrained_vram() {
2176        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
2177            .resolve()
2178            .unwrap();
2179        let decision = |selection: &str| {
2180            resolved
2181                .decisions
2182                .iter()
2183                .find(|decision| decision.selection == selection)
2184                .unwrap()
2185        };
2186        assert_eq!(decision("kv_block_count").selected, "2048");
2187        assert_eq!(
2188            decision("kv_block_count").source,
2189            AutoConfigSource::WorkloadPreset
2190        );
2191
2192        let mut constrained =
2193            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2194        constrained.vram_bytes = Some(20 * 1024 * 1024 * 1024);
2195        let resolved = m3_with_hardware(&[], constrained).resolve().unwrap();
2196        let decision = |selection: &str| {
2197            resolved
2198                .decisions
2199                .iter()
2200                .find(|decision| decision.selection == selection)
2201                .unwrap()
2202        };
2203        assert_eq!(decision("kv_block_count").selected, "2");
2204        assert_eq!(
2205            decision("kv_block_count").source,
2206            AutoConfigSource::HardwareCapability
2207        );
2208        assert_eq!(decision("max_batched_tokens").selected, "32");
2209        assert_eq!(
2210            decision("max_batched_tokens").source,
2211            AutoConfigSource::HardwareCapability
2212        );
2213    }
2214
2215    #[test]
2216    fn compute_capability_parser_accepts_major_minor_and_major_only() {
2217        assert_eq!(parse_compute_capability("8.9"), Some((8, 9)));
2218        assert_eq!(parse_compute_capability("9"), Some((9, 0)));
2219        assert_eq!(parse_compute_capability("N/A"), None);
2220    }
2221
2222    #[test]
2223    fn vram_capacity_tiers_are_monotonic() {
2224        assert_eq!(vram_default_max_sequences(24 * 1024 * 1024 * 1024), 32);
2225        assert_eq!(vram_default_max_sequences(16 * 1024 * 1024 * 1024), 16);
2226        assert_eq!(vram_default_max_sequences(8 * 1024 * 1024 * 1024), 8);
2227        assert_eq!(vram_default_max_sequences(6 * 1024 * 1024 * 1024), 4);
2228    }
2229
2230    #[test]
2231    fn accelerator_serving_default_uses_hardware_concurrency_budget() {
2232        let hardware =
2233            HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2234        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
2235        assert_eq!(workload.target_concurrency, 32);
2236
2237        let resolved = FerrumConfigBuilder::new(snapshot(&[]))
2238            .with_model_capabilities(ModelCapabilities::unknown())
2239            .with_hardware_capabilities(hardware)
2240            .with_workload_profile(workload)
2241            .resolve()
2242            .unwrap();
2243        let max_sequences = resolved
2244            .decisions
2245            .iter()
2246            .find(|decision| decision.selection == "max_sequences")
2247            .unwrap();
2248        assert_eq!(max_sequences.selected, "32");
2249    }
2250
2251    #[test]
2252    fn cpu_serving_default_keeps_single_sequence_budget() {
2253        let hardware = HardwareCapabilities {
2254            backend: "cpu".to_string(),
2255            supported_dtypes: vec!["fp32".to_string()],
2256            ..HardwareCapabilities::unknown()
2257        };
2258        let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
2259        assert_eq!(workload.target_concurrency, 1);
2260    }
2261
2262    #[test]
2263    fn validates_invalid_override_matrix() {
2264        expect_invalid_key(
2265            &[("FERRUM_USE_VLLM_PAGED_ATTN", "maybe")],
2266            "FERRUM_USE_VLLM_PAGED_ATTN",
2267        );
2268        expect_invalid_key(&[("FERRUM_PREFIX_CACHE", "maybe")], "FERRUM_PREFIX_CACHE");
2269        expect_invalid_key(
2270            &[
2271                ("FERRUM_FA_LAYOUT_VARLEN", "1"),
2272                ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
2273            ],
2274            "FERRUM_FA_LAYOUT_VARLEN",
2275        );
2276        expect_invalid_key(&[("FERRUM_FA2_DIRECT_FFI", "1")], "FERRUM_FA2_DIRECT_FFI");
2277        expect_invalid_key_with_features(
2278            &[("FERRUM_VLLM_MOE", "1")],
2279            "FERRUM_VLLM_MOE",
2280            CompiledKernelFeatures::default(),
2281        );
2282        expect_invalid_key(
2283            &[("FERRUM_MOE_DEVICE_ROUTE", "1"), ("FERRUM_VLLM_MOE", "0")],
2284            "FERRUM_MOE_DEVICE_ROUTE",
2285        );
2286        expect_invalid_key(
2287            &[("FERRUM_VLLM_MOE_PAIR_IDS", "1"), ("FERRUM_VLLM_MOE", "0")],
2288            "FERRUM_VLLM_MOE_PAIR_IDS",
2289        );
2290        expect_invalid_key(
2291            &[("FERRUM_MOE_GRAPH", "1"), ("FERRUM_VLLM_MOE", "0")],
2292            "FERRUM_MOE_GRAPH",
2293        );
2294        expect_invalid_key(&[("FERRUM_KV_MAX_BLOCKS", "0")], "FERRUM_KV_MAX_BLOCKS");
2295        expect_invalid_key(&[("FERRUM_PAGED_MAX_SEQS", "0")], "FERRUM_PAGED_MAX_SEQS");
2296        expect_invalid_key(
2297            &[
2298                ("FERRUM_PAGED_MAX_SEQS", "32"),
2299                ("FERRUM_MAX_BATCHED_TOKENS", "16"),
2300            ],
2301            "FERRUM_MAX_BATCHED_TOKENS",
2302        );
2303        expect_invalid_key(
2304            &[
2305                ("FERRUM_KV_MAX_BLOCKS", "16"),
2306                ("FERRUM_MAX_BATCHED_TOKENS", "512"),
2307            ],
2308            "FERRUM_MAX_BATCHED_TOKENS",
2309        );
2310        expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "0")], "FERRUM_MAX_MODEL_LEN");
2311        expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "50000")], "FERRUM_MAX_MODEL_LEN");
2312        expect_invalid_key(
2313            &[
2314                ("FERRUM_KV_MAX_BLOCKS", "16"),
2315                ("FERRUM_MAX_MODEL_LEN", "1024"),
2316            ],
2317            "FERRUM_KV_MAX_BLOCKS",
2318        );
2319        expect_invalid_key(&[("FERRUM_DTYPE", "bf16")], "FERRUM_DTYPE");
2320        expect_invalid_key(&[("FERRUM_KV_DTYPE", "fp8")], "FERRUM_KV_DTYPE");
2321        expect_invalid_key(
2322            &[
2323                ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", "1"),
2324                ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
2325            ],
2326            "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
2327        );
2328    }
2329
2330    #[test]
2331    fn requested_max_model_len_is_optional_and_reflected_when_valid() {
2332        let default_resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
2333            .resolve()
2334            .unwrap();
2335        assert!(!default_resolved
2336            .decisions
2337            .iter()
2338            .any(|decision| decision.selection == "max_model_len"));
2339
2340        let resolved = m3(
2341            &[
2342                ("FERRUM_KV_MAX_BLOCKS", "64"),
2343                ("FERRUM_MAX_MODEL_LEN", "1024"),
2344            ],
2345            CompiledKernelFeatures::m3_fast_path_without_fa2(),
2346        )
2347        .resolve()
2348        .unwrap();
2349        let max_model_len = resolved
2350            .decisions
2351            .iter()
2352            .find(|decision| decision.selection == "max_model_len")
2353            .unwrap();
2354        assert_eq!(max_model_len.selected, "1024");
2355        assert_eq!(
2356            max_model_len.source_key.as_deref(),
2357            Some("FERRUM_MAX_MODEL_LEN")
2358        );
2359    }
2360
2361    #[test]
2362    fn graph_enabled_with_graph_unsafe_moe_is_rejected() {
2363        let mut model = ModelCapabilities::qwen3_30b_a3b_gptq_int4();
2364        model.graph_safe_moe = false;
2365        let err = FerrumConfigBuilder::new(snapshot(&[("FERRUM_MOE_GRAPH", "1")]))
2366            .with_model_capabilities(model)
2367            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
2368                CompiledKernelFeatures::m3_fast_path_without_fa2(),
2369            ))
2370            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
2371            .resolve()
2372            .expect_err("graph unsafe MoE must fail");
2373        assert!(matches!(
2374            err,
2375            AutoConfigError::UnsupportedCombination {
2376                selection,
2377                ..
2378            } if selection == "moe_graph_policy"
2379        ));
2380    }
2381
2382    #[test]
2383    fn scheduler_override_is_reflected_in_decision_trace() {
2384        let resolved = m3(
2385            &[("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK", "64")],
2386            CompiledKernelFeatures::m3_fast_path_without_fa2(),
2387        )
2388        .resolve()
2389        .unwrap();
2390        let scheduler = resolved
2391            .decisions
2392            .iter()
2393            .find(|decision| decision.selection == "scheduler_admission_policy")
2394            .unwrap();
2395        assert_eq!(scheduler.selected, "active_decode_prefill_chunk:64");
2396        assert_eq!(
2397            scheduler.source_key.as_deref(),
2398            Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
2399        );
2400    }
2401
2402    #[test]
2403    fn prefix_cache_override_is_reflected_in_decision_trace() {
2404        let resolved = m3(
2405            &[("FERRUM_PREFIX_CACHE", "1")],
2406            CompiledKernelFeatures::m3_fast_path_without_fa2(),
2407        )
2408        .resolve()
2409        .unwrap();
2410        let prefix_cache = resolved
2411            .decisions
2412            .iter()
2413            .find(|decision| decision.selection == "prefix_cache_policy")
2414            .unwrap();
2415        assert_eq!(prefix_cache.selected, "prefix_cache_enabled");
2416        assert_eq!(
2417            prefix_cache.source_key.as_deref(),
2418            Some("FERRUM_PREFIX_CACHE")
2419        );
2420    }
2421
2422    #[test]
2423    fn non_env_runtime_sources_are_preserved_in_decision_trace() {
2424        let runtime_config = snapshot_with_sources(&[
2425            (
2426                "FERRUM_FA_LAYOUT_VARLEN",
2427                "1",
2428                RuntimeConfigSource::ConfigFile,
2429            ),
2430            ("FERRUM_PAGED_MAX_SEQS", "48", RuntimeConfigSource::Cli),
2431            (
2432                "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE",
2433                "32",
2434                RuntimeConfigSource::ScriptCase,
2435            ),
2436        ]);
2437        let resolved = FerrumConfigBuilder::new(runtime_config)
2438            .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
2439            .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
2440                CompiledKernelFeatures::m3_fast_path_without_fa2(),
2441            ))
2442            .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
2443            .resolve()
2444            .unwrap();
2445
2446        let decision = |selection: &str| {
2447            resolved
2448                .decisions
2449                .iter()
2450                .find(|decision| decision.selection == selection)
2451                .unwrap()
2452        };
2453        let attention = decision("attention_prefill_mixed_backend");
2454        assert_eq!(attention.selected, "fa_layout_varlen");
2455        assert_eq!(attention.source, AutoConfigSource::ConfigFile);
2456        assert_eq!(
2457            attention.source_key.as_deref(),
2458            Some("FERRUM_FA_LAYOUT_VARLEN")
2459        );
2460
2461        let max_sequences = decision("max_sequences");
2462        assert_eq!(max_sequences.selected, "48");
2463        assert_eq!(max_sequences.source, AutoConfigSource::Cli);
2464        assert_eq!(
2465            max_sequences.source_key.as_deref(),
2466            Some("FERRUM_PAGED_MAX_SEQS")
2467        );
2468
2469        let scheduler = decision("scheduler_admission_policy");
2470        assert_eq!(scheduler.selected, "prefill_first_until_active:32");
2471        assert_eq!(scheduler.source, AutoConfigSource::ScriptCase);
2472        assert_eq!(
2473            scheduler.source_key.as_deref(),
2474            Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE")
2475        );
2476    }
2477
2478    #[test]
2479    fn renders_effective_config_and_decision_trace_artifacts() {
2480        let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
2481            .resolve()
2482            .unwrap();
2483        let effective = resolved.effective_config_document();
2484        assert_eq!(effective["schema_version"], 1);
2485        assert!(effective["env_hash"]
2486            .as_str()
2487            .unwrap()
2488            .starts_with("sha256:"));
2489        assert!(effective["entries"].is_array());
2490        assert_eq!(effective["model_capabilities"]["architecture"], "qwen3_moe");
2491        assert_eq!(effective["hardware_capabilities"]["backend"], "cuda");
2492        assert_eq!(
2493            effective["workload_profile"]["preset"],
2494            M3_QWEN3_30B_A3B_INT4_PRESET
2495        );
2496        assert_eq!(
2497            effective["decisions"].as_array().unwrap().len(),
2498            resolved.decisions.len()
2499        );
2500        let trace = resolved.decision_trace_jsonl().unwrap();
2501        assert_eq!(trace.lines().count(), resolved.decisions.len());
2502        assert!(trace.contains("\"attention_prefill_mixed_backend\""));
2503    }
2504
2505    #[test]
2506    fn auto_config_artifacts_match_locked_schema_shape() {
2507        let resolved = FerrumConfigBuilder::m3_qwen3_30b_a3b_int4(snapshot_with_sources(&[
2508            (
2509                "FERRUM_FA_LAYOUT_VARLEN",
2510                "1",
2511                RuntimeConfigSource::ScriptCase,
2512            ),
2513            ("FERRUM_PAGED_MAX_SEQS", "32", RuntimeConfigSource::Cli),
2514        ]))
2515        .resolve()
2516        .unwrap();
2517
2518        let effective = resolved.effective_config_document();
2519        assert_eq!(effective["schema_version"], 1);
2520        assert!(effective["env_hash"]
2521            .as_str()
2522            .unwrap()
2523            .starts_with("sha256:"));
2524
2525        let entries = effective["entries"].as_array().unwrap();
2526        let keys: Vec<_> = entries
2527            .iter()
2528            .map(|entry| entry["key"].as_str().unwrap())
2529            .collect();
2530        let mut sorted_keys = keys.clone();
2531        sorted_keys.sort_unstable();
2532        assert_eq!(keys, sorted_keys);
2533        for entry in entries {
2534            assert!(entry["key"].as_str().unwrap().starts_with("FERRUM_"));
2535            assert!(entry["effective_value"].is_string());
2536            assert!(matches!(
2537                entry["source"].as_str().unwrap(),
2538                "default" | "config_file" | "cli" | "env" | "script_case" | "memory_profile"
2539            ));
2540            assert!(!entry["affects"].as_array().unwrap().is_empty());
2541        }
2542        assert_eq!(
2543            effective["model_capabilities"]["quantization"].as_str(),
2544            Some("gptq_int4")
2545        );
2546        assert_eq!(
2547            effective["model_capabilities"]["moe"]["experts_per_token"].as_u64(),
2548            Some(8)
2549        );
2550        assert_eq!(
2551            effective["hardware_capabilities"]["compute_capability"].as_str(),
2552            Some("8.9")
2553        );
2554        assert_eq!(
2555            effective["hardware_capabilities"]["compiled_features"]["vllm_moe_marlin"].as_bool(),
2556            Some(true)
2557        );
2558        assert_eq!(
2559            effective["workload_profile"]["target_concurrency"].as_u64(),
2560            Some(32)
2561        );
2562        assert_eq!(
2563            effective["workload_profile"]["priority"].as_str(),
2564            Some("throughput")
2565        );
2566        let admission = &effective["admission"];
2567        for field in [
2568            "effective_max_concurrent",
2569            "queue_depth",
2570            "active_prefill",
2571            "active_decode",
2572            "current_batch_size",
2573            "rejected_requests_total",
2574            "failed_requests_total",
2575            "completed_requests_total",
2576        ] {
2577            assert!(admission[field].is_number(), "admission.{field} missing");
2578        }
2579
2580        let trace = resolved.decision_trace_jsonl().unwrap();
2581        let trace_decisions: Vec<AutoConfigDecision> = trace
2582            .lines()
2583            .map(|line| serde_json::from_str(line).unwrap())
2584            .collect();
2585        assert_eq!(trace_decisions, resolved.decisions);
2586        assert_eq!(
2587            serde_json::from_value::<Vec<AutoConfigDecision>>(effective["decisions"].clone())
2588                .unwrap(),
2589            trace_decisions
2590        );
2591
2592        for decision in &trace_decisions {
2593            assert_eq!(decision.schema_version, 1);
2594            assert!(!decision.selection.trim().is_empty());
2595            assert!(!decision.selected.trim().is_empty());
2596            assert!(!decision.candidates.is_empty());
2597            assert!(!decision.affects.is_empty());
2598            if let Some(source_key) = &decision.source_key {
2599                assert!(source_key.starts_with("FERRUM_"));
2600            }
2601            for rejected in &decision.rejected {
2602                assert!(!rejected.value.trim().is_empty());
2603                assert!(!rejected.reason.trim().is_empty());
2604            }
2605        }
2606    }
2607}