1use crate::{
7 parse_bool_env_value, parse_usize_env_value, RuntimeConfigEffect, RuntimeConfigEntry,
8 RuntimeConfigSnapshot, RuntimeConfigSource,
9};
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12use thiserror::Error;
13
14pub const M3_QWEN3_30B_A3B_INT4_PRESET: &str = "m3_qwen3_30b_a3b_int4";
15const DEFAULT_KV_BLOCK_SIZE_TOKENS: usize = 16;
16const DEFAULT_KV_BLOCKS: usize = 2048;
17const GIB: u64 = 1024 * 1024 * 1024;
18
19#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
20pub struct ModelCapabilities {
21 pub architecture: String,
22 pub quantization: Option<String>,
23 pub moe: Option<MoeCapabilities>,
24 pub max_context_len: Option<usize>,
25 pub num_hidden_layers: Option<usize>,
26 pub head_dim: Option<usize>,
27 pub kv_heads: Option<usize>,
28 pub estimated_weight_bytes: Option<u64>,
29 pub supported_dtypes: Vec<String>,
30 pub graph_safe_moe: bool,
31}
32
33impl ModelCapabilities {
34 pub fn unknown() -> Self {
35 Self {
36 architecture: "unknown".to_string(),
37 quantization: None,
38 moe: None,
39 max_context_len: None,
40 num_hidden_layers: None,
41 head_dim: None,
42 kv_heads: None,
43 estimated_weight_bytes: None,
44 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
45 graph_safe_moe: false,
46 }
47 }
48
49 pub fn qwen3_30b_a3b_gptq_int4() -> Self {
50 Self {
51 architecture: "qwen3_moe".to_string(),
52 quantization: Some("gptq_int4".to_string()),
53 moe: Some(MoeCapabilities {
54 num_experts: 128,
55 experts_per_token: 8,
56 moe_intermediate_size: Some(768),
57 }),
58 max_context_len: Some(40960),
59 num_hidden_layers: Some(48),
60 head_dim: Some(128),
61 kv_heads: Some(4),
62 estimated_weight_bytes: Some(18 * GIB),
67 supported_dtypes: vec!["fp16".to_string()],
68 graph_safe_moe: false,
69 }
70 }
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74pub struct MoeCapabilities {
75 pub num_experts: usize,
76 pub experts_per_token: usize,
77 pub moe_intermediate_size: Option<usize>,
78}
79
80#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
81pub struct HardwareCapabilities {
82 pub backend: String,
83 pub cuda_runtime: Option<String>,
84 pub compute_capability: Option<String>,
85 pub vram_bytes: Option<u64>,
86 pub sm_count: Option<u32>,
87 pub supported_dtypes: Vec<String>,
88 pub supported_kv_dtypes: Vec<String>,
89 pub graph_support: bool,
90 pub compiled_features: CompiledKernelFeatures,
91}
92
93impl HardwareCapabilities {
94 pub fn unknown() -> Self {
95 Self {
96 backend: "unknown".to_string(),
97 cuda_runtime: None,
98 compute_capability: None,
99 vram_bytes: None,
100 sm_count: None,
101 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
102 supported_kv_dtypes: vec!["fp16".to_string()],
103 graph_support: false,
104 compiled_features: CompiledKernelFeatures::default(),
105 }
106 }
107
108 pub fn rtx4090_cuda(features: CompiledKernelFeatures) -> Self {
109 Self {
110 backend: "cuda".to_string(),
111 cuda_runtime: None,
112 compute_capability: Some("8.9".to_string()),
113 vram_bytes: Some(24 * 1024 * 1024 * 1024),
114 sm_count: Some(128),
115 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
116 supported_kv_dtypes: vec!["fp16".to_string(), "int8".to_string()],
117 graph_support: true,
118 compiled_features: features,
119 }
120 }
121}
122
123#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
124pub struct CompiledKernelFeatures {
125 pub cuda: bool,
126 pub vllm_paged_attn: bool,
127 pub vllm_moe_marlin: bool,
128 pub cuda_graph: bool,
129 pub greedy_argmax: bool,
130 pub fa2_source: bool,
131 pub fa2_direct_ffi: bool,
132}
133
134impl Default for CompiledKernelFeatures {
135 fn default() -> Self {
136 Self {
137 cuda: false,
138 vllm_paged_attn: false,
139 vllm_moe_marlin: false,
140 cuda_graph: false,
141 greedy_argmax: false,
142 fa2_source: false,
143 fa2_direct_ffi: false,
144 }
145 }
146}
147
148impl CompiledKernelFeatures {
149 pub fn m3_fast_path_without_fa2() -> Self {
150 Self {
151 cuda: true,
152 vllm_paged_attn: true,
153 vllm_moe_marlin: true,
154 cuda_graph: true,
155 greedy_argmax: true,
156 fa2_source: false,
157 fa2_direct_ffi: false,
158 }
159 }
160
161 pub fn m3_fast_path_with_source_fa2() -> Self {
162 Self {
163 fa2_source: true,
164 ..Self::m3_fast_path_without_fa2()
165 }
166 }
167}
168
169#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
170pub struct WorkloadProfile {
171 pub preset: Option<String>,
172 pub serving_mode: String,
173 pub target_concurrency: usize,
174 pub prompt_length_class: String,
175 pub output_length_class: String,
176 pub priority: WorkloadPriority,
177}
178
179impl WorkloadProfile {
180 pub fn serving_default() -> Self {
181 Self {
182 preset: None,
183 serving_mode: "openai_chat".to_string(),
184 target_concurrency: 1,
185 prompt_length_class: "unknown".to_string(),
186 output_length_class: "unknown".to_string(),
187 priority: WorkloadPriority::Balanced,
188 }
189 }
190
191 pub fn serving_default_for_hardware(hardware: &HardwareCapabilities) -> Self {
192 let mut profile = Self::serving_default();
193 if hardware.backend.eq_ignore_ascii_case("cuda")
194 || hardware.backend.eq_ignore_ascii_case("metal")
195 {
196 profile.target_concurrency = hardware
197 .vram_bytes
198 .map(vram_default_max_sequences)
199 .unwrap_or(4)
200 .max(1);
201 }
202 profile
203 }
204
205 pub fn m3_qwen3_30b_a3b_int4() -> Self {
206 Self {
207 preset: Some(M3_QWEN3_30B_A3B_INT4_PRESET.to_string()),
208 serving_mode: "bench_serve".to_string(),
209 target_concurrency: 32,
210 prompt_length_class: "random_256".to_string(),
211 output_length_class: "random_128".to_string(),
212 priority: WorkloadPriority::Throughput,
213 }
214 }
215
216 fn is_m3_preset(&self) -> bool {
217 self.preset.as_deref() == Some(M3_QWEN3_30B_A3B_INT4_PRESET)
218 }
219}
220
221impl Default for WorkloadProfile {
222 fn default() -> Self {
223 Self::serving_default()
224 }
225}
226
227#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
228#[serde(rename_all = "snake_case")]
229pub enum WorkloadPriority {
230 Latency,
231 Throughput,
232 Balanced,
233}
234
235#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
236pub struct ResolvedFerrumConfig {
237 pub schema_version: u32,
238 pub preset: Option<String>,
239 pub runtime_config: RuntimeConfigSnapshot,
240 pub model_capabilities: ModelCapabilities,
241 pub hardware_capabilities: HardwareCapabilities,
242 pub workload_profile: WorkloadProfile,
243 pub decisions: Vec<AutoConfigDecision>,
244}
245
246impl ResolvedFerrumConfig {
247 pub fn effective_config_document(&self) -> serde_json::Value {
248 serde_json::json!({
249 "schema_version": 1,
250 "preset": self.preset,
251 "env_hash": self.runtime_env_hash(),
252 "entries": self.runtime_config.entries,
253 "model_capabilities": self.model_capabilities,
254 "hardware_capabilities": self.hardware_capabilities,
255 "workload_profile": self.workload_profile,
256 "decisions": self.decisions,
257 })
258 }
259
260 pub fn decision_trace_jsonl(&self) -> Result<String, serde_json::Error> {
261 let mut out = String::new();
262 for decision in &self.decisions {
263 out.push_str(&serde_json::to_string(decision)?);
264 out.push('\n');
265 }
266 Ok(out)
267 }
268
269 pub fn runtime_env_hash(&self) -> String {
270 use sha2::{Digest, Sha256};
271
272 let bytes = serde_json::to_vec(&self.runtime_config.entries).unwrap_or_default();
273 let digest = Sha256::digest(bytes);
274 format!("sha256:{digest:x}")
275 }
276}
277
278#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
279pub struct AutoConfigDecision {
280 pub schema_version: u32,
281 pub selection: String,
282 pub selected: String,
283 pub source: AutoConfigSource,
284 pub source_key: Option<String>,
285 pub candidates: Vec<String>,
286 pub rejected: Vec<RejectedCandidate>,
287 pub affects: Vec<RuntimeConfigEffect>,
288}
289
290#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
291pub struct RejectedCandidate {
292 pub value: String,
293 pub reason: String,
294}
295
296#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
297#[serde(rename_all = "snake_case")]
298pub enum AutoConfigSource {
299 Default,
300 Cli,
301 ConfigFile,
302 Env,
303 ScriptCase,
304 ModelMetadata,
305 HardwareCapability,
306 MemoryProfile,
307 WorkloadPreset,
308 CompiledFeature,
309}
310
311#[derive(Debug, Clone, PartialEq, Eq, Error)]
312pub enum AutoConfigError {
313 #[error("{key}: invalid override: {reason}")]
314 InvalidOverride { key: String, reason: String },
315 #[error("{selection}: unsupported combination: {reason}")]
316 UnsupportedCombination { selection: String, reason: String },
317}
318
319pub struct FerrumConfigBuilder {
320 runtime_config: RuntimeConfigSnapshot,
321 model: ModelCapabilities,
322 hardware: HardwareCapabilities,
323 workload: WorkloadProfile,
324}
325
326impl FerrumConfigBuilder {
327 pub fn new(runtime_config: RuntimeConfigSnapshot) -> Self {
328 Self {
329 runtime_config,
330 model: ModelCapabilities::unknown(),
331 hardware: HardwareCapabilities::unknown(),
332 workload: WorkloadProfile::default(),
333 }
334 }
335
336 pub fn m3_qwen3_30b_a3b_int4(runtime_config: RuntimeConfigSnapshot) -> Self {
337 Self::new(runtime_config)
338 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
339 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
340 CompiledKernelFeatures::m3_fast_path_without_fa2(),
341 ))
342 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
343 }
344
345 pub fn with_model_capabilities(mut self, model: ModelCapabilities) -> Self {
346 self.model = model;
347 self
348 }
349
350 pub fn with_hardware_capabilities(mut self, hardware: HardwareCapabilities) -> Self {
351 self.hardware = hardware;
352 self
353 }
354
355 pub fn with_workload_profile(mut self, workload: WorkloadProfile) -> Self {
356 self.workload = workload;
357 self
358 }
359
360 pub fn resolve(self) -> Result<ResolvedFerrumConfig, AutoConfigError> {
361 let mut decisions = Vec::new();
362 let cuda_backend = self.is_cuda_backend();
363 let use_vllm_paged_attn = self.bool_value(
364 "FERRUM_USE_VLLM_PAGED_ATTN",
365 self.workload.is_m3_preset()
366 && cuda_backend
367 && self.hardware.compiled_features.vllm_paged_attn,
368 AutoConfigSource::WorkloadPreset,
369 )?;
370 let fa_layout =
371 self.bool_value("FERRUM_FA_LAYOUT_VARLEN", false, AutoConfigSource::Default)?;
372 let fa2_source = self.bool_value("FERRUM_FA2_SOURCE", false, AutoConfigSource::Default)?;
373 let shim_present = self.raw("FERRUM_FA2_DIRECT_FFI_SHIM").is_some();
374 let fa2_direct_ffi = self.bool_value(
375 "FERRUM_FA2_DIRECT_FFI",
376 shim_present,
377 if shim_present {
378 AutoConfigSource::Env
379 } else {
380 AutoConfigSource::Default
381 },
382 )?;
383 let vllm_v1_short = self.bool_value(
384 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
385 use_vllm_paged_attn.value,
386 AutoConfigSource::Default,
387 )?;
388 let vllm_moe = self.bool_value(
389 "FERRUM_VLLM_MOE",
390 self.workload.is_m3_preset()
391 && cuda_backend
392 && self.hardware.compiled_features.vllm_moe_marlin,
393 AutoConfigSource::WorkloadPreset,
394 )?;
395 let device_route = self.bool_value(
396 "FERRUM_MOE_DEVICE_ROUTE",
397 self.workload.is_m3_preset() && vllm_moe.value,
398 AutoConfigSource::WorkloadPreset,
399 )?;
400 let pair_ids = self.bool_value(
401 "FERRUM_VLLM_MOE_PAIR_IDS",
402 vllm_moe.value,
403 AutoConfigSource::WorkloadPreset,
404 )?;
405 let graph = self.bool_value("FERRUM_MOE_GRAPH", false, AutoConfigSource::WorkloadPreset)?;
406 let greedy = self.bool_value(
407 "FERRUM_GREEDY_ARGMAX",
408 self.workload.is_m3_preset()
409 && cuda_backend
410 && self.hardware.compiled_features.greedy_argmax,
411 AutoConfigSource::WorkloadPreset,
412 )?;
413 let prefix_cache = self.bool_value(
414 "FERRUM_PREFIX_CACHE",
415 false,
416 if self.workload.is_m3_preset() {
417 AutoConfigSource::WorkloadPreset
418 } else {
419 AutoConfigSource::Default
420 },
421 )?;
422 let default_max_sequences = self.default_max_sequences();
423 let max_sequences = self.usize_value(
424 "FERRUM_PAGED_MAX_SEQS",
425 default_max_sequences.value,
426 default_max_sequences.source,
427 )?;
428 let default_kv_blocks = self.default_kv_blocks(&max_sequences);
429 let kv_blocks = self.usize_value(
430 "FERRUM_KV_MAX_BLOCKS",
431 default_kv_blocks.value,
432 default_kv_blocks.source,
433 )?;
434 let default_max_batched_tokens =
435 self.default_max_batched_tokens(&max_sequences, &kv_blocks);
436 let max_batched_tokens = self.usize_value(
437 "FERRUM_MAX_BATCHED_TOKENS",
438 default_max_batched_tokens.value,
439 default_max_batched_tokens.source,
440 )?;
441 let max_model_len = self.optional_usize_value("FERRUM_MAX_MODEL_LEN")?;
442
443 self.validate_attention(
444 use_vllm_paged_attn.value,
445 fa_layout.value,
446 fa2_source.value,
447 fa2_direct_ffi.value,
448 shim_present,
449 vllm_v1_short.value,
450 )?;
451 self.validate_moe(
452 vllm_moe.value,
453 device_route.value,
454 pair_ids.value,
455 graph.value,
456 )?;
457 self.validate_memory(
458 kv_blocks.value,
459 max_sequences.value,
460 max_batched_tokens.value,
461 max_model_len.as_ref().map(|value| value.value),
462 )?;
463 self.validate_dtypes()?;
464 self.validate_sampling(greedy.value)?;
465
466 decisions.push(self.attention_prefill_decision(
467 use_vllm_paged_attn.clone(),
468 fa_layout,
469 fa2_source,
470 fa2_direct_ffi,
471 ));
472 decisions.push(self.attention_decode_decision(use_vllm_paged_attn, vllm_v1_short));
473 decisions.push(self.moe_decision(vllm_moe, device_route, pair_ids));
474 decisions.push(self.graph_decision(graph));
475 decisions.push(self.scalar_decision(
476 "kv_block_count",
477 kv_blocks,
478 RuntimeConfigEffect::Memory,
479 ));
480 decisions.push(self.scalar_decision(
481 "max_sequences",
482 max_sequences,
483 RuntimeConfigEffect::Memory,
484 ));
485 decisions.push(self.scalar_decision(
486 "max_batched_tokens",
487 max_batched_tokens,
488 RuntimeConfigEffect::Performance,
489 ));
490 if let Some(max_model_len) = max_model_len {
491 decisions.push(self.scalar_decision(
492 "max_model_len",
493 max_model_len,
494 RuntimeConfigEffect::Memory,
495 ));
496 }
497 decisions.push(self.prefix_cache_decision(prefix_cache));
498 decisions.push(self.scheduler_decision()?);
499 decisions.push(self.sampling_decision(greedy));
500
501 Ok(ResolvedFerrumConfig {
502 schema_version: 1,
503 preset: self.workload.preset.clone(),
504 runtime_config: self.runtime_config.clone(),
505 model_capabilities: self.model.clone(),
506 hardware_capabilities: self.hardware.clone(),
507 workload_profile: self.workload.clone(),
508 decisions,
509 })
510 }
511
512 fn entries(&self) -> BTreeMap<&str, &str> {
513 self.runtime_config
514 .entries
515 .iter()
516 .map(|entry| (entry.key.as_str(), entry.effective_value.as_str()))
517 .collect()
518 }
519
520 fn raw(&self, key: &str) -> Option<&str> {
521 self.entry(key).map(|entry| entry.effective_value.as_str())
522 }
523
524 fn entry(&self, key: &str) -> Option<&RuntimeConfigEntry> {
525 self.runtime_config
526 .entries
527 .iter()
528 .find(|entry| entry.key == key)
529 }
530
531 fn source_for_key(&self, key: &str, default_source: AutoConfigSource) -> AutoConfigSource {
532 self.entry(key)
533 .map(|entry| auto_config_source_from_runtime(entry.source))
534 .unwrap_or(default_source)
535 }
536
537 fn is_cuda_backend(&self) -> bool {
538 self.hardware.backend.eq_ignore_ascii_case("cuda")
539 }
540
541 fn cuda_compute_capability_at_least(&self, major: u32, minor: u32) -> Option<bool> {
542 let (actual_major, actual_minor) =
543 parse_compute_capability(self.hardware.compute_capability.as_deref()?)?;
544 Some((actual_major, actual_minor) >= (major, minor))
545 }
546
547 fn default_max_sequences(&self) -> ResolvedValue<usize> {
548 let target = self.workload.target_concurrency.max(1);
549 let mut selected = target;
550 if self.workload.is_m3_preset() {
551 if let Some(sm_count) = self.hardware.sm_count {
552 selected = selected.min((sm_count as usize / 4).max(1));
556 }
557 if let Some(vram_bytes) = self.hardware.vram_bytes {
558 selected = selected.min(vram_default_max_sequences(vram_bytes));
559 }
560 }
561 ResolvedValue {
562 value: selected.max(1),
563 source: if selected < target {
564 AutoConfigSource::HardwareCapability
565 } else {
566 AutoConfigSource::WorkloadPreset
567 },
568 source_key: None,
569 }
570 }
571
572 fn default_max_batched_tokens(
573 &self,
574 max_sequences: &ResolvedValue<usize>,
575 kv_blocks: &ResolvedValue<usize>,
576 ) -> ResolvedValue<usize> {
577 let kv_token_capacity = kv_blocks
578 .value
579 .saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS)
580 .max(max_sequences.value.max(1));
581 let value = max_sequences
582 .value
583 .max(1)
584 .saturating_mul(64)
585 .min(kv_token_capacity)
586 .max(max_sequences.value.max(1));
587 ResolvedValue {
588 value,
589 source: if max_sequences.source == AutoConfigSource::HardwareCapability
590 || kv_blocks.source == AutoConfigSource::HardwareCapability
591 {
592 AutoConfigSource::HardwareCapability
593 } else {
594 AutoConfigSource::WorkloadPreset
595 },
596 source_key: None,
597 }
598 }
599
600 fn default_kv_blocks(&self, max_sequences: &ResolvedValue<usize>) -> ResolvedValue<usize> {
601 let min_blocks = ceil_div(max_sequences.value.max(1), DEFAULT_KV_BLOCK_SIZE_TOKENS);
602 let target = DEFAULT_KV_BLOCKS.max(min_blocks);
603 let selected = match (
604 self.hardware.vram_bytes,
605 self.model.estimated_weight_bytes,
606 self.kv_cache_bytes_per_token(),
607 ) {
608 (Some(vram_bytes), Some(weight_bytes), Some(kv_bytes_per_token))
609 if kv_bytes_per_token > 0 =>
610 {
611 let headroom = (vram_bytes / 10).max(2 * GIB);
612 let available = vram_bytes.saturating_sub(weight_bytes.saturating_add(headroom));
613 let kv_token_budget = (available / kv_bytes_per_token) as usize;
614 let block_budget = kv_token_budget / DEFAULT_KV_BLOCK_SIZE_TOKENS;
615 target.min(block_budget.max(min_blocks))
616 }
617 _ => target,
618 };
619 ResolvedValue {
620 value: selected.max(1),
621 source: if selected < target {
622 AutoConfigSource::HardwareCapability
623 } else {
624 AutoConfigSource::WorkloadPreset
625 },
626 source_key: None,
627 }
628 }
629
630 fn kv_cache_bytes_per_token(&self) -> Option<u64> {
631 let layers = self.model.num_hidden_layers? as u64;
632 let kv_heads = self.model.kv_heads? as u64;
633 let head_dim = self.model.head_dim? as u64;
634 layers
635 .checked_mul(2)?
636 .checked_mul(kv_heads)?
637 .checked_mul(head_dim)?
638 .checked_mul(2)
639 }
640
641 fn bool_value(
642 &self,
643 key: &str,
644 default: bool,
645 default_source: AutoConfigSource,
646 ) -> Result<ResolvedValue<bool>, AutoConfigError> {
647 match self.entry(key) {
648 Some(entry) => Ok(ResolvedValue {
649 value: parse_bool_env_value(&entry.effective_value).map_err(|reason| {
650 AutoConfigError::InvalidOverride {
651 key: key.to_string(),
652 reason,
653 }
654 })?,
655 source: auto_config_source_from_runtime(entry.source),
656 source_key: Some(key.to_string()),
657 }),
658 None => Ok(ResolvedValue {
659 value: default,
660 source: default_source,
661 source_key: None,
662 }),
663 }
664 }
665
666 fn usize_value(
667 &self,
668 key: &str,
669 default: usize,
670 default_source: AutoConfigSource,
671 ) -> Result<ResolvedValue<usize>, AutoConfigError> {
672 match self.entry(key) {
673 Some(entry) => Ok(ResolvedValue {
674 value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
675 AutoConfigError::InvalidOverride {
676 key: key.to_string(),
677 reason,
678 }
679 })?,
680 source: auto_config_source_from_runtime(entry.source),
681 source_key: Some(key.to_string()),
682 }),
683 None => Ok(ResolvedValue {
684 value: default,
685 source: default_source,
686 source_key: None,
687 }),
688 }
689 }
690
691 fn optional_usize_value(
692 &self,
693 key: &str,
694 ) -> Result<Option<ResolvedValue<usize>>, AutoConfigError> {
695 match self.entry(key) {
696 Some(entry) => Ok(Some(ResolvedValue {
697 value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
698 AutoConfigError::InvalidOverride {
699 key: key.to_string(),
700 reason,
701 }
702 })?,
703 source: auto_config_source_from_runtime(entry.source),
704 source_key: Some(key.to_string()),
705 })),
706 None => Ok(None),
707 }
708 }
709
710 fn validate_attention(
711 &self,
712 use_vllm_paged_attn: bool,
713 fa_layout: bool,
714 fa2_source: bool,
715 fa2_direct_ffi: bool,
716 shim_present: bool,
717 vllm_v1_short: bool,
718 ) -> Result<(), AutoConfigError> {
719 if use_vllm_paged_attn && !self.hardware.compiled_features.vllm_paged_attn {
720 return self.invalid(
721 "FERRUM_USE_VLLM_PAGED_ATTN",
722 "vLLM paged attention is not compiled",
723 );
724 }
725 if use_vllm_paged_attn && !self.is_cuda_backend() {
726 return self.invalid(
727 "FERRUM_USE_VLLM_PAGED_ATTN",
728 "vLLM paged attention requires CUDA backend",
729 );
730 }
731 if fa_layout && !use_vllm_paged_attn {
732 return self.invalid(
733 "FERRUM_FA_LAYOUT_VARLEN",
734 "FA layout requires vLLM paged attention layout",
735 );
736 }
737 if fa2_source && !self.hardware.compiled_features.fa2_source {
738 return self.invalid(
739 "FERRUM_FA2_SOURCE",
740 "source-linked FA2 support is not compiled",
741 );
742 }
743 if fa2_source && !self.is_cuda_backend() {
744 return self.invalid(
745 "FERRUM_FA2_SOURCE",
746 "source-linked FA2 requires CUDA backend",
747 );
748 }
749 if fa2_source && !use_vllm_paged_attn {
750 return self.invalid(
751 "FERRUM_FA2_SOURCE",
752 "source-linked FA2 requires vLLM paged attention layout",
753 );
754 }
755 if fa2_source && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
756 return self.invalid(
757 "FERRUM_FA2_SOURCE",
758 "source-linked FA2 requires CUDA compute capability >= 8.0",
759 );
760 }
761 if fa2_direct_ffi && !self.hardware.compiled_features.fa2_direct_ffi {
762 return self.invalid(
763 "FERRUM_FA2_DIRECT_FFI",
764 "direct FA2 FFI shim support is not compiled",
765 );
766 }
767 if fa2_direct_ffi && !self.is_cuda_backend() {
768 return self.invalid(
769 "FERRUM_FA2_DIRECT_FFI",
770 "direct FA2 FFI shim requires CUDA backend",
771 );
772 }
773 if fa2_direct_ffi && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
774 return self.invalid(
775 "FERRUM_FA2_DIRECT_FFI",
776 "direct FA2 FFI shim requires CUDA compute capability >= 8.0",
777 );
778 }
779 if fa2_direct_ffi && !shim_present {
780 return self.invalid(
781 "FERRUM_FA2_DIRECT_FFI",
782 "requires FERRUM_FA2_DIRECT_FFI_SHIM",
783 );
784 }
785 if fa2_source && fa2_direct_ffi {
786 return self.unsupported(
787 "attention_prefill_mixed_backend",
788 "FA2 source and direct FFI shim cannot both own the prefill path",
789 );
790 }
791 if vllm_v1_short && !use_vllm_paged_attn {
792 return self.invalid(
793 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
794 "short-context v1 requires vLLM paged attention",
795 );
796 }
797 Ok(())
798 }
799
800 fn validate_moe(
801 &self,
802 vllm_moe: bool,
803 device_route: bool,
804 pair_ids: bool,
805 graph: bool,
806 ) -> Result<(), AutoConfigError> {
807 if vllm_moe && !self.hardware.compiled_features.vllm_moe_marlin {
808 return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE is not compiled");
809 }
810 if vllm_moe && !self.is_cuda_backend() {
811 return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE requires CUDA backend");
812 }
813 if device_route && !vllm_moe {
814 return self.invalid(
815 "FERRUM_MOE_DEVICE_ROUTE",
816 "device route currently requires vLLM MoE",
817 );
818 }
819 if pair_ids && !vllm_moe {
820 return self.invalid(
821 "FERRUM_VLLM_MOE_PAIR_IDS",
822 "pair-id routing requires vLLM MoE",
823 );
824 }
825 let graph_relevant = self.model.moe.is_some() || self.workload.is_m3_preset();
826 if graph && graph_relevant && !self.hardware.graph_support {
827 return self.invalid(
828 "FERRUM_MOE_GRAPH",
829 "hardware/backend does not support CUDA graph replay",
830 );
831 }
832 if graph && graph_relevant && !self.hardware.compiled_features.cuda_graph {
833 return self.invalid("FERRUM_MOE_GRAPH", "CUDA graph support is not compiled");
834 }
835 if graph && graph_relevant && !vllm_moe {
836 return self.invalid(
837 "FERRUM_MOE_GRAPH",
838 "graph decode requires the graph-clean vLLM MoE path",
839 );
840 }
841 if graph && graph_relevant && self.model.moe.is_some() && !self.model.graph_safe_moe {
842 return self.unsupported(
843 "moe_graph_policy",
844 "model MoE path is not marked graph-safe",
845 );
846 }
847 Ok(())
848 }
849
850 fn validate_sampling(&self, greedy: bool) -> Result<(), AutoConfigError> {
851 if greedy && !self.hardware.compiled_features.greedy_argmax {
852 return self.invalid("FERRUM_GREEDY_ARGMAX", "GPU argmax is not compiled");
853 }
854 if greedy
855 && !(self.is_cuda_backend() || self.hardware.backend.eq_ignore_ascii_case("metal"))
856 {
857 return self.invalid(
858 "FERRUM_GREEDY_ARGMAX",
859 "greedy argmax requires CUDA or Metal backend",
860 );
861 }
862 Ok(())
863 }
864
865 fn validate_memory(
866 &self,
867 kv_blocks: usize,
868 max_sequences: usize,
869 max_batched_tokens: usize,
870 requested_max_model_len: Option<usize>,
871 ) -> Result<(), AutoConfigError> {
872 if kv_blocks == 0 {
873 return self.invalid("FERRUM_KV_MAX_BLOCKS", "must be greater than zero");
874 }
875 if max_sequences == 0 {
876 return self.invalid("FERRUM_PAGED_MAX_SEQS", "must be greater than zero");
877 }
878 if max_batched_tokens < max_sequences {
879 return self.invalid(
880 "FERRUM_MAX_BATCHED_TOKENS",
881 "must be at least FERRUM_PAGED_MAX_SEQS",
882 );
883 }
884 let kv_token_capacity = kv_blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS);
885 if max_batched_tokens > kv_token_capacity {
886 return self.invalid(
887 "FERRUM_MAX_BATCHED_TOKENS",
888 "exceeds KV cache token capacity",
889 );
890 }
891 if let Some(max_model_len) = requested_max_model_len {
892 if max_model_len == 0 {
893 return self.invalid("FERRUM_MAX_MODEL_LEN", "must be greater than zero");
894 }
895 if let Some(model_max) = self.model.max_context_len {
896 if max_model_len > model_max {
897 return self.invalid(
898 "FERRUM_MAX_MODEL_LEN",
899 "exceeds model metadata max context length",
900 );
901 }
902 }
903 if max_model_len > kv_token_capacity {
904 return self.invalid(
905 "FERRUM_KV_MAX_BLOCKS",
906 "KV cache token capacity is smaller than FERRUM_MAX_MODEL_LEN",
907 );
908 }
909 }
910 Ok(())
911 }
912
913 fn validate_dtypes(&self) -> Result<(), AutoConfigError> {
914 if let Some(dtype) = self.raw("FERRUM_DTYPE") {
915 let dtype = dtype.to_ascii_lowercase();
916 if !self.hardware.supported_dtypes.iter().any(|d| d == &dtype) {
917 return self.invalid("FERRUM_DTYPE", "dtype is not supported by hardware profile");
918 }
919 }
920 if let Some(dtype) = self.raw("FERRUM_KV_DTYPE") {
921 let dtype = dtype.to_ascii_lowercase();
922 if !self
923 .hardware
924 .supported_kv_dtypes
925 .iter()
926 .any(|d| d == &dtype)
927 {
928 return self.invalid(
929 "FERRUM_KV_DTYPE",
930 "KV dtype is not supported by hardware profile",
931 );
932 }
933 }
934 Ok(())
935 }
936
937 fn attention_prefill_decision(
938 &self,
939 use_vllm_paged_attn: ResolvedValue<bool>,
940 fa_layout: ResolvedValue<bool>,
941 fa2_source: ResolvedValue<bool>,
942 fa2_direct_ffi: ResolvedValue<bool>,
943 ) -> AutoConfigDecision {
944 let (selected, source, source_key) = if fa2_source.value {
945 ("fa2_source", fa2_source.source, fa2_source.source_key)
946 } else if fa2_direct_ffi.value {
947 (
948 "fa2_direct_ffi",
949 fa2_direct_ffi.source,
950 fa2_direct_ffi.source_key,
951 )
952 } else if fa_layout.value {
953 ("fa_layout_varlen", fa_layout.source, fa_layout.source_key)
954 } else if use_vllm_paged_attn.value {
955 (
956 "vllm_paged_varlen",
957 use_vllm_paged_attn.source,
958 use_vllm_paged_attn.source_key,
959 )
960 } else {
961 ("legacy_paged_varlen", AutoConfigSource::Default, None)
962 };
963 self.decision(
964 "attention_prefill_mixed_backend",
965 selected,
966 source,
967 source_key,
968 [
969 "fa2_source",
970 "fa2_direct_ffi",
971 "fa_layout_varlen",
972 "vllm_paged_varlen",
973 "legacy_paged_varlen",
974 ],
975 self.rejected_except(
976 selected,
977 [
978 ("fa2_source", "source-linked FA2 path not selected"),
979 ("fa2_direct_ffi", "diagnostic direct FFI shim not selected"),
980 ("fa_layout_varlen", "FA-compatible layout not selected"),
981 ("vllm_paged_varlen", "vLLM paged varlen bridge not selected"),
982 (
983 "legacy_paged_varlen",
984 "a higher-priority attention path was selected",
985 ),
986 ],
987 ),
988 vec![
989 RuntimeConfigEffect::Performance,
990 RuntimeConfigEffect::Memory,
991 ],
992 )
993 }
994
995 fn attention_decode_decision(
996 &self,
997 use_vllm_paged_attn: ResolvedValue<bool>,
998 vllm_v1_short: ResolvedValue<bool>,
999 ) -> AutoConfigDecision {
1000 let (selected, source, source_key) = if use_vllm_paged_attn.value {
1001 if vllm_v1_short.value {
1002 (
1003 "vllm_paged_attn_v1_short",
1004 vllm_v1_short.source,
1005 vllm_v1_short.source_key,
1006 )
1007 } else {
1008 (
1009 "vllm_paged_attn_v2",
1010 vllm_v1_short.source,
1011 vllm_v1_short.source_key,
1012 )
1013 }
1014 } else {
1015 ("legacy_paged_decode", use_vllm_paged_attn.source, None)
1016 };
1017 self.decision(
1018 "attention_decode_backend",
1019 selected,
1020 source,
1021 source_key,
1022 [
1023 "vllm_paged_attn_v1_short",
1024 "vllm_paged_attn_v2",
1025 "legacy_paged_decode",
1026 ],
1027 self.rejected_except(
1028 selected,
1029 [
1030 (
1031 "vllm_paged_attn_v1_short",
1032 "short-context v1 decode not selected",
1033 ),
1034 ("vllm_paged_attn_v2", "v2 decode not selected"),
1035 ("legacy_paged_decode", "legacy decode not selected"),
1036 ],
1037 ),
1038 vec![RuntimeConfigEffect::Performance],
1039 )
1040 }
1041
1042 fn moe_decision(
1043 &self,
1044 vllm_moe: ResolvedValue<bool>,
1045 device_route: ResolvedValue<bool>,
1046 pair_ids: ResolvedValue<bool>,
1047 ) -> AutoConfigDecision {
1048 let selected = if vllm_moe.value && device_route.value && pair_ids.value {
1049 "vllm_marlin_moe_device_route_pair_ids"
1050 } else if vllm_moe.value && device_route.value {
1051 "vllm_marlin_moe_device_route"
1052 } else if vllm_moe.value {
1053 "vllm_marlin_moe"
1054 } else {
1055 "legacy_moe"
1056 };
1057 self.decision(
1058 "moe_implementation",
1059 selected,
1060 vllm_moe.source,
1061 vllm_moe.source_key,
1062 [
1063 "vllm_marlin_moe_device_route_pair_ids",
1064 "vllm_marlin_moe_device_route",
1065 "vllm_marlin_moe",
1066 "legacy_moe",
1067 ],
1068 self.rejected_except(
1069 selected,
1070 [
1071 (
1072 "vllm_marlin_moe_device_route_pair_ids",
1073 "pair-id device route not selected",
1074 ),
1075 (
1076 "vllm_marlin_moe_device_route",
1077 "device-route MoE not selected",
1078 ),
1079 ("vllm_marlin_moe", "vLLM Marlin MoE not selected"),
1080 ("legacy_moe", "legacy MoE not selected"),
1081 ],
1082 ),
1083 vec![RuntimeConfigEffect::Performance],
1084 )
1085 }
1086
1087 fn graph_decision(&self, graph: ResolvedValue<bool>) -> AutoConfigDecision {
1088 let selected = if graph.value {
1089 "graph_clean_decode"
1090 } else {
1091 "graph_disabled"
1092 };
1093 self.decision(
1094 "moe_graph_policy",
1095 selected,
1096 graph.source,
1097 graph.source_key,
1098 ["graph_clean_decode", "graph_disabled"],
1099 self.rejected_except(
1100 selected,
1101 [
1102 ("graph_clean_decode", "graph decode not selected"),
1103 ("graph_disabled", "graph decode selected"),
1104 ],
1105 ),
1106 vec![
1107 RuntimeConfigEffect::Performance,
1108 RuntimeConfigEffect::Correctness,
1109 ],
1110 )
1111 }
1112
1113 fn scalar_decision(
1114 &self,
1115 selection: &str,
1116 value: ResolvedValue<usize>,
1117 effect: RuntimeConfigEffect,
1118 ) -> AutoConfigDecision {
1119 self.decision(
1120 selection,
1121 &value.value.to_string(),
1122 value.source,
1123 value.source_key,
1124 [value.value.to_string()],
1125 Vec::new(),
1126 vec![effect],
1127 )
1128 }
1129
1130 fn scheduler_decision(&self) -> Result<AutoConfigDecision, AutoConfigError> {
1131 let entries = self.entries();
1132 let mut selected = "continuous_default".to_string();
1133 let mut source_key = None;
1134 if let Some(chunk) = entries.get("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK") {
1135 parse_usize_env_value(chunk).map_err(|reason| AutoConfigError::InvalidOverride {
1136 key: "FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string(),
1137 reason,
1138 })?;
1139 selected = format!("active_decode_prefill_chunk:{chunk}");
1140 source_key = Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string());
1141 } else if let Some(until) = entries.get("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE") {
1142 parse_usize_env_value(until).map_err(|reason| AutoConfigError::InvalidOverride {
1143 key: "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string(),
1144 reason,
1145 })?;
1146 selected = format!("prefill_first_until_active:{until}");
1147 source_key = Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string());
1148 } else if self
1149 .bool_value(
1150 "FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE",
1151 false,
1152 AutoConfigSource::Default,
1153 )?
1154 .value
1155 {
1156 selected = "prompt_token_estimate".to_string();
1157 source_key = Some("FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE".to_string());
1158 }
1159 self.unsupported_if(
1160 source_key.as_deref() == Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1161 && selected.ends_with(":0"),
1162 "scheduler_admission_policy",
1163 "active decode prefill chunk must be greater than zero",
1164 )?;
1165 Ok(self.decision(
1166 "scheduler_admission_policy",
1167 &selected,
1168 source_key
1169 .as_deref()
1170 .map(|key| self.source_for_key(key, AutoConfigSource::Default))
1171 .unwrap_or(AutoConfigSource::Default),
1172 source_key,
1173 [
1174 "continuous_default",
1175 "prompt_token_estimate",
1176 "prefill_first_until_active",
1177 "active_decode_prefill_chunk",
1178 ],
1179 Vec::new(),
1180 vec![RuntimeConfigEffect::Performance],
1181 ))
1182 }
1183
1184 fn prefix_cache_decision(&self, prefix_cache: ResolvedValue<bool>) -> AutoConfigDecision {
1185 let selected = if prefix_cache.value {
1186 "prefix_cache_enabled"
1187 } else {
1188 "prefix_cache_disabled"
1189 };
1190 self.decision(
1191 "prefix_cache_policy",
1192 selected,
1193 prefix_cache.source,
1194 prefix_cache.source_key,
1195 ["prefix_cache_enabled", "prefix_cache_disabled"],
1196 self.rejected_except(
1197 selected,
1198 [
1199 ("prefix_cache_enabled", "prefix cache not selected"),
1200 ("prefix_cache_disabled", "prefix cache enabled"),
1201 ],
1202 ),
1203 vec![
1204 RuntimeConfigEffect::Correctness,
1205 RuntimeConfigEffect::Performance,
1206 RuntimeConfigEffect::Memory,
1207 ],
1208 )
1209 }
1210
1211 fn sampling_decision(&self, greedy: ResolvedValue<bool>) -> AutoConfigDecision {
1212 let selected = if greedy.value {
1213 "gpu_greedy_argmax"
1214 } else {
1215 "logits_readback"
1216 };
1217 self.decision(
1218 "sampling_readback_path",
1219 selected,
1220 greedy.source,
1221 greedy.source_key,
1222 ["gpu_greedy_argmax", "logits_readback"],
1223 self.rejected_except(
1224 selected,
1225 [
1226 ("gpu_greedy_argmax", "GPU argmax not selected"),
1227 ("logits_readback", "logits readback not selected"),
1228 ],
1229 ),
1230 vec![
1231 RuntimeConfigEffect::Performance,
1232 RuntimeConfigEffect::Correctness,
1233 ],
1234 )
1235 }
1236
1237 fn decision<I, C>(
1238 &self,
1239 selection: &str,
1240 selected: &str,
1241 source: AutoConfigSource,
1242 source_key: Option<String>,
1243 candidates: I,
1244 rejected: Vec<RejectedCandidate>,
1245 affects: Vec<RuntimeConfigEffect>,
1246 ) -> AutoConfigDecision
1247 where
1248 I: IntoIterator<Item = C>,
1249 C: Into<String>,
1250 {
1251 AutoConfigDecision {
1252 schema_version: 1,
1253 selection: selection.to_string(),
1254 selected: selected.to_string(),
1255 source,
1256 source_key,
1257 candidates: candidates.into_iter().map(Into::into).collect(),
1258 rejected,
1259 affects,
1260 }
1261 }
1262
1263 fn rejected_except<I>(&self, selected: &str, candidates: I) -> Vec<RejectedCandidate>
1264 where
1265 I: IntoIterator<Item = (&'static str, &'static str)>,
1266 {
1267 candidates
1268 .into_iter()
1269 .filter(|(value, _)| *value != selected)
1270 .map(|(value, reason)| RejectedCandidate {
1271 value: value.to_string(),
1272 reason: reason.to_string(),
1273 })
1274 .collect()
1275 }
1276
1277 fn invalid<T>(&self, key: &str, reason: &str) -> Result<T, AutoConfigError> {
1278 Err(AutoConfigError::InvalidOverride {
1279 key: key.to_string(),
1280 reason: reason.to_string(),
1281 })
1282 }
1283
1284 fn unsupported<T>(&self, selection: &str, reason: &str) -> Result<T, AutoConfigError> {
1285 Err(AutoConfigError::UnsupportedCombination {
1286 selection: selection.to_string(),
1287 reason: reason.to_string(),
1288 })
1289 }
1290
1291 fn unsupported_if(
1292 &self,
1293 condition: bool,
1294 selection: &str,
1295 reason: &str,
1296 ) -> Result<(), AutoConfigError> {
1297 if condition {
1298 self.unsupported(selection, reason)
1299 } else {
1300 Ok(())
1301 }
1302 }
1303}
1304
1305#[derive(Debug, Clone, PartialEq, Eq)]
1306struct ResolvedValue<T> {
1307 value: T,
1308 source: AutoConfigSource,
1309 source_key: Option<String>,
1310}
1311
1312fn parse_compute_capability(value: &str) -> Option<(u32, u32)> {
1313 let value = value.trim();
1314 if value.is_empty() {
1315 return None;
1316 }
1317 let (major, minor) = value.split_once('.').unwrap_or((value, "0"));
1318 Some((major.trim().parse().ok()?, minor.trim().parse().ok()?))
1319}
1320
1321fn vram_default_max_sequences(vram_bytes: u64) -> usize {
1322 match vram_bytes {
1323 bytes if bytes >= 20 * GIB => 32,
1324 bytes if bytes >= 12 * GIB => 16,
1325 bytes if bytes >= 8 * GIB => 8,
1326 _ => 4,
1327 }
1328}
1329
1330fn ceil_div(value: usize, divisor: usize) -> usize {
1331 value.div_ceil(divisor)
1332}
1333
1334fn auto_config_source_from_runtime(source: RuntimeConfigSource) -> AutoConfigSource {
1335 match source {
1336 RuntimeConfigSource::Default => AutoConfigSource::Default,
1337 RuntimeConfigSource::ConfigFile => AutoConfigSource::ConfigFile,
1338 RuntimeConfigSource::Cli => AutoConfigSource::Cli,
1339 RuntimeConfigSource::Env => AutoConfigSource::Env,
1340 RuntimeConfigSource::ScriptCase => AutoConfigSource::ScriptCase,
1341 RuntimeConfigSource::MemoryProfile => AutoConfigSource::MemoryProfile,
1342 }
1343}
1344
1345#[cfg(test)]
1346mod tests {
1347 use super::*;
1348
1349 fn snapshot(vars: &[(&str, &str)]) -> RuntimeConfigSnapshot {
1350 RuntimeConfigSnapshot::from_env_vars(vars.iter().copied())
1351 }
1352
1353 fn snapshot_with_sources(vars: &[(&str, &str, RuntimeConfigSource)]) -> RuntimeConfigSnapshot {
1354 let mut entries: Vec<_> = vars
1355 .iter()
1356 .map(|(key, effective_value, source)| RuntimeConfigEntry {
1357 key: (*key).to_string(),
1358 effective_value: (*effective_value).to_string(),
1359 source: *source,
1360 affects: vec![RuntimeConfigEffect::Performance],
1361 })
1362 .collect();
1363 entries.sort_by(|a, b| a.key.cmp(&b.key));
1364 RuntimeConfigSnapshot { entries }
1365 }
1366
1367 fn m3(vars: &[(&str, &str)], features: CompiledKernelFeatures) -> FerrumConfigBuilder {
1368 FerrumConfigBuilder::new(snapshot(vars))
1369 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1370 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(features))
1371 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1372 }
1373
1374 fn m3_with_hardware(
1375 vars: &[(&str, &str)],
1376 hardware: HardwareCapabilities,
1377 ) -> FerrumConfigBuilder {
1378 FerrumConfigBuilder::new(snapshot(vars))
1379 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1380 .with_hardware_capabilities(hardware)
1381 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1382 }
1383
1384 fn expect_invalid_key(vars: &[(&str, &str)], key: &str) {
1385 expect_invalid_key_with_features(
1386 vars,
1387 key,
1388 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1389 );
1390 }
1391
1392 fn expect_invalid_key_with_features(
1393 vars: &[(&str, &str)],
1394 key: &str,
1395 features: CompiledKernelFeatures,
1396 ) {
1397 expect_invalid_key_with_hardware(vars, key, HardwareCapabilities::rtx4090_cuda(features));
1398 }
1399
1400 fn expect_invalid_key_with_hardware(
1401 vars: &[(&str, &str)],
1402 key: &str,
1403 hardware: HardwareCapabilities,
1404 ) {
1405 let err = m3_with_hardware(vars, hardware)
1406 .resolve()
1407 .expect_err("override should fail");
1408 match err {
1409 AutoConfigError::InvalidOverride { key: actual, .. } => assert_eq!(actual, key),
1410 other => panic!("expected invalid override for {key}, got {other:?}"),
1411 }
1412 }
1413
1414 fn cpu_hardware_with_features(features: CompiledKernelFeatures) -> HardwareCapabilities {
1415 HardwareCapabilities {
1416 backend: "cpu".to_string(),
1417 supported_dtypes: vec!["fp32".to_string()],
1418 supported_kv_dtypes: vec!["fp16".to_string()],
1419 compiled_features: features,
1420 ..HardwareCapabilities::unknown()
1421 }
1422 }
1423
1424 #[test]
1425 fn m3_preset_selects_current_safe_fast_path_without_fa2() {
1426 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1427 .resolve()
1428 .unwrap();
1429 let decisions: BTreeMap<_, _> = resolved
1430 .decisions
1431 .iter()
1432 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1433 .collect();
1434 assert_eq!(
1435 decisions["attention_prefill_mixed_backend"],
1436 "vllm_paged_varlen"
1437 );
1438 assert_eq!(
1439 decisions["attention_decode_backend"],
1440 "vllm_paged_attn_v1_short"
1441 );
1442 assert_eq!(
1443 decisions["moe_implementation"],
1444 "vllm_marlin_moe_device_route_pair_ids"
1445 );
1446 assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1447 assert_eq!(decisions["prefix_cache_policy"], "prefix_cache_disabled");
1448 assert_eq!(decisions["sampling_readback_path"], "gpu_greedy_argmax");
1449 assert_eq!(
1450 resolved.preset.as_deref(),
1451 Some(M3_QWEN3_30B_A3B_INT4_PRESET)
1452 );
1453 }
1454
1455 #[test]
1456 fn source_fa2_selects_source_linked_attention_when_compiled() {
1457 let resolved = m3(
1458 &[("FERRUM_FA2_SOURCE", "1")],
1459 CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1460 )
1461 .resolve()
1462 .unwrap();
1463 let decisions: BTreeMap<_, _> = resolved
1464 .decisions
1465 .iter()
1466 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1467 .collect();
1468
1469 assert_eq!(decisions["attention_prefill_mixed_backend"], "fa2_source");
1470 }
1471
1472 #[test]
1473 fn source_fa2_is_rejected_when_not_compiled() {
1474 expect_invalid_key(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE");
1475 }
1476
1477 #[test]
1478 fn hardware_capabilities_keep_m3_preset_on_compatible_backend_paths() {
1479 let resolved = m3_with_hardware(
1480 &[],
1481 cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2()),
1482 )
1483 .resolve()
1484 .unwrap();
1485 let decisions: BTreeMap<_, _> = resolved
1486 .decisions
1487 .iter()
1488 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1489 .collect();
1490
1491 assert_eq!(
1492 decisions["attention_prefill_mixed_backend"],
1493 "legacy_paged_varlen"
1494 );
1495 assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1496 assert_eq!(decisions["moe_implementation"], "legacy_moe");
1497 assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1498 assert_eq!(decisions["sampling_readback_path"], "logits_readback");
1499 }
1500
1501 #[test]
1502 fn hardware_incompatible_attention_and_sampling_overrides_are_rejected() {
1503 let cpu =
1504 cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2());
1505 expect_invalid_key_with_hardware(
1506 &[("FERRUM_USE_VLLM_PAGED_ATTN", "1")],
1507 "FERRUM_USE_VLLM_PAGED_ATTN",
1508 cpu.clone(),
1509 );
1510 expect_invalid_key_with_hardware(
1511 &[("FERRUM_VLLM_MOE", "1")],
1512 "FERRUM_VLLM_MOE",
1513 cpu.clone(),
1514 );
1515 expect_invalid_key_with_hardware(
1516 &[("FERRUM_GREEDY_ARGMAX", "1")],
1517 "FERRUM_GREEDY_ARGMAX",
1518 cpu.clone(),
1519 );
1520 expect_invalid_key_with_hardware(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE", cpu);
1521
1522 let mut old_cuda = HardwareCapabilities::rtx4090_cuda(
1523 CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1524 );
1525 old_cuda.compute_capability = Some("7.5".to_string());
1526 expect_invalid_key_with_hardware(
1527 &[("FERRUM_FA2_SOURCE", "1")],
1528 "FERRUM_FA2_SOURCE",
1529 old_cuda,
1530 );
1531 }
1532
1533 #[test]
1534 fn hardware_capacity_sizes_default_sequence_budget_without_overriding_user_values() {
1535 let mut small_gpu =
1536 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1537 small_gpu.sm_count = Some(16);
1538 small_gpu.vram_bytes = Some(24 * 1024 * 1024 * 1024);
1539
1540 let resolved = m3_with_hardware(&[], small_gpu.clone()).resolve().unwrap();
1541 let decision = |selection: &str| {
1542 resolved
1543 .decisions
1544 .iter()
1545 .find(|decision| decision.selection == selection)
1546 .unwrap()
1547 };
1548 let max_sequences = decision("max_sequences");
1549 assert_eq!(max_sequences.selected, "4");
1550 assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1551 let max_batched_tokens = decision("max_batched_tokens");
1552 assert_eq!(max_batched_tokens.selected, "256");
1553 assert_eq!(
1554 max_batched_tokens.source,
1555 AutoConfigSource::HardwareCapability
1556 );
1557
1558 let resolved = m3_with_hardware(&[("FERRUM_PAGED_MAX_SEQS", "16")], small_gpu)
1559 .resolve()
1560 .unwrap();
1561 let max_sequences = resolved
1562 .decisions
1563 .iter()
1564 .find(|decision| decision.selection == "max_sequences")
1565 .unwrap();
1566 assert_eq!(max_sequences.selected, "16");
1567 assert_eq!(max_sequences.source, AutoConfigSource::Env);
1568 assert_eq!(
1569 max_sequences.source_key.as_deref(),
1570 Some("FERRUM_PAGED_MAX_SEQS")
1571 );
1572 }
1573
1574 #[test]
1575 fn vram_capacity_caps_m3_default_sequence_budget() {
1576 let mut low_vram_gpu =
1577 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1578 low_vram_gpu.sm_count = Some(128);
1579 low_vram_gpu.vram_bytes = Some(7 * 1024 * 1024 * 1024);
1580
1581 let resolved = m3_with_hardware(&[], low_vram_gpu).resolve().unwrap();
1582 let max_sequences = resolved
1583 .decisions
1584 .iter()
1585 .find(|decision| decision.selection == "max_sequences")
1586 .unwrap();
1587 assert_eq!(max_sequences.selected, "4");
1588 assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1589 }
1590
1591 #[test]
1592 fn memory_budget_keeps_rtx4090_m3_kv_blocks_but_caps_constrained_vram() {
1593 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1594 .resolve()
1595 .unwrap();
1596 let decision = |selection: &str| {
1597 resolved
1598 .decisions
1599 .iter()
1600 .find(|decision| decision.selection == selection)
1601 .unwrap()
1602 };
1603 assert_eq!(decision("kv_block_count").selected, "2048");
1604 assert_eq!(
1605 decision("kv_block_count").source,
1606 AutoConfigSource::WorkloadPreset
1607 );
1608
1609 let mut constrained =
1610 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1611 constrained.vram_bytes = Some(20 * 1024 * 1024 * 1024);
1612 let resolved = m3_with_hardware(&[], constrained).resolve().unwrap();
1613 let decision = |selection: &str| {
1614 resolved
1615 .decisions
1616 .iter()
1617 .find(|decision| decision.selection == selection)
1618 .unwrap()
1619 };
1620 assert_eq!(decision("kv_block_count").selected, "2");
1621 assert_eq!(
1622 decision("kv_block_count").source,
1623 AutoConfigSource::HardwareCapability
1624 );
1625 assert_eq!(decision("max_batched_tokens").selected, "32");
1626 assert_eq!(
1627 decision("max_batched_tokens").source,
1628 AutoConfigSource::HardwareCapability
1629 );
1630 }
1631
1632 #[test]
1633 fn compute_capability_parser_accepts_major_minor_and_major_only() {
1634 assert_eq!(parse_compute_capability("8.9"), Some((8, 9)));
1635 assert_eq!(parse_compute_capability("9"), Some((9, 0)));
1636 assert_eq!(parse_compute_capability("N/A"), None);
1637 }
1638
1639 #[test]
1640 fn vram_capacity_tiers_are_monotonic() {
1641 assert_eq!(vram_default_max_sequences(24 * 1024 * 1024 * 1024), 32);
1642 assert_eq!(vram_default_max_sequences(16 * 1024 * 1024 * 1024), 16);
1643 assert_eq!(vram_default_max_sequences(8 * 1024 * 1024 * 1024), 8);
1644 assert_eq!(vram_default_max_sequences(6 * 1024 * 1024 * 1024), 4);
1645 }
1646
1647 #[test]
1648 fn accelerator_serving_default_uses_hardware_concurrency_budget() {
1649 let hardware =
1650 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1651 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1652 assert_eq!(workload.target_concurrency, 32);
1653
1654 let resolved = FerrumConfigBuilder::new(snapshot(&[]))
1655 .with_model_capabilities(ModelCapabilities::unknown())
1656 .with_hardware_capabilities(hardware)
1657 .with_workload_profile(workload)
1658 .resolve()
1659 .unwrap();
1660 let max_sequences = resolved
1661 .decisions
1662 .iter()
1663 .find(|decision| decision.selection == "max_sequences")
1664 .unwrap();
1665 assert_eq!(max_sequences.selected, "32");
1666 }
1667
1668 #[test]
1669 fn cpu_serving_default_keeps_single_sequence_budget() {
1670 let hardware = HardwareCapabilities {
1671 backend: "cpu".to_string(),
1672 supported_dtypes: vec!["fp32".to_string()],
1673 ..HardwareCapabilities::unknown()
1674 };
1675 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1676 assert_eq!(workload.target_concurrency, 1);
1677 }
1678
1679 #[test]
1680 fn validates_invalid_override_matrix() {
1681 expect_invalid_key(
1682 &[("FERRUM_USE_VLLM_PAGED_ATTN", "maybe")],
1683 "FERRUM_USE_VLLM_PAGED_ATTN",
1684 );
1685 expect_invalid_key(&[("FERRUM_PREFIX_CACHE", "maybe")], "FERRUM_PREFIX_CACHE");
1686 expect_invalid_key(
1687 &[
1688 ("FERRUM_FA_LAYOUT_VARLEN", "1"),
1689 ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1690 ],
1691 "FERRUM_FA_LAYOUT_VARLEN",
1692 );
1693 expect_invalid_key(&[("FERRUM_FA2_DIRECT_FFI", "1")], "FERRUM_FA2_DIRECT_FFI");
1694 expect_invalid_key_with_features(
1695 &[("FERRUM_VLLM_MOE", "1")],
1696 "FERRUM_VLLM_MOE",
1697 CompiledKernelFeatures::default(),
1698 );
1699 expect_invalid_key(
1700 &[("FERRUM_MOE_DEVICE_ROUTE", "1"), ("FERRUM_VLLM_MOE", "0")],
1701 "FERRUM_MOE_DEVICE_ROUTE",
1702 );
1703 expect_invalid_key(
1704 &[("FERRUM_VLLM_MOE_PAIR_IDS", "1"), ("FERRUM_VLLM_MOE", "0")],
1705 "FERRUM_VLLM_MOE_PAIR_IDS",
1706 );
1707 expect_invalid_key(
1708 &[("FERRUM_MOE_GRAPH", "1"), ("FERRUM_VLLM_MOE", "0")],
1709 "FERRUM_MOE_GRAPH",
1710 );
1711 expect_invalid_key(&[("FERRUM_KV_MAX_BLOCKS", "0")], "FERRUM_KV_MAX_BLOCKS");
1712 expect_invalid_key(&[("FERRUM_PAGED_MAX_SEQS", "0")], "FERRUM_PAGED_MAX_SEQS");
1713 expect_invalid_key(
1714 &[
1715 ("FERRUM_PAGED_MAX_SEQS", "32"),
1716 ("FERRUM_MAX_BATCHED_TOKENS", "16"),
1717 ],
1718 "FERRUM_MAX_BATCHED_TOKENS",
1719 );
1720 expect_invalid_key(
1721 &[
1722 ("FERRUM_KV_MAX_BLOCKS", "16"),
1723 ("FERRUM_MAX_BATCHED_TOKENS", "512"),
1724 ],
1725 "FERRUM_MAX_BATCHED_TOKENS",
1726 );
1727 expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "0")], "FERRUM_MAX_MODEL_LEN");
1728 expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "50000")], "FERRUM_MAX_MODEL_LEN");
1729 expect_invalid_key(
1730 &[
1731 ("FERRUM_KV_MAX_BLOCKS", "16"),
1732 ("FERRUM_MAX_MODEL_LEN", "1024"),
1733 ],
1734 "FERRUM_KV_MAX_BLOCKS",
1735 );
1736 expect_invalid_key(&[("FERRUM_DTYPE", "bf16")], "FERRUM_DTYPE");
1737 expect_invalid_key(&[("FERRUM_KV_DTYPE", "fp8")], "FERRUM_KV_DTYPE");
1738 expect_invalid_key(
1739 &[
1740 ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", "1"),
1741 ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1742 ],
1743 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
1744 );
1745 }
1746
1747 #[test]
1748 fn requested_max_model_len_is_optional_and_reflected_when_valid() {
1749 let default_resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1750 .resolve()
1751 .unwrap();
1752 assert!(!default_resolved
1753 .decisions
1754 .iter()
1755 .any(|decision| decision.selection == "max_model_len"));
1756
1757 let resolved = m3(
1758 &[
1759 ("FERRUM_KV_MAX_BLOCKS", "64"),
1760 ("FERRUM_MAX_MODEL_LEN", "1024"),
1761 ],
1762 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1763 )
1764 .resolve()
1765 .unwrap();
1766 let max_model_len = resolved
1767 .decisions
1768 .iter()
1769 .find(|decision| decision.selection == "max_model_len")
1770 .unwrap();
1771 assert_eq!(max_model_len.selected, "1024");
1772 assert_eq!(
1773 max_model_len.source_key.as_deref(),
1774 Some("FERRUM_MAX_MODEL_LEN")
1775 );
1776 }
1777
1778 #[test]
1779 fn graph_enabled_with_graph_unsafe_moe_is_rejected() {
1780 let mut model = ModelCapabilities::qwen3_30b_a3b_gptq_int4();
1781 model.graph_safe_moe = false;
1782 let err = FerrumConfigBuilder::new(snapshot(&[("FERRUM_MOE_GRAPH", "1")]))
1783 .with_model_capabilities(model)
1784 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1785 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1786 ))
1787 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1788 .resolve()
1789 .expect_err("graph unsafe MoE must fail");
1790 assert!(matches!(
1791 err,
1792 AutoConfigError::UnsupportedCombination {
1793 selection,
1794 ..
1795 } if selection == "moe_graph_policy"
1796 ));
1797 }
1798
1799 #[test]
1800 fn scheduler_override_is_reflected_in_decision_trace() {
1801 let resolved = m3(
1802 &[("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK", "64")],
1803 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1804 )
1805 .resolve()
1806 .unwrap();
1807 let scheduler = resolved
1808 .decisions
1809 .iter()
1810 .find(|decision| decision.selection == "scheduler_admission_policy")
1811 .unwrap();
1812 assert_eq!(scheduler.selected, "active_decode_prefill_chunk:64");
1813 assert_eq!(
1814 scheduler.source_key.as_deref(),
1815 Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1816 );
1817 }
1818
1819 #[test]
1820 fn prefix_cache_override_is_reflected_in_decision_trace() {
1821 let resolved = m3(
1822 &[("FERRUM_PREFIX_CACHE", "1")],
1823 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1824 )
1825 .resolve()
1826 .unwrap();
1827 let prefix_cache = resolved
1828 .decisions
1829 .iter()
1830 .find(|decision| decision.selection == "prefix_cache_policy")
1831 .unwrap();
1832 assert_eq!(prefix_cache.selected, "prefix_cache_enabled");
1833 assert_eq!(
1834 prefix_cache.source_key.as_deref(),
1835 Some("FERRUM_PREFIX_CACHE")
1836 );
1837 }
1838
1839 #[test]
1840 fn non_env_runtime_sources_are_preserved_in_decision_trace() {
1841 let runtime_config = snapshot_with_sources(&[
1842 (
1843 "FERRUM_FA_LAYOUT_VARLEN",
1844 "1",
1845 RuntimeConfigSource::ConfigFile,
1846 ),
1847 ("FERRUM_PAGED_MAX_SEQS", "48", RuntimeConfigSource::Cli),
1848 (
1849 "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE",
1850 "32",
1851 RuntimeConfigSource::ScriptCase,
1852 ),
1853 ]);
1854 let resolved = FerrumConfigBuilder::new(runtime_config)
1855 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1856 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1857 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1858 ))
1859 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1860 .resolve()
1861 .unwrap();
1862
1863 let decision = |selection: &str| {
1864 resolved
1865 .decisions
1866 .iter()
1867 .find(|decision| decision.selection == selection)
1868 .unwrap()
1869 };
1870 let attention = decision("attention_prefill_mixed_backend");
1871 assert_eq!(attention.selected, "fa_layout_varlen");
1872 assert_eq!(attention.source, AutoConfigSource::ConfigFile);
1873 assert_eq!(
1874 attention.source_key.as_deref(),
1875 Some("FERRUM_FA_LAYOUT_VARLEN")
1876 );
1877
1878 let max_sequences = decision("max_sequences");
1879 assert_eq!(max_sequences.selected, "48");
1880 assert_eq!(max_sequences.source, AutoConfigSource::Cli);
1881 assert_eq!(
1882 max_sequences.source_key.as_deref(),
1883 Some("FERRUM_PAGED_MAX_SEQS")
1884 );
1885
1886 let scheduler = decision("scheduler_admission_policy");
1887 assert_eq!(scheduler.selected, "prefill_first_until_active:32");
1888 assert_eq!(scheduler.source, AutoConfigSource::ScriptCase);
1889 assert_eq!(
1890 scheduler.source_key.as_deref(),
1891 Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE")
1892 );
1893 }
1894
1895 #[test]
1896 fn renders_effective_config_and_decision_trace_artifacts() {
1897 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1898 .resolve()
1899 .unwrap();
1900 let effective = resolved.effective_config_document();
1901 assert_eq!(effective["schema_version"], 1);
1902 assert!(effective["env_hash"]
1903 .as_str()
1904 .unwrap()
1905 .starts_with("sha256:"));
1906 assert!(effective["entries"].is_array());
1907 assert_eq!(effective["model_capabilities"]["architecture"], "qwen3_moe");
1908 assert_eq!(effective["hardware_capabilities"]["backend"], "cuda");
1909 assert_eq!(
1910 effective["workload_profile"]["preset"],
1911 M3_QWEN3_30B_A3B_INT4_PRESET
1912 );
1913 assert_eq!(
1914 effective["decisions"].as_array().unwrap().len(),
1915 resolved.decisions.len()
1916 );
1917 let trace = resolved.decision_trace_jsonl().unwrap();
1918 assert_eq!(trace.lines().count(), resolved.decisions.len());
1919 assert!(trace.contains("\"attention_prefill_mixed_backend\""));
1920 }
1921
1922 #[test]
1923 fn auto_config_artifacts_match_locked_schema_shape() {
1924 let resolved = FerrumConfigBuilder::m3_qwen3_30b_a3b_int4(snapshot_with_sources(&[
1925 (
1926 "FERRUM_FA_LAYOUT_VARLEN",
1927 "1",
1928 RuntimeConfigSource::ScriptCase,
1929 ),
1930 ("FERRUM_PAGED_MAX_SEQS", "32", RuntimeConfigSource::Cli),
1931 ]))
1932 .resolve()
1933 .unwrap();
1934
1935 let effective = resolved.effective_config_document();
1936 assert_eq!(effective["schema_version"], 1);
1937 assert!(effective["env_hash"]
1938 .as_str()
1939 .unwrap()
1940 .starts_with("sha256:"));
1941
1942 let entries = effective["entries"].as_array().unwrap();
1943 let keys: Vec<_> = entries
1944 .iter()
1945 .map(|entry| entry["key"].as_str().unwrap())
1946 .collect();
1947 let mut sorted_keys = keys.clone();
1948 sorted_keys.sort_unstable();
1949 assert_eq!(keys, sorted_keys);
1950 for entry in entries {
1951 assert!(entry["key"].as_str().unwrap().starts_with("FERRUM_"));
1952 assert!(entry["effective_value"].is_string());
1953 assert!(matches!(
1954 entry["source"].as_str().unwrap(),
1955 "default" | "config_file" | "cli" | "env" | "script_case" | "memory_profile"
1956 ));
1957 assert!(!entry["affects"].as_array().unwrap().is_empty());
1958 }
1959 assert_eq!(
1960 effective["model_capabilities"]["quantization"].as_str(),
1961 Some("gptq_int4")
1962 );
1963 assert_eq!(
1964 effective["model_capabilities"]["moe"]["experts_per_token"].as_u64(),
1965 Some(8)
1966 );
1967 assert_eq!(
1968 effective["hardware_capabilities"]["compute_capability"].as_str(),
1969 Some("8.9")
1970 );
1971 assert_eq!(
1972 effective["hardware_capabilities"]["compiled_features"]["vllm_moe_marlin"].as_bool(),
1973 Some(true)
1974 );
1975 assert_eq!(
1976 effective["workload_profile"]["target_concurrency"].as_u64(),
1977 Some(32)
1978 );
1979 assert_eq!(
1980 effective["workload_profile"]["priority"].as_str(),
1981 Some("throughput")
1982 );
1983
1984 let trace = resolved.decision_trace_jsonl().unwrap();
1985 let trace_decisions: Vec<AutoConfigDecision> = trace
1986 .lines()
1987 .map(|line| serde_json::from_str(line).unwrap())
1988 .collect();
1989 assert_eq!(trace_decisions, resolved.decisions);
1990 assert_eq!(
1991 serde_json::from_value::<Vec<AutoConfigDecision>>(effective["decisions"].clone())
1992 .unwrap(),
1993 trace_decisions
1994 );
1995
1996 for decision in &trace_decisions {
1997 assert_eq!(decision.schema_version, 1);
1998 assert!(!decision.selection.trim().is_empty());
1999 assert!(!decision.selected.trim().is_empty());
2000 assert!(!decision.candidates.is_empty());
2001 assert!(!decision.affects.is_empty());
2002 if let Some(source_key) = &decision.source_key {
2003 assert!(source_key.starts_with("FERRUM_"));
2004 }
2005 for rejected in &decision.rejected {
2006 assert!(!rejected.value.trim().is_empty());
2007 assert!(!rejected.reason.trim().is_empty());
2008 }
2009 }
2010 }
2011}