1use crate::{
7 parse_bool_env_value, parse_usize_env_value, RuntimeConfigEffect, RuntimeConfigEntry,
8 RuntimeConfigSnapshot, RuntimeConfigSource,
9};
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12use thiserror::Error;
13
14pub const M3_QWEN3_30B_A3B_INT4_PRESET: &str = "m3_qwen3_30b_a3b_int4";
15const DEFAULT_KV_BLOCK_SIZE_TOKENS: usize = 16;
16const DEFAULT_KV_BLOCKS: usize = 2048;
17const GIB: u64 = 1024 * 1024 * 1024;
18
19#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
20pub struct ModelCapabilities {
21 pub architecture: String,
22 pub quantization: Option<String>,
23 pub moe: Option<MoeCapabilities>,
24 pub max_context_len: Option<usize>,
25 pub num_hidden_layers: Option<usize>,
26 pub head_dim: Option<usize>,
27 pub kv_heads: Option<usize>,
28 pub estimated_weight_bytes: Option<u64>,
29 pub supported_dtypes: Vec<String>,
30 pub graph_safe_moe: bool,
31}
32
33impl ModelCapabilities {
34 pub fn unknown() -> Self {
35 Self {
36 architecture: "unknown".to_string(),
37 quantization: None,
38 moe: None,
39 max_context_len: None,
40 num_hidden_layers: None,
41 head_dim: None,
42 kv_heads: None,
43 estimated_weight_bytes: None,
44 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
45 graph_safe_moe: false,
46 }
47 }
48
49 pub fn qwen3_30b_a3b_gptq_int4() -> Self {
50 Self {
51 architecture: "qwen3_moe".to_string(),
52 quantization: Some("gptq_int4".to_string()),
53 moe: Some(MoeCapabilities {
54 num_experts: 128,
55 experts_per_token: 8,
56 moe_intermediate_size: Some(768),
57 }),
58 max_context_len: Some(40960),
59 num_hidden_layers: Some(48),
60 head_dim: Some(128),
61 kv_heads: Some(4),
62 estimated_weight_bytes: Some(18 * GIB),
67 supported_dtypes: vec!["fp16".to_string()],
68 graph_safe_moe: true,
69 }
70 }
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74pub struct MoeCapabilities {
75 pub num_experts: usize,
76 pub experts_per_token: usize,
77 pub moe_intermediate_size: Option<usize>,
78}
79
80#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
81pub struct HardwareCapabilities {
82 pub backend: String,
83 pub cuda_runtime: Option<String>,
84 pub compute_capability: Option<String>,
85 pub vram_bytes: Option<u64>,
86 pub sm_count: Option<u32>,
87 pub supported_dtypes: Vec<String>,
88 pub supported_kv_dtypes: Vec<String>,
89 pub graph_support: bool,
90 pub compiled_features: CompiledKernelFeatures,
91}
92
93impl HardwareCapabilities {
94 pub fn unknown() -> Self {
95 Self {
96 backend: "unknown".to_string(),
97 cuda_runtime: None,
98 compute_capability: None,
99 vram_bytes: None,
100 sm_count: None,
101 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
102 supported_kv_dtypes: vec!["fp16".to_string()],
103 graph_support: false,
104 compiled_features: CompiledKernelFeatures::default(),
105 }
106 }
107
108 pub fn rtx4090_cuda(features: CompiledKernelFeatures) -> Self {
109 Self {
110 backend: "cuda".to_string(),
111 cuda_runtime: None,
112 compute_capability: Some("8.9".to_string()),
113 vram_bytes: Some(24 * 1024 * 1024 * 1024),
114 sm_count: Some(128),
115 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
116 supported_kv_dtypes: vec!["fp16".to_string(), "int8".to_string()],
117 graph_support: true,
118 compiled_features: features,
119 }
120 }
121}
122
123#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
124pub struct CompiledKernelFeatures {
125 pub cuda: bool,
126 pub vllm_paged_attn: bool,
127 pub vllm_moe_marlin: bool,
128 pub cuda_graph: bool,
129 pub greedy_argmax: bool,
130 pub fa2_source: bool,
131 pub fa2_direct_ffi: bool,
132}
133
134impl Default for CompiledKernelFeatures {
135 fn default() -> Self {
136 Self {
137 cuda: false,
138 vllm_paged_attn: false,
139 vllm_moe_marlin: false,
140 cuda_graph: false,
141 greedy_argmax: false,
142 fa2_source: false,
143 fa2_direct_ffi: false,
144 }
145 }
146}
147
148impl CompiledKernelFeatures {
149 pub fn m3_fast_path_without_fa2() -> Self {
150 Self {
151 cuda: true,
152 vllm_paged_attn: true,
153 vllm_moe_marlin: true,
154 cuda_graph: true,
155 greedy_argmax: true,
156 fa2_source: false,
157 fa2_direct_ffi: false,
158 }
159 }
160
161 pub fn m3_fast_path_with_source_fa2() -> Self {
162 Self {
163 fa2_source: true,
164 ..Self::m3_fast_path_without_fa2()
165 }
166 }
167}
168
169#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
170pub struct WorkloadProfile {
171 pub preset: Option<String>,
172 pub serving_mode: String,
173 pub target_concurrency: usize,
174 pub prompt_length_class: String,
175 pub output_length_class: String,
176 pub priority: WorkloadPriority,
177}
178
179impl WorkloadProfile {
180 pub fn serving_default() -> Self {
181 Self {
182 preset: None,
183 serving_mode: "openai_chat".to_string(),
184 target_concurrency: 1,
185 prompt_length_class: "unknown".to_string(),
186 output_length_class: "unknown".to_string(),
187 priority: WorkloadPriority::Balanced,
188 }
189 }
190
191 pub fn m3_qwen3_30b_a3b_int4() -> Self {
192 Self {
193 preset: Some(M3_QWEN3_30B_A3B_INT4_PRESET.to_string()),
194 serving_mode: "bench_serve".to_string(),
195 target_concurrency: 32,
196 prompt_length_class: "random_256".to_string(),
197 output_length_class: "random_128".to_string(),
198 priority: WorkloadPriority::Throughput,
199 }
200 }
201
202 fn is_m3_preset(&self) -> bool {
203 self.preset.as_deref() == Some(M3_QWEN3_30B_A3B_INT4_PRESET)
204 }
205}
206
207impl Default for WorkloadProfile {
208 fn default() -> Self {
209 Self::serving_default()
210 }
211}
212
213#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
214#[serde(rename_all = "snake_case")]
215pub enum WorkloadPriority {
216 Latency,
217 Throughput,
218 Balanced,
219}
220
221#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
222pub struct ResolvedFerrumConfig {
223 pub schema_version: u32,
224 pub preset: Option<String>,
225 pub runtime_config: RuntimeConfigSnapshot,
226 pub model_capabilities: ModelCapabilities,
227 pub hardware_capabilities: HardwareCapabilities,
228 pub workload_profile: WorkloadProfile,
229 pub decisions: Vec<AutoConfigDecision>,
230}
231
232impl ResolvedFerrumConfig {
233 pub fn effective_config_document(&self) -> serde_json::Value {
234 serde_json::json!({
235 "schema_version": 1,
236 "preset": self.preset,
237 "env_hash": self.runtime_env_hash(),
238 "entries": self.runtime_config.entries,
239 "model_capabilities": self.model_capabilities,
240 "hardware_capabilities": self.hardware_capabilities,
241 "workload_profile": self.workload_profile,
242 "decisions": self.decisions,
243 })
244 }
245
246 pub fn decision_trace_jsonl(&self) -> Result<String, serde_json::Error> {
247 let mut out = String::new();
248 for decision in &self.decisions {
249 out.push_str(&serde_json::to_string(decision)?);
250 out.push('\n');
251 }
252 Ok(out)
253 }
254
255 pub fn runtime_env_hash(&self) -> String {
256 use sha2::{Digest, Sha256};
257
258 let bytes = serde_json::to_vec(&self.runtime_config.entries).unwrap_or_default();
259 let digest = Sha256::digest(bytes);
260 format!("sha256:{digest:x}")
261 }
262}
263
264#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
265pub struct AutoConfigDecision {
266 pub schema_version: u32,
267 pub selection: String,
268 pub selected: String,
269 pub source: AutoConfigSource,
270 pub source_key: Option<String>,
271 pub candidates: Vec<String>,
272 pub rejected: Vec<RejectedCandidate>,
273 pub affects: Vec<RuntimeConfigEffect>,
274}
275
276#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
277pub struct RejectedCandidate {
278 pub value: String,
279 pub reason: String,
280}
281
282#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
283#[serde(rename_all = "snake_case")]
284pub enum AutoConfigSource {
285 Default,
286 Cli,
287 ConfigFile,
288 Env,
289 ScriptCase,
290 ModelMetadata,
291 HardwareCapability,
292 MemoryProfile,
293 WorkloadPreset,
294 CompiledFeature,
295}
296
297#[derive(Debug, Clone, PartialEq, Eq, Error)]
298pub enum AutoConfigError {
299 #[error("{key}: invalid override: {reason}")]
300 InvalidOverride { key: String, reason: String },
301 #[error("{selection}: unsupported combination: {reason}")]
302 UnsupportedCombination { selection: String, reason: String },
303}
304
305pub struct FerrumConfigBuilder {
306 runtime_config: RuntimeConfigSnapshot,
307 model: ModelCapabilities,
308 hardware: HardwareCapabilities,
309 workload: WorkloadProfile,
310}
311
312impl FerrumConfigBuilder {
313 pub fn new(runtime_config: RuntimeConfigSnapshot) -> Self {
314 Self {
315 runtime_config,
316 model: ModelCapabilities::unknown(),
317 hardware: HardwareCapabilities::unknown(),
318 workload: WorkloadProfile::default(),
319 }
320 }
321
322 pub fn m3_qwen3_30b_a3b_int4(runtime_config: RuntimeConfigSnapshot) -> Self {
323 Self::new(runtime_config)
324 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
325 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
326 CompiledKernelFeatures::m3_fast_path_without_fa2(),
327 ))
328 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
329 }
330
331 pub fn with_model_capabilities(mut self, model: ModelCapabilities) -> Self {
332 self.model = model;
333 self
334 }
335
336 pub fn with_hardware_capabilities(mut self, hardware: HardwareCapabilities) -> Self {
337 self.hardware = hardware;
338 self
339 }
340
341 pub fn with_workload_profile(mut self, workload: WorkloadProfile) -> Self {
342 self.workload = workload;
343 self
344 }
345
346 pub fn resolve(self) -> Result<ResolvedFerrumConfig, AutoConfigError> {
347 let mut decisions = Vec::new();
348 let cuda_backend = self.is_cuda_backend();
349 let use_vllm_paged_attn = self.bool_value(
350 "FERRUM_USE_VLLM_PAGED_ATTN",
351 self.workload.is_m3_preset()
352 && cuda_backend
353 && self.hardware.compiled_features.vllm_paged_attn,
354 AutoConfigSource::WorkloadPreset,
355 )?;
356 let fa_layout =
357 self.bool_value("FERRUM_FA_LAYOUT_VARLEN", false, AutoConfigSource::Default)?;
358 let fa2_source = self.bool_value("FERRUM_FA2_SOURCE", false, AutoConfigSource::Default)?;
359 let shim_present = self.raw("FERRUM_FA2_DIRECT_FFI_SHIM").is_some();
360 let fa2_direct_ffi = self.bool_value(
361 "FERRUM_FA2_DIRECT_FFI",
362 shim_present,
363 if shim_present {
364 AutoConfigSource::Env
365 } else {
366 AutoConfigSource::Default
367 },
368 )?;
369 let vllm_v1_short = self.bool_value(
370 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
371 use_vllm_paged_attn.value,
372 AutoConfigSource::Default,
373 )?;
374 let vllm_moe = self.bool_value(
375 "FERRUM_VLLM_MOE",
376 self.workload.is_m3_preset()
377 && cuda_backend
378 && self.hardware.compiled_features.vllm_moe_marlin,
379 AutoConfigSource::WorkloadPreset,
380 )?;
381 let device_route = self.bool_value(
382 "FERRUM_MOE_DEVICE_ROUTE",
383 self.workload.is_m3_preset() && vllm_moe.value,
384 AutoConfigSource::WorkloadPreset,
385 )?;
386 let pair_ids = self.bool_value(
387 "FERRUM_VLLM_MOE_PAIR_IDS",
388 vllm_moe.value,
389 AutoConfigSource::WorkloadPreset,
390 )?;
391 let graph = self.bool_value(
392 "FERRUM_MOE_GRAPH",
393 self.workload.is_m3_preset()
394 && vllm_moe.value
395 && self.hardware.graph_support
396 && self.hardware.compiled_features.cuda_graph,
397 AutoConfigSource::WorkloadPreset,
398 )?;
399 let greedy = self.bool_value(
400 "FERRUM_GREEDY_ARGMAX",
401 self.workload.is_m3_preset()
402 && cuda_backend
403 && self.hardware.compiled_features.greedy_argmax,
404 AutoConfigSource::WorkloadPreset,
405 )?;
406 let prefix_cache = self.bool_value(
407 "FERRUM_PREFIX_CACHE",
408 false,
409 if self.workload.is_m3_preset() {
410 AutoConfigSource::WorkloadPreset
411 } else {
412 AutoConfigSource::Default
413 },
414 )?;
415 let default_max_sequences = self.default_max_sequences();
416 let max_sequences = self.usize_value(
417 "FERRUM_PAGED_MAX_SEQS",
418 default_max_sequences.value,
419 default_max_sequences.source,
420 )?;
421 let default_kv_blocks = self.default_kv_blocks(&max_sequences);
422 let kv_blocks = self.usize_value(
423 "FERRUM_KV_MAX_BLOCKS",
424 default_kv_blocks.value,
425 default_kv_blocks.source,
426 )?;
427 let default_max_batched_tokens =
428 self.default_max_batched_tokens(&max_sequences, &kv_blocks);
429 let max_batched_tokens = self.usize_value(
430 "FERRUM_MAX_BATCHED_TOKENS",
431 default_max_batched_tokens.value,
432 default_max_batched_tokens.source,
433 )?;
434 let max_model_len = self.optional_usize_value("FERRUM_MAX_MODEL_LEN")?;
435
436 self.validate_attention(
437 use_vllm_paged_attn.value,
438 fa_layout.value,
439 fa2_source.value,
440 fa2_direct_ffi.value,
441 shim_present,
442 vllm_v1_short.value,
443 )?;
444 self.validate_moe(
445 vllm_moe.value,
446 device_route.value,
447 pair_ids.value,
448 graph.value,
449 )?;
450 self.validate_memory(
451 kv_blocks.value,
452 max_sequences.value,
453 max_batched_tokens.value,
454 max_model_len.as_ref().map(|value| value.value),
455 )?;
456 self.validate_dtypes()?;
457 self.validate_sampling(greedy.value)?;
458
459 decisions.push(self.attention_prefill_decision(
460 use_vllm_paged_attn.clone(),
461 fa_layout,
462 fa2_source,
463 fa2_direct_ffi,
464 ));
465 decisions.push(self.attention_decode_decision(use_vllm_paged_attn, vllm_v1_short));
466 decisions.push(self.moe_decision(vllm_moe, device_route, pair_ids));
467 decisions.push(self.graph_decision(graph));
468 decisions.push(self.scalar_decision(
469 "kv_block_count",
470 kv_blocks,
471 RuntimeConfigEffect::Memory,
472 ));
473 decisions.push(self.scalar_decision(
474 "max_sequences",
475 max_sequences,
476 RuntimeConfigEffect::Memory,
477 ));
478 decisions.push(self.scalar_decision(
479 "max_batched_tokens",
480 max_batched_tokens,
481 RuntimeConfigEffect::Performance,
482 ));
483 if let Some(max_model_len) = max_model_len {
484 decisions.push(self.scalar_decision(
485 "max_model_len",
486 max_model_len,
487 RuntimeConfigEffect::Memory,
488 ));
489 }
490 decisions.push(self.prefix_cache_decision(prefix_cache));
491 decisions.push(self.scheduler_decision()?);
492 decisions.push(self.sampling_decision(greedy));
493
494 Ok(ResolvedFerrumConfig {
495 schema_version: 1,
496 preset: self.workload.preset.clone(),
497 runtime_config: self.runtime_config.clone(),
498 model_capabilities: self.model.clone(),
499 hardware_capabilities: self.hardware.clone(),
500 workload_profile: self.workload.clone(),
501 decisions,
502 })
503 }
504
505 fn entries(&self) -> BTreeMap<&str, &str> {
506 self.runtime_config
507 .entries
508 .iter()
509 .map(|entry| (entry.key.as_str(), entry.effective_value.as_str()))
510 .collect()
511 }
512
513 fn raw(&self, key: &str) -> Option<&str> {
514 self.entry(key).map(|entry| entry.effective_value.as_str())
515 }
516
517 fn entry(&self, key: &str) -> Option<&RuntimeConfigEntry> {
518 self.runtime_config
519 .entries
520 .iter()
521 .find(|entry| entry.key == key)
522 }
523
524 fn source_for_key(&self, key: &str, default_source: AutoConfigSource) -> AutoConfigSource {
525 self.entry(key)
526 .map(|entry| auto_config_source_from_runtime(entry.source))
527 .unwrap_or(default_source)
528 }
529
530 fn is_cuda_backend(&self) -> bool {
531 self.hardware.backend.eq_ignore_ascii_case("cuda")
532 }
533
534 fn cuda_compute_capability_at_least(&self, major: u32, minor: u32) -> Option<bool> {
535 let (actual_major, actual_minor) =
536 parse_compute_capability(self.hardware.compute_capability.as_deref()?)?;
537 Some((actual_major, actual_minor) >= (major, minor))
538 }
539
540 fn default_max_sequences(&self) -> ResolvedValue<usize> {
541 let target = self.workload.target_concurrency.max(1);
542 let mut selected = target;
543 if self.workload.is_m3_preset() {
544 if let Some(sm_count) = self.hardware.sm_count {
545 selected = selected.min((sm_count as usize / 4).max(1));
549 }
550 if let Some(vram_bytes) = self.hardware.vram_bytes {
551 selected = selected.min(vram_default_max_sequences(vram_bytes));
552 }
553 }
554 ResolvedValue {
555 value: selected.max(1),
556 source: if selected < target {
557 AutoConfigSource::HardwareCapability
558 } else {
559 AutoConfigSource::WorkloadPreset
560 },
561 source_key: None,
562 }
563 }
564
565 fn default_max_batched_tokens(
566 &self,
567 max_sequences: &ResolvedValue<usize>,
568 kv_blocks: &ResolvedValue<usize>,
569 ) -> ResolvedValue<usize> {
570 let kv_token_capacity = kv_blocks
571 .value
572 .saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS)
573 .max(max_sequences.value.max(1));
574 let value = max_sequences
575 .value
576 .max(1)
577 .saturating_mul(64)
578 .min(kv_token_capacity)
579 .max(max_sequences.value.max(1));
580 ResolvedValue {
581 value,
582 source: if max_sequences.source == AutoConfigSource::HardwareCapability
583 || kv_blocks.source == AutoConfigSource::HardwareCapability
584 {
585 AutoConfigSource::HardwareCapability
586 } else {
587 AutoConfigSource::WorkloadPreset
588 },
589 source_key: None,
590 }
591 }
592
593 fn default_kv_blocks(&self, max_sequences: &ResolvedValue<usize>) -> ResolvedValue<usize> {
594 let min_blocks = ceil_div(max_sequences.value.max(1), DEFAULT_KV_BLOCK_SIZE_TOKENS);
595 let target = DEFAULT_KV_BLOCKS.max(min_blocks);
596 let selected = match (
597 self.hardware.vram_bytes,
598 self.model.estimated_weight_bytes,
599 self.kv_cache_bytes_per_token(),
600 ) {
601 (Some(vram_bytes), Some(weight_bytes), Some(kv_bytes_per_token))
602 if kv_bytes_per_token > 0 =>
603 {
604 let headroom = (vram_bytes / 10).max(2 * GIB);
605 let available = vram_bytes.saturating_sub(weight_bytes.saturating_add(headroom));
606 let kv_token_budget = (available / kv_bytes_per_token) as usize;
607 let block_budget = kv_token_budget / DEFAULT_KV_BLOCK_SIZE_TOKENS;
608 target.min(block_budget.max(min_blocks))
609 }
610 _ => target,
611 };
612 ResolvedValue {
613 value: selected.max(1),
614 source: if selected < target {
615 AutoConfigSource::HardwareCapability
616 } else {
617 AutoConfigSource::WorkloadPreset
618 },
619 source_key: None,
620 }
621 }
622
623 fn kv_cache_bytes_per_token(&self) -> Option<u64> {
624 let layers = self.model.num_hidden_layers? as u64;
625 let kv_heads = self.model.kv_heads? as u64;
626 let head_dim = self.model.head_dim? as u64;
627 layers
628 .checked_mul(2)?
629 .checked_mul(kv_heads)?
630 .checked_mul(head_dim)?
631 .checked_mul(2)
632 }
633
634 fn bool_value(
635 &self,
636 key: &str,
637 default: bool,
638 default_source: AutoConfigSource,
639 ) -> Result<ResolvedValue<bool>, AutoConfigError> {
640 match self.entry(key) {
641 Some(entry) => Ok(ResolvedValue {
642 value: parse_bool_env_value(&entry.effective_value).map_err(|reason| {
643 AutoConfigError::InvalidOverride {
644 key: key.to_string(),
645 reason,
646 }
647 })?,
648 source: auto_config_source_from_runtime(entry.source),
649 source_key: Some(key.to_string()),
650 }),
651 None => Ok(ResolvedValue {
652 value: default,
653 source: default_source,
654 source_key: None,
655 }),
656 }
657 }
658
659 fn usize_value(
660 &self,
661 key: &str,
662 default: usize,
663 default_source: AutoConfigSource,
664 ) -> Result<ResolvedValue<usize>, AutoConfigError> {
665 match self.entry(key) {
666 Some(entry) => Ok(ResolvedValue {
667 value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
668 AutoConfigError::InvalidOverride {
669 key: key.to_string(),
670 reason,
671 }
672 })?,
673 source: auto_config_source_from_runtime(entry.source),
674 source_key: Some(key.to_string()),
675 }),
676 None => Ok(ResolvedValue {
677 value: default,
678 source: default_source,
679 source_key: None,
680 }),
681 }
682 }
683
684 fn optional_usize_value(
685 &self,
686 key: &str,
687 ) -> Result<Option<ResolvedValue<usize>>, AutoConfigError> {
688 match self.entry(key) {
689 Some(entry) => Ok(Some(ResolvedValue {
690 value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
691 AutoConfigError::InvalidOverride {
692 key: key.to_string(),
693 reason,
694 }
695 })?,
696 source: auto_config_source_from_runtime(entry.source),
697 source_key: Some(key.to_string()),
698 })),
699 None => Ok(None),
700 }
701 }
702
703 fn validate_attention(
704 &self,
705 use_vllm_paged_attn: bool,
706 fa_layout: bool,
707 fa2_source: bool,
708 fa2_direct_ffi: bool,
709 shim_present: bool,
710 vllm_v1_short: bool,
711 ) -> Result<(), AutoConfigError> {
712 if use_vllm_paged_attn && !self.hardware.compiled_features.vllm_paged_attn {
713 return self.invalid(
714 "FERRUM_USE_VLLM_PAGED_ATTN",
715 "vLLM paged attention is not compiled",
716 );
717 }
718 if use_vllm_paged_attn && !self.is_cuda_backend() {
719 return self.invalid(
720 "FERRUM_USE_VLLM_PAGED_ATTN",
721 "vLLM paged attention requires CUDA backend",
722 );
723 }
724 if fa_layout && !use_vllm_paged_attn {
725 return self.invalid(
726 "FERRUM_FA_LAYOUT_VARLEN",
727 "FA layout requires vLLM paged attention layout",
728 );
729 }
730 if fa2_source && !self.hardware.compiled_features.fa2_source {
731 return self.invalid(
732 "FERRUM_FA2_SOURCE",
733 "source-built FA2 support is not compiled",
734 );
735 }
736 if fa2_source && !self.is_cuda_backend() {
737 return self.invalid(
738 "FERRUM_FA2_SOURCE",
739 "source-built FA2 requires CUDA backend",
740 );
741 }
742 if fa2_source && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
743 return self.invalid(
744 "FERRUM_FA2_SOURCE",
745 "source-built FA2 requires CUDA compute capability >= 8.0",
746 );
747 }
748 if fa2_direct_ffi && !self.hardware.compiled_features.fa2_direct_ffi {
749 return self.invalid(
750 "FERRUM_FA2_DIRECT_FFI",
751 "direct FA2 FFI shim support is not compiled",
752 );
753 }
754 if fa2_direct_ffi && !self.is_cuda_backend() {
755 return self.invalid(
756 "FERRUM_FA2_DIRECT_FFI",
757 "direct FA2 FFI shim requires CUDA backend",
758 );
759 }
760 if fa2_direct_ffi && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
761 return self.invalid(
762 "FERRUM_FA2_DIRECT_FFI",
763 "direct FA2 FFI shim requires CUDA compute capability >= 8.0",
764 );
765 }
766 if fa2_direct_ffi && !shim_present {
767 return self.invalid(
768 "FERRUM_FA2_DIRECT_FFI",
769 "requires FERRUM_FA2_DIRECT_FFI_SHIM",
770 );
771 }
772 if fa2_source && fa2_direct_ffi {
773 return self.unsupported(
774 "attention_prefill_mixed_backend",
775 "FA2 source and direct FFI shim cannot both own the prefill path",
776 );
777 }
778 if vllm_v1_short && !use_vllm_paged_attn {
779 return self.invalid(
780 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
781 "short-context v1 requires vLLM paged attention",
782 );
783 }
784 Ok(())
785 }
786
787 fn validate_moe(
788 &self,
789 vllm_moe: bool,
790 device_route: bool,
791 pair_ids: bool,
792 graph: bool,
793 ) -> Result<(), AutoConfigError> {
794 if vllm_moe && !self.hardware.compiled_features.vllm_moe_marlin {
795 return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE is not compiled");
796 }
797 if vllm_moe && !self.is_cuda_backend() {
798 return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE requires CUDA backend");
799 }
800 if device_route && !vllm_moe {
801 return self.invalid(
802 "FERRUM_MOE_DEVICE_ROUTE",
803 "device route currently requires vLLM MoE",
804 );
805 }
806 if pair_ids && !vllm_moe {
807 return self.invalid(
808 "FERRUM_VLLM_MOE_PAIR_IDS",
809 "pair-id routing requires vLLM MoE",
810 );
811 }
812 let graph_relevant = self.model.moe.is_some() || self.workload.is_m3_preset();
813 if graph && graph_relevant && !self.hardware.graph_support {
814 return self.invalid(
815 "FERRUM_MOE_GRAPH",
816 "hardware/backend does not support CUDA graph replay",
817 );
818 }
819 if graph && graph_relevant && !self.hardware.compiled_features.cuda_graph {
820 return self.invalid("FERRUM_MOE_GRAPH", "CUDA graph support is not compiled");
821 }
822 if graph && graph_relevant && !vllm_moe {
823 return self.invalid(
824 "FERRUM_MOE_GRAPH",
825 "graph decode requires the graph-clean vLLM MoE path",
826 );
827 }
828 if graph && graph_relevant && self.model.moe.is_some() && !self.model.graph_safe_moe {
829 return self.unsupported(
830 "moe_graph_policy",
831 "model MoE path is not marked graph-safe",
832 );
833 }
834 Ok(())
835 }
836
837 fn validate_sampling(&self, greedy: bool) -> Result<(), AutoConfigError> {
838 if greedy && !self.hardware.compiled_features.greedy_argmax {
839 return self.invalid("FERRUM_GREEDY_ARGMAX", "GPU argmax is not compiled");
840 }
841 if greedy
842 && !(self.is_cuda_backend() || self.hardware.backend.eq_ignore_ascii_case("metal"))
843 {
844 return self.invalid(
845 "FERRUM_GREEDY_ARGMAX",
846 "greedy argmax requires CUDA or Metal backend",
847 );
848 }
849 Ok(())
850 }
851
852 fn validate_memory(
853 &self,
854 kv_blocks: usize,
855 max_sequences: usize,
856 max_batched_tokens: usize,
857 requested_max_model_len: Option<usize>,
858 ) -> Result<(), AutoConfigError> {
859 if kv_blocks == 0 {
860 return self.invalid("FERRUM_KV_MAX_BLOCKS", "must be greater than zero");
861 }
862 if max_sequences == 0 {
863 return self.invalid("FERRUM_PAGED_MAX_SEQS", "must be greater than zero");
864 }
865 if max_batched_tokens < max_sequences {
866 return self.invalid(
867 "FERRUM_MAX_BATCHED_TOKENS",
868 "must be at least FERRUM_PAGED_MAX_SEQS",
869 );
870 }
871 let kv_token_capacity = kv_blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS);
872 if max_batched_tokens > kv_token_capacity {
873 return self.invalid(
874 "FERRUM_MAX_BATCHED_TOKENS",
875 "exceeds KV cache token capacity",
876 );
877 }
878 if let Some(max_model_len) = requested_max_model_len {
879 if max_model_len == 0 {
880 return self.invalid("FERRUM_MAX_MODEL_LEN", "must be greater than zero");
881 }
882 if let Some(model_max) = self.model.max_context_len {
883 if max_model_len > model_max {
884 return self.invalid(
885 "FERRUM_MAX_MODEL_LEN",
886 "exceeds model metadata max context length",
887 );
888 }
889 }
890 if max_model_len > kv_token_capacity {
891 return self.invalid(
892 "FERRUM_KV_MAX_BLOCKS",
893 "KV cache token capacity is smaller than FERRUM_MAX_MODEL_LEN",
894 );
895 }
896 }
897 Ok(())
898 }
899
900 fn validate_dtypes(&self) -> Result<(), AutoConfigError> {
901 if let Some(dtype) = self.raw("FERRUM_DTYPE") {
902 let dtype = dtype.to_ascii_lowercase();
903 if !self.hardware.supported_dtypes.iter().any(|d| d == &dtype) {
904 return self.invalid("FERRUM_DTYPE", "dtype is not supported by hardware profile");
905 }
906 }
907 if let Some(dtype) = self.raw("FERRUM_KV_DTYPE") {
908 let dtype = dtype.to_ascii_lowercase();
909 if !self
910 .hardware
911 .supported_kv_dtypes
912 .iter()
913 .any(|d| d == &dtype)
914 {
915 return self.invalid(
916 "FERRUM_KV_DTYPE",
917 "KV dtype is not supported by hardware profile",
918 );
919 }
920 }
921 Ok(())
922 }
923
924 fn attention_prefill_decision(
925 &self,
926 use_vllm_paged_attn: ResolvedValue<bool>,
927 fa_layout: ResolvedValue<bool>,
928 fa2_source: ResolvedValue<bool>,
929 fa2_direct_ffi: ResolvedValue<bool>,
930 ) -> AutoConfigDecision {
931 let (selected, source, source_key) = if fa2_source.value {
932 ("fa2_source", fa2_source.source, fa2_source.source_key)
933 } else if fa2_direct_ffi.value {
934 (
935 "fa2_direct_ffi",
936 fa2_direct_ffi.source,
937 fa2_direct_ffi.source_key,
938 )
939 } else if fa_layout.value {
940 ("fa_layout_varlen", fa_layout.source, fa_layout.source_key)
941 } else if use_vllm_paged_attn.value {
942 (
943 "vllm_paged_varlen",
944 use_vllm_paged_attn.source,
945 use_vllm_paged_attn.source_key,
946 )
947 } else {
948 ("legacy_paged_varlen", AutoConfigSource::Default, None)
949 };
950 self.decision(
951 "attention_prefill_mixed_backend",
952 selected,
953 source,
954 source_key,
955 [
956 "fa2_source",
957 "fa2_direct_ffi",
958 "fa_layout_varlen",
959 "vllm_paged_varlen",
960 "legacy_paged_varlen",
961 ],
962 self.rejected_except(
963 selected,
964 [
965 ("fa2_source", "source-built FA2 not selected"),
966 ("fa2_direct_ffi", "diagnostic direct FFI shim not selected"),
967 ("fa_layout_varlen", "FA-compatible layout not selected"),
968 ("vllm_paged_varlen", "vLLM paged varlen bridge not selected"),
969 (
970 "legacy_paged_varlen",
971 "a higher-priority attention path was selected",
972 ),
973 ],
974 ),
975 vec![
976 RuntimeConfigEffect::Performance,
977 RuntimeConfigEffect::Memory,
978 ],
979 )
980 }
981
982 fn attention_decode_decision(
983 &self,
984 use_vllm_paged_attn: ResolvedValue<bool>,
985 vllm_v1_short: ResolvedValue<bool>,
986 ) -> AutoConfigDecision {
987 let (selected, source, source_key) = if use_vllm_paged_attn.value {
988 if vllm_v1_short.value {
989 (
990 "vllm_paged_attn_v1_short",
991 vllm_v1_short.source,
992 vllm_v1_short.source_key,
993 )
994 } else {
995 (
996 "vllm_paged_attn_v2",
997 vllm_v1_short.source,
998 vllm_v1_short.source_key,
999 )
1000 }
1001 } else {
1002 ("legacy_paged_decode", use_vllm_paged_attn.source, None)
1003 };
1004 self.decision(
1005 "attention_decode_backend",
1006 selected,
1007 source,
1008 source_key,
1009 [
1010 "vllm_paged_attn_v1_short",
1011 "vllm_paged_attn_v2",
1012 "legacy_paged_decode",
1013 ],
1014 self.rejected_except(
1015 selected,
1016 [
1017 (
1018 "vllm_paged_attn_v1_short",
1019 "short-context v1 decode not selected",
1020 ),
1021 ("vllm_paged_attn_v2", "v2 decode not selected"),
1022 ("legacy_paged_decode", "legacy decode not selected"),
1023 ],
1024 ),
1025 vec![RuntimeConfigEffect::Performance],
1026 )
1027 }
1028
1029 fn moe_decision(
1030 &self,
1031 vllm_moe: ResolvedValue<bool>,
1032 device_route: ResolvedValue<bool>,
1033 pair_ids: ResolvedValue<bool>,
1034 ) -> AutoConfigDecision {
1035 let selected = if vllm_moe.value && device_route.value && pair_ids.value {
1036 "vllm_marlin_moe_device_route_pair_ids"
1037 } else if vllm_moe.value && device_route.value {
1038 "vllm_marlin_moe_device_route"
1039 } else if vllm_moe.value {
1040 "vllm_marlin_moe"
1041 } else {
1042 "legacy_moe"
1043 };
1044 self.decision(
1045 "moe_implementation",
1046 selected,
1047 vllm_moe.source,
1048 vllm_moe.source_key,
1049 [
1050 "vllm_marlin_moe_device_route_pair_ids",
1051 "vllm_marlin_moe_device_route",
1052 "vllm_marlin_moe",
1053 "legacy_moe",
1054 ],
1055 self.rejected_except(
1056 selected,
1057 [
1058 (
1059 "vllm_marlin_moe_device_route_pair_ids",
1060 "pair-id device route not selected",
1061 ),
1062 (
1063 "vllm_marlin_moe_device_route",
1064 "device-route MoE not selected",
1065 ),
1066 ("vllm_marlin_moe", "vLLM Marlin MoE not selected"),
1067 ("legacy_moe", "legacy MoE not selected"),
1068 ],
1069 ),
1070 vec![RuntimeConfigEffect::Performance],
1071 )
1072 }
1073
1074 fn graph_decision(&self, graph: ResolvedValue<bool>) -> AutoConfigDecision {
1075 let selected = if graph.value {
1076 "graph_clean_decode"
1077 } else {
1078 "graph_disabled"
1079 };
1080 self.decision(
1081 "moe_graph_policy",
1082 selected,
1083 graph.source,
1084 graph.source_key,
1085 ["graph_clean_decode", "graph_disabled"],
1086 self.rejected_except(
1087 selected,
1088 [
1089 ("graph_clean_decode", "graph decode not selected"),
1090 ("graph_disabled", "graph decode selected"),
1091 ],
1092 ),
1093 vec![
1094 RuntimeConfigEffect::Performance,
1095 RuntimeConfigEffect::Correctness,
1096 ],
1097 )
1098 }
1099
1100 fn scalar_decision(
1101 &self,
1102 selection: &str,
1103 value: ResolvedValue<usize>,
1104 effect: RuntimeConfigEffect,
1105 ) -> AutoConfigDecision {
1106 self.decision(
1107 selection,
1108 &value.value.to_string(),
1109 value.source,
1110 value.source_key,
1111 [value.value.to_string()],
1112 Vec::new(),
1113 vec![effect],
1114 )
1115 }
1116
1117 fn scheduler_decision(&self) -> Result<AutoConfigDecision, AutoConfigError> {
1118 let entries = self.entries();
1119 let mut selected = "continuous_default".to_string();
1120 let mut source_key = None;
1121 if let Some(chunk) = entries.get("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK") {
1122 parse_usize_env_value(chunk).map_err(|reason| AutoConfigError::InvalidOverride {
1123 key: "FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string(),
1124 reason,
1125 })?;
1126 selected = format!("active_decode_prefill_chunk:{chunk}");
1127 source_key = Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string());
1128 } else if let Some(until) = entries.get("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE") {
1129 parse_usize_env_value(until).map_err(|reason| AutoConfigError::InvalidOverride {
1130 key: "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string(),
1131 reason,
1132 })?;
1133 selected = format!("prefill_first_until_active:{until}");
1134 source_key = Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string());
1135 } else if self
1136 .bool_value(
1137 "FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE",
1138 false,
1139 AutoConfigSource::Default,
1140 )?
1141 .value
1142 {
1143 selected = "prompt_token_estimate".to_string();
1144 source_key = Some("FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE".to_string());
1145 }
1146 self.unsupported_if(
1147 source_key.as_deref() == Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1148 && selected.ends_with(":0"),
1149 "scheduler_admission_policy",
1150 "active decode prefill chunk must be greater than zero",
1151 )?;
1152 Ok(self.decision(
1153 "scheduler_admission_policy",
1154 &selected,
1155 source_key
1156 .as_deref()
1157 .map(|key| self.source_for_key(key, AutoConfigSource::Default))
1158 .unwrap_or(AutoConfigSource::Default),
1159 source_key,
1160 [
1161 "continuous_default",
1162 "prompt_token_estimate",
1163 "prefill_first_until_active",
1164 "active_decode_prefill_chunk",
1165 ],
1166 Vec::new(),
1167 vec![RuntimeConfigEffect::Performance],
1168 ))
1169 }
1170
1171 fn prefix_cache_decision(&self, prefix_cache: ResolvedValue<bool>) -> AutoConfigDecision {
1172 let selected = if prefix_cache.value {
1173 "prefix_cache_enabled"
1174 } else {
1175 "prefix_cache_disabled"
1176 };
1177 self.decision(
1178 "prefix_cache_policy",
1179 selected,
1180 prefix_cache.source,
1181 prefix_cache.source_key,
1182 ["prefix_cache_enabled", "prefix_cache_disabled"],
1183 self.rejected_except(
1184 selected,
1185 [
1186 ("prefix_cache_enabled", "prefix cache not selected"),
1187 ("prefix_cache_disabled", "prefix cache enabled"),
1188 ],
1189 ),
1190 vec![
1191 RuntimeConfigEffect::Correctness,
1192 RuntimeConfigEffect::Performance,
1193 RuntimeConfigEffect::Memory,
1194 ],
1195 )
1196 }
1197
1198 fn sampling_decision(&self, greedy: ResolvedValue<bool>) -> AutoConfigDecision {
1199 let selected = if greedy.value {
1200 "gpu_greedy_argmax"
1201 } else {
1202 "logits_readback"
1203 };
1204 self.decision(
1205 "sampling_readback_path",
1206 selected,
1207 greedy.source,
1208 greedy.source_key,
1209 ["gpu_greedy_argmax", "logits_readback"],
1210 self.rejected_except(
1211 selected,
1212 [
1213 ("gpu_greedy_argmax", "GPU argmax not selected"),
1214 ("logits_readback", "logits readback not selected"),
1215 ],
1216 ),
1217 vec![
1218 RuntimeConfigEffect::Performance,
1219 RuntimeConfigEffect::Correctness,
1220 ],
1221 )
1222 }
1223
1224 fn decision<I, C>(
1225 &self,
1226 selection: &str,
1227 selected: &str,
1228 source: AutoConfigSource,
1229 source_key: Option<String>,
1230 candidates: I,
1231 rejected: Vec<RejectedCandidate>,
1232 affects: Vec<RuntimeConfigEffect>,
1233 ) -> AutoConfigDecision
1234 where
1235 I: IntoIterator<Item = C>,
1236 C: Into<String>,
1237 {
1238 AutoConfigDecision {
1239 schema_version: 1,
1240 selection: selection.to_string(),
1241 selected: selected.to_string(),
1242 source,
1243 source_key,
1244 candidates: candidates.into_iter().map(Into::into).collect(),
1245 rejected,
1246 affects,
1247 }
1248 }
1249
1250 fn rejected_except<I>(&self, selected: &str, candidates: I) -> Vec<RejectedCandidate>
1251 where
1252 I: IntoIterator<Item = (&'static str, &'static str)>,
1253 {
1254 candidates
1255 .into_iter()
1256 .filter(|(value, _)| *value != selected)
1257 .map(|(value, reason)| RejectedCandidate {
1258 value: value.to_string(),
1259 reason: reason.to_string(),
1260 })
1261 .collect()
1262 }
1263
1264 fn invalid<T>(&self, key: &str, reason: &str) -> Result<T, AutoConfigError> {
1265 Err(AutoConfigError::InvalidOverride {
1266 key: key.to_string(),
1267 reason: reason.to_string(),
1268 })
1269 }
1270
1271 fn unsupported<T>(&self, selection: &str, reason: &str) -> Result<T, AutoConfigError> {
1272 Err(AutoConfigError::UnsupportedCombination {
1273 selection: selection.to_string(),
1274 reason: reason.to_string(),
1275 })
1276 }
1277
1278 fn unsupported_if(
1279 &self,
1280 condition: bool,
1281 selection: &str,
1282 reason: &str,
1283 ) -> Result<(), AutoConfigError> {
1284 if condition {
1285 self.unsupported(selection, reason)
1286 } else {
1287 Ok(())
1288 }
1289 }
1290}
1291
1292#[derive(Debug, Clone, PartialEq, Eq)]
1293struct ResolvedValue<T> {
1294 value: T,
1295 source: AutoConfigSource,
1296 source_key: Option<String>,
1297}
1298
1299fn parse_compute_capability(value: &str) -> Option<(u32, u32)> {
1300 let value = value.trim();
1301 if value.is_empty() {
1302 return None;
1303 }
1304 let (major, minor) = value.split_once('.').unwrap_or((value, "0"));
1305 Some((major.trim().parse().ok()?, minor.trim().parse().ok()?))
1306}
1307
1308fn vram_default_max_sequences(vram_bytes: u64) -> usize {
1309 match vram_bytes {
1310 bytes if bytes >= 20 * GIB => 32,
1311 bytes if bytes >= 12 * GIB => 16,
1312 bytes if bytes >= 8 * GIB => 8,
1313 _ => 4,
1314 }
1315}
1316
1317fn ceil_div(value: usize, divisor: usize) -> usize {
1318 value.div_ceil(divisor)
1319}
1320
1321fn auto_config_source_from_runtime(source: RuntimeConfigSource) -> AutoConfigSource {
1322 match source {
1323 RuntimeConfigSource::Default => AutoConfigSource::Default,
1324 RuntimeConfigSource::ConfigFile => AutoConfigSource::ConfigFile,
1325 RuntimeConfigSource::Cli => AutoConfigSource::Cli,
1326 RuntimeConfigSource::Env => AutoConfigSource::Env,
1327 RuntimeConfigSource::ScriptCase => AutoConfigSource::ScriptCase,
1328 RuntimeConfigSource::MemoryProfile => AutoConfigSource::MemoryProfile,
1329 }
1330}
1331
1332#[cfg(test)]
1333mod tests {
1334 use super::*;
1335
1336 fn snapshot(vars: &[(&str, &str)]) -> RuntimeConfigSnapshot {
1337 RuntimeConfigSnapshot::from_env_vars(vars.iter().copied())
1338 }
1339
1340 fn snapshot_with_sources(vars: &[(&str, &str, RuntimeConfigSource)]) -> RuntimeConfigSnapshot {
1341 let mut entries: Vec<_> = vars
1342 .iter()
1343 .map(|(key, effective_value, source)| RuntimeConfigEntry {
1344 key: (*key).to_string(),
1345 effective_value: (*effective_value).to_string(),
1346 source: *source,
1347 affects: vec![RuntimeConfigEffect::Performance],
1348 })
1349 .collect();
1350 entries.sort_by(|a, b| a.key.cmp(&b.key));
1351 RuntimeConfigSnapshot { entries }
1352 }
1353
1354 fn m3(vars: &[(&str, &str)], features: CompiledKernelFeatures) -> FerrumConfigBuilder {
1355 FerrumConfigBuilder::new(snapshot(vars))
1356 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1357 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(features))
1358 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1359 }
1360
1361 fn m3_with_hardware(
1362 vars: &[(&str, &str)],
1363 hardware: HardwareCapabilities,
1364 ) -> FerrumConfigBuilder {
1365 FerrumConfigBuilder::new(snapshot(vars))
1366 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1367 .with_hardware_capabilities(hardware)
1368 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1369 }
1370
1371 fn expect_invalid_key(vars: &[(&str, &str)], key: &str) {
1372 expect_invalid_key_with_features(
1373 vars,
1374 key,
1375 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1376 );
1377 }
1378
1379 fn expect_invalid_key_with_features(
1380 vars: &[(&str, &str)],
1381 key: &str,
1382 features: CompiledKernelFeatures,
1383 ) {
1384 expect_invalid_key_with_hardware(vars, key, HardwareCapabilities::rtx4090_cuda(features));
1385 }
1386
1387 fn expect_invalid_key_with_hardware(
1388 vars: &[(&str, &str)],
1389 key: &str,
1390 hardware: HardwareCapabilities,
1391 ) {
1392 let err = m3_with_hardware(vars, hardware)
1393 .resolve()
1394 .expect_err("override should fail");
1395 match err {
1396 AutoConfigError::InvalidOverride { key: actual, .. } => assert_eq!(actual, key),
1397 other => panic!("expected invalid override for {key}, got {other:?}"),
1398 }
1399 }
1400
1401 fn cpu_hardware_with_features(features: CompiledKernelFeatures) -> HardwareCapabilities {
1402 HardwareCapabilities {
1403 backend: "cpu".to_string(),
1404 supported_dtypes: vec!["fp32".to_string()],
1405 supported_kv_dtypes: vec!["fp16".to_string()],
1406 compiled_features: features,
1407 ..HardwareCapabilities::unknown()
1408 }
1409 }
1410
1411 #[test]
1412 fn m3_preset_selects_current_safe_fast_path_without_fa2() {
1413 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1414 .resolve()
1415 .unwrap();
1416 let decisions: BTreeMap<_, _> = resolved
1417 .decisions
1418 .iter()
1419 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1420 .collect();
1421 assert_eq!(
1422 decisions["attention_prefill_mixed_backend"],
1423 "vllm_paged_varlen"
1424 );
1425 assert_eq!(
1426 decisions["attention_decode_backend"],
1427 "vllm_paged_attn_v1_short"
1428 );
1429 assert_eq!(
1430 decisions["moe_implementation"],
1431 "vllm_marlin_moe_device_route_pair_ids"
1432 );
1433 assert_eq!(decisions["moe_graph_policy"], "graph_clean_decode");
1434 assert_eq!(decisions["prefix_cache_policy"], "prefix_cache_disabled");
1435 assert_eq!(decisions["sampling_readback_path"], "gpu_greedy_argmax");
1436 assert_eq!(
1437 resolved.preset.as_deref(),
1438 Some(M3_QWEN3_30B_A3B_INT4_PRESET)
1439 );
1440 }
1441
1442 #[test]
1443 fn source_fa2_selects_only_when_compiled() {
1444 let resolved = m3(
1445 &[("FERRUM_FA2_SOURCE", "1")],
1446 CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1447 )
1448 .resolve()
1449 .unwrap();
1450 let prefill = resolved
1451 .decisions
1452 .iter()
1453 .find(|decision| decision.selection == "attention_prefill_mixed_backend")
1454 .unwrap();
1455 assert_eq!(prefill.selected, "fa2_source");
1456 expect_invalid_key(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE");
1457 }
1458
1459 #[test]
1460 fn hardware_capabilities_keep_m3_preset_on_compatible_backend_paths() {
1461 let resolved = m3_with_hardware(
1462 &[],
1463 cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2()),
1464 )
1465 .resolve()
1466 .unwrap();
1467 let decisions: BTreeMap<_, _> = resolved
1468 .decisions
1469 .iter()
1470 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1471 .collect();
1472
1473 assert_eq!(
1474 decisions["attention_prefill_mixed_backend"],
1475 "legacy_paged_varlen"
1476 );
1477 assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1478 assert_eq!(decisions["moe_implementation"], "legacy_moe");
1479 assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1480 assert_eq!(decisions["sampling_readback_path"], "logits_readback");
1481 }
1482
1483 #[test]
1484 fn hardware_incompatible_attention_and_sampling_overrides_are_rejected() {
1485 let cpu =
1486 cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2());
1487 expect_invalid_key_with_hardware(
1488 &[("FERRUM_USE_VLLM_PAGED_ATTN", "1")],
1489 "FERRUM_USE_VLLM_PAGED_ATTN",
1490 cpu.clone(),
1491 );
1492 expect_invalid_key_with_hardware(
1493 &[("FERRUM_VLLM_MOE", "1")],
1494 "FERRUM_VLLM_MOE",
1495 cpu.clone(),
1496 );
1497 expect_invalid_key_with_hardware(
1498 &[("FERRUM_GREEDY_ARGMAX", "1")],
1499 "FERRUM_GREEDY_ARGMAX",
1500 cpu.clone(),
1501 );
1502 expect_invalid_key_with_hardware(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE", cpu);
1503
1504 let mut old_cuda = HardwareCapabilities::rtx4090_cuda(
1505 CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1506 );
1507 old_cuda.compute_capability = Some("7.5".to_string());
1508 expect_invalid_key_with_hardware(
1509 &[("FERRUM_FA2_SOURCE", "1")],
1510 "FERRUM_FA2_SOURCE",
1511 old_cuda,
1512 );
1513 }
1514
1515 #[test]
1516 fn hardware_capacity_sizes_default_sequence_budget_without_overriding_user_values() {
1517 let mut small_gpu =
1518 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1519 small_gpu.sm_count = Some(16);
1520 small_gpu.vram_bytes = Some(24 * 1024 * 1024 * 1024);
1521
1522 let resolved = m3_with_hardware(&[], small_gpu.clone()).resolve().unwrap();
1523 let decision = |selection: &str| {
1524 resolved
1525 .decisions
1526 .iter()
1527 .find(|decision| decision.selection == selection)
1528 .unwrap()
1529 };
1530 let max_sequences = decision("max_sequences");
1531 assert_eq!(max_sequences.selected, "4");
1532 assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1533 let max_batched_tokens = decision("max_batched_tokens");
1534 assert_eq!(max_batched_tokens.selected, "256");
1535 assert_eq!(
1536 max_batched_tokens.source,
1537 AutoConfigSource::HardwareCapability
1538 );
1539
1540 let resolved = m3_with_hardware(&[("FERRUM_PAGED_MAX_SEQS", "16")], small_gpu)
1541 .resolve()
1542 .unwrap();
1543 let max_sequences = resolved
1544 .decisions
1545 .iter()
1546 .find(|decision| decision.selection == "max_sequences")
1547 .unwrap();
1548 assert_eq!(max_sequences.selected, "16");
1549 assert_eq!(max_sequences.source, AutoConfigSource::Env);
1550 assert_eq!(
1551 max_sequences.source_key.as_deref(),
1552 Some("FERRUM_PAGED_MAX_SEQS")
1553 );
1554 }
1555
1556 #[test]
1557 fn vram_capacity_caps_m3_default_sequence_budget() {
1558 let mut low_vram_gpu =
1559 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1560 low_vram_gpu.sm_count = Some(128);
1561 low_vram_gpu.vram_bytes = Some(7 * 1024 * 1024 * 1024);
1562
1563 let resolved = m3_with_hardware(&[], low_vram_gpu).resolve().unwrap();
1564 let max_sequences = resolved
1565 .decisions
1566 .iter()
1567 .find(|decision| decision.selection == "max_sequences")
1568 .unwrap();
1569 assert_eq!(max_sequences.selected, "4");
1570 assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
1571 }
1572
1573 #[test]
1574 fn memory_budget_keeps_rtx4090_m3_kv_blocks_but_caps_constrained_vram() {
1575 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1576 .resolve()
1577 .unwrap();
1578 let decision = |selection: &str| {
1579 resolved
1580 .decisions
1581 .iter()
1582 .find(|decision| decision.selection == selection)
1583 .unwrap()
1584 };
1585 assert_eq!(decision("kv_block_count").selected, "2048");
1586 assert_eq!(
1587 decision("kv_block_count").source,
1588 AutoConfigSource::WorkloadPreset
1589 );
1590
1591 let mut constrained =
1592 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1593 constrained.vram_bytes = Some(20 * 1024 * 1024 * 1024);
1594 let resolved = m3_with_hardware(&[], constrained).resolve().unwrap();
1595 let decision = |selection: &str| {
1596 resolved
1597 .decisions
1598 .iter()
1599 .find(|decision| decision.selection == selection)
1600 .unwrap()
1601 };
1602 assert_eq!(decision("kv_block_count").selected, "2");
1603 assert_eq!(
1604 decision("kv_block_count").source,
1605 AutoConfigSource::HardwareCapability
1606 );
1607 assert_eq!(decision("max_batched_tokens").selected, "32");
1608 assert_eq!(
1609 decision("max_batched_tokens").source,
1610 AutoConfigSource::HardwareCapability
1611 );
1612 }
1613
1614 #[test]
1615 fn compute_capability_parser_accepts_major_minor_and_major_only() {
1616 assert_eq!(parse_compute_capability("8.9"), Some((8, 9)));
1617 assert_eq!(parse_compute_capability("9"), Some((9, 0)));
1618 assert_eq!(parse_compute_capability("N/A"), None);
1619 }
1620
1621 #[test]
1622 fn vram_capacity_tiers_are_monotonic() {
1623 assert_eq!(vram_default_max_sequences(24 * 1024 * 1024 * 1024), 32);
1624 assert_eq!(vram_default_max_sequences(16 * 1024 * 1024 * 1024), 16);
1625 assert_eq!(vram_default_max_sequences(8 * 1024 * 1024 * 1024), 8);
1626 assert_eq!(vram_default_max_sequences(6 * 1024 * 1024 * 1024), 4);
1627 }
1628
1629 #[test]
1630 fn validates_invalid_override_matrix() {
1631 expect_invalid_key(
1632 &[("FERRUM_USE_VLLM_PAGED_ATTN", "maybe")],
1633 "FERRUM_USE_VLLM_PAGED_ATTN",
1634 );
1635 expect_invalid_key(&[("FERRUM_PREFIX_CACHE", "maybe")], "FERRUM_PREFIX_CACHE");
1636 expect_invalid_key(
1637 &[
1638 ("FERRUM_FA_LAYOUT_VARLEN", "1"),
1639 ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1640 ],
1641 "FERRUM_FA_LAYOUT_VARLEN",
1642 );
1643 expect_invalid_key(&[("FERRUM_FA2_DIRECT_FFI", "1")], "FERRUM_FA2_DIRECT_FFI");
1644 expect_invalid_key_with_features(
1645 &[("FERRUM_VLLM_MOE", "1")],
1646 "FERRUM_VLLM_MOE",
1647 CompiledKernelFeatures::default(),
1648 );
1649 expect_invalid_key(
1650 &[("FERRUM_MOE_DEVICE_ROUTE", "1"), ("FERRUM_VLLM_MOE", "0")],
1651 "FERRUM_MOE_DEVICE_ROUTE",
1652 );
1653 expect_invalid_key(
1654 &[("FERRUM_VLLM_MOE_PAIR_IDS", "1"), ("FERRUM_VLLM_MOE", "0")],
1655 "FERRUM_VLLM_MOE_PAIR_IDS",
1656 );
1657 expect_invalid_key(
1658 &[("FERRUM_MOE_GRAPH", "1"), ("FERRUM_VLLM_MOE", "0")],
1659 "FERRUM_MOE_GRAPH",
1660 );
1661 expect_invalid_key(&[("FERRUM_KV_MAX_BLOCKS", "0")], "FERRUM_KV_MAX_BLOCKS");
1662 expect_invalid_key(&[("FERRUM_PAGED_MAX_SEQS", "0")], "FERRUM_PAGED_MAX_SEQS");
1663 expect_invalid_key(
1664 &[
1665 ("FERRUM_PAGED_MAX_SEQS", "32"),
1666 ("FERRUM_MAX_BATCHED_TOKENS", "16"),
1667 ],
1668 "FERRUM_MAX_BATCHED_TOKENS",
1669 );
1670 expect_invalid_key(
1671 &[
1672 ("FERRUM_KV_MAX_BLOCKS", "16"),
1673 ("FERRUM_MAX_BATCHED_TOKENS", "512"),
1674 ],
1675 "FERRUM_MAX_BATCHED_TOKENS",
1676 );
1677 expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "0")], "FERRUM_MAX_MODEL_LEN");
1678 expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "50000")], "FERRUM_MAX_MODEL_LEN");
1679 expect_invalid_key(
1680 &[
1681 ("FERRUM_KV_MAX_BLOCKS", "16"),
1682 ("FERRUM_MAX_MODEL_LEN", "1024"),
1683 ],
1684 "FERRUM_KV_MAX_BLOCKS",
1685 );
1686 expect_invalid_key(&[("FERRUM_DTYPE", "bf16")], "FERRUM_DTYPE");
1687 expect_invalid_key(&[("FERRUM_KV_DTYPE", "fp8")], "FERRUM_KV_DTYPE");
1688 expect_invalid_key(
1689 &[
1690 ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", "1"),
1691 ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
1692 ],
1693 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
1694 );
1695 }
1696
1697 #[test]
1698 fn requested_max_model_len_is_optional_and_reflected_when_valid() {
1699 let default_resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1700 .resolve()
1701 .unwrap();
1702 assert!(!default_resolved
1703 .decisions
1704 .iter()
1705 .any(|decision| decision.selection == "max_model_len"));
1706
1707 let resolved = m3(
1708 &[
1709 ("FERRUM_KV_MAX_BLOCKS", "64"),
1710 ("FERRUM_MAX_MODEL_LEN", "1024"),
1711 ],
1712 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1713 )
1714 .resolve()
1715 .unwrap();
1716 let max_model_len = resolved
1717 .decisions
1718 .iter()
1719 .find(|decision| decision.selection == "max_model_len")
1720 .unwrap();
1721 assert_eq!(max_model_len.selected, "1024");
1722 assert_eq!(
1723 max_model_len.source_key.as_deref(),
1724 Some("FERRUM_MAX_MODEL_LEN")
1725 );
1726 }
1727
1728 #[test]
1729 fn graph_enabled_with_graph_unsafe_moe_is_rejected() {
1730 let mut model = ModelCapabilities::qwen3_30b_a3b_gptq_int4();
1731 model.graph_safe_moe = false;
1732 let err = FerrumConfigBuilder::new(snapshot(&[("FERRUM_MOE_GRAPH", "1")]))
1733 .with_model_capabilities(model)
1734 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1735 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1736 ))
1737 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1738 .resolve()
1739 .expect_err("graph unsafe MoE must fail");
1740 assert!(matches!(
1741 err,
1742 AutoConfigError::UnsupportedCombination {
1743 selection,
1744 ..
1745 } if selection == "moe_graph_policy"
1746 ));
1747 }
1748
1749 #[test]
1750 fn scheduler_override_is_reflected_in_decision_trace() {
1751 let resolved = m3(
1752 &[("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK", "64")],
1753 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1754 )
1755 .resolve()
1756 .unwrap();
1757 let scheduler = resolved
1758 .decisions
1759 .iter()
1760 .find(|decision| decision.selection == "scheduler_admission_policy")
1761 .unwrap();
1762 assert_eq!(scheduler.selected, "active_decode_prefill_chunk:64");
1763 assert_eq!(
1764 scheduler.source_key.as_deref(),
1765 Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1766 );
1767 }
1768
1769 #[test]
1770 fn prefix_cache_override_is_reflected_in_decision_trace() {
1771 let resolved = m3(
1772 &[("FERRUM_PREFIX_CACHE", "1")],
1773 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1774 )
1775 .resolve()
1776 .unwrap();
1777 let prefix_cache = resolved
1778 .decisions
1779 .iter()
1780 .find(|decision| decision.selection == "prefix_cache_policy")
1781 .unwrap();
1782 assert_eq!(prefix_cache.selected, "prefix_cache_enabled");
1783 assert_eq!(
1784 prefix_cache.source_key.as_deref(),
1785 Some("FERRUM_PREFIX_CACHE")
1786 );
1787 }
1788
1789 #[test]
1790 fn non_env_runtime_sources_are_preserved_in_decision_trace() {
1791 let runtime_config = snapshot_with_sources(&[
1792 (
1793 "FERRUM_FA_LAYOUT_VARLEN",
1794 "1",
1795 RuntimeConfigSource::ConfigFile,
1796 ),
1797 ("FERRUM_PAGED_MAX_SEQS", "48", RuntimeConfigSource::Cli),
1798 (
1799 "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE",
1800 "32",
1801 RuntimeConfigSource::ScriptCase,
1802 ),
1803 ]);
1804 let resolved = FerrumConfigBuilder::new(runtime_config)
1805 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1806 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1807 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1808 ))
1809 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1810 .resolve()
1811 .unwrap();
1812
1813 let decision = |selection: &str| {
1814 resolved
1815 .decisions
1816 .iter()
1817 .find(|decision| decision.selection == selection)
1818 .unwrap()
1819 };
1820 let attention = decision("attention_prefill_mixed_backend");
1821 assert_eq!(attention.selected, "fa_layout_varlen");
1822 assert_eq!(attention.source, AutoConfigSource::ConfigFile);
1823 assert_eq!(
1824 attention.source_key.as_deref(),
1825 Some("FERRUM_FA_LAYOUT_VARLEN")
1826 );
1827
1828 let max_sequences = decision("max_sequences");
1829 assert_eq!(max_sequences.selected, "48");
1830 assert_eq!(max_sequences.source, AutoConfigSource::Cli);
1831 assert_eq!(
1832 max_sequences.source_key.as_deref(),
1833 Some("FERRUM_PAGED_MAX_SEQS")
1834 );
1835
1836 let scheduler = decision("scheduler_admission_policy");
1837 assert_eq!(scheduler.selected, "prefill_first_until_active:32");
1838 assert_eq!(scheduler.source, AutoConfigSource::ScriptCase);
1839 assert_eq!(
1840 scheduler.source_key.as_deref(),
1841 Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE")
1842 );
1843 }
1844
1845 #[test]
1846 fn renders_effective_config_and_decision_trace_artifacts() {
1847 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1848 .resolve()
1849 .unwrap();
1850 let effective = resolved.effective_config_document();
1851 assert_eq!(effective["schema_version"], 1);
1852 assert!(effective["env_hash"]
1853 .as_str()
1854 .unwrap()
1855 .starts_with("sha256:"));
1856 assert!(effective["entries"].is_array());
1857 assert_eq!(effective["model_capabilities"]["architecture"], "qwen3_moe");
1858 assert_eq!(effective["hardware_capabilities"]["backend"], "cuda");
1859 assert_eq!(
1860 effective["workload_profile"]["preset"],
1861 M3_QWEN3_30B_A3B_INT4_PRESET
1862 );
1863 assert_eq!(
1864 effective["decisions"].as_array().unwrap().len(),
1865 resolved.decisions.len()
1866 );
1867 let trace = resolved.decision_trace_jsonl().unwrap();
1868 assert_eq!(trace.lines().count(), resolved.decisions.len());
1869 assert!(trace.contains("\"attention_prefill_mixed_backend\""));
1870 }
1871
1872 #[test]
1873 fn auto_config_artifacts_match_locked_schema_shape() {
1874 let resolved = FerrumConfigBuilder::m3_qwen3_30b_a3b_int4(snapshot_with_sources(&[
1875 (
1876 "FERRUM_FA_LAYOUT_VARLEN",
1877 "1",
1878 RuntimeConfigSource::ScriptCase,
1879 ),
1880 ("FERRUM_PAGED_MAX_SEQS", "32", RuntimeConfigSource::Cli),
1881 ]))
1882 .resolve()
1883 .unwrap();
1884
1885 let effective = resolved.effective_config_document();
1886 assert_eq!(effective["schema_version"], 1);
1887 assert!(effective["env_hash"]
1888 .as_str()
1889 .unwrap()
1890 .starts_with("sha256:"));
1891
1892 let entries = effective["entries"].as_array().unwrap();
1893 let keys: Vec<_> = entries
1894 .iter()
1895 .map(|entry| entry["key"].as_str().unwrap())
1896 .collect();
1897 let mut sorted_keys = keys.clone();
1898 sorted_keys.sort_unstable();
1899 assert_eq!(keys, sorted_keys);
1900 for entry in entries {
1901 assert!(entry["key"].as_str().unwrap().starts_with("FERRUM_"));
1902 assert!(entry["effective_value"].is_string());
1903 assert!(matches!(
1904 entry["source"].as_str().unwrap(),
1905 "default" | "config_file" | "cli" | "env" | "script_case" | "memory_profile"
1906 ));
1907 assert!(!entry["affects"].as_array().unwrap().is_empty());
1908 }
1909 assert_eq!(
1910 effective["model_capabilities"]["quantization"].as_str(),
1911 Some("gptq_int4")
1912 );
1913 assert_eq!(
1914 effective["model_capabilities"]["moe"]["experts_per_token"].as_u64(),
1915 Some(8)
1916 );
1917 assert_eq!(
1918 effective["hardware_capabilities"]["compute_capability"].as_str(),
1919 Some("8.9")
1920 );
1921 assert_eq!(
1922 effective["hardware_capabilities"]["compiled_features"]["vllm_moe_marlin"].as_bool(),
1923 Some(true)
1924 );
1925 assert_eq!(
1926 effective["workload_profile"]["target_concurrency"].as_u64(),
1927 Some(32)
1928 );
1929 assert_eq!(
1930 effective["workload_profile"]["priority"].as_str(),
1931 Some("throughput")
1932 );
1933
1934 let trace = resolved.decision_trace_jsonl().unwrap();
1935 let trace_decisions: Vec<AutoConfigDecision> = trace
1936 .lines()
1937 .map(|line| serde_json::from_str(line).unwrap())
1938 .collect();
1939 assert_eq!(trace_decisions, resolved.decisions);
1940 assert_eq!(
1941 serde_json::from_value::<Vec<AutoConfigDecision>>(effective["decisions"].clone())
1942 .unwrap(),
1943 trace_decisions
1944 );
1945
1946 for decision in &trace_decisions {
1947 assert_eq!(decision.schema_version, 1);
1948 assert!(!decision.selection.trim().is_empty());
1949 assert!(!decision.selected.trim().is_empty());
1950 assert!(!decision.candidates.is_empty());
1951 assert!(!decision.affects.is_empty());
1952 if let Some(source_key) = &decision.source_key {
1953 assert!(source_key.starts_with("FERRUM_"));
1954 }
1955 for rejected in &decision.rejected {
1956 assert!(!rejected.value.trim().is_empty());
1957 assert!(!rejected.reason.trim().is_empty());
1958 }
1959 }
1960 }
1961}