1use crate::{
7 parse_bool_env_value, parse_usize_env_value, RuntimeConfigEffect, RuntimeConfigEntry,
8 RuntimeConfigSnapshot, RuntimeConfigSource,
9};
10use serde::{Deserialize, Serialize};
11use std::collections::BTreeMap;
12use thiserror::Error;
13
14pub const M3_QWEN3_30B_A3B_INT4_PRESET: &str = "m3_qwen3_30b_a3b_int4";
15pub const QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET: &str =
16 "qwen25_72b_gptq_int4_2x4090_layer_split";
17const DEFAULT_KV_BLOCK_SIZE_TOKENS: usize = 16;
18const DEFAULT_KV_BLOCKS: usize = 2048;
19const GIB: u64 = 1024 * 1024 * 1024;
20
21#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22pub struct ModelCapabilities {
23 pub architecture: String,
24 pub quantization: Option<String>,
25 pub moe: Option<MoeCapabilities>,
26 pub max_context_len: Option<usize>,
27 pub num_hidden_layers: Option<usize>,
28 pub head_dim: Option<usize>,
29 pub kv_heads: Option<usize>,
30 pub estimated_weight_bytes: Option<u64>,
31 pub supported_dtypes: Vec<String>,
32 pub graph_safe_moe: bool,
33}
34
35impl ModelCapabilities {
36 pub fn unknown() -> Self {
37 Self {
38 architecture: "unknown".to_string(),
39 quantization: None,
40 moe: None,
41 max_context_len: None,
42 num_hidden_layers: None,
43 head_dim: None,
44 kv_heads: None,
45 estimated_weight_bytes: None,
46 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
47 graph_safe_moe: false,
48 }
49 }
50
51 pub fn qwen3_30b_a3b_gptq_int4() -> Self {
52 Self {
53 architecture: "qwen3_moe".to_string(),
54 quantization: Some("gptq_int4".to_string()),
55 moe: Some(MoeCapabilities {
56 num_experts: 128,
57 experts_per_token: 8,
58 moe_intermediate_size: Some(768),
59 }),
60 max_context_len: Some(40960),
61 num_hidden_layers: Some(48),
62 head_dim: Some(128),
63 kv_heads: Some(4),
64 estimated_weight_bytes: Some(18 * GIB),
69 supported_dtypes: vec!["fp16".to_string()],
70 graph_safe_moe: false,
71 }
72 }
73
74 pub fn qwen25_72b_gptq_int4() -> Self {
75 Self {
76 architecture: "qwen2".to_string(),
77 quantization: Some("gptq_int4".to_string()),
78 moe: None,
79 max_context_len: Some(32_768),
80 num_hidden_layers: Some(80),
81 head_dim: Some(128),
82 kv_heads: Some(8),
83 estimated_weight_bytes: Some(39 * GIB),
84 supported_dtypes: vec!["fp16".to_string()],
85 graph_safe_moe: false,
86 }
87 }
88}
89
90#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
91pub struct MoeCapabilities {
92 pub num_experts: usize,
93 pub experts_per_token: usize,
94 pub moe_intermediate_size: Option<usize>,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
98pub struct HardwareCapabilities {
99 pub backend: String,
100 pub cuda_runtime: Option<String>,
101 pub compute_capability: Option<String>,
102 pub vram_bytes: Option<u64>,
103 pub sm_count: Option<u32>,
104 pub supported_dtypes: Vec<String>,
105 pub supported_kv_dtypes: Vec<String>,
106 pub graph_support: bool,
107 pub compiled_features: CompiledKernelFeatures,
108}
109
110impl HardwareCapabilities {
111 pub fn unknown() -> Self {
112 Self {
113 backend: "unknown".to_string(),
114 cuda_runtime: None,
115 compute_capability: None,
116 vram_bytes: None,
117 sm_count: None,
118 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
119 supported_kv_dtypes: vec!["fp16".to_string()],
120 graph_support: false,
121 compiled_features: CompiledKernelFeatures::default(),
122 }
123 }
124
125 pub fn rtx4090_cuda(features: CompiledKernelFeatures) -> Self {
126 Self {
127 backend: "cuda".to_string(),
128 cuda_runtime: None,
129 compute_capability: Some("8.9".to_string()),
130 vram_bytes: Some(24 * 1024 * 1024 * 1024),
131 sm_count: Some(128),
132 supported_dtypes: vec!["fp16".to_string(), "fp32".to_string()],
133 supported_kv_dtypes: vec!["fp16".to_string(), "int8".to_string()],
134 graph_support: true,
135 compiled_features: features,
136 }
137 }
138}
139
140#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
141pub struct CompiledKernelFeatures {
142 pub cuda: bool,
143 pub vllm_paged_attn: bool,
144 pub vllm_moe_marlin: bool,
145 pub cuda_graph: bool,
146 pub greedy_argmax: bool,
147 pub fa2_source: bool,
148 pub fa2_direct_ffi: bool,
149}
150
151impl Default for CompiledKernelFeatures {
152 fn default() -> Self {
153 Self {
154 cuda: false,
155 vllm_paged_attn: false,
156 vllm_moe_marlin: false,
157 cuda_graph: false,
158 greedy_argmax: false,
159 fa2_source: false,
160 fa2_direct_ffi: false,
161 }
162 }
163}
164
165impl CompiledKernelFeatures {
166 pub fn m3_fast_path_without_fa2() -> Self {
167 Self {
168 cuda: true,
169 vllm_paged_attn: true,
170 vllm_moe_marlin: true,
171 cuda_graph: true,
172 greedy_argmax: true,
173 fa2_source: false,
174 fa2_direct_ffi: false,
175 }
176 }
177
178 pub fn m3_fast_path_with_source_fa2() -> Self {
179 Self {
180 fa2_source: true,
181 ..Self::m3_fast_path_without_fa2()
182 }
183 }
184}
185
186#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
187pub struct WorkloadProfile {
188 pub preset: Option<String>,
189 pub serving_mode: String,
190 pub target_concurrency: usize,
191 pub prompt_length_class: String,
192 pub output_length_class: String,
193 pub priority: WorkloadPriority,
194}
195
196impl WorkloadProfile {
197 pub fn serving_default() -> Self {
198 Self {
199 preset: None,
200 serving_mode: "openai_chat".to_string(),
201 target_concurrency: 1,
202 prompt_length_class: "unknown".to_string(),
203 output_length_class: "unknown".to_string(),
204 priority: WorkloadPriority::Balanced,
205 }
206 }
207
208 pub fn serving_default_for_hardware(hardware: &HardwareCapabilities) -> Self {
209 let mut profile = Self::serving_default();
210 if hardware.backend.eq_ignore_ascii_case("cuda")
211 || hardware.backend.eq_ignore_ascii_case("metal")
212 {
213 profile.target_concurrency = hardware
214 .vram_bytes
215 .map(vram_default_max_sequences)
216 .unwrap_or(4)
217 .max(1);
218 }
219 profile
220 }
221
222 pub fn m3_qwen3_30b_a3b_int4() -> Self {
223 Self {
224 preset: Some(M3_QWEN3_30B_A3B_INT4_PRESET.to_string()),
225 serving_mode: "bench_serve".to_string(),
226 target_concurrency: 32,
227 prompt_length_class: "random_256".to_string(),
228 output_length_class: "random_128".to_string(),
229 priority: WorkloadPriority::Throughput,
230 }
231 }
232
233 pub fn qwen25_72b_gptq_int4_2x4090_layer_split() -> Self {
234 Self {
235 preset: Some(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET.to_string()),
236 serving_mode: "bench_serve".to_string(),
237 target_concurrency: 16,
238 prompt_length_class: "random_256".to_string(),
239 output_length_class: "random_128".to_string(),
240 priority: WorkloadPriority::Throughput,
241 }
242 }
243
244 fn is_m3_preset(&self) -> bool {
245 self.is_preset(M3_QWEN3_30B_A3B_INT4_PRESET)
246 }
247
248 fn is_preset(&self, preset: &str) -> bool {
249 self.preset.as_deref() == Some(preset)
250 }
251}
252
253impl Default for WorkloadProfile {
254 fn default() -> Self {
255 Self::serving_default()
256 }
257}
258
259#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
260#[serde(rename_all = "snake_case")]
261pub enum WorkloadPriority {
262 Latency,
263 Throughput,
264 Balanced,
265}
266
267#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
268pub struct ResolvedFerrumConfig {
269 pub schema_version: u32,
270 pub preset: Option<String>,
271 pub runtime_config: RuntimeConfigSnapshot,
272 pub model_capabilities: ModelCapabilities,
273 pub hardware_capabilities: HardwareCapabilities,
274 pub workload_profile: WorkloadProfile,
275 pub decisions: Vec<AutoConfigDecision>,
276}
277
278impl ResolvedFerrumConfig {
279 pub fn effective_config_document(&self) -> serde_json::Value {
280 let backend = self.hardware_capabilities.backend.clone();
281 let requested_gpu_devices = self
282 .runtime_csv_usize("FERRUM_REQUESTED_GPU_DEVICES")
283 .or_else(|| default_gpu_devices_for_backend(&backend));
284 let selected_gpu_devices = self
285 .runtime_csv_usize("FERRUM_SELECTED_GPU_DEVICES")
286 .or_else(|| requested_gpu_devices.clone())
287 .or_else(|| default_gpu_devices_for_backend(&backend));
288 let cuda_device_count = self
289 .runtime_usize("FERRUM_CUDA_DEVICE_COUNT")
290 .or_else(|| {
291 backend.eq_ignore_ascii_case("cuda").then(|| {
292 selected_gpu_devices
293 .as_ref()
294 .map(|devices| devices.len())
295 .unwrap_or(1)
296 })
297 })
298 .unwrap_or(0);
299 let selected_distributed_strategy = self
300 .runtime_entry_value("FERRUM_SELECTED_DISTRIBUTED_STRATEGY")
301 .unwrap_or_else(|| {
302 if selected_gpu_devices
303 .as_ref()
304 .map(|devices| devices.len() > 1)
305 .unwrap_or(false)
306 {
307 "layer_split".to_string()
308 } else if backend.eq_ignore_ascii_case("cuda") {
309 "single_gpu".to_string()
310 } else {
311 "none".to_string()
312 }
313 });
314 let selected_layer_split_plan =
315 self.runtime_entry_value("FERRUM_SELECTED_LAYER_SPLIT_PLAN");
316 let selected_layer_split_stages =
317 self.runtime_json_value("FERRUM_SELECTED_LAYER_SPLIT_STAGES");
318 let selected_layer_split_stage_count = selected_layer_split_stages
319 .as_ref()
320 .and_then(|value| value.as_array().map(|stages| stages.len()))
321 .or_else(|| {
322 selected_layer_split_plan
323 .as_ref()
324 .and_then(|_| selected_gpu_devices.as_ref().map(Vec::len))
325 });
326 let requested_pipeline_mode = self.runtime_entry_value("FERRUM_LAYER_SPLIT_PIPELINE_MODE");
327 let selected_pipeline_mode = if selected_layer_split_plan.is_some() {
328 requested_pipeline_mode.unwrap_or_else(|| {
329 if selected_layer_split_stage_count == Some(2) {
330 "overlapped".to_string()
331 } else {
332 "batch".to_string()
333 }
334 })
335 } else {
336 "sequential".to_string()
337 };
338 let selected_max_sequences = self.selected_usize("max_sequences");
339 let selected_microbatch_size = if selected_layer_split_plan.is_some() {
340 selected_max_sequences.map(|max_sequences| {
341 if selected_pipeline_mode == "overlapped" {
342 max_sequences.div_ceil(2).max(1)
343 } else {
344 max_sequences
345 }
346 })
347 } else {
348 Some(1)
349 };
350 let selected_stage_bridge = selected_layer_split_plan.as_ref().map(|_| "host");
351 let selected_max_model_len = self.selected_usize("max_model_len");
352 let selected_kv_capacity = self.runtime_usize("FERRUM_KV_CAPACITY");
353 let selected_max_batched_tokens = self.selected_usize("max_batched_tokens");
354 serde_json::json!({
355 "schema_version": 1,
356 "preset": self.preset,
357 "env_hash": self.runtime_env_hash(),
358 "backend": backend.clone(),
359 "requested_gpu_devices": requested_gpu_devices.clone(),
360 "selected_gpu_devices": selected_gpu_devices.clone(),
361 "cuda_device_count": cuda_device_count,
362 "selected_distributed_strategy": selected_distributed_strategy.clone(),
363 "selected_layer_split_plan": selected_layer_split_plan.clone(),
364 "selected_layer_split_stages": selected_layer_split_stages,
365 "selected_pipeline_mode": selected_pipeline_mode,
366 "selected_microbatch_size": selected_microbatch_size,
367 "selected_stage_bridge": selected_stage_bridge,
368 "selected_weight_placement": if selected_layer_split_plan.is_some() { "layer_split" } else { "single_device" },
369 "selected_kv_layout": if backend.eq_ignore_ascii_case("cpu") { "contiguous" } else { "paged" },
370 "selected_attention_impl": self.selected_string("attention_decode_backend"),
371 "selected_graph_mode": self.selected_string("moe_graph_policy"),
372 "selected_max_sequences": selected_max_sequences,
373 "selected_max_model_len": selected_max_model_len,
374 "selected_kv_capacity": selected_kv_capacity,
375 "selected_max_batched_tokens": selected_max_batched_tokens,
376 "selected_admission_limit": selected_max_sequences,
377 "entries": self.runtime_config.entries,
378 "model_capabilities": self.model_capabilities,
379 "hardware_capabilities": self.hardware_capabilities,
380 "workload_profile": self.workload_profile,
381 "admission": self.admission_summary_document(),
382 "decisions": self.decisions,
383 })
384 }
385
386 pub fn admission_summary_document(&self) -> serde_json::Value {
387 let max_sequences = self.selected_usize("max_sequences");
388 let kv_blocks = self.selected_usize("kv_block_count");
389 let max_batched_tokens = self.selected_usize("max_batched_tokens");
390 let max_model_len = self.selected_usize("max_model_len");
391 let kv_capacity_tokens =
392 kv_blocks.map(|blocks| blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS));
393 let kv_bytes_per_token = kv_cache_bytes_per_token_for_model(&self.model_capabilities);
394 let scheduler_policy = self
395 .selected_string("scheduler_admission_policy")
396 .unwrap_or_else(|| "unknown".to_string());
397 serde_json::json!({
398 "schema_version": 1,
399 "backend": self.hardware_capabilities.backend,
400 "model_architecture": self.model_capabilities.architecture,
401 "scheduler_policy": scheduler_policy,
402 "effective_max_concurrent": max_sequences,
403 "queue_depth": 0u64,
404 "active_prefill": 0u64,
405 "active_decode": 0u64,
406 "current_batch_size": 0u64,
407 "rejected_requests_total": 0u64,
408 "failed_requests_total": 0u64,
409 "completed_requests_total": 0u64,
410 "max_sequences": max_sequences,
411 "kv_block_count": kv_blocks,
412 "kv_block_size_tokens": DEFAULT_KV_BLOCK_SIZE_TOKENS,
413 "kv_capacity_tokens": kv_capacity_tokens,
414 "max_model_length": max_model_len,
415 "max_batched_tokens": max_batched_tokens,
416 "memory_estimate": {
417 "vram_bytes": self.hardware_capabilities.vram_bytes,
418 "estimated_weight_bytes": self.model_capabilities.estimated_weight_bytes,
419 "kv_bytes_per_token": kv_bytes_per_token,
420 "kv_capacity_bytes": match (kv_capacity_tokens, kv_bytes_per_token) {
421 (Some(tokens), Some(bytes_per_token)) => {
422 (tokens as u64).checked_mul(bytes_per_token)
423 }
424 _ => None,
425 },
426 },
427 })
428 }
429
430 pub fn decision_trace_jsonl(&self) -> Result<String, serde_json::Error> {
431 let mut out = String::new();
432 for decision in &self.decisions {
433 out.push_str(&serde_json::to_string(decision)?);
434 out.push('\n');
435 }
436 Ok(out)
437 }
438
439 pub fn runtime_env_hash(&self) -> String {
440 use sha2::{Digest, Sha256};
441
442 let bytes = serde_json::to_vec(&self.runtime_config.entries).unwrap_or_default();
443 let digest = Sha256::digest(bytes);
444 format!("sha256:{digest:x}")
445 }
446
447 fn selected_usize(&self, selection: &str) -> Option<usize> {
448 self.selected_string(selection)?.parse().ok()
449 }
450
451 fn selected_string(&self, selection: &str) -> Option<String> {
452 self.decisions
453 .iter()
454 .find(|decision| decision.selection == selection)
455 .map(|decision| decision.selected.clone())
456 }
457
458 fn runtime_entry_value(&self, key: &str) -> Option<String> {
459 self.runtime_config
460 .entries
461 .iter()
462 .find(|entry| entry.key == key)
463 .map(|entry| entry.effective_value.clone())
464 }
465
466 fn runtime_usize(&self, key: &str) -> Option<usize> {
467 self.runtime_entry_value(key)?.parse().ok()
468 }
469
470 fn runtime_csv_usize(&self, key: &str) -> Option<Vec<usize>> {
471 let raw = self.runtime_entry_value(key)?;
472 let mut out = Vec::new();
473 for part in raw.split(',') {
474 let value = part.trim();
475 if value.is_empty() {
476 return None;
477 }
478 out.push(value.parse().ok()?);
479 }
480 Some(out)
481 }
482
483 fn runtime_json_value(&self, key: &str) -> Option<serde_json::Value> {
484 serde_json::from_str(&self.runtime_entry_value(key)?).ok()
485 }
486}
487
488#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
489pub struct AutoConfigDecision {
490 pub schema_version: u32,
491 pub selection: String,
492 pub selected: String,
493 pub source: AutoConfigSource,
494 pub source_key: Option<String>,
495 pub candidates: Vec<String>,
496 pub rejected: Vec<RejectedCandidate>,
497 pub affects: Vec<RuntimeConfigEffect>,
498}
499
500#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
501pub struct RejectedCandidate {
502 pub value: String,
503 pub reason: String,
504}
505
506#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
507#[serde(rename_all = "snake_case")]
508pub enum AutoConfigSource {
509 Default,
510 Cli,
511 ConfigFile,
512 Env,
513 ScriptCase,
514 ModelMetadata,
515 HardwareCapability,
516 MemoryProfile,
517 WorkloadPreset,
518 CompiledFeature,
519}
520
521#[derive(Debug, Clone, PartialEq, Eq, Error)]
522pub enum AutoConfigError {
523 #[error("{key}: invalid override: {reason}")]
524 InvalidOverride { key: String, reason: String },
525 #[error("{selection}: unsupported combination: {reason}")]
526 UnsupportedCombination { selection: String, reason: String },
527}
528
529pub struct FerrumConfigBuilder {
530 runtime_config: RuntimeConfigSnapshot,
531 model: ModelCapabilities,
532 hardware: HardwareCapabilities,
533 workload: WorkloadProfile,
534}
535
536impl FerrumConfigBuilder {
537 pub fn new(runtime_config: RuntimeConfigSnapshot) -> Self {
538 Self {
539 runtime_config,
540 model: ModelCapabilities::unknown(),
541 hardware: HardwareCapabilities::unknown(),
542 workload: WorkloadProfile::default(),
543 }
544 }
545
546 pub fn m3_qwen3_30b_a3b_int4(runtime_config: RuntimeConfigSnapshot) -> Self {
547 Self::new(runtime_config)
548 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
549 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
550 CompiledKernelFeatures::m3_fast_path_without_fa2(),
551 ))
552 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
553 }
554
555 pub fn with_model_capabilities(mut self, model: ModelCapabilities) -> Self {
556 self.model = model;
557 self
558 }
559
560 pub fn with_hardware_capabilities(mut self, hardware: HardwareCapabilities) -> Self {
561 self.hardware = hardware;
562 self
563 }
564
565 pub fn with_workload_profile(mut self, workload: WorkloadProfile) -> Self {
566 self.workload = workload;
567 self
568 }
569
570 pub fn resolve(self) -> Result<ResolvedFerrumConfig, AutoConfigError> {
571 let mut decisions = Vec::new();
572 let cuda_backend = self.is_cuda_backend();
573 let cuda_gptq_moe = cuda_backend
580 && self.model.moe.is_some()
581 && self.model.quantization.as_deref().is_some_and(|q| {
582 let q = q.to_ascii_lowercase();
583 q.contains("gptq") || q.contains("int4")
584 });
585 let cuda_qwen3_moe = cuda_backend
586 && self.model.moe.is_some()
587 && self.model.architecture.eq_ignore_ascii_case("qwen3_moe");
588 let use_vllm_paged_attn = self.bool_value(
589 "FERRUM_USE_VLLM_PAGED_ATTN",
590 (self.workload.is_m3_preset() || cuda_qwen3_moe)
591 && cuda_backend
592 && self.hardware.compiled_features.vllm_paged_attn,
593 AutoConfigSource::WorkloadPreset,
594 )?;
595 let fa_layout =
596 self.bool_value("FERRUM_FA_LAYOUT_VARLEN", false, AutoConfigSource::Default)?;
597 let fa2_source = self.bool_value("FERRUM_FA2_SOURCE", false, AutoConfigSource::Default)?;
598 let shim_present = self.raw("FERRUM_FA2_DIRECT_FFI_SHIM").is_some();
599 let fa2_direct_ffi = self.bool_value(
600 "FERRUM_FA2_DIRECT_FFI",
601 shim_present,
602 if shim_present {
603 AutoConfigSource::Env
604 } else {
605 AutoConfigSource::Default
606 },
607 )?;
608 let vllm_v1_short = self.bool_value(
609 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
610 use_vllm_paged_attn.value,
611 AutoConfigSource::Default,
612 )?;
613 let vllm_moe = self.bool_value(
614 "FERRUM_VLLM_MOE",
615 (cuda_gptq_moe || (self.workload.is_m3_preset() && cuda_backend))
616 && self.hardware.compiled_features.vllm_moe_marlin,
617 AutoConfigSource::WorkloadPreset,
618 )?;
619 let device_route = self.bool_value(
620 "FERRUM_MOE_DEVICE_ROUTE",
621 vllm_moe.value,
622 AutoConfigSource::WorkloadPreset,
623 )?;
624 let pair_ids = self.bool_value(
625 "FERRUM_VLLM_MOE_PAIR_IDS",
626 vllm_moe.value,
627 AutoConfigSource::WorkloadPreset,
628 )?;
629 let graph = self.bool_value("FERRUM_MOE_GRAPH", false, AutoConfigSource::WorkloadPreset)?;
630 let greedy = self.bool_value(
631 "FERRUM_GREEDY_ARGMAX",
632 self.workload.is_m3_preset()
633 && cuda_backend
634 && self.hardware.compiled_features.greedy_argmax,
635 AutoConfigSource::WorkloadPreset,
636 )?;
637 let prefix_cache = self.bool_value(
638 "FERRUM_PREFIX_CACHE",
639 false,
640 if self.workload.is_m3_preset() {
641 AutoConfigSource::WorkloadPreset
642 } else {
643 AutoConfigSource::Default
644 },
645 )?;
646 let default_max_sequences = self.default_max_sequences();
647 let max_sequences = self.usize_value(
648 "FERRUM_PAGED_MAX_SEQS",
649 default_max_sequences.value,
650 default_max_sequences.source,
651 )?;
652 let default_kv_blocks = self.default_kv_blocks(&max_sequences);
653 let kv_blocks = self.usize_value(
654 "FERRUM_KV_MAX_BLOCKS",
655 default_kv_blocks.value,
656 default_kv_blocks.source,
657 )?;
658 let default_max_batched_tokens =
659 self.default_max_batched_tokens(&max_sequences, &kv_blocks);
660 let max_batched_tokens = self.usize_value(
661 "FERRUM_MAX_BATCHED_TOKENS",
662 default_max_batched_tokens.value,
663 default_max_batched_tokens.source,
664 )?;
665 let max_model_len = self.optional_usize_value("FERRUM_MAX_MODEL_LEN")?;
666
667 self.validate_attention(
668 use_vllm_paged_attn.value,
669 fa_layout.value,
670 fa2_source.value,
671 fa2_direct_ffi.value,
672 shim_present,
673 vllm_v1_short.value,
674 )?;
675 self.validate_moe(
676 vllm_moe.value,
677 device_route.value,
678 pair_ids.value,
679 graph.value,
680 )?;
681 self.validate_memory(
682 kv_blocks.value,
683 max_sequences.value,
684 max_batched_tokens.value,
685 max_model_len.as_ref().map(|value| value.value),
686 )?;
687 self.validate_dtypes()?;
688 self.validate_layer_split_pipeline_mode()?;
689 self.validate_sampling(greedy.value)?;
690
691 decisions.push(self.attention_prefill_decision(
692 use_vllm_paged_attn.clone(),
693 fa_layout,
694 fa2_source,
695 fa2_direct_ffi,
696 ));
697 decisions.push(
698 self.attention_decode_decision(use_vllm_paged_attn.clone(), vllm_v1_short.clone()),
699 );
700 let mut runtime_config = self.runtime_config.clone();
707 for (key, resolved) in [
708 ("FERRUM_USE_VLLM_PAGED_ATTN", &use_vllm_paged_attn),
709 ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", &vllm_v1_short),
710 ("FERRUM_VLLM_MOE", &vllm_moe),
711 ("FERRUM_MOE_DEVICE_ROUTE", &device_route),
712 ("FERRUM_VLLM_MOE_PAIR_IDS", &pair_ids),
713 ] {
714 if resolved.source != AutoConfigSource::Env {
715 runtime_config.upsert(
716 key,
717 if resolved.value { "1" } else { "0" },
718 RuntimeConfigSource::MemoryProfile,
719 );
720 }
721 }
722 decisions.push(self.moe_decision(vllm_moe, device_route, pair_ids));
723 decisions.push(self.graph_decision(graph));
724 decisions.push(self.scalar_decision(
725 "kv_block_count",
726 kv_blocks,
727 RuntimeConfigEffect::Memory,
728 ));
729 decisions.push(self.scalar_decision(
730 "max_sequences",
731 max_sequences,
732 RuntimeConfigEffect::Memory,
733 ));
734 decisions.push(self.scalar_decision(
735 "max_batched_tokens",
736 max_batched_tokens,
737 RuntimeConfigEffect::Performance,
738 ));
739 if let Some(max_model_len) = max_model_len {
740 decisions.push(self.scalar_decision(
741 "max_model_len",
742 max_model_len,
743 RuntimeConfigEffect::Memory,
744 ));
745 }
746 decisions.push(self.prefix_cache_decision(prefix_cache));
747 decisions.push(self.scheduler_decision()?);
748 decisions.push(self.sampling_decision(greedy));
749
750 Ok(ResolvedFerrumConfig {
751 schema_version: 1,
752 preset: self.workload.preset.clone(),
753 runtime_config,
754 model_capabilities: self.model.clone(),
755 hardware_capabilities: self.hardware.clone(),
756 workload_profile: self.workload.clone(),
757 decisions,
758 })
759 }
760
761 fn entries(&self) -> BTreeMap<&str, &str> {
762 self.runtime_config
763 .entries
764 .iter()
765 .map(|entry| (entry.key.as_str(), entry.effective_value.as_str()))
766 .collect()
767 }
768
769 fn raw(&self, key: &str) -> Option<&str> {
770 self.entry(key).map(|entry| entry.effective_value.as_str())
771 }
772
773 fn entry(&self, key: &str) -> Option<&RuntimeConfigEntry> {
774 self.runtime_config
775 .entries
776 .iter()
777 .find(|entry| entry.key == key)
778 }
779
780 fn source_for_key(&self, key: &str, default_source: AutoConfigSource) -> AutoConfigSource {
781 self.entry(key)
782 .map(|entry| auto_config_source_from_runtime(entry.source))
783 .unwrap_or(default_source)
784 }
785
786 fn is_cuda_backend(&self) -> bool {
787 self.hardware.backend.eq_ignore_ascii_case("cuda")
788 }
789
790 fn cuda_compute_capability_at_least(&self, major: u32, minor: u32) -> Option<bool> {
791 let (actual_major, actual_minor) =
792 parse_compute_capability(self.hardware.compute_capability.as_deref()?)?;
793 Some((actual_major, actual_minor) >= (major, minor))
794 }
795
796 fn default_max_sequences(&self) -> ResolvedValue<usize> {
797 let target = self.workload.target_concurrency.max(1);
798 let mut selected = target;
799 if self.workload.is_m3_preset() {
800 if let Some(sm_count) = self.hardware.sm_count {
801 selected = selected.min((sm_count as usize / 4).max(1));
805 }
806 if let Some(vram_bytes) = self.hardware.vram_bytes {
807 selected = selected.min(vram_default_max_sequences(vram_bytes));
808 }
809 }
810 ResolvedValue {
811 value: selected.max(1),
812 source: if selected < target {
813 AutoConfigSource::HardwareCapability
814 } else {
815 AutoConfigSource::WorkloadPreset
816 },
817 source_key: None,
818 }
819 }
820
821 fn default_max_batched_tokens(
822 &self,
823 max_sequences: &ResolvedValue<usize>,
824 kv_blocks: &ResolvedValue<usize>,
825 ) -> ResolvedValue<usize> {
826 let kv_token_capacity = kv_blocks
827 .value
828 .saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS)
829 .max(max_sequences.value.max(1));
830 let target = if self
831 .workload
832 .is_preset(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET)
833 {
834 1536
835 } else {
836 max_sequences.value.max(1).saturating_mul(64)
837 };
838 let value = target
839 .min(kv_token_capacity)
840 .max(max_sequences.value.max(1));
841 ResolvedValue {
842 value,
843 source: if max_sequences.source == AutoConfigSource::HardwareCapability
844 || kv_blocks.source == AutoConfigSource::HardwareCapability
845 {
846 AutoConfigSource::HardwareCapability
847 } else {
848 AutoConfigSource::WorkloadPreset
849 },
850 source_key: None,
851 }
852 }
853
854 fn default_kv_blocks(&self, max_sequences: &ResolvedValue<usize>) -> ResolvedValue<usize> {
855 let min_blocks = ceil_div(max_sequences.value.max(1), DEFAULT_KV_BLOCK_SIZE_TOKENS);
856 if self
857 .workload
858 .is_preset(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET)
859 {
860 return ResolvedValue {
861 value: 1024.max(min_blocks),
862 source: AutoConfigSource::WorkloadPreset,
863 source_key: None,
864 };
865 }
866 let target = DEFAULT_KV_BLOCKS.max(min_blocks);
867 let selected = match (
868 self.hardware.vram_bytes,
869 self.model.estimated_weight_bytes,
870 self.kv_cache_bytes_per_token(),
871 ) {
872 (Some(vram_bytes), Some(weight_bytes), Some(kv_bytes_per_token))
873 if kv_bytes_per_token > 0 =>
874 {
875 let headroom = (vram_bytes / 10).max(2 * GIB);
876 let available = vram_bytes.saturating_sub(weight_bytes.saturating_add(headroom));
877 let kv_token_budget = (available / kv_bytes_per_token) as usize;
878 let block_budget = kv_token_budget / DEFAULT_KV_BLOCK_SIZE_TOKENS;
879 target.min(block_budget.max(min_blocks))
880 }
881 _ => target,
882 };
883 ResolvedValue {
884 value: selected.max(1),
885 source: if selected < target {
886 AutoConfigSource::HardwareCapability
887 } else {
888 AutoConfigSource::WorkloadPreset
889 },
890 source_key: None,
891 }
892 }
893
894 fn kv_cache_bytes_per_token(&self) -> Option<u64> {
895 kv_cache_bytes_per_token_for_model(&self.model)
896 }
897
898 fn bool_value(
899 &self,
900 key: &str,
901 default: bool,
902 default_source: AutoConfigSource,
903 ) -> Result<ResolvedValue<bool>, AutoConfigError> {
904 match self.entry(key) {
905 Some(entry) => Ok(ResolvedValue {
906 value: parse_bool_env_value(&entry.effective_value).map_err(|reason| {
907 AutoConfigError::InvalidOverride {
908 key: key.to_string(),
909 reason,
910 }
911 })?,
912 source: auto_config_source_from_runtime(entry.source),
913 source_key: Some(key.to_string()),
914 }),
915 None => Ok(ResolvedValue {
916 value: default,
917 source: default_source,
918 source_key: None,
919 }),
920 }
921 }
922
923 fn usize_value(
924 &self,
925 key: &str,
926 default: usize,
927 default_source: AutoConfigSource,
928 ) -> Result<ResolvedValue<usize>, AutoConfigError> {
929 match self.entry(key) {
930 Some(entry) => Ok(ResolvedValue {
931 value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
932 AutoConfigError::InvalidOverride {
933 key: key.to_string(),
934 reason,
935 }
936 })?,
937 source: auto_config_source_from_runtime(entry.source),
938 source_key: Some(key.to_string()),
939 }),
940 None => Ok(ResolvedValue {
941 value: default,
942 source: default_source,
943 source_key: None,
944 }),
945 }
946 }
947
948 fn optional_usize_value(
949 &self,
950 key: &str,
951 ) -> Result<Option<ResolvedValue<usize>>, AutoConfigError> {
952 match self.entry(key) {
953 Some(entry) => Ok(Some(ResolvedValue {
954 value: parse_usize_env_value(&entry.effective_value).map_err(|reason| {
955 AutoConfigError::InvalidOverride {
956 key: key.to_string(),
957 reason,
958 }
959 })?,
960 source: auto_config_source_from_runtime(entry.source),
961 source_key: Some(key.to_string()),
962 })),
963 None => Ok(None),
964 }
965 }
966
967 fn validate_attention(
968 &self,
969 use_vllm_paged_attn: bool,
970 fa_layout: bool,
971 fa2_source: bool,
972 fa2_direct_ffi: bool,
973 shim_present: bool,
974 vllm_v1_short: bool,
975 ) -> Result<(), AutoConfigError> {
976 if use_vllm_paged_attn && !self.hardware.compiled_features.vllm_paged_attn {
977 return self.invalid(
978 "FERRUM_USE_VLLM_PAGED_ATTN",
979 "vLLM paged attention is not compiled",
980 );
981 }
982 if use_vllm_paged_attn && !self.is_cuda_backend() {
983 return self.invalid(
984 "FERRUM_USE_VLLM_PAGED_ATTN",
985 "vLLM paged attention requires CUDA backend",
986 );
987 }
988 if fa_layout && !use_vllm_paged_attn {
989 return self.invalid(
990 "FERRUM_FA_LAYOUT_VARLEN",
991 "FA layout requires vLLM paged attention layout",
992 );
993 }
994 if fa2_source && !self.hardware.compiled_features.fa2_source {
995 return self.invalid(
996 "FERRUM_FA2_SOURCE",
997 "source-linked FA2 support is not compiled",
998 );
999 }
1000 if fa2_source && !self.is_cuda_backend() {
1001 return self.invalid(
1002 "FERRUM_FA2_SOURCE",
1003 "source-linked FA2 requires CUDA backend",
1004 );
1005 }
1006 if fa2_source && !use_vllm_paged_attn {
1007 return self.invalid(
1008 "FERRUM_FA2_SOURCE",
1009 "source-linked FA2 requires vLLM paged attention layout",
1010 );
1011 }
1012 if fa2_source && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
1013 return self.invalid(
1014 "FERRUM_FA2_SOURCE",
1015 "source-linked FA2 requires CUDA compute capability >= 8.0",
1016 );
1017 }
1018 if fa2_direct_ffi && !self.hardware.compiled_features.fa2_direct_ffi {
1019 return self.invalid(
1020 "FERRUM_FA2_DIRECT_FFI",
1021 "direct FA2 FFI shim support is not compiled",
1022 );
1023 }
1024 if fa2_direct_ffi && !self.is_cuda_backend() {
1025 return self.invalid(
1026 "FERRUM_FA2_DIRECT_FFI",
1027 "direct FA2 FFI shim requires CUDA backend",
1028 );
1029 }
1030 if fa2_direct_ffi && self.cuda_compute_capability_at_least(8, 0) == Some(false) {
1031 return self.invalid(
1032 "FERRUM_FA2_DIRECT_FFI",
1033 "direct FA2 FFI shim requires CUDA compute capability >= 8.0",
1034 );
1035 }
1036 if fa2_direct_ffi && !shim_present {
1037 return self.invalid(
1038 "FERRUM_FA2_DIRECT_FFI",
1039 "requires FERRUM_FA2_DIRECT_FFI_SHIM",
1040 );
1041 }
1042 if fa2_source && fa2_direct_ffi {
1043 return self.unsupported(
1044 "attention_prefill_mixed_backend",
1045 "FA2 source and direct FFI shim cannot both own the prefill path",
1046 );
1047 }
1048 if vllm_v1_short && !use_vllm_paged_attn {
1049 return self.invalid(
1050 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
1051 "short-context v1 requires vLLM paged attention",
1052 );
1053 }
1054 Ok(())
1055 }
1056
1057 fn validate_moe(
1058 &self,
1059 vllm_moe: bool,
1060 device_route: bool,
1061 pair_ids: bool,
1062 graph: bool,
1063 ) -> Result<(), AutoConfigError> {
1064 if vllm_moe && !self.hardware.compiled_features.vllm_moe_marlin {
1065 return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE is not compiled");
1066 }
1067 if vllm_moe && !self.is_cuda_backend() {
1068 return self.invalid("FERRUM_VLLM_MOE", "vLLM Marlin MoE requires CUDA backend");
1069 }
1070 if device_route && !vllm_moe {
1071 return self.invalid(
1072 "FERRUM_MOE_DEVICE_ROUTE",
1073 "device route currently requires vLLM MoE",
1074 );
1075 }
1076 if pair_ids && !vllm_moe {
1077 return self.invalid(
1078 "FERRUM_VLLM_MOE_PAIR_IDS",
1079 "pair-id routing requires vLLM MoE",
1080 );
1081 }
1082 let graph_relevant = self.model.moe.is_some() || self.workload.is_m3_preset();
1083 if graph && graph_relevant && !self.hardware.graph_support {
1084 return self.invalid(
1085 "FERRUM_MOE_GRAPH",
1086 "hardware/backend does not support CUDA graph replay",
1087 );
1088 }
1089 if graph && graph_relevant && !self.hardware.compiled_features.cuda_graph {
1090 return self.invalid("FERRUM_MOE_GRAPH", "CUDA graph support is not compiled");
1091 }
1092 if graph && graph_relevant && !vllm_moe {
1093 return self.invalid(
1094 "FERRUM_MOE_GRAPH",
1095 "graph decode requires the graph-clean vLLM MoE path",
1096 );
1097 }
1098 if graph && graph_relevant && self.model.moe.is_some() && !self.model.graph_safe_moe {
1099 return self.unsupported(
1100 "moe_graph_policy",
1101 "model MoE path is not marked graph-safe",
1102 );
1103 }
1104 Ok(())
1105 }
1106
1107 fn validate_sampling(&self, greedy: bool) -> Result<(), AutoConfigError> {
1108 if greedy && !self.hardware.compiled_features.greedy_argmax {
1109 return self.invalid("FERRUM_GREEDY_ARGMAX", "GPU argmax is not compiled");
1110 }
1111 if greedy
1112 && !(self.is_cuda_backend() || self.hardware.backend.eq_ignore_ascii_case("metal"))
1113 {
1114 return self.invalid(
1115 "FERRUM_GREEDY_ARGMAX",
1116 "greedy argmax requires CUDA or Metal backend",
1117 );
1118 }
1119 Ok(())
1120 }
1121
1122 fn validate_memory(
1123 &self,
1124 kv_blocks: usize,
1125 max_sequences: usize,
1126 max_batched_tokens: usize,
1127 requested_max_model_len: Option<usize>,
1128 ) -> Result<(), AutoConfigError> {
1129 if kv_blocks == 0 {
1130 return self.invalid("FERRUM_KV_MAX_BLOCKS", "must be greater than zero");
1131 }
1132 if max_sequences == 0 {
1133 return self.invalid("FERRUM_PAGED_MAX_SEQS", "must be greater than zero");
1134 }
1135 if max_batched_tokens < max_sequences {
1136 return self.invalid(
1137 "FERRUM_MAX_BATCHED_TOKENS",
1138 "must be at least FERRUM_PAGED_MAX_SEQS",
1139 );
1140 }
1141 let kv_token_capacity = kv_blocks.saturating_mul(DEFAULT_KV_BLOCK_SIZE_TOKENS);
1142 if max_batched_tokens > kv_token_capacity {
1143 return self.invalid(
1144 "FERRUM_MAX_BATCHED_TOKENS",
1145 "exceeds KV cache token capacity",
1146 );
1147 }
1148 if let Some(max_model_len) = requested_max_model_len {
1149 if max_model_len == 0 {
1150 return self.invalid("FERRUM_MAX_MODEL_LEN", "must be greater than zero");
1151 }
1152 if let Some(model_max) = self.model.max_context_len {
1153 if max_model_len > model_max {
1154 return self.invalid(
1155 "FERRUM_MAX_MODEL_LEN",
1156 "exceeds model metadata max context length",
1157 );
1158 }
1159 }
1160 if max_model_len > kv_token_capacity {
1161 return self.invalid(
1162 "FERRUM_KV_MAX_BLOCKS",
1163 "KV cache token capacity is smaller than FERRUM_MAX_MODEL_LEN",
1164 );
1165 }
1166 }
1167 Ok(())
1168 }
1169
1170 fn validate_dtypes(&self) -> Result<(), AutoConfigError> {
1171 if let Some(dtype) = self.raw("FERRUM_DTYPE") {
1172 let dtype = dtype.to_ascii_lowercase();
1173 if !self.hardware.supported_dtypes.iter().any(|d| d == &dtype) {
1174 return self.invalid("FERRUM_DTYPE", "dtype is not supported by hardware profile");
1175 }
1176 }
1177 if let Some(dtype) = self.raw("FERRUM_KV_DTYPE") {
1178 let dtype = dtype.to_ascii_lowercase();
1179 if !self
1180 .hardware
1181 .supported_kv_dtypes
1182 .iter()
1183 .any(|d| d == &dtype)
1184 {
1185 return self.invalid(
1186 "FERRUM_KV_DTYPE",
1187 "KV dtype is not supported by hardware profile",
1188 );
1189 }
1190 }
1191 Ok(())
1192 }
1193
1194 fn validate_layer_split_pipeline_mode(&self) -> Result<(), AutoConfigError> {
1195 let Some(mode) = self.raw("FERRUM_LAYER_SPLIT_PIPELINE_MODE") else {
1196 return Ok(());
1197 };
1198 match mode.trim().to_ascii_lowercase().as_str() {
1199 "batch" | "overlapped" => Ok(()),
1200 _ => self.invalid(
1201 "FERRUM_LAYER_SPLIT_PIPELINE_MODE",
1202 "must be batch or overlapped",
1203 ),
1204 }
1205 }
1206
1207 fn attention_prefill_decision(
1208 &self,
1209 use_vllm_paged_attn: ResolvedValue<bool>,
1210 fa_layout: ResolvedValue<bool>,
1211 fa2_source: ResolvedValue<bool>,
1212 fa2_direct_ffi: ResolvedValue<bool>,
1213 ) -> AutoConfigDecision {
1214 let (selected, source, source_key) = if fa2_source.value {
1215 ("fa2_source", fa2_source.source, fa2_source.source_key)
1216 } else if fa2_direct_ffi.value {
1217 (
1218 "fa2_direct_ffi",
1219 fa2_direct_ffi.source,
1220 fa2_direct_ffi.source_key,
1221 )
1222 } else if fa_layout.value {
1223 ("fa_layout_varlen", fa_layout.source, fa_layout.source_key)
1224 } else if use_vllm_paged_attn.value {
1225 (
1226 "vllm_paged_varlen",
1227 use_vllm_paged_attn.source,
1228 use_vllm_paged_attn.source_key,
1229 )
1230 } else {
1231 ("legacy_paged_varlen", AutoConfigSource::Default, None)
1232 };
1233 self.decision(
1234 "attention_prefill_mixed_backend",
1235 selected,
1236 source,
1237 source_key,
1238 [
1239 "fa2_source",
1240 "fa2_direct_ffi",
1241 "fa_layout_varlen",
1242 "vllm_paged_varlen",
1243 "legacy_paged_varlen",
1244 ],
1245 self.rejected_except(
1246 selected,
1247 [
1248 ("fa2_source", "source-linked FA2 path not selected"),
1249 ("fa2_direct_ffi", "diagnostic direct FFI shim not selected"),
1250 ("fa_layout_varlen", "FA-compatible layout not selected"),
1251 ("vllm_paged_varlen", "vLLM paged varlen bridge not selected"),
1252 (
1253 "legacy_paged_varlen",
1254 "a higher-priority attention path was selected",
1255 ),
1256 ],
1257 ),
1258 vec![
1259 RuntimeConfigEffect::Performance,
1260 RuntimeConfigEffect::Memory,
1261 ],
1262 )
1263 }
1264
1265 fn attention_decode_decision(
1266 &self,
1267 use_vllm_paged_attn: ResolvedValue<bool>,
1268 vllm_v1_short: ResolvedValue<bool>,
1269 ) -> AutoConfigDecision {
1270 let (selected, source, source_key) = if use_vllm_paged_attn.value {
1271 if vllm_v1_short.value {
1272 (
1273 "vllm_paged_attn_v1_short",
1274 vllm_v1_short.source,
1275 vllm_v1_short.source_key,
1276 )
1277 } else {
1278 (
1279 "vllm_paged_attn_v2",
1280 vllm_v1_short.source,
1281 vllm_v1_short.source_key,
1282 )
1283 }
1284 } else {
1285 ("legacy_paged_decode", use_vllm_paged_attn.source, None)
1286 };
1287 self.decision(
1288 "attention_decode_backend",
1289 selected,
1290 source,
1291 source_key,
1292 [
1293 "vllm_paged_attn_v1_short",
1294 "vllm_paged_attn_v2",
1295 "legacy_paged_decode",
1296 ],
1297 self.rejected_except(
1298 selected,
1299 [
1300 (
1301 "vllm_paged_attn_v1_short",
1302 "short-context v1 decode not selected",
1303 ),
1304 ("vllm_paged_attn_v2", "v2 decode not selected"),
1305 ("legacy_paged_decode", "legacy decode not selected"),
1306 ],
1307 ),
1308 vec![RuntimeConfigEffect::Performance],
1309 )
1310 }
1311
1312 fn moe_decision(
1313 &self,
1314 vllm_moe: ResolvedValue<bool>,
1315 device_route: ResolvedValue<bool>,
1316 pair_ids: ResolvedValue<bool>,
1317 ) -> AutoConfigDecision {
1318 let selected = if vllm_moe.value && device_route.value && pair_ids.value {
1319 "vllm_marlin_moe_device_route_pair_ids"
1320 } else if vllm_moe.value && device_route.value {
1321 "vllm_marlin_moe_device_route"
1322 } else if vllm_moe.value {
1323 "vllm_marlin_moe"
1324 } else {
1325 "legacy_moe"
1326 };
1327 self.decision(
1328 "moe_implementation",
1329 selected,
1330 vllm_moe.source,
1331 vllm_moe.source_key,
1332 [
1333 "vllm_marlin_moe_device_route_pair_ids",
1334 "vllm_marlin_moe_device_route",
1335 "vllm_marlin_moe",
1336 "legacy_moe",
1337 ],
1338 self.rejected_except(
1339 selected,
1340 [
1341 (
1342 "vllm_marlin_moe_device_route_pair_ids",
1343 "pair-id device route not selected",
1344 ),
1345 (
1346 "vllm_marlin_moe_device_route",
1347 "device-route MoE not selected",
1348 ),
1349 ("vllm_marlin_moe", "vLLM Marlin MoE not selected"),
1350 ("legacy_moe", "legacy MoE not selected"),
1351 ],
1352 ),
1353 vec![RuntimeConfigEffect::Performance],
1354 )
1355 }
1356
1357 fn graph_decision(&self, graph: ResolvedValue<bool>) -> AutoConfigDecision {
1358 let selected = if graph.value {
1359 "graph_clean_decode"
1360 } else {
1361 "graph_disabled"
1362 };
1363 self.decision(
1364 "moe_graph_policy",
1365 selected,
1366 graph.source,
1367 graph.source_key,
1368 ["graph_clean_decode", "graph_disabled"],
1369 self.rejected_except(
1370 selected,
1371 [
1372 ("graph_clean_decode", "graph decode not selected"),
1373 ("graph_disabled", "graph decode selected"),
1374 ],
1375 ),
1376 vec![
1377 RuntimeConfigEffect::Performance,
1378 RuntimeConfigEffect::Correctness,
1379 ],
1380 )
1381 }
1382
1383 fn scalar_decision(
1384 &self,
1385 selection: &str,
1386 value: ResolvedValue<usize>,
1387 effect: RuntimeConfigEffect,
1388 ) -> AutoConfigDecision {
1389 self.decision(
1390 selection,
1391 &value.value.to_string(),
1392 value.source,
1393 value.source_key,
1394 [value.value.to_string()],
1395 Vec::new(),
1396 vec![effect],
1397 )
1398 }
1399
1400 fn scheduler_decision(&self) -> Result<AutoConfigDecision, AutoConfigError> {
1401 let entries = self.entries();
1402 let mut selected = "continuous_default".to_string();
1403 let mut source_key = None;
1404 if let Some(chunk) = entries.get("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK") {
1405 parse_usize_env_value(chunk).map_err(|reason| AutoConfigError::InvalidOverride {
1406 key: "FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string(),
1407 reason,
1408 })?;
1409 selected = format!("active_decode_prefill_chunk:{chunk}");
1410 source_key = Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK".to_string());
1411 } else if let Some(until) = entries.get("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE") {
1412 parse_usize_env_value(until).map_err(|reason| AutoConfigError::InvalidOverride {
1413 key: "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string(),
1414 reason,
1415 })?;
1416 selected = format!("prefill_first_until_active:{until}");
1417 source_key = Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE".to_string());
1418 } else if self
1419 .bool_value(
1420 "FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE",
1421 false,
1422 AutoConfigSource::Default,
1423 )?
1424 .value
1425 {
1426 selected = "prompt_token_estimate".to_string();
1427 source_key = Some("FERRUM_SCHED_PROMPT_TOKEN_ESTIMATE".to_string());
1428 }
1429 self.unsupported_if(
1430 source_key.as_deref() == Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
1431 && selected.ends_with(":0"),
1432 "scheduler_admission_policy",
1433 "active decode prefill chunk must be greater than zero",
1434 )?;
1435 Ok(self.decision(
1436 "scheduler_admission_policy",
1437 &selected,
1438 source_key
1439 .as_deref()
1440 .map(|key| self.source_for_key(key, AutoConfigSource::Default))
1441 .unwrap_or(AutoConfigSource::Default),
1442 source_key,
1443 [
1444 "continuous_default",
1445 "prompt_token_estimate",
1446 "prefill_first_until_active",
1447 "active_decode_prefill_chunk",
1448 ],
1449 Vec::new(),
1450 vec![RuntimeConfigEffect::Performance],
1451 ))
1452 }
1453
1454 fn prefix_cache_decision(&self, prefix_cache: ResolvedValue<bool>) -> AutoConfigDecision {
1455 let selected = if prefix_cache.value {
1456 "prefix_cache_enabled"
1457 } else {
1458 "prefix_cache_disabled"
1459 };
1460 self.decision(
1461 "prefix_cache_policy",
1462 selected,
1463 prefix_cache.source,
1464 prefix_cache.source_key,
1465 ["prefix_cache_enabled", "prefix_cache_disabled"],
1466 self.rejected_except(
1467 selected,
1468 [
1469 ("prefix_cache_enabled", "prefix cache not selected"),
1470 ("prefix_cache_disabled", "prefix cache enabled"),
1471 ],
1472 ),
1473 vec![
1474 RuntimeConfigEffect::Correctness,
1475 RuntimeConfigEffect::Performance,
1476 RuntimeConfigEffect::Memory,
1477 ],
1478 )
1479 }
1480
1481 fn sampling_decision(&self, greedy: ResolvedValue<bool>) -> AutoConfigDecision {
1482 let selected = if greedy.value {
1483 "gpu_greedy_argmax"
1484 } else {
1485 "logits_readback"
1486 };
1487 self.decision(
1488 "sampling_readback_path",
1489 selected,
1490 greedy.source,
1491 greedy.source_key,
1492 ["gpu_greedy_argmax", "logits_readback"],
1493 self.rejected_except(
1494 selected,
1495 [
1496 ("gpu_greedy_argmax", "GPU argmax not selected"),
1497 ("logits_readback", "logits readback not selected"),
1498 ],
1499 ),
1500 vec![
1501 RuntimeConfigEffect::Performance,
1502 RuntimeConfigEffect::Correctness,
1503 ],
1504 )
1505 }
1506
1507 fn decision<I, C>(
1508 &self,
1509 selection: &str,
1510 selected: &str,
1511 source: AutoConfigSource,
1512 source_key: Option<String>,
1513 candidates: I,
1514 rejected: Vec<RejectedCandidate>,
1515 affects: Vec<RuntimeConfigEffect>,
1516 ) -> AutoConfigDecision
1517 where
1518 I: IntoIterator<Item = C>,
1519 C: Into<String>,
1520 {
1521 AutoConfigDecision {
1522 schema_version: 1,
1523 selection: selection.to_string(),
1524 selected: selected.to_string(),
1525 source,
1526 source_key,
1527 candidates: candidates.into_iter().map(Into::into).collect(),
1528 rejected,
1529 affects,
1530 }
1531 }
1532
1533 fn rejected_except<I>(&self, selected: &str, candidates: I) -> Vec<RejectedCandidate>
1534 where
1535 I: IntoIterator<Item = (&'static str, &'static str)>,
1536 {
1537 candidates
1538 .into_iter()
1539 .filter(|(value, _)| *value != selected)
1540 .map(|(value, reason)| RejectedCandidate {
1541 value: value.to_string(),
1542 reason: reason.to_string(),
1543 })
1544 .collect()
1545 }
1546
1547 fn invalid<T>(&self, key: &str, reason: &str) -> Result<T, AutoConfigError> {
1548 Err(AutoConfigError::InvalidOverride {
1549 key: key.to_string(),
1550 reason: reason.to_string(),
1551 })
1552 }
1553
1554 fn unsupported<T>(&self, selection: &str, reason: &str) -> Result<T, AutoConfigError> {
1555 Err(AutoConfigError::UnsupportedCombination {
1556 selection: selection.to_string(),
1557 reason: reason.to_string(),
1558 })
1559 }
1560
1561 fn unsupported_if(
1562 &self,
1563 condition: bool,
1564 selection: &str,
1565 reason: &str,
1566 ) -> Result<(), AutoConfigError> {
1567 if condition {
1568 self.unsupported(selection, reason)
1569 } else {
1570 Ok(())
1571 }
1572 }
1573}
1574
1575fn kv_cache_bytes_per_token_for_model(model: &ModelCapabilities) -> Option<u64> {
1576 let layers = model.num_hidden_layers? as u64;
1577 let kv_heads = model.kv_heads? as u64;
1578 let head_dim = model.head_dim? as u64;
1579 layers
1580 .checked_mul(2)?
1581 .checked_mul(kv_heads)?
1582 .checked_mul(head_dim)?
1583 .checked_mul(2)
1584}
1585
1586#[derive(Debug, Clone, PartialEq, Eq)]
1587struct ResolvedValue<T> {
1588 value: T,
1589 source: AutoConfigSource,
1590 source_key: Option<String>,
1591}
1592
1593fn parse_compute_capability(value: &str) -> Option<(u32, u32)> {
1594 let value = value.trim();
1595 if value.is_empty() {
1596 return None;
1597 }
1598 let (major, minor) = value.split_once('.').unwrap_or((value, "0"));
1599 Some((major.trim().parse().ok()?, minor.trim().parse().ok()?))
1600}
1601
1602fn vram_default_max_sequences(vram_bytes: u64) -> usize {
1603 match vram_bytes {
1604 bytes if bytes >= 20 * GIB => 32,
1605 bytes if bytes >= 12 * GIB => 16,
1606 bytes if bytes >= 8 * GIB => 8,
1607 _ => 4,
1608 }
1609}
1610
1611fn default_gpu_devices_for_backend(backend: &str) -> Option<Vec<usize>> {
1612 backend.eq_ignore_ascii_case("cuda").then(|| vec![0])
1613}
1614
1615fn ceil_div(value: usize, divisor: usize) -> usize {
1616 value.div_ceil(divisor)
1617}
1618
1619fn auto_config_source_from_runtime(source: RuntimeConfigSource) -> AutoConfigSource {
1620 match source {
1621 RuntimeConfigSource::Default => AutoConfigSource::Default,
1622 RuntimeConfigSource::ConfigFile => AutoConfigSource::ConfigFile,
1623 RuntimeConfigSource::Cli => AutoConfigSource::Cli,
1624 RuntimeConfigSource::Env => AutoConfigSource::Env,
1625 RuntimeConfigSource::ScriptCase => AutoConfigSource::ScriptCase,
1626 RuntimeConfigSource::MemoryProfile => AutoConfigSource::MemoryProfile,
1627 }
1628}
1629
1630#[cfg(test)]
1631mod tests {
1632 use super::*;
1633
1634 fn snapshot(vars: &[(&str, &str)]) -> RuntimeConfigSnapshot {
1635 RuntimeConfigSnapshot::from_env_vars(vars.iter().copied())
1636 }
1637
1638 fn snapshot_with_sources(vars: &[(&str, &str, RuntimeConfigSource)]) -> RuntimeConfigSnapshot {
1639 let mut entries: Vec<_> = vars
1640 .iter()
1641 .map(|(key, effective_value, source)| RuntimeConfigEntry {
1642 key: (*key).to_string(),
1643 effective_value: (*effective_value).to_string(),
1644 source: *source,
1645 affects: vec![RuntimeConfigEffect::Performance],
1646 })
1647 .collect();
1648 entries.sort_by(|a, b| a.key.cmp(&b.key));
1649 RuntimeConfigSnapshot { entries }
1650 }
1651
1652 fn m3(vars: &[(&str, &str)], features: CompiledKernelFeatures) -> FerrumConfigBuilder {
1653 FerrumConfigBuilder::new(snapshot(vars))
1654 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1655 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(features))
1656 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1657 }
1658
1659 fn m3_with_hardware(
1660 vars: &[(&str, &str)],
1661 hardware: HardwareCapabilities,
1662 ) -> FerrumConfigBuilder {
1663 FerrumConfigBuilder::new(snapshot(vars))
1664 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1665 .with_hardware_capabilities(hardware)
1666 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
1667 }
1668
1669 fn qwen25_layer_split_runtime_entries(source: RuntimeConfigSource) -> RuntimeConfigSnapshot {
1670 snapshot_with_sources(&[
1671 ("FERRUM_REQUESTED_GPU_DEVICES", "0,1", source),
1672 ("FERRUM_SELECTED_GPU_DEVICES", "0,1", source),
1673 ("FERRUM_CUDA_DEVICE_COUNT", "2", source),
1674 (
1675 "FERRUM_SELECTED_DISTRIBUTED_STRATEGY",
1676 "layer_split",
1677 source,
1678 ),
1679 (
1680 "FERRUM_SELECTED_LAYER_SPLIT_PLAN",
1681 "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79",
1682 source,
1683 ),
1684 ("FERRUM_LAYER_SPLIT_PIPELINE_MODE", "batch", source),
1685 ("FERRUM_MAX_MODEL_LEN", "4096", source),
1686 ("FERRUM_KV_MAX_BLOCKS", "1024", source),
1687 ("FERRUM_KV_CAPACITY", "1024", source),
1688 ("FERRUM_PAGED_MAX_SEQS", "16", source),
1689 ("FERRUM_MAX_BATCHED_TOKENS", "1536", source),
1690 ("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE", "16", source),
1691 ])
1692 }
1693
1694 fn expect_invalid_key(vars: &[(&str, &str)], key: &str) {
1695 expect_invalid_key_with_features(
1696 vars,
1697 key,
1698 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1699 );
1700 }
1701
1702 fn expect_invalid_key_with_features(
1703 vars: &[(&str, &str)],
1704 key: &str,
1705 features: CompiledKernelFeatures,
1706 ) {
1707 expect_invalid_key_with_hardware(vars, key, HardwareCapabilities::rtx4090_cuda(features));
1708 }
1709
1710 fn expect_invalid_key_with_hardware(
1711 vars: &[(&str, &str)],
1712 key: &str,
1713 hardware: HardwareCapabilities,
1714 ) {
1715 let err = m3_with_hardware(vars, hardware)
1716 .resolve()
1717 .expect_err("override should fail");
1718 match err {
1719 AutoConfigError::InvalidOverride { key: actual, .. } => assert_eq!(actual, key),
1720 other => panic!("expected invalid override for {key}, got {other:?}"),
1721 }
1722 }
1723
1724 fn cpu_hardware_with_features(features: CompiledKernelFeatures) -> HardwareCapabilities {
1725 HardwareCapabilities {
1726 backend: "cpu".to_string(),
1727 supported_dtypes: vec!["fp32".to_string()],
1728 supported_kv_dtypes: vec!["fp16".to_string()],
1729 compiled_features: features,
1730 ..HardwareCapabilities::unknown()
1731 }
1732 }
1733
1734 #[test]
1735 fn m3_preset_selects_current_safe_fast_path_without_fa2() {
1736 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
1737 .resolve()
1738 .unwrap();
1739 let decisions: BTreeMap<_, _> = resolved
1740 .decisions
1741 .iter()
1742 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1743 .collect();
1744 assert_eq!(
1745 decisions["attention_prefill_mixed_backend"],
1746 "vllm_paged_varlen"
1747 );
1748 assert_eq!(
1749 decisions["attention_decode_backend"],
1750 "vllm_paged_attn_v1_short"
1751 );
1752 assert_eq!(
1753 decisions["moe_implementation"],
1754 "vllm_marlin_moe_device_route_pair_ids"
1755 );
1756 assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1757 assert_eq!(decisions["prefix_cache_policy"], "prefix_cache_disabled");
1758 assert_eq!(decisions["sampling_readback_path"], "gpu_greedy_argmax");
1759 assert_eq!(
1760 resolved.preset.as_deref(),
1761 Some(M3_QWEN3_30B_A3B_INT4_PRESET)
1762 );
1763 }
1764
1765 #[test]
1766 fn cuda_gptq_moe_enables_vllm_marlin_without_m3_preset() {
1767 let hardware =
1773 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1774 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1775 let resolved = FerrumConfigBuilder::new(snapshot(&[]))
1776 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1777 .with_hardware_capabilities(hardware)
1778 .with_workload_profile(workload)
1779 .resolve()
1780 .unwrap();
1781 let decisions: BTreeMap<_, _> = resolved
1782 .decisions
1783 .iter()
1784 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1785 .collect();
1786 assert_ne!(
1787 resolved.preset.as_deref(),
1788 Some(M3_QWEN3_30B_A3B_INT4_PRESET),
1789 "serving-default workload must not be the m3 preset"
1790 );
1791 assert_eq!(
1792 decisions["moe_implementation"], "vllm_marlin_moe_device_route_pair_ids",
1793 "CUDA GPTQ MoE should get the fast vLLM-Marlin path without the m3 preset"
1794 );
1795 let entry = resolved
1799 .runtime_config
1800 .entries
1801 .iter()
1802 .find(|e| e.key == "FERRUM_VLLM_MOE");
1803 assert_eq!(
1804 entry.map(|e| e.effective_value.as_str()),
1805 Some("1"),
1806 "resolved FERRUM_VLLM_MOE must be materialized into the effective config"
1807 );
1808 }
1809
1810 #[test]
1811 fn cuda_qwen3_moe_enables_vllm_paged_attn_without_m3_preset() {
1812 let hardware =
1818 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1819 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1820 let resolved = FerrumConfigBuilder::new(snapshot(&[]))
1821 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1822 .with_hardware_capabilities(hardware)
1823 .with_workload_profile(workload)
1824 .resolve()
1825 .unwrap();
1826 let decisions: BTreeMap<_, _> = resolved
1827 .decisions
1828 .iter()
1829 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1830 .collect();
1831 assert_eq!(
1832 decisions["attention_decode_backend"], "vllm_paged_attn_v1_short",
1833 "CUDA Qwen3-MoE should get VPA decode without the m3 preset"
1834 );
1835 let entry = |key: &str| {
1836 resolved
1837 .runtime_config
1838 .entries
1839 .iter()
1840 .find(|entry| entry.key == key)
1841 .unwrap_or_else(|| panic!("missing runtime config entry {key}"))
1842 };
1843 assert_eq!(entry("FERRUM_USE_VLLM_PAGED_ATTN").effective_value, "1");
1844 assert_eq!(
1845 entry("FERRUM_VLLM_PAGED_ATTN_V1_SHORT").effective_value,
1846 "1"
1847 );
1848 }
1849
1850 #[test]
1851 fn cuda_qwen3_moe_vllm_paged_attn_env_opt_out_is_materialized() {
1852 let hardware =
1853 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
1854 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
1855 let resolved = FerrumConfigBuilder::new(snapshot(&[("FERRUM_USE_VLLM_PAGED_ATTN", "0")]))
1856 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
1857 .with_hardware_capabilities(hardware)
1858 .with_workload_profile(workload)
1859 .resolve()
1860 .unwrap();
1861 let decisions: BTreeMap<_, _> = resolved
1862 .decisions
1863 .iter()
1864 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1865 .collect();
1866 assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1867 let entry = resolved
1868 .runtime_config
1869 .entries
1870 .iter()
1871 .find(|entry| entry.key == "FERRUM_USE_VLLM_PAGED_ATTN")
1872 .expect("env opt-out should stay in effective config");
1873 assert_eq!(entry.effective_value, "0");
1874 assert_eq!(entry.source, RuntimeConfigSource::Env);
1875 }
1876
1877 #[test]
1878 fn qwen25_72b_layer_split_preset_selects_batch_tuned_defaults() {
1879 let resolved = FerrumConfigBuilder::new(qwen25_layer_split_runtime_entries(
1880 RuntimeConfigSource::Default,
1881 ))
1882 .with_model_capabilities(ModelCapabilities::qwen25_72b_gptq_int4())
1883 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1884 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1885 ))
1886 .with_workload_profile(WorkloadProfile::qwen25_72b_gptq_int4_2x4090_layer_split())
1887 .resolve()
1888 .unwrap();
1889 let decision = |selection: &str| {
1890 resolved
1891 .decisions
1892 .iter()
1893 .find(|decision| decision.selection == selection)
1894 .unwrap_or_else(|| panic!("missing decision {selection}"))
1895 };
1896
1897 assert_eq!(
1898 resolved.preset.as_deref(),
1899 Some(QWEN25_72B_GPTQ_INT4_2X4090_LAYER_SPLIT_PRESET)
1900 );
1901 assert_eq!(decision("kv_block_count").selected, "1024");
1902 assert_eq!(decision("max_sequences").selected, "16");
1903 assert_eq!(decision("max_batched_tokens").selected, "1536");
1904 assert_eq!(decision("max_model_len").selected, "4096");
1905 assert_eq!(
1906 decision("scheduler_admission_policy").selected,
1907 "prefill_first_until_active:16"
1908 );
1909 assert_eq!(
1910 decision("scheduler_admission_policy").source,
1911 AutoConfigSource::Default
1912 );
1913
1914 let doc = resolved.effective_config_document();
1915 assert_eq!(doc["selected_pipeline_mode"], "batch");
1916 assert_eq!(doc["selected_microbatch_size"], 16);
1917 assert_eq!(doc["selected_kv_capacity"], 1024);
1918 }
1919
1920 #[test]
1921 fn source_fa2_selects_source_linked_attention_when_compiled() {
1922 let resolved = m3(
1923 &[("FERRUM_FA2_SOURCE", "1")],
1924 CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
1925 )
1926 .resolve()
1927 .unwrap();
1928 let decisions: BTreeMap<_, _> = resolved
1929 .decisions
1930 .iter()
1931 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1932 .collect();
1933
1934 assert_eq!(decisions["attention_prefill_mixed_backend"], "fa2_source");
1935 }
1936
1937 #[test]
1938 fn source_fa2_is_rejected_when_not_compiled() {
1939 expect_invalid_key(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE");
1940 }
1941
1942 #[test]
1943 fn hardware_capabilities_keep_m3_preset_on_compatible_backend_paths() {
1944 let resolved = m3_with_hardware(
1945 &[],
1946 cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2()),
1947 )
1948 .resolve()
1949 .unwrap();
1950 let decisions: BTreeMap<_, _> = resolved
1951 .decisions
1952 .iter()
1953 .map(|decision| (decision.selection.as_str(), decision.selected.as_str()))
1954 .collect();
1955
1956 assert_eq!(
1957 decisions["attention_prefill_mixed_backend"],
1958 "legacy_paged_varlen"
1959 );
1960 assert_eq!(decisions["attention_decode_backend"], "legacy_paged_decode");
1961 assert_eq!(decisions["moe_implementation"], "legacy_moe");
1962 assert_eq!(decisions["moe_graph_policy"], "graph_disabled");
1963 assert_eq!(decisions["sampling_readback_path"], "logits_readback");
1964 }
1965
1966 #[test]
1967 fn effective_config_document_records_cuda_gpu_device_selection() {
1968 let resolved = FerrumConfigBuilder::new(snapshot_with_sources(&[
1969 (
1970 "FERRUM_REQUESTED_GPU_DEVICES",
1971 "0,1",
1972 RuntimeConfigSource::Cli,
1973 ),
1974 (
1975 "FERRUM_SELECTED_GPU_DEVICES",
1976 "0,1",
1977 RuntimeConfigSource::Cli,
1978 ),
1979 ("FERRUM_CUDA_DEVICE_COUNT", "2", RuntimeConfigSource::Cli),
1980 (
1981 "FERRUM_SELECTED_DISTRIBUTED_STRATEGY",
1982 "layer_split",
1983 RuntimeConfigSource::Cli,
1984 ),
1985 (
1986 "FERRUM_SELECTED_LAYER_SPLIT_PLAN",
1987 "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79",
1988 RuntimeConfigSource::Cli,
1989 ),
1990 (
1991 "FERRUM_SELECTED_LAYER_SPLIT_STAGES",
1992 r#"[{"stage":0,"device":0,"layer_start":0,"layer_end":39},{"stage":1,"device":1,"layer_start":40,"layer_end":79}]"#,
1993 RuntimeConfigSource::Cli,
1994 ),
1995 ("FERRUM_KV_CAPACITY", "512", RuntimeConfigSource::Cli),
1996 ]))
1997 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
1998 CompiledKernelFeatures::m3_fast_path_without_fa2(),
1999 ))
2000 .resolve()
2001 .unwrap();
2002
2003 let doc = resolved.effective_config_document();
2004 assert_eq!(doc["backend"], "cuda");
2005 assert_eq!(doc["requested_gpu_devices"], serde_json::json!([0, 1]));
2006 assert_eq!(doc["selected_gpu_devices"], serde_json::json!([0, 1]));
2007 assert_eq!(doc["cuda_device_count"], 2);
2008 assert_eq!(doc["selected_distributed_strategy"], "layer_split");
2009 assert_eq!(
2010 doc["selected_layer_split_plan"],
2011 "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79"
2012 );
2013 assert_eq!(
2014 doc["selected_layer_split_stages"],
2015 serde_json::json!([
2016 {"stage": 0, "device": 0, "layer_start": 0, "layer_end": 39},
2017 {"stage": 1, "device": 1, "layer_start": 40, "layer_end": 79}
2018 ])
2019 );
2020 assert_eq!(doc["selected_weight_placement"], "layer_split");
2021 assert_eq!(doc["selected_pipeline_mode"], "overlapped");
2022 assert_eq!(doc["selected_stage_bridge"], "host");
2023 assert_eq!(
2024 doc["selected_microbatch_size"],
2025 serde_json::json!(doc["selected_max_sequences"].as_u64().unwrap().div_ceil(2))
2026 );
2027 assert_eq!(
2028 doc["selected_admission_limit"],
2029 doc["selected_max_sequences"]
2030 );
2031 assert_eq!(doc["selected_kv_capacity"], 512);
2032 }
2033
2034 #[test]
2035 fn effective_config_document_honors_explicit_layer_split_batch_mode() {
2036 let resolved = FerrumConfigBuilder::new(snapshot_with_sources(&[
2037 (
2038 "FERRUM_REQUESTED_GPU_DEVICES",
2039 "0,1",
2040 RuntimeConfigSource::Cli,
2041 ),
2042 (
2043 "FERRUM_SELECTED_GPU_DEVICES",
2044 "0,1",
2045 RuntimeConfigSource::Cli,
2046 ),
2047 (
2048 "FERRUM_SELECTED_DISTRIBUTED_STRATEGY",
2049 "layer_split",
2050 RuntimeConfigSource::Cli,
2051 ),
2052 (
2053 "FERRUM_SELECTED_LAYER_SPLIT_PLAN",
2054 "stage0:cuda:0:layers=0-39;stage1:cuda:1:layers=40-79",
2055 RuntimeConfigSource::Cli,
2056 ),
2057 (
2058 "FERRUM_LAYER_SPLIT_PIPELINE_MODE",
2059 "batch",
2060 RuntimeConfigSource::Cli,
2061 ),
2062 ("FERRUM_PAGED_MAX_SEQS", "16", RuntimeConfigSource::Cli),
2063 ]))
2064 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
2065 CompiledKernelFeatures::m3_fast_path_without_fa2(),
2066 ))
2067 .resolve()
2068 .unwrap();
2069
2070 let doc = resolved.effective_config_document();
2071 assert_eq!(doc["selected_pipeline_mode"], "batch");
2072 assert_eq!(doc["selected_microbatch_size"], 16);
2073 }
2074
2075 #[test]
2076 fn invalid_layer_split_pipeline_mode_is_rejected() {
2077 expect_invalid_key_with_hardware(
2078 &[("FERRUM_LAYER_SPLIT_PIPELINE_MODE", "serial")],
2079 "FERRUM_LAYER_SPLIT_PIPELINE_MODE",
2080 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2()),
2081 );
2082 }
2083
2084 #[test]
2085 fn hardware_incompatible_attention_and_sampling_overrides_are_rejected() {
2086 let cpu =
2087 cpu_hardware_with_features(CompiledKernelFeatures::m3_fast_path_with_source_fa2());
2088 expect_invalid_key_with_hardware(
2089 &[("FERRUM_USE_VLLM_PAGED_ATTN", "1")],
2090 "FERRUM_USE_VLLM_PAGED_ATTN",
2091 cpu.clone(),
2092 );
2093 expect_invalid_key_with_hardware(
2094 &[("FERRUM_VLLM_MOE", "1")],
2095 "FERRUM_VLLM_MOE",
2096 cpu.clone(),
2097 );
2098 expect_invalid_key_with_hardware(
2099 &[("FERRUM_GREEDY_ARGMAX", "1")],
2100 "FERRUM_GREEDY_ARGMAX",
2101 cpu.clone(),
2102 );
2103 expect_invalid_key_with_hardware(&[("FERRUM_FA2_SOURCE", "1")], "FERRUM_FA2_SOURCE", cpu);
2104
2105 let mut old_cuda = HardwareCapabilities::rtx4090_cuda(
2106 CompiledKernelFeatures::m3_fast_path_with_source_fa2(),
2107 );
2108 old_cuda.compute_capability = Some("7.5".to_string());
2109 expect_invalid_key_with_hardware(
2110 &[("FERRUM_FA2_SOURCE", "1")],
2111 "FERRUM_FA2_SOURCE",
2112 old_cuda,
2113 );
2114 }
2115
2116 #[test]
2117 fn hardware_capacity_sizes_default_sequence_budget_without_overriding_user_values() {
2118 let mut small_gpu =
2119 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2120 small_gpu.sm_count = Some(16);
2121 small_gpu.vram_bytes = Some(24 * 1024 * 1024 * 1024);
2122
2123 let resolved = m3_with_hardware(&[], small_gpu.clone()).resolve().unwrap();
2124 let decision = |selection: &str| {
2125 resolved
2126 .decisions
2127 .iter()
2128 .find(|decision| decision.selection == selection)
2129 .unwrap()
2130 };
2131 let max_sequences = decision("max_sequences");
2132 assert_eq!(max_sequences.selected, "4");
2133 assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
2134 let max_batched_tokens = decision("max_batched_tokens");
2135 assert_eq!(max_batched_tokens.selected, "256");
2136 assert_eq!(
2137 max_batched_tokens.source,
2138 AutoConfigSource::HardwareCapability
2139 );
2140
2141 let resolved = m3_with_hardware(&[("FERRUM_PAGED_MAX_SEQS", "16")], small_gpu)
2142 .resolve()
2143 .unwrap();
2144 let max_sequences = resolved
2145 .decisions
2146 .iter()
2147 .find(|decision| decision.selection == "max_sequences")
2148 .unwrap();
2149 assert_eq!(max_sequences.selected, "16");
2150 assert_eq!(max_sequences.source, AutoConfigSource::Env);
2151 assert_eq!(
2152 max_sequences.source_key.as_deref(),
2153 Some("FERRUM_PAGED_MAX_SEQS")
2154 );
2155 }
2156
2157 #[test]
2158 fn vram_capacity_caps_m3_default_sequence_budget() {
2159 let mut low_vram_gpu =
2160 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2161 low_vram_gpu.sm_count = Some(128);
2162 low_vram_gpu.vram_bytes = Some(7 * 1024 * 1024 * 1024);
2163
2164 let resolved = m3_with_hardware(&[], low_vram_gpu).resolve().unwrap();
2165 let max_sequences = resolved
2166 .decisions
2167 .iter()
2168 .find(|decision| decision.selection == "max_sequences")
2169 .unwrap();
2170 assert_eq!(max_sequences.selected, "4");
2171 assert_eq!(max_sequences.source, AutoConfigSource::HardwareCapability);
2172 }
2173
2174 #[test]
2175 fn memory_budget_keeps_rtx4090_m3_kv_blocks_but_caps_constrained_vram() {
2176 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
2177 .resolve()
2178 .unwrap();
2179 let decision = |selection: &str| {
2180 resolved
2181 .decisions
2182 .iter()
2183 .find(|decision| decision.selection == selection)
2184 .unwrap()
2185 };
2186 assert_eq!(decision("kv_block_count").selected, "2048");
2187 assert_eq!(
2188 decision("kv_block_count").source,
2189 AutoConfigSource::WorkloadPreset
2190 );
2191
2192 let mut constrained =
2193 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2194 constrained.vram_bytes = Some(20 * 1024 * 1024 * 1024);
2195 let resolved = m3_with_hardware(&[], constrained).resolve().unwrap();
2196 let decision = |selection: &str| {
2197 resolved
2198 .decisions
2199 .iter()
2200 .find(|decision| decision.selection == selection)
2201 .unwrap()
2202 };
2203 assert_eq!(decision("kv_block_count").selected, "2");
2204 assert_eq!(
2205 decision("kv_block_count").source,
2206 AutoConfigSource::HardwareCapability
2207 );
2208 assert_eq!(decision("max_batched_tokens").selected, "32");
2209 assert_eq!(
2210 decision("max_batched_tokens").source,
2211 AutoConfigSource::HardwareCapability
2212 );
2213 }
2214
2215 #[test]
2216 fn compute_capability_parser_accepts_major_minor_and_major_only() {
2217 assert_eq!(parse_compute_capability("8.9"), Some((8, 9)));
2218 assert_eq!(parse_compute_capability("9"), Some((9, 0)));
2219 assert_eq!(parse_compute_capability("N/A"), None);
2220 }
2221
2222 #[test]
2223 fn vram_capacity_tiers_are_monotonic() {
2224 assert_eq!(vram_default_max_sequences(24 * 1024 * 1024 * 1024), 32);
2225 assert_eq!(vram_default_max_sequences(16 * 1024 * 1024 * 1024), 16);
2226 assert_eq!(vram_default_max_sequences(8 * 1024 * 1024 * 1024), 8);
2227 assert_eq!(vram_default_max_sequences(6 * 1024 * 1024 * 1024), 4);
2228 }
2229
2230 #[test]
2231 fn accelerator_serving_default_uses_hardware_concurrency_budget() {
2232 let hardware =
2233 HardwareCapabilities::rtx4090_cuda(CompiledKernelFeatures::m3_fast_path_without_fa2());
2234 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
2235 assert_eq!(workload.target_concurrency, 32);
2236
2237 let resolved = FerrumConfigBuilder::new(snapshot(&[]))
2238 .with_model_capabilities(ModelCapabilities::unknown())
2239 .with_hardware_capabilities(hardware)
2240 .with_workload_profile(workload)
2241 .resolve()
2242 .unwrap();
2243 let max_sequences = resolved
2244 .decisions
2245 .iter()
2246 .find(|decision| decision.selection == "max_sequences")
2247 .unwrap();
2248 assert_eq!(max_sequences.selected, "32");
2249 }
2250
2251 #[test]
2252 fn cpu_serving_default_keeps_single_sequence_budget() {
2253 let hardware = HardwareCapabilities {
2254 backend: "cpu".to_string(),
2255 supported_dtypes: vec!["fp32".to_string()],
2256 ..HardwareCapabilities::unknown()
2257 };
2258 let workload = WorkloadProfile::serving_default_for_hardware(&hardware);
2259 assert_eq!(workload.target_concurrency, 1);
2260 }
2261
2262 #[test]
2263 fn validates_invalid_override_matrix() {
2264 expect_invalid_key(
2265 &[("FERRUM_USE_VLLM_PAGED_ATTN", "maybe")],
2266 "FERRUM_USE_VLLM_PAGED_ATTN",
2267 );
2268 expect_invalid_key(&[("FERRUM_PREFIX_CACHE", "maybe")], "FERRUM_PREFIX_CACHE");
2269 expect_invalid_key(
2270 &[
2271 ("FERRUM_FA_LAYOUT_VARLEN", "1"),
2272 ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
2273 ],
2274 "FERRUM_FA_LAYOUT_VARLEN",
2275 );
2276 expect_invalid_key(&[("FERRUM_FA2_DIRECT_FFI", "1")], "FERRUM_FA2_DIRECT_FFI");
2277 expect_invalid_key_with_features(
2278 &[("FERRUM_VLLM_MOE", "1")],
2279 "FERRUM_VLLM_MOE",
2280 CompiledKernelFeatures::default(),
2281 );
2282 expect_invalid_key(
2283 &[("FERRUM_MOE_DEVICE_ROUTE", "1"), ("FERRUM_VLLM_MOE", "0")],
2284 "FERRUM_MOE_DEVICE_ROUTE",
2285 );
2286 expect_invalid_key(
2287 &[("FERRUM_VLLM_MOE_PAIR_IDS", "1"), ("FERRUM_VLLM_MOE", "0")],
2288 "FERRUM_VLLM_MOE_PAIR_IDS",
2289 );
2290 expect_invalid_key(
2291 &[("FERRUM_MOE_GRAPH", "1"), ("FERRUM_VLLM_MOE", "0")],
2292 "FERRUM_MOE_GRAPH",
2293 );
2294 expect_invalid_key(&[("FERRUM_KV_MAX_BLOCKS", "0")], "FERRUM_KV_MAX_BLOCKS");
2295 expect_invalid_key(&[("FERRUM_PAGED_MAX_SEQS", "0")], "FERRUM_PAGED_MAX_SEQS");
2296 expect_invalid_key(
2297 &[
2298 ("FERRUM_PAGED_MAX_SEQS", "32"),
2299 ("FERRUM_MAX_BATCHED_TOKENS", "16"),
2300 ],
2301 "FERRUM_MAX_BATCHED_TOKENS",
2302 );
2303 expect_invalid_key(
2304 &[
2305 ("FERRUM_KV_MAX_BLOCKS", "16"),
2306 ("FERRUM_MAX_BATCHED_TOKENS", "512"),
2307 ],
2308 "FERRUM_MAX_BATCHED_TOKENS",
2309 );
2310 expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "0")], "FERRUM_MAX_MODEL_LEN");
2311 expect_invalid_key(&[("FERRUM_MAX_MODEL_LEN", "50000")], "FERRUM_MAX_MODEL_LEN");
2312 expect_invalid_key(
2313 &[
2314 ("FERRUM_KV_MAX_BLOCKS", "16"),
2315 ("FERRUM_MAX_MODEL_LEN", "1024"),
2316 ],
2317 "FERRUM_KV_MAX_BLOCKS",
2318 );
2319 expect_invalid_key(&[("FERRUM_DTYPE", "bf16")], "FERRUM_DTYPE");
2320 expect_invalid_key(&[("FERRUM_KV_DTYPE", "fp8")], "FERRUM_KV_DTYPE");
2321 expect_invalid_key(
2322 &[
2323 ("FERRUM_VLLM_PAGED_ATTN_V1_SHORT", "1"),
2324 ("FERRUM_USE_VLLM_PAGED_ATTN", "0"),
2325 ],
2326 "FERRUM_VLLM_PAGED_ATTN_V1_SHORT",
2327 );
2328 }
2329
2330 #[test]
2331 fn requested_max_model_len_is_optional_and_reflected_when_valid() {
2332 let default_resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
2333 .resolve()
2334 .unwrap();
2335 assert!(!default_resolved
2336 .decisions
2337 .iter()
2338 .any(|decision| decision.selection == "max_model_len"));
2339
2340 let resolved = m3(
2341 &[
2342 ("FERRUM_KV_MAX_BLOCKS", "64"),
2343 ("FERRUM_MAX_MODEL_LEN", "1024"),
2344 ],
2345 CompiledKernelFeatures::m3_fast_path_without_fa2(),
2346 )
2347 .resolve()
2348 .unwrap();
2349 let max_model_len = resolved
2350 .decisions
2351 .iter()
2352 .find(|decision| decision.selection == "max_model_len")
2353 .unwrap();
2354 assert_eq!(max_model_len.selected, "1024");
2355 assert_eq!(
2356 max_model_len.source_key.as_deref(),
2357 Some("FERRUM_MAX_MODEL_LEN")
2358 );
2359 }
2360
2361 #[test]
2362 fn graph_enabled_with_graph_unsafe_moe_is_rejected() {
2363 let mut model = ModelCapabilities::qwen3_30b_a3b_gptq_int4();
2364 model.graph_safe_moe = false;
2365 let err = FerrumConfigBuilder::new(snapshot(&[("FERRUM_MOE_GRAPH", "1")]))
2366 .with_model_capabilities(model)
2367 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
2368 CompiledKernelFeatures::m3_fast_path_without_fa2(),
2369 ))
2370 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
2371 .resolve()
2372 .expect_err("graph unsafe MoE must fail");
2373 assert!(matches!(
2374 err,
2375 AutoConfigError::UnsupportedCombination {
2376 selection,
2377 ..
2378 } if selection == "moe_graph_policy"
2379 ));
2380 }
2381
2382 #[test]
2383 fn scheduler_override_is_reflected_in_decision_trace() {
2384 let resolved = m3(
2385 &[("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK", "64")],
2386 CompiledKernelFeatures::m3_fast_path_without_fa2(),
2387 )
2388 .resolve()
2389 .unwrap();
2390 let scheduler = resolved
2391 .decisions
2392 .iter()
2393 .find(|decision| decision.selection == "scheduler_admission_policy")
2394 .unwrap();
2395 assert_eq!(scheduler.selected, "active_decode_prefill_chunk:64");
2396 assert_eq!(
2397 scheduler.source_key.as_deref(),
2398 Some("FERRUM_ACTIVE_DECODE_PREFILL_CHUNK")
2399 );
2400 }
2401
2402 #[test]
2403 fn prefix_cache_override_is_reflected_in_decision_trace() {
2404 let resolved = m3(
2405 &[("FERRUM_PREFIX_CACHE", "1")],
2406 CompiledKernelFeatures::m3_fast_path_without_fa2(),
2407 )
2408 .resolve()
2409 .unwrap();
2410 let prefix_cache = resolved
2411 .decisions
2412 .iter()
2413 .find(|decision| decision.selection == "prefix_cache_policy")
2414 .unwrap();
2415 assert_eq!(prefix_cache.selected, "prefix_cache_enabled");
2416 assert_eq!(
2417 prefix_cache.source_key.as_deref(),
2418 Some("FERRUM_PREFIX_CACHE")
2419 );
2420 }
2421
2422 #[test]
2423 fn non_env_runtime_sources_are_preserved_in_decision_trace() {
2424 let runtime_config = snapshot_with_sources(&[
2425 (
2426 "FERRUM_FA_LAYOUT_VARLEN",
2427 "1",
2428 RuntimeConfigSource::ConfigFile,
2429 ),
2430 ("FERRUM_PAGED_MAX_SEQS", "48", RuntimeConfigSource::Cli),
2431 (
2432 "FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE",
2433 "32",
2434 RuntimeConfigSource::ScriptCase,
2435 ),
2436 ]);
2437 let resolved = FerrumConfigBuilder::new(runtime_config)
2438 .with_model_capabilities(ModelCapabilities::qwen3_30b_a3b_gptq_int4())
2439 .with_hardware_capabilities(HardwareCapabilities::rtx4090_cuda(
2440 CompiledKernelFeatures::m3_fast_path_without_fa2(),
2441 ))
2442 .with_workload_profile(WorkloadProfile::m3_qwen3_30b_a3b_int4())
2443 .resolve()
2444 .unwrap();
2445
2446 let decision = |selection: &str| {
2447 resolved
2448 .decisions
2449 .iter()
2450 .find(|decision| decision.selection == selection)
2451 .unwrap()
2452 };
2453 let attention = decision("attention_prefill_mixed_backend");
2454 assert_eq!(attention.selected, "fa_layout_varlen");
2455 assert_eq!(attention.source, AutoConfigSource::ConfigFile);
2456 assert_eq!(
2457 attention.source_key.as_deref(),
2458 Some("FERRUM_FA_LAYOUT_VARLEN")
2459 );
2460
2461 let max_sequences = decision("max_sequences");
2462 assert_eq!(max_sequences.selected, "48");
2463 assert_eq!(max_sequences.source, AutoConfigSource::Cli);
2464 assert_eq!(
2465 max_sequences.source_key.as_deref(),
2466 Some("FERRUM_PAGED_MAX_SEQS")
2467 );
2468
2469 let scheduler = decision("scheduler_admission_policy");
2470 assert_eq!(scheduler.selected, "prefill_first_until_active:32");
2471 assert_eq!(scheduler.source, AutoConfigSource::ScriptCase);
2472 assert_eq!(
2473 scheduler.source_key.as_deref(),
2474 Some("FERRUM_SCHED_PREFILL_FIRST_UNTIL_ACTIVE")
2475 );
2476 }
2477
2478 #[test]
2479 fn renders_effective_config_and_decision_trace_artifacts() {
2480 let resolved = m3(&[], CompiledKernelFeatures::m3_fast_path_without_fa2())
2481 .resolve()
2482 .unwrap();
2483 let effective = resolved.effective_config_document();
2484 assert_eq!(effective["schema_version"], 1);
2485 assert!(effective["env_hash"]
2486 .as_str()
2487 .unwrap()
2488 .starts_with("sha256:"));
2489 assert!(effective["entries"].is_array());
2490 assert_eq!(effective["model_capabilities"]["architecture"], "qwen3_moe");
2491 assert_eq!(effective["hardware_capabilities"]["backend"], "cuda");
2492 assert_eq!(
2493 effective["workload_profile"]["preset"],
2494 M3_QWEN3_30B_A3B_INT4_PRESET
2495 );
2496 assert_eq!(
2497 effective["decisions"].as_array().unwrap().len(),
2498 resolved.decisions.len()
2499 );
2500 let trace = resolved.decision_trace_jsonl().unwrap();
2501 assert_eq!(trace.lines().count(), resolved.decisions.len());
2502 assert!(trace.contains("\"attention_prefill_mixed_backend\""));
2503 }
2504
2505 #[test]
2506 fn auto_config_artifacts_match_locked_schema_shape() {
2507 let resolved = FerrumConfigBuilder::m3_qwen3_30b_a3b_int4(snapshot_with_sources(&[
2508 (
2509 "FERRUM_FA_LAYOUT_VARLEN",
2510 "1",
2511 RuntimeConfigSource::ScriptCase,
2512 ),
2513 ("FERRUM_PAGED_MAX_SEQS", "32", RuntimeConfigSource::Cli),
2514 ]))
2515 .resolve()
2516 .unwrap();
2517
2518 let effective = resolved.effective_config_document();
2519 assert_eq!(effective["schema_version"], 1);
2520 assert!(effective["env_hash"]
2521 .as_str()
2522 .unwrap()
2523 .starts_with("sha256:"));
2524
2525 let entries = effective["entries"].as_array().unwrap();
2526 let keys: Vec<_> = entries
2527 .iter()
2528 .map(|entry| entry["key"].as_str().unwrap())
2529 .collect();
2530 let mut sorted_keys = keys.clone();
2531 sorted_keys.sort_unstable();
2532 assert_eq!(keys, sorted_keys);
2533 for entry in entries {
2534 assert!(entry["key"].as_str().unwrap().starts_with("FERRUM_"));
2535 assert!(entry["effective_value"].is_string());
2536 assert!(matches!(
2537 entry["source"].as_str().unwrap(),
2538 "default" | "config_file" | "cli" | "env" | "script_case" | "memory_profile"
2539 ));
2540 assert!(!entry["affects"].as_array().unwrap().is_empty());
2541 }
2542 assert_eq!(
2543 effective["model_capabilities"]["quantization"].as_str(),
2544 Some("gptq_int4")
2545 );
2546 assert_eq!(
2547 effective["model_capabilities"]["moe"]["experts_per_token"].as_u64(),
2548 Some(8)
2549 );
2550 assert_eq!(
2551 effective["hardware_capabilities"]["compute_capability"].as_str(),
2552 Some("8.9")
2553 );
2554 assert_eq!(
2555 effective["hardware_capabilities"]["compiled_features"]["vllm_moe_marlin"].as_bool(),
2556 Some(true)
2557 );
2558 assert_eq!(
2559 effective["workload_profile"]["target_concurrency"].as_u64(),
2560 Some(32)
2561 );
2562 assert_eq!(
2563 effective["workload_profile"]["priority"].as_str(),
2564 Some("throughput")
2565 );
2566 let admission = &effective["admission"];
2567 for field in [
2568 "effective_max_concurrent",
2569 "queue_depth",
2570 "active_prefill",
2571 "active_decode",
2572 "current_batch_size",
2573 "rejected_requests_total",
2574 "failed_requests_total",
2575 "completed_requests_total",
2576 ] {
2577 assert!(admission[field].is_number(), "admission.{field} missing");
2578 }
2579
2580 let trace = resolved.decision_trace_jsonl().unwrap();
2581 let trace_decisions: Vec<AutoConfigDecision> = trace
2582 .lines()
2583 .map(|line| serde_json::from_str(line).unwrap())
2584 .collect();
2585 assert_eq!(trace_decisions, resolved.decisions);
2586 assert_eq!(
2587 serde_json::from_value::<Vec<AutoConfigDecision>>(effective["decisions"].clone())
2588 .unwrap(),
2589 trace_decisions
2590 );
2591
2592 for decision in &trace_decisions {
2593 assert_eq!(decision.schema_version, 1);
2594 assert!(!decision.selection.trim().is_empty());
2595 assert!(!decision.selected.trim().is_empty());
2596 assert!(!decision.candidates.is_empty());
2597 assert!(!decision.affects.is_empty());
2598 if let Some(source_key) = &decision.source_key {
2599 assert!(source_key.starts_with("FERRUM_"));
2600 }
2601 for rejected in &decision.rejected {
2602 assert!(!rejected.value.trim().is_empty());
2603 assert!(!rejected.reason.trim().is_empty());
2604 }
2605 }
2606 }
2607}