inference_lab/config/scheduler.rs
1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct SchedulerConfig {
5 /// Maximum number of tokens processed in a single iteration
6 pub max_num_batched_tokens: u32,
7
8 /// Maximum number of sequences that can run concurrently
9 pub max_num_seqs: u32,
10
11 /// Scheduling policy: "fcfs" or "priority"
12 pub policy: String,
13
14 /// Enable chunked prefilling
15 pub enable_chunked_prefill: bool,
16
17 /// Maximum tokens to prefill in a single iteration (vLLM's long_prefill_token_threshold)
18 /// Defaults to 4% of max_model_len if not specified
19 #[serde(default)]
20 pub long_prefill_token_threshold: u32,
21
22 /// Maximum number of sequences that can be partially prefilled concurrently (vLLM default: 1)
23 /// This limits how many NEW waiting requests can start prefilling per iteration
24 #[serde(default = "default_max_num_partial_prefills")]
25 pub max_num_partial_prefills: u32,
26
27 /// Block size for KV cache (in tokens)
28 pub block_size: u32,
29
30 /// Enable preemption-free scheduling mode
31 /// When enabled, uses conservative admission control to guarantee zero preemptions
32 #[serde(default)]
33 pub enable_preemption_free: bool,
34}
35
36fn default_max_num_partial_prefills() -> u32 {
37 1
38}
39
40impl SchedulerConfig {
41 /// Set default prefill threshold based on max model length (vLLM uses 4%)
42 /// Only sets threshold if max_num_partial_prefills > 1 (matching vLLM behavior)
43 pub fn set_default_prefill_threshold(&mut self, max_model_len: u32) {
44 if self.enable_chunked_prefill
45 && self.max_num_partial_prefills > 1
46 && self.long_prefill_token_threshold == 0
47 {
48 self.long_prefill_token_threshold = (max_model_len as f64 * 0.04) as u32;
49 }
50 }
51}