Skip to main content

inference_lab/config/
scheduler.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct SchedulerConfig {
5    /// Maximum number of tokens processed in a single iteration
6    pub max_num_batched_tokens: u32,
7
8    /// Maximum number of sequences that can run concurrently
9    pub max_num_seqs: u32,
10
11    /// Scheduling policy: "fcfs" or "priority"
12    pub policy: String,
13
14    /// Enable chunked prefilling
15    pub enable_chunked_prefill: bool,
16
17    /// Maximum tokens to prefill in a single iteration (vLLM's long_prefill_token_threshold)
18    /// Defaults to 4% of max_model_len if not specified
19    #[serde(default)]
20    pub long_prefill_token_threshold: u32,
21
22    /// Maximum number of sequences that can be partially prefilled concurrently (vLLM default: 1)
23    /// This limits how many NEW waiting requests can start prefilling per iteration
24    #[serde(default = "default_max_num_partial_prefills")]
25    pub max_num_partial_prefills: u32,
26
27    /// Block size for KV cache (in tokens)
28    pub block_size: u32,
29
30    /// Enable preemption-free scheduling mode
31    /// When enabled, uses conservative admission control to guarantee zero preemptions
32    #[serde(default)]
33    pub enable_preemption_free: bool,
34}
35
36fn default_max_num_partial_prefills() -> u32 {
37    1
38}
39
40impl SchedulerConfig {
41    /// Set default prefill threshold based on max model length (vLLM uses 4%)
42    /// Only sets threshold if max_num_partial_prefills > 1 (matching vLLM behavior)
43    pub fn set_default_prefill_threshold(&mut self, max_model_len: u32) {
44        if self.enable_chunked_prefill
45            && self.max_num_partial_prefills > 1
46            && self.long_prefill_token_threshold == 0
47        {
48            self.long_prefill_token_threshold = (max_model_len as f64 * 0.04) as u32;
49        }
50    }
51}