use std::sync::Arc;
use crate::common::perf_model::PerfModel;
use crate::common::protocols::{MockEngineArgs, WorkerType};
const DEFAULT_MAX_PREFILL_TOKENS: usize = 16384;
const DEFAULT_CHUNKED_PREFILL_SIZE: usize = 8192;
const DEFAULT_CLIP_MAX_NEW_TOKENS: usize = 4096;
const DEFAULT_INIT_NEW_TOKEN_RATIO: f64 = 0.7;
const DEFAULT_MIN_NEW_TOKEN_RATIO_FACTOR: f64 = 0.14;
const DEFAULT_NEW_TOKEN_RATIO_DECAY_STEPS: f64 = 600.0;
pub(super) const LPM_FALLBACK_THRESHOLD: usize = 128;
pub(super) const IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD: usize = 32;
pub(super) const IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD: usize = 32;
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum SchedulePolicy {
#[default]
Fifo,
Lpm,
}
pub(super) struct SglangConfig {
pub(super) schedule_policy: SchedulePolicy,
pub(super) max_prefill_tokens: usize,
pub(super) chunked_prefill_size: usize,
pub(super) clip_max_new_tokens: usize,
pub(super) init_new_token_ratio: f64,
pub(super) min_new_token_ratio: f64,
pub(super) new_token_ratio_decay_step: f64,
pub(super) perf_model: Arc<PerfModel>,
pub(super) speedup_ratio: f64,
pub(super) decode_speedup_ratio: f64,
pub(super) worker_type: WorkerType,
pub(super) block_size: usize,
pub(super) total_kv_tokens: usize,
pub(super) kv_bytes_per_token: Option<usize>,
pub(super) kv_transfer_bandwidth: Option<f64>,
}
impl SglangConfig {
pub(super) fn from_args(args: &MockEngineArgs) -> Self {
let sglang = args.sglang.as_ref();
let schedule_conservativeness = sglang
.and_then(|s| s.schedule_conservativeness)
.unwrap_or(1.0);
let init_new_token_ratio = DEFAULT_INIT_NEW_TOKEN_RATIO * schedule_conservativeness;
let min_new_token_ratio = init_new_token_ratio * DEFAULT_MIN_NEW_TOKEN_RATIO_FACTOR;
let decay_steps = DEFAULT_NEW_TOKEN_RATIO_DECAY_STEPS;
let decay_step = (init_new_token_ratio - min_new_token_ratio) / decay_steps;
let policy_str = sglang.and_then(|s| s.schedule_policy.as_deref());
let schedule_policy = match policy_str {
Some("lpm") => SchedulePolicy::Lpm,
Some("fifo") | Some("fcfs") | None => SchedulePolicy::Fifo,
Some(other) => {
tracing::warn!(
"Unknown sglang schedule_policy '{}', falling back to FIFO",
other
);
SchedulePolicy::Fifo
}
};
Self {
schedule_policy,
max_prefill_tokens: sglang
.and_then(|s| s.max_prefill_tokens)
.unwrap_or(DEFAULT_MAX_PREFILL_TOKENS),
chunked_prefill_size: sglang
.and_then(|s| s.chunked_prefill_size)
.unwrap_or(DEFAULT_CHUNKED_PREFILL_SIZE),
clip_max_new_tokens: sglang
.and_then(|s| s.clip_max_new_tokens)
.unwrap_or(DEFAULT_CLIP_MAX_NEW_TOKENS),
init_new_token_ratio,
min_new_token_ratio,
new_token_ratio_decay_step: decay_step,
perf_model: args.perf_model.clone(),
speedup_ratio: args.speedup_ratio,
decode_speedup_ratio: args.decode_speedup_ratio,
worker_type: args.worker_type,
block_size: args.block_size,
total_kv_tokens: args.num_gpu_blocks * args.block_size,
kv_bytes_per_token: args.kv_bytes_per_token,
kv_transfer_bandwidth: args.kv_transfer_bandwidth,
}
}
}
pub(super) fn ceil_to_block(tokens: usize, block_size: usize) -> usize {
if tokens == 0 {
return 0;
}
tokens.div_ceil(block_size) * block_size
}
pub(super) fn floor_to_block(tokens: usize, block_size: usize) -> usize {
tokens / block_size * block_size
}