use crate::common::protocols::SchedulingPolicy;
use crate::common::sequence::ActiveSequence;
use crate::kv_manager::KvManager;
pub(crate) fn blocks_needed_to_finish(
sequence: &ActiveSequence,
block_size: usize,
kv_manager: &KvManager,
) -> usize {
let full_blocks =
(sequence.num_input_tokens() + sequence.max_output_tokens()).div_ceil(block_size);
if sequence.num_allocated_tokens() == 0 {
let reusable_blocks =
kv_manager.get_prefill_cost(sequence).active_cached_tokens / block_size;
full_blocks.saturating_sub(reusable_blocks)
} else {
let allocated_blocks = sequence.num_allocated_tokens().div_ceil(block_size);
full_blocks.saturating_sub(allocated_blocks)
}
}
pub(crate) fn available_blocks<'a>(
running: impl Iterator<Item = &'a ActiveSequence>,
num_gpu_blocks: usize,
block_size: usize,
kv_manager: &KvManager,
) -> usize {
let reserved: usize = running
.map(|sequence| blocks_needed_to_finish(sequence, block_size, kv_manager))
.sum();
let free = num_gpu_blocks.saturating_sub(kv_manager.num_active_blocks());
free.saturating_sub(reserved)
}
pub(crate) fn is_no_evict(policy: SchedulingPolicy) -> bool {
policy == SchedulingPolicy::TrtllmGuaranteedNoEvict
}
pub(crate) fn normalize_max_output_tokens(
prompt_len: usize,
max_output_tokens: usize,
num_gpu_blocks: usize,
block_size: usize,
) -> Option<usize> {
let capacity_tokens = num_gpu_blocks.saturating_mul(block_size);
if prompt_len >= capacity_tokens {
return None;
}
Some(max_output_tokens.min(capacity_tokens - prompt_len))
}
pub(crate) fn report_no_evict_violation() {
debug_assert!(
false,
"no-evict invariant violated: trtllm GUARANTEED_NO_EVICT required preemption"
);
tracing::error!(
"trtllm GUARANTEED_NO_EVICT required preemption; reservation under-counted physical KV demand"
);
}