pub struct Request {Show 19 fields
pub request_id: String,
pub priority: i32,
pub arrival_time: f64,
pub status: RequestStatus,
pub num_prompt_tokens: u32,
pub max_output_tokens: u32,
pub target_output_tokens: u32,
pub num_computed_tokens: u32,
pub num_output_tokens: u32,
pub num_tokens: u32,
pub num_cached_tokens: u32,
pub prompt_block_hashes: Vec<u64>,
pub kv_blocks: Vec<BlockId>,
pub num_preemptions: u32,
pub first_token_time: Option<f64>,
pub completion_time: Option<f64>,
pub token_generation_times: Vec<f64>,
pub preempted_time: f64,
pub last_preempted_at: Option<f64>,
}Expand description
Request represents a single inference request in the simulation
Fields§
§request_id: StringUnique request ID
priority: i32Client priority (lower = higher priority)
arrival_time: f64Arrival time (simulated time)
status: RequestStatusRequest status
num_prompt_tokens: u32Number of input tokens
max_output_tokens: u32Maximum number of output tokens to generate
target_output_tokens: u32Actual number of output tokens to generate (sampled, may be less than max) This simulates hitting an EOS token
num_computed_tokens: u32Number of tokens computed so far
num_output_tokens: u32Number of output tokens generated so far
num_tokens: u32Total tokens (prompt + output)
num_cached_tokens: u32Number of prefix-cached tokens (set by cache manager)
prompt_block_hashes: Vec<u64>Synthetic block hashes for prefix caching modeling In synthetic mode: pre-generated hashes (some shared, some unique) In semantic mode: will be computed from actual token content
kv_blocks: Vec<BlockId>KV cache blocks allocated to this request
num_preemptions: u32Number of times this request has been preempted
first_token_time: Option<f64>Time when first token was generated (TTFT tracking)
completion_time: Option<f64>Time when request completed
token_generation_times: Vec<f64>Per-token generation times
preempted_time: f64Time spent preempted (not running)
last_preempted_at: Option<f64>Last preemption start time
Implementations§
Source§impl Request
impl Request
Sourcepub fn new_with_target(
request_id: String,
priority: i32,
arrival_time: f64,
num_prompt_tokens: u32,
max_output_tokens: u32,
target_output_tokens: u32,
) -> Self
pub fn new_with_target( request_id: String, priority: i32, arrival_time: f64, num_prompt_tokens: u32, max_output_tokens: u32, target_output_tokens: u32, ) -> Self
Create a new request with a target output length
Sourcepub fn new(
request_id: String,
priority: i32,
arrival_time: f64,
num_prompt_tokens: u32,
max_output_tokens: u32,
) -> Self
pub fn new( request_id: String, priority: i32, arrival_time: f64, num_prompt_tokens: u32, max_output_tokens: u32, ) -> Self
Create a new request (target = max)
Sourcepub fn get_prompt_block_hashes(&self) -> &[u64]
pub fn get_prompt_block_hashes(&self) -> &[u64]
Get block hashes for the prompt These should be thought of as ‘incremental hashes’ - i.e. the hash of block n is the hash of all the tokens up to that block (not just that block alone). In synthetic mode: returns pre-generated hashes In semantic mode: will compute from actual token content
Sourcepub fn is_prefill(&self) -> bool
pub fn is_prefill(&self) -> bool
Check if this is in prefill phase
Sourcepub fn tokens_to_process(&self) -> u32
pub fn tokens_to_process(&self) -> u32
Get number of tokens needed to process
Sourcepub fn is_finished(&self) -> bool
pub fn is_finished(&self) -> bool
Check if request is done
Sourcepub fn total_tokens(&self) -> u32
pub fn total_tokens(&self) -> u32
Get total tokens (prompt + max output)
Sourcepub fn remaining_tokens(&self) -> u32
pub fn remaining_tokens(&self) -> u32
Get remaining tokens to process
Sourcepub fn kv_cache_size(&self, model: &ModelConfig) -> u64
pub fn kv_cache_size(&self, model: &ModelConfig) -> u64
Calculate KV cache requirement for this request
Sourcepub fn record_generated_tokens(
&mut self,
num_new_tokens: u32,
current_time: f64,
)
pub fn record_generated_tokens( &mut self, num_new_tokens: u32, current_time: f64, )
Record that tokens were generated (update output token count and total)
Sourcepub fn mark_preempted(&mut self, current_time: f64)
pub fn mark_preempted(&mut self, current_time: f64)
Mark request as preempted