pub struct RealtimeOptimizer { /* private fields */ }Expand description
Real-time optimizer for LLM inference
Implementations§
Source§impl RealtimeOptimizer
impl RealtimeOptimizer
Sourcepub fn new(config: RealtimeConfig) -> Self
pub fn new(config: RealtimeConfig) -> Self
Create a new realtime optimizer
Sourcepub fn optimize_batch_size(&self, recent_latencies: &[f32]) -> usize
pub fn optimize_batch_size(&self, recent_latencies: &[f32]) -> usize
Optimize batch size based on recent latency measurements
Sourcepub fn should_evict_kv_cache(&self) -> bool
pub fn should_evict_kv_cache(&self) -> bool
Check if KV cache eviction is needed
Sourcepub fn update_kv_cache_pressure(&self, pressure: f32)
pub fn update_kv_cache_pressure(&self, pressure: f32)
Update KV cache pressure
Sourcepub fn kv_cache_pressure(&self) -> f32
pub fn kv_cache_pressure(&self) -> f32
Get KV cache pressure
Sourcepub fn allocate_token_budget(
&self,
requests: &[Request],
) -> Vec<TokenBudgetAllocation>
pub fn allocate_token_budget( &self, requests: &[Request], ) -> Vec<TokenBudgetAllocation>
Allocate token budgets for a set of requests
Sourcepub fn enable_speculative_decoding(&self, draft_model: &str)
pub fn enable_speculative_decoding(&self, draft_model: &str)
Enable speculative decoding
Sourcepub fn disable_speculative_decoding(&self)
pub fn disable_speculative_decoding(&self)
Disable speculative decoding
Sourcepub fn update_speculation_stats(
&self,
accepted_count: usize,
total_drafted: usize,
)
pub fn update_speculation_stats( &self, accepted_count: usize, total_drafted: usize, )
Update speculation statistics for learning/monitoring
This records the acceptance rate of speculative decoding rounds to help tune the lookahead parameter adaptively.
§Arguments
accepted_count- Number of draft tokens that were acceptedtotal_drafted- Total number of draft tokens generated
Sourcepub fn is_speculative_active(&self) -> bool
pub fn is_speculative_active(&self) -> bool
Check if speculative decoding is active
Sourcepub fn draft_model(&self) -> Option<String>
pub fn draft_model(&self) -> Option<String>
Get the draft model identifier
Sourcepub fn record_latency(&self, latency_ms: f32)
pub fn record_latency(&self, latency_ms: f32)
Record a latency measurement
Sourcepub fn record_throughput(&self, tps: f32)
pub fn record_throughput(&self, tps: f32)
Record a throughput measurement
Sourcepub fn average_latency(&self) -> f32
pub fn average_latency(&self) -> f32
Get average latency
Sourcepub fn average_throughput(&self) -> f32
pub fn average_throughput(&self) -> f32
Get average throughput
Sourcepub fn update_memory_usage(&self, bytes: usize)
pub fn update_memory_usage(&self, bytes: usize)
Update memory usage
Sourcepub fn memory_pressure(&self) -> f32
pub fn memory_pressure(&self) -> f32
Get memory pressure (0.0 - 1.0)
Sourcepub fn optimize(&self, metrics: &InferenceMetrics) -> OptimizationDecision
pub fn optimize(&self, metrics: &InferenceMetrics) -> OptimizationDecision
Make a comprehensive optimization decision
Sourcepub fn add_request(&self, request: Request)
pub fn add_request(&self, request: Request)
Add a pending request
Sourcepub fn remove_request(&self, request_id: &str)
pub fn remove_request(&self, request_id: &str)
Remove a completed request
Sourcepub fn pending_request_count(&self) -> usize
pub fn pending_request_count(&self) -> usize
Get pending request count
Sourcepub fn current_batch_size(&self) -> usize
pub fn current_batch_size(&self) -> usize
Get current batch size
Sourcepub fn update_config(&self, config: RealtimeConfig)
pub fn update_config(&self, config: RealtimeConfig)
Update configuration
Sourcepub fn config(&self) -> RealtimeConfig
pub fn config(&self) -> RealtimeConfig
Get current configuration
Source§impl RealtimeOptimizer
impl RealtimeOptimizer
Sourcepub fn should_use_speculative(&self, params: &GenerateParams) -> bool
pub fn should_use_speculative(&self, params: &GenerateParams) -> bool
Check if speculative decoding should be used for these generation parameters
Returns true when:
- Temperature is low (< 0.5) - deterministic generation benefits most
- Greedy decoding (top_k = 1)
- Speculative decoding is enabled in config
Sourcepub fn get_speculative_config(&self) -> SpeculativeConfig
pub fn get_speculative_config(&self) -> SpeculativeConfig
Get recommended speculative decoding configuration based on current metrics
Trait Implementations§
Auto Trait Implementations§
impl !Freeze for RealtimeOptimizer
impl !RefUnwindSafe for RealtimeOptimizer
impl Send for RealtimeOptimizer
impl Sync for RealtimeOptimizer
impl Unpin for RealtimeOptimizer
impl UnwindSafe for RealtimeOptimizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more