ferrum_interfaces/
model_executor.rs

1//! Model execution interface with clear prefill/decode separation
2//!
3//! This module provides the ModelExecutor trait that replaces the "fat" Model
4//! interface, focusing purely on tensor operations without tokenization or sampling.
5
6use crate::{KvCacheHandle, TensorRef};
7use async_trait::async_trait;
8use ferrum_types::{ModelInfo, Result};
9use serde::{Deserialize, Serialize};
10use std::{collections::HashMap, sync::Arc};
11
12/// Input for prefill phase (processing the initial prompt)
13#[derive(Debug, Clone)]
14pub struct PrefillInput {
15    /// Input token IDs [batch_size, sequence_length]
16    pub input_ids: TensorRef,
17    /// Attention mask [batch_size, sequence_length] (optional)
18    pub attention_mask: Option<TensorRef>,
19    /// Position IDs [batch_size, sequence_length] (optional, for RoPE)
20    pub position_ids: Option<TensorRef>,
21    /// Pre-allocated KV cache handle (optional, for paged attention)
22    pub kv_cache: Option<Arc<dyn KvCacheHandle>>,
23}
24
25impl PrefillInput {
26    /// Create new prefill input
27    pub fn new(input_ids: TensorRef) -> Self {
28        Self {
29            input_ids,
30            attention_mask: None,
31            position_ids: None,
32            kv_cache: None,
33        }
34    }
35
36    /// Create prefill input with a pre-allocated KV cache handle.
37    pub fn with_kv_cache(mut self, kv_cache: Arc<dyn KvCacheHandle>) -> Self {
38        self.kv_cache = Some(kv_cache);
39        self
40    }
41
42    /// Add attention mask
43    pub fn with_attention_mask(mut self, mask: TensorRef) -> Self {
44        self.attention_mask = Some(mask);
45        self
46    }
47
48    /// Add position IDs
49    pub fn with_position_ids(mut self, positions: TensorRef) -> Self {
50        self.position_ids = Some(positions);
51        self
52    }
53
54    /// Get batch size
55    pub fn batch_size(&self) -> usize {
56        self.input_ids.shape()[0]
57    }
58
59    /// Get sequence length
60    pub fn sequence_length(&self) -> usize {
61        if self.input_ids.shape().len() >= 2 {
62            self.input_ids.shape()[1]
63        } else {
64            1
65        }
66    }
67}
68
69/// Output from prefill phase
70#[derive(Debug, Clone)]
71pub struct PrefillOutput {
72    /// Logits for all positions [batch_size, sequence_length, vocab_size]
73    pub logits: TensorRef,
74    /// KV cache handle populated with prompt states
75    pub kv_cache: Arc<dyn KvCacheHandle>,
76    /// Hidden states at each layer (optional, for analysis)
77    pub hidden_states: Option<Vec<TensorRef>>,
78    /// Attention weights (optional, for analysis)
79    pub attention_weights: Option<Vec<TensorRef>>,
80}
81
82impl PrefillOutput {
83    /// Create new prefill output
84    pub fn new(logits: TensorRef, kv_cache: Arc<dyn KvCacheHandle>) -> Self {
85        Self {
86            logits,
87            kv_cache,
88            hidden_states: None,
89            attention_weights: None,
90        }
91    }
92
93    /// Get logits for last position (for next token generation)
94    pub fn last_token_logits(&self) -> Result<TensorRef> {
95        let shape = self.logits.shape();
96        if shape.len() != 3 {
97            return Err(ferrum_types::FerrumError::backend(
98                "Expected 3D logits tensor [batch, seq, vocab]",
99            ));
100        }
101
102        let seq_len = shape[1];
103        if seq_len == 0 {
104            return Err(ferrum_types::FerrumError::backend("Empty sequence"));
105        }
106
107        // Extract last position: [batch, seq-1:seq, vocab] -> [batch, vocab]
108        self.logits
109            .view(&[0, seq_len - 1, 0], &[shape[0], seq_len, shape[2]])
110    }
111}
112
113/// Input for decode phase (generating one token at a time)
114#[derive(Debug, Clone)]
115pub struct DecodeInput {
116    /// Input token ID for current step [batch_size, 1]
117    pub input_ids: TensorRef,
118    /// Existing KV cache from previous steps
119    pub kv_cache: Arc<dyn KvCacheHandle>,
120    /// Position IDs for current step [batch_size, 1] (optional)
121    pub position_ids: Option<TensorRef>,
122}
123
124impl DecodeInput {
125    /// Create new decode input
126    pub fn new(input_ids: TensorRef, kv_cache: Arc<dyn KvCacheHandle>) -> Self {
127        Self {
128            input_ids,
129            kv_cache,
130            position_ids: None,
131        }
132    }
133
134    /// Add position IDs
135    pub fn with_position_ids(mut self, positions: TensorRef) -> Self {
136        self.position_ids = Some(positions);
137        self
138    }
139
140    /// Get batch size
141    pub fn batch_size(&self) -> usize {
142        self.input_ids.shape()[0]
143    }
144}
145
146/// One sequence's contribution to a unified mixed-batch forward.
147///
148/// A unified batch lets a single model forward pass process a mix of
149/// per-sequence work units: a prefill chunk (q_tokens.len() ≥ 1, possibly
150/// continuing from `pos_offset > 0` for chunked prefill) and a decode step
151/// (q_tokens.len() == 1, `pos_offset` = current cache length) coexist in
152/// the same call. The model layer concatenates all `q_tokens` into one
153/// [M_total, hidden] tensor and runs all GEMMs / norms once; only the
154/// attention kernel sees per-item segmentation.
155///
156/// This is the abstraction that enables vLLM-style chunked prefill where
157/// decode tokens for already-running sequences are produced in the same
158/// iter as a prefill chunk for a newly-arriving sequence.
159#[derive(Clone)]
160pub struct UnifiedBatchItem {
161    /// Identifier matching the sequence's KV cache (model-side keying).
162    pub seq_id: String,
163    /// Tokens to process this iter. For decode this is exactly 1 token;
164    /// for prefill (chunked or whole) this is the chunk's tokens.
165    pub q_tokens: Vec<u32>,
166    /// KV cache handle for this sequence.
167    pub kv_cache: Arc<dyn KvCacheHandle>,
168    /// Starting absolute position for the FIRST token in `q_tokens`.
169    /// 0 for a fresh prefill, `kv_len` for a decode step or a continuing
170    /// chunked-prefill slice.
171    pub pos_offset: usize,
172    /// True iff this item completes the request's prefill (or is a decode
173    /// item) — i.e. logits at the last token of `q_tokens` should be
174    /// returned for sampling. Intermediate prefill chunks set this false
175    /// to skip the lm_head + sampling path.
176    pub is_final_chunk: bool,
177}
178
179impl std::fmt::Debug for UnifiedBatchItem {
180    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
181        f.debug_struct("UnifiedBatchItem")
182            .field("seq_id", &self.seq_id)
183            .field("q_len", &self.q_tokens.len())
184            .field("pos_offset", &self.pos_offset)
185            .field("is_final_chunk", &self.is_final_chunk)
186            .finish()
187    }
188}
189
190/// A mixed-batch forward request: any combination of in-progress prefill
191/// chunks and decode steps. See [`UnifiedBatchItem`] for the per-item
192/// semantics. The producer (engine) groups all sequences active in this
193/// iter into a single batch; the consumer (model) runs one forward and
194/// returns per-item logits (only for items with `is_final_chunk = true`,
195/// in the order they appear in `items`).
196#[derive(Debug, Clone, Default)]
197pub struct UnifiedBatch {
198    pub items: Vec<UnifiedBatchItem>,
199}
200
201impl UnifiedBatch {
202    pub fn new() -> Self {
203        Self::default()
204    }
205
206    /// Total query tokens across all items — corresponds to the M dim of
207    /// the model's per-layer GEMMs in the unified forward.
208    pub fn total_q_tokens(&self) -> usize {
209        self.items.iter().map(|it| it.q_tokens.len()).sum()
210    }
211
212    /// Number of items that will produce a logits vector (decode items
213    /// always; prefill items only on their final chunk).
214    pub fn num_sampled_items(&self) -> usize {
215        self.items.iter().filter(|it| it.is_final_chunk).count()
216    }
217}
218
219/// Output from decode phase
220#[derive(Debug, Clone)]
221pub struct DecodeOutput {
222    /// Logits for next token [batch_size, vocab_size]
223    pub logits: TensorRef,
224    /// Updated KV cache with new token state
225    pub kv_cache: Arc<dyn KvCacheHandle>,
226    /// Hidden state for current token (optional)
227    pub hidden_state: Option<TensorRef>,
228    /// Attention weights for current token (optional)
229    pub attention_weights: Option<Vec<TensorRef>>,
230}
231
232impl DecodeOutput {
233    /// Create new decode output
234    pub fn new(logits: TensorRef, kv_cache: Arc<dyn KvCacheHandle>) -> Self {
235        Self {
236            logits,
237            kv_cache,
238            hidden_state: None,
239            attention_weights: None,
240        }
241    }
242}
243
244/// Core model executor trait focusing on tensor operations
245#[async_trait]
246pub trait ModelExecutor: Send + Sync {
247    /// Get model information and metadata
248    fn info(&self) -> &ModelInfo;
249
250    /// Execute prefill phase (process initial prompt)
251    async fn prefill(&self, input: &PrefillInput) -> Result<PrefillOutput>;
252
253    /// Batch prefill: process multiple prompts' prefill in ONE forward pass.
254    ///
255    /// Default implementation falls back to per-request `prefill()` (serial,
256    /// which is the current behavior the engine sees today). Executors that
257    /// support unified mixed-batch forward (e.g. via `model.unified_forward`
258    /// over a varlen QKV path) should override this to amortize launch /
259    /// kernel-overhead across all `inputs` items in one call.
260    ///
261    /// Used by the continuous-batching engine to coalesce a cohort of new
262    /// prefills (apples M3 c=32 sees 32 simultaneous prefills as one logical
263    /// batch; the serial fallback runs each in ~47 ms while a true batched
264    /// path runs all 32 in ~100 ms).
265    async fn batch_prefill(&self, inputs: &[PrefillInput]) -> Result<Vec<PrefillOutput>> {
266        let mut outputs = Vec::with_capacity(inputs.len());
267        for input in inputs {
268            outputs.push(self.prefill(input).await?);
269        }
270        Ok(outputs)
271    }
272
273    /// Execute decode phase (generate next token)
274    async fn decode(&self, input: &DecodeInput) -> Result<DecodeOutput>;
275
276    /// Batch decode: process multiple sequences in one forward pass.
277    ///
278    /// Default implementation falls back to per-request `decode()`.
279    /// Executors with batched CUDA runners should override this.
280    async fn batch_decode(&self, inputs: &[DecodeInput]) -> Result<Vec<DecodeOutput>> {
281        let mut outputs = Vec::with_capacity(inputs.len());
282        for input in inputs {
283            outputs.push(self.decode(input).await?);
284        }
285        Ok(outputs)
286    }
287
288    /// Unified mixed-batch forward: process a [`UnifiedBatch`] containing
289    /// any combination of prefill chunks (one or more `q_tokens` per item,
290    /// possibly continuing from `pos_offset > 0`) and decode steps
291    /// (`q_tokens.len() == 1`, `is_final_chunk = true`) in a single model
292    /// forward pass.
293    ///
294    /// Returns one element per `batch.items[i]`:
295    /// - `Some(logits)` for items with `is_final_chunk = true` (the
296    ///   request's final-position logits, ready for sampling)
297    /// - `None` for intermediate prefill chunks (no lm_head executed —
298    ///   model only updates KV state)
299    ///
300    /// Default implementation returns `Err(unsupported)`. Concrete LLM
301    /// executors should override with either:
302    /// - A behavioral fallback that dispatches each chunk via existing
303    ///   `prefill()` and groups decode items into `batch_decode()` (this
304    ///   preserves current behavior; no perf change), OR
305    /// - A real unified-forward path that runs all items through one
306    ///   `[M_total, hidden]` GEMM chain with a varlen attention kernel
307    ///   (this is the chunked-prefill perf unlock).
308    async fn unified_decode(&self, _batch: &UnifiedBatch) -> Result<Vec<Option<Vec<f32>>>> {
309        Err(ferrum_types::FerrumError::unsupported(
310            "unified_decode not implemented for this executor",
311        ))
312    }
313
314    /// Optional: full forward pass (for non-autoregressive use cases)
315    async fn forward(&self, _input: &TensorRef) -> Result<TensorRef> {
316        // Default implementation not supported
317        Err(ferrum_types::FerrumError::unsupported(
318            "Full forward pass not supported by this executor",
319        ))
320    }
321
322    /// Roll the KV cache for this executor's sequence back to `new_len`.
323    /// Used by speculative decoding on partial rejection so the next
324    /// iteration sees a KV prefix that matches the accepted token stream.
325    /// Default: Ok(()) — executors that don't cache per-sequence state
326    /// (stub, mock) are inherently tolerant; real LLM executors override.
327    async fn truncate_kv(
328        &self,
329        _kv_cache: &std::sync::Arc<dyn crate::KvCacheHandle>,
330        _new_len: usize,
331    ) -> Result<()> {
332        Ok(())
333    }
334
335    /// Multi-position decode-verify: one forward over `N+1` tokens,
336    /// producing one logits row per position. Used by speculative
337    /// decoding's target path so we don't pay N+1 sequential forwards.
338    ///
339    /// Default falls back to N+1 sequential `decode()` calls — correct
340    /// but slow; real LLM executors override.
341    ///
342    /// Returns a `Vec<DecodeOutput>` of length `inputs.len()` with the
343    /// final KV handle attached to the last element.
344    async fn forward_verify(&self, inputs: &[DecodeInput]) -> Result<Vec<DecodeOutput>> {
345        let mut out = Vec::with_capacity(inputs.len());
346        for input in inputs {
347            out.push(self.decode(input).await?);
348        }
349        Ok(out)
350    }
351
352    /// Get executor capabilities
353    fn capabilities(&self) -> ExecutorCapabilities;
354
355    /// Get current executor status
356    fn status(&self) -> ExecutorStatus;
357
358    /// Warm up executor (load model, allocate memory, etc.)
359    async fn warmup(&mut self) -> Result<()> {
360        // Default no-op implementation
361        Ok(())
362    }
363
364    /// Shutdown executor gracefully
365    async fn shutdown(&mut self) -> Result<()> {
366        // Default no-op implementation
367        Ok(())
368    }
369
370    /// Release KV cache and state for a completed sequence.
371    ///
372    /// Called by the engine when a request finishes (success or error) to free
373    /// GPU memory held by the sequence's KV cache. The `cache_id` matches the
374    /// value embedded in the `KvCacheHandle` returned by prefill/decode.
375    fn release_cache(&self, _cache_id: &str) {
376        // Default no-op — executors that manage per-sequence KV caches should override.
377    }
378}
379
380/// Executor capabilities and configuration
381#[derive(Debug, Clone, Serialize, Deserialize)]
382pub struct ExecutorCapabilities {
383    /// Maximum supported batch size
384    pub max_batch_size: usize,
385    /// Maximum sequence length
386    pub max_sequence_length: usize,
387    /// Supported attention mechanisms
388    pub attention_mechanisms: Vec<AttentionType>,
389    /// Whether executor supports dynamic batching
390    pub supports_dynamic_batching: bool,
391    /// Whether executor supports continuous batching
392    pub supports_continuous_batching: bool,
393    /// Whether executor supports speculative decoding
394    pub supports_speculative_decoding: bool,
395    /// Whether executor supports tensor parallelism
396    pub supports_tensor_parallelism: bool,
397    /// Whether executor supports pipeline parallelism
398    pub supports_pipeline_parallelism: bool,
399    /// Supported data types
400    pub supported_dtypes: Vec<ferrum_types::DataType>,
401    /// Supported devices
402    pub supported_devices: Vec<ferrum_types::Device>,
403    /// Memory requirements estimation
404    pub memory_requirements: MemoryRequirements,
405}
406
407/// Attention mechanism types
408#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
409pub enum AttentionType {
410    /// Standard multi-head attention
411    MultiHead,
412    /// Multi-query attention (MQA)
413    MultiQuery,
414    /// Grouped-query attention (GQA)
415    GroupedQuery,
416    /// Flash attention
417    Flash,
418    /// Paged attention
419    Paged,
420    /// Sliding window attention
421    SlidingWindow,
422}
423
424/// Memory requirements for model execution
425#[derive(Debug, Clone, Serialize, Deserialize)]
426pub struct MemoryRequirements {
427    /// Model parameter memory in bytes
428    pub parameter_memory: u64,
429    /// Minimum activation memory per token
430    pub activation_memory_per_token: usize,
431    /// KV cache memory per token per layer
432    pub kv_cache_memory_per_token: usize,
433    /// Additional overhead memory
434    pub overhead_memory: u64,
435}
436
437impl MemoryRequirements {
438    /// Calculate total memory for given configuration
439    pub fn calculate_total_memory(
440        &self,
441        batch_size: usize,
442        sequence_length: usize,
443        num_layers: usize,
444    ) -> u64 {
445        let activation_mem =
446            (self.activation_memory_per_token * batch_size * sequence_length) as u64;
447        let kv_cache_mem =
448            (self.kv_cache_memory_per_token * batch_size * sequence_length * num_layers) as u64;
449
450        self.parameter_memory + activation_mem + kv_cache_mem + self.overhead_memory
451    }
452}
453
454/// Executor status information
455#[derive(Debug, Clone, Serialize, Deserialize)]
456pub struct ExecutorStatus {
457    /// Current executor state
458    pub state: ExecutorState,
459    /// Whether executor is ready to accept requests
460    pub is_ready: bool,
461    /// Current batch size being processed
462    pub current_batch_size: usize,
463    /// Number of prefill operations completed
464    pub prefill_operations: u64,
465    /// Number of decode operations completed
466    pub decode_operations: u64,
467    /// Average prefill time in milliseconds
468    pub avg_prefill_time_ms: f64,
469    /// Average decode time in milliseconds
470    pub avg_decode_time_ms: f64,
471    /// Memory usage statistics
472    pub memory_usage: ExecutorMemoryUsage,
473    /// Last operation timestamp
474    #[serde(skip)]
475    pub last_operation: Option<std::time::Instant>,
476}
477
478/// Executor state
479#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
480pub enum ExecutorState {
481    /// Executor is initializing
482    Initializing,
483    /// Executor is ready to accept requests
484    Ready,
485    /// Executor is processing requests
486    Busy,
487    /// Executor encountered an error
488    Error,
489    /// Executor is shutting down
490    Shutdown,
491}
492
493/// Executor memory usage
494#[derive(Debug, Clone, Serialize, Deserialize)]
495pub struct ExecutorMemoryUsage {
496    /// Total allocated memory in bytes
497    pub allocated_bytes: usize,
498    /// Currently used memory in bytes
499    pub used_bytes: usize,
500    /// Peak memory usage
501    pub peak_bytes: usize,
502    /// Memory utilization percentage
503    pub utilization_percent: f32,
504}
505
506/// Batch model executor for processing multiple requests efficiently
507#[async_trait]
508pub trait BatchModelExecutor: ModelExecutor {
509    /// Execute batch prefill for multiple sequences
510    async fn batch_prefill(&self, inputs: &[PrefillInput]) -> Result<Vec<PrefillOutput>>;
511
512    /// Execute batch decode for multiple sequences
513    async fn batch_decode(&self, inputs: &[DecodeInput]) -> Result<Vec<DecodeOutput>>;
514
515    /// Get optimal batch size for current conditions
516    fn optimal_batch_size(&self) -> usize;
517
518    /// Check if batch size is supported
519    fn supports_batch_size(&self, batch_size: usize) -> bool;
520}
521
522/// Speculative execution support
523#[async_trait]
524pub trait SpeculativeExecutor: ModelExecutor {
525    /// Execute speculative decoding with draft model
526    async fn speculative_decode(
527        &self,
528        input: &DecodeInput,
529        draft_tokens: &[ferrum_types::TokenId],
530        acceptance_threshold: f32,
531    ) -> Result<SpeculativeDecodeOutput>;
532}
533
534/// Output from speculative decoding
535#[derive(Debug, Clone)]
536pub struct SpeculativeDecodeOutput {
537    /// Accepted tokens (subset of draft tokens)
538    pub accepted_tokens: Vec<ferrum_types::TokenId>,
539    /// Logits for the next token after last accepted
540    pub next_logits: TensorRef,
541    /// Updated KV cache
542    pub kv_cache: Arc<dyn KvCacheHandle>,
543    /// Number of draft tokens accepted
544    pub acceptance_count: usize,
545}
546
547/// Model executor factory
548#[async_trait]
549pub trait ModelExecutorFactory: Send + Sync {
550    /// Create executor from model configuration
551    async fn create_executor(&self, config: &ExecutorConfig) -> Result<Box<dyn ModelExecutor>>;
552
553    /// Create batch executor
554    async fn create_batch_executor(
555        &self,
556        config: &ExecutorConfig,
557    ) -> Result<Box<dyn BatchModelExecutor>>;
558
559    /// Get supported executor types
560    fn supported_types(&self) -> Vec<ExecutorType>;
561
562    /// Validate configuration
563    fn validate_config(&self, config: &ExecutorConfig) -> Result<()>;
564}
565
566/// Executor configuration
567#[derive(Debug, Clone, Serialize, Deserialize)]
568pub struct ExecutorConfig {
569    /// Model information
570    pub model_info: ModelInfo,
571    /// Target device
572    pub device: ferrum_types::Device,
573    /// Data type for computation
574    pub dtype: ferrum_types::DataType,
575    /// Maximum batch size
576    pub max_batch_size: usize,
577    /// Maximum sequence length
578    pub max_sequence_length: usize,
579    /// Attention configuration
580    pub attention_config: ExecutorAttentionConfig,
581    /// Memory configuration
582    pub memory_config: ExecutorMemoryConfig,
583    /// Optimization settings
584    pub optimization_config: OptimizationConfig,
585    /// Additional executor-specific options
586    pub executor_options: HashMap<String, serde_json::Value>,
587}
588
589/// Runtime attention configuration for model executor
590///
591/// Note: This is different from ferrum_types::AttentionConfig which describes
592/// the model architecture's attention configuration from config.json.
593/// This type describes the runtime execution settings.
594#[derive(Debug, Clone, Serialize, Deserialize)]
595pub struct ExecutorAttentionConfig {
596    /// Type of attention to use
597    pub attention_type: AttentionType,
598    /// Enable flash attention if available
599    pub enable_flash_attention: bool,
600    /// Enable paged attention
601    pub enable_paged_attention: bool,
602    /// Block size for paged attention
603    pub block_size: Option<usize>,
604    /// Sliding window size (if using sliding window attention)
605    pub sliding_window_size: Option<usize>,
606}
607
608/// Memory configuration for executor
609#[derive(Debug, Clone, Serialize, Deserialize)]
610pub struct ExecutorMemoryConfig {
611    /// Enable memory pooling
612    pub enable_memory_pooling: bool,
613    /// Memory pool size in bytes (None for auto)
614    pub memory_pool_size: Option<usize>,
615    /// Enable KV cache sharing
616    pub enable_kv_cache_sharing: bool,
617    /// Maximum memory usage percentage
618    pub max_memory_usage: f32,
619}
620
621/// Optimization configuration
622#[derive(Debug, Clone, Serialize, Deserialize)]
623pub struct OptimizationConfig {
624    /// Enable CUDA graphs (if supported)
625    pub enable_cuda_graphs: bool,
626    /// Enable kernel fusion
627    pub enable_kernel_fusion: bool,
628    /// Enable mixed precision
629    pub enable_mixed_precision: bool,
630    /// Optimization level (0-3)
631    pub optimization_level: u8,
632    /// Custom optimization flags
633    pub custom_flags: HashMap<String, bool>,
634}
635
636/// Supported executor types
637#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
638pub enum ExecutorType {
639    /// Standard sequential executor
640    Sequential,
641    /// Batch executor for parallel processing
642    Batch,
643    /// Continuous batching executor
644    ContinuousBatch,
645    /// Speculative decoding executor
646    Speculative,
647    /// Pipeline parallel executor
648    PipelineParallel,
649    /// Tensor parallel executor
650    TensorParallel,
651}
652
653/// Executor performance metrics
654#[derive(Debug, Clone, Serialize, Deserialize)]
655pub struct ExecutorMetrics {
656    /// Total operations executed
657    pub total_operations: u64,
658    /// Prefill operations
659    pub prefill_operations: u64,
660    /// Decode operations
661    pub decode_operations: u64,
662    /// Average prefill latency (ms)
663    pub avg_prefill_latency: f64,
664    /// Average decode latency (ms)
665    pub avg_decode_latency: f64,
666    /// P95 prefill latency (ms)
667    pub p95_prefill_latency: f64,
668    /// P95 decode latency (ms)
669    pub p95_decode_latency: f64,
670    /// Throughput (tokens per second)
671    pub throughput_tps: f64,
672    /// Memory efficiency (used/allocated)
673    pub memory_efficiency: f32,
674    /// Batch utilization
675    pub batch_utilization: f32,
676}
677
678/// Executor registry for managing multiple executors
679pub trait ExecutorRegistry: Send + Sync {
680    /// Register executor with name
681    fn register(&mut self, name: &str, executor: Box<dyn ModelExecutor>) -> Result<()>;
682
683    /// Get executor by name
684    fn get(&self, name: &str) -> Option<&dyn ModelExecutor>;
685
686    /// Remove executor by name
687    fn remove(&mut self, name: &str) -> Option<Box<dyn ModelExecutor>>;
688
689    /// List registered executor names
690    fn list_names(&self) -> Vec<String>;
691
692    /// Get executor metrics
693    fn get_metrics(&self, name: &str) -> Option<ExecutorMetrics>;
694}
ferrum_interfaces/model_executor.rs

ferrum_interfaces/
model_executor.rs