ferrum_interfaces/
engine.rs

1//! Inference engine interfaces — split per modality.
2//!
3//! Phase 5a step 2 splits the historical mega-trait (which mixed LLM
4//! generation, embedding, transcription, and TTS in one) into a base
5//! lifecycle trait and four modality-specific supertraits. Each
6//! engine impl now implements exactly the trait its modality needs;
7//! no more inert "unsupported" stubs.
8
9use async_trait::async_trait;
10use ferrum_types::{EngineConfig, InferenceRequest, InferenceResponse, Result, StreamChunk};
11use futures::Stream;
12use std::pin::Pin;
13
14/// Lifecycle / status methods shared by every engine kind.
15///
16/// LLM engines, embedders, transcribers, and TTS services all expose
17/// the same minimal status/metrics surface to the server / CLI. The
18/// modality-specific traits below extend this base.
19#[async_trait]
20pub trait InferenceEngine: Send + Sync {
21    /// Get current engine status.
22    async fn status(&self) -> ferrum_types::EngineStatus;
23
24    /// Shutdown engine gracefully.
25    async fn shutdown(&self) -> Result<()>;
26
27    /// Get engine configuration.
28    fn config(&self) -> &EngineConfig;
29
30    /// Get engine metrics.
31    fn metrics(&self) -> ferrum_types::EngineMetrics;
32
33    /// Health check.
34    async fn health_check(&self) -> ferrum_types::HealthStatus;
35
36    /// Optional cache metrics emitted by concrete LLM engines.
37    ///
38    /// The default keeps non-LLM and stub engines source-compatible. Real
39    /// engines can expose prefix/session cache counters without forcing those
40    /// fields into every modality's core metrics type.
41    fn cache_metrics_snapshot(&self) -> Option<serde_json::Value> {
42        None
43    }
44
45    /// Optional LoRA runtime metrics emitted by concrete LLM engines.
46    fn lora_metrics_snapshot(&self) -> Option<serde_json::Value> {
47        None
48    }
49}
50
51/// LLM text-generation engine.
52///
53/// Implemented by `ContinuousBatchEngine` (the production path) and
54/// `DefaultInferenceEngine` (legacy reference path). Backs
55/// `/v1/chat/completions` and `/v1/completions`.
56#[async_trait]
57pub trait LlmInferenceEngine: InferenceEngine {
58    /// Execute single inference request.
59    async fn infer(&self, request: InferenceRequest) -> Result<InferenceResponse>;
60
61    /// Execute streaming inference request.
62    async fn infer_stream(
63        &self,
64        request: InferenceRequest,
65    ) -> Result<Pin<Box<dyn Stream<Item = Result<StreamChunk>> + Send>>>;
66}
67
68/// Embedding engine (CLIP, BERT, etc.).
69///
70/// Backs `/v1/embeddings`. Distinct from LLM engines — no token
71/// generation, no scheduling, no KV cache.
72#[async_trait]
73pub trait EmbedEngine: InferenceEngine {
74    /// Embed raw text string → float vector (engine handles tokenization).
75    async fn embed_text(&self, text: &str) -> Result<Vec<f32>>;
76
77    /// Embed image (file path or base64) → float vector.
78    async fn embed_image(&self, image: &str) -> Result<Vec<f32>>;
79
80    /// Get embedding dimension.
81    fn embedding_dim(&self) -> usize;
82}
83
84/// Speech-to-text (Whisper) engine.
85///
86/// Backs `/v1/audio/transcriptions`.
87#[async_trait]
88pub trait TranscribeEngine: InferenceEngine {
89    /// Transcribe audio file → text.
90    async fn transcribe_file(&self, path: &str, language: Option<&str>) -> Result<String>;
91
92    /// Transcribe audio bytes (WAV / etc.) → text.
93    async fn transcribe_bytes(&self, data: &[u8], language: Option<&str>) -> Result<String>;
94}
95
96/// Text-to-speech (Qwen3-TTS, etc.) engine.
97///
98/// Backs `/v1/audio/speech`.
99#[async_trait]
100pub trait TtsEngine: InferenceEngine {
101    /// Synthesize speech → PCM audio chunks (streaming).
102    /// Returns Vec of PCM f32 samples per chunk.
103    async fn synthesize_speech(
104        &self,
105        text: &str,
106        language: Option<&str>,
107        chunk_frames: usize,
108    ) -> Result<Vec<Vec<f32>>>;
109
110    /// Get TTS sample rate.
111    fn tts_sample_rate(&self) -> u32;
112}
113
114/// Advanced engine capabilities — opt-in addition to LLM engines that
115/// support batching / speculation / runtime reconfig / diagnostics.
116#[async_trait]
117pub trait AdvancedInferenceEngine: LlmInferenceEngine {
118    /// Execute batch inference.
119    async fn infer_batch(
120        &self,
121        requests: Vec<InferenceRequest>,
122    ) -> Result<Vec<Result<InferenceResponse>>>;
123
124    /// Execute speculative inference.
125    async fn infer_speculative(
126        &self,
127        request: InferenceRequest,
128        speculation_config: ferrum_types::SpeculationConfig,
129    ) -> Result<InferenceResponse>;
130
131    /// Warm up engine with sample requests.
132    async fn warmup(
133        &mut self,
134        warmup_requests: Vec<InferenceRequest>,
135    ) -> Result<ferrum_types::WarmupResult>;
136
137    /// Configure engine at runtime.
138    async fn reconfigure(&mut self, config: EngineConfig) -> Result<()>;
139
140    /// Get detailed diagnostics.
141    async fn diagnostics(&self) -> ferrum_types::DiagnosticsReport;
142
143    /// Export engine state for debugging.
144    async fn export_state(&self) -> Result<ferrum_types::EngineState>;
145
146    /// Import engine state for debugging/testing.
147    async fn import_state(&mut self, state: ferrum_types::EngineState) -> Result<()>;
148}
149
150/// Speculation configuration for speculative decoding.
151pub type SpeculationConfig = ferrum_types::SpeculationConfig;
152
153/// Hardware constraints alias.
154pub type HardwareConstraints = ferrum_types::HardwareConstraints;
155
156/// Request characteristics alias.
157pub type RequestCharacteristics = ferrum_types::RequestCharacteristics;
158
159/// Latency requirements alias.
160pub type LatencyRequirements = ferrum_types::LatencyRequirements;
ferrum_interfaces/engine.rs

ferrum_interfaces/
engine.rs