ferrum_interfaces/engine.rs
1//! Inference engine interfaces — split per modality.
2//!
3//! Phase 5a step 2 splits the historical mega-trait (which mixed LLM
4//! generation, embedding, transcription, and TTS in one) into a base
5//! lifecycle trait and four modality-specific supertraits. Each
6//! engine impl now implements exactly the trait its modality needs;
7//! no more inert "unsupported" stubs.
8
9use async_trait::async_trait;
10use ferrum_types::{EngineConfig, InferenceRequest, InferenceResponse, Result, StreamChunk};
11use futures::Stream;
12use std::pin::Pin;
13
14/// Lifecycle / status methods shared by every engine kind.
15///
16/// LLM engines, embedders, transcribers, and TTS services all expose
17/// the same minimal status/metrics surface to the server / CLI. The
18/// modality-specific traits below extend this base.
19#[async_trait]
20pub trait InferenceEngine: Send + Sync {
21 /// Get current engine status.
22 async fn status(&self) -> ferrum_types::EngineStatus;
23
24 /// Shutdown engine gracefully.
25 async fn shutdown(&self) -> Result<()>;
26
27 /// Get engine configuration.
28 fn config(&self) -> &EngineConfig;
29
30 /// Get engine metrics.
31 fn metrics(&self) -> ferrum_types::EngineMetrics;
32
33 /// Health check.
34 async fn health_check(&self) -> ferrum_types::HealthStatus;
35
36 /// Optional cache metrics emitted by concrete LLM engines.
37 ///
38 /// The default keeps non-LLM and stub engines source-compatible. Real
39 /// engines can expose prefix/session cache counters without forcing those
40 /// fields into every modality's core metrics type.
41 fn cache_metrics_snapshot(&self) -> Option<serde_json::Value> {
42 None
43 }
44
45 /// Optional LoRA runtime metrics emitted by concrete LLM engines.
46 fn lora_metrics_snapshot(&self) -> Option<serde_json::Value> {
47 None
48 }
49}
50
51/// LLM text-generation engine.
52///
53/// Implemented by `ContinuousBatchEngine` (the production path) and
54/// `DefaultInferenceEngine` (legacy reference path). Backs
55/// `/v1/chat/completions` and `/v1/completions`.
56#[async_trait]
57pub trait LlmInferenceEngine: InferenceEngine {
58 /// Execute single inference request.
59 async fn infer(&self, request: InferenceRequest) -> Result<InferenceResponse>;
60
61 /// Execute streaming inference request.
62 async fn infer_stream(
63 &self,
64 request: InferenceRequest,
65 ) -> Result<Pin<Box<dyn Stream<Item = Result<StreamChunk>> + Send>>>;
66}
67
68/// Embedding engine (CLIP, BERT, etc.).
69///
70/// Backs `/v1/embeddings`. Distinct from LLM engines — no token
71/// generation, no scheduling, no KV cache.
72#[async_trait]
73pub trait EmbedEngine: InferenceEngine {
74 /// Embed raw text string → float vector (engine handles tokenization).
75 async fn embed_text(&self, text: &str) -> Result<Vec<f32>>;
76
77 /// Embed image (file path or base64) → float vector.
78 async fn embed_image(&self, image: &str) -> Result<Vec<f32>>;
79
80 /// Get embedding dimension.
81 fn embedding_dim(&self) -> usize;
82}
83
84/// Speech-to-text (Whisper) engine.
85///
86/// Backs `/v1/audio/transcriptions`.
87#[async_trait]
88pub trait TranscribeEngine: InferenceEngine {
89 /// Transcribe audio file → text.
90 async fn transcribe_file(&self, path: &str, language: Option<&str>) -> Result<String>;
91
92 /// Transcribe audio bytes (WAV / etc.) → text.
93 async fn transcribe_bytes(&self, data: &[u8], language: Option<&str>) -> Result<String>;
94}
95
96/// Text-to-speech (Qwen3-TTS, etc.) engine.
97///
98/// Backs `/v1/audio/speech`.
99#[async_trait]
100pub trait TtsEngine: InferenceEngine {
101 /// Synthesize speech → PCM audio chunks (streaming).
102 /// Returns Vec of PCM f32 samples per chunk.
103 async fn synthesize_speech(
104 &self,
105 text: &str,
106 language: Option<&str>,
107 chunk_frames: usize,
108 ) -> Result<Vec<Vec<f32>>>;
109
110 /// Get TTS sample rate.
111 fn tts_sample_rate(&self) -> u32;
112}
113
114/// Advanced engine capabilities — opt-in addition to LLM engines that
115/// support batching / speculation / runtime reconfig / diagnostics.
116#[async_trait]
117pub trait AdvancedInferenceEngine: LlmInferenceEngine {
118 /// Execute batch inference.
119 async fn infer_batch(
120 &self,
121 requests: Vec<InferenceRequest>,
122 ) -> Result<Vec<Result<InferenceResponse>>>;
123
124 /// Execute speculative inference.
125 async fn infer_speculative(
126 &self,
127 request: InferenceRequest,
128 speculation_config: ferrum_types::SpeculationConfig,
129 ) -> Result<InferenceResponse>;
130
131 /// Warm up engine with sample requests.
132 async fn warmup(
133 &mut self,
134 warmup_requests: Vec<InferenceRequest>,
135 ) -> Result<ferrum_types::WarmupResult>;
136
137 /// Configure engine at runtime.
138 async fn reconfigure(&mut self, config: EngineConfig) -> Result<()>;
139
140 /// Get detailed diagnostics.
141 async fn diagnostics(&self) -> ferrum_types::DiagnosticsReport;
142
143 /// Export engine state for debugging.
144 async fn export_state(&self) -> Result<ferrum_types::EngineState>;
145
146 /// Import engine state for debugging/testing.
147 async fn import_state(&mut self, state: ferrum_types::EngineState) -> Result<()>;
148}
149
150/// Speculation configuration for speculative decoding.
151pub type SpeculationConfig = ferrum_types::SpeculationConfig;
152
153/// Hardware constraints alias.
154pub type HardwareConstraints = ferrum_types::HardwareConstraints;
155
156/// Request characteristics alias.
157pub type RequestCharacteristics = ferrum_types::RequestCharacteristics;
158
159/// Latency requirements alias.
160pub type LatencyRequirements = ferrum_types::LatencyRequirements;