ferrum_interfaces/engine.rs
1//! Inference engine interfaces — split per modality.
2//!
3//! Phase 5a step 2 splits the historical mega-trait (which mixed LLM
4//! generation, embedding, transcription, and TTS in one) into a base
5//! lifecycle trait and four modality-specific supertraits. Each
6//! engine impl now implements exactly the trait its modality needs;
7//! no more inert "unsupported" stubs.
8
9use async_trait::async_trait;
10use ferrum_types::{EngineConfig, InferenceRequest, InferenceResponse, Result, StreamChunk};
11use futures::Stream;
12use std::pin::Pin;
13
14/// Lifecycle / status methods shared by every engine kind.
15///
16/// LLM engines, embedders, transcribers, and TTS services all expose
17/// the same minimal status/metrics surface to the server / CLI. The
18/// modality-specific traits below extend this base.
19#[async_trait]
20pub trait InferenceEngine: Send + Sync {
21 /// Get current engine status.
22 async fn status(&self) -> ferrum_types::EngineStatus;
23
24 /// Shutdown engine gracefully.
25 async fn shutdown(&self) -> Result<()>;
26
27 /// Get engine configuration.
28 fn config(&self) -> &EngineConfig;
29
30 /// Get engine metrics.
31 fn metrics(&self) -> ferrum_types::EngineMetrics;
32
33 /// Health check.
34 async fn health_check(&self) -> ferrum_types::HealthStatus;
35}
36
37/// LLM text-generation engine.
38///
39/// Implemented by `ContinuousBatchEngine` (the production path) and
40/// `DefaultInferenceEngine` (legacy reference path). Backs
41/// `/v1/chat/completions` and `/v1/completions`.
42#[async_trait]
43pub trait LlmInferenceEngine: InferenceEngine {
44 /// Execute single inference request.
45 async fn infer(&self, request: InferenceRequest) -> Result<InferenceResponse>;
46
47 /// Execute streaming inference request.
48 async fn infer_stream(
49 &self,
50 request: InferenceRequest,
51 ) -> Result<Pin<Box<dyn Stream<Item = Result<StreamChunk>> + Send>>>;
52}
53
54/// Embedding engine (CLIP, BERT, etc.).
55///
56/// Backs `/v1/embeddings`. Distinct from LLM engines — no token
57/// generation, no scheduling, no KV cache.
58#[async_trait]
59pub trait EmbedEngine: InferenceEngine {
60 /// Embed raw text string → float vector (engine handles tokenization).
61 async fn embed_text(&self, text: &str) -> Result<Vec<f32>>;
62
63 /// Embed image (file path or base64) → float vector.
64 async fn embed_image(&self, image: &str) -> Result<Vec<f32>>;
65
66 /// Get embedding dimension.
67 fn embedding_dim(&self) -> usize;
68}
69
70/// Speech-to-text (Whisper) engine.
71///
72/// Backs `/v1/audio/transcriptions`.
73#[async_trait]
74pub trait TranscribeEngine: InferenceEngine {
75 /// Transcribe audio file → text.
76 async fn transcribe_file(&self, path: &str, language: Option<&str>) -> Result<String>;
77
78 /// Transcribe audio bytes (WAV / etc.) → text.
79 async fn transcribe_bytes(&self, data: &[u8], language: Option<&str>) -> Result<String>;
80}
81
82/// Text-to-speech (Qwen3-TTS, etc.) engine.
83///
84/// Backs `/v1/audio/speech`.
85#[async_trait]
86pub trait TtsEngine: InferenceEngine {
87 /// Synthesize speech → PCM audio chunks (streaming).
88 /// Returns Vec of PCM f32 samples per chunk.
89 async fn synthesize_speech(
90 &self,
91 text: &str,
92 language: Option<&str>,
93 chunk_frames: usize,
94 ) -> Result<Vec<Vec<f32>>>;
95
96 /// Get TTS sample rate.
97 fn tts_sample_rate(&self) -> u32;
98}
99
100/// Advanced engine capabilities — opt-in addition to LLM engines that
101/// support batching / speculation / runtime reconfig / diagnostics.
102#[async_trait]
103pub trait AdvancedInferenceEngine: LlmInferenceEngine {
104 /// Execute batch inference.
105 async fn infer_batch(
106 &self,
107 requests: Vec<InferenceRequest>,
108 ) -> Result<Vec<Result<InferenceResponse>>>;
109
110 /// Execute speculative inference.
111 async fn infer_speculative(
112 &self,
113 request: InferenceRequest,
114 speculation_config: ferrum_types::SpeculationConfig,
115 ) -> Result<InferenceResponse>;
116
117 /// Warm up engine with sample requests.
118 async fn warmup(
119 &mut self,
120 warmup_requests: Vec<InferenceRequest>,
121 ) -> Result<ferrum_types::WarmupResult>;
122
123 /// Configure engine at runtime.
124 async fn reconfigure(&mut self, config: EngineConfig) -> Result<()>;
125
126 /// Get detailed diagnostics.
127 async fn diagnostics(&self) -> ferrum_types::DiagnosticsReport;
128
129 /// Export engine state for debugging.
130 async fn export_state(&self) -> Result<ferrum_types::EngineState>;
131
132 /// Import engine state for debugging/testing.
133 async fn import_state(&mut self, state: ferrum_types::EngineState) -> Result<()>;
134}
135
136/// Speculation configuration for speculative decoding.
137pub type SpeculationConfig = ferrum_types::SpeculationConfig;
138
139/// Hardware constraints alias.
140pub type HardwareConstraints = ferrum_types::HardwareConstraints;
141
142/// Request characteristics alias.
143pub type RequestCharacteristics = ferrum_types::RequestCharacteristics;
144
145/// Latency requirements alias.
146pub type LatencyRequirements = ferrum_types::LatencyRequirements;