ferrum_interfaces/
engine.rs

1//! Inference engine interfaces — split per modality.
2//!
3//! Phase 5a step 2 splits the historical mega-trait (which mixed LLM
4//! generation, embedding, transcription, and TTS in one) into a base
5//! lifecycle trait and four modality-specific supertraits. Each
6//! engine impl now implements exactly the trait its modality needs;
7//! no more inert "unsupported" stubs.
8
9use async_trait::async_trait;
10use ferrum_types::{EngineConfig, InferenceRequest, InferenceResponse, Result, StreamChunk};
11use futures::Stream;
12use std::pin::Pin;
13
14/// Lifecycle / status methods shared by every engine kind.
15///
16/// LLM engines, embedders, transcribers, and TTS services all expose
17/// the same minimal status/metrics surface to the server / CLI. The
18/// modality-specific traits below extend this base.
19#[async_trait]
20pub trait InferenceEngine: Send + Sync {
21    /// Get current engine status.
22    async fn status(&self) -> ferrum_types::EngineStatus;
23
24    /// Shutdown engine gracefully.
25    async fn shutdown(&self) -> Result<()>;
26
27    /// Get engine configuration.
28    fn config(&self) -> &EngineConfig;
29
30    /// Get engine metrics.
31    fn metrics(&self) -> ferrum_types::EngineMetrics;
32
33    /// Health check.
34    async fn health_check(&self) -> ferrum_types::HealthStatus;
35}
36
37/// LLM text-generation engine.
38///
39/// Implemented by `ContinuousBatchEngine` (the production path) and
40/// `DefaultInferenceEngine` (legacy reference path). Backs
41/// `/v1/chat/completions` and `/v1/completions`.
42#[async_trait]
43pub trait LlmInferenceEngine: InferenceEngine {
44    /// Execute single inference request.
45    async fn infer(&self, request: InferenceRequest) -> Result<InferenceResponse>;
46
47    /// Execute streaming inference request.
48    async fn infer_stream(
49        &self,
50        request: InferenceRequest,
51    ) -> Result<Pin<Box<dyn Stream<Item = Result<StreamChunk>> + Send>>>;
52}
53
54/// Embedding engine (CLIP, BERT, etc.).
55///
56/// Backs `/v1/embeddings`. Distinct from LLM engines — no token
57/// generation, no scheduling, no KV cache.
58#[async_trait]
59pub trait EmbedEngine: InferenceEngine {
60    /// Embed raw text string → float vector (engine handles tokenization).
61    async fn embed_text(&self, text: &str) -> Result<Vec<f32>>;
62
63    /// Embed image (file path or base64) → float vector.
64    async fn embed_image(&self, image: &str) -> Result<Vec<f32>>;
65
66    /// Get embedding dimension.
67    fn embedding_dim(&self) -> usize;
68}
69
70/// Speech-to-text (Whisper) engine.
71///
72/// Backs `/v1/audio/transcriptions`.
73#[async_trait]
74pub trait TranscribeEngine: InferenceEngine {
75    /// Transcribe audio file → text.
76    async fn transcribe_file(&self, path: &str, language: Option<&str>) -> Result<String>;
77
78    /// Transcribe audio bytes (WAV / etc.) → text.
79    async fn transcribe_bytes(&self, data: &[u8], language: Option<&str>) -> Result<String>;
80}
81
82/// Text-to-speech (Qwen3-TTS, etc.) engine.
83///
84/// Backs `/v1/audio/speech`.
85#[async_trait]
86pub trait TtsEngine: InferenceEngine {
87    /// Synthesize speech → PCM audio chunks (streaming).
88    /// Returns Vec of PCM f32 samples per chunk.
89    async fn synthesize_speech(
90        &self,
91        text: &str,
92        language: Option<&str>,
93        chunk_frames: usize,
94    ) -> Result<Vec<Vec<f32>>>;
95
96    /// Get TTS sample rate.
97    fn tts_sample_rate(&self) -> u32;
98}
99
100/// Advanced engine capabilities — opt-in addition to LLM engines that
101/// support batching / speculation / runtime reconfig / diagnostics.
102#[async_trait]
103pub trait AdvancedInferenceEngine: LlmInferenceEngine {
104    /// Execute batch inference.
105    async fn infer_batch(
106        &self,
107        requests: Vec<InferenceRequest>,
108    ) -> Result<Vec<Result<InferenceResponse>>>;
109
110    /// Execute speculative inference.
111    async fn infer_speculative(
112        &self,
113        request: InferenceRequest,
114        speculation_config: ferrum_types::SpeculationConfig,
115    ) -> Result<InferenceResponse>;
116
117    /// Warm up engine with sample requests.
118    async fn warmup(
119        &mut self,
120        warmup_requests: Vec<InferenceRequest>,
121    ) -> Result<ferrum_types::WarmupResult>;
122
123    /// Configure engine at runtime.
124    async fn reconfigure(&mut self, config: EngineConfig) -> Result<()>;
125
126    /// Get detailed diagnostics.
127    async fn diagnostics(&self) -> ferrum_types::DiagnosticsReport;
128
129    /// Export engine state for debugging.
130    async fn export_state(&self) -> Result<ferrum_types::EngineState>;
131
132    /// Import engine state for debugging/testing.
133    async fn import_state(&mut self, state: ferrum_types::EngineState) -> Result<()>;
134}
135
136/// Speculation configuration for speculative decoding.
137pub type SpeculationConfig = ferrum_types::SpeculationConfig;
138
139/// Hardware constraints alias.
140pub type HardwareConstraints = ferrum_types::HardwareConstraints;
141
142/// Request characteristics alias.
143pub type RequestCharacteristics = ferrum_types::RequestCharacteristics;
144
145/// Latency requirements alias.
146pub type LatencyRequirements = ferrum_types::LatencyRequirements;
ferrum_interfaces/engine.rs

ferrum_interfaces/
engine.rs