Skip to main content

ferrum_models/common/
families.rs

1//! Model-family traits beyond `DecoderOnlyLLM`.
2//!
3//! Extension points only at this stage — no model in the current tree
4//! implements them yet. Landing order in Phase D:
5//!
6//!   `MultimodalLLM`      Qwen-VL / LLaVA (ViT backbone + LLM decoder)
7//!   `EncoderDecoderLM`   Whisper (encoder hidden + decoder loop)
8//!   `EmbeddingModel`     Bert / E5 / multilingual-e5 (single forward → hidden)
9//!   `Transcriber`        Whisper CLI-facing API
10//!   `TtsModel`           Qwen3-TTS (talker + vocoder pipeline)
11//!
12//! Each trait is written so it composes with the existing
13//! `DecoderOnlyLLM` where appropriate (Multimodal reuses decoder loop,
14//! Transcriber wraps an EncoderDecoderLM + mel frontend, etc.).
15
16use crate::common::llm::DecoderOnlyLLM;
17
18/// Opaque block of visual tokens produced by a vision encoder.
19/// Exact shape depends on the model; commonly `[num_patches, hidden]`.
20pub type VisualTokens = Vec<f32>;
21
22/// Opaque block of audio tokens (mel spectrogram features or encoder output).
23pub type AudioTokens = Vec<f32>;
24
25/// Opaque sample buffer in bytes (image pixel data).
26pub type ImageBuffer = Vec<u8>;
27
28/// PCM audio buffer — f32 mono samples.
29pub type PcmSamples = Vec<f32>;
30
31/// One output segment from a transcriber (start/end seconds + text).
32#[derive(Clone, Debug)]
33pub struct TranscriptSegment {
34    pub start_sec: f32,
35    pub end_sec: f32,
36    pub text: String,
37}
38
39/// One synthesized audio chunk (stereo not supported yet).
40#[derive(Clone, Debug)]
41pub struct AudioBuffer {
42    pub sample_rate: u32,
43    pub samples: Vec<f32>,
44}
45
46/// Optional reference for voice-cloning-style TTS.
47#[derive(Clone, Debug)]
48pub struct SpeakerRef {
49    pub ref_audio: Vec<f32>,
50    pub ref_text: Option<String>,
51}
52
53// ── Multimodal LLM ──────────────────────────────────────────────────────
54//
55// A multimodal LLM is a decoder-only LLM that additionally accepts visual
56// and/or audio inputs (Qwen-VL, LLaVA, etc.). The image/audio encoders
57// typically share the Backend trait but have dedicated model code
58// (separate file per model family).
59
60pub trait MultimodalLLM: DecoderOnlyLLM {
61    /// Encode an image into visual tokens that can be injected into the
62    /// decoder's prefill token stream (typical LLaVA/Qwen-VL flow).
63    fn encode_image(&mut self, pixels: &ImageBuffer) -> VisualTokens;
64
65    /// Optional audio path; models that don't support audio leave the default.
66    fn encode_audio(&mut self, _pcm: &PcmSamples) -> AudioTokens {
67        Vec::new()
68    }
69}
70
71// ── Encoder + Decoder ────────────────────────────────────────────────────
72//
73// Encoder-decoder models (Whisper, T5, BART) keep encoder hidden state
74// around for the duration of decode. The encoder state is opaque to the
75// engine — each model defines its own.
76
77pub trait EncoderDecoderLM: Send + Sync {
78    /// Encoded side output. Type-erased so different models can carry
79    /// different shapes (Whisper: `[n_audio_frames, hidden]`).
80    fn encode(&mut self, cache_id: &str, input: &[u32]) -> EncoderState;
81
82    /// Advance the decoder one step, conditioned on `encoder` produced earlier.
83    fn decode_step(
84        &mut self,
85        cache_id: &str,
86        token: u32,
87        pos: u32,
88        encoder: &EncoderState,
89    ) -> Vec<f32>;
90
91    fn release(&mut self, cache_id: &str);
92}
93
94/// Encoder-side state handed back from `encode()` and passed into
95/// `decode_step()`. Opaque to the engine.
96#[derive(Clone)]
97pub struct EncoderState {
98    pub hidden: Vec<f32>,
99    pub shape: Vec<usize>,
100}
101
102// ── Embedding Model ──────────────────────────────────────────────────────
103
104pub trait EmbeddingModel: Send + Sync {
105    /// Run a single forward pass over a token sequence and return the pooled
106    /// embedding (typical CLS pooling: `[hidden]`).
107    fn embed(&mut self, tokens: &[u32]) -> Vec<f32>;
108}
109
110// ── Transcriber ──────────────────────────────────────────────────────────
111//
112// Higher-level audio-to-text API. Wraps an internal encoder-decoder model
113// plus mel-spectrogram frontend + sampler; CLI only sees this trait.
114
115pub trait Transcriber: Send + Sync {
116    fn transcribe(&mut self, pcm: &PcmSamples, language: Option<&str>) -> Vec<TranscriptSegment>;
117}
118
119// ── TTS Model ────────────────────────────────────────────────────────────
120
121pub trait TtsModel: Send + Sync {
122    fn synthesize(&mut self, text: &str, speaker: Option<&SpeakerRef>) -> AudioBuffer;
123}