ferrum_models/common/families.rs
1//! Model-family traits beyond `DecoderOnlyLLM`.
2//!
3//! Extension points only at this stage — no model in the current tree
4//! implements them yet. Landing order in Phase D:
5//!
6//! `MultimodalLLM` Qwen-VL / LLaVA (ViT backbone + LLM decoder)
7//! `EncoderDecoderLM` Whisper (encoder hidden + decoder loop)
8//! `EmbeddingModel` Bert / E5 / multilingual-e5 (single forward → hidden)
9//! `Transcriber` Whisper CLI-facing API
10//! `TtsModel` Qwen3-TTS (talker + vocoder pipeline)
11//!
12//! Each trait is written so it composes with the existing
13//! `DecoderOnlyLLM` where appropriate (Multimodal reuses decoder loop,
14//! Transcriber wraps an EncoderDecoderLM + mel frontend, etc.).
15
16use crate::common::llm::DecoderOnlyLLM;
17
18/// Opaque block of visual tokens produced by a vision encoder.
19/// Exact shape depends on the model; commonly `[num_patches, hidden]`.
20pub type VisualTokens = Vec<f32>;
21
22/// Opaque block of audio tokens (mel spectrogram features or encoder output).
23pub type AudioTokens = Vec<f32>;
24
25/// Opaque sample buffer in bytes (image pixel data).
26pub type ImageBuffer = Vec<u8>;
27
28/// PCM audio buffer — f32 mono samples.
29pub type PcmSamples = Vec<f32>;
30
31/// One output segment from a transcriber (start/end seconds + text).
32#[derive(Clone, Debug)]
33pub struct TranscriptSegment {
34 pub start_sec: f32,
35 pub end_sec: f32,
36 pub text: String,
37}
38
39/// One synthesized audio chunk (stereo not supported yet).
40#[derive(Clone, Debug)]
41pub struct AudioBuffer {
42 pub sample_rate: u32,
43 pub samples: Vec<f32>,
44}
45
46/// Optional reference for voice-cloning-style TTS.
47#[derive(Clone, Debug)]
48pub struct SpeakerRef {
49 pub ref_audio: Vec<f32>,
50 pub ref_text: Option<String>,
51}
52
53// ── Multimodal LLM ──────────────────────────────────────────────────────
54//
55// A multimodal LLM is a decoder-only LLM that additionally accepts visual
56// and/or audio inputs (Qwen-VL, LLaVA, etc.). The image/audio encoders
57// typically share the Backend trait but have dedicated model code
58// (separate file per model family).
59
60pub trait MultimodalLLM: DecoderOnlyLLM {
61 /// Encode an image into visual tokens that can be injected into the
62 /// decoder's prefill token stream (typical LLaVA/Qwen-VL flow).
63 fn encode_image(&mut self, pixels: &ImageBuffer) -> VisualTokens;
64
65 /// Optional audio path; models that don't support audio leave the default.
66 fn encode_audio(&mut self, _pcm: &PcmSamples) -> AudioTokens {
67 Vec::new()
68 }
69}
70
71// ── Encoder + Decoder ────────────────────────────────────────────────────
72//
73// Encoder-decoder models (Whisper, T5, BART) keep encoder hidden state
74// around for the duration of decode. The encoder state is opaque to the
75// engine — each model defines its own.
76
77pub trait EncoderDecoderLM: Send + Sync {
78 /// Encoded side output. Type-erased so different models can carry
79 /// different shapes (Whisper: `[n_audio_frames, hidden]`).
80 fn encode(&mut self, cache_id: &str, input: &[u32]) -> EncoderState;
81
82 /// Advance the decoder one step, conditioned on `encoder` produced earlier.
83 fn decode_step(
84 &mut self,
85 cache_id: &str,
86 token: u32,
87 pos: u32,
88 encoder: &EncoderState,
89 ) -> Vec<f32>;
90
91 fn release(&mut self, cache_id: &str);
92}
93
94/// Encoder-side state handed back from `encode()` and passed into
95/// `decode_step()`. Opaque to the engine.
96#[derive(Clone)]
97pub struct EncoderState {
98 pub hidden: Vec<f32>,
99 pub shape: Vec<usize>,
100}
101
102// ── Embedding Model ──────────────────────────────────────────────────────
103
104pub trait EmbeddingModel: Send + Sync {
105 /// Run a single forward pass over a token sequence and return the pooled
106 /// embedding (typical CLS pooling: `[hidden]`).
107 fn embed(&mut self, tokens: &[u32]) -> Vec<f32>;
108}
109
110// ── Transcriber ──────────────────────────────────────────────────────────
111//
112// Higher-level audio-to-text API. Wraps an internal encoder-decoder model
113// plus mel-spectrogram frontend + sampler; CLI only sees this trait.
114
115pub trait Transcriber: Send + Sync {
116 fn transcribe(&mut self, pcm: &PcmSamples, language: Option<&str>) -> Vec<TranscriptSegment>;
117}
118
119// ── TTS Model ────────────────────────────────────────────────────────────
120
121pub trait TtsModel: Send + Sync {
122 fn synthesize(&mut self, text: &str, speaker: Option<&SpeakerRef>) -> AudioBuffer;
123}