rlx_vlm_base/lib.rs
1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Shared base types for vision-language and omni runners (PLAN.md M7).
17//!
18//! `rlx-qwen3-vl`, `rlx-lfm-vl`, and `rlx-nemotron-omni` all need the
19//! same shape of plumbing: a per-image preprocessor (resize +
20//! patchify), a vision-tower trait, an MLP projector trait, and a
21//! multimodal turn interleaver that mixes image / text / (audio)
22//! into a single LM token stream. This crate hosts those traits so
23//! the family crates stay thin.
24//!
25//! **Status:** TYPE SKELETON. The traits and supporting structs are
26//! in place; implementations land alongside the per-family crates as
27//! M7 progresses.
28
29use anyhow::Result;
30
31/// Modality tag for one chunk of a multimodal prompt. Lives next to
32/// the LM token stream so the runner knows when to invoke the vision
33/// tower / audio encoder instead of consuming raw token ids.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum Modality {
36 Text,
37 Image,
38 Audio,
39}
40
41/// One image as the preprocessor sees it after resize + patchify.
42/// `patches.len() == grid_h * grid_w * channels * patch_h * patch_w`
43/// — the exact layout depends on the family.
44#[derive(Debug, Clone)]
45pub struct ImagePatches {
46 pub patches: Vec<f32>,
47 pub grid_h: usize,
48 pub grid_w: usize,
49 pub patch_h: usize,
50 pub patch_w: usize,
51 pub channels: usize,
52}
53
54impl ImagePatches {
55 pub fn num_patches(&self) -> usize {
56 self.grid_h * self.grid_w
57 }
58 pub fn patch_dim(&self) -> usize {
59 self.channels * self.patch_h * self.patch_w
60 }
61}
62
63/// Image preprocessor. Implementations resize/letterbox/normalise per
64/// the family's training pipeline (Qwen3-VL uses SigLIP norms,
65/// LFM2.5-VL uses its own, etc.).
66pub trait ImagePreprocessor: Send {
67 fn preprocess_path(&self, path: &std::path::Path) -> Result<ImagePatches>;
68 fn preprocess_bytes(&self, bytes: &[u8]) -> Result<ImagePatches>;
69}
70
71/// Vision tower — embeds patches into the model's hidden dim.
72/// Output shape is `[num_patches, hidden]`.
73pub trait VisionTower: Send {
74 fn embed(&mut self, patches: &ImagePatches) -> Result<Vec<f32>>;
75 fn hidden_size(&self) -> usize;
76}
77
78/// Projector — maps vision-tower embeddings into the LM's embedding
79/// space (so they slot in next to text token embeddings). Typically
80/// a 2-layer MLP with GeLU.
81pub trait Projector: Send {
82 fn project(&mut self, vision_embed: &[f32], num_patches: usize) -> Result<Vec<f32>>;
83 fn output_dim(&self) -> usize;
84}
85
86/// Audio encoder for omni models. Mel features → hidden embeddings.
87/// Reuse `rlx-whisper`'s mel encoder where possible — this trait is
88/// the contract a family crate adapts to.
89pub trait AudioEncoder: Send {
90 fn embed_audio(&mut self, samples: &[f32], sample_rate: u32) -> Result<Vec<f32>>;
91 fn hidden_size(&self) -> usize;
92}
93
94/// Multimodal prompt — turn-ordered list of `(modality, payload)`
95/// chunks. The runner consumes this and assembles the LM token
96/// stream by interleaving text token ids with image/audio embeddings
97/// after passing each non-text chunk through the relevant
98/// encoder + projector.
99#[derive(Debug, Clone, Default)]
100pub struct MultimodalPrompt {
101 pub chunks: Vec<PromptChunk>,
102}
103
104#[derive(Debug, Clone)]
105pub enum PromptChunk {
106 /// Raw LM token ids (caller already ran the chat template +
107 /// tokenizer on the text portion).
108 Text(Vec<u32>),
109 /// Preprocessed image patches.
110 Image(ImagePatches),
111 /// PCM-f32 audio at the given sample rate.
112 Audio { samples: Vec<f32>, sample_rate: u32 },
113}
114
115impl MultimodalPrompt {
116 pub fn push(&mut self, chunk: PromptChunk) {
117 self.chunks.push(chunk);
118 }
119 pub fn is_text_only(&self) -> bool {
120 self.chunks
121 .iter()
122 .all(|c| matches!(c, PromptChunk::Text(_)))
123 }
124 pub fn num_chunks(&self) -> usize {
125 self.chunks.len()
126 }
127}