rlx_models/run.rs
1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! High-level runner API — re-exported from per-model crates.
17//!
18//! Prefer depending on a specific model crate (`rlx-qwen3`, …) and
19//! its `rlx-<family>` binary when you only need one family.
20
21pub use crate::sam_runner::{SamArch, SamPredictionAny, SamRunner, SamRunnerBuilder};
22pub use rlx_cli::{
23 AssembledTurn, ChatMessage, ChatTemplate, ChatTemplateSource, CompatSource,
24 CompatibilityReport, CompatibilityStatus, GgufRequiredFields, LmRunner, MediaSource,
25 ModelRunner, MtmdContext, MtmdTurn, SniffedFrom, SniffedRunner, UnimplementedArch,
26 WeightFormat, arch_runner_name, auto_chat_template, auto_dispatch, auto_runner_name,
27 auto_sniff, check_hf_repo, check_path, debug_resolve_name, dispatch, dispatch_help,
28 known_unimplemented_arch, known_unimplemented_keys, list_mtp_keys, looks_like_hf_repo,
29 model_type_runner_name, open_gguf_loader, open_loader, open_loader_resolved,
30 open_loader_with_format, register_cli, register_runner, registered_runners, run_auto,
31 run_check, run_inspect, run_registered,
32};
33pub use rlx_dinov2::{DinoV2Output, DinoV2Runner, DinoV2RunnerBuilder, DinoV2Variant};
34pub use rlx_flux2::{Flux2Output, Flux2Runner, Flux2RunnerBuilder};
35pub use rlx_gemma::{GemmaConfigSource, GemmaRunner, GemmaRunnerBuilder};
36pub use rlx_llama32::{Llama32ConfigSource, Llama32Runner, Llama32RunnerBuilder};
37pub use rlx_qwen3::{Precision, Qwen3ConfigSource, Qwen3Runner, Qwen3RunnerBuilder};
38pub use rlx_qwen35::{Qwen35ConfigSource, Qwen35Runner, Qwen35RunnerBuilder};
39pub use rlx_vjepa2::{
40 Vjepa2Output, Vjepa2PoolOutput, Vjepa2PredictOutput, Vjepa2Runner, Vjepa2RunnerBuilder,
41};
42pub use rlx_wav2vec2_bert::{Wav2Vec2BertRunner, Wav2Vec2BertRunnerBuilder};
43
44/// Back-compat alias.
45pub type ConfigSource = Qwen3ConfigSource;
46
47use anyhow::{Result, bail};
48use std::path::Path;
49
50/// Sniff `path` for its GGUF / safetensors arch and return a boxed
51/// runner that implements [`LmRunner`]. The factory uses the existing
52/// [`auto_sniff`] arch-dispatch and constructs the per-family runner
53/// via its default builder.
54///
55/// Today this covers the four `text` LM families with a stable
56/// `predict_logits` API: `qwen3`, `qwen35`, `gemma`, `llama32`. Other
57/// families (vision-language, diffusion, embed) don't fit the
58/// `LmRunner` shape and return an error here. They keep their
59/// per-family builders.
60///
61/// PLAN.md M3. The `LmRunner` trait gained a default `generate(..)`
62/// in M8, so a boxed runner from this function can stream tokens too.
63pub fn auto_runner(path: &Path) -> Result<Box<dyn LmRunner>> {
64 auto_runner_with_mmproj(path, None)
65}
66
67/// Same as [`auto_runner`] but also attaches an mmproj vision encoder
68/// when the model family supports multimodal prefill (today: `qwen35`
69/// non-MTP path). For other families `mmproj` is silently ignored —
70/// matches llama-cpp's behaviour where mmproj on a text-only model is
71/// a no-op. The returned runner's [`LmRunner::supports_multimodal`]
72/// will report `true` only when both the family is multimodal-capable
73/// and `mmproj` was attached.
74pub fn auto_runner_with_mmproj(path: &Path, mmproj: Option<&Path>) -> Result<Box<dyn LmRunner>> {
75 let sniff = auto_sniff(path)?;
76 let weights = sniff.path.as_path();
77 // Packed-K-quant auto-detection is now inside each runner's
78 // `.build()` (matches llama.cpp's behaviour — K-quant tensors stay
79 // packed in memory, never materialise to a dense F32 matrix).
80 let runner: Box<dyn LmRunner> = match sniff.runner_name {
81 "qwen3" => Box::new(Qwen3Runner::builder().weights(weights).build()?),
82 "qwen35" => {
83 // PLAN.md M6 — auto-route MTP-equipped GGUFs through
84 // `Qwen35SpecRunner` for speculative decode. The
85 // `Qwen35MtpHead` HIR op now dispatches `DequantMatMul`
86 // per-weight (via `weight_schemes` plumbed through
87 // `lower_qwen35_mtp_head`), so packed K-quant GGUFs can
88 // run MTP without falling back to F32-only.
89 if gguf_has_mtp_heads(weights).unwrap_or(false) {
90 Box::new(
91 rlx_qwen35::Qwen35SpecRunner::builder()
92 .weights(weights)
93 .build()?,
94 )
95 } else {
96 let mut b = Qwen35Runner::builder().weights(weights);
97 if let Some(mp) = mmproj {
98 b = b.mmproj(mp);
99 }
100 Box::new(b.build()?)
101 }
102 }
103 "gemma" => Box::new(GemmaRunner::builder().weights(weights).build()?),
104 "llama32" => Box::new(Llama32Runner::builder().weights(weights).build()?),
105 "lfm" => Box::new(rlx_lfm::LfmRunner::builder().weights(weights).build()?),
106 other => bail!(
107 "auto_runner: runner `{other}` (sniffed from {:?}) has no `LmRunner` impl yet — \
108 use its per-family builder directly",
109 sniff.from
110 ),
111 };
112 Ok(runner)
113}
114
115/// Peek at a GGUF's `<arch>.nextn_predict_layers` metadata key without
116/// fully loading weights. Returns `Ok(true)` when the file declares ≥1
117/// MTP head. Non-GGUF or missing-key → `Ok(false)`.
118fn gguf_has_mtp_heads(path: &Path) -> Result<bool> {
119 use rlx_gguf::{GgufFile, MetaValue};
120 let is_gguf = path
121 .extension()
122 .and_then(|s| s.to_str())
123 .map(|s| s.eq_ignore_ascii_case("gguf"))
124 .unwrap_or(false);
125 if !is_gguf {
126 return Ok(false);
127 }
128 let raw = GgufFile::from_path(path)?;
129 let arch = raw
130 .metadata
131 .get("general.architecture")
132 .and_then(MetaValue::as_str)
133 .unwrap_or("");
134 // Try `<arch>.nextn_predict_layers` first; fall back to `qwen35.*` for
135 // converters that reuse the qwen35 prefix on qwen36 files.
136 for k in [
137 format!("{arch}.nextn_predict_layers"),
138 "qwen35.nextn_predict_layers".to_string(),
139 "qwen36.nextn_predict_layers".to_string(),
140 ] {
141 if let Some(MetaValue::U32(n)) = raw.metadata.get(&k) {
142 return Ok(*n > 0);
143 }
144 }
145 Ok(false)
146}
147
148/// Encode `text` to LM token ids using a HuggingFace `tokenizer.json`
149/// resolved next to the GGUF / safetensors at `weights_path`. Pass
150/// `explicit_tokenizer` to override the auto-discovery (sibling
151/// `<weights>.tokenizer.json` or `tokenizer.json` in the weights dir).
152///
153/// PLAN.md M8 — closes the loop between [`auto_chat_template`] (which
154/// returns a rendered string) and [`LmRunner::predict_logits`] /
155/// [`LmRunner::generate`] (which take raw token ids).
156///
157/// **Fallback (PLAN.md M8):** when no `tokenizer.json` is available
158/// and the weights are a GGUF, `encode_prompt_auto` automatically
159/// reconstructs a byte-level BPE tokenizer from
160/// `tokenizer.ggml.{tokens, merges}`. Works for the GPT-2/Qwen/Llama
161/// family (`tokenizer.ggml.model = "gpt2"`); SentencePiece tokenizers
162/// (`tokenizer.ggml.model = "llama"` legacy) still require a sibling
163/// `tokenizer.json`.
164pub fn auto_tokenize(
165 weights_path: &Path,
166 text: &str,
167 explicit_tokenizer: Option<&Path>,
168) -> Result<Vec<u32>> {
169 use anyhow::Context;
170 match rlx_qwen35::encode_prompt_auto(weights_path, explicit_tokenizer, text) {
171 Ok(ids) => Ok(ids),
172 Err(e) => {
173 // Augment with the GGUF-vocab fallback hint when applicable.
174 let is_gguf = weights_path
175 .extension()
176 .and_then(|s| s.to_str())
177 .map(|s| s.eq_ignore_ascii_case("gguf"))
178 .unwrap_or(false);
179 if !is_gguf {
180 return Err(e);
181 }
182 Err(e).with_context(|| {
183 format!(
184 "auto_tokenize: no `tokenizer.json` resolved for {weights_path:?}. \
185 The GGUF ships a vocab at `tokenizer.ggml.tokens` but \
186 reconstructing a BPE encoder from GGUF-only metadata is \
187 per-family work (PLAN.md M8 follow-up). Options: \
188 (1) place `tokenizer.json` next to the GGUF; \
189 (2) pass an explicit path via the `explicit_tokenizer` arg; \
190 (3) download the matching `tokenizer.json` from the model's \
191 HF repo and point at it"
192 )
193 })
194 }
195 }
196}
197
198/// Inverse of [`auto_tokenize`] — turn `ids` back into text, using the
199/// same tokenizer resolution chain (sibling `tokenizer.json` →
200/// `explicit_tokenizer` → GGUF-embedded byte-level BPE vocab).
201///
202/// `skip_special_tokens=true` removes EOS / chat-template control
203/// tokens (`<|im_end|>`, `<|endoftext|>`, …) — what you want for
204/// streaming user-facing chat output. Set `false` to keep them
205/// (useful for debugging or stop-string matching).
206pub fn auto_detokenize(
207 weights_path: &Path,
208 ids: &[u32],
209 explicit_tokenizer: Option<&Path>,
210 skip_special_tokens: bool,
211) -> Result<String> {
212 rlx_qwen35::decode_ids_auto(weights_path, explicit_tokenizer, ids, skip_special_tokens)
213}