llama_cpp_4/
mtmd.rs

1//! Safe wrappers for the `libmtmd` multimodal support library.
2//!
3//! `libmtmd` extends llama.cpp with the ability to encode image and audio
4//! inputs (bitmaps) into token embeddings that can then be fed into a
5//! standard [`crate::context::LlamaContext::decode`] call alongside normal text tokens.
6//!
7//! # Quick-start
8//!
9//! ```no_run
10//! # #[cfg(feature = "mtmd")]
11//! # {
12//! use std::path::Path;
13//! use llama_cpp_4::{
14//!     llama_backend::LlamaBackend,
15//!     model::{LlamaModel, params::LlamaModelParams, AddBos},
16//!     context::params::LlamaContextParams,
17//!     mtmd::{MtmdContext, MtmdContextParams, MtmdBitmap, MtmdInputChunks, MtmdInputText},
18//! };
19//!
20//! let backend  = LlamaBackend::init().unwrap();
21//! let model    = LlamaModel::load_from_file(&backend, Path::new("model.gguf"),
22//!                                            &LlamaModelParams::default()).unwrap();
23//! let mut lctx = model.new_context(&backend, LlamaContextParams::default()).unwrap();
24//!
25//! // Load the multimodal projector (mmproj) model.
26//! let ctx_params = MtmdContextParams::default();
27//! let mtmd_ctx   = MtmdContext::init_from_file(Path::new("mmproj.gguf"), &model, ctx_params)
28//!                               .unwrap();
29//!
30//! // Load an image from a file.
31//! let bitmap = MtmdBitmap::from_file(&mtmd_ctx, Path::new("image.jpg")).unwrap();
32//!
33//! // Tokenize a prompt that contains the media marker.
34//! let marker  = MtmdContext::default_marker();
35//! let prompt  = format!("Describe this image: {marker}");
36//! let text    = MtmdInputText::new(&prompt, true, true);
37//! let bitmaps = [&bitmap];
38//!
39//! let mut chunks = MtmdInputChunks::new();
40//! mtmd_ctx.tokenize(&text, &bitmaps, &mut chunks).unwrap();
41//!
42//! // Evaluate / decode all chunks.
43//! let n_batch = lctx.n_batch() as i32;
44//! let mut n_past = 0i32;
45//! mtmd_ctx.eval_chunks(lctx.as_ptr(), &chunks, 0, 0, n_batch, true, &mut n_past).unwrap();
46//! # }
47//! ```
48//!
49//! # Feature flag
50//!
51//! This module is only compiled when the `mtmd` Cargo feature is enabled.
52
53use std::ffi::{CStr, CString};
54use std::os::raw::c_void;
55use std::path::Path;
56use std::ptr::NonNull;
57use std::slice;
58
59use llama_cpp_sys_4 as sys;
60
61use crate::model::LlamaModel;
62
63// ─────────────────────────────────────────────────────────────────────────────
64// Error types
65// ─────────────────────────────────────────────────────────────────────────────
66
67/// All errors that can be returned by the mtmd module.
68#[derive(Debug, thiserror::Error)]
69pub enum MtmdError {
70    /// The context could not be created (e.g. bad mmproj file).
71    #[error("failed to create mtmd context (null return from mtmd_init_from_file)")]
72    ContextCreateFailed,
73
74    /// The bitmap could not be created.
75    #[error("failed to create mtmd bitmap")]
76    BitmapCreateFailed,
77
78    /// A path could not be converted to a valid C string (embedded NUL byte or non-UTF-8).
79    #[error("invalid path: {0}")]
80    InvalidPath(#[from] std::ffi::NulError),
81
82    /// A path was not representable as UTF-8.
83    #[error("path is not valid UTF-8")]
84    PathNotUtf8,
85
86    /// `mtmd_tokenize` returned an error code.
87    #[error("tokenize error: code {0} (1 = bitmap count mismatch, 2 = preprocessing error)")]
88    TokenizeError(i32),
89
90    /// `mtmd_encode_chunk` returned a non-zero code.
91    #[error("encode error: code {0}")]
92    EncodeError(i32),
93
94    /// `mtmd_helper_eval_chunks` (or single-chunk variant) returned a non-zero code.
95    #[error("eval error: code {0}")]
96    EvalError(i32),
97
98    /// A video stream could not be opened. Common causes: the build lacks
99    /// video support (`MTMD_VIDEO` was OFF), `ffmpeg`/`ffprobe` is not on
100    /// `PATH`, or the file is unreadable.
101    #[error("failed to open video stream (null return from mtmd_helper_video_init)")]
102    VideoInitFailed,
103
104    /// `mtmd_helper_video_read_next` returned an error code (`-2`).
105    #[error("video read error: code {0}")]
106    VideoReadError(i32),
107}
108
109/// A convenience `Result` alias for this module.
110pub type Result<T> = std::result::Result<T, MtmdError>;
111
112/// Progress callback invoked while the CLIP/mmproj weights are loading.
113///
114/// Receives a value in `[0.0, 1.0]`. Return `true` to continue loading or
115/// `false` to abort immediately.
116pub type MtmdProgressCallback = unsafe extern "C" fn(progress: f32, user_data: *mut c_void) -> bool;
117
118// ─────────────────────────────────────────────────────────────────────────────
119// MtmdContextParams
120// ─────────────────────────────────────────────────────────────────────────────
121
122/// Parameters used when creating an [`MtmdContext`].
123///
124/// Obtain a default-initialised instance via [`MtmdContextParams::default()`].
125pub struct MtmdContextParams {
126    pub(crate) params: sys::mtmd_context_params,
127}
128
129impl std::fmt::Debug for MtmdContextParams {
130    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131        f.debug_struct("MtmdContextParams")
132            .field("use_gpu", &self.params.use_gpu)
133            .field("print_timings", &self.params.print_timings)
134            .field("n_threads", &self.params.n_threads)
135            .field("warmup", &self.params.warmup)
136            .field("image_min_tokens", &self.params.image_min_tokens)
137            .field("image_max_tokens", &self.params.image_max_tokens)
138            .finish()
139    }
140}
141
142impl Default for MtmdContextParams {
143    fn default() -> Self {
144        let params = unsafe { sys::mtmd_context_params_default() };
145        Self { params }
146    }
147}
148
149impl MtmdContextParams {
150    /// Whether to run the vision/audio encoder on the GPU (default: `true`).
151    #[must_use]
152    pub fn use_gpu(mut self, v: bool) -> Self {
153        self.params.use_gpu = v;
154        self
155    }
156
157    /// Whether to print timing info after each encode (default: `false`).
158    #[must_use]
159    pub fn print_timings(mut self, v: bool) -> Self {
160        self.params.print_timings = v;
161        self
162    }
163
164    /// Number of threads used for the vision encoder (default taken from
165    /// `mtmd_context_params_default`).
166    #[must_use]
167    pub fn n_threads(mut self, n: i32) -> Self {
168        self.params.n_threads = n;
169        self
170    }
171
172    /// Whether to run a warm-up encode pass after initialisation.
173    #[must_use]
174    pub fn warmup(mut self, v: bool) -> Self {
175        self.params.warmup = v;
176        self
177    }
178
179    /// Minimum number of image tokens (0 = use model default).
180    #[must_use]
181    pub fn image_min_tokens(mut self, n: i32) -> Self {
182        self.params.image_min_tokens = n;
183        self
184    }
185
186    /// Maximum number of image tokens (0 = use model default).
187    #[must_use]
188    pub fn image_max_tokens(mut self, n: i32) -> Self {
189        self.params.image_max_tokens = n;
190        self
191    }
192
193    /// Maximum number of multimodal output tokens per batch.
194    ///
195    /// Maps to `mtmd_context_params.batch_max_tokens`. The upstream default
196    /// is `1024`. Increase for large images or long audio segments.
197    ///
198    /// # Examples
199    ///
200    /// ```rust
201    /// # #[cfg(feature = "mtmd")]
202    /// # {
203    /// use llama_cpp_4::mtmd::MtmdContextParams;
204    /// let params = MtmdContextParams::default().with_batch_max_tokens(2048);
205    /// assert_eq!(params.batch_max_tokens(), 2048);
206    /// # }
207    /// ```
208    #[must_use]
209    pub fn with_batch_max_tokens(mut self, n: i32) -> Self {
210        self.params.batch_max_tokens = n;
211        self
212    }
213
214    /// Get the configured batch token cap (`batch_max_tokens`).
215    #[must_use]
216    pub fn batch_max_tokens(&self) -> i32 {
217        self.params.batch_max_tokens
218    }
219
220    /// Set flash-attention mode for the vision encoder.
221    ///
222    /// Maps to `mtmd_context_params.flash_attn_type`. Uses the same
223    /// [`crate::context::params::LlamaFlashAttnType`] enum as text contexts.
224    ///
225    /// # Examples
226    ///
227    /// ```rust
228    /// # #[cfg(feature = "mtmd")]
229    /// # {
230    /// use llama_cpp_4::context::params::LlamaFlashAttnType;
231    /// use llama_cpp_4::mtmd::MtmdContextParams;
232    /// let params = MtmdContextParams::default()
233    ///     .with_flash_attn_type(LlamaFlashAttnType::Auto);
234    /// assert_eq!(params.flash_attn_type(), LlamaFlashAttnType::Auto);
235    /// # }
236    /// ```
237    #[must_use]
238    pub fn with_flash_attn_type(
239        mut self,
240        flash_attn_type: crate::context::params::LlamaFlashAttnType,
241    ) -> Self {
242        self.params.flash_attn_type = flash_attn_type.into();
243        self
244    }
245
246    /// Get flash-attention mode for the vision encoder.
247    #[must_use]
248    pub fn flash_attn_type(&self) -> crate::context::params::LlamaFlashAttnType {
249        crate::context::params::LlamaFlashAttnType::from(self.params.flash_attn_type)
250    }
251
252    /// Register a callback invoked while mmproj weights load.
253    ///
254    /// Maps to `mtmd_context_params.progress_callback`. Pass `None` to disable
255    /// progress reporting. The callback may return `false` to abort loading
256    /// early; see [`MtmdProgressCallback`].
257    ///
258    /// `user_data` is forwarded to each invocation and must remain valid until
259    /// [`MtmdContext::init_from_file`] returns.
260    #[must_use]
261    pub fn with_progress_callback(
262        mut self,
263        callback: Option<MtmdProgressCallback>,
264        user_data: *mut c_void,
265    ) -> Self {
266        self.params.progress_callback = callback;
267        self.params.progress_callback_user_data = user_data;
268        self
269    }
270
271    /// Override the media marker string (e.g. `"<image>"`).
272    ///
273    /// The provided string must not contain interior NUL bytes.  Pass `None`
274    /// to use the library default (`mtmd_default_marker()`).
275    ///
276    /// **Note:** the `CString` is stored inside the params so the pointer
277    /// remains valid as long as this `MtmdContextParams` lives.
278    /// # Errors
279    ///
280    /// Returns [`MtmdError`] if the marker string contains a NUL byte.
281    pub fn media_marker(mut self, marker: Option<&str>) -> std::result::Result<Self, MtmdError> {
282        match marker {
283            None => {
284                self.params.media_marker = std::ptr::null();
285                Ok(self)
286            }
287            Some(s) => {
288                let cs = CString::new(s)?;
289                self.params.media_marker = cs.as_ptr();
290                // Leak the CString so the raw pointer stays valid; the caller
291                // must ensure the params don't outlive the string.  Since
292                // MtmdContextParams is consumed by MtmdContext::init_from_file,
293                // this is safe.
294                std::mem::forget(cs);
295                Ok(self)
296            }
297        }
298    }
299}
300
301// ─────────────────────────────────────────────────────────────────────────────
302// MtmdContext
303// ─────────────────────────────────────────────────────────────────────────────
304
305/// The main multimodal context.
306///
307/// Wraps a `mtmd_context *`.  This context is tied to a specific mmproj model
308/// file and a loaded [`LlamaModel`].  It is safe to share across threads for
309/// `tokenize` calls (read-only), but `encode_chunk` / eval helpers mutate
310/// internal state and must not be called concurrently.
311pub struct MtmdContext {
312    ptr: NonNull<sys::mtmd_context>,
313}
314
315// The underlying mtmd_context is internally synchronised for tokenize().
316// encode / decode must be called from a single thread at a time (caller's
317// responsibility, enforced by the inference semaphore in the server).
318unsafe impl Send for MtmdContext {}
319unsafe impl Sync for MtmdContext {}
320
321impl std::fmt::Debug for MtmdContext {
322    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
323        f.debug_struct("MtmdContext")
324            .field("ptr", &self.ptr)
325            .finish()
326    }
327}
328
329impl Drop for MtmdContext {
330    fn drop(&mut self) {
331        unsafe { sys::mtmd_free(self.ptr.as_ptr()) }
332    }
333}
334
335impl MtmdContext {
336    /// Returns the default media marker string used in prompts
337    /// (currently `"<__media__>"`).
338    #[must_use]
339    pub fn default_marker() -> &'static str {
340        let ptr = unsafe { sys::mtmd_default_marker() };
341        unsafe { CStr::from_ptr(ptr) }
342            .to_str()
343            .unwrap_or("<__media__>")
344    }
345
346    /// Initialise a multimodal context from an mmproj GGUF file.
347    ///
348    /// # Parameters
349    ///
350    /// * `mmproj_path` – path to the mmproj `.gguf` file
351    /// * `text_model`  – the already-loaded text model
352    /// * `params`      – context parameters (use [`MtmdContextParams::default()`])
353    ///
354    /// # Errors
355    ///
356    /// Returns [`MtmdError::ContextCreateFailed`] if the underlying C call
357    /// returns a null pointer.
358    #[allow(clippy::needless_pass_by_value)]
359    pub fn init_from_file(
360        mmproj_path: impl AsRef<Path>,
361        text_model: &LlamaModel,
362        params: MtmdContextParams,
363    ) -> Result<Self> {
364        let path = mmproj_path
365            .as_ref()
366            .to_str()
367            .ok_or(MtmdError::PathNotUtf8)?;
368        let c_path = CString::new(path)?;
369
370        let ptr = unsafe {
371            sys::mtmd_init_from_file(c_path.as_ptr(), text_model.model.as_ptr(), params.params)
372        };
373
374        let ptr = NonNull::new(ptr).ok_or(MtmdError::ContextCreateFailed)?;
375        Ok(Self { ptr })
376    }
377
378    // ── Logging ──────────────────────────────────────────────────────────
379
380    /// Silence all clip/mtmd log output by installing a no-op callback.
381    ///
382    /// Call this right after [`init_from_file`](Self::init_from_file) to
383    /// suppress the verbose `clip_model_loader: tensor[N]…` lines that
384    /// clip.cpp emits to its own private logger (separate from `llama_log_set`).
385    pub fn void_logs() {
386        unsafe extern "C" fn noop(
387            _level: sys::ggml_log_level,
388            _text: *const ::std::os::raw::c_char,
389            _ud: *mut ::std::os::raw::c_void,
390        ) {
391        }
392        unsafe { sys::mtmd_log_set(Some(noop), std::ptr::null_mut()) };
393    }
394
395    /// Like [`void_logs`](Self::void_logs), but additionally silences logs
396    /// emitted by the `mtmd_helper_*` layer (e.g. eval/decode helpers).
397    ///
398    /// Internally calls `mtmd_helper_log_set` which also routes through
399    /// `mtmd_log_set`, so this is a strict superset of `void_logs`.
400    pub fn void_helper_logs() {
401        unsafe extern "C" fn noop(
402            _level: sys::ggml_log_level,
403            _text: *const ::std::os::raw::c_char,
404            _ud: *mut ::std::os::raw::c_void,
405        ) {
406        }
407        unsafe { sys::mtmd_helper_log_set(Some(noop), std::ptr::null_mut()) };
408    }
409
410    // ── Capability queries ────────────────────────────────────────────────
411
412    /// Returns `true` if the model supports vision (image) input.
413    #[must_use]
414    pub fn supports_vision(&self) -> bool {
415        unsafe { sys::mtmd_support_vision(self.ptr.as_ptr()) }
416    }
417
418    /// Returns `true` if the model supports audio input.
419    #[must_use]
420    pub fn supports_audio(&self) -> bool {
421        unsafe { sys::mtmd_support_audio(self.ptr.as_ptr()) }
422    }
423
424    /// Returns `true` if this build and model support video input.
425    ///
426    /// Video support additionally requires `ffmpeg`/`ffprobe` to be available
427    /// at runtime (see [`MtmdVideo`]). Wraps `mtmd_helper_support_video`.
428    #[must_use]
429    pub fn supports_video(&self) -> bool {
430        unsafe { sys::mtmd_helper_support_video(self.ptr.as_ptr()) }
431    }
432
433    /// Returns the media marker string configured for *this* context.
434    ///
435    /// Unlike [`default_marker`](Self::default_marker) (the library-wide
436    /// default), this reflects any override passed via
437    /// [`MtmdContextParams::media_marker`]. Wraps `mtmd_get_marker`.
438    #[must_use]
439    pub fn marker(&self) -> &str {
440        let ptr = unsafe { sys::mtmd_get_marker(self.ptr.as_ptr()) };
441        if ptr.is_null() {
442            return Self::default_marker();
443        }
444        unsafe { CStr::from_ptr(ptr) }
445            .to_str()
446            .unwrap_or_else(|_| Self::default_marker())
447    }
448
449    /// Returns the audio sample rate in Hz (e.g. `16_000` for Whisper), or `-1` if
450    /// audio is not supported.
451    #[must_use]
452    pub fn audio_sample_rate(&self) -> i32 {
453        unsafe { sys::mtmd_get_audio_sample_rate(self.ptr.as_ptr()) }
454    }
455
456    /// Whether `llama_decode` must use a non-causal attention mask when
457    /// decoding image embeddings for this model.
458    #[must_use]
459    pub fn decode_use_non_causal(&self, chunk: &MtmdInputChunk<'_>) -> bool {
460        unsafe { sys::mtmd_decode_use_non_causal(self.ptr.as_ptr(), chunk.as_ptr()) }
461    }
462
463    /// Whether the model uses M-RoPE for `llama_decode`.
464    #[must_use]
465    pub fn decode_use_mrope(&self) -> bool {
466        unsafe { sys::mtmd_decode_use_mrope(self.ptr.as_ptr()) }
467    }
468
469    // ── Core API ──────────────────────────────────────────────────────────
470
471    /// Tokenize a text prompt that contains one or more media markers.
472    ///
473    /// The number of `bitmaps` must equal the number of media markers in the
474    /// prompt text, otherwise [`MtmdError::TokenizeError`] with code `1` is returned.
475    ///
476    /// This call is **thread-safe** (shared `&self`).
477    ///
478    /// # Parameters
479    ///
480    /// * `text`    – text + tokenisation options
481    /// * `bitmaps` – slice of [`MtmdBitmap`] references, one per media marker
482    /// * `output`  – an [`MtmdInputChunks`] that will be populated with the result
483    ///
484    /// # Errors
485    ///
486    /// Returns [`MtmdError::TokenizeError`] if tokenization fails.
487    pub fn tokenize(
488        &self,
489        text: &MtmdInputText<'_>,
490        bitmaps: &[&MtmdBitmap],
491        output: &mut MtmdInputChunks,
492    ) -> Result<()> {
493        // The C signature is: mtmd_tokenize(..., mtmd_bitmap ** bitmaps, ...)
494        // where each element is a `const mtmd_bitmap *`.  We build a Vec of
495        // `*const mtmd_bitmap` and pass a mutable pointer to its first element
496        // (i.e. `*mut *const mtmd_bitmap`) to satisfy the C API.
497        let mut bitmap_ptrs: Vec<*const sys::mtmd_bitmap> = bitmaps
498            .iter()
499            .map(|b| b.ptr.as_ptr().cast_const())
500            .collect();
501
502        let c_text = sys::mtmd_input_text {
503            text: text.c_text.as_ptr(),
504            add_special: text.add_special,
505            parse_special: text.parse_special,
506        };
507
508        let ret = unsafe {
509            sys::mtmd_tokenize(
510                self.ptr.as_ptr(),
511                output.ptr.as_ptr(),
512                &raw const c_text,
513                bitmap_ptrs.as_mut_ptr(),
514                bitmap_ptrs.len(),
515            )
516        };
517
518        if ret != 0 {
519            return Err(MtmdError::TokenizeError(ret));
520        }
521        Ok(())
522    }
523
524    /// Encode a single input chunk (image or audio) and store the resulting
525    /// embeddings inside the context.
526    ///
527    /// After a successful call, the embeddings can be retrieved with
528    /// [`MtmdContext::output_embd`].
529    ///
530    /// This call is **NOT thread-safe**.
531    ///
532    /// # Errors
533    ///
534    /// Returns [`MtmdError::EncodeError`] if encoding fails.
535    pub fn encode_chunk(&self, chunk: &MtmdInputChunk<'_>) -> Result<()> {
536        let ret = unsafe { sys::mtmd_encode_chunk(self.ptr.as_ptr(), chunk.ptr) };
537        if ret != 0 {
538            return Err(MtmdError::EncodeError(ret));
539        }
540        Ok(())
541    }
542
543    /// Return a slice over the embeddings produced by the last
544    /// [`encode_chunk`](Self::encode_chunk) call.
545    ///
546    /// The length (in `f32` elements) is:
547    /// ```text
548    /// n_embd_inp(model)  *  chunk.n_tokens()
549    /// ```
550    ///
551    /// # Safety
552    ///
553    /// The returned slice is valid until the next call that mutates the
554    /// context (e.g. another `encode_chunk`).
555    #[must_use]
556    pub fn output_embd(&self, n_elements: usize) -> &[f32] {
557        let ptr = unsafe { sys::mtmd_get_output_embd(self.ptr.as_ptr()) };
558        if ptr.is_null() || n_elements == 0 {
559            return &[];
560        }
561        unsafe { slice::from_raw_parts(ptr, n_elements) }
562    }
563
564    // ── Helper API ────────────────────────────────────────────────────────
565
566    /// High-level helper: evaluate (decode) all chunks in sequence.
567    ///
568    /// * Text chunks are decoded via `llama_decode`.
569    /// * Image/audio chunks are first encoded with `mtmd_encode_chunk` and
570    ///   then decoded via `llama_decode`.
571    ///
572    /// On success `new_n_past` is updated with the new past position.
573    ///
574    /// This call is **NOT thread-safe**.
575    ///
576    /// # Parameters
577    ///
578    /// * `lctx`        – raw pointer to the llama context (from [`LlamaContext::as_ptr`])
579    /// * `chunks`      – the tokenized chunks to evaluate
580    /// * `n_past`      – current KV-cache position
581    /// * `seq_id`      – sequence ID
582    /// * `n_batch`     – maximum batch size (must be ≥ 1)
583    /// * `logits_last` – if `true`, compute logits only for the final token
584    /// * `new_n_past`  – updated KV-cache position after the call
585    ///
586    /// # Errors
587    ///
588    /// Returns [`MtmdError::EvalError`] if evaluation fails.
589    #[allow(clippy::too_many_arguments, clippy::not_unsafe_ptr_arg_deref)]
590    pub fn eval_chunks(
591        &self,
592        lctx: *mut sys::llama_context,
593        chunks: &MtmdInputChunks,
594        n_past: i32,
595        seq_id: i32,
596        n_batch: i32,
597        logits_last: bool,
598        new_n_past: &mut i32,
599    ) -> Result<()> {
600        let ret = unsafe {
601            sys::mtmd_helper_eval_chunks(
602                self.ptr.as_ptr(),
603                lctx,
604                chunks.ptr.as_ptr(),
605                n_past,
606                seq_id,
607                n_batch,
608                logits_last,
609                new_n_past,
610            )
611        };
612        if ret != 0 {
613            return Err(MtmdError::EvalError(ret));
614        }
615        Ok(())
616    }
617
618    /// High-level helper: evaluate a single chunk.
619    ///
620    /// Works identically to [`eval_chunks`](Self::eval_chunks) but operates on
621    /// one chunk at a time.
622    ///
623    /// # Errors
624    ///
625    /// Returns [`MtmdError::EvalError`] if evaluation fails.
626    #[allow(clippy::too_many_arguments, clippy::not_unsafe_ptr_arg_deref)]
627    pub fn eval_chunk_single(
628        &self,
629        lctx: *mut sys::llama_context,
630        chunk: &MtmdInputChunk<'_>,
631        n_past: i32,
632        seq_id: i32,
633        n_batch: i32,
634        logits_last: bool,
635        new_n_past: &mut i32,
636    ) -> Result<()> {
637        let ret = unsafe {
638            sys::mtmd_helper_eval_chunk_single(
639                self.ptr.as_ptr(),
640                lctx,
641                chunk.ptr,
642                n_past,
643                seq_id,
644                n_batch,
645                logits_last,
646                new_n_past,
647            )
648        };
649        if ret != 0 {
650            return Err(MtmdError::EvalError(ret));
651        }
652        Ok(())
653    }
654
655    /// Decode an image/audio chunk whose embeddings have already been
656    /// computed (e.g. via [`encode_chunk`](Self::encode_chunk) followed by
657    /// [`output_embd`](Self::output_embd)).
658    ///
659    /// Unlike [`eval_chunk_single`](Self::eval_chunk_single), this helper
660    /// handles batching plus the non-causal-attention setup required by
661    /// some models (e.g. Gemma 3, Gemma 4 audio) and the M-RoPE position
662    /// layout. Use it when the embeddings are already in hand and you want
663    /// the helper to take care of `llama_decode` plumbing.
664    ///
665    /// `encoded_embd` must contain `mtmd_image_tokens_get_n_tokens(chunk) *
666    /// llama_model_n_embd_inp(model)` `f32` elements. This call is **NOT
667    /// thread-safe**.
668    ///
669    /// # Errors
670    ///
671    /// Returns [`MtmdError::EvalError`] with code `-1` if `chunk` is not an
672    /// image/audio chunk, or `1` if `llama_decode` fails.
673    #[allow(clippy::too_many_arguments, clippy::not_unsafe_ptr_arg_deref)]
674    pub fn decode_image_chunk(
675        &self,
676        lctx: *mut sys::llama_context,
677        chunk: &MtmdInputChunk<'_>,
678        encoded_embd: &[f32],
679        n_past: i32,
680        seq_id: i32,
681        n_batch: i32,
682        new_n_past: &mut i32,
683    ) -> Result<()> {
684        let ret = unsafe {
685            sys::mtmd_helper_decode_image_chunk(
686                self.ptr.as_ptr(),
687                lctx,
688                chunk.ptr,
689                encoded_embd.as_ptr().cast_mut(),
690                n_past,
691                seq_id,
692                n_batch,
693                new_n_past,
694                // No post-decode callback; preserves prior single-shot behavior.
695                None,
696                std::ptr::null_mut(),
697            )
698        };
699        if ret != 0 {
700            return Err(MtmdError::EvalError(ret));
701        }
702        Ok(())
703    }
704
705    /// Returns a raw pointer to the underlying `mtmd_context`.
706    ///
707    /// # Safety
708    ///
709    /// The returned pointer is valid for the lifetime of this `MtmdContext`.
710    /// The caller must not free it.
711    #[must_use]
712    pub fn as_ptr(&self) -> *mut sys::mtmd_context {
713        self.ptr.as_ptr()
714    }
715}
716
717// ─────────────────────────────────────────────────────────────────────────────
718// MtmdInputText
719// ─────────────────────────────────────────────────────────────────────────────
720
721/// Text input for [`MtmdContext::tokenize`].
722///
723/// The prompt string must contain the media marker (see
724/// [`MtmdContext::default_marker`]) once for every bitmap to be embedded.
725#[derive(Debug)]
726pub struct MtmdInputText<'a> {
727    c_text: CString,
728    add_special: bool,
729    parse_special: bool,
730    _marker: std::marker::PhantomData<&'a ()>,
731}
732
733impl<'a> MtmdInputText<'a> {
734    /// Create a new `MtmdInputText`.
735    ///
736    /// * `text`          – the prompt (must not contain interior NUL bytes)
737    /// * `add_special`   – whether to add BOS/EOS tokens
738    /// * `parse_special` – whether to parse special tokens embedded in the text
739    ///
740    /// # Panics
741    ///
742    /// Panics if `text` contains an interior NUL byte.
743    #[must_use]
744    pub fn new(text: &'a str, add_special: bool, parse_special: bool) -> Self {
745        let c_text = CString::new(text).expect("MtmdInputText: text must not contain NUL bytes");
746        Self {
747            c_text,
748            add_special,
749            parse_special,
750            _marker: std::marker::PhantomData,
751        }
752    }
753
754    /// Try to create a new `MtmdInputText`, returning an error if `text`
755    /// contains an interior NUL byte.
756    ///
757    /// # Errors
758    ///
759    /// Returns [`std::ffi::NulError`] if `text` contains a NUL byte.
760    pub fn try_new(
761        text: &'a str,
762        add_special: bool,
763        parse_special: bool,
764    ) -> std::result::Result<Self, std::ffi::NulError> {
765        let c_text = CString::new(text)?;
766        Ok(Self {
767            c_text,
768            add_special,
769            parse_special,
770            _marker: std::marker::PhantomData,
771        })
772    }
773}
774
775// ─────────────────────────────────────────────────────────────────────────────
776// MtmdBitmap
777// ─────────────────────────────────────────────────────────────────────────────
778
779/// An image or audio bitmap ready for multimodal encoding.
780///
781/// # Image bitmaps
782///
783/// The raw pixel data must be in RGBRGBRGB… (interleaved) format.  The total
784/// number of bytes must be `nx * ny * 3`.
785///
786/// # Audio bitmaps
787///
788/// The raw sample data must be little-endian `f32` PCM samples.  The total
789/// number of bytes must be `n_samples * 4`.
790pub struct MtmdBitmap {
791    ptr: NonNull<sys::mtmd_bitmap>,
792}
793
794unsafe impl Send for MtmdBitmap {}
795unsafe impl Sync for MtmdBitmap {}
796
797impl std::fmt::Debug for MtmdBitmap {
798    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
799        f.debug_struct("MtmdBitmap")
800            .field("nx", &self.nx())
801            .field("ny", &self.ny())
802            .field("n_bytes", &self.n_bytes())
803            .field("is_audio", &self.is_audio())
804            .finish()
805    }
806}
807
808impl Drop for MtmdBitmap {
809    fn drop(&mut self) {
810        unsafe { sys::mtmd_bitmap_free(self.ptr.as_ptr()) }
811    }
812}
813
814impl MtmdBitmap {
815    /// Create a bitmap from raw RGB pixel data.
816    ///
817    /// * `nx`   – image width in pixels
818    /// * `ny`   – image height in pixels
819    /// * `data` – raw pixel bytes in RGBRGB… format; must be `nx * ny * 3` bytes
820    ///
821    /// # Errors
822    ///
823    /// Returns [`MtmdError::BitmapCreateFailed`] if the underlying C call
824    /// returns null.
825    pub fn from_rgb(nx: u32, ny: u32, data: &[u8]) -> Result<Self> {
826        let ptr = unsafe { sys::mtmd_bitmap_init(nx, ny, data.as_ptr()) };
827        let ptr = NonNull::new(ptr).ok_or(MtmdError::BitmapCreateFailed)?;
828        Ok(Self { ptr })
829    }
830
831    /// Create an audio bitmap from PCM `f32` samples.
832    ///
833    /// * `samples` – slice of PCM float samples
834    ///
835    /// # Errors
836    ///
837    /// Returns [`MtmdError::BitmapCreateFailed`] if the underlying C call
838    /// returns null.
839    pub fn from_audio(samples: &[f32]) -> Result<Self> {
840        let ptr = unsafe { sys::mtmd_bitmap_init_from_audio(samples.len(), samples.as_ptr()) };
841        let ptr = NonNull::new(ptr).ok_or(MtmdError::BitmapCreateFailed)?;
842        Ok(Self { ptr })
843    }
844
845    /// Build an `MtmdBitmap` from a `mtmd_helper_bitmap_wrapper`, taking
846    /// ownership of the `bitmap` and freeing any `video_ctx`.
847    ///
848    /// The `from_file`/`from_buf` constructors only support image/audio input.
849    /// When the input is a video the helper returns a non-null `video_ctx`
850    /// (an open ffmpeg stream) which is not representable as an `MtmdBitmap`;
851    /// we free it here to avoid leaking it. Use [`MtmdVideo`] for video input.
852    fn from_wrapper(wrapper: sys::mtmd_helper_bitmap_wrapper) -> Result<Self> {
853        if !wrapper.video_ctx.is_null() {
854            unsafe { sys::mtmd_helper_video_free(wrapper.video_ctx) };
855        }
856        let ptr = NonNull::new(wrapper.bitmap).ok_or(MtmdError::BitmapCreateFailed)?;
857        Ok(Self { ptr })
858    }
859
860    /// Load a bitmap from a file (image or audio).
861    ///
862    /// Supported image formats: JPEG, PNG, BMP, GIF, and others handled by
863    /// `stb_image`.  Supported audio formats: WAV, MP3, FLAC (via miniaudio).
864    ///
865    /// # Errors
866    ///
867    /// Returns [`MtmdError::BitmapCreateFailed`] if the file cannot be loaded.
868    pub fn from_file(ctx: &MtmdContext, path: impl AsRef<Path>) -> Result<Self> {
869        let path = path.as_ref().to_str().ok_or(MtmdError::PathNotUtf8)?;
870        let c_path = CString::new(path)?;
871
872        // `placeholder = false`: load the real bitmap data (not a token-count
873        // placeholder). For image/audio the returned `video_ctx` is always null.
874        let wrapper = unsafe {
875            sys::mtmd_helper_bitmap_init_from_file(ctx.ptr.as_ptr(), c_path.as_ptr(), false)
876        };
877        Self::from_wrapper(wrapper)
878    }
879
880    /// Load a bitmap from an in-memory buffer containing a file.
881    ///
882    /// The format is auto-detected (image vs audio via magic bytes).
883    ///
884    /// # Errors
885    ///
886    /// Returns [`MtmdError::BitmapCreateFailed`] if decoding fails.
887    pub fn from_buf(ctx: &MtmdContext, buf: &[u8]) -> Result<Self> {
888        // `placeholder = false`: load the real bitmap data (not a token-count
889        // placeholder). For image/audio the returned `video_ctx` is always null.
890        let wrapper = unsafe {
891            sys::mtmd_helper_bitmap_init_from_buf(ctx.ptr.as_ptr(), buf.as_ptr(), buf.len(), false)
892        };
893        Self::from_wrapper(wrapper)
894    }
895
896    // ── Getters ───────────────────────────────────────────────────────────
897
898    /// Width in pixels (for images) or 0 (for audio).
899    #[must_use]
900    pub fn nx(&self) -> u32 {
901        unsafe { sys::mtmd_bitmap_get_nx(self.ptr.as_ptr()) }
902    }
903
904    /// Height in pixels (for images) or 0 (for audio).
905    #[must_use]
906    pub fn ny(&self) -> u32 {
907        unsafe { sys::mtmd_bitmap_get_ny(self.ptr.as_ptr()) }
908    }
909
910    /// Total number of bytes in the bitmap data.
911    #[must_use]
912    pub fn n_bytes(&self) -> usize {
913        unsafe { sys::mtmd_bitmap_get_n_bytes(self.ptr.as_ptr()) }
914    }
915
916    /// Returns `true` if this bitmap contains audio (rather than image) data.
917    #[must_use]
918    pub fn is_audio(&self) -> bool {
919        unsafe { sys::mtmd_bitmap_is_audio(self.ptr.as_ptr()) }
920    }
921
922    /// Return the raw pixel / sample data.
923    #[must_use]
924    pub fn data(&self) -> &[u8] {
925        let n = self.n_bytes();
926        if n == 0 {
927            return &[];
928        }
929        let ptr = unsafe { sys::mtmd_bitmap_get_data(self.ptr.as_ptr()) };
930        unsafe { slice::from_raw_parts(ptr, n) }
931    }
932
933    /// Return the optional ID string attached to this bitmap (used for KV
934    /// cache tracking), or `None` if no ID has been set.
935    #[must_use]
936    pub fn id(&self) -> Option<&str> {
937        let ptr = unsafe { sys::mtmd_bitmap_get_id(self.ptr.as_ptr()) };
938        if ptr.is_null() {
939            return None;
940        }
941        unsafe { CStr::from_ptr(ptr) }.to_str().ok()
942    }
943
944    /// Attach an optional ID string to this bitmap (used for KV cache
945    /// tracking).
946    ///
947    /// # Errors
948    ///
949    /// Returns an error if `id` contains an interior NUL byte.
950    pub fn set_id(&mut self, id: &str) -> std::result::Result<(), std::ffi::NulError> {
951        let cs = CString::new(id)?;
952        unsafe { sys::mtmd_bitmap_set_id(self.ptr.as_ptr(), cs.as_ptr()) };
953        Ok(())
954    }
955}
956
957// ─────────────────────────────────────────────────────────────────────────────
958// Video input
959// ─────────────────────────────────────────────────────────────────────────────
960
961// `free()` from libc — used to release the heap-allocated text returned by
962// `mtmd_helper_video_read_next` (the C side allocates it with strdup/malloc and
963// documents that the caller must release it with `free()`).
964extern "C" {
965    fn free(ptr: *mut std::os::raw::c_void);
966}
967
968/// Parameters controlling how a [`MtmdVideo`] stream is opened and sampled.
969///
970/// Obtain a default-initialised instance via [`MtmdVideoParams::default()`]
971/// (which mirrors `mtmd_helper_video_init_params_default`: ~4 fps, native
972/// `ffmpeg`/`ffprobe` from `PATH`, and a 5 s timestamp interval) and tweak it
973/// with the builder methods.
974pub struct MtmdVideoParams {
975    params: sys::mtmd_helper_video_init_params,
976    // Keeps the `ffmpeg_bin_dir` C string alive for as long as `params`
977    // borrows it via a raw pointer.
978    ffmpeg_bin_dir: Option<CString>,
979}
980
981impl std::fmt::Debug for MtmdVideoParams {
982    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
983        f.debug_struct("MtmdVideoParams")
984            .field("fps_target", &self.params.fps_target)
985            .field("timestamp_interval_ms", &self.params.timestamp_interval_ms)
986            .field("ffmpeg_bin_dir", &self.ffmpeg_bin_dir)
987            .finish()
988    }
989}
990
991impl Default for MtmdVideoParams {
992    fn default() -> Self {
993        let params = unsafe { sys::mtmd_helper_video_init_params_default() };
994        Self {
995            params,
996            ffmpeg_bin_dir: None,
997        }
998    }
999}
1000
1001impl MtmdVideoParams {
1002    /// Desired output frame rate. Values `<= 0` mean "use the video's native
1003    /// fps" (the default is ~4 fps).
1004    #[must_use]
1005    pub fn fps_target(mut self, fps: f32) -> Self {
1006        self.params.fps_target = fps;
1007        self
1008    }
1009
1010    /// Interval, in milliseconds, between inserted timestamp text chunks (e.g.
1011    /// `"[10m50.5s]"`). Values `<= 0` disable timestamps (default 5000 ms).
1012    #[must_use]
1013    pub fn timestamp_interval_ms(mut self, ms: i64) -> Self {
1014        self.params.timestamp_interval_ms = ms;
1015        self
1016    }
1017
1018    /// Directory containing the `ffmpeg`/`ffprobe` binaries. Pass `None` to
1019    /// search `PATH` (the default).
1020    ///
1021    /// # Errors
1022    ///
1023    /// Returns an error if `dir` contains an interior NUL byte.
1024    pub fn ffmpeg_bin_dir(mut self, dir: Option<&str>) -> Result<Self> {
1025        match dir {
1026            None => {
1027                self.params.ffmpeg_bin_dir = std::ptr::null();
1028                self.ffmpeg_bin_dir = None;
1029            }
1030            Some(d) => {
1031                let cs = CString::new(d)?;
1032                self.params.ffmpeg_bin_dir = cs.as_ptr();
1033                // Store the owner so the pointer above stays valid.
1034                self.ffmpeg_bin_dir = Some(cs);
1035            }
1036        }
1037        Ok(self)
1038    }
1039}
1040
1041/// Metadata describing an open [`MtmdVideo`] stream.
1042#[derive(Debug, Clone, Copy, PartialEq)]
1043pub struct MtmdVideoInfo {
1044    /// Frame width in pixels.
1045    pub width: u32,
1046    /// Frame height in pixels.
1047    pub height: u32,
1048    /// Effective frames-per-second (the `fps_target` if set, else native fps).
1049    pub fps: f32,
1050    /// Estimated total frame count at the effective fps (`-1` if unknown).
1051    pub n_frames: i32,
1052}
1053
1054/// One item read from a [`MtmdVideo`] stream by [`MtmdVideo::read_next`].
1055#[derive(Debug)]
1056pub enum MtmdVideoItem {
1057    /// A decoded video frame, ready to be tokenized like any other image
1058    /// [`MtmdBitmap`].
1059    Frame(MtmdBitmap),
1060    /// A timestamp text marker (e.g. `"[10m50.5s]"`) to be inserted into the
1061    /// prompt between frames.
1062    Text(String),
1063}
1064
1065/// An open video stream, decoded frame-by-frame via `ffmpeg`.
1066///
1067/// The notion of "video" exists only at the helper level — it is decoded into
1068/// a sequence of image [frames](MtmdVideoItem::Frame) and timestamp
1069/// [text markers](MtmdVideoItem::Text) which are then fed through the normal
1070/// multimodal pipeline.
1071///
1072/// Requires a build with video support (see [`MtmdContext::supports_video`])
1073/// and `ffmpeg`/`ffprobe` available at runtime.
1074///
1075/// # Example
1076///
1077/// ```no_run
1078/// # #[cfg(feature = "mtmd")]
1079/// # fn run(mtmd_ctx: &llama_cpp_4::mtmd::MtmdContext) -> Result<(), llama_cpp_4::mtmd::MtmdError> {
1080/// use std::path::Path;
1081/// use llama_cpp_4::mtmd::{MtmdVideo, MtmdVideoParams, MtmdVideoItem};
1082///
1083/// let mut video = MtmdVideo::from_file(mtmd_ctx, Path::new("clip.mp4"),
1084///                                      &MtmdVideoParams::default())?;
1085/// while let Some(item) = video.read_next()? {
1086///     match item {
1087///         MtmdVideoItem::Frame(bitmap) => { /* tokenize the frame */ }
1088///         MtmdVideoItem::Text(ts)      => { /* insert the timestamp marker */ }
1089///     }
1090/// }
1091/// # Ok(())
1092/// # }
1093/// ```
1094pub struct MtmdVideo {
1095    ptr: NonNull<sys::mtmd_helper_video>,
1096}
1097
1098impl std::fmt::Debug for MtmdVideo {
1099    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1100        f.debug_struct("MtmdVideo")
1101            .field("info", &self.info())
1102            .finish()
1103    }
1104}
1105
1106impl Drop for MtmdVideo {
1107    fn drop(&mut self) {
1108        unsafe { sys::mtmd_helper_video_free(self.ptr.as_ptr()) }
1109    }
1110}
1111
1112impl MtmdVideo {
1113    /// Open a video file for frame-by-frame decoding.
1114    ///
1115    /// # Errors
1116    ///
1117    /// Returns [`MtmdError::VideoInitFailed`] if the stream cannot be opened
1118    /// (no video support compiled in, `ffprobe` not found, file unreadable,
1119    /// …), or [`MtmdError::InvalidPath`] / [`MtmdError::PathNotUtf8`] for a bad
1120    /// path.
1121    pub fn from_file(
1122        ctx: &MtmdContext,
1123        path: impl AsRef<Path>,
1124        params: &MtmdVideoParams,
1125    ) -> Result<Self> {
1126        let path = path.as_ref().to_str().ok_or(MtmdError::PathNotUtf8)?;
1127        let c_path = CString::new(path)?;
1128        let ptr = unsafe {
1129            sys::mtmd_helper_video_init(ctx.ptr.as_ptr(), c_path.as_ptr(), params.params)
1130        };
1131        let ptr = NonNull::new(ptr).ok_or(MtmdError::VideoInitFailed)?;
1132        Ok(Self { ptr })
1133    }
1134
1135    /// Open a video from an in-memory buffer. The buffer is copied internally,
1136    /// so it need not outlive this call.
1137    ///
1138    /// # Errors
1139    ///
1140    /// Returns [`MtmdError::VideoInitFailed`] if the stream cannot be opened.
1141    pub fn from_buf(ctx: &MtmdContext, buf: &[u8], params: &MtmdVideoParams) -> Result<Self> {
1142        let ptr = unsafe {
1143            sys::mtmd_helper_video_init_from_buf(
1144                ctx.ptr.as_ptr(),
1145                buf.as_ptr(),
1146                buf.len(),
1147                params.params,
1148            )
1149        };
1150        let ptr = NonNull::new(ptr).ok_or(MtmdError::VideoInitFailed)?;
1151        Ok(Self { ptr })
1152    }
1153
1154    /// Return metadata (resolution, effective fps, estimated frame count) for
1155    /// this stream.
1156    #[must_use]
1157    pub fn info(&self) -> MtmdVideoInfo {
1158        let info = unsafe { sys::mtmd_helper_video_get_info(self.ptr.as_ptr()) };
1159        MtmdVideoInfo {
1160            width: info.width,
1161            height: info.height,
1162            fps: info.fps,
1163            n_frames: info.n_frames,
1164        }
1165    }
1166
1167    /// Read the next item from the stream.
1168    ///
1169    /// Returns `Ok(Some(item))` for each frame or timestamp marker, and
1170    /// `Ok(None)` once the end of the stream is reached.
1171    ///
1172    /// # Errors
1173    ///
1174    /// Returns [`MtmdError::VideoReadError`] on a decode error.
1175    pub fn read_next(&mut self) -> Result<Option<MtmdVideoItem>> {
1176        let mut out_bitmap: *mut sys::mtmd_bitmap = std::ptr::null_mut();
1177        let mut out_text: *mut std::os::raw::c_char = std::ptr::null_mut();
1178        let ret = unsafe {
1179            sys::mtmd_helper_video_read_next(
1180                self.ptr.as_ptr(),
1181                &raw mut out_bitmap,
1182                &raw mut out_text,
1183            )
1184        };
1185        match ret {
1186            0 => {
1187                if let Some(ptr) = NonNull::new(out_bitmap) {
1188                    Ok(Some(MtmdVideoItem::Frame(MtmdBitmap { ptr })))
1189                } else if !out_text.is_null() {
1190                    let text = unsafe { CStr::from_ptr(out_text) }
1191                        .to_string_lossy()
1192                        .into_owned();
1193                    // The C side allocated this with strdup/malloc; release it.
1194                    unsafe { free(out_text.cast()) };
1195                    Ok(Some(MtmdVideoItem::Text(text)))
1196                } else {
1197                    // Success but nothing produced — treat as end of stream.
1198                    Ok(None)
1199                }
1200            }
1201            -1 => Ok(None), // EOF
1202            other => Err(MtmdError::VideoReadError(other)),
1203        }
1204    }
1205}
1206
1207// ─────────────────────────────────────────────────────────────────────────────
1208// MtmdInputChunks
1209// ─────────────────────────────────────────────────────────────────────────────
1210
1211/// A list of tokenized input chunks produced by [`MtmdContext::tokenize`].
1212///
1213/// Each chunk is either a text token sequence or a set of image/audio tokens.
1214pub struct MtmdInputChunks {
1215    ptr: NonNull<sys::mtmd_input_chunks>,
1216}
1217
1218impl std::fmt::Debug for MtmdInputChunks {
1219    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1220        f.debug_struct("MtmdInputChunks")
1221            .field("len", &self.len())
1222            .finish()
1223    }
1224}
1225
1226impl Drop for MtmdInputChunks {
1227    fn drop(&mut self) {
1228        unsafe { sys::mtmd_input_chunks_free(self.ptr.as_ptr()) }
1229    }
1230}
1231
1232impl MtmdInputChunks {
1233    /// Create a new, empty chunk list.  Populated by
1234    /// [`MtmdContext::tokenize`].
1235    ///
1236    /// # Panics
1237    ///
1238    /// Panics if the underlying C allocation fails (OOM).
1239    #[must_use]
1240    pub fn new() -> Self {
1241        let ptr = unsafe { sys::mtmd_input_chunks_init() };
1242        let ptr = NonNull::new(ptr).expect("mtmd_input_chunks_init returned null");
1243        Self { ptr }
1244    }
1245
1246    /// Number of chunks in this list.
1247    #[must_use]
1248    pub fn len(&self) -> usize {
1249        unsafe { sys::mtmd_input_chunks_size(self.ptr.as_ptr()) }
1250    }
1251
1252    /// Returns `true` if there are no chunks.
1253    #[must_use]
1254    pub fn is_empty(&self) -> bool {
1255        self.len() == 0
1256    }
1257
1258    /// Get the `idx`-th chunk.  Returns `None` if `idx >= len()`.
1259    #[must_use]
1260    pub fn get(&self, idx: usize) -> Option<MtmdInputChunk<'_>> {
1261        if idx >= self.len() {
1262            return None;
1263        }
1264        let ptr = unsafe { sys::mtmd_input_chunks_get(self.ptr.as_ptr(), idx) };
1265        if ptr.is_null() {
1266            return None;
1267        }
1268        Some(MtmdInputChunk {
1269            ptr,
1270            _marker: std::marker::PhantomData,
1271        })
1272    }
1273
1274    /// Iterate over all chunks.
1275    pub fn iter(&self) -> impl Iterator<Item = MtmdInputChunk<'_>> {
1276        (0..self.len()).filter_map(|i| self.get(i))
1277    }
1278
1279    /// Total number of tokens across all chunks.
1280    ///
1281    /// Equivalent to `mtmd_helper_get_n_tokens`.
1282    #[must_use]
1283    pub fn n_tokens(&self) -> usize {
1284        unsafe { sys::mtmd_helper_get_n_tokens(self.ptr.as_ptr()) }
1285    }
1286
1287    /// Total number of *positions* across all chunks (used for KV-cache
1288    /// tracking with M-RoPE models where positions ≠ tokens).
1289    ///
1290    /// Equivalent to `mtmd_helper_get_n_pos`.
1291    #[must_use]
1292    pub fn n_pos(&self) -> i32 {
1293        unsafe { sys::mtmd_helper_get_n_pos(self.ptr.as_ptr()) }
1294    }
1295}
1296
1297impl Default for MtmdInputChunks {
1298    fn default() -> Self {
1299        Self::new()
1300    }
1301}
1302
1303// ─────────────────────────────────────────────────────────────────────────────
1304// MtmdInputChunkType
1305// ─────────────────────────────────────────────────────────────────────────────
1306
1307/// The type of an [`MtmdInputChunk`].
1308#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1309pub enum MtmdInputChunkType {
1310    /// Plain text tokens.
1311    Text,
1312    /// Image tokens (embeddings produced by the vision encoder).
1313    Image,
1314    /// Audio tokens (embeddings produced by the audio encoder).
1315    Audio,
1316}
1317
1318impl From<sys::mtmd_input_chunk_type> for MtmdInputChunkType {
1319    fn from(v: sys::mtmd_input_chunk_type) -> Self {
1320        // mtmd_input_chunk_type is a plain C `typedef unsigned int`.
1321        // The variants are exported as free-standing constants.
1322        if v == sys::MTMD_INPUT_CHUNK_TYPE_IMAGE {
1323            Self::Image
1324        } else if v == sys::MTMD_INPUT_CHUNK_TYPE_AUDIO {
1325            Self::Audio
1326        } else {
1327            Self::Text
1328        }
1329    }
1330}
1331
1332// ─────────────────────────────────────────────────────────────────────────────
1333// MtmdInputChunk
1334// ─────────────────────────────────────────────────────────────────────────────
1335
1336/// A single tokenized input chunk (text, image, or audio).
1337///
1338/// Instances are borrowed from an [`MtmdInputChunks`] list and live as long
1339/// as that list.
1340#[derive(Debug)]
1341pub struct MtmdInputChunk<'chunks> {
1342    ptr: *const sys::mtmd_input_chunk,
1343    _marker: std::marker::PhantomData<&'chunks MtmdInputChunks>,
1344}
1345
1346impl<'chunks> MtmdInputChunk<'chunks> {
1347    /// The type of this chunk.
1348    #[must_use]
1349    pub fn chunk_type(&self) -> MtmdInputChunkType {
1350        let t = unsafe { sys::mtmd_input_chunk_get_type(self.ptr) };
1351        MtmdInputChunkType::from(t)
1352    }
1353
1354    /// Total number of tokens in this chunk.
1355    #[must_use]
1356    pub fn n_tokens(&self) -> usize {
1357        unsafe { sys::mtmd_input_chunk_get_n_tokens(self.ptr) }
1358    }
1359
1360    /// Number of temporal positions (equals `n_tokens` for non-M-RoPE models).
1361    #[must_use]
1362    pub fn n_pos(&self) -> i32 {
1363        unsafe { sys::mtmd_input_chunk_get_n_pos(self.ptr) }
1364    }
1365
1366    /// Return the raw llama token IDs for a **text** chunk.
1367    ///
1368    /// Returns `None` if this chunk is not a text chunk.
1369    #[must_use]
1370    pub fn text_tokens(&self) -> Option<&[i32]> {
1371        if self.chunk_type() != MtmdInputChunkType::Text {
1372            return None;
1373        }
1374        let mut n: usize = 0;
1375        let ptr = unsafe { sys::mtmd_input_chunk_get_tokens_text(self.ptr, &raw mut n) };
1376        if ptr.is_null() || n == 0 {
1377            return Some(&[]);
1378        }
1379        Some(unsafe { slice::from_raw_parts(ptr, n) })
1380    }
1381
1382    /// Return the image token metadata for an **image** or **audio** chunk.
1383    ///
1384    /// Returns `None` for text chunks.
1385    #[must_use]
1386    pub fn image_tokens(&self) -> Option<MtmdImageTokens<'chunks>> {
1387        match self.chunk_type() {
1388            MtmdInputChunkType::Image | MtmdInputChunkType::Audio => {}
1389            MtmdInputChunkType::Text => return None,
1390        }
1391        let ptr = unsafe { sys::mtmd_input_chunk_get_tokens_image(self.ptr) };
1392        if ptr.is_null() {
1393            return None;
1394        }
1395        Some(MtmdImageTokens {
1396            ptr,
1397            _marker: std::marker::PhantomData,
1398        })
1399    }
1400
1401    /// Optional ID attached to this chunk (used for KV cache tracking).
1402    #[must_use]
1403    pub fn id(&self) -> Option<&str> {
1404        let ptr = unsafe { sys::mtmd_input_chunk_get_id(self.ptr) };
1405        if ptr.is_null() {
1406            return None;
1407        }
1408        unsafe { CStr::from_ptr(ptr) }.to_str().ok()
1409    }
1410
1411    /// Returns the raw `*const mtmd_input_chunk` pointer.
1412    ///
1413    /// # Safety
1414    ///
1415    /// The returned pointer is valid for the lifetime of the parent
1416    /// `MtmdInputChunks`.
1417    #[must_use]
1418    pub fn as_ptr(&self) -> *const sys::mtmd_input_chunk {
1419        self.ptr
1420    }
1421}
1422
1423// ─────────────────────────────────────────────────────────────────────────────
1424// MtmdDecoderPos
1425// ─────────────────────────────────────────────────────────────────────────────
1426
1427/// Per-token position used by M-RoPE decoder attention.
1428///
1429/// `t` is the temporal axis, `x`/`y` the spatial axes. `z` is reserved for
1430/// future use. Values are *relative* to a base `pos_0` provided when the
1431/// position is computed.
1432#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
1433#[repr(C)]
1434pub struct MtmdDecoderPos {
1435    /// Temporal index.
1436    pub t: u32,
1437    /// Spatial X.
1438    pub x: u32,
1439    /// Spatial Y.
1440    pub y: u32,
1441    /// Reserved.
1442    pub z: u32,
1443}
1444
1445// ─────────────────────────────────────────────────────────────────────────────
1446// MtmdImageTokens
1447// ─────────────────────────────────────────────────────────────────────────────
1448
1449/// Image/audio token metadata attached to a non-text [`MtmdInputChunk`].
1450#[derive(Debug)]
1451pub struct MtmdImageTokens<'chunks> {
1452    ptr: *const sys::mtmd_image_tokens,
1453    _marker: std::marker::PhantomData<&'chunks MtmdInputChunks>,
1454}
1455
1456impl MtmdImageTokens<'_> {
1457    /// Total number of embedding tokens.
1458    #[must_use]
1459    pub fn n_tokens(&self) -> usize {
1460        unsafe { sys::mtmd_image_tokens_get_n_tokens(self.ptr) }
1461    }
1462
1463    /// Width of the token grid.
1464    #[must_use]
1465    pub fn nx(&self) -> usize {
1466        unsafe { sys::mtmd_image_tokens_get_nx(self.ptr) }
1467    }
1468
1469    /// Height of the token grid.
1470    #[must_use]
1471    pub fn ny(&self) -> usize {
1472        unsafe { sys::mtmd_image_tokens_get_ny(self.ptr) }
1473    }
1474
1475    /// Number of temporal positions (M-RoPE variant; equals `n_tokens` otherwise).
1476    #[must_use]
1477    pub fn n_pos(&self) -> i32 {
1478        unsafe { sys::mtmd_image_tokens_get_n_pos(self.ptr) }
1479    }
1480
1481    /// Optional ID for KV cache tracking.
1482    #[must_use]
1483    pub fn id(&self) -> Option<&str> {
1484        let ptr = unsafe { sys::mtmd_image_tokens_get_id(self.ptr) };
1485        if ptr.is_null() {
1486            return None;
1487        }
1488        unsafe { CStr::from_ptr(ptr) }.to_str().ok()
1489    }
1490
1491    /// Compute the per-token decoder positions used by M-RoPE models.
1492    ///
1493    /// Returns a vector of length [`n_tokens`](Self::n_tokens). Each entry
1494    /// is relative to `pos_0`; for non-M-RoPE models this typically reduces
1495    /// to `(0, i, 0, 0)` for the i-th token.
1496    ///
1497    /// Wraps `mtmd_helper_image_get_decoder_pos`.
1498    #[must_use]
1499    pub fn decoder_positions(&self, pos_0: i32) -> Vec<MtmdDecoderPos> {
1500        let n = self.n_tokens();
1501        let mut out = vec![MtmdDecoderPos::default(); n];
1502        if n == 0 {
1503            return out;
1504        }
1505        unsafe {
1506            sys::mtmd_helper_image_get_decoder_pos(
1507                self.ptr,
1508                pos_0,
1509                out.as_mut_ptr().cast::<sys::mtmd_decoder_pos>(),
1510            );
1511        }
1512        out
1513    }
1514}
1515
1516// ─────────────────────────────────────────────────────────────────────────────
1517// LlamaContext extension
1518// ─────────────────────────────────────────────────────────────────────────────
1519
1520use crate::context::LlamaContext;
1521
1522impl LlamaContext<'_> {
1523    /// Expose the raw `llama_context` pointer for use with mtmd helpers.
1524    ///
1525    /// # Safety
1526    ///
1527    /// The pointer is valid for the lifetime of this `LlamaContext` and must
1528    /// not be freed by the caller.
1529    #[must_use]
1530    pub fn as_ptr(&self) -> *mut sys::llama_context {
1531        self.context.as_ptr()
1532    }
1533}
1534
1535#[cfg(test)]
1536mod tests {
1537    use super::*;
1538
1539    #[test]
1540    fn decoder_pos_layout_matches_sys() {
1541        // The Rust MtmdDecoderPos is cast to sys::mtmd_decoder_pos at the
1542        // FFI boundary in `MtmdImageTokens::decoder_positions`. Verify the
1543        // assumption.
1544        assert_eq!(
1545            std::mem::size_of::<MtmdDecoderPos>(),
1546            std::mem::size_of::<sys::mtmd_decoder_pos>(),
1547        );
1548        assert_eq!(
1549            std::mem::align_of::<MtmdDecoderPos>(),
1550            std::mem::align_of::<sys::mtmd_decoder_pos>(),
1551        );
1552        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, t), 0);
1553        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, x), 4);
1554        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, y), 8);
1555        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, z), 12);
1556    }
1557}
llama_cpp_4/mtmd.rs

llama_cpp_4/
mtmd.rs