llama_cpp_4/
mtmd.rs

1//! Safe wrappers for the `libmtmd` multimodal support library.
2//!
3//! `libmtmd` extends llama.cpp with the ability to encode image and audio
4//! inputs (bitmaps) into token embeddings that can then be fed into a
5//! standard [`llama_decode`] call alongside normal text tokens.
6//!
7//! # Quick-start
8//!
9//! ```no_run
10//! # #[cfg(feature = "mtmd")]
11//! # {
12//! use std::path::Path;
13//! use llama_cpp_4::{
14//!     llama_backend::LlamaBackend,
15//!     model::{LlamaModel, params::LlamaModelParams, AddBos},
16//!     context::params::LlamaContextParams,
17//!     mtmd::{MtmdContext, MtmdContextParams, MtmdBitmap, MtmdInputChunks, MtmdInputText},
18//! };
19//!
20//! let backend  = LlamaBackend::init().unwrap();
21//! let model    = LlamaModel::load_from_file(&backend, Path::new("model.gguf"),
22//!                                            &LlamaModelParams::default()).unwrap();
23//! let mut lctx = model.new_context(&backend, LlamaContextParams::default()).unwrap();
24//!
25//! // Load the multimodal projector (mmproj) model.
26//! let ctx_params = MtmdContextParams::default();
27//! let mtmd_ctx   = MtmdContext::init_from_file(Path::new("mmproj.gguf"), &model, ctx_params)
28//!                               .unwrap();
29//!
30//! // Load an image from a file.
31//! let bitmap = MtmdBitmap::from_file(&mtmd_ctx, Path::new("image.jpg")).unwrap();
32//!
33//! // Tokenize a prompt that contains the media marker.
34//! let marker  = MtmdContext::default_marker();
35//! let prompt  = format!("Describe this image: {marker}");
36//! let text    = MtmdInputText::new(&prompt, true, true);
37//! let bitmaps = [&bitmap];
38//!
39//! let mut chunks = MtmdInputChunks::new();
40//! mtmd_ctx.tokenize(&text, &bitmaps, &mut chunks).unwrap();
41//!
42//! // Evaluate / decode all chunks.
43//! let n_batch = lctx.n_batch() as i32;
44//! let mut n_past = 0i32;
45//! mtmd_ctx.eval_chunks(lctx.as_ptr(), &chunks, 0, 0, n_batch, true, &mut n_past).unwrap();
46//! # }
47//! ```
48//!
49//! # Feature flag
50//!
51//! This module is only compiled when the `mtmd` Cargo feature is enabled.
52
53use std::ffi::{CStr, CString};
54use std::path::Path;
55use std::ptr::NonNull;
56use std::slice;
57
58use llama_cpp_sys_4 as sys;
59
60use crate::model::LlamaModel;
61
62// ─────────────────────────────────────────────────────────────────────────────
63// Error types
64// ─────────────────────────────────────────────────────────────────────────────
65
66/// All errors that can be returned by the mtmd module.
67#[derive(Debug, thiserror::Error)]
68pub enum MtmdError {
69    /// The context could not be created (e.g. bad mmproj file).
70    #[error("failed to create mtmd context (null return from mtmd_init_from_file)")]
71    ContextCreateFailed,
72
73    /// The bitmap could not be created.
74    #[error("failed to create mtmd bitmap")]
75    BitmapCreateFailed,
76
77    /// A path could not be converted to a valid C string (embedded NUL byte or non-UTF-8).
78    #[error("invalid path: {0}")]
79    InvalidPath(#[from] std::ffi::NulError),
80
81    /// A path was not representable as UTF-8.
82    #[error("path is not valid UTF-8")]
83    PathNotUtf8,
84
85    /// `mtmd_tokenize` returned an error code.
86    #[error("tokenize error: code {0} (1 = bitmap count mismatch, 2 = preprocessing error)")]
87    TokenizeError(i32),
88
89    /// `mtmd_encode_chunk` returned a non-zero code.
90    #[error("encode error: code {0}")]
91    EncodeError(i32),
92
93    /// `mtmd_helper_eval_chunks` (or single-chunk variant) returned a non-zero code.
94    #[error("eval error: code {0}")]
95    EvalError(i32),
96
97    /// A video stream could not be opened. Common causes: the build lacks
98    /// video support (`MTMD_VIDEO` was OFF), `ffmpeg`/`ffprobe` is not on
99    /// `PATH`, or the file is unreadable.
100    #[error("failed to open video stream (null return from mtmd_helper_video_init)")]
101    VideoInitFailed,
102
103    /// `mtmd_helper_video_read_next` returned an error code (`-2`).
104    #[error("video read error: code {0}")]
105    VideoReadError(i32),
106}
107
108/// A convenience `Result` alias for this module.
109pub type Result<T> = std::result::Result<T, MtmdError>;
110
111// ─────────────────────────────────────────────────────────────────────────────
112// MtmdContextParams
113// ─────────────────────────────────────────────────────────────────────────────
114
115/// Parameters used when creating an [`MtmdContext`].
116///
117/// Obtain a default-initialised instance via [`MtmdContextParams::default()`].
118pub struct MtmdContextParams {
119    pub(crate) params: sys::mtmd_context_params,
120}
121
122impl std::fmt::Debug for MtmdContextParams {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_struct("MtmdContextParams")
125            .field("use_gpu", &self.params.use_gpu)
126            .field("print_timings", &self.params.print_timings)
127            .field("n_threads", &self.params.n_threads)
128            .field("warmup", &self.params.warmup)
129            .field("image_min_tokens", &self.params.image_min_tokens)
130            .field("image_max_tokens", &self.params.image_max_tokens)
131            .finish()
132    }
133}
134
135impl Default for MtmdContextParams {
136    fn default() -> Self {
137        let params = unsafe { sys::mtmd_context_params_default() };
138        Self { params }
139    }
140}
141
142impl MtmdContextParams {
143    /// Whether to run the vision/audio encoder on the GPU (default: `true`).
144    #[must_use]
145    pub fn use_gpu(mut self, v: bool) -> Self {
146        self.params.use_gpu = v;
147        self
148    }
149
150    /// Whether to print timing info after each encode (default: `false`).
151    #[must_use]
152    pub fn print_timings(mut self, v: bool) -> Self {
153        self.params.print_timings = v;
154        self
155    }
156
157    /// Number of threads used for the vision encoder (default taken from
158    /// `mtmd_context_params_default`).
159    #[must_use]
160    pub fn n_threads(mut self, n: i32) -> Self {
161        self.params.n_threads = n;
162        self
163    }
164
165    /// Whether to run a warm-up encode pass after initialisation.
166    #[must_use]
167    pub fn warmup(mut self, v: bool) -> Self {
168        self.params.warmup = v;
169        self
170    }
171
172    /// Minimum number of image tokens (0 = use model default).
173    #[must_use]
174    pub fn image_min_tokens(mut self, n: i32) -> Self {
175        self.params.image_min_tokens = n;
176        self
177    }
178
179    /// Maximum number of image tokens (0 = use model default).
180    #[must_use]
181    pub fn image_max_tokens(mut self, n: i32) -> Self {
182        self.params.image_max_tokens = n;
183        self
184    }
185
186    /// Override the media marker string (e.g. `"<image>"`).
187    ///
188    /// The provided string must not contain interior NUL bytes.  Pass `None`
189    /// to use the library default (`mtmd_default_marker()`).
190    ///
191    /// **Note:** the `CString` is stored inside the params so the pointer
192    /// remains valid as long as this `MtmdContextParams` lives.
193    /// # Errors
194    ///
195    /// Returns [`MtmdError`] if the marker string contains a NUL byte.
196    pub fn media_marker(mut self, marker: Option<&str>) -> std::result::Result<Self, MtmdError> {
197        match marker {
198            None => {
199                self.params.media_marker = std::ptr::null();
200                Ok(self)
201            }
202            Some(s) => {
203                let cs = CString::new(s)?;
204                self.params.media_marker = cs.as_ptr();
205                // Leak the CString so the raw pointer stays valid; the caller
206                // must ensure the params don't outlive the string.  Since
207                // MtmdContextParams is consumed by MtmdContext::init_from_file,
208                // this is safe.
209                std::mem::forget(cs);
210                Ok(self)
211            }
212        }
213    }
214}
215
216// ─────────────────────────────────────────────────────────────────────────────
217// MtmdContext
218// ─────────────────────────────────────────────────────────────────────────────
219
220/// The main multimodal context.
221///
222/// Wraps a `mtmd_context *`.  This context is tied to a specific mmproj model
223/// file and a loaded [`LlamaModel`].  It is safe to share across threads for
224/// `tokenize` calls (read-only), but `encode_chunk` / eval helpers mutate
225/// internal state and must not be called concurrently.
226pub struct MtmdContext {
227    ptr: NonNull<sys::mtmd_context>,
228}
229
230// The underlying mtmd_context is internally synchronised for tokenize().
231// encode / decode must be called from a single thread at a time (caller's
232// responsibility, enforced by the inference semaphore in the server).
233unsafe impl Send for MtmdContext {}
234unsafe impl Sync for MtmdContext {}
235
236impl std::fmt::Debug for MtmdContext {
237    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
238        f.debug_struct("MtmdContext")
239            .field("ptr", &self.ptr)
240            .finish()
241    }
242}
243
244impl Drop for MtmdContext {
245    fn drop(&mut self) {
246        unsafe { sys::mtmd_free(self.ptr.as_ptr()) }
247    }
248}
249
250impl MtmdContext {
251    /// Returns the default media marker string used in prompts
252    /// (currently `"<__media__>"`).
253    #[must_use]
254    pub fn default_marker() -> &'static str {
255        let ptr = unsafe { sys::mtmd_default_marker() };
256        unsafe { CStr::from_ptr(ptr) }
257            .to_str()
258            .unwrap_or("<__media__>")
259    }
260
261    /// Initialise a multimodal context from an mmproj GGUF file.
262    ///
263    /// # Parameters
264    ///
265    /// * `mmproj_path` – path to the mmproj `.gguf` file
266    /// * `text_model`  – the already-loaded text model
267    /// * `params`      – context parameters (use [`MtmdContextParams::default()`])
268    ///
269    /// # Errors
270    ///
271    /// Returns [`MtmdError::ContextCreateFailed`] if the underlying C call
272    /// returns a null pointer.
273    #[allow(clippy::needless_pass_by_value)]
274    pub fn init_from_file(
275        mmproj_path: impl AsRef<Path>,
276        text_model: &LlamaModel,
277        params: MtmdContextParams,
278    ) -> Result<Self> {
279        let path = mmproj_path
280            .as_ref()
281            .to_str()
282            .ok_or(MtmdError::PathNotUtf8)?;
283        let c_path = CString::new(path)?;
284
285        let ptr = unsafe {
286            sys::mtmd_init_from_file(c_path.as_ptr(), text_model.model.as_ptr(), params.params)
287        };
288
289        let ptr = NonNull::new(ptr).ok_or(MtmdError::ContextCreateFailed)?;
290        Ok(Self { ptr })
291    }
292
293    // ── Logging ──────────────────────────────────────────────────────────
294
295    /// Silence all clip/mtmd log output by installing a no-op callback.
296    ///
297    /// Call this right after [`init_from_file`](Self::init_from_file) to
298    /// suppress the verbose `clip_model_loader: tensor[N]…` lines that
299    /// clip.cpp emits to its own private logger (separate from `llama_log_set`).
300    pub fn void_logs() {
301        unsafe extern "C" fn noop(
302            _level: sys::ggml_log_level,
303            _text: *const ::std::os::raw::c_char,
304            _ud: *mut ::std::os::raw::c_void,
305        ) {
306        }
307        unsafe { sys::mtmd_log_set(Some(noop), std::ptr::null_mut()) };
308    }
309
310    /// Like [`void_logs`](Self::void_logs), but additionally silences logs
311    /// emitted by the `mtmd_helper_*` layer (e.g. eval/decode helpers).
312    ///
313    /// Internally calls `mtmd_helper_log_set` which also routes through
314    /// `mtmd_log_set`, so this is a strict superset of `void_logs`.
315    pub fn void_helper_logs() {
316        unsafe extern "C" fn noop(
317            _level: sys::ggml_log_level,
318            _text: *const ::std::os::raw::c_char,
319            _ud: *mut ::std::os::raw::c_void,
320        ) {
321        }
322        unsafe { sys::mtmd_helper_log_set(Some(noop), std::ptr::null_mut()) };
323    }
324
325    // ── Capability queries ────────────────────────────────────────────────
326
327    /// Returns `true` if the model supports vision (image) input.
328    #[must_use]
329    pub fn supports_vision(&self) -> bool {
330        unsafe { sys::mtmd_support_vision(self.ptr.as_ptr()) }
331    }
332
333    /// Returns `true` if the model supports audio input.
334    #[must_use]
335    pub fn supports_audio(&self) -> bool {
336        unsafe { sys::mtmd_support_audio(self.ptr.as_ptr()) }
337    }
338
339    /// Returns `true` if this build and model support video input.
340    ///
341    /// Video support additionally requires `ffmpeg`/`ffprobe` to be available
342    /// at runtime (see [`MtmdVideo`]). Wraps `mtmd_helper_support_video`.
343    #[must_use]
344    pub fn supports_video(&self) -> bool {
345        unsafe { sys::mtmd_helper_support_video(self.ptr.as_ptr()) }
346    }
347
348    /// Returns the media marker string configured for *this* context.
349    ///
350    /// Unlike [`default_marker`](Self::default_marker) (the library-wide
351    /// default), this reflects any override passed via
352    /// [`MtmdContextParams::media_marker`]. Wraps `mtmd_get_marker`.
353    #[must_use]
354    pub fn marker(&self) -> &str {
355        let ptr = unsafe { sys::mtmd_get_marker(self.ptr.as_ptr()) };
356        if ptr.is_null() {
357            return Self::default_marker();
358        }
359        unsafe { CStr::from_ptr(ptr) }
360            .to_str()
361            .unwrap_or_else(|_| Self::default_marker())
362    }
363
364    /// Returns the audio sample rate in Hz (e.g. 16 000 for Whisper), or
365    /// `-1` if audio is not supported.
366    #[must_use]
367    #[deprecated(note = "use audio_sample_rate() instead")]
368    pub fn audio_bitrate(&self) -> i32 {
369        self.audio_sample_rate()
370    }
371
372    /// Returns the audio sample rate in Hz.
373    #[must_use]
374    pub fn audio_sample_rate(&self) -> i32 {
375        unsafe { sys::mtmd_get_audio_sample_rate(self.ptr.as_ptr()) }
376    }
377
378    /// Whether `llama_decode` must use a non-causal attention mask when
379    /// decoding image embeddings for this model.
380    #[must_use]
381    pub fn decode_use_non_causal(&self, chunk: &MtmdInputChunk<'_>) -> bool {
382        unsafe { sys::mtmd_decode_use_non_causal(self.ptr.as_ptr(), chunk.as_ptr()) }
383    }
384
385    /// Whether the model uses M-RoPE for `llama_decode`.
386    #[must_use]
387    pub fn decode_use_mrope(&self) -> bool {
388        unsafe { sys::mtmd_decode_use_mrope(self.ptr.as_ptr()) }
389    }
390
391    // ── Core API ──────────────────────────────────────────────────────────
392
393    /// Tokenize a text prompt that contains one or more media markers.
394    ///
395    /// The number of `bitmaps` must equal the number of media markers in the
396    /// prompt text, otherwise [`MtmdError::TokenizeError(1)`] is returned.
397    ///
398    /// This call is **thread-safe** (shared `&self`).
399    ///
400    /// # Parameters
401    ///
402    /// * `text`    – text + tokenisation options
403    /// * `bitmaps` – slice of [`MtmdBitmap`] references, one per media marker
404    /// * `output`  – an [`MtmdInputChunks`] that will be populated with the result
405    ///
406    /// # Errors
407    ///
408    /// Returns [`MtmdError::TokenizeError`] if tokenization fails.
409    pub fn tokenize(
410        &self,
411        text: &MtmdInputText<'_>,
412        bitmaps: &[&MtmdBitmap],
413        output: &mut MtmdInputChunks,
414    ) -> Result<()> {
415        // The C signature is: mtmd_tokenize(..., mtmd_bitmap ** bitmaps, ...)
416        // where each element is a `const mtmd_bitmap *`.  We build a Vec of
417        // `*const mtmd_bitmap` and pass a mutable pointer to its first element
418        // (i.e. `*mut *const mtmd_bitmap`) to satisfy the C API.
419        let mut bitmap_ptrs: Vec<*const sys::mtmd_bitmap> = bitmaps
420            .iter()
421            .map(|b| b.ptr.as_ptr().cast_const())
422            .collect();
423
424        let c_text = sys::mtmd_input_text {
425            text: text.c_text.as_ptr(),
426            add_special: text.add_special,
427            parse_special: text.parse_special,
428        };
429
430        let ret = unsafe {
431            sys::mtmd_tokenize(
432                self.ptr.as_ptr(),
433                output.ptr.as_ptr(),
434                &raw const c_text,
435                bitmap_ptrs.as_mut_ptr(),
436                bitmap_ptrs.len(),
437            )
438        };
439
440        if ret != 0 {
441            return Err(MtmdError::TokenizeError(ret));
442        }
443        Ok(())
444    }
445
446    /// Encode a single input chunk (image or audio) and store the resulting
447    /// embeddings inside the context.
448    ///
449    /// After a successful call, the embeddings can be retrieved with
450    /// [`MtmdContext::output_embd`].
451    ///
452    /// This call is **NOT thread-safe**.
453    ///
454    /// # Errors
455    ///
456    /// Returns [`MtmdError::EncodeError`] if encoding fails.
457    pub fn encode_chunk(&self, chunk: &MtmdInputChunk<'_>) -> Result<()> {
458        let ret = unsafe { sys::mtmd_encode_chunk(self.ptr.as_ptr(), chunk.ptr) };
459        if ret != 0 {
460            return Err(MtmdError::EncodeError(ret));
461        }
462        Ok(())
463    }
464
465    /// Return a slice over the embeddings produced by the last
466    /// [`encode_chunk`](Self::encode_chunk) call.
467    ///
468    /// The length (in `f32` elements) is:
469    /// ```text
470    /// n_embd_inp(model)  *  chunk.n_tokens()
471    /// ```
472    ///
473    /// # Safety
474    ///
475    /// The returned slice is valid until the next call that mutates the
476    /// context (e.g. another `encode_chunk`).
477    #[must_use]
478    pub fn output_embd(&self, n_elements: usize) -> &[f32] {
479        let ptr = unsafe { sys::mtmd_get_output_embd(self.ptr.as_ptr()) };
480        if ptr.is_null() || n_elements == 0 {
481            return &[];
482        }
483        unsafe { slice::from_raw_parts(ptr, n_elements) }
484    }
485
486    // ── Helper API ────────────────────────────────────────────────────────
487
488    /// High-level helper: evaluate (decode) all chunks in sequence.
489    ///
490    /// * Text chunks are decoded via `llama_decode`.
491    /// * Image/audio chunks are first encoded with `mtmd_encode_chunk` and
492    ///   then decoded via `llama_decode`.
493    ///
494    /// On success `new_n_past` is updated with the new past position.
495    ///
496    /// This call is **NOT thread-safe**.
497    ///
498    /// # Parameters
499    ///
500    /// * `lctx`        – raw pointer to the llama context (from [`LlamaContext::as_ptr`])
501    /// * `chunks`      – the tokenized chunks to evaluate
502    /// * `n_past`      – current KV-cache position
503    /// * `seq_id`      – sequence ID
504    /// * `n_batch`     – maximum batch size (must be ≥ 1)
505    /// * `logits_last` – if `true`, compute logits only for the final token
506    /// * `new_n_past`  – updated KV-cache position after the call
507    ///
508    /// # Errors
509    ///
510    /// Returns [`MtmdError::EvalError`] if evaluation fails.
511    #[allow(clippy::too_many_arguments, clippy::not_unsafe_ptr_arg_deref)]
512    pub fn eval_chunks(
513        &self,
514        lctx: *mut sys::llama_context,
515        chunks: &MtmdInputChunks,
516        n_past: i32,
517        seq_id: i32,
518        n_batch: i32,
519        logits_last: bool,
520        new_n_past: &mut i32,
521    ) -> Result<()> {
522        let ret = unsafe {
523            sys::mtmd_helper_eval_chunks(
524                self.ptr.as_ptr(),
525                lctx,
526                chunks.ptr.as_ptr(),
527                n_past,
528                seq_id,
529                n_batch,
530                logits_last,
531                new_n_past,
532            )
533        };
534        if ret != 0 {
535            return Err(MtmdError::EvalError(ret));
536        }
537        Ok(())
538    }
539
540    /// High-level helper: evaluate a single chunk.
541    ///
542    /// Works identically to [`eval_chunks`](Self::eval_chunks) but operates on
543    /// one chunk at a time.
544    ///
545    /// # Errors
546    ///
547    /// Returns [`MtmdError::EvalError`] if evaluation fails.
548    #[allow(clippy::too_many_arguments, clippy::not_unsafe_ptr_arg_deref)]
549    pub fn eval_chunk_single(
550        &self,
551        lctx: *mut sys::llama_context,
552        chunk: &MtmdInputChunk<'_>,
553        n_past: i32,
554        seq_id: i32,
555        n_batch: i32,
556        logits_last: bool,
557        new_n_past: &mut i32,
558    ) -> Result<()> {
559        let ret = unsafe {
560            sys::mtmd_helper_eval_chunk_single(
561                self.ptr.as_ptr(),
562                lctx,
563                chunk.ptr,
564                n_past,
565                seq_id,
566                n_batch,
567                logits_last,
568                new_n_past,
569            )
570        };
571        if ret != 0 {
572            return Err(MtmdError::EvalError(ret));
573        }
574        Ok(())
575    }
576
577    /// Decode an image/audio chunk whose embeddings have already been
578    /// computed (e.g. via [`encode_chunk`](Self::encode_chunk) followed by
579    /// [`output_embd`](Self::output_embd)).
580    ///
581    /// Unlike [`eval_chunk_single`](Self::eval_chunk_single), this helper
582    /// handles batching plus the non-causal-attention setup required by
583    /// some models (e.g. Gemma 3, Gemma 4 audio) and the M-RoPE position
584    /// layout. Use it when the embeddings are already in hand and you want
585    /// the helper to take care of `llama_decode` plumbing.
586    ///
587    /// `encoded_embd` must contain `mtmd_image_tokens_get_n_tokens(chunk) *
588    /// llama_model_n_embd_inp(model)` `f32` elements. This call is **NOT
589    /// thread-safe**.
590    ///
591    /// # Errors
592    ///
593    /// Returns [`MtmdError::EvalError`] with code `-1` if `chunk` is not an
594    /// image/audio chunk, or `1` if `llama_decode` fails.
595    #[allow(clippy::too_many_arguments, clippy::not_unsafe_ptr_arg_deref)]
596    pub fn decode_image_chunk(
597        &self,
598        lctx: *mut sys::llama_context,
599        chunk: &MtmdInputChunk<'_>,
600        encoded_embd: &[f32],
601        n_past: i32,
602        seq_id: i32,
603        n_batch: i32,
604        new_n_past: &mut i32,
605    ) -> Result<()> {
606        let ret = unsafe {
607            sys::mtmd_helper_decode_image_chunk(
608                self.ptr.as_ptr(),
609                lctx,
610                chunk.ptr,
611                encoded_embd.as_ptr().cast_mut(),
612                n_past,
613                seq_id,
614                n_batch,
615                new_n_past,
616                // No post-decode callback; preserves prior single-shot behavior.
617                None,
618                std::ptr::null_mut(),
619            )
620        };
621        if ret != 0 {
622            return Err(MtmdError::EvalError(ret));
623        }
624        Ok(())
625    }
626
627    /// Returns a raw pointer to the underlying `mtmd_context`.
628    ///
629    /// # Safety
630    ///
631    /// The returned pointer is valid for the lifetime of this `MtmdContext`.
632    /// The caller must not free it.
633    #[must_use]
634    pub fn as_ptr(&self) -> *mut sys::mtmd_context {
635        self.ptr.as_ptr()
636    }
637}
638
639// ─────────────────────────────────────────────────────────────────────────────
640// MtmdInputText
641// ─────────────────────────────────────────────────────────────────────────────
642
643/// Text input for [`MtmdContext::tokenize`].
644///
645/// The prompt string must contain the media marker (see
646/// [`MtmdContext::default_marker`]) once for every bitmap to be embedded.
647#[derive(Debug)]
648pub struct MtmdInputText<'a> {
649    c_text: CString,
650    add_special: bool,
651    parse_special: bool,
652    _marker: std::marker::PhantomData<&'a ()>,
653}
654
655impl<'a> MtmdInputText<'a> {
656    /// Create a new `MtmdInputText`.
657    ///
658    /// * `text`          – the prompt (must not contain interior NUL bytes)
659    /// * `add_special`   – whether to add BOS/EOS tokens
660    /// * `parse_special` – whether to parse special tokens embedded in the text
661    ///
662    /// # Panics
663    ///
664    /// Panics if `text` contains an interior NUL byte.
665    #[must_use]
666    pub fn new(text: &'a str, add_special: bool, parse_special: bool) -> Self {
667        let c_text = CString::new(text).expect("MtmdInputText: text must not contain NUL bytes");
668        Self {
669            c_text,
670            add_special,
671            parse_special,
672            _marker: std::marker::PhantomData,
673        }
674    }
675
676    /// Try to create a new `MtmdInputText`, returning an error if `text`
677    /// contains an interior NUL byte.
678    ///
679    /// # Errors
680    ///
681    /// Returns [`std::ffi::NulError`] if `text` contains a NUL byte.
682    pub fn try_new(
683        text: &'a str,
684        add_special: bool,
685        parse_special: bool,
686    ) -> std::result::Result<Self, std::ffi::NulError> {
687        let c_text = CString::new(text)?;
688        Ok(Self {
689            c_text,
690            add_special,
691            parse_special,
692            _marker: std::marker::PhantomData,
693        })
694    }
695}
696
697// ─────────────────────────────────────────────────────────────────────────────
698// MtmdBitmap
699// ─────────────────────────────────────────────────────────────────────────────
700
701/// An image or audio bitmap ready for multimodal encoding.
702///
703/// # Image bitmaps
704///
705/// The raw pixel data must be in RGBRGBRGB… (interleaved) format.  The total
706/// number of bytes must be `nx * ny * 3`.
707///
708/// # Audio bitmaps
709///
710/// The raw sample data must be little-endian `f32` PCM samples.  The total
711/// number of bytes must be `n_samples * 4`.
712pub struct MtmdBitmap {
713    ptr: NonNull<sys::mtmd_bitmap>,
714}
715
716unsafe impl Send for MtmdBitmap {}
717unsafe impl Sync for MtmdBitmap {}
718
719impl std::fmt::Debug for MtmdBitmap {
720    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
721        f.debug_struct("MtmdBitmap")
722            .field("nx", &self.nx())
723            .field("ny", &self.ny())
724            .field("n_bytes", &self.n_bytes())
725            .field("is_audio", &self.is_audio())
726            .finish()
727    }
728}
729
730impl Drop for MtmdBitmap {
731    fn drop(&mut self) {
732        unsafe { sys::mtmd_bitmap_free(self.ptr.as_ptr()) }
733    }
734}
735
736impl MtmdBitmap {
737    /// Create a bitmap from raw RGB pixel data.
738    ///
739    /// * `nx`   – image width in pixels
740    /// * `ny`   – image height in pixels
741    /// * `data` – raw pixel bytes in RGBRGB… format; must be `nx * ny * 3` bytes
742    ///
743    /// # Errors
744    ///
745    /// Returns [`MtmdError::BitmapCreateFailed`] if the underlying C call
746    /// returns null.
747    pub fn from_rgb(nx: u32, ny: u32, data: &[u8]) -> Result<Self> {
748        let ptr = unsafe { sys::mtmd_bitmap_init(nx, ny, data.as_ptr()) };
749        let ptr = NonNull::new(ptr).ok_or(MtmdError::BitmapCreateFailed)?;
750        Ok(Self { ptr })
751    }
752
753    /// Create an audio bitmap from PCM `f32` samples.
754    ///
755    /// * `samples` – slice of PCM float samples
756    ///
757    /// # Errors
758    ///
759    /// Returns [`MtmdError::BitmapCreateFailed`] if the underlying C call
760    /// returns null.
761    pub fn from_audio(samples: &[f32]) -> Result<Self> {
762        let ptr = unsafe { sys::mtmd_bitmap_init_from_audio(samples.len(), samples.as_ptr()) };
763        let ptr = NonNull::new(ptr).ok_or(MtmdError::BitmapCreateFailed)?;
764        Ok(Self { ptr })
765    }
766
767    /// Build an `MtmdBitmap` from a `mtmd_helper_bitmap_wrapper`, taking
768    /// ownership of the `bitmap` and freeing any `video_ctx`.
769    ///
770    /// The `from_file`/`from_buf` constructors only support image/audio input.
771    /// When the input is a video the helper returns a non-null `video_ctx`
772    /// (an open ffmpeg stream) which is not representable as an `MtmdBitmap`;
773    /// we free it here to avoid leaking it. Use [`MtmdVideo`] for video input.
774    fn from_wrapper(wrapper: sys::mtmd_helper_bitmap_wrapper) -> Result<Self> {
775        if !wrapper.video_ctx.is_null() {
776            unsafe { sys::mtmd_helper_video_free(wrapper.video_ctx) };
777        }
778        let ptr = NonNull::new(wrapper.bitmap).ok_or(MtmdError::BitmapCreateFailed)?;
779        Ok(Self { ptr })
780    }
781
782    /// Load a bitmap from a file (image or audio).
783    ///
784    /// Supported image formats: JPEG, PNG, BMP, GIF, and others handled by
785    /// `stb_image`.  Supported audio formats: WAV, MP3, FLAC (via miniaudio).
786    ///
787    /// # Errors
788    ///
789    /// Returns [`MtmdError::BitmapCreateFailed`] if the file cannot be loaded.
790    pub fn from_file(ctx: &MtmdContext, path: impl AsRef<Path>) -> Result<Self> {
791        let path = path.as_ref().to_str().ok_or(MtmdError::PathNotUtf8)?;
792        let c_path = CString::new(path)?;
793
794        // `placeholder = false`: load the real bitmap data (not a token-count
795        // placeholder). For image/audio the returned `video_ctx` is always null.
796        let wrapper = unsafe {
797            sys::mtmd_helper_bitmap_init_from_file(ctx.ptr.as_ptr(), c_path.as_ptr(), false)
798        };
799        Self::from_wrapper(wrapper)
800    }
801
802    /// Load a bitmap from an in-memory buffer containing a file.
803    ///
804    /// The format is auto-detected (image vs audio via magic bytes).
805    ///
806    /// # Errors
807    ///
808    /// Returns [`MtmdError::BitmapCreateFailed`] if decoding fails.
809    pub fn from_buf(ctx: &MtmdContext, buf: &[u8]) -> Result<Self> {
810        // `placeholder = false`: load the real bitmap data (not a token-count
811        // placeholder). For image/audio the returned `video_ctx` is always null.
812        let wrapper = unsafe {
813            sys::mtmd_helper_bitmap_init_from_buf(ctx.ptr.as_ptr(), buf.as_ptr(), buf.len(), false)
814        };
815        Self::from_wrapper(wrapper)
816    }
817
818    // ── Getters ───────────────────────────────────────────────────────────
819
820    /// Width in pixels (for images) or 0 (for audio).
821    #[must_use]
822    pub fn nx(&self) -> u32 {
823        unsafe { sys::mtmd_bitmap_get_nx(self.ptr.as_ptr()) }
824    }
825
826    /// Height in pixels (for images) or 0 (for audio).
827    #[must_use]
828    pub fn ny(&self) -> u32 {
829        unsafe { sys::mtmd_bitmap_get_ny(self.ptr.as_ptr()) }
830    }
831
832    /// Total number of bytes in the bitmap data.
833    #[must_use]
834    pub fn n_bytes(&self) -> usize {
835        unsafe { sys::mtmd_bitmap_get_n_bytes(self.ptr.as_ptr()) }
836    }
837
838    /// Returns `true` if this bitmap contains audio (rather than image) data.
839    #[must_use]
840    pub fn is_audio(&self) -> bool {
841        unsafe { sys::mtmd_bitmap_is_audio(self.ptr.as_ptr()) }
842    }
843
844    /// Return the raw pixel / sample data.
845    #[must_use]
846    pub fn data(&self) -> &[u8] {
847        let n = self.n_bytes();
848        if n == 0 {
849            return &[];
850        }
851        let ptr = unsafe { sys::mtmd_bitmap_get_data(self.ptr.as_ptr()) };
852        unsafe { slice::from_raw_parts(ptr, n) }
853    }
854
855    /// Return the optional ID string attached to this bitmap (used for KV
856    /// cache tracking), or `None` if no ID has been set.
857    #[must_use]
858    pub fn id(&self) -> Option<&str> {
859        let ptr = unsafe { sys::mtmd_bitmap_get_id(self.ptr.as_ptr()) };
860        if ptr.is_null() {
861            return None;
862        }
863        unsafe { CStr::from_ptr(ptr) }.to_str().ok()
864    }
865
866    /// Attach an optional ID string to this bitmap (used for KV cache
867    /// tracking).
868    ///
869    /// # Errors
870    ///
871    /// Returns an error if `id` contains an interior NUL byte.
872    pub fn set_id(&mut self, id: &str) -> std::result::Result<(), std::ffi::NulError> {
873        let cs = CString::new(id)?;
874        unsafe { sys::mtmd_bitmap_set_id(self.ptr.as_ptr(), cs.as_ptr()) };
875        Ok(())
876    }
877}
878
879// ─────────────────────────────────────────────────────────────────────────────
880// Video input
881// ─────────────────────────────────────────────────────────────────────────────
882
883// `free()` from libc — used to release the heap-allocated text returned by
884// `mtmd_helper_video_read_next` (the C side allocates it with strdup/malloc and
885// documents that the caller must release it with `free()`).
886extern "C" {
887    fn free(ptr: *mut std::os::raw::c_void);
888}
889
890/// Parameters controlling how a [`MtmdVideo`] stream is opened and sampled.
891///
892/// Obtain a default-initialised instance via [`MtmdVideoParams::default()`]
893/// (which mirrors `mtmd_helper_video_init_params_default`: ~4 fps, native
894/// `ffmpeg`/`ffprobe` from `PATH`, and a 5 s timestamp interval) and tweak it
895/// with the builder methods.
896pub struct MtmdVideoParams {
897    params: sys::mtmd_helper_video_init_params,
898    // Keeps the `ffmpeg_bin_dir` C string alive for as long as `params`
899    // borrows it via a raw pointer.
900    ffmpeg_bin_dir: Option<CString>,
901}
902
903impl std::fmt::Debug for MtmdVideoParams {
904    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
905        f.debug_struct("MtmdVideoParams")
906            .field("fps_target", &self.params.fps_target)
907            .field("timestamp_interval_ms", &self.params.timestamp_interval_ms)
908            .field("ffmpeg_bin_dir", &self.ffmpeg_bin_dir)
909            .finish()
910    }
911}
912
913impl Default for MtmdVideoParams {
914    fn default() -> Self {
915        let params = unsafe { sys::mtmd_helper_video_init_params_default() };
916        Self {
917            params,
918            ffmpeg_bin_dir: None,
919        }
920    }
921}
922
923impl MtmdVideoParams {
924    /// Desired output frame rate. Values `<= 0` mean "use the video's native
925    /// fps" (the default is ~4 fps).
926    #[must_use]
927    pub fn fps_target(mut self, fps: f32) -> Self {
928        self.params.fps_target = fps;
929        self
930    }
931
932    /// Interval, in milliseconds, between inserted timestamp text chunks (e.g.
933    /// `"[10m50.5s]"`). Values `<= 0` disable timestamps (default 5000 ms).
934    #[must_use]
935    pub fn timestamp_interval_ms(mut self, ms: i64) -> Self {
936        self.params.timestamp_interval_ms = ms;
937        self
938    }
939
940    /// Directory containing the `ffmpeg`/`ffprobe` binaries. Pass `None` to
941    /// search `PATH` (the default).
942    ///
943    /// # Errors
944    ///
945    /// Returns an error if `dir` contains an interior NUL byte.
946    pub fn ffmpeg_bin_dir(mut self, dir: Option<&str>) -> Result<Self> {
947        match dir {
948            None => {
949                self.params.ffmpeg_bin_dir = std::ptr::null();
950                self.ffmpeg_bin_dir = None;
951            }
952            Some(d) => {
953                let cs = CString::new(d)?;
954                self.params.ffmpeg_bin_dir = cs.as_ptr();
955                // Store the owner so the pointer above stays valid.
956                self.ffmpeg_bin_dir = Some(cs);
957            }
958        }
959        Ok(self)
960    }
961}
962
963/// Metadata describing an open [`MtmdVideo`] stream.
964#[derive(Debug, Clone, Copy, PartialEq)]
965pub struct MtmdVideoInfo {
966    /// Frame width in pixels.
967    pub width: u32,
968    /// Frame height in pixels.
969    pub height: u32,
970    /// Effective frames-per-second (the `fps_target` if set, else native fps).
971    pub fps: f32,
972    /// Estimated total frame count at the effective fps (`-1` if unknown).
973    pub n_frames: i32,
974}
975
976/// One item read from a [`MtmdVideo`] stream by [`MtmdVideo::read_next`].
977#[derive(Debug)]
978pub enum MtmdVideoItem {
979    /// A decoded video frame, ready to be tokenized like any other image
980    /// [`MtmdBitmap`].
981    Frame(MtmdBitmap),
982    /// A timestamp text marker (e.g. `"[10m50.5s]"`) to be inserted into the
983    /// prompt between frames.
984    Text(String),
985}
986
987/// An open video stream, decoded frame-by-frame via `ffmpeg`.
988///
989/// The notion of "video" exists only at the helper level — it is decoded into
990/// a sequence of image [frames](MtmdVideoItem::Frame) and timestamp
991/// [text markers](MtmdVideoItem::Text) which are then fed through the normal
992/// multimodal pipeline.
993///
994/// Requires a build with video support (see [`MtmdContext::supports_video`])
995/// and `ffmpeg`/`ffprobe` available at runtime.
996///
997/// # Example
998///
999/// ```no_run
1000/// # #[cfg(feature = "mtmd")]
1001/// # fn run(mtmd_ctx: &llama_cpp_4::mtmd::MtmdContext) -> Result<(), llama_cpp_4::mtmd::MtmdError> {
1002/// use std::path::Path;
1003/// use llama_cpp_4::mtmd::{MtmdVideo, MtmdVideoParams, MtmdVideoItem};
1004///
1005/// let mut video = MtmdVideo::from_file(mtmd_ctx, Path::new("clip.mp4"),
1006///                                      &MtmdVideoParams::default())?;
1007/// while let Some(item) = video.read_next()? {
1008///     match item {
1009///         MtmdVideoItem::Frame(bitmap) => { /* tokenize the frame */ }
1010///         MtmdVideoItem::Text(ts)      => { /* insert the timestamp marker */ }
1011///     }
1012/// }
1013/// # Ok(())
1014/// # }
1015/// ```
1016pub struct MtmdVideo {
1017    ptr: NonNull<sys::mtmd_helper_video>,
1018}
1019
1020impl std::fmt::Debug for MtmdVideo {
1021    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1022        f.debug_struct("MtmdVideo").field("info", &self.info()).finish()
1023    }
1024}
1025
1026impl Drop for MtmdVideo {
1027    fn drop(&mut self) {
1028        unsafe { sys::mtmd_helper_video_free(self.ptr.as_ptr()) }
1029    }
1030}
1031
1032impl MtmdVideo {
1033    /// Open a video file for frame-by-frame decoding.
1034    ///
1035    /// # Errors
1036    ///
1037    /// Returns [`MtmdError::VideoInitFailed`] if the stream cannot be opened
1038    /// (no video support compiled in, `ffprobe` not found, file unreadable,
1039    /// …), or [`MtmdError::InvalidPath`] / [`MtmdError::PathNotUtf8`] for a bad
1040    /// path.
1041    pub fn from_file(
1042        ctx: &MtmdContext,
1043        path: impl AsRef<Path>,
1044        params: &MtmdVideoParams,
1045    ) -> Result<Self> {
1046        let path = path.as_ref().to_str().ok_or(MtmdError::PathNotUtf8)?;
1047        let c_path = CString::new(path)?;
1048        let ptr = unsafe {
1049            sys::mtmd_helper_video_init(ctx.ptr.as_ptr(), c_path.as_ptr(), params.params)
1050        };
1051        let ptr = NonNull::new(ptr).ok_or(MtmdError::VideoInitFailed)?;
1052        Ok(Self { ptr })
1053    }
1054
1055    /// Open a video from an in-memory buffer. The buffer is copied internally,
1056    /// so it need not outlive this call.
1057    ///
1058    /// # Errors
1059    ///
1060    /// Returns [`MtmdError::VideoInitFailed`] if the stream cannot be opened.
1061    pub fn from_buf(ctx: &MtmdContext, buf: &[u8], params: &MtmdVideoParams) -> Result<Self> {
1062        let ptr = unsafe {
1063            sys::mtmd_helper_video_init_from_buf(
1064                ctx.ptr.as_ptr(),
1065                buf.as_ptr(),
1066                buf.len(),
1067                params.params,
1068            )
1069        };
1070        let ptr = NonNull::new(ptr).ok_or(MtmdError::VideoInitFailed)?;
1071        Ok(Self { ptr })
1072    }
1073
1074    /// Return metadata (resolution, effective fps, estimated frame count) for
1075    /// this stream.
1076    #[must_use]
1077    pub fn info(&self) -> MtmdVideoInfo {
1078        let info = unsafe { sys::mtmd_helper_video_get_info(self.ptr.as_ptr()) };
1079        MtmdVideoInfo {
1080            width: info.width,
1081            height: info.height,
1082            fps: info.fps,
1083            n_frames: info.n_frames,
1084        }
1085    }
1086
1087    /// Read the next item from the stream.
1088    ///
1089    /// Returns `Ok(Some(item))` for each frame or timestamp marker, and
1090    /// `Ok(None)` once the end of the stream is reached.
1091    ///
1092    /// # Errors
1093    ///
1094    /// Returns [`MtmdError::VideoReadError`] on a decode error.
1095    pub fn read_next(&mut self) -> Result<Option<MtmdVideoItem>> {
1096        let mut out_bitmap: *mut sys::mtmd_bitmap = std::ptr::null_mut();
1097        let mut out_text: *mut std::os::raw::c_char = std::ptr::null_mut();
1098        let ret = unsafe {
1099            sys::mtmd_helper_video_read_next(self.ptr.as_ptr(), &raw mut out_bitmap, &raw mut out_text)
1100        };
1101        match ret {
1102            0 => {
1103                if let Some(ptr) = NonNull::new(out_bitmap) {
1104                    Ok(Some(MtmdVideoItem::Frame(MtmdBitmap { ptr })))
1105                } else if !out_text.is_null() {
1106                    let text = unsafe { CStr::from_ptr(out_text) }
1107                        .to_string_lossy()
1108                        .into_owned();
1109                    // The C side allocated this with strdup/malloc; release it.
1110                    unsafe { free(out_text.cast()) };
1111                    Ok(Some(MtmdVideoItem::Text(text)))
1112                } else {
1113                    // Success but nothing produced — treat as end of stream.
1114                    Ok(None)
1115                }
1116            }
1117            -1 => Ok(None), // EOF
1118            other => Err(MtmdError::VideoReadError(other)),
1119        }
1120    }
1121}
1122
1123// ─────────────────────────────────────────────────────────────────────────────
1124// MtmdInputChunks
1125// ─────────────────────────────────────────────────────────────────────────────
1126
1127/// A list of tokenized input chunks produced by [`MtmdContext::tokenize`].
1128///
1129/// Each chunk is either a text token sequence or a set of image/audio tokens.
1130pub struct MtmdInputChunks {
1131    ptr: NonNull<sys::mtmd_input_chunks>,
1132}
1133
1134impl std::fmt::Debug for MtmdInputChunks {
1135    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1136        f.debug_struct("MtmdInputChunks")
1137            .field("len", &self.len())
1138            .finish()
1139    }
1140}
1141
1142impl Drop for MtmdInputChunks {
1143    fn drop(&mut self) {
1144        unsafe { sys::mtmd_input_chunks_free(self.ptr.as_ptr()) }
1145    }
1146}
1147
1148impl MtmdInputChunks {
1149    /// Create a new, empty chunk list.  Populated by
1150    /// [`MtmdContext::tokenize`].
1151    ///
1152    /// # Panics
1153    ///
1154    /// Panics if the underlying C allocation fails (OOM).
1155    #[must_use]
1156    pub fn new() -> Self {
1157        let ptr = unsafe { sys::mtmd_input_chunks_init() };
1158        let ptr = NonNull::new(ptr).expect("mtmd_input_chunks_init returned null");
1159        Self { ptr }
1160    }
1161
1162    /// Number of chunks in this list.
1163    #[must_use]
1164    pub fn len(&self) -> usize {
1165        unsafe { sys::mtmd_input_chunks_size(self.ptr.as_ptr()) }
1166    }
1167
1168    /// Returns `true` if there are no chunks.
1169    #[must_use]
1170    pub fn is_empty(&self) -> bool {
1171        self.len() == 0
1172    }
1173
1174    /// Get the `idx`-th chunk.  Returns `None` if `idx >= len()`.
1175    #[must_use]
1176    pub fn get(&self, idx: usize) -> Option<MtmdInputChunk<'_>> {
1177        if idx >= self.len() {
1178            return None;
1179        }
1180        let ptr = unsafe { sys::mtmd_input_chunks_get(self.ptr.as_ptr(), idx) };
1181        if ptr.is_null() {
1182            return None;
1183        }
1184        Some(MtmdInputChunk {
1185            ptr,
1186            _marker: std::marker::PhantomData,
1187        })
1188    }
1189
1190    /// Iterate over all chunks.
1191    pub fn iter(&self) -> impl Iterator<Item = MtmdInputChunk<'_>> {
1192        (0..self.len()).filter_map(|i| self.get(i))
1193    }
1194
1195    /// Total number of tokens across all chunks.
1196    ///
1197    /// Equivalent to `mtmd_helper_get_n_tokens`.
1198    #[must_use]
1199    pub fn n_tokens(&self) -> usize {
1200        unsafe { sys::mtmd_helper_get_n_tokens(self.ptr.as_ptr()) }
1201    }
1202
1203    /// Total number of *positions* across all chunks (used for KV-cache
1204    /// tracking with M-RoPE models where positions ≠ tokens).
1205    ///
1206    /// Equivalent to `mtmd_helper_get_n_pos`.
1207    #[must_use]
1208    pub fn n_pos(&self) -> i32 {
1209        unsafe { sys::mtmd_helper_get_n_pos(self.ptr.as_ptr()) }
1210    }
1211}
1212
1213impl Default for MtmdInputChunks {
1214    fn default() -> Self {
1215        Self::new()
1216    }
1217}
1218
1219// ─────────────────────────────────────────────────────────────────────────────
1220// MtmdInputChunkType
1221// ─────────────────────────────────────────────────────────────────────────────
1222
1223/// The type of an [`MtmdInputChunk`].
1224#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1225pub enum MtmdInputChunkType {
1226    /// Plain text tokens.
1227    Text,
1228    /// Image tokens (embeddings produced by the vision encoder).
1229    Image,
1230    /// Audio tokens (embeddings produced by the audio encoder).
1231    Audio,
1232}
1233
1234impl From<sys::mtmd_input_chunk_type> for MtmdInputChunkType {
1235    fn from(v: sys::mtmd_input_chunk_type) -> Self {
1236        // mtmd_input_chunk_type is a plain C `typedef unsigned int`.
1237        // The variants are exported as free-standing constants.
1238        if v == sys::MTMD_INPUT_CHUNK_TYPE_IMAGE {
1239            Self::Image
1240        } else if v == sys::MTMD_INPUT_CHUNK_TYPE_AUDIO {
1241            Self::Audio
1242        } else {
1243            Self::Text
1244        }
1245    }
1246}
1247
1248// ─────────────────────────────────────────────────────────────────────────────
1249// MtmdInputChunk
1250// ─────────────────────────────────────────────────────────────────────────────
1251
1252/// A single tokenized input chunk (text, image, or audio).
1253///
1254/// Instances are borrowed from an [`MtmdInputChunks`] list and live as long
1255/// as that list.
1256#[derive(Debug)]
1257pub struct MtmdInputChunk<'chunks> {
1258    ptr: *const sys::mtmd_input_chunk,
1259    _marker: std::marker::PhantomData<&'chunks MtmdInputChunks>,
1260}
1261
1262impl<'chunks> MtmdInputChunk<'chunks> {
1263    /// The type of this chunk.
1264    #[must_use]
1265    pub fn chunk_type(&self) -> MtmdInputChunkType {
1266        let t = unsafe { sys::mtmd_input_chunk_get_type(self.ptr) };
1267        MtmdInputChunkType::from(t)
1268    }
1269
1270    /// Total number of tokens in this chunk.
1271    #[must_use]
1272    pub fn n_tokens(&self) -> usize {
1273        unsafe { sys::mtmd_input_chunk_get_n_tokens(self.ptr) }
1274    }
1275
1276    /// Number of temporal positions (equals `n_tokens` for non-M-RoPE models).
1277    #[must_use]
1278    pub fn n_pos(&self) -> i32 {
1279        unsafe { sys::mtmd_input_chunk_get_n_pos(self.ptr) }
1280    }
1281
1282    /// Return the raw llama token IDs for a **text** chunk.
1283    ///
1284    /// Returns `None` if this chunk is not a text chunk.
1285    #[must_use]
1286    pub fn text_tokens(&self) -> Option<&[i32]> {
1287        if self.chunk_type() != MtmdInputChunkType::Text {
1288            return None;
1289        }
1290        let mut n: usize = 0;
1291        let ptr = unsafe { sys::mtmd_input_chunk_get_tokens_text(self.ptr, &raw mut n) };
1292        if ptr.is_null() || n == 0 {
1293            return Some(&[]);
1294        }
1295        Some(unsafe { slice::from_raw_parts(ptr, n) })
1296    }
1297
1298    /// Return the image token metadata for an **image** or **audio** chunk.
1299    ///
1300    /// Returns `None` for text chunks.
1301    #[must_use]
1302    pub fn image_tokens(&self) -> Option<MtmdImageTokens<'chunks>> {
1303        match self.chunk_type() {
1304            MtmdInputChunkType::Image | MtmdInputChunkType::Audio => {}
1305            MtmdInputChunkType::Text => return None,
1306        }
1307        let ptr = unsafe { sys::mtmd_input_chunk_get_tokens_image(self.ptr) };
1308        if ptr.is_null() {
1309            return None;
1310        }
1311        Some(MtmdImageTokens {
1312            ptr,
1313            _marker: std::marker::PhantomData,
1314        })
1315    }
1316
1317    /// Optional ID attached to this chunk (used for KV cache tracking).
1318    #[must_use]
1319    pub fn id(&self) -> Option<&str> {
1320        let ptr = unsafe { sys::mtmd_input_chunk_get_id(self.ptr) };
1321        if ptr.is_null() {
1322            return None;
1323        }
1324        unsafe { CStr::from_ptr(ptr) }.to_str().ok()
1325    }
1326
1327    /// Returns the raw `*const mtmd_input_chunk` pointer.
1328    ///
1329    /// # Safety
1330    ///
1331    /// The returned pointer is valid for the lifetime of the parent
1332    /// `MtmdInputChunks`.
1333    #[must_use]
1334    pub fn as_ptr(&self) -> *const sys::mtmd_input_chunk {
1335        self.ptr
1336    }
1337}
1338
1339// ─────────────────────────────────────────────────────────────────────────────
1340// MtmdDecoderPos
1341// ─────────────────────────────────────────────────────────────────────────────
1342
1343/// Per-token position used by M-RoPE decoder attention.
1344///
1345/// `t` is the temporal axis, `x`/`y` the spatial axes. `z` is reserved for
1346/// future use. Values are *relative* to a base `pos_0` provided when the
1347/// position is computed.
1348#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
1349#[repr(C)]
1350pub struct MtmdDecoderPos {
1351    /// Temporal index.
1352    pub t: u32,
1353    /// Spatial X.
1354    pub x: u32,
1355    /// Spatial Y.
1356    pub y: u32,
1357    /// Reserved.
1358    pub z: u32,
1359}
1360
1361// ─────────────────────────────────────────────────────────────────────────────
1362// MtmdImageTokens
1363// ─────────────────────────────────────────────────────────────────────────────
1364
1365/// Image/audio token metadata attached to a non-text [`MtmdInputChunk`].
1366#[derive(Debug)]
1367pub struct MtmdImageTokens<'chunks> {
1368    ptr: *const sys::mtmd_image_tokens,
1369    _marker: std::marker::PhantomData<&'chunks MtmdInputChunks>,
1370}
1371
1372impl MtmdImageTokens<'_> {
1373    /// Total number of embedding tokens.
1374    #[must_use]
1375    pub fn n_tokens(&self) -> usize {
1376        unsafe { sys::mtmd_image_tokens_get_n_tokens(self.ptr) }
1377    }
1378
1379    /// Width of the token grid.
1380    #[must_use]
1381    pub fn nx(&self) -> usize {
1382        unsafe { sys::mtmd_image_tokens_get_nx(self.ptr) }
1383    }
1384
1385    /// Height of the token grid.
1386    #[must_use]
1387    pub fn ny(&self) -> usize {
1388        unsafe { sys::mtmd_image_tokens_get_ny(self.ptr) }
1389    }
1390
1391    /// Number of temporal positions (M-RoPE variant; equals `n_tokens` otherwise).
1392    #[must_use]
1393    pub fn n_pos(&self) -> i32 {
1394        unsafe { sys::mtmd_image_tokens_get_n_pos(self.ptr) }
1395    }
1396
1397    /// Optional ID for KV cache tracking.
1398    #[must_use]
1399    pub fn id(&self) -> Option<&str> {
1400        let ptr = unsafe { sys::mtmd_image_tokens_get_id(self.ptr) };
1401        if ptr.is_null() {
1402            return None;
1403        }
1404        unsafe { CStr::from_ptr(ptr) }.to_str().ok()
1405    }
1406
1407    /// Compute the per-token decoder positions used by M-RoPE models.
1408    ///
1409    /// Returns a vector of length [`n_tokens`](Self::n_tokens). Each entry
1410    /// is relative to `pos_0`; for non-M-RoPE models this typically reduces
1411    /// to `(0, i, 0, 0)` for the i-th token.
1412    ///
1413    /// Wraps `mtmd_helper_image_get_decoder_pos`.
1414    #[must_use]
1415    pub fn decoder_positions(&self, pos_0: i32) -> Vec<MtmdDecoderPos> {
1416        let n = self.n_tokens();
1417        let mut out = vec![MtmdDecoderPos::default(); n];
1418        if n == 0 {
1419            return out;
1420        }
1421        unsafe {
1422            sys::mtmd_helper_image_get_decoder_pos(
1423                self.ptr,
1424                pos_0,
1425                out.as_mut_ptr().cast::<sys::mtmd_decoder_pos>(),
1426            );
1427        }
1428        out
1429    }
1430}
1431
1432// ─────────────────────────────────────────────────────────────────────────────
1433// LlamaContext extension
1434// ─────────────────────────────────────────────────────────────────────────────
1435
1436use crate::context::LlamaContext;
1437
1438impl LlamaContext<'_> {
1439    /// Expose the raw `llama_context` pointer for use with mtmd helpers.
1440    ///
1441    /// # Safety
1442    ///
1443    /// The pointer is valid for the lifetime of this `LlamaContext` and must
1444    /// not be freed by the caller.
1445    #[must_use]
1446    pub fn as_ptr(&self) -> *mut sys::llama_context {
1447        self.context.as_ptr()
1448    }
1449}
1450
1451#[cfg(test)]
1452mod tests {
1453    use super::*;
1454
1455    #[test]
1456    fn decoder_pos_layout_matches_sys() {
1457        // The Rust MtmdDecoderPos is cast to sys::mtmd_decoder_pos at the
1458        // FFI boundary in `MtmdImageTokens::decoder_positions`. Verify the
1459        // assumption.
1460        assert_eq!(
1461            std::mem::size_of::<MtmdDecoderPos>(),
1462            std::mem::size_of::<sys::mtmd_decoder_pos>(),
1463        );
1464        assert_eq!(
1465            std::mem::align_of::<MtmdDecoderPos>(),
1466            std::mem::align_of::<sys::mtmd_decoder_pos>(),
1467        );
1468        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, t), 0);
1469        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, x), 4);
1470        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, y), 8);
1471        assert_eq!(std::mem::offset_of!(MtmdDecoderPos, z), 12);
1472    }
1473}
llama_cpp_4/mtmd.rs

llama_cpp_4/
mtmd.rs