Skip to main content

llama_cpp_bindings/
model.rs

1//! A safe wrapper around `llama_model`.
2
3pub mod add_bos;
4pub mod llama_chat_message;
5pub mod llama_chat_template;
6pub mod llama_lora_adapter;
7pub mod llama_split_mode_parse_error;
8pub mod params;
9pub mod rope_type;
10pub mod split_mode;
11pub mod vocab_type;
12pub mod vocab_type_from_int_error;
13
14use std::ffi::{CStr, CString, c_char};
15use std::num::NonZeroU16;
16use std::os::raw::c_int;
17use std::path::Path;
18use std::ptr;
19use std::ptr::NonNull;
20use std::sync::Arc;
21use std::sync::OnceLock;
22
23use toktrie::ApproximateTokEnv;
24use toktrie::TokRxInfo;
25use toktrie::TokTrie;
26
27use llama_cpp_bindings_types::ParsedChatMessage;
28use llama_cpp_bindings_types::ParsedToolCall;
29use llama_cpp_bindings_types::ReasoningMarkers;
30use llama_cpp_bindings_types::ToolCallArguments;
31use llama_cpp_bindings_types::ToolCallMarkers;
32
33use crate::chat_message_parse_outcome::ChatMessageParseOutcome;
34use crate::llama_backend::LlamaBackend;
35use crate::llama_token_attrs::LlamaTokenAttrs;
36use crate::llama_token_attrs_from_int_error::LlamaTokenAttrsFromIntError;
37use crate::raw_chat_message::RawChatMessage;
38use crate::resolved_tool_call_markers::ResolvedToolCallMarkers;
39use crate::sampled_token::SampledToken;
40use crate::sampled_token_classifier::SampledTokenClassifier;
41use crate::streaming_markers::StreamingMarkers;
42use crate::token::LlamaToken;
43use crate::tool_call_format;
44use crate::tool_call_format::ToolCallFormatOutcome;
45use crate::tool_call_template_overrides;
46use crate::{
47    ApplyChatTemplateError, ChatTemplateError, LlamaLoraAdapterInitError, LlamaModelLoadError,
48    MarkerDetectionError, MetaValError, ParseChatMessageError, StringToTokenError,
49    TokenToStringError,
50};
51
52pub use add_bos::AddBos;
53pub use llama_chat_message::LlamaChatMessage;
54pub use llama_chat_template::LlamaChatTemplate;
55pub use llama_lora_adapter::LlamaLoraAdapter;
56pub use rope_type::RopeType;
57pub use vocab_type::VocabType;
58pub use vocab_type_from_int_error::VocabTypeFromIntError;
59
60use params::LlamaModelParams;
61
62fn truncated_buffer_to_string(
63    mut buffer: Vec<u8>,
64    length: usize,
65) -> Result<String, ApplyChatTemplateError> {
66    buffer.truncate(length);
67
68    Ok(String::from_utf8(buffer)?)
69}
70
71fn validate_string_length_for_tokenizer(length: usize) -> Result<c_int, StringToTokenError> {
72    Ok(c_int::try_from(length)?)
73}
74
75fn cstring_with_validated_len(str: &str) -> Result<(CString, c_int), StringToTokenError> {
76    let c_string = CString::new(str)?;
77    let len = validate_string_length_for_tokenizer(c_string.as_bytes().len())?;
78    Ok((c_string, len))
79}
80
81/// A safe wrapper around `llama_model`.
82pub struct LlamaModel {
83    /// Raw pointer to the underlying `llama_model`.
84    pub model: NonNull<llama_cpp_bindings_sys::llama_model>,
85    tok_env: OnceLock<Arc<ApproximateTokEnv>>,
86}
87
88impl std::fmt::Debug for LlamaModel {
89    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
90        f.debug_struct("LlamaModel")
91            .field("model", &self.model)
92            .finish_non_exhaustive()
93    }
94}
95
96unsafe impl Send for LlamaModel {}
97
98unsafe impl Sync for LlamaModel {}
99
100impl LlamaModel {
101    /// Returns a raw pointer to the model's vocabulary.
102    #[must_use]
103    pub fn vocab_ptr(&self) -> *const llama_cpp_bindings_sys::llama_vocab {
104        unsafe { llama_cpp_bindings_sys::llama_model_get_vocab(self.model.as_ptr()) }
105    }
106
107    /// Get the number of tokens the model was trained on.
108    ///
109    /// # Errors
110    ///
111    /// Returns an error if the value returned by llama.cpp does not fit into a `u32`.
112    pub fn n_ctx_train(&self) -> Result<u32, std::num::TryFromIntError> {
113        let n_ctx_train = unsafe { llama_cpp_bindings_sys::llama_n_ctx_train(self.model.as_ptr()) };
114
115        u32::try_from(n_ctx_train)
116    }
117
118    /// Get all tokens in the model.
119    pub fn tokens(
120        &self,
121        decode_special: bool,
122    ) -> impl Iterator<Item = (LlamaToken, Result<String, TokenToStringError>)> + '_ {
123        (0..self.n_vocab())
124            .map(LlamaToken::new)
125            .map(move |llama_token| {
126                let mut decoder = encoding_rs::UTF_8.new_decoder();
127                (
128                    llama_token,
129                    self.token_to_piece(
130                        &SampledToken::Content(llama_token),
131                        &mut decoder,
132                        decode_special,
133                        None,
134                    ),
135                )
136            })
137    }
138
139    /// Get the beginning of stream token.
140    #[must_use]
141    pub fn token_bos(&self) -> LlamaToken {
142        let token = unsafe { llama_cpp_bindings_sys::llama_token_bos(self.vocab_ptr()) };
143        LlamaToken(token)
144    }
145
146    /// Get the end of stream token.
147    #[must_use]
148    pub fn token_eos(&self) -> LlamaToken {
149        let token = unsafe { llama_cpp_bindings_sys::llama_token_eos(self.vocab_ptr()) };
150        LlamaToken(token)
151    }
152
153    /// Get the newline token.
154    #[must_use]
155    pub fn token_nl(&self) -> LlamaToken {
156        let token = unsafe { llama_cpp_bindings_sys::llama_token_nl(self.vocab_ptr()) };
157        LlamaToken(token)
158    }
159
160    /// Check if a token represents the end of generation (end of turn, end of sequence, etc.)
161    #[must_use]
162    pub fn is_eog_token(&self, token: &SampledToken) -> bool {
163        let (SampledToken::Content(LlamaToken(id))
164        | SampledToken::Reasoning(LlamaToken(id))
165        | SampledToken::ToolCall(LlamaToken(id))
166        | SampledToken::Undeterminable(LlamaToken(id))) = *token;
167
168        unsafe { llama_cpp_bindings_sys::llama_token_is_eog(self.vocab_ptr(), id) }
169    }
170
171    /// Get the decoder start token.
172    #[must_use]
173    pub fn decode_start_token(&self) -> LlamaToken {
174        let token =
175            unsafe { llama_cpp_bindings_sys::llama_model_decoder_start_token(self.model.as_ptr()) };
176        LlamaToken(token)
177    }
178
179    /// Get the separator token (SEP).
180    #[must_use]
181    pub fn token_sep(&self) -> LlamaToken {
182        let token = unsafe { llama_cpp_bindings_sys::llama_vocab_sep(self.vocab_ptr()) };
183        LlamaToken(token)
184    }
185
186    /// Convert a string to a Vector of tokens.
187    ///
188    /// # Errors
189    ///
190    /// - if [`str`] contains a null byte
191    /// - if an integer conversion fails during tokenization
192    ///
193    ///
194    /// ```no_run
195    /// use llama_cpp_bindings::model::LlamaModel;
196    ///
197    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
198    /// use std::path::Path;
199    /// use llama_cpp_bindings::model::AddBos;
200    /// let backend = llama_cpp_bindings::llama_backend::LlamaBackend::init()?;
201    /// let model = LlamaModel::load_from_file(&backend, Path::new("path/to/model"), &Default::default())?;
202    /// let tokens = model.str_to_token("Hello, World!", AddBos::Always)?;
203    /// # Ok(())
204    /// # }
205    pub fn str_to_token(
206        &self,
207        str: &str,
208        add_bos: AddBos,
209    ) -> Result<Vec<LlamaToken>, StringToTokenError> {
210        let add_bos = match add_bos {
211            AddBos::Always => true,
212            AddBos::Never => false,
213        };
214
215        let tokens_estimation = std::cmp::max(8, (str.len() / 2) + usize::from(add_bos));
216        let mut buffer: Vec<LlamaToken> = Vec::with_capacity(tokens_estimation);
217
218        let (c_string, c_string_len) = cstring_with_validated_len(str)?;
219        let buffer_capacity = c_int::try_from(buffer.capacity())?;
220
221        let size = invoke_rs_tokenize(
222            self.vocab_ptr(),
223            c_string.as_ptr(),
224            c_string_len,
225            buffer
226                .as_mut_ptr()
227                .cast::<llama_cpp_bindings_sys::llama_token>(),
228            buffer_capacity,
229            add_bos,
230        )?;
231
232        let size = if size.is_negative() {
233            buffer.reserve_exact(usize::try_from(-size)?);
234            invoke_rs_tokenize(
235                self.vocab_ptr(),
236                c_string.as_ptr(),
237                c_string_len,
238                buffer
239                    .as_mut_ptr()
240                    .cast::<llama_cpp_bindings_sys::llama_token>(),
241                -size,
242                add_bos,
243            )?
244        } else {
245            size
246        };
247
248        let size = usize::try_from(size)?;
249
250        // SAFETY: `size` < `capacity` and llama-cpp has initialized elements up to `size`
251        unsafe { buffer.set_len(size) }
252
253        Ok(buffer)
254    }
255
256    /// Get the type of a token.
257    ///
258    /// # Errors
259    ///
260    /// Returns an error if the token type is not known to this library.
261    pub fn token_attr(
262        &self,
263        LlamaToken(id): LlamaToken,
264    ) -> Result<LlamaTokenAttrs, LlamaTokenAttrsFromIntError> {
265        let token_type =
266            unsafe { llama_cpp_bindings_sys::llama_token_get_attr(self.vocab_ptr(), id) };
267
268        LlamaTokenAttrs::try_from(token_type)
269    }
270
271    /// Convert a token to a string using the underlying llama.cpp `llama_token_to_piece` function.
272    ///
273    /// This is the new default function for token decoding and provides direct access to
274    /// the llama.cpp token decoding functionality without any special logic or filtering.
275    ///
276    /// Decoding raw string requires using an decoder, tokens from language models may not always map
277    /// to full characters depending on the encoding so stateful decoding is required, otherwise partial strings may be lost!
278    /// Invalid characters are mapped to REPLACEMENT CHARACTER making the method safe to use even if the model inherently produces
279    /// garbage.
280    ///
281    /// # Errors
282    ///
283    /// - if the token type is unknown
284    ///
285    /// - if the returned size from llama.cpp does not fit into a `usize`
286    pub fn token_to_piece(
287        &self,
288        token: &SampledToken,
289        decoder: &mut encoding_rs::Decoder,
290        special: bool,
291        lstrip: Option<NonZeroU16>,
292    ) -> Result<String, TokenToStringError> {
293        let (SampledToken::Content(inner)
294        | SampledToken::Reasoning(inner)
295        | SampledToken::ToolCall(inner)
296        | SampledToken::Undeterminable(inner)) = *token;
297        let bytes = match self.token_to_piece_bytes(inner, 8, special, lstrip) {
298            Err(TokenToStringError::InsufficientBufferSpace(required_size)) => {
299                let buffer_size: usize = (-required_size).try_into()?;
300
301                self.token_to_piece_bytes(inner, buffer_size, special, lstrip)
302            }
303            other => other,
304        }?;
305
306        let mut output_piece = String::with_capacity(bytes.len());
307        let (_result, _decoded_size, _had_replacements) =
308            decoder.decode_to_string(&bytes, &mut output_piece, false);
309
310        Ok(output_piece)
311    }
312
313    /// Raw token decoding to bytes, use if you want to handle the decoding model output yourself
314    ///
315    /// Convert a token to bytes using the underlying llama.cpp `llama_token_to_piece` function. This is mostly
316    /// a thin wrapper around `llama_token_to_piece` function, that handles rust <-> c type conversions while
317    /// letting the caller handle errors. For a safer interface returning rust strings directly use `token_to_piece` instead!
318    ///
319    /// # Errors
320    ///
321    /// - if the token type is unknown
322    /// - the resultant token is larger than `buffer_size`.
323    /// - if an integer conversion fails
324    pub fn token_to_piece_bytes(
325        &self,
326        token: LlamaToken,
327        buffer_size: usize,
328        special: bool,
329        lstrip: Option<NonZeroU16>,
330    ) -> Result<Vec<u8>, TokenToStringError> {
331        let mut buffer: Vec<u8> = vec![0u8; buffer_size];
332        let buffer_len = c_int::try_from(buffer.len())?;
333        let lstrip = lstrip.map_or(0, |strip_count| i32::from(strip_count.get()));
334        let size = unsafe {
335            llama_cpp_bindings_sys::llama_token_to_piece(
336                self.vocab_ptr(),
337                token.0,
338                buffer.as_mut_ptr().cast::<c_char>(),
339                buffer_len,
340                lstrip,
341                special,
342            )
343        };
344
345        match size {
346            0 => Err(TokenToStringError::UnknownTokenType),
347            error_code if error_code.is_negative() => {
348                Err(TokenToStringError::InsufficientBufferSpace(error_code))
349            }
350            size => {
351                let written = usize::try_from(size)?;
352                buffer.truncate(written);
353
354                Ok(buffer)
355            }
356        }
357    }
358
359    /// The number of tokens the model was trained on.
360    ///
361    /// This returns a `c_int` for maximum compatibility. Most of the time it can be cast to an i32
362    /// without issue.
363    #[must_use]
364    pub fn n_vocab(&self) -> i32 {
365        unsafe { llama_cpp_bindings_sys::llama_n_vocab(self.vocab_ptr()) }
366    }
367
368    /// The type of vocab the model was trained on.
369    ///
370    /// # Errors
371    ///
372    /// Returns an error if llama.cpp emits a vocab type that is not known to this library.
373    pub fn vocab_type(&self) -> Result<VocabType, VocabTypeFromIntError> {
374        let vocab_type = unsafe { llama_cpp_bindings_sys::llama_vocab_type(self.vocab_ptr()) };
375
376        VocabType::try_from(vocab_type)
377    }
378
379    /// This returns a `c_int` for maximum compatibility. Most of the time it can be cast to an i32
380    /// without issue.
381    #[must_use]
382    pub fn n_embd(&self) -> c_int {
383        unsafe { llama_cpp_bindings_sys::llama_n_embd(self.model.as_ptr()) }
384    }
385
386    /// Returns the total size of all the tensors in the model in bytes.
387    #[must_use]
388    pub fn size(&self) -> u64 {
389        unsafe { llama_cpp_bindings_sys::llama_model_size(self.model.as_ptr()) }
390    }
391
392    /// Returns the number of parameters in the model.
393    #[must_use]
394    pub fn n_params(&self) -> u64 {
395        unsafe { llama_cpp_bindings_sys::llama_model_n_params(self.model.as_ptr()) }
396    }
397
398    /// Returns whether the model is a recurrent network (Mamba, RWKV, etc)
399    #[must_use]
400    pub fn is_recurrent(&self) -> bool {
401        unsafe { llama_cpp_bindings_sys::llama_model_is_recurrent(self.model.as_ptr()) }
402    }
403
404    /// Returns the number of layers within the model.
405    ///
406    /// # Errors
407    ///
408    /// Returns an error if the layer count returned by llama.cpp does not fit into a `u32`.
409    pub fn n_layer(&self) -> Result<u32, std::num::TryFromIntError> {
410        u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_layer(self.model.as_ptr()) })
411    }
412
413    /// Returns the number of attention heads within the model.
414    ///
415    /// # Errors
416    ///
417    /// Returns an error if the head count returned by llama.cpp does not fit into a `u32`.
418    pub fn n_head(&self) -> Result<u32, std::num::TryFromIntError> {
419        u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_head(self.model.as_ptr()) })
420    }
421
422    /// Returns the number of KV attention heads.
423    ///
424    /// # Errors
425    ///
426    /// Returns an error if the KV head count returned by llama.cpp does not fit into a `u32`.
427    pub fn n_head_kv(&self) -> Result<u32, std::num::TryFromIntError> {
428        u32::try_from(unsafe { llama_cpp_bindings_sys::llama_model_n_head_kv(self.model.as_ptr()) })
429    }
430
431    /// Returns whether the model is a hybrid network (Jamba, Granite, Qwen3xx, etc.)
432    ///
433    /// Hybrid models have both attention layers and recurrent/SSM layers.
434    #[must_use]
435    pub fn is_hybrid(&self) -> bool {
436        unsafe { llama_cpp_bindings_sys::llama_model_is_hybrid(self.model.as_ptr()) }
437    }
438
439    /// Get metadata value as a string by key name
440    ///
441    /// # Errors
442    /// Returns an error if the key is not found or the value is not valid UTF-8.
443    pub fn meta_val_str(&self, key: &str) -> Result<String, MetaValError> {
444        let key_cstring = CString::new(key)?;
445        let key_ptr = key_cstring.as_ptr();
446
447        extract_meta_string(
448            |buf_ptr, buf_len| unsafe {
449                llama_cpp_bindings_sys::llama_model_meta_val_str(
450                    self.model.as_ptr(),
451                    key_ptr,
452                    buf_ptr,
453                    buf_len,
454                )
455            },
456            256,
457        )
458    }
459
460    /// Get the number of metadata key/value pairs
461    #[must_use]
462    pub fn meta_count(&self) -> i32 {
463        unsafe { llama_cpp_bindings_sys::llama_model_meta_count(self.model.as_ptr()) }
464    }
465
466    /// Get metadata key name by index
467    ///
468    /// # Errors
469    /// Returns an error if the index is out of range or the key is not valid UTF-8.
470    pub fn meta_key_by_index(&self, index: i32) -> Result<String, MetaValError> {
471        extract_meta_string(
472            |buf_ptr, buf_len| unsafe {
473                llama_cpp_bindings_sys::llama_model_meta_key_by_index(
474                    self.model.as_ptr(),
475                    index,
476                    buf_ptr,
477                    buf_len,
478                )
479            },
480            256,
481        )
482    }
483
484    /// Get metadata value as a string by index
485    ///
486    /// # Errors
487    /// Returns an error if the index is out of range or the value is not valid UTF-8.
488    pub fn meta_val_str_by_index(&self, index: i32) -> Result<String, MetaValError> {
489        extract_meta_string(
490            |buf_ptr, buf_len| unsafe {
491                llama_cpp_bindings_sys::llama_model_meta_val_str_by_index(
492                    self.model.as_ptr(),
493                    index,
494                    buf_ptr,
495                    buf_len,
496                )
497            },
498            256,
499        )
500    }
501
502    /// Returns the rope type of the model.
503    #[must_use]
504    pub fn rope_type(&self) -> Option<RopeType> {
505        let raw = unsafe { llama_cpp_bindings_sys::llama_model_rope_type(self.model.as_ptr()) };
506
507        rope_type::rope_type_from_raw(raw)
508    }
509
510    /// Get chat template from model by name. If the name parameter is None, the default chat template will be returned.
511    ///
512    /// You supply this into [`Self::apply_chat_template`] to get back a string with the appropriate template
513    /// substitution applied to convert a list of messages into a prompt the LLM can use to complete
514    /// the chat.
515    ///
516    /// You could also use an external jinja parser, like [minijinja](https://github.com/mitsuhiko/minijinja),
517    /// to parse jinja templates not supported by the llama.cpp template engine.
518    ///
519    /// # Errors
520    ///
521    /// * If the model has no chat template by that name
522    ///
523    /// # Panics
524    ///
525    /// Panics if the C-returned chat template string contains interior null bytes
526    /// (should never happen with valid model data).
527    pub fn chat_template(
528        &self,
529        name: Option<&str>,
530    ) -> Result<LlamaChatTemplate, ChatTemplateError> {
531        let name_cstr = name.map(CString::new);
532        let name_ptr = match name_cstr {
533            Some(Ok(name)) => name.as_ptr(),
534            _ => ptr::null(),
535        };
536        let result = unsafe {
537            llama_cpp_bindings_sys::llama_model_chat_template(self.model.as_ptr(), name_ptr)
538        };
539
540        if result.is_null() {
541            Err(ChatTemplateError::MissingTemplate)
542        } else {
543            let chat_template_cstr = unsafe { CStr::from_ptr(result) };
544
545            Ok(LlamaChatTemplate(chat_template_cstr.to_owned()))
546        }
547    }
548
549    /// Loads a model from a file.
550    ///
551    /// # Errors
552    ///
553    /// See [`LlamaModelLoadError`] for more information.
554    ///
555    /// # Panics
556    ///
557    /// Panics if a valid UTF-8 path somehow contains interior null bytes (should never happen).
558    pub fn load_from_file(
559        _: &LlamaBackend,
560        path: impl AsRef<Path>,
561        params: &LlamaModelParams,
562    ) -> Result<Self, LlamaModelLoadError> {
563        let path = path.as_ref();
564
565        let path_str = path
566            .to_str()
567            .ok_or_else(|| LlamaModelLoadError::PathToStrError(path.to_path_buf()))?;
568
569        if !path.exists() {
570            return Err(LlamaModelLoadError::FileNotFound(path.to_path_buf()));
571        }
572
573        let cstr = CString::new(path_str)?;
574        let mut out_model: *mut llama_cpp_bindings_sys::llama_model = ptr::null_mut();
575        let mut out_error: *mut c_char = ptr::null_mut();
576        let status = unsafe {
577            llama_cpp_bindings_sys::llama_rs_load_model_from_file(
578                cstr.as_ptr(),
579                params.params,
580                &raw mut out_model,
581                &raw mut out_error,
582            )
583        };
584        match status {
585            llama_cpp_bindings_sys::LLAMA_RS_LOAD_MODEL_FROM_FILE_OK => {
586                let model = NonNull::new(out_model)
587                    .ok_or(LlamaModelLoadError::Unloadable)?;
588                Ok(Self {
589                    model,
590                    tok_env: OnceLock::new(),
591                })
592            }
593            llama_cpp_bindings_sys::LLAMA_RS_LOAD_MODEL_FROM_FILE_VENDORED_RETURNED_NULL => {
594                if path.exists() {
595                    Err(LlamaModelLoadError::Unloadable)
596                } else {
597                    Err(LlamaModelLoadError::FileNotFound(path.to_path_buf()))
598                }
599            }
600            llama_cpp_bindings_sys::LLAMA_RS_LOAD_MODEL_FROM_FILE_ERROR_STRING_ALLOCATION_FAILED => {
601                Err(LlamaModelLoadError::NotEnoughMemory)
602            }
603            llama_cpp_bindings_sys::LLAMA_RS_LOAD_MODEL_FROM_FILE_VENDORED_THREW_CXX_EXCEPTION => {
604                let message = unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
605                Err(LlamaModelLoadError::Reported { message })
606            }
607            other => unreachable!(
608                "llama_rs_load_model_from_file returned unrecognized status {other}"
609            ),
610        }
611    }
612
613    /// Initializes a lora adapter from a file.
614    ///
615    /// # Errors
616    ///
617    /// See [`LlamaLoraAdapterInitError`] for more information.
618    pub fn lora_adapter_init(
619        &self,
620        path: impl AsRef<Path>,
621    ) -> Result<LlamaLoraAdapter, LlamaLoraAdapterInitError> {
622        let path = path.as_ref();
623
624        let path_str = path
625            .to_str()
626            .ok_or_else(|| LlamaLoraAdapterInitError::PathToStrError(path.to_path_buf()))?;
627
628        if !path.exists() {
629            return Err(LlamaLoraAdapterInitError::FileNotFound(path.to_path_buf()));
630        }
631
632        let cstr = CString::new(path_str)?;
633        let raw_adapter = unsafe {
634            llama_cpp_bindings_sys::llama_adapter_lora_init(self.model.as_ptr(), cstr.as_ptr())
635        };
636
637        let Some(adapter) = NonNull::new(raw_adapter) else {
638            return Err(LlamaLoraAdapterInitError::Unloadable);
639        };
640
641        Ok(LlamaLoraAdapter {
642            lora_adapter: adapter,
643        })
644    }
645
646    /// Apply the models chat template to some messages.
647    /// See <https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template>
648    ///
649    /// Unlike the llama.cpp `apply_chat_template` which just randomly uses the `ChatML` template when given
650    /// a null pointer for the template, this requires an explicit template to be specified. If you want to
651    /// use "chatml", then just do `LlamaChatTemplate::new("chatml")` or any other model name or template
652    /// string.
653    ///
654    /// Use [`Self::chat_template`] to retrieve the template baked into the model (this is the preferred
655    /// mechanism as using the wrong chat template can result in really unexpected responses from the LLM).
656    ///
657    /// You probably want to set `add_ass` to true so that the generated template string ends with a the
658    /// opening tag of the assistant. If you fail to leave a hanging chat tag, the model will likely generate
659    /// one into the output and the output may also have unexpected output aside from that.
660    ///
661    /// # Errors
662    /// There are many ways this can fail. See [`ApplyChatTemplateError`] for more information.
663    pub fn apply_chat_template(
664        &self,
665        tmpl: &LlamaChatTemplate,
666        chat: &[LlamaChatMessage],
667        add_ass: bool,
668    ) -> Result<String, ApplyChatTemplateError> {
669        let message_length = chat.iter().fold(0, |acc, chat_message| {
670            acc + chat_message.role.to_bytes().len() + chat_message.content.to_bytes().len()
671        });
672        let mut buff: Vec<u8> = vec![0; message_length * 2];
673
674        let chat: Vec<llama_cpp_bindings_sys::llama_chat_message> = chat
675            .iter()
676            .map(|chat_message| llama_cpp_bindings_sys::llama_chat_message {
677                role: chat_message.role.as_ptr(),
678                content: chat_message.content.as_ptr(),
679            })
680            .collect();
681
682        let tmpl_ptr = tmpl.0.as_ptr();
683
684        let buff_len: i32 = buff.len().try_into()?;
685
686        let res = unsafe {
687            llama_cpp_bindings_sys::llama_chat_apply_template(
688                tmpl_ptr,
689                chat.as_ptr(),
690                chat.len(),
691                add_ass,
692                buff.as_mut_ptr().cast::<c_char>(),
693                buff_len,
694            )
695        };
696
697        if res > buff_len {
698            let required_size: usize = res.try_into()?;
699            buff.resize(required_size, 0);
700
701            let new_buff_len: i32 = buff.len().try_into()?;
702
703            let res = unsafe {
704                llama_cpp_bindings_sys::llama_chat_apply_template(
705                    tmpl_ptr,
706                    chat.as_ptr(),
707                    chat.len(),
708                    add_ass,
709                    buff.as_mut_ptr().cast::<c_char>(),
710                    new_buff_len,
711                )
712            };
713            let final_size: usize = res.try_into()?;
714
715            return truncated_buffer_to_string(buff, final_size);
716        }
717
718        let final_size: usize = res.try_into()?;
719
720        truncated_buffer_to_string(buff, final_size)
721    }
722
723    /// Build a streaming [`SampledTokenClassifier`] for this model.
724    ///
725    /// At construction the bindings detect reasoning markers (via the
726    /// autoparser, with a chunked-thinking fallback for templates that consume
727    /// thoughts via content blocks), tool-call markers, and the trailing
728    /// generation-prompt slice. The classifier then runs a state machine over
729    /// the decoded token stream — no per-model branches.
730    ///
731    /// If the model has no usable chat template the classifier is built in a
732    /// blind mode that classifies every token as
733    /// [`SampledToken::Undeterminable`].
734    pub fn sampled_token_classifier(&self) -> SampledTokenClassifier<'_> {
735        let markers = match self.streaming_markers() {
736            Ok(markers) => markers,
737            Err(detection_error) => {
738                log::warn!(
739                    "streaming markers detection failed; classifier will run blind: {detection_error}",
740                );
741                StreamingMarkers::default()
742            }
743        };
744
745        SampledTokenClassifier::new(self, markers)
746    }
747
748    /// Detect reasoning / tool-call markers (as token-ID sequences) and the
749    /// trailing generation-prompt slice for this model's chat template. The
750    /// returned `StreamingMarkers` carry tokenised markers — never raw strings
751    /// — so the classifier matches by `LlamaToken` equality rather than text
752    /// scanning.
753    ///
754    /// # Errors
755    /// Returns [`MarkerDetectionError`] when any underlying FFI call fails.
756    pub fn streaming_markers(&self) -> Result<StreamingMarkers, MarkerDetectionError> {
757        let (reasoning_open_str, reasoning_close_str) =
758            invoke_detect_reasoning_markers(self.model.as_ptr())?;
759
760        let tool_call_haystack = invoke_compute_tool_call_haystack(self.model.as_ptr())?;
761
762        let autoparser_pair = tool_call_haystack.as_deref().and_then(
763            crate::extract_tool_call_markers_from_haystack::extract_tool_call_markers_from_haystack,
764        );
765
766        let (autoparser_open, autoparser_close) = match autoparser_pair {
767            Some(crate::tool_call_marker_pair::ToolCallMarkerPair { open, close }) => {
768                (Some(open), Some(close))
769            }
770            None => (None, None),
771        };
772
773        let resolved_tool_call_markers =
774            self.resolve_tool_call_marker_strings(autoparser_open, autoparser_close);
775
776        Ok(StreamingMarkers {
777            reasoning_open: self.tokenize_marker(reasoning_open_str.as_deref()),
778            reasoning_close: self.tokenize_marker(reasoning_close_str.as_deref()),
779            tool_call_open: self.tokenize_marker(resolved_tool_call_markers.open.as_deref()),
780            tool_call_close: self.tokenize_marker(resolved_tool_call_markers.close.as_deref()),
781        })
782    }
783
784    /// When the autoparser-driven FFI returned no tool-call markers, consult the
785    /// per-template override registry so wrapper-known templates (Gemma 4,
786    /// Mistral 3, ...) still drive the classifier.
787    fn resolve_tool_call_marker_strings(
788        &self,
789        autoparser_open: Option<String>,
790        autoparser_close: Option<String>,
791    ) -> ResolvedToolCallMarkers {
792        if autoparser_open
793            .as_deref()
794            .is_some_and(|raw| !raw.trim().is_empty())
795        {
796            return ResolvedToolCallMarkers {
797                open: autoparser_open,
798                close: autoparser_close,
799            };
800        }
801        let Some(markers) = self.tool_call_markers() else {
802            return ResolvedToolCallMarkers {
803                open: autoparser_open,
804                close: autoparser_close,
805            };
806        };
807        let close = if markers.close.is_empty() {
808            None
809        } else {
810            Some(markers.close)
811        };
812        ResolvedToolCallMarkers {
813            open: Some(markers.open),
814            close,
815        }
816    }
817
818    /// # Errors
819    /// Returns [`MarkerDetectionError`] when the underlying FFI call fails.
820    pub fn reasoning_markers(&self) -> Result<Option<ReasoningMarkers>, MarkerDetectionError> {
821        let (open, close) = invoke_detect_reasoning_markers(self.model.as_ptr())?;
822
823        match (open, close) {
824            (Some(open), Some(close)) if !open.is_empty() && !close.is_empty() => {
825                Ok(Some(ReasoningMarkers { open, close }))
826            }
827            _ => Ok(None),
828        }
829    }
830
831    /// Returns the rich tool-call marker bundle (open / separator / close /
832    /// optional value-quote pair) for this model's chat template, sourced from
833    /// the wrapper's per-template override registry. Returns `None` when no
834    /// registered override matches — callers in that case fall back to
835    /// llama.cpp's autoparser via [`Self::parse_chat_message`].
836    #[must_use]
837    pub fn tool_call_markers(&self) -> Option<ToolCallMarkers> {
838        let template = match self.chat_template(None) {
839            Ok(template) => template,
840            Err(error) => {
841                log::debug!(
842                    "tool-call markers unavailable: chat template missing or invalid: {error}",
843                );
844                return None;
845            }
846        };
847        let template_str = match template.to_str() {
848            Ok(template_str) => template_str,
849            Err(error) => {
850                log::debug!(
851                    "tool-call markers unavailable: chat template is not valid UTF-8: {error}",
852                );
853                return None;
854            }
855        };
856        tool_call_template_overrides::detect(template_str)
857    }
858
859    fn tokenize_marker(&self, marker: Option<&str>) -> Option<Vec<LlamaToken>> {
860        let marker = marker?.trim();
861        if marker.is_empty() {
862            return None;
863        }
864        match self.str_to_token(marker, AddBos::Never) {
865            Ok(tokens) if !tokens.is_empty() => Some(tokens),
866            Ok(_) => None,
867            Err(tokenize_error) => {
868                log::debug!(
869                    "marker {marker:?} failed to tokenise; classifier will ignore it: {tokenize_error}",
870                );
871                None
872            }
873        }
874    }
875
876    /// Parse the assistant's output text into structured content, reasoning,
877    /// and tool calls.
878    ///
879    /// Two passes, in order:
880    /// 1. Duck-type the wrapper-side parsers across every known shape
881    ///    (Qwen XML, GLM key-value, Gemma paired-quote, Mistral bracketed-JSON).
882    ///    First match wins. The shapes are ordered so that more restrictive
883    ///    shapes run first, which keeps the duck-type pass safe for inputs
884    ///    that share an open marker but differ in inner structure.
885    /// 2. Delegate to llama.cpp's `common_chat_parse`. If it succeeds the
886    ///    result is `Recognized`; if it throws `ParseException` the result is
887    ///    `Unrecognized` with the raw input plus the FFI's diagnostic, so the
888    ///    caller can pass the unstructured tokens to the client.
889    ///
890    /// Empty tool-call `id` fields are filled with `call_{index}` before
891    /// returning, so callers always see well-formed identifiers.
892    ///
893    /// `tools_json` is a JSON-array string of OpenAI-style tool definitions
894    /// (use `"[]"` when no tools are in scope). `is_partial` switches between
895    /// mid-stream (lenient) and final (strict) parses for the FFI step.
896    ///
897    /// # Errors
898    ///
899    /// Returns [`ParseChatMessageError`] when `tools_json` is not valid JSON,
900    /// the FFI returns a non-OK status other than `ParseException`, or
901    /// accessor strings are not valid UTF-8.
902    pub fn parse_chat_message(
903        &self,
904        tools_json: &str,
905        input: &str,
906        is_partial: bool,
907    ) -> Result<ChatMessageParseOutcome, ParseChatMessageError> {
908        let tools_value: serde_json::Value =
909            serde_json::from_str(tools_json).map_err(ParseChatMessageError::ToolsJsonInvalid)?;
910        if !tools_value.is_array() {
911            return Err(ParseChatMessageError::ToolsJsonNotArray);
912        }
913
914        let reasoning_markers = self.reasoning_markers().ok().flatten();
915
916        for candidate in tool_call_template_overrides::known_marker_candidates() {
917            if let ToolCallFormatOutcome::Parsed(calls) =
918                tool_call_format::try_parse(input, &candidate)
919            {
920                let split =
921                    split_reasoning_prefix(input, reasoning_markers.as_ref(), &candidate.open);
922                let mut parsed = ParsedChatMessage::new(split.content, split.reasoning, calls);
923                synthesize_missing_tool_call_ids(&mut parsed.tool_calls);
924                return Ok(ChatMessageParseOutcome::Recognized(parsed));
925            }
926        }
927
928        match self.parse_chat_message_via_ffi(tools_json, input, is_partial) {
929            Ok(mut parsed) => {
930                synthesize_missing_tool_call_ids(&mut parsed.tool_calls);
931                Ok(ChatMessageParseOutcome::Recognized(parsed))
932            }
933            Err(ParseChatMessageError::ParseFailed { message }) => {
934                Ok(ChatMessageParseOutcome::Unrecognized(RawChatMessage {
935                    tools_json: tools_json.to_owned(),
936                    text: input.to_owned(),
937                    is_partial,
938                    ffi_error_message: message,
939                }))
940            }
941            Err(other) => Err(other),
942        }
943    }
944
945    fn parse_chat_message_via_ffi(
946        &self,
947        tools_json: &str,
948        input: &str,
949        is_partial: bool,
950    ) -> Result<ParsedChatMessage, ParseChatMessageError> {
951        let tools_cstring = CString::new(tools_json)
952            .map_err(|err| ParseChatMessageError::ToolsSerialization(err.to_string()))?;
953        let input_cstring = CString::new(input)
954            .map_err(|err| ParseChatMessageError::ToolsSerialization(err.to_string()))?;
955
956        let mut handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat = ptr::null_mut();
957        let mut out_error: *mut c_char = ptr::null_mut();
958
959        let status = unsafe {
960            llama_cpp_bindings_sys::llama_rs_parse_chat_message(
961                self.model.as_ptr(),
962                tools_cstring.as_ptr(),
963                input_cstring.as_ptr(),
964                i32::from(is_partial),
965                &raw mut handle,
966                &raw mut out_error,
967            )
968        };
969
970        let parsed = match status {
971            llama_cpp_bindings_sys::LLAMA_RS_PARSE_CHAT_MESSAGE_OK => {
972                collect_parsed_chat_message(handle)
973            }
974            llama_cpp_bindings_sys::LLAMA_RS_PARSE_CHAT_MESSAGE_MODEL_HAS_NO_CHAT_TEMPLATE => {
975                Err(ParseChatMessageError::NoChatTemplate)
976            }
977            llama_cpp_bindings_sys::LLAMA_RS_PARSE_CHAT_MESSAGE_MODEL_HAS_NO_VOCAB => {
978                Err(ParseChatMessageError::NoVocab)
979            }
980            llama_cpp_bindings_sys::LLAMA_RS_PARSE_CHAT_MESSAGE_ERROR_STRING_ALLOCATION_FAILED => {
981                Err(ParseChatMessageError::NotEnoughMemory)
982            }
983            llama_cpp_bindings_sys::LLAMA_RS_PARSE_CHAT_MESSAGE_VENDORED_THREW_CXX_EXCEPTION => {
984                let message =
985                    unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
986                out_error = ptr::null_mut();
987                Err(ParseChatMessageError::ParseFailed { message })
988            }
989            other => {
990                unreachable!("llama_rs_parse_chat_message returned unrecognized status {other}")
991            }
992        };
993
994        let mut free_error: *mut c_char = ptr::null_mut();
995        let free_status = unsafe {
996            llama_cpp_bindings_sys::llama_rs_parsed_chat_free(handle, &raw mut free_error)
997        };
998        match (parsed, free_status) {
999            (Ok(value), llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_FREE_OK) => {
1000                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1001                Ok(value)
1002            }
1003            (
1004                Ok(_),
1005                llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_FREE_DESTRUCTOR_THREW_CXX_EXCEPTION,
1006            ) => {
1007                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1008                let message =
1009                    unsafe { crate::ffi_error_reader::read_and_free_cpp_error(free_error) };
1010                Err(ParseChatMessageError::DestructorFailed { message })
1011            }
1012            (
1013                Ok(_),
1014                llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_FREE_ERROR_STRING_ALLOCATION_FAILED,
1015            ) => {
1016                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1017                Err(ParseChatMessageError::NotEnoughMemory)
1018            }
1019            (Ok(_), other) => {
1020                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1021                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(free_error) };
1022                unreachable!("llama_rs_parsed_chat_free returned unrecognized status {other}")
1023            }
1024            (Err(parse_err), _) => {
1025                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1026                unsafe { llama_cpp_bindings_sys::llama_rs_string_free(free_error) };
1027                Err(parse_err)
1028            }
1029        }
1030    }
1031
1032    /// Render the model's chat template with the autoparser's synthetic
1033    /// no-tools and with-tools inputs. Returns `(output_no_tools,
1034    /// output_with_tools)`. Either side can be empty when the template throws
1035    /// during rendering. Useful for debugging tool-call marker detection.
1036    ///
1037    /// # Errors
1038    ///
1039    /// Returns [`MarkerDetectionError`] when the C++ analyzer throws or the FFI
1040    /// returns a non-OK status.
1041    pub fn diagnose_tool_call_synthetic_renders(
1042        &self,
1043    ) -> Result<(String, String), MarkerDetectionError> {
1044        let (no_tools, with_tools) =
1045            invoke_diagnose_tool_call_synthetic_renders(self.model.as_ptr())?;
1046
1047        Ok((no_tools.unwrap_or_default(), with_tools.unwrap_or_default()))
1048    }
1049}
1050
1051impl LlamaModel {
1052    /// Returns a process-cached, approximate token environment built from this model's vocabulary.
1053    ///
1054    /// The first call iterates the full vocabulary and constructs the trie; subsequent calls
1055    /// return the cached `Arc` without further FFI work.
1056    pub fn approximate_tok_env(&self) -> Arc<ApproximateTokEnv> {
1057        Arc::clone(self.tok_env.get_or_init(|| build_approximate_tok_env(self)))
1058    }
1059}
1060
1061fn build_approximate_tok_env(model: &LlamaModel) -> Arc<ApproximateTokEnv> {
1062    let n_vocab = model.n_vocab().cast_unsigned();
1063    let tok_eos = {
1064        let eot = unsafe { llama_cpp_bindings_sys::llama_vocab_eot(model.vocab_ptr()) };
1065        if eot == -1 {
1066            model.token_eos().0.cast_unsigned()
1067        } else {
1068            eot.cast_unsigned()
1069        }
1070    };
1071    let info = TokRxInfo::new(n_vocab, tok_eos);
1072
1073    let mut words = Vec::with_capacity(n_vocab as usize);
1074
1075    for token_id in 0..n_vocab.cast_signed() {
1076        let token = LlamaToken(token_id);
1077        let bytes = model
1078            .token_to_piece_bytes(token, 32, false, None)
1079            .unwrap_or_default();
1080        if bytes.is_empty() {
1081            let special_bytes = model
1082                .token_to_piece_bytes(token, 32, true, None)
1083                .unwrap_or_default();
1084            if special_bytes.is_empty() {
1085                words.push(vec![]);
1086            } else {
1087                let mut marked = Vec::with_capacity(special_bytes.len() + 1);
1088                marked.push(0xFF);
1089                marked.extend(special_bytes);
1090                words.push(marked);
1091            }
1092        } else {
1093            words.push(bytes);
1094        }
1095    }
1096
1097    let trie = TokTrie::from(&info, &words);
1098    Arc::new(ApproximateTokEnv::new(trie))
1099}
1100
1101fn collect_parsed_chat_message(
1102    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1103) -> Result<ParsedChatMessage, ParseChatMessageError> {
1104    if handle.is_null() {
1105        return Ok(ParsedChatMessage::default());
1106    }
1107
1108    let content = read_parsed_chat_content(handle)?;
1109    let reasoning_content = read_parsed_chat_reasoning_content(handle)?;
1110    let count = read_parsed_chat_tool_call_count(handle)?;
1111
1112    let mut tool_calls = Vec::with_capacity(count);
1113    for index in 0..count {
1114        let id = read_parsed_chat_tool_call_id(handle, index)?;
1115        let name = read_parsed_chat_tool_call_name(handle, index)?;
1116        let arguments_json = read_parsed_chat_tool_call_arguments(handle, index)?;
1117
1118        let arguments = ToolCallArguments::from_string(arguments_json);
1119        tool_calls.push(ParsedToolCall::new(id, name, arguments));
1120    }
1121
1122    Ok(ParsedChatMessage::new(
1123        content,
1124        reasoning_content,
1125        tool_calls,
1126    ))
1127}
1128
1129fn read_parsed_chat_content(
1130    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1131) -> Result<String, ParseChatMessageError> {
1132    let mut out_string: *mut c_char = ptr::null_mut();
1133    let mut out_error: *mut c_char = ptr::null_mut();
1134    let status = unsafe {
1135        llama_cpp_bindings_sys::llama_rs_parsed_chat_content(
1136            handle,
1137            &raw mut out_string,
1138            &raw mut out_error,
1139        )
1140    };
1141    match status {
1142        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_CONTENT_OK => {
1143            consume_accessor_string(out_string)
1144        }
1145        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_CONTENT_ERROR_STRING_ALLOCATION_FAILED => {
1146            unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1147            Err(ParseChatMessageError::NotEnoughMemory)
1148        }
1149        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_CONTENT_VENDORED_THREW_CXX_EXCEPTION => {
1150            let message = unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1151            Err(ParseChatMessageError::Reported { message })
1152        }
1153        other => unreachable!("llama_rs_parsed_chat_content returned unrecognized status {other}"),
1154    }
1155}
1156
1157fn read_parsed_chat_reasoning_content(
1158    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1159) -> Result<String, ParseChatMessageError> {
1160    let mut out_string: *mut c_char = ptr::null_mut();
1161    let mut out_error: *mut c_char = ptr::null_mut();
1162    let status = unsafe {
1163        llama_cpp_bindings_sys::llama_rs_parsed_chat_reasoning_content(
1164            handle,
1165            &raw mut out_string,
1166            &raw mut out_error,
1167        )
1168    };
1169    match status {
1170        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_REASONING_CONTENT_OK => {
1171            consume_accessor_string(out_string)
1172        }
1173        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_REASONING_CONTENT_ERROR_STRING_ALLOCATION_FAILED => {
1174            unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1175            Err(ParseChatMessageError::NotEnoughMemory)
1176        }
1177        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_REASONING_CONTENT_VENDORED_THREW_CXX_EXCEPTION => {
1178            let message =
1179                unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1180            Err(ParseChatMessageError::Reported { message })
1181        }
1182        other => unreachable!(
1183            "llama_rs_parsed_chat_reasoning_content returned unrecognized status {other}"
1184        ),
1185    }
1186}
1187
1188fn read_parsed_chat_tool_call_count(
1189    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1190) -> Result<usize, ParseChatMessageError> {
1191    let mut out_count: usize = 0;
1192    let mut out_error: *mut c_char = ptr::null_mut();
1193    let status = unsafe {
1194        llama_cpp_bindings_sys::llama_rs_parsed_chat_tool_call_count(
1195            handle,
1196            &raw mut out_count,
1197            &raw mut out_error,
1198        )
1199    };
1200    match status {
1201        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_COUNT_OK => Ok(out_count),
1202        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_COUNT_ERROR_STRING_ALLOCATION_FAILED => {
1203            unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1204            Err(ParseChatMessageError::NotEnoughMemory)
1205        }
1206        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_COUNT_VENDORED_THREW_CXX_EXCEPTION => {
1207            let message =
1208                unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1209            Err(ParseChatMessageError::Reported { message })
1210        }
1211        other => unreachable!(
1212            "llama_rs_parsed_chat_tool_call_count returned unrecognized status {other}"
1213        ),
1214    }
1215}
1216
1217fn read_parsed_chat_tool_call_id(
1218    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1219    index: usize,
1220) -> Result<String, ParseChatMessageError> {
1221    let mut out_string: *mut c_char = ptr::null_mut();
1222    let mut out_error: *mut c_char = ptr::null_mut();
1223    let status = unsafe {
1224        llama_cpp_bindings_sys::llama_rs_parsed_chat_tool_call_id(
1225            handle,
1226            index,
1227            &raw mut out_string,
1228            &raw mut out_error,
1229        )
1230    };
1231    match status {
1232        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ID_OK => {
1233            consume_accessor_string(out_string)
1234        }
1235        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ID_INDEX_OUT_OF_BOUNDS => {
1236            Err(ParseChatMessageError::ToolCallIdIndexOutOfBounds { index })
1237        }
1238        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ID_ERROR_STRING_ALLOCATION_FAILED => {
1239            unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1240            Err(ParseChatMessageError::NotEnoughMemory)
1241        }
1242        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ID_VENDORED_THREW_CXX_EXCEPTION => {
1243            let message =
1244                unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1245            Err(ParseChatMessageError::Reported { message })
1246        }
1247        other => unreachable!(
1248            "llama_rs_parsed_chat_tool_call_id returned unrecognized status {other}"
1249        ),
1250    }
1251}
1252
1253fn read_parsed_chat_tool_call_name(
1254    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1255    index: usize,
1256) -> Result<String, ParseChatMessageError> {
1257    let mut out_string: *mut c_char = ptr::null_mut();
1258    let mut out_error: *mut c_char = ptr::null_mut();
1259    let status = unsafe {
1260        llama_cpp_bindings_sys::llama_rs_parsed_chat_tool_call_name(
1261            handle,
1262            index,
1263            &raw mut out_string,
1264            &raw mut out_error,
1265        )
1266    };
1267    match status {
1268        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_NAME_OK => {
1269            consume_accessor_string(out_string)
1270        }
1271        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_NAME_INDEX_OUT_OF_BOUNDS => {
1272            Err(ParseChatMessageError::ToolCallNameIndexOutOfBounds { index })
1273        }
1274        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_NAME_ERROR_STRING_ALLOCATION_FAILED => {
1275            unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1276            Err(ParseChatMessageError::NotEnoughMemory)
1277        }
1278        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_NAME_VENDORED_THREW_CXX_EXCEPTION => {
1279            let message =
1280                unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1281            Err(ParseChatMessageError::Reported { message })
1282        }
1283        other => unreachable!(
1284            "llama_rs_parsed_chat_tool_call_name returned unrecognized status {other}"
1285        ),
1286    }
1287}
1288
1289fn read_parsed_chat_tool_call_arguments(
1290    handle: *mut llama_cpp_bindings_sys::llama_rs_parsed_chat,
1291    index: usize,
1292) -> Result<String, ParseChatMessageError> {
1293    let mut out_string: *mut c_char = ptr::null_mut();
1294    let mut out_error: *mut c_char = ptr::null_mut();
1295    let status = unsafe {
1296        llama_cpp_bindings_sys::llama_rs_parsed_chat_tool_call_arguments(
1297            handle,
1298            index,
1299            &raw mut out_string,
1300            &raw mut out_error,
1301        )
1302    };
1303    match status {
1304        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ARGUMENTS_OK => {
1305            consume_accessor_string(out_string)
1306        }
1307        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ARGUMENTS_INDEX_OUT_OF_BOUNDS => {
1308            Err(ParseChatMessageError::ToolCallArgumentsIndexOutOfBounds { index })
1309        }
1310        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ARGUMENTS_ERROR_STRING_ALLOCATION_FAILED => {
1311            unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1312            Err(ParseChatMessageError::NotEnoughMemory)
1313        }
1314        llama_cpp_bindings_sys::LLAMA_RS_PARSED_CHAT_TOOL_CALL_ARGUMENTS_VENDORED_THREW_CXX_EXCEPTION => {
1315            let message =
1316                unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1317            Err(ParseChatMessageError::Reported { message })
1318        }
1319        other => unreachable!(
1320            "llama_rs_parsed_chat_tool_call_arguments returned unrecognized status {other}"
1321        ),
1322    }
1323}
1324
1325fn consume_accessor_string(ptr: *mut c_char) -> Result<String, ParseChatMessageError> {
1326    if ptr.is_null() {
1327        return Ok(String::new());
1328    }
1329    let bytes = unsafe { CStr::from_ptr(ptr) }.to_bytes().to_vec();
1330    unsafe { llama_cpp_bindings_sys::llama_rs_string_free(ptr) };
1331    Ok(String::from_utf8(bytes)?)
1332}
1333
1334struct ReasoningSplit {
1335    reasoning: String,
1336    content: String,
1337}
1338
1339fn split_reasoning_prefix(
1340    input: &str,
1341    reasoning_markers: Option<&ReasoningMarkers>,
1342    tool_call_open: &str,
1343) -> ReasoningSplit {
1344    let content_only = || ReasoningSplit {
1345        reasoning: String::new(),
1346        content: prefix_before(input, tool_call_open),
1347    };
1348
1349    let Some(reasoning_markers) = reasoning_markers else {
1350        return content_only();
1351    };
1352    let Some(open_pos) = input.find(&reasoning_markers.open) else {
1353        return content_only();
1354    };
1355
1356    let after_open = &input[open_pos + reasoning_markers.open.len()..];
1357    let Some(close_offset) = after_open.find(&reasoning_markers.close) else {
1358        return content_only();
1359    };
1360
1361    let reasoning = after_open[..close_offset].to_owned();
1362    let after_close = &after_open[close_offset + reasoning_markers.close.len()..];
1363
1364    ReasoningSplit {
1365        reasoning,
1366        content: prefix_before(after_close, tool_call_open),
1367    }
1368}
1369
1370fn prefix_before(text: &str, marker: &str) -> String {
1371    text.find(marker)
1372        .map_or_else(|| text.to_owned(), |pos| text[..pos].to_owned())
1373}
1374
1375fn synthesize_missing_tool_call_ids(tool_calls: &mut [ParsedToolCall]) {
1376    for (index, call) in tool_calls.iter_mut().enumerate() {
1377        if call.id.is_empty() {
1378            call.id = format!("call_{index}");
1379        }
1380    }
1381}
1382
1383fn invoke_detect_reasoning_markers(
1384    model: *const llama_cpp_bindings_sys::llama_model,
1385) -> Result<(Option<String>, Option<String>), MarkerDetectionError> {
1386    let mut out_open: *mut c_char = ptr::null_mut();
1387    let mut out_close: *mut c_char = ptr::null_mut();
1388    let mut out_error: *mut c_char = ptr::null_mut();
1389
1390    let status = unsafe {
1391        llama_cpp_bindings_sys::llama_rs_detect_reasoning_markers(
1392            model,
1393            &raw mut out_open,
1394            &raw mut out_close,
1395            &raw mut out_error,
1396        )
1397    };
1398
1399    let parsed = match status {
1400        llama_cpp_bindings_sys::LLAMA_RS_DETECT_REASONING_MARKERS_OK => {
1401            collect_optional_cstr_pair(out_open, out_close)
1402        }
1403        llama_cpp_bindings_sys::LLAMA_RS_DETECT_REASONING_MARKERS_ERROR_STRING_ALLOCATION_FAILED => {
1404            Err(MarkerDetectionError::NotEnoughMemory)
1405        }
1406        llama_cpp_bindings_sys::LLAMA_RS_DETECT_REASONING_MARKERS_VENDORED_THREW_CXX_EXCEPTION => {
1407            let message = unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1408            Err(MarkerDetectionError::ReasoningMarkerDetectionFailed { message })
1409        }
1410        other => unreachable!(
1411            "llama_rs_detect_reasoning_markers returned unrecognized status {other}"
1412        ),
1413    };
1414
1415    unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_open) };
1416    unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_close) };
1417    if !matches!(
1418        parsed,
1419        Err(MarkerDetectionError::ReasoningMarkerDetectionFailed { .. })
1420    ) {
1421        unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1422    }
1423
1424    parsed
1425}
1426
1427fn invoke_compute_tool_call_haystack(
1428    model: *const llama_cpp_bindings_sys::llama_model,
1429) -> Result<Option<String>, MarkerDetectionError> {
1430    let mut out_haystack: *mut c_char = ptr::null_mut();
1431    let mut out_error: *mut c_char = ptr::null_mut();
1432
1433    let status = unsafe {
1434        llama_cpp_bindings_sys::llama_rs_compute_tool_call_haystack(
1435            model,
1436            &raw mut out_haystack,
1437            &raw mut out_error,
1438        )
1439    };
1440
1441    let parsed = match status {
1442        llama_cpp_bindings_sys::LLAMA_RS_COMPUTE_TOOL_CALL_HAYSTACK_OK => {
1443            read_optional_owned_cstr(out_haystack)
1444        }
1445        llama_cpp_bindings_sys::LLAMA_RS_COMPUTE_TOOL_CALL_HAYSTACK_ERROR_STRING_ALLOCATION_FAILED => {
1446            Err(MarkerDetectionError::NotEnoughMemory)
1447        }
1448        llama_cpp_bindings_sys::LLAMA_RS_COMPUTE_TOOL_CALL_HAYSTACK_VENDORED_THREW_CXX_EXCEPTION => {
1449            let message = unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1450            Err(MarkerDetectionError::ToolCallHaystackComputationFailed { message })
1451        }
1452        other => unreachable!(
1453            "llama_rs_compute_tool_call_haystack returned unrecognized status {other}"
1454        ),
1455    };
1456
1457    unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_haystack) };
1458    if !matches!(
1459        parsed,
1460        Err(MarkerDetectionError::ToolCallHaystackComputationFailed { .. })
1461    ) {
1462        unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1463    }
1464
1465    parsed
1466}
1467
1468fn invoke_diagnose_tool_call_synthetic_renders(
1469    model: *const llama_cpp_bindings_sys::llama_model,
1470) -> Result<(Option<String>, Option<String>), MarkerDetectionError> {
1471    let mut out_no_tools: *mut c_char = ptr::null_mut();
1472    let mut out_with_tools: *mut c_char = ptr::null_mut();
1473    let mut out_error: *mut c_char = ptr::null_mut();
1474
1475    let status = unsafe {
1476        llama_cpp_bindings_sys::llama_rs_diagnose_tool_call_synthetic_renders(
1477            model,
1478            &raw mut out_no_tools,
1479            &raw mut out_with_tools,
1480            &raw mut out_error,
1481        )
1482    };
1483
1484    let parsed = match status {
1485        llama_cpp_bindings_sys::LLAMA_RS_DIAGNOSE_TOOL_CALL_SYNTHETIC_RENDERS_OK => {
1486            collect_optional_cstr_pair(out_no_tools, out_with_tools)
1487        }
1488        llama_cpp_bindings_sys::LLAMA_RS_DIAGNOSE_TOOL_CALL_SYNTHETIC_RENDERS_ERROR_STRING_ALLOCATION_FAILED => {
1489            Err(MarkerDetectionError::NotEnoughMemory)
1490        }
1491        llama_cpp_bindings_sys::LLAMA_RS_DIAGNOSE_TOOL_CALL_SYNTHETIC_RENDERS_VENDORED_THREW_CXX_EXCEPTION => {
1492            let message = unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1493            Err(MarkerDetectionError::ToolCallSyntheticRenderDiagnosisFailed { message })
1494        }
1495        other => unreachable!(
1496            "llama_rs_diagnose_tool_call_synthetic_renders returned unrecognized status {other}"
1497        ),
1498    };
1499
1500    unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_no_tools) };
1501    unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_with_tools) };
1502    if !matches!(
1503        parsed,
1504        Err(MarkerDetectionError::ToolCallSyntheticRenderDiagnosisFailed { .. })
1505    ) {
1506        unsafe { llama_cpp_bindings_sys::llama_rs_string_free(out_error) };
1507    }
1508
1509    parsed
1510}
1511
1512fn read_optional_owned_cstr(ptr: *const c_char) -> Result<Option<String>, MarkerDetectionError> {
1513    if ptr.is_null() {
1514        return Ok(None);
1515    }
1516
1517    let bytes = unsafe { CStr::from_ptr(ptr) }.to_bytes().to_vec();
1518
1519    Ok(Some(String::from_utf8(bytes)?))
1520}
1521
1522fn invoke_rs_tokenize(
1523    vocab: *const llama_cpp_bindings_sys::llama_vocab,
1524    text: *const c_char,
1525    text_len: c_int,
1526    tokens: *mut llama_cpp_bindings_sys::llama_token,
1527    n_tokens_max: c_int,
1528    add_bos: bool,
1529) -> Result<c_int, StringToTokenError> {
1530    let mut out_count: i32 = 0;
1531    let mut out_error: *mut c_char = ptr::null_mut();
1532    let status = unsafe {
1533        llama_cpp_bindings_sys::llama_rs_tokenize(
1534            vocab,
1535            text,
1536            text_len,
1537            tokens,
1538            n_tokens_max,
1539            add_bos,
1540            true,
1541            &raw mut out_count,
1542            &raw mut out_error,
1543        )
1544    };
1545    match status {
1546        llama_cpp_bindings_sys::LLAMA_RS_TOKENIZE_OK => Ok(out_count),
1547        llama_cpp_bindings_sys::LLAMA_RS_TOKENIZE_ERROR_STRING_ALLOCATION_FAILED => {
1548            Err(StringToTokenError::NotEnoughMemory)
1549        }
1550        llama_cpp_bindings_sys::LLAMA_RS_TOKENIZE_VENDORED_THREW_CXX_EXCEPTION => {
1551            let message = unsafe { crate::ffi_error_reader::read_and_free_cpp_error(out_error) };
1552            Err(StringToTokenError::Reported { message })
1553        }
1554        other => unreachable!("llama_rs_tokenize returned unrecognized status {other}"),
1555    }
1556}
1557
1558fn collect_optional_cstr_pair(
1559    first_ptr: *const c_char,
1560    second_ptr: *const c_char,
1561) -> Result<(Option<String>, Option<String>), MarkerDetectionError> {
1562    let first = read_optional_owned_cstr(first_ptr)?;
1563    let second = read_optional_owned_cstr(second_ptr)?;
1564    Ok((first, second))
1565}
1566
1567fn extract_meta_string<TCFunction>(
1568    c_function: TCFunction,
1569    capacity: usize,
1570) -> Result<String, MetaValError>
1571where
1572    TCFunction: Fn(*mut c_char, usize) -> i32,
1573{
1574    let mut buffer = vec![0u8; capacity];
1575    let result = c_function(buffer.as_mut_ptr().cast::<c_char>(), buffer.len());
1576
1577    if result < 0 {
1578        return Err(MetaValError::NegativeReturn(result));
1579    }
1580
1581    let returned_len = result.cast_unsigned() as usize;
1582
1583    if returned_len >= capacity {
1584        return extract_meta_string(c_function, returned_len + 1);
1585    }
1586
1587    if buffer.get(returned_len) != Some(&0) {
1588        return Err(MetaValError::NegativeReturn(-1));
1589    }
1590
1591    buffer.truncate(returned_len);
1592
1593    Ok(String::from_utf8(buffer)?)
1594}
1595
1596impl Drop for LlamaModel {
1597    fn drop(&mut self) {
1598        unsafe { llama_cpp_bindings_sys::llama_free_model(self.model.as_ptr()) }
1599    }
1600}
1601
1602#[cfg(test)]
1603mod extract_meta_string_tests {
1604    use super::extract_meta_string;
1605    use crate::MetaValError;
1606
1607    #[test]
1608    fn returns_error_when_null_terminator_missing() {
1609        let result = extract_meta_string(
1610            |buf_ptr, buf_len| {
1611                let buffer =
1612                    unsafe { std::slice::from_raw_parts_mut(buf_ptr.cast::<u8>(), buf_len) };
1613                buffer[0] = b'a';
1614                buffer[1] = b'b';
1615                buffer[2] = b'c';
1616                2
1617            },
1618            4,
1619        );
1620
1621        assert_eq!(result.unwrap_err(), MetaValError::NegativeReturn(-1));
1622    }
1623
1624    #[test]
1625    fn returns_error_for_negative_return_value() {
1626        let result = extract_meta_string(|_buf_ptr, _buf_len| -5, 4);
1627
1628        assert_eq!(result.unwrap_err(), MetaValError::NegativeReturn(-5));
1629    }
1630
1631    #[test]
1632    fn returns_error_for_invalid_utf8_data() {
1633        let result = extract_meta_string(
1634            |buf_ptr, buf_len| {
1635                let buffer =
1636                    unsafe { std::slice::from_raw_parts_mut(buf_ptr.cast::<u8>(), buf_len) };
1637                buffer[0] = 0xFF;
1638                buffer[1] = 0xFE;
1639                buffer[2] = 0;
1640                2
1641            },
1642            4,
1643        );
1644
1645        assert!(result.is_err());
1646        assert!(result.unwrap_err().to_string().contains("FromUtf8Error"));
1647    }
1648
1649    #[test]
1650    fn triggers_buffer_resize_when_returned_len_exceeds_capacity() {
1651        let initial_capacity: usize = 4;
1652        let length_exceeding_initial_capacity = 10;
1653        let written_length = 2;
1654        let call_count = std::cell::Cell::new(0);
1655        let result = extract_meta_string(
1656            |buf_ptr, buf_len| {
1657                let count = call_count.get();
1658                call_count.set(count + 1);
1659                if count == 0 {
1660                    length_exceeding_initial_capacity
1661                } else {
1662                    let buffer =
1663                        unsafe { std::slice::from_raw_parts_mut(buf_ptr.cast::<u8>(), buf_len) };
1664                    buffer[0] = b'h';
1665                    buffer[1] = b'i';
1666                    buffer[2] = 0;
1667                    written_length
1668                }
1669            },
1670            initial_capacity,
1671        );
1672
1673        assert_eq!(result.unwrap(), "hi");
1674    }
1675
1676    #[test]
1677    fn cstring_with_validated_len_null_byte_returns_error() {
1678        let result = super::cstring_with_validated_len("null\0byte");
1679
1680        assert!(result.is_err());
1681    }
1682
1683    #[test]
1684    fn validate_string_length_overflow_returns_error() {
1685        let result = super::validate_string_length_for_tokenizer(usize::MAX);
1686
1687        assert!(result.is_err());
1688    }
1689
1690    #[test]
1691    fn truncated_buffer_to_string_with_invalid_utf8_returns_error() {
1692        let invalid_utf8 = vec![0xff, 0xfe, 0xfd];
1693        let result = super::truncated_buffer_to_string(invalid_utf8, 3);
1694
1695        assert!(result.is_err());
1696    }
1697}