#[cfg(feature = "llm-llamacpp")]
use std::ffi::CString;
#[cfg(feature = "llm-llamacpp")]
use std::os::raw::{c_char, c_float, c_int, c_void};
#[cfg(feature = "llm-llamacpp")]
use std::ptr;
use crate::runtime_adapter::llm::ChatMessage;
use crate::runtime_adapter::AdapterError;
#[cfg(feature = "llm-llamacpp")]
pub struct LlamaModel {
ptr: *mut c_void,
}
#[cfg(feature = "llm-llamacpp")]
unsafe impl Send for LlamaModel {}
#[cfg(feature = "llm-llamacpp")]
unsafe impl Sync for LlamaModel {}
#[cfg(feature = "llm-llamacpp")]
impl Drop for LlamaModel {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { llama_free_model_c(self.ptr) };
self.ptr = std::ptr::null_mut();
}
}
}
#[cfg(feature = "llm-llamacpp")]
pub struct LlamaContext {
ptr: *mut c_void,
}
#[cfg(feature = "llm-llamacpp")]
unsafe impl Send for LlamaContext {}
#[cfg(feature = "llm-llamacpp")]
impl Drop for LlamaContext {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { llama_free_c(self.ptr) };
self.ptr = std::ptr::null_mut();
}
}
}
#[cfg(feature = "llm-llamacpp")]
extern "C" {
fn llama_backend_init_c();
fn llama_backend_free_c();
fn llama_log_set_verbosity_c(level: c_int);
fn llama_log_get_verbosity_c() -> c_int;
fn llama_load_model_from_file_c(path_model: *const c_char, n_gpu_layers: c_int) -> *mut c_void;
fn llama_free_model_c(model: *mut c_void);
fn llama_new_context_with_model_c(
model: *mut c_void,
n_ctx: c_int,
n_threads: c_int,
n_batch: c_int,
flash_attn: bool,
) -> *mut c_void;
fn llama_free_c(ctx: *mut c_void);
fn llama_kv_cache_clear_c(ctx: *mut c_void);
fn llama_tokenize_c(
model: *const c_void,
text: *const c_char,
text_len: c_int,
tokens: *mut i32,
n_tokens_max: c_int,
add_special: bool,
parse_special: bool,
) -> c_int;
fn llama_token_to_piece_c(
model: *const c_void,
token: i32,
buf: *mut c_char,
length: c_int,
lstrip: c_int,
special: bool,
) -> c_int;
fn llama_token_bos_c(model: *const c_void) -> i32;
fn llama_token_eos_c(model: *const c_void) -> i32;
fn llama_vocab_is_eog_c(model: *const c_void, token: i32) -> bool;
fn llama_model_chat_template_c(model: *const c_void) -> *const c_char;
fn llama_n_vocab_c(model: *const c_void) -> c_int;
fn llama_n_ctx_c(ctx: *const c_void) -> c_int;
fn llama_model_is_recurrent_c(model: *const c_void) -> bool;
fn llama_model_has_recurrent_state_c(model: *const c_void) -> bool;
fn llama_decode_c(ctx: *mut c_void, batch: *const c_void) -> c_int;
fn llama_get_logits_c(ctx: *mut c_void) -> *mut c_float;
fn llama_chat_apply_template_c(
tmpl: *const c_char,
chat: *const c_void,
n_msg: usize,
add_ass: bool,
buf: *mut c_char,
length: c_int,
) -> c_int;
fn llama_format_chat_with_model_c(
model: *const c_void,
roles: *const *const c_char,
contents: *const *const c_char,
n_msg: usize,
buf: *mut c_char,
buf_size: c_int,
) -> c_int;
fn llama_generate_c(
ctx: *mut c_void,
model: *const c_void,
input_tokens: *const i32,
n_input: c_int,
output_tokens: *mut i32,
max_tokens: c_int,
temperature: c_float,
top_p: c_float,
min_p: c_float,
top_k: c_int,
repeat_penalty: c_float,
seed: u32,
stop_seqs: *const i32,
stop_lens: *const c_int,
n_stop_seqs: c_int,
) -> c_int;
fn llama_generate_streaming_c(
ctx: *mut c_void,
model: *const c_void,
input_tokens: *const i32,
n_input: c_int,
output_tokens: *mut i32,
max_tokens: c_int,
temperature: c_float,
top_p: c_float,
min_p: c_float,
top_k: c_int,
repeat_penalty: c_float,
seed: u32,
stop_seqs: *const i32,
stop_lens: *const c_int,
n_stop_seqs: c_int,
callback: Option<TokenCallback>,
user_data: *mut c_void,
n_past_in: c_int,
) -> c_int;
fn llama_kv_cache_seq_rm_c(ctx: *mut c_void, seq_id: c_int, p_keep: c_int) -> c_int;
}
#[cfg(feature = "llm-llamacpp")]
pub type TokenCallback =
extern "C" fn(token_id: i32, token_text: *const c_char, user_data: *mut c_void) -> c_int;
#[cfg(feature = "llm-llamacpp")]
pub fn llama_backend_init() {
unsafe {
llama_backend_init_c();
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_backend_free() {
unsafe {
llama_backend_free_c();
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_log_set_verbosity(level: i32) {
unsafe {
llama_log_set_verbosity_c(level as c_int);
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_log_get_verbosity() -> i32 {
unsafe { llama_log_get_verbosity_c() as i32 }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_load_model_from_file(
path: &str,
n_gpu_layers: i32,
) -> Result<LlamaModel, AdapterError> {
let c_path = CString::new(path)
.map_err(|_| AdapterError::InvalidInput("Invalid path encoding".to_string()))?;
let ptr = unsafe { llama_load_model_from_file_c(c_path.as_ptr(), n_gpu_layers as c_int) };
if ptr.is_null() {
return Err(AdapterError::RuntimeError(format!(
"Failed to load model from {}",
path
)));
}
Ok(LlamaModel { ptr })
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_free_model(mut model: LlamaModel) {
if !model.ptr.is_null() {
unsafe { llama_free_model_c(model.ptr) };
model.ptr = std::ptr::null_mut();
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_new_context_with_model(
model: &LlamaModel,
n_ctx: usize,
n_threads: usize,
n_batch: usize,
flash_attn: bool,
) -> Result<LlamaContext, AdapterError> {
let ptr = unsafe {
llama_new_context_with_model_c(
model.ptr,
n_ctx as c_int,
n_threads as c_int,
n_batch as c_int,
flash_attn,
)
};
if ptr.is_null() {
return Err(AdapterError::RuntimeError(
"Failed to create context".to_string(),
));
}
Ok(LlamaContext { ptr })
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_free(mut ctx: LlamaContext) {
if !ctx.ptr.is_null() {
unsafe { llama_free_c(ctx.ptr) };
ctx.ptr = std::ptr::null_mut();
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_kv_cache_clear(ctx: &LlamaContext) {
unsafe {
llama_kv_cache_clear_c(ctx.ptr);
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_kv_cache_seq_rm(ctx: &LlamaContext, seq_id: i32, p_keep: usize) {
unsafe {
let p_keep_c = p_keep.min(c_int::MAX as usize) as c_int;
let _ = llama_kv_cache_seq_rm_c(ctx.ptr, seq_id, p_keep_c);
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_token_bos(model: &LlamaModel) -> i32 {
unsafe { llama_token_bos_c(model.ptr) }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_token_eos(model: &LlamaModel) -> i32 {
unsafe { llama_token_eos_c(model.ptr) }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_vocab_is_eog(model: &LlamaModel, token: i32) -> bool {
unsafe { llama_vocab_is_eog_c(model.ptr, token) }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_n_vocab(model: &LlamaModel) -> usize {
unsafe { llama_n_vocab_c(model.ptr) as usize }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_n_ctx(ctx: &LlamaContext) -> usize {
unsafe { llama_n_ctx_c(ctx.ptr) as usize }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_model_is_recurrent(model: &LlamaModel) -> bool {
unsafe { llama_model_is_recurrent_c(model.ptr) }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_model_has_recurrent_state(model: &LlamaModel) -> bool {
unsafe { llama_model_has_recurrent_state_c(model.ptr) }
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_model_chat_template(model: &LlamaModel) -> Option<String> {
let ptr = unsafe { llama_model_chat_template_c(model.ptr) };
if ptr.is_null() {
return None;
}
unsafe { std::ffi::CStr::from_ptr(ptr) }
.to_str()
.ok()
.map(|s| s.to_string())
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_model_chat_template(_model: &LlamaModel) -> Option<String> {
None
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_format_chat(
model: &LlamaModel,
messages: &[ChatMessage],
) -> Result<String, AdapterError> {
if messages.is_empty() {
return Err(AdapterError::InvalidInput("Empty messages".to_string()));
}
let roles: Vec<CString> = messages
.iter()
.map(|m| {
CString::new(m.role.as_str()).map_err(|_| {
AdapterError::InvalidInput(format!(
"Chat message role '{}' contains null byte",
m.role
))
})
})
.collect::<Result<Vec<_>, _>>()?;
let contents: Vec<CString> = messages
.iter()
.map(|m| {
CString::new(m.content.as_str()).map_err(|_| {
AdapterError::InvalidInput("Chat message content contains null byte".to_string())
})
})
.collect::<Result<Vec<_>, _>>()?;
let role_ptrs: Vec<*const c_char> = roles.iter().map(|s| s.as_ptr()).collect();
let content_ptrs: Vec<*const c_char> = contents.iter().map(|s| s.as_ptr()).collect();
let mut buf = vec![0u8; 4096];
let result = unsafe {
llama_format_chat_with_model_c(
model.ptr,
role_ptrs.as_ptr(),
content_ptrs.as_ptr(),
messages.len(),
buf.as_mut_ptr() as *mut c_char,
buf.len() as c_int,
)
};
if result < 0 {
log::warn!(
target: "xybrid_core",
"Model chat template failed (code {}), falling back to ChatML format",
result
);
return llama_format_chat_chatml(messages);
}
let len = if result as usize >= buf.len() {
buf.resize((result as usize) + 1, 0);
let retry_result = unsafe {
llama_format_chat_with_model_c(
model.ptr,
role_ptrs.as_ptr(),
content_ptrs.as_ptr(),
messages.len(),
buf.as_mut_ptr() as *mut c_char,
buf.len() as c_int,
)
};
if retry_result < 0 {
return llama_format_chat_chatml(messages);
}
retry_result as usize
} else {
result as usize
};
if let Ok(prompt) = std::str::from_utf8(&buf[..len]) {
Ok(prompt.to_string())
} else {
llama_format_chat_chatml(messages)
}
}
#[cfg(feature = "llm-llamacpp")]
fn llama_format_chat_chatml(messages: &[ChatMessage]) -> Result<String, AdapterError> {
let mut prompt = String::new();
for msg in messages {
match msg.role.as_str() {
"system" => {
prompt.push_str(&format!("<|im_start|>system\n{}<|im_end|>\n", msg.content));
}
"user" => {
prompt.push_str(&format!("<|im_start|>user\n{}<|im_end|>\n", msg.content));
}
"assistant" => {
prompt.push_str(&format!(
"<|im_start|>assistant\n{}<|im_end|>\n",
msg.content
));
}
_ => {
prompt.push_str(&format!("<|im_start|>user\n{}<|im_end|>\n", msg.content));
}
}
}
prompt.push_str("<|im_start|>assistant\n");
Ok(prompt)
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_tokenize(
model: &LlamaModel,
text: &str,
add_special: bool,
) -> Result<Vec<i32>, AdapterError> {
let c_text = CString::new(text)
.map_err(|_| AdapterError::InvalidInput("Invalid text encoding".to_string()))?;
let n_tokens = unsafe {
llama_tokenize_c(
model.ptr,
c_text.as_ptr(),
text.len() as c_int,
ptr::null_mut(),
0,
add_special,
false,
)
};
let required_size = if n_tokens < 0 { -n_tokens } else { n_tokens };
if required_size <= 0 {
return Ok(Vec::new());
}
let mut tokens = vec![0i32; required_size as usize + 16]; let result = unsafe {
llama_tokenize_c(
model.ptr,
c_text.as_ptr(),
text.len() as c_int,
tokens.as_mut_ptr(),
tokens.len() as c_int,
add_special,
false,
)
};
if result < 0 {
return Err(AdapterError::RuntimeError(
"Tokenization failed".to_string(),
));
}
tokens.truncate(result as usize);
Ok(tokens)
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_tokenize_special(
model: &LlamaModel,
text: &str,
add_special: bool,
) -> Result<Vec<i32>, AdapterError> {
let c_text = CString::new(text)
.map_err(|_| AdapterError::InvalidInput("Invalid text encoding".to_string()))?;
let n_tokens = unsafe {
llama_tokenize_c(
model.ptr,
c_text.as_ptr(),
text.len() as c_int,
ptr::null_mut(),
0,
add_special,
true, )
};
let required_size = if n_tokens < 0 { -n_tokens } else { n_tokens };
if required_size <= 0 {
return Ok(Vec::new());
}
let mut tokens = vec![0i32; required_size as usize + 16];
let result = unsafe {
llama_tokenize_c(
model.ptr,
c_text.as_ptr(),
text.len() as c_int,
tokens.as_mut_ptr(),
tokens.len() as c_int,
add_special,
true, )
};
if result < 0 {
return Err(AdapterError::RuntimeError(
"Tokenization failed".to_string(),
));
}
tokens.truncate(result as usize);
Ok(tokens)
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_detokenize(model: &LlamaModel, tokens: &[i32]) -> Result<String, AdapterError> {
let mut result = String::new();
let mut buf = vec![0u8; 256];
for &token in tokens {
let len = unsafe {
llama_token_to_piece_c(
model.ptr,
token,
buf.as_mut_ptr() as *mut c_char,
buf.len() as c_int,
0,
true, )
};
if len > 0 {
let len_usize = len as usize;
if len_usize >= buf.len() {
buf.resize(len_usize + 1, 0);
let retry_len = unsafe {
llama_token_to_piece_c(
model.ptr,
token,
buf.as_mut_ptr() as *mut c_char,
buf.len() as c_int,
0,
true,
)
};
if retry_len > 0 {
if let Ok(piece) = std::str::from_utf8(&buf[..retry_len as usize]) {
result.push_str(piece);
}
}
} else if let Ok(piece) = std::str::from_utf8(&buf[..len_usize]) {
result.push_str(piece);
}
}
}
Ok(result)
}
#[cfg(feature = "llm-llamacpp")]
#[derive(Clone)]
pub struct SamplingParams {
pub temperature: f32,
pub top_p: f32,
pub top_k: usize,
pub repeat_penalty: f32,
}
#[cfg(feature = "llm-llamacpp")]
impl Default for SamplingParams {
fn default() -> Self {
Self {
temperature: 0.7,
top_p: 0.9,
top_k: 40,
repeat_penalty: 1.1,
}
}
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_generate_with_stops(
ctx: &LlamaContext,
model: &LlamaModel,
input_tokens: &[i32],
max_tokens: usize,
temperature: f32,
top_p: f32,
min_p: f32,
top_k: usize,
repeat_penalty: f32,
stop_sequences: &[String],
) -> Result<Vec<i32>, AdapterError> {
if input_tokens.is_empty() {
return Err(AdapterError::InvalidInput("Empty input tokens".to_string()));
}
let mut stop_tokens: Vec<i32> = Vec::new();
let mut stop_lens: Vec<c_int> = Vec::new();
for seq in stop_sequences {
let tokens = llama_tokenize_special(model, seq, false)?;
log::debug!(
target: "xybrid_core",
"Tokenized stop sequence '{}' -> {:?} ({} tokens)",
seq, tokens, tokens.len()
);
if !tokens.is_empty() {
stop_lens.push(tokens.len() as c_int);
stop_tokens.extend(tokens);
}
}
log::debug!(
target: "xybrid_core",
"Total stop tokens: {:?}, lengths: {:?}",
stop_tokens, stop_lens
);
let mut output_tokens = vec![0i32; max_tokens];
let seed = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos() as u32)
.unwrap_or(42);
let (stop_seqs_ptr, stop_lens_ptr, n_stop_seqs) = if stop_lens.is_empty() {
(ptr::null(), ptr::null(), 0)
} else {
(
stop_tokens.as_ptr(),
stop_lens.as_ptr(),
stop_lens.len() as c_int,
)
};
let result = unsafe {
llama_generate_c(
ctx.ptr,
model.ptr,
input_tokens.as_ptr(),
input_tokens.len() as c_int,
output_tokens.as_mut_ptr(),
max_tokens as c_int,
temperature,
top_p,
min_p,
top_k as c_int,
repeat_penalty,
seed,
stop_seqs_ptr,
stop_lens_ptr,
n_stop_seqs,
)
};
if result < 0 {
let detail = match result {
-1 => "invalid arguments (null context/model/input or non-positive sizes)",
-2 => "sampler chain creation failed",
-3 => {
"llama_decode failed on prefill \
(the wrapper logs the actual llama_decode return code + chunk \
position to stderr; see `llama_generate_c` in llama_wrapper.cpp)"
}
-4 => "input exceeds context window",
_ => "unknown",
};
return Err(AdapterError::RuntimeError(format!(
"Generation failed with error code {} ({})",
result, detail
)));
}
output_tokens.truncate(result as usize);
Ok(output_tokens)
}
#[cfg(feature = "llm-llamacpp")]
pub fn llama_generate(
ctx: &LlamaContext,
model: &LlamaModel,
input_tokens: &[i32],
max_tokens: usize,
temperature: f32,
top_p: f32,
top_k: usize,
) -> Result<Vec<i32>, AdapterError> {
llama_generate_with_stops(
ctx,
model,
input_tokens,
max_tokens,
temperature,
top_p,
0.05,
top_k,
1.1,
&[],
)
}
#[cfg(feature = "llm-llamacpp")]
struct StreamingContext<'a, F>
where
F: FnMut(i32, &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>,
{
callback: &'a mut F,
error: Option<Box<dyn std::error::Error + Send + Sync>>,
}
#[cfg(feature = "llm-llamacpp")]
extern "C" fn streaming_trampoline<F>(
token_id: i32,
token_text: *const c_char,
user_data: *mut c_void,
) -> c_int
where
F: FnMut(i32, &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>,
{
let ctx = unsafe { &mut *(user_data as *mut StreamingContext<F>) };
let text = if token_text.is_null() {
""
} else {
unsafe { std::ffi::CStr::from_ptr(token_text) }
.to_str()
.unwrap_or("")
};
match (ctx.callback)(token_id, text) {
Ok(()) => 0, Err(e) => {
ctx.error = Some(e);
1 }
}
}
#[cfg(feature = "llm-llamacpp")]
#[allow(clippy::too_many_arguments)]
pub fn llama_generate_streaming<F>(
ctx: &LlamaContext,
model: &LlamaModel,
input_tokens: &[i32],
max_tokens: usize,
temperature: f32,
top_p: f32,
min_p: f32,
top_k: usize,
repeat_penalty: f32,
stop_sequences: &[String],
mut on_token: F,
n_past_in: usize,
) -> Result<(Vec<i32>, bool), AdapterError>
where
F: FnMut(i32, &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>,
{
if input_tokens.is_empty() {
return Err(AdapterError::InvalidInput("Empty input tokens".to_string()));
}
let mut stop_tokens: Vec<i32> = Vec::new();
let mut stop_lens: Vec<c_int> = Vec::new();
for seq in stop_sequences {
let tokens = llama_tokenize_special(model, seq, false)?;
if !tokens.is_empty() {
stop_lens.push(tokens.len() as c_int);
stop_tokens.extend(tokens);
}
}
let mut output_tokens = vec![0i32; max_tokens];
let seed = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos() as u32)
.unwrap_or(42);
let (stop_seqs_ptr, stop_lens_ptr, n_stop_seqs) = if stop_lens.is_empty() {
(ptr::null(), ptr::null(), 0)
} else {
(
stop_tokens.as_ptr(),
stop_lens.as_ptr(),
stop_lens.len() as c_int,
)
};
let mut streaming_ctx = StreamingContext {
callback: &mut on_token,
error: None,
};
let result = unsafe {
llama_generate_streaming_c(
ctx.ptr,
model.ptr,
input_tokens.as_ptr(),
input_tokens.len() as c_int,
output_tokens.as_mut_ptr(),
max_tokens as c_int,
temperature,
top_p,
min_p,
top_k as c_int,
repeat_penalty,
seed,
stop_seqs_ptr,
stop_lens_ptr,
n_stop_seqs,
Some(streaming_trampoline::<F>),
&mut streaming_ctx as *mut StreamingContext<F> as *mut c_void,
n_past_in.min(c_int::MAX as usize) as c_int,
)
};
if (-4..=-1).contains(&result) {
let detail = match result {
-1 => "invalid arguments (null context/model/input or non-positive sizes)",
-2 => "sampler chain creation failed",
-3 => {
"llama_decode failed on prefill (KV-cache state mismatch likely; \
see stderr for the wrapper-level diagnostic line emitted by \
`llama_generate_streaming_c`)"
}
-4 => "input + prefix exceeds context window (n_past_in + n_input >= n_ctx)",
_ => "unknown",
};
return Err(AdapterError::RuntimeError(format!(
"Generation failed with error code {} ({}; n_past_in={})",
result, detail, n_past_in
)));
}
if let Some(err) = streaming_ctx.error {
return Err(AdapterError::from_streaming_callback_error(err));
}
let (n_generated, stopped_by_callback) = if result < 0 {
((-result) as usize, true)
} else {
(result as usize, false)
};
output_tokens.truncate(n_generated);
Ok((output_tokens, stopped_by_callback))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub struct LlamaModel;
#[cfg(not(feature = "llm-llamacpp"))]
pub struct LlamaContext;
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_backend_init() {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_backend_free() {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_log_set_verbosity(_level: i32) {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_log_get_verbosity() -> i32 {
0
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_load_model_from_file(
_path: &str,
_n_gpu_layers: i32,
) -> Result<LlamaModel, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_free_model(_model: LlamaModel) {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_new_context_with_model(
_model: &LlamaModel,
_n_ctx: usize,
_n_threads: usize,
_n_batch: usize,
_flash_attn: bool,
) -> Result<LlamaContext, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_free(_ctx: LlamaContext) {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_kv_cache_clear(_ctx: &LlamaContext) {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_kv_cache_seq_rm(_ctx: &LlamaContext, _seq_id: i32, _p_keep: usize) {}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_format_chat(
_model: &LlamaModel,
_messages: &[ChatMessage],
) -> Result<String, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_tokenize(
_model: &LlamaModel,
_text: &str,
_add_special: bool,
) -> Result<Vec<i32>, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_tokenize_special(
_model: &LlamaModel,
_text: &str,
_add_special: bool,
) -> Result<Vec<i32>, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_detokenize(_model: &LlamaModel, _tokens: &[i32]) -> Result<String, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_generate_with_stops(
_ctx: &LlamaContext,
_model: &LlamaModel,
_input_tokens: &[i32],
_max_tokens: usize,
_temperature: f32,
_top_p: f32,
_min_p: f32,
_top_k: usize,
_repeat_penalty: f32,
_stop_sequences: &[String],
) -> Result<Vec<i32>, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
pub fn llama_generate(
_ctx: &LlamaContext,
_model: &LlamaModel,
_input_tokens: &[i32],
_max_tokens: usize,
_temperature: f32,
_top_p: f32,
_top_k: usize,
) -> Result<Vec<i32>, AdapterError> {
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(not(feature = "llm-llamacpp"))]
#[allow(clippy::too_many_arguments)]
pub fn llama_generate_streaming<F>(
_ctx: &LlamaContext,
_model: &LlamaModel,
_input_tokens: &[i32],
_max_tokens: usize,
_temperature: f32,
_top_p: f32,
_min_p: f32,
_top_k: usize,
_repeat_penalty: f32,
_stop_sequences: &[String],
_on_token: F,
_n_past_in: usize,
) -> Result<(Vec<i32>, bool), AdapterError>
where
F: FnMut(i32, &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>,
{
Err(AdapterError::RuntimeError(
"llm-llamacpp feature not enabled".to_string(),
))
}
#[cfg(test)]
mod tests {
use std::os::raw::{c_int, c_void};
#[test]
fn test_stop_sequence_count_matches_filtered_lens() {
let tokenize_results: Vec<Vec<i32>> = vec![
vec![32000, 32001], vec![], vec![32002], ];
let mut stop_tokens: Vec<i32> = Vec::new();
let mut stop_lens: Vec<c_int> = Vec::new();
for tokens in &tokenize_results {
if !tokens.is_empty() {
stop_lens.push(tokens.len() as c_int);
stop_tokens.extend(tokens);
}
}
let n_stop_seqs = stop_lens.len() as c_int;
assert_eq!(n_stop_seqs, 2, "n_stop_seqs must match stop_lens.len()");
assert_eq!(stop_lens.len(), 2);
assert_eq!(stop_tokens.len(), 3); assert_eq!(stop_lens[0], 2); assert_eq!(stop_lens[1], 1); }
#[cfg(feature = "llm-llamacpp")]
#[test]
fn streaming_trampoline_preserves_cloud_fallback_abort_marker() {
use super::{streaming_trampoline, StreamingContext};
use crate::abort::{cloud_fallback_reason_from_error, AbortReason, CloudFallbackAbort};
use std::ffi::CString;
use std::time::{Duration, Instant};
type Callback = fn(i32, &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>;
fn abort_callback(
_token_id: i32,
_text: &str,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
Err(Box::new(CloudFallbackAbort::new(AbortReason::StressMemory)))
}
let mut callback: Callback = abort_callback;
let mut ctx = StreamingContext {
callback: &mut callback,
error: None,
};
let token_text = CString::new("token").unwrap();
let started = Instant::now();
let stop = streaming_trampoline::<Callback>(
42,
token_text.as_ptr(),
&mut ctx as *mut StreamingContext<Callback> as *mut c_void,
);
let elapsed = started.elapsed();
assert_eq!(stop, 1, "callback errors must stop the C stream");
assert!(
elapsed <= Duration::from_millis(50),
"llama.cpp trampoline abort exceeded M-series cancellation budget: {:?}",
elapsed
);
let err = ctx.error.take().expect("callback error must be stored");
assert_eq!(
cloud_fallback_reason_from_error(err.as_ref()),
Some(AbortReason::StressMemory),
"llama.cpp trampoline must keep the typed CloudFallbackAbort marker for the Rust layer"
);
}
#[test]
fn test_format_chat_retry_uses_correct_length() {
let buf_len: usize = 4096;
let result: c_int = 5000;
assert!(result as usize >= buf_len, "Should trigger resize path");
let _new_buf_len = (result as usize) + 1;
let retry_result: c_int = 4998;
let len = if result as usize >= buf_len {
retry_result as usize } else {
result as usize
};
assert_eq!(
len, 4998,
"Must use retry_result (4998), not first result (5000)"
);
}
#[test]
fn test_context_window_bounds_check() {
let n_ctx: usize = 4096;
let tokens_at_limit = vec![0i32; 4096];
assert!(
tokens_at_limit.len() >= n_ctx,
"Input at context limit should be rejected"
);
let tokens_over_limit = vec![0i32; 5000];
assert!(
tokens_over_limit.len() >= n_ctx,
"Input exceeding context limit should be rejected"
);
let tokens_within_limit = vec![0i32; 2000];
assert!(
tokens_within_limit.len() < n_ctx,
"Input within context limit should be accepted"
);
let tokens_just_under = vec![0i32; 4095];
assert!(
tokens_just_under.len() < n_ctx,
"Input at n_ctx-1 should be accepted (room for 1 generated token)"
);
}
#[test]
fn test_batch_size_must_fit_input_tokens() {
let fixed_batch_size: usize = 512;
let small_input = 100;
let batch_size = if small_input > fixed_batch_size {
small_input
} else {
fixed_batch_size
};
assert!(batch_size >= small_input);
let large_input = 2000;
let batch_size = if large_input > fixed_batch_size {
large_input
} else {
fixed_batch_size
};
assert_eq!(batch_size, 2000, "Batch must grow to fit large input");
assert!(batch_size >= large_input);
let exact_input = 512;
let batch_size = if exact_input > fixed_batch_size {
exact_input
} else {
fixed_batch_size
};
assert_eq!(batch_size, 512);
assert!(batch_size >= exact_input);
let over_input = 513;
let batch_size = if over_input > fixed_batch_size {
over_input
} else {
fixed_batch_size
};
assert_eq!(
batch_size, 513,
"Batch must not use fixed 512 when input is 513"
);
}
}