use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use crate::error::CoreResult;
pub type TokenCallback = Box<dyn FnMut(&str, u32, f64) + Send>;
#[derive(Debug, Clone)]
pub struct InferenceParams {
pub max_tokens: u32,
pub temperature: f32,
pub context_size: u32,
pub n_threads: u32,
}
impl Default for InferenceParams {
fn default() -> Self {
let default_threads = std::thread::available_parallelism()
.map(|n| (n.get() as u32).saturating_sub(2).max(1))
.unwrap_or(4);
Self {
max_tokens: 2048,
temperature: 0.7,
context_size: 4096,
n_threads: default_threads,
}
}
}
#[derive(Debug, Clone)]
pub struct GenerationResult {
pub text: String,
pub tokens_generated: u32,
pub prompt_tokens: u32,
pub tokens_per_sec: f64,
pub time_to_first_token_ms: f64,
pub generation_time_ms: f64,
}
pub trait LlmBackend: Send + Sync {
fn generate(
&self,
prompt: &str,
params: &InferenceParams,
abort: Arc<AtomicBool>,
on_token: TokenCallback,
) -> CoreResult<GenerationResult>;
fn tokenize_count(&self, text: &str) -> CoreResult<u32>;
fn is_ready(&self) -> bool;
}