Skip to main content

llama_engine/
lib.rs

1//! # llama-engine
2//!
3//! The "narrow waist" of the llama.rs stack. Defines the core [`LlamaEngine`] trait
4//! and associated types that all other crates depend on. Implementations can swap
5//! CPU/Metal/FFI backends without changing application code.
6//!
7//! ## Design Notes
8//!
9//! ### Interior Mutability
10//! `LlamaEngine` methods take `&self` (not `&mut self`) to allow shared access across
11//! multiple sessions and to enable concurrent inference without requiring exclusive
12//! borrows or external synchronization at call sites. Backends using interior
13//! mutability (e.g., `Mutex`, `Arc<RwLock>`) are still responsible for performing any
14//! necessary internal synchronization to ensure thread-safe access to shared state.
15//!
16//! ### Token Type
17//! `TokenId` is aliased as `i32` for FFI compatibility, though token IDs are logically
18//! non-negative. This will be reconsidered if a u32/usize conversion barrier emerges.
19
20pub type Result<T> = std::result::Result<T, LlamaError>;
21
22/// Token ID type (i32 for FFI compat; logically non-negative).
23pub type TokenId = i32;
24
25/// Top-level error type for all engine operations.
26#[derive(Debug, thiserror::Error)]
27pub enum LlamaError {
28    #[error("Model loading failed: {0}")]
29    ModelLoad(String),
30    #[error("Tokenization failed: {0}")]
31    Tokenization(String),
32    #[error("Inference failed: {0}")]
33    Inference(String),
34}
35
36/// Specification for loading a model.
37pub struct ModelSpec {
38    pub path: String,
39    pub context_size: usize,
40}
41
42/// Opaque handle to a loaded model.
43///
44/// No public fields — backends use their own representation (pointers, indices, etc.).
45/// This is intentional so application code does not depend on backend internals.
46pub struct ModelHandle;
47
48/// Represents an active inference session with its own KV cache state.
49///
50/// Sessions hold runtime state (KV cache, token history, etc.) that persists
51/// across prefill and decode phases. Multiple sessions can exist simultaneously,
52/// each with its own independent state.
53///
54/// Sessions are intentionally not `Clone` — cloning would imply duplicating
55/// KV cache state, which is not a cheap or well-defined operation.
56#[derive(Debug)]
57pub struct Session {
58    /// Unique session ID for tracking and logging (private so backends can rely on it as a stable key).
59    id: uuid::Uuid,
60}
61
62impl Session {
63    /// Return the unique session ID for tracking and logging.
64    pub fn id(&self) -> uuid::Uuid {
65        self.id
66    }
67
68    /// Create a new inference session with a random UUID.
69    pub fn new() -> Self {
70        Self {
71            id: uuid::Uuid::new_v4(),
72        }
73    }
74
75    /// Create a session with an explicit ID (useful for testing/replay).
76    pub fn with_id(id: uuid::Uuid) -> Self {
77        Self { id }
78    }
79}
80
81impl Default for Session {
82    fn default() -> Self {
83        Self::new()
84    }
85}
86
87/// Result of the prefill phase (prompt processing).
88#[derive(Debug, Clone)]
89#[must_use]
90pub struct PrefillResult {
91    /// Number of tokens processed.
92    pub tokens_processed: usize,
93}
94
95/// Result of a single decode step.
96#[derive(Debug, Clone)]
97#[must_use]
98pub struct DecodeResult {
99    /// The decoded token.
100    pub token: TokenId,
101}
102
103/// The core engine trait — everything else plugs into this.
104///
105/// Implementations provide inference, tokenization, and embedding functionality.
106/// oxidizedRAG and oxidizedgraph depend on *engine behavior*, not implementation
107/// details. Swap CPU/Metal/FFI backends without changing application code.
108pub trait LlamaEngine: Send + Sync {
109    /// Load a model from disk given a specification.
110    fn load_model(&self, spec: &ModelSpec) -> Result<ModelHandle>;
111
112    /// Convert text into a sequence of token IDs.
113    fn tokenize(&self, text: &str) -> Result<Vec<TokenId>>;
114
115    /// Convert token IDs back into text.
116    fn detokenize(&self, tokens: &[TokenId]) -> Result<String>;
117
118    /// Run the prefill phase: process prompt tokens and populate the KV cache.
119    fn prefill(&self, session: &mut Session, tokens: &[TokenId]) -> Result<PrefillResult>;
120
121    /// Run the decode phase: produce the next token from the model.
122    fn decode(&self, session: &mut Session) -> Result<DecodeResult>;
123
124    /// Generate embeddings for a batch of texts (for oxidizedRAG integration).
125    fn embed(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>>;
126}