llama_engine/lib.rs
1//! # llama-engine
2//!
3//! The "narrow waist" of the llama.rs stack. Defines the core [`LlamaEngine`] trait
4//! and associated types that all other crates depend on. Implementations can swap
5//! CPU/Metal/FFI backends without changing application code.
6//!
7//! ## Design Notes
8//!
9//! ### Interior Mutability
10//! `LlamaEngine` methods take `&self` (not `&mut self`) to allow shared access across
11//! multiple sessions and to enable concurrent inference without requiring exclusive
12//! borrows or external synchronization at call sites. Backends using interior
13//! mutability (e.g., `Mutex`, `Arc<RwLock>`) are still responsible for performing any
14//! necessary internal synchronization to ensure thread-safe access to shared state.
15//!
16//! ### Token Type
17//! `TokenId` is aliased as `i32` for FFI compatibility, though token IDs are logically
18//! non-negative. This will be reconsidered if a u32/usize conversion barrier emerges.
19
20pub type Result<T> = std::result::Result<T, LlamaError>;
21
22/// Token ID type (i32 for FFI compat; logically non-negative).
23pub type TokenId = i32;
24
25/// Top-level error type for all engine operations.
26#[derive(Debug, thiserror::Error)]
27pub enum LlamaError {
28 #[error("Model loading failed: {0}")]
29 ModelLoad(String),
30 #[error("Tokenization failed: {0}")]
31 Tokenization(String),
32 #[error("Inference failed: {0}")]
33 Inference(String),
34}
35
36/// Specification for loading a model.
37pub struct ModelSpec {
38 pub path: String,
39 pub context_size: usize,
40}
41
42/// Opaque handle to a loaded model.
43///
44/// No public fields — backends use their own representation (pointers, indices, etc.).
45/// This is intentional so application code does not depend on backend internals.
46pub struct ModelHandle;
47
48/// Represents an active inference session with its own KV cache state.
49///
50/// Sessions hold runtime state (KV cache, token history, etc.) that persists
51/// across prefill and decode phases. Multiple sessions can exist simultaneously,
52/// each with its own independent state.
53///
54/// Sessions are intentionally not `Clone` — cloning would imply duplicating
55/// KV cache state, which is not a cheap or well-defined operation.
56#[derive(Debug)]
57pub struct Session {
58 /// Unique session ID for tracking and logging (private so backends can rely on it as a stable key).
59 id: uuid::Uuid,
60}
61
62impl Session {
63 /// Return the unique session ID for tracking and logging.
64 pub fn id(&self) -> uuid::Uuid {
65 self.id
66 }
67
68 /// Create a new inference session with a random UUID.
69 pub fn new() -> Self {
70 Self {
71 id: uuid::Uuid::new_v4(),
72 }
73 }
74
75 /// Create a session with an explicit ID (useful for testing/replay).
76 pub fn with_id(id: uuid::Uuid) -> Self {
77 Self { id }
78 }
79}
80
81impl Default for Session {
82 fn default() -> Self {
83 Self::new()
84 }
85}
86
87/// Result of the prefill phase (prompt processing).
88#[derive(Debug, Clone)]
89#[must_use]
90pub struct PrefillResult {
91 /// Number of tokens processed.
92 pub tokens_processed: usize,
93}
94
95/// Result of a single decode step.
96#[derive(Debug, Clone)]
97#[must_use]
98pub struct DecodeResult {
99 /// The decoded token.
100 pub token: TokenId,
101}
102
103/// The core engine trait — everything else plugs into this.
104///
105/// Implementations provide inference, tokenization, and embedding functionality.
106/// oxidizedRAG and oxidizedgraph depend on *engine behavior*, not implementation
107/// details. Swap CPU/Metal/FFI backends without changing application code.
108pub trait LlamaEngine: Send + Sync {
109 /// Load a model from disk given a specification.
110 fn load_model(&self, spec: &ModelSpec) -> Result<ModelHandle>;
111
112 /// Convert text into a sequence of token IDs.
113 fn tokenize(&self, text: &str) -> Result<Vec<TokenId>>;
114
115 /// Convert token IDs back into text.
116 fn detokenize(&self, tokens: &[TokenId]) -> Result<String>;
117
118 /// Run the prefill phase: process prompt tokens and populate the KV cache.
119 fn prefill(&self, session: &mut Session, tokens: &[TokenId]) -> Result<PrefillResult>;
120
121 /// Run the decode phase: produce the next token from the model.
122 fn decode(&self, session: &mut Session) -> Result<DecodeResult>;
123
124 /// Generate embeddings for a batch of texts (for oxidizedRAG integration).
125 fn embed(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>>;
126}