llama_rs/
lib.rs

1//! llama-rs: A Rust implementation of llama.cpp
2//!
3//! High-performance LLM inference engine with support for GGUF and ONNX models.
4//!
5//! # Features
6//!
7//! - Full GGUF file format support (v1, v2, v3)
8//! - All quantization formats (Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, K-quants)
9//! - Memory-mapped model loading
10//! - CPU backend with SIMD and parallel operations
11//! - LLaMA model architecture support
12//!
13//! # Example
14//!
15//! ```no_run
16//! use llama_rs::{GgufFile, default_backend};
17//!
18//! // Load a GGUF model
19//! let file = GgufFile::open("model.gguf").unwrap();
20//! println!("Model architecture: {:?}", file.data.get_string("general.architecture"));
21//!
22//! // Get the default backend
23//! let backend = default_backend();
24//! println!("Using backend: {}", backend.name());
25//! ```
26
27pub mod backend;
28#[cfg(feature = "client")]
29pub mod client;
30pub mod config;
31pub mod diagnostics;
32pub mod engine;
33#[cfg(feature = "server")]
34pub mod engine_batched;
35pub mod gguf;
36#[cfg(feature = "huggingface")]
37pub mod huggingface;
38pub mod model;
39#[cfg(feature = "onnx")]
40pub mod onnx;
41#[cfg(feature = "distributed")]
42pub mod distributed;
43#[cfg(feature = "council")]
44pub mod council;
45pub mod rag;
46pub mod safetensors;
47pub mod sampling;
48#[cfg(feature = "server")]
49pub mod server;
50pub mod tensor;
51pub mod tokenizer;
52
53// Re-export main types
54pub use config::{Config, ConfigError};
55pub use engine::{ChatEngine, ChatTemplate, Engine, EngineConfig, EngineError};
56pub use backend::{default_backend, Backend, BackendError};
57pub use backend::tensor_parallel::{
58    ShardingPlan, SingleDeviceTP, TPConfig, TensorParallel, merge_shards, shard_weight,
59};
60pub use gguf::{
61    GgufBuilder, GgufData, GgufFile, GgufReader, GgufWriter, TensorToWrite,
62    QuantizeOptions, QuantizeStats, quantize_model,
63};
64pub use model::{
65    Architecture, InferenceContext, KVCache, LlamaModel, Model, ModelConfig, ModelError,
66    ModelLoader, ModelSource, build_llama_model, load_llama_model,
67    // DeltaNet / SSM
68    AttentionLayer, DeltaNetConfig, DeltaNetLayer, DeltaNetState, RecurrentState,
69    // LoRA
70    LoraAdapter, LoraAdapters, LoraConfig,
71    // MoE
72    MoeConfig, MoeExpert, MoeLayer, MoeRouter, MoeStats,
73    // Speculative decoding
74    SpeculativeConfig, SpeculativeDecoder, SpeculativeMode, SpeculativeStats,
75    // Embeddings
76    EmbeddingConfig, EmbeddingError, EmbeddingExtractor, PoolingStrategy, TruncationStrategy,
77    cosine_similarity, dot_product, euclidean_distance, find_nearest,
78    // Prompt cache
79    CachedPrefix, PrefixId, PrefixSharing, PromptCache, PromptCacheConfig, PromptCacheStats,
80    // KV cache quantization
81    KVCacheFormat, QuantizedKVCache,
82    // Paged attention
83    BlockId, BlockTable, PageAllocator, PagedKVPool, PagedSequence, DEFAULT_BLOCK_SIZE,
84};
85pub use sampling::{
86    Grammar, GrammarSampler, GbnfGrammar, JsonGrammar, RegexGrammar,
87    MirostatConfig, Sampler, SamplerConfig,
88};
89pub use tensor::{DType, Tensor, TensorError, TensorStorage};
90pub use tokenizer::{Tokenizer, TokenizerError};
91#[cfg(feature = "huggingface")]
92pub use huggingface::{HfClient, HfError, HfFileInfo, RepoType, format_bytes};
93#[cfg(feature = "onnx")]
94pub use onnx::{HfConfig, OnnxError, OnnxFile, OnnxMetadata, OnnxModelLoader, OnnxTensorInfo};
95#[cfg(feature = "rag")]
96pub use rag::{
97    RagConfig, RagStore, RagError, RagResult, Document, NewDocument, RagContextBuilder, TextChunker,
98    // Config types
99    IndexType, SearchType, DistanceMetric, DatabaseConfig, EmbeddingsConfig, SearchConfig,
100    // Knowledge base
101    KnowledgeBase, KnowledgeBaseBuilder, KnowledgeBaseConfig, DataSource, ChunkingStrategy,
102    RetrievalConfig, RetrievalResponse, RetrieveAndGenerateResponse, RetrievedChunk,
103    Citation, SourceLocation, IngestionResult,
104    // Embeddings
105    EmbeddingGenerator,
106    // Metadata filtering
107    MetadataFilter,
108};
109
110#[cfg(feature = "rag-sqlite")]
111pub use rag::{
112    SqliteStore, SqliteConfig, SqliteDocument, SqliteNewDocument, SqliteMetadataFilter,
113    SqliteDistanceMetric,
114};
115
116#[cfg(all(feature = "rag-sqlite", not(feature = "rag")))]
117pub use rag::{RagError, RagResult};
118
119#[cfg(feature = "server")]
120pub use engine_batched::{
121    BatchFinishReason, BatchRequest, BatchToken, BatchedEngine, BatchedEngineConfig,
122};
123
124#[cfg(feature = "distributed")]
125pub use distributed::{
126    ClusterConfig, Coordinator, DistributedError, DistributedModel, DistributedResult,
127    PipelineExecutor, ShardServer, ShardSpec,
128};
129
130/// Library-wide error type
131#[derive(thiserror::Error, Debug)]
132pub enum Error {
133    #[error("IO error: {0}")]
134    Io(#[from] std::io::Error),
135
136    #[error("GGUF error: {0}")]
137    Gguf(#[from] gguf::GgufError),
138
139    #[error("Tensor error: {0}")]
140    Tensor(#[from] tensor::TensorError),
141
142    #[error("Backend error: {0}")]
143    Backend(#[from] backend::BackendError),
144}
145
146pub type Result<T> = std::result::Result<T, Error>;
llama_rs/lib.rs

llama_rs/
lib.rs