llama_gguf/
lib.rs

1//! llama-rs: A Rust implementation of llama.cpp
2//!
3//! High-performance LLM inference engine with support for GGUF and ONNX models.
4//!
5//! # Features
6//!
7//! - Full GGUF file format support (v1, v2, v3)
8//! - All quantization formats (Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, K-quants)
9//! - Memory-mapped model loading
10//! - CPU backend with SIMD and parallel operations
11//! - LLaMA model architecture support
12//!
13//! # Example
14//!
15//! ```no_run
16//! use llama_gguf::{GgufFile, default_backend};
17//!
18//! // Load a GGUF model
19//! let file = GgufFile::open("model.gguf").unwrap();
20//! println!("Model architecture: {:?}", file.data.get_string("general.architecture"));
21//!
22//! // Get the default backend
23//! let backend = default_backend();
24//! println!("Using backend: {}", backend.name());
25//! ```
26
27pub mod backend;
28#[cfg(feature = "client")]
29pub mod client;
30pub mod config;
31pub mod engine;
32#[cfg(feature = "server")]
33pub mod engine_batched;
34pub mod gguf;
35#[cfg(feature = "huggingface")]
36pub mod huggingface;
37pub mod model;
38#[cfg(feature = "onnx")]
39pub mod onnx;
40#[cfg(feature = "distributed")]
41pub mod distributed;
42pub mod rag;
43pub mod sampling;
44#[cfg(feature = "server")]
45pub mod server;
46pub mod tensor;
47pub mod tokenizer;
48
49// Re-export main types
50pub use config::{Config, ConfigError};
51pub use engine::{ChatEngine, ChatTemplate, Engine, EngineConfig, EngineError};
52pub use backend::{default_backend, Backend, BackendError};
53pub use backend::tensor_parallel::{
54    ShardingPlan, SingleDeviceTP, TPConfig, TensorParallel, merge_shards, shard_weight,
55};
56pub use gguf::{
57    GgufBuilder, GgufData, GgufFile, GgufReader, GgufWriter, TensorToWrite,
58    QuantizeOptions, QuantizeStats, quantize_model,
59};
60pub use model::{
61    Architecture, InferenceContext, KVCache, LlamaModel, Model, ModelConfig, ModelError,
62    ModelLoader, load_llama_model,
63    // DeltaNet / SSM
64    AttentionLayer, DeltaNetConfig, DeltaNetLayer, DeltaNetState, RecurrentState,
65    // LoRA
66    LoraAdapter, LoraAdapters, LoraConfig,
67    // MoE
68    MoeConfig, MoeExpert, MoeLayer, MoeRouter, MoeStats,
69    // Speculative decoding
70    SpeculativeConfig, SpeculativeDecoder, SpeculativeMode, SpeculativeStats,
71    // Embeddings
72    EmbeddingConfig, EmbeddingError, EmbeddingExtractor, PoolingStrategy, TruncationStrategy,
73    cosine_similarity, dot_product, euclidean_distance, find_nearest,
74    // Prompt cache
75    CachedPrefix, PrefixId, PrefixSharing, PromptCache, PromptCacheConfig, PromptCacheStats,
76    // KV cache quantization
77    KVCacheFormat, QuantizedKVCache,
78    // Paged attention
79    BlockId, BlockTable, PageAllocator, PagedKVPool, PagedSequence, DEFAULT_BLOCK_SIZE,
80};
81pub use sampling::{
82    Grammar, GrammarSampler, GbnfGrammar, JsonGrammar, RegexGrammar,
83    MirostatConfig, Sampler, SamplerConfig,
84};
85pub use tensor::{DType, Tensor, TensorError, TensorStorage};
86pub use tokenizer::{Tokenizer, TokenizerError};
87#[cfg(feature = "huggingface")]
88pub use huggingface::{HfClient, HfError, HfFileInfo, format_bytes};
89#[cfg(feature = "onnx")]
90pub use onnx::{HfConfig, OnnxError, OnnxFile, OnnxMetadata, OnnxModelLoader, OnnxTensorInfo};
91#[cfg(feature = "rag")]
92pub use rag::{
93    RagConfig, RagStore, RagError, RagResult, Document, NewDocument, RagContextBuilder, TextChunker,
94    // Config types
95    IndexType, SearchType, DistanceMetric, DatabaseConfig, EmbeddingsConfig, SearchConfig,
96    // Knowledge base
97    KnowledgeBase, KnowledgeBaseBuilder, KnowledgeBaseConfig, DataSource, ChunkingStrategy,
98    RetrievalConfig, RetrievalResponse, RetrieveAndGenerateResponse, RetrievedChunk,
99    Citation, SourceLocation, IngestionResult,
100    // Embeddings
101    EmbeddingGenerator,
102    // Metadata filtering
103    MetadataFilter,
104};
105
106#[cfg(feature = "rag-sqlite")]
107pub use rag::{
108    SqliteStore, SqliteConfig, SqliteDocument, SqliteNewDocument, SqliteMetadataFilter,
109    SqliteDistanceMetric,
110};
111
112#[cfg(all(feature = "rag-sqlite", not(feature = "rag")))]
113pub use rag::{RagError, RagResult};
114
115#[cfg(feature = "server")]
116pub use engine_batched::{
117    BatchFinishReason, BatchRequest, BatchToken, BatchedEngine, BatchedEngineConfig,
118};
119
120#[cfg(feature = "distributed")]
121pub use distributed::{
122    ClusterConfig, Coordinator, DistributedError, DistributedModel, DistributedResult,
123    PipelineExecutor, ShardServer, ShardSpec,
124};
125
126/// Library-wide error type
127#[derive(thiserror::Error, Debug)]
128pub enum Error {
129    #[error("IO error: {0}")]
130    Io(#[from] std::io::Error),
131
132    #[error("GGUF error: {0}")]
133    Gguf(#[from] gguf::GgufError),
134
135    #[error("Tensor error: {0}")]
136    Tensor(#[from] tensor::TensorError),
137
138    #[error("Backend error: {0}")]
139    Backend(#[from] backend::BackendError),
140}
141
142pub type Result<T> = std::result::Result<T, Error>;
llama_gguf/lib.rs

llama_gguf/
lib.rs