1pub mod backend;
28#[cfg(feature = "client")]
29pub mod client;
30pub mod config;
31pub mod engine;
32#[cfg(feature = "server")]
33pub mod engine_batched;
34pub mod gguf;
35#[cfg(feature = "huggingface")]
36pub mod huggingface;
37pub mod model;
38#[cfg(feature = "onnx")]
39pub mod onnx;
40#[cfg(feature = "distributed")]
41pub mod distributed;
42pub mod rag;
43pub mod sampling;
44#[cfg(feature = "server")]
45pub mod server;
46pub mod tensor;
47pub mod tokenizer;
48
49pub use config::{Config, ConfigError};
51pub use engine::{ChatEngine, ChatTemplate, Engine, EngineConfig, EngineError};
52pub use backend::{default_backend, Backend, BackendError};
53pub use backend::tensor_parallel::{
54 ShardingPlan, SingleDeviceTP, TPConfig, TensorParallel, merge_shards, shard_weight,
55};
56pub use gguf::{
57 GgufBuilder, GgufData, GgufFile, GgufReader, GgufWriter, TensorToWrite,
58 QuantizeOptions, QuantizeStats, quantize_model,
59};
60pub use model::{
61 Architecture, InferenceContext, KVCache, LlamaModel, Model, ModelConfig, ModelError,
62 ModelLoader, load_llama_model,
63 AttentionLayer, DeltaNetConfig, DeltaNetLayer, DeltaNetState, RecurrentState,
65 LoraAdapter, LoraAdapters, LoraConfig,
67 MoeConfig, MoeExpert, MoeLayer, MoeRouter, MoeStats,
69 SpeculativeConfig, SpeculativeDecoder, SpeculativeMode, SpeculativeStats,
71 EmbeddingConfig, EmbeddingError, EmbeddingExtractor, PoolingStrategy, TruncationStrategy,
73 cosine_similarity, dot_product, euclidean_distance, find_nearest,
74 CachedPrefix, PrefixId, PrefixSharing, PromptCache, PromptCacheConfig, PromptCacheStats,
76 KVCacheFormat, QuantizedKVCache,
78 BlockId, BlockTable, PageAllocator, PagedKVPool, PagedSequence, DEFAULT_BLOCK_SIZE,
80};
81pub use sampling::{
82 Grammar, GrammarSampler, GbnfGrammar, JsonGrammar, RegexGrammar,
83 MirostatConfig, Sampler, SamplerConfig,
84};
85pub use tensor::{DType, Tensor, TensorError, TensorStorage};
86pub use tokenizer::{Tokenizer, TokenizerError};
87#[cfg(feature = "huggingface")]
88pub use huggingface::{HfClient, HfError, HfFileInfo, format_bytes};
89#[cfg(feature = "onnx")]
90pub use onnx::{HfConfig, OnnxError, OnnxFile, OnnxMetadata, OnnxModelLoader, OnnxTensorInfo};
91#[cfg(feature = "rag")]
92pub use rag::{
93 RagConfig, RagStore, RagError, RagResult, Document, NewDocument, RagContextBuilder, TextChunker,
94 IndexType, SearchType, DistanceMetric, DatabaseConfig, EmbeddingsConfig, SearchConfig,
96 KnowledgeBase, KnowledgeBaseBuilder, KnowledgeBaseConfig, DataSource, ChunkingStrategy,
98 RetrievalConfig, RetrievalResponse, RetrieveAndGenerateResponse, RetrievedChunk,
99 Citation, SourceLocation, IngestionResult,
100 EmbeddingGenerator,
102 MetadataFilter,
104};
105
106#[cfg(feature = "rag-sqlite")]
107pub use rag::{
108 SqliteStore, SqliteConfig, SqliteDocument, SqliteNewDocument, SqliteMetadataFilter,
109 SqliteDistanceMetric,
110};
111
112#[cfg(all(feature = "rag-sqlite", not(feature = "rag")))]
113pub use rag::{RagError, RagResult};
114
115#[cfg(feature = "server")]
116pub use engine_batched::{
117 BatchFinishReason, BatchRequest, BatchToken, BatchedEngine, BatchedEngineConfig,
118};
119
120#[cfg(feature = "distributed")]
121pub use distributed::{
122 ClusterConfig, Coordinator, DistributedError, DistributedModel, DistributedResult,
123 PipelineExecutor, ShardServer, ShardSpec,
124};
125
126#[derive(thiserror::Error, Debug)]
128pub enum Error {
129 #[error("IO error: {0}")]
130 Io(#[from] std::io::Error),
131
132 #[error("GGUF error: {0}")]
133 Gguf(#[from] gguf::GgufError),
134
135 #[error("Tensor error: {0}")]
136 Tensor(#[from] tensor::TensorError),
137
138 #[error("Backend error: {0}")]
139 Backend(#[from] backend::BackendError),
140}
141
142pub type Result<T> = std::result::Result<T, Error>;