1#![deny(missing_docs)]
69#![deny(clippy::all)]
70#![warn(clippy::pedantic)]
71#![allow(clippy::missing_errors_doc)]
72#![allow(clippy::missing_panics_doc)]
73#![allow(clippy::module_name_repetitions)]
74#![allow(clippy::cast_precision_loss)]
75#![allow(clippy::cast_possible_truncation)]
76#![allow(clippy::doc_markdown)]
77#![allow(clippy::map_unwrap_or)]
78#![allow(clippy::redundant_closure_for_method_calls)]
79#![allow(clippy::unnecessary_literal_bound)]
80#![allow(clippy::cloned_instead_of_copied)]
81#![allow(clippy::must_use_candidate)]
82#![allow(clippy::assigning_clones)]
83#![allow(clippy::manual_div_ceil)]
84#![allow(clippy::unnecessary_map_or)]
85#![allow(clippy::derivable_impls)]
86
87pub mod chunk;
88#[cfg(feature = "compression")]
89pub mod compressed;
90pub mod embed;
91pub mod error;
92#[cfg(feature = "eval")]
93pub mod eval;
94pub mod fusion;
95pub mod index;
96pub mod loader;
97pub mod media;
98pub mod metrics;
99#[cfg(feature = "multivector")]
100pub mod multivector;
101pub mod pipeline;
102pub mod preprocess;
103pub mod rerank;
104pub mod retrieve;
105#[cfg(feature = "sqlite")]
106#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss, clippy::cast_possible_truncation)]
107pub mod sqlite;
108
109pub use chunk::{
110 Chunk, ChunkId, ChunkMetadata, Chunker, ChunkingStrategy, FixedSizeChunker, ParagraphChunker,
111 RecursiveChunker, SemanticChunker, SentenceChunker, StructuralChunker, TimestampChunker,
112};
113#[cfg(feature = "compression")]
114pub use compressed::Compression;
115pub use embed::{Embedder, EmbeddingConfig, PoolingStrategy};
116#[cfg(feature = "embeddings")]
117pub use embed::{EmbeddingModelType, FastEmbedder};
118pub use error::{Error, Result};
119pub use fusion::FusionStrategy;
120pub use index::{BM25Index, SparseIndex, VectorStore};
121#[cfg(feature = "transcription")]
122pub use loader::transcription::{TranscriptionBackend, TranscriptionConfig, TranscriptionLoader};
123#[cfg(feature = "ocr")]
124pub use loader::ImageLoader;
125pub use loader::{DocumentLoader, LoaderRegistry, SubtitleLoader, TextLoader};
126pub use media::{parse_subtitles, SubtitleCue, SubtitleFormat, SubtitleTrack};
127pub use metrics::{AggregatedMetrics, RetrievalMetrics};
128pub use pipeline::{ContextAssembler, RagPipeline};
129pub use rerank::Reranker;
130pub use retrieve::{HybridRetriever, RetrievalResult};
131
132#[cfg(feature = "sqlite")]
133pub use sqlite::{SqliteIndex, SqliteStore};
134
135#[cfg(feature = "multivector")]
136pub use multivector::{
137 exact_maxsim, MockMultiVectorEmbedder, MultiVectorEmbedder, MultiVectorEmbedding,
138 ResidualCodec, WarpIndex, WarpIndexConfig, WarpSearchConfig,
139};
140#[cfg(feature = "multivector")]
141pub use retrieve::MultiVectorRetriever;
142
143#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
145pub struct DocumentId(pub uuid::Uuid);
146
147impl DocumentId {
148 #[must_use]
150 pub fn new() -> Self {
151 Self(uuid::Uuid::new_v4())
152 }
153}
154
155impl Default for DocumentId {
156 fn default() -> Self {
157 Self::new()
158 }
159}
160
161impl std::fmt::Display for DocumentId {
162 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163 write!(f, "{}", self.0)
164 }
165}
166
167#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
169pub struct Document {
170 pub id: DocumentId,
172 pub content: String,
174 pub title: Option<String>,
176 pub source: Option<String>,
178 pub metadata: std::collections::HashMap<String, serde_json::Value>,
180}
181
182impl Document {
183 #[must_use]
185 pub fn new(content: impl Into<String>) -> Self {
186 Self {
187 id: DocumentId::new(),
188 content: content.into(),
189 title: None,
190 source: None,
191 metadata: std::collections::HashMap::new(),
192 }
193 }
194
195 #[must_use]
197 pub fn with_title(mut self, title: impl Into<String>) -> Self {
198 self.title = Some(title.into());
199 self
200 }
201
202 #[must_use]
204 pub fn with_source(mut self, source: impl Into<String>) -> Self {
205 self.source = Some(source.into());
206 self
207 }
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn test_document_id_unique() {
216 let id1 = DocumentId::new();
217 let id2 = DocumentId::new();
218 assert_ne!(id1, id2);
219 }
220
221 #[test]
222 fn test_document_creation() {
223 let doc = Document::new("Hello, world!");
224 assert_eq!(doc.content, "Hello, world!");
225 assert!(doc.title.is_none());
226 assert!(doc.source.is_none());
227 }
228
229 #[test]
230 fn test_document_builder() {
231 let doc =
232 Document::new("Content").with_title("Test Title").with_source("https://example.com");
233
234 assert_eq!(doc.content, "Content");
235 assert_eq!(doc.title, Some("Test Title".to_string()));
236 assert_eq!(doc.source, Some("https://example.com".to_string()));
237 }
238
239 #[test]
240 fn test_document_id_display() {
241 let id = DocumentId::new();
242 let display = format!("{id}");
243 assert!(!display.is_empty());
244 assert!(display.contains('-')); }
246
247 #[test]
248 fn test_document_id_serialization() {
249 let id = DocumentId::new();
250 let json = serde_json::to_string(&id).unwrap();
251 let deserialized: DocumentId = serde_json::from_str(&json).unwrap();
252 assert_eq!(id, deserialized);
253 }
254}