1#![deny(missing_docs)]
68#![deny(clippy::all)]
69#![allow(clippy::disallowed_methods)] #![allow(clippy::return_self_not_must_use)]
71#![warn(clippy::pedantic)]
72#![allow(clippy::missing_errors_doc)]
73#![allow(clippy::missing_panics_doc)]
74#![allow(clippy::module_name_repetitions)]
75#![allow(clippy::cast_precision_loss)]
76#![allow(clippy::cast_possible_truncation)]
77#![allow(clippy::doc_markdown)]
78#![allow(clippy::map_unwrap_or)]
79#![allow(clippy::redundant_closure_for_method_calls)]
80#![allow(clippy::unnecessary_literal_bound)]
81#![allow(clippy::cloned_instead_of_copied)]
82#![allow(clippy::must_use_candidate)]
83#![allow(clippy::assigning_clones)]
84#![allow(clippy::manual_div_ceil)]
85#![allow(clippy::unnecessary_map_or)]
86#![allow(clippy::derivable_impls)]
87#[macro_use]
88#[allow(unused_macros)]
89mod generated_contracts;
90pub mod chunk;
91#[cfg(feature = "compression")]
92pub mod compressed;
93pub mod embed;
94pub mod error;
95#[cfg(feature = "eval")]
96pub mod eval;
97pub mod fusion;
98pub mod index;
99pub mod loader;
100pub mod media;
101pub mod metrics;
102pub mod mmr;
103#[cfg(feature = "multivector")]
104pub mod multivector;
105pub mod pipeline;
106pub mod preprocess;
107pub mod rerank;
108pub mod retrieve;
109#[cfg(feature = "sqlite")]
110#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss, clippy::cast_possible_truncation)]
111pub mod sqlite;
112pub mod tokenizer;
113pub use chunk::{
114 Chunk, ChunkId, ChunkMetadata, Chunker, ChunkingStrategy, FixedSizeChunker, ParagraphChunker,
115 RecursiveChunker, SemanticChunker, SentenceChunker, StructuralChunker, TimestampChunker,
116};
117#[cfg(feature = "compression")]
118pub use compressed::Compression;
119pub use embed::{Embedder, EmbeddingConfig, PoolingStrategy};
120#[cfg(feature = "embeddings")]
121pub use embed::{EmbeddingModelType, FastEmbedder};
122pub use error::{Error, Result};
123pub use fusion::FusionStrategy;
124pub use index::{BM25Index, SparseIndex, VectorStore};
125#[cfg(feature = "transcription")]
126pub use loader::transcription::{TranscriptionBackend, TranscriptionConfig, TranscriptionLoader};
127#[cfg(feature = "ocr")]
128pub use loader::ImageLoader;
129pub use loader::{DocumentLoader, LoaderRegistry, SubtitleLoader, TextLoader};
130pub use media::{parse_subtitles, SubtitleCue, SubtitleFormat, SubtitleTrack};
131pub use metrics::{AggregatedMetrics, RetrievalMetrics};
132#[cfg(feature = "multivector")]
133pub use multivector::{
134 exact_maxsim, MockMultiVectorEmbedder, MultiVectorEmbedder, MultiVectorEmbedding,
135 ResidualCodec, WarpIndex, WarpIndexConfig, WarpSearchConfig,
136};
137pub use pipeline::{ContextAssembler, RagPipeline};
138pub use rerank::Reranker;
139#[cfg(feature = "multivector")]
140pub use retrieve::MultiVectorRetriever;
141pub use retrieve::{HybridRetriever, RetrievalResult};
142#[cfg(feature = "sqlite")]
143pub use sqlite::{SqliteIndex, SqliteStore};
144#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
146pub struct DocumentId(pub uuid::Uuid);
147impl DocumentId {
148 #[must_use]
150 pub fn new() -> Self {
151 Self(uuid::Uuid::new_v4())
152 }
153}
154impl Default for DocumentId {
155 fn default() -> Self {
156 Self::new()
157 }
158}
159impl std::fmt::Display for DocumentId {
160 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161 write!(f, "{}", self.0)
162 }
163}
164#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
166pub struct Document {
167 pub id: DocumentId,
169 pub content: String,
171 pub title: Option<String>,
173 pub source: Option<String>,
175 pub metadata: std::collections::HashMap<String, serde_json::Value>,
177}
178impl Document {
179 #[must_use]
181 pub fn new(content: impl Into<String>) -> Self {
182 Self {
183 id: DocumentId::new(),
184 content: content.into(),
185 title: None,
186 source: None,
187 metadata: std::collections::HashMap::new(),
188 }
189 }
190 #[must_use]
192 pub fn with_title(mut self, title: impl Into<String>) -> Self {
193 self.title = Some(title.into());
194 self
195 }
196 #[must_use]
198 pub fn with_source(mut self, source: impl Into<String>) -> Self {
199 self.source = Some(source.into());
200 self
201 }
202}
203#[cfg(test)]
204mod tests {
205 use super::*;
206 #[test]
207 fn test_document_id_unique() {
208 let id1 = DocumentId::new();
209 let id2 = DocumentId::new();
210 assert_ne!(id1, id2);
211 }
212 #[test]
213 fn test_document_creation() {
214 let doc = Document::new("Hello, world!");
215 assert_eq!(doc.content, "Hello, world!");
216 assert!(doc.title.is_none());
217 assert!(doc.source.is_none());
218 }
219 #[test]
220 fn test_document_builder() {
221 let doc =
222 Document::new("Content").with_title("Test Title").with_source("https://example.com");
223 assert_eq!(doc.content, "Content");
224 assert_eq!(doc.title, Some("Test Title".to_string()));
225 assert_eq!(doc.source, Some("https://example.com".to_string()));
226 }
227 #[test]
228 fn test_document_id_display() {
229 let id = DocumentId::new();
230 let display = format!("{id}");
231 assert!(!display.is_empty());
232 assert!(display.contains('-')); }
234 #[test]
235 fn test_document_id_serialization() {
236 let id = DocumentId::new();
237 let json = serde_json::to_string(&id).unwrap();
238 let deserialized: DocumentId = serde_json::from_str(&json).unwrap();
239 assert_eq!(id, deserialized);
240 }
241}