1#![deny(missing_docs)]
68#![deny(clippy::all)]
69#![allow(clippy::disallowed_methods)] #![allow(clippy::return_self_not_must_use)]
71#![warn(clippy::pedantic)]
72#![allow(clippy::missing_errors_doc)]
73#![allow(clippy::missing_panics_doc)]
74#![allow(clippy::module_name_repetitions)]
75#![allow(clippy::cast_precision_loss)]
76#![allow(clippy::cast_possible_truncation)]
77#![allow(clippy::doc_markdown)]
78#![allow(clippy::map_unwrap_or)]
79#![allow(clippy::redundant_closure_for_method_calls)]
80#![allow(clippy::unnecessary_literal_bound)]
81#![allow(clippy::cloned_instead_of_copied)]
82#![allow(clippy::must_use_candidate)]
83#![allow(clippy::assigning_clones)]
84#![allow(clippy::manual_div_ceil)]
85#![allow(clippy::unnecessary_map_or)]
86#![allow(clippy::derivable_impls)]
87#[macro_use]
88#[allow(unused_macros)]
89mod generated_contracts;
90pub mod chunk;
91#[cfg(feature = "compression")]
92pub mod compressed;
93pub mod embed;
94pub mod error;
95#[cfg(feature = "eval")]
96pub mod eval;
97pub mod fusion;
98pub mod index;
99pub mod loader;
100pub mod media;
101pub mod metrics;
102#[cfg(feature = "multivector")]
103pub mod multivector;
104pub mod pipeline;
105pub mod preprocess;
106pub mod rerank;
107pub mod retrieve;
108#[cfg(feature = "sqlite")]
109#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss, clippy::cast_possible_truncation)]
110pub mod sqlite;
111pub use chunk::{
112 Chunk, ChunkId, ChunkMetadata, Chunker, ChunkingStrategy, FixedSizeChunker, ParagraphChunker,
113 RecursiveChunker, SemanticChunker, SentenceChunker, StructuralChunker, TimestampChunker,
114};
115#[cfg(feature = "compression")]
116pub use compressed::Compression;
117pub use embed::{Embedder, EmbeddingConfig, PoolingStrategy};
118#[cfg(feature = "embeddings")]
119pub use embed::{EmbeddingModelType, FastEmbedder};
120pub use error::{Error, Result};
121pub use fusion::FusionStrategy;
122pub use index::{BM25Index, SparseIndex, VectorStore};
123#[cfg(feature = "transcription")]
124pub use loader::transcription::{TranscriptionBackend, TranscriptionConfig, TranscriptionLoader};
125#[cfg(feature = "ocr")]
126pub use loader::ImageLoader;
127pub use loader::{DocumentLoader, LoaderRegistry, SubtitleLoader, TextLoader};
128pub use media::{parse_subtitles, SubtitleCue, SubtitleFormat, SubtitleTrack};
129pub use metrics::{AggregatedMetrics, RetrievalMetrics};
130#[cfg(feature = "multivector")]
131pub use multivector::{
132 exact_maxsim, MockMultiVectorEmbedder, MultiVectorEmbedder, MultiVectorEmbedding,
133 ResidualCodec, WarpIndex, WarpIndexConfig, WarpSearchConfig,
134};
135pub use pipeline::{ContextAssembler, RagPipeline};
136pub use rerank::Reranker;
137#[cfg(feature = "multivector")]
138pub use retrieve::MultiVectorRetriever;
139pub use retrieve::{HybridRetriever, RetrievalResult};
140#[cfg(feature = "sqlite")]
141pub use sqlite::{SqliteIndex, SqliteStore};
142#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
144pub struct DocumentId(pub uuid::Uuid);
145impl DocumentId {
146 #[must_use]
148 pub fn new() -> Self {
149 Self(uuid::Uuid::new_v4())
150 }
151}
152impl Default for DocumentId {
153 fn default() -> Self {
154 Self::new()
155 }
156}
157impl std::fmt::Display for DocumentId {
158 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
159 write!(f, "{}", self.0)
160 }
161}
162#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
164pub struct Document {
165 pub id: DocumentId,
167 pub content: String,
169 pub title: Option<String>,
171 pub source: Option<String>,
173 pub metadata: std::collections::HashMap<String, serde_json::Value>,
175}
176impl Document {
177 #[must_use]
179 pub fn new(content: impl Into<String>) -> Self {
180 Self {
181 id: DocumentId::new(),
182 content: content.into(),
183 title: None,
184 source: None,
185 metadata: std::collections::HashMap::new(),
186 }
187 }
188 #[must_use]
190 pub fn with_title(mut self, title: impl Into<String>) -> Self {
191 self.title = Some(title.into());
192 self
193 }
194 #[must_use]
196 pub fn with_source(mut self, source: impl Into<String>) -> Self {
197 self.source = Some(source.into());
198 self
199 }
200}
201#[cfg(test)]
202mod tests {
203 use super::*;
204 #[test]
205 fn test_document_id_unique() {
206 let id1 = DocumentId::new();
207 let id2 = DocumentId::new();
208 assert_ne!(id1, id2);
209 }
210 #[test]
211 fn test_document_creation() {
212 let doc = Document::new("Hello, world!");
213 assert_eq!(doc.content, "Hello, world!");
214 assert!(doc.title.is_none());
215 assert!(doc.source.is_none());
216 }
217 #[test]
218 fn test_document_builder() {
219 let doc =
220 Document::new("Content").with_title("Test Title").with_source("https://example.com");
221 assert_eq!(doc.content, "Content");
222 assert_eq!(doc.title, Some("Test Title".to_string()));
223 assert_eq!(doc.source, Some("https://example.com".to_string()));
224 }
225 #[test]
226 fn test_document_id_display() {
227 let id = DocumentId::new();
228 let display = format!("{id}");
229 assert!(!display.is_empty());
230 assert!(display.contains('-')); }
232 #[test]
233 fn test_document_id_serialization() {
234 let id = DocumentId::new();
235 let json = serde_json::to_string(&id).unwrap();
236 let deserialized: DocumentId = serde_json::from_str(&json).unwrap();
237 assert_eq!(id, deserialized);
238 }
239}