Skip to main content

trueno_rag/
lib.rs

1//! Trueno-RAG: Pure-Rust Retrieval-Augmented Generation Pipeline
2//!
3//! This crate provides a complete RAG stack built on Trueno compute primitives
4//! with zero Python/C++ dependencies.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use trueno_rag::{
10//!     pipeline::RagPipelineBuilder,
11//!     chunk::RecursiveChunker,
12//!     embed::MockEmbedder,
13//!     rerank::NoOpReranker,
14//!     fusion::FusionStrategy,
15//!     Document,
16//! };
17//!
18//! // Build a RAG pipeline
19//! let mut pipeline = RagPipelineBuilder::new()
20//!     .chunker(RecursiveChunker::new(512, 50))
21//!     .embedder(MockEmbedder::new(384))
22//!     .reranker(NoOpReranker::new())
23//!     .fusion(FusionStrategy::RRF { k: 60.0 })
24//!     .build()
25//!     .unwrap();
26//!
27//! // Index a document
28//! let doc = Document::new("Machine learning enables computers to learn from data.")
29//!     .with_title("ML Intro");
30//! pipeline.index_document(&doc).unwrap();
31//!
32//! // Query the pipeline
33//! let results = pipeline.query("machine learning", 5).unwrap();
34//! assert!(!results.is_empty());
35//! ```
36//!
37//! # Chunking Strategies
38//!
39//! Multiple chunking strategies are available:
40//!
41//! - [`RecursiveChunker`] - Hierarchical splitting (default)
42//! - [`FixedSizeChunker`] - Character-based splitting
43//! - [`SentenceChunker`] - Sentence-boundary aware
44//! - [`ParagraphChunker`] - Paragraph grouping
45//! - [`SemanticChunker`] - Embedding similarity-based
46//! - [`StructuralChunker`] - Header/section-aware
47//! - [`TimestampChunker`] - Subtitle/transcript time-boundary aware
48//!
49//! # Fusion Strategies
50//!
51//! Combine dense and sparse retrieval results:
52//!
53//! - [`FusionStrategy::RRF`] - Reciprocal Rank Fusion (recommended)
54//! - [`FusionStrategy::Linear`] - Weighted combination
55//! - [`FusionStrategy::DBSF`] - Distribution-based score fusion
56//!
57//! # Example: Custom Chunking
58//!
59//! ```rust
60//! use trueno_rag::{chunk::{ParagraphChunker, Chunker}, Document};
61//!
62//! let chunker = ParagraphChunker::new(2); // 2 paragraphs per chunk
63//! let doc = Document::new("Para 1.\n\nPara 2.\n\nPara 3.");
64//! let chunks = chunker.chunk(&doc).unwrap();
65//! assert_eq!(chunks.len(), 2);
66//! ```
67#![deny(missing_docs)]
68#![deny(clippy::all)]
69#![allow(clippy::disallowed_methods)] // json! macro internally uses unwrap
70#![allow(clippy::return_self_not_must_use)]
71#![warn(clippy::pedantic)]
72#![allow(clippy::missing_errors_doc)]
73#![allow(clippy::missing_panics_doc)]
74#![allow(clippy::module_name_repetitions)]
75#![allow(clippy::cast_precision_loss)]
76#![allow(clippy::cast_possible_truncation)]
77#![allow(clippy::doc_markdown)]
78#![allow(clippy::map_unwrap_or)]
79#![allow(clippy::redundant_closure_for_method_calls)]
80#![allow(clippy::unnecessary_literal_bound)]
81#![allow(clippy::cloned_instead_of_copied)]
82#![allow(clippy::must_use_candidate)]
83#![allow(clippy::assigning_clones)]
84#![allow(clippy::manual_div_ceil)]
85#![allow(clippy::unnecessary_map_or)]
86#![allow(clippy::derivable_impls)]
87#[macro_use]
88#[allow(unused_macros)]
89mod generated_contracts;
90pub mod chunk;
91#[cfg(feature = "compression")]
92pub mod compressed;
93pub mod embed;
94pub mod error;
95#[cfg(feature = "eval")]
96pub mod eval;
97pub mod fusion;
98pub mod index;
99pub mod loader;
100pub mod media;
101pub mod metrics;
102#[cfg(feature = "multivector")]
103pub mod multivector;
104pub mod pipeline;
105pub mod preprocess;
106pub mod rerank;
107pub mod retrieve;
108#[cfg(feature = "sqlite")]
109#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss, clippy::cast_possible_truncation)]
110pub mod sqlite;
111pub use chunk::{
112    Chunk, ChunkId, ChunkMetadata, Chunker, ChunkingStrategy, FixedSizeChunker, ParagraphChunker,
113    RecursiveChunker, SemanticChunker, SentenceChunker, StructuralChunker, TimestampChunker,
114};
115#[cfg(feature = "compression")]
116pub use compressed::Compression;
117pub use embed::{Embedder, EmbeddingConfig, PoolingStrategy};
118#[cfg(feature = "embeddings")]
119pub use embed::{EmbeddingModelType, FastEmbedder};
120pub use error::{Error, Result};
121pub use fusion::FusionStrategy;
122pub use index::{BM25Index, SparseIndex, VectorStore};
123#[cfg(feature = "transcription")]
124pub use loader::transcription::{TranscriptionBackend, TranscriptionConfig, TranscriptionLoader};
125#[cfg(feature = "ocr")]
126pub use loader::ImageLoader;
127pub use loader::{DocumentLoader, LoaderRegistry, SubtitleLoader, TextLoader};
128pub use media::{parse_subtitles, SubtitleCue, SubtitleFormat, SubtitleTrack};
129pub use metrics::{AggregatedMetrics, RetrievalMetrics};
130#[cfg(feature = "multivector")]
131pub use multivector::{
132    exact_maxsim, MockMultiVectorEmbedder, MultiVectorEmbedder, MultiVectorEmbedding,
133    ResidualCodec, WarpIndex, WarpIndexConfig, WarpSearchConfig,
134};
135pub use pipeline::{ContextAssembler, RagPipeline};
136pub use rerank::Reranker;
137#[cfg(feature = "multivector")]
138pub use retrieve::MultiVectorRetriever;
139pub use retrieve::{HybridRetriever, RetrievalResult};
140#[cfg(feature = "sqlite")]
141pub use sqlite::{SqliteIndex, SqliteStore};
142/// Document identifier
143#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
144pub struct DocumentId(pub uuid::Uuid);
145impl DocumentId {
146    /// Create a new random document ID
147    #[must_use]
148    pub fn new() -> Self {
149        Self(uuid::Uuid::new_v4())
150    }
151}
152impl Default for DocumentId {
153    fn default() -> Self {
154        Self::new()
155    }
156}
157impl std::fmt::Display for DocumentId {
158    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
159        write!(f, "{}", self.0)
160    }
161}
162/// A document to be indexed
163#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
164pub struct Document {
165    /// Unique document identifier
166    pub id: DocumentId,
167    /// Document content
168    pub content: String,
169    /// Document title
170    pub title: Option<String>,
171    /// Source URL or path
172    pub source: Option<String>,
173    /// Custom metadata
174    pub metadata: std::collections::HashMap<String, serde_json::Value>,
175}
176impl Document {
177    /// Create a new document with the given content
178    #[must_use]
179    pub fn new(content: impl Into<String>) -> Self {
180        Self {
181            id: DocumentId::new(),
182            content: content.into(),
183            title: None,
184            source: None,
185            metadata: std::collections::HashMap::new(),
186        }
187    }
188    /// Set the document title
189    #[must_use]
190    pub fn with_title(mut self, title: impl Into<String>) -> Self {
191        self.title = Some(title.into());
192        self
193    }
194    /// Set the document source
195    #[must_use]
196    pub fn with_source(mut self, source: impl Into<String>) -> Self {
197        self.source = Some(source.into());
198        self
199    }
200}
201#[cfg(test)]
202mod tests {
203    use super::*;
204    #[test]
205    fn test_document_id_unique() {
206        let id1 = DocumentId::new();
207        let id2 = DocumentId::new();
208        assert_ne!(id1, id2);
209    }
210    #[test]
211    fn test_document_creation() {
212        let doc = Document::new("Hello, world!");
213        assert_eq!(doc.content, "Hello, world!");
214        assert!(doc.title.is_none());
215        assert!(doc.source.is_none());
216    }
217    #[test]
218    fn test_document_builder() {
219        let doc =
220            Document::new("Content").with_title("Test Title").with_source("https://example.com");
221        assert_eq!(doc.content, "Content");
222        assert_eq!(doc.title, Some("Test Title".to_string()));
223        assert_eq!(doc.source, Some("https://example.com".to_string()));
224    }
225    #[test]
226    fn test_document_id_display() {
227        let id = DocumentId::new();
228        let display = format!("{id}");
229        assert!(!display.is_empty());
230        assert!(display.contains('-')); // UUID format
231    }
232    #[test]
233    fn test_document_id_serialization() {
234        let id = DocumentId::new();
235        let json = serde_json::to_string(&id).unwrap();
236        let deserialized: DocumentId = serde_json::from_str(&json).unwrap();
237        assert_eq!(id, deserialized);
238    }
239}