Skip to main content

trueno_rag/
lib.rs

1//! Trueno-RAG: Pure-Rust Retrieval-Augmented Generation Pipeline
2//!
3//! This crate provides a complete RAG stack built on Trueno compute primitives
4//! with zero Python/C++ dependencies.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use trueno_rag::{
10//!     pipeline::RagPipelineBuilder,
11//!     chunk::RecursiveChunker,
12//!     embed::MockEmbedder,
13//!     rerank::NoOpReranker,
14//!     fusion::FusionStrategy,
15//!     Document,
16//! };
17//!
18//! // Build a RAG pipeline
19//! let mut pipeline = RagPipelineBuilder::new()
20//!     .chunker(RecursiveChunker::new(512, 50))
21//!     .embedder(MockEmbedder::new(384))
22//!     .reranker(NoOpReranker::new())
23//!     .fusion(FusionStrategy::RRF { k: 60.0 })
24//!     .build()
25//!     .unwrap();
26//!
27//! // Index a document
28//! let doc = Document::new("Machine learning enables computers to learn from data.")
29//!     .with_title("ML Intro");
30//! pipeline.index_document(&doc).unwrap();
31//!
32//! // Query the pipeline
33//! let results = pipeline.query("machine learning", 5).unwrap();
34//! assert!(!results.is_empty());
35//! ```
36//!
37//! # Chunking Strategies
38//!
39//! Multiple chunking strategies are available:
40//!
41//! - [`RecursiveChunker`] - Hierarchical splitting (default)
42//! - [`FixedSizeChunker`] - Character-based splitting
43//! - [`SentenceChunker`] - Sentence-boundary aware
44//! - [`ParagraphChunker`] - Paragraph grouping
45//! - [`SemanticChunker`] - Embedding similarity-based
46//! - [`StructuralChunker`] - Header/section-aware
47//!
48//! # Fusion Strategies
49//!
50//! Combine dense and sparse retrieval results:
51//!
52//! - [`FusionStrategy::RRF`] - Reciprocal Rank Fusion (recommended)
53//! - [`FusionStrategy::Linear`] - Weighted combination
54//! - [`FusionStrategy::DBSF`] - Distribution-based score fusion
55//!
56//! # Example: Custom Chunking
57//!
58//! ```rust
59//! use trueno_rag::{chunk::{ParagraphChunker, Chunker}, Document};
60//!
61//! let chunker = ParagraphChunker::new(2); // 2 paragraphs per chunk
62//! let doc = Document::new("Para 1.\n\nPara 2.\n\nPara 3.");
63//! let chunks = chunker.chunk(&doc).unwrap();
64//! assert_eq!(chunks.len(), 2);
65//! ```
66
67#![deny(missing_docs)]
68#![deny(clippy::all)]
69#![warn(clippy::pedantic)]
70#![allow(clippy::missing_errors_doc)]
71#![allow(clippy::missing_panics_doc)]
72#![allow(clippy::module_name_repetitions)]
73#![allow(clippy::cast_precision_loss)]
74#![allow(clippy::cast_possible_truncation)]
75#![allow(clippy::doc_markdown)]
76#![allow(clippy::map_unwrap_or)]
77#![allow(clippy::redundant_closure_for_method_calls)]
78#![allow(clippy::unnecessary_literal_bound)]
79#![allow(clippy::cloned_instead_of_copied)]
80#![allow(clippy::must_use_candidate)]
81#![allow(clippy::assigning_clones)]
82#![allow(clippy::manual_div_ceil)]
83#![allow(clippy::unnecessary_map_or)]
84#![allow(clippy::derivable_impls)]
85
86pub mod chunk;
87#[cfg(feature = "compression")]
88pub mod compressed;
89pub mod embed;
90pub mod error;
91pub mod fusion;
92pub mod index;
93pub mod metrics;
94pub mod pipeline;
95pub mod rerank;
96pub mod retrieve;
97
98pub use chunk::{
99    Chunk, ChunkId, ChunkMetadata, Chunker, ChunkingStrategy, FixedSizeChunker, ParagraphChunker,
100    RecursiveChunker, SemanticChunker, SentenceChunker, StructuralChunker,
101};
102#[cfg(feature = "compression")]
103pub use compressed::Compression;
104pub use embed::{Embedder, EmbeddingConfig, PoolingStrategy};
105#[cfg(feature = "embeddings")]
106pub use embed::{EmbeddingModelType, FastEmbedder};
107pub use error::{Error, Result};
108pub use fusion::FusionStrategy;
109pub use index::{BM25Index, SparseIndex, VectorStore};
110pub use metrics::{AggregatedMetrics, RetrievalMetrics};
111pub use pipeline::{ContextAssembler, RagPipeline};
112pub use rerank::Reranker;
113pub use retrieve::{HybridRetriever, RetrievalResult};
114
115/// Document identifier
116#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
117pub struct DocumentId(pub uuid::Uuid);
118
119impl DocumentId {
120    /// Create a new random document ID
121    #[must_use]
122    pub fn new() -> Self {
123        Self(uuid::Uuid::new_v4())
124    }
125}
126
127impl Default for DocumentId {
128    fn default() -> Self {
129        Self::new()
130    }
131}
132
133impl std::fmt::Display for DocumentId {
134    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
135        write!(f, "{}", self.0)
136    }
137}
138
139/// A document to be indexed
140#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
141pub struct Document {
142    /// Unique document identifier
143    pub id: DocumentId,
144    /// Document content
145    pub content: String,
146    /// Document title
147    pub title: Option<String>,
148    /// Source URL or path
149    pub source: Option<String>,
150    /// Custom metadata
151    pub metadata: std::collections::HashMap<String, serde_json::Value>,
152}
153
154impl Document {
155    /// Create a new document with the given content
156    #[must_use]
157    pub fn new(content: impl Into<String>) -> Self {
158        Self {
159            id: DocumentId::new(),
160            content: content.into(),
161            title: None,
162            source: None,
163            metadata: std::collections::HashMap::new(),
164        }
165    }
166
167    /// Set the document title
168    #[must_use]
169    pub fn with_title(mut self, title: impl Into<String>) -> Self {
170        self.title = Some(title.into());
171        self
172    }
173
174    /// Set the document source
175    #[must_use]
176    pub fn with_source(mut self, source: impl Into<String>) -> Self {
177        self.source = Some(source.into());
178        self
179    }
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_document_id_unique() {
188        let id1 = DocumentId::new();
189        let id2 = DocumentId::new();
190        assert_ne!(id1, id2);
191    }
192
193    #[test]
194    fn test_document_creation() {
195        let doc = Document::new("Hello, world!");
196        assert_eq!(doc.content, "Hello, world!");
197        assert!(doc.title.is_none());
198        assert!(doc.source.is_none());
199    }
200
201    #[test]
202    fn test_document_builder() {
203        let doc = Document::new("Content")
204            .with_title("Test Title")
205            .with_source("https://example.com");
206
207        assert_eq!(doc.content, "Content");
208        assert_eq!(doc.title, Some("Test Title".to_string()));
209        assert_eq!(doc.source, Some("https://example.com".to_string()));
210    }
211
212    #[test]
213    fn test_document_id_display() {
214        let id = DocumentId::new();
215        let display = format!("{id}");
216        assert!(!display.is_empty());
217        assert!(display.contains('-')); // UUID format
218    }
219
220    #[test]
221    fn test_document_id_serialization() {
222        let id = DocumentId::new();
223        let json = serde_json::to_string(&id).unwrap();
224        let deserialized: DocumentId = serde_json::from_str(&json).unwrap();
225        assert_eq!(id, deserialized);
226    }
227}