lethe_core_rust/
lib.rs

1//! # lethe-core-rust
2//! 
3//! A high-performance hybrid retrieval engine that combines BM25 lexical search with vector similarity 
4//! using z-score fusion. Lethe Core provides state-of-the-art context selection for conversational AI 
5//! and retrieval-augmented generation (RAG) systems.
6//!
7//! ## Features
8//!
9//! - **Hybrid Retrieval**: Combines BM25 lexical search with vector similarity for optimal relevance
10//! - **Z-Score Fusion**: Normalizes and fuses scores using statistical z-score transformation (α=0.5, β=0.5)  
11//! - **Hero Configuration**: Pre-tuned parameters achieving parity with splade baseline performance
12//! - **Gamma Boosting**: Context-aware score boosting for code, errors, and technical content
13//! - **Chunking Pipeline**: Intelligent text segmentation with sentence-level granularity
14//! - **Async-First**: Built on Tokio for high-performance concurrent operations
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use lethe_core_rust::{get_hero_config, apply_zscore_fusion, Candidate};
20//!
21//! # #[tokio::main]
22//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Get the hero configuration (optimal for splade parity)
24//! let config = get_hero_config();
25//! println!("Hero config: α={}, β={}", config.alpha, config.beta);
26//! 
27//! // Example candidates from BM25 and vector search
28//! let bm25_candidates = vec![
29//!     Candidate {
30//!         doc_id: "doc1".to_string(),
31//!         score: 0.8,
32//!         text: Some("Rust async programming".to_string()),
33//!         kind: Some("bm25".to_string()),
34//!     },
35//! ];
36//! 
37//! let vector_candidates = vec![
38//!     Candidate {
39//!         doc_id: "doc1".to_string(),
40//!         score: 0.9,
41//!         text: Some("Rust async programming".to_string()),
42//!         kind: Some("vector".to_string()),
43//!     },
44//! ];
45//! 
46//! // Apply z-score fusion with hero configuration (α=0.5)
47//! let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
48//! println!("Fused {} candidates", results.len());
49//! # Ok(())
50//! # }
51//! ```
52
53// Re-export all shared types and utilities
54pub mod types;
55pub mod error;
56pub mod config;
57pub mod utils;
58
59// Domain services and business logic
60pub mod chunker;
61pub mod retrieval;
62pub mod embeddings;
63pub mod hyde;
64pub mod query_understanding;
65pub mod ml_prediction;
66pub mod pipeline;
67
68// Infrastructure services (optional with database feature)
69#[cfg(feature = "database")]
70pub mod database;
71#[cfg(feature = "database")]
72pub mod repositories;
73
74// Re-export everything for convenience
75pub use types::*;
76pub use error::*;
77pub use config::*;
78pub use utils::*;
79
80pub use chunker::*;
81pub use retrieval::*;
82pub use embeddings::*;
83pub use hyde::*;
84pub use query_understanding::*;
85pub use ml_prediction::*;
86pub use pipeline::*;
87
88#[cfg(feature = "database")]
89pub use database::*;
90#[cfg(feature = "database")]
91pub use repositories::*;
92
93/// Get a hero configuration for testing and benchmarks
94/// 
95/// The hero configuration provides optimal parameters validated against splade baseline performance:
96/// - α = 0.5, β = 0.5: Equal weighting of lexical and semantic signals
97/// - k_initial = 200: Large candidate pool for comprehensive coverage  
98/// - k_final = 5: Focused results optimized for Recall@5 metrics
99/// - Diversification = "splade": Advanced diversification matching baseline method
100/// - Gamma boosting disabled: Clean z-score fusion without latent multipliers
101/// 
102/// # Examples
103///
104/// ```rust
105/// use lethe_core_rust::get_hero_config;
106/// 
107/// let config = get_hero_config();
108/// assert_eq!(config.alpha, 0.5);
109/// assert_eq!(config.beta, 0.5);
110/// assert_eq!(config.k_final, 5);
111/// ```
112pub fn get_hero_config() -> HybridRetrievalConfig {
113    HybridRetrievalConfig::hero()
114}
115
116/// Process candidates using z-score fusion
117/// 
118/// This function demonstrates the core z-score fusion algorithm by:
119/// 1. Converting raw scores to z-scores (mean=0, std=1)
120/// 2. Combining using weighted fusion: `α * z_bm25 + β * z_vector`
121/// 3. Returning unified candidates sorted by hybrid score
122///
123/// # Arguments
124///
125/// * `bm25_candidates` - Candidates from BM25 lexical search
126/// * `vector_candidates` - Candidates from vector semantic search
127/// * `alpha` - Weight for BM25 z-scores (typically 0.5 for equal weighting)
128///
129/// # Returns
130///
131/// Vector of candidates with hybrid scores, sorted descending by relevance
132///
133/// # Examples
134///
135/// ```rust
136/// use lethe_core_rust::{apply_zscore_fusion, Candidate};
137/// 
138/// let bm25_candidates = vec![
139///     Candidate {
140///         doc_id: "doc1".to_string(),
141///         score: 0.8,
142///         text: Some("Programming tutorial".to_string()),
143///         kind: Some("bm25".to_string()),
144///     },
145/// ];
146/// 
147/// let vector_candidates = vec![
148///     Candidate {
149///         doc_id: "doc1".to_string(),
150///         score: 0.9,
151///         text: Some("Programming tutorial".to_string()),
152///         kind: Some("vector".to_string()),
153///     },
154/// ];
155/// 
156/// let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
157/// assert!(!results.is_empty());
158/// assert_eq!(results[0].kind, Some("hybrid".to_string()));
159/// ```
160pub fn apply_zscore_fusion(
161    bm25_candidates: Vec<Candidate>, 
162    vector_candidates: Vec<Candidate>, 
163    alpha: f64
164) -> Vec<Candidate> {
165    let service = HybridRetrievalService::mock_for_testing();
166    
167    // Calculate z-scores for each set
168    let zscore_bm25 = service.calculate_zscores(&bm25_candidates);
169    let zscore_vector = service.calculate_zscores(&vector_candidates);
170    
171    // Combine with weighted fusion
172    let mut combined = Vec::new();
173    let mut doc_scores = std::collections::HashMap::new();
174    
175    for candidate in zscore_bm25 {
176        doc_scores.insert(candidate.doc_id.clone(), alpha * candidate.score);
177    }
178    
179    for candidate in zscore_vector {
180        let entry = doc_scores.entry(candidate.doc_id.clone()).or_insert(0.0);
181        *entry += (1.0 - alpha) * candidate.score;
182    }
183    
184    // Convert back to candidates
185    for (doc_id, score) in doc_scores {
186        combined.push(Candidate {
187            doc_id,
188            score,
189            text: None,
190            kind: Some("hybrid".to_string()),
191        });
192    }
193    
194    // Sort by score descending
195    combined.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
196    combined
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn test_hero_config() {
205        let config = get_hero_config();
206        
207        // Verify hero configuration parameters
208        assert_eq!(config.alpha, 0.5);
209        assert_eq!(config.beta, 0.5);
210        assert_eq!(config.k_final, 5);
211        assert_eq!(config.k_initial, 200);
212        assert_eq!(config.diversify_method, "splade");
213        
214        // Hero config has gamma boosting disabled by default (no latent multipliers)
215        assert!(config.gamma_kind_boost.is_empty(), "Gamma boosting should be disabled by default");
216    }
217
218    #[test]
219    fn test_zscore_fusion() {
220        // Create test candidates
221        let bm25_candidates = vec![
222            Candidate {
223                doc_id: "doc1".to_string(),
224                score: 0.8,
225                text: Some("Rust programming".to_string()),
226                kind: Some("bm25".to_string()),
227            },
228            Candidate {
229                doc_id: "doc2".to_string(),
230                score: 0.6,
231                text: Some("Python data".to_string()), 
232                kind: Some("bm25".to_string()),
233            },
234        ];
235        
236        let vector_candidates = vec![
237            Candidate {
238                doc_id: "doc1".to_string(),
239                score: 0.9,
240                text: Some("Rust programming".to_string()),
241                kind: Some("vector".to_string()),
242            },
243            Candidate {
244                doc_id: "doc3".to_string(),
245                score: 0.7,
246                text: Some("Machine learning".to_string()),
247                kind: Some("vector".to_string()),
248            },
249        ];
250        
251        // Test z-score fusion with alpha=0.5 (hero config)
252        let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
253        
254        // Should have 3 unique documents (doc1, doc2, doc3)
255        assert_eq!(results.len(), 3);
256        
257        // Results should be sorted by score descending
258        for i in 1..results.len() {
259            assert!(results[i-1].score >= results[i].score);
260        }
261        
262        // All results should be marked as hybrid
263        for result in &results {
264            assert_eq!(result.kind, Some("hybrid".to_string()));
265        }
266    }
267
268    #[test]
269    fn test_library_integration() {
270        // Test that all main components can be imported and used together
271        let _config = get_hero_config();
272        let _chunking_config = ChunkingConfig::default();
273        let _service = HybridRetrievalService::mock_for_testing();
274        
275        // Test that error types are available
276        let _error = LetheError::InvalidQuery("test".to_string());
277        
278        // Test that all types are available
279        let _candidate = Candidate {
280            doc_id: "test".to_string(),
281            score: 1.0,
282            text: None,
283            kind: None,
284        };
285    }
286}