lethe_core_rust/lib.rs
1//! # lethe-core-rust
2//!
3//! A high-performance hybrid retrieval engine that combines BM25 lexical search with vector similarity
4//! using z-score fusion. Lethe Core provides state-of-the-art context selection for conversational AI
5//! and retrieval-augmented generation (RAG) systems.
6//!
7//! ## Features
8//!
9//! - **Hybrid Retrieval**: Combines BM25 lexical search with vector similarity for optimal relevance
10//! - **Z-Score Fusion**: Normalizes and fuses scores using statistical z-score transformation (α=0.5, β=0.5)
11//! - **Hero Configuration**: Pre-tuned parameters achieving parity with splade baseline performance
12//! - **Gamma Boosting**: Context-aware score boosting for code, errors, and technical content
13//! - **Chunking Pipeline**: Intelligent text segmentation with sentence-level granularity
14//! - **Async-First**: Built on Tokio for high-performance concurrent operations
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use lethe_core_rust::{get_hero_config, apply_zscore_fusion, Candidate};
20//!
21//! # #[tokio::main]
22//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Get the hero configuration (optimal for splade parity)
24//! let config = get_hero_config();
25//! println!("Hero config: α={}, β={}", config.alpha, config.beta);
26//!
27//! // Example candidates from BM25 and vector search
28//! let bm25_candidates = vec![
29//! Candidate {
30//! doc_id: "doc1".to_string(),
31//! score: 0.8,
32//! text: Some("Rust async programming".to_string()),
33//! kind: Some("bm25".to_string()),
34//! },
35//! ];
36//!
37//! let vector_candidates = vec![
38//! Candidate {
39//! doc_id: "doc1".to_string(),
40//! score: 0.9,
41//! text: Some("Rust async programming".to_string()),
42//! kind: Some("vector".to_string()),
43//! },
44//! ];
45//!
46//! // Apply z-score fusion with hero configuration (α=0.5)
47//! let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
48//! println!("Fused {} candidates", results.len());
49//! # Ok(())
50//! # }
51//! ```
52
53// Re-export all shared types and utilities
54pub mod types;
55pub mod error;
56pub mod config;
57pub mod utils;
58
59// Domain services and business logic
60pub mod chunker;
61pub mod retrieval;
62pub mod embeddings;
63pub mod hyde;
64pub mod query_understanding;
65pub mod ml_prediction;
66pub mod pipeline;
67
68// Infrastructure services (optional with database feature)
69#[cfg(feature = "database")]
70pub mod database;
71#[cfg(feature = "database")]
72pub mod repositories;
73
74// Re-export everything for convenience
75pub use types::*;
76pub use error::*;
77pub use config::*;
78pub use utils::*;
79
80pub use chunker::*;
81pub use retrieval::*;
82pub use embeddings::*;
83pub use hyde::*;
84pub use query_understanding::*;
85pub use ml_prediction::*;
86pub use pipeline::*;
87
88#[cfg(feature = "database")]
89pub use database::*;
90#[cfg(feature = "database")]
91pub use repositories::*;
92
93/// Get a hero configuration for testing and benchmarks
94///
95/// The hero configuration provides optimal parameters validated against splade baseline performance:
96/// - α = 0.5, β = 0.5: Equal weighting of lexical and semantic signals
97/// - k_initial = 200: Large candidate pool for comprehensive coverage
98/// - k_final = 5: Focused results optimized for Recall@5 metrics
99/// - Diversification = "splade": Advanced diversification matching baseline method
100/// - Gamma boosting disabled: Clean z-score fusion without latent multipliers
101///
102/// # Examples
103///
104/// ```rust
105/// use lethe_core_rust::get_hero_config;
106///
107/// let config = get_hero_config();
108/// assert_eq!(config.alpha, 0.5);
109/// assert_eq!(config.beta, 0.5);
110/// assert_eq!(config.k_final, 5);
111/// ```
112pub fn get_hero_config() -> HybridRetrievalConfig {
113 HybridRetrievalConfig::hero()
114}
115
116/// Process candidates using z-score fusion
117///
118/// This function demonstrates the core z-score fusion algorithm by:
119/// 1. Converting raw scores to z-scores (mean=0, std=1)
120/// 2. Combining using weighted fusion: `α * z_bm25 + β * z_vector`
121/// 3. Returning unified candidates sorted by hybrid score
122///
123/// # Arguments
124///
125/// * `bm25_candidates` - Candidates from BM25 lexical search
126/// * `vector_candidates` - Candidates from vector semantic search
127/// * `alpha` - Weight for BM25 z-scores (typically 0.5 for equal weighting)
128///
129/// # Returns
130///
131/// Vector of candidates with hybrid scores, sorted descending by relevance
132///
133/// # Examples
134///
135/// ```rust
136/// use lethe_core_rust::{apply_zscore_fusion, Candidate};
137///
138/// let bm25_candidates = vec![
139/// Candidate {
140/// doc_id: "doc1".to_string(),
141/// score: 0.8,
142/// text: Some("Programming tutorial".to_string()),
143/// kind: Some("bm25".to_string()),
144/// },
145/// ];
146///
147/// let vector_candidates = vec![
148/// Candidate {
149/// doc_id: "doc1".to_string(),
150/// score: 0.9,
151/// text: Some("Programming tutorial".to_string()),
152/// kind: Some("vector".to_string()),
153/// },
154/// ];
155///
156/// let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
157/// assert!(!results.is_empty());
158/// assert_eq!(results[0].kind, Some("hybrid".to_string()));
159/// ```
160pub fn apply_zscore_fusion(
161 bm25_candidates: Vec<Candidate>,
162 vector_candidates: Vec<Candidate>,
163 alpha: f64
164) -> Vec<Candidate> {
165 let service = HybridRetrievalService::mock_for_testing();
166
167 // Calculate z-scores for each set
168 let zscore_bm25 = service.calculate_zscores(&bm25_candidates);
169 let zscore_vector = service.calculate_zscores(&vector_candidates);
170
171 // Combine with weighted fusion
172 let mut combined = Vec::new();
173 let mut doc_scores = std::collections::HashMap::new();
174
175 for candidate in zscore_bm25 {
176 doc_scores.insert(candidate.doc_id.clone(), alpha * candidate.score);
177 }
178
179 for candidate in zscore_vector {
180 let entry = doc_scores.entry(candidate.doc_id.clone()).or_insert(0.0);
181 *entry += (1.0 - alpha) * candidate.score;
182 }
183
184 // Convert back to candidates
185 for (doc_id, score) in doc_scores {
186 combined.push(Candidate {
187 doc_id,
188 score,
189 text: None,
190 kind: Some("hybrid".to_string()),
191 });
192 }
193
194 // Sort by score descending
195 combined.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
196 combined
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202
203 #[test]
204 fn test_hero_config() {
205 let config = get_hero_config();
206
207 // Verify hero configuration parameters
208 assert_eq!(config.alpha, 0.5);
209 assert_eq!(config.beta, 0.5);
210 assert_eq!(config.k_final, 5);
211 assert_eq!(config.k_initial, 200);
212 assert_eq!(config.diversify_method, "splade");
213
214 // Hero config has gamma boosting disabled by default (no latent multipliers)
215 assert!(config.gamma_kind_boost.is_empty(), "Gamma boosting should be disabled by default");
216 }
217
218 #[test]
219 fn test_zscore_fusion() {
220 // Create test candidates
221 let bm25_candidates = vec![
222 Candidate {
223 doc_id: "doc1".to_string(),
224 score: 0.8,
225 text: Some("Rust programming".to_string()),
226 kind: Some("bm25".to_string()),
227 },
228 Candidate {
229 doc_id: "doc2".to_string(),
230 score: 0.6,
231 text: Some("Python data".to_string()),
232 kind: Some("bm25".to_string()),
233 },
234 ];
235
236 let vector_candidates = vec![
237 Candidate {
238 doc_id: "doc1".to_string(),
239 score: 0.9,
240 text: Some("Rust programming".to_string()),
241 kind: Some("vector".to_string()),
242 },
243 Candidate {
244 doc_id: "doc3".to_string(),
245 score: 0.7,
246 text: Some("Machine learning".to_string()),
247 kind: Some("vector".to_string()),
248 },
249 ];
250
251 // Test z-score fusion with alpha=0.5 (hero config)
252 let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
253
254 // Should have 3 unique documents (doc1, doc2, doc3)
255 assert_eq!(results.len(), 3);
256
257 // Results should be sorted by score descending
258 for i in 1..results.len() {
259 assert!(results[i-1].score >= results[i].score);
260 }
261
262 // All results should be marked as hybrid
263 for result in &results {
264 assert_eq!(result.kind, Some("hybrid".to_string()));
265 }
266 }
267
268 #[test]
269 fn test_library_integration() {
270 // Test that all main components can be imported and used together
271 let _config = get_hero_config();
272 let _chunking_config = ChunkingConfig::default();
273 let _service = HybridRetrievalService::mock_for_testing();
274
275 // Test that error types are available
276 let _error = LetheError::InvalidQuery("test".to_string());
277
278 // Test that all types are available
279 let _candidate = Candidate {
280 doc_id: "test".to_string(),
281 score: 1.0,
282 text: None,
283 kind: None,
284 };
285 }
286}