lethe_core_rust/
lib.rs

1//! # lethe-core-rust
2//! 
3//! A high-performance hybrid retrieval engine that combines BM25 lexical search with vector similarity 
4//! using z-score fusion. Lethe Core provides state-of-the-art context selection for conversational AI 
5//! and retrieval-augmented generation (RAG) systems.
6//!
7//! ## Features
8//!
9//! - **Hybrid Retrieval**: Combines BM25 lexical search with vector similarity for optimal relevance
10//! - **Z-Score Fusion**: Normalizes and fuses scores using statistical z-score transformation (α=0.5, β=0.5)  
11//! - **Hero Configuration**: Pre-tuned parameters achieving parity with splade baseline performance
12//! - **Gamma Boosting**: Context-aware score boosting for code, errors, and technical content
13//! - **Chunking Pipeline**: Intelligent text segmentation with sentence-level granularity
14//! - **Async-First**: Built on Tokio for high-performance concurrent operations
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use lethe_core_rust::{get_hero_config, apply_zscore_fusion, Candidate};
20//!
21//! # #[tokio::main]
22//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Get the hero configuration (optimal for splade parity)
24//! let config = get_hero_config();
25//! println!("Hero config: α={}, β={}", config.alpha, config.beta);
26//! 
27//! // Example candidates from BM25 and vector search
28//! let bm25_candidates = vec![
29//!     Candidate {
30//!         doc_id: "doc1".to_string(),
31//!         score: 0.8,
32//!         text: Some("Rust async programming".to_string()),
33//!         kind: Some("bm25".to_string()),
34//!     },
35//! ];
36//! 
37//! let vector_candidates = vec![
38//!     Candidate {
39//!         doc_id: "doc1".to_string(),
40//!         score: 0.9,
41//!         text: Some("Rust async programming".to_string()),
42//!         kind: Some("vector".to_string()),
43//!     },
44//! ];
45//! 
46//! // Apply z-score fusion with hero configuration (α=0.5)
47//! let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
48//! println!("Fused {} candidates", results.len());
49//! # Ok(())
50//! # }
51//! ```
52
53// Re-export all shared types and utilities
54pub mod types;
55pub mod error;
56pub mod config;
57pub mod utils;
58
59// Domain services and business logic
60pub mod chunker;
61pub mod retrieval;
62pub mod embeddings;
63pub mod hyde;
64pub mod query_understanding;
65pub mod ml_prediction;
66pub mod pipeline;
67
68// Infrastructure services (optional with database feature)
69#[cfg(feature = "database")]
70pub mod database;
71#[cfg(feature = "database")]
72pub mod repositories;
73
74// Re-export everything for convenience
75pub use types::*;
76pub use error::*;
77pub use config::*;
78pub use utils::*;
79
80pub use chunker::*;
81pub use retrieval::*;
82pub use embeddings::*;
83pub use hyde::*;
84pub use query_understanding::*;
85pub use ml_prediction::*;
86pub use pipeline::*;
87
88#[cfg(feature = "database")]
89pub use database::*;
90#[cfg(feature = "database")]
91pub use repositories::*;
92
93/// Get a hero configuration for testing and benchmarks
94/// 
95/// The hero configuration provides optimal parameters validated against splade baseline performance:
96/// - α = 0.5, β = 0.5: Equal weighting of lexical and semantic signals
97/// - k_initial = 200: Large candidate pool for comprehensive coverage  
98/// - k_final = 5: Focused results optimized for Recall@5 metrics
99/// - Diversification = "splade": Advanced diversification matching baseline method
100/// - Gamma boosting disabled: Clean z-score fusion without latent multipliers
101/// 
102/// # Examples
103///
104/// ```rust
105/// use lethe_core_rust::get_hero_config;
106/// 
107/// let config = get_hero_config();
108/// assert_eq!(config.alpha, 0.5);
109/// assert_eq!(config.beta, 0.5);
110/// assert_eq!(config.k_final, 5);
111/// ```
112pub fn get_hero_config() -> HybridRetrievalConfig {
113    HybridRetrievalConfig::hero()
114}
115
116/// Get a validated hero configuration against canonical hash
117/// 
118/// This function creates a hero configuration and validates it against the expected
119/// hash from the canonical audit manifest. It will refuse to run unless the 
120/// configuration matches exactly, providing integrity verification.
121/// 
122/// # Arguments
123/// 
124/// * `expected_hash` - The expected SHA-256 hash of the canonical hero configuration
125/// * `allow_override` - Whether to continue with mismatched config (logs warning)
126/// 
127/// # Returns
128/// 
129/// A validated `HybridRetrievalConfig` or an error if validation fails
130/// 
131/// # Examples
132/// 
133/// ```rust
134/// use lethe_core_rust::{get_hero_config_validated, LetheError};
135/// 
136/// // With canonical hash (this would be from your manifest)
137/// let canonical_hash = "8dd2de7e89ed4af1aede4cc89d8c9d8435d03a340d918f45e640b4b84959a80f";
138/// 
139/// // Strict validation - fails on mismatch
140/// match get_hero_config_validated(canonical_hash, false) {
141///     Ok(config) => {
142///         // Use validated config
143///         assert_eq!(config.alpha, 0.5);
144///     },
145///     Err(e) => {
146///         // Handle configuration validation failure
147///         eprintln!("Hero config validation failed: {}", e);
148///     }
149/// }
150/// 
151/// // Permissive validation - warns on mismatch but continues
152/// let config = get_hero_config_validated(canonical_hash, true)
153///     .expect("Should not fail with override");
154/// ```
155pub fn get_hero_config_validated(expected_hash: &str, allow_override: bool) -> Result<HybridRetrievalConfig> {
156    HybridRetrievalConfig::hero_with_validation(expected_hash, allow_override)
157}
158
159/// Process candidates using z-score fusion
160/// 
161/// This function demonstrates the core z-score fusion algorithm by:
162/// 1. Converting raw scores to z-scores (mean=0, std=1)
163/// 2. Combining using weighted fusion: `α * z_bm25 + β * z_vector`
164/// 3. Returning unified candidates sorted by hybrid score
165///
166/// # Arguments
167///
168/// * `bm25_candidates` - Candidates from BM25 lexical search
169/// * `vector_candidates` - Candidates from vector semantic search
170/// * `alpha` - Weight for BM25 z-scores (typically 0.5 for equal weighting)
171///
172/// # Returns
173///
174/// Vector of candidates with hybrid scores, sorted descending by relevance
175///
176/// # Examples
177///
178/// ```rust
179/// use lethe_core_rust::{apply_zscore_fusion, Candidate};
180/// 
181/// let bm25_candidates = vec![
182///     Candidate {
183///         doc_id: "doc1".to_string(),
184///         score: 0.8,
185///         text: Some("Programming tutorial".to_string()),
186///         kind: Some("bm25".to_string()),
187///     },
188/// ];
189/// 
190/// let vector_candidates = vec![
191///     Candidate {
192///         doc_id: "doc1".to_string(),
193///         score: 0.9,
194///         text: Some("Programming tutorial".to_string()),
195///         kind: Some("vector".to_string()),
196///     },
197/// ];
198/// 
199/// let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
200/// assert!(!results.is_empty());
201/// assert_eq!(results[0].kind, Some("hybrid".to_string()));
202/// ```
203pub fn apply_zscore_fusion(
204    bm25_candidates: Vec<Candidate>, 
205    vector_candidates: Vec<Candidate>, 
206    alpha: f64
207) -> Vec<Candidate> {
208    let service = HybridRetrievalService::mock_for_testing();
209    
210    // Calculate z-scores for each set
211    let zscore_bm25 = service.calculate_zscores(&bm25_candidates);
212    let zscore_vector = service.calculate_zscores(&vector_candidates);
213    
214    // Combine with weighted fusion
215    let mut combined = Vec::new();
216    let mut doc_scores = std::collections::HashMap::new();
217    
218    for candidate in zscore_bm25 {
219        doc_scores.insert(candidate.doc_id.clone(), alpha * candidate.score);
220    }
221    
222    for candidate in zscore_vector {
223        let entry = doc_scores.entry(candidate.doc_id.clone()).or_insert(0.0);
224        *entry += (1.0 - alpha) * candidate.score;
225    }
226    
227    // Convert back to candidates
228    for (doc_id, score) in doc_scores {
229        combined.push(Candidate {
230            doc_id,
231            score,
232            text: None,
233            kind: Some("hybrid".to_string()),
234        });
235    }
236    
237    // Sort by score descending
238    combined.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
239    combined
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245
246    #[test]
247    fn test_hero_config() {
248        let config = get_hero_config();
249        
250        // Verify hero configuration parameters
251        assert_eq!(config.alpha, 0.5);
252        assert_eq!(config.beta, 0.5);
253        assert_eq!(config.k_final, 5);
254        assert_eq!(config.k_initial, 200);
255        assert_eq!(config.diversify_method, "splade");
256        
257        // Hero config has gamma boosting disabled by default (no latent multipliers)
258        assert!(config.gamma_kind_boost.is_empty(), "Gamma boosting should be disabled by default");
259    }
260
261    #[test]
262    fn test_zscore_fusion() {
263        // Create test candidates
264        let bm25_candidates = vec![
265            Candidate {
266                doc_id: "doc1".to_string(),
267                score: 0.8,
268                text: Some("Rust programming".to_string()),
269                kind: Some("bm25".to_string()),
270            },
271            Candidate {
272                doc_id: "doc2".to_string(),
273                score: 0.6,
274                text: Some("Python data".to_string()), 
275                kind: Some("bm25".to_string()),
276            },
277        ];
278        
279        let vector_candidates = vec![
280            Candidate {
281                doc_id: "doc1".to_string(),
282                score: 0.9,
283                text: Some("Rust programming".to_string()),
284                kind: Some("vector".to_string()),
285            },
286            Candidate {
287                doc_id: "doc3".to_string(),
288                score: 0.7,
289                text: Some("Machine learning".to_string()),
290                kind: Some("vector".to_string()),
291            },
292        ];
293        
294        // Test z-score fusion with alpha=0.5 (hero config)
295        let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
296        
297        // Should have 3 unique documents (doc1, doc2, doc3)
298        assert_eq!(results.len(), 3);
299        
300        // Results should be sorted by score descending
301        for i in 1..results.len() {
302            assert!(results[i-1].score >= results[i].score);
303        }
304        
305        // All results should be marked as hybrid
306        for result in &results {
307            assert_eq!(result.kind, Some("hybrid".to_string()));
308        }
309    }
310
311    #[test]
312    fn test_library_integration() {
313        // Test that all main components can be imported and used together
314        let _config = get_hero_config();
315        let _chunking_config = ChunkingConfig::default();
316        let _service = HybridRetrievalService::mock_for_testing();
317        
318        // Test that error types are available
319        let _error = LetheError::config("test");
320        
321        // Test that all types are available
322        let _candidate = Candidate {
323            doc_id: "test".to_string(),
324            score: 1.0,
325            text: None,
326            kind: None,
327        };
328    }
329
330    #[test]
331    fn test_hero_config_hash_computation() {
332        let config = get_hero_config();
333        let hash = config.compute_hash();
334        
335        // Print the hash so we can see it in test output
336        println!("Hero configuration hash: {}", hash);
337        
338        // Print the JSON for verification
339        let json = serde_json::to_string_pretty(&config).expect("Failed to serialize");
340        println!("Configuration JSON:\n{}", json);
341        
342        // The hash should be consistent for the same configuration
343        let hash2 = config.compute_hash();
344        assert_eq!(hash, hash2);
345        
346        // Hash should be a valid hex string of correct length (SHA-256 = 64 chars)
347        assert_eq!(hash.len(), 64);
348        assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
349        
350        // Verify it matches the expected canonical hash
351        assert_eq!(hash, "91ad48c46bfb83257f69b329bf4153a7862765840fbbcfb6fb28ed2408ffe759");
352    }
353
354    #[test]
355    fn test_hero_config_validation_success() {
356        let canonical_hash = "91ad48c46bfb83257f69b329bf4153a7862765840fbbcfb6fb28ed2408ffe759";
357        
358        // Test strict validation with correct hash - should succeed
359        let config = get_hero_config_validated(canonical_hash, false).unwrap();
360        assert_eq!(config.alpha, 0.5);
361        assert_eq!(config.beta, 0.5);
362        assert_eq!(config.k_final, 5);
363        
364        // Test permissive validation with correct hash - should also succeed
365        let config2 = get_hero_config_validated(canonical_hash, true).unwrap();
366        assert_eq!(config2.alpha, 0.5);
367    }
368
369    #[test]
370    fn test_hero_config_validation_failure() {
371        let wrong_hash = "0000000000000000000000000000000000000000000000000000000000000000";
372        
373        // Test strict validation with wrong hash - should fail
374        let result = get_hero_config_validated(wrong_hash, false);
375        assert!(result.is_err());
376        
377        // Test permissive validation with wrong hash - should succeed but warn
378        let config = get_hero_config_validated(wrong_hash, true).unwrap();
379        assert_eq!(config.alpha, 0.5); // Config should still be valid
380    }
381}