lethe_core_rust/lib.rs
1//! # lethe-core-rust
2//!
3//! A high-performance hybrid retrieval engine that combines BM25 lexical search with vector similarity
4//! using z-score fusion. Lethe Core provides state-of-the-art context selection for conversational AI
5//! and retrieval-augmented generation (RAG) systems.
6//!
7//! ## Features
8//!
9//! - **Hybrid Retrieval**: Combines BM25 lexical search with vector similarity for optimal relevance
10//! - **Z-Score Fusion**: Normalizes and fuses scores using statistical z-score transformation (α=0.5, β=0.5)
11//! - **Hero Configuration**: Pre-tuned parameters achieving parity with splade baseline performance
12//! - **Gamma Boosting**: Context-aware score boosting for code, errors, and technical content
13//! - **Chunking Pipeline**: Intelligent text segmentation with sentence-level granularity
14//! - **Async-First**: Built on Tokio for high-performance concurrent operations
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use lethe_core_rust::{get_hero_config, apply_zscore_fusion, Candidate};
20//!
21//! # #[tokio::main]
22//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Get the hero configuration (optimal for splade parity)
24//! let config = get_hero_config();
25//! println!("Hero config: α={}, β={}", config.alpha, config.beta);
26//!
27//! // Example candidates from BM25 and vector search
28//! let bm25_candidates = vec![
29//! Candidate {
30//! doc_id: "doc1".to_string(),
31//! score: 0.8,
32//! text: Some("Rust async programming".to_string()),
33//! kind: Some("bm25".to_string()),
34//! },
35//! ];
36//!
37//! let vector_candidates = vec![
38//! Candidate {
39//! doc_id: "doc1".to_string(),
40//! score: 0.9,
41//! text: Some("Rust async programming".to_string()),
42//! kind: Some("vector".to_string()),
43//! },
44//! ];
45//!
46//! // Apply z-score fusion with hero configuration (α=0.5)
47//! let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
48//! println!("Fused {} candidates", results.len());
49//! # Ok(())
50//! # }
51//! ```
52
53// Re-export all shared types and utilities
54pub mod types;
55pub mod error;
56pub mod config;
57pub mod utils;
58
59// Domain services and business logic
60pub mod chunker;
61pub mod retrieval;
62pub mod embeddings;
63pub mod hyde;
64pub mod query_understanding;
65pub mod ml_prediction;
66pub mod pipeline;
67
68// Infrastructure services (optional with database feature)
69#[cfg(feature = "database")]
70pub mod database;
71#[cfg(feature = "database")]
72pub mod repositories;
73
74// Re-export everything for convenience
75pub use types::*;
76pub use error::*;
77pub use config::*;
78pub use utils::*;
79
80pub use chunker::*;
81pub use retrieval::*;
82pub use embeddings::*;
83pub use hyde::*;
84pub use query_understanding::*;
85pub use ml_prediction::*;
86pub use pipeline::*;
87
88#[cfg(feature = "database")]
89pub use database::*;
90#[cfg(feature = "database")]
91pub use repositories::*;
92
93/// Get a hero configuration for testing and benchmarks
94///
95/// The hero configuration provides optimal parameters validated against splade baseline performance:
96/// - α = 0.5, β = 0.5: Equal weighting of lexical and semantic signals
97/// - k_initial = 200: Large candidate pool for comprehensive coverage
98/// - k_final = 5: Focused results optimized for Recall@5 metrics
99/// - Diversification = "splade": Advanced diversification matching baseline method
100/// - Gamma boosting disabled: Clean z-score fusion without latent multipliers
101///
102/// # Examples
103///
104/// ```rust
105/// use lethe_core_rust::get_hero_config;
106///
107/// let config = get_hero_config();
108/// assert_eq!(config.alpha, 0.5);
109/// assert_eq!(config.beta, 0.5);
110/// assert_eq!(config.k_final, 5);
111/// ```
112pub fn get_hero_config() -> HybridRetrievalConfig {
113 HybridRetrievalConfig::hero()
114}
115
116/// Get a validated hero configuration against canonical hash
117///
118/// This function creates a hero configuration and validates it against the expected
119/// hash from the canonical audit manifest. It will refuse to run unless the
120/// configuration matches exactly, providing integrity verification.
121///
122/// # Arguments
123///
124/// * `expected_hash` - The expected SHA-256 hash of the canonical hero configuration
125/// * `allow_override` - Whether to continue with mismatched config (logs warning)
126///
127/// # Returns
128///
129/// A validated `HybridRetrievalConfig` or an error if validation fails
130///
131/// # Examples
132///
133/// ```rust
134/// use lethe_core_rust::{get_hero_config_validated, LetheError};
135///
136/// // With canonical hash (this would be from your manifest)
137/// let canonical_hash = "8dd2de7e89ed4af1aede4cc89d8c9d8435d03a340d918f45e640b4b84959a80f";
138///
139/// // Strict validation - fails on mismatch
140/// match get_hero_config_validated(canonical_hash, false) {
141/// Ok(config) => {
142/// // Use validated config
143/// assert_eq!(config.alpha, 0.5);
144/// },
145/// Err(e) => {
146/// // Handle configuration validation failure
147/// eprintln!("Hero config validation failed: {}", e);
148/// }
149/// }
150///
151/// // Permissive validation - warns on mismatch but continues
152/// let config = get_hero_config_validated(canonical_hash, true)
153/// .expect("Should not fail with override");
154/// ```
155pub fn get_hero_config_validated(expected_hash: &str, allow_override: bool) -> Result<HybridRetrievalConfig> {
156 HybridRetrievalConfig::hero_with_validation(expected_hash, allow_override)
157}
158
159/// Process candidates using z-score fusion
160///
161/// This function demonstrates the core z-score fusion algorithm by:
162/// 1. Converting raw scores to z-scores (mean=0, std=1)
163/// 2. Combining using weighted fusion: `α * z_bm25 + β * z_vector`
164/// 3. Returning unified candidates sorted by hybrid score
165///
166/// # Arguments
167///
168/// * `bm25_candidates` - Candidates from BM25 lexical search
169/// * `vector_candidates` - Candidates from vector semantic search
170/// * `alpha` - Weight for BM25 z-scores (typically 0.5 for equal weighting)
171///
172/// # Returns
173///
174/// Vector of candidates with hybrid scores, sorted descending by relevance
175///
176/// # Examples
177///
178/// ```rust
179/// use lethe_core_rust::{apply_zscore_fusion, Candidate};
180///
181/// let bm25_candidates = vec![
182/// Candidate {
183/// doc_id: "doc1".to_string(),
184/// score: 0.8,
185/// text: Some("Programming tutorial".to_string()),
186/// kind: Some("bm25".to_string()),
187/// },
188/// ];
189///
190/// let vector_candidates = vec![
191/// Candidate {
192/// doc_id: "doc1".to_string(),
193/// score: 0.9,
194/// text: Some("Programming tutorial".to_string()),
195/// kind: Some("vector".to_string()),
196/// },
197/// ];
198///
199/// let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
200/// assert!(!results.is_empty());
201/// assert_eq!(results[0].kind, Some("hybrid".to_string()));
202/// ```
203pub fn apply_zscore_fusion(
204 bm25_candidates: Vec<Candidate>,
205 vector_candidates: Vec<Candidate>,
206 alpha: f64
207) -> Vec<Candidate> {
208 let service = HybridRetrievalService::mock_for_testing();
209
210 // Calculate z-scores for each set
211 let zscore_bm25 = service.calculate_zscores(&bm25_candidates);
212 let zscore_vector = service.calculate_zscores(&vector_candidates);
213
214 // Combine with weighted fusion
215 let mut combined = Vec::new();
216 let mut doc_scores = std::collections::HashMap::new();
217
218 for candidate in zscore_bm25 {
219 doc_scores.insert(candidate.doc_id.clone(), alpha * candidate.score);
220 }
221
222 for candidate in zscore_vector {
223 let entry = doc_scores.entry(candidate.doc_id.clone()).or_insert(0.0);
224 *entry += (1.0 - alpha) * candidate.score;
225 }
226
227 // Convert back to candidates
228 for (doc_id, score) in doc_scores {
229 combined.push(Candidate {
230 doc_id,
231 score,
232 text: None,
233 kind: Some("hybrid".to_string()),
234 });
235 }
236
237 // Sort by score descending
238 combined.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
239 combined
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245
246 #[test]
247 fn test_hero_config() {
248 let config = get_hero_config();
249
250 // Verify hero configuration parameters
251 assert_eq!(config.alpha, 0.5);
252 assert_eq!(config.beta, 0.5);
253 assert_eq!(config.k_final, 5);
254 assert_eq!(config.k_initial, 200);
255 assert_eq!(config.diversify_method, "splade");
256
257 // Hero config has gamma boosting disabled by default (no latent multipliers)
258 assert!(config.gamma_kind_boost.is_empty(), "Gamma boosting should be disabled by default");
259 }
260
261 #[test]
262 fn test_zscore_fusion() {
263 // Create test candidates
264 let bm25_candidates = vec![
265 Candidate {
266 doc_id: "doc1".to_string(),
267 score: 0.8,
268 text: Some("Rust programming".to_string()),
269 kind: Some("bm25".to_string()),
270 },
271 Candidate {
272 doc_id: "doc2".to_string(),
273 score: 0.6,
274 text: Some("Python data".to_string()),
275 kind: Some("bm25".to_string()),
276 },
277 ];
278
279 let vector_candidates = vec![
280 Candidate {
281 doc_id: "doc1".to_string(),
282 score: 0.9,
283 text: Some("Rust programming".to_string()),
284 kind: Some("vector".to_string()),
285 },
286 Candidate {
287 doc_id: "doc3".to_string(),
288 score: 0.7,
289 text: Some("Machine learning".to_string()),
290 kind: Some("vector".to_string()),
291 },
292 ];
293
294 // Test z-score fusion with alpha=0.5 (hero config)
295 let results = apply_zscore_fusion(bm25_candidates, vector_candidates, 0.5);
296
297 // Should have 3 unique documents (doc1, doc2, doc3)
298 assert_eq!(results.len(), 3);
299
300 // Results should be sorted by score descending
301 for i in 1..results.len() {
302 assert!(results[i-1].score >= results[i].score);
303 }
304
305 // All results should be marked as hybrid
306 for result in &results {
307 assert_eq!(result.kind, Some("hybrid".to_string()));
308 }
309 }
310
311 #[test]
312 fn test_library_integration() {
313 // Test that all main components can be imported and used together
314 let _config = get_hero_config();
315 let _chunking_config = ChunkingConfig::default();
316 let _service = HybridRetrievalService::mock_for_testing();
317
318 // Test that error types are available
319 let _error = LetheError::config("test");
320
321 // Test that all types are available
322 let _candidate = Candidate {
323 doc_id: "test".to_string(),
324 score: 1.0,
325 text: None,
326 kind: None,
327 };
328 }
329
330 #[test]
331 fn test_hero_config_hash_computation() {
332 let config = get_hero_config();
333 let hash = config.compute_hash();
334
335 // Print the hash so we can see it in test output
336 println!("Hero configuration hash: {}", hash);
337
338 // Print the JSON for verification
339 let json = serde_json::to_string_pretty(&config).expect("Failed to serialize");
340 println!("Configuration JSON:\n{}", json);
341
342 // The hash should be consistent for the same configuration
343 let hash2 = config.compute_hash();
344 assert_eq!(hash, hash2);
345
346 // Hash should be a valid hex string of correct length (SHA-256 = 64 chars)
347 assert_eq!(hash.len(), 64);
348 assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
349
350 // Verify it matches the expected canonical hash
351 assert_eq!(hash, "91ad48c46bfb83257f69b329bf4153a7862765840fbbcfb6fb28ed2408ffe759");
352 }
353
354 #[test]
355 fn test_hero_config_validation_success() {
356 let canonical_hash = "91ad48c46bfb83257f69b329bf4153a7862765840fbbcfb6fb28ed2408ffe759";
357
358 // Test strict validation with correct hash - should succeed
359 let config = get_hero_config_validated(canonical_hash, false).unwrap();
360 assert_eq!(config.alpha, 0.5);
361 assert_eq!(config.beta, 0.5);
362 assert_eq!(config.k_final, 5);
363
364 // Test permissive validation with correct hash - should also succeed
365 let config2 = get_hero_config_validated(canonical_hash, true).unwrap();
366 assert_eq!(config2.alpha, 0.5);
367 }
368
369 #[test]
370 fn test_hero_config_validation_failure() {
371 let wrong_hash = "0000000000000000000000000000000000000000000000000000000000000000";
372
373 // Test strict validation with wrong hash - should fail
374 let result = get_hero_config_validated(wrong_hash, false);
375 assert!(result.is_err());
376
377 // Test permissive validation with wrong hash - should succeed but warn
378 let config = get_hero_config_validated(wrong_hash, true).unwrap();
379 assert_eq!(config.alpha, 0.5); // Config should still be valid
380 }
381}