lethe_core_rust/
utils.rs

1use regex::Regex;
2use sha2::{Digest, Sha256};
3use std::collections::HashSet;
4use std::sync::OnceLock;
5
6/// Pre-compiled regexes for performance
7struct CompiledRegexes {
8    alphanumeric: Regex,
9    punctuation: Regex,
10    sentence_split: Regex,
11    code_fence: Regex,
12    word_boundary: Regex,
13    code_symbol: Regex,
14    error_token: Regex,
15    path_file: Regex,
16    numeric_id: Regex,
17}
18
19impl CompiledRegexes {
20    fn new() -> Self {
21        Self {
22            alphanumeric: Regex::new(r"[a-zA-Z0-9]+").unwrap(),
23            punctuation: Regex::new(r"[^\w\s]").unwrap(),
24            sentence_split: Regex::new(r"[.!?]\s+").unwrap(),
25            code_fence: Regex::new(r"```[\s\S]*?```").unwrap(),
26            word_boundary: Regex::new(r"\b\w+\b").unwrap(),
27            code_symbol: Regex::new(r"[_a-zA-Z][\w]*\(|\b[A-Z][A-Za-z0-9]+::[A-Za-z0-9]+\b").unwrap(),
28            error_token: Regex::new(r"(?i)(Exception|Error|stack trace|errno|\bE\d{2,}\b)").unwrap(),
29            path_file: Regex::new(r"/[^\s]+\.[a-zA-Z0-9]+|[A-Za-z]:\\[^\s]+\.[a-zA-Z0-9]+").unwrap(),
30            numeric_id: Regex::new(r"\b\d{3,}\b").unwrap(),
31        }
32    }
33}
34
35/// Global regex cache to avoid repeated compilation
36static REGEX_CACHE: OnceLock<CompiledRegexes> = OnceLock::new();
37
38fn get_regex_cache() -> &'static CompiledRegexes {
39    REGEX_CACHE.get_or_init(CompiledRegexes::new)
40}
41
42/// Token counting utilities
43pub struct TokenCounter;
44
45impl TokenCounter {
46    /// Count tokens in text using GPT-style approximation
47    /// This provides a rough estimate - for actual tokenization, use a proper tokenizer
48    pub fn count_tokens(text: &str) -> i32 {
49        if text.is_empty() {
50            return 0;
51        }
52
53        Self::count_tokens_detailed(text).total_tokens
54    }
55    
56    /// Count tokens with detailed breakdown for debugging
57    pub fn count_tokens_detailed(text: &str) -> TokenCounts {
58        if text.is_empty() {
59            return TokenCounts::default();
60        }
61        
62        let regex_cache = get_regex_cache();
63        let words: Vec<&str> = text.split_whitespace().collect();
64        if words.is_empty() {
65            return TokenCounts::default();
66        }
67        
68        let mut alphanumeric_tokens = 0;
69        let mut punctuation_tokens = 0;
70        let mut whitespace_tokens = 0;
71        
72        for word in &words {
73            // Count alphanumeric sequences
74            alphanumeric_tokens += regex_cache.alphanumeric.find_iter(word).count() as i32;
75            
76            // Count punctuation separately
77            punctuation_tokens += regex_cache.punctuation.find_iter(word).count() as i32;
78        }
79        
80        // Count whitespace between words (words.len() - 1 spaces)
81        whitespace_tokens = if words.len() > 1 { (words.len() - 1) as i32 } else { 0 };
82        
83        // Total approximation: alphanumeric + punctuation/2 + whitespace
84        let total_tokens = alphanumeric_tokens + (punctuation_tokens + 1) / 2 + whitespace_tokens;
85        
86        TokenCounts {
87            alphanumeric_tokens,
88            punctuation_tokens,
89            whitespace_tokens,
90            total_tokens: std::cmp::max(1, total_tokens),
91        }
92    }
93}
94
95#[derive(Debug, Clone, Default)]
96pub struct TokenCounts {
97    pub alphanumeric_tokens: i32,
98    pub punctuation_tokens: i32,
99    pub whitespace_tokens: i32,
100    pub total_tokens: i32,
101}
102
103/// Configuration options for sentence splitting
104#[derive(Debug, Clone)]
105pub struct SentenceSplitOptions {
106    pub min_sentence_length: usize,
107    pub min_word_length: usize,
108    pub fallback_to_words: bool,
109}
110
111impl Default for SentenceSplitOptions {
112    fn default() -> Self {
113        Self {
114            min_sentence_length: 1,
115            min_word_length: 1,
116            fallback_to_words: false,
117        }
118    }
119}
120
121/// Configuration options for code fence extraction
122#[derive(Debug, Clone)]
123pub struct CodeFenceOptions {
124    pub skip_empty_text: bool,
125    pub min_code_length: usize,
126}
127
128impl Default for CodeFenceOptions {
129    fn default() -> Self {
130        Self {
131            skip_empty_text: true,
132            min_code_length: 6, // Minimum "```x```" length
133        }
134    }
135}
136
137/// Configuration options for tokenization
138#[derive(Debug, Clone)]
139pub struct TokenizeOptions {
140    pub min_word_length: usize,
141    pub to_lowercase: bool,
142}
143
144impl Default for TokenizeOptions {
145    fn default() -> Self {
146        Self {
147            min_word_length: 2,
148            to_lowercase: true,
149        }
150    }
151}
152
153/// Text processing utilities
154pub struct TextProcessor;
155
156impl TextProcessor {
157    /// Split text into sentences with fallback to words
158    pub fn split_sentences(text: &str) -> Vec<String> {
159        if text.is_empty() {
160            return Vec::new();
161        }
162        
163        Self::split_sentences_advanced(text, SentenceSplitOptions::default())
164    }
165    
166    /// Split sentences with configurable options
167    pub fn split_sentences_advanced(text: &str, options: SentenceSplitOptions) -> Vec<String> {
168        if text.is_empty() {
169            return Vec::new();
170        }
171        
172        let regex_cache = get_regex_cache();
173        let mut sentences = Vec::new();
174        let mut current_start = 0;
175        
176        for mat in regex_cache.sentence_split.find_iter(text) {
177            let end = mat.start() + 1; // Include the punctuation
178            let sentence = text[current_start..end].trim();
179            if !sentence.is_empty() && sentence.len() >= options.min_sentence_length {
180                sentences.push(sentence.to_string());
181            }
182            current_start = mat.end();
183        }
184        
185        // Add the remaining text if any
186        if current_start < text.len() {
187            let sentence = text[current_start..].trim();
188            if !sentence.is_empty() && sentence.len() >= options.min_sentence_length {
189                sentences.push(sentence.to_string());
190            }
191        }
192
193        // Fallback to word splitting if no sentences or if explicitly requested
194        if (sentences.len() <= 1 && !text.contains(['.', '!', '?'])) || options.fallback_to_words {
195            return text
196                .split_whitespace()
197                .map(|w| w.to_string())
198                .filter(|w| !w.is_empty() && w.len() >= options.min_word_length)
199                .collect();
200        }
201
202        sentences
203    }
204
205    /// Extract code fences and text parts with better error handling
206    pub fn extract_code_fences(text: &str) -> Vec<TextPart> {
207        if text.is_empty() {
208            return vec![TextPart {
209                kind: TextPartKind::Text,
210                content: String::new(),
211                start: 0,
212                end: 0,
213            }];
214        }
215        
216        Self::extract_code_fences_with_options(text, CodeFenceOptions::default())
217    }
218    
219    /// Extract code fences with configurable options
220    pub fn extract_code_fences_with_options(text: &str, options: CodeFenceOptions) -> Vec<TextPart> {
221        let mut parts = Vec::new();
222        let regex_cache = get_regex_cache();
223        let mut last_end = 0;
224
225        for mat in regex_cache.code_fence.find_iter(text) {
226            // Add text before code block
227            if mat.start() > last_end {
228                let text_content = &text[last_end..mat.start()];
229                if !text_content.trim().is_empty() || !options.skip_empty_text {
230                    parts.push(TextPart {
231                        kind: TextPartKind::Text,
232                        content: text_content.to_string(),
233                        start: last_end,
234                        end: mat.start(),
235                    });
236                }
237            }
238
239            // Add code block
240            let code_content = mat.as_str();
241            if code_content.len() >= options.min_code_length {
242                parts.push(TextPart {
243                    kind: TextPartKind::Code,
244                    content: code_content.to_string(),
245                    start: mat.start(),
246                    end: mat.end(),
247                });
248            }
249
250            last_end = mat.end();
251        }
252
253        // Add remaining text
254        if last_end < text.len() {
255            let text_content = &text[last_end..];
256            if !text_content.trim().is_empty() || !options.skip_empty_text {
257                parts.push(TextPart {
258                    kind: TextPartKind::Text,
259                    content: text_content.to_string(),
260                    start: last_end,
261                    end: text.len(),
262                });
263            }
264        }
265
266        // If no parts found, treat as single text part
267        if parts.is_empty() {
268            parts.push(TextPart {
269                kind: TextPartKind::Text,
270                content: text.to_string(),
271                start: 0,
272                end: text.len(),
273            });
274        }
275
276        parts
277    }
278
279    /// Normalize text to NFC form
280    pub fn normalize_text(text: &str) -> String {
281        // Rust's String is already UTF-8, but we can apply basic normalization
282        text.chars().collect::<String>()
283    }
284
285    /// Tokenize text for search (similar to TF-IDF processing) with better performance
286    pub fn tokenize(text: &str) -> Vec<String> {
287        if text.is_empty() {
288            return Vec::new();
289        }
290        
291        Self::tokenize_with_options(text, TokenizeOptions::default())
292    }
293    
294    /// Tokenize with configurable options
295    pub fn tokenize_with_options(text: &str, options: TokenizeOptions) -> Vec<String> {
296        let regex_cache = get_regex_cache();
297        let text_to_process = if options.to_lowercase { text.to_lowercase() } else { text.to_string() };
298        
299        regex_cache
300            .word_boundary
301            .find_iter(&text_to_process)
302            .map(|mat| mat.as_str().to_string())
303            .filter(|word| word.len() >= options.min_word_length)
304            .collect()
305    }
306}
307
308/// Hash utilities
309pub struct HashUtils;
310
311impl HashUtils {
312    /// Generate SHA-256 hash of input
313    pub fn sha256_hash(input: &str) -> String {
314        let mut hasher = Sha256::new();
315        hasher.update(input.as_bytes());
316        hex::encode(hasher.finalize())
317    }
318
319    /// Generate short hash (16 chars) for IDs
320    pub fn short_hash(input: &str) -> String {
321        Self::sha256_hash(input)[..16].to_string()
322    }
323}
324
325/// Query feature detection
326pub struct QueryFeatures;
327
328impl QueryFeatures {
329    /// Extract features from query text using cached regexes for better performance
330    pub fn extract_features(query: &str) -> QueryFeatureFlags {
331        if query.is_empty() {
332            return QueryFeatureFlags::default();
333        }
334        
335        let regex_cache = get_regex_cache();
336        
337        QueryFeatureFlags {
338            has_code_symbol: regex_cache.code_symbol.is_match(query),
339            has_error_token: regex_cache.error_token.is_match(query),
340            has_path_or_file: regex_cache.path_file.is_match(query),
341            has_numeric_id: regex_cache.numeric_id.is_match(query),
342        }
343    }
344
345    /// Calculate gamma boost based on query features and content kind
346    pub fn gamma_boost(kind: &str, features: &QueryFeatureFlags) -> f64 {
347        let mut boost = 0.0;
348
349        if features.has_code_symbol && (kind == "code" || kind == "user_code") {
350            boost += 0.10;
351        }
352        
353        if features.has_error_token && kind == "tool_result" {
354            boost += 0.08;
355        }
356        
357        if features.has_path_or_file && kind == "code" {
358            boost += 0.04;
359        }
360
361        boost
362    }
363}
364
365/// Overlap calculation utilities
366pub struct OverlapUtils;
367
368impl OverlapUtils {
369    /// Calculate overlap ratio between two sets of document IDs
370    pub fn calculate_overlap_ratio(set1: &[String], set2: &[String]) -> f64 {
371        if set1.is_empty() || set2.is_empty() {
372            return 0.0;
373        }
374
375        let ids1: HashSet<_> = set1.iter().collect();
376        let ids2: HashSet<_> = set2.iter().collect();
377
378        let intersection_size = ids1.intersection(&ids2).count();
379        let union_size = ids1.union(&ids2).count();
380
381        if union_size == 0 {
382            0.0
383        } else {
384            intersection_size as f64 / union_size as f64
385        }
386    }
387}
388
389/// Text part from code fence extraction
390#[derive(Debug, Clone)]
391pub struct TextPart {
392    pub kind: TextPartKind,
393    pub content: String,
394    pub start: usize,
395    pub end: usize,
396}
397
398/// Kind of text part
399#[derive(Debug, Clone, PartialEq)]
400pub enum TextPartKind {
401    Text,
402    Code,
403}
404
405/// Query feature flags
406#[derive(Debug, Clone, Default)]
407pub struct QueryFeatureFlags {
408    pub has_code_symbol: bool,
409    pub has_error_token: bool,
410    pub has_path_or_file: bool,
411    pub has_numeric_id: bool,
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417
418    #[test]
419    fn test_token_counting() {
420        assert_eq!(TokenCounter::count_tokens(""), 0);
421        assert_eq!(TokenCounter::count_tokens("hello"), 1);
422        assert_eq!(TokenCounter::count_tokens("hello world"), 3); // "hello" + "world" + whitespace = 3
423        assert_eq!(TokenCounter::count_tokens("function_name()"), 3); // function_name + () = 3
424        
425        // Test the detailed counting for debugging
426        let detailed = TokenCounter::count_tokens_detailed("hello world");
427        assert_eq!(detailed.alphanumeric_tokens, 2); // "hello", "world"
428        assert_eq!(detailed.whitespace_tokens, 1); // one space
429        assert_eq!(detailed.total_tokens, 3); // 2 + 0 + 1 = 3
430    }
431
432    #[test]
433    fn test_sentence_splitting() {
434        let sentences = TextProcessor::split_sentences("Hello world. How are you? Fine thanks!");
435        assert_eq!(sentences.len(), 3);
436        assert_eq!(sentences[0], "Hello world.");
437        assert_eq!(sentences[1], "How are you?");
438        assert_eq!(sentences[2], "Fine thanks!");
439    }
440
441    #[test]
442    fn test_code_fence_extraction() {
443        let text = "Some text\n```rust\nfn main() {}\n```\nMore text";
444        let parts = TextProcessor::extract_code_fences(text);
445        assert_eq!(parts.len(), 3);
446        assert!(matches!(parts[0].kind, TextPartKind::Text));
447        assert!(matches!(parts[1].kind, TextPartKind::Code));
448        assert!(matches!(parts[2].kind, TextPartKind::Text));
449    }
450
451    #[test]
452    fn test_query_features() {
453        let features = QueryFeatures::extract_features("function_name() error in /path/file.rs");
454        assert!(features.has_code_symbol);
455        assert!(features.has_error_token);
456        assert!(features.has_path_or_file);
457    }
458
459    #[test]
460    fn test_overlap_calculation() {
461        let set1 = vec!["a".to_string(), "b".to_string(), "c".to_string()];
462        let set2 = vec!["b".to_string(), "c".to_string(), "d".to_string()];
463        let ratio = OverlapUtils::calculate_overlap_ratio(&set1, &set2);
464        assert!((ratio - 0.5).abs() < f64::EPSILON); // 2 intersection / 4 union = 0.5
465    }
466
467    #[test]
468    fn test_hash_generation() {
469        let hash = HashUtils::short_hash("test input");
470        assert_eq!(hash.len(), 16);
471        
472        // Same input should produce same hash
473        let hash2 = HashUtils::short_hash("test input");
474        assert_eq!(hash, hash2);
475        
476        // Different input should produce different hash
477        let hash3 = HashUtils::short_hash("different input");
478        assert_ne!(hash, hash3);
479    }
480
481    #[test]
482    fn test_tokenize_options_default() {
483        let options = TokenizeOptions::default();
484        assert_eq!(options.min_word_length, 2);
485        assert!(options.to_lowercase);
486    }
487
488    #[test]
489    fn test_text_processor_empty_input() {
490        let sentences = TextProcessor::split_sentences("");
491        assert!(sentences.is_empty());
492        
493        let parts = TextProcessor::extract_code_fences("");
494        assert_eq!(parts.len(), 1);
495        assert!(matches!(parts[0].kind, TextPartKind::Text));
496        assert_eq!(parts[0].content, "");
497    }
498
499    #[test]
500    fn test_query_features_empty_query() {
501        let features = QueryFeatures::extract_features("");
502        assert!(!features.has_code_symbol);
503        assert!(!features.has_error_token);
504        assert!(!features.has_path_or_file);
505        assert!(!features.has_numeric_id);
506    }
507
508    #[test]
509    fn test_query_features_all_features() {
510        let features = QueryFeatures::extract_features("error: function_name() failed in /path/to/file.rs:123 with id 456");
511        assert!(features.has_code_symbol);
512        assert!(features.has_error_token);
513        assert!(features.has_path_or_file);
514        assert!(features.has_numeric_id);
515    }
516
517    #[test]
518    fn test_hash_generation_edge_cases() {
519        // Test different input types
520        let hash1 = HashUtils::short_hash("test content");
521        let hash2 = HashUtils::short_hash("test content");
522        let hash3 = HashUtils::short_hash("different content");
523        
524        assert_eq!(hash1, hash2); // Same content should produce same hash
525        assert_ne!(hash1, hash3); // Different content should produce different hash
526        
527        // Test empty string
528        let empty_hash = HashUtils::short_hash("");
529        assert!(!empty_hash.is_empty());
530        assert_eq!(empty_hash.len(), 16); // Should be 16 characters
531    }
532
533    #[test]
534    fn test_text_processor_edge_cases() {
535        // Test with mixed newline types
536        let text = "Line 1\nLine 2\r\nLine 3\rLine 4";
537        let sentences = TextProcessor::split_sentences(text);
538        assert!(sentences.len() >= 4);
539        
540        // Test with only punctuation
541        let text = "... !!! ???";
542        let sentences = TextProcessor::split_sentences(text);
543        assert!(!sentences.is_empty());
544    }
545
546    #[test]
547    fn test_query_features_partial_matches() {
548        // Test individual features
549        let features = QueryFeatures::extract_features("just some error here");
550        assert!(features.has_error_token);
551        assert!(!features.has_code_symbol);
552        assert!(!features.has_path_or_file);
553        assert!(!features.has_numeric_id);
554
555        let features = QueryFeatures::extract_features("function() call");
556        assert!(features.has_code_symbol);
557        assert!(!features.has_error_token);
558        
559        let features = QueryFeatures::extract_features("/home/user/file.txt");
560        assert!(features.has_path_or_file);
561        
562        let features = QueryFeatures::extract_features("user id 12345");
563        assert!(features.has_numeric_id);
564    }
565}