1use regex::Regex;
2use sha2::{Digest, Sha256};
3use std::collections::HashSet;
4use std::sync::OnceLock;
5
6struct CompiledRegexes {
8 alphanumeric: Regex,
9 punctuation: Regex,
10 sentence_split: Regex,
11 code_fence: Regex,
12 word_boundary: Regex,
13 code_symbol: Regex,
14 error_token: Regex,
15 path_file: Regex,
16 numeric_id: Regex,
17}
18
19impl CompiledRegexes {
20 fn new() -> Self {
21 Self {
22 alphanumeric: Regex::new(r"[a-zA-Z0-9]+").unwrap(),
23 punctuation: Regex::new(r"[^\w\s]").unwrap(),
24 sentence_split: Regex::new(r"[.!?]\s+").unwrap(),
25 code_fence: Regex::new(r"```[\s\S]*?```").unwrap(),
26 word_boundary: Regex::new(r"\b\w+\b").unwrap(),
27 code_symbol: Regex::new(r"[_a-zA-Z][\w]*\(|\b[A-Z][A-Za-z0-9]+::[A-Za-z0-9]+\b").unwrap(),
28 error_token: Regex::new(r"(?i)(Exception|Error|stack trace|errno|\bE\d{2,}\b)").unwrap(),
29 path_file: Regex::new(r"/[^\s]+\.[a-zA-Z0-9]+|[A-Za-z]:\\[^\s]+\.[a-zA-Z0-9]+").unwrap(),
30 numeric_id: Regex::new(r"\b\d{3,}\b").unwrap(),
31 }
32 }
33}
34
35static REGEX_CACHE: OnceLock<CompiledRegexes> = OnceLock::new();
37
38fn get_regex_cache() -> &'static CompiledRegexes {
39 REGEX_CACHE.get_or_init(CompiledRegexes::new)
40}
41
42pub struct TokenCounter;
44
45impl TokenCounter {
46 pub fn count_tokens(text: &str) -> i32 {
49 if text.is_empty() {
50 return 0;
51 }
52
53 Self::count_tokens_detailed(text).total_tokens
54 }
55
56 pub fn count_tokens_detailed(text: &str) -> TokenCounts {
58 if text.is_empty() {
59 return TokenCounts::default();
60 }
61
62 let regex_cache = get_regex_cache();
63 let words: Vec<&str> = text.split_whitespace().collect();
64 if words.is_empty() {
65 return TokenCounts::default();
66 }
67
68 let mut alphanumeric_tokens = 0;
69 let mut punctuation_tokens = 0;
70 let mut whitespace_tokens = 0;
71
72 for word in &words {
73 alphanumeric_tokens += regex_cache.alphanumeric.find_iter(word).count() as i32;
75
76 punctuation_tokens += regex_cache.punctuation.find_iter(word).count() as i32;
78 }
79
80 whitespace_tokens = if words.len() > 1 { (words.len() - 1) as i32 } else { 0 };
82
83 let total_tokens = alphanumeric_tokens + (punctuation_tokens + 1) / 2 + whitespace_tokens;
85
86 TokenCounts {
87 alphanumeric_tokens,
88 punctuation_tokens,
89 whitespace_tokens,
90 total_tokens: std::cmp::max(1, total_tokens),
91 }
92 }
93}
94
95#[derive(Debug, Clone, Default)]
96pub struct TokenCounts {
97 pub alphanumeric_tokens: i32,
98 pub punctuation_tokens: i32,
99 pub whitespace_tokens: i32,
100 pub total_tokens: i32,
101}
102
103#[derive(Debug, Clone)]
105pub struct SentenceSplitOptions {
106 pub min_sentence_length: usize,
107 pub min_word_length: usize,
108 pub fallback_to_words: bool,
109}
110
111impl Default for SentenceSplitOptions {
112 fn default() -> Self {
113 Self {
114 min_sentence_length: 1,
115 min_word_length: 1,
116 fallback_to_words: false,
117 }
118 }
119}
120
121#[derive(Debug, Clone)]
123pub struct CodeFenceOptions {
124 pub skip_empty_text: bool,
125 pub min_code_length: usize,
126}
127
128impl Default for CodeFenceOptions {
129 fn default() -> Self {
130 Self {
131 skip_empty_text: true,
132 min_code_length: 6, }
134 }
135}
136
137#[derive(Debug, Clone)]
139pub struct TokenizeOptions {
140 pub min_word_length: usize,
141 pub to_lowercase: bool,
142}
143
144impl Default for TokenizeOptions {
145 fn default() -> Self {
146 Self {
147 min_word_length: 2,
148 to_lowercase: true,
149 }
150 }
151}
152
153pub struct TextProcessor;
155
156impl TextProcessor {
157 pub fn split_sentences(text: &str) -> Vec<String> {
159 if text.is_empty() {
160 return Vec::new();
161 }
162
163 Self::split_sentences_advanced(text, SentenceSplitOptions::default())
164 }
165
166 pub fn split_sentences_advanced(text: &str, options: SentenceSplitOptions) -> Vec<String> {
168 if text.is_empty() {
169 return Vec::new();
170 }
171
172 let regex_cache = get_regex_cache();
173 let mut sentences = Vec::new();
174 let mut current_start = 0;
175
176 for mat in regex_cache.sentence_split.find_iter(text) {
177 let end = mat.start() + 1; let sentence = text[current_start..end].trim();
179 if !sentence.is_empty() && sentence.len() >= options.min_sentence_length {
180 sentences.push(sentence.to_string());
181 }
182 current_start = mat.end();
183 }
184
185 if current_start < text.len() {
187 let sentence = text[current_start..].trim();
188 if !sentence.is_empty() && sentence.len() >= options.min_sentence_length {
189 sentences.push(sentence.to_string());
190 }
191 }
192
193 if (sentences.len() <= 1 && !text.contains(['.', '!', '?'])) || options.fallback_to_words {
195 return text
196 .split_whitespace()
197 .map(|w| w.to_string())
198 .filter(|w| !w.is_empty() && w.len() >= options.min_word_length)
199 .collect();
200 }
201
202 sentences
203 }
204
205 pub fn extract_code_fences(text: &str) -> Vec<TextPart> {
207 if text.is_empty() {
208 return vec![TextPart {
209 kind: TextPartKind::Text,
210 content: String::new(),
211 start: 0,
212 end: 0,
213 }];
214 }
215
216 Self::extract_code_fences_with_options(text, CodeFenceOptions::default())
217 }
218
219 pub fn extract_code_fences_with_options(text: &str, options: CodeFenceOptions) -> Vec<TextPart> {
221 let mut parts = Vec::new();
222 let regex_cache = get_regex_cache();
223 let mut last_end = 0;
224
225 for mat in regex_cache.code_fence.find_iter(text) {
226 if mat.start() > last_end {
228 let text_content = &text[last_end..mat.start()];
229 if !text_content.trim().is_empty() || !options.skip_empty_text {
230 parts.push(TextPart {
231 kind: TextPartKind::Text,
232 content: text_content.to_string(),
233 start: last_end,
234 end: mat.start(),
235 });
236 }
237 }
238
239 let code_content = mat.as_str();
241 if code_content.len() >= options.min_code_length {
242 parts.push(TextPart {
243 kind: TextPartKind::Code,
244 content: code_content.to_string(),
245 start: mat.start(),
246 end: mat.end(),
247 });
248 }
249
250 last_end = mat.end();
251 }
252
253 if last_end < text.len() {
255 let text_content = &text[last_end..];
256 if !text_content.trim().is_empty() || !options.skip_empty_text {
257 parts.push(TextPart {
258 kind: TextPartKind::Text,
259 content: text_content.to_string(),
260 start: last_end,
261 end: text.len(),
262 });
263 }
264 }
265
266 if parts.is_empty() {
268 parts.push(TextPart {
269 kind: TextPartKind::Text,
270 content: text.to_string(),
271 start: 0,
272 end: text.len(),
273 });
274 }
275
276 parts
277 }
278
279 pub fn normalize_text(text: &str) -> String {
281 text.chars().collect::<String>()
283 }
284
285 pub fn tokenize(text: &str) -> Vec<String> {
287 if text.is_empty() {
288 return Vec::new();
289 }
290
291 Self::tokenize_with_options(text, TokenizeOptions::default())
292 }
293
294 pub fn tokenize_with_options(text: &str, options: TokenizeOptions) -> Vec<String> {
296 let regex_cache = get_regex_cache();
297 let text_to_process = if options.to_lowercase { text.to_lowercase() } else { text.to_string() };
298
299 regex_cache
300 .word_boundary
301 .find_iter(&text_to_process)
302 .map(|mat| mat.as_str().to_string())
303 .filter(|word| word.len() >= options.min_word_length)
304 .collect()
305 }
306}
307
308pub struct HashUtils;
310
311impl HashUtils {
312 pub fn sha256_hash(input: &str) -> String {
314 let mut hasher = Sha256::new();
315 hasher.update(input.as_bytes());
316 hex::encode(hasher.finalize())
317 }
318
319 pub fn short_hash(input: &str) -> String {
321 Self::sha256_hash(input)[..16].to_string()
322 }
323}
324
325pub struct QueryFeatures;
327
328impl QueryFeatures {
329 pub fn extract_features(query: &str) -> QueryFeatureFlags {
331 if query.is_empty() {
332 return QueryFeatureFlags::default();
333 }
334
335 let regex_cache = get_regex_cache();
336
337 QueryFeatureFlags {
338 has_code_symbol: regex_cache.code_symbol.is_match(query),
339 has_error_token: regex_cache.error_token.is_match(query),
340 has_path_or_file: regex_cache.path_file.is_match(query),
341 has_numeric_id: regex_cache.numeric_id.is_match(query),
342 }
343 }
344
345 pub fn gamma_boost(kind: &str, features: &QueryFeatureFlags) -> f64 {
347 let mut boost = 0.0;
348
349 if features.has_code_symbol && (kind == "code" || kind == "user_code") {
350 boost += 0.10;
351 }
352
353 if features.has_error_token && kind == "tool_result" {
354 boost += 0.08;
355 }
356
357 if features.has_path_or_file && kind == "code" {
358 boost += 0.04;
359 }
360
361 boost
362 }
363}
364
365pub struct OverlapUtils;
367
368impl OverlapUtils {
369 pub fn calculate_overlap_ratio(set1: &[String], set2: &[String]) -> f64 {
371 if set1.is_empty() || set2.is_empty() {
372 return 0.0;
373 }
374
375 let ids1: HashSet<_> = set1.iter().collect();
376 let ids2: HashSet<_> = set2.iter().collect();
377
378 let intersection_size = ids1.intersection(&ids2).count();
379 let union_size = ids1.union(&ids2).count();
380
381 if union_size == 0 {
382 0.0
383 } else {
384 intersection_size as f64 / union_size as f64
385 }
386 }
387}
388
389#[derive(Debug, Clone)]
391pub struct TextPart {
392 pub kind: TextPartKind,
393 pub content: String,
394 pub start: usize,
395 pub end: usize,
396}
397
398#[derive(Debug, Clone, PartialEq)]
400pub enum TextPartKind {
401 Text,
402 Code,
403}
404
405#[derive(Debug, Clone, Default)]
407pub struct QueryFeatureFlags {
408 pub has_code_symbol: bool,
409 pub has_error_token: bool,
410 pub has_path_or_file: bool,
411 pub has_numeric_id: bool,
412}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417
418 #[test]
419 fn test_token_counting() {
420 assert_eq!(TokenCounter::count_tokens(""), 0);
421 assert_eq!(TokenCounter::count_tokens("hello"), 1);
422 assert_eq!(TokenCounter::count_tokens("hello world"), 3); assert_eq!(TokenCounter::count_tokens("function_name()"), 3); let detailed = TokenCounter::count_tokens_detailed("hello world");
427 assert_eq!(detailed.alphanumeric_tokens, 2); assert_eq!(detailed.whitespace_tokens, 1); assert_eq!(detailed.total_tokens, 3); }
431
432 #[test]
433 fn test_sentence_splitting() {
434 let sentences = TextProcessor::split_sentences("Hello world. How are you? Fine thanks!");
435 assert_eq!(sentences.len(), 3);
436 assert_eq!(sentences[0], "Hello world.");
437 assert_eq!(sentences[1], "How are you?");
438 assert_eq!(sentences[2], "Fine thanks!");
439 }
440
441 #[test]
442 fn test_code_fence_extraction() {
443 let text = "Some text\n```rust\nfn main() {}\n```\nMore text";
444 let parts = TextProcessor::extract_code_fences(text);
445 assert_eq!(parts.len(), 3);
446 assert!(matches!(parts[0].kind, TextPartKind::Text));
447 assert!(matches!(parts[1].kind, TextPartKind::Code));
448 assert!(matches!(parts[2].kind, TextPartKind::Text));
449 }
450
451 #[test]
452 fn test_query_features() {
453 let features = QueryFeatures::extract_features("function_name() error in /path/file.rs");
454 assert!(features.has_code_symbol);
455 assert!(features.has_error_token);
456 assert!(features.has_path_or_file);
457 }
458
459 #[test]
460 fn test_overlap_calculation() {
461 let set1 = vec!["a".to_string(), "b".to_string(), "c".to_string()];
462 let set2 = vec!["b".to_string(), "c".to_string(), "d".to_string()];
463 let ratio = OverlapUtils::calculate_overlap_ratio(&set1, &set2);
464 assert!((ratio - 0.5).abs() < f64::EPSILON); }
466
467 #[test]
468 fn test_hash_generation() {
469 let hash = HashUtils::short_hash("test input");
470 assert_eq!(hash.len(), 16);
471
472 let hash2 = HashUtils::short_hash("test input");
474 assert_eq!(hash, hash2);
475
476 let hash3 = HashUtils::short_hash("different input");
478 assert_ne!(hash, hash3);
479 }
480
481 #[test]
482 fn test_tokenize_options_default() {
483 let options = TokenizeOptions::default();
484 assert_eq!(options.min_word_length, 2);
485 assert!(options.to_lowercase);
486 }
487
488 #[test]
489 fn test_text_processor_empty_input() {
490 let sentences = TextProcessor::split_sentences("");
491 assert!(sentences.is_empty());
492
493 let parts = TextProcessor::extract_code_fences("");
494 assert_eq!(parts.len(), 1);
495 assert!(matches!(parts[0].kind, TextPartKind::Text));
496 assert_eq!(parts[0].content, "");
497 }
498
499 #[test]
500 fn test_query_features_empty_query() {
501 let features = QueryFeatures::extract_features("");
502 assert!(!features.has_code_symbol);
503 assert!(!features.has_error_token);
504 assert!(!features.has_path_or_file);
505 assert!(!features.has_numeric_id);
506 }
507
508 #[test]
509 fn test_query_features_all_features() {
510 let features = QueryFeatures::extract_features("error: function_name() failed in /path/to/file.rs:123 with id 456");
511 assert!(features.has_code_symbol);
512 assert!(features.has_error_token);
513 assert!(features.has_path_or_file);
514 assert!(features.has_numeric_id);
515 }
516
517 #[test]
518 fn test_hash_generation_edge_cases() {
519 let hash1 = HashUtils::short_hash("test content");
521 let hash2 = HashUtils::short_hash("test content");
522 let hash3 = HashUtils::short_hash("different content");
523
524 assert_eq!(hash1, hash2); assert_ne!(hash1, hash3); let empty_hash = HashUtils::short_hash("");
529 assert!(!empty_hash.is_empty());
530 assert_eq!(empty_hash.len(), 16); }
532
533 #[test]
534 fn test_text_processor_edge_cases() {
535 let text = "Line 1\nLine 2\r\nLine 3\rLine 4";
537 let sentences = TextProcessor::split_sentences(text);
538 assert!(sentences.len() >= 4);
539
540 let text = "... !!! ???";
542 let sentences = TextProcessor::split_sentences(text);
543 assert!(!sentences.is_empty());
544 }
545
546 #[test]
547 fn test_query_features_partial_matches() {
548 let features = QueryFeatures::extract_features("just some error here");
550 assert!(features.has_error_token);
551 assert!(!features.has_code_symbol);
552 assert!(!features.has_path_or_file);
553 assert!(!features.has_numeric_id);
554
555 let features = QueryFeatures::extract_features("function() call");
556 assert!(features.has_code_symbol);
557 assert!(!features.has_error_token);
558
559 let features = QueryFeatures::extract_features("/home/user/file.txt");
560 assert!(features.has_path_or_file);
561
562 let features = QueryFeatures::extract_features("user id 12345");
563 assert!(features.has_numeric_id);
564 }
565}