Skip to main content

offline_intelligence/utils/
text_utils.rs

1//! Efficient text processing utilities
2
3use std::borrow::Cow;
4use regex::Regex;
5use lazy_static::lazy_static;
6
7lazy_static! {
8    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
9}
10
11/// Case-insensitive text operations without allocation when possible
12pub struct TextUtils;
13
14impl TextUtils {
15    /// Check if text contains pattern (case-insensitive) without allocation
16    pub fn contains_ignore_case(text: &str, pattern: &str) -> bool {
17        if pattern.len() > text.len() {
18            return false;
19        }
20        
21        // Fast path for short patterns
22        if pattern.len() <= 32 {
23            text.to_lowercase().contains(&pattern.to_lowercase())
24        } else {
25            // Use more efficient algorithm for longer patterns
26            text.chars()
27                .flat_map(char::to_lowercase)
28                .collect::<String>()
29                .contains(&pattern.to_lowercase())
30        }
31    }
32    
33    /// Normalize whitespace efficiently
34    pub fn normalize_whitespace(text: &str) -> Cow<'_, str> {
35        if WHITESPACE_REGEX.is_match(text) {
36            Cow::Owned(WHITESPACE_REGEX.replace_all(text, " ").trim().to_string())
37        } else {
38            Cow::Borrowed(text)
39        }
40    }
41    
42    /// Extract first N words efficiently
43    pub fn first_words(text: &str, n: usize) -> Cow<'_, str> {
44        if n == 0 || text.is_empty() {
45            return Cow::Borrowed("");
46        }
47        
48        let mut word_count = 0;
49        let mut end_pos = 0;
50        
51        for (pos, _) in text.match_indices(' ') {
52            word_count += 1;
53            if word_count >= n {
54                end_pos = pos;
55                break;
56            }
57        }
58        
59        if end_pos > 0 {
60            Cow::Borrowed(&text[..end_pos])
61        } else {
62            Cow::Borrowed(text)
63        }
64    }
65    
66    /// Count words efficiently (no allocation)
67    pub fn count_words(text: &str) -> usize {
68        if text.is_empty() {
69            return 0;
70        }
71        
72        text.split_whitespace().count()
73    }
74    
75    /// Truncate text to max length with ellipsis if needed
76    pub fn truncate_with_ellipsis(text: &str, max_len: usize) -> Cow<'_, str> {
77        if text.len() <= max_len {
78            Cow::Borrowed(text)
79        } else if max_len <= 3 {
80            Cow::Borrowed("...")
81        } else {
82            let mut result = String::with_capacity(max_len);
83            result.push_str(&text[..max_len - 3]);
84            result.push_str("...");
85            Cow::Owned(result)
86        }
87    }
88    
89    /// Check if word is significant (not stop word, long enough)
90    pub fn is_significant_word(word: &str, min_len: usize) -> bool {
91        if word.len() < min_len {
92            return false;
93        }
94        
95        // Quick ASCII-only check for common stop words
96        !matches!(word.to_lowercase().as_str(), "the" | "a" | "an" | "and" | "or" | "but" | "in" | "on" | "at" | "to" | "for" |
97            "of" | "with" | "by" | "is" | "am" | "are" | "was" | "were" | "be" | "been" |
98            "being" | "have" | "has" | "had" | "do" | "does" | "did")
99    }
100}