sklears_utils/
text_processing.rs

1//! Text processing utilities for machine learning workflows
2//!
3//! This module provides utilities for text parsing, string similarity measures,
4//! regular expression helpers, unicode handling, and text normalization.
5
6use crate::UtilsResult;
7use std::cmp;
8use std::collections::{HashMap, HashSet};
9
10// ===== TEXT PARSING UTILITIES =====
11
12/// Text parser for extracting structured data from text
13pub struct TextParser;
14
15impl TextParser {
16    /// Split text into tokens using various delimiters
17    pub fn tokenize(text: &str, delimiters: &[char]) -> Vec<String> {
18        if delimiters.is_empty() {
19            return vec![text.to_string()];
20        }
21
22        let mut tokens = Vec::new();
23        let mut current_token = String::new();
24
25        for ch in text.chars() {
26            if delimiters.contains(&ch) {
27                if !current_token.is_empty() {
28                    tokens.push(current_token.trim().to_string());
29                    current_token.clear();
30                }
31            } else {
32                current_token.push(ch);
33            }
34        }
35
36        if !current_token.is_empty() {
37            tokens.push(current_token.trim().to_string());
38        }
39
40        tokens
41    }
42
43    /// Extract numbers from text
44    pub fn extract_numbers(text: &str) -> Vec<f64> {
45        let mut numbers = Vec::new();
46        let mut current_number = String::new();
47
48        for ch in text.chars() {
49            if ch.is_ascii_digit() || ch == '.' || ch == '-' || ch == '+' {
50                current_number.push(ch);
51            } else if !current_number.is_empty() {
52                if let Ok(num) = current_number.parse::<f64>() {
53                    numbers.push(num);
54                }
55                current_number.clear();
56            }
57        }
58
59        if !current_number.is_empty() {
60            if let Ok(num) = current_number.parse::<f64>() {
61                numbers.push(num);
62            }
63        }
64
65        numbers
66    }
67
68    /// Extract key-value pairs from text
69    pub fn extract_key_value_pairs(
70        text: &str,
71        pair_delimiter: char,
72        kv_delimiter: char,
73    ) -> HashMap<String, String> {
74        let mut pairs = HashMap::new();
75
76        for pair in text.split(pair_delimiter) {
77            if let Some(kv_pos) = pair.find(kv_delimiter) {
78                let key = pair[..kv_pos].trim().to_string();
79                let value = pair[kv_pos + 1..].trim().to_string();
80                pairs.insert(key, value);
81            }
82        }
83
84        pairs
85    }
86
87    /// Parse structured text lines (e.g., log files)
88    pub fn parse_structured_lines<F, T>(lines: &[String], parser: F) -> UtilsResult<Vec<T>>
89    where
90        F: Fn(&str) -> Option<T>,
91    {
92        let mut results = Vec::new();
93
94        for line in lines {
95            if let Some(parsed) = parser(line) {
96                results.push(parsed);
97            }
98        }
99
100        Ok(results)
101    }
102
103    /// Extract words and their frequencies
104    pub fn word_frequency(text: &str) -> HashMap<String, usize> {
105        let mut frequencies = HashMap::new();
106
107        let words = Self::tokenize(text, &[' ', '\t', '\n', '\r', '.', ',', '!', '?', ';', ':']);
108
109        for word in words {
110            let word_lower = word.to_lowercase();
111            if !word_lower.is_empty() {
112                *frequencies.entry(word_lower).or_insert(0) += 1;
113            }
114        }
115
116        frequencies
117    }
118}
119
120// ===== STRING SIMILARITY MEASURES =====
121
122/// String similarity utilities
123pub struct StringSimilarity;
124
125impl StringSimilarity {
126    /// Calculate Levenshtein distance between two strings
127    pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
128        let s1_chars: Vec<char> = s1.chars().collect();
129        let s2_chars: Vec<char> = s2.chars().collect();
130        let m = s1_chars.len();
131        let n = s2_chars.len();
132
133        if m == 0 {
134            return n;
135        }
136        if n == 0 {
137            return m;
138        }
139
140        let mut dp = vec![vec![0; n + 1]; m + 1];
141
142        // Initialize first row and column
143        for (i, row) in dp.iter_mut().enumerate().take(m + 1) {
144            row[0] = i;
145        }
146        for j in 0..=n {
147            dp[0][j] = j;
148        }
149
150        // Fill the DP table
151        for i in 1..=m {
152            for j in 1..=n {
153                let cost = if s1_chars[i - 1] == s2_chars[j - 1] {
154                    0
155                } else {
156                    1
157                };
158                dp[i][j] = cmp::min(
159                    dp[i - 1][j] + 1, // deletion
160                    cmp::min(
161                        dp[i][j - 1] + 1,        // insertion
162                        dp[i - 1][j - 1] + cost, // substitution
163                    ),
164                );
165            }
166        }
167
168        dp[m][n]
169    }
170
171    /// Calculate normalized Levenshtein similarity (0.0 to 1.0)
172    pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
173        let max_len = cmp::max(s1.len(), s2.len());
174        if max_len == 0 {
175            return 1.0;
176        }
177
178        let distance = Self::levenshtein_distance(s1, s2);
179        1.0 - (distance as f64 / max_len as f64)
180    }
181
182    /// Calculate Jaccard similarity between two strings (based on character n-grams)
183    pub fn jaccard_similarity(s1: &str, s2: &str, n: usize) -> f64 {
184        if n == 0 {
185            return 0.0;
186        }
187
188        let ngrams1 = Self::character_ngrams(s1, n);
189        let ngrams2 = Self::character_ngrams(s2, n);
190
191        if ngrams1.is_empty() && ngrams2.is_empty() {
192            return 1.0;
193        }
194
195        let intersection: HashSet<_> = ngrams1.intersection(&ngrams2).collect();
196        let union: HashSet<_> = ngrams1.union(&ngrams2).collect();
197
198        intersection.len() as f64 / union.len() as f64
199    }
200
201    /// Calculate cosine similarity between two strings (based on word frequencies)
202    pub fn cosine_similarity(s1: &str, s2: &str) -> f64 {
203        let freq1 = TextParser::word_frequency(s1);
204        let freq2 = TextParser::word_frequency(s2);
205
206        if freq1.is_empty() || freq2.is_empty() {
207            return 0.0;
208        }
209
210        let mut dot_product = 0.0;
211        let mut norm1 = 0.0;
212        let mut norm2 = 0.0;
213
214        let all_words: HashSet<_> = freq1.keys().chain(freq2.keys()).collect();
215
216        for word in all_words {
217            let f1 = *freq1.get(word).unwrap_or(&0) as f64;
218            let f2 = *freq2.get(word).unwrap_or(&0) as f64;
219
220            dot_product += f1 * f2;
221            norm1 += f1 * f1;
222            norm2 += f2 * f2;
223        }
224
225        if norm1 == 0.0 || norm2 == 0.0 {
226            return 0.0;
227        }
228
229        dot_product / (norm1.sqrt() * norm2.sqrt())
230    }
231
232    /// Generate character n-grams from a string
233    fn character_ngrams(s: &str, n: usize) -> HashSet<String> {
234        let chars: Vec<char> = s.chars().collect();
235        let mut ngrams = HashSet::new();
236
237        if chars.len() < n {
238            return ngrams;
239        }
240
241        for i in 0..=chars.len() - n {
242            let ngram: String = chars[i..i + n].iter().collect();
243            ngrams.insert(ngram);
244        }
245
246        ngrams
247    }
248
249    /// Find the best matching string from a list
250    pub fn find_best_match(
251        target: &str,
252        candidates: &[String],
253        similarity_fn: fn(&str, &str) -> f64,
254        threshold: f64,
255    ) -> Option<(String, f64)> {
256        let mut best_match = None;
257        let mut best_score = threshold;
258
259        for candidate in candidates {
260            let score = similarity_fn(target, candidate);
261            if score > best_score {
262                best_score = score;
263                best_match = Some((candidate.clone(), score));
264            }
265        }
266
267        best_match
268    }
269}
270
271// ===== REGULAR EXPRESSION HELPERS =====
272
273/// Regular expression utilities for common patterns
274pub struct RegexUtils;
275
276impl RegexUtils {
277    /// Check if string is a valid email address (simple pattern)
278    pub fn is_email(text: &str) -> bool {
279        let email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$";
280        Self::matches_pattern(text, email_pattern)
281    }
282
283    /// Check if string is a valid URL (simple pattern)
284    pub fn is_url(text: &str) -> bool {
285        let url_pattern = r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$";
286        Self::matches_pattern(text, url_pattern)
287    }
288
289    /// Check if string contains only digits
290    pub fn is_numeric(text: &str) -> bool {
291        !text.is_empty() && text.chars().all(|c| c.is_ascii_digit())
292    }
293
294    /// Check if string is alphanumeric
295    pub fn is_alphanumeric(text: &str) -> bool {
296        !text.is_empty() && text.chars().all(|c| c.is_alphanumeric())
297    }
298
299    /// Extract all words from text
300    pub fn extract_words(text: &str) -> Vec<String> {
301        text.split_whitespace()
302            .map(|word| {
303                word.chars()
304                    .filter(|c| c.is_alphabetic())
305                    .collect::<String>()
306            })
307            .filter(|word| !word.is_empty())
308            .collect()
309    }
310
311    /// Simple pattern matching (without regex crate dependency)
312    fn matches_pattern(text: &str, pattern: &str) -> bool {
313        // This is a simplified implementation for common patterns
314        // In a real implementation, you'd use the regex crate
315        match pattern {
316            r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" => {
317                text.contains('@')
318                    && text.contains('.')
319                    && !text.starts_with('@')
320                    && !text.ends_with('@')
321            }
322            r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$" => {
323                text.starts_with("http://") || text.starts_with("https://")
324            }
325            _ => false,
326        }
327    }
328
329    /// Find all occurrences of a substring
330    pub fn find_all_occurrences(text: &str, pattern: &str) -> Vec<usize> {
331        let mut positions = Vec::new();
332        let mut start = 0;
333
334        while let Some(pos) = text[start..].find(pattern) {
335            let absolute_pos = start + pos;
336            positions.push(absolute_pos);
337            start = absolute_pos + 1;
338        }
339
340        positions
341    }
342}
343
344// ===== UNICODE HANDLING =====
345
346/// Unicode handling utilities
347pub struct UnicodeUtils;
348
349impl UnicodeUtils {
350    /// Normalize unicode text (simple normalization)
351    pub fn simple_normalize(text: &str) -> String {
352        text.chars()
353            .map(|c| match c {
354                'À'..='Ä' | 'à'..='ä' => 'a',
355                'È'..='Ë' | 'è'..='ë' => 'e',
356                'Ì'..='Ï' | 'ì'..='ï' => 'i',
357                'Ò'..='Ö' | 'ò'..='ö' => 'o',
358                'Ù'..='Ü' | 'ù'..='ü' => 'u',
359                'Ñ' | 'ñ' => 'n',
360                'Ç' | 'ç' => 'c',
361                _ => c,
362            })
363            .collect()
364    }
365
366    /// Remove diacritics (accents) from text
367    pub fn remove_diacritics(text: &str) -> String {
368        Self::simple_normalize(text)
369    }
370
371    /// Check if text contains non-ASCII characters
372    pub fn has_non_ascii(text: &str) -> bool {
373        !text.is_ascii()
374    }
375
376    /// Count unicode characters (not bytes)
377    pub fn char_count(text: &str) -> usize {
378        text.chars().count()
379    }
380
381    /// Get unicode character categories
382    pub fn analyze_text(text: &str) -> TextAnalysis {
383        let mut analysis = TextAnalysis::default();
384
385        for ch in text.chars() {
386            analysis.total_chars += 1;
387
388            if ch.is_alphabetic() {
389                analysis.alphabetic += 1;
390            }
391            if ch.is_numeric() {
392                analysis.numeric += 1;
393            }
394            if ch.is_whitespace() {
395                analysis.whitespace += 1;
396            }
397            if ch.is_ascii_punctuation() {
398                analysis.punctuation += 1;
399            }
400            if !ch.is_ascii() {
401                analysis.non_ascii += 1;
402            }
403        }
404
405        analysis
406    }
407}
408
409/// Text analysis results
410#[derive(Debug, Default, Clone)]
411pub struct TextAnalysis {
412    pub total_chars: usize,
413    pub alphabetic: usize,
414    pub numeric: usize,
415    pub whitespace: usize,
416    pub punctuation: usize,
417    pub non_ascii: usize,
418}
419
420// ===== TEXT NORMALIZATION =====
421
422/// Text normalization utilities
423pub struct TextNormalizer;
424
425impl TextNormalizer {
426    /// Normalize text for machine learning (lowercase, trim, etc.)
427    pub fn normalize_for_ml(text: &str) -> String {
428        text.to_lowercase()
429            .trim()
430            .chars()
431            .filter(|c| c.is_alphanumeric() || c.is_whitespace())
432            .collect::<String>()
433            .split_whitespace()
434            .collect::<Vec<_>>()
435            .join(" ")
436    }
437
438    /// Remove extra whitespace
439    pub fn normalize_whitespace(text: &str) -> String {
440        text.split_whitespace().collect::<Vec<_>>().join(" ")
441    }
442
443    /// Convert to title case
444    pub fn to_title_case(text: &str) -> String {
445        text.split_whitespace()
446            .map(|word| {
447                let mut chars = word.chars();
448                match chars.next() {
449                    None => String::new(),
450                    Some(first) => {
451                        first.to_uppercase().collect::<String>()
452                            + chars.as_str().to_lowercase().as_str()
453                    }
454                }
455            })
456            .collect::<Vec<_>>()
457            .join(" ")
458    }
459
460    /// Remove HTML tags (simple implementation)
461    pub fn remove_html_tags(text: &str) -> String {
462        let mut result = String::new();
463        let mut in_tag = false;
464
465        for ch in text.chars() {
466            match ch {
467                '<' => in_tag = true,
468                '>' => in_tag = false,
469                _ if !in_tag => result.push(ch),
470                _ => {}
471            }
472        }
473
474        result
475    }
476
477    /// Clean text for analysis (remove punctuation, normalize case)
478    pub fn clean_for_analysis(text: &str) -> String {
479        let cleaned = Self::remove_html_tags(text);
480        let cleaned = UnicodeUtils::remove_diacritics(&cleaned);
481        Self::normalize_for_ml(&cleaned)
482    }
483
484    /// Truncate text to specified length with ellipsis
485    pub fn truncate(text: &str, max_length: usize, add_ellipsis: bool) -> String {
486        if text.len() <= max_length {
487            return text.to_string();
488        }
489
490        let truncated = &text[..max_length.saturating_sub(if add_ellipsis { 3 } else { 0 })];
491        if add_ellipsis {
492            format!("{truncated}...")
493        } else {
494            truncated.to_string()
495        }
496    }
497}
498
499#[allow(non_snake_case)]
500#[cfg(test)]
501mod tests {
502    use super::*;
503
504    #[test]
505    fn test_text_parser() {
506        let text = "Hello, world! How are you?";
507        let tokens = TextParser::tokenize(text, &[' ', ',', '!', '?']);
508        assert_eq!(tokens, vec!["Hello", "world", "How", "are", "you"]);
509
510        let numbers = TextParser::extract_numbers("Price: $12.99, Quantity: 5, Discount: -2.5");
511        assert_eq!(numbers, vec![12.99, 5.0, -2.5]);
512
513        let freq = TextParser::word_frequency("hello world hello");
514        assert_eq!(*freq.get("hello").unwrap(), 2);
515        assert_eq!(*freq.get("world").unwrap(), 1);
516    }
517
518    #[test]
519    fn test_string_similarity() {
520        assert_eq!(StringSimilarity::levenshtein_distance("cat", "bat"), 1);
521        assert_eq!(
522            StringSimilarity::levenshtein_distance("kitten", "sitting"),
523            3
524        );
525
526        let similarity = StringSimilarity::levenshtein_similarity("hello", "hallo");
527        assert!(similarity > 0.5);
528
529        let jaccard = StringSimilarity::jaccard_similarity("hello", "hallo", 2);
530        assert!(jaccard > 0.0);
531
532        let cosine = StringSimilarity::cosine_similarity("hello world", "hello earth");
533        assert!(cosine > 0.0);
534    }
535
536    #[test]
537    fn test_regex_utils() {
538        assert!(RegexUtils::is_email("test@example.com"));
539        assert!(!RegexUtils::is_email("invalid.email"));
540
541        assert!(RegexUtils::is_url("https://example.com"));
542        assert!(!RegexUtils::is_url("not-a-url"));
543
544        assert!(RegexUtils::is_numeric("12345"));
545        assert!(!RegexUtils::is_numeric("123a45"));
546
547        let words = RegexUtils::extract_words("Hello, world! 123");
548        assert_eq!(words, vec!["Hello", "world"]);
549
550        let positions = RegexUtils::find_all_occurrences("hello hello world", "hello");
551        assert_eq!(positions, vec![0, 6]);
552    }
553
554    #[test]
555    fn test_unicode_utils() {
556        let normalized = UnicodeUtils::simple_normalize("café");
557        assert_eq!(normalized, "cafe");
558
559        assert!(UnicodeUtils::has_non_ascii("café"));
560        assert!(!UnicodeUtils::has_non_ascii("cafe"));
561
562        assert_eq!(UnicodeUtils::char_count("café"), 4);
563
564        let analysis = UnicodeUtils::analyze_text("Hello, 世界!");
565        assert!(analysis.total_chars > 0);
566        assert!(analysis.alphabetic > 0);
567        assert!(analysis.non_ascii > 0);
568    }
569
570    #[test]
571    fn test_text_normalizer() {
572        let normalized = TextNormalizer::normalize_for_ml("  Hello, WORLD!  ");
573        assert_eq!(normalized, "hello world");
574
575        let whitespace = TextNormalizer::normalize_whitespace("  hello   world  ");
576        assert_eq!(whitespace, "hello world");
577
578        let title = TextNormalizer::to_title_case("hello world");
579        assert_eq!(title, "Hello World");
580
581        let no_html = TextNormalizer::remove_html_tags("<p>Hello <b>world</b>!</p>");
582        assert_eq!(no_html, "Hello world!");
583
584        let truncated = TextNormalizer::truncate("Hello, world!", 8, true);
585        assert_eq!(truncated, "Hello...");
586    }
587
588    #[test]
589    fn test_text_analysis() {
590        let analysis = UnicodeUtils::analyze_text("Hello123! ");
591        assert_eq!(analysis.total_chars, 10);
592        assert_eq!(analysis.alphabetic, 5);
593        assert_eq!(analysis.numeric, 3);
594        assert_eq!(analysis.whitespace, 1);
595        assert_eq!(analysis.punctuation, 1);
596    }
597}