1use crate::UtilsResult;
7use std::cmp;
8use std::collections::{HashMap, HashSet};
9
10pub struct TextParser;
14
15impl TextParser {
16 pub fn tokenize(text: &str, delimiters: &[char]) -> Vec<String> {
18 if delimiters.is_empty() {
19 return vec![text.to_string()];
20 }
21
22 let mut tokens = Vec::new();
23 let mut current_token = String::new();
24
25 for ch in text.chars() {
26 if delimiters.contains(&ch) {
27 if !current_token.is_empty() {
28 tokens.push(current_token.trim().to_string());
29 current_token.clear();
30 }
31 } else {
32 current_token.push(ch);
33 }
34 }
35
36 if !current_token.is_empty() {
37 tokens.push(current_token.trim().to_string());
38 }
39
40 tokens
41 }
42
43 pub fn extract_numbers(text: &str) -> Vec<f64> {
45 let mut numbers = Vec::new();
46 let mut current_number = String::new();
47
48 for ch in text.chars() {
49 if ch.is_ascii_digit() || ch == '.' || ch == '-' || ch == '+' {
50 current_number.push(ch);
51 } else if !current_number.is_empty() {
52 if let Ok(num) = current_number.parse::<f64>() {
53 numbers.push(num);
54 }
55 current_number.clear();
56 }
57 }
58
59 if !current_number.is_empty() {
60 if let Ok(num) = current_number.parse::<f64>() {
61 numbers.push(num);
62 }
63 }
64
65 numbers
66 }
67
68 pub fn extract_key_value_pairs(
70 text: &str,
71 pair_delimiter: char,
72 kv_delimiter: char,
73 ) -> HashMap<String, String> {
74 let mut pairs = HashMap::new();
75
76 for pair in text.split(pair_delimiter) {
77 if let Some(kv_pos) = pair.find(kv_delimiter) {
78 let key = pair[..kv_pos].trim().to_string();
79 let value = pair[kv_pos + 1..].trim().to_string();
80 pairs.insert(key, value);
81 }
82 }
83
84 pairs
85 }
86
87 pub fn parse_structured_lines<F, T>(lines: &[String], parser: F) -> UtilsResult<Vec<T>>
89 where
90 F: Fn(&str) -> Option<T>,
91 {
92 let mut results = Vec::new();
93
94 for line in lines {
95 if let Some(parsed) = parser(line) {
96 results.push(parsed);
97 }
98 }
99
100 Ok(results)
101 }
102
103 pub fn word_frequency(text: &str) -> HashMap<String, usize> {
105 let mut frequencies = HashMap::new();
106
107 let words = Self::tokenize(text, &[' ', '\t', '\n', '\r', '.', ',', '!', '?', ';', ':']);
108
109 for word in words {
110 let word_lower = word.to_lowercase();
111 if !word_lower.is_empty() {
112 *frequencies.entry(word_lower).or_insert(0) += 1;
113 }
114 }
115
116 frequencies
117 }
118}
119
120pub struct StringSimilarity;
124
125impl StringSimilarity {
126 pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
128 let s1_chars: Vec<char> = s1.chars().collect();
129 let s2_chars: Vec<char> = s2.chars().collect();
130 let m = s1_chars.len();
131 let n = s2_chars.len();
132
133 if m == 0 {
134 return n;
135 }
136 if n == 0 {
137 return m;
138 }
139
140 let mut dp = vec![vec![0; n + 1]; m + 1];
141
142 for (i, row) in dp.iter_mut().enumerate().take(m + 1) {
144 row[0] = i;
145 }
146 for j in 0..=n {
147 dp[0][j] = j;
148 }
149
150 for i in 1..=m {
152 for j in 1..=n {
153 let cost = if s1_chars[i - 1] == s2_chars[j - 1] {
154 0
155 } else {
156 1
157 };
158 dp[i][j] = cmp::min(
159 dp[i - 1][j] + 1, cmp::min(
161 dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost, ),
164 );
165 }
166 }
167
168 dp[m][n]
169 }
170
171 pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
173 let max_len = cmp::max(s1.len(), s2.len());
174 if max_len == 0 {
175 return 1.0;
176 }
177
178 let distance = Self::levenshtein_distance(s1, s2);
179 1.0 - (distance as f64 / max_len as f64)
180 }
181
182 pub fn jaccard_similarity(s1: &str, s2: &str, n: usize) -> f64 {
184 if n == 0 {
185 return 0.0;
186 }
187
188 let ngrams1 = Self::character_ngrams(s1, n);
189 let ngrams2 = Self::character_ngrams(s2, n);
190
191 if ngrams1.is_empty() && ngrams2.is_empty() {
192 return 1.0;
193 }
194
195 let intersection: HashSet<_> = ngrams1.intersection(&ngrams2).collect();
196 let union: HashSet<_> = ngrams1.union(&ngrams2).collect();
197
198 intersection.len() as f64 / union.len() as f64
199 }
200
201 pub fn cosine_similarity(s1: &str, s2: &str) -> f64 {
203 let freq1 = TextParser::word_frequency(s1);
204 let freq2 = TextParser::word_frequency(s2);
205
206 if freq1.is_empty() || freq2.is_empty() {
207 return 0.0;
208 }
209
210 let mut dot_product = 0.0;
211 let mut norm1 = 0.0;
212 let mut norm2 = 0.0;
213
214 let all_words: HashSet<_> = freq1.keys().chain(freq2.keys()).collect();
215
216 for word in all_words {
217 let f1 = *freq1.get(word).unwrap_or(&0) as f64;
218 let f2 = *freq2.get(word).unwrap_or(&0) as f64;
219
220 dot_product += f1 * f2;
221 norm1 += f1 * f1;
222 norm2 += f2 * f2;
223 }
224
225 if norm1 == 0.0 || norm2 == 0.0 {
226 return 0.0;
227 }
228
229 dot_product / (norm1.sqrt() * norm2.sqrt())
230 }
231
232 fn character_ngrams(s: &str, n: usize) -> HashSet<String> {
234 let chars: Vec<char> = s.chars().collect();
235 let mut ngrams = HashSet::new();
236
237 if chars.len() < n {
238 return ngrams;
239 }
240
241 for i in 0..=chars.len() - n {
242 let ngram: String = chars[i..i + n].iter().collect();
243 ngrams.insert(ngram);
244 }
245
246 ngrams
247 }
248
249 pub fn find_best_match(
251 target: &str,
252 candidates: &[String],
253 similarity_fn: fn(&str, &str) -> f64,
254 threshold: f64,
255 ) -> Option<(String, f64)> {
256 let mut best_match = None;
257 let mut best_score = threshold;
258
259 for candidate in candidates {
260 let score = similarity_fn(target, candidate);
261 if score > best_score {
262 best_score = score;
263 best_match = Some((candidate.clone(), score));
264 }
265 }
266
267 best_match
268 }
269}
270
271pub struct RegexUtils;
275
276impl RegexUtils {
277 pub fn is_email(text: &str) -> bool {
279 let email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$";
280 Self::matches_pattern(text, email_pattern)
281 }
282
283 pub fn is_url(text: &str) -> bool {
285 let url_pattern = r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$";
286 Self::matches_pattern(text, url_pattern)
287 }
288
289 pub fn is_numeric(text: &str) -> bool {
291 !text.is_empty() && text.chars().all(|c| c.is_ascii_digit())
292 }
293
294 pub fn is_alphanumeric(text: &str) -> bool {
296 !text.is_empty() && text.chars().all(|c| c.is_alphanumeric())
297 }
298
299 pub fn extract_words(text: &str) -> Vec<String> {
301 text.split_whitespace()
302 .map(|word| {
303 word.chars()
304 .filter(|c| c.is_alphabetic())
305 .collect::<String>()
306 })
307 .filter(|word| !word.is_empty())
308 .collect()
309 }
310
311 fn matches_pattern(text: &str, pattern: &str) -> bool {
313 match pattern {
316 r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" => {
317 text.contains('@')
318 && text.contains('.')
319 && !text.starts_with('@')
320 && !text.ends_with('@')
321 }
322 r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$" => {
323 text.starts_with("http://") || text.starts_with("https://")
324 }
325 _ => false,
326 }
327 }
328
329 pub fn find_all_occurrences(text: &str, pattern: &str) -> Vec<usize> {
331 let mut positions = Vec::new();
332 let mut start = 0;
333
334 while let Some(pos) = text[start..].find(pattern) {
335 let absolute_pos = start + pos;
336 positions.push(absolute_pos);
337 start = absolute_pos + 1;
338 }
339
340 positions
341 }
342}
343
344pub struct UnicodeUtils;
348
349impl UnicodeUtils {
350 pub fn simple_normalize(text: &str) -> String {
352 text.chars()
353 .map(|c| match c {
354 'À'..='Ä' | 'à'..='ä' => 'a',
355 'È'..='Ë' | 'è'..='ë' => 'e',
356 'Ì'..='Ï' | 'ì'..='ï' => 'i',
357 'Ò'..='Ö' | 'ò'..='ö' => 'o',
358 'Ù'..='Ü' | 'ù'..='ü' => 'u',
359 'Ñ' | 'ñ' => 'n',
360 'Ç' | 'ç' => 'c',
361 _ => c,
362 })
363 .collect()
364 }
365
366 pub fn remove_diacritics(text: &str) -> String {
368 Self::simple_normalize(text)
369 }
370
371 pub fn has_non_ascii(text: &str) -> bool {
373 !text.is_ascii()
374 }
375
376 pub fn char_count(text: &str) -> usize {
378 text.chars().count()
379 }
380
381 pub fn analyze_text(text: &str) -> TextAnalysis {
383 let mut analysis = TextAnalysis::default();
384
385 for ch in text.chars() {
386 analysis.total_chars += 1;
387
388 if ch.is_alphabetic() {
389 analysis.alphabetic += 1;
390 }
391 if ch.is_numeric() {
392 analysis.numeric += 1;
393 }
394 if ch.is_whitespace() {
395 analysis.whitespace += 1;
396 }
397 if ch.is_ascii_punctuation() {
398 analysis.punctuation += 1;
399 }
400 if !ch.is_ascii() {
401 analysis.non_ascii += 1;
402 }
403 }
404
405 analysis
406 }
407}
408
409#[derive(Debug, Default, Clone)]
411pub struct TextAnalysis {
412 pub total_chars: usize,
413 pub alphabetic: usize,
414 pub numeric: usize,
415 pub whitespace: usize,
416 pub punctuation: usize,
417 pub non_ascii: usize,
418}
419
420pub struct TextNormalizer;
424
425impl TextNormalizer {
426 pub fn normalize_for_ml(text: &str) -> String {
428 text.to_lowercase()
429 .trim()
430 .chars()
431 .filter(|c| c.is_alphanumeric() || c.is_whitespace())
432 .collect::<String>()
433 .split_whitespace()
434 .collect::<Vec<_>>()
435 .join(" ")
436 }
437
438 pub fn normalize_whitespace(text: &str) -> String {
440 text.split_whitespace().collect::<Vec<_>>().join(" ")
441 }
442
443 pub fn to_title_case(text: &str) -> String {
445 text.split_whitespace()
446 .map(|word| {
447 let mut chars = word.chars();
448 match chars.next() {
449 None => String::new(),
450 Some(first) => {
451 first.to_uppercase().collect::<String>()
452 + chars.as_str().to_lowercase().as_str()
453 }
454 }
455 })
456 .collect::<Vec<_>>()
457 .join(" ")
458 }
459
460 pub fn remove_html_tags(text: &str) -> String {
462 let mut result = String::new();
463 let mut in_tag = false;
464
465 for ch in text.chars() {
466 match ch {
467 '<' => in_tag = true,
468 '>' => in_tag = false,
469 _ if !in_tag => result.push(ch),
470 _ => {}
471 }
472 }
473
474 result
475 }
476
477 pub fn clean_for_analysis(text: &str) -> String {
479 let cleaned = Self::remove_html_tags(text);
480 let cleaned = UnicodeUtils::remove_diacritics(&cleaned);
481 Self::normalize_for_ml(&cleaned)
482 }
483
484 pub fn truncate(text: &str, max_length: usize, add_ellipsis: bool) -> String {
486 if text.len() <= max_length {
487 return text.to_string();
488 }
489
490 let truncated = &text[..max_length.saturating_sub(if add_ellipsis { 3 } else { 0 })];
491 if add_ellipsis {
492 format!("{truncated}...")
493 } else {
494 truncated.to_string()
495 }
496 }
497}
498
499#[allow(non_snake_case)]
500#[cfg(test)]
501mod tests {
502 use super::*;
503
504 #[test]
505 fn test_text_parser() {
506 let text = "Hello, world! How are you?";
507 let tokens = TextParser::tokenize(text, &[' ', ',', '!', '?']);
508 assert_eq!(tokens, vec!["Hello", "world", "How", "are", "you"]);
509
510 let numbers = TextParser::extract_numbers("Price: $12.99, Quantity: 5, Discount: -2.5");
511 assert_eq!(numbers, vec![12.99, 5.0, -2.5]);
512
513 let freq = TextParser::word_frequency("hello world hello");
514 assert_eq!(*freq.get("hello").unwrap(), 2);
515 assert_eq!(*freq.get("world").unwrap(), 1);
516 }
517
518 #[test]
519 fn test_string_similarity() {
520 assert_eq!(StringSimilarity::levenshtein_distance("cat", "bat"), 1);
521 assert_eq!(
522 StringSimilarity::levenshtein_distance("kitten", "sitting"),
523 3
524 );
525
526 let similarity = StringSimilarity::levenshtein_similarity("hello", "hallo");
527 assert!(similarity > 0.5);
528
529 let jaccard = StringSimilarity::jaccard_similarity("hello", "hallo", 2);
530 assert!(jaccard > 0.0);
531
532 let cosine = StringSimilarity::cosine_similarity("hello world", "hello earth");
533 assert!(cosine > 0.0);
534 }
535
536 #[test]
537 fn test_regex_utils() {
538 assert!(RegexUtils::is_email("test@example.com"));
539 assert!(!RegexUtils::is_email("invalid.email"));
540
541 assert!(RegexUtils::is_url("https://example.com"));
542 assert!(!RegexUtils::is_url("not-a-url"));
543
544 assert!(RegexUtils::is_numeric("12345"));
545 assert!(!RegexUtils::is_numeric("123a45"));
546
547 let words = RegexUtils::extract_words("Hello, world! 123");
548 assert_eq!(words, vec!["Hello", "world"]);
549
550 let positions = RegexUtils::find_all_occurrences("hello hello world", "hello");
551 assert_eq!(positions, vec![0, 6]);
552 }
553
554 #[test]
555 fn test_unicode_utils() {
556 let normalized = UnicodeUtils::simple_normalize("café");
557 assert_eq!(normalized, "cafe");
558
559 assert!(UnicodeUtils::has_non_ascii("café"));
560 assert!(!UnicodeUtils::has_non_ascii("cafe"));
561
562 assert_eq!(UnicodeUtils::char_count("café"), 4);
563
564 let analysis = UnicodeUtils::analyze_text("Hello, 世界!");
565 assert!(analysis.total_chars > 0);
566 assert!(analysis.alphabetic > 0);
567 assert!(analysis.non_ascii > 0);
568 }
569
570 #[test]
571 fn test_text_normalizer() {
572 let normalized = TextNormalizer::normalize_for_ml(" Hello, WORLD! ");
573 assert_eq!(normalized, "hello world");
574
575 let whitespace = TextNormalizer::normalize_whitespace(" hello world ");
576 assert_eq!(whitespace, "hello world");
577
578 let title = TextNormalizer::to_title_case("hello world");
579 assert_eq!(title, "Hello World");
580
581 let no_html = TextNormalizer::remove_html_tags("<p>Hello <b>world</b>!</p>");
582 assert_eq!(no_html, "Hello world!");
583
584 let truncated = TextNormalizer::truncate("Hello, world!", 8, true);
585 assert_eq!(truncated, "Hello...");
586 }
587
588 #[test]
589 fn test_text_analysis() {
590 let analysis = UnicodeUtils::analyze_text("Hello123! ");
591 assert_eq!(analysis.total_chars, 10);
592 assert_eq!(analysis.alphabetic, 5);
593 assert_eq!(analysis.numeric, 3);
594 assert_eq!(analysis.whitespace, 1);
595 assert_eq!(analysis.punctuation, 1);
596 }
597}