1use crate::error::{Result, TextError};
26use std::collections::{HashMap, HashSet};
27
28#[derive(Debug, Clone)]
34pub struct PreprocessConfig {
35 pub strip_html: bool,
37 pub handle_urls: UrlHandling,
39 pub handle_emails: EmailHandling,
41 pub handle_mentions: MentionHandling,
43 pub normalize_numbers: bool,
45 pub number_token: String,
47 pub expand_contractions: bool,
49 pub spell_check: bool,
51 pub max_edit_distance: usize,
53 pub remove_diacritics: bool,
55 pub unicode_normalize: bool,
57 pub lowercase: bool,
59 pub normalize_whitespace: bool,
61 pub remove_punctuation: bool,
63}
64
65impl Default for PreprocessConfig {
66 fn default() -> Self {
67 Self {
68 strip_html: true,
69 handle_urls: UrlHandling::Remove,
70 handle_emails: EmailHandling::Remove,
71 handle_mentions: MentionHandling::Remove,
72 normalize_numbers: false,
73 number_token: "<NUM>".to_string(),
74 expand_contractions: true,
75 spell_check: false,
76 max_edit_distance: 2,
77 remove_diacritics: false,
78 unicode_normalize: true,
79 lowercase: false,
80 normalize_whitespace: true,
81 remove_punctuation: false,
82 }
83 }
84}
85
86#[derive(Debug, Clone, PartialEq)]
88pub enum UrlHandling {
89 Keep,
91 Remove,
93 Replace(String),
95}
96
97#[derive(Debug, Clone, PartialEq)]
99pub enum EmailHandling {
100 Keep,
102 Remove,
104 Replace(String),
106}
107
108#[derive(Debug, Clone, PartialEq)]
110pub enum MentionHandling {
111 Keep,
113 Remove,
115 Replace(String),
117}
118
119#[derive(Debug, Clone)]
125pub struct PreprocessResult {
126 pub text: String,
128 pub extracted_urls: Vec<String>,
130 pub extracted_emails: Vec<String>,
132 pub extracted_mentions: Vec<String>,
134 pub extracted_numbers: Vec<String>,
136 pub spelling_corrections: Vec<(String, String)>,
138}
139
140#[derive(Debug, Clone)]
146pub struct TextPreprocessor {
147 config: PreprocessConfig,
148 dictionary: HashSet<String>,
150 contractions: HashMap<String, String>,
152}
153
154impl TextPreprocessor {
155 pub fn new(config: PreprocessConfig) -> Self {
157 let contractions = build_contraction_map();
158 Self {
159 config,
160 dictionary: HashSet::new(),
161 contractions,
162 }
163 }
164
165 pub fn with_dictionary(mut self, words: impl IntoIterator<Item = String>) -> Self {
167 self.dictionary = words.into_iter().collect();
168 self
169 }
170
171 pub fn add_dictionary_words(&mut self, words: impl IntoIterator<Item = String>) {
173 self.dictionary.extend(words);
174 }
175
176 pub fn with_basic_dictionary(mut self) -> Self {
178 self.dictionary = build_basic_dictionary();
179 self
180 }
181
182 pub fn process(&self, text: &str) -> Result<PreprocessResult> {
184 let mut result = PreprocessResult {
185 text: text.to_string(),
186 extracted_urls: Vec::new(),
187 extracted_emails: Vec::new(),
188 extracted_mentions: Vec::new(),
189 extracted_numbers: Vec::new(),
190 spelling_corrections: Vec::new(),
191 };
192
193 if self.config.unicode_normalize {
195 result.text = unicode_nfc_normalize(&result.text);
196 }
197
198 if self.config.strip_html {
200 result.text = strip_html_tags(&result.text);
201 }
202
203 let (text_after_urls, urls) =
205 extract_and_handle_urls(&result.text, &self.config.handle_urls);
206 result.text = text_after_urls;
207 result.extracted_urls = urls;
208
209 let (text_after_emails, emails) =
211 extract_and_handle_emails(&result.text, &self.config.handle_emails);
212 result.text = text_after_emails;
213 result.extracted_emails = emails;
214
215 let (text_after_mentions, mentions) =
217 extract_and_handle_mentions(&result.text, &self.config.handle_mentions);
218 result.text = text_after_mentions;
219 result.extracted_mentions = mentions;
220
221 if self.config.expand_contractions {
223 result.text = self.expand_contractions_text(&result.text);
224 }
225
226 if self.config.normalize_numbers {
228 let (text, numbers) = normalize_numbers(&result.text, &self.config.number_token);
229 result.text = text;
230 result.extracted_numbers = numbers;
231 }
232
233 if self.config.remove_diacritics {
235 result.text = remove_diacritics_from_text(&result.text);
236 }
237
238 if self.config.lowercase {
240 result.text = result.text.to_lowercase();
241 }
242
243 if self.config.remove_punctuation {
245 result.text = remove_punctuation(&result.text);
246 }
247
248 if self.config.spell_check && !self.dictionary.is_empty() {
250 let (text, corrections) =
251 self.spell_check_text(&result.text, self.config.max_edit_distance);
252 result.text = text;
253 result.spelling_corrections = corrections;
254 }
255
256 if self.config.normalize_whitespace {
258 result.text = normalize_whitespace(&result.text);
259 }
260
261 Ok(result)
262 }
263
264 fn expand_contractions_text(&self, text: &str) -> String {
266 let mut result = text.to_string();
267
268 let mut sorted_contractions: Vec<(&String, &String)> = self.contractions.iter().collect();
270 sorted_contractions.sort_by_key(|(k, _)| std::cmp::Reverse(k.len()));
271
272 for (contraction, expansion) in &sorted_contractions {
273 let lower = result.to_lowercase();
275 let contraction_lower = contraction.to_lowercase();
276 let mut new_result = String::with_capacity(result.len());
277 let mut search_from = 0;
278
279 loop {
280 let lower_slice = &lower[search_from..];
281 match lower_slice.find(&contraction_lower) {
282 Some(pos) => {
283 new_result.push_str(&result[search_from..search_from + pos]);
284 new_result.push_str(expansion);
285 search_from += pos + contraction.len();
286 }
287 None => {
288 new_result.push_str(&result[search_from..]);
289 break;
290 }
291 }
292 }
293 result = new_result;
294 }
295 result
296 }
297
298 fn spell_check_text(&self, text: &str, max_distance: usize) -> (String, Vec<(String, String)>) {
300 let mut corrections = Vec::new();
301 let words: Vec<&str> = text.split_whitespace().collect();
302 let mut result_words = Vec::with_capacity(words.len());
303
304 for word in &words {
305 let clean_word = word
306 .trim_matches(|c: char| !c.is_alphanumeric())
307 .to_lowercase();
308
309 if clean_word.is_empty() || self.dictionary.contains(&clean_word) {
310 result_words.push(word.to_string());
311 continue;
312 }
313
314 if let Some(correction) = find_closest_word(&clean_word, &self.dictionary, max_distance)
316 {
317 corrections.push((clean_word.clone(), correction.clone()));
318 let corrected = transfer_casing(word, &correction);
320 result_words.push(corrected);
321 } else {
322 result_words.push(word.to_string());
323 }
324 }
325
326 (result_words.join(" "), corrections)
327 }
328}
329
330pub fn strip_html_tags(text: &str) -> String {
338 let mut result = String::with_capacity(text.len());
339 let mut in_tag = false;
340 let chars: Vec<char> = text.chars().collect();
341 let mut i = 0;
342
343 while i < chars.len() {
344 if chars[i] == '<' {
345 in_tag = true;
346 i += 1;
347 continue;
348 }
349 if chars[i] == '>' && in_tag {
350 in_tag = false;
351 i += 1;
352 continue;
353 }
354 if !in_tag {
355 if chars[i] == '&' {
357 if let Some(entity_result) = try_decode_entity(&chars, i) {
358 result.push(entity_result.0);
359 i = entity_result.1;
360 continue;
361 }
362 }
363 result.push(chars[i]);
364 }
365 i += 1;
366 }
367 result
368}
369
370fn try_decode_entity(chars: &[char], start: usize) -> Option<(char, usize)> {
373 let mut end = start + 1;
375 while end < chars.len() && end - start < 10 {
376 if chars[end] == ';' {
377 let entity: String = chars[start..=end].iter().collect();
378 let decoded = match entity.as_str() {
379 "&" => Some('&'),
380 "<" => Some('<'),
381 ">" => Some('>'),
382 """ => Some('"'),
383 "'" => Some('\''),
384 " " => Some(' '),
385 _ => {
386 if entity.starts_with("&#x") || entity.starts_with("&#X") {
388 let hex_str: String = entity[3..entity.len() - 1].to_string();
389 u32::from_str_radix(&hex_str, 16)
390 .ok()
391 .and_then(char::from_u32)
392 } else if entity.starts_with("&#") {
393 let num_str: String = entity[2..entity.len() - 1].to_string();
394 num_str.parse::<u32>().ok().and_then(char::from_u32)
395 } else {
396 None
397 }
398 }
399 };
400 if let Some(c) = decoded {
401 return Some((c, end + 1));
402 }
403 return None;
404 }
405 end += 1;
406 }
407 None
408}
409
410fn extract_and_handle_urls(text: &str, handling: &UrlHandling) -> (String, Vec<String>) {
416 let mut urls = Vec::new();
417
418 match handling {
419 UrlHandling::Keep => (text.to_string(), urls),
420 UrlHandling::Remove | UrlHandling::Replace(_) => {
421 let replacement = match handling {
422 UrlHandling::Replace(token) => token.as_str(),
423 _ => "",
424 };
425 let result =
426 replace_pattern_simple(text, is_url_start, find_url_end, replacement, &mut urls);
427 (result, urls)
428 }
429 }
430}
431
432fn is_url_start(text: &str, pos: usize) -> bool {
434 let remaining = &text[pos..];
435 remaining.starts_with("http://")
436 || remaining.starts_with("https://")
437 || remaining.starts_with("ftp://")
438 || remaining.starts_with("www.")
439}
440
441fn find_url_end(text: &str, start: usize) -> usize {
443 let bytes = text.as_bytes();
444 let mut end = start;
445 while end < bytes.len() {
446 let b = bytes[end];
447 if b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' || b == b'>' || b == b'"' {
449 break;
450 }
451 end += 1;
452 }
453 while end > start {
455 let b = bytes[end - 1];
456 if b == b'.'
457 || b == b','
458 || b == b')'
459 || b == b']'
460 || b == b';'
461 || b == b':'
462 || b == b'!'
463 || b == b'?'
464 {
465 end -= 1;
466 } else {
467 break;
468 }
469 }
470 end
471}
472
473fn extract_and_handle_emails(text: &str, handling: &EmailHandling) -> (String, Vec<String>) {
479 let mut emails = Vec::new();
480
481 match handling {
482 EmailHandling::Keep => (text.to_string(), emails),
483 EmailHandling::Remove | EmailHandling::Replace(_) => {
484 let replacement = match handling {
485 EmailHandling::Replace(token) => token.as_str(),
486 _ => "",
487 };
488 let result = find_and_replace_emails(text, replacement, &mut emails);
489 (result, emails)
490 }
491 }
492}
493
494fn find_and_replace_emails(text: &str, replacement: &str, extracted: &mut Vec<String>) -> String {
496 let mut result = String::with_capacity(text.len());
497 let chars: Vec<char> = text.chars().collect();
498 let mut i = 0;
499
500 while i < chars.len() {
501 if chars[i] == '@' && i > 0 {
503 let mut local_start = i;
505 while local_start > 0 {
506 let c = chars[local_start - 1];
507 if c.is_alphanumeric() || c == '.' || c == '_' || c == '+' || c == '-' || c == '%' {
508 local_start -= 1;
509 } else {
510 break;
511 }
512 }
513
514 let mut domain_end = i + 1;
516 let mut has_dot = false;
517 while domain_end < chars.len() {
518 let c = chars[domain_end];
519 if c.is_alphanumeric() || c == '.' || c == '-' {
520 if c == '.' {
521 has_dot = true;
522 }
523 domain_end += 1;
524 } else {
525 break;
526 }
527 }
528
529 if local_start < i && domain_end > i + 1 && has_dot {
530 let email: String = chars[local_start..domain_end].iter().collect();
531 extracted.push(email);
532
533 let already_written = i - local_start;
535 for _ in 0..already_written {
536 result.pop();
537 }
538
539 result.push_str(replacement);
540 i = domain_end;
541 continue;
542 }
543 }
544
545 result.push(chars[i]);
546 i += 1;
547 }
548 result
549}
550
551fn extract_and_handle_mentions(text: &str, handling: &MentionHandling) -> (String, Vec<String>) {
557 let mut mentions = Vec::new();
558
559 match handling {
560 MentionHandling::Keep => (text.to_string(), mentions),
561 MentionHandling::Remove | MentionHandling::Replace(_) => {
562 let replacement = match handling {
563 MentionHandling::Replace(token) => token.as_str(),
564 _ => "",
565 };
566 let result = find_and_replace_mentions(text, replacement, &mut mentions);
567 (result, mentions)
568 }
569 }
570}
571
572fn find_and_replace_mentions(text: &str, replacement: &str, extracted: &mut Vec<String>) -> String {
574 let mut result = String::with_capacity(text.len());
575 let chars: Vec<char> = text.chars().collect();
576 let mut i = 0;
577
578 while i < chars.len() {
579 if chars[i] == '@' {
580 let preceded_by_space = i == 0 || chars[i - 1].is_whitespace();
582 if preceded_by_space {
583 let mut end = i + 1;
584 while end < chars.len() && (chars[end].is_alphanumeric() || chars[end] == '_') {
585 end += 1;
586 }
587 if end > i + 1 {
588 let mention: String = chars[i..end].iter().collect();
589 extracted.push(mention);
590 result.push_str(replacement);
591 i = end;
592 continue;
593 }
594 }
595 }
596 result.push(chars[i]);
597 i += 1;
598 }
599 result
600}
601
602fn normalize_numbers(text: &str, token: &str) -> (String, Vec<String>) {
608 let mut numbers = Vec::new();
609 let mut result = String::with_capacity(text.len());
610 let chars: Vec<char> = text.chars().collect();
611 let mut i = 0;
612
613 while i < chars.len() {
614 if chars[i].is_ascii_digit()
615 || (chars[i] == '-'
616 && i + 1 < chars.len()
617 && chars[i + 1].is_ascii_digit()
618 && (i == 0 || chars[i - 1].is_whitespace()))
619 {
620 let start = i;
621 if chars[i] == '-' {
622 i += 1;
623 }
624 while i < chars.len() && chars[i].is_ascii_digit() {
626 i += 1;
627 }
628 while i + 1 < chars.len() && chars[i] == ',' && chars[i + 1].is_ascii_digit() {
630 i += 1; while i < chars.len() && chars[i].is_ascii_digit() {
632 i += 1;
633 }
634 }
635 if i < chars.len()
637 && chars[i] == '.'
638 && i + 1 < chars.len()
639 && chars[i + 1].is_ascii_digit()
640 {
641 i += 1; while i < chars.len() && chars[i].is_ascii_digit() {
643 i += 1;
644 }
645 }
646 if i < chars.len() && (chars[i] == 'e' || chars[i] == 'E') {
648 let save = i;
649 i += 1;
650 if i < chars.len() && (chars[i] == '+' || chars[i] == '-') {
651 i += 1;
652 }
653 if i < chars.len() && chars[i].is_ascii_digit() {
654 while i < chars.len() && chars[i].is_ascii_digit() {
655 i += 1;
656 }
657 } else {
658 i = save; }
660 }
661
662 let num: String = chars[start..i].iter().collect();
663 numbers.push(num);
664 result.push_str(token);
665 } else {
666 result.push(chars[i]);
667 i += 1;
668 }
669 }
670 (result, numbers)
671}
672
673pub fn remove_diacritics_from_text(text: &str) -> String {
681 use unicode_normalization::UnicodeNormalization;
682
683 text.nfd().filter(|c| !is_combining_mark(*c)).collect()
684}
685
686fn is_combining_mark(c: char) -> bool {
688 let code = c as u32;
689 (0x0300..=0x036F).contains(&code)
694 || (0x1AB0..=0x1AFF).contains(&code)
695 || (0x1DC0..=0x1DFF).contains(&code)
696 || (0xFE20..=0xFE2F).contains(&code)
697}
698
699fn unicode_nfc_normalize(text: &str) -> String {
705 use unicode_normalization::UnicodeNormalization;
706 text.nfc().collect()
707}
708
709pub fn normalize_whitespace(text: &str) -> String {
715 let mut result = String::with_capacity(text.len());
716 let mut last_was_space = true; for c in text.chars() {
719 if c.is_whitespace() {
720 if !last_was_space {
721 result.push(' ');
722 last_was_space = true;
723 }
724 } else {
725 result.push(c);
726 last_was_space = false;
727 }
728 }
729
730 if result.ends_with(' ') {
732 result.pop();
733 }
734 result
735}
736
737fn remove_punctuation(text: &str) -> String {
743 text.chars()
744 .map(|c| if c.is_ascii_punctuation() { ' ' } else { c })
745 .collect()
746}
747
748pub fn edit_distance(a: &str, b: &str) -> usize {
754 let a_chars: Vec<char> = a.chars().collect();
755 let b_chars: Vec<char> = b.chars().collect();
756 let m = a_chars.len();
757 let n = b_chars.len();
758
759 if m == 0 {
760 return n;
761 }
762 if n == 0 {
763 return m;
764 }
765
766 let mut prev = vec![0usize; n + 1];
767 let mut curr = vec![0usize; n + 1];
768
769 for j in 0..=n {
770 prev[j] = j;
771 }
772
773 for i in 1..=m {
774 curr[0] = i;
775 for j in 1..=n {
776 let cost = if a_chars[i - 1] == b_chars[j - 1] {
777 0
778 } else {
779 1
780 };
781 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
782 }
783 std::mem::swap(&mut prev, &mut curr);
784 }
785 prev[n]
786}
787
788fn find_closest_word(
790 word: &str,
791 dictionary: &HashSet<String>,
792 max_distance: usize,
793) -> Option<String> {
794 let mut best: Option<(String, usize)> = None;
795
796 for dict_word in dictionary {
797 let len_diff = if word.len() > dict_word.len() {
799 word.len() - dict_word.len()
800 } else {
801 dict_word.len() - word.len()
802 };
803 if len_diff > max_distance {
804 continue;
805 }
806
807 let dist = edit_distance(word, dict_word);
808 if dist <= max_distance {
809 match &best {
810 None => best = Some((dict_word.clone(), dist)),
811 Some((_, best_dist)) => {
812 if dist < *best_dist {
813 best = Some((dict_word.clone(), dist));
814 }
815 }
816 }
817 }
818 }
819
820 best.map(|(w, _)| w)
821}
822
823fn transfer_casing(source: &str, target: &str) -> String {
825 let source_chars: Vec<char> = source.chars().collect();
826 let target_chars: Vec<char> = target.chars().collect();
827
828 if source_chars.iter().all(|c| c.is_uppercase()) {
829 return target.to_uppercase();
830 }
831
832 if source_chars
833 .first()
834 .map(|c| c.is_uppercase())
835 .unwrap_or(false)
836 {
837 let mut result: String = target_chars
838 .first()
839 .map(|c| c.to_uppercase().to_string())
840 .unwrap_or_default();
841 for &c in &target_chars[1..] {
842 result.push(c);
843 }
844 return result;
845 }
846
847 target.to_string()
848}
849
850fn build_contraction_map() -> HashMap<String, String> {
856 let mut m = HashMap::new();
857 let pairs = [
858 ("can't", "cannot"),
859 ("won't", "will not"),
860 ("don't", "do not"),
861 ("doesn't", "does not"),
862 ("didn't", "did not"),
863 ("isn't", "is not"),
864 ("aren't", "are not"),
865 ("wasn't", "was not"),
866 ("weren't", "were not"),
867 ("hasn't", "has not"),
868 ("haven't", "have not"),
869 ("hadn't", "had not"),
870 ("wouldn't", "would not"),
871 ("couldn't", "could not"),
872 ("shouldn't", "should not"),
873 ("mustn't", "must not"),
874 ("needn't", "need not"),
875 ("shan't", "shall not"),
876 ("mightn't", "might not"),
877 ("it's", "it is"),
878 ("that's", "that is"),
879 ("what's", "what is"),
880 ("where's", "where is"),
881 ("who's", "who is"),
882 ("there's", "there is"),
883 ("here's", "here is"),
884 ("let's", "let us"),
885 ("i'm", "i am"),
886 ("you're", "you are"),
887 ("we're", "we are"),
888 ("they're", "they are"),
889 ("i've", "i have"),
890 ("you've", "you have"),
891 ("we've", "we have"),
892 ("they've", "they have"),
893 ("i'll", "i will"),
894 ("you'll", "you will"),
895 ("he'll", "he will"),
896 ("she'll", "she will"),
897 ("it'll", "it will"),
898 ("we'll", "we will"),
899 ("they'll", "they will"),
900 ("i'd", "i would"),
901 ("you'd", "you would"),
902 ("he'd", "he would"),
903 ("she'd", "she would"),
904 ("we'd", "we would"),
905 ("they'd", "they would"),
906 ];
907 for (contraction, expansion) in &pairs {
908 m.insert(contraction.to_string(), expansion.to_string());
909 }
910 m
911}
912
913fn build_basic_dictionary() -> HashSet<String> {
919 let words = [
920 "the",
921 "be",
922 "to",
923 "of",
924 "and",
925 "a",
926 "in",
927 "that",
928 "have",
929 "i",
930 "it",
931 "for",
932 "not",
933 "on",
934 "with",
935 "he",
936 "as",
937 "you",
938 "do",
939 "at",
940 "this",
941 "but",
942 "his",
943 "by",
944 "from",
945 "they",
946 "we",
947 "say",
948 "her",
949 "she",
950 "or",
951 "an",
952 "will",
953 "my",
954 "one",
955 "all",
956 "would",
957 "there",
958 "their",
959 "what",
960 "so",
961 "up",
962 "out",
963 "if",
964 "about",
965 "who",
966 "get",
967 "which",
968 "go",
969 "me",
970 "when",
971 "make",
972 "can",
973 "like",
974 "time",
975 "no",
976 "just",
977 "him",
978 "know",
979 "take",
980 "people",
981 "into",
982 "year",
983 "your",
984 "good",
985 "some",
986 "could",
987 "them",
988 "see",
989 "other",
990 "than",
991 "then",
992 "now",
993 "look",
994 "only",
995 "come",
996 "its",
997 "over",
998 "think",
999 "also",
1000 "back",
1001 "after",
1002 "use",
1003 "two",
1004 "how",
1005 "our",
1006 "work",
1007 "first",
1008 "well",
1009 "way",
1010 "even",
1011 "new",
1012 "want",
1013 "because",
1014 "any",
1015 "these",
1016 "give",
1017 "day",
1018 "most",
1019 "us",
1020 "great",
1021 "world",
1022 "very",
1023 "much",
1024 "been",
1025 "hello",
1026 "world",
1027 "computer",
1028 "science",
1029 "data",
1030 "machine",
1031 "learning",
1032 "algorithm",
1033 "programming",
1034 "software",
1035 "system",
1036 "network",
1037 "internet",
1038 "technology",
1039 "digital",
1040 "information",
1041 "process",
1042 "language",
1043 "text",
1044 ];
1045 words.iter().map(|w| w.to_string()).collect()
1046}
1047
1048fn replace_pattern_simple(
1054 text: &str,
1055 is_start: fn(&str, usize) -> bool,
1056 find_end: fn(&str, usize) -> usize,
1057 replacement: &str,
1058 extracted: &mut Vec<String>,
1059) -> String {
1060 let mut result = String::with_capacity(text.len());
1061 let mut i = 0;
1062 let bytes = text.as_bytes();
1063
1064 while i < bytes.len() {
1065 if is_start(text, i) {
1066 let end = find_end(text, i);
1067 if end > i {
1068 extracted.push(text[i..end].to_string());
1069 result.push_str(replacement);
1070 i = end;
1071 continue;
1072 }
1073 }
1074 let c = text[i..].chars().next().unwrap_or(' ');
1076 result.push(c);
1077 i += c.len_utf8();
1078 }
1079 result
1080}
1081
1082#[cfg(test)]
1087mod tests {
1088 use super::*;
1089
1090 #[test]
1091 fn test_strip_html_basic() {
1092 assert_eq!(strip_html_tags("<p>hello</p>"), "hello");
1093 assert_eq!(strip_html_tags("<b>bold</b> text"), "bold text");
1094 }
1095
1096 #[test]
1097 fn test_strip_html_nested() {
1098 assert_eq!(strip_html_tags("<div><p>nested</p></div>"), "nested");
1099 }
1100
1101 #[test]
1102 fn test_strip_html_entities() {
1103 assert_eq!(strip_html_tags("a & b"), "a & b");
1104 assert_eq!(strip_html_tags("a < b"), "a < b");
1105 assert_eq!(strip_html_tags("a > b"), "a > b");
1106 }
1107
1108 #[test]
1109 fn test_strip_html_no_tags() {
1110 assert_eq!(strip_html_tags("no tags here"), "no tags here");
1111 }
1112
1113 #[test]
1114 fn test_url_detection() {
1115 let (text, urls) =
1116 extract_and_handle_urls("visit https://example.com for info", &UrlHandling::Remove);
1117 assert!(!text.contains("https://"));
1118 assert_eq!(urls.len(), 1);
1119 assert_eq!(urls[0], "https://example.com");
1120 }
1121
1122 #[test]
1123 fn test_url_replacement() {
1124 let (text, urls) = extract_and_handle_urls(
1125 "check https://example.com now",
1126 &UrlHandling::Replace("<URL>".to_string()),
1127 );
1128 assert!(text.contains("<URL>"));
1129 assert_eq!(urls.len(), 1);
1130 }
1131
1132 #[test]
1133 fn test_url_keep() {
1134 let (text, urls) = extract_and_handle_urls("see https://example.com", &UrlHandling::Keep);
1135 assert!(text.contains("https://example.com"));
1136 assert!(urls.is_empty());
1137 }
1138
1139 #[test]
1140 fn test_email_detection() {
1141 let (text, emails) =
1142 extract_and_handle_emails("contact user@example.com for help", &EmailHandling::Remove);
1143 assert!(!text.contains("user@example.com"));
1144 assert_eq!(emails.len(), 1);
1145 assert_eq!(emails[0], "user@example.com");
1146 }
1147
1148 #[test]
1149 fn test_mention_detection() {
1150 let (text, mentions) =
1151 extract_and_handle_mentions("hello @user123 how are you", &MentionHandling::Remove);
1152 assert!(!text.contains("@user123"));
1153 assert_eq!(mentions.len(), 1);
1154 assert_eq!(mentions[0], "@user123");
1155 }
1156
1157 #[test]
1158 fn test_mention_replacement() {
1159 let (text, _) = extract_and_handle_mentions(
1160 "hi @alice and @bob",
1161 &MentionHandling::Replace("<MENTION>".to_string()),
1162 );
1163 assert!(text.contains("<MENTION>"));
1164 assert!(!text.contains("@alice"));
1165 }
1166
1167 #[test]
1168 fn test_number_normalization() {
1169 let (text, numbers) = normalize_numbers("I have 42 cats and 3.14 dogs", "<NUM>");
1170 assert!(text.contains("<NUM>"));
1171 assert_eq!(numbers.len(), 2);
1172 assert!(numbers.contains(&"42".to_string()));
1173 assert!(numbers.contains(&"3.14".to_string()));
1174 }
1175
1176 #[test]
1177 fn test_number_with_commas() {
1178 let (text, numbers) = normalize_numbers("population: 1,234,567", "<NUM>");
1179 assert!(text.contains("<NUM>"));
1180 assert_eq!(numbers.len(), 1);
1181 }
1182
1183 #[test]
1184 fn test_contraction_expansion() {
1185 let preprocessor = TextPreprocessor::new(PreprocessConfig {
1186 strip_html: false,
1187 expand_contractions: true,
1188 ..Default::default()
1189 });
1190 let result = preprocessor
1191 .process("I can't do this")
1192 .expect("process failed");
1193 assert!(result.text.contains("cannot"));
1194 }
1195
1196 #[test]
1197 fn test_contraction_wont() {
1198 let preprocessor = TextPreprocessor::new(PreprocessConfig {
1199 strip_html: false,
1200 expand_contractions: true,
1201 ..Default::default()
1202 });
1203 let result = preprocessor.process("I won't go").expect("process failed");
1204 assert!(result.text.contains("will not"));
1205 }
1206
1207 #[test]
1208 fn test_diacritics_removal() {
1209 let result = remove_diacritics_from_text("cafe\u{0301}"); assert_eq!(result, "cafe");
1211 }
1212
1213 #[test]
1214 fn test_diacritics_spanish() {
1215 let result = remove_diacritics_from_text("ni\u{00f1}o"); assert_eq!(result, "nino");
1217 }
1218
1219 #[test]
1220 fn test_whitespace_normalization() {
1221 assert_eq!(normalize_whitespace(" hello world "), "hello world");
1222 assert_eq!(normalize_whitespace("a\t\nb"), "a b");
1223 }
1224
1225 #[test]
1226 fn test_edit_distance() {
1227 assert_eq!(edit_distance("kitten", "sitting"), 3);
1228 assert_eq!(edit_distance("", "abc"), 3);
1229 assert_eq!(edit_distance("abc", "abc"), 0);
1230 assert_eq!(edit_distance("abc", ""), 3);
1231 }
1232
1233 #[test]
1234 fn test_spell_check() {
1235 let mut dictionary = HashSet::new();
1236 dictionary.insert("hello".to_string());
1237 dictionary.insert("world".to_string());
1238 dictionary.insert("computer".to_string());
1239
1240 let closest = find_closest_word("helo", &dictionary, 2);
1241 assert_eq!(closest, Some("hello".to_string()));
1242 }
1243
1244 #[test]
1245 fn test_spell_check_no_match() {
1246 let mut dictionary = HashSet::new();
1247 dictionary.insert("hello".to_string());
1248
1249 let closest = find_closest_word("zzzzz", &dictionary, 1);
1250 assert!(closest.is_none());
1251 }
1252
1253 #[test]
1254 fn test_full_pipeline() {
1255 let config = PreprocessConfig {
1256 strip_html: true,
1257 handle_urls: UrlHandling::Replace("<URL>".to_string()),
1258 handle_emails: EmailHandling::Replace("<EMAIL>".to_string()),
1259 handle_mentions: MentionHandling::Replace("<MENTION>".to_string()),
1260 normalize_numbers: true,
1261 expand_contractions: true,
1262 unicode_normalize: true,
1263 normalize_whitespace: true,
1264 ..Default::default()
1265 };
1266
1267 let preprocessor = TextPreprocessor::new(config);
1268 let text = "<p>I can't believe https://example.com has @user with 42 items!</p>";
1269 let result = preprocessor.process(text).expect("process failed");
1270
1271 assert!(!result.text.contains("<p>"));
1272 assert!(result.text.contains("cannot"));
1273 assert!(result.text.contains("<URL>"));
1274 assert!(result.text.contains("<MENTION>"));
1275 assert!(!result.extracted_urls.is_empty());
1276 assert!(!result.extracted_mentions.is_empty());
1277 }
1278
1279 #[test]
1280 fn test_pipeline_defaults() {
1281 let preprocessor = TextPreprocessor::new(PreprocessConfig::default());
1282 let result = preprocessor.process("Hello World").expect("process failed");
1283 assert_eq!(result.text, "Hello World");
1284 }
1285
1286 #[test]
1287 fn test_punctuation_removal() {
1288 let text = remove_punctuation("Hello, world! How are you?");
1289 assert!(!text.contains(','));
1290 assert!(!text.contains('!'));
1291 assert!(!text.contains('?'));
1292 }
1293
1294 #[test]
1295 fn test_transfer_casing() {
1296 assert_eq!(transfer_casing("Hello", "world"), "World");
1297 assert_eq!(transfer_casing("HELLO", "world"), "WORLD");
1298 assert_eq!(transfer_casing("hello", "WORLD"), "WORLD");
1299 }
1300
1301 #[test]
1302 fn test_basic_dictionary() {
1303 let dict = build_basic_dictionary();
1304 assert!(dict.contains("the"));
1305 assert!(dict.contains("hello"));
1306 }
1307
1308 #[test]
1309 fn test_spell_check_integration() {
1310 let config = PreprocessConfig {
1311 strip_html: false,
1312 expand_contractions: false,
1313 spell_check: true,
1314 max_edit_distance: 2,
1315 normalize_whitespace: true,
1316 ..Default::default()
1317 };
1318
1319 let preprocessor = TextPreprocessor::new(config).with_basic_dictionary();
1320 let result = preprocessor.process("helo wrld").expect("process failed");
1321 assert!(!result.spelling_corrections.is_empty());
1323 }
1324
1325 #[test]
1326 fn test_numeric_entity_decode() {
1327 assert_eq!(strip_html_tags("A"), "A");
1328 assert_eq!(strip_html_tags("A"), "A");
1329 }
1330
1331 #[test]
1332 fn test_empty_input() {
1333 let preprocessor = TextPreprocessor::new(PreprocessConfig::default());
1334 let result = preprocessor.process("").expect("process failed");
1335 assert_eq!(result.text, "");
1336 }
1337
1338 #[test]
1339 fn test_multiple_urls() {
1340 let (text, urls) =
1341 extract_and_handle_urls("see https://a.com and https://b.com", &UrlHandling::Remove);
1342 assert_eq!(urls.len(), 2);
1343 assert!(!text.contains("https://"));
1344 }
1345
1346 #[test]
1347 fn test_lowercase() {
1348 let config = PreprocessConfig {
1349 strip_html: false,
1350 expand_contractions: false,
1351 lowercase: true,
1352 ..Default::default()
1353 };
1354 let preprocessor = TextPreprocessor::new(config);
1355 let result = preprocessor.process("Hello WORLD").expect("process failed");
1356 assert_eq!(result.text, "hello world");
1357 }
1358
1359 #[test]
1360 fn test_scientific_notation() {
1361 let (text, numbers) = normalize_numbers("value is 1.5e10 and 2E-3", "<NUM>");
1362 assert_eq!(numbers.len(), 2);
1363 assert!(text.contains("<NUM>"));
1364 }
1365
1366 #[test]
1367 fn test_negative_numbers() {
1368 let (text, numbers) = normalize_numbers("temperature: -42 degrees", "<NUM>");
1369 assert!(numbers.contains(&"-42".to_string()));
1370 assert!(text.contains("<NUM>"));
1371 }
1372
1373 #[test]
1374 fn test_html_self_closing() {
1375 assert_eq!(strip_html_tags("before<br/>after"), "beforeafter");
1376 assert_eq!(strip_html_tags("a<img src='x'/>b"), "ab");
1377 }
1378
1379 #[test]
1380 fn test_email_no_email() {
1381 let (text, emails) = extract_and_handle_emails("no email here", &EmailHandling::Remove);
1382 assert_eq!(text, "no email here");
1383 assert!(emails.is_empty());
1384 }
1385
1386 #[test]
1387 fn test_mention_not_at_word_boundary() {
1388 let (text, mentions) =
1390 extract_and_handle_mentions("test@notamention", &MentionHandling::Remove);
1391 assert!(mentions.is_empty());
1393 assert!(text.contains("test@notamention"));
1394 }
1395}