scirs2_text/
text_preprocess.rs

1//! # Text Preprocessing Pipeline
2//!
3//! A configurable text preprocessing pipeline with the following capabilities:
4//!
5//! - **HTML tag removal**: Strip HTML/XML tags from text
6//! - **URL/email/mention extraction**: Detect and extract or remove URLs, emails, @mentions
7//! - **Number normalization**: Convert numbers to tokens or normalized forms
8//! - **Contraction expansion**: Expand English contractions (e.g., "can't" -> "cannot")
9//! - **Spell checking**: Edit-distance-based spelling correction with dictionary lookup
10//! - **Text normalization**: Unicode normalization, diacritics removal
11//!
12//! ## Example
13//!
14//! ```rust
15//! use scirs2_text::text_preprocess::{TextPreprocessor, PreprocessConfig};
16//!
17//! let config = PreprocessConfig::default();
18//! let preprocessor = TextPreprocessor::new(config);
19//!
20//! let text = "<p>I can't believe it's only $9.99!</p>";
21//! let result = preprocessor.process(text).unwrap();
22//! assert!(!result.text.contains("<p>"));
23//! ```
24
25use crate::error::{Result, TextError};
26use std::collections::{HashMap, HashSet};
27
28// ---------------------------------------------------------------------------
29// Configuration
30// ---------------------------------------------------------------------------
31
32/// Configuration for the text preprocessing pipeline.
33#[derive(Debug, Clone)]
34pub struct PreprocessConfig {
35    /// Remove HTML/XML tags.
36    pub strip_html: bool,
37    /// Remove or replace URLs.
38    pub handle_urls: UrlHandling,
39    /// Remove or replace email addresses.
40    pub handle_emails: EmailHandling,
41    /// Remove or replace @mentions.
42    pub handle_mentions: MentionHandling,
43    /// Normalize numbers.
44    pub normalize_numbers: bool,
45    /// Number replacement token.
46    pub number_token: String,
47    /// Expand contractions.
48    pub expand_contractions: bool,
49    /// Enable spell checking.
50    pub spell_check: bool,
51    /// Maximum edit distance for spell checking.
52    pub max_edit_distance: usize,
53    /// Remove diacritics/accents.
54    pub remove_diacritics: bool,
55    /// Perform unicode normalization (NFC).
56    pub unicode_normalize: bool,
57    /// Convert to lowercase.
58    pub lowercase: bool,
59    /// Remove extra whitespace.
60    pub normalize_whitespace: bool,
61    /// Remove punctuation.
62    pub remove_punctuation: bool,
63}
64
65impl Default for PreprocessConfig {
66    fn default() -> Self {
67        Self {
68            strip_html: true,
69            handle_urls: UrlHandling::Remove,
70            handle_emails: EmailHandling::Remove,
71            handle_mentions: MentionHandling::Remove,
72            normalize_numbers: false,
73            number_token: "<NUM>".to_string(),
74            expand_contractions: true,
75            spell_check: false,
76            max_edit_distance: 2,
77            remove_diacritics: false,
78            unicode_normalize: true,
79            lowercase: false,
80            normalize_whitespace: true,
81            remove_punctuation: false,
82        }
83    }
84}
85
86/// How to handle URLs.
87#[derive(Debug, Clone, PartialEq)]
88pub enum UrlHandling {
89    /// Leave URLs as-is.
90    Keep,
91    /// Remove URLs entirely.
92    Remove,
93    /// Replace with a token.
94    Replace(String),
95}
96
97/// How to handle email addresses.
98#[derive(Debug, Clone, PartialEq)]
99pub enum EmailHandling {
100    /// Leave emails as-is.
101    Keep,
102    /// Remove emails entirely.
103    Remove,
104    /// Replace with a token.
105    Replace(String),
106}
107
108/// How to handle @mentions.
109#[derive(Debug, Clone, PartialEq)]
110pub enum MentionHandling {
111    /// Leave mentions as-is.
112    Keep,
113    /// Remove mentions entirely.
114    Remove,
115    /// Replace with a token.
116    Replace(String),
117}
118
119// ---------------------------------------------------------------------------
120// Preprocessing result
121// ---------------------------------------------------------------------------
122
123/// Result of text preprocessing.
124#[derive(Debug, Clone)]
125pub struct PreprocessResult {
126    /// The preprocessed text.
127    pub text: String,
128    /// URLs extracted from the text.
129    pub extracted_urls: Vec<String>,
130    /// Email addresses extracted from the text.
131    pub extracted_emails: Vec<String>,
132    /// @mentions extracted from the text.
133    pub extracted_mentions: Vec<String>,
134    /// Numbers found in the text.
135    pub extracted_numbers: Vec<String>,
136    /// Spelling corrections made (original -> corrected).
137    pub spelling_corrections: Vec<(String, String)>,
138}
139
140// ---------------------------------------------------------------------------
141// Text Preprocessor
142// ---------------------------------------------------------------------------
143
144/// The main text preprocessing pipeline.
145#[derive(Debug, Clone)]
146pub struct TextPreprocessor {
147    config: PreprocessConfig,
148    /// Spell check dictionary.
149    dictionary: HashSet<String>,
150    /// Contraction mapping.
151    contractions: HashMap<String, String>,
152}
153
154impl TextPreprocessor {
155    /// Create a new preprocessor with the given configuration.
156    pub fn new(config: PreprocessConfig) -> Self {
157        let contractions = build_contraction_map();
158        Self {
159            config,
160            dictionary: HashSet::new(),
161            contractions,
162        }
163    }
164
165    /// Set a custom dictionary for spell checking.
166    pub fn with_dictionary(mut self, words: impl IntoIterator<Item = String>) -> Self {
167        self.dictionary = words.into_iter().collect();
168        self
169    }
170
171    /// Add words to the spell checking dictionary.
172    pub fn add_dictionary_words(&mut self, words: impl IntoIterator<Item = String>) {
173        self.dictionary.extend(words);
174    }
175
176    /// Load a basic English dictionary (common words).
177    pub fn with_basic_dictionary(mut self) -> Self {
178        self.dictionary = build_basic_dictionary();
179        self
180    }
181
182    /// Process a text through the preprocessing pipeline.
183    pub fn process(&self, text: &str) -> Result<PreprocessResult> {
184        let mut result = PreprocessResult {
185            text: text.to_string(),
186            extracted_urls: Vec::new(),
187            extracted_emails: Vec::new(),
188            extracted_mentions: Vec::new(),
189            extracted_numbers: Vec::new(),
190            spelling_corrections: Vec::new(),
191        };
192
193        // 1. Unicode normalization
194        if self.config.unicode_normalize {
195            result.text = unicode_nfc_normalize(&result.text);
196        }
197
198        // 2. Strip HTML
199        if self.config.strip_html {
200            result.text = strip_html_tags(&result.text);
201        }
202
203        // 3. Handle URLs
204        let (text_after_urls, urls) =
205            extract_and_handle_urls(&result.text, &self.config.handle_urls);
206        result.text = text_after_urls;
207        result.extracted_urls = urls;
208
209        // 4. Handle emails
210        let (text_after_emails, emails) =
211            extract_and_handle_emails(&result.text, &self.config.handle_emails);
212        result.text = text_after_emails;
213        result.extracted_emails = emails;
214
215        // 5. Handle mentions
216        let (text_after_mentions, mentions) =
217            extract_and_handle_mentions(&result.text, &self.config.handle_mentions);
218        result.text = text_after_mentions;
219        result.extracted_mentions = mentions;
220
221        // 6. Expand contractions
222        if self.config.expand_contractions {
223            result.text = self.expand_contractions_text(&result.text);
224        }
225
226        // 7. Normalize numbers
227        if self.config.normalize_numbers {
228            let (text, numbers) = normalize_numbers(&result.text, &self.config.number_token);
229            result.text = text;
230            result.extracted_numbers = numbers;
231        }
232
233        // 8. Remove diacritics
234        if self.config.remove_diacritics {
235            result.text = remove_diacritics_from_text(&result.text);
236        }
237
238        // 9. Lowercase
239        if self.config.lowercase {
240            result.text = result.text.to_lowercase();
241        }
242
243        // 10. Remove punctuation
244        if self.config.remove_punctuation {
245            result.text = remove_punctuation(&result.text);
246        }
247
248        // 11. Spell check
249        if self.config.spell_check && !self.dictionary.is_empty() {
250            let (text, corrections) =
251                self.spell_check_text(&result.text, self.config.max_edit_distance);
252            result.text = text;
253            result.spelling_corrections = corrections;
254        }
255
256        // 12. Normalize whitespace (always last for clean output)
257        if self.config.normalize_whitespace {
258            result.text = normalize_whitespace(&result.text);
259        }
260
261        Ok(result)
262    }
263
264    /// Expand contractions in text.
265    fn expand_contractions_text(&self, text: &str) -> String {
266        let mut result = text.to_string();
267
268        // Sort contractions by length descending to match longer ones first
269        let mut sorted_contractions: Vec<(&String, &String)> = self.contractions.iter().collect();
270        sorted_contractions.sort_by_key(|(k, _)| std::cmp::Reverse(k.len()));
271
272        for (contraction, expansion) in &sorted_contractions {
273            // Case-insensitive replacement
274            let lower = result.to_lowercase();
275            let contraction_lower = contraction.to_lowercase();
276            let mut new_result = String::with_capacity(result.len());
277            let mut search_from = 0;
278
279            loop {
280                let lower_slice = &lower[search_from..];
281                match lower_slice.find(&contraction_lower) {
282                    Some(pos) => {
283                        new_result.push_str(&result[search_from..search_from + pos]);
284                        new_result.push_str(expansion);
285                        search_from += pos + contraction.len();
286                    }
287                    None => {
288                        new_result.push_str(&result[search_from..]);
289                        break;
290                    }
291                }
292            }
293            result = new_result;
294        }
295        result
296    }
297
298    /// Spell check text and return corrected text with corrections list.
299    fn spell_check_text(&self, text: &str, max_distance: usize) -> (String, Vec<(String, String)>) {
300        let mut corrections = Vec::new();
301        let words: Vec<&str> = text.split_whitespace().collect();
302        let mut result_words = Vec::with_capacity(words.len());
303
304        for word in &words {
305            let clean_word = word
306                .trim_matches(|c: char| !c.is_alphanumeric())
307                .to_lowercase();
308
309            if clean_word.is_empty() || self.dictionary.contains(&clean_word) {
310                result_words.push(word.to_string());
311                continue;
312            }
313
314            // Find closest dictionary word
315            if let Some(correction) = find_closest_word(&clean_word, &self.dictionary, max_distance)
316            {
317                corrections.push((clean_word.clone(), correction.clone()));
318                // Preserve original casing pattern for the replacement
319                let corrected = transfer_casing(word, &correction);
320                result_words.push(corrected);
321            } else {
322                result_words.push(word.to_string());
323            }
324        }
325
326        (result_words.join(" "), corrections)
327    }
328}
329
330// ---------------------------------------------------------------------------
331// HTML stripping
332// ---------------------------------------------------------------------------
333
334/// Remove HTML/XML tags from text.
335///
336/// Handles self-closing tags, attributes, and HTML entities.
337pub fn strip_html_tags(text: &str) -> String {
338    let mut result = String::with_capacity(text.len());
339    let mut in_tag = false;
340    let chars: Vec<char> = text.chars().collect();
341    let mut i = 0;
342
343    while i < chars.len() {
344        if chars[i] == '<' {
345            in_tag = true;
346            i += 1;
347            continue;
348        }
349        if chars[i] == '>' && in_tag {
350            in_tag = false;
351            i += 1;
352            continue;
353        }
354        if !in_tag {
355            // Handle HTML entities
356            if chars[i] == '&' {
357                if let Some(entity_result) = try_decode_entity(&chars, i) {
358                    result.push(entity_result.0);
359                    i = entity_result.1;
360                    continue;
361                }
362            }
363            result.push(chars[i]);
364        }
365        i += 1;
366    }
367    result
368}
369
370/// Try to decode an HTML entity starting at position `start`.
371/// Returns (decoded_char, next_position) or None.
372fn try_decode_entity(chars: &[char], start: usize) -> Option<(char, usize)> {
373    // Find the semicolon
374    let mut end = start + 1;
375    while end < chars.len() && end - start < 10 {
376        if chars[end] == ';' {
377            let entity: String = chars[start..=end].iter().collect();
378            let decoded = match entity.as_str() {
379                "&amp;" => Some('&'),
380                "&lt;" => Some('<'),
381                "&gt;" => Some('>'),
382                "&quot;" => Some('"'),
383                "&apos;" => Some('\''),
384                "&nbsp;" => Some(' '),
385                _ => {
386                    // Numeric entities
387                    if entity.starts_with("&#x") || entity.starts_with("&#X") {
388                        let hex_str: String = entity[3..entity.len() - 1].to_string();
389                        u32::from_str_radix(&hex_str, 16)
390                            .ok()
391                            .and_then(char::from_u32)
392                    } else if entity.starts_with("&#") {
393                        let num_str: String = entity[2..entity.len() - 1].to_string();
394                        num_str.parse::<u32>().ok().and_then(char::from_u32)
395                    } else {
396                        None
397                    }
398                }
399            };
400            if let Some(c) = decoded {
401                return Some((c, end + 1));
402            }
403            return None;
404        }
405        end += 1;
406    }
407    None
408}
409
410// ---------------------------------------------------------------------------
411// URL extraction and handling
412// ---------------------------------------------------------------------------
413
414/// Extract and handle URLs in text.
415fn extract_and_handle_urls(text: &str, handling: &UrlHandling) -> (String, Vec<String>) {
416    let mut urls = Vec::new();
417
418    match handling {
419        UrlHandling::Keep => (text.to_string(), urls),
420        UrlHandling::Remove | UrlHandling::Replace(_) => {
421            let replacement = match handling {
422                UrlHandling::Replace(token) => token.as_str(),
423                _ => "",
424            };
425            let result =
426                replace_pattern_simple(text, is_url_start, find_url_end, replacement, &mut urls);
427            (result, urls)
428        }
429    }
430}
431
432/// Check if a URL starts at this position.
433fn is_url_start(text: &str, pos: usize) -> bool {
434    let remaining = &text[pos..];
435    remaining.starts_with("http://")
436        || remaining.starts_with("https://")
437        || remaining.starts_with("ftp://")
438        || remaining.starts_with("www.")
439}
440
441/// Find end of URL starting at `start`.
442fn find_url_end(text: &str, start: usize) -> usize {
443    let bytes = text.as_bytes();
444    let mut end = start;
445    while end < bytes.len() {
446        let b = bytes[end];
447        // URL ends at whitespace, or certain punctuation at end
448        if b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' || b == b'>' || b == b'"' {
449            break;
450        }
451        end += 1;
452    }
453    // Trim trailing punctuation that's unlikely part of URL
454    while end > start {
455        let b = bytes[end - 1];
456        if b == b'.'
457            || b == b','
458            || b == b')'
459            || b == b']'
460            || b == b';'
461            || b == b':'
462            || b == b'!'
463            || b == b'?'
464        {
465            end -= 1;
466        } else {
467            break;
468        }
469    }
470    end
471}
472
473// ---------------------------------------------------------------------------
474// Email extraction and handling
475// ---------------------------------------------------------------------------
476
477/// Extract and handle email addresses.
478fn extract_and_handle_emails(text: &str, handling: &EmailHandling) -> (String, Vec<String>) {
479    let mut emails = Vec::new();
480
481    match handling {
482        EmailHandling::Keep => (text.to_string(), emails),
483        EmailHandling::Remove | EmailHandling::Replace(_) => {
484            let replacement = match handling {
485                EmailHandling::Replace(token) => token.as_str(),
486                _ => "",
487            };
488            let result = find_and_replace_emails(text, replacement, &mut emails);
489            (result, emails)
490        }
491    }
492}
493
494/// Find and replace email addresses in text.
495fn find_and_replace_emails(text: &str, replacement: &str, extracted: &mut Vec<String>) -> String {
496    let mut result = String::with_capacity(text.len());
497    let chars: Vec<char> = text.chars().collect();
498    let mut i = 0;
499
500    while i < chars.len() {
501        // Look for @ sign and work backwards/forwards
502        if chars[i] == '@' && i > 0 {
503            // Find local part (before @)
504            let mut local_start = i;
505            while local_start > 0 {
506                let c = chars[local_start - 1];
507                if c.is_alphanumeric() || c == '.' || c == '_' || c == '+' || c == '-' || c == '%' {
508                    local_start -= 1;
509                } else {
510                    break;
511                }
512            }
513
514            // Find domain part (after @)
515            let mut domain_end = i + 1;
516            let mut has_dot = false;
517            while domain_end < chars.len() {
518                let c = chars[domain_end];
519                if c.is_alphanumeric() || c == '.' || c == '-' {
520                    if c == '.' {
521                        has_dot = true;
522                    }
523                    domain_end += 1;
524                } else {
525                    break;
526                }
527            }
528
529            if local_start < i && domain_end > i + 1 && has_dot {
530                let email: String = chars[local_start..domain_end].iter().collect();
531                extracted.push(email);
532
533                // Remove what we already wrote for the local part
534                let already_written = i - local_start;
535                for _ in 0..already_written {
536                    result.pop();
537                }
538
539                result.push_str(replacement);
540                i = domain_end;
541                continue;
542            }
543        }
544
545        result.push(chars[i]);
546        i += 1;
547    }
548    result
549}
550
551// ---------------------------------------------------------------------------
552// Mention extraction and handling
553// ---------------------------------------------------------------------------
554
555/// Extract and handle @mentions.
556fn extract_and_handle_mentions(text: &str, handling: &MentionHandling) -> (String, Vec<String>) {
557    let mut mentions = Vec::new();
558
559    match handling {
560        MentionHandling::Keep => (text.to_string(), mentions),
561        MentionHandling::Remove | MentionHandling::Replace(_) => {
562            let replacement = match handling {
563                MentionHandling::Replace(token) => token.as_str(),
564                _ => "",
565            };
566            let result = find_and_replace_mentions(text, replacement, &mut mentions);
567            (result, mentions)
568        }
569    }
570}
571
572/// Find and replace @mentions.
573fn find_and_replace_mentions(text: &str, replacement: &str, extracted: &mut Vec<String>) -> String {
574    let mut result = String::with_capacity(text.len());
575    let chars: Vec<char> = text.chars().collect();
576    let mut i = 0;
577
578    while i < chars.len() {
579        if chars[i] == '@' {
580            // Check that @ is preceded by whitespace or start of string
581            let preceded_by_space = i == 0 || chars[i - 1].is_whitespace();
582            if preceded_by_space {
583                let mut end = i + 1;
584                while end < chars.len() && (chars[end].is_alphanumeric() || chars[end] == '_') {
585                    end += 1;
586                }
587                if end > i + 1 {
588                    let mention: String = chars[i..end].iter().collect();
589                    extracted.push(mention);
590                    result.push_str(replacement);
591                    i = end;
592                    continue;
593                }
594            }
595        }
596        result.push(chars[i]);
597        i += 1;
598    }
599    result
600}
601
602// ---------------------------------------------------------------------------
603// Number normalization
604// ---------------------------------------------------------------------------
605
606/// Normalize numbers in text by replacing them with a token.
607fn normalize_numbers(text: &str, token: &str) -> (String, Vec<String>) {
608    let mut numbers = Vec::new();
609    let mut result = String::with_capacity(text.len());
610    let chars: Vec<char> = text.chars().collect();
611    let mut i = 0;
612
613    while i < chars.len() {
614        if chars[i].is_ascii_digit()
615            || (chars[i] == '-'
616                && i + 1 < chars.len()
617                && chars[i + 1].is_ascii_digit()
618                && (i == 0 || chars[i - 1].is_whitespace()))
619        {
620            let start = i;
621            if chars[i] == '-' {
622                i += 1;
623            }
624            // Consume digits with optional commas and decimal point
625            while i < chars.len() && chars[i].is_ascii_digit() {
626                i += 1;
627            }
628            // Check for comma-separated groups
629            while i + 1 < chars.len() && chars[i] == ',' && chars[i + 1].is_ascii_digit() {
630                i += 1; // skip comma
631                while i < chars.len() && chars[i].is_ascii_digit() {
632                    i += 1;
633                }
634            }
635            // Decimal part
636            if i < chars.len()
637                && chars[i] == '.'
638                && i + 1 < chars.len()
639                && chars[i + 1].is_ascii_digit()
640            {
641                i += 1; // skip dot
642                while i < chars.len() && chars[i].is_ascii_digit() {
643                    i += 1;
644                }
645            }
646            // Scientific notation
647            if i < chars.len() && (chars[i] == 'e' || chars[i] == 'E') {
648                let save = i;
649                i += 1;
650                if i < chars.len() && (chars[i] == '+' || chars[i] == '-') {
651                    i += 1;
652                }
653                if i < chars.len() && chars[i].is_ascii_digit() {
654                    while i < chars.len() && chars[i].is_ascii_digit() {
655                        i += 1;
656                    }
657                } else {
658                    i = save; // not valid scientific notation
659                }
660            }
661
662            let num: String = chars[start..i].iter().collect();
663            numbers.push(num);
664            result.push_str(token);
665        } else {
666            result.push(chars[i]);
667            i += 1;
668        }
669    }
670    (result, numbers)
671}
672
673// ---------------------------------------------------------------------------
674// Diacritics removal
675// ---------------------------------------------------------------------------
676
677/// Remove diacritics/accents from text.
678///
679/// Uses Unicode decomposition to separate base characters from combining marks.
680pub fn remove_diacritics_from_text(text: &str) -> String {
681    use unicode_normalization::UnicodeNormalization;
682
683    text.nfd().filter(|c| !is_combining_mark(*c)).collect()
684}
685
686/// Check if a character is a Unicode combining mark.
687fn is_combining_mark(c: char) -> bool {
688    let code = c as u32;
689    // Combining Diacritical Marks: U+0300 to U+036F
690    // Combining Diacritical Marks Extended: U+1AB0 to U+1AFF
691    // Combining Diacritical Marks Supplement: U+1DC0 to U+1DFF
692    // Combining Half Marks: U+FE20 to U+FE2F
693    (0x0300..=0x036F).contains(&code)
694        || (0x1AB0..=0x1AFF).contains(&code)
695        || (0x1DC0..=0x1DFF).contains(&code)
696        || (0xFE20..=0xFE2F).contains(&code)
697}
698
699// ---------------------------------------------------------------------------
700// Unicode normalization
701// ---------------------------------------------------------------------------
702
703/// Apply Unicode NFC normalization.
704fn unicode_nfc_normalize(text: &str) -> String {
705    use unicode_normalization::UnicodeNormalization;
706    text.nfc().collect()
707}
708
709// ---------------------------------------------------------------------------
710// Whitespace normalization
711// ---------------------------------------------------------------------------
712
713/// Normalize whitespace: collapse multiple spaces, trim.
714pub fn normalize_whitespace(text: &str) -> String {
715    let mut result = String::with_capacity(text.len());
716    let mut last_was_space = true; // true to trim leading
717
718    for c in text.chars() {
719        if c.is_whitespace() {
720            if !last_was_space {
721                result.push(' ');
722                last_was_space = true;
723            }
724        } else {
725            result.push(c);
726            last_was_space = false;
727        }
728    }
729
730    // Trim trailing space
731    if result.ends_with(' ') {
732        result.pop();
733    }
734    result
735}
736
737// ---------------------------------------------------------------------------
738// Punctuation removal
739// ---------------------------------------------------------------------------
740
741/// Remove punctuation from text.
742fn remove_punctuation(text: &str) -> String {
743    text.chars()
744        .map(|c| if c.is_ascii_punctuation() { ' ' } else { c })
745        .collect()
746}
747
748// ---------------------------------------------------------------------------
749// Spell checking
750// ---------------------------------------------------------------------------
751
752/// Compute edit distance (Levenshtein) between two strings.
753pub fn edit_distance(a: &str, b: &str) -> usize {
754    let a_chars: Vec<char> = a.chars().collect();
755    let b_chars: Vec<char> = b.chars().collect();
756    let m = a_chars.len();
757    let n = b_chars.len();
758
759    if m == 0 {
760        return n;
761    }
762    if n == 0 {
763        return m;
764    }
765
766    let mut prev = vec![0usize; n + 1];
767    let mut curr = vec![0usize; n + 1];
768
769    for j in 0..=n {
770        prev[j] = j;
771    }
772
773    for i in 1..=m {
774        curr[0] = i;
775        for j in 1..=n {
776            let cost = if a_chars[i - 1] == b_chars[j - 1] {
777                0
778            } else {
779                1
780            };
781            curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
782        }
783        std::mem::swap(&mut prev, &mut curr);
784    }
785    prev[n]
786}
787
788/// Find the closest word in the dictionary within max_distance.
789fn find_closest_word(
790    word: &str,
791    dictionary: &HashSet<String>,
792    max_distance: usize,
793) -> Option<String> {
794    let mut best: Option<(String, usize)> = None;
795
796    for dict_word in dictionary {
797        // Quick length filter
798        let len_diff = if word.len() > dict_word.len() {
799            word.len() - dict_word.len()
800        } else {
801            dict_word.len() - word.len()
802        };
803        if len_diff > max_distance {
804            continue;
805        }
806
807        let dist = edit_distance(word, dict_word);
808        if dist <= max_distance {
809            match &best {
810                None => best = Some((dict_word.clone(), dist)),
811                Some((_, best_dist)) => {
812                    if dist < *best_dist {
813                        best = Some((dict_word.clone(), dist));
814                    }
815                }
816            }
817        }
818    }
819
820    best.map(|(w, _)| w)
821}
822
823/// Transfer casing pattern from source to target.
824fn transfer_casing(source: &str, target: &str) -> String {
825    let source_chars: Vec<char> = source.chars().collect();
826    let target_chars: Vec<char> = target.chars().collect();
827
828    if source_chars.iter().all(|c| c.is_uppercase()) {
829        return target.to_uppercase();
830    }
831
832    if source_chars
833        .first()
834        .map(|c| c.is_uppercase())
835        .unwrap_or(false)
836    {
837        let mut result: String = target_chars
838            .first()
839            .map(|c| c.to_uppercase().to_string())
840            .unwrap_or_default();
841        for &c in &target_chars[1..] {
842            result.push(c);
843        }
844        return result;
845    }
846
847    target.to_string()
848}
849
850// ---------------------------------------------------------------------------
851// Contraction map
852// ---------------------------------------------------------------------------
853
854/// Build the English contraction mapping.
855fn build_contraction_map() -> HashMap<String, String> {
856    let mut m = HashMap::new();
857    let pairs = [
858        ("can't", "cannot"),
859        ("won't", "will not"),
860        ("don't", "do not"),
861        ("doesn't", "does not"),
862        ("didn't", "did not"),
863        ("isn't", "is not"),
864        ("aren't", "are not"),
865        ("wasn't", "was not"),
866        ("weren't", "were not"),
867        ("hasn't", "has not"),
868        ("haven't", "have not"),
869        ("hadn't", "had not"),
870        ("wouldn't", "would not"),
871        ("couldn't", "could not"),
872        ("shouldn't", "should not"),
873        ("mustn't", "must not"),
874        ("needn't", "need not"),
875        ("shan't", "shall not"),
876        ("mightn't", "might not"),
877        ("it's", "it is"),
878        ("that's", "that is"),
879        ("what's", "what is"),
880        ("where's", "where is"),
881        ("who's", "who is"),
882        ("there's", "there is"),
883        ("here's", "here is"),
884        ("let's", "let us"),
885        ("i'm", "i am"),
886        ("you're", "you are"),
887        ("we're", "we are"),
888        ("they're", "they are"),
889        ("i've", "i have"),
890        ("you've", "you have"),
891        ("we've", "we have"),
892        ("they've", "they have"),
893        ("i'll", "i will"),
894        ("you'll", "you will"),
895        ("he'll", "he will"),
896        ("she'll", "she will"),
897        ("it'll", "it will"),
898        ("we'll", "we will"),
899        ("they'll", "they will"),
900        ("i'd", "i would"),
901        ("you'd", "you would"),
902        ("he'd", "he would"),
903        ("she'd", "she would"),
904        ("we'd", "we would"),
905        ("they'd", "they would"),
906    ];
907    for (contraction, expansion) in &pairs {
908        m.insert(contraction.to_string(), expansion.to_string());
909    }
910    m
911}
912
913// ---------------------------------------------------------------------------
914// Basic dictionary
915// ---------------------------------------------------------------------------
916
917/// Build a basic English dictionary of common words.
918fn build_basic_dictionary() -> HashSet<String> {
919    let words = [
920        "the",
921        "be",
922        "to",
923        "of",
924        "and",
925        "a",
926        "in",
927        "that",
928        "have",
929        "i",
930        "it",
931        "for",
932        "not",
933        "on",
934        "with",
935        "he",
936        "as",
937        "you",
938        "do",
939        "at",
940        "this",
941        "but",
942        "his",
943        "by",
944        "from",
945        "they",
946        "we",
947        "say",
948        "her",
949        "she",
950        "or",
951        "an",
952        "will",
953        "my",
954        "one",
955        "all",
956        "would",
957        "there",
958        "their",
959        "what",
960        "so",
961        "up",
962        "out",
963        "if",
964        "about",
965        "who",
966        "get",
967        "which",
968        "go",
969        "me",
970        "when",
971        "make",
972        "can",
973        "like",
974        "time",
975        "no",
976        "just",
977        "him",
978        "know",
979        "take",
980        "people",
981        "into",
982        "year",
983        "your",
984        "good",
985        "some",
986        "could",
987        "them",
988        "see",
989        "other",
990        "than",
991        "then",
992        "now",
993        "look",
994        "only",
995        "come",
996        "its",
997        "over",
998        "think",
999        "also",
1000        "back",
1001        "after",
1002        "use",
1003        "two",
1004        "how",
1005        "our",
1006        "work",
1007        "first",
1008        "well",
1009        "way",
1010        "even",
1011        "new",
1012        "want",
1013        "because",
1014        "any",
1015        "these",
1016        "give",
1017        "day",
1018        "most",
1019        "us",
1020        "great",
1021        "world",
1022        "very",
1023        "much",
1024        "been",
1025        "hello",
1026        "world",
1027        "computer",
1028        "science",
1029        "data",
1030        "machine",
1031        "learning",
1032        "algorithm",
1033        "programming",
1034        "software",
1035        "system",
1036        "network",
1037        "internet",
1038        "technology",
1039        "digital",
1040        "information",
1041        "process",
1042        "language",
1043        "text",
1044    ];
1045    words.iter().map(|w| w.to_string()).collect()
1046}
1047
1048// ---------------------------------------------------------------------------
1049// Generic pattern replacement helper
1050// ---------------------------------------------------------------------------
1051
1052/// Replace patterns in text identified by start/end detector functions.
1053fn replace_pattern_simple(
1054    text: &str,
1055    is_start: fn(&str, usize) -> bool,
1056    find_end: fn(&str, usize) -> usize,
1057    replacement: &str,
1058    extracted: &mut Vec<String>,
1059) -> String {
1060    let mut result = String::with_capacity(text.len());
1061    let mut i = 0;
1062    let bytes = text.as_bytes();
1063
1064    while i < bytes.len() {
1065        if is_start(text, i) {
1066            let end = find_end(text, i);
1067            if end > i {
1068                extracted.push(text[i..end].to_string());
1069                result.push_str(replacement);
1070                i = end;
1071                continue;
1072            }
1073        }
1074        // Push one UTF-8 character
1075        let c = text[i..].chars().next().unwrap_or(' ');
1076        result.push(c);
1077        i += c.len_utf8();
1078    }
1079    result
1080}
1081
1082// ---------------------------------------------------------------------------
1083// Tests
1084// ---------------------------------------------------------------------------
1085
1086#[cfg(test)]
1087mod tests {
1088    use super::*;
1089
1090    #[test]
1091    fn test_strip_html_basic() {
1092        assert_eq!(strip_html_tags("<p>hello</p>"), "hello");
1093        assert_eq!(strip_html_tags("<b>bold</b> text"), "bold text");
1094    }
1095
1096    #[test]
1097    fn test_strip_html_nested() {
1098        assert_eq!(strip_html_tags("<div><p>nested</p></div>"), "nested");
1099    }
1100
1101    #[test]
1102    fn test_strip_html_entities() {
1103        assert_eq!(strip_html_tags("a &amp; b"), "a & b");
1104        assert_eq!(strip_html_tags("a &lt; b"), "a < b");
1105        assert_eq!(strip_html_tags("a &gt; b"), "a > b");
1106    }
1107
1108    #[test]
1109    fn test_strip_html_no_tags() {
1110        assert_eq!(strip_html_tags("no tags here"), "no tags here");
1111    }
1112
1113    #[test]
1114    fn test_url_detection() {
1115        let (text, urls) =
1116            extract_and_handle_urls("visit https://example.com for info", &UrlHandling::Remove);
1117        assert!(!text.contains("https://"));
1118        assert_eq!(urls.len(), 1);
1119        assert_eq!(urls[0], "https://example.com");
1120    }
1121
1122    #[test]
1123    fn test_url_replacement() {
1124        let (text, urls) = extract_and_handle_urls(
1125            "check https://example.com now",
1126            &UrlHandling::Replace("<URL>".to_string()),
1127        );
1128        assert!(text.contains("<URL>"));
1129        assert_eq!(urls.len(), 1);
1130    }
1131
1132    #[test]
1133    fn test_url_keep() {
1134        let (text, urls) = extract_and_handle_urls("see https://example.com", &UrlHandling::Keep);
1135        assert!(text.contains("https://example.com"));
1136        assert!(urls.is_empty());
1137    }
1138
1139    #[test]
1140    fn test_email_detection() {
1141        let (text, emails) =
1142            extract_and_handle_emails("contact user@example.com for help", &EmailHandling::Remove);
1143        assert!(!text.contains("user@example.com"));
1144        assert_eq!(emails.len(), 1);
1145        assert_eq!(emails[0], "user@example.com");
1146    }
1147
1148    #[test]
1149    fn test_mention_detection() {
1150        let (text, mentions) =
1151            extract_and_handle_mentions("hello @user123 how are you", &MentionHandling::Remove);
1152        assert!(!text.contains("@user123"));
1153        assert_eq!(mentions.len(), 1);
1154        assert_eq!(mentions[0], "@user123");
1155    }
1156
1157    #[test]
1158    fn test_mention_replacement() {
1159        let (text, _) = extract_and_handle_mentions(
1160            "hi @alice and @bob",
1161            &MentionHandling::Replace("<MENTION>".to_string()),
1162        );
1163        assert!(text.contains("<MENTION>"));
1164        assert!(!text.contains("@alice"));
1165    }
1166
1167    #[test]
1168    fn test_number_normalization() {
1169        let (text, numbers) = normalize_numbers("I have 42 cats and 3.14 dogs", "<NUM>");
1170        assert!(text.contains("<NUM>"));
1171        assert_eq!(numbers.len(), 2);
1172        assert!(numbers.contains(&"42".to_string()));
1173        assert!(numbers.contains(&"3.14".to_string()));
1174    }
1175
1176    #[test]
1177    fn test_number_with_commas() {
1178        let (text, numbers) = normalize_numbers("population: 1,234,567", "<NUM>");
1179        assert!(text.contains("<NUM>"));
1180        assert_eq!(numbers.len(), 1);
1181    }
1182
1183    #[test]
1184    fn test_contraction_expansion() {
1185        let preprocessor = TextPreprocessor::new(PreprocessConfig {
1186            strip_html: false,
1187            expand_contractions: true,
1188            ..Default::default()
1189        });
1190        let result = preprocessor
1191            .process("I can't do this")
1192            .expect("process failed");
1193        assert!(result.text.contains("cannot"));
1194    }
1195
1196    #[test]
1197    fn test_contraction_wont() {
1198        let preprocessor = TextPreprocessor::new(PreprocessConfig {
1199            strip_html: false,
1200            expand_contractions: true,
1201            ..Default::default()
1202        });
1203        let result = preprocessor.process("I won't go").expect("process failed");
1204        assert!(result.text.contains("will not"));
1205    }
1206
1207    #[test]
1208    fn test_diacritics_removal() {
1209        let result = remove_diacritics_from_text("cafe\u{0301}"); // café
1210        assert_eq!(result, "cafe");
1211    }
1212
1213    #[test]
1214    fn test_diacritics_spanish() {
1215        let result = remove_diacritics_from_text("ni\u{00f1}o"); // niño
1216        assert_eq!(result, "nino");
1217    }
1218
1219    #[test]
1220    fn test_whitespace_normalization() {
1221        assert_eq!(normalize_whitespace("  hello   world  "), "hello world");
1222        assert_eq!(normalize_whitespace("a\t\nb"), "a b");
1223    }
1224
1225    #[test]
1226    fn test_edit_distance() {
1227        assert_eq!(edit_distance("kitten", "sitting"), 3);
1228        assert_eq!(edit_distance("", "abc"), 3);
1229        assert_eq!(edit_distance("abc", "abc"), 0);
1230        assert_eq!(edit_distance("abc", ""), 3);
1231    }
1232
1233    #[test]
1234    fn test_spell_check() {
1235        let mut dictionary = HashSet::new();
1236        dictionary.insert("hello".to_string());
1237        dictionary.insert("world".to_string());
1238        dictionary.insert("computer".to_string());
1239
1240        let closest = find_closest_word("helo", &dictionary, 2);
1241        assert_eq!(closest, Some("hello".to_string()));
1242    }
1243
1244    #[test]
1245    fn test_spell_check_no_match() {
1246        let mut dictionary = HashSet::new();
1247        dictionary.insert("hello".to_string());
1248
1249        let closest = find_closest_word("zzzzz", &dictionary, 1);
1250        assert!(closest.is_none());
1251    }
1252
1253    #[test]
1254    fn test_full_pipeline() {
1255        let config = PreprocessConfig {
1256            strip_html: true,
1257            handle_urls: UrlHandling::Replace("<URL>".to_string()),
1258            handle_emails: EmailHandling::Replace("<EMAIL>".to_string()),
1259            handle_mentions: MentionHandling::Replace("<MENTION>".to_string()),
1260            normalize_numbers: true,
1261            expand_contractions: true,
1262            unicode_normalize: true,
1263            normalize_whitespace: true,
1264            ..Default::default()
1265        };
1266
1267        let preprocessor = TextPreprocessor::new(config);
1268        let text = "<p>I can't believe https://example.com has @user with 42 items!</p>";
1269        let result = preprocessor.process(text).expect("process failed");
1270
1271        assert!(!result.text.contains("<p>"));
1272        assert!(result.text.contains("cannot"));
1273        assert!(result.text.contains("<URL>"));
1274        assert!(result.text.contains("<MENTION>"));
1275        assert!(!result.extracted_urls.is_empty());
1276        assert!(!result.extracted_mentions.is_empty());
1277    }
1278
1279    #[test]
1280    fn test_pipeline_defaults() {
1281        let preprocessor = TextPreprocessor::new(PreprocessConfig::default());
1282        let result = preprocessor.process("Hello World").expect("process failed");
1283        assert_eq!(result.text, "Hello World");
1284    }
1285
1286    #[test]
1287    fn test_punctuation_removal() {
1288        let text = remove_punctuation("Hello, world! How are you?");
1289        assert!(!text.contains(','));
1290        assert!(!text.contains('!'));
1291        assert!(!text.contains('?'));
1292    }
1293
1294    #[test]
1295    fn test_transfer_casing() {
1296        assert_eq!(transfer_casing("Hello", "world"), "World");
1297        assert_eq!(transfer_casing("HELLO", "world"), "WORLD");
1298        assert_eq!(transfer_casing("hello", "WORLD"), "WORLD");
1299    }
1300
1301    #[test]
1302    fn test_basic_dictionary() {
1303        let dict = build_basic_dictionary();
1304        assert!(dict.contains("the"));
1305        assert!(dict.contains("hello"));
1306    }
1307
1308    #[test]
1309    fn test_spell_check_integration() {
1310        let config = PreprocessConfig {
1311            strip_html: false,
1312            expand_contractions: false,
1313            spell_check: true,
1314            max_edit_distance: 2,
1315            normalize_whitespace: true,
1316            ..Default::default()
1317        };
1318
1319        let preprocessor = TextPreprocessor::new(config).with_basic_dictionary();
1320        let result = preprocessor.process("helo wrld").expect("process failed");
1321        // Should correct "helo" -> "hello" and "wrld" -> "world"
1322        assert!(!result.spelling_corrections.is_empty());
1323    }
1324
1325    #[test]
1326    fn test_numeric_entity_decode() {
1327        assert_eq!(strip_html_tags("&#65;"), "A");
1328        assert_eq!(strip_html_tags("&#x41;"), "A");
1329    }
1330
1331    #[test]
1332    fn test_empty_input() {
1333        let preprocessor = TextPreprocessor::new(PreprocessConfig::default());
1334        let result = preprocessor.process("").expect("process failed");
1335        assert_eq!(result.text, "");
1336    }
1337
1338    #[test]
1339    fn test_multiple_urls() {
1340        let (text, urls) =
1341            extract_and_handle_urls("see https://a.com and https://b.com", &UrlHandling::Remove);
1342        assert_eq!(urls.len(), 2);
1343        assert!(!text.contains("https://"));
1344    }
1345
1346    #[test]
1347    fn test_lowercase() {
1348        let config = PreprocessConfig {
1349            strip_html: false,
1350            expand_contractions: false,
1351            lowercase: true,
1352            ..Default::default()
1353        };
1354        let preprocessor = TextPreprocessor::new(config);
1355        let result = preprocessor.process("Hello WORLD").expect("process failed");
1356        assert_eq!(result.text, "hello world");
1357    }
1358
1359    #[test]
1360    fn test_scientific_notation() {
1361        let (text, numbers) = normalize_numbers("value is 1.5e10 and 2E-3", "<NUM>");
1362        assert_eq!(numbers.len(), 2);
1363        assert!(text.contains("<NUM>"));
1364    }
1365
1366    #[test]
1367    fn test_negative_numbers() {
1368        let (text, numbers) = normalize_numbers("temperature: -42 degrees", "<NUM>");
1369        assert!(numbers.contains(&"-42".to_string()));
1370        assert!(text.contains("<NUM>"));
1371    }
1372
1373    #[test]
1374    fn test_html_self_closing() {
1375        assert_eq!(strip_html_tags("before<br/>after"), "beforeafter");
1376        assert_eq!(strip_html_tags("a<img src='x'/>b"), "ab");
1377    }
1378
1379    #[test]
1380    fn test_email_no_email() {
1381        let (text, emails) = extract_and_handle_emails("no email here", &EmailHandling::Remove);
1382        assert_eq!(text, "no email here");
1383        assert!(emails.is_empty());
1384    }
1385
1386    #[test]
1387    fn test_mention_not_at_word_boundary() {
1388        // @ in the middle of a word should not be treated as mention
1389        let (text, mentions) =
1390            extract_and_handle_mentions("test@notamention", &MentionHandling::Remove);
1391        // Since @ is not preceded by whitespace or start, should keep as-is
1392        assert!(mentions.is_empty());
1393        assert!(text.contains("test@notamention"));
1394    }
1395}
scirs2_text/text_preprocess.rs

scirs2_text/
text_preprocess.rs