scirs2_text/
preprocess.rs

1//! Text preprocessing utilities
2//!
3//! This module provides functionality for text normalization,
4//! cleaning, and other preprocessing operations.
5
6use crate::error::Result;
7use lazy_static::lazy_static;
8use regex::Regex;
9use std::collections::HashSet;
10use unicode_normalization::UnicodeNormalization;
11
12lazy_static! {
13    static ref SPECIAL_CHARS: Regex = Regex::new(r"[^\w\s]").unwrap();
14    static ref WHITESPACE: Regex = Regex::new(r"\s+").unwrap();
15
16    // Common English stopwords
17    static ref DEFAULT_STOPWORDS: HashSet<String> = {
18        let words = vec![
19            "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
20            "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
21            "to", "was", "were", "will", "with"
22        ];
23        words.into_iter().map(String::from).collect()
24    };
25}
26
27/// Trait for text normalization operations
28pub trait TextNormalizer {
29    /// Normalize the input text
30    fn normalize(&self, text: &str) -> Result<String>;
31
32    /// Normalize a batch of texts
33    fn normalize_batch(&self, texts: &[&str]) -> Result<Vec<String>> {
34        texts.iter().map(|text| self.normalize(text)).collect()
35    }
36}
37
38/// Trait for text cleaning operations
39pub trait TextCleaner {
40    /// Clean the input text
41    fn clean(&self, text: &str) -> Result<String>;
42
43    /// Clean a batch of texts
44    fn clean_batch(&self, texts: &[&str]) -> Result<Vec<String>> {
45        texts.iter().map(|text| self.clean(text)).collect()
46    }
47}
48
49/// Basic text normalizer that handles case folding and unicode normalization
50#[derive(Debug, Clone)]
51pub struct BasicNormalizer {
52    lowercase: bool,
53    unicode_normalization: bool,
54}
55
56impl BasicNormalizer {
57    /// Create a new basic normalizer
58    pub fn new(_lowercase: bool, unicodenormalization: bool) -> Self {
59        Self {
60            lowercase: _lowercase,
61            unicode_normalization: unicodenormalization,
62        }
63    }
64}
65
66impl Default for BasicNormalizer {
67    fn default() -> Self {
68        Self::new(true, true)
69    }
70}
71
72impl TextNormalizer for BasicNormalizer {
73    fn normalize(&self, text: &str) -> Result<String> {
74        let mut normalized = text.to_string();
75
76        // Apply Unicode normalization (NFC form)
77        if self.unicode_normalization {
78            normalized = normalized.nfc().collect();
79        }
80
81        // Apply case folding
82        if self.lowercase {
83            normalized = normalized.to_lowercase();
84        }
85
86        Ok(normalized)
87    }
88}
89
90/// Text cleaner for removing special characters, extra whitespace, and stopwords
91#[derive(Debug, Clone)]
92pub struct BasicTextCleaner {
93    remove_special_chars: bool,
94    remove_stopwords: bool,
95    normalize_whitespace: bool,
96    stopwords: HashSet<String>,
97}
98
99impl BasicTextCleaner {
100    /// Create a new text cleaner
101    pub fn new(
102        remove_special_chars: bool,
103        remove_stopwords: bool,
104        normalize_whitespace: bool,
105    ) -> Self {
106        Self {
107            remove_special_chars,
108            remove_stopwords,
109            normalize_whitespace: true,
110            stopwords: DEFAULT_STOPWORDS.clone(),
111        }
112    }
113
114    /// Create a text cleaner with custom stopwords
115    pub fn with_stopwords(
116        remove_special_chars: bool,
117        remove_stopwords: bool,
118        normalize_whitespace: bool,
119        stopwords: HashSet<String>,
120    ) -> Self {
121        Self {
122            remove_special_chars,
123            remove_stopwords,
124            normalize_whitespace,
125            stopwords,
126        }
127    }
128
129    /// Add stopwords to the cleaner
130    pub fn add_stopwords(&mut self, words: &[&str]) {
131        for word in words {
132            self.stopwords.insert(word.to_string());
133        }
134    }
135
136    /// Check if a word is a stopword
137    pub fn is_stopword(&self, word: &str) -> bool {
138        self.stopwords.contains(word)
139    }
140}
141
142impl Default for BasicTextCleaner {
143    fn default() -> Self {
144        Self::new(true, true, true)
145    }
146}
147
148impl TextCleaner for BasicTextCleaner {
149    fn clean(&self, text: &str) -> Result<String> {
150        let mut cleaned = text.to_string();
151
152        // Remove special characters
153        if self.remove_special_chars {
154            cleaned = SPECIAL_CHARS.replace_all(&cleaned, " ").to_string();
155        }
156
157        // Normalize whitespace
158        if self.normalize_whitespace {
159            cleaned = WHITESPACE.replace_all(&cleaned, " ").trim().to_string();
160        }
161
162        // Remove stopwords
163        if self.remove_stopwords {
164            cleaned = cleaned
165                .split_whitespace()
166                .filter(|word| !self.is_stopword(word))
167                .collect::<Vec<_>>()
168                .join(" ");
169        }
170
171        Ok(cleaned)
172    }
173}
174
175/// Pipeline for text preprocessing that combines normalization and cleaning
176#[derive(Debug, Clone)]
177pub struct TextPreprocessor {
178    normalizer: BasicNormalizer,
179    cleaner: BasicTextCleaner,
180}
181
182impl TextPreprocessor {
183    /// Create a new text preprocessor
184    pub fn new(normalizer: BasicNormalizer, cleaner: BasicTextCleaner) -> Self {
185        Self {
186            normalizer,
187            cleaner,
188        }
189    }
190
191    /// Process a text using the normalization and cleaning pipeline
192    pub fn process(&self, text: &str) -> Result<String> {
193        let normalized = self.normalizer.normalize(text)?;
194        let cleaned = self.cleaner.clean(&normalized)?;
195        Ok(cleaned)
196    }
197
198    /// Process a batch of texts
199    pub fn process_batch(&self, texts: &[&str]) -> Result<Vec<String>> {
200        texts.iter().map(|text| self.process(text)).collect()
201    }
202}
203
204impl Default for TextPreprocessor {
205    fn default() -> Self {
206        Self::new(BasicNormalizer::default(), BasicTextCleaner::default())
207    }
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn test_basic_normalizer() {
216        let normalizer = BasicNormalizer::default();
217        let text = "Héllo, World!";
218        let normalized = normalizer.normalize(text).unwrap();
219        assert_eq!(normalized, "héllo, world!");
220    }
221
222    #[test]
223    fn testtext_cleaner() {
224        let cleaner = BasicTextCleaner::default();
225        let text = "Hello, world! This is a test.";
226        let cleaned = cleaner.clean(text).unwrap();
227        assert_eq!(cleaned, "Hello world This test");
228    }
229
230    #[test]
231    fn testtext_preprocessor() {
232        let preprocessor = TextPreprocessor::default();
233        let text = "Héllo, World! This is a test.";
234        let processed = preprocessor.process(text).unwrap();
235        assert_eq!(processed, "héllo world this test");
236    }
237}