scirs2_text/
preprocess.rs1use crate::error::Result;
7use lazy_static::lazy_static;
8use regex::Regex;
9use std::collections::HashSet;
10use unicode_normalization::UnicodeNormalization;
11
12lazy_static! {
13 static ref SPECIAL_CHARS: Regex = Regex::new(r"[^\w\s]").unwrap();
14 static ref WHITESPACE: Regex = Regex::new(r"\s+").unwrap();
15
16 static ref DEFAULT_STOPWORDS: HashSet<String> = {
18 let words = vec![
19 "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
20 "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
21 "to", "was", "were", "will", "with"
22 ];
23 words.into_iter().map(String::from).collect()
24 };
25}
26
27pub trait TextNormalizer {
29 fn normalize(&self, text: &str) -> Result<String>;
31
32 fn normalize_batch(&self, texts: &[&str]) -> Result<Vec<String>> {
34 texts.iter().map(|text| self.normalize(text)).collect()
35 }
36}
37
38pub trait TextCleaner {
40 fn clean(&self, text: &str) -> Result<String>;
42
43 fn clean_batch(&self, texts: &[&str]) -> Result<Vec<String>> {
45 texts.iter().map(|text| self.clean(text)).collect()
46 }
47}
48
49#[derive(Debug, Clone)]
51pub struct BasicNormalizer {
52 lowercase: bool,
53 unicode_normalization: bool,
54}
55
56impl BasicNormalizer {
57 pub fn new(_lowercase: bool, unicodenormalization: bool) -> Self {
59 Self {
60 lowercase: _lowercase,
61 unicode_normalization: unicodenormalization,
62 }
63 }
64}
65
66impl Default for BasicNormalizer {
67 fn default() -> Self {
68 Self::new(true, true)
69 }
70}
71
72impl TextNormalizer for BasicNormalizer {
73 fn normalize(&self, text: &str) -> Result<String> {
74 let mut normalized = text.to_string();
75
76 if self.unicode_normalization {
78 normalized = normalized.nfc().collect();
79 }
80
81 if self.lowercase {
83 normalized = normalized.to_lowercase();
84 }
85
86 Ok(normalized)
87 }
88}
89
90#[derive(Debug, Clone)]
92pub struct BasicTextCleaner {
93 remove_special_chars: bool,
94 remove_stopwords: bool,
95 normalize_whitespace: bool,
96 stopwords: HashSet<String>,
97}
98
99impl BasicTextCleaner {
100 pub fn new(
102 remove_special_chars: bool,
103 remove_stopwords: bool,
104 normalize_whitespace: bool,
105 ) -> Self {
106 Self {
107 remove_special_chars,
108 remove_stopwords,
109 normalize_whitespace: true,
110 stopwords: DEFAULT_STOPWORDS.clone(),
111 }
112 }
113
114 pub fn with_stopwords(
116 remove_special_chars: bool,
117 remove_stopwords: bool,
118 normalize_whitespace: bool,
119 stopwords: HashSet<String>,
120 ) -> Self {
121 Self {
122 remove_special_chars,
123 remove_stopwords,
124 normalize_whitespace,
125 stopwords,
126 }
127 }
128
129 pub fn add_stopwords(&mut self, words: &[&str]) {
131 for word in words {
132 self.stopwords.insert(word.to_string());
133 }
134 }
135
136 pub fn is_stopword(&self, word: &str) -> bool {
138 self.stopwords.contains(word)
139 }
140}
141
142impl Default for BasicTextCleaner {
143 fn default() -> Self {
144 Self::new(true, true, true)
145 }
146}
147
148impl TextCleaner for BasicTextCleaner {
149 fn clean(&self, text: &str) -> Result<String> {
150 let mut cleaned = text.to_string();
151
152 if self.remove_special_chars {
154 cleaned = SPECIAL_CHARS.replace_all(&cleaned, " ").to_string();
155 }
156
157 if self.normalize_whitespace {
159 cleaned = WHITESPACE.replace_all(&cleaned, " ").trim().to_string();
160 }
161
162 if self.remove_stopwords {
164 cleaned = cleaned
165 .split_whitespace()
166 .filter(|word| !self.is_stopword(word))
167 .collect::<Vec<_>>()
168 .join(" ");
169 }
170
171 Ok(cleaned)
172 }
173}
174
175#[derive(Debug, Clone)]
177pub struct TextPreprocessor {
178 normalizer: BasicNormalizer,
179 cleaner: BasicTextCleaner,
180}
181
182impl TextPreprocessor {
183 pub fn new(normalizer: BasicNormalizer, cleaner: BasicTextCleaner) -> Self {
185 Self {
186 normalizer,
187 cleaner,
188 }
189 }
190
191 pub fn process(&self, text: &str) -> Result<String> {
193 let normalized = self.normalizer.normalize(text)?;
194 let cleaned = self.cleaner.clean(&normalized)?;
195 Ok(cleaned)
196 }
197
198 pub fn process_batch(&self, texts: &[&str]) -> Result<Vec<String>> {
200 texts.iter().map(|text| self.process(text)).collect()
201 }
202}
203
204impl Default for TextPreprocessor {
205 fn default() -> Self {
206 Self::new(BasicNormalizer::default(), BasicTextCleaner::default())
207 }
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn test_basic_normalizer() {
216 let normalizer = BasicNormalizer::default();
217 let text = "Héllo, World!";
218 let normalized = normalizer.normalize(text).unwrap();
219 assert_eq!(normalized, "héllo, world!");
220 }
221
222 #[test]
223 fn testtext_cleaner() {
224 let cleaner = BasicTextCleaner::default();
225 let text = "Hello, world! This is a test.";
226 let cleaned = cleaner.clean(text).unwrap();
227 assert_eq!(cleaned, "Hello world This test");
228 }
229
230 #[test]
231 fn testtext_preprocessor() {
232 let preprocessor = TextPreprocessor::default();
233 let text = "Héllo, World! This is a test.";
234 let processed = preprocessor.process(text).unwrap();
235 assert_eq!(processed, "héllo world this test");
236 }
237}