unobtanium_segmenter/normalization/
rust_stemmers.rs1use rust_stemmers::Algorithm;
6use rust_stemmers::Stemmer;
7use whatlang::Lang;
8
9use crate::SegmentedToken;
10use crate::SegmentedTokenKind;
11use crate::augmentation::Augmenter;
12
13#[derive(Debug, Clone)]
27pub struct NormalizationRustStemmers {
28 pub anyway_above_confidence: f64,
36
37 pub process_already_normalized: bool,
42}
43
44impl NormalizationRustStemmers {
45 pub fn new() -> Self {
47 Default::default()
48 }
49
50 pub fn set_anyway_above_confidence(mut self, anyway_above_confidence: f64) -> Self {
52 self.anyway_above_confidence = anyway_above_confidence;
53 return self;
54 }
55
56 pub fn set_process_already_normalized(mut self, process_already_normalized: bool) -> Self {
58 self.process_already_normalized = process_already_normalized;
59 return self;
60 }
61}
62
63impl Default for NormalizationRustStemmers {
64 fn default() -> Self {
65 Self {
66 anyway_above_confidence: 0.4,
67 process_already_normalized: false,
68 }
69 }
70}
71
72impl Augmenter for NormalizationRustStemmers {
73 #[allow(clippy::collapsible_if)]
74 fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
75 if !matches!(token.kind, Some(SegmentedTokenKind::AlphaNumeric) | None) {
76 return token;
77 }
78 if token.was_normalized() && !self.process_already_normalized {
79 return token;
80 }
81 if let Some(algorithm) = token
82 .normalization_language
83 .and_then(get_stemming_algorithm_for_lang)
84 {
85 let stemmer = Stemmer::create(algorithm);
86 token.update_normalized_string(
87 stemmer.stem(token.get_text_prefer_normalized()).to_string(),
88 token.normalization_language,
89 );
90 } else if token.is_detected_language_relible
91 || token.detected_language_confidence > self.anyway_above_confidence
92 {
93 if let Some(language) = token.normalization_language.or(token.detected_language) {
94 if let Some(algorithm) = get_stemming_algorithm_for_lang(language) {
95 let stemmer = Stemmer::create(algorithm);
96 token.update_normalized_string(
97 stemmer.stem(token.get_text_prefer_normalized()).to_string(),
98 Some(language),
99 );
100 }
101 }
102 }
103 return token;
104 }
105}
106
107fn get_stemming_algorithm_for_lang(lang: Lang) -> Option<Algorithm> {
109 Some(match lang {
110 Lang::Ara => Algorithm::Arabic,
111 Lang::Dan => Algorithm::Danish,
112 Lang::Nld => Algorithm::Dutch,
113 Lang::Eng => Algorithm::English,
114 Lang::Fin => Algorithm::Finnish,
115 Lang::Fra => Algorithm::French,
116 Lang::Deu => Algorithm::German,
117 Lang::Ell => Algorithm::Greek,
118 Lang::Hun => Algorithm::Hungarian,
119 Lang::Ita => Algorithm::Italian,
120 Lang::Por => Algorithm::Portuguese,
122 Lang::Ron => Algorithm::Romanian,
123 Lang::Rus => Algorithm::Russian,
124 Lang::Spa => Algorithm::Spanish,
125 Lang::Swe => Algorithm::Swedish,
126 Lang::Tam => Algorithm::Tamil,
127 Lang::Tur => Algorithm::Turkish,
128 _ => {
129 return None;
130 }
131 })
132}
133
134#[cfg(test)]
135mod test {
136
137 use super::*;
138
139 use crate::chain::*;
140
141 use crate::augmentation::AugmentationDetectLanguage;
142 use crate::normalization::NormalizationLowercase;
143 use crate::segmentation::UnicodeSentenceSplitter;
144 use crate::segmentation::UnicodeWordSplitter;
145
146 #[test]
147 fn test_stemmed_unicode_word_split() {
148 let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN NOT HANDLE SCREAMING CASE OUTRAGE!";
149
150 let sentence_splitter = UnicodeSentenceSplitter::new();
151 let language_detector = AugmentationDetectLanguage::new();
152 let word_splitter = UnicodeWordSplitter::new();
153
154 let result: Vec<String> = test_text
155 .start_segmentation_chain()
156 .chain_segmenter(&sentence_splitter)
157 .chain_augmenter(&language_detector)
158 .inspect(|x| {
159 println!("{x:?}");
160 })
161 .chain_segmenter(&word_splitter)
162 .inspect(|x| {
163 println!("word: {x:?}");
164 })
165 .chain_augmenter(&NormalizationRustStemmers::new().set_anyway_above_confidence(0.1))
166 .map(|t| t.get_text_prefer_normalized_owned())
167 .collect();
168
169 let expected_tokens: Vec<String> = vec![
170 "Fisch",
171 " ",
172 "Fritz",
173 " ",
174 "fischt",
175 " ",
176 "frisch",
177 " ",
178 "Fisch",
179 "!",
180 " ",
181 "",
182 "The",
183 " ",
184 "jump",
185 " ",
186 "brown",
187 " ",
188 "fox",
189 " ",
190 "quick",
191 " ",
192 "jump",
193 " ",
194 "over",
195 " ",
196 "the",
197 " ",
198 "sleep",
199 " ",
200 "dog",
201 ".",
202 " ",
203 "",
204 "CAN",
205 " ",
206 "NOT",
207 " ",
208 "HANDLE",
209 " ",
210 "SCREAMING",
211 " ",
212 "CASE",
213 " ",
214 "OUTRAGE",
215 "!",
216 "",
217 ]
218 .iter()
219 .map(|s| s.to_string())
220 .collect();
221
222 assert_eq!(result, expected_tokens);
223 }
224
225 #[test]
226 fn test_stemmed_unicode_word_split_lowercase() {
227 let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN ALSO HANDLE SCREAMING CASE OUTRAGE!";
228
229 let sentence_splitter = UnicodeSentenceSplitter::new();
230 let language_detector = AugmentationDetectLanguage::new();
231 let word_splitter = UnicodeWordSplitter::new();
232
233 let result: Vec<String> = test_text
234 .start_segmentation_chain()
235 .chain_segmenter(&sentence_splitter)
236 .chain_augmenter(&language_detector)
237 .inspect(|x| {
238 println!("{x:?}");
239 })
240 .chain_segmenter(&word_splitter)
241 .inspect(|x| {
242 println!("word: {x:?}");
243 })
244 .chain_augmenter(&NormalizationLowercase::new())
245 .chain_augmenter(
246 &NormalizationRustStemmers::new()
247 .set_anyway_above_confidence(0.1)
248 .set_process_already_normalized(true),
249 )
250 .map(|t| t.get_text_prefer_normalized_owned())
251 .collect();
252
253 let expected_tokens: Vec<String> = vec![
254 "fisch",
255 " ",
256 "fritz",
257 " ",
258 "fischt",
259 " ",
260 "frisch",
261 " ",
262 "fisch",
263 "!",
264 " ",
265 "",
266 "the",
267 " ",
268 "jump",
269 " ",
270 "brown",
271 " ",
272 "fox",
273 " ",
274 "quick",
275 " ",
276 "jump",
277 " ",
278 "over",
279 " ",
280 "the",
281 " ",
282 "sleep",
283 " ",
284 "dog",
285 ".",
286 " ",
287 "",
288 "can",
289 " ",
290 "also",
291 " ",
292 "handl",
293 " ",
294 "screaming",
295 " ",
296 "cas",
297 " ",
298 "outrag",
299 "!",
300 "",
301 ]
302 .iter()
303 .map(|s| s.to_string())
304 .collect();
305
306 assert_eq!(result, expected_tokens);
307 }
308}