unobtanium_segmenter/normalization/
rust_stemmers.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use rust_stemmers::Algorithm;
6use rust_stemmers::Stemmer;
7use whatlang::Lang;
8
9use crate::SegmentedToken;
10use crate::SegmentedTokenKind;
11use crate::augmentation::Augmenter;
12
13/// Will run stemming with the language tagged onto the token if an algorithm is available.
14///
15/// This uses the [rust_stemmers] crate under the hood.
16///
17/// This is recommended to be run after an [AugmentationDetectLanguage][crate::augmentation::AugmentationDetectLanguage] has been used, it will not do anything if no language metadata is available!
18///
19/// If you need lowercase normalization, do that **before** this normalizer and set `process_already_normalized` to `true`. This is because some normalizers can't handle SCREAMING CASE.
20///
21/// Tokens will be ignored if:
22/// * They are known to not be an [SegmentedTokenKind::AlphaNumeric]
23/// * They already have `normalized_text` set when `process_already_normalized` is `false` (default)
24///
25/// If the tokens [normalization_language](SegmentedToken::normalization_language) is already set to a `Some` value that one wil be used and the detected language ignored.
26#[derive(Debug, Clone)]
27pub struct NormalizationRustStemmers {
28	/// Thereshold above which the flag about the lnguage detection flagging itself as reliable is ignored and the detected lnguage used for normalization anyway.
29	/// Setting this can help with shorter texts.
30	///
31	/// 1.0 which translates to never ignore the flag.
32	/// 0.0 would mean to always ignore it.
33	///
34	/// Default is 0.4 as that is usually "good enough" for correct stemming.
35	pub anyway_above_confidence: f64,
36
37	/// Wheter to process tokens that are already normalized.
38	/// You want to enable this, if your pipeline does some generic preprocessing like lowercasing.
39	///
40	/// Default is `false` for backwards compatibility.
41	pub process_already_normalized: bool,
42}
43
44impl NormalizationRustStemmers {
45	/// Create a new NormalizationRustStemmers instance with the default settings.
46	pub fn new() -> Self {
47		Default::default()
48	}
49
50	/// Adjust the value of [anyway_above_confidence][Self::anyway_above_confidence] builder style.
51	pub fn set_anyway_above_confidence(mut self, anyway_above_confidence: f64) -> Self {
52		self.anyway_above_confidence = anyway_above_confidence;
53		return self;
54	}
55
56	/// Adjust the value of [process_already_normalized][Self::process_already_normalized] builder style.
57	pub fn set_process_already_normalized(mut self, process_already_normalized: bool) -> Self {
58		self.process_already_normalized = process_already_normalized;
59		return self;
60	}
61}
62
63impl Default for NormalizationRustStemmers {
64	fn default() -> Self {
65		Self {
66			anyway_above_confidence: 0.4,
67			process_already_normalized: false,
68		}
69	}
70}
71
72impl Augmenter for NormalizationRustStemmers {
73	#[allow(clippy::collapsible_if)]
74	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
75		if !matches!(token.kind, Some(SegmentedTokenKind::AlphaNumeric) | None) {
76			return token;
77		}
78		if token.was_normalized() && !self.process_already_normalized {
79			return token;
80		}
81		if let Some(algorithm) = token
82			.normalization_language
83			.and_then(get_stemming_algorithm_for_lang)
84		{
85			let stemmer = Stemmer::create(algorithm);
86			token.update_normalized_string(
87				stemmer.stem(token.get_text_prefer_normalized()).to_string(),
88				token.normalization_language,
89			);
90		} else if token.is_detected_language_relible
91			|| token.detected_language_confidence > self.anyway_above_confidence
92		{
93			if let Some(language) = token.normalization_language.or(token.detected_language) {
94				if let Some(algorithm) = get_stemming_algorithm_for_lang(language) {
95					let stemmer = Stemmer::create(algorithm);
96					token.update_normalized_string(
97						stemmer.stem(token.get_text_prefer_normalized()).to_string(),
98						Some(language),
99					);
100				}
101			}
102		}
103		return token;
104	}
105}
106
107/// Map Whatlang languages to Implemented normalization algorithms
108fn get_stemming_algorithm_for_lang(lang: Lang) -> Option<Algorithm> {
109	Some(match lang {
110		Lang::Ara => Algorithm::Arabic,
111		Lang::Dan => Algorithm::Danish,
112		Lang::Nld => Algorithm::Dutch,
113		Lang::Eng => Algorithm::English,
114		Lang::Fin => Algorithm::Finnish,
115		Lang::Fra => Algorithm::French,
116		Lang::Deu => Algorithm::German,
117		Lang::Ell => Algorithm::Greek,
118		Lang::Hun => Algorithm::Hungarian,
119		Lang::Ita => Algorithm::Italian,
120		// Missing: Norwegian, whatlang can't detect it
121		Lang::Por => Algorithm::Portuguese,
122		Lang::Ron => Algorithm::Romanian,
123		Lang::Rus => Algorithm::Russian,
124		Lang::Spa => Algorithm::Spanish,
125		Lang::Swe => Algorithm::Swedish,
126		Lang::Tam => Algorithm::Tamil,
127		Lang::Tur => Algorithm::Turkish,
128		_ => {
129			return None;
130		}
131	})
132}
133
134#[cfg(test)]
135mod test {
136
137	use super::*;
138
139	use crate::chain::*;
140
141	use crate::augmentation::AugmentationDetectLanguage;
142	use crate::normalization::NormalizationLowercase;
143	use crate::segmentation::UnicodeSentenceSplitter;
144	use crate::segmentation::UnicodeWordSplitter;
145
146	#[test]
147	fn test_stemmed_unicode_word_split() {
148		let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN NOT HANDLE SCREAMING CASE OUTRAGE!";
149
150		let sentence_splitter = UnicodeSentenceSplitter::new();
151		let language_detector = AugmentationDetectLanguage::new();
152		let word_splitter = UnicodeWordSplitter::new();
153
154		let result: Vec<String> = test_text
155			.start_segmentation_chain()
156			.chain_segmenter(&sentence_splitter)
157			.chain_augmenter(&language_detector)
158			.inspect(|x| {
159				println!("{x:?}");
160			})
161			.chain_segmenter(&word_splitter)
162			.inspect(|x| {
163				println!("word: {x:?}");
164			})
165			.chain_augmenter(&NormalizationRustStemmers::new().set_anyway_above_confidence(0.1))
166			.map(|t| t.get_text_prefer_normalized_owned())
167			.collect();
168
169		let expected_tokens: Vec<String> = vec![
170			"Fisch",
171			" ",
172			"Fritz",
173			" ",
174			"fischt",
175			" ",
176			"frisch",
177			" ",
178			"Fisch",
179			"!",
180			" ",
181			"",
182			"The",
183			" ",
184			"jump",
185			" ",
186			"brown",
187			" ",
188			"fox",
189			" ",
190			"quick",
191			" ",
192			"jump",
193			" ",
194			"over",
195			" ",
196			"the",
197			" ",
198			"sleep",
199			" ",
200			"dog",
201			".",
202			" ",
203			"",
204			"CAN",
205			" ",
206			"NOT",
207			" ",
208			"HANDLE",
209			" ",
210			"SCREAMING",
211			" ",
212			"CASE",
213			" ",
214			"OUTRAGE",
215			"!",
216			"",
217		]
218		.iter()
219		.map(|s| s.to_string())
220		.collect();
221
222		assert_eq!(result, expected_tokens);
223	}
224
225	#[test]
226	fn test_stemmed_unicode_word_split_lowercase() {
227		let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN ALSO HANDLE SCREAMING CASE OUTRAGE!";
228
229		let sentence_splitter = UnicodeSentenceSplitter::new();
230		let language_detector = AugmentationDetectLanguage::new();
231		let word_splitter = UnicodeWordSplitter::new();
232
233		let result: Vec<String> = test_text
234			.start_segmentation_chain()
235			.chain_segmenter(&sentence_splitter)
236			.chain_augmenter(&language_detector)
237			.inspect(|x| {
238				println!("{x:?}");
239			})
240			.chain_segmenter(&word_splitter)
241			.inspect(|x| {
242				println!("word: {x:?}");
243			})
244			.chain_augmenter(&NormalizationLowercase::new())
245			.chain_augmenter(
246				&NormalizationRustStemmers::new()
247					.set_anyway_above_confidence(0.1)
248					.set_process_already_normalized(true),
249			)
250			.map(|t| t.get_text_prefer_normalized_owned())
251			.collect();
252
253		let expected_tokens: Vec<String> = vec![
254			"fisch",
255			" ",
256			"fritz",
257			" ",
258			"fischt",
259			" ",
260			"frisch",
261			" ",
262			"fisch",
263			"!",
264			" ",
265			"",
266			"the",
267			" ",
268			"jump",
269			" ",
270			"brown",
271			" ",
272			"fox",
273			" ",
274			"quick",
275			" ",
276			"jump",
277			" ",
278			"over",
279			" ",
280			"the",
281			" ",
282			"sleep",
283			" ",
284			"dog",
285			".",
286			" ",
287			"",
288			"can",
289			" ",
290			"also",
291			" ",
292			"handl",
293			" ",
294			"screaming",
295			" ",
296			"cas",
297			" ",
298			"outrag",
299			"!",
300			"",
301		]
302		.iter()
303		.map(|s| s.to_string())
304		.collect();
305
306		assert_eq!(result, expected_tokens);
307	}
308}