unobtanium_segmenter/normalization/
rust_stemmers.rs

1use rust_stemmers::Algorithm;
2use rust_stemmers::Stemmer;
3use whatlang::Lang;
4
5use crate::augmentation::Augmenter;
6use crate::SegmentedToken;
7use crate::SegmentedTokenKind;
8
9/// Will run stemming with the language tagged onto the token if an algorithm is available.
10///
11/// This uses the [rust_stemmers] crate under the hood.
12///
13/// This is recommended to be run after an [AugmentationDetectLanguage][crate::augmentation::AugmentationDetectLanguage] has been used, it will not do anything if no language metadata is available!
14///
15/// Tokens will be ignored if:
16/// * They are known to not be an [SegmentedTokenKind::AlphaNumeric]
17/// * They already have `normalized_text` set. Apply things like lowercasing after this.
18///
19#[derive(Debug, Clone)]
20pub struct NormalizationRustStemmers {
21	/// Thereshold above which the flag about the lnguage detection flagging itself as reliable is ignored and the detected lnguage used for normalization anyway.
22	/// Setting this can help with shorter texts.
23	///
24	/// 1.0 which translates to never ignore the flag.
25	/// 0.0 would mean to always ignore it.
26	///
27	/// Default is 0.4 as that is usually "good enough" for correct stemming.
28	pub anyway_above_confidence: f64,
29}
30
31impl NormalizationRustStemmers {
32	/// Create a new NormalizationRustStemmers instance with the default settings.
33	pub fn new() -> Self {
34		Default::default()
35	}
36
37	/// Adjust the value of [anyway_above_confidence][Self::anyway_above_confidence] builder style.
38	pub fn set_anyway_above_confidence(mut self, anyway_above_confidence: f64) -> Self {
39		self.anyway_above_confidence = anyway_above_confidence;
40		return self;
41	}
42}
43
44impl Default for NormalizationRustStemmers {
45	fn default() -> Self {
46		Self {
47			anyway_above_confidence: 0.4,
48		}
49	}
50}
51
52impl Augmenter for NormalizationRustStemmers {
53	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
54		if (token.is_detected_language_relible
55			|| token.detected_language_confidence > self.anyway_above_confidence)
56			&& token.normalized_text.is_none()
57			&& matches!(token.kind, Some(SegmentedTokenKind::AlphaNumeric) | None)
58		{
59			if let Some(language) = token.detected_language {
60				if let Some(algorithm) = get_stemming_algorithm_for_lang(language) {
61					let stemmer = Stemmer::create(algorithm);
62					let stemmed = stemmer.stem(token.get_text_prefer_normalized());
63					if stemmed != token.text {
64						token.normalized_text = Some(stemmed.to_string());
65					}
66				}
67			}
68		}
69		return token;
70	}
71}
72
73/// Map Whatlang languages to Implemented normalization algorithms
74fn get_stemming_algorithm_for_lang(lang: Lang) -> Option<Algorithm> {
75	Some(match lang {
76		Lang::Ara => Algorithm::Arabic,
77		Lang::Dan => Algorithm::Danish,
78		Lang::Nld => Algorithm::Dutch,
79		Lang::Eng => Algorithm::English,
80		Lang::Fin => Algorithm::Finnish,
81		Lang::Fra => Algorithm::French,
82		Lang::Deu => Algorithm::German,
83		Lang::Ell => Algorithm::Greek,
84		Lang::Hun => Algorithm::Hungarian,
85		Lang::Ita => Algorithm::Italian,
86		// Missing: Norwegian, whatlang can't detect it
87		Lang::Por => Algorithm::Portuguese,
88		Lang::Ron => Algorithm::Romanian,
89		Lang::Rus => Algorithm::Russian,
90		Lang::Spa => Algorithm::Spanish,
91		Lang::Swe => Algorithm::Swedish,
92		Lang::Tam => Algorithm::Tamil,
93		Lang::Tur => Algorithm::Turkish,
94		_ => {
95			return None;
96		}
97	})
98}
99
100#[cfg(test)]
101mod test {
102
103	use super::*;
104
105	use crate::chain::*;
106
107	use crate::augmentation::AugmentationDetectLanguage;
108	use crate::segmentation::UnicodeSentenceSplitter;
109	use crate::segmentation::UnicodeWordSplitter;
110
111	#[test]
112	fn test_stemmed_unicode_word_split() {
113		let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog.";
114
115		let sentence_splitter = UnicodeSentenceSplitter::new();
116		let language_detector = AugmentationDetectLanguage::new();
117		let word_splitter = UnicodeWordSplitter::new();
118
119		let result: Vec<String> = test_text
120			.start_segmentation_chain()
121			.chain_segmenter(&sentence_splitter)
122			.chain_augmenter(&language_detector)
123			.inspect(|x| {
124				println!("{x:?}");
125			})
126			.chain_segmenter(&word_splitter)
127			.inspect(|x| {
128				println!("word: {x:?}");
129			})
130			.chain_augmenter(&NormalizationRustStemmers::new().set_anyway_above_confidence(0.1))
131			.map(|t| t.get_text_prefer_normalized_owned())
132			.collect();
133
134		let expected_tokens: Vec<String> = vec![
135			"Fisch", " ", "Fritz", " ", "fischt", " ", "frisch", " ", "Fisch", "!", " ", "", "The",
136			" ", "jump", " ", "brown", " ", "fox", " ", "quick", " ", "jump", " ", "over", " ",
137			"the", " ", "sleep", " ", "dog", ".", "",
138		]
139		.iter()
140		.map(|s| s.to_string())
141		.collect();
142
143		assert_eq!(result, expected_tokens);
144	}
145}