unobtanium_segmenter/normalization/
rust_stemmers.rs1use rust_stemmers::Algorithm;
2use rust_stemmers::Stemmer;
3use whatlang::Lang;
4
5use crate::augmentation::Augmenter;
6use crate::SegmentedToken;
7use crate::SegmentedTokenKind;
8
9#[derive(Debug, Clone)]
20pub struct NormalizationRustStemmers {
21 pub anyway_above_confidence: f64,
29}
30
31impl NormalizationRustStemmers {
32 pub fn new() -> Self {
34 Default::default()
35 }
36
37 pub fn set_anyway_above_confidence(mut self, anyway_above_confidence: f64) -> Self {
39 self.anyway_above_confidence = anyway_above_confidence;
40 return self;
41 }
42}
43
44impl Default for NormalizationRustStemmers {
45 fn default() -> Self {
46 Self {
47 anyway_above_confidence: 0.4,
48 }
49 }
50}
51
52impl Augmenter for NormalizationRustStemmers {
53 fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
54 if (token.is_detected_language_relible
55 || token.detected_language_confidence > self.anyway_above_confidence)
56 && token.normalized_text.is_none()
57 && matches!(token.kind, Some(SegmentedTokenKind::AlphaNumeric) | None)
58 {
59 if let Some(language) = token.detected_language {
60 if let Some(algorithm) = get_stemming_algorithm_for_lang(language) {
61 let stemmer = Stemmer::create(algorithm);
62 let stemmed = stemmer.stem(token.get_text_prefer_normalized());
63 if stemmed != token.text {
64 token.normalized_text = Some(stemmed.to_string());
65 }
66 }
67 }
68 }
69 return token;
70 }
71}
72
73fn get_stemming_algorithm_for_lang(lang: Lang) -> Option<Algorithm> {
75 Some(match lang {
76 Lang::Ara => Algorithm::Arabic,
77 Lang::Dan => Algorithm::Danish,
78 Lang::Nld => Algorithm::Dutch,
79 Lang::Eng => Algorithm::English,
80 Lang::Fin => Algorithm::Finnish,
81 Lang::Fra => Algorithm::French,
82 Lang::Deu => Algorithm::German,
83 Lang::Ell => Algorithm::Greek,
84 Lang::Hun => Algorithm::Hungarian,
85 Lang::Ita => Algorithm::Italian,
86 Lang::Por => Algorithm::Portuguese,
88 Lang::Ron => Algorithm::Romanian,
89 Lang::Rus => Algorithm::Russian,
90 Lang::Spa => Algorithm::Spanish,
91 Lang::Swe => Algorithm::Swedish,
92 Lang::Tam => Algorithm::Tamil,
93 Lang::Tur => Algorithm::Turkish,
94 _ => {
95 return None;
96 }
97 })
98}
99
100#[cfg(test)]
101mod test {
102
103 use super::*;
104
105 use crate::chain::*;
106
107 use crate::augmentation::AugmentationDetectLanguage;
108 use crate::segmentation::UnicodeSentenceSplitter;
109 use crate::segmentation::UnicodeWordSplitter;
110
111 #[test]
112 fn test_stemmed_unicode_word_split() {
113 let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog.";
114
115 let sentence_splitter = UnicodeSentenceSplitter::new();
116 let language_detector = AugmentationDetectLanguage::new();
117 let word_splitter = UnicodeWordSplitter::new();
118
119 let result: Vec<String> = test_text
120 .start_segmentation_chain()
121 .chain_segmenter(&sentence_splitter)
122 .chain_augmenter(&language_detector)
123 .inspect(|x| {
124 println!("{x:?}");
125 })
126 .chain_segmenter(&word_splitter)
127 .inspect(|x| {
128 println!("word: {x:?}");
129 })
130 .chain_augmenter(&NormalizationRustStemmers::new().set_anyway_above_confidence(0.1))
131 .map(|t| t.get_text_prefer_normalized_owned())
132 .collect();
133
134 let expected_tokens: Vec<String> = vec![
135 "Fisch", " ", "Fritz", " ", "fischt", " ", "frisch", " ", "Fisch", "!", " ", "", "The",
136 " ", "jump", " ", "brown", " ", "fox", " ", "quick", " ", "jump", " ", "over", " ",
137 "the", " ", "sleep", " ", "dog", ".", "",
138 ]
139 .iter()
140 .map(|s| s.to_string())
141 .collect();
142
143 assert_eq!(result, expected_tokens);
144 }
145}