use rust_stemmers::Algorithm;
use rust_stemmers::Stemmer;
use whatlang::Lang;
use crate::SegmentedToken;
use crate::SegmentedTokenKind;
use crate::augmentation::Augmenter;
#[derive(Debug, Clone)]
pub struct NormalizationRustStemmers {
pub anyway_above_confidence: f64,
pub process_already_normalized: bool,
}
impl NormalizationRustStemmers {
pub fn new() -> Self {
Default::default()
}
pub fn set_anyway_above_confidence(mut self, anyway_above_confidence: f64) -> Self {
self.anyway_above_confidence = anyway_above_confidence;
return self;
}
pub fn set_process_already_normalized(mut self, process_already_normalized: bool) -> Self {
self.process_already_normalized = process_already_normalized;
return self;
}
}
impl Default for NormalizationRustStemmers {
fn default() -> Self {
Self {
anyway_above_confidence: 0.4,
process_already_normalized: false,
}
}
}
impl Augmenter for NormalizationRustStemmers {
#[allow(clippy::collapsible_if)]
fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
if !matches!(token.kind, Some(SegmentedTokenKind::AlphaNumeric) | None) {
return token;
}
if token.was_normalized() && !self.process_already_normalized {
return token;
}
if let Some(algorithm) = token
.normalization_language
.and_then(get_stemming_algorithm_for_lang)
{
let stemmer = Stemmer::create(algorithm);
token.update_normalized_string(
stemmer.stem(token.get_text_prefer_normalized()).to_string(),
token.normalization_language,
);
} else if token.is_detected_language_relible
|| token.detected_language_confidence > self.anyway_above_confidence
{
if let Some(language) = token.normalization_language.or(token.detected_language) {
if let Some(algorithm) = get_stemming_algorithm_for_lang(language) {
let stemmer = Stemmer::create(algorithm);
token.update_normalized_string(
stemmer.stem(token.get_text_prefer_normalized()).to_string(),
Some(language),
);
}
}
}
return token;
}
}
fn get_stemming_algorithm_for_lang(lang: Lang) -> Option<Algorithm> {
Some(match lang {
Lang::Ara => Algorithm::Arabic,
Lang::Dan => Algorithm::Danish,
Lang::Nld => Algorithm::Dutch,
Lang::Eng => Algorithm::English,
Lang::Fin => Algorithm::Finnish,
Lang::Fra => Algorithm::French,
Lang::Deu => Algorithm::German,
Lang::Ell => Algorithm::Greek,
Lang::Hun => Algorithm::Hungarian,
Lang::Ita => Algorithm::Italian,
Lang::Por => Algorithm::Portuguese,
Lang::Ron => Algorithm::Romanian,
Lang::Rus => Algorithm::Russian,
Lang::Spa => Algorithm::Spanish,
Lang::Swe => Algorithm::Swedish,
Lang::Tam => Algorithm::Tamil,
Lang::Tur => Algorithm::Turkish,
_ => {
return None;
}
})
}
#[cfg(test)]
mod test {
use super::*;
use crate::chain::*;
use crate::augmentation::AugmentationDetectLanguage;
use crate::normalization::NormalizationLowercase;
use crate::segmentation::UnicodeSentenceSplitter;
use crate::segmentation::UnicodeWordSplitter;
#[test]
fn test_stemmed_unicode_word_split() {
let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN NOT HANDLE SCREAMING CASE OUTRAGE!";
let sentence_splitter = UnicodeSentenceSplitter::new();
let language_detector = AugmentationDetectLanguage::new();
let word_splitter = UnicodeWordSplitter::new();
let result: Vec<String> = test_text
.start_segmentation_chain()
.chain_segmenter(&sentence_splitter)
.chain_augmenter(&language_detector)
.inspect(|x| {
println!("{x:?}");
})
.chain_segmenter(&word_splitter)
.inspect(|x| {
println!("word: {x:?}");
})
.chain_augmenter(&NormalizationRustStemmers::new().set_anyway_above_confidence(0.1))
.map(|t| t.get_text_prefer_normalized_owned())
.collect();
let expected_tokens: Vec<String> = vec![
"Fisch",
" ",
"Fritz",
" ",
"fischt",
" ",
"frisch",
" ",
"Fisch",
"!",
" ",
"",
"The",
" ",
"jump",
" ",
"brown",
" ",
"fox",
" ",
"quick",
" ",
"jump",
" ",
"over",
" ",
"the",
" ",
"sleep",
" ",
"dog",
".",
" ",
"",
"CAN",
" ",
"NOT",
" ",
"HANDLE",
" ",
"SCREAMING",
" ",
"CASE",
" ",
"OUTRAGE",
"!",
"",
]
.iter()
.map(|s| s.to_string())
.collect();
assert_eq!(result, expected_tokens);
}
#[test]
fn test_stemmed_unicode_word_split_lowercase() {
let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN ALSO HANDLE SCREAMING CASE OUTRAGE!";
let sentence_splitter = UnicodeSentenceSplitter::new();
let language_detector = AugmentationDetectLanguage::new();
let word_splitter = UnicodeWordSplitter::new();
let result: Vec<String> = test_text
.start_segmentation_chain()
.chain_segmenter(&sentence_splitter)
.chain_augmenter(&language_detector)
.inspect(|x| {
println!("{x:?}");
})
.chain_segmenter(&word_splitter)
.inspect(|x| {
println!("word: {x:?}");
})
.chain_augmenter(&NormalizationLowercase::new())
.chain_augmenter(
&NormalizationRustStemmers::new()
.set_anyway_above_confidence(0.1)
.set_process_already_normalized(true),
)
.map(|t| t.get_text_prefer_normalized_owned())
.collect();
let expected_tokens: Vec<String> = vec![
"fisch",
" ",
"fritz",
" ",
"fischt",
" ",
"frisch",
" ",
"fisch",
"!",
" ",
"",
"the",
" ",
"jump",
" ",
"brown",
" ",
"fox",
" ",
"quick",
" ",
"jump",
" ",
"over",
" ",
"the",
" ",
"sleep",
" ",
"dog",
".",
" ",
"",
"can",
" ",
"also",
" ",
"handl",
" ",
"screaming",
" ",
"cas",
" ",
"outrag",
"!",
"",
]
.iter()
.map(|s| s.to_string())
.collect();
assert_eq!(result, expected_tokens);
}
}