use crate::AprenderError;
pub trait Stemmer {
fn stem(&self, word: &str) -> Result<String, AprenderError>;
fn stem_tokens<S: AsRef<str>>(&self, tokens: &[S]) -> Result<Vec<String>, AprenderError> {
tokens
.iter()
.map(|token| self.stem(token.as_ref()))
.collect()
}
}
#[derive(Debug, Clone, Default)]
pub struct PorterStemmer;
impl PorterStemmer {
#[must_use]
pub fn new() -> Self {
Self
}
fn is_vowel(c: char) -> bool {
matches!(c, 'a' | 'e' | 'i' | 'o' | 'u')
}
fn measure(word: &str) -> usize {
let mut count = 0;
let mut prev_is_vowel = false;
for c in word.chars() {
let is_vowel = Self::is_vowel(c);
if !is_vowel && prev_is_vowel {
count += 1;
}
prev_is_vowel = is_vowel;
}
count
}
fn ends_with_double_consonant(word: &str) -> bool {
if word.len() < 2 {
return false;
}
let chars: Vec<char> = word.chars().collect();
let last = chars[chars.len() - 1];
let second_last = chars[chars.len() - 2];
!Self::is_vowel(last) && last == second_last
}
fn ends_with_cvc(word: &str) -> bool {
if word.len() < 3 {
return false;
}
let chars: Vec<char> = word.chars().collect();
let len = chars.len();
let last = chars[len - 1];
let middle = chars[len - 2];
let first = chars[len - 3];
!Self::is_vowel(last)
&& Self::is_vowel(middle)
&& !Self::is_vowel(first)
&& !matches!(last, 'w' | 'x' | 'y')
}
fn replace_suffix(word: &str, suffix: &str, replacement: &str, min_measure: usize) -> String {
if let Some(stem) = word.strip_suffix(suffix) {
if Self::measure(stem) > min_measure {
return format!("{stem}{replacement}");
}
}
word.to_string()
}
}
impl PorterStemmer {
fn apply_step1(word: &mut String) {
if word.ends_with("sses") || word.ends_with("ies") {
word.truncate(word.len() - 2);
} else if !word.ends_with("ss") && word.ends_with('s') && word.len() > 1 {
word.truncate(word.len() - 1);
}
let mut step1b_flag = false;
if word.ends_with("eed") {
let stem = &word[..word.len() - 3];
if Self::measure(stem) > 0 {
word.truncate(word.len() - 1); }
} else if word.ends_with("ed") {
let stem_len = word.len() - 2;
if word[..stem_len].chars().any(Self::is_vowel) {
word.truncate(stem_len);
step1b_flag = true;
}
} else if word.ends_with("ing") {
let stem_len = word.len() - 3;
if word[..stem_len].chars().any(Self::is_vowel) {
word.truncate(stem_len);
step1b_flag = true;
}
}
if step1b_flag {
Self::apply_step1b_fixup(word);
}
if word.ends_with('y')
&& word.len() > 1
&& word[..word.len() - 1].chars().any(Self::is_vowel)
{
word.truncate(word.len() - 1);
word.push('i');
}
}
fn apply_step1b_fixup(word: &mut String) {
if word.ends_with("at") || word.ends_with("bl") || word.ends_with("iz") {
word.push('e');
} else if Self::ends_with_double_consonant(word)
&& !word.ends_with('l')
&& !word.ends_with('s')
&& !word.ends_with('z')
{
word.pop();
} else if Self::measure(word) == 1 && Self::ends_with_cvc(word) {
word.push('e');
}
}
fn apply_step4(word: &mut String) {
if Self::measure(word) <= 1 {
return;
}
const SUFFIXES: &[(&str, usize)] = &[
("ement", 5),
("ance", 4),
("ence", 4),
("able", 4),
("ible", 4),
("ment", 4),
("ant", 3),
("ent", 3),
("ism", 3),
("ate", 3),
("iti", 3),
("ous", 3),
("ive", 3),
("ize", 3),
("al", 2),
("er", 2),
("ic", 2),
("ou", 2),
];
for &(suffix, trim) in SUFFIXES {
if word.ends_with(suffix) {
word.truncate(word.len() - trim);
return;
}
}
if word.ends_with("ion")
&& word.len() > 3
&& matches!(word.as_bytes().get(word.len() - 4), Some(b's' | b't'))
{
word.truncate(word.len() - 3);
}
}
}
impl Stemmer for PorterStemmer {
fn stem(&self, word: &str) -> Result<String, AprenderError> {
let mut word = word.to_lowercase();
if word.len() <= 2 {
return Ok(word);
}
Self::apply_step1(&mut word);
word = Self::replace_suffix(&word, "ational", "ate", 0);
word = Self::replace_suffix(&word, "tional", "tion", 0);
word = Self::replace_suffix(&word, "enci", "ence", 0);
word = Self::replace_suffix(&word, "anci", "ance", 0);
word = Self::replace_suffix(&word, "izer", "ize", 0);
word = Self::replace_suffix(&word, "abli", "able", 0);
word = Self::replace_suffix(&word, "alli", "al", 0);
word = Self::replace_suffix(&word, "entli", "ent", 0);
word = Self::replace_suffix(&word, "eli", "e", 0);
word = Self::replace_suffix(&word, "ousli", "ous", 0);
word = Self::replace_suffix(&word, "ization", "ize", 0);
word = Self::replace_suffix(&word, "ation", "ate", 0);
word = Self::replace_suffix(&word, "ator", "ate", 0);
word = Self::replace_suffix(&word, "alism", "al", 0);
word = Self::replace_suffix(&word, "iveness", "ive", 0);
word = Self::replace_suffix(&word, "fulness", "ful", 0);
word = Self::replace_suffix(&word, "ousness", "ous", 0);
word = Self::replace_suffix(&word, "aliti", "al", 0);
word = Self::replace_suffix(&word, "iviti", "ive", 0);
word = Self::replace_suffix(&word, "biliti", "ble", 0);
word = Self::replace_suffix(&word, "icate", "ic", 0);
word = Self::replace_suffix(&word, "ative", "", 0);
word = Self::replace_suffix(&word, "alize", "al", 0);
word = Self::replace_suffix(&word, "iciti", "ic", 0);
word = Self::replace_suffix(&word, "ical", "ic", 0);
word = Self::replace_suffix(&word, "ful", "", 0);
word = Self::replace_suffix(&word, "ness", "", 0);
Self::apply_step4(&mut word);
if word.ends_with('e') {
let m = Self::measure(&word[..word.len() - 1]);
if m > 1 || (m == 1 && !Self::ends_with_cvc(&word[..word.len() - 1])) {
word.truncate(word.len() - 1);
}
}
if word.ends_with("ll") && Self::measure(&word) > 1 {
word.pop();
}
Ok(word)
}
}
#[cfg(test)]
#[path = "stem_tests.rs"]
mod tests;
#[cfg(test)]
#[path = "tests_stem_contract.rs"]
mod tests_stem_contract;