const SUFFIXES: &[&str] = &[
"هایمان",
"هایتان",
"هایشان",
"هایم",
"هایت",
"هایش",
"انمان",
"انتان",
"انشان",
"انم",
"انت",
"انش",
"های",
"ها",
"مان",
"تان",
"شان",
"ام",
"ات",
"اش",
"ترین",
"تر",
"یم",
"ید",
"ند",
"ست",
"ان",
"ای",
];
const MIN_STEM_LEN: usize = 3;
#[must_use]
pub fn stem(word: &str) -> String {
let cleaned: String = word.chars().filter(|&c| c != '\u{200C}').collect();
let chars: Vec<char> = cleaned.chars().collect();
for suffix in SUFFIXES {
let suffix_chars: Vec<char> = suffix.chars().collect();
if chars.len() < suffix_chars.len() + MIN_STEM_LEN {
continue;
}
if chars.ends_with(&suffix_chars) {
let stem_len = chars.len() - suffix_chars.len();
return chars[..stem_len].iter().collect();
}
}
cleaned
}
#[must_use]
pub fn stem_tokens(tokens: &[String]) -> Vec<String> {
tokens.iter().map(|t| stem(t)).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn plural_ha() {
assert_eq!(stem("کتابها"), "کتاب");
assert_eq!(stem("کتابها"), "کتاب");
}
#[test]
fn plural_with_possessive() {
assert_eq!(stem("کتابهایم"), "کتاب");
}
#[test]
fn comparative() {
assert_eq!(stem("بزرگتر"), "بزرگ");
assert_eq!(stem("بزرگترین"), "بزرگ");
}
#[test]
fn possessive() {
assert_eq!(stem("دوستانم"), "دوست");
assert_eq!(stem("کتابم"), "کتابم"); }
#[test]
fn no_suffix_returns_word() {
assert_eq!(stem("کتاب"), "کتاب");
}
#[test]
fn too_short_word_unchanged() {
assert_eq!(stem("پی"), "پی");
assert_eq!(stem("ها"), "ها");
}
#[test]
fn batch_stems() {
let tokens = vec![
"کتابها".to_string(),
"گلها".to_string(),
"روزها".to_string(),
];
let stems = stem_tokens(&tokens);
assert_eq!(stems, vec!["کتاب", "گلها", "روز"]);
}
}