use dashmap::DashMap;
use smallvec::SmallVec;
use std::path::Path;
use std::sync::Arc;
#[cfg(feature = "python")]
use pyo3::prelude::*;
const MIN_STEM: usize = 3;
const MIN_STEM_DERIV: usize = 4;
const MAX_ITER: usize = 4;
static INFLECTIONAL: &[&str] = &["lah", "kah", "tah", "pun"];
static NYA: &str = "nya";
static DERIV_SUFFIXES: &[&str] = &["kan", "an", "i"];
#[inline]
fn is_vowel(c: char) -> bool {
matches!(c, 'a' | 'e' | 'i' | 'o' | 'u')
}
#[inline]
fn first_char(s: &str) -> Option<char> {
s.chars().next()
}
#[inline]
fn has_known_prefix(word: &str) -> bool {
word.starts_with("me")
|| word.starts_with("pe")
|| word.starts_with("be")
|| word.starts_with("te")
|| word.starts_with("se")
|| word.starts_with("di")
}
#[inline]
fn valid_stem_start(s: &str) -> bool {
let mut it = s.chars();
match it.next() {
None => false,
Some(c) if is_vowel(c) => true, Some(_) => it.next().map(is_vowel).unwrap_or(false), }
}
fn strip_inflectional(word: &str) -> &str {
for suf in INFLECTIONAL {
if let Some(base) = word.strip_suffix(suf) {
if base.len() >= MIN_STEM {
return base;
}
}
}
word
}
fn strip_nya(word: &str) -> &str {
if let Some(base) = word.strip_suffix(NYA) {
if base.len() >= MIN_STEM {
return base;
}
}
word
}
fn strip_deriv_suffix(word: &str) -> SmallVec<[String; 1]> {
let mut out = SmallVec::new();
for suf in DERIV_SUFFIXES {
if let Some(base) = word.strip_suffix(suf) {
if base.len() >= MIN_STEM_DERIV {
out.push(base.to_string());
return out; }
}
}
out
}
fn strip_me(word: &str) -> SmallVec<[String; 4]> {
let mut out: SmallVec<[String; 4]> = SmallVec::new();
macro_rules! push {
($s:expr) => {
if $s.len() >= MIN_STEM {
out.push($s.to_string());
}
};
(fmt: $s:expr) => {
if $s.len() >= MIN_STEM {
out.push($s);
}
};
}
if let Some(rest) = word.strip_prefix("memper") {
push!(rest);
return out;
}
if let Some(rest) = word.strip_prefix("mempel") {
push!(fmt: format!("pel{}", rest));
return out;
}
if let Some(rest) = word.strip_prefix("menge") {
push!(rest);
}
if let Some(rest) = word.strip_prefix("meny") {
if first_char(rest).map(is_vowel).unwrap_or(false) {
push!(fmt: format!("s{}", rest)); } else {
push!(rest); }
return out;
}
if let Some(rest) = word.strip_prefix("meng") {
if let Some(c) = first_char(rest) {
if is_vowel(c) {
push!(fmt: format!("k{}", rest)); push!(rest); } else {
push!(rest); }
}
return out;
}
if let Some(rest) = word.strip_prefix("men") {
if let Some(c) = first_char(rest) {
match c {
'a' | 'e' | 'i' | 'o' | 'u' => {
push!(fmt: format!("t{}", rest)); push!(rest);
}
'd' | 'c' | 'j' | 'z' | 'n' => push!(rest),
_ => push!(rest),
}
}
return out;
}
if let Some(rest) = word.strip_prefix("mem") {
if let Some(c) = first_char(rest) {
match c {
'b' | 'f' | 'v' => push!(rest), 'p' => {
push!(rest); push!(fmt: format!("p{}", rest)); }
_ => {
push!(fmt: format!("p{}", rest)); push!(rest);
}
}
}
return out;
}
if let Some(rest) = word.strip_prefix("me") {
push!(rest);
}
out
}
fn strip_pe(word: &str) -> SmallVec<[String; 4]> {
let mut out: SmallVec<[String; 4]> = SmallVec::new();
macro_rules! push {
($s:expr) => {
if $s.len() >= MIN_STEM {
out.push($s.to_string());
}
};
(fmt: $s:expr) => {
if $s.len() >= MIN_STEM {
out.push($s);
}
};
}
if let Some(rest) = word.strip_prefix("pemper") {
push!(rest);
return out;
}
if let Some(rest) = word.strip_prefix("pempel") {
push!(fmt: format!("pel{}", rest));
return out;
}
if let Some(rest) = word.strip_prefix("penge") {
push!(rest);
}
if let Some(rest) = word.strip_prefix("peny") {
if first_char(rest).map(is_vowel).unwrap_or(false) {
push!(fmt: format!("s{}", rest)); } else {
push!(rest);
}
return out;
}
if let Some(rest) = word.strip_prefix("peng") {
if let Some(c) = first_char(rest) {
if is_vowel(c) {
push!(fmt: format!("k{}", rest)); push!(rest);
} else {
push!(rest);
}
}
return out;
}
if let Some(rest) = word.strip_prefix("pen") {
if let Some(c) = first_char(rest) {
match c {
'a' | 'e' | 'i' | 'o' | 'u' => {
push!(fmt: format!("t{}", rest)); push!(rest);
}
'd' | 'c' | 'j' | 'z' | 'n' => push!(rest),
_ => push!(rest),
}
}
return out;
}
if let Some(rest) = word.strip_prefix("pem") {
if let Some(c) = first_char(rest) {
match c {
'b' | 'f' | 'v' => push!(rest), 'p' => {
push!(rest);
push!(fmt: format!("p{}", rest));
}
_ => {
push!(fmt: format!("p{}", rest));
push!(rest);
}
}
}
return out;
}
if let Some(rest) = word.strip_prefix("pel") {
push!(rest); return out;
}
if let Some(rest) = word.strip_prefix("pe") {
push!(rest); }
if word.starts_with("per") {
if let Some(rest) = word.strip_prefix("per") {
push!(rest); }
}
out
}
fn strip_ber(word: &str) -> SmallVec<[String; 2]> {
let mut out: SmallVec<[String; 2]> = SmallVec::new();
if let Some(rest) = word.strip_prefix("bel") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
return out;
}
if let Some(rest) = word.strip_prefix("ber") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
return out;
}
if let Some(rest) = word.strip_prefix("be") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
}
out
}
fn strip_ter(word: &str) -> SmallVec<[String; 2]> {
let mut out: SmallVec<[String; 2]> = SmallVec::new();
if let Some(rest) = word.strip_prefix("ter") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
return out;
}
if let Some(rest) = word.strip_prefix("te") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
}
out
}
fn strip_se(word: &str) -> SmallVec<[String; 1]> {
let mut out: SmallVec<[String; 1]> = SmallVec::new();
if let Some(rest) = word.strip_prefix("se") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
}
out
}
fn strip_ke(word: &str) -> SmallVec<[String; 1]> {
let mut out: SmallVec<[String; 1]> = SmallVec::new();
if let Some(rest) = word.strip_prefix("ke") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
}
out
}
fn strip_di(word: &str) -> SmallVec<[String; 1]> {
let mut out: SmallVec<[String; 1]> = SmallVec::new();
if let Some(rest) = word.strip_prefix("di") {
if rest.len() >= MIN_STEM {
out.push(rest.to_string());
}
}
out
}
fn strip_any_prefix(word: &str) -> SmallVec<[String; 6]> {
let raw: SmallVec<[String; 6]> = if word.starts_with("me") {
strip_me(word).into_iter().collect()
} else if word.starts_with("pe") {
strip_pe(word).into_iter().collect()
} else if word.starts_with("be") {
strip_ber(word).into_iter().collect()
} else if word.starts_with("te") {
strip_ter(word).into_iter().collect()
} else if word.starts_with("se") {
strip_se(word).into_iter().collect()
} else if word.starts_with("ke") {
strip_ke(word).into_iter().collect()
} else if word.starts_with("di") {
strip_di(word).into_iter().collect()
} else {
SmallVec::new()
};
raw.into_iter().filter(|s| valid_stem_start(s)).collect()
}
fn best_candidate(candidates: &[String], original: &str) -> String {
if candidates.is_empty() {
return original.to_string();
}
if let Some(c) = candidates.iter().find(|s| s.len() >= MIN_STEM_DERIV) {
return c.clone();
}
candidates[0].clone()
}
fn best_candidate_dict<D: Dictionary + ?Sized>(
candidates: &[String],
original: &str,
dict: &D,
) -> String {
for c in candidates {
if dict.contains(c) {
return c.clone();
}
}
best_candidate(candidates, original)
}
pub trait Dictionary: Send + Sync {
fn contains(&self, word: &str) -> bool;
fn size(&self) -> usize {
0
}
}
pub struct NullDict;
impl Dictionary for NullDict {
#[inline]
fn contains(&self, _word: &str) -> bool {
false
}
}
pub struct FstDict {
set: fst::Set<memmap2::Mmap>,
}
impl FstDict {
pub fn open(path: impl AsRef<Path>) -> Result<Self, Box<dyn std::error::Error>> {
let file = std::fs::File::open(path)?;
let mmap = unsafe { memmap2::Mmap::map(&file)? };
let set = fst::Set::new(mmap)?;
Ok(Self { set })
}
}
impl Dictionary for FstDict {
#[inline]
fn contains(&self, word: &str) -> bool {
self.set.contains(word)
}
#[inline]
fn size(&self) -> usize {
self.set.len()
}
}
fn one_pass(word: &str) -> SmallVec<[String; 8]> {
let mut combined: SmallVec<[String; 4]> = SmallVec::new();
let mut prefix_only: SmallVec<[String; 4]> = SmallVec::new();
let prefix_stripped = strip_any_prefix(word);
for ps in &prefix_stripped {
let suffixes = strip_deriv_suffix(ps);
if suffixes.is_empty() {
prefix_only.push(ps.clone());
} else {
for ss in suffixes {
combined.push(ss);
}
prefix_only.push(ps.clone());
}
}
for ds in strip_deriv_suffix(word) {
let sub = strip_any_prefix(&ds);
if sub.is_empty() {
prefix_only.push(ds);
} else {
for ps in sub {
combined.push(ps);
}
}
}
let mut out: SmallVec<[String; 8]> = SmallVec::new();
out.extend(combined);
out.extend(prefix_only);
out
}
fn ecs_stem<D: Dictionary + ?Sized>(word: &str, dict: &D) -> String {
let n = word.len();
if n < MIN_STEM {
return word.to_string();
}
if dict.contains(word) {
return word.to_string();
}
let base = strip_nya(word);
if dict.contains(base) {
return base.to_string();
}
let mut current = base.to_string();
for _ in 0..MAX_ITER {
let candidates = one_pass(¤t);
if candidates.is_empty() {
break;
}
let next = best_candidate_dict(&candidates, ¤t, dict);
if next == current {
break; }
current = next;
if dict.contains(¤t) {
break;
}
if !has_known_prefix(¤t) {
let stripped = current
.strip_suffix("kan")
.filter(|b| b.len() >= MIN_STEM_DERIV)
.or_else(|| {
current
.strip_suffix("an")
.filter(|b| b.len() >= MIN_STEM_DERIV)
});
if let Some(s) = stripped {
current = s.to_string();
}
break;
}
}
if current == base {
let inflectional_base = strip_inflectional(base);
if inflectional_base != base {
return inflectional_base.to_string();
}
}
current
}
#[derive(Debug, Clone, Default)]
pub struct StemmerConfig {
pub fst_path: Option<String>,
}
pub struct IndonesianStemmer {
dict: Arc<dyn Dictionary>,
cache: DashMap<String, String>,
}
impl IndonesianStemmer {
pub fn new() -> Self {
Self {
dict: Arc::new(NullDict),
cache: DashMap::with_capacity(4096),
}
}
pub fn with_fst(path: impl AsRef<Path>) -> Self {
let dict: Arc<dyn Dictionary> = match FstDict::open(path.as_ref()) {
Ok(d) => {
eprintln!(
"[harmorp] FST dictionary loaded from {}",
path.as_ref().display()
);
Arc::new(d)
}
Err(e) => {
eprintln!(
"[harmorp] FST not found ({}), running without dictionary",
e
);
Arc::new(NullDict)
}
};
Self {
dict,
cache: DashMap::with_capacity(16384),
}
}
pub fn stem(&self, word: &str) -> String {
let lower = word.to_lowercase();
if let Some(cached) = self.cache.get(&lower) {
return cached.clone();
}
let result = ecs_stem(&lower, self.dict.as_ref());
self.cache.insert(lower, result.clone());
result
}
pub fn stem_batch(&self, words: &[String]) -> Vec<String> {
words.iter().map(|w| self.stem(w)).collect()
}
pub fn in_dict(&self, word: &str) -> bool {
self.dict.contains(&word.to_lowercase())
}
pub fn cache_len(&self) -> usize {
self.cache.len()
}
pub fn dict_size(&self) -> usize {
self.dict.size()
}
pub fn clear_cache(&self) {
self.cache.clear();
}
}
impl Default for IndonesianStemmer {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "python")]
#[pyclass(name = "Stemmer")]
struct PyStemmer {
inner: Arc<IndonesianStemmer>,
}
#[cfg(feature = "python")]
#[pymethods]
impl PyStemmer {
#[new]
#[pyo3(signature = (fst_path=None))]
fn new(fst_path: Option<&str>) -> Self {
let inner = match fst_path {
Some(p) => IndonesianStemmer::with_fst(p),
None => IndonesianStemmer::new(),
};
Self {
inner: Arc::new(inner),
}
}
fn stem(&self, word: &str) -> String {
self.inner.stem(word)
}
fn stem_batch(&self, py: Python<'_>, words: Vec<String>) -> Vec<String> {
py.allow_threads(|| self.inner.stem_batch(&words))
}
fn in_dict(&self, word: &str) -> bool {
self.inner.in_dict(word)
}
fn clear_cache(&self) {
self.inner.clear_cache();
}
}
#[cfg(feature = "python")]
#[pymodule]
fn harmorp(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<PyStemmer>()?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn s() -> IndonesianStemmer {
IndonesianStemmer::new()
}
#[test]
fn test_inflectional() {
let st = s();
assert_eq!(st.stem("bukulah"), "buku");
assert_eq!(st.stem("apakah"), "apa");
assert_eq!(st.stem("meskipun"), "meski");
assert_eq!(st.stem("biarlah"), "biar");
assert_eq!(st.stem("walaupun"), "walau");
}
#[test]
fn test_derivational() {
let st = s();
assert_eq!(st.stem("makanan"), "makan");
assert_eq!(st.stem("jatuhkan"), "jatuh");
assert_eq!(st.stem("bangunkan"), "bangun");
}
#[test]
fn test_me_prefix() {
let st = s();
assert_eq!(st.stem("membaca"), "baca");
assert_eq!(st.stem("membuat"), "buat");
assert_eq!(st.stem("memfasilitasi"), "fasilitas");
assert_eq!(st.stem("menyapu"), "sapu");
assert_eq!(st.stem("menulis"), "tulis");
assert_eq!(st.stem("mendengar"), "dengar");
assert_eq!(st.stem("memperbaiki"), "baik");
assert_eq!(st.stem("mengecil"), "kecil");
}
#[test]
fn test_pe_prefix() {
let st = s();
assert_eq!(st.stem("pembaca"), "baca");
assert_eq!(st.stem("pembuat"), "buat");
assert_eq!(st.stem("penulis"), "tulis");
assert_eq!(st.stem("pelajar"), "ajar");
assert_eq!(st.stem("pekerja"), "kerja");
assert_eq!(st.stem("perumahan"), "rumah");
}
#[test]
fn test_be_te_se() {
let st = s();
assert_eq!(st.stem("bermain"), "main");
assert_eq!(st.stem("belajar"), "ajar");
assert_eq!(st.stem("bekerja"), "kerja");
assert_eq!(st.stem("tertawa"), "tawa");
assert_eq!(st.stem("seratus"), "ratus");
}
#[test]
fn test_complex() {
let st = s();
assert_eq!(st.stem("pembelajaran"), "ajar");
assert_eq!(st.stem("membukakan"), "buka");
assert_eq!(st.stem("mengepulangkan"), "pulang");
assert_eq!(st.stem("mempertimbangkan"), "timbang");
assert_eq!(st.stem("pengembangan"), "kembang");
assert_eq!(st.stem("berdiskusi"), "diskus");
assert_eq!(st.stem("tercatat"), "catat");
}
#[test]
fn test_no_stripping() {
let st = s();
for w in &[
"buku", "makan", "rumah", "anak", "ayah", "kata", "yang", "untuk", "di", "ke",
] {
assert_eq!(st.stem(w), *w, "word '{}' should not be modified", w);
}
}
#[test]
fn test_cache() {
let st = s();
let r1 = st.stem("membaca");
let r2 = st.stem("membaca");
assert_eq!(r1, r2);
assert_eq!(st.cache.len(), 1);
}
#[test]
fn test_batch() {
let st = s();
let words = vec![
"bukulah".to_string(),
"makanan".to_string(),
"bermain".to_string(),
];
let res = st.stem_batch(&words);
assert_eq!(res, vec!["buku", "makan", "main"]);
}
}