use alloc::string::String;
use alloc::vec::Vec;
static BUILTIN_STOPWORDS: &str = include_str!("../data/stopwords_th.txt");
pub struct StopwordSet {
words: Vec<String>,
}
impl StopwordSet {
pub fn builtin() -> Self {
Self::from_text(BUILTIN_STOPWORDS)
}
pub fn from_text(data: &str) -> Self {
let mut words: Vec<String> = data
.lines()
.map(|l| l.trim_start_matches('\u{FEFF}').trim())
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.map(String::from)
.collect();
words.sort_unstable();
words.dedup();
StopwordSet { words }
}
#[inline]
pub fn contains(&self, word: &str) -> bool {
self.words
.binary_search_by(|w| w.as_str().cmp(word))
.is_ok()
}
#[inline]
pub fn len(&self) -> usize {
self.words.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.words.is_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn stops() -> StopwordSet {
StopwordSet::builtin()
}
#[test]
fn builtin_loads_without_panic() {
let _ = stops();
}
#[test]
fn builtin_has_expected_count() {
let s = stops();
assert!(s.len() >= 1000, "expected ≥1000 stopwords, got {}", s.len());
}
#[test]
fn common_function_words_are_stopwords() {
let s = stops();
for word in &["และ", "ที่", "ของ", "ใน", "ไม่", "ได้", "กับ", "จาก"]
{
assert!(s.contains(word), "expected '{word}' to be a stopword");
}
}
#[test]
fn content_words_are_not_stopwords() {
let s = stops();
for word in &["กินข้าว", "โรงพยาบาล", "คอมพิวเตอร์", "ประเทศไทย"]
{
assert!(!s.contains(word), "'{word}' should not be a stopword");
}
}
#[test]
fn empty_string_is_not_a_stopword() {
assert!(!stops().contains(""));
}
#[test]
fn from_text_ignores_comment_lines() {
let s = StopwordSet::from_text("# comment\nกิน\nข้าว\n");
assert!(s.contains("กิน"));
assert!(s.contains("ข้าว"));
assert_eq!(s.len(), 2);
}
#[test]
fn from_text_ignores_blank_lines() {
let s = StopwordSet::from_text("\nกิน\n\nข้าว\n");
assert_eq!(s.len(), 2);
}
#[test]
fn from_text_strips_bom() {
let s = StopwordSet::from_text("\u{FEFF}กิน\nข้าว\n");
assert!(s.contains("กิน"), "BOM should be stripped before lookup");
}
#[test]
fn from_text_deduplicates() {
let s = StopwordSet::from_text("กิน\nกิน\nกิน\n");
assert_eq!(s.len(), 1);
}
#[test]
fn empty_input_produces_empty_set() {
let s = StopwordSet::from_text("");
assert!(s.is_empty());
}
#[test]
fn contains_is_exact_match() {
let s = StopwordSet::from_text("กิน\n");
assert!(s.contains("กิน"));
assert!(!s.contains("กิน "));
assert!(!s.contains("กินข้าว"));
}
}