1use alloc::string::String;
20use alloc::vec::Vec;
21
22static BUILTIN_STOPWORDS: &str = include_str!("../data/stopwords_th.txt");
23
24pub struct StopwordSet {
29 words: Vec<String>,
30}
31
32impl StopwordSet {
33 pub fn builtin() -> Self {
35 Self::from_text(BUILTIN_STOPWORDS)
36 }
37
38 pub fn from_text(data: &str) -> Self {
44 let mut words: Vec<String> = data
45 .lines()
46 .map(|l| l.trim_start_matches('\u{FEFF}').trim())
47 .filter(|l| !l.is_empty() && !l.starts_with('#'))
48 .map(String::from)
49 .collect();
50 words.sort_unstable();
51 words.dedup();
52 StopwordSet { words }
53 }
54
55 #[inline]
57 pub fn contains(&self, word: &str) -> bool {
58 self.words
59 .binary_search_by(|w| w.as_str().cmp(word))
60 .is_ok()
61 }
62
63 #[inline]
65 pub fn len(&self) -> usize {
66 self.words.len()
67 }
68
69 #[inline]
71 pub fn is_empty(&self) -> bool {
72 self.words.is_empty()
73 }
74}
75
76#[cfg(test)]
81mod tests {
82 use super::*;
83
84 fn stops() -> StopwordSet {
85 StopwordSet::builtin()
86 }
87
88 #[test]
89 fn builtin_loads_without_panic() {
90 let _ = stops();
91 }
92
93 #[test]
94 fn builtin_has_expected_count() {
95 let s = stops();
96 assert!(s.len() >= 1000, "expected ≥1000 stopwords, got {}", s.len());
97 }
98
99 #[test]
100 fn common_function_words_are_stopwords() {
101 let s = stops();
102 for word in &["และ", "ที่", "ของ", "ใน", "ไม่", "ได้", "กับ", "จาก"]
103 {
104 assert!(s.contains(word), "expected '{word}' to be a stopword");
105 }
106 }
107
108 #[test]
109 fn content_words_are_not_stopwords() {
110 let s = stops();
111 for word in &["กินข้าว", "โรงพยาบาล", "คอมพิวเตอร์", "ประเทศไทย"]
112 {
113 assert!(!s.contains(word), "'{word}' should not be a stopword");
114 }
115 }
116
117 #[test]
118 fn empty_string_is_not_a_stopword() {
119 assert!(!stops().contains(""));
120 }
121
122 #[test]
123 fn from_text_ignores_comment_lines() {
124 let s = StopwordSet::from_text("# comment\nกิน\nข้าว\n");
125 assert!(s.contains("กิน"));
126 assert!(s.contains("ข้าว"));
127 assert_eq!(s.len(), 2);
128 }
129
130 #[test]
131 fn from_text_ignores_blank_lines() {
132 let s = StopwordSet::from_text("\nกิน\n\nข้าว\n");
133 assert_eq!(s.len(), 2);
134 }
135
136 #[test]
137 fn from_text_strips_bom() {
138 let s = StopwordSet::from_text("\u{FEFF}กิน\nข้าว\n");
139 assert!(s.contains("กิน"), "BOM should be stripped before lookup");
140 }
141
142 #[test]
143 fn from_text_deduplicates() {
144 let s = StopwordSet::from_text("กิน\nกิน\nกิน\n");
145 assert_eq!(s.len(), 1);
146 }
147
148 #[test]
149 fn empty_input_produces_empty_set() {
150 let s = StopwordSet::from_text("");
151 assert!(s.is_empty());
152 }
153
154 #[test]
155 fn contains_is_exact_match() {
156 let s = StopwordSet::from_text("กิน\n");
157 assert!(s.contains("กิน"));
158 assert!(!s.contains("กิน "));
159 assert!(!s.contains("กินข้าว"));
160 }
161}