1use alloc::string::String;
20use alloc::vec::Vec;
21
22static BUILTIN_STOPWORDS: &str = include_str!("../data/stopwords_th.txt");
23
24pub struct StopwordSet {
29 words: Vec<String>,
30}
31
32impl StopwordSet {
33 pub fn builtin() -> Self {
35 Self::from_text(BUILTIN_STOPWORDS)
36 }
37
38 pub fn from_text(data: &str) -> Self {
44 let mut words: Vec<String> = data
45 .lines()
46 .map(|l| l.trim_start_matches('\u{FEFF}').trim())
47 .filter(|l| !l.is_empty() && !l.starts_with('#'))
48 .map(String::from)
49 .collect();
50 words.sort_unstable();
51 words.dedup();
52 StopwordSet { words }
53 }
54
55 pub fn builtin_with_extra(extra: &str) -> Self {
76 let mut words: Vec<String> = BUILTIN_STOPWORDS
77 .lines()
78 .chain(extra.lines())
79 .map(|l| l.trim_start_matches('\u{FEFF}').trim())
80 .filter(|l| !l.is_empty() && !l.starts_with('#'))
81 .map(String::from)
82 .collect();
83 words.sort_unstable();
84 words.dedup();
85 StopwordSet { words }
86 }
87
88 #[inline]
90 pub fn contains(&self, word: &str) -> bool {
91 self.words
92 .binary_search_by(|w| w.as_str().cmp(word))
93 .is_ok()
94 }
95
96 #[inline]
98 pub fn len(&self) -> usize {
99 self.words.len()
100 }
101
102 #[inline]
104 pub fn is_empty(&self) -> bool {
105 self.words.is_empty()
106 }
107}
108
109#[cfg(test)]
114mod tests {
115 use super::*;
116
117 fn stops() -> StopwordSet {
118 StopwordSet::builtin()
119 }
120
121 #[test]
122 fn builtin_loads_without_panic() {
123 let _ = stops();
124 }
125
126 #[test]
127 fn builtin_has_expected_count() {
128 let s = stops();
129 assert!(s.len() >= 1000, "expected ≥1000 stopwords, got {}", s.len());
130 }
131
132 #[test]
133 fn common_function_words_are_stopwords() {
134 let s = stops();
135 for word in &["และ", "ที่", "ของ", "ใน", "ไม่", "ได้", "กับ", "จาก"]
136 {
137 assert!(s.contains(word), "expected '{word}' to be a stopword");
138 }
139 }
140
141 #[test]
142 fn content_words_are_not_stopwords() {
143 let s = stops();
144 for word in &["กินข้าว", "โรงพยาบาล", "คอมพิวเตอร์", "ประเทศไทย"]
145 {
146 assert!(!s.contains(word), "'{word}' should not be a stopword");
147 }
148 }
149
150 #[test]
151 fn empty_string_is_not_a_stopword() {
152 assert!(!stops().contains(""));
153 }
154
155 #[test]
156 fn from_text_ignores_comment_lines() {
157 let s = StopwordSet::from_text("# comment\nกิน\nข้าว\n");
158 assert!(s.contains("กิน"));
159 assert!(s.contains("ข้าว"));
160 assert_eq!(s.len(), 2);
161 }
162
163 #[test]
164 fn from_text_ignores_blank_lines() {
165 let s = StopwordSet::from_text("\nกิน\n\nข้าว\n");
166 assert_eq!(s.len(), 2);
167 }
168
169 #[test]
170 fn from_text_strips_bom() {
171 let s = StopwordSet::from_text("\u{FEFF}กิน\nข้าว\n");
172 assert!(s.contains("กิน"), "BOM should be stripped before lookup");
173 }
174
175 #[test]
176 fn from_text_deduplicates() {
177 let s = StopwordSet::from_text("กิน\nกิน\nกิน\n");
178 assert_eq!(s.len(), 1);
179 }
180
181 #[test]
182 fn empty_input_produces_empty_set() {
183 let s = StopwordSet::from_text("");
184 assert!(s.is_empty());
185 }
186
187 #[test]
188 fn contains_is_exact_match() {
189 let s = StopwordSet::from_text("กิน\n");
190 assert!(s.contains("กิน"));
191 assert!(!s.contains("กิน "));
192 assert!(!s.contains("กินข้าว"));
193 }
194
195 #[test]
196 fn builtin_with_extra_includes_builtin_words() {
197 let s = StopwordSet::builtin_with_extra("ดาวน์โหลด\n");
198 assert!(s.contains("และ"), "built-in word should be present");
199 assert!(s.contains("ที่"), "built-in word should be present");
200 }
201
202 #[test]
203 fn builtin_with_extra_includes_extra_words() {
204 let s = StopwordSet::builtin_with_extra("ดาวน์โหลด\nอัปโหลด\n");
205 assert!(s.contains("ดาวน์โหลด"), "extra word should be present");
206 assert!(s.contains("อัปโหลด"), "extra word should be present");
207 }
208
209 #[test]
210 fn builtin_with_extra_deduplicates_overlap() {
211 let builtin = StopwordSet::builtin();
212 let combined = StopwordSet::builtin_with_extra("และ\n");
214 assert_eq!(
215 combined.len(),
216 builtin.len(),
217 "duplicate word should not increase set size"
218 );
219 }
220
221 #[test]
222 fn builtin_with_extra_empty_extra_equals_builtin() {
223 let a = StopwordSet::builtin();
224 let b = StopwordSet::builtin_with_extra("");
225 assert_eq!(a.len(), b.len());
226 }
227
228 #[test]
229 fn builtin_with_extra_ignores_comment_and_blank_lines() {
230 let base = StopwordSet::builtin();
231 let s = StopwordSet::builtin_with_extra("# comment\n\nและ\n");
232 assert_eq!(
233 s.len(),
234 base.len(),
235 "comment/blank/duplicate should not add entries"
236 );
237 }
238}