Skip to main content

kham_core/
stopwords.rs

1//! Thai stopword filter.
2//!
3//! [`StopwordSet`] identifies common Thai function words (particles, conjunctions,
4//! pronouns, discourse markers) that carry little lexical meaning and should be
5//! excluded from full-text search indexes.
6//!
7//! The built-in list (1 029 entries) is sourced from PyThaiNLP (Apache-2.0).
8//!
9//! # Example
10//!
11//! ```rust
12//! use kham_core::stopwords::StopwordSet;
13//!
14//! let stops = StopwordSet::builtin();
15//! assert!(stops.contains("และ"));
16//! assert!(!stops.contains("กินข้าว"));
17//! ```
18
19use alloc::string::String;
20use alloc::vec::Vec;
21
22static BUILTIN_STOPWORDS: &str = include_str!("../data/stopwords_th.txt");
23
24/// A sorted set of stopwords supporting O(log n) lookup.
25///
26/// Construct once per process with [`StopwordSet::builtin`] and reuse across
27/// segmentation calls.
28pub struct StopwordSet {
29    words: Vec<String>,
30}
31
32impl StopwordSet {
33    /// Load the built-in Thai stopword list (1 029 entries, PyThaiNLP Apache-2.0).
34    pub fn builtin() -> Self {
35        Self::from_text(BUILTIN_STOPWORDS)
36    }
37
38    /// Build a [`StopwordSet`] from a newline-separated word list.
39    ///
40    /// Lines beginning with `#` and blank lines are ignored.
41    /// BOM characters (`\u{FEFF}`) are stripped from every line.
42    /// The resulting set is sorted and deduplicated.
43    pub fn from_text(data: &str) -> Self {
44        let mut words: Vec<String> = data
45            .lines()
46            .map(|l| l.trim_start_matches('\u{FEFF}').trim())
47            .filter(|l| !l.is_empty() && !l.starts_with('#'))
48            .map(String::from)
49            .collect();
50        words.sort_unstable();
51        words.dedup();
52        StopwordSet { words }
53    }
54
55    /// Load the built-in list plus additional words from `extra`.
56    ///
57    /// `extra` uses the same format as [`from_text`]: newline-separated words,
58    /// `#` comment lines and blank lines ignored, BOM stripped.
59    /// The combined set is sorted and deduplicated.
60    ///
61    /// Use this when you have domain-specific function words to suppress in
62    /// addition to the standard Thai stopword list.
63    ///
64    /// # Example
65    ///
66    /// ```rust
67    /// use kham_core::stopwords::StopwordSet;
68    ///
69    /// let stops = StopwordSet::builtin_with_extra("ดาวน์โหลด\nอัปโหลด\n");
70    /// assert!(stops.contains("และ"));       // built-in
71    /// assert!(stops.contains("ดาวน์โหลด")); // extra
72    /// ```
73    ///
74    /// [`from_text`]: StopwordSet::from_text
75    pub fn builtin_with_extra(extra: &str) -> Self {
76        let mut words: Vec<String> = BUILTIN_STOPWORDS
77            .lines()
78            .chain(extra.lines())
79            .map(|l| l.trim_start_matches('\u{FEFF}').trim())
80            .filter(|l| !l.is_empty() && !l.starts_with('#'))
81            .map(String::from)
82            .collect();
83        words.sort_unstable();
84        words.dedup();
85        StopwordSet { words }
86    }
87
88    /// Return `true` if `word` is in the stopword set.
89    #[inline]
90    pub fn contains(&self, word: &str) -> bool {
91        self.words
92            .binary_search_by(|w| w.as_str().cmp(word))
93            .is_ok()
94    }
95
96    /// Number of stopwords in this set.
97    #[inline]
98    pub fn len(&self) -> usize {
99        self.words.len()
100    }
101
102    /// Return `true` if the set is empty.
103    #[inline]
104    pub fn is_empty(&self) -> bool {
105        self.words.is_empty()
106    }
107}
108
109// ---------------------------------------------------------------------------
110// Tests
111// ---------------------------------------------------------------------------
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    fn stops() -> StopwordSet {
118        StopwordSet::builtin()
119    }
120
121    #[test]
122    fn builtin_loads_without_panic() {
123        let _ = stops();
124    }
125
126    #[test]
127    fn builtin_has_expected_count() {
128        let s = stops();
129        assert!(s.len() >= 1000, "expected ≥1000 stopwords, got {}", s.len());
130    }
131
132    #[test]
133    fn common_function_words_are_stopwords() {
134        let s = stops();
135        for word in &["และ", "ที่", "ของ", "ใน", "ไม่", "ได้", "กับ", "จาก"]
136        {
137            assert!(s.contains(word), "expected '{word}' to be a stopword");
138        }
139    }
140
141    #[test]
142    fn content_words_are_not_stopwords() {
143        let s = stops();
144        for word in &["กินข้าว", "โรงพยาบาล", "คอมพิวเตอร์", "ประเทศไทย"]
145        {
146            assert!(!s.contains(word), "'{word}' should not be a stopword");
147        }
148    }
149
150    #[test]
151    fn empty_string_is_not_a_stopword() {
152        assert!(!stops().contains(""));
153    }
154
155    #[test]
156    fn from_text_ignores_comment_lines() {
157        let s = StopwordSet::from_text("# comment\nกิน\nข้าว\n");
158        assert!(s.contains("กิน"));
159        assert!(s.contains("ข้าว"));
160        assert_eq!(s.len(), 2);
161    }
162
163    #[test]
164    fn from_text_ignores_blank_lines() {
165        let s = StopwordSet::from_text("\nกิน\n\nข้าว\n");
166        assert_eq!(s.len(), 2);
167    }
168
169    #[test]
170    fn from_text_strips_bom() {
171        let s = StopwordSet::from_text("\u{FEFF}กิน\nข้าว\n");
172        assert!(s.contains("กิน"), "BOM should be stripped before lookup");
173    }
174
175    #[test]
176    fn from_text_deduplicates() {
177        let s = StopwordSet::from_text("กิน\nกิน\nกิน\n");
178        assert_eq!(s.len(), 1);
179    }
180
181    #[test]
182    fn empty_input_produces_empty_set() {
183        let s = StopwordSet::from_text("");
184        assert!(s.is_empty());
185    }
186
187    #[test]
188    fn contains_is_exact_match() {
189        let s = StopwordSet::from_text("กิน\n");
190        assert!(s.contains("กิน"));
191        assert!(!s.contains("กิน "));
192        assert!(!s.contains("กินข้าว"));
193    }
194
195    #[test]
196    fn builtin_with_extra_includes_builtin_words() {
197        let s = StopwordSet::builtin_with_extra("ดาวน์โหลด\n");
198        assert!(s.contains("และ"), "built-in word should be present");
199        assert!(s.contains("ที่"), "built-in word should be present");
200    }
201
202    #[test]
203    fn builtin_with_extra_includes_extra_words() {
204        let s = StopwordSet::builtin_with_extra("ดาวน์โหลด\nอัปโหลด\n");
205        assert!(s.contains("ดาวน์โหลด"), "extra word should be present");
206        assert!(s.contains("อัปโหลด"), "extra word should be present");
207    }
208
209    #[test]
210    fn builtin_with_extra_deduplicates_overlap() {
211        let builtin = StopwordSet::builtin();
212        // "และ" is already in the built-in list — adding it again should not duplicate.
213        let combined = StopwordSet::builtin_with_extra("และ\n");
214        assert_eq!(
215            combined.len(),
216            builtin.len(),
217            "duplicate word should not increase set size"
218        );
219    }
220
221    #[test]
222    fn builtin_with_extra_empty_extra_equals_builtin() {
223        let a = StopwordSet::builtin();
224        let b = StopwordSet::builtin_with_extra("");
225        assert_eq!(a.len(), b.len());
226    }
227
228    #[test]
229    fn builtin_with_extra_ignores_comment_and_blank_lines() {
230        let base = StopwordSet::builtin();
231        let s = StopwordSet::builtin_with_extra("# comment\n\nและ\n");
232        assert_eq!(
233            s.len(),
234            base.len(),
235            "comment/blank/duplicate should not add entries"
236        );
237    }
238}