Skip to main content

kham_core/
stopwords.rs

1//! Thai stopword filter.
2//!
3//! [`StopwordSet`] identifies common Thai function words (particles, conjunctions,
4//! pronouns, discourse markers) that carry little lexical meaning and should be
5//! excluded from full-text search indexes.
6//!
7//! The built-in list (1 029 entries) is sourced from PyThaiNLP (Apache-2.0).
8//!
9//! # Example
10//!
11//! ```rust
12//! use kham_core::stopwords::StopwordSet;
13//!
14//! let stops = StopwordSet::builtin();
15//! assert!(stops.contains("และ"));
16//! assert!(!stops.contains("กินข้าว"));
17//! ```
18
19use alloc::string::String;
20use alloc::vec::Vec;
21
22static BUILTIN_STOPWORDS: &str = include_str!("../data/stopwords_th.txt");
23
24/// A sorted set of stopwords supporting O(log n) lookup.
25///
26/// Construct once per process with [`StopwordSet::builtin`] and reuse across
27/// segmentation calls.
28pub struct StopwordSet {
29    words: Vec<String>,
30}
31
32impl StopwordSet {
33    /// Load the built-in Thai stopword list (1 029 entries, PyThaiNLP Apache-2.0).
34    pub fn builtin() -> Self {
35        Self::from_text(BUILTIN_STOPWORDS)
36    }
37
38    /// Build a [`StopwordSet`] from a newline-separated word list.
39    ///
40    /// Lines beginning with `#` and blank lines are ignored.
41    /// BOM characters (`\u{FEFF}`) are stripped from every line.
42    /// The resulting set is sorted and deduplicated.
43    pub fn from_text(data: &str) -> Self {
44        let mut words: Vec<String> = data
45            .lines()
46            .map(|l| l.trim_start_matches('\u{FEFF}').trim())
47            .filter(|l| !l.is_empty() && !l.starts_with('#'))
48            .map(String::from)
49            .collect();
50        words.sort_unstable();
51        words.dedup();
52        StopwordSet { words }
53    }
54
55    /// Return `true` if `word` is in the stopword set.
56    #[inline]
57    pub fn contains(&self, word: &str) -> bool {
58        self.words
59            .binary_search_by(|w| w.as_str().cmp(word))
60            .is_ok()
61    }
62
63    /// Number of stopwords in this set.
64    #[inline]
65    pub fn len(&self) -> usize {
66        self.words.len()
67    }
68
69    /// Return `true` if the set is empty.
70    #[inline]
71    pub fn is_empty(&self) -> bool {
72        self.words.is_empty()
73    }
74}
75
76// ---------------------------------------------------------------------------
77// Tests
78// ---------------------------------------------------------------------------
79
80#[cfg(test)]
81mod tests {
82    use super::*;
83
84    fn stops() -> StopwordSet {
85        StopwordSet::builtin()
86    }
87
88    #[test]
89    fn builtin_loads_without_panic() {
90        let _ = stops();
91    }
92
93    #[test]
94    fn builtin_has_expected_count() {
95        let s = stops();
96        assert!(s.len() >= 1000, "expected ≥1000 stopwords, got {}", s.len());
97    }
98
99    #[test]
100    fn common_function_words_are_stopwords() {
101        let s = stops();
102        for word in &["และ", "ที่", "ของ", "ใน", "ไม่", "ได้", "กับ", "จาก"]
103        {
104            assert!(s.contains(word), "expected '{word}' to be a stopword");
105        }
106    }
107
108    #[test]
109    fn content_words_are_not_stopwords() {
110        let s = stops();
111        for word in &["กินข้าว", "โรงพยาบาล", "คอมพิวเตอร์", "ประเทศไทย"]
112        {
113            assert!(!s.contains(word), "'{word}' should not be a stopword");
114        }
115    }
116
117    #[test]
118    fn empty_string_is_not_a_stopword() {
119        assert!(!stops().contains(""));
120    }
121
122    #[test]
123    fn from_text_ignores_comment_lines() {
124        let s = StopwordSet::from_text("# comment\nกิน\nข้าว\n");
125        assert!(s.contains("กิน"));
126        assert!(s.contains("ข้าว"));
127        assert_eq!(s.len(), 2);
128    }
129
130    #[test]
131    fn from_text_ignores_blank_lines() {
132        let s = StopwordSet::from_text("\nกิน\n\nข้าว\n");
133        assert_eq!(s.len(), 2);
134    }
135
136    #[test]
137    fn from_text_strips_bom() {
138        let s = StopwordSet::from_text("\u{FEFF}กิน\nข้าว\n");
139        assert!(s.contains("กิน"), "BOM should be stripped before lookup");
140    }
141
142    #[test]
143    fn from_text_deduplicates() {
144        let s = StopwordSet::from_text("กิน\nกิน\nกิน\n");
145        assert_eq!(s.len(), 1);
146    }
147
148    #[test]
149    fn empty_input_produces_empty_set() {
150        let s = StopwordSet::from_text("");
151        assert!(s.is_empty());
152    }
153
154    #[test]
155    fn contains_is_exact_match() {
156        let s = StopwordSet::from_text("กิน\n");
157        assert!(s.contains("กิน"));
158        assert!(!s.contains("กิน "));
159        assert!(!s.contains("กินข้าว"));
160    }
161}