Skip to main content

kham_core/
synonym.rs

1//! Synonym expansion for Thai full-text search.
2//!
3//! [`SynonymMap`] maps a canonical form to a list of equivalent terms. During
4//! FTS indexing, each matched canonical token is expanded to its synonyms so
5//! that queries on any variant find the document.
6//!
7//! # Data format
8//!
9//! A tab-separated text file, one rule per line:
10//!
11//! ```text
12//! # canonical<TAB>syn1<TAB>syn2 ...
13//! คอม<TAB>คอมพิวเตอร์<TAB>computer
14//! รถไฟฟ้า<TAB>BTS<TAB>MRT<TAB>รถไฟใต้ดิน
15//! ```
16//!
17//! Lines beginning with `#` and blank lines are ignored.
18//!
19//! # Example
20//!
21//! ```rust
22//! use kham_core::synonym::SynonymMap;
23//!
24//! let map = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
25//! let expansions = map.expand("คอม").unwrap_or_default();
26//! assert!(expansions.contains(&"คอมพิวเตอร์".to_string()));
27//! ```
28
29use alloc::collections::BTreeMap;
30use alloc::string::String;
31use alloc::vec::Vec;
32
33/// A canonical-form → synonym-list lookup table.
34///
35/// Built from tab-separated data via [`SynonymMap::from_tsv`]. Lookup is
36/// O(log n) via [`BTreeMap`] (suitable for sets up to ~5 000 entries; use an
37/// FST-backed map for larger corpora).
38pub struct SynonymMap(BTreeMap<String, Vec<String>>);
39
40impl SynonymMap {
41    /// Create an empty [`SynonymMap`] (no expansions).
42    pub fn empty() -> Self {
43        SynonymMap(BTreeMap::new())
44    }
45
46    /// Parse a tab-separated synonym table.
47    ///
48    /// Format: `canonical\tsyn1\tsyn2\t…` — one rule per line.
49    /// Lines beginning with `#` and blank lines are skipped.
50    /// Lines with fewer than two tab-separated fields are skipped silently.
51    pub fn from_tsv(data: &str) -> Self {
52        let mut map: BTreeMap<String, Vec<String>> = BTreeMap::new();
53        for line in data.lines() {
54            let line = line.trim();
55            if line.is_empty() || line.starts_with('#') {
56                continue;
57            }
58            let mut parts = line.splitn(2, '\t');
59            let canonical = match parts.next() {
60                Some(c) if !c.is_empty() => String::from(c),
61                _ => continue,
62            };
63            let rest = match parts.next() {
64                Some(r) if !r.is_empty() => r,
65                _ => continue,
66            };
67            let synonyms: Vec<String> = rest
68                .split('\t')
69                .map(str::trim)
70                .filter(|s| !s.is_empty())
71                .map(String::from)
72                .collect();
73            if synonyms.is_empty() {
74                continue;
75            }
76            map.entry(canonical).or_default().extend(synonyms);
77        }
78        SynonymMap(map)
79    }
80
81    /// Look up synonym expansions for `word`.
82    ///
83    /// Returns `None` if `word` has no entry. Returns the full synonym slice
84    /// otherwise — the caller should index all returned strings alongside the
85    /// canonical form.
86    pub fn expand(&self, word: &str) -> Option<&[String]> {
87        self.0.get(word).map(Vec::as_slice)
88    }
89
90    /// Return `true` if `word` has synonym expansions.
91    #[inline]
92    pub fn has_synonyms(&self, word: &str) -> bool {
93        self.0.contains_key(word)
94    }
95
96    /// Number of canonical entries in the map.
97    #[inline]
98    pub fn len(&self) -> usize {
99        self.0.len()
100    }
101
102    /// Return `true` if the map has no entries.
103    #[inline]
104    pub fn is_empty(&self) -> bool {
105        self.0.is_empty()
106    }
107}
108
109// ---------------------------------------------------------------------------
110// Tests
111// ---------------------------------------------------------------------------
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn empty_map_returns_none() {
119        let m = SynonymMap::empty();
120        assert!(m.expand("คอม").is_none());
121        assert!(m.is_empty());
122    }
123
124    #[test]
125    fn single_synonym_parsed() {
126        let m = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\n");
127        let syns = m.expand("คอม").expect("should have synonyms");
128        assert_eq!(syns, &[String::from("คอมพิวเตอร์")]);
129    }
130
131    #[test]
132    fn multiple_synonyms_parsed() {
133        let m = SynonymMap::from_tsv("รถไฟฟ้า\tBTS\tMRT\tรถไฟใต้ดิน\n");
134        let syns = m.expand("รถไฟฟ้า").expect("should have synonyms");
135        assert_eq!(syns.len(), 3);
136        assert!(syns.contains(&String::from("BTS")));
137        assert!(syns.contains(&String::from("MRT")));
138        assert!(syns.contains(&String::from("รถไฟใต้ดิน")));
139    }
140
141    #[test]
142    fn comment_lines_skipped() {
143        let m = SynonymMap::from_tsv("# this is a comment\nคอม\tคอมพิวเตอร์\n");
144        assert_eq!(m.len(), 1);
145    }
146
147    #[test]
148    fn blank_lines_skipped() {
149        let m = SynonymMap::from_tsv("\n\nคอม\tคอมพิวเตอร์\n\n");
150        assert_eq!(m.len(), 1);
151    }
152
153    #[test]
154    fn line_without_tab_skipped() {
155        let m = SynonymMap::from_tsv("คอม\n");
156        assert!(m.expand("คอม").is_none());
157    }
158
159    #[test]
160    fn unknown_word_returns_none() {
161        let m = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\n");
162        assert!(m.expand("xyz").is_none());
163        assert!(!m.has_synonyms("xyz"));
164    }
165
166    #[test]
167    fn has_synonyms_true_for_known_word() {
168        let m = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\n");
169        assert!(m.has_synonyms("คอม"));
170    }
171
172    #[test]
173    fn duplicate_canonical_merges_synonyms() {
174        let m = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\nคอม\tcomputer\n");
175        let syns = m.expand("คอม").expect("should have synonyms");
176        assert!(syns.contains(&String::from("คอมพิวเตอร์")));
177        assert!(syns.contains(&String::from("computer")));
178    }
179
180    #[test]
181    fn empty_input_produces_empty_map() {
182        assert!(SynonymMap::from_tsv("").is_empty());
183    }
184
185    #[test]
186    fn whitespace_trimmed_from_synonyms() {
187        let m = SynonymMap::from_tsv("คอม\t คอมพิวเตอร์ \n");
188        let syns = m.expand("คอม").expect("should have synonyms");
189        assert_eq!(syns, &[String::from("คอมพิวเตอร์")]);
190    }
191}