Skip to main content

kham_core/
spell.rs

1//! Thai spell correction using edit distance and phonetic ranking.
2//!
3//! [`SpellChecker`] finds words in the built-in dictionary that are close to a
4//! given input, ranked first by phonetic similarity (lk82 Soundex), then by
5//! edit distance, then by TNC corpus frequency.
6//!
7//! ```rust
8//! use kham_core::spell::SpellChecker;
9//!
10//! let checker = SpellChecker::builtin();
11//! let suggestions = checker.suggestions("กาน", 5);
12//! // All candidates are within edit distance 2
13//! assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
14//! // Phonetically similar words appear first
15//! if let Some(first) = suggestions.first() {
16//!     assert!(first.soundex_match || first.edit_distance <= 1);
17//! }
18//! ```
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use crate::dict::BUILTIN_WORDS;
24use crate::freq::FreqMap;
25use crate::soundex::lk82;
26
27// ---------------------------------------------------------------------------
28// Public types
29// ---------------------------------------------------------------------------
30
31/// A spelling suggestion returned by [`SpellChecker::suggestions`].
32///
33/// Suggestions are sorted by:
34/// 1. `soundex_match` descending — phonetically similar words first
35/// 2. `edit_distance` ascending — closer edits first
36/// 3. `freq_score` descending — more common words first
37#[derive(Debug, Clone, PartialEq, Eq)]
38pub struct Suggestion {
39    /// Candidate word from the built-in dictionary.
40    pub word: String,
41    /// Levenshtein edit distance between the input and this candidate.
42    pub edit_distance: u8,
43    /// `true` if the lk82 phonetic codes of the input and candidate match.
44    pub soundex_match: bool,
45    /// TNC frequency score; `0` if the word is not in the frequency table.
46    pub freq_score: u32,
47}
48
49/// Thai spell checker using edit distance and lk82 phonetic ranking.
50///
51/// Candidates are drawn from the built-in 62k-word dictionary and filtered to
52/// edit distance ≤ 2. Phonetically similar candidates (matching lk82 code) are
53/// ranked above purely orthographic matches.
54///
55/// # Examples
56///
57/// ```rust
58/// use kham_core::spell::SpellChecker;
59///
60/// let checker = SpellChecker::builtin();
61///
62/// // Misspelled word — expect near-miss suggestions
63/// let suggestions = checker.suggestions("กานข้าว", 5);
64/// assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
65/// ```
66///
67/// Correctly spelled word — edit_distance 0 if it is in the dictionary:
68///
69/// ```rust
70/// use kham_core::spell::SpellChecker;
71///
72/// let checker = SpellChecker::builtin();
73/// let suggestions = checker.suggestions("กิน", 10);
74/// assert!(suggestions.iter().any(|s| s.word == "กิน" && s.edit_distance == 0));
75/// ```
76pub struct SpellChecker {
77    words_text: &'static str,
78    freq: FreqMap,
79}
80
81impl SpellChecker {
82    /// Create a spell checker backed by the built-in dictionary and TNC
83    /// frequency table.
84    ///
85    /// Construction loads the TNC frequency map (~106k entries) — reuse the
86    /// returned instance rather than calling `builtin()` on every query.
87    ///
88    /// # Examples
89    ///
90    /// ```rust
91    /// use kham_core::spell::SpellChecker;
92    ///
93    /// let checker = SpellChecker::builtin();
94    /// assert!(!checker.suggestions("สวัดสี", 5).is_empty());
95    /// ```
96    pub fn builtin() -> Self {
97        Self {
98            words_text: BUILTIN_WORDS,
99            freq: FreqMap::builtin(),
100        }
101    }
102
103    /// Return up to `max_n` spelling suggestions for `word`.
104    ///
105    /// Only candidates with edit distance ≤ 2 are returned. Results are sorted
106    /// by phonetic match (lk82), then edit distance, then TNC frequency.
107    ///
108    /// Returns an empty `Vec` when `word` is empty or `max_n` is zero.
109    ///
110    /// # Examples
111    ///
112    /// ```rust
113    /// use kham_core::spell::SpellChecker;
114    ///
115    /// let checker = SpellChecker::builtin();
116    ///
117    /// // Empty input → no suggestions
118    /// assert!(checker.suggestions("", 5).is_empty());
119    ///
120    /// // max_n = 0 → no suggestions
121    /// assert!(checker.suggestions("กาน", 0).is_empty());
122    ///
123    /// // Results respect the distance threshold
124    /// let suggs = checker.suggestions("กิน", 10);
125    /// assert!(suggs.iter().all(|s| s.edit_distance <= 2));
126    /// ```
127    pub fn suggestions(&self, word: &str, max_n: usize) -> Vec<Suggestion> {
128        if word.is_empty() || max_n == 0 {
129            return Vec::new();
130        }
131
132        let word_chars: Vec<char> = word.chars().collect();
133        let word_len = word_chars.len();
134        let input_lk82 = lk82(word);
135
136        let mut results: Vec<Suggestion> = Vec::new();
137
138        for line in self.words_text.lines() {
139            let candidate = line.trim();
140            if candidate.is_empty() || candidate.starts_with('#') {
141                continue;
142            }
143
144            // Length pre-filter: Levenshtein distance ≤ 2 implies |len_a - len_b| ≤ 2.
145            let cand_len = candidate.chars().count();
146            if cand_len + 2 < word_len || word_len + 2 < cand_len {
147                continue;
148            }
149
150            let dist = levenshtein(&word_chars, candidate, 2);
151            if dist > 2 {
152                continue;
153            }
154
155            let soundex_match = lk82(candidate) == input_lk82;
156            let freq_score = self.freq.get(candidate);
157
158            results.push(Suggestion {
159                word: String::from(candidate),
160                edit_distance: dist as u8,
161                soundex_match,
162                freq_score,
163            });
164        }
165
166        // Sort: soundex_match DESC, edit_distance ASC, freq_score DESC
167        results.sort_unstable_by(|a, b| {
168            b.soundex_match
169                .cmp(&a.soundex_match)
170                .then(a.edit_distance.cmp(&b.edit_distance))
171                .then(b.freq_score.cmp(&a.freq_score))
172        });
173
174        results.truncate(max_n);
175        results
176    }
177}
178
179// ---------------------------------------------------------------------------
180// Levenshtein distance (character-level, with early exit)
181// ---------------------------------------------------------------------------
182
183/// Compute Levenshtein distance between `a_chars` (pre-collected) and `b` (str).
184///
185/// Returns `max_dist + 1` as soon as the running minimum exceeds `max_dist`,
186/// avoiding unnecessary work for clearly dissimilar pairs.
187fn levenshtein(a_chars: &[char], b: &str, max_dist: usize) -> usize {
188    let b_chars: Vec<char> = b.chars().collect();
189    let m = a_chars.len();
190    let n = b_chars.len();
191
192    if m == 0 {
193        return n;
194    }
195    if n == 0 {
196        return m;
197    }
198
199    // Length difference is a hard lower bound on edit distance.
200    if m.abs_diff(n) > max_dist {
201        return max_dist + 1;
202    }
203
204    // Two-row DP. `prev[j]` = edit distance for a[0..i-1] vs b[0..j].
205    let mut prev: Vec<usize> = (0..=n).collect();
206    let mut curr: Vec<usize> = alloc::vec![0usize; n + 1];
207
208    for i in 1..=m {
209        curr[0] = i;
210        let mut row_min = i;
211
212        for j in 1..=n {
213            let substitution_cost = usize::from(a_chars[i - 1] != b_chars[j - 1]);
214            curr[j] = (prev[j] + 1)
215                .min(curr[j - 1] + 1)
216                .min(prev[j - 1] + substitution_cost);
217            if curr[j] < row_min {
218                row_min = curr[j];
219            }
220        }
221
222        // Every cell in this row exceeds max_dist — no path can recover.
223        if row_min > max_dist {
224            return max_dist + 1;
225        }
226
227        core::mem::swap(&mut prev, &mut curr);
228    }
229
230    prev[n]
231}
232
233// ---------------------------------------------------------------------------
234// Tests
235// ---------------------------------------------------------------------------
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    fn checker() -> SpellChecker {
242        SpellChecker::builtin()
243    }
244
245    #[test]
246    fn empty_input_returns_empty() {
247        assert!(checker().suggestions("", 5).is_empty());
248    }
249
250    #[test]
251    fn zero_max_n_returns_empty() {
252        assert!(checker().suggestions("กาน", 0).is_empty());
253    }
254
255    #[test]
256    fn all_results_within_threshold() {
257        let suggs = checker().suggestions("กาน", 20);
258        assert!(
259            suggs.iter().all(|s| s.edit_distance <= 2),
260            "got out-of-range result: {suggs:?}"
261        );
262    }
263
264    #[test]
265    fn respects_max_n() {
266        let suggs = checker().suggestions("กาน", 3);
267        assert!(suggs.len() <= 3);
268    }
269
270    #[test]
271    fn word_in_dict_appears_as_dist_zero() {
272        // "กิน" is definitely in the built-in dictionary.
273        let suggs = checker().suggestions("กิน", 10);
274        let exact = suggs.iter().find(|s| s.word == "กิน");
275        assert!(exact.is_some(), "expected กิน in suggestions");
276        assert_eq!(exact.unwrap().edit_distance, 0);
277    }
278
279    #[test]
280    fn sorted_soundex_first_then_distance() {
281        let suggs = checker().suggestions("กาน", 20);
282        // Verify sort order: soundex DESC, edit_distance ASC, freq DESC
283        for window in suggs.windows(2) {
284            let (a, b) = (&window[0], &window[1]);
285            let ok = (a.soundex_match & !b.soundex_match)
286                || (a.soundex_match == b.soundex_match && a.edit_distance < b.edit_distance)
287                || (a.soundex_match == b.soundex_match
288                    && a.edit_distance == b.edit_distance
289                    && a.freq_score >= b.freq_score);
290            assert!(ok, "sort order violated: {a:?} before {b:?}");
291        }
292    }
293
294    #[test]
295    fn transposition_within_distance_two() {
296        // "สวสัดี" is a character transposition of "สวัสดี" (ั and ส swapped)
297        // Standard Levenshtein counts this as 2 edits — within the threshold.
298        let suggs = checker().suggestions("สวสัดี", 5);
299        let hit = suggs.iter().find(|s| s.word == "สวัสดี");
300        assert!(
301            hit.is_some(),
302            "expected สวัสดี in suggestions for สวสัดี; got: {suggs:?}"
303        );
304        assert!(hit.unwrap().edit_distance <= 2);
305    }
306
307    #[test]
308    fn single_char_deletion_suggestion() {
309        // "กินข้า" is "กินข้าว" with the final ว deleted — edit distance 1.
310        let suggs = checker().suggestions("กินข้า", 5);
311        let hit = suggs.iter().find(|s| s.word == "กินข้าว");
312        assert!(
313            hit.is_some(),
314            "expected กินข้าว in suggestions for กินข้า; got: {suggs:?}"
315        );
316        assert_eq!(hit.unwrap().edit_distance, 1);
317    }
318
319    // Levenshtein unit tests ------------------------------------------------
320
321    fn lev(a: &str, b: &str) -> usize {
322        let a_chars: Vec<char> = a.chars().collect();
323        levenshtein(&a_chars, b, 255)
324    }
325
326    #[test]
327    fn lev_identical() {
328        assert_eq!(lev("กิน", "กิน"), 0);
329    }
330
331    #[test]
332    fn lev_single_substitution() {
333        assert_eq!(lev("กาน", "กาล"), 1);
334    }
335
336    #[test]
337    fn lev_single_deletion() {
338        assert_eq!(lev("กินข้าว", "กินข้า"), 1);
339    }
340
341    #[test]
342    fn lev_single_insertion() {
343        assert_eq!(lev("กินข้า", "กินข้าว"), 1);
344    }
345
346    #[test]
347    fn lev_transposition_is_two() {
348        // Standard Levenshtein: transposition = delete + insert = 2
349        assert_eq!(lev("สวสัดี", "สวัสดี"), 2);
350    }
351
352    #[test]
353    fn lev_empty_strings() {
354        assert_eq!(lev("", ""), 0);
355        assert_eq!(lev("กิน", ""), 3);
356        assert_eq!(lev("", "กิน"), 3);
357    }
358
359    #[test]
360    fn lev_early_exit_above_max() {
361        let a_chars: Vec<char> = "กิน".chars().collect();
362        // Distance is 5 but max_dist is 2 — should return 3 (max_dist + 1).
363        assert_eq!(levenshtein(&a_chars, "สวัสดีครับ", 2), 3);
364    }
365}