Skip to main content

kham_core/
spell.rs

1//! Thai spell correction using edit distance and phonetic ranking.
2//!
3//! [`SpellChecker`] finds words in the built-in dictionary that are close to a
4//! given input, ranked first by phonetic similarity (lk82 Soundex), then by
5//! edit distance, then by TNC corpus frequency.
6//!
7//! ```rust
8//! use kham_core::spell::SpellChecker;
9//!
10//! let checker = SpellChecker::builtin();
11//! let suggestions = checker.suggestions("กาน", 5);
12//! // All candidates are within edit distance 2
13//! assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
14//! // Phonetically similar words appear first
15//! if let Some(first) = suggestions.first() {
16//!     assert!(first.soundex_match || first.edit_distance <= 1);
17//! }
18//! ```
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use crate::dict::BUILTIN_WORDS;
24use crate::freq::FreqMap;
25use crate::segmenter::Tokenizer;
26use crate::soundex::lk82;
27use crate::token::TokenKind;
28
29// ---------------------------------------------------------------------------
30// Public types
31// ---------------------------------------------------------------------------
32
33/// A spelling suggestion returned by [`SpellChecker::suggestions`].
34///
35/// Suggestions are sorted by:
36/// 1. `soundex_match` descending — phonetically similar words first
37/// 2. `edit_distance` ascending — closer edits first
38/// 3. `freq_score` descending — more common words first
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct Suggestion {
41    /// Candidate word from the built-in dictionary.
42    pub word: String,
43    /// Levenshtein edit distance between the input and this candidate.
44    pub edit_distance: u8,
45    /// `true` if the lk82 phonetic codes of the input and candidate match.
46    pub soundex_match: bool,
47    /// TNC frequency score; `0` if the word is not in the frequency table.
48    pub freq_score: u32,
49}
50
51/// Thai spell checker using edit distance and lk82 phonetic ranking.
52///
53/// Candidates are drawn from the built-in 62k-word dictionary and filtered to
54/// edit distance ≤ 2. Phonetically similar candidates (matching lk82 code) are
55/// ranked above purely orthographic matches.
56///
57/// # Examples
58///
59/// ```rust
60/// use kham_core::spell::SpellChecker;
61///
62/// let checker = SpellChecker::builtin();
63///
64/// // Misspelled word — expect near-miss suggestions
65/// let suggestions = checker.suggestions("กานข้าว", 5);
66/// assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
67/// ```
68///
69/// Correctly spelled word — edit_distance 0 if it is in the dictionary:
70///
71/// ```rust
72/// use kham_core::spell::SpellChecker;
73///
74/// let checker = SpellChecker::builtin();
75/// let suggestions = checker.suggestions("กิน", 10);
76/// assert!(suggestions.iter().any(|s| s.word == "กิน" && s.edit_distance == 0));
77/// ```
78pub struct SpellChecker {
79    words_text: &'static str,
80    freq: FreqMap,
81    tokenizer: Tokenizer,
82}
83
84impl SpellChecker {
85    /// Create a spell checker backed by the built-in dictionary and TNC
86    /// frequency table.
87    ///
88    /// Construction loads the TNC frequency map (~106k entries) — reuse the
89    /// returned instance rather than calling `builtin()` on every query.
90    ///
91    /// # Examples
92    ///
93    /// ```rust
94    /// use kham_core::spell::SpellChecker;
95    ///
96    /// let checker = SpellChecker::builtin();
97    /// assert!(!checker.suggestions("สวัดสี", 5).is_empty());
98    /// ```
99    pub fn builtin() -> Self {
100        Self {
101            words_text: BUILTIN_WORDS,
102            freq: FreqMap::builtin(),
103            tokenizer: Tokenizer::new(),
104        }
105    }
106
107    /// Return up to `max_n` spelling suggestions for `word`.
108    ///
109    /// Only candidates with edit distance ≤ 2 are returned. Results are sorted
110    /// by phonetic match (lk82), then edit distance, then TNC frequency.
111    ///
112    /// Returns an empty `Vec` when `word` is empty or `max_n` is zero.
113    ///
114    /// # Examples
115    ///
116    /// ```rust
117    /// use kham_core::spell::SpellChecker;
118    ///
119    /// let checker = SpellChecker::builtin();
120    ///
121    /// // Empty input → no suggestions
122    /// assert!(checker.suggestions("", 5).is_empty());
123    ///
124    /// // max_n = 0 → no suggestions
125    /// assert!(checker.suggestions("กาน", 0).is_empty());
126    ///
127    /// // Results respect the distance threshold
128    /// let suggs = checker.suggestions("กิน", 10);
129    /// assert!(suggs.iter().all(|s| s.edit_distance <= 2));
130    /// ```
131    pub fn suggestions(&self, word: &str, max_n: usize) -> Vec<Suggestion> {
132        if word.is_empty() || max_n == 0 {
133            return Vec::new();
134        }
135
136        let word_chars: Vec<char> = word.chars().collect();
137        let word_len = word_chars.len();
138        let input_lk82 = lk82(word);
139
140        let mut results: Vec<Suggestion> = Vec::new();
141
142        for line in self.words_text.lines() {
143            let candidate = line.trim();
144            if candidate.is_empty() || candidate.starts_with('#') {
145                continue;
146            }
147
148            // Length pre-filter: Levenshtein distance ≤ 2 implies |len_a - len_b| ≤ 2.
149            let cand_len = candidate.chars().count();
150            if cand_len + 2 < word_len || word_len + 2 < cand_len {
151                continue;
152            }
153
154            let dist = levenshtein(&word_chars, candidate, 2);
155            if dist > 2 {
156                continue;
157            }
158
159            let soundex_match = lk82(candidate) == input_lk82;
160            let freq_score = self.freq.get(candidate);
161
162            results.push(Suggestion {
163                word: String::from(candidate),
164                edit_distance: dist as u8,
165                soundex_match,
166                freq_score,
167            });
168        }
169
170        // Sort: soundex_match DESC, edit_distance ASC, freq_score DESC
171        results.sort_unstable_by(|a, b| {
172            b.soundex_match
173                .cmp(&a.soundex_match)
174                .then(a.edit_distance.cmp(&b.edit_distance))
175                .then(b.freq_score.cmp(&a.freq_score))
176        });
177
178        results.truncate(max_n);
179        results
180    }
181
182    /// Return the single best spelling correction for `word`, or `None` if the
183    /// word appears correctly in the dictionary (edit distance 0).
184    pub fn did_you_mean(&self, word: &str) -> Option<String> {
185        match self.suggestions(word, 1).into_iter().next() {
186            Some(s) if s.edit_distance == 0 => None,
187            Some(s) => Some(s.word),
188            None => None,
189        }
190    }
191
192    /// Correct an entire Thai text by replacing Unknown tokens with their best
193    /// spelling suggestion.
194    ///
195    /// Segments `text` with the built-in tokenizer, then for every
196    /// `TokenKind::Unknown` token that is at least 2 characters long, looks up
197    /// the best suggestion and substitutes it. All other tokens (including known
198    /// Thai words, numbers, Latin, punctuation) are passed through unchanged.
199    ///
200    /// Returns the input unchanged when no Unknown tokens are found or no
201    /// correction candidates exist.
202    ///
203    /// # Example
204    /// ```rust
205    /// use kham_core::spell::SpellChecker;
206    /// let checker = SpellChecker::builtin();
207    /// // A correctly spelled sentence should come back unchanged word-for-word.
208    /// let out = checker.correct_text("กินข้าวกับปลา");
209    /// assert!(!out.is_empty());
210    /// ```
211    pub fn correct_text(&self, text: &str) -> String {
212        if text.is_empty() {
213            return String::new();
214        }
215        let tokens = self.tokenizer.segment(text);
216        let mut result = String::with_capacity(text.len());
217        for token in &tokens {
218            if token.kind == TokenKind::Unknown && token.text.chars().count() >= 2 {
219                match self.did_you_mean(token.text) {
220                    Some(correction) => result.push_str(&correction),
221                    None => result.push_str(token.text),
222                }
223            } else {
224                result.push_str(token.text);
225            }
226        }
227        result
228    }
229}
230
231// ---------------------------------------------------------------------------
232// Levenshtein distance (character-level, with early exit)
233// ---------------------------------------------------------------------------
234
235/// Compute Levenshtein distance between `a_chars` (pre-collected) and `b` (str).
236///
237/// Returns `max_dist + 1` as soon as the running minimum exceeds `max_dist`,
238/// avoiding unnecessary work for clearly dissimilar pairs.
239fn levenshtein(a_chars: &[char], b: &str, max_dist: usize) -> usize {
240    let b_chars: Vec<char> = b.chars().collect();
241    let m = a_chars.len();
242    let n = b_chars.len();
243
244    if m == 0 {
245        return n;
246    }
247    if n == 0 {
248        return m;
249    }
250
251    // Length difference is a hard lower bound on edit distance.
252    if m.abs_diff(n) > max_dist {
253        return max_dist + 1;
254    }
255
256    // Two-row DP. `prev[j]` = edit distance for a[0..i-1] vs b[0..j].
257    let mut prev: Vec<usize> = (0..=n).collect();
258    let mut curr: Vec<usize> = alloc::vec![0usize; n + 1];
259
260    for i in 1..=m {
261        curr[0] = i;
262        let mut row_min = i;
263
264        for j in 1..=n {
265            let substitution_cost = usize::from(a_chars[i - 1] != b_chars[j - 1]);
266            curr[j] = (prev[j] + 1)
267                .min(curr[j - 1] + 1)
268                .min(prev[j - 1] + substitution_cost);
269            if curr[j] < row_min {
270                row_min = curr[j];
271            }
272        }
273
274        // Every cell in this row exceeds max_dist — no path can recover.
275        if row_min > max_dist {
276            return max_dist + 1;
277        }
278
279        core::mem::swap(&mut prev, &mut curr);
280    }
281
282    prev[n]
283}
284
285// ---------------------------------------------------------------------------
286// Tests
287// ---------------------------------------------------------------------------
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292
293    fn checker() -> SpellChecker {
294        SpellChecker::builtin()
295    }
296
297    #[test]
298    fn empty_input_returns_empty() {
299        assert!(checker().suggestions("", 5).is_empty());
300    }
301
302    #[test]
303    fn zero_max_n_returns_empty() {
304        assert!(checker().suggestions("กาน", 0).is_empty());
305    }
306
307    #[test]
308    fn all_results_within_threshold() {
309        let suggs = checker().suggestions("กาน", 20);
310        assert!(
311            suggs.iter().all(|s| s.edit_distance <= 2),
312            "got out-of-range result: {suggs:?}"
313        );
314    }
315
316    #[test]
317    fn respects_max_n() {
318        let suggs = checker().suggestions("กาน", 3);
319        assert!(suggs.len() <= 3);
320    }
321
322    #[test]
323    fn word_in_dict_appears_as_dist_zero() {
324        // "กิน" is definitely in the built-in dictionary.
325        let suggs = checker().suggestions("กิน", 10);
326        let exact = suggs.iter().find(|s| s.word == "กิน");
327        assert!(exact.is_some(), "expected กิน in suggestions");
328        assert_eq!(exact.unwrap().edit_distance, 0);
329    }
330
331    #[test]
332    fn sorted_soundex_first_then_distance() {
333        let suggs = checker().suggestions("กาน", 20);
334        // Verify sort order: soundex DESC, edit_distance ASC, freq DESC
335        for window in suggs.windows(2) {
336            let (a, b) = (&window[0], &window[1]);
337            let ok = (a.soundex_match & !b.soundex_match)
338                || (a.soundex_match == b.soundex_match && a.edit_distance < b.edit_distance)
339                || (a.soundex_match == b.soundex_match
340                    && a.edit_distance == b.edit_distance
341                    && a.freq_score >= b.freq_score);
342            assert!(ok, "sort order violated: {a:?} before {b:?}");
343        }
344    }
345
346    #[test]
347    fn transposition_within_distance_two() {
348        // "สวสัดี" is a character transposition of "สวัสดี" (ั and ส swapped)
349        // Standard Levenshtein counts this as 2 edits — within the threshold.
350        let suggs = checker().suggestions("สวสัดี", 5);
351        let hit = suggs.iter().find(|s| s.word == "สวัสดี");
352        assert!(
353            hit.is_some(),
354            "expected สวัสดี in suggestions for สวสัดี; got: {suggs:?}"
355        );
356        assert!(hit.unwrap().edit_distance <= 2);
357    }
358
359    #[test]
360    fn single_char_deletion_suggestion() {
361        // "กินข้า" is "กินข้าว" with the final ว deleted — edit distance 1.
362        let suggs = checker().suggestions("กินข้า", 5);
363        let hit = suggs.iter().find(|s| s.word == "กินข้าว");
364        assert!(
365            hit.is_some(),
366            "expected กินข้าว in suggestions for กินข้า; got: {suggs:?}"
367        );
368        assert_eq!(hit.unwrap().edit_distance, 1);
369    }
370
371    // Levenshtein unit tests ------------------------------------------------
372
373    fn lev(a: &str, b: &str) -> usize {
374        let a_chars: Vec<char> = a.chars().collect();
375        levenshtein(&a_chars, b, 255)
376    }
377
378    #[test]
379    fn lev_identical() {
380        assert_eq!(lev("กิน", "กิน"), 0);
381    }
382
383    #[test]
384    fn lev_single_substitution() {
385        assert_eq!(lev("กาน", "กาล"), 1);
386    }
387
388    #[test]
389    fn lev_single_deletion() {
390        assert_eq!(lev("กินข้าว", "กินข้า"), 1);
391    }
392
393    #[test]
394    fn lev_single_insertion() {
395        assert_eq!(lev("กินข้า", "กินข้าว"), 1);
396    }
397
398    #[test]
399    fn lev_transposition_is_two() {
400        // Standard Levenshtein: transposition = delete + insert = 2
401        assert_eq!(lev("สวสัดี", "สวัสดี"), 2);
402    }
403
404    #[test]
405    fn lev_empty_strings() {
406        assert_eq!(lev("", ""), 0);
407        assert_eq!(lev("กิน", ""), 3);
408        assert_eq!(lev("", "กิน"), 3);
409    }
410
411    #[test]
412    fn lev_early_exit_above_max() {
413        let a_chars: Vec<char> = "กิน".chars().collect();
414        // Distance is 5 but max_dist is 2 — should return 3 (max_dist + 1).
415        assert_eq!(levenshtein(&a_chars, "สวัสดีครับ", 2), 3);
416    }
417
418    // did_you_mean / correct_text tests ----------------------------------------
419
420    #[test]
421    fn did_you_mean_correct_word_returns_none() {
422        // "กิน" is in the built-in dictionary — edit distance 0 → None
423        assert_eq!(checker().did_you_mean("กิน"), None);
424    }
425
426    #[test]
427    fn did_you_mean_misspelled_word_returns_suggestion() {
428        // "กินข้า" is "กินข้าว" with the final ว deleted — not in the dictionary
429        let result = checker().did_you_mean("กินข้า");
430        assert!(result.is_some(), "expected a suggestion for กินข้า");
431        assert_ne!(
432            result.as_deref(),
433            Some("กินข้า"),
434            "suggestion should differ from input"
435        );
436    }
437
438    #[test]
439    fn correct_text_passthrough_clean_input() {
440        let out = checker().correct_text("กินข้าวกับปลา");
441        assert!(!out.is_empty(), "output should not be empty");
442        // No Unknown tokens expected for well-known words
443        let tokenizer = crate::segmenter::Tokenizer::new();
444        let tokens = tokenizer.segment(&out);
445        assert!(
446            tokens
447                .iter()
448                .all(|t| t.kind != crate::token::TokenKind::Unknown),
449            "expected no Unknown tokens in corrected output; got: {tokens:?}"
450        );
451    }
452}