kham_core/spell.rs
1//! Thai spell correction using edit distance and phonetic ranking.
2//!
3//! [`SpellChecker`] finds words in the built-in dictionary that are close to a
4//! given input, ranked first by phonetic similarity (lk82 Soundex), then by
5//! edit distance, then by TNC corpus frequency.
6//!
7//! ```rust
8//! use kham_core::spell::SpellChecker;
9//!
10//! let checker = SpellChecker::builtin();
11//! let suggestions = checker.suggestions("กาน", 5);
12//! // All candidates are within edit distance 2
13//! assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
14//! // Phonetically similar words appear first
15//! if let Some(first) = suggestions.first() {
16//! assert!(first.soundex_match || first.edit_distance <= 1);
17//! }
18//! ```
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use crate::dict::BUILTIN_WORDS;
24use crate::freq::FreqMap;
25use crate::soundex::lk82;
26
27// ---------------------------------------------------------------------------
28// Public types
29// ---------------------------------------------------------------------------
30
31/// A spelling suggestion returned by [`SpellChecker::suggestions`].
32///
33/// Suggestions are sorted by:
34/// 1. `soundex_match` descending — phonetically similar words first
35/// 2. `edit_distance` ascending — closer edits first
36/// 3. `freq_score` descending — more common words first
37#[derive(Debug, Clone, PartialEq, Eq)]
38pub struct Suggestion {
39 /// Candidate word from the built-in dictionary.
40 pub word: String,
41 /// Levenshtein edit distance between the input and this candidate.
42 pub edit_distance: u8,
43 /// `true` if the lk82 phonetic codes of the input and candidate match.
44 pub soundex_match: bool,
45 /// TNC frequency score; `0` if the word is not in the frequency table.
46 pub freq_score: u32,
47}
48
49/// Thai spell checker using edit distance and lk82 phonetic ranking.
50///
51/// Candidates are drawn from the built-in 62k-word dictionary and filtered to
52/// edit distance ≤ 2. Phonetically similar candidates (matching lk82 code) are
53/// ranked above purely orthographic matches.
54///
55/// # Examples
56///
57/// ```rust
58/// use kham_core::spell::SpellChecker;
59///
60/// let checker = SpellChecker::builtin();
61///
62/// // Misspelled word — expect near-miss suggestions
63/// let suggestions = checker.suggestions("กานข้าว", 5);
64/// assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
65/// ```
66///
67/// Correctly spelled word — edit_distance 0 if it is in the dictionary:
68///
69/// ```rust
70/// use kham_core::spell::SpellChecker;
71///
72/// let checker = SpellChecker::builtin();
73/// let suggestions = checker.suggestions("กิน", 10);
74/// assert!(suggestions.iter().any(|s| s.word == "กิน" && s.edit_distance == 0));
75/// ```
76pub struct SpellChecker {
77 words_text: &'static str,
78 freq: FreqMap,
79}
80
81impl SpellChecker {
82 /// Create a spell checker backed by the built-in dictionary and TNC
83 /// frequency table.
84 ///
85 /// Construction loads the TNC frequency map (~106k entries) — reuse the
86 /// returned instance rather than calling `builtin()` on every query.
87 ///
88 /// # Examples
89 ///
90 /// ```rust
91 /// use kham_core::spell::SpellChecker;
92 ///
93 /// let checker = SpellChecker::builtin();
94 /// assert!(!checker.suggestions("สวัดสี", 5).is_empty());
95 /// ```
96 pub fn builtin() -> Self {
97 Self {
98 words_text: BUILTIN_WORDS,
99 freq: FreqMap::builtin(),
100 }
101 }
102
103 /// Return up to `max_n` spelling suggestions for `word`.
104 ///
105 /// Only candidates with edit distance ≤ 2 are returned. Results are sorted
106 /// by phonetic match (lk82), then edit distance, then TNC frequency.
107 ///
108 /// Returns an empty `Vec` when `word` is empty or `max_n` is zero.
109 ///
110 /// # Examples
111 ///
112 /// ```rust
113 /// use kham_core::spell::SpellChecker;
114 ///
115 /// let checker = SpellChecker::builtin();
116 ///
117 /// // Empty input → no suggestions
118 /// assert!(checker.suggestions("", 5).is_empty());
119 ///
120 /// // max_n = 0 → no suggestions
121 /// assert!(checker.suggestions("กาน", 0).is_empty());
122 ///
123 /// // Results respect the distance threshold
124 /// let suggs = checker.suggestions("กิน", 10);
125 /// assert!(suggs.iter().all(|s| s.edit_distance <= 2));
126 /// ```
127 pub fn suggestions(&self, word: &str, max_n: usize) -> Vec<Suggestion> {
128 if word.is_empty() || max_n == 0 {
129 return Vec::new();
130 }
131
132 let word_chars: Vec<char> = word.chars().collect();
133 let word_len = word_chars.len();
134 let input_lk82 = lk82(word);
135
136 let mut results: Vec<Suggestion> = Vec::new();
137
138 for line in self.words_text.lines() {
139 let candidate = line.trim();
140 if candidate.is_empty() || candidate.starts_with('#') {
141 continue;
142 }
143
144 // Length pre-filter: Levenshtein distance ≤ 2 implies |len_a - len_b| ≤ 2.
145 let cand_len = candidate.chars().count();
146 if cand_len + 2 < word_len || word_len + 2 < cand_len {
147 continue;
148 }
149
150 let dist = levenshtein(&word_chars, candidate, 2);
151 if dist > 2 {
152 continue;
153 }
154
155 let soundex_match = lk82(candidate) == input_lk82;
156 let freq_score = self.freq.get(candidate);
157
158 results.push(Suggestion {
159 word: String::from(candidate),
160 edit_distance: dist as u8,
161 soundex_match,
162 freq_score,
163 });
164 }
165
166 // Sort: soundex_match DESC, edit_distance ASC, freq_score DESC
167 results.sort_unstable_by(|a, b| {
168 b.soundex_match
169 .cmp(&a.soundex_match)
170 .then(a.edit_distance.cmp(&b.edit_distance))
171 .then(b.freq_score.cmp(&a.freq_score))
172 });
173
174 results.truncate(max_n);
175 results
176 }
177}
178
179// ---------------------------------------------------------------------------
180// Levenshtein distance (character-level, with early exit)
181// ---------------------------------------------------------------------------
182
183/// Compute Levenshtein distance between `a_chars` (pre-collected) and `b` (str).
184///
185/// Returns `max_dist + 1` as soon as the running minimum exceeds `max_dist`,
186/// avoiding unnecessary work for clearly dissimilar pairs.
187fn levenshtein(a_chars: &[char], b: &str, max_dist: usize) -> usize {
188 let b_chars: Vec<char> = b.chars().collect();
189 let m = a_chars.len();
190 let n = b_chars.len();
191
192 if m == 0 {
193 return n;
194 }
195 if n == 0 {
196 return m;
197 }
198
199 // Length difference is a hard lower bound on edit distance.
200 if m.abs_diff(n) > max_dist {
201 return max_dist + 1;
202 }
203
204 // Two-row DP. `prev[j]` = edit distance for a[0..i-1] vs b[0..j].
205 let mut prev: Vec<usize> = (0..=n).collect();
206 let mut curr: Vec<usize> = alloc::vec![0usize; n + 1];
207
208 for i in 1..=m {
209 curr[0] = i;
210 let mut row_min = i;
211
212 for j in 1..=n {
213 let substitution_cost = usize::from(a_chars[i - 1] != b_chars[j - 1]);
214 curr[j] = (prev[j] + 1)
215 .min(curr[j - 1] + 1)
216 .min(prev[j - 1] + substitution_cost);
217 if curr[j] < row_min {
218 row_min = curr[j];
219 }
220 }
221
222 // Every cell in this row exceeds max_dist — no path can recover.
223 if row_min > max_dist {
224 return max_dist + 1;
225 }
226
227 core::mem::swap(&mut prev, &mut curr);
228 }
229
230 prev[n]
231}
232
233// ---------------------------------------------------------------------------
234// Tests
235// ---------------------------------------------------------------------------
236
237#[cfg(test)]
238mod tests {
239 use super::*;
240
241 fn checker() -> SpellChecker {
242 SpellChecker::builtin()
243 }
244
245 #[test]
246 fn empty_input_returns_empty() {
247 assert!(checker().suggestions("", 5).is_empty());
248 }
249
250 #[test]
251 fn zero_max_n_returns_empty() {
252 assert!(checker().suggestions("กาน", 0).is_empty());
253 }
254
255 #[test]
256 fn all_results_within_threshold() {
257 let suggs = checker().suggestions("กาน", 20);
258 assert!(
259 suggs.iter().all(|s| s.edit_distance <= 2),
260 "got out-of-range result: {suggs:?}"
261 );
262 }
263
264 #[test]
265 fn respects_max_n() {
266 let suggs = checker().suggestions("กาน", 3);
267 assert!(suggs.len() <= 3);
268 }
269
270 #[test]
271 fn word_in_dict_appears_as_dist_zero() {
272 // "กิน" is definitely in the built-in dictionary.
273 let suggs = checker().suggestions("กิน", 10);
274 let exact = suggs.iter().find(|s| s.word == "กิน");
275 assert!(exact.is_some(), "expected กิน in suggestions");
276 assert_eq!(exact.unwrap().edit_distance, 0);
277 }
278
279 #[test]
280 fn sorted_soundex_first_then_distance() {
281 let suggs = checker().suggestions("กาน", 20);
282 // Verify sort order: soundex DESC, edit_distance ASC, freq DESC
283 for window in suggs.windows(2) {
284 let (a, b) = (&window[0], &window[1]);
285 let ok = (a.soundex_match & !b.soundex_match)
286 || (a.soundex_match == b.soundex_match && a.edit_distance < b.edit_distance)
287 || (a.soundex_match == b.soundex_match
288 && a.edit_distance == b.edit_distance
289 && a.freq_score >= b.freq_score);
290 assert!(ok, "sort order violated: {a:?} before {b:?}");
291 }
292 }
293
294 #[test]
295 fn transposition_within_distance_two() {
296 // "สวสัดี" is a character transposition of "สวัสดี" (ั and ส swapped)
297 // Standard Levenshtein counts this as 2 edits — within the threshold.
298 let suggs = checker().suggestions("สวสัดี", 5);
299 let hit = suggs.iter().find(|s| s.word == "สวัสดี");
300 assert!(
301 hit.is_some(),
302 "expected สวัสดี in suggestions for สวสัดี; got: {suggs:?}"
303 );
304 assert!(hit.unwrap().edit_distance <= 2);
305 }
306
307 #[test]
308 fn single_char_deletion_suggestion() {
309 // "กินข้า" is "กินข้าว" with the final ว deleted — edit distance 1.
310 let suggs = checker().suggestions("กินข้า", 5);
311 let hit = suggs.iter().find(|s| s.word == "กินข้าว");
312 assert!(
313 hit.is_some(),
314 "expected กินข้าว in suggestions for กินข้า; got: {suggs:?}"
315 );
316 assert_eq!(hit.unwrap().edit_distance, 1);
317 }
318
319 // Levenshtein unit tests ------------------------------------------------
320
321 fn lev(a: &str, b: &str) -> usize {
322 let a_chars: Vec<char> = a.chars().collect();
323 levenshtein(&a_chars, b, 255)
324 }
325
326 #[test]
327 fn lev_identical() {
328 assert_eq!(lev("กิน", "กิน"), 0);
329 }
330
331 #[test]
332 fn lev_single_substitution() {
333 assert_eq!(lev("กาน", "กาล"), 1);
334 }
335
336 #[test]
337 fn lev_single_deletion() {
338 assert_eq!(lev("กินข้าว", "กินข้า"), 1);
339 }
340
341 #[test]
342 fn lev_single_insertion() {
343 assert_eq!(lev("กินข้า", "กินข้าว"), 1);
344 }
345
346 #[test]
347 fn lev_transposition_is_two() {
348 // Standard Levenshtein: transposition = delete + insert = 2
349 assert_eq!(lev("สวสัดี", "สวัสดี"), 2);
350 }
351
352 #[test]
353 fn lev_empty_strings() {
354 assert_eq!(lev("", ""), 0);
355 assert_eq!(lev("กิน", ""), 3);
356 assert_eq!(lev("", "กิน"), 3);
357 }
358
359 #[test]
360 fn lev_early_exit_above_max() {
361 let a_chars: Vec<char> = "กิน".chars().collect();
362 // Distance is 5 but max_dist is 2 — should return 3 (max_dist + 1).
363 assert_eq!(levenshtein(&a_chars, "สวัสดีครับ", 2), 3);
364 }
365}