kham_core/spell.rs
1//! Thai spell correction using edit distance and phonetic ranking.
2//!
3//! [`SpellChecker`] finds words in the built-in dictionary that are close to a
4//! given input, ranked first by phonetic similarity (lk82 Soundex), then by
5//! edit distance, then by TNC corpus frequency.
6//!
7//! ```rust
8//! use kham_core::spell::SpellChecker;
9//!
10//! let checker = SpellChecker::builtin();
11//! let suggestions = checker.suggestions("กาน", 5);
12//! // All candidates are within edit distance 2
13//! assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
14//! // Phonetically similar words appear first
15//! if let Some(first) = suggestions.first() {
16//! assert!(first.soundex_match || first.edit_distance <= 1);
17//! }
18//! ```
19
20use alloc::string::String;
21use alloc::vec::Vec;
22
23use crate::dict::BUILTIN_WORDS;
24use crate::freq::FreqMap;
25use crate::segmenter::Tokenizer;
26use crate::soundex::lk82;
27use crate::token::TokenKind;
28
29// ---------------------------------------------------------------------------
30// Public types
31// ---------------------------------------------------------------------------
32
33/// A spelling suggestion returned by [`SpellChecker::suggestions`].
34///
35/// Suggestions are sorted by:
36/// 1. `soundex_match` descending — phonetically similar words first
37/// 2. `edit_distance` ascending — closer edits first
38/// 3. `freq_score` descending — more common words first
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct Suggestion {
41 /// Candidate word from the built-in dictionary.
42 pub word: String,
43 /// Levenshtein edit distance between the input and this candidate.
44 pub edit_distance: u8,
45 /// `true` if the lk82 phonetic codes of the input and candidate match.
46 pub soundex_match: bool,
47 /// TNC frequency score; `0` if the word is not in the frequency table.
48 pub freq_score: u32,
49}
50
51/// Thai spell checker using edit distance and lk82 phonetic ranking.
52///
53/// Candidates are drawn from the built-in 62k-word dictionary and filtered to
54/// edit distance ≤ 2. Phonetically similar candidates (matching lk82 code) are
55/// ranked above purely orthographic matches.
56///
57/// # Examples
58///
59/// ```rust
60/// use kham_core::spell::SpellChecker;
61///
62/// let checker = SpellChecker::builtin();
63///
64/// // Misspelled word — expect near-miss suggestions
65/// let suggestions = checker.suggestions("กานข้าว", 5);
66/// assert!(suggestions.iter().all(|s| s.edit_distance <= 2));
67/// ```
68///
69/// Correctly spelled word — edit_distance 0 if it is in the dictionary:
70///
71/// ```rust
72/// use kham_core::spell::SpellChecker;
73///
74/// let checker = SpellChecker::builtin();
75/// let suggestions = checker.suggestions("กิน", 10);
76/// assert!(suggestions.iter().any(|s| s.word == "กิน" && s.edit_distance == 0));
77/// ```
78pub struct SpellChecker {
79 words_text: &'static str,
80 freq: FreqMap,
81 tokenizer: Tokenizer,
82}
83
84impl SpellChecker {
85 /// Create a spell checker backed by the built-in dictionary and TNC
86 /// frequency table.
87 ///
88 /// Construction loads the TNC frequency map (~106k entries) — reuse the
89 /// returned instance rather than calling `builtin()` on every query.
90 ///
91 /// # Examples
92 ///
93 /// ```rust
94 /// use kham_core::spell::SpellChecker;
95 ///
96 /// let checker = SpellChecker::builtin();
97 /// assert!(!checker.suggestions("สวัดสี", 5).is_empty());
98 /// ```
99 pub fn builtin() -> Self {
100 Self {
101 words_text: BUILTIN_WORDS,
102 freq: FreqMap::builtin(),
103 tokenizer: Tokenizer::new(),
104 }
105 }
106
107 /// Return up to `max_n` spelling suggestions for `word`.
108 ///
109 /// Only candidates with edit distance ≤ 2 are returned. Results are sorted
110 /// by phonetic match (lk82), then edit distance, then TNC frequency.
111 ///
112 /// Returns an empty `Vec` when `word` is empty or `max_n` is zero.
113 ///
114 /// # Examples
115 ///
116 /// ```rust
117 /// use kham_core::spell::SpellChecker;
118 ///
119 /// let checker = SpellChecker::builtin();
120 ///
121 /// // Empty input → no suggestions
122 /// assert!(checker.suggestions("", 5).is_empty());
123 ///
124 /// // max_n = 0 → no suggestions
125 /// assert!(checker.suggestions("กาน", 0).is_empty());
126 ///
127 /// // Results respect the distance threshold
128 /// let suggs = checker.suggestions("กิน", 10);
129 /// assert!(suggs.iter().all(|s| s.edit_distance <= 2));
130 /// ```
131 pub fn suggestions(&self, word: &str, max_n: usize) -> Vec<Suggestion> {
132 if word.is_empty() || max_n == 0 {
133 return Vec::new();
134 }
135
136 let word_chars: Vec<char> = word.chars().collect();
137 let word_len = word_chars.len();
138 let input_lk82 = lk82(word);
139
140 let mut results: Vec<Suggestion> = Vec::new();
141
142 for line in self.words_text.lines() {
143 let candidate = line.trim();
144 if candidate.is_empty() || candidate.starts_with('#') {
145 continue;
146 }
147
148 // Length pre-filter: Levenshtein distance ≤ 2 implies |len_a - len_b| ≤ 2.
149 let cand_len = candidate.chars().count();
150 if cand_len + 2 < word_len || word_len + 2 < cand_len {
151 continue;
152 }
153
154 let dist = levenshtein(&word_chars, candidate, 2);
155 if dist > 2 {
156 continue;
157 }
158
159 let soundex_match = lk82(candidate) == input_lk82;
160 let freq_score = self.freq.get(candidate);
161
162 results.push(Suggestion {
163 word: String::from(candidate),
164 edit_distance: dist as u8,
165 soundex_match,
166 freq_score,
167 });
168 }
169
170 // Sort: soundex_match DESC, edit_distance ASC, freq_score DESC
171 results.sort_unstable_by(|a, b| {
172 b.soundex_match
173 .cmp(&a.soundex_match)
174 .then(a.edit_distance.cmp(&b.edit_distance))
175 .then(b.freq_score.cmp(&a.freq_score))
176 });
177
178 results.truncate(max_n);
179 results
180 }
181
182 /// Return the single best spelling correction for `word`, or `None` if the
183 /// word appears correctly in the dictionary (edit distance 0).
184 pub fn did_you_mean(&self, word: &str) -> Option<String> {
185 match self.suggestions(word, 1).into_iter().next() {
186 Some(s) if s.edit_distance == 0 => None,
187 Some(s) => Some(s.word),
188 None => None,
189 }
190 }
191
192 /// Correct an entire Thai text by replacing Unknown tokens with their best
193 /// spelling suggestion.
194 ///
195 /// Segments `text` with the built-in tokenizer, then for every
196 /// `TokenKind::Unknown` token that is at least 2 characters long, looks up
197 /// the best suggestion and substitutes it. All other tokens (including known
198 /// Thai words, numbers, Latin, punctuation) are passed through unchanged.
199 ///
200 /// Returns the input unchanged when no Unknown tokens are found or no
201 /// correction candidates exist.
202 ///
203 /// # Example
204 /// ```rust
205 /// use kham_core::spell::SpellChecker;
206 /// let checker = SpellChecker::builtin();
207 /// // A correctly spelled sentence should come back unchanged word-for-word.
208 /// let out = checker.correct_text("กินข้าวกับปลา");
209 /// assert!(!out.is_empty());
210 /// ```
211 pub fn correct_text(&self, text: &str) -> String {
212 if text.is_empty() {
213 return String::new();
214 }
215 let tokens = self.tokenizer.segment(text);
216 let mut result = String::with_capacity(text.len());
217 for token in &tokens {
218 if token.kind == TokenKind::Unknown && token.text.chars().count() >= 2 {
219 match self.did_you_mean(token.text) {
220 Some(correction) => result.push_str(&correction),
221 None => result.push_str(token.text),
222 }
223 } else {
224 result.push_str(token.text);
225 }
226 }
227 result
228 }
229}
230
231// ---------------------------------------------------------------------------
232// Levenshtein distance (character-level, with early exit)
233// ---------------------------------------------------------------------------
234
235/// Compute Levenshtein distance between `a_chars` (pre-collected) and `b` (str).
236///
237/// Returns `max_dist + 1` as soon as the running minimum exceeds `max_dist`,
238/// avoiding unnecessary work for clearly dissimilar pairs.
239fn levenshtein(a_chars: &[char], b: &str, max_dist: usize) -> usize {
240 let b_chars: Vec<char> = b.chars().collect();
241 let m = a_chars.len();
242 let n = b_chars.len();
243
244 if m == 0 {
245 return n;
246 }
247 if n == 0 {
248 return m;
249 }
250
251 // Length difference is a hard lower bound on edit distance.
252 if m.abs_diff(n) > max_dist {
253 return max_dist + 1;
254 }
255
256 // Two-row DP. `prev[j]` = edit distance for a[0..i-1] vs b[0..j].
257 let mut prev: Vec<usize> = (0..=n).collect();
258 let mut curr: Vec<usize> = alloc::vec![0usize; n + 1];
259
260 for i in 1..=m {
261 curr[0] = i;
262 let mut row_min = i;
263
264 for j in 1..=n {
265 let substitution_cost = usize::from(a_chars[i - 1] != b_chars[j - 1]);
266 curr[j] = (prev[j] + 1)
267 .min(curr[j - 1] + 1)
268 .min(prev[j - 1] + substitution_cost);
269 if curr[j] < row_min {
270 row_min = curr[j];
271 }
272 }
273
274 // Every cell in this row exceeds max_dist — no path can recover.
275 if row_min > max_dist {
276 return max_dist + 1;
277 }
278
279 core::mem::swap(&mut prev, &mut curr);
280 }
281
282 prev[n]
283}
284
285// ---------------------------------------------------------------------------
286// Tests
287// ---------------------------------------------------------------------------
288
289#[cfg(test)]
290mod tests {
291 use super::*;
292
293 fn checker() -> SpellChecker {
294 SpellChecker::builtin()
295 }
296
297 #[test]
298 fn empty_input_returns_empty() {
299 assert!(checker().suggestions("", 5).is_empty());
300 }
301
302 #[test]
303 fn zero_max_n_returns_empty() {
304 assert!(checker().suggestions("กาน", 0).is_empty());
305 }
306
307 #[test]
308 fn all_results_within_threshold() {
309 let suggs = checker().suggestions("กาน", 20);
310 assert!(
311 suggs.iter().all(|s| s.edit_distance <= 2),
312 "got out-of-range result: {suggs:?}"
313 );
314 }
315
316 #[test]
317 fn respects_max_n() {
318 let suggs = checker().suggestions("กาน", 3);
319 assert!(suggs.len() <= 3);
320 }
321
322 #[test]
323 fn word_in_dict_appears_as_dist_zero() {
324 // "กิน" is definitely in the built-in dictionary.
325 let suggs = checker().suggestions("กิน", 10);
326 let exact = suggs.iter().find(|s| s.word == "กิน");
327 assert!(exact.is_some(), "expected กิน in suggestions");
328 assert_eq!(exact.unwrap().edit_distance, 0);
329 }
330
331 #[test]
332 fn sorted_soundex_first_then_distance() {
333 let suggs = checker().suggestions("กาน", 20);
334 // Verify sort order: soundex DESC, edit_distance ASC, freq DESC
335 for window in suggs.windows(2) {
336 let (a, b) = (&window[0], &window[1]);
337 let ok = (a.soundex_match & !b.soundex_match)
338 || (a.soundex_match == b.soundex_match && a.edit_distance < b.edit_distance)
339 || (a.soundex_match == b.soundex_match
340 && a.edit_distance == b.edit_distance
341 && a.freq_score >= b.freq_score);
342 assert!(ok, "sort order violated: {a:?} before {b:?}");
343 }
344 }
345
346 #[test]
347 fn transposition_within_distance_two() {
348 // "สวสัดี" is a character transposition of "สวัสดี" (ั and ส swapped)
349 // Standard Levenshtein counts this as 2 edits — within the threshold.
350 let suggs = checker().suggestions("สวสัดี", 5);
351 let hit = suggs.iter().find(|s| s.word == "สวัสดี");
352 assert!(
353 hit.is_some(),
354 "expected สวัสดี in suggestions for สวสัดี; got: {suggs:?}"
355 );
356 assert!(hit.unwrap().edit_distance <= 2);
357 }
358
359 #[test]
360 fn single_char_deletion_suggestion() {
361 // "กินข้า" is "กินข้าว" with the final ว deleted — edit distance 1.
362 let suggs = checker().suggestions("กินข้า", 5);
363 let hit = suggs.iter().find(|s| s.word == "กินข้าว");
364 assert!(
365 hit.is_some(),
366 "expected กินข้าว in suggestions for กินข้า; got: {suggs:?}"
367 );
368 assert_eq!(hit.unwrap().edit_distance, 1);
369 }
370
371 // Levenshtein unit tests ------------------------------------------------
372
373 fn lev(a: &str, b: &str) -> usize {
374 let a_chars: Vec<char> = a.chars().collect();
375 levenshtein(&a_chars, b, 255)
376 }
377
378 #[test]
379 fn lev_identical() {
380 assert_eq!(lev("กิน", "กิน"), 0);
381 }
382
383 #[test]
384 fn lev_single_substitution() {
385 assert_eq!(lev("กาน", "กาล"), 1);
386 }
387
388 #[test]
389 fn lev_single_deletion() {
390 assert_eq!(lev("กินข้าว", "กินข้า"), 1);
391 }
392
393 #[test]
394 fn lev_single_insertion() {
395 assert_eq!(lev("กินข้า", "กินข้าว"), 1);
396 }
397
398 #[test]
399 fn lev_transposition_is_two() {
400 // Standard Levenshtein: transposition = delete + insert = 2
401 assert_eq!(lev("สวสัดี", "สวัสดี"), 2);
402 }
403
404 #[test]
405 fn lev_empty_strings() {
406 assert_eq!(lev("", ""), 0);
407 assert_eq!(lev("กิน", ""), 3);
408 assert_eq!(lev("", "กิน"), 3);
409 }
410
411 #[test]
412 fn lev_early_exit_above_max() {
413 let a_chars: Vec<char> = "กิน".chars().collect();
414 // Distance is 5 but max_dist is 2 — should return 3 (max_dist + 1).
415 assert_eq!(levenshtein(&a_chars, "สวัสดีครับ", 2), 3);
416 }
417
418 // did_you_mean / correct_text tests ----------------------------------------
419
420 #[test]
421 fn did_you_mean_correct_word_returns_none() {
422 // "กิน" is in the built-in dictionary — edit distance 0 → None
423 assert_eq!(checker().did_you_mean("กิน"), None);
424 }
425
426 #[test]
427 fn did_you_mean_misspelled_word_returns_suggestion() {
428 // "กินข้า" is "กินข้าว" with the final ว deleted — not in the dictionary
429 let result = checker().did_you_mean("กินข้า");
430 assert!(result.is_some(), "expected a suggestion for กินข้า");
431 assert_ne!(
432 result.as_deref(),
433 Some("กินข้า"),
434 "suggestion should differ from input"
435 );
436 }
437
438 #[test]
439 fn correct_text_passthrough_clean_input() {
440 let out = checker().correct_text("กินข้าวกับปลา");
441 assert!(!out.is_empty(), "output should not be empty");
442 // No Unknown tokens expected for well-known words
443 let tokenizer = crate::segmenter::Tokenizer::new();
444 let tokens = tokenizer.segment(&out);
445 assert!(
446 tokens
447 .iter()
448 .all(|t| t.kind != crate::token::TokenKind::Unknown),
449 "expected no Unknown tokens in corrected output; got: {tokens:?}"
450 );
451 }
452}