kham_core/
soundex.rs

1//! Thai phonetic encoding (Soundex) — lk82, udom83, MetaSound, and Thai–English cross-language.
2//!
3//! Groups Thai words by sound so that spelling variants and near-homophones
4//! share the same code, enabling fuzzy search and name matching.
5//!
6//! ```
7//! use kham_core::soundex::{soundex, SoundexAlgorithm};
8//!
9//! // กาน / ขาน / คาน all share the same lk82 code
10//! assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82),
11//!            soundex("ขาน", SoundexAlgorithm::Lk82));
12//! assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82), "1600");
13//! ```
14
15use alloc::string::String;
16use alloc::vec::Vec;
17
18/// Selects the Thai phonetic encoding algorithm.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum SoundexAlgorithm {
21    /// Lorchirachoonkul 1982 — most widely used; 4-char alphanumeric code.
22    Lk82,
23    /// Udompanich 1983 — finer distinctions for sibilants and liquids.
24    Udom83,
25    /// MetaSound (Snae & Brückner 2009) — per-syllable `[initial][vowel][final]` triple;
26    /// variable-length output (3 chars per syllable).
27    MetaSound,
28}
29
30/// Encode a Thai word using the selected algorithm.
31///
32/// - `Lk82` / `Udom83` — always returns a 4-character ASCII code; `"0000"` if
33///   the word contains no Thai consonants.
34/// - `MetaSound` — returns 3 characters per syllable (variable length); `"000"`
35///   if the word contains no Thai consonants.
36///
37/// ```
38/// use kham_core::soundex::{soundex, SoundexAlgorithm};
39///
40/// assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82),
41///            soundex("คาน", SoundexAlgorithm::Lk82));
42/// assert_ne!(soundex("กาน", SoundexAlgorithm::Lk82),
43///            soundex("บาน", SoundexAlgorithm::Lk82));
44/// assert_eq!(soundex("กาน", SoundexAlgorithm::MetaSound), "112");
45/// ```
46pub fn soundex(word: &str, algo: SoundexAlgorithm) -> String {
47    match algo {
48        SoundexAlgorithm::Lk82 => lk82(word),
49        SoundexAlgorithm::Udom83 => udom83(word),
50        SoundexAlgorithm::MetaSound => metasound(word),
51    }
52}
53
54/// Returns `true` if two words share the same phonetic code under the given algorithm.
55///
56/// Returns `false` if either word is empty or contains no recognisable Thai consonants.
57/// Works for all three algorithms — lk82, udom83, and MetaSound.
58///
59/// ```
60/// use kham_core::soundex::{sounds_like, SoundexAlgorithm};
61///
62/// assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::Lk82));
63/// assert!(!sounds_like("กิน", "มิน", SoundexAlgorithm::Lk82));
64/// assert!(!sounds_like("ลาน", "ราน", SoundexAlgorithm::Udom83)); // ล / ร split in udom83
65/// assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::MetaSound));
66/// ```
67pub fn sounds_like(a: &str, b: &str, algo: SoundexAlgorithm) -> bool {
68    if a.is_empty() || b.is_empty() {
69        return false;
70    }
71    let code_a = soundex(a, algo);
72    // All-zero codes mean no recognisable Thai consonants (lk82/udom83 → "0000", MetaSound → "000")
73    !code_a.chars().all(|c| c == '0') && code_a == soundex(b, algo)
74}
75
76// ── LK82 (Lorchirachoonkul 1982) ─────────────────────────────────────────────
77
78/// Encode a Thai word using the LK82 algorithm (Lorchirachoonkul 1982).
79///
80/// Maps consonants to 12 groups (`'0'`–`'9'`, `'A'`, `'B'`), removes adjacent
81/// duplicates, and pads to exactly 4 characters with `'0'`.
82///
83/// ```
84/// use kham_core::soundex::lk82;
85///
86/// assert_eq!(lk82("กาน"),      "1600");
87/// assert_eq!(lk82("ขาน"),      "1600"); // ก / ข in the same group
88/// assert_eq!(lk82("บ้าน"),     "4600");
89/// assert_eq!(lk82("กรุงเทพ"), "1873");
90/// ```
91pub fn lk82(word: &str) -> String {
92    encode(word, lk82_code)
93}
94
95fn lk82_code(c: char) -> u8 {
96    match c {
97        'อ' => b'0',
98        'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
99        'จ' | 'ช' | 'ซ' | 'ศ' | 'ษ' | 'ส' | 'ฉ' | 'ฌ' | 'ญ' => b'2',
100        'ต' | 'ถ' | 'ท' | 'ธ' | 'ฏ' | 'ฐ' | 'ฑ' | 'ฒ' | 'ด' | 'ฎ' => b'3',
101        'บ' | 'ป' | 'พ' | 'ผ' | 'ภ' | 'ฝ' | 'ฟ' => b'4',
102        'ม' => b'5',
103        'น' | 'ณ' => b'6',
104        'ง' => b'7',
105        'ล' | 'ร' | 'ฬ' => b'8',
106        'ว' => b'9',
107        'ย' => b'A',
108        'ห' | 'ฮ' => b'B',
109        _ => b'0',
110    }
111}
112
113// ── Udom83 (Udompanich 1983) ─────────────────────────────────────────────────
114
115/// Encode a Thai word using the Udom83 algorithm (Udompanich 1983).
116///
117/// Uses finer groupings than lk82: sibilants (ซ ศ ษ ส) are separate from
118/// affricates (จ ช ฉ ฌ), and the liquids ร and ล are in different groups.
119///
120/// ```
121/// use kham_core::soundex::udom83;
122///
123/// // ส (sibilant) and ช (affricate) are different groups in udom83
124/// assert_ne!(udom83("สาน"), udom83("ชาน"));
125/// // but ส and ซ share the same sibilant group
126/// assert_eq!(udom83("สาน"), udom83("ซาน"));
127/// // ล and ร are split
128/// assert_ne!(udom83("ลาน"), udom83("ราน"));
129/// ```
130pub fn udom83(word: &str) -> String {
131    encode(word, udom83_code)
132}
133
134fn udom83_code(c: char) -> u8 {
135    match c {
136        'อ' => b'0',
137        'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
138        'จ' | 'ช' | 'ฉ' | 'ฌ' => b'2',
139        'ซ' | 'ศ' | 'ษ' | 'ส' => b'3',
140        'ต' | 'ถ' | 'ท' | 'ธ' | 'ฏ' | 'ฐ' | 'ฑ' | 'ฒ' | 'ด' | 'ฎ' => b'4',
141        'บ' | 'ป' | 'พ' | 'ผ' | 'ภ' | 'ฝ' | 'ฟ' => b'5',
142        'ม' => b'6',
143        'น' | 'ณ' | 'ญ' => b'7',
144        'ง' => b'8',
145        'ล' | 'ฬ' => b'9',
146        'ร' => b'A',
147        'ว' => b'B',
148        'ย' => b'C',
149        'ห' | 'ฮ' => b'D',
150        _ => b'0',
151    }
152}
153
154// ── MetaSound (Snae & Brückner 2009) ─────────────────────────────────────────
155
156/// Encode a Thai word using the MetaSound algorithm (Snae & Brückner 2009).
157///
158/// Returns a variable-length ASCII code: **3 characters per syllable**
159/// (`[initial][vowel][final]`). More discriminating than lk82/udom83 — it
160/// encodes vowel length and final consonant class in addition to the onset.
161///
162/// Returns `"000"` if `word` contains no Thai consonants.
163///
164/// **Note:** Consonant clusters (e.g. กร, กล) are parsed as separate units;
165/// this is an approximation of the full syllable-parser approach.
166///
167/// ```
168/// use kham_core::soundex::metasound;
169///
170/// assert_eq!(metasound("กาน"), "112"); // initial=ก(1) vowel=า(1) final=น(2)
171/// assert_eq!(metasound("ขาน"), "112"); // ข same initial group as ก
172/// assert_eq!(metasound("กาม"), "113"); // final=ม(3) differs from น(2)
173/// assert_ne!(metasound("กาน"), metasound("กาม"));
174/// ```
175pub fn metasound(word: &str) -> String {
176    let chars: Vec<char> = word.chars().collect();
177    let len = chars.len();
178    let mut result = String::new();
179    let mut i = 0;
180
181    while i < len {
182        // 1. Optional lead vowel (เ แ โ ไ ใ appear before the consonant in Unicode)
183        let lead = if is_ms_lead(chars[i]) {
184            let v = chars[i];
185            i += 1;
186            Some(v)
187        } else {
188            None
189        };
190
191        // 2. Initial consonant (required to emit a syllable code)
192        if i >= len || !is_thai_consonant(chars[i]) {
193            if lead.is_none() {
194                i += 1; // skip non-Thai char
195            }
196            continue;
197        }
198        let initial = chars[i];
199        i += 1;
200
201        // Thanthakat immediately after initial → silent consonant; skip syllable
202        if i < len && chars[i] == '\u{0E4C}' {
203            i += 1;
204            continue;
205        }
206
207        // 3. Upper vowel signs (ิ ี ึ ื ั ุ ู) and tone marks above/below the initial
208        let mut upper: Option<char> = None;
209        let mut nikhahit = false;
210        while i < len {
211            match chars[i] {
212                c if is_ms_upper(c) => {
213                    upper = Some(c);
214                    i += 1;
215                }
216                c if is_ms_tone(c) => {
217                    i += 1;
218                }
219                '\u{0E4D}' => {
220                    // Nikhahit อํ — upper component of sara am (–ำ)
221                    nikhahit = true;
222                    i += 1;
223                }
224                _ => break,
225            }
226        }
227
228        // 4. Follow vowel (า ะ ำ appear after the consonant spine)
229        let follow = if i < len && is_ms_follow(chars[i]) {
230            let v = chars[i];
231            i += 1;
232            Some(v)
233        } else {
234            None
235        };
236
237        // 5. Final consonant — present only when the next consonant is NOT followed
238        //    by a vowel mark (which would make it the initial of the next syllable).
239        let final_c = if i < len && is_thai_consonant(chars[i]) {
240            let next = i + 1;
241            if next < len && chars[next] == '\u{0E4C}' {
242                // Silent consonant (e.g. กรณ์): consume both and produce no final
243                i += 2;
244                None
245            } else if next < len
246                && (is_ms_upper(chars[next])
247                    || is_ms_follow(chars[next])
248                    || is_ms_lead(chars[next]))
249            {
250                // Consonant has its own vowel → next syllable's initial; don't consume
251                None
252            } else {
253                let fc = chars[i];
254                i += 1;
255                Some(fc)
256            }
257        } else {
258            None
259        };
260
261        // Emit [initial][vowel][final]
262        result.push(ms_initial_code(initial) as char);
263        result.push(ms_vowel_code(lead, upper, follow, nikhahit) as char);
264        result.push(ms_final_code(final_c) as char);
265    }
266
267    if result.is_empty() {
268        "000".into()
269    } else {
270        result
271    }
272}
273
274fn ms_initial_code(c: char) -> u8 {
275    match c {
276        'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
277        'ง' => b'2',
278        'จ' | 'ช' | 'ฉ' | 'ฌ' => b'3',
279        'ซ' | 'ศ' | 'ษ' | 'ส' => b'4',
280        'ญ' | 'ย' => b'5',
281        'ฎ' | 'ด' => b'6',
282        'ฏ' | 'ต' => b'7',
283        'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => b'8',
284        'น' | 'ณ' => b'9',
285        'บ' => b'A',
286        'ป' => b'B',
287        'ผ' | 'พ' | 'ภ' => b'C',
288        'ฝ' | 'ฟ' => b'D',
289        'ม' => b'E',
290        'ร' => b'F',
291        'ล' | 'ฬ' => b'G',
292        'ว' => b'H',
293        'ห' | 'ฮ' => b'I',
294        _ => b'J', // อ and unknowns → glottal / null onset
295    }
296}
297
298fn ms_vowel_code(
299    lead: Option<char>,
300    upper: Option<char>,
301    follow: Option<char>,
302    nikhahit: bool,
303) -> u8 {
304    // Sara am (nikhahit อํ or ำ) takes priority
305    if nikhahit {
306        return b'D';
307    }
308    match lead {
309        // Leading vowels (เ แ โ ไ ใ) determine the vowel class
310        Some('ไ') | Some('ใ') => b'E', // /ai/
311        Some('เ') => match follow {
312            Some('\u{0E32}') => b'F', // เ–า /ao/
313            Some('\u{0E30}') => b'8', // เ–ะ short /e/
314            _ => b'8',                // เ– long /eː/ (default)
315        },
316        Some('แ') => b'8', // แ– /ɛ/ class (short or long)
317        Some('โ') => b'9', // โ– /o/ class (short or long)
318        // No lead vowel: rely on upper and follow vowel signs
319        _ => match upper {
320            Some('\u{0E31}') => b'0', // ั (mai han akat) short /a/
321            Some('\u{0E34}') => b'2', // ิ short /i/
322            Some('\u{0E35}') => b'3', // ี long /iː/
323            Some('\u{0E36}') => b'4', // ึ short /ɯ/
324            Some('\u{0E37}') => b'5', // ื long /ɯː/
325            Some('\u{0E38}') => b'6', // ุ short /u/
326            Some('\u{0E39}') => b'7', // ู long /uː/
327            _ => match follow {
328                Some('\u{0E30}') => b'0', // ะ short /a/
329                Some('\u{0E32}') => b'1', // า long /aː/
330                Some('\u{0E33}') => b'D', // ำ /am/
331                _ => b'0',                // no vowel marking → default short /a/
332            },
333        },
334    }
335}
336
337fn ms_final_code(c: Option<char>) -> u8 {
338    match c {
339        Some('ก') => b'1', // velar stop
340        Some('น') | Some('ณ') | Some('ญ') | Some('ร') | Some('ล') | Some('ฬ') => b'2', // alveolar sonorant
341        Some('ม') => b'3',             // bilabial nasal
342        Some('ง') => b'4',             // velar nasal
343        Some('ย') | Some('ว') => b'5', // glide
344        _ => b'6',                     // open syllable / no final
345    }
346}
347
348// Character class helpers used only by MetaSound (lk82/udom83 use is_thai_consonant only)
349
350fn is_ms_lead(c: char) -> bool {
351    matches!(c, '\u{0E40}'..='\u{0E44}') // เ แ โ ไ ใ
352}
353
354fn is_ms_upper(c: char) -> bool {
355    // mai han akat ั (U+0E31) + sara ิ ี ึ ื ุ ู (U+0E34–U+0E39) + phinthu (U+0E3A)
356    c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
357}
358
359fn is_ms_follow(c: char) -> bool {
360    matches!(c, '\u{0E30}' | '\u{0E32}' | '\u{0E33}') // ะ า ำ
361}
362
363fn is_ms_tone(c: char) -> bool {
364    matches!(c, '\u{0E48}'..='\u{0E4B}') // ่ ้ ๊ ๋
365}
366
367// ── shared helpers ────────────────────────────────────────────────────────────
368
369/// Strip consonant + ์ (thanthakat, U+0E4C) pairs — silent consonants.
370fn strip_silent(s: &str) -> String {
371    let chars: Vec<char> = s.chars().collect();
372    let mut out = String::new();
373    let mut i = 0;
374    while i < chars.len() {
375        if i + 1 < chars.len() && chars[i + 1] == '\u{0E4C}' {
376            i += 2;
377            continue;
378        }
379        out.push(chars[i]);
380        i += 1;
381    }
382    out
383}
384
385/// True for Thai consonant code points ก–ฮ (U+0E01–U+0E2E).
386fn is_thai_consonant(c: char) -> bool {
387    ('\u{0E01}'..='\u{0E2E}').contains(&c)
388}
389
390/// Core encoder: strip silent consonants → map codes → dedup adjacent → pad to 4.
391fn encode(word: &str, code_fn: fn(char) -> u8) -> String {
392    const LEN: usize = 4;
393
394    let stripped = strip_silent(word);
395    let mut codes: Vec<u8> = Vec::with_capacity(LEN);
396    let mut last: Option<u8> = None;
397
398    for ch in stripped.chars() {
399        if !is_thai_consonant(ch) {
400            continue;
401        }
402        let code = code_fn(ch);
403        if Some(code) != last {
404            codes.push(code);
405            last = Some(code);
406        }
407        if codes.len() == LEN {
408            break;
409        }
410    }
411
412    while codes.len() < LEN {
413        codes.push(b'0');
414    }
415
416    String::from_utf8(codes).expect("soundex codes are ASCII")
417}
418
419// ── Thai–English cross-language Soundex (Suwanvisat & Prasitjutrakul 1998) ──
420//
421// Source: "Thai-English Cross-Language Transliterated Word Retrieval using
422// Soundex Technique", NECTEC Annual Conference 1998.
423//
424// The paper extends Odell & Russell's Soundex to a combined Thai+English table
425// so that both a Thai transliteration and its English source word encode to the
426// same (or prefix-matched) code — no romanization step needed.
427//
428// Key differences from standard Soundex:
429//   • First character encodes as a digit too (not kept as a letter)
430//   • Vowels in non-first position → '7' (not dropped)
431//   • H → '8', W → '1', Y → '9' in non-first position
432//   • Thai consonants map directly to the same 7 groups as English
433//   • ง (ng) → "52" (two digits: N-group then G/K-group)
434//   • Code is variable-length (unlimited); callers choose a minimum k for matching
435
436/// Encode a Thai or English word into a shared cross-language phonetic code.
437///
438/// Implements the Suwanvisat & Prasitjutrakul (1998) modified Soundex that
439/// extends the encoding table to cover both Thai consonants and English letters
440/// directly — **no romanization step required**. A Thai transliteration and its
441/// English source word produce codes that share a common prefix, enabling
442/// cross-language retrieval of transliterated proper nouns and loan words.
443///
444/// **Encoding rules:**
445/// - Every character (Thai consonant or English letter) is mapped to a digit; the
446///   first character is also encoded numerically (unlike standard Soundex).
447/// - English vowels A/E/I/O/U → `'7'` in non-first position (retained, not dropped).
448/// - H → `'8'`, W → `'1'`, Y → `'9'` in non-first position.
449/// - Thai vowel marks (sara, tone marks, leading vowels) are skipped entirely.
450/// - ง maps to `"52"` (N-group + G/K-group, representing the NG onset).
451/// - Adjacent identical digits collapse to one (standard deduplication).
452/// - Output is **variable length** — longer words produce longer codes.
453///
454/// Returns `""` if `word` contains no encodable characters.
455///
456/// ```
457/// use kham_core::soundex::thai_english_soundex;
458///
459/// // Same initial-group consonants produce a common prefix
460/// assert_eq!(&thai_english_soundex("McDonald")[..3],
461///            &thai_english_soundex("แมคโดนัลด์")[..3]);
462/// // English words encode to fully numeric codes
463/// assert_eq!(thai_english_soundex("Robert"), "671763");
464/// ```
465pub fn thai_english_soundex(word: &str) -> String {
466    let chars: Vec<char> = word.chars().collect();
467    let len = chars.len();
468    let mut result = String::new();
469    let mut last_digit: Option<char> = None;
470    let mut is_first = true;
471    let mut i = 0;
472
473    while i < len {
474        let c = chars[i];
475
476        // Thai vowel marks, tone marks, leading vowels — skip entirely
477        if is_cl_skip(c) {
478            i += 1;
479            continue;
480        }
481
482        // Silent Thai consonant: consonant immediately followed by ์ (thanthakat)
483        if is_thai_consonant(c) && i + 1 < len && chars[i + 1] == '\u{0E4C}' {
484            i += 2;
485            continue;
486        }
487
488        // Only encode ASCII alpha and Thai consonants
489        if !c.is_ascii_alphabetic() && !is_thai_consonant(c) {
490            i += 1;
491            continue;
492        }
493
494        let code = cl_code(c, is_first);
495        if !code.is_empty() {
496            is_first = false;
497        }
498        for digit in code.chars() {
499            if Some(digit) != last_digit {
500                result.push(digit);
501                last_digit = Some(digit);
502            }
503        }
504
505        i += 1;
506    }
507
508    result
509}
510
511/// Encode an English (or romanized) word using standard Soundex (Odell & Russell).
512///
513/// Retains the first letter, replaces remaining consonants with digits `1`–`6`,
514/// collapses adjacent identical codes, drops vowels and H/W/Y, and pads to
515/// exactly 4 characters. Returns `""` if `word` contains no ASCII alphabetic
516/// characters.
517///
518/// | Digit | Letters          |
519/// |-------|-----------------|
520/// | `1`   | B F P V         |
521/// | `2`   | C G J K Q S X Z |
522/// | `3`   | D T             |
523/// | `4`   | L               |
524/// | `5`   | M N             |
525/// | `6`   | R               |
526/// | skip  | A E I O U H W Y |
527///
528/// ```
529/// use kham_core::soundex::english_soundex;
530///
531/// assert_eq!(english_soundex("Robert"), "R163");
532/// assert_eq!(english_soundex("Rupert"), "R163");
533/// assert_eq!(english_soundex("McDonald"), "M235");
534/// assert_eq!(english_soundex("Smith"),    "S530");
535/// ```
536pub fn english_soundex(word: &str) -> String {
537    let mut chars = word
538        .chars()
539        .filter(|c| c.is_ascii_alphabetic())
540        .map(|c| c.to_ascii_uppercase());
541
542    let first = match chars.next() {
543        Some(c) => c,
544        None => return String::new(),
545    };
546
547    let mut code = String::with_capacity(4);
548    code.push(first);
549    let mut last = std_soundex_digit(first);
550
551    for c in chars {
552        let d = std_soundex_digit(c);
553        if d == '0' {
554            last = '0'; // vowel / H / W / Y — acts as separator
555        } else if d != last {
556            code.push(d);
557            last = d;
558            if code.len() == 4 {
559                break;
560            }
561        }
562    }
563
564    while code.len() < 4 {
565        code.push('0');
566    }
567    code
568}
569
570/// Standard Soundex digit (Odell & Russell). Returns `'0'` for non-coded letters.
571fn std_soundex_digit(c: char) -> char {
572    match c {
573        'B' | 'F' | 'P' | 'V' => '1',
574        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
575        'D' | 'T' => '3',
576        'L' => '4',
577        'M' | 'N' => '5',
578        'R' => '6',
579        _ => '0',
580    }
581}
582
583/// Returns `true` if two words share the same cross-language phonetic code.
584///
585/// Accepts Thai, English, or mixed input — no romanizer required. Returns
586/// `false` if either word produces an empty code. For cross-language
587/// Thai↔English pairs (e.g. transliterated loan words), the codes share a
588/// common prefix even if not exactly equal; prefer comparing
589/// [`thai_english_soundex`] codes directly with a minimum-length threshold
590/// for that use case.
591///
592/// ```
593/// use kham_core::soundex::sounds_like_cross_lang;
594///
595/// assert!(sounds_like_cross_lang("Robert",  "Rupert"));  // same code: "671763"
596/// assert!(sounds_like_cross_lang("กาน", "คาน"));         // ก and ค → same group
597/// assert!(!sounds_like_cross_lang("Robert",  "Smith"));
598/// ```
599pub fn sounds_like_cross_lang(a: &str, b: &str) -> bool {
600    let code_a = thai_english_soundex(a);
601    !code_a.is_empty() && code_a == thai_english_soundex(b)
602}
603
604/// Returns the cross-language Soundex code fragment for one character.
605///
606/// `is_first` selects the first-position table (AEIOUHWY → `"0"`) vs the
607/// rest-position table (AEIOU → `"7"`, H → `"8"`, W → `"1"`, Y → `"9"`).
608/// Returns `""` for characters that should be skipped (อ in non-first position).
609/// ง returns `"52"` (two digits) in both positions.
610fn cl_code(c: char, is_first: bool) -> &'static str {
611    if c.is_ascii_alphabetic() {
612        let cu = c.to_ascii_uppercase();
613        return if is_first {
614            match cu {
615                'A' | 'E' | 'I' | 'O' | 'U' | 'H' | 'W' | 'Y' => "0",
616                'B' | 'F' | 'P' | 'V' => "1",
617                'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => "2",
618                'D' | 'T' => "3",
619                'L' => "4",
620                'M' | 'N' => "5",
621                'R' => "6",
622                _ => "",
623            }
624        } else {
625            match cu {
626                'A' | 'E' | 'I' | 'O' | 'U' => "7",
627                'H' => "8",
628                'W' => "1",
629                'Y' => "9",
630                'B' | 'F' | 'P' | 'V' => "1",
631                'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => "2",
632                'D' | 'T' => "3",
633                'L' => "4",
634                'M' | 'N' => "5",
635                'R' => "6",
636                _ => "",
637            }
638        };
639    }
640
641    // Thai consonants — same 7 groups in both positions; ว/ห/ย/ญ split by position
642    if is_first {
643        match c {
644            // Group 0 equivalent: vowel carriers / glides / h — first position
645            'อ' | 'ห' | 'ฮ' | 'ว' | 'ญ' | 'ย' => "0",
646            // Group 2 (C/G/J/K/Q/S/X/Z): all velar+palatal+sibilant clusters
647            'ก' | 'ข' | 'ฃ' | 'ค' | 'ฅ' | 'ฆ' => "2",
648            'จ' | 'ฉ' | 'ช' | 'ฌ' => "2",
649            'ซ' | 'ศ' | 'ษ' | 'ส' => "2",
650            // ง = NG/NK → N-group then G/K-group
651            'ง' => "52",
652            // Group 3 (D/T): dental/alveolar stops
653            'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "3",
654            // Group 4 (L): laterals
655            'ล' | 'ฬ' => "4",
656            // Group 5 (M/N): nasals
657            'ม' | 'ณ' | 'น' => "5",
658            // Group 6 (R): rhotic
659            'ร' => "6",
660            // Group 1 (B/F/P/V): bilabials + labiodentals
661            'บ' | 'ป' | 'ผ' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "1",
662            _ => "",
663        }
664    } else {
665        match c {
666            // อ is a pure vowel carrier in non-initial position — skip
667            'อ' => "",
668            // ว/ห/ฮ/ญ/ย split like W/H/Y in English non-first position
669            'ห' | 'ฮ' => "8",
670            'ว' => "1",
671            'ญ' | 'ย' => "9",
672            'ก' | 'ข' | 'ฃ' | 'ค' | 'ฅ' | 'ฆ' => "2",
673            'จ' | 'ฉ' | 'ช' | 'ฌ' => "2",
674            'ซ' | 'ศ' | 'ษ' | 'ส' => "2",
675            'ง' => "52",
676            'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "3",
677            'ล' | 'ฬ' => "4",
678            'ม' | 'ณ' | 'น' => "5",
679            'ร' => "6",
680            'บ' | 'ป' | 'ผ' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "1",
681            _ => "",
682        }
683    }
684}
685
686/// Thai characters to skip in cross-language soundex (vowel marks, tone marks,
687/// leading vowels, thanthakat, nikhahit — anything that isn't a consonant).
688fn is_cl_skip(c: char) -> bool {
689    matches!(
690        c,
691        '\u{0E30}'..='\u{0E3A}' // sara vowels (ะ า ิ ี ึ ื ุ ู ฺ) and mai han akat
692        | '\u{0E40}'..='\u{0E44}' // leading vowels (เ แ โ ไ ใ)
693        | '\u{0E47}'..='\u{0E4E}' // mai tai khu, tone marks, ์, ๎, nikhahit
694    )
695}
696
697#[cfg(test)]
698mod tests {
699    use super::*;
700
701    // ── lk82 ─────────────────────────────────────────────────────────────────
702
703    #[test]
704    fn lk82_worked_examples() {
705        assert_eq!(lk82("กาน"), "1600");
706        assert_eq!(lk82("ขาน"), "1600");
707        assert_eq!(lk82("คาน"), "1600");
708        assert_eq!(lk82("บ้าน"), "4600");
709        assert_eq!(lk82("มาก"), "5100");
710        assert_eq!(lk82("นาค"), "6100");
711        assert_eq!(lk82("กรุงเทพ"), "1873");
712    }
713
714    #[test]
715    fn lk82_same_initial_velar() {
716        assert_eq!(lk82("กาน"), lk82("ขาน"));
717        assert_eq!(lk82("กาน"), lk82("คาน"));
718    }
719
720    #[test]
721    fn lk82_different_initials() {
722        assert_ne!(lk82("กาน"), lk82("ปาน"));
723        assert_ne!(lk82("มาน"), lk82("นาน"));
724    }
725
726    #[test]
727    fn lk82_always_four_chars() {
728        assert_eq!(lk82("ก").len(), 4);
729        assert_eq!(lk82("กรุงเทพมหานคร").len(), 4);
730    }
731
732    #[test]
733    fn lk82_empty_and_no_thai() {
734        assert_eq!(lk82(""), "0000");
735        assert_eq!(lk82("123"), "0000");
736        assert_eq!(lk82("hello"), "0000");
737    }
738
739    #[test]
740    fn lk82_strips_silent_consonant() {
741        // กรณ์ → กร (ณ is silent)
742        assert_eq!(lk82("กรณ์"), lk82("กร"));
743    }
744
745    #[test]
746    fn lk82_deduplicates_adjacent_same_group() {
747        // กข → both code '1' → deduplicated to a single '1'
748        assert_eq!(lk82("กข"), "1000");
749    }
750
751    // ── udom83 ───────────────────────────────────────────────────────────────
752
753    #[test]
754    fn udom83_always_four_chars() {
755        assert_eq!(udom83("ก").len(), 4);
756        assert_eq!(udom83("กรุงเทพมหานคร").len(), 4);
757    }
758
759    #[test]
760    fn udom83_separates_liquids() {
761        assert_ne!(udom83("ลาน"), udom83("ราน"));
762    }
763
764    #[test]
765    fn udom83_sibilant_separate_from_affricate() {
766        assert_ne!(udom83("สาน"), udom83("ชาน"));
767        assert_eq!(udom83("สาน"), udom83("ซาน"));
768    }
769
770    #[test]
771    fn udom83_empty_and_no_thai() {
772        assert_eq!(udom83(""), "0000");
773        assert_eq!(udom83("abc"), "0000");
774    }
775
776    // ── metasound ─────────────────────────────────────────────────────────────
777
778    #[test]
779    fn metasound_worked_examples() {
780        // กาน: initial=ก(1) vowel=า(1) final=น(2)
781        assert_eq!(metasound("กาน"), "112");
782        // ขาน: ข shares group '1' with ก
783        assert_eq!(metasound("ขาน"), "112");
784        // กาม: different final ม(3)
785        assert_eq!(metasound("กาม"), "113");
786    }
787
788    #[test]
789    fn metasound_same_initial_group() {
790        assert_eq!(metasound("กาน"), metasound("ขาน"));
791        assert_eq!(metasound("กาน"), metasound("คาน"));
792    }
793
794    #[test]
795    fn metasound_distinguishes_finals() {
796        assert_ne!(metasound("กาน"), metasound("กาม"));
797        assert_ne!(metasound("กาน"), metasound("กาง"));
798    }
799
800    #[test]
801    fn metasound_vowel_length() {
802        // า long /aː/ (code '1') vs ะ short /a/ (code '0')
803        assert_ne!(metasound("กาน"), metasound("กะ"));
804    }
805
806    #[test]
807    fn metasound_lead_vowel_classes() {
808        // เ– class → vowel code '8'
809        let e_code = metasound("เกน");
810        assert_eq!(&e_code[1..2], "8");
811        // ไ / ใ → vowel code 'E'
812        let ai_code = metasound("ไก");
813        assert_eq!(&ai_code[1..2], "E");
814    }
815
816    #[test]
817    fn metasound_empty_and_no_thai() {
818        assert_eq!(metasound(""), "000");
819        assert_eq!(metasound("abc"), "000");
820        assert_eq!(metasound("123"), "000");
821    }
822
823    #[test]
824    fn metasound_open_syllable() {
825        // กา: no final consonant → final code '6'
826        assert_eq!(metasound("กา"), "116");
827    }
828
829    #[test]
830    fn metasound_sara_am() {
831        // กำ: nikhahit → vowel code 'D'
832        let code = metasound("กำ");
833        assert_eq!(&code[1..2], "D");
834    }
835
836    // ── soundex() enum API ────────────────────────────────────────────────────
837
838    #[test]
839    fn soundex_dispatches_to_lk82() {
840        assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82), lk82("กาน"));
841    }
842
843    #[test]
844    fn soundex_dispatches_to_udom83() {
845        assert_eq!(soundex("กาน", SoundexAlgorithm::Udom83), udom83("กาน"));
846    }
847
848    #[test]
849    fn soundex_dispatches_to_metasound() {
850        assert_eq!(
851            soundex("กาน", SoundexAlgorithm::MetaSound),
852            metasound("กาน")
853        );
854    }
855
856    // ── sounds_like ───────────────────────────────────────────────────────────
857
858    #[test]
859    fn sounds_like_lk82_positive() {
860        assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::Lk82));
861    }
862
863    #[test]
864    fn sounds_like_lk82_negative() {
865        assert!(!sounds_like("กิน", "มิน", SoundexAlgorithm::Lk82));
866    }
867
868    #[test]
869    fn sounds_like_udom83_splits_liquids() {
870        assert!(!sounds_like("ลาน", "ราน", SoundexAlgorithm::Udom83));
871    }
872
873    #[test]
874    fn sounds_like_metasound_positive() {
875        assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::MetaSound));
876    }
877
878    #[test]
879    fn sounds_like_metasound_negative() {
880        assert!(!sounds_like("กาน", "กาม", SoundexAlgorithm::MetaSound));
881    }
882
883    #[test]
884    fn sounds_like_empty_returns_false() {
885        assert!(!sounds_like("", "กาน", SoundexAlgorithm::Lk82));
886        assert!(!sounds_like("กาน", "", SoundexAlgorithm::Lk82));
887    }
888
889    // ── English Soundex ───────────────────────────────────────────────────────
890
891    #[test]
892    fn english_soundex_standard_examples() {
893        assert_eq!(english_soundex("Robert"), "R163");
894        assert_eq!(english_soundex("Rupert"), "R163"); // same code as Robert
895        assert_eq!(english_soundex("McDonald"), "M235");
896        assert_eq!(english_soundex("Smith"), "S530");
897        assert_eq!(english_soundex("Thompson"), "T512");
898    }
899
900    #[test]
901    fn english_soundex_always_four_chars() {
902        assert_eq!(english_soundex("A").len(), 4);
903        assert_eq!(english_soundex("Robert").len(), 4);
904    }
905
906    #[test]
907    fn english_soundex_empty_and_no_alpha() {
908        assert_eq!(english_soundex(""), "");
909        assert_eq!(english_soundex("123"), "");
910    }
911
912    #[test]
913    fn english_soundex_case_insensitive() {
914        assert_eq!(english_soundex("robert"), english_soundex("Robert"));
915        assert_eq!(english_soundex("ROBERT"), english_soundex("Robert"));
916    }
917
918    #[test]
919    fn english_soundex_vowel_separates_same_code() {
920        // B and P are both code '1'; with a vowel between them they must NOT collapse.
921        // "Abba" → A(keep) b→1 b→same,skip → A100
922        assert_eq!(english_soundex("Abba"), "A100");
923        // "Ababar" — b(1) a(sep) b(1 again after vowel sep) → distinct
924        assert_eq!(&english_soundex("Ababar")[..2], "A1");
925    }
926
927    #[test]
928    fn english_soundex_adjacent_same_code_collapsed() {
929        // CK → both code '2'; adjacent → only one digit
930        assert_eq!(english_soundex("Jack"), "J200");
931    }
932
933    // ── Thai–English cross-language (Suwanvisat & Prasitjutrakul 1998) ──────────
934
935    #[test]
936    fn thai_english_soundex_english_numeric_codes() {
937        // First character also encoded as a digit (unlike standard Soundex)
938        assert_eq!(thai_english_soundex("Robert"), "671763");
939        assert_eq!(thai_english_soundex("Rupert"), "671763"); // same code — same initial-sound group
940    }
941
942    #[test]
943    fn thai_english_soundex_thai_direct_encoding() {
944        // Thai consonants map directly to the shared table — no romanizer needed
945        assert_eq!(thai_english_soundex("กน"), "25"); // ก→2 (K group), น→5 (N group)
946        assert_eq!(thai_english_soundex("ร"), "6"); // ร→6 (R group)
947        assert_eq!(thai_english_soundex("ก"), "2"); // single consonant → single digit
948    }
949
950    #[test]
951    fn thai_english_soundex_ng_two_digits() {
952        // ง (ng onset) encodes as "52": N-group (5) then G/K-group (2)
953        assert_eq!(thai_english_soundex("ง"), "52");
954    }
955
956    #[test]
957    fn thai_english_soundex_thai_vowels_skipped_english_vowels_to_7() {
958        // Thai vowel diacritics are skipped entirely
959        assert_eq!(thai_english_soundex("กิน"), "25"); // ิ (U+0E34) is skipped
960                                                      // English vowels in non-first position → '7' (retained, not dropped)
961        assert!(thai_english_soundex("Robert").contains('7')); // 'o' and 'e' → '7'
962    }
963
964    #[test]
965    fn thai_english_soundex_cross_lang_prefix_match() {
966        // McDonald and แมคโดนัลด์ share the same 3-char prefix "523"
967        let en = thai_english_soundex("McDonald");
968        let th = thai_english_soundex("แมคโดนัลด์");
969        assert!(en.len() >= 3 && th.len() >= 3, "codes too short");
970        assert_eq!(&en[..3], &th[..3]);
971    }
972
973    #[test]
974    fn thai_english_soundex_variable_length_and_empty() {
975        assert_eq!(thai_english_soundex(""), "");
976        assert_eq!(thai_english_soundex("123"), "");
977        // longer words produce longer codes
978        let long = thai_english_soundex("กรุงเทพมหานคร");
979        assert!(long.len() > 2);
980    }
981
982    #[test]
983    fn sounds_like_cross_lang_same_english() {
984        assert!(sounds_like_cross_lang("Robert", "Rupert"));
985    }
986
987    #[test]
988    fn sounds_like_cross_lang_same_thai_initial_group() {
989        // ก and ค are both in the K/G group (→ "2"); กาน and คาน share the full code
990        assert!(sounds_like_cross_lang("กาน", "คาน"));
991    }
992
993    #[test]
994    fn sounds_like_cross_lang_different() {
995        assert!(!sounds_like_cross_lang("Robert", "Smith"));
996        assert!(!sounds_like_cross_lang("กาน", "บาน")); // ก→2 vs บ→1
997    }
998
999    #[test]
1000    fn sounds_like_cross_lang_empty_returns_false() {
1001        assert!(!sounds_like_cross_lang("", "Robert"));
1002        assert!(!sounds_like_cross_lang("Robert", ""));
1003    }
1004}
kham_core/soundex.rs

kham_core/
soundex.rs