kham_core/soundex.rs
1//! Thai phonetic encoding (Soundex) — lk82, udom83, MetaSound, and Thai–English cross-language.
2//!
3//! Groups Thai words by sound so that spelling variants and near-homophones
4//! share the same code, enabling fuzzy search and name matching.
5//!
6//! ```
7//! use kham_core::soundex::{soundex, SoundexAlgorithm};
8//!
9//! // กาน / ขาน / คาน all share the same lk82 code
10//! assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82),
11//! soundex("ขาน", SoundexAlgorithm::Lk82));
12//! assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82), "1600");
13//! ```
14
15use alloc::string::String;
16use alloc::vec::Vec;
17
18/// Selects the Thai phonetic encoding algorithm.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum SoundexAlgorithm {
21 /// Lorchirachoonkul 1982 — most widely used; 4-char alphanumeric code.
22 Lk82,
23 /// Udompanich 1983 — finer distinctions for sibilants and liquids.
24 Udom83,
25 /// MetaSound (Snae & Brückner 2009) — per-syllable `[initial][vowel][final]` triple;
26 /// variable-length output (3 chars per syllable).
27 MetaSound,
28}
29
30/// Encode a Thai word using the selected algorithm.
31///
32/// - `Lk82` / `Udom83` — always returns a 4-character ASCII code; `"0000"` if
33/// the word contains no Thai consonants.
34/// - `MetaSound` — returns 3 characters per syllable (variable length); `"000"`
35/// if the word contains no Thai consonants.
36///
37/// ```
38/// use kham_core::soundex::{soundex, SoundexAlgorithm};
39///
40/// assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82),
41/// soundex("คาน", SoundexAlgorithm::Lk82));
42/// assert_ne!(soundex("กาน", SoundexAlgorithm::Lk82),
43/// soundex("บาน", SoundexAlgorithm::Lk82));
44/// assert_eq!(soundex("กาน", SoundexAlgorithm::MetaSound), "112");
45/// ```
46pub fn soundex(word: &str, algo: SoundexAlgorithm) -> String {
47 match algo {
48 SoundexAlgorithm::Lk82 => lk82(word),
49 SoundexAlgorithm::Udom83 => udom83(word),
50 SoundexAlgorithm::MetaSound => metasound(word),
51 }
52}
53
54/// Returns `true` if two words share the same phonetic code under the given algorithm.
55///
56/// Returns `false` if either word is empty or contains no recognisable Thai consonants.
57/// Works for all three algorithms — lk82, udom83, and MetaSound.
58///
59/// ```
60/// use kham_core::soundex::{sounds_like, SoundexAlgorithm};
61///
62/// assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::Lk82));
63/// assert!(!sounds_like("กิน", "มิน", SoundexAlgorithm::Lk82));
64/// assert!(!sounds_like("ลาน", "ราน", SoundexAlgorithm::Udom83)); // ล / ร split in udom83
65/// assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::MetaSound));
66/// ```
67pub fn sounds_like(a: &str, b: &str, algo: SoundexAlgorithm) -> bool {
68 if a.is_empty() || b.is_empty() {
69 return false;
70 }
71 let code_a = soundex(a, algo);
72 // All-zero codes mean no recognisable Thai consonants (lk82/udom83 → "0000", MetaSound → "000")
73 !code_a.chars().all(|c| c == '0') && code_a == soundex(b, algo)
74}
75
76// ── LK82 (Lorchirachoonkul 1982) ─────────────────────────────────────────────
77
78/// Encode a Thai word using the LK82 algorithm (Lorchirachoonkul 1982).
79///
80/// Maps consonants to 12 groups (`'0'`–`'9'`, `'A'`, `'B'`), removes adjacent
81/// duplicates, and pads to exactly 4 characters with `'0'`.
82///
83/// ```
84/// use kham_core::soundex::lk82;
85///
86/// assert_eq!(lk82("กาน"), "1600");
87/// assert_eq!(lk82("ขาน"), "1600"); // ก / ข in the same group
88/// assert_eq!(lk82("บ้าน"), "4600");
89/// assert_eq!(lk82("กรุงเทพ"), "1873");
90/// ```
91pub fn lk82(word: &str) -> String {
92 encode(word, lk82_code)
93}
94
95fn lk82_code(c: char) -> u8 {
96 match c {
97 'อ' => b'0',
98 'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
99 'จ' | 'ช' | 'ซ' | 'ศ' | 'ษ' | 'ส' | 'ฉ' | 'ฌ' | 'ญ' => b'2',
100 'ต' | 'ถ' | 'ท' | 'ธ' | 'ฏ' | 'ฐ' | 'ฑ' | 'ฒ' | 'ด' | 'ฎ' => b'3',
101 'บ' | 'ป' | 'พ' | 'ผ' | 'ภ' | 'ฝ' | 'ฟ' => b'4',
102 'ม' => b'5',
103 'น' | 'ณ' => b'6',
104 'ง' => b'7',
105 'ล' | 'ร' | 'ฬ' => b'8',
106 'ว' => b'9',
107 'ย' => b'A',
108 'ห' | 'ฮ' => b'B',
109 _ => b'0',
110 }
111}
112
113// ── Udom83 (Udompanich 1983) ─────────────────────────────────────────────────
114
115/// Encode a Thai word using the Udom83 algorithm (Udompanich 1983).
116///
117/// Uses finer groupings than lk82: sibilants (ซ ศ ษ ส) are separate from
118/// affricates (จ ช ฉ ฌ), and the liquids ร and ล are in different groups.
119///
120/// ```
121/// use kham_core::soundex::udom83;
122///
123/// // ส (sibilant) and ช (affricate) are different groups in udom83
124/// assert_ne!(udom83("สาน"), udom83("ชาน"));
125/// // but ส and ซ share the same sibilant group
126/// assert_eq!(udom83("สาน"), udom83("ซาน"));
127/// // ล and ร are split
128/// assert_ne!(udom83("ลาน"), udom83("ราน"));
129/// ```
130pub fn udom83(word: &str) -> String {
131 encode(word, udom83_code)
132}
133
134fn udom83_code(c: char) -> u8 {
135 match c {
136 'อ' => b'0',
137 'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
138 'จ' | 'ช' | 'ฉ' | 'ฌ' => b'2',
139 'ซ' | 'ศ' | 'ษ' | 'ส' => b'3',
140 'ต' | 'ถ' | 'ท' | 'ธ' | 'ฏ' | 'ฐ' | 'ฑ' | 'ฒ' | 'ด' | 'ฎ' => b'4',
141 'บ' | 'ป' | 'พ' | 'ผ' | 'ภ' | 'ฝ' | 'ฟ' => b'5',
142 'ม' => b'6',
143 'น' | 'ณ' | 'ญ' => b'7',
144 'ง' => b'8',
145 'ล' | 'ฬ' => b'9',
146 'ร' => b'A',
147 'ว' => b'B',
148 'ย' => b'C',
149 'ห' | 'ฮ' => b'D',
150 _ => b'0',
151 }
152}
153
154// ── MetaSound (Snae & Brückner 2009) ─────────────────────────────────────────
155
156/// Encode a Thai word using the MetaSound algorithm (Snae & Brückner 2009).
157///
158/// Returns a variable-length ASCII code: **3 characters per syllable**
159/// (`[initial][vowel][final]`). More discriminating than lk82/udom83 — it
160/// encodes vowel length and final consonant class in addition to the onset.
161///
162/// Returns `"000"` if `word` contains no Thai consonants.
163///
164/// **Note:** Consonant clusters (e.g. กร, กล) are parsed as separate units;
165/// this is an approximation of the full syllable-parser approach.
166///
167/// ```
168/// use kham_core::soundex::metasound;
169///
170/// assert_eq!(metasound("กาน"), "112"); // initial=ก(1) vowel=า(1) final=น(2)
171/// assert_eq!(metasound("ขาน"), "112"); // ข same initial group as ก
172/// assert_eq!(metasound("กาม"), "113"); // final=ม(3) differs from น(2)
173/// assert_ne!(metasound("กาน"), metasound("กาม"));
174/// ```
175pub fn metasound(word: &str) -> String {
176 let chars: Vec<char> = word.chars().collect();
177 let len = chars.len();
178 let mut result = String::new();
179 let mut i = 0;
180
181 while i < len {
182 // 1. Optional lead vowel (เ แ โ ไ ใ appear before the consonant in Unicode)
183 let lead = if is_ms_lead(chars[i]) {
184 let v = chars[i];
185 i += 1;
186 Some(v)
187 } else {
188 None
189 };
190
191 // 2. Initial consonant (required to emit a syllable code)
192 if i >= len || !is_thai_consonant(chars[i]) {
193 if lead.is_none() {
194 i += 1; // skip non-Thai char
195 }
196 continue;
197 }
198 let initial = chars[i];
199 i += 1;
200
201 // Thanthakat immediately after initial → silent consonant; skip syllable
202 if i < len && chars[i] == '\u{0E4C}' {
203 i += 1;
204 continue;
205 }
206
207 // 3. Upper vowel signs (ิ ี ึ ื ั ุ ู) and tone marks above/below the initial
208 let mut upper: Option<char> = None;
209 let mut nikhahit = false;
210 while i < len {
211 match chars[i] {
212 c if is_ms_upper(c) => {
213 upper = Some(c);
214 i += 1;
215 }
216 c if is_ms_tone(c) => {
217 i += 1;
218 }
219 '\u{0E4D}' => {
220 // Nikhahit อํ — upper component of sara am (–ำ)
221 nikhahit = true;
222 i += 1;
223 }
224 _ => break,
225 }
226 }
227
228 // 4. Follow vowel (า ะ ำ appear after the consonant spine)
229 let follow = if i < len && is_ms_follow(chars[i]) {
230 let v = chars[i];
231 i += 1;
232 Some(v)
233 } else {
234 None
235 };
236
237 // 5. Final consonant — present only when the next consonant is NOT followed
238 // by a vowel mark (which would make it the initial of the next syllable).
239 let final_c = if i < len && is_thai_consonant(chars[i]) {
240 let next = i + 1;
241 if next < len && chars[next] == '\u{0E4C}' {
242 // Silent consonant (e.g. กรณ์): consume both and produce no final
243 i += 2;
244 None
245 } else if next < len
246 && (is_ms_upper(chars[next])
247 || is_ms_follow(chars[next])
248 || is_ms_lead(chars[next]))
249 {
250 // Consonant has its own vowel → next syllable's initial; don't consume
251 None
252 } else {
253 let fc = chars[i];
254 i += 1;
255 Some(fc)
256 }
257 } else {
258 None
259 };
260
261 // Emit [initial][vowel][final]
262 result.push(ms_initial_code(initial) as char);
263 result.push(ms_vowel_code(lead, upper, follow, nikhahit) as char);
264 result.push(ms_final_code(final_c) as char);
265 }
266
267 if result.is_empty() {
268 "000".into()
269 } else {
270 result
271 }
272}
273
274fn ms_initial_code(c: char) -> u8 {
275 match c {
276 'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
277 'ง' => b'2',
278 'จ' | 'ช' | 'ฉ' | 'ฌ' => b'3',
279 'ซ' | 'ศ' | 'ษ' | 'ส' => b'4',
280 'ญ' | 'ย' => b'5',
281 'ฎ' | 'ด' => b'6',
282 'ฏ' | 'ต' => b'7',
283 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => b'8',
284 'น' | 'ณ' => b'9',
285 'บ' => b'A',
286 'ป' => b'B',
287 'ผ' | 'พ' | 'ภ' => b'C',
288 'ฝ' | 'ฟ' => b'D',
289 'ม' => b'E',
290 'ร' => b'F',
291 'ล' | 'ฬ' => b'G',
292 'ว' => b'H',
293 'ห' | 'ฮ' => b'I',
294 _ => b'J', // อ and unknowns → glottal / null onset
295 }
296}
297
298fn ms_vowel_code(
299 lead: Option<char>,
300 upper: Option<char>,
301 follow: Option<char>,
302 nikhahit: bool,
303) -> u8 {
304 // Sara am (nikhahit อํ or ำ) takes priority
305 if nikhahit {
306 return b'D';
307 }
308 match lead {
309 // Leading vowels (เ แ โ ไ ใ) determine the vowel class
310 Some('ไ') | Some('ใ') => b'E', // /ai/
311 Some('เ') => match follow {
312 Some('\u{0E32}') => b'F', // เ–า /ao/
313 Some('\u{0E30}') => b'8', // เ–ะ short /e/
314 _ => b'8', // เ– long /eː/ (default)
315 },
316 Some('แ') => b'8', // แ– /ɛ/ class (short or long)
317 Some('โ') => b'9', // โ– /o/ class (short or long)
318 // No lead vowel: rely on upper and follow vowel signs
319 _ => match upper {
320 Some('\u{0E31}') => b'0', // ั (mai han akat) short /a/
321 Some('\u{0E34}') => b'2', // ิ short /i/
322 Some('\u{0E35}') => b'3', // ี long /iː/
323 Some('\u{0E36}') => b'4', // ึ short /ɯ/
324 Some('\u{0E37}') => b'5', // ื long /ɯː/
325 Some('\u{0E38}') => b'6', // ุ short /u/
326 Some('\u{0E39}') => b'7', // ู long /uː/
327 _ => match follow {
328 Some('\u{0E30}') => b'0', // ะ short /a/
329 Some('\u{0E32}') => b'1', // า long /aː/
330 Some('\u{0E33}') => b'D', // ำ /am/
331 _ => b'0', // no vowel marking → default short /a/
332 },
333 },
334 }
335}
336
337fn ms_final_code(c: Option<char>) -> u8 {
338 match c {
339 Some('ก') => b'1', // velar stop
340 Some('น') | Some('ณ') | Some('ญ') | Some('ร') | Some('ล') | Some('ฬ') => b'2', // alveolar sonorant
341 Some('ม') => b'3', // bilabial nasal
342 Some('ง') => b'4', // velar nasal
343 Some('ย') | Some('ว') => b'5', // glide
344 _ => b'6', // open syllable / no final
345 }
346}
347
348// Character class helpers used only by MetaSound (lk82/udom83 use is_thai_consonant only)
349
350fn is_ms_lead(c: char) -> bool {
351 matches!(c, '\u{0E40}'..='\u{0E44}') // เ แ โ ไ ใ
352}
353
354fn is_ms_upper(c: char) -> bool {
355 // mai han akat ั (U+0E31) + sara ิ ี ึ ื ุ ู (U+0E34–U+0E39) + phinthu (U+0E3A)
356 c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
357}
358
359fn is_ms_follow(c: char) -> bool {
360 matches!(c, '\u{0E30}' | '\u{0E32}' | '\u{0E33}') // ะ า ำ
361}
362
363fn is_ms_tone(c: char) -> bool {
364 matches!(c, '\u{0E48}'..='\u{0E4B}') // ่ ้ ๊ ๋
365}
366
367// ── shared helpers ────────────────────────────────────────────────────────────
368
369/// Strip consonant + ์ (thanthakat, U+0E4C) pairs — silent consonants.
370fn strip_silent(s: &str) -> String {
371 let chars: Vec<char> = s.chars().collect();
372 let mut out = String::new();
373 let mut i = 0;
374 while i < chars.len() {
375 if i + 1 < chars.len() && chars[i + 1] == '\u{0E4C}' {
376 i += 2;
377 continue;
378 }
379 out.push(chars[i]);
380 i += 1;
381 }
382 out
383}
384
385/// True for Thai consonant code points ก–ฮ (U+0E01–U+0E2E).
386fn is_thai_consonant(c: char) -> bool {
387 ('\u{0E01}'..='\u{0E2E}').contains(&c)
388}
389
390/// Core encoder: strip silent consonants → map codes → dedup adjacent → pad to 4.
391fn encode(word: &str, code_fn: fn(char) -> u8) -> String {
392 const LEN: usize = 4;
393
394 let stripped = strip_silent(word);
395 let mut codes: Vec<u8> = Vec::with_capacity(LEN);
396 let mut last: Option<u8> = None;
397
398 for ch in stripped.chars() {
399 if !is_thai_consonant(ch) {
400 continue;
401 }
402 let code = code_fn(ch);
403 if Some(code) != last {
404 codes.push(code);
405 last = Some(code);
406 }
407 if codes.len() == LEN {
408 break;
409 }
410 }
411
412 while codes.len() < LEN {
413 codes.push(b'0');
414 }
415
416 String::from_utf8(codes).expect("soundex codes are ASCII")
417}
418
419// ── Thai–English cross-language Soundex (Suwanvisat & Prasitjutrakul 1998) ──
420//
421// Source: "Thai-English Cross-Language Transliterated Word Retrieval using
422// Soundex Technique", NECTEC Annual Conference 1998.
423//
424// The paper extends Odell & Russell's Soundex to a combined Thai+English table
425// so that both a Thai transliteration and its English source word encode to the
426// same (or prefix-matched) code — no romanization step needed.
427//
428// Key differences from standard Soundex:
429// • First character encodes as a digit too (not kept as a letter)
430// • Vowels in non-first position → '7' (not dropped)
431// • H → '8', W → '1', Y → '9' in non-first position
432// • Thai consonants map directly to the same 7 groups as English
433// • ง (ng) → "52" (two digits: N-group then G/K-group)
434// • Code is variable-length (unlimited); callers choose a minimum k for matching
435
436/// Encode a Thai or English word into a shared cross-language phonetic code.
437///
438/// Implements the Suwanvisat & Prasitjutrakul (1998) modified Soundex that
439/// extends the encoding table to cover both Thai consonants and English letters
440/// directly — **no romanization step required**. A Thai transliteration and its
441/// English source word produce codes that share a common prefix, enabling
442/// cross-language retrieval of transliterated proper nouns and loan words.
443///
444/// **Encoding rules:**
445/// - Every character (Thai consonant or English letter) is mapped to a digit; the
446/// first character is also encoded numerically (unlike standard Soundex).
447/// - English vowels A/E/I/O/U → `'7'` in non-first position (retained, not dropped).
448/// - H → `'8'`, W → `'1'`, Y → `'9'` in non-first position.
449/// - Thai vowel marks (sara, tone marks, leading vowels) are skipped entirely.
450/// - ง maps to `"52"` (N-group + G/K-group, representing the NG onset).
451/// - Adjacent identical digits collapse to one (standard deduplication).
452/// - Output is **variable length** — longer words produce longer codes.
453///
454/// Returns `""` if `word` contains no encodable characters.
455///
456/// ```
457/// use kham_core::soundex::thai_english_soundex;
458///
459/// // Same initial-group consonants produce a common prefix
460/// assert_eq!(&thai_english_soundex("McDonald")[..3],
461/// &thai_english_soundex("แมคโดนัลด์")[..3]);
462/// // English words encode to fully numeric codes
463/// assert_eq!(thai_english_soundex("Robert"), "671763");
464/// ```
465pub fn thai_english_soundex(word: &str) -> String {
466 let chars: Vec<char> = word.chars().collect();
467 let len = chars.len();
468 let mut result = String::new();
469 let mut last_digit: Option<char> = None;
470 let mut is_first = true;
471 let mut i = 0;
472
473 while i < len {
474 let c = chars[i];
475
476 // Thai vowel marks, tone marks, leading vowels — skip entirely
477 if is_cl_skip(c) {
478 i += 1;
479 continue;
480 }
481
482 // Silent Thai consonant: consonant immediately followed by ์ (thanthakat)
483 if is_thai_consonant(c) && i + 1 < len && chars[i + 1] == '\u{0E4C}' {
484 i += 2;
485 continue;
486 }
487
488 // Only encode ASCII alpha and Thai consonants
489 if !c.is_ascii_alphabetic() && !is_thai_consonant(c) {
490 i += 1;
491 continue;
492 }
493
494 let code = cl_code(c, is_first);
495 if !code.is_empty() {
496 is_first = false;
497 }
498 for digit in code.chars() {
499 if Some(digit) != last_digit {
500 result.push(digit);
501 last_digit = Some(digit);
502 }
503 }
504
505 i += 1;
506 }
507
508 result
509}
510
511/// Encode an English (or romanized) word using standard Soundex (Odell & Russell).
512///
513/// Retains the first letter, replaces remaining consonants with digits `1`–`6`,
514/// collapses adjacent identical codes, drops vowels and H/W/Y, and pads to
515/// exactly 4 characters. Returns `""` if `word` contains no ASCII alphabetic
516/// characters.
517///
518/// | Digit | Letters |
519/// |-------|-----------------|
520/// | `1` | B F P V |
521/// | `2` | C G J K Q S X Z |
522/// | `3` | D T |
523/// | `4` | L |
524/// | `5` | M N |
525/// | `6` | R |
526/// | skip | A E I O U H W Y |
527///
528/// ```
529/// use kham_core::soundex::english_soundex;
530///
531/// assert_eq!(english_soundex("Robert"), "R163");
532/// assert_eq!(english_soundex("Rupert"), "R163");
533/// assert_eq!(english_soundex("McDonald"), "M235");
534/// assert_eq!(english_soundex("Smith"), "S530");
535/// ```
536pub fn english_soundex(word: &str) -> String {
537 let mut chars = word
538 .chars()
539 .filter(|c| c.is_ascii_alphabetic())
540 .map(|c| c.to_ascii_uppercase());
541
542 let first = match chars.next() {
543 Some(c) => c,
544 None => return String::new(),
545 };
546
547 let mut code = String::with_capacity(4);
548 code.push(first);
549 let mut last = std_soundex_digit(first);
550
551 for c in chars {
552 let d = std_soundex_digit(c);
553 if d == '0' {
554 last = '0'; // vowel / H / W / Y — acts as separator
555 } else if d != last {
556 code.push(d);
557 last = d;
558 if code.len() == 4 {
559 break;
560 }
561 }
562 }
563
564 while code.len() < 4 {
565 code.push('0');
566 }
567 code
568}
569
570/// Standard Soundex digit (Odell & Russell). Returns `'0'` for non-coded letters.
571fn std_soundex_digit(c: char) -> char {
572 match c {
573 'B' | 'F' | 'P' | 'V' => '1',
574 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
575 'D' | 'T' => '3',
576 'L' => '4',
577 'M' | 'N' => '5',
578 'R' => '6',
579 _ => '0',
580 }
581}
582
583/// Returns `true` if two words share the same cross-language phonetic code.
584///
585/// Accepts Thai, English, or mixed input — no romanizer required. Returns
586/// `false` if either word produces an empty code. For cross-language
587/// Thai↔English pairs (e.g. transliterated loan words), the codes share a
588/// common prefix even if not exactly equal; prefer comparing
589/// [`thai_english_soundex`] codes directly with a minimum-length threshold
590/// for that use case.
591///
592/// ```
593/// use kham_core::soundex::sounds_like_cross_lang;
594///
595/// assert!(sounds_like_cross_lang("Robert", "Rupert")); // same code: "671763"
596/// assert!(sounds_like_cross_lang("กาน", "คาน")); // ก and ค → same group
597/// assert!(!sounds_like_cross_lang("Robert", "Smith"));
598/// ```
599pub fn sounds_like_cross_lang(a: &str, b: &str) -> bool {
600 let code_a = thai_english_soundex(a);
601 !code_a.is_empty() && code_a == thai_english_soundex(b)
602}
603
604/// Returns the cross-language Soundex code fragment for one character.
605///
606/// `is_first` selects the first-position table (AEIOUHWY → `"0"`) vs the
607/// rest-position table (AEIOU → `"7"`, H → `"8"`, W → `"1"`, Y → `"9"`).
608/// Returns `""` for characters that should be skipped (อ in non-first position).
609/// ง returns `"52"` (two digits) in both positions.
610fn cl_code(c: char, is_first: bool) -> &'static str {
611 if c.is_ascii_alphabetic() {
612 let cu = c.to_ascii_uppercase();
613 return if is_first {
614 match cu {
615 'A' | 'E' | 'I' | 'O' | 'U' | 'H' | 'W' | 'Y' => "0",
616 'B' | 'F' | 'P' | 'V' => "1",
617 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => "2",
618 'D' | 'T' => "3",
619 'L' => "4",
620 'M' | 'N' => "5",
621 'R' => "6",
622 _ => "",
623 }
624 } else {
625 match cu {
626 'A' | 'E' | 'I' | 'O' | 'U' => "7",
627 'H' => "8",
628 'W' => "1",
629 'Y' => "9",
630 'B' | 'F' | 'P' | 'V' => "1",
631 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => "2",
632 'D' | 'T' => "3",
633 'L' => "4",
634 'M' | 'N' => "5",
635 'R' => "6",
636 _ => "",
637 }
638 };
639 }
640
641 // Thai consonants — same 7 groups in both positions; ว/ห/ย/ญ split by position
642 if is_first {
643 match c {
644 // Group 0 equivalent: vowel carriers / glides / h — first position
645 'อ' | 'ห' | 'ฮ' | 'ว' | 'ญ' | 'ย' => "0",
646 // Group 2 (C/G/J/K/Q/S/X/Z): all velar+palatal+sibilant clusters
647 'ก' | 'ข' | 'ฃ' | 'ค' | 'ฅ' | 'ฆ' => "2",
648 'จ' | 'ฉ' | 'ช' | 'ฌ' => "2",
649 'ซ' | 'ศ' | 'ษ' | 'ส' => "2",
650 // ง = NG/NK → N-group then G/K-group
651 'ง' => "52",
652 // Group 3 (D/T): dental/alveolar stops
653 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "3",
654 // Group 4 (L): laterals
655 'ล' | 'ฬ' => "4",
656 // Group 5 (M/N): nasals
657 'ม' | 'ณ' | 'น' => "5",
658 // Group 6 (R): rhotic
659 'ร' => "6",
660 // Group 1 (B/F/P/V): bilabials + labiodentals
661 'บ' | 'ป' | 'ผ' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "1",
662 _ => "",
663 }
664 } else {
665 match c {
666 // อ is a pure vowel carrier in non-initial position — skip
667 'อ' => "",
668 // ว/ห/ฮ/ญ/ย split like W/H/Y in English non-first position
669 'ห' | 'ฮ' => "8",
670 'ว' => "1",
671 'ญ' | 'ย' => "9",
672 'ก' | 'ข' | 'ฃ' | 'ค' | 'ฅ' | 'ฆ' => "2",
673 'จ' | 'ฉ' | 'ช' | 'ฌ' => "2",
674 'ซ' | 'ศ' | 'ษ' | 'ส' => "2",
675 'ง' => "52",
676 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "3",
677 'ล' | 'ฬ' => "4",
678 'ม' | 'ณ' | 'น' => "5",
679 'ร' => "6",
680 'บ' | 'ป' | 'ผ' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "1",
681 _ => "",
682 }
683 }
684}
685
686/// Thai characters to skip in cross-language soundex (vowel marks, tone marks,
687/// leading vowels, thanthakat, nikhahit — anything that isn't a consonant).
688fn is_cl_skip(c: char) -> bool {
689 matches!(
690 c,
691 '\u{0E30}'..='\u{0E3A}' // sara vowels (ะ า ิ ี ึ ื ุ ู ฺ) and mai han akat
692 | '\u{0E40}'..='\u{0E44}' // leading vowels (เ แ โ ไ ใ)
693 | '\u{0E47}'..='\u{0E4E}' // mai tai khu, tone marks, ์, ๎, nikhahit
694 )
695}
696
697#[cfg(test)]
698mod tests {
699 use super::*;
700
701 // ── lk82 ─────────────────────────────────────────────────────────────────
702
703 #[test]
704 fn lk82_worked_examples() {
705 assert_eq!(lk82("กาน"), "1600");
706 assert_eq!(lk82("ขาน"), "1600");
707 assert_eq!(lk82("คาน"), "1600");
708 assert_eq!(lk82("บ้าน"), "4600");
709 assert_eq!(lk82("มาก"), "5100");
710 assert_eq!(lk82("นาค"), "6100");
711 assert_eq!(lk82("กรุงเทพ"), "1873");
712 }
713
714 #[test]
715 fn lk82_same_initial_velar() {
716 assert_eq!(lk82("กาน"), lk82("ขาน"));
717 assert_eq!(lk82("กาน"), lk82("คาน"));
718 }
719
720 #[test]
721 fn lk82_different_initials() {
722 assert_ne!(lk82("กาน"), lk82("ปาน"));
723 assert_ne!(lk82("มาน"), lk82("นาน"));
724 }
725
726 #[test]
727 fn lk82_always_four_chars() {
728 assert_eq!(lk82("ก").len(), 4);
729 assert_eq!(lk82("กรุงเทพมหานคร").len(), 4);
730 }
731
732 #[test]
733 fn lk82_empty_and_no_thai() {
734 assert_eq!(lk82(""), "0000");
735 assert_eq!(lk82("123"), "0000");
736 assert_eq!(lk82("hello"), "0000");
737 }
738
739 #[test]
740 fn lk82_strips_silent_consonant() {
741 // กรณ์ → กร (ณ is silent)
742 assert_eq!(lk82("กรณ์"), lk82("กร"));
743 }
744
745 #[test]
746 fn lk82_deduplicates_adjacent_same_group() {
747 // กข → both code '1' → deduplicated to a single '1'
748 assert_eq!(lk82("กข"), "1000");
749 }
750
751 // ── udom83 ───────────────────────────────────────────────────────────────
752
753 #[test]
754 fn udom83_always_four_chars() {
755 assert_eq!(udom83("ก").len(), 4);
756 assert_eq!(udom83("กรุงเทพมหานคร").len(), 4);
757 }
758
759 #[test]
760 fn udom83_separates_liquids() {
761 assert_ne!(udom83("ลาน"), udom83("ราน"));
762 }
763
764 #[test]
765 fn udom83_sibilant_separate_from_affricate() {
766 assert_ne!(udom83("สาน"), udom83("ชาน"));
767 assert_eq!(udom83("สาน"), udom83("ซาน"));
768 }
769
770 #[test]
771 fn udom83_empty_and_no_thai() {
772 assert_eq!(udom83(""), "0000");
773 assert_eq!(udom83("abc"), "0000");
774 }
775
776 // ── metasound ─────────────────────────────────────────────────────────────
777
778 #[test]
779 fn metasound_worked_examples() {
780 // กาน: initial=ก(1) vowel=า(1) final=น(2)
781 assert_eq!(metasound("กาน"), "112");
782 // ขาน: ข shares group '1' with ก
783 assert_eq!(metasound("ขาน"), "112");
784 // กาม: different final ม(3)
785 assert_eq!(metasound("กาม"), "113");
786 }
787
788 #[test]
789 fn metasound_same_initial_group() {
790 assert_eq!(metasound("กาน"), metasound("ขาน"));
791 assert_eq!(metasound("กาน"), metasound("คาน"));
792 }
793
794 #[test]
795 fn metasound_distinguishes_finals() {
796 assert_ne!(metasound("กาน"), metasound("กาม"));
797 assert_ne!(metasound("กาน"), metasound("กาง"));
798 }
799
800 #[test]
801 fn metasound_vowel_length() {
802 // า long /aː/ (code '1') vs ะ short /a/ (code '0')
803 assert_ne!(metasound("กาน"), metasound("กะ"));
804 }
805
806 #[test]
807 fn metasound_lead_vowel_classes() {
808 // เ– class → vowel code '8'
809 let e_code = metasound("เกน");
810 assert_eq!(&e_code[1..2], "8");
811 // ไ / ใ → vowel code 'E'
812 let ai_code = metasound("ไก");
813 assert_eq!(&ai_code[1..2], "E");
814 }
815
816 #[test]
817 fn metasound_empty_and_no_thai() {
818 assert_eq!(metasound(""), "000");
819 assert_eq!(metasound("abc"), "000");
820 assert_eq!(metasound("123"), "000");
821 }
822
823 #[test]
824 fn metasound_open_syllable() {
825 // กา: no final consonant → final code '6'
826 assert_eq!(metasound("กา"), "116");
827 }
828
829 #[test]
830 fn metasound_sara_am() {
831 // กำ: nikhahit → vowel code 'D'
832 let code = metasound("กำ");
833 assert_eq!(&code[1..2], "D");
834 }
835
836 // ── soundex() enum API ────────────────────────────────────────────────────
837
838 #[test]
839 fn soundex_dispatches_to_lk82() {
840 assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82), lk82("กาน"));
841 }
842
843 #[test]
844 fn soundex_dispatches_to_udom83() {
845 assert_eq!(soundex("กาน", SoundexAlgorithm::Udom83), udom83("กาน"));
846 }
847
848 #[test]
849 fn soundex_dispatches_to_metasound() {
850 assert_eq!(
851 soundex("กาน", SoundexAlgorithm::MetaSound),
852 metasound("กาน")
853 );
854 }
855
856 // ── sounds_like ───────────────────────────────────────────────────────────
857
858 #[test]
859 fn sounds_like_lk82_positive() {
860 assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::Lk82));
861 }
862
863 #[test]
864 fn sounds_like_lk82_negative() {
865 assert!(!sounds_like("กิน", "มิน", SoundexAlgorithm::Lk82));
866 }
867
868 #[test]
869 fn sounds_like_udom83_splits_liquids() {
870 assert!(!sounds_like("ลาน", "ราน", SoundexAlgorithm::Udom83));
871 }
872
873 #[test]
874 fn sounds_like_metasound_positive() {
875 assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::MetaSound));
876 }
877
878 #[test]
879 fn sounds_like_metasound_negative() {
880 assert!(!sounds_like("กาน", "กาม", SoundexAlgorithm::MetaSound));
881 }
882
883 #[test]
884 fn sounds_like_empty_returns_false() {
885 assert!(!sounds_like("", "กาน", SoundexAlgorithm::Lk82));
886 assert!(!sounds_like("กาน", "", SoundexAlgorithm::Lk82));
887 }
888
889 // ── English Soundex ───────────────────────────────────────────────────────
890
891 #[test]
892 fn english_soundex_standard_examples() {
893 assert_eq!(english_soundex("Robert"), "R163");
894 assert_eq!(english_soundex("Rupert"), "R163"); // same code as Robert
895 assert_eq!(english_soundex("McDonald"), "M235");
896 assert_eq!(english_soundex("Smith"), "S530");
897 assert_eq!(english_soundex("Thompson"), "T512");
898 }
899
900 #[test]
901 fn english_soundex_always_four_chars() {
902 assert_eq!(english_soundex("A").len(), 4);
903 assert_eq!(english_soundex("Robert").len(), 4);
904 }
905
906 #[test]
907 fn english_soundex_empty_and_no_alpha() {
908 assert_eq!(english_soundex(""), "");
909 assert_eq!(english_soundex("123"), "");
910 }
911
912 #[test]
913 fn english_soundex_case_insensitive() {
914 assert_eq!(english_soundex("robert"), english_soundex("Robert"));
915 assert_eq!(english_soundex("ROBERT"), english_soundex("Robert"));
916 }
917
918 #[test]
919 fn english_soundex_vowel_separates_same_code() {
920 // B and P are both code '1'; with a vowel between them they must NOT collapse.
921 // "Abba" → A(keep) b→1 b→same,skip → A100
922 assert_eq!(english_soundex("Abba"), "A100");
923 // "Ababar" — b(1) a(sep) b(1 again after vowel sep) → distinct
924 assert_eq!(&english_soundex("Ababar")[..2], "A1");
925 }
926
927 #[test]
928 fn english_soundex_adjacent_same_code_collapsed() {
929 // CK → both code '2'; adjacent → only one digit
930 assert_eq!(english_soundex("Jack"), "J200");
931 }
932
933 // ── Thai–English cross-language (Suwanvisat & Prasitjutrakul 1998) ──────────
934
935 #[test]
936 fn thai_english_soundex_english_numeric_codes() {
937 // First character also encoded as a digit (unlike standard Soundex)
938 assert_eq!(thai_english_soundex("Robert"), "671763");
939 assert_eq!(thai_english_soundex("Rupert"), "671763"); // same code — same initial-sound group
940 }
941
942 #[test]
943 fn thai_english_soundex_thai_direct_encoding() {
944 // Thai consonants map directly to the shared table — no romanizer needed
945 assert_eq!(thai_english_soundex("กน"), "25"); // ก→2 (K group), น→5 (N group)
946 assert_eq!(thai_english_soundex("ร"), "6"); // ร→6 (R group)
947 assert_eq!(thai_english_soundex("ก"), "2"); // single consonant → single digit
948 }
949
950 #[test]
951 fn thai_english_soundex_ng_two_digits() {
952 // ง (ng onset) encodes as "52": N-group (5) then G/K-group (2)
953 assert_eq!(thai_english_soundex("ง"), "52");
954 }
955
956 #[test]
957 fn thai_english_soundex_thai_vowels_skipped_english_vowels_to_7() {
958 // Thai vowel diacritics are skipped entirely
959 assert_eq!(thai_english_soundex("กิน"), "25"); // ิ (U+0E34) is skipped
960 // English vowels in non-first position → '7' (retained, not dropped)
961 assert!(thai_english_soundex("Robert").contains('7')); // 'o' and 'e' → '7'
962 }
963
964 #[test]
965 fn thai_english_soundex_cross_lang_prefix_match() {
966 // McDonald and แมคโดนัลด์ share the same 3-char prefix "523"
967 let en = thai_english_soundex("McDonald");
968 let th = thai_english_soundex("แมคโดนัลด์");
969 assert!(en.len() >= 3 && th.len() >= 3, "codes too short");
970 assert_eq!(&en[..3], &th[..3]);
971 }
972
973 #[test]
974 fn thai_english_soundex_variable_length_and_empty() {
975 assert_eq!(thai_english_soundex(""), "");
976 assert_eq!(thai_english_soundex("123"), "");
977 // longer words produce longer codes
978 let long = thai_english_soundex("กรุงเทพมหานคร");
979 assert!(long.len() > 2);
980 }
981
982 #[test]
983 fn sounds_like_cross_lang_same_english() {
984 assert!(sounds_like_cross_lang("Robert", "Rupert"));
985 }
986
987 #[test]
988 fn sounds_like_cross_lang_same_thai_initial_group() {
989 // ก and ค are both in the K/G group (→ "2"); กาน and คาน share the full code
990 assert!(sounds_like_cross_lang("กาน", "คาน"));
991 }
992
993 #[test]
994 fn sounds_like_cross_lang_different() {
995 assert!(!sounds_like_cross_lang("Robert", "Smith"));
996 assert!(!sounds_like_cross_lang("กาน", "บาน")); // ก→2 vs บ→1
997 }
998
999 #[test]
1000 fn sounds_like_cross_lang_empty_returns_false() {
1001 assert!(!sounds_like_cross_lang("", "Robert"));
1002 assert!(!sounds_like_cross_lang("Robert", ""));
1003 }
1004}