kham_core/
romanizer.rs

1//! RTGS romanization of segmented Thai words.
2//!
3//! [`RomanizationMap`] maps pre-segmented Thai words to their Roman (Latin)
4//! phonetic equivalents using the Royal Thai General System of Transcription
5//! (RTGS) — the Thai government standard used in road signs, passports, and
6//! official documents.
7//!
8//! Lookup first checks the hand-curated table; words not in the table are
9//! romanized by the built-in rule engine ([`romanize_word`]).
10//!
11//! # RTGS characteristics
12//!
13//! - Consonant-by-consonant transliteration (initial vs. final position differ)
14//! - No tone marks in output
15//! - No vowel-length distinction (อิ and อี both map to `i`)
16//! - Diphthongs and vowel clusters have explicit multi-character mappings
17//!
18//! # Data format
19//!
20//! Tab-separated text file, one entry per line:
21//!
22//! ```text
23//! # Thai word<TAB>RTGS romanization
24//! กิน<TAB>kin
25//! ข้าว<TAB>khao
26//! ปลา<TAB>pla
27//! ```
28//!
29//! Lines beginning with `#` and blank lines are ignored.
30//! Duplicate keys: last entry wins (allows override files).
31//!
32//! # Example
33//!
34//! ```rust
35//! use kham_core::romanizer::RomanizationMap;
36//!
37//! let map = RomanizationMap::builtin();
38//! assert_eq!(map.romanize("กิน"), Some("kin"));
39//! assert_eq!(map.romanize_or_raw("ข้าว"), "khao");
40//! assert_eq!(map.romanize_or_raw("xyz"), "xyz");
41//!
42//! let tokens = vec!["กิน", "ข้าว", "ปลา"];
43//! assert_eq!(map.romanize_tokens(&tokens), vec!["kin", "khao", "pla"]);
44//! ```
45
46use alloc::collections::BTreeMap;
47use alloc::string::String;
48use alloc::vec::Vec;
49
50static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
51
52/// A Thai-word → RTGS-romanization lookup table.
53///
54/// Built from tab-separated data via [`RomanizationMap::from_tsv`].
55/// Lookup is O(log n) via [`BTreeMap`].
56pub struct RomanizationMap(BTreeMap<String, String>);
57
58impl RomanizationMap {
59    /// Load the built-in RTGS romanization table.
60    pub fn builtin() -> Self {
61        Self::from_tsv(BUILTIN_ROMANIZATION)
62    }
63
64    /// Parse a tab-separated romanization table.
65    ///
66    /// Format: `thai_word\trtgs_romanization` — one entry per line.
67    /// Lines beginning with `#` and blank lines are skipped.
68    /// For duplicate keys, the last entry wins.
69    pub fn from_tsv(data: &str) -> Self {
70        let mut map: BTreeMap<String, String> = BTreeMap::new();
71        for line in data.lines() {
72            let line = line.trim();
73            if line.is_empty() || line.starts_with('#') {
74                continue;
75            }
76            let mut parts = line.splitn(2, '\t');
77            let word = match parts.next() {
78                Some(w) if !w.is_empty() => String::from(w),
79                _ => continue,
80            };
81            let roman = match parts.next() {
82                Some(r) if !r.is_empty() => String::from(r.trim()),
83                _ => continue,
84            };
85            map.insert(word, roman);
86        }
87        RomanizationMap(map)
88    }
89
90    /// Look up the RTGS romanization for a pre-segmented Thai word.
91    ///
92    /// Returns the table hit if the word is in the hand-curated list, otherwise
93    /// applies the built-in rule engine. Returns `None` only when the word
94    /// contains no Thai characters (e.g. pure Latin or numbers).
95    ///
96    /// The returned `&str` borrows from the map for table hits; rule-engine
97    /// results are returned as an owned `String` via the `romanize_owned`
98    /// helper — callers that want a borrowed `&str` should use
99    /// [`romanize_or_raw`](Self::romanize_or_raw).
100    ///
101    /// # Example
102    ///
103    /// ```rust
104    /// use kham_core::romanizer::RomanizationMap;
105    ///
106    /// let map = RomanizationMap::builtin();
107    /// // Table hit
108    /// assert_eq!(map.romanize("กิน"), Some("kin"));
109    /// // OOV word — not in table; use romanize_owned() for rule-engine fallback
110    /// assert_eq!(map.romanize("เปปซี่"), None);
111    /// // Non-Thai input
112    /// assert_eq!(map.romanize("xyz"), None);
113    /// ```
114    pub fn romanize(&self, word: &str) -> Option<&str> {
115        self.0.get(word).map(String::as_str)
116    }
117
118    /// Romanize `word` to an owned `String`, using the table first, then the
119    /// rule engine for out-of-vocabulary Thai words.
120    ///
121    /// Returns `None` only when the word contains no Thai characters.
122    ///
123    /// # Example
124    ///
125    /// ```rust
126    /// use kham_core::romanizer::RomanizationMap;
127    ///
128    /// let map = RomanizationMap::builtin();
129    /// assert_eq!(map.romanize_owned("กิน").as_deref(), Some("kin"));
130    /// // OOV word gets rule-based approximation
131    /// assert!(map.romanize_owned("เปปซี่").is_some());
132    /// // Non-Thai returns None
133    /// assert_eq!(map.romanize_owned("hello"), None);
134    /// ```
135    pub fn romanize_owned(&self, word: &str) -> Option<String> {
136        if let Some(s) = self.0.get(word) {
137            return Some(s.clone());
138        }
139        if word.chars().any(is_thai_char) {
140            Some(romanize_word(word))
141        } else {
142            None
143        }
144    }
145
146    /// Return the RTGS romanization for `word`, or `word` unchanged if not in
147    /// the table. Only performs table lookup — no rule engine.
148    ///
149    /// For OOV Thai words that should fall back to the rule engine, use
150    /// [`romanize_or_rule`](Self::romanize_or_rule) instead.
151    ///
152    /// # Example
153    ///
154    /// ```rust
155    /// use kham_core::romanizer::RomanizationMap;
156    ///
157    /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
158    /// assert_eq!(map.romanize_or_raw("กิน"), "kin");
159    /// assert_eq!(map.romanize_or_raw("xyz"), "xyz");
160    /// // OOV Thai is returned unchanged (raw passthrough)
161    /// assert_eq!(map.romanize_or_raw("เปปซี่"), "เปปซี่");
162    /// ```
163    pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
164        self.0.get(word).map(String::as_str).unwrap_or(word)
165    }
166
167    /// Return the RTGS romanization for `word`.
168    ///
169    /// Checks the table first; for OOV Thai words the built-in rule engine is
170    /// applied. Non-Thai input is returned unchanged. Always returns an owned
171    /// `String`.
172    ///
173    /// # Example
174    ///
175    /// ```rust
176    /// use kham_core::romanizer::RomanizationMap;
177    ///
178    /// let map = RomanizationMap::builtin();
179    /// // Table hit
180    /// assert_eq!(map.romanize_or_rule("กิน"), "kin");
181    /// // Non-Thai passes through
182    /// assert_eq!(map.romanize_or_rule("hello"), "hello");
183    /// // OOV Thai gets rule-based approximation
184    /// let oov = map.romanize_or_rule("เปปซี่");
185    /// assert!(!oov.is_empty());
186    /// assert!(!oov.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)));
187    /// ```
188    pub fn romanize_or_rule(&self, word: &str) -> String {
189        if let Some(s) = self.0.get(word) {
190            return s.clone();
191        }
192        if word.chars().any(is_thai_char) {
193            romanize_word(word)
194        } else {
195            String::from(word)
196        }
197    }
198
199    /// Romanize a slice of pre-segmented token strings.
200    ///
201    /// Returns a `Vec<String>` aligned 1:1 with the input slice. Tokens not
202    /// found in the table are returned unchanged (same behaviour as
203    /// [`romanize_or_raw`](Self::romanize_or_raw)).
204    ///
205    /// # Example
206    ///
207    /// ```rust
208    /// use kham_core::romanizer::RomanizationMap;
209    ///
210    /// let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
211    /// let out = map.romanize_tokens(&["กิน", "ปลา"]);
212    /// assert_eq!(out, vec!["kin", "pla"]);
213    /// ```
214    pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
215        tokens
216            .iter()
217            .map(|t| String::from(self.romanize_or_raw(t)))
218            .collect()
219    }
220
221    /// Number of entries in the map.
222    #[inline]
223    pub fn len(&self) -> usize {
224        self.0.len()
225    }
226
227    /// Return `true` if the map has no entries.
228    #[inline]
229    pub fn is_empty(&self) -> bool {
230        self.0.is_empty()
231    }
232}
233
234// ---------------------------------------------------------------------------
235// Rule-based RTGS engine (fallback for OOV words)
236// ---------------------------------------------------------------------------
237
238#[inline]
239fn is_thai_char(c: char) -> bool {
240    ('\u{0E00}'..='\u{0E7F}').contains(&c)
241}
242
243/// RTGS initial-position consonant mapping.
244fn initial_rtgs(c: char) -> &'static str {
245    match c {
246        'ก' => "k",
247        'ข' | 'ค' | 'ฅ' | 'ฆ' => "kh",
248        'ง' => "ng",
249        'จ' | 'ฉ' | 'ช' | 'ฌ' => "ch",
250        'ซ' | 'ศ' | 'ษ' | 'ส' => "s",
251        'ญ' | 'ย' => "y",
252        'ฎ' | 'ด' => "d",
253        'ฏ' | 'ต' => "t",
254        'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "th",
255        'น' | 'ณ' => "n",
256        'บ' => "b",
257        'ป' => "p",
258        'ผ' | 'พ' | 'ภ' => "ph",
259        'ฝ' | 'ฟ' => "f",
260        'ม' => "m",
261        'ร' => "r",
262        'ล' | 'ฬ' => "l",
263        'ว' => "w",
264        'ห' | 'ฮ' => "h",
265        'อ' => "",
266        _ => "",
267    }
268}
269
270/// RTGS final-position (coda) consonant mapping.
271fn final_rtgs(c: char) -> &'static str {
272    match c {
273        'ก' | 'ข' | 'ค' | 'ฅ' | 'ฆ' => "k",
274        'ง' => "ng",
275        'จ' | 'ช' | 'ซ' | 'ฌ' | 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ถ' | 'ท' | 'ธ' | 'ศ' | 'ษ' | 'ส' => {
276            "t"
277        }
278        'น' | 'ณ' => "n",
279        'บ' | 'ป' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "p",
280        'ม' => "m",
281        'ย' | 'ญ' => "i",
282        'ร' => "n",
283        'ล' | 'ฬ' => "n",
284        'ว' => "o",
285        'ห' | 'อ' => "",
286        _ => "",
287    }
288}
289
290fn is_thai_consonant(c: char) -> bool {
291    matches!(c, 'ก'..='ฮ')
292}
293
294fn is_leading_vowel(c: char) -> bool {
295    matches!(c, 'เ' | 'แ' | 'โ' | 'ใ' | 'ไ')
296}
297
298fn is_tone_mark(c: char) -> bool {
299    matches!(c, '\u{0E48}' | '\u{0E49}' | '\u{0E4A}' | '\u{0E4B}')
300}
301
302fn is_silent_mark(c: char) -> bool {
303    c == '\u{0E4C}' // ์ thanthakat
304}
305
306/// Apply RTGS rules to an OOV Thai word.
307///
308/// Processes the Unicode character sequence using a lightweight syllable
309/// state machine. Handles leading vowels (เ แ โ ใ ไ), above vowels
310/// (ิ ี ึ ื ั ็), below vowels (ุ ู), following vowels (า ะ ำ), tone marks
311/// (skipped), and the thanthakat silent marker (์). Unrecognised characters
312/// pass through unchanged.
313pub fn romanize_word(word: &str) -> String {
314    let chars: Vec<char> = word.chars().collect();
315    let n = chars.len();
316    let mut out = String::with_capacity(word.len());
317    let mut i = 0;
318
319    while i < n {
320        let c = chars[i];
321
322        if is_leading_vowel(c) {
323            let lead = c;
324            i += 1;
325            // Skip any stacked tone marks before the initial consonant
326            while i < n && is_tone_mark(chars[i]) {
327                i += 1;
328            }
329            if i < n && is_thai_consonant(chars[i]) {
330                let init = initial_rtgs(chars[i]);
331                i += 1;
332                // Skip tone marks and above/below vowels that follow the initial
333                while i < n
334                    && (is_tone_mark(chars[i])
335                        || matches!(
336                            chars[i],
337                            'ิ' | 'ี' | 'ึ' | 'ื' | 'ั' | '็' | 'ุ' | 'ู' | '\u{0E4D}' | '\u{0E3A}'
338                        ))
339                {
340                    i += 1;
341                }
342                // Detect compound patterns: เ_อ → oe, เ_า → ao, เ_็ already consumed above
343                let suffix = if lead == 'เ' && i < n && chars[i] == 'อ' {
344                    i += 1;
345                    "oe"
346                } else if lead == 'เ' && i < n && chars[i] == 'า' {
347                    i += 1;
348                    "ao" // เ_า pattern
349                } else {
350                    match lead {
351                        'เ' => "e",
352                        'แ' => "ae",
353                        'โ' => "o",
354                        'ใ' | 'ไ' => "ai",
355                        _ => "",
356                    }
357                };
358                out.push_str(init);
359                out.push_str(suffix);
360                // Final consonant
361                if i < n && is_thai_consonant(chars[i]) && !is_silent_mark(chars[i]) {
362                    // Check for thanthakat on next+1
363                    let fin_c = chars[i];
364                    i += 1;
365                    let silent = i < n && is_silent_mark(chars[i]);
366                    if silent {
367                        i += 1; // consume ์
368                    } else {
369                        out.push_str(final_rtgs(fin_c));
370                    }
371                }
372            } else {
373                // Lone leading vowel — just emit vowel sound
374                out.push_str(match lead {
375                    'เ' => "e",
376                    'แ' => "ae",
377                    'โ' => "o",
378                    'ใ' | 'ไ' => "ai",
379                    _ => "",
380                });
381            }
382        } else if is_thai_consonant(c) {
383            let init = initial_rtgs(c);
384            i += 1;
385
386            // Collect vowel diacritics and tone marks
387            let mut vowel = "";
388            let mut pending_silent = false;
389            while i < n {
390                match chars[i] {
391                    // Tone marks — skip
392                    ch if is_tone_mark(ch) => i += 1,
393                    // Thanthakat — this consonant is silent
394                    ch if is_silent_mark(ch) => {
395                        pending_silent = true;
396                        i += 1;
397                        break;
398                    }
399                    // Above vowels
400                    'ิ' | '็' => {
401                        vowel = "i";
402                        i += 1;
403                    }
404                    'ี' => {
405                        vowel = "i";
406                        i += 1;
407                    }
408                    'ึ' => {
409                        vowel = "ue";
410                        i += 1;
411                    }
412                    'ื' => {
413                        vowel = "ue";
414                        i += 1;
415                    }
416                    'ั' => {
417                        vowel = "a";
418                        i += 1;
419                    }
420                    // Below vowels
421                    'ุ' => {
422                        vowel = "u";
423                        i += 1;
424                    }
425                    'ู' => {
426                        vowel = "u";
427                        i += 1;
428                    }
429                    // Following vowels
430                    'า' => {
431                        vowel = "a";
432                        i += 1;
433                    }
434                    'ะ' => {
435                        vowel = "a";
436                        i += 1;
437                    }
438                    'ำ' => {
439                        vowel = "am";
440                        i += 1;
441                        break;
442                    } // am absorbs final
443                    // Nikhahit / phinthu — skip
444                    '\u{0E4D}' | '\u{0E3A}' => i += 1,
445                    _ => break,
446                }
447            }
448
449            if pending_silent {
450                // Consonant is silent (e.g. ห์ in loan words) — emit nothing
451                continue;
452            }
453
454            out.push_str(init);
455            out.push_str(vowel);
456
457            // ำ already encodes the final nasal — skip coda search
458            if vowel == "am" {
459                continue;
460            }
461
462            // Final consonant: next non-tone-mark consonant followed by end-of-word
463            // or another leading vowel / vowel diacritic
464            if i < n && is_thai_consonant(chars[i]) {
465                let fin_c = chars[i];
466                // Peek: if fin_c is followed by ์ it's silent
467                let next_is_silent = i + 1 < n && is_silent_mark(chars[i + 1]);
468                // If fin_c is followed by a vowel diacritic or leading vowel, it's
469                // an initial of the next syllable — don't consume as final
470                let next_is_vowel = i + 1 < n
471                    && (is_leading_vowel(chars[i + 1])
472                        || matches!(
473                            chars[i + 1],
474                            'ิ' | 'ี'
475                                | 'ึ'
476                                | 'ื'
477                                | 'ั'
478                                | '็'
479                                | 'ุ'
480                                | 'ู'
481                                | 'า'
482                                | 'ะ'
483                                | 'ำ'
484                        ));
485                if next_is_silent {
486                    i += 2; // consume consonant + ์
487                } else if next_is_vowel {
488                    // next char is an initial of a following syllable — leave it
489                } else {
490                    out.push_str(final_rtgs(fin_c));
491                    i += 1;
492                }
493            }
494        } else if is_tone_mark(c) || is_silent_mark(c) || matches!(c, '\u{0E4D}' | '\u{0E3A}') {
495            i += 1; // stray diacritic — skip
496        } else {
497            // Non-Thai character: pass through
498            out.push(c);
499            i += 1;
500        }
501    }
502
503    out
504}
505
506// ---------------------------------------------------------------------------
507// Tests
508// ---------------------------------------------------------------------------
509
510#[cfg(test)]
511mod tests {
512    use super::*;
513    use alloc::vec;
514
515    #[test]
516    fn builtin_common_words() {
517        let map = RomanizationMap::builtin();
518        assert_eq!(map.romanize("กิน"), Some("kin"));
519        assert_eq!(map.romanize("ข้าว"), Some("khao"));
520        assert_eq!(map.romanize("น้ำ"), Some("nam"));
521        assert_eq!(map.romanize("ปลา"), Some("pla"));
522    }
523
524    #[test]
525    fn unknown_word_returns_none_for_non_thai() {
526        let map = RomanizationMap::builtin();
527        assert_eq!(map.romanize("hello"), None);
528        assert_eq!(map.romanize("123"), None);
529    }
530
531    #[test]
532    fn romanize_or_raw_hit() {
533        let map = RomanizationMap::builtin();
534        assert_eq!(map.romanize_or_raw("กิน"), "kin");
535    }
536
537    #[test]
538    fn romanize_or_raw_non_thai_passthrough() {
539        let map = RomanizationMap::builtin();
540        assert_eq!(map.romanize_or_raw("xyz"), "xyz");
541    }
542
543    #[test]
544    fn romanize_or_rule_oov_thai_non_empty() {
545        let map = RomanizationMap::builtin();
546        // OOV Thai words should get rule-based romanization, not empty string
547        let result = map.romanize_or_rule("เปปซี่");
548        assert!(
549            !result.is_empty(),
550            "rule engine should produce non-empty output"
551        );
552        assert!(
553            !result.chars().any(is_thai_char),
554            "output should be Latin, not Thai"
555        );
556    }
557
558    // ── rule engine unit tests ────────────────────────────────────────────────
559
560    #[test]
561    fn rule_simple_consonant_vowel_final() {
562        // กิน = ก(k) + ิ(i) + น(n) → "kin"
563        assert_eq!(romanize_word("กิน"), "kin");
564    }
565
566    #[test]
567    fn rule_leading_vowel_ae() {
568        // แก = แ(ae) + ก(k) → "kaek" or "kaek"
569        // แก้ว = แ + ก + ้ (tone) + ว(final=o) → "kaeo"
570        let r = romanize_word("แก้ว");
571        assert_eq!(r, "kaeo");
572    }
573
574    #[test]
575    fn rule_leading_vowel_o() {
576        // โต = โ + ต → "to"
577        assert_eq!(romanize_word("โต"), "to");
578    }
579
580    #[test]
581    fn rule_leading_vowel_ai() {
582        // ไป = ไ + ป → "pai" (final ป in ไ pattern)
583        let r = romanize_word("ไป");
584        // Should start with 'p' and contain 'ai'
585        assert!(r.contains("ai"), "ไป should romanize with 'ai', got: {r}");
586    }
587
588    #[test]
589    fn rule_sara_am() {
590        // ทำ = ท + ำ → "tham"
591        assert_eq!(romanize_word("ทำ"), "tham");
592    }
593
594    #[test]
595    fn rule_below_vowel_u() {
596        // ดุ = ด + ุ → "du"
597        assert_eq!(romanize_word("ดุ"), "du");
598    }
599
600    #[test]
601    fn rule_non_thai_passthrough() {
602        assert_eq!(romanize_word("hello"), "hello");
603    }
604
605    #[test]
606    fn rule_empty_string() {
607        assert_eq!(romanize_word(""), "");
608    }
609
610    #[test]
611    fn romanize_or_rule_table_takes_priority() {
612        let map = RomanizationMap::builtin();
613        // Table has hand-curated "กิน" → "kin"
614        assert_eq!(map.romanize_or_rule("กิน"), "kin");
615    }
616
617    #[test]
618    fn romanize_or_rule_non_thai_passthrough() {
619        let map = RomanizationMap::builtin();
620        assert_eq!(map.romanize_or_rule("hello"), "hello");
621    }
622
623    #[test]
624    fn from_tsv_last_duplicate_wins() {
625        let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
626        assert_eq!(map.romanize("กิน"), Some("gin"));
627    }
628
629    #[test]
630    fn romanize_tokens_aligned() {
631        let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
632        let out = map.romanize_tokens(&["กิน", "ปลา"]);
633        assert_eq!(out, vec!["kin", "pla"]);
634    }
635
636    #[test]
637    fn romanize_tokens_unknown_passthrough() {
638        let map = RomanizationMap::from_tsv("กิน\tkin\n");
639        let out = map.romanize_tokens(&["กิน", "xyz"]);
640        assert_eq!(out, vec!["kin", "xyz"]);
641    }
642
643    #[test]
644    fn comment_and_blank_lines_skipped() {
645        let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
646        assert_eq!(map.len(), 1);
647        assert_eq!(map.romanize("กิน"), Some("kin"));
648    }
649
650    #[test]
651    fn line_without_tab_skipped() {
652        let map = RomanizationMap::from_tsv("กิน\n");
653        assert!(map.is_empty());
654    }
655
656    #[test]
657    fn whitespace_trimmed_from_romanization() {
658        let map = RomanizationMap::from_tsv("กิน\t kin \n");
659        assert_eq!(map.romanize("กิน"), Some("kin"));
660    }
661
662    #[test]
663    fn empty_input_produces_empty_map() {
664        assert!(RomanizationMap::from_tsv("").is_empty());
665    }
666
667    #[test]
668    fn romanize_tokens_empty_slice() {
669        let map = RomanizationMap::builtin();
670        assert!(map.romanize_tokens(&[]).is_empty());
671    }
672}
kham_core/romanizer.rs

kham_core/
romanizer.rs