kham_core/
romanizer.rs

1//! RTGS romanization of segmented Thai words.
2//!
3//! [`RomanizationMap`] maps pre-segmented Thai words to their Roman (Latin)
4//! phonetic equivalents using the Royal Thai General System of Transcription
5//! (RTGS) — the Thai government standard used in road signs, passports, and
6//! official documents.
7//!
8//! Lookup first checks the hand-curated table; words not in the table are
9//! romanized by the built-in rule engine ([`romanize_word`]).
10//!
11//! # RTGS characteristics
12//!
13//! - Consonant-by-consonant transliteration (initial vs. final position differ)
14//! - No tone marks in output
15//! - No vowel-length distinction (อิ and อี both map to `i`)
16//! - Diphthongs and vowel clusters have explicit multi-character mappings
17//!
18//! # Data format
19//!
20//! Tab-separated text file, one entry per line:
21//!
22//! ```text
23//! # Thai word<TAB>RTGS romanization
24//! กิน<TAB>kin
25//! ข้าว<TAB>khao
26//! ปลา<TAB>pla
27//! ```
28//!
29//! Lines beginning with `#` and blank lines are ignored.
30//! Duplicate keys: last entry wins (allows override files).
31//!
32//! # Example
33//!
34//! ```rust
35//! use kham_core::romanizer::RomanizationMap;
36//!
37//! let map = RomanizationMap::builtin();
38//! assert_eq!(map.romanize("กิน"), Some("kin"));
39//! assert_eq!(map.romanize_or_raw("ข้าว"), "khao");
40//! assert_eq!(map.romanize_or_raw("xyz"), "xyz");
41//!
42//! let tokens = vec!["กิน", "ข้าว", "ปลา"];
43//! assert_eq!(map.romanize_tokens(&tokens), vec!["kin", "khao", "pla"]);
44//! ```
45
46use alloc::collections::BTreeMap;
47use alloc::string::String;
48use alloc::vec::Vec;
49
50use crate::segmenter::Tokenizer;
51use crate::token::TokenKind;
52
53static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
54
55/// A Thai-word → RTGS-romanization lookup table.
56///
57/// Built from tab-separated data via [`RomanizationMap::from_tsv`].
58/// Lookup is O(log n) via [`BTreeMap`].
59pub struct RomanizationMap(BTreeMap<String, String>);
60
61impl RomanizationMap {
62    /// Load the built-in RTGS romanization table.
63    pub fn builtin() -> Self {
64        Self::from_tsv(BUILTIN_ROMANIZATION)
65    }
66
67    /// Parse a tab-separated romanization table.
68    ///
69    /// Format: `thai_word\trtgs_romanization` — one entry per line.
70    /// Lines beginning with `#` and blank lines are skipped.
71    /// For duplicate keys, the last entry wins.
72    pub fn from_tsv(data: &str) -> Self {
73        let mut map: BTreeMap<String, String> = BTreeMap::new();
74        for line in data.lines() {
75            let line = line.trim();
76            if line.is_empty() || line.starts_with('#') {
77                continue;
78            }
79            let mut parts = line.splitn(2, '\t');
80            let word = match parts.next() {
81                Some(w) if !w.is_empty() => String::from(w),
82                _ => continue,
83            };
84            let roman = match parts.next() {
85                Some(r) if !r.is_empty() => String::from(r.trim()),
86                _ => continue,
87            };
88            map.insert(word, roman);
89        }
90        RomanizationMap(map)
91    }
92
93    /// Look up the RTGS romanization for a pre-segmented Thai word.
94    ///
95    /// Returns the table hit if the word is in the hand-curated list, otherwise
96    /// applies the built-in rule engine. Returns `None` only when the word
97    /// contains no Thai characters (e.g. pure Latin or numbers).
98    ///
99    /// The returned `&str` borrows from the map for table hits; rule-engine
100    /// results are returned as an owned `String` via the `romanize_owned`
101    /// helper — callers that want a borrowed `&str` should use
102    /// [`romanize_or_raw`](Self::romanize_or_raw).
103    ///
104    /// # Example
105    ///
106    /// ```rust
107    /// use kham_core::romanizer::RomanizationMap;
108    ///
109    /// let map = RomanizationMap::builtin();
110    /// // Table hit
111    /// assert_eq!(map.romanize("กิน"), Some("kin"));
112    /// // OOV word — not in table; use romanize_owned() for rule-engine fallback
113    /// assert_eq!(map.romanize("เปปซี่"), None);
114    /// // Non-Thai input
115    /// assert_eq!(map.romanize("xyz"), None);
116    /// ```
117    pub fn romanize(&self, word: &str) -> Option<&str> {
118        self.0.get(word).map(String::as_str)
119    }
120
121    /// Romanize `word` to an owned `String`, using the table first, then the
122    /// rule engine for out-of-vocabulary Thai words.
123    ///
124    /// Returns `None` only when the word contains no Thai characters.
125    ///
126    /// # Example
127    ///
128    /// ```rust
129    /// use kham_core::romanizer::RomanizationMap;
130    ///
131    /// let map = RomanizationMap::builtin();
132    /// assert_eq!(map.romanize_owned("กิน").as_deref(), Some("kin"));
133    /// // OOV word gets rule-based approximation
134    /// assert!(map.romanize_owned("เปปซี่").is_some());
135    /// // Non-Thai returns None
136    /// assert_eq!(map.romanize_owned("hello"), None);
137    /// ```
138    pub fn romanize_owned(&self, word: &str) -> Option<String> {
139        if let Some(s) = self.0.get(word) {
140            return Some(s.clone());
141        }
142        if word.chars().any(is_thai_char) {
143            Some(romanize_word(word))
144        } else {
145            None
146        }
147    }
148
149    /// Return the RTGS romanization for `word`, or `word` unchanged if not in
150    /// the table. Only performs table lookup — no rule engine.
151    ///
152    /// For OOV Thai words that should fall back to the rule engine, use
153    /// [`romanize_or_rule`](Self::romanize_or_rule) instead.
154    ///
155    /// # Example
156    ///
157    /// ```rust
158    /// use kham_core::romanizer::RomanizationMap;
159    ///
160    /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
161    /// assert_eq!(map.romanize_or_raw("กิน"), "kin");
162    /// assert_eq!(map.romanize_or_raw("xyz"), "xyz");
163    /// // OOV Thai is returned unchanged (raw passthrough)
164    /// assert_eq!(map.romanize_or_raw("เปปซี่"), "เปปซี่");
165    /// ```
166    pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
167        self.0.get(word).map(String::as_str).unwrap_or(word)
168    }
169
170    /// Return the RTGS romanization for `word`.
171    ///
172    /// Checks the table first; for OOV Thai words the built-in rule engine is
173    /// applied. Non-Thai input is returned unchanged. Always returns an owned
174    /// `String`.
175    ///
176    /// # Example
177    ///
178    /// ```rust
179    /// use kham_core::romanizer::RomanizationMap;
180    ///
181    /// let map = RomanizationMap::builtin();
182    /// // Table hit
183    /// assert_eq!(map.romanize_or_rule("กิน"), "kin");
184    /// // Non-Thai passes through
185    /// assert_eq!(map.romanize_or_rule("hello"), "hello");
186    /// // OOV Thai gets rule-based approximation
187    /// let oov = map.romanize_or_rule("เปปซี่");
188    /// assert!(!oov.is_empty());
189    /// assert!(!oov.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)));
190    /// ```
191    pub fn romanize_or_rule(&self, word: &str) -> String {
192        if let Some(s) = self.0.get(word) {
193            return s.clone();
194        }
195        if word.chars().any(is_thai_char) {
196            romanize_word(word)
197        } else {
198            String::from(word)
199        }
200    }
201
202    /// Romanize a slice of pre-segmented token strings.
203    ///
204    /// Returns a `Vec<String>` aligned 1:1 with the input slice. Tokens not
205    /// found in the table are returned unchanged (same behaviour as
206    /// [`romanize_or_raw`](Self::romanize_or_raw)).
207    ///
208    /// # Example
209    ///
210    /// ```rust
211    /// use kham_core::romanizer::RomanizationMap;
212    ///
213    /// let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
214    /// let out = map.romanize_tokens(&["กิน", "ปลา"]);
215    /// assert_eq!(out, vec!["kin", "pla"]);
216    /// ```
217    pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
218        tokens
219            .iter()
220            .map(|t| String::from(self.romanize_or_raw(t)))
221            .collect()
222    }
223
224    /// Segment `text` and romanize every Thai token using RTGS table-lookup with
225    /// rule-based fallback. Non-Thai tokens (Latin, numbers, punctuation,
226    /// whitespace) are passed through as-is.
227    ///
228    /// The result is a continuous string with no separator between tokens — the
229    /// original whitespace tokens (if any) are preserved as spaces.
230    ///
231    /// # Example
232    /// ```rust
233    /// use kham_core::romanizer::RomanizationMap;
234    ///
235    /// let map = RomanizationMap::builtin();
236    /// let out = map.romanize_sentence("กินข้าว");
237    /// // Should contain only ASCII / Latin characters for Thai input
238    /// assert!(!out.is_empty());
239    /// assert!(!out.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)));
240    /// ```
241    pub fn romanize_sentence(&self, text: &str) -> String {
242        if text.is_empty() {
243            return String::new();
244        }
245        let tokenizer = Tokenizer::builder().keep_whitespace(true).build();
246        let tokens = tokenizer.segment(text);
247        let mut out = String::with_capacity(text.len() * 2);
248        for token in &tokens {
249            match token.kind {
250                TokenKind::Thai | TokenKind::Named(_) => {
251                    out.push_str(&self.romanize_or_rule(token.text));
252                }
253                _ => out.push_str(token.text),
254            }
255        }
256        out
257    }
258
259    /// Number of entries in the map.
260    #[inline]
261    pub fn len(&self) -> usize {
262        self.0.len()
263    }
264
265    /// Return `true` if the map has no entries.
266    #[inline]
267    pub fn is_empty(&self) -> bool {
268        self.0.is_empty()
269    }
270}
271
272// ---------------------------------------------------------------------------
273// Rule-based RTGS engine (fallback for OOV words)
274// ---------------------------------------------------------------------------
275
276#[inline]
277fn is_thai_char(c: char) -> bool {
278    ('\u{0E00}'..='\u{0E7F}').contains(&c)
279}
280
281/// RTGS initial-position consonant mapping.
282fn initial_rtgs(c: char) -> &'static str {
283    match c {
284        'ก' => "k",
285        'ข' | 'ค' | 'ฅ' | 'ฆ' => "kh",
286        'ง' => "ng",
287        'จ' | 'ฉ' | 'ช' | 'ฌ' => "ch",
288        'ซ' | 'ศ' | 'ษ' | 'ส' => "s",
289        'ญ' | 'ย' => "y",
290        'ฎ' | 'ด' => "d",
291        'ฏ' | 'ต' => "t",
292        'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "th",
293        'น' | 'ณ' => "n",
294        'บ' => "b",
295        'ป' => "p",
296        'ผ' | 'พ' | 'ภ' => "ph",
297        'ฝ' | 'ฟ' => "f",
298        'ม' => "m",
299        'ร' => "r",
300        'ล' | 'ฬ' => "l",
301        'ว' => "w",
302        'ห' | 'ฮ' => "h",
303        'อ' => "",
304        _ => "",
305    }
306}
307
308/// RTGS final-position (coda) consonant mapping.
309fn final_rtgs(c: char) -> &'static str {
310    match c {
311        'ก' | 'ข' | 'ค' | 'ฅ' | 'ฆ' => "k",
312        'ง' => "ng",
313        'จ' | 'ช' | 'ซ' | 'ฌ' | 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ถ' | 'ท' | 'ธ' | 'ศ' | 'ษ' | 'ส' => {
314            "t"
315        }
316        'น' | 'ณ' => "n",
317        'บ' | 'ป' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "p",
318        'ม' => "m",
319        'ย' | 'ญ' => "i",
320        'ร' => "n",
321        'ล' | 'ฬ' => "n",
322        'ว' => "o",
323        'ห' | 'อ' => "",
324        _ => "",
325    }
326}
327
328fn is_thai_consonant(c: char) -> bool {
329    matches!(c, 'ก'..='ฮ')
330}
331
332fn is_leading_vowel(c: char) -> bool {
333    matches!(c, 'เ' | 'แ' | 'โ' | 'ใ' | 'ไ')
334}
335
336fn is_tone_mark(c: char) -> bool {
337    matches!(c, '\u{0E48}' | '\u{0E49}' | '\u{0E4A}' | '\u{0E4B}')
338}
339
340fn is_silent_mark(c: char) -> bool {
341    c == '\u{0E4C}' // ์ thanthakat
342}
343
344/// Apply RTGS rules to an OOV Thai word.
345///
346/// Processes the Unicode character sequence using a lightweight syllable
347/// state machine. Handles leading vowels (เ แ โ ใ ไ), above vowels
348/// (ิ ี ึ ื ั ็), below vowels (ุ ู), following vowels (า ะ ำ), tone marks
349/// (skipped), and the thanthakat silent marker (์). Unrecognised characters
350/// pass through unchanged.
351pub fn romanize_word(word: &str) -> String {
352    let chars: Vec<char> = word.chars().collect();
353    let n = chars.len();
354    let mut out = String::with_capacity(word.len());
355    let mut i = 0;
356
357    while i < n {
358        let c = chars[i];
359
360        if is_leading_vowel(c) {
361            let lead = c;
362            i += 1;
363            // Skip any stacked tone marks before the initial consonant
364            while i < n && is_tone_mark(chars[i]) {
365                i += 1;
366            }
367            if i < n && is_thai_consonant(chars[i]) {
368                let init = initial_rtgs(chars[i]);
369                i += 1;
370                // Skip tone marks and above/below vowels that follow the initial
371                while i < n
372                    && (is_tone_mark(chars[i])
373                        || matches!(
374                            chars[i],
375                            'ิ' | 'ี' | 'ึ' | 'ื' | 'ั' | '็' | 'ุ' | 'ู' | '\u{0E4D}' | '\u{0E3A}'
376                        ))
377                {
378                    i += 1;
379                }
380                // Detect compound patterns: เ_อ → oe, เ_า → ao, เ_็ already consumed above
381                let suffix = if lead == 'เ' && i < n && chars[i] == 'อ' {
382                    i += 1;
383                    "oe"
384                } else if lead == 'เ' && i < n && chars[i] == 'า' {
385                    i += 1;
386                    "ao" // เ_า pattern
387                } else {
388                    match lead {
389                        'เ' => "e",
390                        'แ' => "ae",
391                        'โ' => "o",
392                        'ใ' | 'ไ' => "ai",
393                        _ => "",
394                    }
395                };
396                out.push_str(init);
397                out.push_str(suffix);
398                // Final consonant
399                if i < n && is_thai_consonant(chars[i]) && !is_silent_mark(chars[i]) {
400                    // Check for thanthakat on next+1
401                    let fin_c = chars[i];
402                    i += 1;
403                    let silent = i < n && is_silent_mark(chars[i]);
404                    if silent {
405                        i += 1; // consume ์
406                    } else {
407                        out.push_str(final_rtgs(fin_c));
408                    }
409                }
410            } else {
411                // Lone leading vowel — just emit vowel sound
412                out.push_str(match lead {
413                    'เ' => "e",
414                    'แ' => "ae",
415                    'โ' => "o",
416                    'ใ' | 'ไ' => "ai",
417                    _ => "",
418                });
419            }
420        } else if is_thai_consonant(c) {
421            let init = initial_rtgs(c);
422            i += 1;
423
424            // Collect vowel diacritics and tone marks
425            let mut vowel = "";
426            let mut pending_silent = false;
427            while i < n {
428                match chars[i] {
429                    // Tone marks — skip
430                    ch if is_tone_mark(ch) => i += 1,
431                    // Thanthakat — this consonant is silent
432                    ch if is_silent_mark(ch) => {
433                        pending_silent = true;
434                        i += 1;
435                        break;
436                    }
437                    // Above vowels
438                    'ิ' | '็' => {
439                        vowel = "i";
440                        i += 1;
441                    }
442                    'ี' => {
443                        vowel = "i";
444                        i += 1;
445                    }
446                    'ึ' => {
447                        vowel = "ue";
448                        i += 1;
449                    }
450                    'ื' => {
451                        vowel = "ue";
452                        i += 1;
453                    }
454                    'ั' => {
455                        vowel = "a";
456                        i += 1;
457                    }
458                    // Below vowels
459                    'ุ' => {
460                        vowel = "u";
461                        i += 1;
462                    }
463                    'ู' => {
464                        vowel = "u";
465                        i += 1;
466                    }
467                    // Following vowels
468                    'า' => {
469                        vowel = "a";
470                        i += 1;
471                    }
472                    'ะ' => {
473                        vowel = "a";
474                        i += 1;
475                    }
476                    'ำ' => {
477                        vowel = "am";
478                        i += 1;
479                        break;
480                    } // am absorbs final
481                    // Nikhahit / phinthu — skip
482                    '\u{0E4D}' | '\u{0E3A}' => i += 1,
483                    _ => break,
484                }
485            }
486
487            if pending_silent {
488                // Consonant is silent (e.g. ห์ in loan words) — emit nothing
489                continue;
490            }
491
492            out.push_str(init);
493            out.push_str(vowel);
494
495            // ำ already encodes the final nasal — skip coda search
496            if vowel == "am" {
497                continue;
498            }
499
500            // Final consonant: next non-tone-mark consonant followed by end-of-word
501            // or another leading vowel / vowel diacritic
502            if i < n && is_thai_consonant(chars[i]) {
503                let fin_c = chars[i];
504                // Peek: if fin_c is followed by ์ it's silent
505                let next_is_silent = i + 1 < n && is_silent_mark(chars[i + 1]);
506                // If fin_c is followed by a vowel diacritic or leading vowel, it's
507                // an initial of the next syllable — don't consume as final
508                let next_is_vowel = i + 1 < n
509                    && (is_leading_vowel(chars[i + 1])
510                        || matches!(
511                            chars[i + 1],
512                            'ิ' | 'ี'
513                                | 'ึ'
514                                | 'ื'
515                                | 'ั'
516                                | '็'
517                                | 'ุ'
518                                | 'ู'
519                                | 'า'
520                                | 'ะ'
521                                | 'ำ'
522                        ));
523                if next_is_silent {
524                    i += 2; // consume consonant + ์
525                } else if next_is_vowel {
526                    // next char is an initial of a following syllable — leave it
527                } else {
528                    out.push_str(final_rtgs(fin_c));
529                    i += 1;
530                }
531            }
532        } else if is_tone_mark(c) || is_silent_mark(c) || matches!(c, '\u{0E4D}' | '\u{0E3A}') {
533            i += 1; // stray diacritic — skip
534        } else {
535            // Non-Thai character: pass through
536            out.push(c);
537            i += 1;
538        }
539    }
540
541    out
542}
543
544// ---------------------------------------------------------------------------
545// Tests
546// ---------------------------------------------------------------------------
547
548#[cfg(test)]
549mod tests {
550    use super::*;
551    use alloc::vec;
552
553    #[test]
554    fn builtin_common_words() {
555        let map = RomanizationMap::builtin();
556        assert_eq!(map.romanize("กิน"), Some("kin"));
557        assert_eq!(map.romanize("ข้าว"), Some("khao"));
558        assert_eq!(map.romanize("น้ำ"), Some("nam"));
559        assert_eq!(map.romanize("ปลา"), Some("pla"));
560    }
561
562    #[test]
563    fn unknown_word_returns_none_for_non_thai() {
564        let map = RomanizationMap::builtin();
565        assert_eq!(map.romanize("hello"), None);
566        assert_eq!(map.romanize("123"), None);
567    }
568
569    #[test]
570    fn romanize_or_raw_hit() {
571        let map = RomanizationMap::builtin();
572        assert_eq!(map.romanize_or_raw("กิน"), "kin");
573    }
574
575    #[test]
576    fn romanize_or_raw_non_thai_passthrough() {
577        let map = RomanizationMap::builtin();
578        assert_eq!(map.romanize_or_raw("xyz"), "xyz");
579    }
580
581    #[test]
582    fn romanize_or_rule_oov_thai_non_empty() {
583        let map = RomanizationMap::builtin();
584        // OOV Thai words should get rule-based romanization, not empty string
585        let result = map.romanize_or_rule("เปปซี่");
586        assert!(
587            !result.is_empty(),
588            "rule engine should produce non-empty output"
589        );
590        assert!(
591            !result.chars().any(is_thai_char),
592            "output should be Latin, not Thai"
593        );
594    }
595
596    // ── rule engine unit tests ────────────────────────────────────────────────
597
598    #[test]
599    fn rule_simple_consonant_vowel_final() {
600        // กิน = ก(k) + ิ(i) + น(n) → "kin"
601        assert_eq!(romanize_word("กิน"), "kin");
602    }
603
604    #[test]
605    fn rule_leading_vowel_ae() {
606        // แก = แ(ae) + ก(k) → "kaek" or "kaek"
607        // แก้ว = แ + ก + ้ (tone) + ว(final=o) → "kaeo"
608        let r = romanize_word("แก้ว");
609        assert_eq!(r, "kaeo");
610    }
611
612    #[test]
613    fn rule_leading_vowel_o() {
614        // โต = โ + ต → "to"
615        assert_eq!(romanize_word("โต"), "to");
616    }
617
618    #[test]
619    fn rule_leading_vowel_ai() {
620        // ไป = ไ + ป → "pai" (final ป in ไ pattern)
621        let r = romanize_word("ไป");
622        // Should start with 'p' and contain 'ai'
623        assert!(r.contains("ai"), "ไป should romanize with 'ai', got: {r}");
624    }
625
626    #[test]
627    fn rule_sara_am() {
628        // ทำ = ท + ำ → "tham"
629        assert_eq!(romanize_word("ทำ"), "tham");
630    }
631
632    #[test]
633    fn rule_below_vowel_u() {
634        // ดุ = ด + ุ → "du"
635        assert_eq!(romanize_word("ดุ"), "du");
636    }
637
638    #[test]
639    fn rule_non_thai_passthrough() {
640        assert_eq!(romanize_word("hello"), "hello");
641    }
642
643    #[test]
644    fn rule_empty_string() {
645        assert_eq!(romanize_word(""), "");
646    }
647
648    #[test]
649    fn romanize_or_rule_table_takes_priority() {
650        let map = RomanizationMap::builtin();
651        // Table has hand-curated "กิน" → "kin"
652        assert_eq!(map.romanize_or_rule("กิน"), "kin");
653    }
654
655    #[test]
656    fn romanize_or_rule_non_thai_passthrough() {
657        let map = RomanizationMap::builtin();
658        assert_eq!(map.romanize_or_rule("hello"), "hello");
659    }
660
661    #[test]
662    fn from_tsv_last_duplicate_wins() {
663        let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
664        assert_eq!(map.romanize("กิน"), Some("gin"));
665    }
666
667    #[test]
668    fn romanize_tokens_aligned() {
669        let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
670        let out = map.romanize_tokens(&["กิน", "ปลา"]);
671        assert_eq!(out, vec!["kin", "pla"]);
672    }
673
674    #[test]
675    fn romanize_tokens_unknown_passthrough() {
676        let map = RomanizationMap::from_tsv("กิน\tkin\n");
677        let out = map.romanize_tokens(&["กิน", "xyz"]);
678        assert_eq!(out, vec!["kin", "xyz"]);
679    }
680
681    #[test]
682    fn comment_and_blank_lines_skipped() {
683        let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
684        assert_eq!(map.len(), 1);
685        assert_eq!(map.romanize("กิน"), Some("kin"));
686    }
687
688    #[test]
689    fn line_without_tab_skipped() {
690        let map = RomanizationMap::from_tsv("กิน\n");
691        assert!(map.is_empty());
692    }
693
694    #[test]
695    fn whitespace_trimmed_from_romanization() {
696        let map = RomanizationMap::from_tsv("กิน\t kin \n");
697        assert_eq!(map.romanize("กิน"), Some("kin"));
698    }
699
700    #[test]
701    fn empty_input_produces_empty_map() {
702        assert!(RomanizationMap::from_tsv("").is_empty());
703    }
704
705    #[test]
706    fn romanize_tokens_empty_slice() {
707        let map = RomanizationMap::builtin();
708        assert!(map.romanize_tokens(&[]).is_empty());
709    }
710
711    // romanize_sentence tests --------------------------------------------------
712
713    #[test]
714    fn romanize_sentence_thai_only() {
715        let map = RomanizationMap::builtin();
716        let out = map.romanize_sentence("กินข้าว");
717        assert!(!out.is_empty(), "output should not be empty");
718        assert!(
719            !out.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)),
720            "output should contain no Thai characters; got: {out:?}"
721        );
722    }
723
724    #[test]
725    fn romanize_sentence_mixed() {
726        let map = RomanizationMap::builtin();
727        let out = map.romanize_sentence("กิน100บาท");
728        assert!(
729            out.contains("100"),
730            "output should preserve '100'; got: {out:?}"
731        );
732        // "บาท" should be romanized — no Thai chars in the output
733        assert!(
734            !out.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)),
735            "output should contain no Thai characters; got: {out:?}"
736        );
737    }
738}
kham_core/romanizer.rs

kham_core/
romanizer.rs