kham_core/romanizer.rs
1//! RTGS romanization of segmented Thai words.
2//!
3//! [`RomanizationMap`] maps pre-segmented Thai words to their Roman (Latin)
4//! phonetic equivalents using the Royal Thai General System of Transcription
5//! (RTGS) — the Thai government standard used in road signs, passports, and
6//! official documents.
7//!
8//! Lookup first checks the hand-curated table; words not in the table are
9//! romanized by the built-in rule engine ([`romanize_word`]).
10//!
11//! # RTGS characteristics
12//!
13//! - Consonant-by-consonant transliteration (initial vs. final position differ)
14//! - No tone marks in output
15//! - No vowel-length distinction (อิ and อี both map to `i`)
16//! - Diphthongs and vowel clusters have explicit multi-character mappings
17//!
18//! # Data format
19//!
20//! Tab-separated text file, one entry per line:
21//!
22//! ```text
23//! # Thai word<TAB>RTGS romanization
24//! กิน<TAB>kin
25//! ข้าว<TAB>khao
26//! ปลา<TAB>pla
27//! ```
28//!
29//! Lines beginning with `#` and blank lines are ignored.
30//! Duplicate keys: last entry wins (allows override files).
31//!
32//! # Example
33//!
34//! ```rust
35//! use kham_core::romanizer::RomanizationMap;
36//!
37//! let map = RomanizationMap::builtin();
38//! assert_eq!(map.romanize("กิน"), Some("kin"));
39//! assert_eq!(map.romanize_or_raw("ข้าว"), "khao");
40//! assert_eq!(map.romanize_or_raw("xyz"), "xyz");
41//!
42//! let tokens = vec!["กิน", "ข้าว", "ปลา"];
43//! assert_eq!(map.romanize_tokens(&tokens), vec!["kin", "khao", "pla"]);
44//! ```
45
46use alloc::collections::BTreeMap;
47use alloc::string::String;
48use alloc::vec::Vec;
49
50static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
51
52/// A Thai-word → RTGS-romanization lookup table.
53///
54/// Built from tab-separated data via [`RomanizationMap::from_tsv`].
55/// Lookup is O(log n) via [`BTreeMap`].
56pub struct RomanizationMap(BTreeMap<String, String>);
57
58impl RomanizationMap {
59 /// Load the built-in RTGS romanization table.
60 pub fn builtin() -> Self {
61 Self::from_tsv(BUILTIN_ROMANIZATION)
62 }
63
64 /// Parse a tab-separated romanization table.
65 ///
66 /// Format: `thai_word\trtgs_romanization` — one entry per line.
67 /// Lines beginning with `#` and blank lines are skipped.
68 /// For duplicate keys, the last entry wins.
69 pub fn from_tsv(data: &str) -> Self {
70 let mut map: BTreeMap<String, String> = BTreeMap::new();
71 for line in data.lines() {
72 let line = line.trim();
73 if line.is_empty() || line.starts_with('#') {
74 continue;
75 }
76 let mut parts = line.splitn(2, '\t');
77 let word = match parts.next() {
78 Some(w) if !w.is_empty() => String::from(w),
79 _ => continue,
80 };
81 let roman = match parts.next() {
82 Some(r) if !r.is_empty() => String::from(r.trim()),
83 _ => continue,
84 };
85 map.insert(word, roman);
86 }
87 RomanizationMap(map)
88 }
89
90 /// Look up the RTGS romanization for a pre-segmented Thai word.
91 ///
92 /// Returns the table hit if the word is in the hand-curated list, otherwise
93 /// applies the built-in rule engine. Returns `None` only when the word
94 /// contains no Thai characters (e.g. pure Latin or numbers).
95 ///
96 /// The returned `&str` borrows from the map for table hits; rule-engine
97 /// results are returned as an owned `String` via the `romanize_owned`
98 /// helper — callers that want a borrowed `&str` should use
99 /// [`romanize_or_raw`](Self::romanize_or_raw).
100 ///
101 /// # Example
102 ///
103 /// ```rust
104 /// use kham_core::romanizer::RomanizationMap;
105 ///
106 /// let map = RomanizationMap::builtin();
107 /// // Table hit
108 /// assert_eq!(map.romanize("กิน"), Some("kin"));
109 /// // OOV word — not in table; use romanize_owned() for rule-engine fallback
110 /// assert_eq!(map.romanize("เปปซี่"), None);
111 /// // Non-Thai input
112 /// assert_eq!(map.romanize("xyz"), None);
113 /// ```
114 pub fn romanize(&self, word: &str) -> Option<&str> {
115 self.0.get(word).map(String::as_str)
116 }
117
118 /// Romanize `word` to an owned `String`, using the table first, then the
119 /// rule engine for out-of-vocabulary Thai words.
120 ///
121 /// Returns `None` only when the word contains no Thai characters.
122 ///
123 /// # Example
124 ///
125 /// ```rust
126 /// use kham_core::romanizer::RomanizationMap;
127 ///
128 /// let map = RomanizationMap::builtin();
129 /// assert_eq!(map.romanize_owned("กิน").as_deref(), Some("kin"));
130 /// // OOV word gets rule-based approximation
131 /// assert!(map.romanize_owned("เปปซี่").is_some());
132 /// // Non-Thai returns None
133 /// assert_eq!(map.romanize_owned("hello"), None);
134 /// ```
135 pub fn romanize_owned(&self, word: &str) -> Option<String> {
136 if let Some(s) = self.0.get(word) {
137 return Some(s.clone());
138 }
139 if word.chars().any(is_thai_char) {
140 Some(romanize_word(word))
141 } else {
142 None
143 }
144 }
145
146 /// Return the RTGS romanization for `word`, or `word` unchanged if not in
147 /// the table. Only performs table lookup — no rule engine.
148 ///
149 /// For OOV Thai words that should fall back to the rule engine, use
150 /// [`romanize_or_rule`](Self::romanize_or_rule) instead.
151 ///
152 /// # Example
153 ///
154 /// ```rust
155 /// use kham_core::romanizer::RomanizationMap;
156 ///
157 /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
158 /// assert_eq!(map.romanize_or_raw("กิน"), "kin");
159 /// assert_eq!(map.romanize_or_raw("xyz"), "xyz");
160 /// // OOV Thai is returned unchanged (raw passthrough)
161 /// assert_eq!(map.romanize_or_raw("เปปซี่"), "เปปซี่");
162 /// ```
163 pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
164 self.0.get(word).map(String::as_str).unwrap_or(word)
165 }
166
167 /// Return the RTGS romanization for `word`.
168 ///
169 /// Checks the table first; for OOV Thai words the built-in rule engine is
170 /// applied. Non-Thai input is returned unchanged. Always returns an owned
171 /// `String`.
172 ///
173 /// # Example
174 ///
175 /// ```rust
176 /// use kham_core::romanizer::RomanizationMap;
177 ///
178 /// let map = RomanizationMap::builtin();
179 /// // Table hit
180 /// assert_eq!(map.romanize_or_rule("กิน"), "kin");
181 /// // Non-Thai passes through
182 /// assert_eq!(map.romanize_or_rule("hello"), "hello");
183 /// // OOV Thai gets rule-based approximation
184 /// let oov = map.romanize_or_rule("เปปซี่");
185 /// assert!(!oov.is_empty());
186 /// assert!(!oov.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)));
187 /// ```
188 pub fn romanize_or_rule(&self, word: &str) -> String {
189 if let Some(s) = self.0.get(word) {
190 return s.clone();
191 }
192 if word.chars().any(is_thai_char) {
193 romanize_word(word)
194 } else {
195 String::from(word)
196 }
197 }
198
199 /// Romanize a slice of pre-segmented token strings.
200 ///
201 /// Returns a `Vec<String>` aligned 1:1 with the input slice. Tokens not
202 /// found in the table are returned unchanged (same behaviour as
203 /// [`romanize_or_raw`](Self::romanize_or_raw)).
204 ///
205 /// # Example
206 ///
207 /// ```rust
208 /// use kham_core::romanizer::RomanizationMap;
209 ///
210 /// let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
211 /// let out = map.romanize_tokens(&["กิน", "ปลา"]);
212 /// assert_eq!(out, vec!["kin", "pla"]);
213 /// ```
214 pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
215 tokens
216 .iter()
217 .map(|t| String::from(self.romanize_or_raw(t)))
218 .collect()
219 }
220
221 /// Number of entries in the map.
222 #[inline]
223 pub fn len(&self) -> usize {
224 self.0.len()
225 }
226
227 /// Return `true` if the map has no entries.
228 #[inline]
229 pub fn is_empty(&self) -> bool {
230 self.0.is_empty()
231 }
232}
233
234// ---------------------------------------------------------------------------
235// Rule-based RTGS engine (fallback for OOV words)
236// ---------------------------------------------------------------------------
237
238#[inline]
239fn is_thai_char(c: char) -> bool {
240 ('\u{0E00}'..='\u{0E7F}').contains(&c)
241}
242
243/// RTGS initial-position consonant mapping.
244fn initial_rtgs(c: char) -> &'static str {
245 match c {
246 'ก' => "k",
247 'ข' | 'ค' | 'ฅ' | 'ฆ' => "kh",
248 'ง' => "ng",
249 'จ' | 'ฉ' | 'ช' | 'ฌ' => "ch",
250 'ซ' | 'ศ' | 'ษ' | 'ส' => "s",
251 'ญ' | 'ย' => "y",
252 'ฎ' | 'ด' => "d",
253 'ฏ' | 'ต' => "t",
254 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "th",
255 'น' | 'ณ' => "n",
256 'บ' => "b",
257 'ป' => "p",
258 'ผ' | 'พ' | 'ภ' => "ph",
259 'ฝ' | 'ฟ' => "f",
260 'ม' => "m",
261 'ร' => "r",
262 'ล' | 'ฬ' => "l",
263 'ว' => "w",
264 'ห' | 'ฮ' => "h",
265 'อ' => "",
266 _ => "",
267 }
268}
269
270/// RTGS final-position (coda) consonant mapping.
271fn final_rtgs(c: char) -> &'static str {
272 match c {
273 'ก' | 'ข' | 'ค' | 'ฅ' | 'ฆ' => "k",
274 'ง' => "ng",
275 'จ' | 'ช' | 'ซ' | 'ฌ' | 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ถ' | 'ท' | 'ธ' | 'ศ' | 'ษ' | 'ส' => {
276 "t"
277 }
278 'น' | 'ณ' => "n",
279 'บ' | 'ป' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "p",
280 'ม' => "m",
281 'ย' | 'ญ' => "i",
282 'ร' => "n",
283 'ล' | 'ฬ' => "n",
284 'ว' => "o",
285 'ห' | 'อ' => "",
286 _ => "",
287 }
288}
289
290fn is_thai_consonant(c: char) -> bool {
291 matches!(c, 'ก'..='ฮ')
292}
293
294fn is_leading_vowel(c: char) -> bool {
295 matches!(c, 'เ' | 'แ' | 'โ' | 'ใ' | 'ไ')
296}
297
298fn is_tone_mark(c: char) -> bool {
299 matches!(c, '\u{0E48}' | '\u{0E49}' | '\u{0E4A}' | '\u{0E4B}')
300}
301
302fn is_silent_mark(c: char) -> bool {
303 c == '\u{0E4C}' // ์ thanthakat
304}
305
306/// Apply RTGS rules to an OOV Thai word.
307///
308/// Processes the Unicode character sequence using a lightweight syllable
309/// state machine. Handles leading vowels (เ แ โ ใ ไ), above vowels
310/// (ิ ี ึ ื ั ็), below vowels (ุ ู), following vowels (า ะ ำ), tone marks
311/// (skipped), and the thanthakat silent marker (์). Unrecognised characters
312/// pass through unchanged.
313pub fn romanize_word(word: &str) -> String {
314 let chars: Vec<char> = word.chars().collect();
315 let n = chars.len();
316 let mut out = String::with_capacity(word.len());
317 let mut i = 0;
318
319 while i < n {
320 let c = chars[i];
321
322 if is_leading_vowel(c) {
323 let lead = c;
324 i += 1;
325 // Skip any stacked tone marks before the initial consonant
326 while i < n && is_tone_mark(chars[i]) {
327 i += 1;
328 }
329 if i < n && is_thai_consonant(chars[i]) {
330 let init = initial_rtgs(chars[i]);
331 i += 1;
332 // Skip tone marks and above/below vowels that follow the initial
333 while i < n
334 && (is_tone_mark(chars[i])
335 || matches!(
336 chars[i],
337 'ิ' | 'ี' | 'ึ' | 'ื' | 'ั' | '็' | 'ุ' | 'ู' | '\u{0E4D}' | '\u{0E3A}'
338 ))
339 {
340 i += 1;
341 }
342 // Detect compound patterns: เ_อ → oe, เ_า → ao, เ_็ already consumed above
343 let suffix = if lead == 'เ' && i < n && chars[i] == 'อ' {
344 i += 1;
345 "oe"
346 } else if lead == 'เ' && i < n && chars[i] == 'า' {
347 i += 1;
348 "ao" // เ_า pattern
349 } else {
350 match lead {
351 'เ' => "e",
352 'แ' => "ae",
353 'โ' => "o",
354 'ใ' | 'ไ' => "ai",
355 _ => "",
356 }
357 };
358 out.push_str(init);
359 out.push_str(suffix);
360 // Final consonant
361 if i < n && is_thai_consonant(chars[i]) && !is_silent_mark(chars[i]) {
362 // Check for thanthakat on next+1
363 let fin_c = chars[i];
364 i += 1;
365 let silent = i < n && is_silent_mark(chars[i]);
366 if silent {
367 i += 1; // consume ์
368 } else {
369 out.push_str(final_rtgs(fin_c));
370 }
371 }
372 } else {
373 // Lone leading vowel — just emit vowel sound
374 out.push_str(match lead {
375 'เ' => "e",
376 'แ' => "ae",
377 'โ' => "o",
378 'ใ' | 'ไ' => "ai",
379 _ => "",
380 });
381 }
382 } else if is_thai_consonant(c) {
383 let init = initial_rtgs(c);
384 i += 1;
385
386 // Collect vowel diacritics and tone marks
387 let mut vowel = "";
388 let mut pending_silent = false;
389 while i < n {
390 match chars[i] {
391 // Tone marks — skip
392 ch if is_tone_mark(ch) => i += 1,
393 // Thanthakat — this consonant is silent
394 ch if is_silent_mark(ch) => {
395 pending_silent = true;
396 i += 1;
397 break;
398 }
399 // Above vowels
400 'ิ' | '็' => {
401 vowel = "i";
402 i += 1;
403 }
404 'ี' => {
405 vowel = "i";
406 i += 1;
407 }
408 'ึ' => {
409 vowel = "ue";
410 i += 1;
411 }
412 'ื' => {
413 vowel = "ue";
414 i += 1;
415 }
416 'ั' => {
417 vowel = "a";
418 i += 1;
419 }
420 // Below vowels
421 'ุ' => {
422 vowel = "u";
423 i += 1;
424 }
425 'ู' => {
426 vowel = "u";
427 i += 1;
428 }
429 // Following vowels
430 'า' => {
431 vowel = "a";
432 i += 1;
433 }
434 'ะ' => {
435 vowel = "a";
436 i += 1;
437 }
438 'ำ' => {
439 vowel = "am";
440 i += 1;
441 break;
442 } // am absorbs final
443 // Nikhahit / phinthu — skip
444 '\u{0E4D}' | '\u{0E3A}' => i += 1,
445 _ => break,
446 }
447 }
448
449 if pending_silent {
450 // Consonant is silent (e.g. ห์ in loan words) — emit nothing
451 continue;
452 }
453
454 out.push_str(init);
455 out.push_str(vowel);
456
457 // ำ already encodes the final nasal — skip coda search
458 if vowel == "am" {
459 continue;
460 }
461
462 // Final consonant: next non-tone-mark consonant followed by end-of-word
463 // or another leading vowel / vowel diacritic
464 if i < n && is_thai_consonant(chars[i]) {
465 let fin_c = chars[i];
466 // Peek: if fin_c is followed by ์ it's silent
467 let next_is_silent = i + 1 < n && is_silent_mark(chars[i + 1]);
468 // If fin_c is followed by a vowel diacritic or leading vowel, it's
469 // an initial of the next syllable — don't consume as final
470 let next_is_vowel = i + 1 < n
471 && (is_leading_vowel(chars[i + 1])
472 || matches!(
473 chars[i + 1],
474 'ิ' | 'ี'
475 | 'ึ'
476 | 'ื'
477 | 'ั'
478 | '็'
479 | 'ุ'
480 | 'ู'
481 | 'า'
482 | 'ะ'
483 | 'ำ'
484 ));
485 if next_is_silent {
486 i += 2; // consume consonant + ์
487 } else if next_is_vowel {
488 // next char is an initial of a following syllable — leave it
489 } else {
490 out.push_str(final_rtgs(fin_c));
491 i += 1;
492 }
493 }
494 } else if is_tone_mark(c) || is_silent_mark(c) || matches!(c, '\u{0E4D}' | '\u{0E3A}') {
495 i += 1; // stray diacritic — skip
496 } else {
497 // Non-Thai character: pass through
498 out.push(c);
499 i += 1;
500 }
501 }
502
503 out
504}
505
506// ---------------------------------------------------------------------------
507// Tests
508// ---------------------------------------------------------------------------
509
510#[cfg(test)]
511mod tests {
512 use super::*;
513 use alloc::vec;
514
515 #[test]
516 fn builtin_common_words() {
517 let map = RomanizationMap::builtin();
518 assert_eq!(map.romanize("กิน"), Some("kin"));
519 assert_eq!(map.romanize("ข้าว"), Some("khao"));
520 assert_eq!(map.romanize("น้ำ"), Some("nam"));
521 assert_eq!(map.romanize("ปลา"), Some("pla"));
522 }
523
524 #[test]
525 fn unknown_word_returns_none_for_non_thai() {
526 let map = RomanizationMap::builtin();
527 assert_eq!(map.romanize("hello"), None);
528 assert_eq!(map.romanize("123"), None);
529 }
530
531 #[test]
532 fn romanize_or_raw_hit() {
533 let map = RomanizationMap::builtin();
534 assert_eq!(map.romanize_or_raw("กิน"), "kin");
535 }
536
537 #[test]
538 fn romanize_or_raw_non_thai_passthrough() {
539 let map = RomanizationMap::builtin();
540 assert_eq!(map.romanize_or_raw("xyz"), "xyz");
541 }
542
543 #[test]
544 fn romanize_or_rule_oov_thai_non_empty() {
545 let map = RomanizationMap::builtin();
546 // OOV Thai words should get rule-based romanization, not empty string
547 let result = map.romanize_or_rule("เปปซี่");
548 assert!(
549 !result.is_empty(),
550 "rule engine should produce non-empty output"
551 );
552 assert!(
553 !result.chars().any(is_thai_char),
554 "output should be Latin, not Thai"
555 );
556 }
557
558 // ── rule engine unit tests ────────────────────────────────────────────────
559
560 #[test]
561 fn rule_simple_consonant_vowel_final() {
562 // กิน = ก(k) + ิ(i) + น(n) → "kin"
563 assert_eq!(romanize_word("กิน"), "kin");
564 }
565
566 #[test]
567 fn rule_leading_vowel_ae() {
568 // แก = แ(ae) + ก(k) → "kaek" or "kaek"
569 // แก้ว = แ + ก + ้ (tone) + ว(final=o) → "kaeo"
570 let r = romanize_word("แก้ว");
571 assert_eq!(r, "kaeo");
572 }
573
574 #[test]
575 fn rule_leading_vowel_o() {
576 // โต = โ + ต → "to"
577 assert_eq!(romanize_word("โต"), "to");
578 }
579
580 #[test]
581 fn rule_leading_vowel_ai() {
582 // ไป = ไ + ป → "pai" (final ป in ไ pattern)
583 let r = romanize_word("ไป");
584 // Should start with 'p' and contain 'ai'
585 assert!(r.contains("ai"), "ไป should romanize with 'ai', got: {r}");
586 }
587
588 #[test]
589 fn rule_sara_am() {
590 // ทำ = ท + ำ → "tham"
591 assert_eq!(romanize_word("ทำ"), "tham");
592 }
593
594 #[test]
595 fn rule_below_vowel_u() {
596 // ดุ = ด + ุ → "du"
597 assert_eq!(romanize_word("ดุ"), "du");
598 }
599
600 #[test]
601 fn rule_non_thai_passthrough() {
602 assert_eq!(romanize_word("hello"), "hello");
603 }
604
605 #[test]
606 fn rule_empty_string() {
607 assert_eq!(romanize_word(""), "");
608 }
609
610 #[test]
611 fn romanize_or_rule_table_takes_priority() {
612 let map = RomanizationMap::builtin();
613 // Table has hand-curated "กิน" → "kin"
614 assert_eq!(map.romanize_or_rule("กิน"), "kin");
615 }
616
617 #[test]
618 fn romanize_or_rule_non_thai_passthrough() {
619 let map = RomanizationMap::builtin();
620 assert_eq!(map.romanize_or_rule("hello"), "hello");
621 }
622
623 #[test]
624 fn from_tsv_last_duplicate_wins() {
625 let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
626 assert_eq!(map.romanize("กิน"), Some("gin"));
627 }
628
629 #[test]
630 fn romanize_tokens_aligned() {
631 let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
632 let out = map.romanize_tokens(&["กิน", "ปลา"]);
633 assert_eq!(out, vec!["kin", "pla"]);
634 }
635
636 #[test]
637 fn romanize_tokens_unknown_passthrough() {
638 let map = RomanizationMap::from_tsv("กิน\tkin\n");
639 let out = map.romanize_tokens(&["กิน", "xyz"]);
640 assert_eq!(out, vec!["kin", "xyz"]);
641 }
642
643 #[test]
644 fn comment_and_blank_lines_skipped() {
645 let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
646 assert_eq!(map.len(), 1);
647 assert_eq!(map.romanize("กิน"), Some("kin"));
648 }
649
650 #[test]
651 fn line_without_tab_skipped() {
652 let map = RomanizationMap::from_tsv("กิน\n");
653 assert!(map.is_empty());
654 }
655
656 #[test]
657 fn whitespace_trimmed_from_romanization() {
658 let map = RomanizationMap::from_tsv("กิน\t kin \n");
659 assert_eq!(map.romanize("กิน"), Some("kin"));
660 }
661
662 #[test]
663 fn empty_input_produces_empty_map() {
664 assert!(RomanizationMap::from_tsv("").is_empty());
665 }
666
667 #[test]
668 fn romanize_tokens_empty_slice() {
669 let map = RomanizationMap::builtin();
670 assert!(map.romanize_tokens(&[]).is_empty());
671 }
672}