kham_core/romanizer.rs
1//! RTGS romanization of segmented Thai words.
2//!
3//! [`RomanizationMap`] maps pre-segmented Thai words to their Roman (Latin)
4//! phonetic equivalents using the Royal Thai General System of Transcription
5//! (RTGS) — the Thai government standard used in road signs, passports, and
6//! official documents.
7//!
8//! Lookup first checks the hand-curated table; words not in the table are
9//! romanized by the built-in rule engine ([`romanize_word`]).
10//!
11//! # RTGS characteristics
12//!
13//! - Consonant-by-consonant transliteration (initial vs. final position differ)
14//! - No tone marks in output
15//! - No vowel-length distinction (อิ and อี both map to `i`)
16//! - Diphthongs and vowel clusters have explicit multi-character mappings
17//!
18//! # Data format
19//!
20//! Tab-separated text file, one entry per line:
21//!
22//! ```text
23//! # Thai word<TAB>RTGS romanization
24//! กิน<TAB>kin
25//! ข้าว<TAB>khao
26//! ปลา<TAB>pla
27//! ```
28//!
29//! Lines beginning with `#` and blank lines are ignored.
30//! Duplicate keys: last entry wins (allows override files).
31//!
32//! # Example
33//!
34//! ```rust
35//! use kham_core::romanizer::RomanizationMap;
36//!
37//! let map = RomanizationMap::builtin();
38//! assert_eq!(map.romanize("กิน"), Some("kin"));
39//! assert_eq!(map.romanize_or_raw("ข้าว"), "khao");
40//! assert_eq!(map.romanize_or_raw("xyz"), "xyz");
41//!
42//! let tokens = vec!["กิน", "ข้าว", "ปลา"];
43//! assert_eq!(map.romanize_tokens(&tokens), vec!["kin", "khao", "pla"]);
44//! ```
45
46use alloc::collections::BTreeMap;
47use alloc::string::String;
48use alloc::vec::Vec;
49
50use crate::segmenter::Tokenizer;
51use crate::token::TokenKind;
52
53static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
54
55/// A Thai-word → RTGS-romanization lookup table.
56///
57/// Built from tab-separated data via [`RomanizationMap::from_tsv`].
58/// Lookup is O(log n) via [`BTreeMap`].
59pub struct RomanizationMap(BTreeMap<String, String>);
60
61impl RomanizationMap {
62 /// Load the built-in RTGS romanization table.
63 pub fn builtin() -> Self {
64 Self::from_tsv(BUILTIN_ROMANIZATION)
65 }
66
67 /// Parse a tab-separated romanization table.
68 ///
69 /// Format: `thai_word\trtgs_romanization` — one entry per line.
70 /// Lines beginning with `#` and blank lines are skipped.
71 /// For duplicate keys, the last entry wins.
72 pub fn from_tsv(data: &str) -> Self {
73 let mut map: BTreeMap<String, String> = BTreeMap::new();
74 for line in data.lines() {
75 let line = line.trim();
76 if line.is_empty() || line.starts_with('#') {
77 continue;
78 }
79 let mut parts = line.splitn(2, '\t');
80 let word = match parts.next() {
81 Some(w) if !w.is_empty() => String::from(w),
82 _ => continue,
83 };
84 let roman = match parts.next() {
85 Some(r) if !r.is_empty() => String::from(r.trim()),
86 _ => continue,
87 };
88 map.insert(word, roman);
89 }
90 RomanizationMap(map)
91 }
92
93 /// Look up the RTGS romanization for a pre-segmented Thai word.
94 ///
95 /// Returns the table hit if the word is in the hand-curated list, otherwise
96 /// applies the built-in rule engine. Returns `None` only when the word
97 /// contains no Thai characters (e.g. pure Latin or numbers).
98 ///
99 /// The returned `&str` borrows from the map for table hits; rule-engine
100 /// results are returned as an owned `String` via the `romanize_owned`
101 /// helper — callers that want a borrowed `&str` should use
102 /// [`romanize_or_raw`](Self::romanize_or_raw).
103 ///
104 /// # Example
105 ///
106 /// ```rust
107 /// use kham_core::romanizer::RomanizationMap;
108 ///
109 /// let map = RomanizationMap::builtin();
110 /// // Table hit
111 /// assert_eq!(map.romanize("กิน"), Some("kin"));
112 /// // OOV word — not in table; use romanize_owned() for rule-engine fallback
113 /// assert_eq!(map.romanize("เปปซี่"), None);
114 /// // Non-Thai input
115 /// assert_eq!(map.romanize("xyz"), None);
116 /// ```
117 pub fn romanize(&self, word: &str) -> Option<&str> {
118 self.0.get(word).map(String::as_str)
119 }
120
121 /// Romanize `word` to an owned `String`, using the table first, then the
122 /// rule engine for out-of-vocabulary Thai words.
123 ///
124 /// Returns `None` only when the word contains no Thai characters.
125 ///
126 /// # Example
127 ///
128 /// ```rust
129 /// use kham_core::romanizer::RomanizationMap;
130 ///
131 /// let map = RomanizationMap::builtin();
132 /// assert_eq!(map.romanize_owned("กิน").as_deref(), Some("kin"));
133 /// // OOV word gets rule-based approximation
134 /// assert!(map.romanize_owned("เปปซี่").is_some());
135 /// // Non-Thai returns None
136 /// assert_eq!(map.romanize_owned("hello"), None);
137 /// ```
138 pub fn romanize_owned(&self, word: &str) -> Option<String> {
139 if let Some(s) = self.0.get(word) {
140 return Some(s.clone());
141 }
142 if word.chars().any(is_thai_char) {
143 Some(romanize_word(word))
144 } else {
145 None
146 }
147 }
148
149 /// Return the RTGS romanization for `word`, or `word` unchanged if not in
150 /// the table. Only performs table lookup — no rule engine.
151 ///
152 /// For OOV Thai words that should fall back to the rule engine, use
153 /// [`romanize_or_rule`](Self::romanize_or_rule) instead.
154 ///
155 /// # Example
156 ///
157 /// ```rust
158 /// use kham_core::romanizer::RomanizationMap;
159 ///
160 /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
161 /// assert_eq!(map.romanize_or_raw("กิน"), "kin");
162 /// assert_eq!(map.romanize_or_raw("xyz"), "xyz");
163 /// // OOV Thai is returned unchanged (raw passthrough)
164 /// assert_eq!(map.romanize_or_raw("เปปซี่"), "เปปซี่");
165 /// ```
166 pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
167 self.0.get(word).map(String::as_str).unwrap_or(word)
168 }
169
170 /// Return the RTGS romanization for `word`.
171 ///
172 /// Checks the table first; for OOV Thai words the built-in rule engine is
173 /// applied. Non-Thai input is returned unchanged. Always returns an owned
174 /// `String`.
175 ///
176 /// # Example
177 ///
178 /// ```rust
179 /// use kham_core::romanizer::RomanizationMap;
180 ///
181 /// let map = RomanizationMap::builtin();
182 /// // Table hit
183 /// assert_eq!(map.romanize_or_rule("กิน"), "kin");
184 /// // Non-Thai passes through
185 /// assert_eq!(map.romanize_or_rule("hello"), "hello");
186 /// // OOV Thai gets rule-based approximation
187 /// let oov = map.romanize_or_rule("เปปซี่");
188 /// assert!(!oov.is_empty());
189 /// assert!(!oov.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)));
190 /// ```
191 pub fn romanize_or_rule(&self, word: &str) -> String {
192 if let Some(s) = self.0.get(word) {
193 return s.clone();
194 }
195 if word.chars().any(is_thai_char) {
196 romanize_word(word)
197 } else {
198 String::from(word)
199 }
200 }
201
202 /// Romanize a slice of pre-segmented token strings.
203 ///
204 /// Returns a `Vec<String>` aligned 1:1 with the input slice. Tokens not
205 /// found in the table are returned unchanged (same behaviour as
206 /// [`romanize_or_raw`](Self::romanize_or_raw)).
207 ///
208 /// # Example
209 ///
210 /// ```rust
211 /// use kham_core::romanizer::RomanizationMap;
212 ///
213 /// let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
214 /// let out = map.romanize_tokens(&["กิน", "ปลา"]);
215 /// assert_eq!(out, vec!["kin", "pla"]);
216 /// ```
217 pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
218 tokens
219 .iter()
220 .map(|t| String::from(self.romanize_or_raw(t)))
221 .collect()
222 }
223
224 /// Segment `text` and romanize every Thai token using RTGS table-lookup with
225 /// rule-based fallback. Non-Thai tokens (Latin, numbers, punctuation,
226 /// whitespace) are passed through as-is.
227 ///
228 /// The result is a continuous string with no separator between tokens — the
229 /// original whitespace tokens (if any) are preserved as spaces.
230 ///
231 /// # Example
232 /// ```rust
233 /// use kham_core::romanizer::RomanizationMap;
234 ///
235 /// let map = RomanizationMap::builtin();
236 /// let out = map.romanize_sentence("กินข้าว");
237 /// // Should contain only ASCII / Latin characters for Thai input
238 /// assert!(!out.is_empty());
239 /// assert!(!out.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)));
240 /// ```
241 pub fn romanize_sentence(&self, text: &str) -> String {
242 if text.is_empty() {
243 return String::new();
244 }
245 let tokenizer = Tokenizer::builder().keep_whitespace(true).build();
246 let tokens = tokenizer.segment(text);
247 let mut out = String::with_capacity(text.len() * 2);
248 for token in &tokens {
249 match token.kind {
250 TokenKind::Thai | TokenKind::Named(_) => {
251 out.push_str(&self.romanize_or_rule(token.text));
252 }
253 _ => out.push_str(token.text),
254 }
255 }
256 out
257 }
258
259 /// Number of entries in the map.
260 #[inline]
261 pub fn len(&self) -> usize {
262 self.0.len()
263 }
264
265 /// Return `true` if the map has no entries.
266 #[inline]
267 pub fn is_empty(&self) -> bool {
268 self.0.is_empty()
269 }
270}
271
272// ---------------------------------------------------------------------------
273// Rule-based RTGS engine (fallback for OOV words)
274// ---------------------------------------------------------------------------
275
276#[inline]
277fn is_thai_char(c: char) -> bool {
278 ('\u{0E00}'..='\u{0E7F}').contains(&c)
279}
280
281/// RTGS initial-position consonant mapping.
282fn initial_rtgs(c: char) -> &'static str {
283 match c {
284 'ก' => "k",
285 'ข' | 'ค' | 'ฅ' | 'ฆ' => "kh",
286 'ง' => "ng",
287 'จ' | 'ฉ' | 'ช' | 'ฌ' => "ch",
288 'ซ' | 'ศ' | 'ษ' | 'ส' => "s",
289 'ญ' | 'ย' => "y",
290 'ฎ' | 'ด' => "d",
291 'ฏ' | 'ต' => "t",
292 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "th",
293 'น' | 'ณ' => "n",
294 'บ' => "b",
295 'ป' => "p",
296 'ผ' | 'พ' | 'ภ' => "ph",
297 'ฝ' | 'ฟ' => "f",
298 'ม' => "m",
299 'ร' => "r",
300 'ล' | 'ฬ' => "l",
301 'ว' => "w",
302 'ห' | 'ฮ' => "h",
303 'อ' => "",
304 _ => "",
305 }
306}
307
308/// RTGS final-position (coda) consonant mapping.
309fn final_rtgs(c: char) -> &'static str {
310 match c {
311 'ก' | 'ข' | 'ค' | 'ฅ' | 'ฆ' => "k",
312 'ง' => "ng",
313 'จ' | 'ช' | 'ซ' | 'ฌ' | 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ถ' | 'ท' | 'ธ' | 'ศ' | 'ษ' | 'ส' => {
314 "t"
315 }
316 'น' | 'ณ' => "n",
317 'บ' | 'ป' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "p",
318 'ม' => "m",
319 'ย' | 'ญ' => "i",
320 'ร' => "n",
321 'ล' | 'ฬ' => "n",
322 'ว' => "o",
323 'ห' | 'อ' => "",
324 _ => "",
325 }
326}
327
328fn is_thai_consonant(c: char) -> bool {
329 matches!(c, 'ก'..='ฮ')
330}
331
332fn is_leading_vowel(c: char) -> bool {
333 matches!(c, 'เ' | 'แ' | 'โ' | 'ใ' | 'ไ')
334}
335
336fn is_tone_mark(c: char) -> bool {
337 matches!(c, '\u{0E48}' | '\u{0E49}' | '\u{0E4A}' | '\u{0E4B}')
338}
339
340fn is_silent_mark(c: char) -> bool {
341 c == '\u{0E4C}' // ์ thanthakat
342}
343
344/// Apply RTGS rules to an OOV Thai word.
345///
346/// Processes the Unicode character sequence using a lightweight syllable
347/// state machine. Handles leading vowels (เ แ โ ใ ไ), above vowels
348/// (ิ ี ึ ื ั ็), below vowels (ุ ู), following vowels (า ะ ำ), tone marks
349/// (skipped), and the thanthakat silent marker (์). Unrecognised characters
350/// pass through unchanged.
351pub fn romanize_word(word: &str) -> String {
352 let chars: Vec<char> = word.chars().collect();
353 let n = chars.len();
354 let mut out = String::with_capacity(word.len());
355 let mut i = 0;
356
357 while i < n {
358 let c = chars[i];
359
360 if is_leading_vowel(c) {
361 let lead = c;
362 i += 1;
363 // Skip any stacked tone marks before the initial consonant
364 while i < n && is_tone_mark(chars[i]) {
365 i += 1;
366 }
367 if i < n && is_thai_consonant(chars[i]) {
368 let init = initial_rtgs(chars[i]);
369 i += 1;
370 // Skip tone marks and above/below vowels that follow the initial
371 while i < n
372 && (is_tone_mark(chars[i])
373 || matches!(
374 chars[i],
375 'ิ' | 'ี' | 'ึ' | 'ื' | 'ั' | '็' | 'ุ' | 'ู' | '\u{0E4D}' | '\u{0E3A}'
376 ))
377 {
378 i += 1;
379 }
380 // Detect compound patterns: เ_อ → oe, เ_า → ao, เ_็ already consumed above
381 let suffix = if lead == 'เ' && i < n && chars[i] == 'อ' {
382 i += 1;
383 "oe"
384 } else if lead == 'เ' && i < n && chars[i] == 'า' {
385 i += 1;
386 "ao" // เ_า pattern
387 } else {
388 match lead {
389 'เ' => "e",
390 'แ' => "ae",
391 'โ' => "o",
392 'ใ' | 'ไ' => "ai",
393 _ => "",
394 }
395 };
396 out.push_str(init);
397 out.push_str(suffix);
398 // Final consonant
399 if i < n && is_thai_consonant(chars[i]) && !is_silent_mark(chars[i]) {
400 // Check for thanthakat on next+1
401 let fin_c = chars[i];
402 i += 1;
403 let silent = i < n && is_silent_mark(chars[i]);
404 if silent {
405 i += 1; // consume ์
406 } else {
407 out.push_str(final_rtgs(fin_c));
408 }
409 }
410 } else {
411 // Lone leading vowel — just emit vowel sound
412 out.push_str(match lead {
413 'เ' => "e",
414 'แ' => "ae",
415 'โ' => "o",
416 'ใ' | 'ไ' => "ai",
417 _ => "",
418 });
419 }
420 } else if is_thai_consonant(c) {
421 let init = initial_rtgs(c);
422 i += 1;
423
424 // Collect vowel diacritics and tone marks
425 let mut vowel = "";
426 let mut pending_silent = false;
427 while i < n {
428 match chars[i] {
429 // Tone marks — skip
430 ch if is_tone_mark(ch) => i += 1,
431 // Thanthakat — this consonant is silent
432 ch if is_silent_mark(ch) => {
433 pending_silent = true;
434 i += 1;
435 break;
436 }
437 // Above vowels
438 'ิ' | '็' => {
439 vowel = "i";
440 i += 1;
441 }
442 'ี' => {
443 vowel = "i";
444 i += 1;
445 }
446 'ึ' => {
447 vowel = "ue";
448 i += 1;
449 }
450 'ื' => {
451 vowel = "ue";
452 i += 1;
453 }
454 'ั' => {
455 vowel = "a";
456 i += 1;
457 }
458 // Below vowels
459 'ุ' => {
460 vowel = "u";
461 i += 1;
462 }
463 'ู' => {
464 vowel = "u";
465 i += 1;
466 }
467 // Following vowels
468 'า' => {
469 vowel = "a";
470 i += 1;
471 }
472 'ะ' => {
473 vowel = "a";
474 i += 1;
475 }
476 'ำ' => {
477 vowel = "am";
478 i += 1;
479 break;
480 } // am absorbs final
481 // Nikhahit / phinthu — skip
482 '\u{0E4D}' | '\u{0E3A}' => i += 1,
483 _ => break,
484 }
485 }
486
487 if pending_silent {
488 // Consonant is silent (e.g. ห์ in loan words) — emit nothing
489 continue;
490 }
491
492 out.push_str(init);
493 out.push_str(vowel);
494
495 // ำ already encodes the final nasal — skip coda search
496 if vowel == "am" {
497 continue;
498 }
499
500 // Final consonant: next non-tone-mark consonant followed by end-of-word
501 // or another leading vowel / vowel diacritic
502 if i < n && is_thai_consonant(chars[i]) {
503 let fin_c = chars[i];
504 // Peek: if fin_c is followed by ์ it's silent
505 let next_is_silent = i + 1 < n && is_silent_mark(chars[i + 1]);
506 // If fin_c is followed by a vowel diacritic or leading vowel, it's
507 // an initial of the next syllable — don't consume as final
508 let next_is_vowel = i + 1 < n
509 && (is_leading_vowel(chars[i + 1])
510 || matches!(
511 chars[i + 1],
512 'ิ' | 'ี'
513 | 'ึ'
514 | 'ื'
515 | 'ั'
516 | '็'
517 | 'ุ'
518 | 'ู'
519 | 'า'
520 | 'ะ'
521 | 'ำ'
522 ));
523 if next_is_silent {
524 i += 2; // consume consonant + ์
525 } else if next_is_vowel {
526 // next char is an initial of a following syllable — leave it
527 } else {
528 out.push_str(final_rtgs(fin_c));
529 i += 1;
530 }
531 }
532 } else if is_tone_mark(c) || is_silent_mark(c) || matches!(c, '\u{0E4D}' | '\u{0E3A}') {
533 i += 1; // stray diacritic — skip
534 } else {
535 // Non-Thai character: pass through
536 out.push(c);
537 i += 1;
538 }
539 }
540
541 out
542}
543
544// ---------------------------------------------------------------------------
545// Tests
546// ---------------------------------------------------------------------------
547
548#[cfg(test)]
549mod tests {
550 use super::*;
551 use alloc::vec;
552
553 #[test]
554 fn builtin_common_words() {
555 let map = RomanizationMap::builtin();
556 assert_eq!(map.romanize("กิน"), Some("kin"));
557 assert_eq!(map.romanize("ข้าว"), Some("khao"));
558 assert_eq!(map.romanize("น้ำ"), Some("nam"));
559 assert_eq!(map.romanize("ปลา"), Some("pla"));
560 }
561
562 #[test]
563 fn unknown_word_returns_none_for_non_thai() {
564 let map = RomanizationMap::builtin();
565 assert_eq!(map.romanize("hello"), None);
566 assert_eq!(map.romanize("123"), None);
567 }
568
569 #[test]
570 fn romanize_or_raw_hit() {
571 let map = RomanizationMap::builtin();
572 assert_eq!(map.romanize_or_raw("กิน"), "kin");
573 }
574
575 #[test]
576 fn romanize_or_raw_non_thai_passthrough() {
577 let map = RomanizationMap::builtin();
578 assert_eq!(map.romanize_or_raw("xyz"), "xyz");
579 }
580
581 #[test]
582 fn romanize_or_rule_oov_thai_non_empty() {
583 let map = RomanizationMap::builtin();
584 // OOV Thai words should get rule-based romanization, not empty string
585 let result = map.romanize_or_rule("เปปซี่");
586 assert!(
587 !result.is_empty(),
588 "rule engine should produce non-empty output"
589 );
590 assert!(
591 !result.chars().any(is_thai_char),
592 "output should be Latin, not Thai"
593 );
594 }
595
596 // ── rule engine unit tests ────────────────────────────────────────────────
597
598 #[test]
599 fn rule_simple_consonant_vowel_final() {
600 // กิน = ก(k) + ิ(i) + น(n) → "kin"
601 assert_eq!(romanize_word("กิน"), "kin");
602 }
603
604 #[test]
605 fn rule_leading_vowel_ae() {
606 // แก = แ(ae) + ก(k) → "kaek" or "kaek"
607 // แก้ว = แ + ก + ้ (tone) + ว(final=o) → "kaeo"
608 let r = romanize_word("แก้ว");
609 assert_eq!(r, "kaeo");
610 }
611
612 #[test]
613 fn rule_leading_vowel_o() {
614 // โต = โ + ต → "to"
615 assert_eq!(romanize_word("โต"), "to");
616 }
617
618 #[test]
619 fn rule_leading_vowel_ai() {
620 // ไป = ไ + ป → "pai" (final ป in ไ pattern)
621 let r = romanize_word("ไป");
622 // Should start with 'p' and contain 'ai'
623 assert!(r.contains("ai"), "ไป should romanize with 'ai', got: {r}");
624 }
625
626 #[test]
627 fn rule_sara_am() {
628 // ทำ = ท + ำ → "tham"
629 assert_eq!(romanize_word("ทำ"), "tham");
630 }
631
632 #[test]
633 fn rule_below_vowel_u() {
634 // ดุ = ด + ุ → "du"
635 assert_eq!(romanize_word("ดุ"), "du");
636 }
637
638 #[test]
639 fn rule_non_thai_passthrough() {
640 assert_eq!(romanize_word("hello"), "hello");
641 }
642
643 #[test]
644 fn rule_empty_string() {
645 assert_eq!(romanize_word(""), "");
646 }
647
648 #[test]
649 fn romanize_or_rule_table_takes_priority() {
650 let map = RomanizationMap::builtin();
651 // Table has hand-curated "กิน" → "kin"
652 assert_eq!(map.romanize_or_rule("กิน"), "kin");
653 }
654
655 #[test]
656 fn romanize_or_rule_non_thai_passthrough() {
657 let map = RomanizationMap::builtin();
658 assert_eq!(map.romanize_or_rule("hello"), "hello");
659 }
660
661 #[test]
662 fn from_tsv_last_duplicate_wins() {
663 let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
664 assert_eq!(map.romanize("กิน"), Some("gin"));
665 }
666
667 #[test]
668 fn romanize_tokens_aligned() {
669 let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
670 let out = map.romanize_tokens(&["กิน", "ปลา"]);
671 assert_eq!(out, vec!["kin", "pla"]);
672 }
673
674 #[test]
675 fn romanize_tokens_unknown_passthrough() {
676 let map = RomanizationMap::from_tsv("กิน\tkin\n");
677 let out = map.romanize_tokens(&["กิน", "xyz"]);
678 assert_eq!(out, vec!["kin", "xyz"]);
679 }
680
681 #[test]
682 fn comment_and_blank_lines_skipped() {
683 let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
684 assert_eq!(map.len(), 1);
685 assert_eq!(map.romanize("กิน"), Some("kin"));
686 }
687
688 #[test]
689 fn line_without_tab_skipped() {
690 let map = RomanizationMap::from_tsv("กิน\n");
691 assert!(map.is_empty());
692 }
693
694 #[test]
695 fn whitespace_trimmed_from_romanization() {
696 let map = RomanizationMap::from_tsv("กิน\t kin \n");
697 assert_eq!(map.romanize("กิน"), Some("kin"));
698 }
699
700 #[test]
701 fn empty_input_produces_empty_map() {
702 assert!(RomanizationMap::from_tsv("").is_empty());
703 }
704
705 #[test]
706 fn romanize_tokens_empty_slice() {
707 let map = RomanizationMap::builtin();
708 assert!(map.romanize_tokens(&[]).is_empty());
709 }
710
711 // romanize_sentence tests --------------------------------------------------
712
713 #[test]
714 fn romanize_sentence_thai_only() {
715 let map = RomanizationMap::builtin();
716 let out = map.romanize_sentence("กินข้าว");
717 assert!(!out.is_empty(), "output should not be empty");
718 assert!(
719 !out.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)),
720 "output should contain no Thai characters; got: {out:?}"
721 );
722 }
723
724 #[test]
725 fn romanize_sentence_mixed() {
726 let map = RomanizationMap::builtin();
727 let out = map.romanize_sentence("กิน100บาท");
728 assert!(
729 out.contains("100"),
730 "output should preserve '100'; got: {out:?}"
731 );
732 // "บาท" should be romanized — no Thai chars in the output
733 assert!(
734 !out.chars().any(|c| ('\u{0E00}'..='\u{0E7F}').contains(&c)),
735 "output should contain no Thai characters; got: {out:?}"
736 );
737 }
738}