use alloc::collections::BTreeMap;
use alloc::string::String;
use alloc::vec::Vec;
static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
pub struct RomanizationMap(BTreeMap<String, String>);
impl RomanizationMap {
pub fn builtin() -> Self {
Self::from_tsv(BUILTIN_ROMANIZATION)
}
pub fn from_tsv(data: &str) -> Self {
let mut map: BTreeMap<String, String> = BTreeMap::new();
for line in data.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let mut parts = line.splitn(2, '\t');
let word = match parts.next() {
Some(w) if !w.is_empty() => String::from(w),
_ => continue,
};
let roman = match parts.next() {
Some(r) if !r.is_empty() => String::from(r.trim()),
_ => continue,
};
map.insert(word, roman);
}
RomanizationMap(map)
}
pub fn romanize(&self, word: &str) -> Option<&str> {
self.0.get(word).map(String::as_str)
}
pub fn romanize_owned(&self, word: &str) -> Option<String> {
if let Some(s) = self.0.get(word) {
return Some(s.clone());
}
if word.chars().any(is_thai_char) {
Some(romanize_word(word))
} else {
None
}
}
pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
self.0.get(word).map(String::as_str).unwrap_or(word)
}
pub fn romanize_or_rule(&self, word: &str) -> String {
if let Some(s) = self.0.get(word) {
return s.clone();
}
if word.chars().any(is_thai_char) {
romanize_word(word)
} else {
String::from(word)
}
}
pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
tokens
.iter()
.map(|t| String::from(self.romanize_or_raw(t)))
.collect()
}
#[inline]
pub fn len(&self) -> usize {
self.0.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
#[inline]
fn is_thai_char(c: char) -> bool {
('\u{0E00}'..='\u{0E7F}').contains(&c)
}
fn initial_rtgs(c: char) -> &'static str {
match c {
'ก' => "k",
'ข' | 'ค' | 'ฅ' | 'ฆ' => "kh",
'ง' => "ng",
'จ' | 'ฉ' | 'ช' | 'ฌ' => "ch",
'ซ' | 'ศ' | 'ษ' | 'ส' => "s",
'ญ' | 'ย' => "y",
'ฎ' | 'ด' => "d",
'ฏ' | 'ต' => "t",
'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "th",
'น' | 'ณ' => "n",
'บ' => "b",
'ป' => "p",
'ผ' | 'พ' | 'ภ' => "ph",
'ฝ' | 'ฟ' => "f",
'ม' => "m",
'ร' => "r",
'ล' | 'ฬ' => "l",
'ว' => "w",
'ห' | 'ฮ' => "h",
'อ' => "",
_ => "",
}
}
fn final_rtgs(c: char) -> &'static str {
match c {
'ก' | 'ข' | 'ค' | 'ฅ' | 'ฆ' => "k",
'ง' => "ng",
'จ' | 'ช' | 'ซ' | 'ฌ' | 'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ถ' | 'ท' | 'ธ' | 'ศ' | 'ษ' | 'ส' => {
"t"
}
'น' | 'ณ' => "n",
'บ' | 'ป' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "p",
'ม' => "m",
'ย' | 'ญ' => "i",
'ร' => "n",
'ล' | 'ฬ' => "n",
'ว' => "o",
'ห' | 'อ' => "",
_ => "",
}
}
fn is_thai_consonant(c: char) -> bool {
matches!(c, 'ก'..='ฮ')
}
fn is_leading_vowel(c: char) -> bool {
matches!(c, 'เ' | 'แ' | 'โ' | 'ใ' | 'ไ')
}
fn is_tone_mark(c: char) -> bool {
matches!(c, '\u{0E48}' | '\u{0E49}' | '\u{0E4A}' | '\u{0E4B}')
}
fn is_silent_mark(c: char) -> bool {
c == '\u{0E4C}' }
pub fn romanize_word(word: &str) -> String {
let chars: Vec<char> = word.chars().collect();
let n = chars.len();
let mut out = String::with_capacity(word.len());
let mut i = 0;
while i < n {
let c = chars[i];
if is_leading_vowel(c) {
let lead = c;
i += 1;
while i < n && is_tone_mark(chars[i]) {
i += 1;
}
if i < n && is_thai_consonant(chars[i]) {
let init = initial_rtgs(chars[i]);
i += 1;
while i < n
&& (is_tone_mark(chars[i])
|| matches!(
chars[i],
'ิ' | 'ี' | 'ึ' | 'ื' | 'ั' | '็' | 'ุ' | 'ู' | '\u{0E4D}' | '\u{0E3A}'
))
{
i += 1;
}
let suffix = if lead == 'เ' && i < n && chars[i] == 'อ' {
i += 1;
"oe"
} else if lead == 'เ' && i < n && chars[i] == 'า' {
i += 1;
"ao" } else {
match lead {
'เ' => "e",
'แ' => "ae",
'โ' => "o",
'ใ' | 'ไ' => "ai",
_ => "",
}
};
out.push_str(init);
out.push_str(suffix);
if i < n && is_thai_consonant(chars[i]) && !is_silent_mark(chars[i]) {
let fin_c = chars[i];
i += 1;
let silent = i < n && is_silent_mark(chars[i]);
if silent {
i += 1; } else {
out.push_str(final_rtgs(fin_c));
}
}
} else {
out.push_str(match lead {
'เ' => "e",
'แ' => "ae",
'โ' => "o",
'ใ' | 'ไ' => "ai",
_ => "",
});
}
} else if is_thai_consonant(c) {
let init = initial_rtgs(c);
i += 1;
let mut vowel = "";
let mut pending_silent = false;
while i < n {
match chars[i] {
ch if is_tone_mark(ch) => i += 1,
ch if is_silent_mark(ch) => {
pending_silent = true;
i += 1;
break;
}
'ิ' | '็' => {
vowel = "i";
i += 1;
}
'ี' => {
vowel = "i";
i += 1;
}
'ึ' => {
vowel = "ue";
i += 1;
}
'ื' => {
vowel = "ue";
i += 1;
}
'ั' => {
vowel = "a";
i += 1;
}
'ุ' => {
vowel = "u";
i += 1;
}
'ู' => {
vowel = "u";
i += 1;
}
'า' => {
vowel = "a";
i += 1;
}
'ะ' => {
vowel = "a";
i += 1;
}
'ำ' => {
vowel = "am";
i += 1;
break;
} '\u{0E4D}' | '\u{0E3A}' => i += 1,
_ => break,
}
}
if pending_silent {
continue;
}
out.push_str(init);
out.push_str(vowel);
if vowel == "am" {
continue;
}
if i < n && is_thai_consonant(chars[i]) {
let fin_c = chars[i];
let next_is_silent = i + 1 < n && is_silent_mark(chars[i + 1]);
let next_is_vowel = i + 1 < n
&& (is_leading_vowel(chars[i + 1])
|| matches!(
chars[i + 1],
'ิ' | 'ี'
| 'ึ'
| 'ื'
| 'ั'
| '็'
| 'ุ'
| 'ู'
| 'า'
| 'ะ'
| 'ำ'
));
if next_is_silent {
i += 2; } else if next_is_vowel {
} else {
out.push_str(final_rtgs(fin_c));
i += 1;
}
}
} else if is_tone_mark(c) || is_silent_mark(c) || matches!(c, '\u{0E4D}' | '\u{0E3A}') {
i += 1; } else {
out.push(c);
i += 1;
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
#[test]
fn builtin_common_words() {
let map = RomanizationMap::builtin();
assert_eq!(map.romanize("กิน"), Some("kin"));
assert_eq!(map.romanize("ข้าว"), Some("khao"));
assert_eq!(map.romanize("น้ำ"), Some("nam"));
assert_eq!(map.romanize("ปลา"), Some("pla"));
}
#[test]
fn unknown_word_returns_none_for_non_thai() {
let map = RomanizationMap::builtin();
assert_eq!(map.romanize("hello"), None);
assert_eq!(map.romanize("123"), None);
}
#[test]
fn romanize_or_raw_hit() {
let map = RomanizationMap::builtin();
assert_eq!(map.romanize_or_raw("กิน"), "kin");
}
#[test]
fn romanize_or_raw_non_thai_passthrough() {
let map = RomanizationMap::builtin();
assert_eq!(map.romanize_or_raw("xyz"), "xyz");
}
#[test]
fn romanize_or_rule_oov_thai_non_empty() {
let map = RomanizationMap::builtin();
let result = map.romanize_or_rule("เปปซี่");
assert!(
!result.is_empty(),
"rule engine should produce non-empty output"
);
assert!(
!result.chars().any(is_thai_char),
"output should be Latin, not Thai"
);
}
#[test]
fn rule_simple_consonant_vowel_final() {
assert_eq!(romanize_word("กิน"), "kin");
}
#[test]
fn rule_leading_vowel_ae() {
let r = romanize_word("แก้ว");
assert_eq!(r, "kaeo");
}
#[test]
fn rule_leading_vowel_o() {
assert_eq!(romanize_word("โต"), "to");
}
#[test]
fn rule_leading_vowel_ai() {
let r = romanize_word("ไป");
assert!(r.contains("ai"), "ไป should romanize with 'ai', got: {r}");
}
#[test]
fn rule_sara_am() {
assert_eq!(romanize_word("ทำ"), "tham");
}
#[test]
fn rule_below_vowel_u() {
assert_eq!(romanize_word("ดุ"), "du");
}
#[test]
fn rule_non_thai_passthrough() {
assert_eq!(romanize_word("hello"), "hello");
}
#[test]
fn rule_empty_string() {
assert_eq!(romanize_word(""), "");
}
#[test]
fn romanize_or_rule_table_takes_priority() {
let map = RomanizationMap::builtin();
assert_eq!(map.romanize_or_rule("กิน"), "kin");
}
#[test]
fn romanize_or_rule_non_thai_passthrough() {
let map = RomanizationMap::builtin();
assert_eq!(map.romanize_or_rule("hello"), "hello");
}
#[test]
fn from_tsv_last_duplicate_wins() {
let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
assert_eq!(map.romanize("กิน"), Some("gin"));
}
#[test]
fn romanize_tokens_aligned() {
let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
let out = map.romanize_tokens(&["กิน", "ปลา"]);
assert_eq!(out, vec!["kin", "pla"]);
}
#[test]
fn romanize_tokens_unknown_passthrough() {
let map = RomanizationMap::from_tsv("กิน\tkin\n");
let out = map.romanize_tokens(&["กิน", "xyz"]);
assert_eq!(out, vec!["kin", "xyz"]);
}
#[test]
fn comment_and_blank_lines_skipped() {
let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
assert_eq!(map.len(), 1);
assert_eq!(map.romanize("กิน"), Some("kin"));
}
#[test]
fn line_without_tab_skipped() {
let map = RomanizationMap::from_tsv("กิน\n");
assert!(map.is_empty());
}
#[test]
fn whitespace_trimmed_from_romanization() {
let map = RomanizationMap::from_tsv("กิน\t kin \n");
assert_eq!(map.romanize("กิน"), Some("kin"));
}
#[test]
fn empty_input_produces_empty_map() {
assert!(RomanizationMap::from_tsv("").is_empty());
}
#[test]
fn romanize_tokens_empty_slice() {
let map = RomanizationMap::builtin();
assert!(map.romanize_tokens(&[]).is_empty());
}
}