Skip to main content

kham_core/
romanizer.rs

1//! RTGS romanization of segmented Thai words.
2//!
3//! [`RomanizationMap`] maps pre-segmented Thai words to their Roman (Latin)
4//! phonetic equivalents using the Royal Thai General System of Transcription
5//! (RTGS) — the Thai government standard used in road signs, passports, and
6//! official documents.
7//!
8//! This implementation is **table-driven only**. A rule-based phonetic engine
9//! is a future `#[cfg(feature = "phonetic")]` extension.
10//!
11//! # RTGS characteristics
12//!
13//! - Consonant-by-consonant transliteration (initial vs. final position differ)
14//! - No tone marks in output
15//! - No vowel-length distinction (อิ and อี both map to `i`)
16//! - Diphthongs and vowel clusters have explicit multi-character mappings
17//!
18//! # Data format
19//!
20//! Tab-separated text file, one entry per line:
21//!
22//! ```text
23//! # Thai word<TAB>RTGS romanization
24//! กิน<TAB>kin
25//! ข้าว<TAB>khao
26//! ปลา<TAB>pla
27//! ```
28//!
29//! Lines beginning with `#` and blank lines are ignored.
30//! Duplicate keys: last entry wins (allows override files).
31//!
32//! # Example
33//!
34//! ```rust
35//! use kham_core::romanizer::RomanizationMap;
36//!
37//! let map = RomanizationMap::builtin();
38//! assert_eq!(map.romanize("กิน"), Some("kin"));
39//! assert_eq!(map.romanize_or_raw("ข้าว"), "khao");
40//! assert_eq!(map.romanize_or_raw("xyz"), "xyz");
41//!
42//! let tokens = vec!["กิน", "ข้าว", "ปลา"];
43//! assert_eq!(map.romanize_tokens(&tokens), vec!["kin", "khao", "pla"]);
44//! ```
45
46use alloc::collections::BTreeMap;
47use alloc::string::String;
48use alloc::vec::Vec;
49
50static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
51
52/// A Thai-word → RTGS-romanization lookup table.
53///
54/// Built from tab-separated data via [`RomanizationMap::from_tsv`].
55/// Lookup is O(log n) via [`BTreeMap`].
56pub struct RomanizationMap(BTreeMap<String, String>);
57
58impl RomanizationMap {
59    /// Load the built-in RTGS romanization table.
60    pub fn builtin() -> Self {
61        Self::from_tsv(BUILTIN_ROMANIZATION)
62    }
63
64    /// Parse a tab-separated romanization table.
65    ///
66    /// Format: `thai_word\trtgs_romanization` — one entry per line.
67    /// Lines beginning with `#` and blank lines are skipped.
68    /// For duplicate keys, the last entry wins.
69    pub fn from_tsv(data: &str) -> Self {
70        let mut map: BTreeMap<String, String> = BTreeMap::new();
71        for line in data.lines() {
72            let line = line.trim();
73            if line.is_empty() || line.starts_with('#') {
74                continue;
75            }
76            let mut parts = line.splitn(2, '\t');
77            let word = match parts.next() {
78                Some(w) if !w.is_empty() => String::from(w),
79                _ => continue,
80            };
81            let roman = match parts.next() {
82                Some(r) if !r.is_empty() => String::from(r.trim()),
83                _ => continue,
84            };
85            map.insert(word, roman);
86        }
87        RomanizationMap(map)
88    }
89
90    /// Look up the RTGS romanization for a pre-segmented Thai word.
91    ///
92    /// Returns `None` if the word is not in the table.
93    /// The returned `&str` borrows from the map — zero-copy for hits.
94    ///
95    /// # Example
96    ///
97    /// ```rust
98    /// use kham_core::romanizer::RomanizationMap;
99    ///
100    /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
101    /// assert_eq!(map.romanize("กิน"), Some("kin"));
102    /// assert_eq!(map.romanize("xyz"), None);
103    /// ```
104    pub fn romanize(&self, word: &str) -> Option<&str> {
105        self.0.get(word).map(String::as_str)
106    }
107
108    /// Return the RTGS romanization for `word`, or `word` unchanged if not found.
109    ///
110    /// # Example
111    ///
112    /// ```rust
113    /// use kham_core::romanizer::RomanizationMap;
114    ///
115    /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
116    /// assert_eq!(map.romanize_or_raw("กิน"), "kin");
117    /// assert_eq!(map.romanize_or_raw("xyz"), "xyz");
118    /// ```
119    pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
120        self.0.get(word).map(String::as_str).unwrap_or(word)
121    }
122
123    /// Romanize a slice of pre-segmented token strings.
124    ///
125    /// Returns a `Vec<String>` aligned 1:1 with the input slice. Tokens not
126    /// found in the table are returned unchanged (same behaviour as
127    /// [`romanize_or_raw`](Self::romanize_or_raw)).
128    ///
129    /// # Example
130    ///
131    /// ```rust
132    /// use kham_core::romanizer::RomanizationMap;
133    ///
134    /// let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
135    /// let out = map.romanize_tokens(&["กิน", "ปลา"]);
136    /// assert_eq!(out, vec!["kin", "pla"]);
137    /// ```
138    pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
139        tokens
140            .iter()
141            .map(|t| String::from(self.romanize_or_raw(t)))
142            .collect()
143    }
144
145    /// Number of entries in the map.
146    #[inline]
147    pub fn len(&self) -> usize {
148        self.0.len()
149    }
150
151    /// Return `true` if the map has no entries.
152    #[inline]
153    pub fn is_empty(&self) -> bool {
154        self.0.is_empty()
155    }
156}
157
158// ---------------------------------------------------------------------------
159// Tests
160// ---------------------------------------------------------------------------
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165    use alloc::vec;
166
167    #[test]
168    fn builtin_common_words() {
169        let map = RomanizationMap::builtin();
170        assert_eq!(map.romanize("กิน"), Some("kin"));
171        assert_eq!(map.romanize("ข้าว"), Some("khao"));
172        assert_eq!(map.romanize("น้ำ"), Some("nam"));
173        assert_eq!(map.romanize("ปลา"), Some("pla"));
174    }
175
176    #[test]
177    fn unknown_word_returns_none() {
178        let map = RomanizationMap::builtin();
179        assert_eq!(map.romanize("เปปซี่"), None);
180    }
181
182    #[test]
183    fn romanize_or_raw_fallback() {
184        let map = RomanizationMap::builtin();
185        assert_eq!(map.romanize_or_raw("เปปซี่"), "เปปซี่");
186    }
187
188    #[test]
189    fn romanize_or_raw_hit() {
190        let map = RomanizationMap::builtin();
191        assert_eq!(map.romanize_or_raw("กิน"), "kin");
192    }
193
194    #[test]
195    fn from_tsv_last_duplicate_wins() {
196        let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
197        assert_eq!(map.romanize("กิน"), Some("gin"));
198    }
199
200    #[test]
201    fn romanize_tokens_aligned() {
202        let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
203        let out = map.romanize_tokens(&["กิน", "ปลา"]);
204        assert_eq!(out, vec!["kin", "pla"]);
205    }
206
207    #[test]
208    fn romanize_tokens_unknown_passthrough() {
209        let map = RomanizationMap::from_tsv("กิน\tkin\n");
210        let out = map.romanize_tokens(&["กิน", "xyz"]);
211        assert_eq!(out, vec!["kin", "xyz"]);
212    }
213
214    #[test]
215    fn comment_and_blank_lines_skipped() {
216        let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
217        assert_eq!(map.len(), 1);
218        assert_eq!(map.romanize("กิน"), Some("kin"));
219    }
220
221    #[test]
222    fn line_without_tab_skipped() {
223        let map = RomanizationMap::from_tsv("กิน\n");
224        assert!(map.is_empty());
225    }
226
227    #[test]
228    fn whitespace_trimmed_from_romanization() {
229        let map = RomanizationMap::from_tsv("กิน\t kin \n");
230        assert_eq!(map.romanize("กิน"), Some("kin"));
231    }
232
233    #[test]
234    fn empty_input_produces_empty_map() {
235        assert!(RomanizationMap::from_tsv("").is_empty());
236    }
237
238    #[test]
239    fn romanize_tokens_empty_slice() {
240        let map = RomanizationMap::builtin();
241        assert!(map.romanize_tokens(&[]).is_empty());
242    }
243}