kham_core/romanizer.rs
1//! RTGS romanization of segmented Thai words.
2//!
3//! [`RomanizationMap`] maps pre-segmented Thai words to their Roman (Latin)
4//! phonetic equivalents using the Royal Thai General System of Transcription
5//! (RTGS) — the Thai government standard used in road signs, passports, and
6//! official documents.
7//!
8//! This implementation is **table-driven only**. A rule-based phonetic engine
9//! is a future `#[cfg(feature = "phonetic")]` extension.
10//!
11//! # RTGS characteristics
12//!
13//! - Consonant-by-consonant transliteration (initial vs. final position differ)
14//! - No tone marks in output
15//! - No vowel-length distinction (อิ and อี both map to `i`)
16//! - Diphthongs and vowel clusters have explicit multi-character mappings
17//!
18//! # Data format
19//!
20//! Tab-separated text file, one entry per line:
21//!
22//! ```text
23//! # Thai word<TAB>RTGS romanization
24//! กิน<TAB>kin
25//! ข้าว<TAB>khao
26//! ปลา<TAB>pla
27//! ```
28//!
29//! Lines beginning with `#` and blank lines are ignored.
30//! Duplicate keys: last entry wins (allows override files).
31//!
32//! # Example
33//!
34//! ```rust
35//! use kham_core::romanizer::RomanizationMap;
36//!
37//! let map = RomanizationMap::builtin();
38//! assert_eq!(map.romanize("กิน"), Some("kin"));
39//! assert_eq!(map.romanize_or_raw("ข้าว"), "khao");
40//! assert_eq!(map.romanize_or_raw("xyz"), "xyz");
41//!
42//! let tokens = vec!["กิน", "ข้าว", "ปลา"];
43//! assert_eq!(map.romanize_tokens(&tokens), vec!["kin", "khao", "pla"]);
44//! ```
45
46use alloc::collections::BTreeMap;
47use alloc::string::String;
48use alloc::vec::Vec;
49
50static BUILTIN_ROMANIZATION: &str = include_str!("../data/romanization_th.tsv");
51
52/// A Thai-word → RTGS-romanization lookup table.
53///
54/// Built from tab-separated data via [`RomanizationMap::from_tsv`].
55/// Lookup is O(log n) via [`BTreeMap`].
56pub struct RomanizationMap(BTreeMap<String, String>);
57
58impl RomanizationMap {
59 /// Load the built-in RTGS romanization table.
60 pub fn builtin() -> Self {
61 Self::from_tsv(BUILTIN_ROMANIZATION)
62 }
63
64 /// Parse a tab-separated romanization table.
65 ///
66 /// Format: `thai_word\trtgs_romanization` — one entry per line.
67 /// Lines beginning with `#` and blank lines are skipped.
68 /// For duplicate keys, the last entry wins.
69 pub fn from_tsv(data: &str) -> Self {
70 let mut map: BTreeMap<String, String> = BTreeMap::new();
71 for line in data.lines() {
72 let line = line.trim();
73 if line.is_empty() || line.starts_with('#') {
74 continue;
75 }
76 let mut parts = line.splitn(2, '\t');
77 let word = match parts.next() {
78 Some(w) if !w.is_empty() => String::from(w),
79 _ => continue,
80 };
81 let roman = match parts.next() {
82 Some(r) if !r.is_empty() => String::from(r.trim()),
83 _ => continue,
84 };
85 map.insert(word, roman);
86 }
87 RomanizationMap(map)
88 }
89
90 /// Look up the RTGS romanization for a pre-segmented Thai word.
91 ///
92 /// Returns `None` if the word is not in the table.
93 /// The returned `&str` borrows from the map — zero-copy for hits.
94 ///
95 /// # Example
96 ///
97 /// ```rust
98 /// use kham_core::romanizer::RomanizationMap;
99 ///
100 /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
101 /// assert_eq!(map.romanize("กิน"), Some("kin"));
102 /// assert_eq!(map.romanize("xyz"), None);
103 /// ```
104 pub fn romanize(&self, word: &str) -> Option<&str> {
105 self.0.get(word).map(String::as_str)
106 }
107
108 /// Return the RTGS romanization for `word`, or `word` unchanged if not found.
109 ///
110 /// # Example
111 ///
112 /// ```rust
113 /// use kham_core::romanizer::RomanizationMap;
114 ///
115 /// let map = RomanizationMap::from_tsv("กิน\tkin\n");
116 /// assert_eq!(map.romanize_or_raw("กิน"), "kin");
117 /// assert_eq!(map.romanize_or_raw("xyz"), "xyz");
118 /// ```
119 pub fn romanize_or_raw<'a>(&'a self, word: &'a str) -> &'a str {
120 self.0.get(word).map(String::as_str).unwrap_or(word)
121 }
122
123 /// Romanize a slice of pre-segmented token strings.
124 ///
125 /// Returns a `Vec<String>` aligned 1:1 with the input slice. Tokens not
126 /// found in the table are returned unchanged (same behaviour as
127 /// [`romanize_or_raw`](Self::romanize_or_raw)).
128 ///
129 /// # Example
130 ///
131 /// ```rust
132 /// use kham_core::romanizer::RomanizationMap;
133 ///
134 /// let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
135 /// let out = map.romanize_tokens(&["กิน", "ปลา"]);
136 /// assert_eq!(out, vec!["kin", "pla"]);
137 /// ```
138 pub fn romanize_tokens(&self, tokens: &[&str]) -> Vec<String> {
139 tokens
140 .iter()
141 .map(|t| String::from(self.romanize_or_raw(t)))
142 .collect()
143 }
144
145 /// Number of entries in the map.
146 #[inline]
147 pub fn len(&self) -> usize {
148 self.0.len()
149 }
150
151 /// Return `true` if the map has no entries.
152 #[inline]
153 pub fn is_empty(&self) -> bool {
154 self.0.is_empty()
155 }
156}
157
158// ---------------------------------------------------------------------------
159// Tests
160// ---------------------------------------------------------------------------
161
162#[cfg(test)]
163mod tests {
164 use super::*;
165 use alloc::vec;
166
167 #[test]
168 fn builtin_common_words() {
169 let map = RomanizationMap::builtin();
170 assert_eq!(map.romanize("กิน"), Some("kin"));
171 assert_eq!(map.romanize("ข้าว"), Some("khao"));
172 assert_eq!(map.romanize("น้ำ"), Some("nam"));
173 assert_eq!(map.romanize("ปลา"), Some("pla"));
174 }
175
176 #[test]
177 fn unknown_word_returns_none() {
178 let map = RomanizationMap::builtin();
179 assert_eq!(map.romanize("เปปซี่"), None);
180 }
181
182 #[test]
183 fn romanize_or_raw_fallback() {
184 let map = RomanizationMap::builtin();
185 assert_eq!(map.romanize_or_raw("เปปซี่"), "เปปซี่");
186 }
187
188 #[test]
189 fn romanize_or_raw_hit() {
190 let map = RomanizationMap::builtin();
191 assert_eq!(map.romanize_or_raw("กิน"), "kin");
192 }
193
194 #[test]
195 fn from_tsv_last_duplicate_wins() {
196 let map = RomanizationMap::from_tsv("กิน\tkin\nกิน\tgin\n");
197 assert_eq!(map.romanize("กิน"), Some("gin"));
198 }
199
200 #[test]
201 fn romanize_tokens_aligned() {
202 let map = RomanizationMap::from_tsv("กิน\tkin\nปลา\tpla\n");
203 let out = map.romanize_tokens(&["กิน", "ปลา"]);
204 assert_eq!(out, vec!["kin", "pla"]);
205 }
206
207 #[test]
208 fn romanize_tokens_unknown_passthrough() {
209 let map = RomanizationMap::from_tsv("กิน\tkin\n");
210 let out = map.romanize_tokens(&["กิน", "xyz"]);
211 assert_eq!(out, vec!["kin", "xyz"]);
212 }
213
214 #[test]
215 fn comment_and_blank_lines_skipped() {
216 let map = RomanizationMap::from_tsv("# comment\n\nกิน\tkin\n");
217 assert_eq!(map.len(), 1);
218 assert_eq!(map.romanize("กิน"), Some("kin"));
219 }
220
221 #[test]
222 fn line_without_tab_skipped() {
223 let map = RomanizationMap::from_tsv("กิน\n");
224 assert!(map.is_empty());
225 }
226
227 #[test]
228 fn whitespace_trimmed_from_romanization() {
229 let map = RomanizationMap::from_tsv("กิน\t kin \n");
230 assert_eq!(map.romanize("กิน"), Some("kin"));
231 }
232
233 #[test]
234 fn empty_input_produces_empty_map() {
235 assert!(RomanizationMap::from_tsv("").is_empty());
236 }
237
238 #[test]
239 fn romanize_tokens_empty_slice() {
240 let map = RomanizationMap::builtin();
241 assert!(map.romanize_tokens(&[]).is_empty());
242 }
243}