kham_core/normalizer.rs
1//! Thai text normalizer.
2//!
3//! Applies two transformations in order:
4//!
5//! 1. **วรรณยุกต์ dedup** — consecutive tone marks on the same consonant are
6//! collapsed to the last one. This handles accidental double-keystrokes
7//! (e.g. อ่ อ้ → อ้) as well as identical repetitions (อ่ อ่ → อ่).
8//!
9//! 2. **Sara Am composition** — the two-character sequence nikhahit (อํ
10//! U+0E4D) + sara aa (อา U+0E32) is composed into the single sara am
11//! character (อำ U+0E33), as Unicode intends.
12//!
13//! ## Why สระลอย reorder is not included
14//!
15//! Reordering a misplaced leading vowel (เ แ โ ใ ไ) requires knowing whether
16//! that vowel belongs to the consonant *before* it or the consonant *after* it
17//! in the code stream. In correctly encoded Thai text the sequence
18//! `consonant + lead_vowel` is common at word boundaries (e.g. ว + โ in
19//! "ชาวโลก"), and a simple look-ahead cannot distinguish that from a truly
20//! misplaced vowel without full TCC-level analysis. Correct TCC analysis
21//! requires the same character predicates used here, creating a dependency
22//! cycle. Applications that need สระลอย correction should pre-process the
23//! text with a dedicated TCC-aware utility before calling [`normalize`].
24//!
25//! ## NFC note
26//!
27//! Full Unicode NFC normalisation is not applied because Thai characters
28//! have combining class 0 and do not participate in canonical decomposition.
29//! The two rules above cover all practically observed Thai normalisation
30//! issues. Mixed-script Latin text is passed through unchanged; callers
31//! that require full NFC on Latin portions should pre-process the text
32//! with a Unicode normalisation library before calling [`normalize`].
33
34use alloc::string::String;
35use alloc::vec::Vec;
36
37// ---------------------------------------------------------------------------
38// Thai character predicates (local — avoids coupling to tcc module)
39// ---------------------------------------------------------------------------
40
41/// Tone marks: อ่ อ้ อ๊ อ๋ (U+0E48–U+0E4B).
42#[inline]
43fn is_tone(c: char) -> bool {
44 matches!(c, '\u{0E48}'..='\u{0E4B}')
45}
46
47// ---------------------------------------------------------------------------
48// Public API
49// ---------------------------------------------------------------------------
50
51/// Normalise Thai text into canonical form.
52///
53/// Returns an owned [`String`] with both transformations applied.
54/// ASCII and non-Thai characters are passed through unchanged.
55///
56/// # Examples
57///
58/// ```rust
59/// use kham_core::normalizer::normalize;
60///
61/// // 1. วรรณยุกต์ dedup: double tone mark → single (keep last)
62/// let doubled = "\u{0E01}\u{0E48}\u{0E49}"; // ก + อ่ + อ้
63/// let fixed = normalize(doubled);
64/// assert_eq!(fixed, "\u{0E01}\u{0E49}"); // ก้ only
65/// ```
66///
67/// ```rust
68/// use kham_core::normalizer::normalize;
69///
70/// // 2. Sara Am composition: nikhahit + sara aa → sara am
71/// let decomposed = "\u{0E01}\u{0E4D}\u{0E32}"; // ก + อํ + อา
72/// let fixed = normalize(decomposed);
73/// assert_eq!(fixed, "\u{0E01}\u{0E33}"); // กำ
74/// ```
75pub fn normalize(text: &str) -> String {
76 if text.is_empty() {
77 return String::new();
78 }
79
80 // Collect into a char vec for O(1) indexed look-ahead.
81 let chars: Vec<char> = text.chars().collect();
82 let n = chars.len();
83 let mut out = String::with_capacity(text.len());
84 let mut i = 0;
85
86 while i < n {
87 let c = chars[i];
88
89 // ── Rule 1: วรรณยุกต์ dedup ───────────────────────────────────────
90 // Consume a run of consecutive tone marks, keep only the last one.
91 // Rationale: the last key pressed is the most likely intended mark.
92 if is_tone(c) {
93 let mut last = c;
94 while i + 1 < n && is_tone(chars[i + 1]) {
95 i += 1;
96 last = chars[i];
97 }
98 out.push(last);
99 i += 1;
100 continue;
101 }
102
103 // ── Rule 2: Sara Am composition ───────────────────────────────────
104 // Nikhahit (อํ U+0E4D) + Sara Aa (อา U+0E32) → Sara Am (อำ U+0E33)
105 if c == '\u{0E4D}' && i + 1 < n && chars[i + 1] == '\u{0E32}' {
106 out.push('\u{0E33}'); // อำ
107 i += 2;
108 continue;
109 }
110
111 out.push(c);
112 i += 1;
113 }
114
115 out
116}
117
118// ---------------------------------------------------------------------------
119// Tests
120// ---------------------------------------------------------------------------
121
122#[cfg(test)]
123mod tests {
124 use super::*;
125
126 // ── passthrough ───────────────────────────────────────────────────────────
127
128 #[test]
129 fn empty_string() {
130 assert_eq!(normalize(""), "");
131 }
132
133 #[test]
134 fn ascii_passthrough() {
135 assert_eq!(normalize("hello 123"), "hello 123");
136 }
137
138 #[test]
139 fn correctly_encoded_thai_unchanged() {
140 // Correctly encoded Thai — no tone duplicates or decomposed Sara Am —
141 // must come back byte-for-byte identical.
142 assert_eq!(normalize("กินข้าว"), "กินข้าว");
143 assert_eq!(normalize("สวัสดี"), "สวัสดี");
144 assert_eq!(normalize("สวัสดีชาวโลก"), "สวัสดีชาวโลก");
145 assert_eq!(normalize("ธนาคารแห่งนั้น"), "ธนาคารแห่งนั้น");
146 }
147
148 #[test]
149 fn mixed_script_passthrough() {
150 let s = "ธนาคาร100แห่ง";
151 assert_eq!(normalize(s), s);
152 }
153
154 // ── Rule 1: วรรณยุกต์ dedup ──────────────────────────────────────────────
155
156 #[test]
157 fn duplicate_same_tone_removed() {
158 // ก + อ่ + อ่ → ก + อ่
159 let input = "\u{0E01}\u{0E48}\u{0E48}";
160 let expected = "\u{0E01}\u{0E48}";
161 assert_eq!(normalize(input), expected);
162 }
163
164 #[test]
165 fn different_tone_keeps_last() {
166 // ก + อ่ (low) + อ้ (falling) → ก + อ้ (last wins)
167 let input = "\u{0E01}\u{0E48}\u{0E49}";
168 let expected = "\u{0E01}\u{0E49}";
169 assert_eq!(normalize(input), expected);
170 }
171
172 #[test]
173 fn triple_tone_keeps_last() {
174 // Three consecutive tone marks → keep the last
175 let input = "\u{0E01}\u{0E48}\u{0E49}\u{0E4A}";
176 let expected = "\u{0E01}\u{0E4A}";
177 assert_eq!(normalize(input), expected);
178 }
179
180 #[test]
181 fn single_tone_unchanged() {
182 // ก้ — no duplicate
183 let input = "\u{0E01}\u{0E49}";
184 assert_eq!(normalize(input), input);
185 }
186
187 #[test]
188 fn tone_in_real_word() {
189 // ข้้าว (ข้าว with doubled อ้) → ข้าว
190 let input = "\u{0E02}\u{0E49}\u{0E49}\u{0E32}\u{0E27}"; // ข + อ้ + อ้ + า + ว
191 let expected = "\u{0E02}\u{0E49}\u{0E32}\u{0E27}"; // ข้าว
192 assert_eq!(normalize(input), expected);
193 }
194
195 // ── Rule 2: Sara Am composition ───────────────────────────────────────────
196
197 #[test]
198 fn nikhahit_plus_sara_aa_composed() {
199 // อํ (U+0E4D) + อา (U+0E32) → อำ (U+0E33)
200 let input = "\u{0E4D}\u{0E32}";
201 assert_eq!(normalize(input), "\u{0E33}");
202 }
203
204 #[test]
205 fn sara_am_in_word_context() {
206 // ก + อํ + อา → กำ
207 let input = "\u{0E01}\u{0E4D}\u{0E32}";
208 let expected = "\u{0E01}\u{0E33}";
209 assert_eq!(normalize(input), expected);
210 }
211
212 #[test]
213 fn nikhahit_without_sara_aa_unchanged() {
214 // Lone nikhahit (อํ) not followed by sara aa — no composition.
215 let input = "\u{0E01}\u{0E4D}\u{0E01}";
216 assert_eq!(normalize(input), input);
217 }
218
219 #[test]
220 fn already_sara_am_unchanged() {
221 // อำ (U+0E33) is already composed — must not be double-processed.
222 let input = "\u{0E01}\u{0E33}";
223 assert_eq!(normalize(input), input);
224 }
225
226 #[test]
227 fn nam_decomposed_composes_to_nam() {
228 // น้ำ decomposed: น + ้ + อํ + อา → น้ำ (น + ้ + อำ)
229 let input = "\u{0E19}\u{0E49}\u{0E4D}\u{0E32}";
230 let expected = "\u{0E19}\u{0E49}\u{0E33}";
231 assert_eq!(normalize(input), expected);
232 }
233
234 // ── output length invariant ────────────────────────────────────────────────
235
236 #[test]
237 fn output_length_never_exceeds_input_length() {
238 // Normalization can only shrink or preserve, never grow the char count.
239 let inputs = [
240 "กินข้าว",
241 "สวัสดีชาวโลก",
242 "\u{0E01}\u{0E48}\u{0E48}", // dedup (shrinks)
243 "\u{0E4D}\u{0E32}", // composition (shrinks)
244 ];
245 for s in inputs {
246 let out = normalize(s);
247 assert!(
248 out.chars().count() <= s.chars().count(),
249 "normalize grew the string: {s:?} → {out:?}"
250 );
251 }
252 }
253}