kham_core/
normalizer.rs

1//! Thai text normalizer.
2//!
3//! Applies two transformations in order:
4//!
5//! 1. **วรรณยุกต์ dedup** — consecutive tone marks on the same consonant are
6//!    collapsed to the last one. This handles accidental double-keystrokes
7//!    (e.g. อ่ อ้ → อ้) as well as identical repetitions (อ่ อ่ → อ่).
8//!
9//! 2. **Sara Am composition** — the two-character sequence nikhahit (อํ
10//!    U+0E4D) + sara aa (อา U+0E32) is composed into the single sara am
11//!    character (อำ U+0E33), as Unicode intends.
12//!
13//! ## Why สระลอย reorder is not included
14//!
15//! Reordering a misplaced leading vowel (เ แ โ ใ ไ) requires knowing whether
16//! that vowel belongs to the consonant *before* it or the consonant *after* it
17//! in the code stream. In correctly encoded Thai text the sequence
18//! `consonant + lead_vowel` is common at word boundaries (e.g. ว + โ in
19//! "ชาวโลก"), and a simple look-ahead cannot distinguish that from a truly
20//! misplaced vowel without full TCC-level analysis. Correct TCC analysis
21//! requires the same character predicates used here, creating a dependency
22//! cycle. Applications that need สระลอย correction should pre-process the
23//! text with a dedicated TCC-aware utility before calling [`normalize`].
24//!
25//! ## NFC note
26//!
27//! Full Unicode NFC normalisation is not applied because Thai characters
28//! have combining class 0 and do not participate in canonical decomposition.
29//! The two rules above cover all practically observed Thai normalisation
30//! issues. Mixed-script Latin text is passed through unchanged; callers
31//! that require full NFC on Latin portions should pre-process the text
32//! with a Unicode normalisation library before calling [`normalize`].
33
34use alloc::string::String;
35use alloc::vec::Vec;
36
37// ---------------------------------------------------------------------------
38// Thai character predicates (local — avoids coupling to tcc module)
39// ---------------------------------------------------------------------------
40
41/// Tone marks: อ่ อ้ อ๊ อ๋ (U+0E48–U+0E4B).
42#[inline]
43fn is_tone(c: char) -> bool {
44    matches!(c, '\u{0E48}'..='\u{0E4B}')
45}
46
47// ---------------------------------------------------------------------------
48// Public API
49// ---------------------------------------------------------------------------
50
51/// Normalise Thai text into canonical form.
52///
53/// Returns an owned [`String`] with both transformations applied.
54/// ASCII and non-Thai characters are passed through unchanged.
55///
56/// # Examples
57///
58/// ```rust
59/// use kham_core::normalizer::normalize;
60///
61/// // 1. วรรณยุกต์ dedup: double tone mark → single (keep last)
62/// let doubled = "\u{0E01}\u{0E48}\u{0E49}"; // ก + อ่ + อ้
63/// let fixed = normalize(doubled);
64/// assert_eq!(fixed, "\u{0E01}\u{0E49}"); // ก้ only
65/// ```
66///
67/// ```rust
68/// use kham_core::normalizer::normalize;
69///
70/// // 2. Sara Am composition: nikhahit + sara aa → sara am
71/// let decomposed = "\u{0E01}\u{0E4D}\u{0E32}"; // ก + อํ + อา
72/// let fixed = normalize(decomposed);
73/// assert_eq!(fixed, "\u{0E01}\u{0E33}"); // กำ
74/// ```
75pub fn normalize(text: &str) -> String {
76    if text.is_empty() {
77        return String::new();
78    }
79
80    // Collect into a char vec for O(1) indexed look-ahead.
81    let chars: Vec<char> = text.chars().collect();
82    let n = chars.len();
83    let mut out = String::with_capacity(text.len());
84    let mut i = 0;
85
86    while i < n {
87        let c = chars[i];
88
89        // ── Rule 1: วรรณยุกต์ dedup ───────────────────────────────────────
90        // Consume a run of consecutive tone marks, keep only the last one.
91        // Rationale: the last key pressed is the most likely intended mark.
92        if is_tone(c) {
93            let mut last = c;
94            while i + 1 < n && is_tone(chars[i + 1]) {
95                i += 1;
96                last = chars[i];
97            }
98            out.push(last);
99            i += 1;
100            continue;
101        }
102
103        // ── Rule 2: Sara Am composition ───────────────────────────────────
104        // Nikhahit (อํ U+0E4D) + Sara Aa (อา U+0E32)  →  Sara Am (อำ U+0E33)
105        if c == '\u{0E4D}' && i + 1 < n && chars[i + 1] == '\u{0E32}' {
106            out.push('\u{0E33}'); // อำ
107            i += 2;
108            continue;
109        }
110
111        out.push(c);
112        i += 1;
113    }
114
115    out
116}
117
118// ---------------------------------------------------------------------------
119// Tests
120// ---------------------------------------------------------------------------
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    // ── passthrough ───────────────────────────────────────────────────────────
127
128    #[test]
129    fn empty_string() {
130        assert_eq!(normalize(""), "");
131    }
132
133    #[test]
134    fn ascii_passthrough() {
135        assert_eq!(normalize("hello 123"), "hello 123");
136    }
137
138    #[test]
139    fn correctly_encoded_thai_unchanged() {
140        // Correctly encoded Thai — no tone duplicates or decomposed Sara Am —
141        // must come back byte-for-byte identical.
142        assert_eq!(normalize("กินข้าว"), "กินข้าว");
143        assert_eq!(normalize("สวัสดี"), "สวัสดี");
144        assert_eq!(normalize("สวัสดีชาวโลก"), "สวัสดีชาวโลก");
145        assert_eq!(normalize("ธนาคารแห่งนั้น"), "ธนาคารแห่งนั้น");
146    }
147
148    #[test]
149    fn mixed_script_passthrough() {
150        let s = "ธนาคาร100แห่ง";
151        assert_eq!(normalize(s), s);
152    }
153
154    // ── Rule 1: วรรณยุกต์ dedup ──────────────────────────────────────────────
155
156    #[test]
157    fn duplicate_same_tone_removed() {
158        // ก + อ่ + อ่ → ก + อ่
159        let input = "\u{0E01}\u{0E48}\u{0E48}";
160        let expected = "\u{0E01}\u{0E48}";
161        assert_eq!(normalize(input), expected);
162    }
163
164    #[test]
165    fn different_tone_keeps_last() {
166        // ก + อ่ (low) + อ้ (falling) → ก + อ้  (last wins)
167        let input = "\u{0E01}\u{0E48}\u{0E49}";
168        let expected = "\u{0E01}\u{0E49}";
169        assert_eq!(normalize(input), expected);
170    }
171
172    #[test]
173    fn triple_tone_keeps_last() {
174        // Three consecutive tone marks → keep the last
175        let input = "\u{0E01}\u{0E48}\u{0E49}\u{0E4A}";
176        let expected = "\u{0E01}\u{0E4A}";
177        assert_eq!(normalize(input), expected);
178    }
179
180    #[test]
181    fn single_tone_unchanged() {
182        // ก้ — no duplicate
183        let input = "\u{0E01}\u{0E49}";
184        assert_eq!(normalize(input), input);
185    }
186
187    #[test]
188    fn tone_in_real_word() {
189        // ข้้าว (ข้าว with doubled อ้) → ข้าว
190        let input = "\u{0E02}\u{0E49}\u{0E49}\u{0E32}\u{0E27}"; // ข + อ้ + อ้ + า + ว
191        let expected = "\u{0E02}\u{0E49}\u{0E32}\u{0E27}"; // ข้าว
192        assert_eq!(normalize(input), expected);
193    }
194
195    // ── Rule 2: Sara Am composition ───────────────────────────────────────────
196
197    #[test]
198    fn nikhahit_plus_sara_aa_composed() {
199        // อํ (U+0E4D) + อา (U+0E32) → อำ (U+0E33)
200        let input = "\u{0E4D}\u{0E32}";
201        assert_eq!(normalize(input), "\u{0E33}");
202    }
203
204    #[test]
205    fn sara_am_in_word_context() {
206        // ก + อํ + อา → กำ
207        let input = "\u{0E01}\u{0E4D}\u{0E32}";
208        let expected = "\u{0E01}\u{0E33}";
209        assert_eq!(normalize(input), expected);
210    }
211
212    #[test]
213    fn nikhahit_without_sara_aa_unchanged() {
214        // Lone nikhahit (อํ) not followed by sara aa — no composition.
215        let input = "\u{0E01}\u{0E4D}\u{0E01}";
216        assert_eq!(normalize(input), input);
217    }
218
219    #[test]
220    fn already_sara_am_unchanged() {
221        // อำ (U+0E33) is already composed — must not be double-processed.
222        let input = "\u{0E01}\u{0E33}";
223        assert_eq!(normalize(input), input);
224    }
225
226    #[test]
227    fn nam_decomposed_composes_to_nam() {
228        // น้ำ decomposed: น + ้ + อํ + อา → น้ำ (น + ้ + อำ)
229        let input = "\u{0E19}\u{0E49}\u{0E4D}\u{0E32}";
230        let expected = "\u{0E19}\u{0E49}\u{0E33}";
231        assert_eq!(normalize(input), expected);
232    }
233
234    // ── output length invariant ────────────────────────────────────────────────
235
236    #[test]
237    fn output_length_never_exceeds_input_length() {
238        // Normalization can only shrink or preserve, never grow the char count.
239        let inputs = [
240            "กินข้าว",
241            "สวัสดีชาวโลก",
242            "\u{0E01}\u{0E48}\u{0E48}", // dedup (shrinks)
243            "\u{0E4D}\u{0E32}",         // composition (shrinks)
244        ];
245        for s in inputs {
246            let out = normalize(s);
247            assert!(
248                out.chars().count() <= s.chars().count(),
249                "normalize grew the string: {s:?} → {out:?}"
250            );
251        }
252    }
253}
kham_core/normalizer.rs

kham_core/
normalizer.rs