kham_core/
tcc.rs

1//! Thai Character Cluster (TCC) boundary detection.
2//!
3//! Implements the TCC rules from Theeramunkong et al. (2000).
4//! A TCC is the smallest indivisible Thai orthographic unit — roughly
5//! one leading vowel + one consonant + its upper vowels + tone mark + trailing vowel.
6//!
7//! ## Pattern (simplified)
8//! ```text
9//! TCC = LEAD? CONSONANT UPPER* TONE? (THANTHAKAT | FOLLOW | NIKHAHIT)?
10//!     | NON_THAI+
11//! ```
12//!
13//! TCC segmentation is used as a pre-pass by the main segmenter to ensure
14//! that word boundaries always fall on TCC boundaries.
15
16use alloc::vec;
17use alloc::vec::Vec;
18
19// ---------------------------------------------------------------------------
20// Unicode character classification
21// ---------------------------------------------------------------------------
22
23/// Thai consonants ก–ฮ (U+0E01–U+0E2E), plus special vowel-consonants ฤ ฦ.
24#[inline]
25fn is_consonant(c: char) -> bool {
26    matches!(c, '\u{0E01}'..='\u{0E2E}')
27}
28
29/// Leading vowels that appear *before* the consonant: เ แ โ ไ ใ (U+0E40–U+0E44).
30#[inline]
31fn is_lead_vowel(c: char) -> bool {
32    matches!(c, '\u{0E40}'..='\u{0E44}')
33}
34
35/// Upper vowels / signs written above the consonant: อั อิ อี อึ อื อุ อู อฺ
36/// (U+0E31, U+0E34–U+0E3A).
37#[inline]
38fn is_upper_vowel(c: char) -> bool {
39    c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
40}
41
42/// Tone marks: อ่ อ้ อ๊ อ๋ (U+0E48–U+0E4B).
43#[inline]
44fn is_tone(c: char) -> bool {
45    matches!(c, '\u{0E48}'..='\u{0E4B}')
46}
47
48/// Thanthakat ์ (U+0E4C) — silences a consonant.
49#[inline]
50fn is_thanthakat(c: char) -> bool {
51    c == '\u{0E4C}'
52}
53
54/// Nikhahit อํ (U+0E4D) — the upper component of Sara Am อำ.
55#[inline]
56fn is_nikhahit(c: char) -> bool {
57    c == '\u{0E4D}'
58}
59
60/// Follow (trailing) vowels written after the consonant: อะ อา อำ
61/// (U+0E30, U+0E32–U+0E33).
62#[inline]
63fn is_follow_vowel(c: char) -> bool {
64    c == '\u{0E30}' || matches!(c, '\u{0E32}'..='\u{0E33}')
65}
66
67/// Any character in the Thai Unicode block (U+0E00–U+0E7F).
68#[inline]
69fn is_thai(c: char) -> bool {
70    matches!(c, '\u{0E00}'..='\u{0E7F}')
71}
72
73// ---------------------------------------------------------------------------
74// Cursor — encapsulates offset arithmetic for the scanner
75// ---------------------------------------------------------------------------
76
77/// A forward-only cursor over the characters of a `&str` slice.
78///
79/// `base` is the byte offset of the slice's start within the original string,
80/// so `end` is always a valid offset into the original string.
81struct Cursor<'a> {
82    chars: core::iter::Peekable<core::str::CharIndices<'a>>,
83    base: usize,
84    /// Byte offset of the first character **not yet consumed**, relative to
85    /// the original string. Updated by every call to [`advance`].
86    end: usize,
87}
88
89impl<'a> Cursor<'a> {
90    fn new(text: &'a str, pos: usize) -> Self {
91        Self {
92            chars: text[pos..].char_indices().peekable(),
93            base: pos,
94            end: pos,
95        }
96    }
97
98    /// Peek at the next character without consuming it.
99    #[inline]
100    fn peek(&mut self) -> Option<char> {
101        self.chars.peek().map(|&(_, c)| c)
102    }
103
104    /// Consume the next character, update `end`, and return it.
105    #[inline]
106    fn advance(&mut self) -> Option<char> {
107        let (off, c) = self.chars.next()?;
108        self.end = self.base + off + c.len_utf8();
109        Some(c)
110    }
111
112    /// Consume the next character only if `pred` returns `true` for it.
113    #[inline]
114    fn advance_if(&mut self, pred: impl Fn(char) -> bool) -> bool {
115        match self.chars.peek() {
116            Some(&(_, c)) if pred(c) => {
117                self.advance();
118                true
119            }
120            _ => false,
121        }
122    }
123
124    /// Consume characters as long as `pred` holds.
125    #[inline]
126    fn advance_while(&mut self, pred: impl Fn(char) -> bool) {
127        while self.advance_if(&pred) {}
128    }
129}
130
131// ---------------------------------------------------------------------------
132// Thai TCC sub-scanners
133// ---------------------------------------------------------------------------
134
135/// Consume a maximal run of non-Thai characters (one non-Thai TCC).
136fn scan_non_thai(cur: &mut Cursor<'_>) {
137    cur.advance_while(|c| !is_thai(c));
138}
139
140/// Consume the TCC "head": optional leading vowel + required consonant.
141///
142/// `first` is the character already consumed from `cur`.
143/// Returns the base consonant, or `None` if `first` starts no valid Thai TCC
144/// (lone leading vowel with nothing after it, or a lone non-consonant Thai char).
145fn scan_head(cur: &mut Cursor<'_>, first: char) -> Option<char> {
146    if is_lead_vowel(first) {
147        // Leading vowel must be immediately followed by a consonant.
148        match cur.peek() {
149            Some(c) if is_consonant(c) => {
150                cur.advance();
151                Some(c)
152            }
153            // Lone leading vowel — ends the TCC right here.
154            _ => None,
155        }
156    } else if is_consonant(first) {
157        Some(first)
158    } else {
159        // Lone Thai non-consonant (digit, punctuation …) — single-char TCC.
160        None
161    }
162}
163
164/// Consume zero or more upper vowels / diacritic signs above the consonant.
165fn scan_upper_vowels(cur: &mut Cursor<'_>) {
166    cur.advance_while(is_upper_vowel);
167}
168
169/// Consume tone marks. Swallows duplicates that appear in malformed input.
170fn scan_tone_marks(cur: &mut Cursor<'_>) {
171    cur.advance_while(is_tone);
172}
173
174/// Consume the optional trailing diacritic: ์, อะ, อา, อำ, or อํ.
175fn scan_trailing(cur: &mut Cursor<'_>) {
176    cur.advance_if(|c| is_thanthakat(c) || is_follow_vowel(c) || is_nikhahit(c));
177}
178
179// ---------------------------------------------------------------------------
180// Core TCC scanner
181// ---------------------------------------------------------------------------
182
183/// Scan one TCC starting at `pos` in `text` and return the byte offset of
184/// the first character *after* the TCC.
185///
186/// Returns `None` only when `pos >= text.len()`.
187fn scan_one_tcc(text: &str, pos: usize) -> Option<usize> {
188    let mut cur = Cursor::new(text, pos);
189    let first = cur.advance()?;
190
191    // Non-Thai run → one flat TCC.
192    if !is_thai(first) {
193        scan_non_thai(&mut cur);
194        return Some(cur.end);
195    }
196
197    // Thai TCC: LEAD? CONSONANT UPPER* TONE? TRAIL?
198    let consonant = match scan_head(&mut cur, first) {
199        Some(c) => c,
200        // Lone leading vowel or non-consonant Thai char — TCC ends here.
201        None => return Some(cur.end),
202    };
203
204    // ฤ (U+0E24) and ฦ (U+0E26) are standalone vowel-consonants; nothing attaches.
205    if !matches!(consonant, '\u{0E24}' | '\u{0E26}') {
206        scan_upper_vowels(&mut cur);
207        scan_tone_marks(&mut cur);
208        scan_trailing(&mut cur);
209    }
210
211    Some(cur.end)
212}
213
214// ---------------------------------------------------------------------------
215// Public API
216// ---------------------------------------------------------------------------
217
218/// Return the byte offsets of every TCC boundary in `text`.
219///
220/// The returned slice always starts with `0` and ends with `text.len()`.
221/// Slicing `text` with consecutive pairs of offsets gives the individual TCCs.
222///
223/// # Examples
224///
225/// ```rust
226/// use kham_core::tcc::tcc_boundaries;
227///
228/// // "กิน" — กิ is one TCC (ก + อิ), น is another
229/// let bounds = tcc_boundaries("กิน");
230/// assert_eq!(bounds, vec![0, 6, 9]); // กิ = 6 bytes, น = 3 bytes
231/// assert_eq!(*bounds.first().unwrap(), 0);
232/// assert_eq!(*bounds.last().unwrap(), "กิน".len());
233/// ```
234///
235/// Consecutive boundaries slice directly into the original string:
236///
237/// ```rust
238/// use kham_core::tcc::tcc_boundaries;
239///
240/// let text = "กินข้าว";
241/// let bounds = tcc_boundaries(text);
242/// let tccs: Vec<&str> = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
243/// assert_eq!(tccs.join(""), text); // round-trip is lossless
244/// assert!(tccs.len() >= 2);        // at least กิ and นข้ / า / ว
245/// ```
246///
247/// Mixed script: a Latin run is one non-Thai TCC; Thai chars each follow TCC rules:
248///
249/// ```rust
250/// use kham_core::tcc::tcc_boundaries;
251///
252/// let bounds = tcc_boundaries("hiสวัสดี");
253/// assert_eq!(bounds[0], 0);
254/// assert_eq!(bounds[1], 2); // "hi" = 2 ASCII bytes
255/// assert_eq!(*bounds.last().unwrap(), "hiสวัสดี".len());
256/// ```
257///
258/// Tone marks, upper vowels, and trailing vowels group with their consonant:
259///
260/// ```rust
261/// use kham_core::tcc::tcc_boundaries;
262///
263/// // "เก้" — lead vowel เ + ก + tone ้ → one TCC
264/// assert_eq!(tcc_boundaries("เก้").len(), 2); // [0, 9] → 1 TCC of 9 bytes
265///
266/// // "กำ" — ก + Sara Am อำ → one TCC
267/// assert_eq!(tcc_boundaries("กำ").len(), 2);
268/// ```
269pub fn tcc_boundaries(text: &str) -> Vec<usize> {
270    if text.is_empty() {
271        return vec![0];
272    }
273
274    let mut bounds = Vec::with_capacity(text.len() / 3 + 2);
275    bounds.push(0);
276
277    let mut pos = 0;
278    while pos < text.len() {
279        match scan_one_tcc(text, pos) {
280            Some(next) if next > pos => {
281                bounds.push(next);
282                pos = next;
283            }
284            // Safety net: advance by one UTF-8 char to avoid infinite loop.
285            _ => {
286                let next = text[pos..]
287                    .char_indices()
288                    .nth(1)
289                    .map(|(i, _)| pos + i)
290                    .unwrap_or(text.len());
291                bounds.push(next);
292                pos = next;
293            }
294        }
295    }
296
297    bounds
298}
299
300/// Iterate over the TCCs in `text` as `&str` slices.
301///
302/// # Examples
303///
304/// ```rust
305/// use kham_core::tcc::tcc_iter;
306///
307/// // "เกม": เก (lead vowel เ + consonant ก) is TCC 1, ม is TCC 2
308/// let tccs: Vec<&str> = tcc_iter("เกม").collect();
309/// assert_eq!(tccs, vec!["เก", "ม"]);
310/// ```
311///
312/// All TCCs joined reconstruct the original string:
313///
314/// ```rust
315/// use kham_core::tcc::tcc_iter;
316///
317/// let text = "สวัสดีชาวโลก";
318/// let joined: String = tcc_iter(text).collect();
319/// assert_eq!(joined, text);
320/// ```
321///
322/// Counts give the segmenter its candidate split-point count before the DP:
323///
324/// ```rust
325/// use kham_core::tcc::tcc_iter;
326///
327/// // "กิน" has 2 TCCs; "กินข้าว" has more
328/// assert_eq!(tcc_iter("กิน").count(), 2);
329/// assert!(tcc_iter("กินข้าว").count() >= 4);
330/// ```
331pub fn tcc_iter(text: &str) -> impl Iterator<Item = &str> {
332    TccIter { text, pos: 0 }
333}
334
335struct TccIter<'a> {
336    text: &'a str,
337    pos: usize,
338}
339
340impl<'a> Iterator for TccIter<'a> {
341    type Item = &'a str;
342
343    fn next(&mut self) -> Option<Self::Item> {
344        if self.pos >= self.text.len() {
345            return None;
346        }
347        let end = scan_one_tcc(self.text, self.pos)?;
348        let slice = &self.text[self.pos..end];
349        self.pos = end;
350        Some(slice)
351    }
352}
353
354// ---------------------------------------------------------------------------
355// Tests
356// ---------------------------------------------------------------------------
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361    use alloc::vec;
362
363    fn tccs(text: &str) -> Vec<&str> {
364        tcc_iter(text).collect()
365    }
366
367    #[test]
368    fn empty() {
369        assert_eq!(tcc_boundaries(""), vec![0]);
370        assert_eq!(tccs(""), Vec::<&str>::new());
371    }
372
373    #[test]
374    fn single_consonant() {
375        // ก = U+0E01, 3 bytes
376        assert_eq!(tccs("ก"), vec!["ก"]);
377    }
378
379    #[test]
380    fn consonant_upper_vowel() {
381        // กิ = ก (U+0E01) + อิ (U+0E34) = 6 bytes → 1 TCC
382        assert_eq!(tccs("กิ"), vec!["กิ"]);
383    }
384
385    #[test]
386    fn consonant_upper_tone() {
387        // กิ้ = ก + อิ + ้ = 9 bytes → 1 TCC
388        assert_eq!(tccs("กิ้"), vec!["กิ้"]);
389    }
390
391    #[test]
392    fn two_consonants() {
393        // กน → 2 TCCs
394        assert_eq!(tccs("กน"), vec!["ก", "น"]);
395    }
396
397    #[test]
398    fn gin_two_tccs() {
399        // กิน → กิ (TCC1) + น (TCC2)
400        assert_eq!(tccs("กิน"), vec!["กิ", "น"]);
401        let b = tcc_boundaries("กิน");
402        assert_eq!(b, vec![0, 6, 9]);
403    }
404
405    #[test]
406    fn lead_vowel() {
407        // เก = เ + ก → 1 TCC (lead vowel attaches to following consonant)
408        assert_eq!(tccs("เก"), vec!["เก"]);
409    }
410
411    #[test]
412    fn lead_vowel_with_tone() {
413        // เก้ = เ + ก + ้
414        assert_eq!(tccs("เก้"), vec!["เก้"]);
415    }
416
417    #[test]
418    fn follow_vowel_aa() {
419        // กา = ก + อา → 1 TCC
420        assert_eq!(tccs("กา"), vec!["กา"]);
421    }
422
423    #[test]
424    fn follow_vowel_sara_am() {
425        // กำ = ก + อำ → 1 TCC
426        assert_eq!(tccs("กำ"), vec!["กำ"]);
427    }
428
429    #[test]
430    fn thanthakat() {
431        // กร์ = ก + ร + ์ → but ก and ร are separate consonants so:
432        // ก (TCC1), ร์ (TCC2 — ร + thanthakat)
433        assert_eq!(tccs("กร์"), vec!["ก", "ร์"]);
434    }
435
436    #[test]
437    fn non_thai_run() {
438        // "hello" → single non-Thai TCC
439        assert_eq!(tccs("hello"), vec!["hello"]);
440    }
441
442    #[test]
443    fn mixed_script() {
444        // "hi" + กิน → ["hi", "กิ", "น"]
445        assert_eq!(tccs("hiกิน"), vec!["hi", "กิ", "น"]);
446    }
447
448    #[test]
449    fn thai_digit() {
450        // ๑ (U+0E51) is a Thai digit — standalone TCC
451        assert_eq!(tccs("๑"), vec!["๑"]);
452    }
453
454    #[test]
455    fn sawasdee() {
456        // สวัสดี — classic greeting, 5 chars, 3 TCCs: สวั สดี? Let's verify
457        // ส (U+0E2A), ว (U+0E27), ั (U+0E31), ส (U+0E2A), ด (U+0E14), ี (U+0E35)
458        // TCC1: สว ั → ส + วั? No — ั (upper vowel) attaches to preceding consonant ว
459        // Actually: ส (TCC1), วั (TCC2), ส (TCC3), ดี (TCC4)
460        let result = tccs("สวัสดี");
461        // Verify coverage: joining all TCCs gives back original
462        assert_eq!(result.join(""), "สวัสดี");
463        // Verify count (4 TCCs for สวัสดี)
464        assert_eq!(result.len(), 4);
465    }
466
467    #[test]
468    fn boundary_coverage() {
469        // Every boundary pair must be valid UTF-8 slice of original
470        let text = "ธนาคาร100แห่ง";
471        let bounds = tcc_boundaries(text);
472        // First and last are correct
473        assert_eq!(bounds[0], 0);
474        assert_eq!(*bounds.last().unwrap(), text.len());
475        // All intermediate boundaries are valid char boundaries
476        for &b in &bounds {
477            assert!(
478                text.is_char_boundary(b),
479                "offset {b} is not a char boundary"
480            );
481        }
482        // Joining the slices reconstructs the original
483        let rebuilt: alloc::string::String = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
484        assert_eq!(rebuilt, text);
485    }
486}
kham_core/tcc.rs

kham_core/
tcc.rs