kham_core/tcc.rs
1//! Thai Character Cluster (TCC) boundary detection.
2//!
3//! Implements the TCC rules from Theeramunkong et al. (2000).
4//! A TCC is the smallest indivisible Thai orthographic unit — roughly
5//! one leading vowel + one consonant + its upper vowels + tone mark + trailing vowel.
6//!
7//! ## Pattern (simplified)
8//! ```text
9//! TCC = LEAD? CONSONANT UPPER* TONE? (THANTHAKAT | FOLLOW | NIKHAHIT)?
10//! | NON_THAI+
11//! ```
12//!
13//! TCC segmentation is used as a pre-pass by the main segmenter to ensure
14//! that word boundaries always fall on TCC boundaries.
15
16use alloc::vec;
17use alloc::vec::Vec;
18
19// ---------------------------------------------------------------------------
20// Unicode character classification
21// ---------------------------------------------------------------------------
22
23/// Thai consonants ก–ฮ (U+0E01–U+0E2E), plus special vowel-consonants ฤ ฦ.
24#[inline]
25fn is_consonant(c: char) -> bool {
26 matches!(c, '\u{0E01}'..='\u{0E2E}')
27}
28
29/// Leading vowels that appear *before* the consonant: เ แ โ ไ ใ (U+0E40–U+0E44).
30#[inline]
31fn is_lead_vowel(c: char) -> bool {
32 matches!(c, '\u{0E40}'..='\u{0E44}')
33}
34
35/// Upper vowels / signs written above the consonant: อั อิ อี อึ อื อุ อู อฺ
36/// (U+0E31, U+0E34–U+0E3A).
37#[inline]
38fn is_upper_vowel(c: char) -> bool {
39 c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
40}
41
42/// Tone marks: อ่ อ้ อ๊ อ๋ (U+0E48–U+0E4B).
43#[inline]
44fn is_tone(c: char) -> bool {
45 matches!(c, '\u{0E48}'..='\u{0E4B}')
46}
47
48/// Thanthakat ์ (U+0E4C) — silences a consonant.
49#[inline]
50fn is_thanthakat(c: char) -> bool {
51 c == '\u{0E4C}'
52}
53
54/// Nikhahit อํ (U+0E4D) — the upper component of Sara Am อำ.
55#[inline]
56fn is_nikhahit(c: char) -> bool {
57 c == '\u{0E4D}'
58}
59
60/// Follow (trailing) vowels written after the consonant: อะ อา อำ
61/// (U+0E30, U+0E32–U+0E33).
62#[inline]
63fn is_follow_vowel(c: char) -> bool {
64 c == '\u{0E30}' || matches!(c, '\u{0E32}'..='\u{0E33}')
65}
66
67/// Any character in the Thai Unicode block (U+0E00–U+0E7F).
68#[inline]
69fn is_thai(c: char) -> bool {
70 matches!(c, '\u{0E00}'..='\u{0E7F}')
71}
72
73// ---------------------------------------------------------------------------
74// Cursor — encapsulates offset arithmetic for the scanner
75// ---------------------------------------------------------------------------
76
77/// A forward-only cursor over the characters of a `&str` slice.
78///
79/// `base` is the byte offset of the slice's start within the original string,
80/// so `end` is always a valid offset into the original string.
81struct Cursor<'a> {
82 chars: core::iter::Peekable<core::str::CharIndices<'a>>,
83 base: usize,
84 /// Byte offset of the first character **not yet consumed**, relative to
85 /// the original string. Updated by every call to [`advance`].
86 end: usize,
87}
88
89impl<'a> Cursor<'a> {
90 fn new(text: &'a str, pos: usize) -> Self {
91 Self {
92 chars: text[pos..].char_indices().peekable(),
93 base: pos,
94 end: pos,
95 }
96 }
97
98 /// Peek at the next character without consuming it.
99 #[inline]
100 fn peek(&mut self) -> Option<char> {
101 self.chars.peek().map(|&(_, c)| c)
102 }
103
104 /// Consume the next character, update `end`, and return it.
105 #[inline]
106 fn advance(&mut self) -> Option<char> {
107 let (off, c) = self.chars.next()?;
108 self.end = self.base + off + c.len_utf8();
109 Some(c)
110 }
111
112 /// Consume the next character only if `pred` returns `true` for it.
113 #[inline]
114 fn advance_if(&mut self, pred: impl Fn(char) -> bool) -> bool {
115 match self.chars.peek() {
116 Some(&(_, c)) if pred(c) => {
117 self.advance();
118 true
119 }
120 _ => false,
121 }
122 }
123
124 /// Consume characters as long as `pred` holds.
125 #[inline]
126 fn advance_while(&mut self, pred: impl Fn(char) -> bool) {
127 while self.advance_if(&pred) {}
128 }
129}
130
131// ---------------------------------------------------------------------------
132// Thai TCC sub-scanners
133// ---------------------------------------------------------------------------
134
135/// Consume a maximal run of non-Thai characters (one non-Thai TCC).
136fn scan_non_thai(cur: &mut Cursor<'_>) {
137 cur.advance_while(|c| !is_thai(c));
138}
139
140/// Consume the TCC "head": optional leading vowel + required consonant.
141///
142/// `first` is the character already consumed from `cur`.
143/// Returns the base consonant, or `None` if `first` starts no valid Thai TCC
144/// (lone leading vowel with nothing after it, or a lone non-consonant Thai char).
145fn scan_head(cur: &mut Cursor<'_>, first: char) -> Option<char> {
146 if is_lead_vowel(first) {
147 // Leading vowel must be immediately followed by a consonant.
148 match cur.peek() {
149 Some(c) if is_consonant(c) => {
150 cur.advance();
151 Some(c)
152 }
153 // Lone leading vowel — ends the TCC right here.
154 _ => None,
155 }
156 } else if is_consonant(first) {
157 Some(first)
158 } else {
159 // Lone Thai non-consonant (digit, punctuation …) — single-char TCC.
160 None
161 }
162}
163
164/// Consume zero or more upper vowels / diacritic signs above the consonant.
165fn scan_upper_vowels(cur: &mut Cursor<'_>) {
166 cur.advance_while(is_upper_vowel);
167}
168
169/// Consume tone marks. Swallows duplicates that appear in malformed input.
170fn scan_tone_marks(cur: &mut Cursor<'_>) {
171 cur.advance_while(is_tone);
172}
173
174/// Consume the optional trailing diacritic: ์, อะ, อา, อำ, or อํ.
175fn scan_trailing(cur: &mut Cursor<'_>) {
176 cur.advance_if(|c| is_thanthakat(c) || is_follow_vowel(c) || is_nikhahit(c));
177}
178
179// ---------------------------------------------------------------------------
180// Core TCC scanner
181// ---------------------------------------------------------------------------
182
183/// Scan one TCC starting at `pos` in `text` and return the byte offset of
184/// the first character *after* the TCC.
185///
186/// Returns `None` only when `pos >= text.len()`.
187fn scan_one_tcc(text: &str, pos: usize) -> Option<usize> {
188 let mut cur = Cursor::new(text, pos);
189 let first = cur.advance()?;
190
191 // Non-Thai run → one flat TCC.
192 if !is_thai(first) {
193 scan_non_thai(&mut cur);
194 return Some(cur.end);
195 }
196
197 // Thai TCC: LEAD? CONSONANT UPPER* TONE? TRAIL?
198 let consonant = match scan_head(&mut cur, first) {
199 Some(c) => c,
200 // Lone leading vowel or non-consonant Thai char — TCC ends here.
201 None => return Some(cur.end),
202 };
203
204 // ฤ (U+0E24) and ฦ (U+0E26) are standalone vowel-consonants; nothing attaches.
205 if !matches!(consonant, '\u{0E24}' | '\u{0E26}') {
206 scan_upper_vowels(&mut cur);
207 scan_tone_marks(&mut cur);
208 scan_trailing(&mut cur);
209 }
210
211 Some(cur.end)
212}
213
214// ---------------------------------------------------------------------------
215// Public API
216// ---------------------------------------------------------------------------
217
218/// Return the byte offsets of every TCC boundary in `text`.
219///
220/// The returned slice always starts with `0` and ends with `text.len()`.
221/// Slicing `text` with consecutive pairs of offsets gives the individual TCCs.
222///
223/// # Examples
224///
225/// ```rust
226/// use kham_core::tcc::tcc_boundaries;
227///
228/// // "กิน" — กิ is one TCC (ก + อิ), น is another
229/// let bounds = tcc_boundaries("กิน");
230/// assert_eq!(bounds, vec![0, 6, 9]); // กิ = 6 bytes, น = 3 bytes
231/// assert_eq!(*bounds.first().unwrap(), 0);
232/// assert_eq!(*bounds.last().unwrap(), "กิน".len());
233/// ```
234///
235/// Consecutive boundaries slice directly into the original string:
236///
237/// ```rust
238/// use kham_core::tcc::tcc_boundaries;
239///
240/// let text = "กินข้าว";
241/// let bounds = tcc_boundaries(text);
242/// let tccs: Vec<&str> = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
243/// assert_eq!(tccs.join(""), text); // round-trip is lossless
244/// assert!(tccs.len() >= 2); // at least กิ and นข้ / า / ว
245/// ```
246///
247/// Mixed script: a Latin run is one non-Thai TCC; Thai chars each follow TCC rules:
248///
249/// ```rust
250/// use kham_core::tcc::tcc_boundaries;
251///
252/// let bounds = tcc_boundaries("hiสวัสดี");
253/// assert_eq!(bounds[0], 0);
254/// assert_eq!(bounds[1], 2); // "hi" = 2 ASCII bytes
255/// assert_eq!(*bounds.last().unwrap(), "hiสวัสดี".len());
256/// ```
257///
258/// Tone marks, upper vowels, and trailing vowels group with their consonant:
259///
260/// ```rust
261/// use kham_core::tcc::tcc_boundaries;
262///
263/// // "เก้" — lead vowel เ + ก + tone ้ → one TCC
264/// assert_eq!(tcc_boundaries("เก้").len(), 2); // [0, 9] → 1 TCC of 9 bytes
265///
266/// // "กำ" — ก + Sara Am อำ → one TCC
267/// assert_eq!(tcc_boundaries("กำ").len(), 2);
268/// ```
269pub fn tcc_boundaries(text: &str) -> Vec<usize> {
270 if text.is_empty() {
271 return vec![0];
272 }
273
274 let mut bounds = Vec::with_capacity(text.len() / 3 + 2);
275 bounds.push(0);
276
277 let mut pos = 0;
278 while pos < text.len() {
279 match scan_one_tcc(text, pos) {
280 Some(next) if next > pos => {
281 bounds.push(next);
282 pos = next;
283 }
284 // Safety net: advance by one UTF-8 char to avoid infinite loop.
285 _ => {
286 let next = text[pos..]
287 .char_indices()
288 .nth(1)
289 .map(|(i, _)| pos + i)
290 .unwrap_or(text.len());
291 bounds.push(next);
292 pos = next;
293 }
294 }
295 }
296
297 bounds
298}
299
300/// Iterate over the TCCs in `text` as `&str` slices.
301///
302/// # Examples
303///
304/// ```rust
305/// use kham_core::tcc::tcc_iter;
306///
307/// // "เกม": เก (lead vowel เ + consonant ก) is TCC 1, ม is TCC 2
308/// let tccs: Vec<&str> = tcc_iter("เกม").collect();
309/// assert_eq!(tccs, vec!["เก", "ม"]);
310/// ```
311///
312/// All TCCs joined reconstruct the original string:
313///
314/// ```rust
315/// use kham_core::tcc::tcc_iter;
316///
317/// let text = "สวัสดีชาวโลก";
318/// let joined: String = tcc_iter(text).collect();
319/// assert_eq!(joined, text);
320/// ```
321///
322/// Counts give the segmenter its candidate split-point count before the DP:
323///
324/// ```rust
325/// use kham_core::tcc::tcc_iter;
326///
327/// // "กิน" has 2 TCCs; "กินข้าว" has more
328/// assert_eq!(tcc_iter("กิน").count(), 2);
329/// assert!(tcc_iter("กินข้าว").count() >= 4);
330/// ```
331pub fn tcc_iter(text: &str) -> impl Iterator<Item = &str> {
332 TccIter { text, pos: 0 }
333}
334
335struct TccIter<'a> {
336 text: &'a str,
337 pos: usize,
338}
339
340impl<'a> Iterator for TccIter<'a> {
341 type Item = &'a str;
342
343 fn next(&mut self) -> Option<Self::Item> {
344 if self.pos >= self.text.len() {
345 return None;
346 }
347 let end = scan_one_tcc(self.text, self.pos)?;
348 let slice = &self.text[self.pos..end];
349 self.pos = end;
350 Some(slice)
351 }
352}
353
354// ---------------------------------------------------------------------------
355// Tests
356// ---------------------------------------------------------------------------
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361 use alloc::vec;
362
363 fn tccs(text: &str) -> Vec<&str> {
364 tcc_iter(text).collect()
365 }
366
367 #[test]
368 fn empty() {
369 assert_eq!(tcc_boundaries(""), vec![0]);
370 assert_eq!(tccs(""), Vec::<&str>::new());
371 }
372
373 #[test]
374 fn single_consonant() {
375 // ก = U+0E01, 3 bytes
376 assert_eq!(tccs("ก"), vec!["ก"]);
377 }
378
379 #[test]
380 fn consonant_upper_vowel() {
381 // กิ = ก (U+0E01) + อิ (U+0E34) = 6 bytes → 1 TCC
382 assert_eq!(tccs("กิ"), vec!["กิ"]);
383 }
384
385 #[test]
386 fn consonant_upper_tone() {
387 // กิ้ = ก + อิ + ้ = 9 bytes → 1 TCC
388 assert_eq!(tccs("กิ้"), vec!["กิ้"]);
389 }
390
391 #[test]
392 fn two_consonants() {
393 // กน → 2 TCCs
394 assert_eq!(tccs("กน"), vec!["ก", "น"]);
395 }
396
397 #[test]
398 fn gin_two_tccs() {
399 // กิน → กิ (TCC1) + น (TCC2)
400 assert_eq!(tccs("กิน"), vec!["กิ", "น"]);
401 let b = tcc_boundaries("กิน");
402 assert_eq!(b, vec![0, 6, 9]);
403 }
404
405 #[test]
406 fn lead_vowel() {
407 // เก = เ + ก → 1 TCC (lead vowel attaches to following consonant)
408 assert_eq!(tccs("เก"), vec!["เก"]);
409 }
410
411 #[test]
412 fn lead_vowel_with_tone() {
413 // เก้ = เ + ก + ้
414 assert_eq!(tccs("เก้"), vec!["เก้"]);
415 }
416
417 #[test]
418 fn follow_vowel_aa() {
419 // กา = ก + อา → 1 TCC
420 assert_eq!(tccs("กา"), vec!["กา"]);
421 }
422
423 #[test]
424 fn follow_vowel_sara_am() {
425 // กำ = ก + อำ → 1 TCC
426 assert_eq!(tccs("กำ"), vec!["กำ"]);
427 }
428
429 #[test]
430 fn thanthakat() {
431 // กร์ = ก + ร + ์ → but ก and ร are separate consonants so:
432 // ก (TCC1), ร์ (TCC2 — ร + thanthakat)
433 assert_eq!(tccs("กร์"), vec!["ก", "ร์"]);
434 }
435
436 #[test]
437 fn non_thai_run() {
438 // "hello" → single non-Thai TCC
439 assert_eq!(tccs("hello"), vec!["hello"]);
440 }
441
442 #[test]
443 fn mixed_script() {
444 // "hi" + กิน → ["hi", "กิ", "น"]
445 assert_eq!(tccs("hiกิน"), vec!["hi", "กิ", "น"]);
446 }
447
448 #[test]
449 fn thai_digit() {
450 // ๑ (U+0E51) is a Thai digit — standalone TCC
451 assert_eq!(tccs("๑"), vec!["๑"]);
452 }
453
454 #[test]
455 fn sawasdee() {
456 // สวัสดี — classic greeting, 5 chars, 3 TCCs: สวั สดี? Let's verify
457 // ส (U+0E2A), ว (U+0E27), ั (U+0E31), ส (U+0E2A), ด (U+0E14), ี (U+0E35)
458 // TCC1: สว ั → ส + วั? No — ั (upper vowel) attaches to preceding consonant ว
459 // Actually: ส (TCC1), วั (TCC2), ส (TCC3), ดี (TCC4)
460 let result = tccs("สวัสดี");
461 // Verify coverage: joining all TCCs gives back original
462 assert_eq!(result.join(""), "สวัสดี");
463 // Verify count (4 TCCs for สวัสดี)
464 assert_eq!(result.len(), 4);
465 }
466
467 #[test]
468 fn boundary_coverage() {
469 // Every boundary pair must be valid UTF-8 slice of original
470 let text = "ธนาคาร100แห่ง";
471 let bounds = tcc_boundaries(text);
472 // First and last are correct
473 assert_eq!(bounds[0], 0);
474 assert_eq!(*bounds.last().unwrap(), text.len());
475 // All intermediate boundaries are valid char boundaries
476 for &b in &bounds {
477 assert!(
478 text.is_char_boundary(b),
479 "offset {b} is not a char boundary"
480 );
481 }
482 // Joining the slices reconstructs the original
483 let rebuilt: alloc::string::String = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
484 assert_eq!(rebuilt, text);
485 }
486}