kham-core 0.5.1

Pure Rust Thai word segmentation engine — no_std compatible
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
//! Thai Character Cluster (TCC) boundary detection.
//!
//! Implements the TCC rules from Theeramunkong et al. (2000).
//! A TCC is the smallest indivisible Thai orthographic unit — roughly
//! one leading vowel + one consonant + its upper vowels + tone mark + trailing vowel.
//!
//! ## Pattern (simplified)
//! ```text
//! TCC = LEAD? CONSONANT UPPER* TONE? (THANTHAKAT | FOLLOW | NIKHAHIT)?
//!     | NON_THAI+
//! ```
//!
//! TCC segmentation is used as a pre-pass by the main segmenter to ensure
//! that word boundaries always fall on TCC boundaries.

use alloc::vec;
use alloc::vec::Vec;

// ---------------------------------------------------------------------------
// Unicode character classification
// ---------------------------------------------------------------------------

/// Thai consonants ก–ฮ (U+0E01–U+0E2E), plus special vowel-consonants ฤ ฦ.
#[inline]
fn is_consonant(c: char) -> bool {
    matches!(c, '\u{0E01}'..='\u{0E2E}')
}

/// Leading vowels that appear *before* the consonant: เ แ โ ไ ใ (U+0E40–U+0E44).
#[inline]
fn is_lead_vowel(c: char) -> bool {
    matches!(c, '\u{0E40}'..='\u{0E44}')
}

/// Upper vowels / signs written above the consonant: อั อิ อี อึ อื อุ อู อฺ
/// (U+0E31, U+0E34–U+0E3A).
#[inline]
fn is_upper_vowel(c: char) -> bool {
    c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
}

/// Tone marks: อ่ อ้ อ๊ อ๋ (U+0E48–U+0E4B).
#[inline]
fn is_tone(c: char) -> bool {
    matches!(c, '\u{0E48}'..='\u{0E4B}')
}

/// Thanthakat ์ (U+0E4C) — silences a consonant.
#[inline]
fn is_thanthakat(c: char) -> bool {
    c == '\u{0E4C}'
}

/// Nikhahit อํ (U+0E4D) — the upper component of Sara Am อำ.
#[inline]
fn is_nikhahit(c: char) -> bool {
    c == '\u{0E4D}'
}

/// Follow (trailing) vowels written after the consonant: อะ อา อำ
/// (U+0E30, U+0E32–U+0E33).
#[inline]
fn is_follow_vowel(c: char) -> bool {
    c == '\u{0E30}' || matches!(c, '\u{0E32}'..='\u{0E33}')
}

/// Any character in the Thai Unicode block (U+0E00–U+0E7F).
#[inline]
fn is_thai(c: char) -> bool {
    matches!(c, '\u{0E00}'..='\u{0E7F}')
}

// ---------------------------------------------------------------------------
// Cursor — encapsulates offset arithmetic for the scanner
// ---------------------------------------------------------------------------

/// A forward-only cursor over the characters of a `&str` slice.
///
/// `base` is the byte offset of the slice's start within the original string,
/// so `end` is always a valid offset into the original string.
struct Cursor<'a> {
    chars: core::iter::Peekable<core::str::CharIndices<'a>>,
    base: usize,
    /// Byte offset of the first character **not yet consumed**, relative to
    /// the original string. Updated by every call to [`advance`].
    end: usize,
}

impl<'a> Cursor<'a> {
    fn new(text: &'a str, pos: usize) -> Self {
        Self {
            chars: text[pos..].char_indices().peekable(),
            base: pos,
            end: pos,
        }
    }

    /// Peek at the next character without consuming it.
    #[inline]
    fn peek(&mut self) -> Option<char> {
        self.chars.peek().map(|&(_, c)| c)
    }

    /// Consume the next character, update `end`, and return it.
    #[inline]
    fn advance(&mut self) -> Option<char> {
        let (off, c) = self.chars.next()?;
        self.end = self.base + off + c.len_utf8();
        Some(c)
    }

    /// Consume the next character only if `pred` returns `true` for it.
    #[inline]
    fn advance_if(&mut self, pred: impl Fn(char) -> bool) -> bool {
        match self.chars.peek() {
            Some(&(_, c)) if pred(c) => {
                self.advance();
                true
            }
            _ => false,
        }
    }

    /// Consume characters as long as `pred` holds.
    #[inline]
    fn advance_while(&mut self, pred: impl Fn(char) -> bool) {
        while self.advance_if(&pred) {}
    }
}

// ---------------------------------------------------------------------------
// Thai TCC sub-scanners
// ---------------------------------------------------------------------------

/// Consume a maximal run of non-Thai characters (one non-Thai TCC).
fn scan_non_thai(cur: &mut Cursor<'_>) {
    cur.advance_while(|c| !is_thai(c));
}

/// Consume the TCC "head": optional leading vowel + required consonant.
///
/// `first` is the character already consumed from `cur`.
/// Returns the base consonant, or `None` if `first` starts no valid Thai TCC
/// (lone leading vowel with nothing after it, or a lone non-consonant Thai char).
fn scan_head(cur: &mut Cursor<'_>, first: char) -> Option<char> {
    if is_lead_vowel(first) {
        // Leading vowel must be immediately followed by a consonant.
        match cur.peek() {
            Some(c) if is_consonant(c) => {
                cur.advance();
                Some(c)
            }
            // Lone leading vowel — ends the TCC right here.
            _ => None,
        }
    } else if is_consonant(first) {
        Some(first)
    } else {
        // Lone Thai non-consonant (digit, punctuation …) — single-char TCC.
        None
    }
}

/// Consume zero or more upper vowels / diacritic signs above the consonant.
fn scan_upper_vowels(cur: &mut Cursor<'_>) {
    cur.advance_while(is_upper_vowel);
}

/// Consume tone marks. Swallows duplicates that appear in malformed input.
fn scan_tone_marks(cur: &mut Cursor<'_>) {
    cur.advance_while(is_tone);
}

/// Consume the optional trailing diacritic: ์, อะ, อา, อำ, or อํ.
fn scan_trailing(cur: &mut Cursor<'_>) {
    cur.advance_if(|c| is_thanthakat(c) || is_follow_vowel(c) || is_nikhahit(c));
}

// ---------------------------------------------------------------------------
// Core TCC scanner
// ---------------------------------------------------------------------------

/// Scan one TCC starting at `pos` in `text` and return the byte offset of
/// the first character *after* the TCC.
///
/// Returns `None` only when `pos >= text.len()`.
fn scan_one_tcc(text: &str, pos: usize) -> Option<usize> {
    let mut cur = Cursor::new(text, pos);
    let first = cur.advance()?;

    // Non-Thai run → one flat TCC.
    if !is_thai(first) {
        scan_non_thai(&mut cur);
        return Some(cur.end);
    }

    // Thai TCC: LEAD? CONSONANT UPPER* TONE? TRAIL?
    let consonant = match scan_head(&mut cur, first) {
        Some(c) => c,
        // Lone leading vowel or non-consonant Thai char — TCC ends here.
        None => return Some(cur.end),
    };

    // ฤ (U+0E24) and ฦ (U+0E26) are standalone vowel-consonants; nothing attaches.
    if !matches!(consonant, '\u{0E24}' | '\u{0E26}') {
        scan_upper_vowels(&mut cur);
        scan_tone_marks(&mut cur);
        scan_trailing(&mut cur);
    }

    Some(cur.end)
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/// Return the byte offsets of every TCC boundary in `text`.
///
/// The returned slice always starts with `0` and ends with `text.len()`.
/// Slicing `text` with consecutive pairs of offsets gives the individual TCCs.
///
/// # Examples
///
/// ```rust
/// use kham_core::tcc::tcc_boundaries;
///
/// // "กิน" — กิ is one TCC (ก + อิ), น is another
/// let bounds = tcc_boundaries("กิน");
/// assert_eq!(bounds, vec![0, 6, 9]); // กิ = 6 bytes, น = 3 bytes
/// assert_eq!(*bounds.first().unwrap(), 0);
/// assert_eq!(*bounds.last().unwrap(), "กิน".len());
/// ```
///
/// Consecutive boundaries slice directly into the original string:
///
/// ```rust
/// use kham_core::tcc::tcc_boundaries;
///
/// let text = "กินข้าว";
/// let bounds = tcc_boundaries(text);
/// let tccs: Vec<&str> = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
/// assert_eq!(tccs.join(""), text); // round-trip is lossless
/// assert!(tccs.len() >= 2);        // at least กิ and นข้ / า / ว
/// ```
///
/// Mixed script: a Latin run is one non-Thai TCC; Thai chars each follow TCC rules:
///
/// ```rust
/// use kham_core::tcc::tcc_boundaries;
///
/// let bounds = tcc_boundaries("hiสวัสดี");
/// assert_eq!(bounds[0], 0);
/// assert_eq!(bounds[1], 2); // "hi" = 2 ASCII bytes
/// assert_eq!(*bounds.last().unwrap(), "hiสวัสดี".len());
/// ```
///
/// Tone marks, upper vowels, and trailing vowels group with their consonant:
///
/// ```rust
/// use kham_core::tcc::tcc_boundaries;
///
/// // "เก้" — lead vowel เ + ก + tone ้ → one TCC
/// assert_eq!(tcc_boundaries("เก้").len(), 2); // [0, 9] → 1 TCC of 9 bytes
///
/// // "กำ" — ก + Sara Am อำ → one TCC
/// assert_eq!(tcc_boundaries("กำ").len(), 2);
/// ```
pub fn tcc_boundaries(text: &str) -> Vec<usize> {
    if text.is_empty() {
        return vec![0];
    }

    let mut bounds = Vec::with_capacity(text.len() / 3 + 2);
    bounds.push(0);

    let mut pos = 0;
    while pos < text.len() {
        match scan_one_tcc(text, pos) {
            Some(next) if next > pos => {
                bounds.push(next);
                pos = next;
            }
            // Safety net: advance by one UTF-8 char to avoid infinite loop.
            _ => {
                let next = text[pos..]
                    .char_indices()
                    .nth(1)
                    .map(|(i, _)| pos + i)
                    .unwrap_or(text.len());
                bounds.push(next);
                pos = next;
            }
        }
    }

    bounds
}

/// Iterate over the TCCs in `text` as `&str` slices.
///
/// # Examples
///
/// ```rust
/// use kham_core::tcc::tcc_iter;
///
/// // "เกม": เก (lead vowel เ + consonant ก) is TCC 1, ม is TCC 2
/// let tccs: Vec<&str> = tcc_iter("เกม").collect();
/// assert_eq!(tccs, vec!["เก", "ม"]);
/// ```
///
/// All TCCs joined reconstruct the original string:
///
/// ```rust
/// use kham_core::tcc::tcc_iter;
///
/// let text = "สวัสดีชาวโลก";
/// let joined: String = tcc_iter(text).collect();
/// assert_eq!(joined, text);
/// ```
///
/// Counts give the segmenter its candidate split-point count before the DP:
///
/// ```rust
/// use kham_core::tcc::tcc_iter;
///
/// // "กิน" has 2 TCCs; "กินข้าว" has more
/// assert_eq!(tcc_iter("กิน").count(), 2);
/// assert!(tcc_iter("กินข้าว").count() >= 4);
/// ```
pub fn tcc_iter(text: &str) -> impl Iterator<Item = &str> {
    TccIter { text, pos: 0 }
}

struct TccIter<'a> {
    text: &'a str,
    pos: usize,
}

impl<'a> Iterator for TccIter<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<Self::Item> {
        if self.pos >= self.text.len() {
            return None;
        }
        let end = scan_one_tcc(self.text, self.pos)?;
        let slice = &self.text[self.pos..end];
        self.pos = end;
        Some(slice)
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    fn tccs(text: &str) -> Vec<&str> {
        tcc_iter(text).collect()
    }

    #[test]
    fn empty() {
        assert_eq!(tcc_boundaries(""), vec![0]);
        assert_eq!(tccs(""), Vec::<&str>::new());
    }

    #[test]
    fn single_consonant() {
        // ก = U+0E01, 3 bytes
        assert_eq!(tccs(""), vec![""]);
    }

    #[test]
    fn consonant_upper_vowel() {
        // กิ = ก (U+0E01) + อิ (U+0E34) = 6 bytes → 1 TCC
        assert_eq!(tccs("กิ"), vec!["กิ"]);
    }

    #[test]
    fn consonant_upper_tone() {
        // กิ้ = ก + อิ + ้ = 9 bytes → 1 TCC
        assert_eq!(tccs("กิ้"), vec!["กิ้"]);
    }

    #[test]
    fn two_consonants() {
        // กน → 2 TCCs
        assert_eq!(tccs("กน"), vec!["", ""]);
    }

    #[test]
    fn gin_two_tccs() {
        // กิน → กิ (TCC1) + น (TCC2)
        assert_eq!(tccs("กิน"), vec!["กิ", ""]);
        let b = tcc_boundaries("กิน");
        assert_eq!(b, vec![0, 6, 9]);
    }

    #[test]
    fn lead_vowel() {
        // เก = เ + ก → 1 TCC (lead vowel attaches to following consonant)
        assert_eq!(tccs("เก"), vec!["เก"]);
    }

    #[test]
    fn lead_vowel_with_tone() {
        // เก้ = เ + ก + ้
        assert_eq!(tccs("เก้"), vec!["เก้"]);
    }

    #[test]
    fn follow_vowel_aa() {
        // กา = ก + อา → 1 TCC
        assert_eq!(tccs("กา"), vec!["กา"]);
    }

    #[test]
    fn follow_vowel_sara_am() {
        // กำ = ก + อำ → 1 TCC
        assert_eq!(tccs("กำ"), vec!["กำ"]);
    }

    #[test]
    fn thanthakat() {
        // กร์ = ก + ร + ์ → but ก and ร are separate consonants so:
        // ก (TCC1), ร์ (TCC2 — ร + thanthakat)
        assert_eq!(tccs("กร์"), vec!["", "ร์"]);
    }

    #[test]
    fn non_thai_run() {
        // "hello" → single non-Thai TCC
        assert_eq!(tccs("hello"), vec!["hello"]);
    }

    #[test]
    fn mixed_script() {
        // "hi" + กิน → ["hi", "กิ", "น"]
        assert_eq!(tccs("hiกิน"), vec!["hi", "กิ", ""]);
    }

    #[test]
    fn thai_digit() {
        // ๑ (U+0E51) is a Thai digit — standalone TCC
        assert_eq!(tccs(""), vec![""]);
    }

    #[test]
    fn sawasdee() {
        // สวัสดี — classic greeting, 5 chars, 3 TCCs: สวั สดี? Let's verify
        // ส (U+0E2A), ว (U+0E27), ั (U+0E31), ส (U+0E2A), ด (U+0E14), ี (U+0E35)
        // TCC1: สว ั → ส + วั? No — ั (upper vowel) attaches to preceding consonant ว
        // Actually: ส (TCC1), วั (TCC2), ส (TCC3), ดี (TCC4)
        let result = tccs("สวัสดี");
        // Verify coverage: joining all TCCs gives back original
        assert_eq!(result.join(""), "สวัสดี");
        // Verify count (4 TCCs for สวัสดี)
        assert_eq!(result.len(), 4);
    }

    #[test]
    fn boundary_coverage() {
        // Every boundary pair must be valid UTF-8 slice of original
        let text = "ธนาคาร100แห่ง";
        let bounds = tcc_boundaries(text);
        // First and last are correct
        assert_eq!(bounds[0], 0);
        assert_eq!(*bounds.last().unwrap(), text.len());
        // All intermediate boundaries are valid char boundaries
        for &b in &bounds {
            assert!(
                text.is_char_boundary(b),
                "offset {b} is not a char boundary"
            );
        }
        // Joining the slices reconstructs the original
        let rebuilt: alloc::string::String = bounds.windows(2).map(|w| &text[w[0]..w[1]]).collect();
        assert_eq!(rebuilt, text);
    }
}