Skip to main content

kham_core/
pre_tokenizer.rs

1//! Unicode script classifier and pre-tokenizer.
2//!
3//! Splits raw input into coarse, script-homogeneous [`Token`] spans before
4//! the main segmenter runs. The segmenter only needs to apply the expensive
5//! DAG algorithm to Thai spans; all other spans pass through unchanged.
6//!
7//! ## Pipeline position
8//!
9//! ```text
10//! raw text
11//!    │
12//!    ▼
13//! pre_tokenize()   ← this module
14//!    │  splits into [Thai | Latin | Number | Whitespace | Emoji | Punctuation | Unknown]
15//!    ▼
16//! segmenter        ← processes Thai spans with tcc + dict
17//!    │
18//!    ▼
19//! Vec<Token<'_>>
20//! ```
21//!
22//! ## Example
23//!
24//! ```rust
25//! use kham_core::pre_tokenizer::pre_tokenize;
26//! use kham_core::TokenKind;
27//!
28//! let spans = pre_tokenize("ธนาคาร100แห่ง");
29//! assert_eq!(spans[0].kind, TokenKind::Thai);   // "ธนาคาร"
30//! assert_eq!(spans[1].kind, TokenKind::Number); // "100"
31//! assert_eq!(spans[2].kind, TokenKind::Thai);   // "แห่ง"
32//! ```
33
34use alloc::vec::Vec;
35
36use crate::token::{Token, TokenKind};
37
38// ---------------------------------------------------------------------------
39// Character classification
40// ---------------------------------------------------------------------------
41
42/// Classify a single Unicode scalar value into a [`TokenKind`].
43///
44/// Classification is purely codepoint-based — no context is used. The rules
45/// are applied in priority order so that sub-ranges override their parent
46/// block (e.g. Thai digits are checked before the broader Thai block).
47///
48/// ## Classification table
49///
50/// | Range / set | Kind |
51/// |---|---|
52/// | U+0E50–U+0E59 (Thai digits ๐–๙) | `Number` |
53/// | U+0E00–U+0E7F (Thai block) | `Thai` |
54/// | `0`–`9` (ASCII digits) | `Number` |
55/// | U+FF10–U+FF19 (fullwidth digits) | `Number` |
56/// | `A`–`Z`, `a`–`z` (ASCII letters) | `Latin` |
57/// | U+FF21–U+FF5A (fullwidth Latin) | `Latin` |
58/// | Space, tab, newline, CR, NBSP, ideographic space | `Whitespace` |
59/// | Major emoji blocks (U+1F300–U+1FAFF, U+2600–U+27BF, …) | `Emoji` |
60/// | ASCII punctuation (`!`–`/`, `:`–`@`, …) | `Punctuation` |
61/// | U+2000–U+206F (Unicode general punctuation) | `Punctuation` |
62/// | Everything else | `Unknown` |
63#[inline]
64pub fn classify_char(c: char) -> TokenKind {
65    match c {
66        // Thai digits sit inside the Thai block — check them first so they
67        // are not misclassified as Thai script.
68        '\u{0E50}'..='\u{0E59}' => TokenKind::Number,
69
70        // Remaining Thai Unicode block: consonants, vowels, tone marks, etc.
71        '\u{0E00}'..='\u{0E7F}' => TokenKind::Thai,
72
73        // ASCII decimal digits.
74        '0'..='9' => TokenKind::Number,
75
76        // Fullwidth digit forms (U+FF10 0 – U+FF19 9).
77        '\u{FF10}'..='\u{FF19}' => TokenKind::Number,
78
79        // ASCII basic Latin letters (a–z, A–Z).
80        'A'..='Z' | 'a'..='z' => TokenKind::Latin,
81
82        // Fullwidth Latin capital (U+FF21 A – U+FF3A Z) and
83        // small (U+FF41 a – U+FF5A z) letter forms.
84        '\u{FF21}'..='\u{FF3A}' | '\u{FF41}'..='\u{FF5A}' => TokenKind::Latin,
85
86        // Common whitespace: regular space, horizontal tab, newline, carriage
87        // return, non-breaking space (U+00A0), and ideographic space (U+3000).
88        ' ' | '\t' | '\n' | '\r' | '\u{00A0}' | '\u{3000}' => TokenKind::Whitespace,
89
90        // Emoji — covers the core emoji blocks in the Supplementary Multilingual
91        // Plane and the Miscellaneous Symbols / Dingbats blocks in the BMP.
92        // ZWJ (U+200D) and the emoji variation selector (U+FE0F) are also
93        // included so that ZWJ emoji sequences stay in one span.
94        c if is_emoji(c) => TokenKind::Emoji,
95
96        // ASCII punctuation is split into three non-contiguous ranges:
97        //   U+0021–U+002F  ! " # $ % & ' ( ) * + , - . /
98        //   U+003A–U+0040  : ; < = > ? @
99        //   U+005B–U+0060  [ \ ] ^ _ `
100        //   U+007B–U+007E  { | } ~
101        '!'..='/' | ':'..='@' | '['..='`' | '{'..='~' => TokenKind::Punctuation,
102
103        // Unicode General Punctuation block (U+2000–U+206F):
104        // em-dash, en-dash, ellipsis, quotation marks, etc.
105        '\u{2000}'..='\u{206F}' => TokenKind::Punctuation,
106
107        // All other codepoints (Hangul, Arabic, Cyrillic, CJK, etc.).
108        _ => TokenKind::Unknown,
109    }
110}
111
112/// Returns `true` if `c` belongs to one of the major Unicode emoji blocks.
113///
114/// This function is intentionally conservative: it matches codepoints that
115/// are nearly always emoji (Emoticons, Miscellaneous Symbols and Pictographs,
116/// Transport and Map Symbols, supplemental emoji blocks), plus the two glue
117/// codepoints used to build emoji sequences — ZWJ (U+200D) and the emoji
118/// variation selector (U+FE0F).
119///
120/// Full ZWJ-sequence detection (e.g. 👨‍👩‍👧) requires multi-codepoint
121/// lookahead and is left to a dedicated Unicode segmenter; this function
122/// ensures that the individual codepoints in such sequences are at least
123/// classified as `Emoji` so they land in the same pre-token span.
124#[inline]
125pub fn is_emoji(c: char) -> bool {
126    matches!(c,
127        // Zero-width joiner — glue used in multi-person / flag emoji sequences.
128        '\u{200D}'
129        // Variation Selector-16: forces emoji (graphic) presentation.
130        | '\u{FE0F}'
131        // Miscellaneous Symbols and Dingbats (BMP).
132        | '\u{2600}'..='\u{27BF}'
133        // Supplemental Symbols and Pictographs — the large SMP emoji block.
134        // Covers Emoticons (1F600), Misc Symbols & Pictographs (1F300),
135        // Transport (1F680), Activities (1F3C0), Objects (1F4A0), etc.
136        | '\u{1F300}'..='\u{1F9FF}'
137        // Symbols and Pictographs Extended-A (chess, medical symbols, …).
138        | '\u{1FA00}'..='\u{1FAFF}'
139    )
140}
141
142// ---------------------------------------------------------------------------
143// Pre-tokenizer
144// ---------------------------------------------------------------------------
145
146/// Split `text` into a sequence of script-homogeneous [`Token`] spans.
147///
148/// Each span groups consecutive characters that share the same [`TokenKind`]
149/// as determined by [`classify_char`]. Spans never overlap and their union
150/// is exactly `text` — i.e. joining `token.text` values reconstructs the
151/// original string.
152///
153/// The function is O(n) in the number of Unicode scalar values in `text`.
154/// No allocation beyond the output `Vec` is performed.
155///
156/// # Returns
157///
158/// An empty `Vec` when `text` is empty.
159///
160/// # Example
161///
162/// ```rust
163/// use kham_core::pre_tokenizer::pre_tokenize;
164/// use kham_core::TokenKind;
165///
166/// // Mixed Thai / number / Thai
167/// let tokens = pre_tokenize("ธนาคาร100แห่ง");
168/// assert_eq!(tokens.len(), 3);
169/// assert_eq!(tokens[0].text, "ธนาคาร");
170/// assert_eq!(tokens[0].kind, TokenKind::Thai);
171/// assert_eq!(tokens[1].text, "100");
172/// assert_eq!(tokens[1].kind, TokenKind::Number);
173/// assert_eq!(tokens[2].text, "แห่ง");
174/// assert_eq!(tokens[2].kind, TokenKind::Thai);
175/// ```
176pub fn pre_tokenize(text: &str) -> Vec<Token<'_>> {
177    if text.is_empty() {
178        return Vec::new();
179    }
180
181    // Capacity hint: most real text averages > 3 bytes per token, so
182    // `text.len() / 4` avoids most reallocations without over-allocating.
183    let mut tokens: Vec<Token<'_>> = Vec::with_capacity(text.len() / 4 + 1);
184
185    // `span_start`/`char_span_start` track the byte/char offset where the
186    // current span began. `span_kind` is `None` only before the first char.
187    let mut span_start = 0usize;
188    let mut char_span_start = 0usize;
189    let mut span_kind: Option<TokenKind> = None;
190    let mut char_pos = 0usize;
191
192    for (byte_pos, c) in text.char_indices() {
193        let kind = classify_char(c);
194
195        match span_kind {
196            // No span open yet — start the first one.
197            None => {
198                span_start = byte_pos;
199                char_span_start = char_pos;
200                span_kind = Some(kind);
201            }
202
203            // Same kind as the running span — extend it silently.
204            Some(k) if k == kind => {}
205
206            // Different kind — flush the completed span and open a new one.
207            Some(k) => {
208                push_token(
209                    &mut tokens,
210                    text,
211                    span_start,
212                    byte_pos,
213                    char_span_start,
214                    char_pos,
215                    k,
216                );
217                span_start = byte_pos;
218                char_span_start = char_pos;
219                span_kind = Some(kind);
220            }
221        }
222
223        char_pos += 1;
224    }
225
226    // Flush the final span (always non-empty because text is non-empty).
227    if let Some(k) = span_kind {
228        push_token(
229            &mut tokens,
230            text,
231            span_start,
232            text.len(),
233            char_span_start,
234            char_pos,
235            k,
236        );
237    }
238
239    tokens
240}
241
242/// Construct a [`Token`] from byte and char ranges of `text` and push it onto `out`.
243#[inline]
244fn push_token<'t>(
245    out: &mut Vec<Token<'t>>,
246    text: &'t str,
247    start: usize,
248    end: usize,
249    char_start: usize,
250    char_end: usize,
251    kind: TokenKind,
252) {
253    out.push(Token::new(
254        &text[start..end],
255        start..end,
256        char_start..char_end,
257        kind,
258    ));
259}
260
261// ---------------------------------------------------------------------------
262// Tests
263// ---------------------------------------------------------------------------
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use alloc::string::{String, ToString};
269
270    // ── helpers ──────────────────────────────────────────────────────────────
271
272    /// Assert that `pre_tokenize(text)` produces tokens with the given
273    /// `(text, kind)` pairs, in order.
274    fn assert_tokens(text: &str, expected: &[(&str, TokenKind)]) {
275        let tokens = pre_tokenize(text);
276        assert_eq!(
277            tokens.len(),
278            expected.len(),
279            "token count mismatch for {text:?}\ngot: {tokens:?}"
280        );
281        for (i, (tok, &(exp_text, exp_kind))) in tokens.iter().zip(expected.iter()).enumerate() {
282            assert_eq!(tok.text, exp_text, "token[{i}].text");
283            assert_eq!(tok.kind, exp_kind, "token[{i}].kind");
284        }
285    }
286
287    // ── edge cases ───────────────────────────────────────────────────────────
288
289    #[test]
290    fn empty_input_returns_empty_vec() {
291        assert!(pre_tokenize("").is_empty());
292    }
293
294    #[test]
295    fn single_char_each_kind() {
296        assert_tokens("ก", &[("ก", TokenKind::Thai)]);
297        assert_tokens("A", &[("A", TokenKind::Latin)]);
298        assert_tokens("1", &[("1", TokenKind::Number)]);
299        assert_tokens(" ", &[(" ", TokenKind::Whitespace)]);
300        assert_tokens("!", &[("!", TokenKind::Punctuation)]);
301        assert_tokens("😀", &[("😀", TokenKind::Emoji)]);
302    }
303
304    // ── Thai ─────────────────────────────────────────────────────────────────
305
306    #[test]
307    fn thai_run_stays_one_span() {
308        assert_tokens("สวัสดี", &[("สวัสดี", TokenKind::Thai)]);
309    }
310
311    #[test]
312    fn thai_digits_split_from_thai_script() {
313        // Thai digits ๑๒๓ are Number, not Thai.
314        assert_tokens("ก๑", &[("ก", TokenKind::Thai), ("๑", TokenKind::Number)]);
315    }
316
317    #[test]
318    fn thai_digits_grouped_as_number() {
319        assert_tokens("๑๒๓", &[("๑๒๓", TokenKind::Number)]);
320    }
321
322    // ── Latin ─────────────────────────────────────────────────────────────────
323
324    #[test]
325    fn latin_run_stays_one_span() {
326        assert_tokens("hello", &[("hello", TokenKind::Latin)]);
327    }
328
329    #[test]
330    fn latin_case_mixed_stays_one_span() {
331        assert_tokens("Hello", &[("Hello", TokenKind::Latin)]);
332    }
333
334    #[test]
335    fn fullwidth_latin_classified_as_latin() {
336        // A = U+FF21, a = U+FF41
337        assert_tokens("Aa", &[("Aa", TokenKind::Latin)]);
338    }
339
340    // ── Number ───────────────────────────────────────────────────────────────
341
342    #[test]
343    fn ascii_digits_grouped() {
344        assert_tokens("100", &[("100", TokenKind::Number)]);
345    }
346
347    #[test]
348    fn fullwidth_digits_classified_as_number() {
349        // 0 = U+FF10
350        assert_tokens("123", &[("123", TokenKind::Number)]);
351    }
352
353    // ── Whitespace ────────────────────────────────────────────────────────────
354
355    #[test]
356    fn space_tab_newline_grouped() {
357        assert_tokens(" \t\n", &[(" \t\n", TokenKind::Whitespace)]);
358    }
359
360    #[test]
361    fn nbsp_classified_as_whitespace() {
362        // U+00A0 non-breaking space
363        let nbsp = "\u{00A0}";
364        assert_tokens(nbsp, &[(nbsp, TokenKind::Whitespace)]);
365    }
366
367    #[test]
368    fn ideographic_space_classified_as_whitespace() {
369        // U+3000 ideographic space
370        let is = "\u{3000}";
371        assert_tokens(is, &[(is, TokenKind::Whitespace)]);
372    }
373
374    // ── Punctuation ───────────────────────────────────────────────────────────
375
376    #[test]
377    fn ascii_punctuation_classified() {
378        for ch in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars() {
379            let s = ch.to_string();
380            let tokens = pre_tokenize(&s);
381            assert_eq!(tokens.len(), 1, "expected 1 token for {ch:?}");
382            assert_eq!(
383                tokens[0].kind,
384                TokenKind::Punctuation,
385                "wrong kind for {ch:?}"
386            );
387        }
388    }
389
390    #[test]
391    fn unicode_punctuation_em_dash() {
392        // U+2014 EM DASH is in the General Punctuation block.
393        assert_tokens("—", &[("—", TokenKind::Punctuation)]);
394    }
395
396    #[test]
397    fn unicode_punctuation_ellipsis() {
398        assert_tokens("…", &[("…", TokenKind::Punctuation)]);
399    }
400
401    // ── Emoji ─────────────────────────────────────────────────────────────────
402
403    #[test]
404    fn basic_emoji_span() {
405        assert_tokens("😀", &[("😀", TokenKind::Emoji)]);
406    }
407
408    #[test]
409    fn emoji_run_stays_one_span() {
410        assert_tokens("😀🎉", &[("😀🎉", TokenKind::Emoji)]);
411    }
412
413    #[test]
414    fn misc_symbol_emoji() {
415        // U+2764 ❤ is in the Miscellaneous Symbols block.
416        assert_tokens("❤", &[("❤", TokenKind::Emoji)]);
417    }
418
419    // ── Mixed script ──────────────────────────────────────────────────────────
420
421    #[test]
422    fn bank_example() {
423        // Classic mixed-script Thai example from CLAUDE.md.
424        assert_tokens(
425            "ธนาคาร100แห่ง",
426            &[
427                ("ธนาคาร", TokenKind::Thai),
428                ("100", TokenKind::Number),
429                ("แห่ง", TokenKind::Thai),
430            ],
431        );
432    }
433
434    #[test]
435    fn thai_space_latin() {
436        assert_tokens(
437            "สวัสดี hello",
438            &[
439                ("สวัสดี", TokenKind::Thai),
440                (" ", TokenKind::Whitespace),
441                ("hello", TokenKind::Latin),
442            ],
443        );
444    }
445
446    #[test]
447    fn latin_number_thai() {
448        assert_tokens(
449            "hello123สวัสดี",
450            &[
451                ("hello", TokenKind::Latin),
452                ("123", TokenKind::Number),
453                ("สวัสดี", TokenKind::Thai),
454            ],
455        );
456    }
457
458    #[test]
459    fn all_kinds_in_sequence() {
460        assert_tokens(
461            "กิน 1 A!😀",
462            &[
463                ("กิน", TokenKind::Thai),
464                (" ", TokenKind::Whitespace),
465                ("1", TokenKind::Number),
466                (" ", TokenKind::Whitespace),
467                ("A", TokenKind::Latin),
468                ("!", TokenKind::Punctuation),
469                ("😀", TokenKind::Emoji),
470            ],
471        );
472    }
473
474    // ── Structural invariants ─────────────────────────────────────────────────
475
476    #[test]
477    fn spans_cover_full_input() {
478        // Joining all token texts must reconstruct the original string exactly.
479        let inputs = [
480            "ธนาคาร100แห่ง",
481            "hello world",
482            "สวัสดี 😀 123!",
483            "กิน\tข้าว\n",
484            "",
485        ];
486        for input in inputs {
487            let rebuilt: String = pre_tokenize(input).iter().map(|t| t.text).collect();
488            assert_eq!(rebuilt, input, "coverage failed for {input:?}");
489        }
490    }
491
492    #[test]
493    fn span_byte_offsets_are_correct() {
494        // Every span's byte range must match the string it refers to.
495        let text = "ธนาคาร100แห่ง";
496        for tok in pre_tokenize(text) {
497            assert_eq!(
498                &text[tok.span.clone()],
499                tok.text,
500                "span mismatch: {:?}",
501                tok
502            );
503            assert!(
504                text.is_char_boundary(tok.span.start),
505                "span.start is not a char boundary"
506            );
507            assert!(
508                text.is_char_boundary(tok.span.end),
509                "span.end is not a char boundary"
510            );
511        }
512    }
513
514    #[test]
515    fn no_empty_tokens() {
516        // The pre-tokenizer must never emit a zero-length token.
517        let text = "กิน hello 123";
518        for tok in pre_tokenize(text) {
519            assert!(!tok.text.is_empty(), "empty token: {tok:?}");
520        }
521    }
522
523    #[test]
524    fn adjacent_spans_are_contiguous() {
525        // The end of span[i] must equal the start of span[i+1].
526        let text = "กิน hello 123!😀";
527        let tokens = pre_tokenize(text);
528        for pair in tokens.windows(2) {
529            assert_eq!(
530                pair[0].span.end, pair[1].span.start,
531                "gap between {:?} and {:?}",
532                pair[0], pair[1]
533            );
534        }
535    }
536
537    #[test]
538    fn char_spans_are_contiguous() {
539        let text = "กิน hello 123!😀";
540        let tokens = pre_tokenize(text);
541        for pair in tokens.windows(2) {
542            assert_eq!(
543                pair[0].char_span.end, pair[1].char_span.start,
544                "char_span gap between {:?} and {:?}",
545                pair[0].text, pair[1].text
546            );
547        }
548    }
549
550    #[test]
551    fn char_span_len_matches_char_count() {
552        let text = "ธนาคาร100แห่ง";
553        for tok in pre_tokenize(text) {
554            assert_eq!(
555                tok.char_span.end - tok.char_span.start,
556                tok.text.chars().count(),
557                "char_span mismatch for {:?}",
558                tok.text
559            );
560        }
561    }
562
563    #[test]
564    fn char_span_mixed_script_offsets() {
565        // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
566        let tokens = pre_tokenize("ธนาคาร100แห่ง");
567        assert_eq!(tokens[0].char_span, 0..6);
568        assert_eq!(tokens[1].char_span, 6..9);
569        assert_eq!(tokens[2].char_span, 9..13);
570    }
571
572    #[test]
573    fn char_span_emoji_counts_as_one_char() {
574        // 😀 is 4 bytes but 1 Unicode scalar value.
575        let tokens = pre_tokenize("😀");
576        assert_eq!(tokens[0].char_span, 0..1);
577        assert_eq!(tokens[0].span, 0..4);
578    }
579
580    // ── classify_char direct tests ────────────────────────────────────────────
581
582    #[test]
583    fn classify_char_spot_checks() {
584        assert_eq!(classify_char('ก'), TokenKind::Thai);
585        assert_eq!(classify_char('๑'), TokenKind::Number); // Thai digit
586        assert_eq!(classify_char('a'), TokenKind::Latin);
587        assert_eq!(classify_char('Z'), TokenKind::Latin);
588        assert_eq!(classify_char('5'), TokenKind::Number);
589        assert_eq!(classify_char(' '), TokenKind::Whitespace);
590        assert_eq!(classify_char('\n'), TokenKind::Whitespace);
591        assert_eq!(classify_char('!'), TokenKind::Punctuation);
592        assert_eq!(classify_char('.'), TokenKind::Punctuation);
593        assert_eq!(classify_char('😀'), TokenKind::Emoji);
594        assert_eq!(classify_char('❤'), TokenKind::Emoji);
595    }
596}