kham_core/pre_tokenizer.rs
1//! Unicode script classifier and pre-tokenizer.
2//!
3//! Splits raw input into coarse, script-homogeneous [`Token`] spans before
4//! the main segmenter runs. The segmenter only needs to apply the expensive
5//! DAG algorithm to Thai spans; all other spans pass through unchanged.
6//!
7//! ## Pipeline position
8//!
9//! ```text
10//! raw text
11//! │
12//! ▼
13//! pre_tokenize() ← this module
14//! │ splits into [Thai | Latin | Number | Whitespace | Emoji | Punctuation | Unknown]
15//! ▼
16//! segmenter ← processes Thai spans with tcc + dict
17//! │
18//! ▼
19//! Vec<Token<'_>>
20//! ```
21//!
22//! ## Example
23//!
24//! ```rust
25//! use kham_core::pre_tokenizer::pre_tokenize;
26//! use kham_core::TokenKind;
27//!
28//! let spans = pre_tokenize("ธนาคาร100แห่ง");
29//! assert_eq!(spans[0].kind, TokenKind::Thai); // "ธนาคาร"
30//! assert_eq!(spans[1].kind, TokenKind::Number); // "100"
31//! assert_eq!(spans[2].kind, TokenKind::Thai); // "แห่ง"
32//! ```
33
34use alloc::vec::Vec;
35
36use crate::token::{Token, TokenKind};
37
38// ---------------------------------------------------------------------------
39// Character classification
40// ---------------------------------------------------------------------------
41
42/// Classify a single Unicode scalar value into a [`TokenKind`].
43///
44/// Classification is purely codepoint-based — no context is used. The rules
45/// are applied in priority order so that sub-ranges override their parent
46/// block (e.g. Thai digits are checked before the broader Thai block).
47///
48/// ## Classification table
49///
50/// | Range / set | Kind |
51/// |---|---|
52/// | U+0E50–U+0E59 (Thai digits ๐–๙) | `Number` |
53/// | U+0E00–U+0E7F (Thai block) | `Thai` |
54/// | `0`–`9` (ASCII digits) | `Number` |
55/// | U+FF10–U+FF19 (fullwidth digits) | `Number` |
56/// | `A`–`Z`, `a`–`z` (ASCII letters) | `Latin` |
57/// | U+FF21–U+FF5A (fullwidth Latin) | `Latin` |
58/// | Space, tab, newline, CR, NBSP, ideographic space | `Whitespace` |
59/// | Major emoji blocks (U+1F300–U+1FAFF, U+2600–U+27BF, …) | `Emoji` |
60/// | ASCII punctuation (`!`–`/`, `:`–`@`, …) | `Punctuation` |
61/// | U+2000–U+206F (Unicode general punctuation) | `Punctuation` |
62/// | Everything else | `Unknown` |
63#[inline]
64pub fn classify_char(c: char) -> TokenKind {
65 match c {
66 // Thai digits sit inside the Thai block — check them first so they
67 // are not misclassified as Thai script.
68 '\u{0E50}'..='\u{0E59}' => TokenKind::Number,
69
70 // Remaining Thai Unicode block: consonants, vowels, tone marks, etc.
71 '\u{0E00}'..='\u{0E7F}' => TokenKind::Thai,
72
73 // ASCII decimal digits.
74 '0'..='9' => TokenKind::Number,
75
76 // Fullwidth digit forms (U+FF10 0 – U+FF19 9).
77 '\u{FF10}'..='\u{FF19}' => TokenKind::Number,
78
79 // ASCII basic Latin letters (a–z, A–Z).
80 'A'..='Z' | 'a'..='z' => TokenKind::Latin,
81
82 // Fullwidth Latin capital (U+FF21 A – U+FF3A Z) and
83 // small (U+FF41 a – U+FF5A z) letter forms.
84 '\u{FF21}'..='\u{FF3A}' | '\u{FF41}'..='\u{FF5A}' => TokenKind::Latin,
85
86 // Common whitespace: regular space, horizontal tab, newline, carriage
87 // return, non-breaking space (U+00A0), and ideographic space (U+3000).
88 ' ' | '\t' | '\n' | '\r' | '\u{00A0}' | '\u{3000}' => TokenKind::Whitespace,
89
90 // Emoji — covers the core emoji blocks in the Supplementary Multilingual
91 // Plane and the Miscellaneous Symbols / Dingbats blocks in the BMP.
92 // ZWJ (U+200D) and the emoji variation selector (U+FE0F) are also
93 // included so that ZWJ emoji sequences stay in one span.
94 c if is_emoji(c) => TokenKind::Emoji,
95
96 // ASCII punctuation is split into three non-contiguous ranges:
97 // U+0021–U+002F ! " # $ % & ' ( ) * + , - . /
98 // U+003A–U+0040 : ; < = > ? @
99 // U+005B–U+0060 [ \ ] ^ _ `
100 // U+007B–U+007E { | } ~
101 '!'..='/' | ':'..='@' | '['..='`' | '{'..='~' => TokenKind::Punctuation,
102
103 // Unicode General Punctuation block (U+2000–U+206F):
104 // em-dash, en-dash, ellipsis, quotation marks, etc.
105 '\u{2000}'..='\u{206F}' => TokenKind::Punctuation,
106
107 // All other codepoints (Hangul, Arabic, Cyrillic, CJK, etc.).
108 _ => TokenKind::Unknown,
109 }
110}
111
112/// Returns `true` if `c` belongs to one of the major Unicode emoji blocks.
113///
114/// This function is intentionally conservative: it matches codepoints that
115/// are nearly always emoji (Emoticons, Miscellaneous Symbols and Pictographs,
116/// Transport and Map Symbols, supplemental emoji blocks), plus the two glue
117/// codepoints used to build emoji sequences — ZWJ (U+200D) and the emoji
118/// variation selector (U+FE0F).
119///
120/// Full ZWJ-sequence detection (e.g. 👨👩👧) requires multi-codepoint
121/// lookahead and is left to a dedicated Unicode segmenter; this function
122/// ensures that the individual codepoints in such sequences are at least
123/// classified as `Emoji` so they land in the same pre-token span.
124#[inline]
125pub fn is_emoji(c: char) -> bool {
126 matches!(c,
127 // Zero-width joiner — glue used in multi-person / flag emoji sequences.
128 '\u{200D}'
129 // Variation Selector-16: forces emoji (graphic) presentation.
130 | '\u{FE0F}'
131 // Miscellaneous Symbols and Dingbats (BMP).
132 | '\u{2600}'..='\u{27BF}'
133 // Supplemental Symbols and Pictographs — the large SMP emoji block.
134 // Covers Emoticons (1F600), Misc Symbols & Pictographs (1F300),
135 // Transport (1F680), Activities (1F3C0), Objects (1F4A0), etc.
136 | '\u{1F300}'..='\u{1F9FF}'
137 // Symbols and Pictographs Extended-A (chess, medical symbols, …).
138 | '\u{1FA00}'..='\u{1FAFF}'
139 )
140}
141
142// ---------------------------------------------------------------------------
143// Pre-tokenizer
144// ---------------------------------------------------------------------------
145
146/// Split `text` into a sequence of script-homogeneous [`Token`] spans.
147///
148/// Each span groups consecutive characters that share the same [`TokenKind`]
149/// as determined by [`classify_char`]. Spans never overlap and their union
150/// is exactly `text` — i.e. joining `token.text` values reconstructs the
151/// original string.
152///
153/// The function is O(n) in the number of Unicode scalar values in `text`.
154/// No allocation beyond the output `Vec` is performed.
155///
156/// # Returns
157///
158/// An empty `Vec` when `text` is empty.
159///
160/// # Example
161///
162/// ```rust
163/// use kham_core::pre_tokenizer::pre_tokenize;
164/// use kham_core::TokenKind;
165///
166/// // Mixed Thai / number / Thai
167/// let tokens = pre_tokenize("ธนาคาร100แห่ง");
168/// assert_eq!(tokens.len(), 3);
169/// assert_eq!(tokens[0].text, "ธนาคาร");
170/// assert_eq!(tokens[0].kind, TokenKind::Thai);
171/// assert_eq!(tokens[1].text, "100");
172/// assert_eq!(tokens[1].kind, TokenKind::Number);
173/// assert_eq!(tokens[2].text, "แห่ง");
174/// assert_eq!(tokens[2].kind, TokenKind::Thai);
175/// ```
176pub fn pre_tokenize(text: &str) -> Vec<Token<'_>> {
177 if text.is_empty() {
178 return Vec::new();
179 }
180
181 // Capacity hint: most real text averages > 3 bytes per token, so
182 // `text.len() / 4` avoids most reallocations without over-allocating.
183 let mut tokens: Vec<Token<'_>> = Vec::with_capacity(text.len() / 4 + 1);
184
185 // `span_start`/`char_span_start` track the byte/char offset where the
186 // current span began. `span_kind` is `None` only before the first char.
187 let mut span_start = 0usize;
188 let mut char_span_start = 0usize;
189 let mut span_kind: Option<TokenKind> = None;
190 let mut char_pos = 0usize;
191
192 for (byte_pos, c) in text.char_indices() {
193 let kind = classify_char(c);
194
195 match span_kind {
196 // No span open yet — start the first one.
197 None => {
198 span_start = byte_pos;
199 char_span_start = char_pos;
200 span_kind = Some(kind);
201 }
202
203 // Same kind as the running span — extend it silently.
204 Some(k) if k == kind => {}
205
206 // Different kind — flush the completed span and open a new one.
207 Some(k) => {
208 push_token(
209 &mut tokens,
210 text,
211 span_start,
212 byte_pos,
213 char_span_start,
214 char_pos,
215 k,
216 );
217 span_start = byte_pos;
218 char_span_start = char_pos;
219 span_kind = Some(kind);
220 }
221 }
222
223 char_pos += 1;
224 }
225
226 // Flush the final span (always non-empty because text is non-empty).
227 if let Some(k) = span_kind {
228 push_token(
229 &mut tokens,
230 text,
231 span_start,
232 text.len(),
233 char_span_start,
234 char_pos,
235 k,
236 );
237 }
238
239 tokens
240}
241
242/// Construct a [`Token`] from byte and char ranges of `text` and push it onto `out`.
243#[inline]
244fn push_token<'t>(
245 out: &mut Vec<Token<'t>>,
246 text: &'t str,
247 start: usize,
248 end: usize,
249 char_start: usize,
250 char_end: usize,
251 kind: TokenKind,
252) {
253 out.push(Token::new(
254 &text[start..end],
255 start..end,
256 char_start..char_end,
257 kind,
258 ));
259}
260
261// ---------------------------------------------------------------------------
262// Tests
263// ---------------------------------------------------------------------------
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268 use alloc::string::{String, ToString};
269
270 // ── helpers ──────────────────────────────────────────────────────────────
271
272 /// Assert that `pre_tokenize(text)` produces tokens with the given
273 /// `(text, kind)` pairs, in order.
274 fn assert_tokens(text: &str, expected: &[(&str, TokenKind)]) {
275 let tokens = pre_tokenize(text);
276 assert_eq!(
277 tokens.len(),
278 expected.len(),
279 "token count mismatch for {text:?}\ngot: {tokens:?}"
280 );
281 for (i, (tok, &(exp_text, exp_kind))) in tokens.iter().zip(expected.iter()).enumerate() {
282 assert_eq!(tok.text, exp_text, "token[{i}].text");
283 assert_eq!(tok.kind, exp_kind, "token[{i}].kind");
284 }
285 }
286
287 // ── edge cases ───────────────────────────────────────────────────────────
288
289 #[test]
290 fn empty_input_returns_empty_vec() {
291 assert!(pre_tokenize("").is_empty());
292 }
293
294 #[test]
295 fn single_char_each_kind() {
296 assert_tokens("ก", &[("ก", TokenKind::Thai)]);
297 assert_tokens("A", &[("A", TokenKind::Latin)]);
298 assert_tokens("1", &[("1", TokenKind::Number)]);
299 assert_tokens(" ", &[(" ", TokenKind::Whitespace)]);
300 assert_tokens("!", &[("!", TokenKind::Punctuation)]);
301 assert_tokens("😀", &[("😀", TokenKind::Emoji)]);
302 }
303
304 // ── Thai ─────────────────────────────────────────────────────────────────
305
306 #[test]
307 fn thai_run_stays_one_span() {
308 assert_tokens("สวัสดี", &[("สวัสดี", TokenKind::Thai)]);
309 }
310
311 #[test]
312 fn thai_digits_split_from_thai_script() {
313 // Thai digits ๑๒๓ are Number, not Thai.
314 assert_tokens("ก๑", &[("ก", TokenKind::Thai), ("๑", TokenKind::Number)]);
315 }
316
317 #[test]
318 fn thai_digits_grouped_as_number() {
319 assert_tokens("๑๒๓", &[("๑๒๓", TokenKind::Number)]);
320 }
321
322 // ── Latin ─────────────────────────────────────────────────────────────────
323
324 #[test]
325 fn latin_run_stays_one_span() {
326 assert_tokens("hello", &[("hello", TokenKind::Latin)]);
327 }
328
329 #[test]
330 fn latin_case_mixed_stays_one_span() {
331 assert_tokens("Hello", &[("Hello", TokenKind::Latin)]);
332 }
333
334 #[test]
335 fn fullwidth_latin_classified_as_latin() {
336 // A = U+FF21, a = U+FF41
337 assert_tokens("Aa", &[("Aa", TokenKind::Latin)]);
338 }
339
340 // ── Number ───────────────────────────────────────────────────────────────
341
342 #[test]
343 fn ascii_digits_grouped() {
344 assert_tokens("100", &[("100", TokenKind::Number)]);
345 }
346
347 #[test]
348 fn fullwidth_digits_classified_as_number() {
349 // 0 = U+FF10
350 assert_tokens("123", &[("123", TokenKind::Number)]);
351 }
352
353 // ── Whitespace ────────────────────────────────────────────────────────────
354
355 #[test]
356 fn space_tab_newline_grouped() {
357 assert_tokens(" \t\n", &[(" \t\n", TokenKind::Whitespace)]);
358 }
359
360 #[test]
361 fn nbsp_classified_as_whitespace() {
362 // U+00A0 non-breaking space
363 let nbsp = "\u{00A0}";
364 assert_tokens(nbsp, &[(nbsp, TokenKind::Whitespace)]);
365 }
366
367 #[test]
368 fn ideographic_space_classified_as_whitespace() {
369 // U+3000 ideographic space
370 let is = "\u{3000}";
371 assert_tokens(is, &[(is, TokenKind::Whitespace)]);
372 }
373
374 // ── Punctuation ───────────────────────────────────────────────────────────
375
376 #[test]
377 fn ascii_punctuation_classified() {
378 for ch in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars() {
379 let s = ch.to_string();
380 let tokens = pre_tokenize(&s);
381 assert_eq!(tokens.len(), 1, "expected 1 token for {ch:?}");
382 assert_eq!(
383 tokens[0].kind,
384 TokenKind::Punctuation,
385 "wrong kind for {ch:?}"
386 );
387 }
388 }
389
390 #[test]
391 fn unicode_punctuation_em_dash() {
392 // U+2014 EM DASH is in the General Punctuation block.
393 assert_tokens("—", &[("—", TokenKind::Punctuation)]);
394 }
395
396 #[test]
397 fn unicode_punctuation_ellipsis() {
398 assert_tokens("…", &[("…", TokenKind::Punctuation)]);
399 }
400
401 // ── Emoji ─────────────────────────────────────────────────────────────────
402
403 #[test]
404 fn basic_emoji_span() {
405 assert_tokens("😀", &[("😀", TokenKind::Emoji)]);
406 }
407
408 #[test]
409 fn emoji_run_stays_one_span() {
410 assert_tokens("😀🎉", &[("😀🎉", TokenKind::Emoji)]);
411 }
412
413 #[test]
414 fn misc_symbol_emoji() {
415 // U+2764 ❤ is in the Miscellaneous Symbols block.
416 assert_tokens("❤", &[("❤", TokenKind::Emoji)]);
417 }
418
419 // ── Mixed script ──────────────────────────────────────────────────────────
420
421 #[test]
422 fn bank_example() {
423 // Classic mixed-script Thai example from CLAUDE.md.
424 assert_tokens(
425 "ธนาคาร100แห่ง",
426 &[
427 ("ธนาคาร", TokenKind::Thai),
428 ("100", TokenKind::Number),
429 ("แห่ง", TokenKind::Thai),
430 ],
431 );
432 }
433
434 #[test]
435 fn thai_space_latin() {
436 assert_tokens(
437 "สวัสดี hello",
438 &[
439 ("สวัสดี", TokenKind::Thai),
440 (" ", TokenKind::Whitespace),
441 ("hello", TokenKind::Latin),
442 ],
443 );
444 }
445
446 #[test]
447 fn latin_number_thai() {
448 assert_tokens(
449 "hello123สวัสดี",
450 &[
451 ("hello", TokenKind::Latin),
452 ("123", TokenKind::Number),
453 ("สวัสดี", TokenKind::Thai),
454 ],
455 );
456 }
457
458 #[test]
459 fn all_kinds_in_sequence() {
460 assert_tokens(
461 "กิน 1 A!😀",
462 &[
463 ("กิน", TokenKind::Thai),
464 (" ", TokenKind::Whitespace),
465 ("1", TokenKind::Number),
466 (" ", TokenKind::Whitespace),
467 ("A", TokenKind::Latin),
468 ("!", TokenKind::Punctuation),
469 ("😀", TokenKind::Emoji),
470 ],
471 );
472 }
473
474 // ── Structural invariants ─────────────────────────────────────────────────
475
476 #[test]
477 fn spans_cover_full_input() {
478 // Joining all token texts must reconstruct the original string exactly.
479 let inputs = [
480 "ธนาคาร100แห่ง",
481 "hello world",
482 "สวัสดี 😀 123!",
483 "กิน\tข้าว\n",
484 "",
485 ];
486 for input in inputs {
487 let rebuilt: String = pre_tokenize(input).iter().map(|t| t.text).collect();
488 assert_eq!(rebuilt, input, "coverage failed for {input:?}");
489 }
490 }
491
492 #[test]
493 fn span_byte_offsets_are_correct() {
494 // Every span's byte range must match the string it refers to.
495 let text = "ธนาคาร100แห่ง";
496 for tok in pre_tokenize(text) {
497 assert_eq!(
498 &text[tok.span.clone()],
499 tok.text,
500 "span mismatch: {:?}",
501 tok
502 );
503 assert!(
504 text.is_char_boundary(tok.span.start),
505 "span.start is not a char boundary"
506 );
507 assert!(
508 text.is_char_boundary(tok.span.end),
509 "span.end is not a char boundary"
510 );
511 }
512 }
513
514 #[test]
515 fn no_empty_tokens() {
516 // The pre-tokenizer must never emit a zero-length token.
517 let text = "กิน hello 123";
518 for tok in pre_tokenize(text) {
519 assert!(!tok.text.is_empty(), "empty token: {tok:?}");
520 }
521 }
522
523 #[test]
524 fn adjacent_spans_are_contiguous() {
525 // The end of span[i] must equal the start of span[i+1].
526 let text = "กิน hello 123!😀";
527 let tokens = pre_tokenize(text);
528 for pair in tokens.windows(2) {
529 assert_eq!(
530 pair[0].span.end, pair[1].span.start,
531 "gap between {:?} and {:?}",
532 pair[0], pair[1]
533 );
534 }
535 }
536
537 #[test]
538 fn char_spans_are_contiguous() {
539 let text = "กิน hello 123!😀";
540 let tokens = pre_tokenize(text);
541 for pair in tokens.windows(2) {
542 assert_eq!(
543 pair[0].char_span.end, pair[1].char_span.start,
544 "char_span gap between {:?} and {:?}",
545 pair[0].text, pair[1].text
546 );
547 }
548 }
549
550 #[test]
551 fn char_span_len_matches_char_count() {
552 let text = "ธนาคาร100แห่ง";
553 for tok in pre_tokenize(text) {
554 assert_eq!(
555 tok.char_span.end - tok.char_span.start,
556 tok.text.chars().count(),
557 "char_span mismatch for {:?}",
558 tok.text
559 );
560 }
561 }
562
563 #[test]
564 fn char_span_mixed_script_offsets() {
565 // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
566 let tokens = pre_tokenize("ธนาคาร100แห่ง");
567 assert_eq!(tokens[0].char_span, 0..6);
568 assert_eq!(tokens[1].char_span, 6..9);
569 assert_eq!(tokens[2].char_span, 9..13);
570 }
571
572 #[test]
573 fn char_span_emoji_counts_as_one_char() {
574 // 😀 is 4 bytes but 1 Unicode scalar value.
575 let tokens = pre_tokenize("😀");
576 assert_eq!(tokens[0].char_span, 0..1);
577 assert_eq!(tokens[0].span, 0..4);
578 }
579
580 // ── classify_char direct tests ────────────────────────────────────────────
581
582 #[test]
583 fn classify_char_spot_checks() {
584 assert_eq!(classify_char('ก'), TokenKind::Thai);
585 assert_eq!(classify_char('๑'), TokenKind::Number); // Thai digit
586 assert_eq!(classify_char('a'), TokenKind::Latin);
587 assert_eq!(classify_char('Z'), TokenKind::Latin);
588 assert_eq!(classify_char('5'), TokenKind::Number);
589 assert_eq!(classify_char(' '), TokenKind::Whitespace);
590 assert_eq!(classify_char('\n'), TokenKind::Whitespace);
591 assert_eq!(classify_char('!'), TokenKind::Punctuation);
592 assert_eq!(classify_char('.'), TokenKind::Punctuation);
593 assert_eq!(classify_char('😀'), TokenKind::Emoji);
594 assert_eq!(classify_char('❤'), TokenKind::Emoji);
595 }
596}