use alloc::vec::Vec;
use crate::token::{Token, TokenKind};
#[inline]
pub fn classify_char(c: char) -> TokenKind {
match c {
'\u{0E50}'..='\u{0E59}' => TokenKind::Number,
'\u{0E00}'..='\u{0E7F}' => TokenKind::Thai,
'0'..='9' => TokenKind::Number,
'\u{FF10}'..='\u{FF19}' => TokenKind::Number,
'A'..='Z' | 'a'..='z' => TokenKind::Latin,
'\u{FF21}'..='\u{FF3A}' | '\u{FF41}'..='\u{FF5A}' => TokenKind::Latin,
' ' | '\t' | '\n' | '\r' | '\u{00A0}' | '\u{3000}' => TokenKind::Whitespace,
c if is_emoji(c) => TokenKind::Emoji,
'!'..='/' | ':'..='@' | '['..='`' | '{'..='~' => TokenKind::Punctuation,
'\u{2000}'..='\u{206F}' => TokenKind::Punctuation,
_ => TokenKind::Unknown,
}
}
#[inline]
pub fn is_emoji(c: char) -> bool {
matches!(c,
'\u{200D}'
| '\u{FE0F}'
| '\u{2600}'..='\u{27BF}'
| '\u{1F300}'..='\u{1F9FF}'
| '\u{1FA00}'..='\u{1FAFF}'
)
}
pub fn pre_tokenize(text: &str) -> Vec<Token<'_>> {
if text.is_empty() {
return Vec::new();
}
let mut tokens: Vec<Token<'_>> = Vec::with_capacity(text.len() / 4 + 1);
let mut span_start = 0usize;
let mut char_span_start = 0usize;
let mut span_kind: Option<TokenKind> = None;
let mut char_pos = 0usize;
for (byte_pos, c) in text.char_indices() {
let kind = classify_char(c);
match span_kind {
None => {
span_start = byte_pos;
char_span_start = char_pos;
span_kind = Some(kind);
}
Some(k) if k == kind => {}
Some(k) => {
push_token(
&mut tokens,
text,
span_start,
byte_pos,
char_span_start,
char_pos,
k,
);
span_start = byte_pos;
char_span_start = char_pos;
span_kind = Some(kind);
}
}
char_pos += 1;
}
if let Some(k) = span_kind {
push_token(
&mut tokens,
text,
span_start,
text.len(),
char_span_start,
char_pos,
k,
);
}
tokens
}
#[inline]
fn push_token<'t>(
out: &mut Vec<Token<'t>>,
text: &'t str,
start: usize,
end: usize,
char_start: usize,
char_end: usize,
kind: TokenKind,
) {
out.push(Token::new(
&text[start..end],
start..end,
char_start..char_end,
kind,
));
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::string::{String, ToString};
fn assert_tokens(text: &str, expected: &[(&str, TokenKind)]) {
let tokens = pre_tokenize(text);
assert_eq!(
tokens.len(),
expected.len(),
"token count mismatch for {text:?}\ngot: {tokens:?}"
);
for (i, (tok, &(exp_text, exp_kind))) in tokens.iter().zip(expected.iter()).enumerate() {
assert_eq!(tok.text, exp_text, "token[{i}].text");
assert_eq!(tok.kind, exp_kind, "token[{i}].kind");
}
}
#[test]
fn empty_input_returns_empty_vec() {
assert!(pre_tokenize("").is_empty());
}
#[test]
fn single_char_each_kind() {
assert_tokens("ก", &[("ก", TokenKind::Thai)]);
assert_tokens("A", &[("A", TokenKind::Latin)]);
assert_tokens("1", &[("1", TokenKind::Number)]);
assert_tokens(" ", &[(" ", TokenKind::Whitespace)]);
assert_tokens("!", &[("!", TokenKind::Punctuation)]);
assert_tokens("😀", &[("😀", TokenKind::Emoji)]);
}
#[test]
fn thai_run_stays_one_span() {
assert_tokens("สวัสดี", &[("สวัสดี", TokenKind::Thai)]);
}
#[test]
fn thai_digits_split_from_thai_script() {
assert_tokens("ก๑", &[("ก", TokenKind::Thai), ("๑", TokenKind::Number)]);
}
#[test]
fn thai_digits_grouped_as_number() {
assert_tokens("๑๒๓", &[("๑๒๓", TokenKind::Number)]);
}
#[test]
fn latin_run_stays_one_span() {
assert_tokens("hello", &[("hello", TokenKind::Latin)]);
}
#[test]
fn latin_case_mixed_stays_one_span() {
assert_tokens("Hello", &[("Hello", TokenKind::Latin)]);
}
#[test]
fn fullwidth_latin_classified_as_latin() {
assert_tokens("Aa", &[("Aa", TokenKind::Latin)]);
}
#[test]
fn ascii_digits_grouped() {
assert_tokens("100", &[("100", TokenKind::Number)]);
}
#[test]
fn fullwidth_digits_classified_as_number() {
assert_tokens("123", &[("123", TokenKind::Number)]);
}
#[test]
fn space_tab_newline_grouped() {
assert_tokens(" \t\n", &[(" \t\n", TokenKind::Whitespace)]);
}
#[test]
fn nbsp_classified_as_whitespace() {
let nbsp = "\u{00A0}";
assert_tokens(nbsp, &[(nbsp, TokenKind::Whitespace)]);
}
#[test]
fn ideographic_space_classified_as_whitespace() {
let is = "\u{3000}";
assert_tokens(is, &[(is, TokenKind::Whitespace)]);
}
#[test]
fn ascii_punctuation_classified() {
for ch in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars() {
let s = ch.to_string();
let tokens = pre_tokenize(&s);
assert_eq!(tokens.len(), 1, "expected 1 token for {ch:?}");
assert_eq!(
tokens[0].kind,
TokenKind::Punctuation,
"wrong kind for {ch:?}"
);
}
}
#[test]
fn unicode_punctuation_em_dash() {
assert_tokens("—", &[("—", TokenKind::Punctuation)]);
}
#[test]
fn unicode_punctuation_ellipsis() {
assert_tokens("…", &[("…", TokenKind::Punctuation)]);
}
#[test]
fn basic_emoji_span() {
assert_tokens("😀", &[("😀", TokenKind::Emoji)]);
}
#[test]
fn emoji_run_stays_one_span() {
assert_tokens("😀🎉", &[("😀🎉", TokenKind::Emoji)]);
}
#[test]
fn misc_symbol_emoji() {
assert_tokens("❤", &[("❤", TokenKind::Emoji)]);
}
#[test]
fn bank_example() {
assert_tokens(
"ธนาคาร100แห่ง",
&[
("ธนาคาร", TokenKind::Thai),
("100", TokenKind::Number),
("แห่ง", TokenKind::Thai),
],
);
}
#[test]
fn thai_space_latin() {
assert_tokens(
"สวัสดี hello",
&[
("สวัสดี", TokenKind::Thai),
(" ", TokenKind::Whitespace),
("hello", TokenKind::Latin),
],
);
}
#[test]
fn latin_number_thai() {
assert_tokens(
"hello123สวัสดี",
&[
("hello", TokenKind::Latin),
("123", TokenKind::Number),
("สวัสดี", TokenKind::Thai),
],
);
}
#[test]
fn all_kinds_in_sequence() {
assert_tokens(
"กิน 1 A!😀",
&[
("กิน", TokenKind::Thai),
(" ", TokenKind::Whitespace),
("1", TokenKind::Number),
(" ", TokenKind::Whitespace),
("A", TokenKind::Latin),
("!", TokenKind::Punctuation),
("😀", TokenKind::Emoji),
],
);
}
#[test]
fn spans_cover_full_input() {
let inputs = [
"ธนาคาร100แห่ง",
"hello world",
"สวัสดี 😀 123!",
"กิน\tข้าว\n",
"",
];
for input in inputs {
let rebuilt: String = pre_tokenize(input).iter().map(|t| t.text).collect();
assert_eq!(rebuilt, input, "coverage failed for {input:?}");
}
}
#[test]
fn span_byte_offsets_are_correct() {
let text = "ธนาคาร100แห่ง";
for tok in pre_tokenize(text) {
assert_eq!(
&text[tok.span.clone()],
tok.text,
"span mismatch: {:?}",
tok
);
assert!(
text.is_char_boundary(tok.span.start),
"span.start is not a char boundary"
);
assert!(
text.is_char_boundary(tok.span.end),
"span.end is not a char boundary"
);
}
}
#[test]
fn no_empty_tokens() {
let text = "กิน hello 123";
for tok in pre_tokenize(text) {
assert!(!tok.text.is_empty(), "empty token: {tok:?}");
}
}
#[test]
fn adjacent_spans_are_contiguous() {
let text = "กิน hello 123!😀";
let tokens = pre_tokenize(text);
for pair in tokens.windows(2) {
assert_eq!(
pair[0].span.end, pair[1].span.start,
"gap between {:?} and {:?}",
pair[0], pair[1]
);
}
}
#[test]
fn char_spans_are_contiguous() {
let text = "กิน hello 123!😀";
let tokens = pre_tokenize(text);
for pair in tokens.windows(2) {
assert_eq!(
pair[0].char_span.end, pair[1].char_span.start,
"char_span gap between {:?} and {:?}",
pair[0].text, pair[1].text
);
}
}
#[test]
fn char_span_len_matches_char_count() {
let text = "ธนาคาร100แห่ง";
for tok in pre_tokenize(text) {
assert_eq!(
tok.char_span.end - tok.char_span.start,
tok.text.chars().count(),
"char_span mismatch for {:?}",
tok.text
);
}
}
#[test]
fn char_span_mixed_script_offsets() {
let tokens = pre_tokenize("ธนาคาร100แห่ง");
assert_eq!(tokens[0].char_span, 0..6);
assert_eq!(tokens[1].char_span, 6..9);
assert_eq!(tokens[2].char_span, 9..13);
}
#[test]
fn char_span_emoji_counts_as_one_char() {
let tokens = pre_tokenize("😀");
assert_eq!(tokens[0].char_span, 0..1);
assert_eq!(tokens[0].span, 0..4);
}
#[test]
fn classify_char_spot_checks() {
assert_eq!(classify_char('ก'), TokenKind::Thai);
assert_eq!(classify_char('๑'), TokenKind::Number); assert_eq!(classify_char('a'), TokenKind::Latin);
assert_eq!(classify_char('Z'), TokenKind::Latin);
assert_eq!(classify_char('5'), TokenKind::Number);
assert_eq!(classify_char(' '), TokenKind::Whitespace);
assert_eq!(classify_char('\n'), TokenKind::Whitespace);
assert_eq!(classify_char('!'), TokenKind::Punctuation);
assert_eq!(classify_char('.'), TokenKind::Punctuation);
assert_eq!(classify_char('😀'), TokenKind::Emoji);
assert_eq!(classify_char('❤'), TokenKind::Emoji);
}
}