use kham_core::{TokenKind, Tokenizer};
fn load_cases(path: &str) -> Vec<(String, Vec<String>)> {
let content =
std::fs::read_to_string(path).unwrap_or_else(|e| panic!("failed to read {path}: {e}"));
content
.lines()
.filter(|l| !l.trim().is_empty() && !l.trim_start().starts_with('#'))
.map(|line| {
let mut parts = line.splitn(2, '|');
let input = parts.next().unwrap_or("").to_string();
let rest = parts.next().unwrap_or("");
let expected: Vec<String> = rest.split('|').map(|s| s.to_string()).collect();
(input, expected)
})
.collect()
}
fn segment_texts(tok: &Tokenizer, input: &str) -> Vec<String> {
tok.segment(input)
.into_iter()
.filter(|t| t.kind != TokenKind::Whitespace)
.map(|t| t.text.to_string())
.collect()
}
fn testdata(name: &str) -> String {
let dir = env!("CARGO_MANIFEST_DIR");
format!("{dir}/testdata/{name}")
}
#[test]
fn basic_sentences_split_correctly() {
let tok = Tokenizer::new();
for (input, expected) in load_cases(&testdata("basic.txt")) {
let got = segment_texts(&tok, &input);
assert_eq!(
got, expected,
"basic.txt: wrong split for {input:?}\n got: {got:?}\n expected: {expected:?}"
);
}
}
#[test]
fn basic_sentences_reconstruct_input() {
let tok = Tokenizer::new();
for (input, _) in load_cases(&testdata("basic.txt")) {
let rebuilt: String = tok.segment(&input).iter().map(|t| t.text).collect();
assert_eq!(
rebuilt, input,
"basic.txt: reconstruction failed for {input:?}"
);
}
}
#[test]
fn basic_sentences_all_tokens_are_thai_kind() {
let tok = Tokenizer::new();
for (input, _) in load_cases(&testdata("basic.txt")) {
for token in tok.segment(&input) {
assert_eq!(
token.kind,
TokenKind::Thai,
"basic.txt: expected Thai kind for {token:?} in {input:?}"
);
}
}
}
#[test]
fn mixed_script_split_correctly() {
let tok = Tokenizer::new();
for (input, expected) in load_cases(&testdata("mixed_script.txt")) {
let got = segment_texts(&tok, &input);
assert_eq!(
got, expected,
"mixed_script.txt: wrong split for {input:?}\n got: {got:?}\n expected: {expected:?}"
);
}
}
#[test]
fn mixed_script_reconstructs_without_whitespace() {
let tok = Tokenizer::new();
for (input, _) in load_cases(&testdata("mixed_script.txt")) {
let expected_rebuilt: String = input.chars().filter(|c| !c.is_whitespace()).collect();
let rebuilt: String = tok.segment(&input).iter().map(|t| t.text).collect();
assert_eq!(
rebuilt, expected_rebuilt,
"mixed_script.txt: reconstruction failed for {input:?}"
);
}
}
#[test]
fn mixed_script_number_spans_are_number_kind() {
let tok = Tokenizer::new();
let tokens = tok.segment("ธนาคาร100แห่ง");
let num = tokens.iter().find(|t| t.text == "100");
assert!(num.is_some(), "no Number token found");
assert_eq!(num.unwrap().kind, TokenKind::Number);
}
#[test]
fn mixed_script_latin_spans_are_latin_kind() {
let tok = Tokenizer::new();
let tokens = tok.segment("สวัสดีhello");
let lat = tokens.iter().find(|t| t.text == "hello");
assert!(lat.is_some(), "no Latin token found");
assert_eq!(lat.unwrap().kind, TokenKind::Latin);
}
#[test]
fn normalization_idempotent_on_canonical_input() {
let tok = Tokenizer::new();
for (input, expected) in load_cases(&testdata("normalization.txt")) {
let normalized = tok.normalize(&input);
assert_eq!(
normalized, input,
"normalization.txt: normalize() changed canonical input {input:?}"
);
let got = segment_texts(&tok, &normalized);
assert_eq!(
got, expected,
"normalization.txt: wrong split after normalize() for {input:?}"
);
}
}
#[test]
fn normalize_then_segment_deduplicates_tone_marks() {
let tok = Tokenizer::new();
let doubled_tone = "\u{0E44}\u{0E1B}\u{0E48}\u{0E48}"; let normalized = tok.normalize(doubled_tone);
assert_eq!(normalized, "\u{0E44}\u{0E1B}\u{0E48}"); let tokens = tok.segment(&normalized);
let rebuilt: String = tokens.iter().map(|t| t.text).collect();
assert_eq!(rebuilt, normalized);
}
#[test]
fn normalize_then_segment_composes_sara_am() {
let tok = Tokenizer::new();
let decomposed = "\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; let normalized = tok.normalize(decomposed);
assert_eq!(normalized, "\u{0E19}\u{0E49}\u{0E33}"); let tokens = tok.segment(&normalized);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "\u{0E19}\u{0E49}\u{0E33}");
assert_eq!(tokens[0].kind, TokenKind::Thai);
}
#[test]
fn edge_empty_string() {
let tok = Tokenizer::new();
assert!(tok.segment("").is_empty());
assert_eq!(tok.normalize(""), "");
}
#[test]
fn edge_single_thai_char() {
let tok = Tokenizer::new();
let tokens = tok.segment("ก");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "ก");
}
#[test]
fn edge_single_latin_char() {
let tok = Tokenizer::new();
let tokens = tok.segment("A");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Latin);
}
#[test]
fn edge_zero_width_chars_do_not_panic() {
let tok = Tokenizer::new();
let input = "กิน\u{200D}ข้าว";
let tokens = tok.segment(input);
let rebuilt: String = tokens.iter().map(|t| t.text).collect();
assert_eq!(rebuilt, input);
}
#[test]
fn edge_sara_waw_floating_vowel() {
let tok = Tokenizer::new();
let input = "\u{0E40}"; let tokens = tok.segment(input);
assert!(!tokens.is_empty());
let rebuilt: String = tokens.iter().map(|t| t.text).collect();
assert_eq!(rebuilt, input);
}
#[test]
fn edge_repeated_tone_marks_only() {
let tok = Tokenizer::new();
let input = "\u{0E48}\u{0E49}\u{0E4A}"; let tokens = tok.segment(input);
let rebuilt: String = tokens.iter().map(|t| t.text).collect();
assert_eq!(rebuilt, input);
}
#[test]
fn edge_all_whitespace() {
let tok = Tokenizer::new();
assert!(tok.segment(" \t\n").is_empty());
let tokens = Tokenizer::builder()
.keep_whitespace(true)
.build()
.segment(" ");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Whitespace);
}
#[test]
fn edge_mixed_thai_number_thai_no_spaces() {
let tok = Tokenizer::new();
let input = "ธนาคาร100แห่ง";
let rebuilt: String = tok.segment(input).iter().map(|t| t.text).collect();
assert_eq!(rebuilt, input);
}
#[test]
fn all_spans_valid_utf8_boundaries() {
let tok = Tokenizer::new();
let cases = ["กินข้าวกับปลา", "ธนาคาร100แห่ง", "สวัสดีhello123", "คนที่นี่ไปมา"];
for input in cases {
for token in tok.segment(input) {
assert!(
input.is_char_boundary(token.span.start),
"span.start not a char boundary in {input:?}: {token:?}"
);
assert!(
input.is_char_boundary(token.span.end),
"span.end not a char boundary in {input:?}: {token:?}"
);
assert_eq!(
&input[token.span.clone()],
token.text,
"span/text mismatch in {input:?}: {token:?}"
);
}
}
}
#[test]
fn all_testdata_files() {
let dir = format!("{}/testdata", env!("CARGO_MANIFEST_DIR"));
let tok = Tokenizer::new();
let mut entries: Vec<_> = std::fs::read_dir(&dir)
.unwrap_or_else(|e| panic!("cannot read testdata dir {dir}: {e}"))
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().is_some_and(|x| x == "txt"))
.collect();
entries.sort_by_key(|e| e.path());
assert!(
!entries.is_empty(),
"testdata/ contains no .txt files — expected at least basic.txt"
);
let mut total_cases = 0usize;
let mut failures: Vec<String> = Vec::new();
for entry in &entries {
let path = entry.path();
let file_name = path.file_name().unwrap().to_string_lossy().into_owned();
for (input, expected) in load_cases(&path.to_string_lossy()) {
total_cases += 1;
let got = segment_texts(&tok, &input);
if got != expected {
failures.push(format!(
"{file_name}: {input:?}\n got: {got:?}\n expected: {expected:?}"
));
}
}
}
assert!(
failures.is_empty(),
"{} / {} case(s) failed:\n{}",
failures.len(),
total_cases,
failures.join("\n")
);
}
#[test]
fn keep_whitespace_spans_are_contiguous() {
let tok = Tokenizer::builder().keep_whitespace(true).build();
let input = "กิน ข้าว 100 hello";
let tokens = tok.segment(input);
for w in tokens.windows(2) {
assert_eq!(
w[0].span.end, w[1].span.start,
"gap between tokens in {input:?}: {:?} and {:?}",
w[0], w[1]
);
}
}