kham_core/segmenter.rs
1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//! │
12//! ▼ (optional) Tokenizer::normalize() ← fixes tone dedup + Sara Am composition
13//! │
14//! ▼ pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//! │
17//! ▼ (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//! │
20//! ▼ DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//! │
23//! ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized); // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68 dict: Dict,
69 freq: FreqMap,
70 keep_whitespace: bool,
71}
72
73impl Tokenizer {
74 /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75 pub fn new() -> Self {
76 Self {
77 dict: builtin_dict(),
78 freq: FreqMap::builtin(),
79 keep_whitespace: false,
80 }
81 }
82
83 /// Normalise Thai text into canonical form.
84 ///
85 /// This is a convenience wrapper around [`normalizer::normalize`].
86 /// Because [`segment`] is zero-copy, normalization must happen **before**
87 /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88 /// borrow it for [`segment`]:
89 ///
90 /// ```rust
91 /// use kham_core::Tokenizer;
92 ///
93 /// let tok = Tokenizer::new();
94 /// // Input with a doubled tone mark and decomposed Sara Am
95 /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96 /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97 /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98 /// assert!(!tokens.is_empty());
99 /// ```
100 ///
101 /// [`segment`]: Tokenizer::segment
102 pub fn normalize(&self, text: &str) -> alloc::string::String {
103 normalizer::normalize(text)
104 }
105
106 /// Return a [`TokenizerBuilder`] for custom configuration.
107 ///
108 /// # Example
109 ///
110 /// ```rust
111 /// use kham_core::Tokenizer;
112 ///
113 /// // Use built-in dict (no extra words needed here)
114 /// let tok = Tokenizer::builder().build();
115 /// let tokens = tok.segment("สวัสดีชาวโลก");
116 /// assert!(!tokens.is_empty());
117 /// ```
118 pub fn builder() -> TokenizerBuilder {
119 TokenizerBuilder::default()
120 }
121
122 /// Segment `text` into tokens.
123 ///
124 /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125 /// zero-copy sub-slice of `text`.
126 ///
127 /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128 /// through unchanged. Thai spans are segmented with the newmm DAG
129 /// algorithm constrained to TCC boundaries.
130 ///
131 /// # Example
132 ///
133 /// ```rust
134 /// use kham_core::{Tokenizer, TokenKind};
135 ///
136 /// let tok = Tokenizer::new();
137 /// // Mixed Thai + number + Thai
138 /// let tokens = tok.segment("ธนาคาร100แห่ง");
139 /// assert_eq!(tokens[1].text, "100");
140 /// assert_eq!(tokens[1].kind, TokenKind::Number);
141 /// ```
142 pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
143 if text.is_empty() {
144 return Vec::new();
145 }
146
147 // Split into script-homogeneous spans. Non-Thai spans pass through;
148 // Thai spans go through the newmm DAG segmenter.
149 // Call normalize() first if the input may contain สระลอย in wrong
150 // order, stacked tone marks, or decomposed Sara Am.
151 let pre_tokens = pre_tokenize(text);
152
153 let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
154
155 for token in pre_tokens {
156 match token.kind {
157 TokenKind::Thai => {
158 segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
159 }
160 TokenKind::Whitespace if !self.keep_whitespace => {
161 // Discard whitespace tokens unless keep_whitespace is set.
162 }
163 _ => {
164 result.push(token);
165 }
166 }
167 }
168
169 result
170 }
171}
172
173// ---------------------------------------------------------------------------
174// newmm DAG segmentation — Thai spans only
175// ---------------------------------------------------------------------------
176
177/// Lexicographic DP score for a TCC boundary position.
178///
179/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
180/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
181/// 2. Minimise total token count (prefer longer compounds over split components).
182/// 3. Maximise dictionary matches.
183/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
184#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
185struct DpScore {
186 neg_unknowns: i32,
187 neg_tokens: i32,
188 dict_words: i32,
189 freq_score: u64,
190}
191
192impl DpScore {
193 const ZERO: Self = Self {
194 neg_unknowns: 0,
195 dict_words: 0,
196 freq_score: 0,
197 neg_tokens: 0,
198 };
199
200 fn dict_edge(self, freq: u32) -> Self {
201 Self {
202 dict_words: self.dict_words + 1,
203 freq_score: self.freq_score + freq as u64,
204 neg_tokens: self.neg_tokens - 1,
205 ..self
206 }
207 }
208
209 fn unknown_edge(self) -> Self {
210 Self {
211 neg_unknowns: self.neg_unknowns - 1,
212 neg_tokens: self.neg_tokens - 1,
213 ..self
214 }
215 }
216}
217
218/// Output of the forward DP pass.
219struct DpTable {
220 /// Predecessor boundary index for backtracking.
221 from: Vec<usize>,
222 /// Whether the incoming edge at index `i` was a dictionary match.
223 is_dict: Vec<bool>,
224}
225
226/// Forward DP over TCC boundary indices for a single Thai slice.
227///
228/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
229fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
230 let nb = bounds.len();
231 let mut best: Vec<Option<DpScore>> = vec![None; nb];
232 let mut from = vec![0usize; nb];
233 let mut is_dict = vec![false; nb];
234
235 best[0] = Some(DpScore::ZERO);
236
237 for i in 0..nb - 1 {
238 let score = match best[i] {
239 Some(s) => s,
240 None => continue,
241 };
242 let pos = bounds[i];
243 let remaining = &slice[pos..];
244
245 // Dictionary edges — all prefixes, not just the longest, so the DP
246 // can make a globally optimal choice rather than a greedy one.
247 for prefix in dict.prefixes(remaining) {
248 let end_pos = pos + prefix.len();
249 if let Ok(j) = bounds.binary_search(&end_pos) {
250 let freq = freqs.get(prefix);
251 let candidate = Some(score.dict_edge(freq));
252 if candidate > best[j] {
253 best[j] = candidate;
254 from[j] = i;
255 is_dict[j] = true;
256 }
257 }
258 }
259
260 // Fallback edge: advance one TCC as an unknown token.
261 let j = i + 1;
262 let candidate = Some(score.unknown_edge());
263 if candidate > best[j] {
264 best[j] = candidate;
265 from[j] = i;
266 is_dict[j] = false;
267 }
268 }
269
270 DpTable { from, is_dict }
271}
272
273/// Reconstruct the winning boundary-index path by following `from` pointers
274/// from the last index back to 0, then reversing.
275fn backtrack_path(from: &[usize]) -> Vec<usize> {
276 let nb = from.len();
277 let mut path = Vec::with_capacity(nb);
278 let mut cur = nb - 1;
279 loop {
280 path.push(cur);
281 if cur == 0 {
282 break;
283 }
284 cur = from[cur];
285 }
286 path.reverse();
287 path
288}
289
290/// Segment a single Thai span using the newmm DAG algorithm and append tokens
291/// to `out`.
292///
293/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
294fn segment_thai<'t>(
295 dict: &Dict,
296 freqs: &FreqMap,
297 text: &'t str,
298 span: core::ops::Range<usize>,
299 out: &mut Vec<Token<'t>>,
300) {
301 let slice = &text[span.start..span.end];
302 let bounds = tcc_boundaries(slice);
303
304 if bounds.len() <= 1 {
305 return;
306 }
307
308 let dp = forward_dp(dict, freqs, slice, &bounds);
309 let path = backtrack_path(&dp.from);
310
311 // Char offset of span.start — computed once, then incremented per token.
312 let mut char_cursor = text[..span.start].chars().count();
313
314 for w in path.windows(2) {
315 let start_byte = span.start + bounds[w[0]];
316 let end_byte = span.start + bounds[w[1]];
317 let token_text = &text[start_byte..end_byte];
318 let char_start = char_cursor;
319 char_cursor += token_text.chars().count();
320 let kind = if dp.is_dict[w[1]] {
321 TokenKind::Thai
322 } else {
323 TokenKind::Unknown
324 };
325 out.push(Token::new(
326 token_text,
327 start_byte..end_byte,
328 char_start..char_cursor,
329 kind,
330 ));
331 }
332}
333
334// ---------------------------------------------------------------------------
335// Tokenizer trait impls
336// ---------------------------------------------------------------------------
337
338impl Default for Tokenizer {
339 fn default() -> Self {
340 Self::new()
341 }
342}
343
344// ---------------------------------------------------------------------------
345// TokenizerBuilder
346// ---------------------------------------------------------------------------
347
348/// Builder for [`Tokenizer`].
349///
350/// # Example
351///
352/// ```rust
353/// use kham_core::Tokenizer;
354///
355/// let tok = Tokenizer::builder()
356/// .keep_whitespace(true)
357/// .build();
358/// ```
359#[derive(Debug, Default)]
360pub struct TokenizerBuilder {
361 dict_words: Option<alloc::string::String>,
362 keep_whitespace: bool,
363}
364
365impl TokenizerBuilder {
366 /// Load an additional word list from a string (newline-separated words).
367 ///
368 /// Words are merged with the built-in dictionary.
369 pub fn dict_words(mut self, words: &str) -> Self {
370 self.dict_words = Some(alloc::string::String::from(words));
371 self
372 }
373
374 /// Configure whether whitespace tokens are included in the output.
375 ///
376 /// Default: `false` (whitespace is discarded).
377 pub fn keep_whitespace(mut self, keep: bool) -> Self {
378 self.keep_whitespace = keep;
379 self
380 }
381
382 /// Consume the builder and return a configured [`Tokenizer`].
383 pub fn build(self) -> Tokenizer {
384 let dict = if let Some(extra) = &self.dict_words {
385 // Custom words: merge with built-in word list and rebuild.
386 let mut combined = alloc::string::String::from(BUILTIN_WORDS);
387 combined.push('\n');
388 combined.push_str(extra);
389 Dict::from_word_list(&combined)
390 } else {
391 // Default path: load from pre-compiled binary — O(S) copy.
392 builtin_dict()
393 };
394 Tokenizer {
395 dict,
396 freq: FreqMap::builtin(),
397 keep_whitespace: self.keep_whitespace,
398 }
399 }
400
401 /// Try to load a custom word list from a file path.
402 ///
403 /// Only available when the `std` feature is enabled.
404 ///
405 /// # Errors
406 ///
407 /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
408 ///
409 /// # Example
410 ///
411 /// ```rust,no_run
412 /// use kham_core::Tokenizer;
413 ///
414 /// let tok = Tokenizer::builder()
415 /// .dict_file("my_words.txt")
416 /// .expect("failed to load dict")
417 /// .build();
418 /// ```
419 #[cfg(feature = "std")]
420 pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
421 extern crate std;
422 let content = std::fs::read_to_string(path)
423 .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
424 Ok(self.dict_words(&content))
425 }
426}
427
428// ---------------------------------------------------------------------------
429// Tests
430// ---------------------------------------------------------------------------
431
432#[cfg(test)]
433mod tests {
434 use super::*;
435
436 fn tok() -> Tokenizer {
437 Tokenizer::new()
438 }
439
440 // ── basic smoke tests ────────────────────────────────────────────────────
441
442 #[test]
443 fn empty_input() {
444 assert!(tok().segment("").is_empty());
445 }
446
447 #[test]
448 fn pure_latin_passthrough() {
449 let tokens = tok().segment("hello");
450 assert_eq!(tokens.len(), 1);
451 assert_eq!(tokens[0].text, "hello");
452 assert_eq!(tokens[0].kind, TokenKind::Latin);
453 }
454
455 #[test]
456 fn pure_number_passthrough() {
457 let tokens = tok().segment("12345");
458 assert_eq!(tokens.len(), 1);
459 assert_eq!(tokens[0].text, "12345");
460 assert_eq!(tokens[0].kind, TokenKind::Number);
461 }
462
463 #[test]
464 fn whitespace_dropped_by_default() {
465 let tokens = tok().segment("กิน ข้าว");
466 for t in &tokens {
467 assert_ne!(t.kind, TokenKind::Whitespace);
468 }
469 }
470
471 #[test]
472 fn whitespace_kept_when_requested() {
473 let tokens = Tokenizer::builder()
474 .keep_whitespace(true)
475 .build()
476 .segment("กิน ข้าว");
477 assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
478 }
479
480 // ── Thai segmentation ────────────────────────────────────────────────────
481
482 #[test]
483 fn gin_khao_gap_pla() {
484 // "กินข้าวกับปลา" — all words must be in the built-in dict
485 let tokens = tok().segment("กินข้าวกับปลา");
486 let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
487 // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
488 assert!(words.len() >= 2, "expected multiple words, got {words:?}");
489 // Reconstructing must yield the original string
490 assert_eq!(words.join(""), "กินข้าวกับปลา");
491 }
492
493 #[test]
494 fn mixed_thai_number_thai() {
495 // Classic CLAUDE.md example
496 let tokens = tok().segment("ธนาคาร100แห่ง");
497 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
498 assert_eq!(rebuilt, "ธนาคาร100แห่ง");
499 // "100" must survive as a Number token
500 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
501 assert!(num.is_some());
502 assert_eq!(num.unwrap().text, "100");
503 }
504
505 #[test]
506 fn mixed_thai_latin() {
507 let tokens = tok().segment("สวัสดี hello");
508 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
509 // Whitespace dropped by default
510 assert_eq!(rebuilt, "สวัสดีhello");
511 assert!(tokens
512 .iter()
513 .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
514 }
515
516 // ── span / byte-offset invariants ────────────────────────────────────────
517
518 #[test]
519 fn spans_cover_input_excluding_whitespace() {
520 let text = "กินข้าว123hello";
521 let tokens = tok().segment(text);
522 // Every span must be a valid UTF-8 slice of `text`.
523 for t in &tokens {
524 assert_eq!(&text[t.span.clone()], t.text);
525 assert!(text.is_char_boundary(t.span.start));
526 assert!(text.is_char_boundary(t.span.end));
527 }
528 }
529
530 #[test]
531 fn adjacent_spans_are_contiguous() {
532 let text = "กินข้าวกับปลา";
533 let tokens = Tokenizer::builder()
534 .keep_whitespace(true)
535 .build()
536 .segment(text);
537 for w in tokens.windows(2) {
538 assert_eq!(
539 w[0].span.end, w[1].span.start,
540 "gap between {:?} and {:?}",
541 w[0], w[1]
542 );
543 }
544 }
545
546 #[test]
547 fn no_empty_tokens() {
548 let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
549 for t in &tokens {
550 assert!(!t.text.is_empty());
551 }
552 }
553
554 // ── custom dictionary ─────────────────────────────────────────────────────
555
556 #[test]
557 fn custom_dict_word_is_matched() {
558 // Use a nonsense word that is not in the built-in dictionary and cannot
559 // be decomposed into subwords — ensures the custom dict is actually used.
560 let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
561 let tokens = tok.segment("กขคงจฉ");
562 let thai: Vec<&str> = tokens
563 .iter()
564 .filter(|t| t.kind == TokenKind::Thai)
565 .map(|t| t.text)
566 .collect();
567 assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
568 }
569
570 // ── normalize then segment ────────────────────────────────────────────────
571
572 #[test]
573 fn normalize_deduplicates_tone_before_segment() {
574 // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
575 let t = tok();
576 // Insert a doubled tone on ข: ข + อ้ + อ้ (ข้้)
577 let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
578 let normalized = t.normalize(raw);
579 let tokens = t.segment(&normalized);
580 assert!(!tokens.is_empty());
581 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
582 assert_eq!(rebuilt, normalized);
583 }
584
585 #[test]
586 fn normalize_clean_input_is_identity() {
587 // normalize() on already-clean text should not change it.
588 let t = tok();
589 let clean = "กินข้าวกับปลา";
590 assert_eq!(t.normalize(clean), clean);
591 }
592
593 #[test]
594 fn segment_without_normalize_on_clean_input() {
595 // segment() alone is sufficient when input is already canonical.
596 let tokens = tok().segment("กินข้าวกับปลา");
597 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
598 assert_eq!(rebuilt, "กินข้าวกับปลา");
599 }
600
601 // ── DpScore ordering ──────────────────────────────────────────────────────
602 //
603 // The score is a 4-field lexicographic key:
604 // 1. neg_unknowns — fewer unknowns is strictly better
605 // 2. neg_tokens — fewer tokens (prefer longer compounds over split components)
606 // 3. dict_words — more dictionary matches breaks token-count ties
607 // 4. freq_score — higher cumulative TNC frequency as the final tiebreaker
608
609 #[test]
610 fn dp_score_fewer_unknowns_is_primary() {
611 // A path with no unknowns beats one with unknowns regardless of other fields.
612 let no_unknown = DpScore::ZERO;
613 let one_unknown = DpScore::ZERO.unknown_edge();
614 assert!(no_unknown > one_unknown);
615 }
616
617 #[test]
618 fn dp_score_fewer_tokens_beats_more_dict_words() {
619 // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
620 // beats เดิน+ทาง (2 tokens, 2 matches).
621 let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
622 let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
623 assert!(compound > split);
624 }
625
626 #[test]
627 fn dp_score_higher_freq_breaks_token_tie() {
628 // Same unknowns and token count; higher TNC freq wins.
629 let low_freq = DpScore::ZERO.dict_edge(10);
630 let high_freq = DpScore::ZERO.dict_edge(100);
631 assert!(high_freq > low_freq);
632 }
633
634 #[test]
635 fn dp_score_fewer_tokens_beats_higher_freq() {
636 // Fewer tokens wins even when the competing path has higher TNC frequency.
637 let high_freq_more_tokens = DpScore {
638 neg_unknowns: 0,
639 neg_tokens: -2,
640 dict_words: 1,
641 freq_score: 200,
642 };
643 let low_freq_fewer_tokens = DpScore {
644 neg_unknowns: 0,
645 neg_tokens: -1,
646 dict_words: 1,
647 freq_score: 100,
648 };
649 assert!(low_freq_fewer_tokens > high_freq_more_tokens);
650 }
651
652 #[test]
653 fn dp_score_more_dict_words_breaks_token_tie() {
654 // Same unknowns and token count; more dict matches wins.
655 let fewer_dict = DpScore {
656 neg_unknowns: 0,
657 neg_tokens: -2,
658 dict_words: 1,
659 freq_score: 0,
660 };
661 let more_dict = DpScore {
662 neg_unknowns: 0,
663 neg_tokens: -2,
664 dict_words: 2,
665 freq_score: 0,
666 };
667 assert!(more_dict > fewer_dict);
668 }
669
670 #[test]
671 fn dict_edge_accumulates_freq_score() {
672 let after_one = DpScore::ZERO.dict_edge(50);
673 let after_two = after_one.dict_edge(30);
674 assert_eq!(after_one.freq_score, 50);
675 assert_eq!(after_two.freq_score, 80);
676 }
677
678 #[test]
679 fn dict_edge_increments_dict_words_and_neg_tokens() {
680 let s = DpScore::ZERO.dict_edge(0);
681 assert_eq!(s.dict_words, 1);
682 assert_eq!(s.neg_tokens, -1);
683 assert_eq!(s.neg_unknowns, 0);
684 }
685
686 #[test]
687 fn unknown_edge_increments_neg_unknowns_only() {
688 let s = DpScore::ZERO.unknown_edge();
689 assert_eq!(s.neg_unknowns, -1);
690 assert_eq!(s.neg_tokens, -1);
691 assert_eq!(s.dict_words, 0);
692 assert_eq!(s.freq_score, 0);
693 }
694
695 #[test]
696 fn unknown_edge_does_not_contribute_freq() {
697 let s = DpScore::ZERO.unknown_edge().unknown_edge();
698 assert_eq!(s.freq_score, 0);
699 }
700
701 // ── char_span invariants ──────────────────────────────────────────────────
702
703 #[test]
704 fn char_span_len_equals_char_count() {
705 let tokens = tok().segment("กินข้าวกับปลา");
706 for t in &tokens {
707 assert_eq!(
708 t.char_span.end - t.char_span.start,
709 t.text.chars().count(),
710 "char_span length mismatch for {:?}",
711 t.text
712 );
713 }
714 }
715
716 #[test]
717 fn char_spans_are_contiguous() {
718 let tokens = Tokenizer::builder()
719 .keep_whitespace(true)
720 .build()
721 .segment("กินข้าว 100 hello");
722 for w in tokens.windows(2) {
723 assert_eq!(
724 w[0].char_span.end, w[1].char_span.start,
725 "char_span gap between {:?} and {:?}",
726 w[0].text, w[1].text
727 );
728 }
729 }
730
731 #[test]
732 fn char_span_for_mixed_script() {
733 // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
734 let tokens = tok().segment("ธนาคาร100แห่ง");
735 assert_eq!(tokens[0].char_span, 0..6);
736 assert_eq!(tokens[1].char_span, 6..9);
737 assert_eq!(tokens[2].char_span, 9..13);
738 }
739
740 #[test]
741 fn char_span_accounts_for_multibyte_chars() {
742 // Each Thai codepoint is 3 bytes but 1 char.
743 // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
744 let tokens = tok().segment("กิน");
745 assert_eq!(tokens[0].span, 0..9);
746 assert_eq!(tokens[0].char_span, 0..3);
747 }
748
749 #[test]
750 fn char_span_emoji_is_single_char() {
751 // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
752 let tokens = tok().segment("😀");
753 assert_eq!(tokens[0].char_len(), 1);
754 assert_eq!(tokens[0].byte_len(), 4);
755 }
756
757 // ── edge cases ────────────────────────────────────────────────────────────
758
759 #[test]
760 fn single_thai_char() {
761 let tokens = tok().segment("ก");
762 assert_eq!(tokens.len(), 1);
763 assert_eq!(tokens[0].text, "ก");
764 }
765
766 #[test]
767 fn sawasdee_khao_lok() {
768 let tokens = tok().segment("สวัสดีชาวโลก");
769 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
770 assert_eq!(rebuilt, "สวัสดีชาวโลก");
771 }
772}