kham_core/segmenter.rs
1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//! │
12//! ▼ (optional) Tokenizer::normalize() ← fixes tone dedup + Sara Am composition
13//! │
14//! ▼ pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//! │
17//! ▼ (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//! │
20//! ▼ DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//! │
23//! ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized); // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68 dict: Dict,
69 freq: FreqMap,
70 keep_whitespace: bool,
71}
72
73impl Tokenizer {
74 /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75 pub fn new() -> Self {
76 Self {
77 dict: builtin_dict(),
78 freq: FreqMap::builtin(),
79 keep_whitespace: false,
80 }
81 }
82
83 /// Normalise Thai text into canonical form.
84 ///
85 /// This is a convenience wrapper around [`normalizer::normalize`].
86 /// Because [`segment`] is zero-copy, normalization must happen **before**
87 /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88 /// borrow it for [`segment`]:
89 ///
90 /// ```rust
91 /// use kham_core::Tokenizer;
92 ///
93 /// let tok = Tokenizer::new();
94 /// // Input with a doubled tone mark and decomposed Sara Am
95 /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96 /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97 /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98 /// assert!(!tokens.is_empty());
99 /// ```
100 ///
101 /// [`segment`]: Tokenizer::segment
102 pub fn normalize(&self, text: &str) -> alloc::string::String {
103 normalizer::normalize(text)
104 }
105
106 /// Return a [`TokenizerBuilder`] for custom configuration.
107 ///
108 /// # Example
109 ///
110 /// ```rust
111 /// use kham_core::Tokenizer;
112 ///
113 /// // Use built-in dict (no extra words needed here)
114 /// let tok = Tokenizer::builder().build();
115 /// let tokens = tok.segment("สวัสดีชาวโลก");
116 /// assert!(!tokens.is_empty());
117 /// ```
118 pub fn builder() -> TokenizerBuilder {
119 TokenizerBuilder::default()
120 }
121
122 /// Segment `text` into tokens.
123 ///
124 /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125 /// zero-copy sub-slice of `text`.
126 ///
127 /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128 /// through unchanged. Thai spans are segmented with the newmm DAG
129 /// algorithm constrained to TCC boundaries.
130 ///
131 /// # Examples
132 ///
133 /// ```rust
134 /// use kham_core::{Tokenizer, TokenKind};
135 ///
136 /// let tok = Tokenizer::new();
137 /// // Mixed Thai + number + Thai — number token lands at index 1
138 /// let tokens = tok.segment("ธนาคาร100แห่ง");
139 /// assert_eq!(tokens[1].text, "100");
140 /// assert_eq!(tokens[1].kind, TokenKind::Number);
141 /// ```
142 ///
143 /// Joining all token texts reconstructs the original string (whitespace
144 /// is dropped by default, so the joined result omits whitespace):
145 ///
146 /// ```rust
147 /// use kham_core::Tokenizer;
148 ///
149 /// let tok = Tokenizer::new();
150 /// let text = "กินข้าวกับปลา";
151 /// let tokens = tok.segment(text);
152 /// let rebuilt: String = tokens.iter().map(|t| t.text).collect();
153 /// assert_eq!(rebuilt, text);
154 /// ```
155 ///
156 /// Every token carries both byte and char offsets into the original string:
157 ///
158 /// ```rust
159 /// use kham_core::Tokenizer;
160 ///
161 /// let tok = Tokenizer::new();
162 /// let text = "ธนาคาร100แห่ง";
163 /// let tokens = tok.segment(text);
164 /// for t in &tokens {
165 /// // Byte span: valid UTF-8 slice
166 /// assert_eq!(&text[t.span.clone()], t.text);
167 /// // Char span: length matches Unicode scalar count
168 /// assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
169 /// }
170 /// ```
171 pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
172 if text.is_empty() {
173 return Vec::new();
174 }
175
176 // Split into script-homogeneous spans. Non-Thai spans pass through;
177 // Thai spans go through the newmm DAG segmenter.
178 // Call normalize() first if the input may contain สระลอย in wrong
179 // order, stacked tone marks, or decomposed Sara Am.
180 let pre_tokens = pre_tokenize(text);
181
182 let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
183
184 for token in pre_tokens {
185 match token.kind {
186 TokenKind::Thai => {
187 segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
188 }
189 TokenKind::Whitespace if !self.keep_whitespace => {
190 // Discard whitespace tokens unless keep_whitespace is set.
191 }
192 _ => {
193 result.push(token);
194 }
195 }
196 }
197
198 result
199 }
200}
201
202// ---------------------------------------------------------------------------
203// newmm DAG segmentation — Thai spans only
204// ---------------------------------------------------------------------------
205
206/// Lexicographic DP score for a TCC boundary position.
207///
208/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
209/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
210/// 2. Minimise total token count (prefer longer compounds over split components).
211/// 3. Maximise dictionary matches.
212/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
213#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
214struct DpScore {
215 neg_unknowns: i32,
216 neg_tokens: i32,
217 dict_words: i32,
218 freq_score: u64,
219}
220
221impl DpScore {
222 const ZERO: Self = Self {
223 neg_unknowns: 0,
224 dict_words: 0,
225 freq_score: 0,
226 neg_tokens: 0,
227 };
228
229 fn dict_edge(self, freq: u32) -> Self {
230 Self {
231 dict_words: self.dict_words + 1,
232 freq_score: self.freq_score + freq as u64,
233 neg_tokens: self.neg_tokens - 1,
234 ..self
235 }
236 }
237
238 fn unknown_edge(self) -> Self {
239 Self {
240 neg_unknowns: self.neg_unknowns - 1,
241 neg_tokens: self.neg_tokens - 1,
242 ..self
243 }
244 }
245}
246
247/// Output of the forward DP pass.
248struct DpTable {
249 /// Predecessor boundary index for backtracking.
250 from: Vec<usize>,
251 /// Whether the incoming edge at index `i` was a dictionary match.
252 is_dict: Vec<bool>,
253}
254
255/// Forward DP over TCC boundary indices for a single Thai slice.
256///
257/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
258fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
259 let nb = bounds.len();
260 let mut best: Vec<Option<DpScore>> = vec![None; nb];
261 let mut from = vec![0usize; nb];
262 let mut is_dict = vec![false; nb];
263
264 best[0] = Some(DpScore::ZERO);
265
266 for i in 0..nb - 1 {
267 let score = match best[i] {
268 Some(s) => s,
269 None => continue,
270 };
271 let pos = bounds[i];
272 let remaining = &slice[pos..];
273
274 // Dictionary edges — all prefixes, not just the longest, so the DP
275 // can make a globally optimal choice rather than a greedy one.
276 for prefix in dict.prefixes(remaining) {
277 let end_pos = pos + prefix.len();
278 if let Ok(j) = bounds.binary_search(&end_pos) {
279 let freq = freqs.get(prefix);
280 let candidate = Some(score.dict_edge(freq));
281 if candidate > best[j] {
282 best[j] = candidate;
283 from[j] = i;
284 is_dict[j] = true;
285 }
286 }
287 }
288
289 // Fallback edge: advance one TCC as an unknown token.
290 let j = i + 1;
291 let candidate = Some(score.unknown_edge());
292 if candidate > best[j] {
293 best[j] = candidate;
294 from[j] = i;
295 is_dict[j] = false;
296 }
297 }
298
299 DpTable { from, is_dict }
300}
301
302/// Reconstruct the winning boundary-index path by following `from` pointers
303/// from the last index back to 0, then reversing.
304fn backtrack_path(from: &[usize]) -> Vec<usize> {
305 let nb = from.len();
306 let mut path = Vec::with_capacity(nb);
307 let mut cur = nb - 1;
308 loop {
309 path.push(cur);
310 if cur == 0 {
311 break;
312 }
313 cur = from[cur];
314 }
315 path.reverse();
316 path
317}
318
319/// Segment a single Thai span using the newmm DAG algorithm and append tokens
320/// to `out`.
321///
322/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
323fn segment_thai<'t>(
324 dict: &Dict,
325 freqs: &FreqMap,
326 text: &'t str,
327 span: core::ops::Range<usize>,
328 out: &mut Vec<Token<'t>>,
329) {
330 let slice = &text[span.start..span.end];
331 let bounds = tcc_boundaries(slice);
332
333 if bounds.len() <= 1 {
334 return;
335 }
336
337 let dp = forward_dp(dict, freqs, slice, &bounds);
338 let path = backtrack_path(&dp.from);
339
340 // Char offset of span.start — computed once, then incremented per token.
341 let mut char_cursor = text[..span.start].chars().count();
342
343 for w in path.windows(2) {
344 let start_byte = span.start + bounds[w[0]];
345 let end_byte = span.start + bounds[w[1]];
346 let token_text = &text[start_byte..end_byte];
347 let char_start = char_cursor;
348 char_cursor += token_text.chars().count();
349 let kind = if dp.is_dict[w[1]] {
350 TokenKind::Thai
351 } else {
352 TokenKind::Unknown
353 };
354 out.push(Token::new(
355 token_text,
356 start_byte..end_byte,
357 char_start..char_cursor,
358 kind,
359 ));
360 }
361}
362
363// ---------------------------------------------------------------------------
364// Tokenizer trait impls
365// ---------------------------------------------------------------------------
366
367impl Default for Tokenizer {
368 fn default() -> Self {
369 Self::new()
370 }
371}
372
373// ---------------------------------------------------------------------------
374// TokenizerBuilder
375// ---------------------------------------------------------------------------
376
377/// Builder for [`Tokenizer`].
378///
379/// # Example
380///
381/// ```rust
382/// use kham_core::Tokenizer;
383///
384/// let tok = Tokenizer::builder()
385/// .keep_whitespace(true)
386/// .build();
387/// ```
388#[derive(Debug, Default)]
389pub struct TokenizerBuilder {
390 dict_words: Option<alloc::string::String>,
391 keep_whitespace: bool,
392}
393
394impl TokenizerBuilder {
395 /// Load an additional word list from a string (newline-separated words).
396 ///
397 /// Words are merged with the built-in dictionary.
398 ///
399 /// # Example
400 ///
401 /// ```rust
402 /// use kham_core::{Tokenizer, TokenKind};
403 ///
404 /// let tok = Tokenizer::builder()
405 /// .dict_words("ปัญญาประดิษฐ์\n")
406 /// .build();
407 /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
408 /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
409 /// ```
410 pub fn dict_words(mut self, words: &str) -> Self {
411 self.dict_words = Some(alloc::string::String::from(words));
412 self
413 }
414
415 /// Configure whether whitespace tokens are included in the output.
416 ///
417 /// Default: `false` (whitespace is discarded).
418 ///
419 /// # Example
420 ///
421 /// ```rust
422 /// use kham_core::{Tokenizer, TokenKind};
423 ///
424 /// let tok = Tokenizer::builder().keep_whitespace(true).build();
425 /// let tokens = tok.segment("กิน ข้าว");
426 /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
427 /// // Byte spans are contiguous when whitespace is kept
428 /// for w in tokens.windows(2) {
429 /// assert_eq!(w[0].span.end, w[1].span.start);
430 /// }
431 /// ```
432 pub fn keep_whitespace(mut self, keep: bool) -> Self {
433 self.keep_whitespace = keep;
434 self
435 }
436
437 /// Consume the builder and return a configured [`Tokenizer`].
438 pub fn build(self) -> Tokenizer {
439 let dict = if let Some(extra) = &self.dict_words {
440 // Custom words: merge with built-in word list and rebuild.
441 let mut combined = alloc::string::String::from(BUILTIN_WORDS);
442 combined.push('\n');
443 combined.push_str(extra);
444 Dict::from_word_list(&combined)
445 } else {
446 // Default path: load from pre-compiled binary — O(S) copy.
447 builtin_dict()
448 };
449 Tokenizer {
450 dict,
451 freq: FreqMap::builtin(),
452 keep_whitespace: self.keep_whitespace,
453 }
454 }
455
456 /// Try to load a custom word list from a file path.
457 ///
458 /// Only available when the `std` feature is enabled.
459 ///
460 /// # Errors
461 ///
462 /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
463 ///
464 /// # Example
465 ///
466 /// ```rust,no_run
467 /// use kham_core::Tokenizer;
468 ///
469 /// let tok = Tokenizer::builder()
470 /// .dict_file("my_words.txt")
471 /// .expect("failed to load dict")
472 /// .build();
473 /// ```
474 #[cfg(feature = "std")]
475 pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
476 extern crate std;
477 let content = std::fs::read_to_string(path)
478 .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
479 Ok(self.dict_words(&content))
480 }
481}
482
483// ---------------------------------------------------------------------------
484// Tests
485// ---------------------------------------------------------------------------
486
487#[cfg(test)]
488mod tests {
489 use super::*;
490
491 fn tok() -> Tokenizer {
492 Tokenizer::new()
493 }
494
495 // ── basic smoke tests ────────────────────────────────────────────────────
496
497 #[test]
498 fn empty_input() {
499 assert!(tok().segment("").is_empty());
500 }
501
502 #[test]
503 fn pure_latin_passthrough() {
504 let tokens = tok().segment("hello");
505 assert_eq!(tokens.len(), 1);
506 assert_eq!(tokens[0].text, "hello");
507 assert_eq!(tokens[0].kind, TokenKind::Latin);
508 }
509
510 #[test]
511 fn pure_number_passthrough() {
512 let tokens = tok().segment("12345");
513 assert_eq!(tokens.len(), 1);
514 assert_eq!(tokens[0].text, "12345");
515 assert_eq!(tokens[0].kind, TokenKind::Number);
516 }
517
518 #[test]
519 fn whitespace_dropped_by_default() {
520 let tokens = tok().segment("กิน ข้าว");
521 for t in &tokens {
522 assert_ne!(t.kind, TokenKind::Whitespace);
523 }
524 }
525
526 #[test]
527 fn whitespace_kept_when_requested() {
528 let tokens = Tokenizer::builder()
529 .keep_whitespace(true)
530 .build()
531 .segment("กิน ข้าว");
532 assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
533 }
534
535 // ── Thai segmentation ────────────────────────────────────────────────────
536
537 #[test]
538 fn gin_khao_gap_pla() {
539 // "กินข้าวกับปลา" — all words must be in the built-in dict
540 let tokens = tok().segment("กินข้าวกับปลา");
541 let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
542 // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
543 assert!(words.len() >= 2, "expected multiple words, got {words:?}");
544 // Reconstructing must yield the original string
545 assert_eq!(words.join(""), "กินข้าวกับปลา");
546 }
547
548 #[test]
549 fn mixed_thai_number_thai() {
550 // Classic CLAUDE.md example
551 let tokens = tok().segment("ธนาคาร100แห่ง");
552 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
553 assert_eq!(rebuilt, "ธนาคาร100แห่ง");
554 // "100" must survive as a Number token
555 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
556 assert!(num.is_some());
557 assert_eq!(num.unwrap().text, "100");
558 }
559
560 #[test]
561 fn mixed_thai_latin() {
562 let tokens = tok().segment("สวัสดี hello");
563 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
564 // Whitespace dropped by default
565 assert_eq!(rebuilt, "สวัสดีhello");
566 assert!(tokens
567 .iter()
568 .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
569 }
570
571 // ── span / byte-offset invariants ────────────────────────────────────────
572
573 #[test]
574 fn spans_cover_input_excluding_whitespace() {
575 let text = "กินข้าว123hello";
576 let tokens = tok().segment(text);
577 // Every span must be a valid UTF-8 slice of `text`.
578 for t in &tokens {
579 assert_eq!(&text[t.span.clone()], t.text);
580 assert!(text.is_char_boundary(t.span.start));
581 assert!(text.is_char_boundary(t.span.end));
582 }
583 }
584
585 #[test]
586 fn adjacent_spans_are_contiguous() {
587 let text = "กินข้าวกับปลา";
588 let tokens = Tokenizer::builder()
589 .keep_whitespace(true)
590 .build()
591 .segment(text);
592 for w in tokens.windows(2) {
593 assert_eq!(
594 w[0].span.end, w[1].span.start,
595 "gap between {:?} and {:?}",
596 w[0], w[1]
597 );
598 }
599 }
600
601 #[test]
602 fn no_empty_tokens() {
603 let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
604 for t in &tokens {
605 assert!(!t.text.is_empty());
606 }
607 }
608
609 // ── custom dictionary ─────────────────────────────────────────────────────
610
611 #[test]
612 fn custom_dict_word_is_matched() {
613 // Use a nonsense word that is not in the built-in dictionary and cannot
614 // be decomposed into subwords — ensures the custom dict is actually used.
615 let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
616 let tokens = tok.segment("กขคงจฉ");
617 let thai: Vec<&str> = tokens
618 .iter()
619 .filter(|t| t.kind == TokenKind::Thai)
620 .map(|t| t.text)
621 .collect();
622 assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
623 }
624
625 // ── normalize then segment ────────────────────────────────────────────────
626
627 #[test]
628 fn normalize_deduplicates_tone_before_segment() {
629 // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
630 let t = tok();
631 // Insert a doubled tone on ข: ข + อ้ + อ้ (ข้้)
632 let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
633 let normalized = t.normalize(raw);
634 let tokens = t.segment(&normalized);
635 assert!(!tokens.is_empty());
636 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
637 assert_eq!(rebuilt, normalized);
638 }
639
640 #[test]
641 fn normalize_clean_input_is_identity() {
642 // normalize() on already-clean text should not change it.
643 let t = tok();
644 let clean = "กินข้าวกับปลา";
645 assert_eq!(t.normalize(clean), clean);
646 }
647
648 #[test]
649 fn segment_without_normalize_on_clean_input() {
650 // segment() alone is sufficient when input is already canonical.
651 let tokens = tok().segment("กินข้าวกับปลา");
652 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
653 assert_eq!(rebuilt, "กินข้าวกับปลา");
654 }
655
656 // ── DpScore ordering ──────────────────────────────────────────────────────
657 //
658 // The score is a 4-field lexicographic key:
659 // 1. neg_unknowns — fewer unknowns is strictly better
660 // 2. neg_tokens — fewer tokens (prefer longer compounds over split components)
661 // 3. dict_words — more dictionary matches breaks token-count ties
662 // 4. freq_score — higher cumulative TNC frequency as the final tiebreaker
663
664 #[test]
665 fn dp_score_fewer_unknowns_is_primary() {
666 // A path with no unknowns beats one with unknowns regardless of other fields.
667 let no_unknown = DpScore::ZERO;
668 let one_unknown = DpScore::ZERO.unknown_edge();
669 assert!(no_unknown > one_unknown);
670 }
671
672 #[test]
673 fn dp_score_fewer_tokens_beats_more_dict_words() {
674 // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
675 // beats เดิน+ทาง (2 tokens, 2 matches).
676 let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
677 let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
678 assert!(compound > split);
679 }
680
681 #[test]
682 fn dp_score_higher_freq_breaks_token_tie() {
683 // Same unknowns and token count; higher TNC freq wins.
684 let low_freq = DpScore::ZERO.dict_edge(10);
685 let high_freq = DpScore::ZERO.dict_edge(100);
686 assert!(high_freq > low_freq);
687 }
688
689 #[test]
690 fn dp_score_fewer_tokens_beats_higher_freq() {
691 // Fewer tokens wins even when the competing path has higher TNC frequency.
692 let high_freq_more_tokens = DpScore {
693 neg_unknowns: 0,
694 neg_tokens: -2,
695 dict_words: 1,
696 freq_score: 200,
697 };
698 let low_freq_fewer_tokens = DpScore {
699 neg_unknowns: 0,
700 neg_tokens: -1,
701 dict_words: 1,
702 freq_score: 100,
703 };
704 assert!(low_freq_fewer_tokens > high_freq_more_tokens);
705 }
706
707 #[test]
708 fn dp_score_more_dict_words_breaks_token_tie() {
709 // Same unknowns and token count; more dict matches wins.
710 let fewer_dict = DpScore {
711 neg_unknowns: 0,
712 neg_tokens: -2,
713 dict_words: 1,
714 freq_score: 0,
715 };
716 let more_dict = DpScore {
717 neg_unknowns: 0,
718 neg_tokens: -2,
719 dict_words: 2,
720 freq_score: 0,
721 };
722 assert!(more_dict > fewer_dict);
723 }
724
725 #[test]
726 fn dict_edge_accumulates_freq_score() {
727 let after_one = DpScore::ZERO.dict_edge(50);
728 let after_two = after_one.dict_edge(30);
729 assert_eq!(after_one.freq_score, 50);
730 assert_eq!(after_two.freq_score, 80);
731 }
732
733 #[test]
734 fn dict_edge_increments_dict_words_and_neg_tokens() {
735 let s = DpScore::ZERO.dict_edge(0);
736 assert_eq!(s.dict_words, 1);
737 assert_eq!(s.neg_tokens, -1);
738 assert_eq!(s.neg_unknowns, 0);
739 }
740
741 #[test]
742 fn unknown_edge_increments_neg_unknowns_only() {
743 let s = DpScore::ZERO.unknown_edge();
744 assert_eq!(s.neg_unknowns, -1);
745 assert_eq!(s.neg_tokens, -1);
746 assert_eq!(s.dict_words, 0);
747 assert_eq!(s.freq_score, 0);
748 }
749
750 #[test]
751 fn unknown_edge_does_not_contribute_freq() {
752 let s = DpScore::ZERO.unknown_edge().unknown_edge();
753 assert_eq!(s.freq_score, 0);
754 }
755
756 // ── char_span invariants ──────────────────────────────────────────────────
757
758 #[test]
759 fn char_span_len_equals_char_count() {
760 let tokens = tok().segment("กินข้าวกับปลา");
761 for t in &tokens {
762 assert_eq!(
763 t.char_span.end - t.char_span.start,
764 t.text.chars().count(),
765 "char_span length mismatch for {:?}",
766 t.text
767 );
768 }
769 }
770
771 #[test]
772 fn char_spans_are_contiguous() {
773 let tokens = Tokenizer::builder()
774 .keep_whitespace(true)
775 .build()
776 .segment("กินข้าว 100 hello");
777 for w in tokens.windows(2) {
778 assert_eq!(
779 w[0].char_span.end, w[1].char_span.start,
780 "char_span gap between {:?} and {:?}",
781 w[0].text, w[1].text
782 );
783 }
784 }
785
786 #[test]
787 fn char_span_for_mixed_script() {
788 // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
789 let tokens = tok().segment("ธนาคาร100แห่ง");
790 assert_eq!(tokens[0].char_span, 0..6);
791 assert_eq!(tokens[1].char_span, 6..9);
792 assert_eq!(tokens[2].char_span, 9..13);
793 }
794
795 #[test]
796 fn char_span_accounts_for_multibyte_chars() {
797 // Each Thai codepoint is 3 bytes but 1 char.
798 // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
799 let tokens = tok().segment("กิน");
800 assert_eq!(tokens[0].span, 0..9);
801 assert_eq!(tokens[0].char_span, 0..3);
802 }
803
804 #[test]
805 fn char_span_emoji_is_single_char() {
806 // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
807 let tokens = tok().segment("😀");
808 assert_eq!(tokens[0].char_len(), 1);
809 assert_eq!(tokens[0].byte_len(), 4);
810 }
811
812 // ── edge cases ────────────────────────────────────────────────────────────
813
814 #[test]
815 fn single_thai_char() {
816 let tokens = tok().segment("ก");
817 assert_eq!(tokens.len(), 1);
818 assert_eq!(tokens[0].text, "ก");
819 }
820
821 #[test]
822 fn sawasdee_khao_lok() {
823 let tokens = tok().segment("สวัสดีชาวโลก");
824 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
825 assert_eq!(rebuilt, "สวัสดีชาวโลก");
826 }
827}