kham_core/segmenter.rs
1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//! │
12//! ▼ (optional) Tokenizer::normalize() ← fixes tone dedup + Sara Am composition
13//! │
14//! ▼ pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//! │
17//! ▼ (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//! │
20//! ▼ DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//! │
23//! ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized); // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68 dict: Dict,
69 freq: FreqMap,
70 keep_whitespace: bool,
71}
72
73impl Tokenizer {
74 /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75 pub fn new() -> Self {
76 Self {
77 dict: builtin_dict(),
78 freq: FreqMap::builtin(),
79 keep_whitespace: false,
80 }
81 }
82
83 /// Normalise Thai text into canonical form.
84 ///
85 /// This is a convenience wrapper around [`normalizer::normalize`].
86 /// Because [`segment`] is zero-copy, normalization must happen **before**
87 /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88 /// borrow it for [`segment`]:
89 ///
90 /// ```rust
91 /// use kham_core::Tokenizer;
92 ///
93 /// let tok = Tokenizer::new();
94 /// // Input with a doubled tone mark and decomposed Sara Am
95 /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96 /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97 /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98 /// assert!(!tokens.is_empty());
99 /// ```
100 ///
101 /// [`segment`]: Tokenizer::segment
102 pub fn normalize(&self, text: &str) -> alloc::string::String {
103 normalizer::normalize(text)
104 }
105
106 /// Return a [`TokenizerBuilder`] for custom configuration.
107 ///
108 /// # Example
109 ///
110 /// ```rust
111 /// use kham_core::Tokenizer;
112 ///
113 /// // Use built-in dict (no extra words needed here)
114 /// let tok = Tokenizer::builder().build();
115 /// let tokens = tok.segment("สวัสดีชาวโลก");
116 /// assert!(!tokens.is_empty());
117 /// ```
118 pub fn builder() -> TokenizerBuilder {
119 TokenizerBuilder::default()
120 }
121
122 /// Segment `text` into tokens.
123 ///
124 /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125 /// zero-copy sub-slice of `text`.
126 ///
127 /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128 /// through unchanged. Thai spans are segmented with the newmm DAG
129 /// algorithm constrained to TCC boundaries.
130 ///
131 /// # Examples
132 ///
133 /// ```rust
134 /// use kham_core::{Tokenizer, TokenKind};
135 ///
136 /// let tok = Tokenizer::new();
137 /// // Mixed Thai + number + Thai — number token lands at index 1
138 /// let tokens = tok.segment("ธนาคาร100แห่ง");
139 /// assert_eq!(tokens[1].text, "100");
140 /// assert_eq!(tokens[1].kind, TokenKind::Number);
141 /// ```
142 ///
143 /// Joining all token texts reconstructs the original string (whitespace
144 /// is dropped by default, so the joined result omits whitespace):
145 ///
146 /// ```rust
147 /// use kham_core::Tokenizer;
148 ///
149 /// let tok = Tokenizer::new();
150 /// let text = "กินข้าวกับปลา";
151 /// let tokens = tok.segment(text);
152 /// let rebuilt: String = tokens.iter().map(|t| t.text).collect();
153 /// assert_eq!(rebuilt, text);
154 /// ```
155 ///
156 /// Every token carries both byte and char offsets into the original string:
157 ///
158 /// ```rust
159 /// use kham_core::Tokenizer;
160 ///
161 /// let tok = Tokenizer::new();
162 /// let text = "ธนาคาร100แห่ง";
163 /// let tokens = tok.segment(text);
164 /// for t in &tokens {
165 /// // Byte span: valid UTF-8 slice
166 /// assert_eq!(&text[t.span.clone()], t.text);
167 /// // Char span: length matches Unicode scalar count
168 /// assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
169 /// }
170 /// ```
171 pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
172 if text.is_empty() {
173 return Vec::new();
174 }
175
176 // Split into script-homogeneous spans. Non-Thai spans pass through;
177 // Thai spans go through the newmm DAG segmenter.
178 // Call normalize() first if the input may contain สระลอย in wrong
179 // order, stacked tone marks, or decomposed Sara Am.
180 let pre_tokens = pre_tokenize(text);
181
182 let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
183
184 for token in pre_tokens {
185 match token.kind {
186 TokenKind::Thai => {
187 segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
188 }
189 TokenKind::Whitespace if !self.keep_whitespace => {
190 // Discard whitespace tokens unless keep_whitespace is set.
191 }
192 _ => {
193 result.push(token);
194 }
195 }
196 }
197
198 result
199 }
200}
201
202// ---------------------------------------------------------------------------
203// newmm DAG segmentation — Thai spans only
204// ---------------------------------------------------------------------------
205
206/// Lexicographic DP score for a TCC boundary position.
207///
208/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
209/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
210/// 2. Minimise total token count (prefer longer compounds over split components).
211/// 3. Maximise dictionary matches.
212/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
213#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
214struct DpScore {
215 neg_unknowns: i32,
216 neg_tokens: i32,
217 dict_words: i32,
218 freq_score: u64,
219}
220
221impl DpScore {
222 const ZERO: Self = Self {
223 neg_unknowns: 0,
224 dict_words: 0,
225 freq_score: 0,
226 neg_tokens: 0,
227 };
228
229 fn dict_edge(self, freq: u32) -> Self {
230 Self {
231 dict_words: self.dict_words + 1,
232 freq_score: self.freq_score + freq as u64,
233 neg_tokens: self.neg_tokens - 1,
234 ..self
235 }
236 }
237
238 fn unknown_edge(self) -> Self {
239 Self {
240 neg_unknowns: self.neg_unknowns - 1,
241 neg_tokens: self.neg_tokens - 1,
242 ..self
243 }
244 }
245}
246
247/// Output of the forward DP pass.
248struct DpTable {
249 /// Predecessor boundary index for backtracking.
250 from: Vec<usize>,
251 /// Whether the incoming edge at index `i` was a dictionary match.
252 is_dict: Vec<bool>,
253}
254
255/// Forward DP over TCC boundary indices for a single Thai slice.
256///
257/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
258fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
259 let nb = bounds.len();
260 let mut best: Vec<Option<DpScore>> = vec![None; nb];
261 let mut from = vec![0usize; nb];
262 let mut is_dict = vec![false; nb];
263
264 best[0] = Some(DpScore::ZERO);
265
266 for i in 0..nb - 1 {
267 let score = match best[i] {
268 Some(s) => s,
269 None => continue,
270 };
271 let pos = bounds[i];
272 let remaining = &slice[pos..];
273
274 // Dictionary edges — all prefixes, not just the longest, so the DP
275 // can make a globally optimal choice rather than a greedy one.
276 for prefix in dict.prefixes(remaining) {
277 let end_pos = pos + prefix.len();
278 if let Ok(j) = bounds.binary_search(&end_pos) {
279 let freq = freqs.get(prefix);
280 let candidate = Some(score.dict_edge(freq));
281 if candidate > best[j] {
282 best[j] = candidate;
283 from[j] = i;
284 is_dict[j] = true;
285 }
286 }
287 }
288
289 // Fallback edge: advance one TCC as an unknown token.
290 let j = i + 1;
291 let candidate = Some(score.unknown_edge());
292 if candidate > best[j] {
293 best[j] = candidate;
294 from[j] = i;
295 is_dict[j] = false;
296 }
297 }
298
299 DpTable { from, is_dict }
300}
301
302/// Reconstruct the winning boundary-index path by following `from` pointers
303/// from the last index back to 0, then reversing.
304fn backtrack_path(from: &[usize]) -> Vec<usize> {
305 let nb = from.len();
306 let mut path = Vec::with_capacity(nb);
307 let mut cur = nb - 1;
308 loop {
309 path.push(cur);
310 if cur == 0 {
311 break;
312 }
313 cur = from[cur];
314 }
315 path.reverse();
316 path
317}
318
319/// Segment a single Thai span using the newmm DAG algorithm and append tokens
320/// to `out`.
321///
322/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
323fn segment_thai<'t>(
324 dict: &Dict,
325 freqs: &FreqMap,
326 text: &'t str,
327 span: core::ops::Range<usize>,
328 out: &mut Vec<Token<'t>>,
329) {
330 let slice = &text[span.start..span.end];
331 let bounds = tcc_boundaries(slice);
332
333 if bounds.len() <= 1 {
334 return;
335 }
336
337 let dp = forward_dp(dict, freqs, slice, &bounds);
338 let path = backtrack_path(&dp.from);
339
340 // Char offset of span.start — computed once, then incremented per token.
341 let mut char_cursor = text[..span.start].chars().count();
342
343 for w in path.windows(2) {
344 let start_byte = span.start + bounds[w[0]];
345 let end_byte = span.start + bounds[w[1]];
346 let token_text = &text[start_byte..end_byte];
347 let char_start = char_cursor;
348 char_cursor += token_text.chars().count();
349 let kind = if dp.is_dict[w[1]] {
350 TokenKind::Thai
351 } else {
352 TokenKind::Unknown
353 };
354 out.push(Token::new(
355 token_text,
356 start_byte..end_byte,
357 char_start..char_cursor,
358 kind,
359 ));
360 }
361}
362
363// ---------------------------------------------------------------------------
364// Tokenizer trait impls
365// ---------------------------------------------------------------------------
366
367impl Default for Tokenizer {
368 fn default() -> Self {
369 Self::new()
370 }
371}
372
373// ---------------------------------------------------------------------------
374// TokenizerBuilder
375// ---------------------------------------------------------------------------
376
377/// Builder for [`Tokenizer`].
378///
379/// # Example
380///
381/// ```rust
382/// use kham_core::Tokenizer;
383///
384/// let tok = Tokenizer::builder()
385/// .keep_whitespace(true)
386/// .build();
387/// ```
388#[derive(Debug, Default)]
389pub struct TokenizerBuilder {
390 dict_words: Option<alloc::string::String>,
391 dict_merge: Option<alloc::string::String>,
392 keep_whitespace: bool,
393}
394
395impl TokenizerBuilder {
396 /// Load an additional word list from a string (newline-separated words).
397 ///
398 /// Words are merged with the built-in dictionary.
399 ///
400 /// # Example
401 ///
402 /// ```rust
403 /// use kham_core::{Tokenizer, TokenKind};
404 ///
405 /// let tok = Tokenizer::builder()
406 /// .dict_words("ปัญญาประดิษฐ์\n")
407 /// .build();
408 /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
409 /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
410 /// ```
411 pub fn dict_words(mut self, words: &str) -> Self {
412 self.dict_words = Some(alloc::string::String::from(words));
413 self
414 }
415
416 /// Configure whether whitespace tokens are included in the output.
417 ///
418 /// Default: `false` (whitespace is discarded).
419 ///
420 /// # Example
421 ///
422 /// ```rust
423 /// use kham_core::{Tokenizer, TokenKind};
424 ///
425 /// let tok = Tokenizer::builder().keep_whitespace(true).build();
426 /// let tokens = tok.segment("กิน ข้าว");
427 /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
428 /// // Byte spans are contiguous when whitespace is kept
429 /// for w in tokens.windows(2) {
430 /// assert_eq!(w[0].span.end, w[1].span.start);
431 /// }
432 /// ```
433 /// Add extra words via a lightweight overlay — no trie rebuild.
434 ///
435 /// Words are stored in a sorted list alongside the pre-compiled trie.
436 /// This is O(k log k) in the number of custom words and avoids the O(N)
437 /// full trie rebuild that [`dict_words`](Self::dict_words) performs.
438 ///
439 /// Prefer `dict_merge` over `dict_words` when adding a small custom
440 /// vocabulary (e.g. domain-specific terms, product names).
441 ///
442 /// If both `dict_merge` and `dict_words` are called, `dict_words` takes
443 /// precedence (it performs a full rebuild that subsumes any overlay).
444 ///
445 /// # Example
446 ///
447 /// ```rust
448 /// use kham_core::{Tokenizer, TokenKind};
449 ///
450 /// let tok = Tokenizer::builder()
451 /// .dict_merge("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n")
452 /// .build();
453 /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
454 /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
455 /// ```
456 pub fn dict_merge(mut self, words: &str) -> Self {
457 self.dict_merge = Some(alloc::string::String::from(words));
458 self
459 }
460
461 /// Configure whether whitespace tokens are included in the output.
462 ///
463 /// Default: `false` (whitespace is discarded).
464 ///
465 /// # Example
466 ///
467 /// ```rust
468 /// use kham_core::{Tokenizer, TokenKind};
469 ///
470 /// let tok = Tokenizer::builder().keep_whitespace(true).build();
471 /// let tokens = tok.segment("กิน ข้าว");
472 /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
473 /// // Byte spans are contiguous when whitespace is kept
474 /// for w in tokens.windows(2) {
475 /// assert_eq!(w[0].span.end, w[1].span.start);
476 /// }
477 /// ```
478 pub fn keep_whitespace(mut self, keep: bool) -> Self {
479 self.keep_whitespace = keep;
480 self
481 }
482
483 /// Consume the builder and return a configured [`Tokenizer`].
484 pub fn build(self) -> Tokenizer {
485 let dict = if let Some(extra) = &self.dict_words {
486 // Full rebuild path: merges BUILTIN_WORDS + custom words into a new trie.
487 let mut combined = alloc::string::String::from(BUILTIN_WORDS);
488 combined.push('\n');
489 combined.push_str(extra);
490 Dict::from_word_list(&combined)
491 } else if let Some(overlay) = &self.dict_merge {
492 // Fast overlay path: load pre-compiled binary, attach small sorted list.
493 builtin_dict().with_overlay(overlay)
494 } else {
495 // Default path: load from pre-compiled binary — O(S) copy.
496 builtin_dict()
497 };
498 Tokenizer {
499 dict,
500 freq: FreqMap::builtin(),
501 keep_whitespace: self.keep_whitespace,
502 }
503 }
504
505 /// Try to load a custom word list from a file path.
506 ///
507 /// Only available when the `std` feature is enabled.
508 ///
509 /// # Errors
510 ///
511 /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
512 ///
513 /// # Example
514 ///
515 /// ```rust,no_run
516 /// use kham_core::Tokenizer;
517 ///
518 /// let tok = Tokenizer::builder()
519 /// .dict_file("my_words.txt")
520 /// .expect("failed to load dict")
521 /// .build();
522 /// ```
523 #[cfg(feature = "std")]
524 pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
525 extern crate std;
526 let content = std::fs::read_to_string(path)
527 .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
528 Ok(self.dict_words(&content))
529 }
530}
531
532// ---------------------------------------------------------------------------
533// Tests
534// ---------------------------------------------------------------------------
535
536#[cfg(test)]
537mod tests {
538 use super::*;
539
540 fn tok() -> Tokenizer {
541 Tokenizer::new()
542 }
543
544 // ── basic smoke tests ────────────────────────────────────────────────────
545
546 #[test]
547 fn empty_input() {
548 assert!(tok().segment("").is_empty());
549 }
550
551 #[test]
552 fn pure_latin_passthrough() {
553 let tokens = tok().segment("hello");
554 assert_eq!(tokens.len(), 1);
555 assert_eq!(tokens[0].text, "hello");
556 assert_eq!(tokens[0].kind, TokenKind::Latin);
557 }
558
559 #[test]
560 fn pure_number_passthrough() {
561 let tokens = tok().segment("12345");
562 assert_eq!(tokens.len(), 1);
563 assert_eq!(tokens[0].text, "12345");
564 assert_eq!(tokens[0].kind, TokenKind::Number);
565 }
566
567 #[test]
568 fn whitespace_dropped_by_default() {
569 let tokens = tok().segment("กิน ข้าว");
570 for t in &tokens {
571 assert_ne!(t.kind, TokenKind::Whitespace);
572 }
573 }
574
575 #[test]
576 fn whitespace_kept_when_requested() {
577 let tokens = Tokenizer::builder()
578 .keep_whitespace(true)
579 .build()
580 .segment("กิน ข้าว");
581 assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
582 }
583
584 // ── Thai segmentation ────────────────────────────────────────────────────
585
586 #[test]
587 fn gin_khao_gap_pla() {
588 // "กินข้าวกับปลา" — all words must be in the built-in dict
589 let tokens = tok().segment("กินข้าวกับปลา");
590 let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
591 // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
592 assert!(words.len() >= 2, "expected multiple words, got {words:?}");
593 // Reconstructing must yield the original string
594 assert_eq!(words.join(""), "กินข้าวกับปลา");
595 }
596
597 #[test]
598 fn mixed_thai_number_thai() {
599 // Classic CLAUDE.md example
600 let tokens = tok().segment("ธนาคาร100แห่ง");
601 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
602 assert_eq!(rebuilt, "ธนาคาร100แห่ง");
603 // "100" must survive as a Number token
604 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
605 assert!(num.is_some());
606 assert_eq!(num.unwrap().text, "100");
607 }
608
609 #[test]
610 fn mixed_thai_latin() {
611 let tokens = tok().segment("สวัสดี hello");
612 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
613 // Whitespace dropped by default
614 assert_eq!(rebuilt, "สวัสดีhello");
615 assert!(tokens
616 .iter()
617 .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
618 }
619
620 // ── span / byte-offset invariants ────────────────────────────────────────
621
622 #[test]
623 fn spans_cover_input_excluding_whitespace() {
624 let text = "กินข้าว123hello";
625 let tokens = tok().segment(text);
626 // Every span must be a valid UTF-8 slice of `text`.
627 for t in &tokens {
628 assert_eq!(&text[t.span.clone()], t.text);
629 assert!(text.is_char_boundary(t.span.start));
630 assert!(text.is_char_boundary(t.span.end));
631 }
632 }
633
634 #[test]
635 fn adjacent_spans_are_contiguous() {
636 let text = "กินข้าวกับปลา";
637 let tokens = Tokenizer::builder()
638 .keep_whitespace(true)
639 .build()
640 .segment(text);
641 for w in tokens.windows(2) {
642 assert_eq!(
643 w[0].span.end, w[1].span.start,
644 "gap between {:?} and {:?}",
645 w[0], w[1]
646 );
647 }
648 }
649
650 #[test]
651 fn no_empty_tokens() {
652 let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
653 for t in &tokens {
654 assert!(!t.text.is_empty());
655 }
656 }
657
658 // ── custom dictionary ─────────────────────────────────────────────────────
659
660 #[test]
661 fn custom_dict_word_is_matched() {
662 // Use a nonsense word that is not in the built-in dictionary and cannot
663 // be decomposed into subwords — ensures the custom dict is actually used.
664 let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
665 let tokens = tok.segment("กขคงจฉ");
666 let thai: Vec<&str> = tokens
667 .iter()
668 .filter(|t| t.kind == TokenKind::Thai)
669 .map(|t| t.text)
670 .collect();
671 assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
672 }
673
674 // ── normalize then segment ────────────────────────────────────────────────
675
676 #[test]
677 fn normalize_deduplicates_tone_before_segment() {
678 // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
679 let t = tok();
680 // Insert a doubled tone on ข: ข + อ้ + อ้ (ข้้)
681 let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
682 let normalized = t.normalize(raw);
683 let tokens = t.segment(&normalized);
684 assert!(!tokens.is_empty());
685 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
686 assert_eq!(rebuilt, normalized);
687 }
688
689 #[test]
690 fn normalize_clean_input_is_identity() {
691 // normalize() on already-clean text should not change it.
692 let t = tok();
693 let clean = "กินข้าวกับปลา";
694 assert_eq!(t.normalize(clean), clean);
695 }
696
697 #[test]
698 fn segment_without_normalize_on_clean_input() {
699 // segment() alone is sufficient when input is already canonical.
700 let tokens = tok().segment("กินข้าวกับปลา");
701 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
702 assert_eq!(rebuilt, "กินข้าวกับปลา");
703 }
704
705 // ── DpScore ordering ──────────────────────────────────────────────────────
706 //
707 // The score is a 4-field lexicographic key:
708 // 1. neg_unknowns — fewer unknowns is strictly better
709 // 2. neg_tokens — fewer tokens (prefer longer compounds over split components)
710 // 3. dict_words — more dictionary matches breaks token-count ties
711 // 4. freq_score — higher cumulative TNC frequency as the final tiebreaker
712
713 #[test]
714 fn dp_score_fewer_unknowns_is_primary() {
715 // A path with no unknowns beats one with unknowns regardless of other fields.
716 let no_unknown = DpScore::ZERO;
717 let one_unknown = DpScore::ZERO.unknown_edge();
718 assert!(no_unknown > one_unknown);
719 }
720
721 #[test]
722 fn dp_score_fewer_tokens_beats_more_dict_words() {
723 // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
724 // beats เดิน+ทาง (2 tokens, 2 matches).
725 let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
726 let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
727 assert!(compound > split);
728 }
729
730 #[test]
731 fn dp_score_higher_freq_breaks_token_tie() {
732 // Same unknowns and token count; higher TNC freq wins.
733 let low_freq = DpScore::ZERO.dict_edge(10);
734 let high_freq = DpScore::ZERO.dict_edge(100);
735 assert!(high_freq > low_freq);
736 }
737
738 #[test]
739 fn dp_score_fewer_tokens_beats_higher_freq() {
740 // Fewer tokens wins even when the competing path has higher TNC frequency.
741 let high_freq_more_tokens = DpScore {
742 neg_unknowns: 0,
743 neg_tokens: -2,
744 dict_words: 1,
745 freq_score: 200,
746 };
747 let low_freq_fewer_tokens = DpScore {
748 neg_unknowns: 0,
749 neg_tokens: -1,
750 dict_words: 1,
751 freq_score: 100,
752 };
753 assert!(low_freq_fewer_tokens > high_freq_more_tokens);
754 }
755
756 #[test]
757 fn dp_score_more_dict_words_breaks_token_tie() {
758 // Same unknowns and token count; more dict matches wins.
759 let fewer_dict = DpScore {
760 neg_unknowns: 0,
761 neg_tokens: -2,
762 dict_words: 1,
763 freq_score: 0,
764 };
765 let more_dict = DpScore {
766 neg_unknowns: 0,
767 neg_tokens: -2,
768 dict_words: 2,
769 freq_score: 0,
770 };
771 assert!(more_dict > fewer_dict);
772 }
773
774 #[test]
775 fn dict_edge_accumulates_freq_score() {
776 let after_one = DpScore::ZERO.dict_edge(50);
777 let after_two = after_one.dict_edge(30);
778 assert_eq!(after_one.freq_score, 50);
779 assert_eq!(after_two.freq_score, 80);
780 }
781
782 #[test]
783 fn dict_edge_increments_dict_words_and_neg_tokens() {
784 let s = DpScore::ZERO.dict_edge(0);
785 assert_eq!(s.dict_words, 1);
786 assert_eq!(s.neg_tokens, -1);
787 assert_eq!(s.neg_unknowns, 0);
788 }
789
790 #[test]
791 fn unknown_edge_increments_neg_unknowns_only() {
792 let s = DpScore::ZERO.unknown_edge();
793 assert_eq!(s.neg_unknowns, -1);
794 assert_eq!(s.neg_tokens, -1);
795 assert_eq!(s.dict_words, 0);
796 assert_eq!(s.freq_score, 0);
797 }
798
799 #[test]
800 fn unknown_edge_does_not_contribute_freq() {
801 let s = DpScore::ZERO.unknown_edge().unknown_edge();
802 assert_eq!(s.freq_score, 0);
803 }
804
805 // ── char_span invariants ──────────────────────────────────────────────────
806
807 #[test]
808 fn char_span_len_equals_char_count() {
809 let tokens = tok().segment("กินข้าวกับปลา");
810 for t in &tokens {
811 assert_eq!(
812 t.char_span.end - t.char_span.start,
813 t.text.chars().count(),
814 "char_span length mismatch for {:?}",
815 t.text
816 );
817 }
818 }
819
820 #[test]
821 fn char_spans_are_contiguous() {
822 let tokens = Tokenizer::builder()
823 .keep_whitespace(true)
824 .build()
825 .segment("กินข้าว 100 hello");
826 for w in tokens.windows(2) {
827 assert_eq!(
828 w[0].char_span.end, w[1].char_span.start,
829 "char_span gap between {:?} and {:?}",
830 w[0].text, w[1].text
831 );
832 }
833 }
834
835 #[test]
836 fn char_span_for_mixed_script() {
837 // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
838 let tokens = tok().segment("ธนาคาร100แห่ง");
839 assert_eq!(tokens[0].char_span, 0..6);
840 assert_eq!(tokens[1].char_span, 6..9);
841 assert_eq!(tokens[2].char_span, 9..13);
842 }
843
844 #[test]
845 fn char_span_accounts_for_multibyte_chars() {
846 // Each Thai codepoint is 3 bytes but 1 char.
847 // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
848 let tokens = tok().segment("กิน");
849 assert_eq!(tokens[0].span, 0..9);
850 assert_eq!(tokens[0].char_span, 0..3);
851 }
852
853 #[test]
854 fn char_span_emoji_is_single_char() {
855 // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
856 let tokens = tok().segment("😀");
857 assert_eq!(tokens[0].char_len(), 1);
858 assert_eq!(tokens[0].byte_len(), 4);
859 }
860
861 // ── edge cases ────────────────────────────────────────────────────────────
862
863 #[test]
864 fn single_thai_char() {
865 let tokens = tok().segment("ก");
866 assert_eq!(tokens.len(), 1);
867 assert_eq!(tokens[0].text, "ก");
868 }
869
870 #[test]
871 fn sawasdee_khao_lok() {
872 let tokens = tok().segment("สวัสดีชาวโลก");
873 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
874 assert_eq!(rebuilt, "สวัสดีชาวโลก");
875 }
876}