kham_core/
segmenter.rs

1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//!   │
12//!   ▼  (optional) Tokenizer::normalize()   ← fixes tone dedup + Sara Am composition
13//!   │
14//!   ▼  pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//!   │
17//!   ▼  (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//!   │
20//!   ▼  DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//!   │
23//!   ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized);       // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68    dict: Dict,
69    freq: FreqMap,
70    keep_whitespace: bool,
71}
72
73impl Tokenizer {
74    /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75    pub fn new() -> Self {
76        Self {
77            dict: builtin_dict(),
78            freq: FreqMap::builtin(),
79            keep_whitespace: false,
80        }
81    }
82
83    /// Normalise Thai text into canonical form.
84    ///
85    /// This is a convenience wrapper around [`normalizer::normalize`].
86    /// Because [`segment`] is zero-copy, normalization must happen **before**
87    /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88    /// borrow it for [`segment`]:
89    ///
90    /// ```rust
91    /// use kham_core::Tokenizer;
92    ///
93    /// let tok = Tokenizer::new();
94    /// // Input with a doubled tone mark and decomposed Sara Am
95    /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96    /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97    /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98    /// assert!(!tokens.is_empty());
99    /// ```
100    ///
101    /// [`segment`]: Tokenizer::segment
102    pub fn normalize(&self, text: &str) -> alloc::string::String {
103        normalizer::normalize(text)
104    }
105
106    /// Return a [`TokenizerBuilder`] for custom configuration.
107    ///
108    /// # Example
109    ///
110    /// ```rust
111    /// use kham_core::Tokenizer;
112    ///
113    /// // Use built-in dict (no extra words needed here)
114    /// let tok = Tokenizer::builder().build();
115    /// let tokens = tok.segment("สวัสดีชาวโลก");
116    /// assert!(!tokens.is_empty());
117    /// ```
118    pub fn builder() -> TokenizerBuilder {
119        TokenizerBuilder::default()
120    }
121
122    /// Segment `text` into tokens.
123    ///
124    /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125    /// zero-copy sub-slice of `text`.
126    ///
127    /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128    /// through unchanged. Thai spans are segmented with the newmm DAG
129    /// algorithm constrained to TCC boundaries.
130    ///
131    /// # Examples
132    ///
133    /// ```rust
134    /// use kham_core::{Tokenizer, TokenKind};
135    ///
136    /// let tok = Tokenizer::new();
137    /// // Mixed Thai + number + Thai — number token lands at index 1
138    /// let tokens = tok.segment("ธนาคาร100แห่ง");
139    /// assert_eq!(tokens[1].text, "100");
140    /// assert_eq!(tokens[1].kind, TokenKind::Number);
141    /// ```
142    ///
143    /// Joining all token texts reconstructs the original string (whitespace
144    /// is dropped by default, so the joined result omits whitespace):
145    ///
146    /// ```rust
147    /// use kham_core::Tokenizer;
148    ///
149    /// let tok = Tokenizer::new();
150    /// let text = "กินข้าวกับปลา";
151    /// let tokens = tok.segment(text);
152    /// let rebuilt: String = tokens.iter().map(|t| t.text).collect();
153    /// assert_eq!(rebuilt, text);
154    /// ```
155    ///
156    /// Every token carries both byte and char offsets into the original string:
157    ///
158    /// ```rust
159    /// use kham_core::Tokenizer;
160    ///
161    /// let tok = Tokenizer::new();
162    /// let text = "ธนาคาร100แห่ง";
163    /// let tokens = tok.segment(text);
164    /// for t in &tokens {
165    ///     // Byte span: valid UTF-8 slice
166    ///     assert_eq!(&text[t.span.clone()], t.text);
167    ///     // Char span: length matches Unicode scalar count
168    ///     assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
169    /// }
170    /// ```
171    pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
172        if text.is_empty() {
173            return Vec::new();
174        }
175
176        // Split into script-homogeneous spans. Non-Thai spans pass through;
177        // Thai spans go through the newmm DAG segmenter.
178        // Call normalize() first if the input may contain สระลอย in wrong
179        // order, stacked tone marks, or decomposed Sara Am.
180        let pre_tokens = pre_tokenize(text);
181
182        let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
183
184        for token in pre_tokens {
185            match token.kind {
186                TokenKind::Thai => {
187                    segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
188                }
189                TokenKind::Whitespace if !self.keep_whitespace => {
190                    // Discard whitespace tokens unless keep_whitespace is set.
191                }
192                _ => {
193                    result.push(token);
194                }
195            }
196        }
197
198        result
199    }
200}
201
202// ---------------------------------------------------------------------------
203// newmm DAG segmentation — Thai spans only
204// ---------------------------------------------------------------------------
205
206/// Lexicographic DP score for a TCC boundary position.
207///
208/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
209/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
210/// 2. Minimise total token count (prefer longer compounds over split components).
211/// 3. Maximise dictionary matches.
212/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
213#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
214struct DpScore {
215    neg_unknowns: i32,
216    neg_tokens: i32,
217    dict_words: i32,
218    freq_score: u64,
219}
220
221impl DpScore {
222    const ZERO: Self = Self {
223        neg_unknowns: 0,
224        dict_words: 0,
225        freq_score: 0,
226        neg_tokens: 0,
227    };
228
229    fn dict_edge(self, freq: u32) -> Self {
230        Self {
231            dict_words: self.dict_words + 1,
232            freq_score: self.freq_score + freq as u64,
233            neg_tokens: self.neg_tokens - 1,
234            ..self
235        }
236    }
237
238    fn unknown_edge(self) -> Self {
239        Self {
240            neg_unknowns: self.neg_unknowns - 1,
241            neg_tokens: self.neg_tokens - 1,
242            ..self
243        }
244    }
245}
246
247/// Output of the forward DP pass.
248struct DpTable {
249    /// Predecessor boundary index for backtracking.
250    from: Vec<usize>,
251    /// Whether the incoming edge at index `i` was a dictionary match.
252    is_dict: Vec<bool>,
253}
254
255/// Forward DP over TCC boundary indices for a single Thai slice.
256///
257/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
258fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
259    let nb = bounds.len();
260    let mut best: Vec<Option<DpScore>> = vec![None; nb];
261    let mut from = vec![0usize; nb];
262    let mut is_dict = vec![false; nb];
263
264    best[0] = Some(DpScore::ZERO);
265
266    for i in 0..nb - 1 {
267        let score = match best[i] {
268            Some(s) => s,
269            None => continue,
270        };
271        let pos = bounds[i];
272        let remaining = &slice[pos..];
273
274        // Dictionary edges — all prefixes, not just the longest, so the DP
275        // can make a globally optimal choice rather than a greedy one.
276        for prefix in dict.prefixes(remaining) {
277            let end_pos = pos + prefix.len();
278            if let Ok(j) = bounds.binary_search(&end_pos) {
279                let freq = freqs.get(prefix);
280                let candidate = Some(score.dict_edge(freq));
281                if candidate > best[j] {
282                    best[j] = candidate;
283                    from[j] = i;
284                    is_dict[j] = true;
285                }
286            }
287        }
288
289        // Fallback edge: advance one TCC as an unknown token.
290        let j = i + 1;
291        let candidate = Some(score.unknown_edge());
292        if candidate > best[j] {
293            best[j] = candidate;
294            from[j] = i;
295            is_dict[j] = false;
296        }
297    }
298
299    DpTable { from, is_dict }
300}
301
302/// Reconstruct the winning boundary-index path by following `from` pointers
303/// from the last index back to 0, then reversing.
304fn backtrack_path(from: &[usize]) -> Vec<usize> {
305    let nb = from.len();
306    let mut path = Vec::with_capacity(nb);
307    let mut cur = nb - 1;
308    loop {
309        path.push(cur);
310        if cur == 0 {
311            break;
312        }
313        cur = from[cur];
314    }
315    path.reverse();
316    path
317}
318
319/// Segment a single Thai span using the newmm DAG algorithm and append tokens
320/// to `out`.
321///
322/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
323fn segment_thai<'t>(
324    dict: &Dict,
325    freqs: &FreqMap,
326    text: &'t str,
327    span: core::ops::Range<usize>,
328    out: &mut Vec<Token<'t>>,
329) {
330    let slice = &text[span.start..span.end];
331    let bounds = tcc_boundaries(slice);
332
333    if bounds.len() <= 1 {
334        return;
335    }
336
337    let dp = forward_dp(dict, freqs, slice, &bounds);
338    let path = backtrack_path(&dp.from);
339
340    // Char offset of span.start — computed once, then incremented per token.
341    let mut char_cursor = text[..span.start].chars().count();
342
343    for w in path.windows(2) {
344        let start_byte = span.start + bounds[w[0]];
345        let end_byte = span.start + bounds[w[1]];
346        let token_text = &text[start_byte..end_byte];
347        let char_start = char_cursor;
348        char_cursor += token_text.chars().count();
349        let kind = if dp.is_dict[w[1]] {
350            TokenKind::Thai
351        } else {
352            TokenKind::Unknown
353        };
354        out.push(Token::new(
355            token_text,
356            start_byte..end_byte,
357            char_start..char_cursor,
358            kind,
359        ));
360    }
361}
362
363// ---------------------------------------------------------------------------
364// Tokenizer trait impls
365// ---------------------------------------------------------------------------
366
367impl Default for Tokenizer {
368    fn default() -> Self {
369        Self::new()
370    }
371}
372
373// ---------------------------------------------------------------------------
374// TokenizerBuilder
375// ---------------------------------------------------------------------------
376
377/// Builder for [`Tokenizer`].
378///
379/// # Example
380///
381/// ```rust
382/// use kham_core::Tokenizer;
383///
384/// let tok = Tokenizer::builder()
385///     .keep_whitespace(true)
386///     .build();
387/// ```
388#[derive(Debug, Default)]
389pub struct TokenizerBuilder {
390    dict_words: Option<alloc::string::String>,
391    keep_whitespace: bool,
392}
393
394impl TokenizerBuilder {
395    /// Load an additional word list from a string (newline-separated words).
396    ///
397    /// Words are merged with the built-in dictionary.
398    ///
399    /// # Example
400    ///
401    /// ```rust
402    /// use kham_core::{Tokenizer, TokenKind};
403    ///
404    /// let tok = Tokenizer::builder()
405    ///     .dict_words("ปัญญาประดิษฐ์\n")
406    ///     .build();
407    /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
408    /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
409    /// ```
410    pub fn dict_words(mut self, words: &str) -> Self {
411        self.dict_words = Some(alloc::string::String::from(words));
412        self
413    }
414
415    /// Configure whether whitespace tokens are included in the output.
416    ///
417    /// Default: `false` (whitespace is discarded).
418    ///
419    /// # Example
420    ///
421    /// ```rust
422    /// use kham_core::{Tokenizer, TokenKind};
423    ///
424    /// let tok = Tokenizer::builder().keep_whitespace(true).build();
425    /// let tokens = tok.segment("กิน ข้าว");
426    /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
427    /// // Byte spans are contiguous when whitespace is kept
428    /// for w in tokens.windows(2) {
429    ///     assert_eq!(w[0].span.end, w[1].span.start);
430    /// }
431    /// ```
432    pub fn keep_whitespace(mut self, keep: bool) -> Self {
433        self.keep_whitespace = keep;
434        self
435    }
436
437    /// Consume the builder and return a configured [`Tokenizer`].
438    pub fn build(self) -> Tokenizer {
439        let dict = if let Some(extra) = &self.dict_words {
440            // Custom words: merge with built-in word list and rebuild.
441            let mut combined = alloc::string::String::from(BUILTIN_WORDS);
442            combined.push('\n');
443            combined.push_str(extra);
444            Dict::from_word_list(&combined)
445        } else {
446            // Default path: load from pre-compiled binary — O(S) copy.
447            builtin_dict()
448        };
449        Tokenizer {
450            dict,
451            freq: FreqMap::builtin(),
452            keep_whitespace: self.keep_whitespace,
453        }
454    }
455
456    /// Try to load a custom word list from a file path.
457    ///
458    /// Only available when the `std` feature is enabled.
459    ///
460    /// # Errors
461    ///
462    /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
463    ///
464    /// # Example
465    ///
466    /// ```rust,no_run
467    /// use kham_core::Tokenizer;
468    ///
469    /// let tok = Tokenizer::builder()
470    ///     .dict_file("my_words.txt")
471    ///     .expect("failed to load dict")
472    ///     .build();
473    /// ```
474    #[cfg(feature = "std")]
475    pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
476        extern crate std;
477        let content = std::fs::read_to_string(path)
478            .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
479        Ok(self.dict_words(&content))
480    }
481}
482
483// ---------------------------------------------------------------------------
484// Tests
485// ---------------------------------------------------------------------------
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490
491    fn tok() -> Tokenizer {
492        Tokenizer::new()
493    }
494
495    // ── basic smoke tests ────────────────────────────────────────────────────
496
497    #[test]
498    fn empty_input() {
499        assert!(tok().segment("").is_empty());
500    }
501
502    #[test]
503    fn pure_latin_passthrough() {
504        let tokens = tok().segment("hello");
505        assert_eq!(tokens.len(), 1);
506        assert_eq!(tokens[0].text, "hello");
507        assert_eq!(tokens[0].kind, TokenKind::Latin);
508    }
509
510    #[test]
511    fn pure_number_passthrough() {
512        let tokens = tok().segment("12345");
513        assert_eq!(tokens.len(), 1);
514        assert_eq!(tokens[0].text, "12345");
515        assert_eq!(tokens[0].kind, TokenKind::Number);
516    }
517
518    #[test]
519    fn whitespace_dropped_by_default() {
520        let tokens = tok().segment("กิน ข้าว");
521        for t in &tokens {
522            assert_ne!(t.kind, TokenKind::Whitespace);
523        }
524    }
525
526    #[test]
527    fn whitespace_kept_when_requested() {
528        let tokens = Tokenizer::builder()
529            .keep_whitespace(true)
530            .build()
531            .segment("กิน ข้าว");
532        assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
533    }
534
535    // ── Thai segmentation ────────────────────────────────────────────────────
536
537    #[test]
538    fn gin_khao_gap_pla() {
539        // "กินข้าวกับปลา" — all words must be in the built-in dict
540        let tokens = tok().segment("กินข้าวกับปลา");
541        let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
542        // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
543        assert!(words.len() >= 2, "expected multiple words, got {words:?}");
544        // Reconstructing must yield the original string
545        assert_eq!(words.join(""), "กินข้าวกับปลา");
546    }
547
548    #[test]
549    fn mixed_thai_number_thai() {
550        // Classic CLAUDE.md example
551        let tokens = tok().segment("ธนาคาร100แห่ง");
552        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
553        assert_eq!(rebuilt, "ธนาคาร100แห่ง");
554        // "100" must survive as a Number token
555        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
556        assert!(num.is_some());
557        assert_eq!(num.unwrap().text, "100");
558    }
559
560    #[test]
561    fn mixed_thai_latin() {
562        let tokens = tok().segment("สวัสดี hello");
563        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
564        // Whitespace dropped by default
565        assert_eq!(rebuilt, "สวัสดีhello");
566        assert!(tokens
567            .iter()
568            .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
569    }
570
571    // ── span / byte-offset invariants ────────────────────────────────────────
572
573    #[test]
574    fn spans_cover_input_excluding_whitespace() {
575        let text = "กินข้าว123hello";
576        let tokens = tok().segment(text);
577        // Every span must be a valid UTF-8 slice of `text`.
578        for t in &tokens {
579            assert_eq!(&text[t.span.clone()], t.text);
580            assert!(text.is_char_boundary(t.span.start));
581            assert!(text.is_char_boundary(t.span.end));
582        }
583    }
584
585    #[test]
586    fn adjacent_spans_are_contiguous() {
587        let text = "กินข้าวกับปลา";
588        let tokens = Tokenizer::builder()
589            .keep_whitespace(true)
590            .build()
591            .segment(text);
592        for w in tokens.windows(2) {
593            assert_eq!(
594                w[0].span.end, w[1].span.start,
595                "gap between {:?} and {:?}",
596                w[0], w[1]
597            );
598        }
599    }
600
601    #[test]
602    fn no_empty_tokens() {
603        let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
604        for t in &tokens {
605            assert!(!t.text.is_empty());
606        }
607    }
608
609    // ── custom dictionary ─────────────────────────────────────────────────────
610
611    #[test]
612    fn custom_dict_word_is_matched() {
613        // Use a nonsense word that is not in the built-in dictionary and cannot
614        // be decomposed into subwords — ensures the custom dict is actually used.
615        let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
616        let tokens = tok.segment("กขคงจฉ");
617        let thai: Vec<&str> = tokens
618            .iter()
619            .filter(|t| t.kind == TokenKind::Thai)
620            .map(|t| t.text)
621            .collect();
622        assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
623    }
624
625    // ── normalize then segment ────────────────────────────────────────────────
626
627    #[test]
628    fn normalize_deduplicates_tone_before_segment() {
629        // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
630        let t = tok();
631        // Insert a doubled tone on ข: ข + อ้ + อ้  (ข้้)
632        let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
633        let normalized = t.normalize(raw);
634        let tokens = t.segment(&normalized);
635        assert!(!tokens.is_empty());
636        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
637        assert_eq!(rebuilt, normalized);
638    }
639
640    #[test]
641    fn normalize_clean_input_is_identity() {
642        // normalize() on already-clean text should not change it.
643        let t = tok();
644        let clean = "กินข้าวกับปลา";
645        assert_eq!(t.normalize(clean), clean);
646    }
647
648    #[test]
649    fn segment_without_normalize_on_clean_input() {
650        // segment() alone is sufficient when input is already canonical.
651        let tokens = tok().segment("กินข้าวกับปลา");
652        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
653        assert_eq!(rebuilt, "กินข้าวกับปลา");
654    }
655
656    // ── DpScore ordering ──────────────────────────────────────────────────────
657    //
658    // The score is a 4-field lexicographic key:
659    //   1. neg_unknowns  — fewer unknowns is strictly better
660    //   2. neg_tokens    — fewer tokens (prefer longer compounds over split components)
661    //   3. dict_words    — more dictionary matches breaks token-count ties
662    //   4. freq_score    — higher cumulative TNC frequency as the final tiebreaker
663
664    #[test]
665    fn dp_score_fewer_unknowns_is_primary() {
666        // A path with no unknowns beats one with unknowns regardless of other fields.
667        let no_unknown = DpScore::ZERO;
668        let one_unknown = DpScore::ZERO.unknown_edge();
669        assert!(no_unknown > one_unknown);
670    }
671
672    #[test]
673    fn dp_score_fewer_tokens_beats_more_dict_words() {
674        // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
675        // beats เดิน+ทาง (2 tokens, 2 matches).
676        let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
677        let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
678        assert!(compound > split);
679    }
680
681    #[test]
682    fn dp_score_higher_freq_breaks_token_tie() {
683        // Same unknowns and token count; higher TNC freq wins.
684        let low_freq = DpScore::ZERO.dict_edge(10);
685        let high_freq = DpScore::ZERO.dict_edge(100);
686        assert!(high_freq > low_freq);
687    }
688
689    #[test]
690    fn dp_score_fewer_tokens_beats_higher_freq() {
691        // Fewer tokens wins even when the competing path has higher TNC frequency.
692        let high_freq_more_tokens = DpScore {
693            neg_unknowns: 0,
694            neg_tokens: -2,
695            dict_words: 1,
696            freq_score: 200,
697        };
698        let low_freq_fewer_tokens = DpScore {
699            neg_unknowns: 0,
700            neg_tokens: -1,
701            dict_words: 1,
702            freq_score: 100,
703        };
704        assert!(low_freq_fewer_tokens > high_freq_more_tokens);
705    }
706
707    #[test]
708    fn dp_score_more_dict_words_breaks_token_tie() {
709        // Same unknowns and token count; more dict matches wins.
710        let fewer_dict = DpScore {
711            neg_unknowns: 0,
712            neg_tokens: -2,
713            dict_words: 1,
714            freq_score: 0,
715        };
716        let more_dict = DpScore {
717            neg_unknowns: 0,
718            neg_tokens: -2,
719            dict_words: 2,
720            freq_score: 0,
721        };
722        assert!(more_dict > fewer_dict);
723    }
724
725    #[test]
726    fn dict_edge_accumulates_freq_score() {
727        let after_one = DpScore::ZERO.dict_edge(50);
728        let after_two = after_one.dict_edge(30);
729        assert_eq!(after_one.freq_score, 50);
730        assert_eq!(after_two.freq_score, 80);
731    }
732
733    #[test]
734    fn dict_edge_increments_dict_words_and_neg_tokens() {
735        let s = DpScore::ZERO.dict_edge(0);
736        assert_eq!(s.dict_words, 1);
737        assert_eq!(s.neg_tokens, -1);
738        assert_eq!(s.neg_unknowns, 0);
739    }
740
741    #[test]
742    fn unknown_edge_increments_neg_unknowns_only() {
743        let s = DpScore::ZERO.unknown_edge();
744        assert_eq!(s.neg_unknowns, -1);
745        assert_eq!(s.neg_tokens, -1);
746        assert_eq!(s.dict_words, 0);
747        assert_eq!(s.freq_score, 0);
748    }
749
750    #[test]
751    fn unknown_edge_does_not_contribute_freq() {
752        let s = DpScore::ZERO.unknown_edge().unknown_edge();
753        assert_eq!(s.freq_score, 0);
754    }
755
756    // ── char_span invariants ──────────────────────────────────────────────────
757
758    #[test]
759    fn char_span_len_equals_char_count() {
760        let tokens = tok().segment("กินข้าวกับปลา");
761        for t in &tokens {
762            assert_eq!(
763                t.char_span.end - t.char_span.start,
764                t.text.chars().count(),
765                "char_span length mismatch for {:?}",
766                t.text
767            );
768        }
769    }
770
771    #[test]
772    fn char_spans_are_contiguous() {
773        let tokens = Tokenizer::builder()
774            .keep_whitespace(true)
775            .build()
776            .segment("กินข้าว 100 hello");
777        for w in tokens.windows(2) {
778            assert_eq!(
779                w[0].char_span.end, w[1].char_span.start,
780                "char_span gap between {:?} and {:?}",
781                w[0].text, w[1].text
782            );
783        }
784    }
785
786    #[test]
787    fn char_span_for_mixed_script() {
788        // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
789        let tokens = tok().segment("ธนาคาร100แห่ง");
790        assert_eq!(tokens[0].char_span, 0..6);
791        assert_eq!(tokens[1].char_span, 6..9);
792        assert_eq!(tokens[2].char_span, 9..13);
793    }
794
795    #[test]
796    fn char_span_accounts_for_multibyte_chars() {
797        // Each Thai codepoint is 3 bytes but 1 char.
798        // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
799        let tokens = tok().segment("กิน");
800        assert_eq!(tokens[0].span, 0..9);
801        assert_eq!(tokens[0].char_span, 0..3);
802    }
803
804    #[test]
805    fn char_span_emoji_is_single_char() {
806        // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
807        let tokens = tok().segment("😀");
808        assert_eq!(tokens[0].char_len(), 1);
809        assert_eq!(tokens[0].byte_len(), 4);
810    }
811
812    // ── edge cases ────────────────────────────────────────────────────────────
813
814    #[test]
815    fn single_thai_char() {
816        let tokens = tok().segment("ก");
817        assert_eq!(tokens.len(), 1);
818        assert_eq!(tokens[0].text, "ก");
819    }
820
821    #[test]
822    fn sawasdee_khao_lok() {
823        let tokens = tok().segment("สวัสดีชาวโลก");
824        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
825        assert_eq!(rebuilt, "สวัสดีชาวโลก");
826    }
827}
kham_core/segmenter.rs

kham_core/
segmenter.rs