kham_core/
dict.rs

1//! Dictionary backed by a Double-Array Trie (DARTS).
2//!
3//! ## Structure
4//!
5//! A DARTS encodes a trie in two parallel arrays:
6//!
7//! ```text
8//! base[s]  — offset used to compute the next state from state s
9//! check[t] — parent state of state t; -1 means "unused"
10//!
11//! Transition:  next = base[s] + byte + 1
12//! Valid iff:   check[next] == s
13//! ```
14//!
15//! Adding 1 to every byte value ensures that even byte 0 (used as the
16//! end-of-word sentinel) maps to index ≥ 1, so index 0 is never a child
17//! and `check[0]` can serve as the root sentinel.
18//!
19//! ## Construction
20//!
21//! 1. Parse the word list into a regular trie (adjacency via `BTreeMap`).
22//! 2. BFS over trie nodes. For each node, find the smallest `base` offset
23//!    such that `base + byte + 1` is free for every child byte.
24//! 3. Stamp `check[base + byte + 1] = current_darts_state` and record the
25//!    mapping from trie-node-id → darts-state-id for children.
26//!
27//! ## Lookup
28//!
29//! - `contains(word)`: follow bytes then follow the NUL sentinel.
30//! - `prefixes(text)`: walk bytes, emit a match whenever NUL sentinel is
31//!   reachable at the current state.
32
33use alloc::collections::VecDeque;
34use alloc::vec;
35use alloc::vec::Vec;
36
37// ---------------------------------------------------------------------------
38// Constants
39// ---------------------------------------------------------------------------
40
41/// Marks an unused slot in the `check` array.
42const UNUSED: i32 = -1;
43
44/// Sentinel byte appended to every inserted word.
45/// NUL (0x00) never appears in valid UTF-8 text, so it's a safe sentinel.
46const TERM: u8 = 0;
47
48// ---------------------------------------------------------------------------
49// Intermediate trie (used only during construction)
50// ---------------------------------------------------------------------------
51
52struct TrieNode {
53    /// Sorted (byte, child-node-index) pairs.
54    ///
55    /// A sorted `Vec` is faster than `BTreeMap` for the small child-counts
56    /// typical of trie nodes: no heap allocation per node, better cache
57    /// locality, and linear scan outperforms tree traversal for ≤ ~16 items.
58    children: Vec<(u8, usize)>,
59}
60
61impl TrieNode {
62    fn new() -> Self {
63        Self {
64            children: Vec::new(),
65        }
66    }
67
68    /// Find the index into `children` whose byte equals `b`, or `Err(pos)`
69    /// where `pos` is the insertion point to keep `children` sorted.
70    #[inline]
71    fn find(&self, b: u8) -> Result<usize, usize> {
72        self.children.binary_search_by_key(&b, |&(k, _)| k)
73    }
74
75    /// Return the child node index for byte `b`, or `None`.
76    #[inline]
77    fn get(&self, b: u8) -> Option<usize> {
78        self.find(b).ok().map(|i| self.children[i].1)
79    }
80
81    /// Insert `(b, child_id)` maintaining sorted order.
82    /// Panics in debug mode if `b` is already present.
83    #[inline]
84    fn insert(&mut self, b: u8, child_id: usize) {
85        match self.find(b) {
86            Ok(_) => debug_assert!(false, "duplicate byte in trie node"),
87            Err(pos) => self.children.insert(pos, (b, child_id)),
88        }
89    }
90}
91
92/// Build a plain trie from a word list and return the nodes.
93fn build_trie(text: &str) -> Vec<TrieNode> {
94    let mut nodes: Vec<TrieNode> = vec![TrieNode::new()]; // nodes[0] = root
95
96    for line in text.lines() {
97        let word = line.trim();
98        if word.is_empty() || word.starts_with('#') {
99            continue;
100        }
101        let mut state = 0usize;
102        // Append TERM so the terminal is just another trie node.
103        for b in word.bytes().chain(core::iter::once(TERM)) {
104            if let Some(next) = nodes[state].get(b) {
105                state = next;
106            } else {
107                let next_id = nodes.len();
108                nodes.push(TrieNode::new());
109                nodes[state].insert(b, next_id);
110                state = next_id;
111            }
112        }
113    }
114
115    nodes
116}
117
118// ---------------------------------------------------------------------------
119// Bitmap free-list
120// ---------------------------------------------------------------------------
121
122/// Bitmap tracking which slots in `check` are free (bit = 1 → free).
123///
124/// Provides `next_free_from(pos)` in O(n/64) by scanning 64 bits at a time
125/// with `u64::trailing_zeros`. This is the critical speedup for large word
126/// lists whose trie nodes have wide child-byte spreads (e.g., ASCII 0x20
127/// mixed with Thai 0xE0), where the naive one-slot-at-a-time scan is O(n²).
128struct FreeBitmap {
129    words: Vec<u64>,
130    /// Number of real slots tracked. Bits at indices `cap..` are always 0.
131    cap: usize,
132}
133
134impl FreeBitmap {
135    /// Create a bitmap for slots `0..cap`. Slot 0 is occupied (root sentinel);
136    /// slots `1..cap` are free. Bits above `cap` in the last word are 0.
137    fn new(cap: usize) -> Self {
138        let n_words = cap.div_ceil(64);
139        let mut words = vec![!0u64; n_words];
140        // Clear phantom bits in the last word.
141        let used = cap % 64;
142        if used > 0 && n_words > 0 {
143            words[n_words - 1] = (1u64 << used) - 1;
144        }
145        // Slot 0 occupied (root sentinel).
146        if n_words > 0 {
147            words[0] &= !1u64;
148        }
149        FreeBitmap { words, cap }
150    }
151
152    /// Mark `slot` as occupied (clear its bit).
153    #[inline]
154    fn occupy(&mut self, slot: usize) {
155        debug_assert!(slot < self.cap);
156        self.words[slot / 64] &= !(1u64 << (slot % 64));
157    }
158
159    /// Extend the bitmap to cover `new_cap` slots.
160    ///
161    /// - Old phantom bits (slots `old_cap .. old_n_words*64`) become real free slots.
162    /// - New phantom bits (slots `new_cap .. new_n_words*64`) are cleared.
163    fn grow(&mut self, new_cap: usize) {
164        debug_assert!(new_cap > self.cap);
165        let old_cap = self.cap;
166        let old_words = self.words.len();
167        let new_words = new_cap.div_ceil(64);
168
169        // Un-clear old phantom bits (they're now real free slots).
170        let old_used = old_cap % 64;
171        if old_used > 0 && old_words > 0 {
172            self.words[old_words - 1] |= !((1u64 << old_used) - 1);
173        }
174
175        // Extend with all-free words.
176        self.words.resize(new_words, !0u64);
177
178        // Clear phantom bits in the new last word.
179        let new_used = new_cap % 64;
180        if new_used > 0 && new_words > 0 {
181            self.words[new_words - 1] = (1u64 << new_used) - 1;
182        }
183
184        self.cap = new_cap;
185    }
186
187    /// Return the smallest free slot index `≥ start`, or `self.cap` if none.
188    /// Never returns an index ≥ `self.cap` (phantom bits are never set).
189    fn next_free_from(&self, start: usize) -> usize {
190        if start >= self.cap {
191            return self.cap;
192        }
193        let cap_word = self.cap.div_ceil(64);
194        let word_idx = start / 64;
195        let bit_idx = start % 64;
196
197        // Partial first word.
198        let partial = self.words[word_idx] >> bit_idx;
199        if partial != 0 {
200            return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
201        }
202
203        // Full remaining words.
204        for i in (word_idx + 1)..cap_word {
205            if self.words[i] != 0 {
206                return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
207            }
208        }
209
210        self.cap // all occupied
211    }
212}
213
214// ---------------------------------------------------------------------------
215// Base-allocation helper
216// ---------------------------------------------------------------------------
217
218/// Find the smallest offset `b ≥ 1` such that `check[b + byte + 1]` is free
219/// (either `UNUSED` or out-of-bounds) for every byte in `children`.
220///
221/// Uses the bitmap to jump over occupied regions in O(n/64) per jump instead
222/// of O(n). For nodes with wide child-byte spreads (ASCII 0x20 mixed with
223/// Thai 0xE0), this turns an O(n²) construction into near-linear time.
224fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
225    debug_assert!(!children.is_empty());
226    let min_child = children[0] as usize; // sorted ascending
227
228    // Start b so min_child lands at or after min_free.
229    let mut b = min_free.saturating_sub(min_child + 1).max(1);
230
231    'search: loop {
232        for &c in children {
233            let pos = b + c as usize + 1;
234            if pos < check.len() && check[pos] != UNUSED {
235                // Jump: advance b so this child lands on the next free slot.
236                let nf = bitmap.next_free_from(pos + 1);
237                b = nf.saturating_sub(c as usize + 1).max(b + 1);
238                continue 'search;
239            }
240        }
241        return b;
242    }
243}
244
245// ---------------------------------------------------------------------------
246// Public Dict type
247// ---------------------------------------------------------------------------
248
249/// Double-Array Trie dictionary.
250///
251/// Provides O(k) lookup where k is the number of **bytes** in the query.
252/// Memory layout is two `Vec<i32>` (base + check), cache-friendly.
253pub struct Dict {
254    base: Vec<i32>,
255    check: Vec<i32>,
256}
257
258impl Dict {
259    // ── Construction ─────────────────────────────────────────────────────────
260
261    /// Build a [`Dict`] from a newline-separated word list.
262    ///
263    /// Lines starting with `#` are treated as comments and skipped.
264    /// Blank lines are skipped. Words are stored as raw UTF-8 bytes.
265    ///
266    /// # Example
267    ///
268    /// ```rust
269    /// use kham_core::dict::Dict;
270    ///
271    /// let dict = Dict::from_word_list("กิน\nข้าว\nปลา\n");
272    /// assert!(dict.contains("กิน"));
273    /// assert!(dict.contains("ข้าว"));
274    /// assert!(!dict.contains("xyz"));
275    /// ```
276    pub fn from_word_list(text: &str) -> Self {
277        let trie = build_trie(text);
278        Self::from_trie(trie)
279    }
280
281    /// Deserialise a [`Dict`] from a pre-compiled DARTS binary blob.
282    ///
283    /// The binary blob is normally produced by `build.rs` at compile time and
284    /// embedded via [`include_bytes!`]. Loading from bytes bypasses the full
285    /// trie-construction pipeline, making it the fastest way to obtain a
286    /// ready-to-use dictionary.
287    ///
288    /// # Binary Format
289    ///
290    /// The blob begins with a fixed 16-byte header followed by the raw array
291    /// data. All multi-byte integers are **little-endian**.
292    ///
293    /// ```text
294    /// Offset  Size  Field        Description
295    /// ──────  ────  ───────────  ───────────────────────────────────────────
296    ///      0     4  magic        ASCII b"KDAM" — identifies the file type
297    ///      4     1  version      Format version; currently 0x01
298    ///      5     3  reserved     Must be zero; reserved for future flags
299    ///      8     4  base_len     Number of i32 elements in the base array
300    ///     12     4  check_len    Number of i32 elements in the check array
301    ///     16     —  base[]       base_len × 4 bytes, little-endian i32
302    ///     16 + base_len*4  —  check[]  check_len × 4 bytes, little-endian i32
303    /// ```
304    ///
305    /// # Performance
306    ///
307    /// `O(S)` where `S` is the total byte length of `data`. The function
308    /// performs one pass of `chunks_exact(4).map(i32::from_le_bytes)` over
309    /// each array, allocating exactly `base_len + check_len` i32 values.
310    /// Compare with [`from_word_list`], which is `O(W × K)` — proportional to
311    /// total word bytes — due to trie construction and base-offset search.
312    ///
313    /// | Method            | Complexity | Allocation   | Use when                       |
314    /// |-------------------|------------|--------------|--------------------------------|
315    /// | `from_bytes`      | O(S)       | 2 × `Vec<i32>` | loading pre-built binary blob  |
316    /// | `from_word_list`  | O(W × K)   | trie + DARTS   | building from a raw word list  |
317    ///
318    /// # Errors / Panics
319    ///
320    /// This function panics — rather than returning a `Result` — because
321    /// failures indicate a corrupted or stale build artifact, not a runtime
322    /// condition the caller can meaningfully recover from. A clean
323    /// `cargo build` always regenerates a valid `dict.bin`.
324    ///
325    /// | Condition                                | Panic message                    |
326    /// |------------------------------------------|----------------------------------|
327    /// | `data.len() < 16`                        | `"dict.bin too short"`           |
328    /// | First 4 bytes ≠ `b"KDAM"`               | `"dict.bin: bad magic"`          |
329    /// | Byte 4 ≠ `0x01`                          | `"dict.bin: unsupported version"`|
330    ///
331    /// # Example
332    ///
333    /// ```rust
334    /// use kham_core::dict::Dict;
335    ///
336    /// // Typically you embed a pre-built blob with include_bytes! and pass it here.
337    /// // This example constructs a minimal valid blob by hand for illustration.
338    /// let dict = kham_core::dict::builtin_dict();
339    /// assert!(dict.contains("กิน"));
340    /// assert!(dict.contains("ธนาคาร"));
341    /// ```
342    ///
343    /// For the common case of the built-in word list, prefer [`builtin_dict()`],
344    /// which calls this function with the compile-time-embedded blob.
345    ///
346    /// [`from_word_list`]: Dict::from_word_list
347    /// [`builtin_dict()`]: crate::dict::builtin_dict
348    pub fn from_bytes(data: &[u8]) -> Self {
349        const HDR: usize = 16; // 4 magic + 1 version + 3 reserved + 4 base_len + 4 check_len
350        assert!(data.len() >= HDR, "dict.bin too short");
351        assert_eq!(&data[0..4], b"KDAM", "dict.bin: bad magic");
352        assert_eq!(data[4], 0x01, "dict.bin: unsupported version");
353
354        let base_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
355        let check_len = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
356
357        let base_end = HDR + base_len * 4;
358        let check_end = base_end + check_len * 4;
359
360        let base: Vec<i32> = data[HDR..base_end]
361            .chunks_exact(4)
362            .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
363            .collect();
364        let check: Vec<i32> = data[base_end..check_end]
365            .chunks_exact(4)
366            .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
367            .collect();
368
369        Self { base, check }
370    }
371
372    fn from_trie(nodes: Vec<TrieNode>) -> Self {
373        let n = nodes.len();
374        // Initial capacity: trie nodes * average fanout, plus breathing room.
375        let cap = n * 2 + 512;
376        let mut base: Vec<i32> = vec![0; cap];
377        let mut check: Vec<i32> = vec![UNUSED; cap];
378
379        // check[0] = 0 marks the root as "in use" (its own parent = itself).
380        check[0] = 0;
381
382        // trie_to_darts[trie_node_id] = darts_state_index
383        let mut trie_to_darts: Vec<usize> = vec![0; n];
384        // Root trie node (0) → root darts state (0).
385
386        // Bitmap mirrors `check`: bit i is set iff check[i] == UNUSED.
387        // Kept in sync so find_base can jump over occupied runs in O(n/64).
388        let mut bitmap = FreeBitmap::new(cap);
389
390        // Smallest slot known to be free — advances monotonically.
391        let mut min_free: usize = 1;
392
393        let mut queue: VecDeque<usize> = VecDeque::new();
394        queue.push_back(0); // start BFS from root trie node
395
396        while let Some(trie_id) = queue.pop_front() {
397            let darts_id = trie_to_darts[trie_id];
398            let node = &nodes[trie_id];
399
400            if node.children.is_empty() {
401                continue; // leaf node — nothing to allocate
402            }
403
404            // Advance min_free to the next genuinely free slot.
405            while min_free < check.len() && check[min_free] != UNUSED {
406                min_free += 1;
407            }
408
409            // Collect sorted child bytes for base search.
410            // children is already sorted ascending by byte value.
411            let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();
412
413            // Find a collision-free base offset using the bitmap for fast jumps.
414            let b = find_base(&check, &child_bytes, &bitmap, min_free);
415            base[darts_id] = b as i32;
416
417            // Stamp each child into the arrays.
418            for &(byte, child_trie_id) in &node.children {
419                let child_darts = b + byte as usize + 1;
420
421                // Grow arrays and bitmap if needed.
422                if child_darts + 1 > check.len() {
423                    let new_cap = child_darts + 512;
424                    base.resize(new_cap, 0);
425                    check.resize(new_cap, UNUSED);
426                    bitmap.grow(new_cap);
427                }
428
429                check[child_darts] = darts_id as i32;
430                bitmap.occupy(child_darts);
431                trie_to_darts[child_trie_id] = child_darts;
432                queue.push_back(child_trie_id);
433            }
434        }
435
436        // Trim trailing unused slots to reduce memory footprint.
437        let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
438        base.truncate(last + 1);
439        check.truncate(last + 1);
440
441        Self { base, check }
442    }
443
444    // ── Lookup primitives ────────────────────────────────────────────────────
445
446    /// Follow one transition from `state` on `byte`.
447    /// Returns `Some(next_state)` if valid, `None` otherwise.
448    #[inline]
449    fn goto(&self, state: usize, byte: u8) -> Option<usize> {
450        let next = (*self.base.get(state)?) + byte as i32 + 1;
451        if next <= 0 {
452            return None;
453        }
454        let next = next as usize;
455        if self.check.get(next).copied()? == state as i32 {
456            Some(next)
457        } else {
458            None
459        }
460    }
461
462    // ── Public API ───────────────────────────────────────────────────────────
463
464    /// Returns `true` if `word` is present in the dictionary.
465    ///
466    /// Lookup is O(n) where n is `word.len()` in bytes.
467    ///
468    /// # Example
469    ///
470    /// ```rust
471    /// use kham_core::dict::Dict;
472    ///
473    /// let dict = Dict::from_word_list("สวัสดี\nโลก\n");
474    /// assert!(dict.contains("สวัสดี"));
475    /// assert!(dict.contains("โลก"));
476    /// assert!(!dict.contains("สวัส")); // prefix only — not a word
477    /// assert!(!dict.contains(""));
478    /// ```
479    pub fn contains(&self, word: &str) -> bool {
480        if word.is_empty() {
481            return false;
482        }
483        let mut state = 0usize;
484        for b in word.bytes() {
485            match self.goto(state, b) {
486                Some(s) => state = s,
487                None => return false,
488            }
489        }
490        self.goto(state, TERM).is_some()
491    }
492
493    /// Return all substrings of `text` (anchored at byte 0) that are present
494    /// in the dictionary, ordered **longest first**.
495    ///
496    /// Only returns slices that are valid UTF-8 boundaries (which is always
497    /// guaranteed when the dictionary contains valid UTF-8 words).
498    ///
499    /// # Example
500    ///
501    /// ```rust
502    /// use kham_core::dict::Dict;
503    ///
504    /// let dict = Dict::from_word_list("กิน\nกิน\nกิน\nกินข้าว\n");
505    /// let p = dict.prefixes("กินข้าวกับปลา");
506    /// // Longest prefix is "กินข้าว", then "กิน"
507    /// assert_eq!(p[0], "กินข้าว");
508    /// assert_eq!(p[1], "กิน");
509    /// ```
510    pub fn prefixes<'t>(&self, text: &'t str) -> Vec<&'t str> {
511        let mut result = Vec::new();
512        let mut state = 0usize;
513        let bytes = text.as_bytes();
514
515        for i in 0..bytes.len() {
516            match self.goto(state, bytes[i]) {
517                Some(s) => {
518                    state = s;
519                    if self.goto(state, TERM).is_some() {
520                        // i is the index of the last byte; the prefix is text[0..i+1].
521                        // Since dictionary words are valid UTF-8, i+1 is always a
522                        // char boundary.
523                        debug_assert!(text.is_char_boundary(i + 1));
524                        result.push(&text[..i + 1]);
525                    }
526                }
527                None => break,
528            }
529        }
530
531        // Reverse so the longest match comes first.
532        result.reverse();
533        result
534    }
535
536    /// Number of allocated DARTS states (informational / testing).
537    #[inline]
538    pub fn state_count(&self) -> usize {
539        self.check.len()
540    }
541}
542
543// ---------------------------------------------------------------------------
544// Built-in word list and pre-compiled binary dictionary
545// ---------------------------------------------------------------------------
546
547/// The built-in CC0 Thai word list, embedded at compile time.
548///
549/// Pass to [`Dict::from_word_list`] to build the default dictionary, or use
550/// [`builtin_dict`] to load the pre-compiled binary form in O(S) time.
551pub static BUILTIN_WORDS: &str = include_str!("../data/words_th.txt");
552
553/// Pre-compiled binary DARTS blob produced by `build.rs` at compile time.
554///
555/// Format: 16-byte header (`b"KDAM"` magic, version, lengths) followed by
556/// `base[]` then `check[]` as little-endian `i32` values.
557static BUILTIN_DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.bin"));
558
559/// Return the built-in dictionary loaded from the pre-compiled binary blob.
560///
561/// This is O(S) — a simple byte copy of the pre-built DARTS arrays — versus
562/// O(W×K) for [`Dict::from_word_list`] which reconstructs the trie at runtime.
563/// Use this in [`crate::segmenter::Tokenizer`] and wherever the default dictionary is needed.
564///
565/// # Example
566///
567/// ```rust
568/// use kham_core::dict::builtin_dict;
569///
570/// let dict = builtin_dict();
571/// assert!(dict.contains("กิน"));
572/// assert!(dict.contains("ธนาคาร"));
573/// ```
574pub fn builtin_dict() -> Dict {
575    Dict::from_bytes(BUILTIN_DICT_BYTES)
576}
577
578// ---------------------------------------------------------------------------
579// Tests
580// ---------------------------------------------------------------------------
581
582#[cfg(test)]
583mod tests {
584    use super::*;
585    use alloc::vec;
586
587    fn small_dict() -> Dict {
588        Dict::from_word_list("กิน\nข้าว\nปลา\nน้ำ\nกินข้าว\n")
589    }
590
591    // ── contains ─────────────────────────────────────────────────────────────
592
593    #[test]
594    fn contains_exact_words() {
595        let d = small_dict();
596        assert!(d.contains("กิน"));
597        assert!(d.contains("ข้าว"));
598        assert!(d.contains("ปลา"));
599        assert!(d.contains("น้ำ"));
600        assert!(d.contains("กินข้าว"));
601    }
602
603    #[test]
604    fn contains_rejects_prefix_only() {
605        let d = small_dict();
606        // "กิ" is a prefix of "กิน" but not itself a word
607        assert!(!d.contains("กิ"));
608    }
609
610    #[test]
611    fn contains_rejects_empty() {
612        let d = small_dict();
613        assert!(!d.contains(""));
614    }
615
616    #[test]
617    fn contains_rejects_unknown() {
618        let d = small_dict();
619        assert!(!d.contains("xyz"));
620        assert!(!d.contains("hello"));
621    }
622
623    #[test]
624    fn contains_ascii() {
625        let d = Dict::from_word_list("hello\nworld\n");
626        assert!(d.contains("hello"));
627        assert!(d.contains("world"));
628        assert!(!d.contains("hell"));
629        assert!(!d.contains("worlds"));
630    }
631
632    #[test]
633    fn contains_comments_and_blanks_skipped() {
634        let d = Dict::from_word_list("# comment\n\nกิน\n   \nข้าว\n");
635        assert!(d.contains("กิน"));
636        assert!(d.contains("ข้าว"));
637        assert!(!d.contains("# comment"));
638    }
639
640    #[test]
641    fn contains_single_char() {
642        let d = Dict::from_word_list("ก\n");
643        assert!(d.contains("ก"));
644        assert!(!d.contains("กก"));
645    }
646
647    // ── prefixes ──────────────────────────────────────────────────────────────
648
649    #[test]
650    fn prefixes_longest_first() {
651        let d = small_dict();
652        let p = d.prefixes("กินข้าวกับปลา");
653        // "กินข้าว" and "กิน" are both prefixes of the input
654        assert!(p.contains(&"กินข้าว"));
655        assert!(p.contains(&"กิน"));
656        // Longest must come first
657        assert_eq!(p[0], "กินข้าว");
658    }
659
660    #[test]
661    fn prefixes_single_match() {
662        let d = small_dict();
663        let p = d.prefixes("ปลาทู");
664        assert_eq!(p, vec!["ปลา"]);
665    }
666
667    #[test]
668    fn prefixes_no_match() {
669        let d = small_dict();
670        assert!(d.prefixes("xyz").is_empty());
671        assert!(d.prefixes("").is_empty());
672    }
673
674    #[test]
675    fn prefixes_exact_match() {
676        let d = small_dict();
677        // "กิน" is exactly a word; text doesn't extend further
678        let p = d.prefixes("กิน");
679        assert_eq!(p, vec!["กิน"]);
680    }
681
682    // ── full-round-trip / structural ──────────────────────────────────────────
683
684    #[test]
685    fn builtin_words_parse() {
686        // Built-in word list must parse without panic and contain a few known words
687        let d = Dict::from_word_list(BUILTIN_WORDS);
688        assert!(d.contains("กิน"));
689        assert!(d.contains("ข้าว"));
690    }
691
692    #[test]
693    fn large_word_list() {
694        // Insert many words and verify all are retrievable
695        let words = [
696            "กิน",
697            "ข้าว",
698            "ปลา",
699            "น้ำ",
700            "คน",
701            "ไป",
702            "มา",
703            "ที่",
704            "นี่",
705            "นั้น",
706            "สวัสดี",
707            "ธนาคาร",
708            "แห่ง",
709            "ชาวโลก",
710            "ประเทศ",
711            "ภาษา",
712            "เมือง",
713            "บ้าน",
714        ];
715        let list: alloc::string::String = words.iter().map(|w| alloc::format!("{w}\n")).collect();
716        let d = Dict::from_word_list(&list);
717        for &w in &words {
718            assert!(d.contains(w), "missing word: {w}");
719        }
720        assert!(!d.contains("notaword"));
721    }
722
723    #[test]
724    fn builtin_contains_common_words() {
725        let d = Dict::from_word_list(BUILTIN_WORDS);
726        let words = ["สวัสดี", "ธนาคาร", "ชาวโลก", "ไป", "มา", "กิน", "ข้าว"];
727        for w in words {
728            assert!(d.contains(w), "builtin dict missing: {w}");
729        }
730        let p = d.prefixes("สวัสดีชาวโลก");
731        assert!(p.contains(&"สวัสดี"), "prefixes missing สวัสดี, got: {p:?}");
732        let p2 = d.prefixes("ธนาคารแห่งนั้น");
733        assert!(
734            p2.contains(&"ธนาคาร"),
735            "prefixes missing ธนาคาร, got: {p2:?}"
736        );
737    }
738
739    #[test]
740    fn check_array_validity() {
741        // Every non-UNUSED entry in check must point to a valid state index.
742        let d = small_dict();
743        let len = d.check.len() as i32;
744        for &c in &d.check {
745            if c != UNUSED {
746                assert!(c >= 0 && c < len, "check entry {c} out of range");
747            }
748        }
749    }
750}
kham_core/dict.rs

kham_core/
dict.rs