kham_core/
dict.rs

1//! Dictionary backed by a Double-Array Trie (DARTS).
2//!
3//! ## Structure
4//!
5//! A DARTS encodes a trie in two parallel arrays:
6//!
7//! ```text
8//! base[s]  — offset used to compute the next state from state s
9//! check[t] — parent state of state t; -1 means "unused"
10//!
11//! Transition:  next = base[s] + byte + 1
12//! Valid iff:   check[next] == s
13//! ```
14//!
15//! Adding 1 to every byte value ensures that even byte 0 (used as the
16//! end-of-word sentinel) maps to index ≥ 1, so index 0 is never a child
17//! and `check[0]` can serve as the root sentinel.
18//!
19//! ## Construction
20//!
21//! 1. Parse the word list into a regular trie (adjacency via `BTreeMap`).
22//! 2. BFS over trie nodes. For each node, find the smallest `base` offset
23//!    such that `base + byte + 1` is free for every child byte.
24//! 3. Stamp `check[base + byte + 1] = current_darts_state` and record the
25//!    mapping from trie-node-id → darts-state-id for children.
26//!
27//! ## Lookup
28//!
29//! - `contains(word)`: follow bytes then follow the NUL sentinel.
30//! - `prefixes(text)`: walk bytes, emit a match whenever NUL sentinel is
31//!   reachable at the current state.
32
33use alloc::collections::VecDeque;
34use alloc::string::String;
35use alloc::vec;
36use alloc::vec::Vec;
37
38// ---------------------------------------------------------------------------
39// Constants
40// ---------------------------------------------------------------------------
41
42/// Marks an unused slot in the `check` array.
43const UNUSED: i32 = -1;
44
45/// Sentinel byte appended to every inserted word.
46/// NUL (0x00) never appears in valid UTF-8 text, so it's a safe sentinel.
47const TERM: u8 = 0;
48
49// ---------------------------------------------------------------------------
50// Intermediate trie (used only during construction)
51// ---------------------------------------------------------------------------
52
53struct TrieNode {
54    /// Sorted (byte, child-node-index) pairs.
55    ///
56    /// A sorted `Vec` is faster than `BTreeMap` for the small child-counts
57    /// typical of trie nodes: no heap allocation per node, better cache
58    /// locality, and linear scan outperforms tree traversal for ≤ ~16 items.
59    children: Vec<(u8, usize)>,
60}
61
62impl TrieNode {
63    fn new() -> Self {
64        Self {
65            children: Vec::new(),
66        }
67    }
68
69    /// Find the index into `children` whose byte equals `b`, or `Err(pos)`
70    /// where `pos` is the insertion point to keep `children` sorted.
71    #[inline]
72    fn find(&self, b: u8) -> Result<usize, usize> {
73        self.children.binary_search_by_key(&b, |&(k, _)| k)
74    }
75
76    /// Return the child node index for byte `b`, or `None`.
77    #[inline]
78    fn get(&self, b: u8) -> Option<usize> {
79        self.find(b).ok().map(|i| self.children[i].1)
80    }
81
82    /// Insert `(b, child_id)` maintaining sorted order.
83    /// Panics in debug mode if `b` is already present.
84    #[inline]
85    fn insert(&mut self, b: u8, child_id: usize) {
86        match self.find(b) {
87            Ok(_) => debug_assert!(false, "duplicate byte in trie node"),
88            Err(pos) => self.children.insert(pos, (b, child_id)),
89        }
90    }
91}
92
93/// Build a plain trie from a word list and return the nodes.
94fn build_trie(text: &str) -> Vec<TrieNode> {
95    let mut nodes: Vec<TrieNode> = vec![TrieNode::new()]; // nodes[0] = root
96
97    for line in text.lines() {
98        let word = line.trim();
99        if word.is_empty() || word.starts_with('#') {
100            continue;
101        }
102        let mut state = 0usize;
103        // Append TERM so the terminal is just another trie node.
104        for b in word.bytes().chain(core::iter::once(TERM)) {
105            if let Some(next) = nodes[state].get(b) {
106                state = next;
107            } else {
108                let next_id = nodes.len();
109                nodes.push(TrieNode::new());
110                nodes[state].insert(b, next_id);
111                state = next_id;
112            }
113        }
114    }
115
116    nodes
117}
118
119// ---------------------------------------------------------------------------
120// Bitmap free-list
121// ---------------------------------------------------------------------------
122
123/// Bitmap tracking which slots in `check` are free (bit = 1 → free).
124///
125/// Provides `next_free_from(pos)` in O(n/64) by scanning 64 bits at a time
126/// with `u64::trailing_zeros`. This is the critical speedup for large word
127/// lists whose trie nodes have wide child-byte spreads (e.g., ASCII 0x20
128/// mixed with Thai 0xE0), where the naive one-slot-at-a-time scan is O(n²).
129struct FreeBitmap {
130    words: Vec<u64>,
131    /// Number of real slots tracked. Bits at indices `cap..` are always 0.
132    cap: usize,
133}
134
135impl FreeBitmap {
136    /// Create a bitmap for slots `0..cap`. Slot 0 is occupied (root sentinel);
137    /// slots `1..cap` are free. Bits above `cap` in the last word are 0.
138    fn new(cap: usize) -> Self {
139        let n_words = cap.div_ceil(64);
140        let mut words = vec![!0u64; n_words];
141        // Clear phantom bits in the last word.
142        let used = cap % 64;
143        if used > 0 && n_words > 0 {
144            words[n_words - 1] = (1u64 << used) - 1;
145        }
146        // Slot 0 occupied (root sentinel).
147        if n_words > 0 {
148            words[0] &= !1u64;
149        }
150        FreeBitmap { words, cap }
151    }
152
153    /// Mark `slot` as occupied (clear its bit).
154    #[inline]
155    fn occupy(&mut self, slot: usize) {
156        debug_assert!(slot < self.cap);
157        self.words[slot / 64] &= !(1u64 << (slot % 64));
158    }
159
160    /// Extend the bitmap to cover `new_cap` slots.
161    ///
162    /// - Old phantom bits (slots `old_cap .. old_n_words*64`) become real free slots.
163    /// - New phantom bits (slots `new_cap .. new_n_words*64`) are cleared.
164    fn grow(&mut self, new_cap: usize) {
165        debug_assert!(new_cap > self.cap);
166        let old_cap = self.cap;
167        let old_words = self.words.len();
168        let new_words = new_cap.div_ceil(64);
169
170        // Un-clear old phantom bits (they're now real free slots).
171        let old_used = old_cap % 64;
172        if old_used > 0 && old_words > 0 {
173            self.words[old_words - 1] |= !((1u64 << old_used) - 1);
174        }
175
176        // Extend with all-free words.
177        self.words.resize(new_words, !0u64);
178
179        // Clear phantom bits in the new last word.
180        let new_used = new_cap % 64;
181        if new_used > 0 && new_words > 0 {
182            self.words[new_words - 1] = (1u64 << new_used) - 1;
183        }
184
185        self.cap = new_cap;
186    }
187
188    /// Return the smallest free slot index `≥ start`, or `self.cap` if none.
189    /// Never returns an index ≥ `self.cap` (phantom bits are never set).
190    fn next_free_from(&self, start: usize) -> usize {
191        if start >= self.cap {
192            return self.cap;
193        }
194        let cap_word = self.cap.div_ceil(64);
195        let word_idx = start / 64;
196        let bit_idx = start % 64;
197
198        // Partial first word.
199        let partial = self.words[word_idx] >> bit_idx;
200        if partial != 0 {
201            return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
202        }
203
204        // Full remaining words.
205        for i in (word_idx + 1)..cap_word {
206            if self.words[i] != 0 {
207                return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
208            }
209        }
210
211        self.cap // all occupied
212    }
213}
214
215// ---------------------------------------------------------------------------
216// Base-allocation helper
217// ---------------------------------------------------------------------------
218
219/// Find the smallest offset `b ≥ 1` such that `check[b + byte + 1]` is free
220/// (either `UNUSED` or out-of-bounds) for every byte in `children`.
221///
222/// Uses the bitmap to jump over occupied regions in O(n/64) per jump instead
223/// of O(n). For nodes with wide child-byte spreads (ASCII 0x20 mixed with
224/// Thai 0xE0), this turns an O(n²) construction into near-linear time.
225fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
226    debug_assert!(!children.is_empty());
227    let min_child = children[0] as usize; // sorted ascending
228
229    // Start b so min_child lands at or after min_free.
230    let mut b = min_free.saturating_sub(min_child + 1).max(1);
231
232    'search: loop {
233        for &c in children {
234            let pos = b + c as usize + 1;
235            if pos < check.len() && check[pos] != UNUSED {
236                // Jump: advance b so this child lands on the next free slot.
237                let nf = bitmap.next_free_from(pos + 1);
238                b = nf.saturating_sub(c as usize + 1).max(b + 1);
239                continue 'search;
240            }
241        }
242        return b;
243    }
244}
245
246// ---------------------------------------------------------------------------
247// Public Dict type
248// ---------------------------------------------------------------------------
249
250/// Double-Array Trie dictionary.
251///
252/// Provides O(k) lookup where k is the number of **bytes** in the query.
253/// Memory layout is two `Vec<i32>` (base + check), cache-friendly.
254pub struct Dict {
255    base: Vec<i32>,
256    check: Vec<i32>,
257    /// Extra words added via [`Dict::with_overlay`], sorted alphabetically for
258    /// O(log k) `contains` lookup. Scanned linearly in `prefixes`.
259    overlay: Vec<String>,
260}
261
262impl Dict {
263    // ── Construction ─────────────────────────────────────────────────────────
264
265    /// Build a [`Dict`] from a newline-separated word list.
266    ///
267    /// Lines starting with `#` are treated as comments and skipped.
268    /// Blank lines are skipped. Words are stored as raw UTF-8 bytes.
269    ///
270    /// # Example
271    ///
272    /// ```rust
273    /// use kham_core::dict::Dict;
274    ///
275    /// let dict = Dict::from_word_list("กิน\nข้าว\nปลา\n");
276    /// assert!(dict.contains("กิน"));
277    /// assert!(dict.contains("ข้าว"));
278    /// assert!(!dict.contains("xyz"));
279    /// ```
280    pub fn from_word_list(text: &str) -> Self {
281        let trie = build_trie(text);
282        Self::from_trie(trie)
283    }
284
285    /// Deserialise a [`Dict`] from a pre-compiled DARTS binary blob.
286    ///
287    /// The binary blob is normally produced by `build.rs` at compile time and
288    /// embedded via [`include_bytes!`]. Loading from bytes bypasses the full
289    /// trie-construction pipeline, making it the fastest way to obtain a
290    /// ready-to-use dictionary.
291    ///
292    /// # Binary Format
293    ///
294    /// The blob begins with a fixed 16-byte header followed by the raw array
295    /// data. All multi-byte integers are **little-endian**.
296    ///
297    /// ```text
298    /// Offset  Size  Field        Description
299    /// ──────  ────  ───────────  ───────────────────────────────────────────
300    ///      0     4  magic        ASCII b"KDAM" — identifies the file type
301    ///      4     1  version      Format version; currently 0x01
302    ///      5     3  reserved     Must be zero; reserved for future flags
303    ///      8     4  base_len     Number of i32 elements in the base array
304    ///     12     4  check_len    Number of i32 elements in the check array
305    ///     16     —  base[]       base_len × 4 bytes, little-endian i32
306    ///     16 + base_len*4  —  check[]  check_len × 4 bytes, little-endian i32
307    /// ```
308    ///
309    /// # Performance
310    ///
311    /// `O(S)` where `S` is the total byte length of `data`. The function
312    /// performs one pass of `chunks_exact(4).map(i32::from_le_bytes)` over
313    /// each array, allocating exactly `base_len + check_len` i32 values.
314    /// Compare with [`from_word_list`], which is `O(W × K)` — proportional to
315    /// total word bytes — due to trie construction and base-offset search.
316    ///
317    /// | Method            | Complexity | Allocation   | Use when                       |
318    /// |-------------------|------------|--------------|--------------------------------|
319    /// | `from_bytes`      | O(S)       | 2 × `Vec<i32>` | loading pre-built binary blob  |
320    /// | `from_word_list`  | O(W × K)   | trie + DARTS   | building from a raw word list  |
321    ///
322    /// # Errors / Panics
323    ///
324    /// This function panics — rather than returning a `Result` — because
325    /// failures indicate a corrupted or stale build artifact, not a runtime
326    /// condition the caller can meaningfully recover from. A clean
327    /// `cargo build` always regenerates a valid `dict.bin`.
328    ///
329    /// | Condition                                | Panic message                    |
330    /// |------------------------------------------|----------------------------------|
331    /// | `data.len() < 16`                        | `"dict.bin too short"`           |
332    /// | First 4 bytes ≠ `b"KDAM"`               | `"dict.bin: bad magic"`          |
333    /// | Byte 4 ≠ `0x01`                          | `"dict.bin: unsupported version"`|
334    ///
335    /// # Example
336    ///
337    /// ```rust
338    /// use kham_core::dict::Dict;
339    ///
340    /// // Typically you embed a pre-built blob with include_bytes! and pass it here.
341    /// // This example constructs a minimal valid blob by hand for illustration.
342    /// let dict = kham_core::dict::builtin_dict();
343    /// assert!(dict.contains("กิน"));
344    /// assert!(dict.contains("ธนาคาร"));
345    /// ```
346    ///
347    /// For the common case of the built-in word list, prefer [`builtin_dict()`],
348    /// which calls this function with the compile-time-embedded blob.
349    ///
350    /// [`from_word_list`]: Dict::from_word_list
351    /// [`builtin_dict()`]: crate::dict::builtin_dict
352    pub fn from_bytes(data: &[u8]) -> Self {
353        const HDR: usize = 16; // 4 magic + 1 version + 3 reserved + 4 base_len + 4 check_len
354        assert!(data.len() >= HDR, "dict.bin too short");
355        assert_eq!(&data[0..4], b"KDAM", "dict.bin: bad magic");
356        assert_eq!(data[4], 0x01, "dict.bin: unsupported version");
357
358        let base_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
359        let check_len = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
360
361        let base_end = HDR + base_len * 4;
362        let check_end = base_end + check_len * 4;
363
364        let base: Vec<i32> = data[HDR..base_end]
365            .chunks_exact(4)
366            .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
367            .collect();
368        let check: Vec<i32> = data[base_end..check_end]
369            .chunks_exact(4)
370            .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
371            .collect();
372
373        Self {
374            base,
375            check,
376            overlay: Vec::new(),
377        }
378    }
379
380    fn from_trie(nodes: Vec<TrieNode>) -> Self {
381        let n = nodes.len();
382        // Initial capacity: trie nodes * average fanout, plus breathing room.
383        let cap = n * 2 + 512;
384        let mut base: Vec<i32> = vec![0; cap];
385        let mut check: Vec<i32> = vec![UNUSED; cap];
386
387        // check[0] = 0 marks the root as "in use" (its own parent = itself).
388        check[0] = 0;
389
390        // trie_to_darts[trie_node_id] = darts_state_index
391        let mut trie_to_darts: Vec<usize> = vec![0; n];
392        // Root trie node (0) → root darts state (0).
393
394        // Bitmap mirrors `check`: bit i is set iff check[i] == UNUSED.
395        // Kept in sync so find_base can jump over occupied runs in O(n/64).
396        let mut bitmap = FreeBitmap::new(cap);
397
398        // Smallest slot known to be free — advances monotonically.
399        let mut min_free: usize = 1;
400
401        let mut queue: VecDeque<usize> = VecDeque::new();
402        queue.push_back(0); // start BFS from root trie node
403
404        while let Some(trie_id) = queue.pop_front() {
405            let darts_id = trie_to_darts[trie_id];
406            let node = &nodes[trie_id];
407
408            if node.children.is_empty() {
409                continue; // leaf node — nothing to allocate
410            }
411
412            // Advance min_free to the next genuinely free slot.
413            while min_free < check.len() && check[min_free] != UNUSED {
414                min_free += 1;
415            }
416
417            // Collect sorted child bytes for base search.
418            // children is already sorted ascending by byte value.
419            let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();
420
421            // Find a collision-free base offset using the bitmap for fast jumps.
422            let b = find_base(&check, &child_bytes, &bitmap, min_free);
423            base[darts_id] = b as i32;
424
425            // Stamp each child into the arrays.
426            for &(byte, child_trie_id) in &node.children {
427                let child_darts = b + byte as usize + 1;
428
429                // Grow arrays and bitmap if needed.
430                if child_darts + 1 > check.len() {
431                    let new_cap = child_darts + 512;
432                    base.resize(new_cap, 0);
433                    check.resize(new_cap, UNUSED);
434                    bitmap.grow(new_cap);
435                }
436
437                check[child_darts] = darts_id as i32;
438                bitmap.occupy(child_darts);
439                trie_to_darts[child_trie_id] = child_darts;
440                queue.push_back(child_trie_id);
441            }
442        }
443
444        // Trim trailing unused slots to reduce memory footprint.
445        let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
446        base.truncate(last + 1);
447        check.truncate(last + 1);
448
449        Self {
450            base,
451            check,
452            overlay: Vec::new(),
453        }
454    }
455
456    // ── Lookup primitives ────────────────────────────────────────────────────
457
458    /// Follow one transition from `state` on `byte`.
459    /// Returns `Some(next_state)` if valid, `None` otherwise.
460    #[inline]
461    fn goto(&self, state: usize, byte: u8) -> Option<usize> {
462        let next = (*self.base.get(state)?) + byte as i32 + 1;
463        if next <= 0 {
464            return None;
465        }
466        let next = next as usize;
467        if self.check.get(next).copied()? == state as i32 {
468            Some(next)
469        } else {
470            None
471        }
472    }
473
474    // ── Public API ───────────────────────────────────────────────────────────
475
476    /// Extend this dictionary with additional words without rebuilding the trie.
477    ///
478    /// Custom words are stored in a sorted overlay and checked alongside the
479    /// pre-compiled trie in [`contains`](Dict::contains) and
480    /// [`prefixes`](Dict::prefixes). This is O(k log k) in the number of
481    /// extra words — much cheaper than a full trie rebuild via
482    /// [`from_word_list`](Dict::from_word_list).
483    ///
484    /// Lines starting with `#` and blank lines are skipped.
485    ///
486    /// # Example
487    ///
488    /// ```rust
489    /// use kham_core::dict::builtin_dict;
490    ///
491    /// let dict = builtin_dict().with_overlay("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n");
492    /// assert!(dict.contains("กิน"));           // built-in word
493    /// assert!(dict.contains("ปัญญาประดิษฐ์")); // overlay word
494    /// ```
495    pub fn with_overlay(mut self, words: &str) -> Self {
496        for line in words.lines() {
497            let w = line.trim();
498            if w.is_empty() || w.starts_with('#') {
499                continue;
500            }
501            if let Err(pos) = self.overlay.binary_search_by(|e| e.as_str().cmp(w)) {
502                self.overlay.insert(pos, String::from(w));
503            }
504        }
505        self
506    }
507
508    /// Returns `true` if `word` is present in the dictionary or the overlay.
509    ///
510    /// Lookup is O(n + log k) where n is `word.len()` in bytes and k is the
511    /// overlay size.
512    ///
513    /// # Example
514    ///
515    /// ```rust
516    /// use kham_core::dict::Dict;
517    ///
518    /// let dict = Dict::from_word_list("สวัสดี\nโลก\n");
519    /// assert!(dict.contains("สวัสดี"));
520    /// assert!(dict.contains("โลก"));
521    /// assert!(!dict.contains("สวัส")); // prefix only — not a word
522    /// assert!(!dict.contains(""));
523    /// ```
524    pub fn contains(&self, word: &str) -> bool {
525        if word.is_empty() {
526            return false;
527        }
528        // Check overlay first (sorted Vec — O(log k))
529        if self
530            .overlay
531            .binary_search_by(|e| e.as_str().cmp(word))
532            .is_ok()
533        {
534            return true;
535        }
536        let mut state = 0usize;
537        for b in word.bytes() {
538            match self.goto(state, b) {
539                Some(s) => state = s,
540                None => return false,
541            }
542        }
543        self.goto(state, TERM).is_some()
544    }
545
546    /// Return all substrings of `text` (anchored at byte 0) that are present
547    /// in the dictionary, ordered **longest first**.
548    ///
549    /// Only returns slices that are valid UTF-8 boundaries (which is always
550    /// guaranteed when the dictionary contains valid UTF-8 words).
551    ///
552    /// # Example
553    ///
554    /// ```rust
555    /// use kham_core::dict::Dict;
556    ///
557    /// let dict = Dict::from_word_list("กิน\nกิน\nกิน\nกินข้าว\n");
558    /// let p = dict.prefixes("กินข้าวกับปลา");
559    /// // Longest prefix is "กินข้าว", then "กิน"
560    /// assert_eq!(p[0], "กินข้าว");
561    /// assert_eq!(p[1], "กิน");
562    /// ```
563    pub fn prefixes<'t>(&self, text: &'t str) -> Vec<&'t str> {
564        let mut result = Vec::new();
565        let mut state = 0usize;
566        let bytes = text.as_bytes();
567
568        for i in 0..bytes.len() {
569            match self.goto(state, bytes[i]) {
570                Some(s) => {
571                    state = s;
572                    if self.goto(state, TERM).is_some() {
573                        // i is the index of the last byte; the prefix is text[0..i+1].
574                        // Since dictionary words are valid UTF-8, i+1 is always a
575                        // char boundary.
576                        debug_assert!(text.is_char_boundary(i + 1));
577                        result.push(&text[..i + 1]);
578                    }
579                }
580                None => break,
581            }
582        }
583
584        // Reverse so the longest match comes first.
585        result.reverse();
586
587        // Add any overlay words that are prefixes of text.
588        for w in &self.overlay {
589            if text.starts_with(w.as_str()) && !result.contains(&&text[..w.len()]) {
590                result.push(&text[..w.len()]);
591            }
592        }
593
594        // Re-sort by length descending so overlay insertions maintain the convention.
595        result.sort_unstable_by_key(|s| core::cmp::Reverse(s.len()));
596        result
597    }
598
599    /// Number of allocated DARTS states (informational / testing).
600    #[inline]
601    pub fn state_count(&self) -> usize {
602        self.check.len()
603    }
604}
605
606// ---------------------------------------------------------------------------
607// Built-in word list and pre-compiled binary dictionary
608// ---------------------------------------------------------------------------
609
610/// The built-in CC0 Thai word list, embedded at compile time.
611///
612/// Pass to [`Dict::from_word_list`] to build the default dictionary, or use
613/// [`builtin_dict`] to load the pre-compiled binary form in O(S) time.
614pub static BUILTIN_WORDS: &str = include_str!("../data/words_th.txt");
615
616/// Pre-compiled binary DARTS blob produced by `build.rs` at compile time.
617///
618/// Format: 16-byte header (`b"KDAM"` magic, version, lengths) followed by
619/// `base[]` then `check[]` as little-endian `i32` values.
620static BUILTIN_DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.bin"));
621
622/// Return the built-in dictionary loaded from the pre-compiled binary blob.
623///
624/// This is O(S) — a simple byte copy of the pre-built DARTS arrays — versus
625/// O(W×K) for [`Dict::from_word_list`] which reconstructs the trie at runtime.
626/// Use this in [`crate::segmenter::Tokenizer`] and wherever the default dictionary is needed.
627///
628/// # Example
629///
630/// ```rust
631/// use kham_core::dict::builtin_dict;
632///
633/// let dict = builtin_dict();
634/// assert!(dict.contains("กิน"));
635/// assert!(dict.contains("ธนาคาร"));
636/// ```
637pub fn builtin_dict() -> Dict {
638    Dict::from_bytes(BUILTIN_DICT_BYTES)
639}
640
641// ---------------------------------------------------------------------------
642// Tests
643// ---------------------------------------------------------------------------
644
645#[cfg(test)]
646mod tests {
647    use super::*;
648    use alloc::vec;
649
650    fn small_dict() -> Dict {
651        Dict::from_word_list("กิน\nข้าว\nปลา\nน้ำ\nกินข้าว\n")
652    }
653
654    // ── contains ─────────────────────────────────────────────────────────────
655
656    #[test]
657    fn contains_exact_words() {
658        let d = small_dict();
659        assert!(d.contains("กิน"));
660        assert!(d.contains("ข้าว"));
661        assert!(d.contains("ปลา"));
662        assert!(d.contains("น้ำ"));
663        assert!(d.contains("กินข้าว"));
664    }
665
666    #[test]
667    fn contains_rejects_prefix_only() {
668        let d = small_dict();
669        // "กิ" is a prefix of "กิน" but not itself a word
670        assert!(!d.contains("กิ"));
671    }
672
673    #[test]
674    fn contains_rejects_empty() {
675        let d = small_dict();
676        assert!(!d.contains(""));
677    }
678
679    #[test]
680    fn contains_rejects_unknown() {
681        let d = small_dict();
682        assert!(!d.contains("xyz"));
683        assert!(!d.contains("hello"));
684    }
685
686    #[test]
687    fn contains_ascii() {
688        let d = Dict::from_word_list("hello\nworld\n");
689        assert!(d.contains("hello"));
690        assert!(d.contains("world"));
691        assert!(!d.contains("hell"));
692        assert!(!d.contains("worlds"));
693    }
694
695    #[test]
696    fn contains_comments_and_blanks_skipped() {
697        let d = Dict::from_word_list("# comment\n\nกิน\n   \nข้าว\n");
698        assert!(d.contains("กิน"));
699        assert!(d.contains("ข้าว"));
700        assert!(!d.contains("# comment"));
701    }
702
703    #[test]
704    fn contains_single_char() {
705        let d = Dict::from_word_list("ก\n");
706        assert!(d.contains("ก"));
707        assert!(!d.contains("กก"));
708    }
709
710    // ── prefixes ──────────────────────────────────────────────────────────────
711
712    #[test]
713    fn prefixes_longest_first() {
714        let d = small_dict();
715        let p = d.prefixes("กินข้าวกับปลา");
716        // "กินข้าว" and "กิน" are both prefixes of the input
717        assert!(p.contains(&"กินข้าว"));
718        assert!(p.contains(&"กิน"));
719        // Longest must come first
720        assert_eq!(p[0], "กินข้าว");
721    }
722
723    #[test]
724    fn prefixes_single_match() {
725        let d = small_dict();
726        let p = d.prefixes("ปลาทู");
727        assert_eq!(p, vec!["ปลา"]);
728    }
729
730    #[test]
731    fn prefixes_no_match() {
732        let d = small_dict();
733        assert!(d.prefixes("xyz").is_empty());
734        assert!(d.prefixes("").is_empty());
735    }
736
737    #[test]
738    fn prefixes_exact_match() {
739        let d = small_dict();
740        // "กิน" is exactly a word; text doesn't extend further
741        let p = d.prefixes("กิน");
742        assert_eq!(p, vec!["กิน"]);
743    }
744
745    // ── full-round-trip / structural ──────────────────────────────────────────
746
747    #[test]
748    fn builtin_words_parse() {
749        // Built-in word list must parse without panic and contain a few known words
750        let d = Dict::from_word_list(BUILTIN_WORDS);
751        assert!(d.contains("กิน"));
752        assert!(d.contains("ข้าว"));
753    }
754
755    #[test]
756    fn large_word_list() {
757        // Insert many words and verify all are retrievable
758        let words = [
759            "กิน",
760            "ข้าว",
761            "ปลา",
762            "น้ำ",
763            "คน",
764            "ไป",
765            "มา",
766            "ที่",
767            "นี่",
768            "นั้น",
769            "สวัสดี",
770            "ธนาคาร",
771            "แห่ง",
772            "ชาวโลก",
773            "ประเทศ",
774            "ภาษา",
775            "เมือง",
776            "บ้าน",
777        ];
778        let list: alloc::string::String = words.iter().map(|w| alloc::format!("{w}\n")).collect();
779        let d = Dict::from_word_list(&list);
780        for &w in &words {
781            assert!(d.contains(w), "missing word: {w}");
782        }
783        assert!(!d.contains("notaword"));
784    }
785
786    #[test]
787    fn builtin_contains_common_words() {
788        let d = Dict::from_word_list(BUILTIN_WORDS);
789        let words = ["สวัสดี", "ธนาคาร", "ชาวโลก", "ไป", "มา", "กิน", "ข้าว"];
790        for w in words {
791            assert!(d.contains(w), "builtin dict missing: {w}");
792        }
793        let p = d.prefixes("สวัสดีชาวโลก");
794        assert!(p.contains(&"สวัสดี"), "prefixes missing สวัสดี, got: {p:?}");
795        let p2 = d.prefixes("ธนาคารแห่งนั้น");
796        assert!(
797            p2.contains(&"ธนาคาร"),
798            "prefixes missing ธนาคาร, got: {p2:?}"
799        );
800    }
801
802    #[test]
803    fn check_array_validity() {
804        // Every non-UNUSED entry in check must point to a valid state index.
805        let d = small_dict();
806        let len = d.check.len() as i32;
807        for &c in &d.check {
808            if c != UNUSED {
809                assert!(c >= 0 && c < len, "check entry {c} out of range");
810            }
811        }
812    }
813}
kham_core/dict.rs

kham_core/
dict.rs