kham_core/dict.rs
1//! Dictionary backed by a Double-Array Trie (DARTS).
2//!
3//! ## Structure
4//!
5//! A DARTS encodes a trie in two parallel arrays:
6//!
7//! ```text
8//! base[s] — offset used to compute the next state from state s
9//! check[t] — parent state of state t; -1 means "unused"
10//!
11//! Transition: next = base[s] + byte + 1
12//! Valid iff: check[next] == s
13//! ```
14//!
15//! Adding 1 to every byte value ensures that even byte 0 (used as the
16//! end-of-word sentinel) maps to index ≥ 1, so index 0 is never a child
17//! and `check[0]` can serve as the root sentinel.
18//!
19//! ## Construction
20//!
21//! 1. Parse the word list into a regular trie (adjacency via `BTreeMap`).
22//! 2. BFS over trie nodes. For each node, find the smallest `base` offset
23//! such that `base + byte + 1` is free for every child byte.
24//! 3. Stamp `check[base + byte + 1] = current_darts_state` and record the
25//! mapping from trie-node-id → darts-state-id for children.
26//!
27//! ## Lookup
28//!
29//! - `contains(word)`: follow bytes then follow the NUL sentinel.
30//! - `prefixes(text)`: walk bytes, emit a match whenever NUL sentinel is
31//! reachable at the current state.
32
33use alloc::collections::VecDeque;
34use alloc::vec;
35use alloc::vec::Vec;
36
37// ---------------------------------------------------------------------------
38// Constants
39// ---------------------------------------------------------------------------
40
41/// Marks an unused slot in the `check` array.
42const UNUSED: i32 = -1;
43
44/// Sentinel byte appended to every inserted word.
45/// NUL (0x00) never appears in valid UTF-8 text, so it's a safe sentinel.
46const TERM: u8 = 0;
47
48// ---------------------------------------------------------------------------
49// Intermediate trie (used only during construction)
50// ---------------------------------------------------------------------------
51
52struct TrieNode {
53 /// Sorted (byte, child-node-index) pairs.
54 ///
55 /// A sorted `Vec` is faster than `BTreeMap` for the small child-counts
56 /// typical of trie nodes: no heap allocation per node, better cache
57 /// locality, and linear scan outperforms tree traversal for ≤ ~16 items.
58 children: Vec<(u8, usize)>,
59}
60
61impl TrieNode {
62 fn new() -> Self {
63 Self {
64 children: Vec::new(),
65 }
66 }
67
68 /// Find the index into `children` whose byte equals `b`, or `Err(pos)`
69 /// where `pos` is the insertion point to keep `children` sorted.
70 #[inline]
71 fn find(&self, b: u8) -> Result<usize, usize> {
72 self.children.binary_search_by_key(&b, |&(k, _)| k)
73 }
74
75 /// Return the child node index for byte `b`, or `None`.
76 #[inline]
77 fn get(&self, b: u8) -> Option<usize> {
78 self.find(b).ok().map(|i| self.children[i].1)
79 }
80
81 /// Insert `(b, child_id)` maintaining sorted order.
82 /// Panics in debug mode if `b` is already present.
83 #[inline]
84 fn insert(&mut self, b: u8, child_id: usize) {
85 match self.find(b) {
86 Ok(_) => debug_assert!(false, "duplicate byte in trie node"),
87 Err(pos) => self.children.insert(pos, (b, child_id)),
88 }
89 }
90}
91
92/// Build a plain trie from a word list and return the nodes.
93fn build_trie(text: &str) -> Vec<TrieNode> {
94 let mut nodes: Vec<TrieNode> = vec![TrieNode::new()]; // nodes[0] = root
95
96 for line in text.lines() {
97 let word = line.trim();
98 if word.is_empty() || word.starts_with('#') {
99 continue;
100 }
101 let mut state = 0usize;
102 // Append TERM so the terminal is just another trie node.
103 for b in word.bytes().chain(core::iter::once(TERM)) {
104 if let Some(next) = nodes[state].get(b) {
105 state = next;
106 } else {
107 let next_id = nodes.len();
108 nodes.push(TrieNode::new());
109 nodes[state].insert(b, next_id);
110 state = next_id;
111 }
112 }
113 }
114
115 nodes
116}
117
118// ---------------------------------------------------------------------------
119// Bitmap free-list
120// ---------------------------------------------------------------------------
121
122/// Bitmap tracking which slots in `check` are free (bit = 1 → free).
123///
124/// Provides `next_free_from(pos)` in O(n/64) by scanning 64 bits at a time
125/// with `u64::trailing_zeros`. This is the critical speedup for large word
126/// lists whose trie nodes have wide child-byte spreads (e.g., ASCII 0x20
127/// mixed with Thai 0xE0), where the naive one-slot-at-a-time scan is O(n²).
128struct FreeBitmap {
129 words: Vec<u64>,
130 /// Number of real slots tracked. Bits at indices `cap..` are always 0.
131 cap: usize,
132}
133
134impl FreeBitmap {
135 /// Create a bitmap for slots `0..cap`. Slot 0 is occupied (root sentinel);
136 /// slots `1..cap` are free. Bits above `cap` in the last word are 0.
137 fn new(cap: usize) -> Self {
138 let n_words = cap.div_ceil(64);
139 let mut words = vec![!0u64; n_words];
140 // Clear phantom bits in the last word.
141 let used = cap % 64;
142 if used > 0 && n_words > 0 {
143 words[n_words - 1] = (1u64 << used) - 1;
144 }
145 // Slot 0 occupied (root sentinel).
146 if n_words > 0 {
147 words[0] &= !1u64;
148 }
149 FreeBitmap { words, cap }
150 }
151
152 /// Mark `slot` as occupied (clear its bit).
153 #[inline]
154 fn occupy(&mut self, slot: usize) {
155 debug_assert!(slot < self.cap);
156 self.words[slot / 64] &= !(1u64 << (slot % 64));
157 }
158
159 /// Extend the bitmap to cover `new_cap` slots.
160 ///
161 /// - Old phantom bits (slots `old_cap .. old_n_words*64`) become real free slots.
162 /// - New phantom bits (slots `new_cap .. new_n_words*64`) are cleared.
163 fn grow(&mut self, new_cap: usize) {
164 debug_assert!(new_cap > self.cap);
165 let old_cap = self.cap;
166 let old_words = self.words.len();
167 let new_words = new_cap.div_ceil(64);
168
169 // Un-clear old phantom bits (they're now real free slots).
170 let old_used = old_cap % 64;
171 if old_used > 0 && old_words > 0 {
172 self.words[old_words - 1] |= !((1u64 << old_used) - 1);
173 }
174
175 // Extend with all-free words.
176 self.words.resize(new_words, !0u64);
177
178 // Clear phantom bits in the new last word.
179 let new_used = new_cap % 64;
180 if new_used > 0 && new_words > 0 {
181 self.words[new_words - 1] = (1u64 << new_used) - 1;
182 }
183
184 self.cap = new_cap;
185 }
186
187 /// Return the smallest free slot index `≥ start`, or `self.cap` if none.
188 /// Never returns an index ≥ `self.cap` (phantom bits are never set).
189 fn next_free_from(&self, start: usize) -> usize {
190 if start >= self.cap {
191 return self.cap;
192 }
193 let cap_word = self.cap.div_ceil(64);
194 let word_idx = start / 64;
195 let bit_idx = start % 64;
196
197 // Partial first word.
198 let partial = self.words[word_idx] >> bit_idx;
199 if partial != 0 {
200 return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
201 }
202
203 // Full remaining words.
204 for i in (word_idx + 1)..cap_word {
205 if self.words[i] != 0 {
206 return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
207 }
208 }
209
210 self.cap // all occupied
211 }
212}
213
214// ---------------------------------------------------------------------------
215// Base-allocation helper
216// ---------------------------------------------------------------------------
217
218/// Find the smallest offset `b ≥ 1` such that `check[b + byte + 1]` is free
219/// (either `UNUSED` or out-of-bounds) for every byte in `children`.
220///
221/// Uses the bitmap to jump over occupied regions in O(n/64) per jump instead
222/// of O(n). For nodes with wide child-byte spreads (ASCII 0x20 mixed with
223/// Thai 0xE0), this turns an O(n²) construction into near-linear time.
224fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
225 debug_assert!(!children.is_empty());
226 let min_child = children[0] as usize; // sorted ascending
227
228 // Start b so min_child lands at or after min_free.
229 let mut b = min_free.saturating_sub(min_child + 1).max(1);
230
231 'search: loop {
232 for &c in children {
233 let pos = b + c as usize + 1;
234 if pos < check.len() && check[pos] != UNUSED {
235 // Jump: advance b so this child lands on the next free slot.
236 let nf = bitmap.next_free_from(pos + 1);
237 b = nf.saturating_sub(c as usize + 1).max(b + 1);
238 continue 'search;
239 }
240 }
241 return b;
242 }
243}
244
245// ---------------------------------------------------------------------------
246// Public Dict type
247// ---------------------------------------------------------------------------
248
249/// Double-Array Trie dictionary.
250///
251/// Provides O(k) lookup where k is the number of **bytes** in the query.
252/// Memory layout is two `Vec<i32>` (base + check), cache-friendly.
253pub struct Dict {
254 base: Vec<i32>,
255 check: Vec<i32>,
256}
257
258impl Dict {
259 // ── Construction ─────────────────────────────────────────────────────────
260
261 /// Build a [`Dict`] from a newline-separated word list.
262 ///
263 /// Lines starting with `#` are treated as comments and skipped.
264 /// Blank lines are skipped. Words are stored as raw UTF-8 bytes.
265 ///
266 /// # Example
267 ///
268 /// ```rust
269 /// use kham_core::dict::Dict;
270 ///
271 /// let dict = Dict::from_word_list("กิน\nข้าว\nปลา\n");
272 /// assert!(dict.contains("กิน"));
273 /// assert!(dict.contains("ข้าว"));
274 /// assert!(!dict.contains("xyz"));
275 /// ```
276 pub fn from_word_list(text: &str) -> Self {
277 let trie = build_trie(text);
278 Self::from_trie(trie)
279 }
280
281 /// Deserialise a [`Dict`] from a pre-compiled DARTS binary blob.
282 ///
283 /// The binary blob is normally produced by `build.rs` at compile time and
284 /// embedded via [`include_bytes!`]. Loading from bytes bypasses the full
285 /// trie-construction pipeline, making it the fastest way to obtain a
286 /// ready-to-use dictionary.
287 ///
288 /// # Binary Format
289 ///
290 /// The blob begins with a fixed 16-byte header followed by the raw array
291 /// data. All multi-byte integers are **little-endian**.
292 ///
293 /// ```text
294 /// Offset Size Field Description
295 /// ────── ──── ─────────── ───────────────────────────────────────────
296 /// 0 4 magic ASCII b"KDAM" — identifies the file type
297 /// 4 1 version Format version; currently 0x01
298 /// 5 3 reserved Must be zero; reserved for future flags
299 /// 8 4 base_len Number of i32 elements in the base array
300 /// 12 4 check_len Number of i32 elements in the check array
301 /// 16 — base[] base_len × 4 bytes, little-endian i32
302 /// 16 + base_len*4 — check[] check_len × 4 bytes, little-endian i32
303 /// ```
304 ///
305 /// # Performance
306 ///
307 /// `O(S)` where `S` is the total byte length of `data`. The function
308 /// performs one pass of `chunks_exact(4).map(i32::from_le_bytes)` over
309 /// each array, allocating exactly `base_len + check_len` i32 values.
310 /// Compare with [`from_word_list`], which is `O(W × K)` — proportional to
311 /// total word bytes — due to trie construction and base-offset search.
312 ///
313 /// | Method | Complexity | Allocation | Use when |
314 /// |-------------------|------------|--------------|--------------------------------|
315 /// | `from_bytes` | O(S) | 2 × `Vec<i32>` | loading pre-built binary blob |
316 /// | `from_word_list` | O(W × K) | trie + DARTS | building from a raw word list |
317 ///
318 /// # Errors / Panics
319 ///
320 /// This function panics — rather than returning a `Result` — because
321 /// failures indicate a corrupted or stale build artifact, not a runtime
322 /// condition the caller can meaningfully recover from. A clean
323 /// `cargo build` always regenerates a valid `dict.bin`.
324 ///
325 /// | Condition | Panic message |
326 /// |------------------------------------------|----------------------------------|
327 /// | `data.len() < 16` | `"dict.bin too short"` |
328 /// | First 4 bytes ≠ `b"KDAM"` | `"dict.bin: bad magic"` |
329 /// | Byte 4 ≠ `0x01` | `"dict.bin: unsupported version"`|
330 ///
331 /// # Example
332 ///
333 /// ```rust
334 /// use kham_core::dict::Dict;
335 ///
336 /// // Typically you embed a pre-built blob with include_bytes! and pass it here.
337 /// // This example constructs a minimal valid blob by hand for illustration.
338 /// let dict = kham_core::dict::builtin_dict();
339 /// assert!(dict.contains("กิน"));
340 /// assert!(dict.contains("ธนาคาร"));
341 /// ```
342 ///
343 /// For the common case of the built-in word list, prefer [`builtin_dict()`],
344 /// which calls this function with the compile-time-embedded blob.
345 ///
346 /// [`from_word_list`]: Dict::from_word_list
347 /// [`builtin_dict()`]: crate::dict::builtin_dict
348 pub fn from_bytes(data: &[u8]) -> Self {
349 const HDR: usize = 16; // 4 magic + 1 version + 3 reserved + 4 base_len + 4 check_len
350 assert!(data.len() >= HDR, "dict.bin too short");
351 assert_eq!(&data[0..4], b"KDAM", "dict.bin: bad magic");
352 assert_eq!(data[4], 0x01, "dict.bin: unsupported version");
353
354 let base_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
355 let check_len = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
356
357 let base_end = HDR + base_len * 4;
358 let check_end = base_end + check_len * 4;
359
360 let base: Vec<i32> = data[HDR..base_end]
361 .chunks_exact(4)
362 .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
363 .collect();
364 let check: Vec<i32> = data[base_end..check_end]
365 .chunks_exact(4)
366 .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
367 .collect();
368
369 Self { base, check }
370 }
371
372 fn from_trie(nodes: Vec<TrieNode>) -> Self {
373 let n = nodes.len();
374 // Initial capacity: trie nodes * average fanout, plus breathing room.
375 let cap = n * 2 + 512;
376 let mut base: Vec<i32> = vec![0; cap];
377 let mut check: Vec<i32> = vec![UNUSED; cap];
378
379 // check[0] = 0 marks the root as "in use" (its own parent = itself).
380 check[0] = 0;
381
382 // trie_to_darts[trie_node_id] = darts_state_index
383 let mut trie_to_darts: Vec<usize> = vec![0; n];
384 // Root trie node (0) → root darts state (0).
385
386 // Bitmap mirrors `check`: bit i is set iff check[i] == UNUSED.
387 // Kept in sync so find_base can jump over occupied runs in O(n/64).
388 let mut bitmap = FreeBitmap::new(cap);
389
390 // Smallest slot known to be free — advances monotonically.
391 let mut min_free: usize = 1;
392
393 let mut queue: VecDeque<usize> = VecDeque::new();
394 queue.push_back(0); // start BFS from root trie node
395
396 while let Some(trie_id) = queue.pop_front() {
397 let darts_id = trie_to_darts[trie_id];
398 let node = &nodes[trie_id];
399
400 if node.children.is_empty() {
401 continue; // leaf node — nothing to allocate
402 }
403
404 // Advance min_free to the next genuinely free slot.
405 while min_free < check.len() && check[min_free] != UNUSED {
406 min_free += 1;
407 }
408
409 // Collect sorted child bytes for base search.
410 // children is already sorted ascending by byte value.
411 let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();
412
413 // Find a collision-free base offset using the bitmap for fast jumps.
414 let b = find_base(&check, &child_bytes, &bitmap, min_free);
415 base[darts_id] = b as i32;
416
417 // Stamp each child into the arrays.
418 for &(byte, child_trie_id) in &node.children {
419 let child_darts = b + byte as usize + 1;
420
421 // Grow arrays and bitmap if needed.
422 if child_darts + 1 > check.len() {
423 let new_cap = child_darts + 512;
424 base.resize(new_cap, 0);
425 check.resize(new_cap, UNUSED);
426 bitmap.grow(new_cap);
427 }
428
429 check[child_darts] = darts_id as i32;
430 bitmap.occupy(child_darts);
431 trie_to_darts[child_trie_id] = child_darts;
432 queue.push_back(child_trie_id);
433 }
434 }
435
436 // Trim trailing unused slots to reduce memory footprint.
437 let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
438 base.truncate(last + 1);
439 check.truncate(last + 1);
440
441 Self { base, check }
442 }
443
444 // ── Lookup primitives ────────────────────────────────────────────────────
445
446 /// Follow one transition from `state` on `byte`.
447 /// Returns `Some(next_state)` if valid, `None` otherwise.
448 #[inline]
449 fn goto(&self, state: usize, byte: u8) -> Option<usize> {
450 let next = (*self.base.get(state)?) + byte as i32 + 1;
451 if next <= 0 {
452 return None;
453 }
454 let next = next as usize;
455 if self.check.get(next).copied()? == state as i32 {
456 Some(next)
457 } else {
458 None
459 }
460 }
461
462 // ── Public API ───────────────────────────────────────────────────────────
463
464 /// Returns `true` if `word` is present in the dictionary.
465 ///
466 /// Lookup is O(n) where n is `word.len()` in bytes.
467 ///
468 /// # Example
469 ///
470 /// ```rust
471 /// use kham_core::dict::Dict;
472 ///
473 /// let dict = Dict::from_word_list("สวัสดี\nโลก\n");
474 /// assert!(dict.contains("สวัสดี"));
475 /// assert!(dict.contains("โลก"));
476 /// assert!(!dict.contains("สวัส")); // prefix only — not a word
477 /// assert!(!dict.contains(""));
478 /// ```
479 pub fn contains(&self, word: &str) -> bool {
480 if word.is_empty() {
481 return false;
482 }
483 let mut state = 0usize;
484 for b in word.bytes() {
485 match self.goto(state, b) {
486 Some(s) => state = s,
487 None => return false,
488 }
489 }
490 self.goto(state, TERM).is_some()
491 }
492
493 /// Return all substrings of `text` (anchored at byte 0) that are present
494 /// in the dictionary, ordered **longest first**.
495 ///
496 /// Only returns slices that are valid UTF-8 boundaries (which is always
497 /// guaranteed when the dictionary contains valid UTF-8 words).
498 ///
499 /// # Example
500 ///
501 /// ```rust
502 /// use kham_core::dict::Dict;
503 ///
504 /// let dict = Dict::from_word_list("กิน\nกิน\nกิน\nกินข้าว\n");
505 /// let p = dict.prefixes("กินข้าวกับปลา");
506 /// // Longest prefix is "กินข้าว", then "กิน"
507 /// assert_eq!(p[0], "กินข้าว");
508 /// assert_eq!(p[1], "กิน");
509 /// ```
510 pub fn prefixes<'t>(&self, text: &'t str) -> Vec<&'t str> {
511 let mut result = Vec::new();
512 let mut state = 0usize;
513 let bytes = text.as_bytes();
514
515 for i in 0..bytes.len() {
516 match self.goto(state, bytes[i]) {
517 Some(s) => {
518 state = s;
519 if self.goto(state, TERM).is_some() {
520 // i is the index of the last byte; the prefix is text[0..i+1].
521 // Since dictionary words are valid UTF-8, i+1 is always a
522 // char boundary.
523 debug_assert!(text.is_char_boundary(i + 1));
524 result.push(&text[..i + 1]);
525 }
526 }
527 None => break,
528 }
529 }
530
531 // Reverse so the longest match comes first.
532 result.reverse();
533 result
534 }
535
536 /// Number of allocated DARTS states (informational / testing).
537 #[inline]
538 pub fn state_count(&self) -> usize {
539 self.check.len()
540 }
541}
542
543// ---------------------------------------------------------------------------
544// Built-in word list and pre-compiled binary dictionary
545// ---------------------------------------------------------------------------
546
547/// The built-in CC0 Thai word list, embedded at compile time.
548///
549/// Pass to [`Dict::from_word_list`] to build the default dictionary, or use
550/// [`builtin_dict`] to load the pre-compiled binary form in O(S) time.
551pub static BUILTIN_WORDS: &str = include_str!("../data/words_th.txt");
552
553/// Pre-compiled binary DARTS blob produced by `build.rs` at compile time.
554///
555/// Format: 16-byte header (`b"KDAM"` magic, version, lengths) followed by
556/// `base[]` then `check[]` as little-endian `i32` values.
557static BUILTIN_DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.bin"));
558
559/// Return the built-in dictionary loaded from the pre-compiled binary blob.
560///
561/// This is O(S) — a simple byte copy of the pre-built DARTS arrays — versus
562/// O(W×K) for [`Dict::from_word_list`] which reconstructs the trie at runtime.
563/// Use this in [`crate::segmenter::Tokenizer`] and wherever the default dictionary is needed.
564///
565/// # Example
566///
567/// ```rust
568/// use kham_core::dict::builtin_dict;
569///
570/// let dict = builtin_dict();
571/// assert!(dict.contains("กิน"));
572/// assert!(dict.contains("ธนาคาร"));
573/// ```
574pub fn builtin_dict() -> Dict {
575 Dict::from_bytes(BUILTIN_DICT_BYTES)
576}
577
578// ---------------------------------------------------------------------------
579// Tests
580// ---------------------------------------------------------------------------
581
582#[cfg(test)]
583mod tests {
584 use super::*;
585 use alloc::vec;
586
587 fn small_dict() -> Dict {
588 Dict::from_word_list("กิน\nข้าว\nปลา\nน้ำ\nกินข้าว\n")
589 }
590
591 // ── contains ─────────────────────────────────────────────────────────────
592
593 #[test]
594 fn contains_exact_words() {
595 let d = small_dict();
596 assert!(d.contains("กิน"));
597 assert!(d.contains("ข้าว"));
598 assert!(d.contains("ปลา"));
599 assert!(d.contains("น้ำ"));
600 assert!(d.contains("กินข้าว"));
601 }
602
603 #[test]
604 fn contains_rejects_prefix_only() {
605 let d = small_dict();
606 // "กิ" is a prefix of "กิน" but not itself a word
607 assert!(!d.contains("กิ"));
608 }
609
610 #[test]
611 fn contains_rejects_empty() {
612 let d = small_dict();
613 assert!(!d.contains(""));
614 }
615
616 #[test]
617 fn contains_rejects_unknown() {
618 let d = small_dict();
619 assert!(!d.contains("xyz"));
620 assert!(!d.contains("hello"));
621 }
622
623 #[test]
624 fn contains_ascii() {
625 let d = Dict::from_word_list("hello\nworld\n");
626 assert!(d.contains("hello"));
627 assert!(d.contains("world"));
628 assert!(!d.contains("hell"));
629 assert!(!d.contains("worlds"));
630 }
631
632 #[test]
633 fn contains_comments_and_blanks_skipped() {
634 let d = Dict::from_word_list("# comment\n\nกิน\n \nข้าว\n");
635 assert!(d.contains("กิน"));
636 assert!(d.contains("ข้าว"));
637 assert!(!d.contains("# comment"));
638 }
639
640 #[test]
641 fn contains_single_char() {
642 let d = Dict::from_word_list("ก\n");
643 assert!(d.contains("ก"));
644 assert!(!d.contains("กก"));
645 }
646
647 // ── prefixes ──────────────────────────────────────────────────────────────
648
649 #[test]
650 fn prefixes_longest_first() {
651 let d = small_dict();
652 let p = d.prefixes("กินข้าวกับปลา");
653 // "กินข้าว" and "กิน" are both prefixes of the input
654 assert!(p.contains(&"กินข้าว"));
655 assert!(p.contains(&"กิน"));
656 // Longest must come first
657 assert_eq!(p[0], "กินข้าว");
658 }
659
660 #[test]
661 fn prefixes_single_match() {
662 let d = small_dict();
663 let p = d.prefixes("ปลาทู");
664 assert_eq!(p, vec!["ปลา"]);
665 }
666
667 #[test]
668 fn prefixes_no_match() {
669 let d = small_dict();
670 assert!(d.prefixes("xyz").is_empty());
671 assert!(d.prefixes("").is_empty());
672 }
673
674 #[test]
675 fn prefixes_exact_match() {
676 let d = small_dict();
677 // "กิน" is exactly a word; text doesn't extend further
678 let p = d.prefixes("กิน");
679 assert_eq!(p, vec!["กิน"]);
680 }
681
682 // ── full-round-trip / structural ──────────────────────────────────────────
683
684 #[test]
685 fn builtin_words_parse() {
686 // Built-in word list must parse without panic and contain a few known words
687 let d = Dict::from_word_list(BUILTIN_WORDS);
688 assert!(d.contains("กิน"));
689 assert!(d.contains("ข้าว"));
690 }
691
692 #[test]
693 fn large_word_list() {
694 // Insert many words and verify all are retrievable
695 let words = [
696 "กิน",
697 "ข้าว",
698 "ปลา",
699 "น้ำ",
700 "คน",
701 "ไป",
702 "มา",
703 "ที่",
704 "นี่",
705 "นั้น",
706 "สวัสดี",
707 "ธนาคาร",
708 "แห่ง",
709 "ชาวโลก",
710 "ประเทศ",
711 "ภาษา",
712 "เมือง",
713 "บ้าน",
714 ];
715 let list: alloc::string::String = words.iter().map(|w| alloc::format!("{w}\n")).collect();
716 let d = Dict::from_word_list(&list);
717 for &w in &words {
718 assert!(d.contains(w), "missing word: {w}");
719 }
720 assert!(!d.contains("notaword"));
721 }
722
723 #[test]
724 fn builtin_contains_common_words() {
725 let d = Dict::from_word_list(BUILTIN_WORDS);
726 let words = ["สวัสดี", "ธนาคาร", "ชาวโลก", "ไป", "มา", "กิน", "ข้าว"];
727 for w in words {
728 assert!(d.contains(w), "builtin dict missing: {w}");
729 }
730 let p = d.prefixes("สวัสดีชาวโลก");
731 assert!(p.contains(&"สวัสดี"), "prefixes missing สวัสดี, got: {p:?}");
732 let p2 = d.prefixes("ธนาคารแห่งนั้น");
733 assert!(
734 p2.contains(&"ธนาคาร"),
735 "prefixes missing ธนาคาร, got: {p2:?}"
736 );
737 }
738
739 #[test]
740 fn check_array_validity() {
741 // Every non-UNUSED entry in check must point to a valid state index.
742 let d = small_dict();
743 let len = d.check.len() as i32;
744 for &c in &d.check {
745 if c != UNUSED {
746 assert!(c >= 0 && c < len, "check entry {c} out of range");
747 }
748 }
749 }
750}