kham_core/dict.rs
1//! Dictionary backed by a Double-Array Trie (DARTS).
2//!
3//! ## Structure
4//!
5//! A DARTS encodes a trie in two parallel arrays:
6//!
7//! ```text
8//! base[s] — offset used to compute the next state from state s
9//! check[t] — parent state of state t; -1 means "unused"
10//!
11//! Transition: next = base[s] + byte + 1
12//! Valid iff: check[next] == s
13//! ```
14//!
15//! Adding 1 to every byte value ensures that even byte 0 (used as the
16//! end-of-word sentinel) maps to index ≥ 1, so index 0 is never a child
17//! and `check[0]` can serve as the root sentinel.
18//!
19//! ## Construction
20//!
21//! 1. Parse the word list into a regular trie (adjacency via `BTreeMap`).
22//! 2. BFS over trie nodes. For each node, find the smallest `base` offset
23//! such that `base + byte + 1` is free for every child byte.
24//! 3. Stamp `check[base + byte + 1] = current_darts_state` and record the
25//! mapping from trie-node-id → darts-state-id for children.
26//!
27//! ## Lookup
28//!
29//! - `contains(word)`: follow bytes then follow the NUL sentinel.
30//! - `prefixes(text)`: walk bytes, emit a match whenever NUL sentinel is
31//! reachable at the current state.
32
33use alloc::collections::VecDeque;
34use alloc::string::String;
35use alloc::vec;
36use alloc::vec::Vec;
37
38// ---------------------------------------------------------------------------
39// Constants
40// ---------------------------------------------------------------------------
41
42/// Marks an unused slot in the `check` array.
43const UNUSED: i32 = -1;
44
45/// Sentinel byte appended to every inserted word.
46/// NUL (0x00) never appears in valid UTF-8 text, so it's a safe sentinel.
47const TERM: u8 = 0;
48
49// ---------------------------------------------------------------------------
50// Intermediate trie (used only during construction)
51// ---------------------------------------------------------------------------
52
53struct TrieNode {
54 /// Sorted (byte, child-node-index) pairs.
55 ///
56 /// A sorted `Vec` is faster than `BTreeMap` for the small child-counts
57 /// typical of trie nodes: no heap allocation per node, better cache
58 /// locality, and linear scan outperforms tree traversal for ≤ ~16 items.
59 children: Vec<(u8, usize)>,
60}
61
62impl TrieNode {
63 fn new() -> Self {
64 Self {
65 children: Vec::new(),
66 }
67 }
68
69 /// Find the index into `children` whose byte equals `b`, or `Err(pos)`
70 /// where `pos` is the insertion point to keep `children` sorted.
71 #[inline]
72 fn find(&self, b: u8) -> Result<usize, usize> {
73 self.children.binary_search_by_key(&b, |&(k, _)| k)
74 }
75
76 /// Return the child node index for byte `b`, or `None`.
77 #[inline]
78 fn get(&self, b: u8) -> Option<usize> {
79 self.find(b).ok().map(|i| self.children[i].1)
80 }
81
82 /// Insert `(b, child_id)` maintaining sorted order.
83 /// Panics in debug mode if `b` is already present.
84 #[inline]
85 fn insert(&mut self, b: u8, child_id: usize) {
86 match self.find(b) {
87 Ok(_) => debug_assert!(false, "duplicate byte in trie node"),
88 Err(pos) => self.children.insert(pos, (b, child_id)),
89 }
90 }
91}
92
93/// Build a plain trie from a word list and return the nodes.
94fn build_trie(text: &str) -> Vec<TrieNode> {
95 let mut nodes: Vec<TrieNode> = vec![TrieNode::new()]; // nodes[0] = root
96
97 for line in text.lines() {
98 let word = line.trim();
99 if word.is_empty() || word.starts_with('#') {
100 continue;
101 }
102 let mut state = 0usize;
103 // Append TERM so the terminal is just another trie node.
104 for b in word.bytes().chain(core::iter::once(TERM)) {
105 if let Some(next) = nodes[state].get(b) {
106 state = next;
107 } else {
108 let next_id = nodes.len();
109 nodes.push(TrieNode::new());
110 nodes[state].insert(b, next_id);
111 state = next_id;
112 }
113 }
114 }
115
116 nodes
117}
118
119// ---------------------------------------------------------------------------
120// Bitmap free-list
121// ---------------------------------------------------------------------------
122
123/// Bitmap tracking which slots in `check` are free (bit = 1 → free).
124///
125/// Provides `next_free_from(pos)` in O(n/64) by scanning 64 bits at a time
126/// with `u64::trailing_zeros`. This is the critical speedup for large word
127/// lists whose trie nodes have wide child-byte spreads (e.g., ASCII 0x20
128/// mixed with Thai 0xE0), where the naive one-slot-at-a-time scan is O(n²).
129struct FreeBitmap {
130 words: Vec<u64>,
131 /// Number of real slots tracked. Bits at indices `cap..` are always 0.
132 cap: usize,
133}
134
135impl FreeBitmap {
136 /// Create a bitmap for slots `0..cap`. Slot 0 is occupied (root sentinel);
137 /// slots `1..cap` are free. Bits above `cap` in the last word are 0.
138 fn new(cap: usize) -> Self {
139 let n_words = cap.div_ceil(64);
140 let mut words = vec![!0u64; n_words];
141 // Clear phantom bits in the last word.
142 let used = cap % 64;
143 if used > 0 && n_words > 0 {
144 words[n_words - 1] = (1u64 << used) - 1;
145 }
146 // Slot 0 occupied (root sentinel).
147 if n_words > 0 {
148 words[0] &= !1u64;
149 }
150 FreeBitmap { words, cap }
151 }
152
153 /// Mark `slot` as occupied (clear its bit).
154 #[inline]
155 fn occupy(&mut self, slot: usize) {
156 debug_assert!(slot < self.cap);
157 self.words[slot / 64] &= !(1u64 << (slot % 64));
158 }
159
160 /// Extend the bitmap to cover `new_cap` slots.
161 ///
162 /// - Old phantom bits (slots `old_cap .. old_n_words*64`) become real free slots.
163 /// - New phantom bits (slots `new_cap .. new_n_words*64`) are cleared.
164 fn grow(&mut self, new_cap: usize) {
165 debug_assert!(new_cap > self.cap);
166 let old_cap = self.cap;
167 let old_words = self.words.len();
168 let new_words = new_cap.div_ceil(64);
169
170 // Un-clear old phantom bits (they're now real free slots).
171 let old_used = old_cap % 64;
172 if old_used > 0 && old_words > 0 {
173 self.words[old_words - 1] |= !((1u64 << old_used) - 1);
174 }
175
176 // Extend with all-free words.
177 self.words.resize(new_words, !0u64);
178
179 // Clear phantom bits in the new last word.
180 let new_used = new_cap % 64;
181 if new_used > 0 && new_words > 0 {
182 self.words[new_words - 1] = (1u64 << new_used) - 1;
183 }
184
185 self.cap = new_cap;
186 }
187
188 /// Return the smallest free slot index `≥ start`, or `self.cap` if none.
189 /// Never returns an index ≥ `self.cap` (phantom bits are never set).
190 fn next_free_from(&self, start: usize) -> usize {
191 if start >= self.cap {
192 return self.cap;
193 }
194 let cap_word = self.cap.div_ceil(64);
195 let word_idx = start / 64;
196 let bit_idx = start % 64;
197
198 // Partial first word.
199 let partial = self.words[word_idx] >> bit_idx;
200 if partial != 0 {
201 return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
202 }
203
204 // Full remaining words.
205 for i in (word_idx + 1)..cap_word {
206 if self.words[i] != 0 {
207 return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
208 }
209 }
210
211 self.cap // all occupied
212 }
213}
214
215// ---------------------------------------------------------------------------
216// Base-allocation helper
217// ---------------------------------------------------------------------------
218
219/// Find the smallest offset `b ≥ 1` such that `check[b + byte + 1]` is free
220/// (either `UNUSED` or out-of-bounds) for every byte in `children`.
221///
222/// Uses the bitmap to jump over occupied regions in O(n/64) per jump instead
223/// of O(n). For nodes with wide child-byte spreads (ASCII 0x20 mixed with
224/// Thai 0xE0), this turns an O(n²) construction into near-linear time.
225fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
226 debug_assert!(!children.is_empty());
227 let min_child = children[0] as usize; // sorted ascending
228
229 // Start b so min_child lands at or after min_free.
230 let mut b = min_free.saturating_sub(min_child + 1).max(1);
231
232 'search: loop {
233 for &c in children {
234 let pos = b + c as usize + 1;
235 if pos < check.len() && check[pos] != UNUSED {
236 // Jump: advance b so this child lands on the next free slot.
237 let nf = bitmap.next_free_from(pos + 1);
238 b = nf.saturating_sub(c as usize + 1).max(b + 1);
239 continue 'search;
240 }
241 }
242 return b;
243 }
244}
245
246// ---------------------------------------------------------------------------
247// Public Dict type
248// ---------------------------------------------------------------------------
249
250/// Double-Array Trie dictionary.
251///
252/// Provides O(k) lookup where k is the number of **bytes** in the query.
253/// Memory layout is two `Vec<i32>` (base + check), cache-friendly.
254pub struct Dict {
255 base: Vec<i32>,
256 check: Vec<i32>,
257 /// Extra words added via [`Dict::with_overlay`], sorted alphabetically for
258 /// O(log k) `contains` lookup. Scanned linearly in `prefixes`.
259 overlay: Vec<String>,
260}
261
262impl Dict {
263 // ── Construction ─────────────────────────────────────────────────────────
264
265 /// Build a [`Dict`] from a newline-separated word list.
266 ///
267 /// Lines starting with `#` are treated as comments and skipped.
268 /// Blank lines are skipped. Words are stored as raw UTF-8 bytes.
269 ///
270 /// # Example
271 ///
272 /// ```rust
273 /// use kham_core::dict::Dict;
274 ///
275 /// let dict = Dict::from_word_list("กิน\nข้าว\nปลา\n");
276 /// assert!(dict.contains("กิน"));
277 /// assert!(dict.contains("ข้าว"));
278 /// assert!(!dict.contains("xyz"));
279 /// ```
280 pub fn from_word_list(text: &str) -> Self {
281 let trie = build_trie(text);
282 Self::from_trie(trie)
283 }
284
285 /// Deserialise a [`Dict`] from a pre-compiled DARTS binary blob.
286 ///
287 /// The binary blob is normally produced by `build.rs` at compile time and
288 /// embedded via [`include_bytes!`]. Loading from bytes bypasses the full
289 /// trie-construction pipeline, making it the fastest way to obtain a
290 /// ready-to-use dictionary.
291 ///
292 /// # Binary Format
293 ///
294 /// The blob begins with a fixed 16-byte header followed by the raw array
295 /// data. All multi-byte integers are **little-endian**.
296 ///
297 /// ```text
298 /// Offset Size Field Description
299 /// ────── ──── ─────────── ───────────────────────────────────────────
300 /// 0 4 magic ASCII b"KDAM" — identifies the file type
301 /// 4 1 version Format version; currently 0x01
302 /// 5 3 reserved Must be zero; reserved for future flags
303 /// 8 4 base_len Number of i32 elements in the base array
304 /// 12 4 check_len Number of i32 elements in the check array
305 /// 16 — base[] base_len × 4 bytes, little-endian i32
306 /// 16 + base_len*4 — check[] check_len × 4 bytes, little-endian i32
307 /// ```
308 ///
309 /// # Performance
310 ///
311 /// `O(S)` where `S` is the total byte length of `data`. The function
312 /// performs one pass of `chunks_exact(4).map(i32::from_le_bytes)` over
313 /// each array, allocating exactly `base_len + check_len` i32 values.
314 /// Compare with [`from_word_list`], which is `O(W × K)` — proportional to
315 /// total word bytes — due to trie construction and base-offset search.
316 ///
317 /// | Method | Complexity | Allocation | Use when |
318 /// |-------------------|------------|--------------|--------------------------------|
319 /// | `from_bytes` | O(S) | 2 × `Vec<i32>` | loading pre-built binary blob |
320 /// | `from_word_list` | O(W × K) | trie + DARTS | building from a raw word list |
321 ///
322 /// # Errors / Panics
323 ///
324 /// This function panics — rather than returning a `Result` — because
325 /// failures indicate a corrupted or stale build artifact, not a runtime
326 /// condition the caller can meaningfully recover from. A clean
327 /// `cargo build` always regenerates a valid `dict.bin`.
328 ///
329 /// | Condition | Panic message |
330 /// |------------------------------------------|----------------------------------|
331 /// | `data.len() < 16` | `"dict.bin too short"` |
332 /// | First 4 bytes ≠ `b"KDAM"` | `"dict.bin: bad magic"` |
333 /// | Byte 4 ≠ `0x01` | `"dict.bin: unsupported version"`|
334 ///
335 /// # Example
336 ///
337 /// ```rust
338 /// use kham_core::dict::Dict;
339 ///
340 /// // Typically you embed a pre-built blob with include_bytes! and pass it here.
341 /// // This example constructs a minimal valid blob by hand for illustration.
342 /// let dict = kham_core::dict::builtin_dict();
343 /// assert!(dict.contains("กิน"));
344 /// assert!(dict.contains("ธนาคาร"));
345 /// ```
346 ///
347 /// For the common case of the built-in word list, prefer [`builtin_dict()`],
348 /// which calls this function with the compile-time-embedded blob.
349 ///
350 /// [`from_word_list`]: Dict::from_word_list
351 /// [`builtin_dict()`]: crate::dict::builtin_dict
352 pub fn from_bytes(data: &[u8]) -> Self {
353 const HDR: usize = 16; // 4 magic + 1 version + 3 reserved + 4 base_len + 4 check_len
354 assert!(data.len() >= HDR, "dict.bin too short");
355 assert_eq!(&data[0..4], b"KDAM", "dict.bin: bad magic");
356 assert_eq!(data[4], 0x01, "dict.bin: unsupported version");
357
358 let base_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
359 let check_len = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
360
361 let base_end = HDR + base_len * 4;
362 let check_end = base_end + check_len * 4;
363
364 let base: Vec<i32> = data[HDR..base_end]
365 .chunks_exact(4)
366 .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
367 .collect();
368 let check: Vec<i32> = data[base_end..check_end]
369 .chunks_exact(4)
370 .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
371 .collect();
372
373 Self {
374 base,
375 check,
376 overlay: Vec::new(),
377 }
378 }
379
380 fn from_trie(nodes: Vec<TrieNode>) -> Self {
381 let n = nodes.len();
382 // Initial capacity: trie nodes * average fanout, plus breathing room.
383 let cap = n * 2 + 512;
384 let mut base: Vec<i32> = vec![0; cap];
385 let mut check: Vec<i32> = vec![UNUSED; cap];
386
387 // check[0] = 0 marks the root as "in use" (its own parent = itself).
388 check[0] = 0;
389
390 // trie_to_darts[trie_node_id] = darts_state_index
391 let mut trie_to_darts: Vec<usize> = vec![0; n];
392 // Root trie node (0) → root darts state (0).
393
394 // Bitmap mirrors `check`: bit i is set iff check[i] == UNUSED.
395 // Kept in sync so find_base can jump over occupied runs in O(n/64).
396 let mut bitmap = FreeBitmap::new(cap);
397
398 // Smallest slot known to be free — advances monotonically.
399 let mut min_free: usize = 1;
400
401 let mut queue: VecDeque<usize> = VecDeque::new();
402 queue.push_back(0); // start BFS from root trie node
403
404 while let Some(trie_id) = queue.pop_front() {
405 let darts_id = trie_to_darts[trie_id];
406 let node = &nodes[trie_id];
407
408 if node.children.is_empty() {
409 continue; // leaf node — nothing to allocate
410 }
411
412 // Advance min_free to the next genuinely free slot.
413 while min_free < check.len() && check[min_free] != UNUSED {
414 min_free += 1;
415 }
416
417 // Collect sorted child bytes for base search.
418 // children is already sorted ascending by byte value.
419 let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();
420
421 // Find a collision-free base offset using the bitmap for fast jumps.
422 let b = find_base(&check, &child_bytes, &bitmap, min_free);
423 base[darts_id] = b as i32;
424
425 // Stamp each child into the arrays.
426 for &(byte, child_trie_id) in &node.children {
427 let child_darts = b + byte as usize + 1;
428
429 // Grow arrays and bitmap if needed.
430 if child_darts + 1 > check.len() {
431 let new_cap = child_darts + 512;
432 base.resize(new_cap, 0);
433 check.resize(new_cap, UNUSED);
434 bitmap.grow(new_cap);
435 }
436
437 check[child_darts] = darts_id as i32;
438 bitmap.occupy(child_darts);
439 trie_to_darts[child_trie_id] = child_darts;
440 queue.push_back(child_trie_id);
441 }
442 }
443
444 // Trim trailing unused slots to reduce memory footprint.
445 let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
446 base.truncate(last + 1);
447 check.truncate(last + 1);
448
449 Self {
450 base,
451 check,
452 overlay: Vec::new(),
453 }
454 }
455
456 // ── Lookup primitives ────────────────────────────────────────────────────
457
458 /// Follow one transition from `state` on `byte`.
459 /// Returns `Some(next_state)` if valid, `None` otherwise.
460 #[inline]
461 fn goto(&self, state: usize, byte: u8) -> Option<usize> {
462 let next = (*self.base.get(state)?) + byte as i32 + 1;
463 if next <= 0 {
464 return None;
465 }
466 let next = next as usize;
467 if self.check.get(next).copied()? == state as i32 {
468 Some(next)
469 } else {
470 None
471 }
472 }
473
474 // ── Public API ───────────────────────────────────────────────────────────
475
476 /// Extend this dictionary with additional words without rebuilding the trie.
477 ///
478 /// Custom words are stored in a sorted overlay and checked alongside the
479 /// pre-compiled trie in [`contains`](Dict::contains) and
480 /// [`prefixes`](Dict::prefixes). This is O(k log k) in the number of
481 /// extra words — much cheaper than a full trie rebuild via
482 /// [`from_word_list`](Dict::from_word_list).
483 ///
484 /// Lines starting with `#` and blank lines are skipped.
485 ///
486 /// # Example
487 ///
488 /// ```rust
489 /// use kham_core::dict::builtin_dict;
490 ///
491 /// let dict = builtin_dict().with_overlay("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n");
492 /// assert!(dict.contains("กิน")); // built-in word
493 /// assert!(dict.contains("ปัญญาประดิษฐ์")); // overlay word
494 /// ```
495 pub fn with_overlay(mut self, words: &str) -> Self {
496 for line in words.lines() {
497 let w = line.trim();
498 if w.is_empty() || w.starts_with('#') {
499 continue;
500 }
501 if let Err(pos) = self.overlay.binary_search_by(|e| e.as_str().cmp(w)) {
502 self.overlay.insert(pos, String::from(w));
503 }
504 }
505 self
506 }
507
508 /// Returns `true` if `word` is present in the dictionary or the overlay.
509 ///
510 /// Lookup is O(n + log k) where n is `word.len()` in bytes and k is the
511 /// overlay size.
512 ///
513 /// # Example
514 ///
515 /// ```rust
516 /// use kham_core::dict::Dict;
517 ///
518 /// let dict = Dict::from_word_list("สวัสดี\nโลก\n");
519 /// assert!(dict.contains("สวัสดี"));
520 /// assert!(dict.contains("โลก"));
521 /// assert!(!dict.contains("สวัส")); // prefix only — not a word
522 /// assert!(!dict.contains(""));
523 /// ```
524 pub fn contains(&self, word: &str) -> bool {
525 if word.is_empty() {
526 return false;
527 }
528 // Check overlay first (sorted Vec — O(log k))
529 if self
530 .overlay
531 .binary_search_by(|e| e.as_str().cmp(word))
532 .is_ok()
533 {
534 return true;
535 }
536 let mut state = 0usize;
537 for b in word.bytes() {
538 match self.goto(state, b) {
539 Some(s) => state = s,
540 None => return false,
541 }
542 }
543 self.goto(state, TERM).is_some()
544 }
545
546 /// Return all substrings of `text` (anchored at byte 0) that are present
547 /// in the dictionary, ordered **longest first**.
548 ///
549 /// Only returns slices that are valid UTF-8 boundaries (which is always
550 /// guaranteed when the dictionary contains valid UTF-8 words).
551 ///
552 /// # Example
553 ///
554 /// ```rust
555 /// use kham_core::dict::Dict;
556 ///
557 /// let dict = Dict::from_word_list("กิน\nกิน\nกิน\nกินข้าว\n");
558 /// let p = dict.prefixes("กินข้าวกับปลา");
559 /// // Longest prefix is "กินข้าว", then "กิน"
560 /// assert_eq!(p[0], "กินข้าว");
561 /// assert_eq!(p[1], "กิน");
562 /// ```
563 pub fn prefixes<'t>(&self, text: &'t str) -> Vec<&'t str> {
564 let mut result = Vec::new();
565 let mut state = 0usize;
566 let bytes = text.as_bytes();
567
568 for i in 0..bytes.len() {
569 match self.goto(state, bytes[i]) {
570 Some(s) => {
571 state = s;
572 if self.goto(state, TERM).is_some() {
573 // i is the index of the last byte; the prefix is text[0..i+1].
574 // Since dictionary words are valid UTF-8, i+1 is always a
575 // char boundary.
576 debug_assert!(text.is_char_boundary(i + 1));
577 result.push(&text[..i + 1]);
578 }
579 }
580 None => break,
581 }
582 }
583
584 // Reverse so the longest match comes first.
585 result.reverse();
586
587 // Add any overlay words that are prefixes of text.
588 for w in &self.overlay {
589 if text.starts_with(w.as_str()) && !result.contains(&&text[..w.len()]) {
590 result.push(&text[..w.len()]);
591 }
592 }
593
594 // Re-sort by length descending so overlay insertions maintain the convention.
595 result.sort_unstable_by_key(|s| core::cmp::Reverse(s.len()));
596 result
597 }
598
599 /// Number of allocated DARTS states (informational / testing).
600 #[inline]
601 pub fn state_count(&self) -> usize {
602 self.check.len()
603 }
604}
605
606// ---------------------------------------------------------------------------
607// Built-in word list and pre-compiled binary dictionary
608// ---------------------------------------------------------------------------
609
610/// The built-in CC0 Thai word list, embedded at compile time.
611///
612/// Pass to [`Dict::from_word_list`] to build the default dictionary, or use
613/// [`builtin_dict`] to load the pre-compiled binary form in O(S) time.
614pub static BUILTIN_WORDS: &str = include_str!("../data/words_th.txt");
615
616/// Pre-compiled binary DARTS blob produced by `build.rs` at compile time.
617///
618/// Format: 16-byte header (`b"KDAM"` magic, version, lengths) followed by
619/// `base[]` then `check[]` as little-endian `i32` values.
620static BUILTIN_DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.bin"));
621
622/// Return the built-in dictionary loaded from the pre-compiled binary blob.
623///
624/// This is O(S) — a simple byte copy of the pre-built DARTS arrays — versus
625/// O(W×K) for [`Dict::from_word_list`] which reconstructs the trie at runtime.
626/// Use this in [`crate::segmenter::Tokenizer`] and wherever the default dictionary is needed.
627///
628/// # Example
629///
630/// ```rust
631/// use kham_core::dict::builtin_dict;
632///
633/// let dict = builtin_dict();
634/// assert!(dict.contains("กิน"));
635/// assert!(dict.contains("ธนาคาร"));
636/// ```
637pub fn builtin_dict() -> Dict {
638 Dict::from_bytes(BUILTIN_DICT_BYTES)
639}
640
641// ---------------------------------------------------------------------------
642// Tests
643// ---------------------------------------------------------------------------
644
645#[cfg(test)]
646mod tests {
647 use super::*;
648 use alloc::vec;
649
650 fn small_dict() -> Dict {
651 Dict::from_word_list("กิน\nข้าว\nปลา\nน้ำ\nกินข้าว\n")
652 }
653
654 // ── contains ─────────────────────────────────────────────────────────────
655
656 #[test]
657 fn contains_exact_words() {
658 let d = small_dict();
659 assert!(d.contains("กิน"));
660 assert!(d.contains("ข้าว"));
661 assert!(d.contains("ปลา"));
662 assert!(d.contains("น้ำ"));
663 assert!(d.contains("กินข้าว"));
664 }
665
666 #[test]
667 fn contains_rejects_prefix_only() {
668 let d = small_dict();
669 // "กิ" is a prefix of "กิน" but not itself a word
670 assert!(!d.contains("กิ"));
671 }
672
673 #[test]
674 fn contains_rejects_empty() {
675 let d = small_dict();
676 assert!(!d.contains(""));
677 }
678
679 #[test]
680 fn contains_rejects_unknown() {
681 let d = small_dict();
682 assert!(!d.contains("xyz"));
683 assert!(!d.contains("hello"));
684 }
685
686 #[test]
687 fn contains_ascii() {
688 let d = Dict::from_word_list("hello\nworld\n");
689 assert!(d.contains("hello"));
690 assert!(d.contains("world"));
691 assert!(!d.contains("hell"));
692 assert!(!d.contains("worlds"));
693 }
694
695 #[test]
696 fn contains_comments_and_blanks_skipped() {
697 let d = Dict::from_word_list("# comment\n\nกิน\n \nข้าว\n");
698 assert!(d.contains("กิน"));
699 assert!(d.contains("ข้าว"));
700 assert!(!d.contains("# comment"));
701 }
702
703 #[test]
704 fn contains_single_char() {
705 let d = Dict::from_word_list("ก\n");
706 assert!(d.contains("ก"));
707 assert!(!d.contains("กก"));
708 }
709
710 // ── prefixes ──────────────────────────────────────────────────────────────
711
712 #[test]
713 fn prefixes_longest_first() {
714 let d = small_dict();
715 let p = d.prefixes("กินข้าวกับปลา");
716 // "กินข้าว" and "กิน" are both prefixes of the input
717 assert!(p.contains(&"กินข้าว"));
718 assert!(p.contains(&"กิน"));
719 // Longest must come first
720 assert_eq!(p[0], "กินข้าว");
721 }
722
723 #[test]
724 fn prefixes_single_match() {
725 let d = small_dict();
726 let p = d.prefixes("ปลาทู");
727 assert_eq!(p, vec!["ปลา"]);
728 }
729
730 #[test]
731 fn prefixes_no_match() {
732 let d = small_dict();
733 assert!(d.prefixes("xyz").is_empty());
734 assert!(d.prefixes("").is_empty());
735 }
736
737 #[test]
738 fn prefixes_exact_match() {
739 let d = small_dict();
740 // "กิน" is exactly a word; text doesn't extend further
741 let p = d.prefixes("กิน");
742 assert_eq!(p, vec!["กิน"]);
743 }
744
745 // ── full-round-trip / structural ──────────────────────────────────────────
746
747 #[test]
748 fn builtin_words_parse() {
749 // Built-in word list must parse without panic and contain a few known words
750 let d = Dict::from_word_list(BUILTIN_WORDS);
751 assert!(d.contains("กิน"));
752 assert!(d.contains("ข้าว"));
753 }
754
755 #[test]
756 fn large_word_list() {
757 // Insert many words and verify all are retrievable
758 let words = [
759 "กิน",
760 "ข้าว",
761 "ปลา",
762 "น้ำ",
763 "คน",
764 "ไป",
765 "มา",
766 "ที่",
767 "นี่",
768 "นั้น",
769 "สวัสดี",
770 "ธนาคาร",
771 "แห่ง",
772 "ชาวโลก",
773 "ประเทศ",
774 "ภาษา",
775 "เมือง",
776 "บ้าน",
777 ];
778 let list: alloc::string::String = words.iter().map(|w| alloc::format!("{w}\n")).collect();
779 let d = Dict::from_word_list(&list);
780 for &w in &words {
781 assert!(d.contains(w), "missing word: {w}");
782 }
783 assert!(!d.contains("notaword"));
784 }
785
786 #[test]
787 fn builtin_contains_common_words() {
788 let d = Dict::from_word_list(BUILTIN_WORDS);
789 let words = ["สวัสดี", "ธนาคาร", "ชาวโลก", "ไป", "มา", "กิน", "ข้าว"];
790 for w in words {
791 assert!(d.contains(w), "builtin dict missing: {w}");
792 }
793 let p = d.prefixes("สวัสดีชาวโลก");
794 assert!(p.contains(&"สวัสดี"), "prefixes missing สวัสดี, got: {p:?}");
795 let p2 = d.prefixes("ธนาคารแห่งนั้น");
796 assert!(
797 p2.contains(&"ธนาคาร"),
798 "prefixes missing ธนาคาร, got: {p2:?}"
799 );
800 }
801
802 #[test]
803 fn check_array_validity() {
804 // Every non-UNUSED entry in check must point to a valid state index.
805 let d = small_dict();
806 let len = d.check.len() as i32;
807 for &c in &d.check {
808 if c != UNUSED {
809 assert!(c >= 0 && c < len, "check entry {c} out of range");
810 }
811 }
812 }
813}