kham-core 0.4.0

Pure Rust Thai word segmentation engine — no_std compatible
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
//! Dictionary backed by a Double-Array Trie (DARTS).
//!
//! ## Structure
//!
//! A DARTS encodes a trie in two parallel arrays:
//!
//! ```text
//! base[s]  — offset used to compute the next state from state s
//! check[t] — parent state of state t; -1 means "unused"
//!
//! Transition:  next = base[s] + byte + 1
//! Valid iff:   check[next] == s
//! ```
//!
//! Adding 1 to every byte value ensures that even byte 0 (used as the
//! end-of-word sentinel) maps to index ≥ 1, so index 0 is never a child
//! and `check[0]` can serve as the root sentinel.
//!
//! ## Construction
//!
//! 1. Parse the word list into a regular trie (adjacency via `BTreeMap`).
//! 2. BFS over trie nodes. For each node, find the smallest `base` offset
//!    such that `base + byte + 1` is free for every child byte.
//! 3. Stamp `check[base + byte + 1] = current_darts_state` and record the
//!    mapping from trie-node-id → darts-state-id for children.
//!
//! ## Lookup
//!
//! - `contains(word)`: follow bytes then follow the NUL sentinel.
//! - `prefixes(text)`: walk bytes, emit a match whenever NUL sentinel is
//!   reachable at the current state.

use alloc::collections::VecDeque;
use alloc::vec;
use alloc::vec::Vec;

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/// Marks an unused slot in the `check` array.
const UNUSED: i32 = -1;

/// Sentinel byte appended to every inserted word.
/// NUL (0x00) never appears in valid UTF-8 text, so it's a safe sentinel.
const TERM: u8 = 0;

// ---------------------------------------------------------------------------
// Intermediate trie (used only during construction)
// ---------------------------------------------------------------------------

struct TrieNode {
    /// Sorted (byte, child-node-index) pairs.
    ///
    /// A sorted `Vec` is faster than `BTreeMap` for the small child-counts
    /// typical of trie nodes: no heap allocation per node, better cache
    /// locality, and linear scan outperforms tree traversal for ≤ ~16 items.
    children: Vec<(u8, usize)>,
}

impl TrieNode {
    fn new() -> Self {
        Self {
            children: Vec::new(),
        }
    }

    /// Find the index into `children` whose byte equals `b`, or `Err(pos)`
    /// where `pos` is the insertion point to keep `children` sorted.
    #[inline]
    fn find(&self, b: u8) -> Result<usize, usize> {
        self.children.binary_search_by_key(&b, |&(k, _)| k)
    }

    /// Return the child node index for byte `b`, or `None`.
    #[inline]
    fn get(&self, b: u8) -> Option<usize> {
        self.find(b).ok().map(|i| self.children[i].1)
    }

    /// Insert `(b, child_id)` maintaining sorted order.
    /// Panics in debug mode if `b` is already present.
    #[inline]
    fn insert(&mut self, b: u8, child_id: usize) {
        match self.find(b) {
            Ok(_) => debug_assert!(false, "duplicate byte in trie node"),
            Err(pos) => self.children.insert(pos, (b, child_id)),
        }
    }
}

/// Build a plain trie from a word list and return the nodes.
fn build_trie(text: &str) -> Vec<TrieNode> {
    let mut nodes: Vec<TrieNode> = vec![TrieNode::new()]; // nodes[0] = root

    for line in text.lines() {
        let word = line.trim();
        if word.is_empty() || word.starts_with('#') {
            continue;
        }
        let mut state = 0usize;
        // Append TERM so the terminal is just another trie node.
        for b in word.bytes().chain(core::iter::once(TERM)) {
            if let Some(next) = nodes[state].get(b) {
                state = next;
            } else {
                let next_id = nodes.len();
                nodes.push(TrieNode::new());
                nodes[state].insert(b, next_id);
                state = next_id;
            }
        }
    }

    nodes
}

// ---------------------------------------------------------------------------
// Bitmap free-list
// ---------------------------------------------------------------------------

/// Bitmap tracking which slots in `check` are free (bit = 1 → free).
///
/// Provides `next_free_from(pos)` in O(n/64) by scanning 64 bits at a time
/// with `u64::trailing_zeros`. This is the critical speedup for large word
/// lists whose trie nodes have wide child-byte spreads (e.g., ASCII 0x20
/// mixed with Thai 0xE0), where the naive one-slot-at-a-time scan is O(n²).
struct FreeBitmap {
    words: Vec<u64>,
    /// Number of real slots tracked. Bits at indices `cap..` are always 0.
    cap: usize,
}

impl FreeBitmap {
    /// Create a bitmap for slots `0..cap`. Slot 0 is occupied (root sentinel);
    /// slots `1..cap` are free. Bits above `cap` in the last word are 0.
    fn new(cap: usize) -> Self {
        let n_words = cap.div_ceil(64);
        let mut words = vec![!0u64; n_words];
        // Clear phantom bits in the last word.
        let used = cap % 64;
        if used > 0 && n_words > 0 {
            words[n_words - 1] = (1u64 << used) - 1;
        }
        // Slot 0 occupied (root sentinel).
        if n_words > 0 {
            words[0] &= !1u64;
        }
        FreeBitmap { words, cap }
    }

    /// Mark `slot` as occupied (clear its bit).
    #[inline]
    fn occupy(&mut self, slot: usize) {
        debug_assert!(slot < self.cap);
        self.words[slot / 64] &= !(1u64 << (slot % 64));
    }

    /// Extend the bitmap to cover `new_cap` slots.
    ///
    /// - Old phantom bits (slots `old_cap .. old_n_words*64`) become real free slots.
    /// - New phantom bits (slots `new_cap .. new_n_words*64`) are cleared.
    fn grow(&mut self, new_cap: usize) {
        debug_assert!(new_cap > self.cap);
        let old_cap = self.cap;
        let old_words = self.words.len();
        let new_words = new_cap.div_ceil(64);

        // Un-clear old phantom bits (they're now real free slots).
        let old_used = old_cap % 64;
        if old_used > 0 && old_words > 0 {
            self.words[old_words - 1] |= !((1u64 << old_used) - 1);
        }

        // Extend with all-free words.
        self.words.resize(new_words, !0u64);

        // Clear phantom bits in the new last word.
        let new_used = new_cap % 64;
        if new_used > 0 && new_words > 0 {
            self.words[new_words - 1] = (1u64 << new_used) - 1;
        }

        self.cap = new_cap;
    }

    /// Return the smallest free slot index `≥ start`, or `self.cap` if none.
    /// Never returns an index ≥ `self.cap` (phantom bits are never set).
    fn next_free_from(&self, start: usize) -> usize {
        if start >= self.cap {
            return self.cap;
        }
        let cap_word = self.cap.div_ceil(64);
        let word_idx = start / 64;
        let bit_idx = start % 64;

        // Partial first word.
        let partial = self.words[word_idx] >> bit_idx;
        if partial != 0 {
            return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
        }

        // Full remaining words.
        for i in (word_idx + 1)..cap_word {
            if self.words[i] != 0 {
                return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
            }
        }

        self.cap // all occupied
    }
}

// ---------------------------------------------------------------------------
// Base-allocation helper
// ---------------------------------------------------------------------------

/// Find the smallest offset `b ≥ 1` such that `check[b + byte + 1]` is free
/// (either `UNUSED` or out-of-bounds) for every byte in `children`.
///
/// Uses the bitmap to jump over occupied regions in O(n/64) per jump instead
/// of O(n). For nodes with wide child-byte spreads (ASCII 0x20 mixed with
/// Thai 0xE0), this turns an O(n²) construction into near-linear time.
fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
    debug_assert!(!children.is_empty());
    let min_child = children[0] as usize; // sorted ascending

    // Start b so min_child lands at or after min_free.
    let mut b = min_free.saturating_sub(min_child + 1).max(1);

    'search: loop {
        for &c in children {
            let pos = b + c as usize + 1;
            if pos < check.len() && check[pos] != UNUSED {
                // Jump: advance b so this child lands on the next free slot.
                let nf = bitmap.next_free_from(pos + 1);
                b = nf.saturating_sub(c as usize + 1).max(b + 1);
                continue 'search;
            }
        }
        return b;
    }
}

// ---------------------------------------------------------------------------
// Public Dict type
// ---------------------------------------------------------------------------

/// Double-Array Trie dictionary.
///
/// Provides O(k) lookup where k is the number of **bytes** in the query.
/// Memory layout is two `Vec<i32>` (base + check), cache-friendly.
pub struct Dict {
    base: Vec<i32>,
    check: Vec<i32>,
}

impl Dict {
    // ── Construction ─────────────────────────────────────────────────────────

    /// Build a [`Dict`] from a newline-separated word list.
    ///
    /// Lines starting with `#` are treated as comments and skipped.
    /// Blank lines are skipped. Words are stored as raw UTF-8 bytes.
    ///
    /// # Example
    ///
    /// ```rust
    /// use kham_core::dict::Dict;
    ///
    /// let dict = Dict::from_word_list("กิน\nข้าว\nปลา\n");
    /// assert!(dict.contains("กิน"));
    /// assert!(dict.contains("ข้าว"));
    /// assert!(!dict.contains("xyz"));
    /// ```
    pub fn from_word_list(text: &str) -> Self {
        let trie = build_trie(text);
        Self::from_trie(trie)
    }

    /// Deserialise a [`Dict`] from a pre-compiled DARTS binary blob.
    ///
    /// The binary blob is normally produced by `build.rs` at compile time and
    /// embedded via [`include_bytes!`]. Loading from bytes bypasses the full
    /// trie-construction pipeline, making it the fastest way to obtain a
    /// ready-to-use dictionary.
    ///
    /// # Binary Format
    ///
    /// The blob begins with a fixed 16-byte header followed by the raw array
    /// data. All multi-byte integers are **little-endian**.
    ///
    /// ```text
    /// Offset  Size  Field        Description
    /// ──────  ────  ───────────  ───────────────────────────────────────────
    ///      0     4  magic        ASCII b"KDAM" — identifies the file type
    ///      4     1  version      Format version; currently 0x01
    ///      5     3  reserved     Must be zero; reserved for future flags
    ///      8     4  base_len     Number of i32 elements in the base array
    ///     12     4  check_len    Number of i32 elements in the check array
    ///     16     —  base[]       base_len × 4 bytes, little-endian i32
    ///     16 + base_len*4  —  check[]  check_len × 4 bytes, little-endian i32
    /// ```
    ///
    /// # Performance
    ///
    /// `O(S)` where `S` is the total byte length of `data`. The function
    /// performs one pass of `chunks_exact(4).map(i32::from_le_bytes)` over
    /// each array, allocating exactly `base_len + check_len` i32 values.
    /// Compare with [`from_word_list`], which is `O(W × K)` — proportional to
    /// total word bytes — due to trie construction and base-offset search.
    ///
    /// | Method            | Complexity | Allocation   | Use when                       |
    /// |-------------------|------------|--------------|--------------------------------|
    /// | `from_bytes`      | O(S)       | 2 × `Vec<i32>` | loading pre-built binary blob  |
    /// | `from_word_list`  | O(W × K)   | trie + DARTS   | building from a raw word list  |
    ///
    /// # Errors / Panics
    ///
    /// This function panics — rather than returning a `Result` — because
    /// failures indicate a corrupted or stale build artifact, not a runtime
    /// condition the caller can meaningfully recover from. A clean
    /// `cargo build` always regenerates a valid `dict.bin`.
    ///
    /// | Condition                                | Panic message                    |
    /// |------------------------------------------|----------------------------------|
    /// | `data.len() < 16`                        | `"dict.bin too short"`           |
    /// | First 4 bytes ≠ `b"KDAM"`               | `"dict.bin: bad magic"`          |
    /// | Byte 4 ≠ `0x01`                          | `"dict.bin: unsupported version"`|
    ///
    /// # Example
    ///
    /// ```rust
    /// use kham_core::dict::Dict;
    ///
    /// // Typically you embed a pre-built blob with include_bytes! and pass it here.
    /// // This example constructs a minimal valid blob by hand for illustration.
    /// let dict = kham_core::dict::builtin_dict();
    /// assert!(dict.contains("กิน"));
    /// assert!(dict.contains("ธนาคาร"));
    /// ```
    ///
    /// For the common case of the built-in word list, prefer [`builtin_dict()`],
    /// which calls this function with the compile-time-embedded blob.
    ///
    /// [`from_word_list`]: Dict::from_word_list
    /// [`builtin_dict()`]: crate::dict::builtin_dict
    pub fn from_bytes(data: &[u8]) -> Self {
        const HDR: usize = 16; // 4 magic + 1 version + 3 reserved + 4 base_len + 4 check_len
        assert!(data.len() >= HDR, "dict.bin too short");
        assert_eq!(&data[0..4], b"KDAM", "dict.bin: bad magic");
        assert_eq!(data[4], 0x01, "dict.bin: unsupported version");

        let base_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
        let check_len = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;

        let base_end = HDR + base_len * 4;
        let check_end = base_end + check_len * 4;

        let base: Vec<i32> = data[HDR..base_end]
            .chunks_exact(4)
            .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
            .collect();
        let check: Vec<i32> = data[base_end..check_end]
            .chunks_exact(4)
            .map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
            .collect();

        Self { base, check }
    }

    fn from_trie(nodes: Vec<TrieNode>) -> Self {
        let n = nodes.len();
        // Initial capacity: trie nodes * average fanout, plus breathing room.
        let cap = n * 2 + 512;
        let mut base: Vec<i32> = vec![0; cap];
        let mut check: Vec<i32> = vec![UNUSED; cap];

        // check[0] = 0 marks the root as "in use" (its own parent = itself).
        check[0] = 0;

        // trie_to_darts[trie_node_id] = darts_state_index
        let mut trie_to_darts: Vec<usize> = vec![0; n];
        // Root trie node (0) → root darts state (0).

        // Bitmap mirrors `check`: bit i is set iff check[i] == UNUSED.
        // Kept in sync so find_base can jump over occupied runs in O(n/64).
        let mut bitmap = FreeBitmap::new(cap);

        // Smallest slot known to be free — advances monotonically.
        let mut min_free: usize = 1;

        let mut queue: VecDeque<usize> = VecDeque::new();
        queue.push_back(0); // start BFS from root trie node

        while let Some(trie_id) = queue.pop_front() {
            let darts_id = trie_to_darts[trie_id];
            let node = &nodes[trie_id];

            if node.children.is_empty() {
                continue; // leaf node — nothing to allocate
            }

            // Advance min_free to the next genuinely free slot.
            while min_free < check.len() && check[min_free] != UNUSED {
                min_free += 1;
            }

            // Collect sorted child bytes for base search.
            // children is already sorted ascending by byte value.
            let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();

            // Find a collision-free base offset using the bitmap for fast jumps.
            let b = find_base(&check, &child_bytes, &bitmap, min_free);
            base[darts_id] = b as i32;

            // Stamp each child into the arrays.
            for &(byte, child_trie_id) in &node.children {
                let child_darts = b + byte as usize + 1;

                // Grow arrays and bitmap if needed.
                if child_darts + 1 > check.len() {
                    let new_cap = child_darts + 512;
                    base.resize(new_cap, 0);
                    check.resize(new_cap, UNUSED);
                    bitmap.grow(new_cap);
                }

                check[child_darts] = darts_id as i32;
                bitmap.occupy(child_darts);
                trie_to_darts[child_trie_id] = child_darts;
                queue.push_back(child_trie_id);
            }
        }

        // Trim trailing unused slots to reduce memory footprint.
        let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
        base.truncate(last + 1);
        check.truncate(last + 1);

        Self { base, check }
    }

    // ── Lookup primitives ────────────────────────────────────────────────────

    /// Follow one transition from `state` on `byte`.
    /// Returns `Some(next_state)` if valid, `None` otherwise.
    #[inline]
    fn goto(&self, state: usize, byte: u8) -> Option<usize> {
        let next = (*self.base.get(state)?) + byte as i32 + 1;
        if next <= 0 {
            return None;
        }
        let next = next as usize;
        if self.check.get(next).copied()? == state as i32 {
            Some(next)
        } else {
            None
        }
    }

    // ── Public API ───────────────────────────────────────────────────────────

    /// Returns `true` if `word` is present in the dictionary.
    ///
    /// Lookup is O(n) where n is `word.len()` in bytes.
    ///
    /// # Example
    ///
    /// ```rust
    /// use kham_core::dict::Dict;
    ///
    /// let dict = Dict::from_word_list("สวัสดี\nโลก\n");
    /// assert!(dict.contains("สวัสดี"));
    /// assert!(dict.contains("โลก"));
    /// assert!(!dict.contains("สวัส")); // prefix only — not a word
    /// assert!(!dict.contains(""));
    /// ```
    pub fn contains(&self, word: &str) -> bool {
        if word.is_empty() {
            return false;
        }
        let mut state = 0usize;
        for b in word.bytes() {
            match self.goto(state, b) {
                Some(s) => state = s,
                None => return false,
            }
        }
        self.goto(state, TERM).is_some()
    }

    /// Return all substrings of `text` (anchored at byte 0) that are present
    /// in the dictionary, ordered **longest first**.
    ///
    /// Only returns slices that are valid UTF-8 boundaries (which is always
    /// guaranteed when the dictionary contains valid UTF-8 words).
    ///
    /// # Example
    ///
    /// ```rust
    /// use kham_core::dict::Dict;
    ///
    /// let dict = Dict::from_word_list("กิน\nกิน\nกิน\nกินข้าว\n");
    /// let p = dict.prefixes("กินข้าวกับปลา");
    /// // Longest prefix is "กินข้าว", then "กิน"
    /// assert_eq!(p[0], "กินข้าว");
    /// assert_eq!(p[1], "กิน");
    /// ```
    pub fn prefixes<'t>(&self, text: &'t str) -> Vec<&'t str> {
        let mut result = Vec::new();
        let mut state = 0usize;
        let bytes = text.as_bytes();

        for i in 0..bytes.len() {
            match self.goto(state, bytes[i]) {
                Some(s) => {
                    state = s;
                    if self.goto(state, TERM).is_some() {
                        // i is the index of the last byte; the prefix is text[0..i+1].
                        // Since dictionary words are valid UTF-8, i+1 is always a
                        // char boundary.
                        debug_assert!(text.is_char_boundary(i + 1));
                        result.push(&text[..i + 1]);
                    }
                }
                None => break,
            }
        }

        // Reverse so the longest match comes first.
        result.reverse();
        result
    }

    /// Number of allocated DARTS states (informational / testing).
    #[inline]
    pub fn state_count(&self) -> usize {
        self.check.len()
    }
}

// ---------------------------------------------------------------------------
// Built-in word list and pre-compiled binary dictionary
// ---------------------------------------------------------------------------

/// The built-in CC0 Thai word list, embedded at compile time.
///
/// Pass to [`Dict::from_word_list`] to build the default dictionary, or use
/// [`builtin_dict`] to load the pre-compiled binary form in O(S) time.
pub static BUILTIN_WORDS: &str = include_str!("../data/words_th.txt");

/// Pre-compiled binary DARTS blob produced by `build.rs` at compile time.
///
/// Format: 16-byte header (`b"KDAM"` magic, version, lengths) followed by
/// `base[]` then `check[]` as little-endian `i32` values.
static BUILTIN_DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.bin"));

/// Return the built-in dictionary loaded from the pre-compiled binary blob.
///
/// This is O(S) — a simple byte copy of the pre-built DARTS arrays — versus
/// O(W×K) for [`Dict::from_word_list`] which reconstructs the trie at runtime.
/// Use this in [`crate::segmenter::Tokenizer`] and wherever the default dictionary is needed.
///
/// # Example
///
/// ```rust
/// use kham_core::dict::builtin_dict;
///
/// let dict = builtin_dict();
/// assert!(dict.contains("กิน"));
/// assert!(dict.contains("ธนาคาร"));
/// ```
pub fn builtin_dict() -> Dict {
    Dict::from_bytes(BUILTIN_DICT_BYTES)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    fn small_dict() -> Dict {
        Dict::from_word_list("กิน\nข้าว\nปลา\nน้ำ\nกินข้าว\n")
    }

    // ── contains ─────────────────────────────────────────────────────────────

    #[test]
    fn contains_exact_words() {
        let d = small_dict();
        assert!(d.contains("กิน"));
        assert!(d.contains("ข้าว"));
        assert!(d.contains("ปลา"));
        assert!(d.contains("น้ำ"));
        assert!(d.contains("กินข้าว"));
    }

    #[test]
    fn contains_rejects_prefix_only() {
        let d = small_dict();
        // "กิ" is a prefix of "กิน" but not itself a word
        assert!(!d.contains("กิ"));
    }

    #[test]
    fn contains_rejects_empty() {
        let d = small_dict();
        assert!(!d.contains(""));
    }

    #[test]
    fn contains_rejects_unknown() {
        let d = small_dict();
        assert!(!d.contains("xyz"));
        assert!(!d.contains("hello"));
    }

    #[test]
    fn contains_ascii() {
        let d = Dict::from_word_list("hello\nworld\n");
        assert!(d.contains("hello"));
        assert!(d.contains("world"));
        assert!(!d.contains("hell"));
        assert!(!d.contains("worlds"));
    }

    #[test]
    fn contains_comments_and_blanks_skipped() {
        let d = Dict::from_word_list("# comment\n\nกิน\n   \nข้าว\n");
        assert!(d.contains("กิน"));
        assert!(d.contains("ข้าว"));
        assert!(!d.contains("# comment"));
    }

    #[test]
    fn contains_single_char() {
        let d = Dict::from_word_list("\n");
        assert!(d.contains(""));
        assert!(!d.contains("กก"));
    }

    // ── prefixes ──────────────────────────────────────────────────────────────

    #[test]
    fn prefixes_longest_first() {
        let d = small_dict();
        let p = d.prefixes("กินข้าวกับปลา");
        // "กินข้าว" and "กิน" are both prefixes of the input
        assert!(p.contains(&"กินข้าว"));
        assert!(p.contains(&"กิน"));
        // Longest must come first
        assert_eq!(p[0], "กินข้าว");
    }

    #[test]
    fn prefixes_single_match() {
        let d = small_dict();
        let p = d.prefixes("ปลาทู");
        assert_eq!(p, vec!["ปลา"]);
    }

    #[test]
    fn prefixes_no_match() {
        let d = small_dict();
        assert!(d.prefixes("xyz").is_empty());
        assert!(d.prefixes("").is_empty());
    }

    #[test]
    fn prefixes_exact_match() {
        let d = small_dict();
        // "กิน" is exactly a word; text doesn't extend further
        let p = d.prefixes("กิน");
        assert_eq!(p, vec!["กิน"]);
    }

    // ── full-round-trip / structural ──────────────────────────────────────────

    #[test]
    fn builtin_words_parse() {
        // Built-in word list must parse without panic and contain a few known words
        let d = Dict::from_word_list(BUILTIN_WORDS);
        assert!(d.contains("กิน"));
        assert!(d.contains("ข้าว"));
    }

    #[test]
    fn large_word_list() {
        // Insert many words and verify all are retrievable
        let words = [
            "กิน",
            "ข้าว",
            "ปลา",
            "น้ำ",
            "คน",
            "ไป",
            "มา",
            "ที่",
            "นี่",
            "นั้น",
            "สวัสดี",
            "ธนาคาร",
            "แห่ง",
            "ชาวโลก",
            "ประเทศ",
            "ภาษา",
            "เมือง",
            "บ้าน",
        ];
        let list: alloc::string::String = words.iter().map(|w| alloc::format!("{w}\n")).collect();
        let d = Dict::from_word_list(&list);
        for &w in &words {
            assert!(d.contains(w), "missing word: {w}");
        }
        assert!(!d.contains("notaword"));
    }

    #[test]
    fn builtin_contains_common_words() {
        let d = Dict::from_word_list(BUILTIN_WORDS);
        let words = ["สวัสดี", "ธนาคาร", "ชาวโลก", "ไป", "มา", "กิน", "ข้าว"];
        for w in words {
            assert!(d.contains(w), "builtin dict missing: {w}");
        }
        let p = d.prefixes("สวัสดีชาวโลก");
        assert!(p.contains(&"สวัสดี"), "prefixes missing สวัสดี, got: {p:?}");
        let p2 = d.prefixes("ธนาคารแห่งนั้น");
        assert!(
            p2.contains(&"ธนาคาร"),
            "prefixes missing ธนาคาร, got: {p2:?}"
        );
    }

    #[test]
    fn check_array_validity() {
        // Every non-UNUSED entry in check must point to a valid state index.
        let d = small_dict();
        let len = d.check.len() as i32;
        for &c in &d.check {
            if c != UNUSED {
                assert!(c >= 0 && c < len, "check entry {c} out of range");
            }
        }
    }
}