use alloc::collections::VecDeque;
use alloc::vec;
use alloc::vec::Vec;
const UNUSED: i32 = -1;
const TERM: u8 = 0;
struct TrieNode {
children: Vec<(u8, usize)>,
}
impl TrieNode {
fn new() -> Self {
Self {
children: Vec::new(),
}
}
#[inline]
fn find(&self, b: u8) -> Result<usize, usize> {
self.children.binary_search_by_key(&b, |&(k, _)| k)
}
#[inline]
fn get(&self, b: u8) -> Option<usize> {
self.find(b).ok().map(|i| self.children[i].1)
}
#[inline]
fn insert(&mut self, b: u8, child_id: usize) {
match self.find(b) {
Ok(_) => debug_assert!(false, "duplicate byte in trie node"),
Err(pos) => self.children.insert(pos, (b, child_id)),
}
}
}
fn build_trie(text: &str) -> Vec<TrieNode> {
let mut nodes: Vec<TrieNode> = vec![TrieNode::new()];
for line in text.lines() {
let word = line.trim();
if word.is_empty() || word.starts_with('#') {
continue;
}
let mut state = 0usize;
for b in word.bytes().chain(core::iter::once(TERM)) {
if let Some(next) = nodes[state].get(b) {
state = next;
} else {
let next_id = nodes.len();
nodes.push(TrieNode::new());
nodes[state].insert(b, next_id);
state = next_id;
}
}
}
nodes
}
struct FreeBitmap {
words: Vec<u64>,
cap: usize,
}
impl FreeBitmap {
fn new(cap: usize) -> Self {
let n_words = cap.div_ceil(64);
let mut words = vec![!0u64; n_words];
let used = cap % 64;
if used > 0 && n_words > 0 {
words[n_words - 1] = (1u64 << used) - 1;
}
if n_words > 0 {
words[0] &= !1u64;
}
FreeBitmap { words, cap }
}
#[inline]
fn occupy(&mut self, slot: usize) {
debug_assert!(slot < self.cap);
self.words[slot / 64] &= !(1u64 << (slot % 64));
}
fn grow(&mut self, new_cap: usize) {
debug_assert!(new_cap > self.cap);
let old_cap = self.cap;
let old_words = self.words.len();
let new_words = new_cap.div_ceil(64);
let old_used = old_cap % 64;
if old_used > 0 && old_words > 0 {
self.words[old_words - 1] |= !((1u64 << old_used) - 1);
}
self.words.resize(new_words, !0u64);
let new_used = new_cap % 64;
if new_used > 0 && new_words > 0 {
self.words[new_words - 1] = (1u64 << new_used) - 1;
}
self.cap = new_cap;
}
fn next_free_from(&self, start: usize) -> usize {
if start >= self.cap {
return self.cap;
}
let cap_word = self.cap.div_ceil(64);
let word_idx = start / 64;
let bit_idx = start % 64;
let partial = self.words[word_idx] >> bit_idx;
if partial != 0 {
return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
}
for i in (word_idx + 1)..cap_word {
if self.words[i] != 0 {
return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
}
}
self.cap }
}
fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
debug_assert!(!children.is_empty());
let min_child = children[0] as usize;
let mut b = min_free.saturating_sub(min_child + 1).max(1);
'search: loop {
for &c in children {
let pos = b + c as usize + 1;
if pos < check.len() && check[pos] != UNUSED {
let nf = bitmap.next_free_from(pos + 1);
b = nf.saturating_sub(c as usize + 1).max(b + 1);
continue 'search;
}
}
return b;
}
}
pub struct Dict {
base: Vec<i32>,
check: Vec<i32>,
}
impl Dict {
pub fn from_word_list(text: &str) -> Self {
let trie = build_trie(text);
Self::from_trie(trie)
}
pub fn from_bytes(data: &[u8]) -> Self {
const HDR: usize = 16; assert!(data.len() >= HDR, "dict.bin too short");
assert_eq!(&data[0..4], b"KDAM", "dict.bin: bad magic");
assert_eq!(data[4], 0x01, "dict.bin: unsupported version");
let base_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
let check_len = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
let base_end = HDR + base_len * 4;
let check_end = base_end + check_len * 4;
let base: Vec<i32> = data[HDR..base_end]
.chunks_exact(4)
.map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
.collect();
let check: Vec<i32> = data[base_end..check_end]
.chunks_exact(4)
.map(|c| i32::from_le_bytes([c[0], c[1], c[2], c[3]]))
.collect();
Self { base, check }
}
fn from_trie(nodes: Vec<TrieNode>) -> Self {
let n = nodes.len();
let cap = n * 2 + 512;
let mut base: Vec<i32> = vec![0; cap];
let mut check: Vec<i32> = vec![UNUSED; cap];
check[0] = 0;
let mut trie_to_darts: Vec<usize> = vec![0; n];
let mut bitmap = FreeBitmap::new(cap);
let mut min_free: usize = 1;
let mut queue: VecDeque<usize> = VecDeque::new();
queue.push_back(0);
while let Some(trie_id) = queue.pop_front() {
let darts_id = trie_to_darts[trie_id];
let node = &nodes[trie_id];
if node.children.is_empty() {
continue; }
while min_free < check.len() && check[min_free] != UNUSED {
min_free += 1;
}
let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();
let b = find_base(&check, &child_bytes, &bitmap, min_free);
base[darts_id] = b as i32;
for &(byte, child_trie_id) in &node.children {
let child_darts = b + byte as usize + 1;
if child_darts + 1 > check.len() {
let new_cap = child_darts + 512;
base.resize(new_cap, 0);
check.resize(new_cap, UNUSED);
bitmap.grow(new_cap);
}
check[child_darts] = darts_id as i32;
bitmap.occupy(child_darts);
trie_to_darts[child_trie_id] = child_darts;
queue.push_back(child_trie_id);
}
}
let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
base.truncate(last + 1);
check.truncate(last + 1);
Self { base, check }
}
#[inline]
fn goto(&self, state: usize, byte: u8) -> Option<usize> {
let next = (*self.base.get(state)?) + byte as i32 + 1;
if next <= 0 {
return None;
}
let next = next as usize;
if self.check.get(next).copied()? == state as i32 {
Some(next)
} else {
None
}
}
pub fn contains(&self, word: &str) -> bool {
if word.is_empty() {
return false;
}
let mut state = 0usize;
for b in word.bytes() {
match self.goto(state, b) {
Some(s) => state = s,
None => return false,
}
}
self.goto(state, TERM).is_some()
}
pub fn prefixes<'t>(&self, text: &'t str) -> Vec<&'t str> {
let mut result = Vec::new();
let mut state = 0usize;
let bytes = text.as_bytes();
for i in 0..bytes.len() {
match self.goto(state, bytes[i]) {
Some(s) => {
state = s;
if self.goto(state, TERM).is_some() {
debug_assert!(text.is_char_boundary(i + 1));
result.push(&text[..i + 1]);
}
}
None => break,
}
}
result.reverse();
result
}
#[inline]
pub fn state_count(&self) -> usize {
self.check.len()
}
}
pub static BUILTIN_WORDS: &str = include_str!("../data/words_th.txt");
static BUILTIN_DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.bin"));
pub fn builtin_dict() -> Dict {
Dict::from_bytes(BUILTIN_DICT_BYTES)
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
fn small_dict() -> Dict {
Dict::from_word_list("กิน\nข้าว\nปลา\nน้ำ\nกินข้าว\n")
}
#[test]
fn contains_exact_words() {
let d = small_dict();
assert!(d.contains("กิน"));
assert!(d.contains("ข้าว"));
assert!(d.contains("ปลา"));
assert!(d.contains("น้ำ"));
assert!(d.contains("กินข้าว"));
}
#[test]
fn contains_rejects_prefix_only() {
let d = small_dict();
assert!(!d.contains("กิ"));
}
#[test]
fn contains_rejects_empty() {
let d = small_dict();
assert!(!d.contains(""));
}
#[test]
fn contains_rejects_unknown() {
let d = small_dict();
assert!(!d.contains("xyz"));
assert!(!d.contains("hello"));
}
#[test]
fn contains_ascii() {
let d = Dict::from_word_list("hello\nworld\n");
assert!(d.contains("hello"));
assert!(d.contains("world"));
assert!(!d.contains("hell"));
assert!(!d.contains("worlds"));
}
#[test]
fn contains_comments_and_blanks_skipped() {
let d = Dict::from_word_list("# comment\n\nกิน\n \nข้าว\n");
assert!(d.contains("กิน"));
assert!(d.contains("ข้าว"));
assert!(!d.contains("# comment"));
}
#[test]
fn contains_single_char() {
let d = Dict::from_word_list("ก\n");
assert!(d.contains("ก"));
assert!(!d.contains("กก"));
}
#[test]
fn prefixes_longest_first() {
let d = small_dict();
let p = d.prefixes("กินข้าวกับปลา");
assert!(p.contains(&"กินข้าว"));
assert!(p.contains(&"กิน"));
assert_eq!(p[0], "กินข้าว");
}
#[test]
fn prefixes_single_match() {
let d = small_dict();
let p = d.prefixes("ปลาทู");
assert_eq!(p, vec!["ปลา"]);
}
#[test]
fn prefixes_no_match() {
let d = small_dict();
assert!(d.prefixes("xyz").is_empty());
assert!(d.prefixes("").is_empty());
}
#[test]
fn prefixes_exact_match() {
let d = small_dict();
let p = d.prefixes("กิน");
assert_eq!(p, vec!["กิน"]);
}
#[test]
fn builtin_words_parse() {
let d = Dict::from_word_list(BUILTIN_WORDS);
assert!(d.contains("กิน"));
assert!(d.contains("ข้าว"));
}
#[test]
fn large_word_list() {
let words = [
"กิน",
"ข้าว",
"ปลา",
"น้ำ",
"คน",
"ไป",
"มา",
"ที่",
"นี่",
"นั้น",
"สวัสดี",
"ธนาคาร",
"แห่ง",
"ชาวโลก",
"ประเทศ",
"ภาษา",
"เมือง",
"บ้าน",
];
let list: alloc::string::String = words.iter().map(|w| alloc::format!("{w}\n")).collect();
let d = Dict::from_word_list(&list);
for &w in &words {
assert!(d.contains(w), "missing word: {w}");
}
assert!(!d.contains("notaword"));
}
#[test]
fn builtin_contains_common_words() {
let d = Dict::from_word_list(BUILTIN_WORDS);
let words = ["สวัสดี", "ธนาคาร", "ชาวโลก", "ไป", "มา", "กิน", "ข้าว"];
for w in words {
assert!(d.contains(w), "builtin dict missing: {w}");
}
let p = d.prefixes("สวัสดีชาวโลก");
assert!(p.contains(&"สวัสดี"), "prefixes missing สวัสดี, got: {p:?}");
let p2 = d.prefixes("ธนาคารแห่งนั้น");
assert!(
p2.contains(&"ธนาคาร"),
"prefixes missing ธนาคาร, got: {p2:?}"
);
}
#[test]
fn check_array_validity() {
let d = small_dict();
let len = d.check.len() as i32;
for &c in &d.check {
if c != UNUSED {
assert!(c >= 0 && c < len, "check entry {c} out of range");
}
}
}
}