use std::collections::VecDeque;
use std::fs;
use std::path::Path;
const UNUSED: i32 = -1;
const TERM: u8 = 0;
struct TrieNode {
children: Vec<(u8, usize)>,
}
impl TrieNode {
fn new() -> Self {
Self {
children: Vec::new(),
}
}
fn get(&self, b: u8) -> Option<usize> {
self.children
.binary_search_by_key(&b, |&(k, _)| k)
.ok()
.map(|i| self.children[i].1)
}
fn insert(&mut self, b: u8, child_id: usize) {
match self.children.binary_search_by_key(&b, |&(k, _)| k) {
Ok(_) => {}
Err(pos) => self.children.insert(pos, (b, child_id)),
}
}
}
fn build_trie(text: &str) -> Vec<TrieNode> {
let mut nodes: Vec<TrieNode> = vec![TrieNode::new()];
for line in text.lines() {
let word = line.trim();
if word.is_empty() || word.starts_with('#') {
continue;
}
let mut state = 0usize;
for b in word.bytes().chain(std::iter::once(TERM)) {
if let Some(next) = nodes[state].get(b) {
state = next;
} else {
let next_id = nodes.len();
nodes.push(TrieNode::new());
nodes[state].insert(b, next_id);
state = next_id;
}
}
}
nodes
}
struct FreeBitmap {
words: Vec<u64>,
cap: usize,
}
impl FreeBitmap {
fn new(cap: usize) -> Self {
let n_words = cap.div_ceil(64);
let mut words = vec![!0u64; n_words];
let used = cap % 64;
if used > 0 && n_words > 0 {
words[n_words - 1] = (1u64 << used) - 1;
}
if n_words > 0 {
words[0] &= !1u64;
}
FreeBitmap { words, cap }
}
fn occupy(&mut self, slot: usize) {
self.words[slot / 64] &= !(1u64 << (slot % 64));
}
fn grow(&mut self, new_cap: usize) {
let old_cap = self.cap;
let old_words = self.words.len();
let new_words = new_cap.div_ceil(64);
let old_used = old_cap % 64;
if old_used > 0 && old_words > 0 {
self.words[old_words - 1] |= !((1u64 << old_used) - 1);
}
self.words.resize(new_words, !0u64);
let new_used = new_cap % 64;
if new_used > 0 && new_words > 0 {
self.words[new_words - 1] = (1u64 << new_used) - 1;
}
self.cap = new_cap;
}
fn next_free_from(&self, start: usize) -> usize {
if start >= self.cap {
return self.cap;
}
let cap_word = self.cap.div_ceil(64);
let word_idx = start / 64;
let bit_idx = start % 64;
let partial = self.words[word_idx] >> bit_idx;
if partial != 0 {
return (word_idx * 64 + bit_idx + partial.trailing_zeros() as usize).min(self.cap);
}
for i in (word_idx + 1)..cap_word {
if self.words[i] != 0 {
return (i * 64 + self.words[i].trailing_zeros() as usize).min(self.cap);
}
}
self.cap
}
}
fn find_base(check: &[i32], children: &[u8], bitmap: &FreeBitmap, min_free: usize) -> usize {
let min_child = children[0] as usize;
let mut b = min_free.saturating_sub(min_child + 1).max(1);
'search: loop {
for &c in children {
let pos = b + c as usize + 1;
if pos < check.len() && check[pos] != UNUSED {
let nf = bitmap.next_free_from(pos + 1);
b = nf.saturating_sub(c as usize + 1).max(b + 1);
continue 'search;
}
}
return b;
}
}
fn build_darts(nodes: Vec<TrieNode>) -> (Vec<i32>, Vec<i32>) {
let n = nodes.len();
let cap = n * 2 + 512;
let mut base: Vec<i32> = vec![0; cap];
let mut check: Vec<i32> = vec![UNUSED; cap];
check[0] = 0;
let mut trie_to_darts: Vec<usize> = vec![0; n];
let mut bitmap = FreeBitmap::new(cap);
let mut min_free: usize = 1;
let mut queue: VecDeque<usize> = VecDeque::new();
queue.push_back(0);
while let Some(trie_id) = queue.pop_front() {
let darts_id = trie_to_darts[trie_id];
let node = &nodes[trie_id];
if node.children.is_empty() {
continue;
}
while min_free < check.len() && check[min_free] != UNUSED {
min_free += 1;
}
let child_bytes: Vec<u8> = node.children.iter().map(|&(b, _)| b).collect();
let b = find_base(&check, &child_bytes, &bitmap, min_free);
base[darts_id] = b as i32;
for &(byte, child_trie_id) in &node.children {
let child_darts = b + byte as usize + 1;
if child_darts + 1 > check.len() {
let new_cap = child_darts + 512;
base.resize(new_cap, 0);
check.resize(new_cap, UNUSED);
bitmap.grow(new_cap);
}
check[child_darts] = darts_id as i32;
bitmap.occupy(child_darts);
trie_to_darts[child_trie_id] = child_darts;
queue.push_back(child_trie_id);
}
}
let last = check.iter().rposition(|&c| c != UNUSED).unwrap_or(0);
base.truncate(last + 1);
check.truncate(last + 1);
(base, check)
}
fn serialize_darts(base: &[i32], check: &[i32]) -> Vec<u8> {
let base_len = base.len();
let check_len = check.len();
let total = 16 + base_len * 4 + check_len * 4;
let mut out = Vec::with_capacity(total);
out.extend_from_slice(b"KDAM"); out.push(0x01); out.extend_from_slice(&[0u8; 3]); out.extend_from_slice(&(base_len as u32).to_le_bytes());
out.extend_from_slice(&(check_len as u32).to_le_bytes());
for &v in base {
out.extend_from_slice(&v.to_le_bytes());
}
for &v in check {
out.extend_from_slice(&v.to_le_bytes());
}
out
}
fn compress_data_file(manifest: &str, out_dir: &str, src: &str, dst: &str) {
let src_path = Path::new(manifest).join(src);
let dst_path = Path::new(out_dir).join(dst);
println!("cargo:rerun-if-changed={}", src_path.display());
let data = fs::read(&src_path)
.unwrap_or_else(|e| panic!("failed to read {}: {e}", src_path.display()));
let compressed = miniz_oxide::deflate::compress_to_vec_zlib(&data, 6);
println!(
"cargo:warning={}: {} -> {} bytes ({:.0}%)",
dst,
data.len(),
compressed.len(),
compressed.len() as f64 / data.len() as f64 * 100.0
);
fs::write(&dst_path, &compressed)
.unwrap_or_else(|e| panic!("failed to write {}: {e}", dst_path.display()));
}
fn main() {
let manifest = std::env::var("CARGO_MANIFEST_DIR").unwrap();
let words_path = Path::new(&manifest).join("data/words_th.txt");
let out_dir = std::env::var("OUT_DIR").unwrap();
let out_path = Path::new(&out_dir).join("dict.bin");
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed={}", words_path.display());
let text = fs::read_to_string(&words_path)
.unwrap_or_else(|e| panic!("failed to read {}: {e}", words_path.display()));
let trie = build_trie(&text);
let (base, check) = build_darts(trie);
let blob = serialize_darts(&base, &check);
fs::write(&out_path, &blob)
.unwrap_or_else(|e| panic!("failed to write {}: {e}", out_path.display()));
println!(
"cargo:warning=dict.bin: {} states, {} bytes",
check.len(),
blob.len()
);
compress_data_file(&manifest, &out_dir, "data/tnc_freq.txt", "tnc_freq.bin");
compress_data_file(&manifest, &out_dir, "data/ne_th.tsv", "ne_th.bin");
compress_data_file(&manifest, &out_dir, "data/pos_th.tsv", "pos_th.bin");
}