use std::{collections::HashMap, env, fs, io, path::PathBuf};
fn main() {
println!("cargo:rerun-if-changed=dict/dict.txt.big");
println!("cargo:rerun-if-changed=dict/cedict_ts.u8");
println!("cargo:rerun-if-changed=build.rs");
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
compress_jieba_dict(&out_dir);
build_pinyin_dat(&out_dir);
}
fn compress_jieba_dict(out_dir: &PathBuf) {
let src = PathBuf::from("dict/dict.txt.big");
require_file(&src,
" curl -L https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big \\\n -o dict/dict.txt.big");
let raw = fs::read(&src).expect("failed to read dict/dict.txt.big");
let dst = out_dir.join("dict.dat");
let mut out = io::BufWriter::new(fs::File::create(&dst).unwrap());
zstd::stream::copy_encode(raw.as_slice(), &mut out, 19).unwrap();
println!("cargo:warning=dict.dat: {} KB → {} KB (zstd level 19)",
raw.len() / 1024, fs::metadata(&dst).unwrap().len() / 1024);
}
fn build_pinyin_dat(out_dir: &PathBuf) {
let src = PathBuf::from("dict/cedict_ts.u8");
require_file(&src,
" wget -O - 'https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz' \\\n | gunzip > dict/cedict_ts.u8");
let text = fs::read_to_string(&src).expect("failed to read dict/cedict_ts.u8");
let entries = parse_cedict(&text);
let blob = encode_table(&entries);
let dst = out_dir.join("pinyin.dat");
let mut out = io::BufWriter::new(fs::File::create(&dst).unwrap());
zstd::stream::copy_encode(blob.as_slice(), &mut out, 19).unwrap();
println!("cargo:warning=pinyin.dat: {} KB → {} KB (zstd level 19)",
blob.len() / 1024, fs::metadata(&dst).unwrap().len() / 1024);
}
struct Entry {
hanzi: String,
pinyin_numbers: String,
}
fn parse_cedict(text: &str) -> Vec<Entry> {
let mut map: HashMap<String, String> = HashMap::new();
for line in text.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let Some((trad, rest)) = line.split_once(' ') else { continue };
let Some((simp, rest)) = rest.split_once(' ') else { continue };
let Some(bs) = rest.find('[') else { continue };
let Some(be) = rest.find(']') else { continue };
let raw = rest[bs + 1..be].trim();
let numbers = raw.replace("u:", "ü");
let numbers = numbers.split_whitespace().collect::<Vec<_>>().join(" ");
map.entry(trad.to_string()).or_insert_with(|| numbers.clone());
if simp != trad {
map.entry(simp.to_string()).or_insert(numbers);
}
}
let mut entries: Vec<Entry> = map
.into_iter()
.map(|(h, n)| Entry { hanzi: h, pinyin_numbers: n })
.collect();
entries.sort_by(|a, b| a.hanzi.cmp(&b.hanzi));
entries
}
fn encode_table(entries: &[Entry]) -> Vec<u8> {
let n = entries.len() as u32;
let mut heap: Vec<u8> = Vec::new();
let mut index: Vec<(u64, u32)> = Vec::with_capacity(entries.len());
for e in entries {
let offset = heap.len() as u32;
let nb = e.pinyin_numbers.as_bytes();
assert!(nb.len() <= 255, "pinyin_numbers too long for: {}", e.hanzi);
heap.push(nb.len() as u8);
heap.extend_from_slice(nb);
index.push((fnv1a_64(e.hanzi.as_bytes()), offset));
}
index.sort_by_key(|&(h, _)| h);
let mut out = Vec::with_capacity(4 + index.len() * 12 + heap.len());
out.extend_from_slice(&n.to_le_bytes());
for (hash, offset) in &index {
out.extend_from_slice(&hash.to_le_bytes());
out.extend_from_slice(&offset.to_le_bytes());
}
out.extend_from_slice(&heap);
out
}
fn fnv1a_64(bytes: &[u8]) -> u64 {
let mut h: u64 = 0xcbf29ce484222325;
for &b in bytes { h ^= b as u64; h = h.wrapping_mul(0x100000001b3); }
h
}
fn require_file(path: &PathBuf, hint: &str) {
if !path.exists() {
println!("cargo:warning={} not found — download it with:", path.display());
for line in hint.lines() { println!("cargo:warning={line}"); }
panic!("{} missing", path.display());
}
}