rust-mando 0.1.2

Convert Chinese characters to pinyin with jieba word segmentation
Documentation
//! Build script:
//!
//! 1. Compresses `dict/dict.txt.big` → `$OUT_DIR/dict.dat`  (jieba segmenter)
//! 2. Parses `dict/cedict_ts.u8`     → `$OUT_DIR/pinyin.dat` (pinyin lookup)
//!
//! Both source files must be present. See dict/README.md for downloads.
//!
//! Compression: C-linked `zstd` crate (level 19, host-only).
//! Decompression at runtime: `ruzstd` (pure Rust, WASM-safe).
//!
//! # pinyin.dat binary layout (little-endian, before zstd)
//!
//! ```text
//! u32 N                                   ← entry count
//! [u64 key_hash | u32 heap_offset] × N    ← index sorted by hash
//! [u8 len, len×UTF-8 bytes] × N           ← heap: pinyin_numbers only
//! ```
//!
//! key_hash = fnv1a_64(hanzi.as_bytes())
//! Both traditional and simplified forms are indexed; they share the same
//! heap entry when they differ, and are deduplicated when identical.
//! pinyin_numbers stores raw CC-CEDICT syllables e.g. "bei3 jing1"
//! (with u: replaced by ü). Marks are derived at runtime.

use std::{collections::HashMap, env, fs, io, path::PathBuf};

fn main() {
    println!("cargo:rerun-if-changed=dict/dict.txt.big");
    println!("cargo:rerun-if-changed=dict/cedict_ts.u8");
    println!("cargo:rerun-if-changed=build.rs");

    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
    compress_jieba_dict(&out_dir);
    build_pinyin_dat(&out_dir);
}

// ── 1. Jieba dict ─────────────────────────────────────────────────────────────

fn compress_jieba_dict(out_dir: &PathBuf) {
    let src = PathBuf::from("dict/dict.txt.big");
    require_file(&src,
        "  curl -L https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big \\\n       -o dict/dict.txt.big");

    let raw      = fs::read(&src).expect("failed to read dict/dict.txt.big");
    let dst      = out_dir.join("dict.dat");
    let mut out  = io::BufWriter::new(fs::File::create(&dst).unwrap());
    zstd::stream::copy_encode(raw.as_slice(), &mut out, 19).unwrap();

    println!("cargo:warning=dict.dat:   {} KB → {} KB (zstd level 19)",
        raw.len() / 1024, fs::metadata(&dst).unwrap().len() / 1024);
}

// ── 2. Pinyin table ───────────────────────────────────────────────────────────

fn build_pinyin_dat(out_dir: &PathBuf) {
    let src = PathBuf::from("dict/cedict_ts.u8");
    require_file(&src,
        "  wget -O - 'https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz' \\\n       | gunzip > dict/cedict_ts.u8");

    let text    = fs::read_to_string(&src).expect("failed to read dict/cedict_ts.u8");
    let entries = parse_cedict(&text);
    let blob    = encode_table(&entries);

    let dst     = out_dir.join("pinyin.dat");
    let mut out = io::BufWriter::new(fs::File::create(&dst).unwrap());
    zstd::stream::copy_encode(blob.as_slice(), &mut out, 19).unwrap();

    println!("cargo:warning=pinyin.dat: {} KB → {} KB (zstd level 19)",
        blob.len() / 1024, fs::metadata(&dst).unwrap().len() / 1024);
}

// ── CC-CEDICT parser ──────────────────────────────────────────────────────────

/// One entry: a hanzi string (traditional or simplified) → tone-number syllables.
/// e.g. "北京" → "bei3 jing1"
struct Entry {
    hanzi:          String,
    pinyin_numbers: String,
}

fn parse_cedict(text: &str) -> Vec<Entry> {
    // Keep only the first CC-CEDICT reading per hanzi form (first = most common).
    // Both traditional and simplified are indexed; when they are identical only
    // one entry is inserted (the or_insert call on the second key is a no-op).
    let mut map: HashMap<String, String> = HashMap::new();

    for line in text.lines() {
        let line = line.trim();
        if line.is_empty() || line.starts_with('#') {
            continue;
        }

        // Format: Traditional Simplified [pin1 yin1] /defs.../
        let Some((trad, rest)) = line.split_once(' ')  else { continue };
        let Some((simp, rest)) = rest.split_once(' ')  else { continue };
        let Some(bs)           = rest.find('[')         else { continue };
        let Some(be)           = rest.find(']')         else { continue };

        let raw = rest[bs + 1..be].trim();
        // Normalise u: → ü (CC-CEDICT uses u: for ü)
        let numbers = raw.replace("u:", "ü");
        // Normalise whitespace
        let numbers = numbers.split_whitespace().collect::<Vec<_>>().join(" ");

        map.entry(trad.to_string()).or_insert_with(|| numbers.clone());
        // Index simplified form only when it differs from traditional
        if simp != trad {
            map.entry(simp.to_string()).or_insert(numbers);
        }
    }

    let mut entries: Vec<Entry> = map
        .into_iter()
        .map(|(h, n)| Entry { hanzi: h, pinyin_numbers: n })
        .collect();
    entries.sort_by(|a, b| a.hanzi.cmp(&b.hanzi));
    entries
}

// ── Binary encoder ────────────────────────────────────────────────────────────

fn encode_table(entries: &[Entry]) -> Vec<u8> {
    let n = entries.len() as u32;
    let mut heap:  Vec<u8>         = Vec::new();
    let mut index: Vec<(u64, u32)> = Vec::with_capacity(entries.len());

    for e in entries {
        let offset = heap.len() as u32;
        let nb     = e.pinyin_numbers.as_bytes();
        assert!(nb.len() <= 255, "pinyin_numbers too long for: {}", e.hanzi);
        heap.push(nb.len() as u8);
        heap.extend_from_slice(nb);
        index.push((fnv1a_64(e.hanzi.as_bytes()), offset));
    }

    index.sort_by_key(|&(h, _)| h);

    let mut out = Vec::with_capacity(4 + index.len() * 12 + heap.len());
    out.extend_from_slice(&n.to_le_bytes());
    for (hash, offset) in &index {
        out.extend_from_slice(&hash.to_le_bytes());
        out.extend_from_slice(&offset.to_le_bytes());
    }
    out.extend_from_slice(&heap);
    out
}

// ── FNV-1a-64 (must match src/pinyin_dict.rs exactly) ────────────────────────

fn fnv1a_64(bytes: &[u8]) -> u64 {
    let mut h: u64 = 0xcbf29ce484222325;
    for &b in bytes { h ^= b as u64; h = h.wrapping_mul(0x100000001b3); }
    h
}

// ── helpers ───────────────────────────────────────────────────────────────────

fn require_file(path: &PathBuf, hint: &str) {
    if !path.exists() {
        println!("cargo:warning={} not found — download it with:", path.display());
        for line in hint.lines() { println!("cargo:warning={line}"); }
        panic!("{} missing", path.display());
    }
}