wubi 0.3.0

Self-developed Wubi 86 encoder, dictionary, and dataset (PHF + FST, WASM-ready).
Documentation
//! Auto-decompose tool — given rime's reference dictionary, find a 字根
//! decomposition for each character that, when fed through our encoder,
//! produces the expected code.
//!
//!     cargo run --release --bin golia-auto-decomp [--max N] > new_seed.txt
//!
//! Outputs seed.txt-format lines (`char\tzigen-seq\tstroke-seq\tshape`) for
//! every character we can correctly decompose using only the 字根 currently
//! in our table.
//!
//! Algorithm:
//! - For each (code, char) pair in rime, length 3 or 4:
//!   - try **rule 2** (2 字根 + identifier) for length-3 codes
//!   - try **rule 3** (3 字根 + identifier) for length-4 codes
//!   - try **rule 4** (4 字根, no identifier) for length-4 codes
//! - Pick the FIRST decomposition (in 字根-table iteration order) that works.
//! - If multiple zigen pairs/triples/quads produce the same code, only one
//!   is emitted; the others are valid too. Any of them produces correct codes.
//!
//! What this tool does NOT do (yet):
//! - 1-字根 cases (键名/单笔画/成字字根): hand-curate or detect via char-in-zigen-table
//! - 2-letter codes (一/二级简码): handled by separate jianma data files
//! - Rule 4 with 4+ 字根 (only checks exactly 4 字根 in candidate code positions)

use std::collections::{HashMap, HashSet};

use wubi::{Decomp, Shape, Stroke, embedded_seed, encode, iter_jianma1, iter_zigen};

fn main() {
    let args: Vec<String> = std::env::args().skip(1).collect();
    let max: usize = args
        .iter()
        .find_map(|a| a.strip_prefix("--max=").and_then(|s| s.parse().ok()))
        .unwrap_or(usize::MAX);
    let basic_cjk_only = !args.iter().any(|a| a == "--all-blocks");

    let manifest = env!("CARGO_MANIFEST_DIR");
    let ref_path = format!("{manifest}/../../data/wubi86_full.txt");
    let reference = std::fs::read_to_string(&ref_path).expect("read rime");

    // Inverse: letter → list of 字根 with that key.
    let mut by_letter: HashMap<u8, Vec<char>> = HashMap::new();
    for (z, l) in iter_zigen() {
        by_letter.entry(l).or_default().push(z);
    }
    for v in by_letter.values_mut() {
        // Stable iteration order so output is deterministic across runs.
        v.sort();
    }

    // Skip chars whose decomp is already in hand-curated seed (those win on
    // conflict). 一级简码 chars are NOT skipped — they need both their
    // 1-letter 简码 entry (auto-emitted from jianma1.txt at build time) AND
    // their full algorithmic code (e.g., 国 → "l" AND "lgyi").
    let mut already: HashSet<char> = HashSet::new();
    for (ch, _) in embedded_seed() {
        already.insert(ch);
    }
    let _ = iter_jianma1; // intentionally unused; keep import documented

    // Group rime entries by char; pick the LONGEST code per char as the
    // canonical full-code (that's what our algorithm produces).
    let mut longest_code: HashMap<char, String> = HashMap::new();
    for raw in reference.lines() {
        let line = raw.trim();
        if line.is_empty() || line.starts_with('#') {
            continue;
        }
        let mut parts = line.splitn(2, '\t');
        let (Some(code), Some(word)) = (parts.next(), parts.next()) else {
            continue;
        };
        let code = code.trim();
        let word = word.trim();
        if word.chars().count() != 1 {
            continue; // skip phrases for now
        }
        let ch = word.chars().next().unwrap();
        if !code.bytes().all(|b| b.is_ascii_lowercase()) {
            continue;
        }
        let entry = longest_code.entry(ch).or_default();
        if code.len() > entry.len() {
            *entry = code.to_string();
        }
    }

    println!(
        "# Auto-generated by golia-auto-decomp.\n\
         # Format: <char>\\t<zigen-seq>\\t<strokes>\\t<shape>\n\
         # Each entry's encoded code matches rime's longest code for the char.\n\
         # The 字根 sequence picked is the FIRST that the brute-force search\n\
         # found; for chars with multiple valid decomps it may not match the\n\
         # canonical visual decomposition. Manual review encouraged."
    );

    let mut emitted = 0usize;
    let mut sorted: Vec<(char, String)> = longest_code.into_iter().collect();
    sorted.sort_by(|a, b| a.1.cmp(&b.1).then(a.0.cmp(&b.0)));

    for (ch, code) in sorted {
        if emitted >= max {
            break;
        }
        if already.contains(&ch) {
            continue;
        }
        // Filter to CJK Unified Ideographs (U+4E00..=U+9FFF) by default —
        // skips Extension A/B/C/D/etc. and compatibility blocks unless
        // `--all-blocks` is passed.
        let cp = ch as u32;
        if basic_cjk_only && !(0x4E00..=0x9FFF).contains(&cp) {
            continue;
        }
        if let Some(decomp) = try_decompose(&code, &by_letter) {
            // sanity: round-trip
            if let Ok(encoded) = encode(&decomp) {
                if encoded.as_str() == code {
                    let zg: String = decomp
                        .zigen
                        .iter()
                        .map(|c| c.to_string())
                        .collect::<Vec<_>>()
                        .join(" ");
                    let st: String = decomp
                        .strokes
                        .iter()
                        .map(|s| (*s as u8).to_string())
                        .collect::<Vec<_>>()
                        .join(" ");
                    let sh = decomp.shape as u8;
                    println!("{ch}\t{zg}\t{st}\t{sh}");
                    emitted += 1;
                }
            }
        }
    }

    eprintln!("[auto-decomp] emitted {emitted} entries");
}

fn identifier(letter: u8) -> Option<(Stroke, Shape)> {
    match letter {
        b'g' => Some((Stroke::Heng, Shape::LeftRight)),
        b'f' => Some((Stroke::Heng, Shape::TopBottom)),
        b'd' => Some((Stroke::Heng, Shape::Whole)),
        b'h' => Some((Stroke::Shu, Shape::LeftRight)),
        b'j' => Some((Stroke::Shu, Shape::TopBottom)),
        b'k' => Some((Stroke::Shu, Shape::Whole)),
        b't' => Some((Stroke::Pie, Shape::LeftRight)),
        b'r' => Some((Stroke::Pie, Shape::TopBottom)),
        b'e' => Some((Stroke::Pie, Shape::Whole)),
        b'y' => Some((Stroke::Na, Shape::LeftRight)),
        b'u' => Some((Stroke::Na, Shape::TopBottom)),
        b'i' => Some((Stroke::Na, Shape::Whole)),
        b'n' => Some((Stroke::Zhe, Shape::LeftRight)),
        b'b' => Some((Stroke::Zhe, Shape::TopBottom)),
        b'v' => Some((Stroke::Zhe, Shape::Whole)),
        _ => None,
    }
}

fn try_decompose(code: &str, by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
    let bytes = code.as_bytes();
    match bytes.len() {
        3 => try_rule_2(bytes, by_letter),
        4 => try_rule_3(bytes, by_letter).or_else(|| try_rule_4(bytes, by_letter)),
        _ => None,
    }
}

/// Rule 2: 2 字根 + identifier (3-letter code).
///
/// Picks the first 字根 in each letter's list. Any combination produces the
/// same code by construction; quality of decomp is a separate concern.
fn try_rule_2(bytes: &[u8], by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
    let (l1, l2, lid) = (bytes[0], bytes[1], bytes[2]);
    let (last_stroke, shape) = identifier(lid)?;
    let &z1 = by_letter.get(&l1)?.first()?;
    let &z2 = by_letter.get(&l2)?.first()?;
    Some(Decomp {
        zigen: vec![z1, z2],
        strokes: vec![last_stroke],
        shape,
    })
}

/// Rule 3: 3 字根 + identifier (4-letter code).
fn try_rule_3(bytes: &[u8], by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
    let (l1, l2, l3, lid) = (bytes[0], bytes[1], bytes[2], bytes[3]);
    let (last_stroke, shape) = identifier(lid)?;
    let &z1 = by_letter.get(&l1)?.first()?;
    let &z2 = by_letter.get(&l2)?.first()?;
    let &z3 = by_letter.get(&l3)?.first()?;
    Some(Decomp {
        zigen: vec![z1, z2, z3],
        strokes: vec![last_stroke],
        shape,
    })
}

/// Rule 4: 4 字根, no identifier (4-letter code).
fn try_rule_4(bytes: &[u8], by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
    let (l1, l2, l3, l4) = (bytes[0], bytes[1], bytes[2], bytes[3]);
    let &z1 = by_letter.get(&l1)?.first()?;
    let &z2 = by_letter.get(&l2)?.first()?;
    let &z3 = by_letter.get(&l3)?.first()?;
    let &z4 = by_letter.get(&l4)?.first()?;
    Some(Decomp {
        zigen: vec![z1, z2, z3, z4],
        // Rule 4's strokes don't affect output; one entry satisfies
        // downstream parser invariants.
        strokes: vec![Stroke::Heng],
        shape: Shape::Whole,
    })
}