use std::collections::{HashMap, HashSet};
use wubi::{Decomp, Shape, Stroke, embedded_seed, encode, iter_jianma1, iter_zigen};
fn main() {
let args: Vec<String> = std::env::args().skip(1).collect();
let max: usize = args
.iter()
.find_map(|a| a.strip_prefix("--max=").and_then(|s| s.parse().ok()))
.unwrap_or(usize::MAX);
let basic_cjk_only = !args.iter().any(|a| a == "--all-blocks");
let manifest = env!("CARGO_MANIFEST_DIR");
let ref_path = format!("{manifest}/../../data/wubi86_full.txt");
let reference = std::fs::read_to_string(&ref_path).expect("read rime");
let mut by_letter: HashMap<u8, Vec<char>> = HashMap::new();
for (z, l) in iter_zigen() {
by_letter.entry(l).or_default().push(z);
}
for v in by_letter.values_mut() {
v.sort();
}
let mut already: HashSet<char> = HashSet::new();
for (ch, _) in embedded_seed() {
already.insert(ch);
}
let _ = iter_jianma1;
let mut longest_code: HashMap<char, String> = HashMap::new();
for raw in reference.lines() {
let line = raw.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let mut parts = line.splitn(2, '\t');
let (Some(code), Some(word)) = (parts.next(), parts.next()) else {
continue;
};
let code = code.trim();
let word = word.trim();
if word.chars().count() != 1 {
continue; }
let ch = word.chars().next().unwrap();
if !code.bytes().all(|b| b.is_ascii_lowercase()) {
continue;
}
let entry = longest_code.entry(ch).or_default();
if code.len() > entry.len() {
*entry = code.to_string();
}
}
println!(
"# Auto-generated by golia-auto-decomp.\n\
# Format: <char>\\t<zigen-seq>\\t<strokes>\\t<shape>\n\
# Each entry's encoded code matches rime's longest code for the char.\n\
# The 字根 sequence picked is the FIRST that the brute-force search\n\
# found; for chars with multiple valid decomps it may not match the\n\
# canonical visual decomposition. Manual review encouraged."
);
let mut emitted = 0usize;
let mut sorted: Vec<(char, String)> = longest_code.into_iter().collect();
sorted.sort_by(|a, b| a.1.cmp(&b.1).then(a.0.cmp(&b.0)));
for (ch, code) in sorted {
if emitted >= max {
break;
}
if already.contains(&ch) {
continue;
}
let cp = ch as u32;
if basic_cjk_only && !(0x4E00..=0x9FFF).contains(&cp) {
continue;
}
if let Some(decomp) = try_decompose(&code, &by_letter) {
if let Ok(encoded) = encode(&decomp) {
if encoded.as_str() == code {
let zg: String = decomp
.zigen
.iter()
.map(|c| c.to_string())
.collect::<Vec<_>>()
.join(" ");
let st: String = decomp
.strokes
.iter()
.map(|s| (*s as u8).to_string())
.collect::<Vec<_>>()
.join(" ");
let sh = decomp.shape as u8;
println!("{ch}\t{zg}\t{st}\t{sh}");
emitted += 1;
}
}
}
}
eprintln!("[auto-decomp] emitted {emitted} entries");
}
fn identifier(letter: u8) -> Option<(Stroke, Shape)> {
match letter {
b'g' => Some((Stroke::Heng, Shape::LeftRight)),
b'f' => Some((Stroke::Heng, Shape::TopBottom)),
b'd' => Some((Stroke::Heng, Shape::Whole)),
b'h' => Some((Stroke::Shu, Shape::LeftRight)),
b'j' => Some((Stroke::Shu, Shape::TopBottom)),
b'k' => Some((Stroke::Shu, Shape::Whole)),
b't' => Some((Stroke::Pie, Shape::LeftRight)),
b'r' => Some((Stroke::Pie, Shape::TopBottom)),
b'e' => Some((Stroke::Pie, Shape::Whole)),
b'y' => Some((Stroke::Na, Shape::LeftRight)),
b'u' => Some((Stroke::Na, Shape::TopBottom)),
b'i' => Some((Stroke::Na, Shape::Whole)),
b'n' => Some((Stroke::Zhe, Shape::LeftRight)),
b'b' => Some((Stroke::Zhe, Shape::TopBottom)),
b'v' => Some((Stroke::Zhe, Shape::Whole)),
_ => None,
}
}
fn try_decompose(code: &str, by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
let bytes = code.as_bytes();
match bytes.len() {
3 => try_rule_2(bytes, by_letter),
4 => try_rule_3(bytes, by_letter).or_else(|| try_rule_4(bytes, by_letter)),
_ => None,
}
}
fn try_rule_2(bytes: &[u8], by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
let (l1, l2, lid) = (bytes[0], bytes[1], bytes[2]);
let (last_stroke, shape) = identifier(lid)?;
let &z1 = by_letter.get(&l1)?.first()?;
let &z2 = by_letter.get(&l2)?.first()?;
Some(Decomp {
zigen: vec![z1, z2],
strokes: vec![last_stroke],
shape,
})
}
fn try_rule_3(bytes: &[u8], by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
let (l1, l2, l3, lid) = (bytes[0], bytes[1], bytes[2], bytes[3]);
let (last_stroke, shape) = identifier(lid)?;
let &z1 = by_letter.get(&l1)?.first()?;
let &z2 = by_letter.get(&l2)?.first()?;
let &z3 = by_letter.get(&l3)?.first()?;
Some(Decomp {
zigen: vec![z1, z2, z3],
strokes: vec![last_stroke],
shape,
})
}
fn try_rule_4(bytes: &[u8], by_letter: &HashMap<u8, Vec<char>>) -> Option<Decomp> {
let (l1, l2, l3, l4) = (bytes[0], bytes[1], bytes[2], bytes[3]);
let &z1 = by_letter.get(&l1)?.first()?;
let &z2 = by_letter.get(&l2)?.first()?;
let &z3 = by_letter.get(&l3)?.first()?;
let &z4 = by_letter.get(&l4)?.first()?;
Some(Decomp {
zigen: vec![z1, z2, z3, z4],
strokes: vec![Stroke::Heng],
shape: Shape::Whole,
})
}