wubi 0.2.0

Self-developed Wubi 86 encoder, dictionary, and dataset (PHF + FST, WASM-ready).
Documentation
//! `wubi-fetch-corpus` — download every corpus declared in
//! `data/corpus/manifest.toml` into `data/corpus/cache/`, validating the
//! SHA-256 against the manifest. Re-runs are idempotent: cached files whose
//! SHA still matches are skipped.
//!
//! Usage:
//!     cargo run --features tools --release --bin wubi-fetch-corpus
//!
//! Exit codes:
//!     0 — all corpora present and SHA-valid
//!     1 — at least one download failed or SHA mismatched

use std::collections::BTreeMap;
use std::fs;
use std::io::{self, Read, Write};
use std::path::{Path, PathBuf};
use std::process::ExitCode;

use serde::Deserialize;
use sha2::{Digest, Sha256};

#[derive(Debug, Deserialize)]
struct Manifest {
    #[serde(default)]
    corpus: BTreeMap<String, CorpusSpec>,
}

#[derive(Debug, Deserialize)]
struct CorpusSpec {
    url: String,
    sha256: String,
    #[allow(dead_code)] // metadata used by build_weights
    weight: f64,
    #[allow(dead_code)]
    license: String,
    #[allow(dead_code)]
    description: String,
    #[allow(dead_code)]
    #[serde(default = "default_format")]
    format: String,
}

fn default_format() -> String {
    "plain".to_string()
}

fn main() -> ExitCode {
    let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
    let manifest_path = crate_dir.join("data/corpus/manifest.toml");
    let cache_dir = crate_dir.join("data/corpus/cache");

    let manifest_src = match fs::read_to_string(&manifest_path) {
        Ok(s) => s,
        Err(e) => {
            eprintln!("error: cannot read {}: {e}", manifest_path.display());
            return ExitCode::from(1);
        }
    };
    let manifest: Manifest = match toml::from_str(&manifest_src) {
        Ok(m) => m,
        Err(e) => {
            eprintln!("error: cannot parse manifest.toml: {e}");
            return ExitCode::from(1);
        }
    };

    if manifest.corpus.is_empty() {
        eprintln!(
            "manifest has no [corpus.*] entries — nothing to fetch.\n\
             see {} for the schema.",
            manifest_path.display()
        );
        return ExitCode::SUCCESS;
    }

    if let Err(e) = fs::create_dir_all(&cache_dir) {
        eprintln!("error: cannot create {}: {e}", cache_dir.display());
        return ExitCode::from(1);
    }

    let mut had_error = false;
    for (id, spec) in &manifest.corpus {
        let path = cache_dir.join(id);
        match ensure_corpus(&path, spec) {
            Ok(FetchOutcome::Cached) => println!("{id}: already cached + verified"),
            Ok(FetchOutcome::Fetched(n)) => println!("{id}: fetched {n} bytes, sha256 ok"),
            Err(e) => {
                eprintln!("{id}: {e}");
                had_error = true;
            }
        }
    }

    if had_error {
        ExitCode::from(1)
    } else {
        ExitCode::SUCCESS
    }
}

enum FetchOutcome {
    Cached,
    Fetched(u64),
}

fn ensure_corpus(path: &Path, spec: &CorpusSpec) -> Result<FetchOutcome, String> {
    if path.exists() {
        let sha = sha256_file(path).map_err(|e| format!("hashing {}: {e}", path.display()))?;
        if sha.eq_ignore_ascii_case(&spec.sha256) {
            return Ok(FetchOutcome::Cached);
        }
        eprintln!(
            "  cache mismatch for {}: re-downloading (expected {}, got {})",
            path.display(),
            &spec.sha256,
            &sha,
        );
    }

    let tmp = path.with_extension("part");
    let bytes = download(&spec.url, &tmp)?;
    let actual = sha256_file(&tmp).map_err(|e| format!("hashing {}: {e}", tmp.display()))?;
    if !actual.eq_ignore_ascii_case(&spec.sha256) {
        let _ = fs::remove_file(&tmp);
        return Err(format!(
            "sha256 mismatch — expected {}, got {} (download discarded)",
            spec.sha256, actual
        ));
    }
    fs::rename(&tmp, path).map_err(|e| format!("rename {}{}: {e}", tmp.display(), path.display()))?;
    Ok(FetchOutcome::Fetched(bytes))
}

fn download(url: &str, dest: &Path) -> Result<u64, String> {
    let resp = ureq::get(url)
        .call()
        .map_err(|e| format!("GET {url}: {e}"))?;
    if let Some(parent) = dest.parent() {
        fs::create_dir_all(parent).map_err(|e| format!("mkdir {}: {e}", parent.display()))?;
    }
    let mut file = fs::File::create(dest).map_err(|e| format!("create {}: {e}", dest.display()))?;
    let mut reader = resp.into_reader();
    let n = io::copy(&mut reader, &mut file)
        .map_err(|e| format!("copy → {}: {e}", dest.display()))?;
    file.flush().ok();
    Ok(n)
}

fn sha256_file(path: &Path) -> io::Result<String> {
    let mut file = fs::File::open(path)?;
    let mut hasher = Sha256::new();
    let mut buf = [0u8; 64 * 1024];
    loop {
        let n = file.read(&mut buf)?;
        if n == 0 {
            break;
        }
        hasher.update(&buf[..n]);
    }
    Ok(hex(hasher.finalize().as_slice()))
}

fn hex(bytes: &[u8]) -> String {
    let mut s = String::with_capacity(bytes.len() * 2);
    for b in bytes {
        s.push(nibble((b >> 4) & 0xF));
        s.push(nibble(b & 0xF));
    }
    s
}

fn nibble(n: u8) -> char {
    match n {
        0..=9 => (b'0' + n) as char,
        _ => (b'a' + n - 10) as char,
    }
}