use std::collections::BTreeMap;
use std::fs;
use std::io::{self, Read, Write};
use std::path::{Path, PathBuf};
use std::process::ExitCode;
use serde::Deserialize;
use sha2::{Digest, Sha256};
#[derive(Debug, Deserialize)]
struct Manifest {
#[serde(default)]
corpus: BTreeMap<String, CorpusSpec>,
}
#[derive(Debug, Deserialize)]
struct CorpusSpec {
url: String,
sha256: String,
#[allow(dead_code)] weight: f64,
#[allow(dead_code)]
license: String,
#[allow(dead_code)]
description: String,
#[allow(dead_code)]
#[serde(default = "default_format")]
format: String,
}
fn default_format() -> String {
"plain".to_string()
}
fn main() -> ExitCode {
let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let manifest_path = crate_dir.join("data/corpus/manifest.toml");
let cache_dir = crate_dir.join("data/corpus/cache");
let manifest_src = match fs::read_to_string(&manifest_path) {
Ok(s) => s,
Err(e) => {
eprintln!("error: cannot read {}: {e}", manifest_path.display());
return ExitCode::from(1);
}
};
let manifest: Manifest = match toml::from_str(&manifest_src) {
Ok(m) => m,
Err(e) => {
eprintln!("error: cannot parse manifest.toml: {e}");
return ExitCode::from(1);
}
};
if manifest.corpus.is_empty() {
eprintln!(
"manifest has no [corpus.*] entries — nothing to fetch.\n\
see {} for the schema.",
manifest_path.display()
);
return ExitCode::SUCCESS;
}
if let Err(e) = fs::create_dir_all(&cache_dir) {
eprintln!("error: cannot create {}: {e}", cache_dir.display());
return ExitCode::from(1);
}
let mut had_error = false;
for (id, spec) in &manifest.corpus {
let path = cache_dir.join(id);
match ensure_corpus(&path, spec) {
Ok(FetchOutcome::Cached) => println!("✓ {id}: already cached + verified"),
Ok(FetchOutcome::Fetched(n)) => println!("✓ {id}: fetched {n} bytes, sha256 ok"),
Err(e) => {
eprintln!("✗ {id}: {e}");
had_error = true;
}
}
}
if had_error {
ExitCode::from(1)
} else {
ExitCode::SUCCESS
}
}
enum FetchOutcome {
Cached,
Fetched(u64),
}
fn ensure_corpus(path: &Path, spec: &CorpusSpec) -> Result<FetchOutcome, String> {
if path.exists() {
let sha = sha256_file(path).map_err(|e| format!("hashing {}: {e}", path.display()))?;
if sha.eq_ignore_ascii_case(&spec.sha256) {
return Ok(FetchOutcome::Cached);
}
eprintln!(
" cache mismatch for {}: re-downloading (expected {}, got {})",
path.display(),
&spec.sha256,
&sha,
);
}
let tmp = path.with_extension("part");
let bytes = download(&spec.url, &tmp)?;
let actual = sha256_file(&tmp).map_err(|e| format!("hashing {}: {e}", tmp.display()))?;
if !actual.eq_ignore_ascii_case(&spec.sha256) {
let _ = fs::remove_file(&tmp);
return Err(format!(
"sha256 mismatch — expected {}, got {} (download discarded)",
spec.sha256, actual
));
}
fs::rename(&tmp, path).map_err(|e| format!("rename {} → {}: {e}", tmp.display(), path.display()))?;
Ok(FetchOutcome::Fetched(bytes))
}
fn download(url: &str, dest: &Path) -> Result<u64, String> {
let resp = ureq::get(url)
.call()
.map_err(|e| format!("GET {url}: {e}"))?;
if let Some(parent) = dest.parent() {
fs::create_dir_all(parent).map_err(|e| format!("mkdir {}: {e}", parent.display()))?;
}
let mut file = fs::File::create(dest).map_err(|e| format!("create {}: {e}", dest.display()))?;
let mut reader = resp.into_reader();
let n = io::copy(&mut reader, &mut file)
.map_err(|e| format!("copy → {}: {e}", dest.display()))?;
file.flush().ok();
Ok(n)
}
fn sha256_file(path: &Path) -> io::Result<String> {
let mut file = fs::File::open(path)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; 64 * 1024];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(hex(hasher.finalize().as_slice()))
}
fn hex(bytes: &[u8]) -> String {
let mut s = String::with_capacity(bytes.len() * 2);
for b in bytes {
s.push(nibble((b >> 4) & 0xF));
s.push(nibble(b & 0xF));
}
s
}
fn nibble(n: u8) -> char {
match n {
0..=9 => (b'0' + n) as char,
_ => (b'a' + n - 10) as char,
}
}