use anyhow::{anyhow, bail, Context, Result};
use clap::{Parser, Subcommand};
use serde::{Deserialize, Serialize};
use serde_yaml::Value;
use std::collections::BTreeMap;
use std::fs;
use std::path::{Path, PathBuf};
const DEFAULT_CONTRACT_PATH: &str = "contracts/dataset-thestack-python-v1.yaml";
const DEFAULT_OUTPUT_DIR: &str = "output";
const MIN_INVARIANTS: usize = 7;
const MIN_FALSIFICATIONS: usize = 5;
const MIN_GATES: usize = 5;
const REQUIRED_TOP_KEYS: &[&str] = &[
"source",
"license_whitelist",
"pii_scrub",
"deduplication",
"split",
"budget",
];
#[derive(Parser)]
#[command(
name = "apr-corpus-ingest",
version,
about = "Dry-run scaffold for SHIP-TWO-001 MODEL-2 corpus ingest (C-DATA-THESTACK-PYTHON)"
)]
struct Cli {
#[command(subcommand)]
cmd: Command,
}
#[derive(Subcommand)]
enum Command {
Plan {
#[arg(long, default_value = DEFAULT_CONTRACT_PATH)]
contract: PathBuf,
#[arg(long, default_value = DEFAULT_OUTPUT_DIR)]
output_dir: PathBuf,
},
ValidateContract {
path: PathBuf,
},
}
#[derive(Debug, Deserialize)]
struct CorpusContract {
contract_id: String,
version: String,
#[serde(default)]
status: Option<String>,
#[allow(dead_code)]
source: Value,
#[allow(dead_code)]
license_whitelist: Value,
#[allow(dead_code)]
pii_scrub: Value,
#[allow(dead_code)]
deduplication: Value,
#[allow(dead_code)]
split: Value,
#[allow(dead_code)]
budget: Value,
#[serde(default)]
invariants: Vec<ContractItem>,
#[serde(default)]
falsification: Vec<ContractItem>,
#[serde(default)]
gates: Vec<ContractItem>,
}
#[derive(Debug, Deserialize)]
struct ContractItem {
id: String,
#[serde(default)]
name: Option<String>,
#[allow(dead_code)]
#[serde(default)]
description: Option<String>,
}
#[derive(Debug)]
struct ValidationReport {
contract_id: String,
version: String,
status: Option<String>,
invariants: usize,
falsification: usize,
gates: usize,
present_top_keys: Vec<String>,
}
fn main() -> Result<()> {
let cli = Cli::parse();
match cli.cmd {
Command::Plan {
contract,
output_dir,
} => run_plan(&contract, &output_dir),
Command::ValidateContract { path } => {
let report = validate_contract(&path)?;
print_validation_report(&report);
Ok(())
}
}
}
fn run_plan(contract_path: &Path, output_dir: &Path) -> Result<()> {
let yaml = fs::read_to_string(contract_path).with_context(|| {
format!(
"failed to read corpus contract at {}",
contract_path.display()
)
})?;
let contract: CorpusContract =
serde_yaml::from_str(&yaml).with_context(|| "contract YAML failed to deserialize")?;
assert_structural_minimums(&contract)?;
print_plan_summary(&contract, contract_path);
fs::create_dir_all(output_dir)
.with_context(|| format!("failed to create output directory {}", output_dir.display()))?;
let manifest_path = output_dir.join("dry-run-manifest.yaml");
let manifest = build_dry_run_manifest(&contract, contract_path);
let rendered =
serde_yaml::to_string(&manifest).with_context(|| "failed to serialize dry-run manifest")?;
fs::write(&manifest_path, rendered)
.with_context(|| format!("failed to write {}", manifest_path.display()))?;
println!();
println!("wrote dry-run manifest: {}", manifest_path.display());
println!("(no network calls, no downloads — this is scaffolding only)");
Ok(())
}
fn validate_contract(path: &Path) -> Result<ValidationReport> {
let yaml = fs::read_to_string(path)
.with_context(|| format!("failed to read contract at {}", path.display()))?;
let raw: Value =
serde_yaml::from_str(&yaml).with_context(|| "contract YAML is not valid YAML")?;
let mapping = raw
.as_mapping()
.ok_or_else(|| anyhow!("contract root must be a YAML mapping"))?;
let mut present_top_keys = Vec::new();
for key in REQUIRED_TOP_KEYS {
if mapping.contains_key(Value::String((*key).to_string())) {
present_top_keys.push((*key).to_string());
} else {
bail!("contract missing required top-level key: {key}");
}
}
let contract: CorpusContract = serde_yaml::from_str(&yaml)
.with_context(|| "contract YAML failed to deserialize into typed schema")?;
assert_structural_minimums(&contract)?;
Ok(ValidationReport {
contract_id: contract.contract_id,
version: contract.version,
status: contract.status,
invariants: contract.invariants.len(),
falsification: contract.falsification.len(),
gates: contract.gates.len(),
present_top_keys,
})
}
fn assert_structural_minimums(contract: &CorpusContract) -> Result<()> {
if contract.invariants.len() < MIN_INVARIANTS {
bail!(
"contract declares {} invariants; spec requires at least {}",
contract.invariants.len(),
MIN_INVARIANTS
);
}
if contract.falsification.len() < MIN_FALSIFICATIONS {
bail!(
"contract declares {} falsification tests; spec requires at least {}",
contract.falsification.len(),
MIN_FALSIFICATIONS
);
}
if contract.gates.len() < MIN_GATES {
bail!(
"contract declares {} gates; spec requires at least {}",
contract.gates.len(),
MIN_GATES
);
}
for item in contract
.invariants
.iter()
.chain(&contract.falsification)
.chain(&contract.gates)
{
if item.id.trim().is_empty() {
bail!("contract has an item with empty id");
}
}
Ok(())
}
fn print_plan_summary(contract: &CorpusContract, contract_path: &Path) {
println!("== C-DATA-THESTACK-PYTHON ingest plan (dry-run) ==");
println!("contract file : {}", contract_path.display());
println!("contract_id : {}", contract.contract_id);
println!("version : {}", contract.version);
if let Some(status) = &contract.status {
println!("status : {status}");
}
println!("invariants : {}", contract.invariants.len());
for inv in &contract.invariants {
println!(" - {}", inv.id);
}
println!("falsification : {}", contract.falsification.len());
for f in &contract.falsification {
let name = f.name.as_deref().unwrap_or("");
println!(" - {} {}", f.id, name);
}
println!("gates : {}", contract.gates.len());
for g in &contract.gates {
let name = g.name.as_deref().unwrap_or("");
println!(" - {} {}", g.id, name);
}
println!();
println!("planned steps (none executed in dry-run):");
println!(" 1. pin HF dataset revision_sha + record raw_tar_sha256");
println!(" 2. license filter via go-license-detector (SPDX whitelist)");
println!(" 3. PII scrub (file-level rejection on pattern match)");
println!(" 4. MinHash-LSH dedup, shingle=5, perms=128, seed=42");
println!(" 5. hash-by-file-sha256 deterministic train/val split");
println!(" 6. emit shards + manifest + provenance + corpus_sha256");
}
fn print_validation_report(report: &ValidationReport) {
println!("contract_id : {}", report.contract_id);
println!("version : {}", report.version);
if let Some(status) = &report.status {
println!("status : {status}");
}
println!(
"top-level : {} keys present",
report.present_top_keys.len()
);
for key in &report.present_top_keys {
println!(" - {key}");
}
println!("invariants : {}", report.invariants);
println!("falsification : {}", report.falsification);
println!("gates : {}", report.gates);
println!("VALIDATION: PASS");
}
#[derive(Serialize)]
struct DryRunManifest {
dry_run: bool,
generated_at_utc: String,
generator: &'static str,
contract_path: String,
contract_id: String,
contract_version: String,
invariant_count: usize,
falsification_count: usize,
gate_count: usize,
source: BTreeMap<&'static str, String>,
counts: BTreeMap<&'static str, String>,
corpus_sha256: String,
gates: Vec<String>,
}
fn build_dry_run_manifest(contract: &CorpusContract, contract_path: &Path) -> DryRunManifest {
let mut source = BTreeMap::new();
source.insert("revision_sha", placeholder("source.revision_sha"));
source.insert("raw_tar_sha256", placeholder("source.raw_tar_sha256"));
source.insert("ingest_date_utc", placeholder("source.ingest_date_utc"));
let mut counts = BTreeMap::new();
counts.insert("train_token_count", placeholder("counts.train_token_count"));
counts.insert("val_token_count", placeholder("counts.val_token_count"));
counts.insert("train_file_count", placeholder("counts.train_file_count"));
counts.insert("val_file_count", placeholder("counts.val_file_count"));
counts.insert(
"pii_rejected_count",
placeholder("counts.pii_rejected_count"),
);
counts.insert(
"license_rejected_count",
placeholder("counts.license_rejected_count"),
);
counts.insert(
"dedup_rejected_count",
placeholder("counts.dedup_rejected_count"),
);
DryRunManifest {
dry_run: true,
generated_at_utc: now_utc_iso8601(),
generator: concat!("apr-corpus-ingest ", env!("CARGO_PKG_VERSION")),
contract_path: contract_path.display().to_string(),
contract_id: contract.contract_id.clone(),
contract_version: contract.version.clone(),
invariant_count: contract.invariants.len(),
falsification_count: contract.falsification.len(),
gate_count: contract.gates.len(),
source,
counts,
corpus_sha256: placeholder("corpus_sha256"),
gates: contract.gates.iter().map(|g| g.id.clone()).collect(),
}
}
fn placeholder(field: &str) -> String {
format!("TODO: {field} — fill in on real ingest")
}
fn now_utc_iso8601() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
let secs = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let days = (secs / 86_400) as i64;
let sod = secs % 86_400;
let (y, m, d) = civil_from_days(days);
let hour = sod / 3600;
let minute = (sod % 3600) / 60;
let second = sod % 60;
format!("{y:04}-{m:02}-{d:02}T{hour:02}:{minute:02}:{second:02}Z")
}
fn civil_from_days(z: i64) -> (i32, u32, u32) {
let z = z + 719_468;
let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
let doe = (z - era * 146_097) as u64;
let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
let y = yoe as i64 + era * 400;
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
let mp = (5 * doy + 2) / 153;
let d = (doy - (153 * mp + 2) / 5 + 1) as u32;
let m = if mp < 10 { mp + 3 } else { mp - 9 } as u32;
let y = if m <= 2 { y + 1 } else { y };
(y as i32, m, d)
}
#[cfg(test)]
mod tests {
use super::*;
fn real_contract_path() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(Path::parent)
.expect("workspace root resolvable from CARGO_MANIFEST_DIR")
.join("contracts/dataset-thestack-python-v1.yaml")
}
#[test]
fn parses_real_contract_with_structural_invariants() {
let path = real_contract_path();
assert!(path.exists(), "real contract missing at {}", path.display());
let yaml = fs::read_to_string(&path).expect("read real contract");
let contract: CorpusContract = serde_yaml::from_str(&yaml).expect("contract deserializes");
assert_structural_minimums(&contract).expect("structural minimums hold");
assert!(
contract.invariants.len() >= MIN_INVARIANTS,
"expected >= {} invariants, got {}",
MIN_INVARIANTS,
contract.invariants.len()
);
assert!(
contract.falsification.len() >= MIN_FALSIFICATIONS,
"expected >= {} falsification tests, got {}",
MIN_FALSIFICATIONS,
contract.falsification.len()
);
assert!(
contract.gates.len() >= MIN_GATES,
"expected >= {} gates, got {}",
MIN_GATES,
contract.gates.len()
);
for inv in &contract.invariants {
assert!(
inv.id.starts_with("INV-DATA-"),
"bad invariant id: {}",
inv.id
);
}
for f in &contract.falsification {
assert!(
f.id.starts_with("FALSIFY-DATA-"),
"bad falsification id: {}",
f.id
);
}
for g in &contract.gates {
assert!(g.id.starts_with("GATE-DATA-"), "bad gate id: {}", g.id);
}
}
#[test]
fn validate_contract_passes_on_real_contract_and_reports_top_keys() {
let path = real_contract_path();
let report = validate_contract(&path).expect("validation passes");
assert_eq!(report.contract_id, "C-DATA-THESTACK-PYTHON");
assert_eq!(report.invariants, 7);
assert_eq!(report.falsification, 5);
assert_eq!(report.gates, 5);
for key in REQUIRED_TOP_KEYS {
assert!(
report.present_top_keys.iter().any(|k| k == *key),
"missing required top-level key: {key}"
);
}
assert_eq!(report.present_top_keys.len(), REQUIRED_TOP_KEYS.len());
}
}