use std::io::Write as IoWrite;
use std::path::Path;
use std::time::Instant;
use clap::{Arg, ArgMatches, Command};
use crate::cmd::{resolve_db_path, resolve_ror_db_path};
pub fn command() -> Command {
Command::new("validate")
.about("Validate records in the local commonmeta database against the v1.0 schema")
.long_about(
"Reads records from the commonmeta SQLite database and validates each one \
against the commonmeta v1.0 JSON schema. Invalid records are reported with \
their DOI and a description of each schema violation.\n\n\
Each record has a `valid` boolean column (default false). Records that pass \
validation are marked valid = true. Use `--recheck` to skip already-valid \
records and only process those still marked false.\n\n\
Use `--fill` to enrich affiliation and organization identifiers: records \
whose affiliations carry a Crossref Funder ID or ISNI are looked up in the \
ROR organizations database and updated with the matching ROR URL and display \
name (asserted_by = \"Commonmeta\"). Affiliations that already hold a ROR id \
but are missing a name are filled in the same way.\n\n\
Examples:\n\n\
commonmeta validate\n\
commonmeta validate --from datacite\n\
commonmeta validate --from datacite --type Dataset\n\
commonmeta validate --number 1000\n\
commonmeta validate --recheck --fix\n\
commonmeta validate --report errors.jsonl\n\
commonmeta validate /path/to/other.sqlite3\n\
commonmeta validate --fill --organizations /path/to/ror.sqlite3\n\
commonmeta validate --fill --from crossref --number 10000",
)
.arg(
Arg::new("database")
.help("SQLite database path (default: platform commonmeta.sqlite3)")
.required(false)
.index(1),
)
.arg(
Arg::new("from")
.long("from")
.short('f')
.help("Filter by DOI registration agency (lowercase): crossref, datacite, medra, jalc, cnki, kisti, op, airiti"),
)
.arg(
Arg::new("type")
.long("type")
.help("Filter by work type, e.g. Dataset, JournalArticle"),
)
.arg(
Arg::new("number")
.long("number")
.short('n')
.help("Maximum number of records to validate (0 = all)")
.value_parser(clap::value_parser!(usize))
.default_value("0"),
)
.arg(
Arg::new("report")
.long("report")
.help("Write validation errors as JSONL to this file instead of stderr"),
)
.arg(
Arg::new("fix")
.long("fix")
.help("Repair invalid records in-place by re-applying schema normalization")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("recheck")
.long("recheck")
.help("Only validate records not yet marked valid (valid = false); skips records already confirmed valid")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("id")
.long("id")
.help("Validate (and fill) a single record by DOI or full https://doi.org/... URL"),
)
.arg(
Arg::new("has-ror-id")
.long("has-ror-id")
.help("Only process records that have at least one ROR affiliation ID")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("fill")
.long("fill")
.help("Enrich affiliations: replace Crossref Funder IDs and ISNIs with ROR URLs, fill missing names for bare ROR ids")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("organizations")
.long("organizations")
.help("Path to the ROR organizations SQLite database used by --fill (default: platform ror.sqlite3, env: ROR_DB)"),
)
}
pub fn execute(matches: &ArgMatches) -> Result<(), String> {
let db_path = resolve_db_path(matches.get_one::<String>("database"));
let from_raw = matches.get_one::<String>("from").map(String::as_str);
let work_type = matches.get_one::<String>("type").map(String::as_str);
let record_id = matches.get_one::<String>("id").map(|s| {
let n = crate::doi_utils::normalize_doi(s);
if n.is_empty() { s.clone() } else { n }
});
let number = *matches.get_one::<usize>("number").unwrap_or(&0);
let error_report = matches.get_one::<String>("report");
let fix = matches.get_flag("fix");
let recheck = matches.get_flag("recheck");
let has_ror_id = matches.get_flag("has-ror-id");
let fill = matches.get_flag("fill");
let ror_db_path = resolve_ror_db_path(matches.get_one::<String>("organizations"));
let provider = from_raw.map(|s| match s.to_lowercase().as_str() {
"crossref" => "Crossref",
"datacite" => "DataCite",
"medra" => "mEDRA",
"jalc" => "JaLC",
"cnki" => "CNKI",
"kisti" => "KISTI",
"op" => "OP",
"airiti" => "Airiti",
_ => s,
});
if !Path::new(&db_path).exists() {
return Err(format!("database not found: {}", db_path));
}
if fill {
let ror_explicit = matches.get_one::<String>("organizations").is_some();
if !Path::new(&ror_db_path).exists() {
if ror_explicit {
return Err(format!(
"organizations database not found: {}",
ror_db_path
));
}
eprintln!(
"validate: --fill skipped — organizations database not found at {} \
(use --organizations to specify a path)",
ror_db_path
);
} else {
eprintln!("validate: --fill from {}", ror_db_path);
let start = Instant::now();
let report = crate::fill_sqlite(
Path::new(&db_path),
Path::new(&ror_db_path),
provider,
work_type,
record_id.as_deref(),
has_ror_id,
number,
)
.map_err(|e| e.to_string())?;
eprintln!(
"validate: fill — {} records checked, {} changed, {} affiliations filled in {:.2?}",
report.total, report.changed, report.affiliations_filled, start.elapsed()
);
}
}
eprintln!("validate: reading from {}", db_path);
if recheck {
eprintln!("validate: --recheck — only records not yet marked valid");
}
if let Some(p) = provider {
eprintln!("validate: provider filter = {}", p);
}
if let Some(t) = work_type {
eprintln!("validate: type filter = {}", t);
}
if let Some(f) = error_report {
eprintln!("validate: errors → {}", f);
}
if fix {
eprintln!("validate: --fix enabled, repairing invalid records in-place");
}
let start = Instant::now();
let report = crate::validate_sqlite(
Path::new(&db_path), provider, work_type, record_id.as_deref(), has_ror_id, number, fix, recheck,
).map_err(|e| e.to_string())?;
if record_id.is_some() && report.total == 0 {
return Err(format!(
"record not found: {}",
record_id.as_deref().unwrap_or("")
));
}
eprintln!(
"validate: checked {} records in {:.2?}",
report.total,
start.elapsed()
);
if report.invalid > 0 {
match error_report {
Some(path) => {
let mut f = std::fs::File::create(path)
.map_err(|e| format!("cannot create '{}': {}", path, e))?;
for ve in &report.errors {
let line = serde_json::json!({
"id": ve.id,
"errors": ve.errors,
});
writeln!(f, "{}", line)
.map_err(|e| format!("write error: {}", e))?;
}
}
None => {
for ve in &report.errors {
for err in &ve.errors {
eprintln!(" INVALID {}: {}", ve.id, err);
}
}
}
}
}
if report.fixed > 0 {
eprintln!("validate: {} record(s) repaired in-place", report.fixed);
}
println!(
"validate: {} records — {} valid, {} invalid{}",
report.total,
report.valid,
report.invalid,
if report.fixed > 0 { format!(" ({} fixed)", report.fixed) } else { String::new() },
);
if report.invalid > 0 {
Err(format!("{} record(s) failed schema validation", report.invalid))
} else {
Ok(())
}
}