use std::io::Write as IoWrite;
use std::path::Path;
use std::time::Instant;
use clap::{Arg, ArgMatches, Command};
use commonmeta;
use crate::cmd::resolve_db_path;
pub fn command() -> Command {
Command::new("validate")
.about("Validate records in the local commonmeta database against the v1.0 schema")
.long_about(
"Reads records from the commonmeta SQLite database and validates each one \
against the commonmeta v1.0 JSON schema. Invalid records are reported with \
their DOI and a description of each schema violation.\n\n\
Errors are persisted to a `validation_errors` table in the database so that \
`--recheck` can re-validate only the records that failed last time.\n\n\
Examples:\n\n\
commonmeta validate\n\
commonmeta validate --from datacite\n\
commonmeta validate --from datacite --type Dataset\n\
commonmeta validate --number 1000\n\
commonmeta validate --recheck --fix\n\
commonmeta validate --report errors.jsonl\n\
commonmeta validate /path/to/other.sqlite3",
)
.arg(
Arg::new("database")
.help("SQLite database path (default: platform commonmeta.sqlite3)")
.required(false)
.index(1),
)
.arg(
Arg::new("from")
.long("from")
.short('f')
.help("Filter by provider: crossref, datacite, openalex"),
)
.arg(
Arg::new("type")
.long("type")
.help("Filter by work type, e.g. Dataset, JournalArticle"),
)
.arg(
Arg::new("number")
.long("number")
.short('n')
.help("Maximum number of records to validate (0 = all)")
.value_parser(clap::value_parser!(usize))
.default_value("0"),
)
.arg(
Arg::new("report")
.long("report")
.help("Write validation errors as JSONL to this file instead of stderr"),
)
.arg(
Arg::new("fix")
.long("fix")
.help("Repair invalid records in-place by re-applying schema normalization")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("recheck")
.long("recheck")
.help("Only re-validate records that failed in the previous run (reads from the validation_errors table)")
.action(clap::ArgAction::SetTrue),
)
}
pub fn execute(matches: &ArgMatches) -> Result<(), String> {
let db_path = resolve_db_path(matches.get_one::<String>("database"));
let from = matches.get_one::<String>("from").map(String::as_str);
let work_type = matches.get_one::<String>("type").map(String::as_str);
let number = *matches.get_one::<usize>("number").unwrap_or(&0);
let error_report = matches.get_one::<String>("report");
let fix = matches.get_flag("fix");
let recheck = matches.get_flag("recheck");
let provider = from.map(|s| match s.to_lowercase().as_str() {
"datacite" => "DataCite",
"crossref" => "Crossref",
"openalex" => "OpenAlex",
"inveniordm" => "InvenioRDM",
_ => s,
});
if !Path::new(&db_path).exists() {
return Err(format!("database not found: {}", db_path));
}
eprintln!("validate: reading from {}", db_path);
if recheck {
eprintln!("validate: --recheck — only records with errors from the previous run");
}
if let Some(p) = provider {
eprintln!("validate: provider filter = {}", p);
}
if let Some(t) = work_type {
eprintln!("validate: type filter = {}", t);
}
if let Some(f) = error_report {
eprintln!("validate: errors → {}", f);
}
if fix {
eprintln!("validate: --fix enabled, repairing invalid records in-place");
}
let start = Instant::now();
let report = commonmeta::validate_sqlite(
Path::new(&db_path), provider, work_type, number, fix, recheck,
).map_err(|e| e.to_string())?;
eprintln!(
"validate: checked {} records in {:.2?}",
report.total,
start.elapsed()
);
if report.invalid > 0 {
match error_report {
Some(path) => {
let mut f = std::fs::File::create(path)
.map_err(|e| format!("cannot create '{}': {}", path, e))?;
for ve in &report.errors {
let line = serde_json::json!({
"id": ve.id,
"errors": ve.errors,
});
writeln!(f, "{}", line)
.map_err(|e| format!("write error: {}", e))?;
}
}
None => {
for ve in &report.errors {
for err in &ve.errors {
eprintln!(" INVALID {}: {}", ve.id, err);
}
}
}
}
}
if report.fixed > 0 {
eprintln!("validate: {} record(s) repaired in-place", report.fixed);
}
println!(
"validate: {} records — {} valid, {} invalid{}",
report.total,
report.valid,
report.invalid,
if report.fixed > 0 { format!(" ({} fixed)", report.fixed) } else { String::new() },
);
if report.invalid > 0 {
Err(format!("{} record(s) failed schema validation", report.invalid))
} else {
Ok(())
}
}