commonmeta 0.9.4

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
use std::io::Write as IoWrite;
use std::path::Path;
use std::time::Instant;

use clap::{Arg, ArgMatches, Command};

use crate::cmd::{resolve_db_path, resolve_ror_db_path};

pub fn command() -> Command {
    Command::new("validate")
        .about("Validate records in the local commonmeta database against the v1.0 schema")
        .long_about(
            "Reads records from the commonmeta SQLite database and validates each one \
            against the commonmeta v1.0 JSON schema. Invalid records are reported with \
            their DOI and a description of each schema violation.\n\n\
            Each record has a `valid` boolean column (default false). Records that pass \
            validation are marked valid = true. Use `--recheck` to skip already-valid \
            records and only process those still marked false.\n\n\
            Use `--fill` to enrich affiliation and organization identifiers: records \
            whose affiliations carry a Crossref Funder ID or ISNI are looked up in the \
            ROR organizations database and updated with the matching ROR URL and display \
            name (asserted_by = \"Commonmeta\"). Affiliations that already hold a ROR id \
            but are missing a name are filled in the same way.\n\n\
            Examples:\n\n\
            commonmeta validate\n\
            commonmeta validate --from datacite\n\
            commonmeta validate --from datacite --type Dataset\n\
            commonmeta validate --number 1000\n\
            commonmeta validate --recheck --fix\n\
            commonmeta validate --report errors.jsonl\n\
            commonmeta validate /path/to/other.sqlite3\n\
            commonmeta validate --fill --organizations /path/to/ror.sqlite3\n\
            commonmeta validate --fill --from crossref --number 10000",
        )
        .arg(
            Arg::new("database")
                .help("SQLite database path (default: platform commonmeta.sqlite3)")
                .required(false)
                .index(1),
        )
        .arg(
            Arg::new("from")
                .long("from")
                .short('f')
                .help("Filter by DOI registration agency (lowercase): crossref, datacite, medra, jalc, cnki, kisti, op, airiti"),
        )
        .arg(
            Arg::new("type")
                .long("type")
                .help("Filter by work type, e.g. Dataset, JournalArticle"),
        )
        .arg(
            Arg::new("number")
                .long("number")
                .short('n')
                .help("Maximum number of records to validate (0 = all)")
                .value_parser(clap::value_parser!(usize))
                .default_value("0"),
        )
        .arg(
            Arg::new("report")
                .long("report")
                .help("Write validation errors as JSONL to this file instead of stderr"),
        )
        .arg(
            Arg::new("fix")
                .long("fix")
                .help("Repair invalid records in-place by re-applying schema normalization")
                .action(clap::ArgAction::SetTrue),
        )
        .arg(
            Arg::new("recheck")
                .long("recheck")
                .help("Only validate records not yet marked valid (valid = false); skips records already confirmed valid")
                .action(clap::ArgAction::SetTrue),
        )
        .arg(
            Arg::new("id")
                .long("id")
                .help("Validate (and fill) a single record by DOI or full https://doi.org/... URL"),
        )
        .arg(
            Arg::new("has-ror-id")
                .long("has-ror-id")
                .help("Only process records that have at least one ROR affiliation ID")
                .action(clap::ArgAction::SetTrue),
        )
        .arg(
            Arg::new("fill")
                .long("fill")
                .help("Enrich affiliations: replace Crossref Funder IDs and ISNIs with ROR URLs, fill missing names for bare ROR ids")
                .action(clap::ArgAction::SetTrue),
        )
        .arg(
            Arg::new("organizations")
                .long("organizations")
                .help("Path to the ROR organizations SQLite database used by --fill (default: platform ror.sqlite3, env: ROR_DB)"),
        )
}

pub fn execute(matches: &ArgMatches) -> Result<(), String> {
    let db_path = resolve_db_path(matches.get_one::<String>("database"));
    let from_raw = matches.get_one::<String>("from").map(String::as_str);
    let work_type = matches.get_one::<String>("type").map(String::as_str);
    let record_id = matches.get_one::<String>("id").map(|s| {
        // Accept both bare DOIs and https://doi.org/... URLs; works.id stores full URLs.
        let n = crate::doi_utils::normalize_doi(s);
        if n.is_empty() { s.clone() } else { n }
    });
    let number = *matches.get_one::<usize>("number").unwrap_or(&0);
    let error_report = matches.get_one::<String>("report");
    let fix = matches.get_flag("fix");
    let recheck = matches.get_flag("recheck");
    let has_ror_id = matches.get_flag("has-ror-id");
    let fill = matches.get_flag("fill");
    let ror_db_path = resolve_ror_db_path(matches.get_one::<String>("organizations"));

    let provider = from_raw.map(|s| match s.to_lowercase().as_str() {
        "crossref"  => "Crossref",
        "datacite"  => "DataCite",
        "medra"     => "mEDRA",
        "jalc"      => "JaLC",
        "cnki"      => "CNKI",
        "kisti"     => "KISTI",
        "op"        => "OP",
        "airiti"    => "Airiti",
        _           => s,
    });

    if !Path::new(&db_path).exists() {
        return Err(format!("database not found: {}", db_path));
    }

    // --fill: run before validation so repaired affiliations are visible to the schema check.
    // If --organizations was not given and the default path doesn't exist, warn and skip.
    if fill {
        let ror_explicit = matches.get_one::<String>("organizations").is_some();
        if !Path::new(&ror_db_path).exists() {
            if ror_explicit {
                return Err(format!(
                    "organizations database not found: {}",
                    ror_db_path
                ));
            }
            eprintln!(
                "validate: --fill skipped — organizations database not found at {} \
                 (use --organizations to specify a path)",
                ror_db_path
            );
        } else {
            eprintln!("validate: --fill from {}", ror_db_path);
            let start = Instant::now();
            let report = crate::fill_sqlite(
                Path::new(&db_path),
                Path::new(&ror_db_path),
                provider,
                work_type,
                record_id.as_deref(),
                has_ror_id,
                number,
            )
            .map_err(|e| e.to_string())?;
            eprintln!(
                "validate: fill — {} records checked, {} changed, {} affiliations filled in {:.2?}",
                report.total, report.changed, report.affiliations_filled, start.elapsed()
            );
        }
    }

    eprintln!("validate: reading from {}", db_path);
    if recheck {
        eprintln!("validate: --recheck — only records not yet marked valid");
    }
    if let Some(p) = provider {
        eprintln!("validate: provider filter = {}", p);
    }
    if let Some(t) = work_type {
        eprintln!("validate: type filter    = {}", t);
    }
    if let Some(f) = error_report {
        eprintln!("validate: errors → {}", f);
    }
    if fix {
        eprintln!("validate: --fix enabled, repairing invalid records in-place");
    }

    let start = Instant::now();
    let report = crate::validate_sqlite(
        Path::new(&db_path), provider, work_type, record_id.as_deref(), has_ror_id, number, fix, recheck,
    ).map_err(|e| e.to_string())?;

    if record_id.is_some() && report.total == 0 {
        return Err(format!(
            "record not found: {}",
            record_id.as_deref().unwrap_or("")
        ));
    }

    eprintln!(
        "validate: checked {} records in {:.2?}",
        report.total,
        start.elapsed()
    );

    if report.invalid > 0 {
        match error_report {
            Some(path) => {
                let mut f = std::fs::File::create(path)
                    .map_err(|e| format!("cannot create '{}': {}", path, e))?;
                for ve in &report.errors {
                    let line = serde_json::json!({
                        "id": ve.id,
                        "errors": ve.errors,
                    });
                    writeln!(f, "{}", line)
                        .map_err(|e| format!("write error: {}", e))?;
                }
            }
            None => {
                for ve in &report.errors {
                    for err in &ve.errors {
                        eprintln!("  INVALID {}: {}", ve.id, err);
                    }
                }
            }
        }
    }

    if report.fixed > 0 {
        eprintln!("validate: {} record(s) repaired in-place", report.fixed);
    }
    println!(
        "validate: {} records — {} valid, {} invalid{}",
        report.total,
        report.valid,
        report.invalid,
        if report.fixed > 0 { format!(" ({} fixed)", report.fixed) } else { String::new() },
    );

    if report.invalid > 0 {
        Err(format!("{} record(s) failed schema validation", report.invalid))
    } else {
        Ok(())
    }
}