commonmeta 0.8.18

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
use std::io::Write as IoWrite;
use std::path::Path;
use std::time::Instant;

use clap::{Arg, ArgMatches, Command};

use commonmeta;

use crate::cmd::resolve_db_path;

pub fn command() -> Command {
    Command::new("validate")
        .about("Validate records in the local commonmeta database against the v1.0 schema")
        .long_about(
            "Reads records from the commonmeta SQLite database and validates each one \
            against the commonmeta v1.0 JSON schema. Invalid records are reported with \
            their DOI and a description of each schema violation.\n\n\
            Each record has a `valid` boolean column (default false). Records that pass \
            validation are marked valid = true. Use `--recheck` to skip already-valid \
            records and only process those still marked false.\n\n\
            Examples:\n\n\
            commonmeta validate\n\
            commonmeta validate --from datacite\n\
            commonmeta validate --from datacite --type Dataset\n\
            commonmeta validate --number 1000\n\
            commonmeta validate --recheck --fix\n\
            commonmeta validate --report errors.jsonl\n\
            commonmeta validate /path/to/other.sqlite3",
        )
        .arg(
            Arg::new("database")
                .help("SQLite database path (default: platform commonmeta.sqlite3)")
                .required(false)
                .index(1),
        )
        .arg(
            Arg::new("from")
                .long("from")
                .short('f')
                .help("Filter by provider: crossref, datacite, openalex"),
        )
        .arg(
            Arg::new("type")
                .long("type")
                .help("Filter by work type, e.g. Dataset, JournalArticle"),
        )
        .arg(
            Arg::new("number")
                .long("number")
                .short('n')
                .help("Maximum number of records to validate (0 = all)")
                .value_parser(clap::value_parser!(usize))
                .default_value("0"),
        )
        .arg(
            Arg::new("report")
                .long("report")
                .help("Write validation errors as JSONL to this file instead of stderr"),
        )
        .arg(
            Arg::new("fix")
                .long("fix")
                .help("Repair invalid records in-place by re-applying schema normalization")
                .action(clap::ArgAction::SetTrue),
        )
        .arg(
            Arg::new("recheck")
                .long("recheck")
                .help("Only validate records not yet marked valid (valid = false); skips records already confirmed valid")
                .action(clap::ArgAction::SetTrue),
        )
}

pub fn execute(matches: &ArgMatches) -> Result<(), String> {
    let db_path = resolve_db_path(matches.get_one::<String>("database"));
    let from = matches.get_one::<String>("from").map(String::as_str);
    let work_type = matches.get_one::<String>("type").map(String::as_str);
    let number = *matches.get_one::<usize>("number").unwrap_or(&0);
    let error_report = matches.get_one::<String>("report");
    let fix = matches.get_flag("fix");
    let recheck = matches.get_flag("recheck");

    let provider = from.map(|s| match s.to_lowercase().as_str() {
        "datacite"   => "DataCite",
        "crossref"   => "Crossref",
        "openalex"   => "OpenAlex",
        "inveniordm" => "InvenioRDM",
        _            => s,
    });

    if !Path::new(&db_path).exists() {
        return Err(format!("database not found: {}", db_path));
    }

    eprintln!("validate: reading from {}", db_path);
    if recheck {
        eprintln!("validate: --recheck — only records not yet marked valid");
    }
    if let Some(p) = provider {
        eprintln!("validate: provider filter = {}", p);
    }
    if let Some(t) = work_type {
        eprintln!("validate: type filter    = {}", t);
    }
    if let Some(f) = error_report {
        eprintln!("validate: errors → {}", f);
    }
    if fix {
        eprintln!("validate: --fix enabled, repairing invalid records in-place");
    }

    let start = Instant::now();
    let report = commonmeta::validate_sqlite(
        Path::new(&db_path), provider, work_type, number, fix, recheck,
    ).map_err(|e| e.to_string())?;

    eprintln!(
        "validate: checked {} records in {:.2?}",
        report.total,
        start.elapsed()
    );

    if report.invalid > 0 {
        match error_report {
            Some(path) => {
                let mut f = std::fs::File::create(path)
                    .map_err(|e| format!("cannot create '{}': {}", path, e))?;
                for ve in &report.errors {
                    let line = serde_json::json!({
                        "id": ve.id,
                        "errors": ve.errors,
                    });
                    writeln!(f, "{}", line)
                        .map_err(|e| format!("write error: {}", e))?;
                }
            }
            None => {
                for ve in &report.errors {
                    for err in &ve.errors {
                        eprintln!("  INVALID {}: {}", ve.id, err);
                    }
                }
            }
        }
    }

    if report.fixed > 0 {
        eprintln!("validate: {} record(s) repaired in-place", report.fixed);
    }
    println!(
        "validate: {} records — {} valid, {} invalid{}",
        report.total,
        report.valid,
        report.invalid,
        if report.fixed > 0 { format!(" ({} fixed)", report.fixed) } else { String::new() },
    );

    if report.invalid > 0 {
        Err(format!("{} record(s) failed schema validation", report.invalid))
    } else {
        Ok(())
    }
}