commonmeta 0.9.6

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
use clap::{Arg, ArgAction, ArgMatches, Command};
use std::path::Path;
use std::time::Instant;

use crate::JunctionTable;

use crate::cmd::resolve_db_path;

pub fn command() -> Command {
    Command::new("migrate")
        .about("Apply any pending database schema migrations")
        .long_about(
            "Open the local commonmeta database and apply any pending schema \
             migrations, then exit. Safe to run on databases built with older \
             releases: migrations are idempotent and only alter the schema \
             (no existing records are modified or removed).\n\n\
             Each step is printed to stderr with elapsed time. On large databases \
             (hundreds of millions of records) index creation can take 30–90 minutes; \
             the command will not hang silently.\n\n\
             Junction-table backfills populate works_orcid, works_ror, or \
             works_references for existing records. Required once on databases \
             built before the junction tables were introduced. Each flag tracks \
             its own resume cursor in the settings table so runs can be \
             interrupted and restarted safely. --backfill runs all three in a \
             single streaming pass (most efficient for large corpora).\n\n\
             Examples:\n\n\
             commonmeta migrate\n\
             commonmeta migrate --backfill\n\
             commonmeta migrate --references\n\
             commonmeta migrate --orcid --ror\n\
             commonmeta migrate --file /var/lib/commonmeta/commonmeta.sqlite3",
        )
        .arg(
            Arg::new("file")
                .long("file")
                .help("Path to the SQLite database (overrides COMMONMETA_DB and platform default)"),
        )
        .arg(
            Arg::new("backfill")
                .long("backfill")
                .help("Populate all junction tables (works_orcid, works_ror, works_references) in one pass")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("orcid")
                .long("orcid")
                .help("Populate works_orcid for all existing works")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("ror")
                .long("ror")
                .help("Populate works_ror for all existing works")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("references")
                .long("references")
                .help("Populate works_references for all existing works")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("crossref")
                .long("crossref")
                .help("Restrict backfill to Crossref works only")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("datacite")
                .long("datacite")
                .help("Restrict backfill to DataCite works only")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("rebuild-fts")
                .long("rebuild-fts")
                .help("Drop and rebuild all three FTS5 indexes (works_fts, people_fts, organizations_fts)")
                .action(ArgAction::SetTrue),
        )
}

pub fn execute(matches: &ArgMatches) -> Result<(), String> {
    let db_path_str = resolve_db_path(matches.get_one::<String>("file"));
    let db_path = Path::new(&db_path_str);

    if !db_path.exists() {
        return Err(format!(
            "database not found at '{}'; run 'commonmeta import' first",
            db_path_str
        ));
    }

    eprintln!("migrate: {}", db_path_str);
    let total_start = Instant::now();

    let (applied, version) =
        crate::run_migrations(db_path).map_err(|e| e.to_string())?;

    if applied == 0 {
        eprintln!("migrate: already at schema version {version}, nothing to do");
    } else {
        eprintln!(
            "migrate: applied {} step(s), schema version {} ({:.1?})",
            applied,
            version,
            total_start.elapsed()
        );
    }

    let backfill_all = matches.get_flag("backfill");
    let mut tables: Vec<JunctionTable> = Vec::new();
    if backfill_all {
        tables = vec![JunctionTable::Orcid, JunctionTable::Ror, JunctionTable::References];
    } else {
        if matches.get_flag("orcid")      { tables.push(JunctionTable::Orcid); }
        if matches.get_flag("ror")        { tables.push(JunctionTable::Ror); }
        if matches.get_flag("references") { tables.push(JunctionTable::References); }
    }

    let mut providers: Vec<&str> = Vec::new();
    if matches.get_flag("crossref") { providers.push("Crossref"); }
    if matches.get_flag("datacite") { providers.push("DataCite"); }

    if !tables.is_empty() {
        let table_label = if backfill_all { "all junction tables".to_string() }
                    else { tables.iter().map(|t| format!("works_{}", match t {
                        JunctionTable::Orcid => "orcid",
                        JunctionTable::Ror   => "ror",
                        JunctionTable::References => "references",
                    })).collect::<Vec<_>>().join(", ") };
        let provider_label = if providers.is_empty() { String::new() }
                             else { format!(" ({})", providers.join(", ")) };
        let label = format!("{table_label}{provider_label}");
        eprintln!("migrate: backfilling {label}");
        let bf_start = Instant::now();
        let (scanned, inserted) =
            crate::backfill_junction_tables(db_path, &tables, &providers).map_err(|e| e.to_string())?;
        eprintln!(
            "migrate: backfill complete — {scanned} works scanned, {inserted} rows indexed ({:.1?})",
            bf_start.elapsed()
        );
        println!("backfilled {label} in {} (schema version {version}, {inserted} rows)", db_path_str);
    } else {
        println!("migrated {} (schema version {})", db_path_str, version);
    }

    if matches.get_flag("rebuild-fts") {
        let fts_start = Instant::now();
        eprintln!("migrate: rebuilding works_fts …");
        crate::rebuild_works_fts(db_path).map_err(|e| e.to_string())?;
        eprintln!("migrate: works_fts done ({:.1?})", fts_start.elapsed());

        // organizations_fts and people_fts live in the same file when ROR/ORCID
        // data was imported into the same database. Log and skip if absent.
        let fts_start = Instant::now();
        eprintln!("migrate: rebuilding organizations_fts …");
        match crate::rebuild_organizations_fts(db_path) {
            Ok(()) => eprintln!("migrate: organizations_fts done ({:.1?})", fts_start.elapsed()),
            Err(e) => eprintln!("migrate: organizations_fts skipped — {e}"),
        }

        let fts_start = Instant::now();
        eprintln!("migrate: rebuilding people_fts …");
        match crate::rebuild_people_fts(db_path) {
            Ok(()) => eprintln!("migrate: people_fts done ({:.1?})", fts_start.elapsed()),
            Err(e) => eprintln!("migrate: people_fts skipped — {e}"),
        }

        println!("rebuild-fts complete ({:.1?})", total_start.elapsed());
    }

    Ok(())
}