commonmeta 0.9.4

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
use clap::{Arg, ArgMatches, Command};
use std::path::Path;

use crate::cmd::resolve_db_path;

const STATS_TABLES: &[&str] = &[
    "works",
    "organizations",
    "people",
    "prefixes",
    "works_ror",
    "works_orcid",
    "works_references",
];

pub fn command() -> Command {
    Command::new("settings")
        .about("Show key/value settings stored in the local SQLite database")
        .long_about(
            "Read and display all rows from the `settings` table of the local \
             commonmeta SQLite database. Settings record installed vocabulary \
             versions and bulk-import dates.\n\n\
             Use --stats to show record counts and sizes for the main tables instead. \
             Counts are returned instantly even on 300M-row tables because MAX(rowid) \
             is used rather than COUNT(*). Add --providers to also show the works \
             breakdown by provider (requires a full COUNT(*) scan — slow on large tables).\n\n\
             Examples:\n\n\
             commonmeta settings\n\
             commonmeta settings --stats\n\
             commonmeta settings --stats --providers\n\
             commonmeta settings --file /data/commonmeta.sqlite3\n\
             commonmeta settings --stats --file /data/commonmeta.sqlite3",
        )
        .arg(
            Arg::new("file")
                .long("file")
                .help("Path to the works SQLite database (overrides COMMONMETA_DB and platform default)"),
        )
        .arg(
            Arg::new("people-db")
                .long("people-db")
                .help("Path to the people SQLite database (shows ORCID Public Data File version)"),
        )
        .arg(
            Arg::new("stats")
                .long("stats")
                .help("Show record counts and sizes for all main tables instead of settings values")
                .action(clap::ArgAction::SetTrue),
        )
        .arg(
            Arg::new("providers")
                .long("providers")
                .help(
                    "With --stats: also show works count broken down by provider. \
                    Requires a full COUNT(*) scan — may take minutes on large tables.",
                )
                .action(clap::ArgAction::SetTrue),
        )
}

pub fn execute(matches: &ArgMatches) -> Result<(), String> {
    let db_path_str = resolve_db_path(matches.get_one::<String>("file"));
    let db_path = Path::new(&db_path_str);
    let stats = matches.get_flag("stats");
    let providers = matches.get_flag("providers");

    if stats {
        print_stats(db_path, &db_path_str, providers)?;
        if let Some(people_str) = matches.get_one::<String>("people-db") {
            let people_path = Path::new(people_str);
            if people_path != db_path {
                println!();
                print_stats(people_path, people_str, providers)?;
            }
        }
        return Ok(());
    }

    print_settings_table(db_path, &db_path_str)?;

    if let Some(people_str) = matches.get_one::<String>("people-db") {
        let people_path = Path::new(people_str);
        if people_path != db_path {
            println!();
            print_settings_table(people_path, people_str)?;
        }
    }

    Ok(())
}

fn fmt_count(n: i64) -> String {
    let s = n.to_string();
    let mut out = String::with_capacity(s.len() + s.len() / 3);
    for (i, c) in s.chars().rev().enumerate() {
        if i > 0 && i % 3 == 0 {
            out.push(',');
        }
        out.push(c);
    }
    out.chars().rev().collect()
}

fn print_stats(path: &Path, label: &str, show_providers: bool) -> Result<(), String> {
    if !path.exists() {
        return Err(format!("database not found at '{}'", label));
    }

    let conn = rusqlite::Connection::open(path)
        .map_err(|e| format!("cannot open '{}': {}", label, e))?;

    // Ensure lazily-created tables exist so they show 0 instead of "(table not found)".
    let _ = conn.execute_batch(
        r#"CREATE TABLE IF NOT EXISTS prefixes (
               "prefix"       TEXT PRIMARY KEY NOT NULL,
               "ra"           TEXT NOT NULL DEFAULT '',
               "date_created" TEXT NOT NULL DEFAULT '',
               "date_updated" TEXT NOT NULL DEFAULT ''
           );
           CREATE TABLE IF NOT EXISTS people (
               "id"                   TEXT PRIMARY KEY NOT NULL,
               "orcid"                TEXT NOT NULL DEFAULT '',
               "given_name"           TEXT NOT NULL DEFAULT '',
               "family_name"          TEXT NOT NULL DEFAULT '',
               "credit_name"          TEXT NOT NULL DEFAULT '',
               "name"                 TEXT NOT NULL DEFAULT '',
               "country"              TEXT NOT NULL DEFAULT '',
               "keywords"             TEXT NOT NULL DEFAULT '',
               "other_names"          TEXT NOT NULL DEFAULT '',
               "external_identifiers" TEXT NOT NULL DEFAULT '',
               "date_updated"         TEXT NOT NULL DEFAULT '',
               "metadata"             BLOB NOT NULL DEFAULT x''
           );"#,
    );

    println!("{}:", label);
    let name_width = STATS_TABLES.iter().map(|t| t.len()).max().unwrap_or(0);

    for &table in STATS_TABLES {
        let exists: bool = conn
            .query_row(
                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
                rusqlite::params![table],
                |row| row.get::<_, i64>(0),
            )
            .unwrap_or(0)
            > 0;

        if !exists {
            println!("  {:<width$}  (table not found)", table, width = name_width);
            continue;
        }

        // MAX(rowid) is O(log n) via the B-tree index; COUNT(*) scans all pages.
        let count: i64 = conn
            .query_row(
                &format!("SELECT COALESCE(MAX(rowid), 0) FROM \"{}\"", table),
                [],
                |row| row.get(0),
            )
            .unwrap_or(0);

        // Per-table size via the dbstat virtual table (bundled-full enables SQLITE_ENABLE_DBSTAT_VTAB).
        let size_bytes: i64 = conn
            .query_row(
                "SELECT COALESCE(SUM(pgsize), 0) FROM dbstat WHERE name=?1",
                rusqlite::params![table],
                |row| row.get(0),
            )
            .unwrap_or(0);
        let size_gb = size_bytes as f64 / 1_073_741_824.0;

        println!(
            "  {:<width$}  {:>15}    {:.3} GB",
            table, fmt_count(count), size_gb,
            width = name_width,
        );

        // Provider breakdown — requires a full COUNT(*) scan; opt-in via --providers.
        if table == "works" && show_providers && count > 0 {
            if let Ok(mut stmt) = conn.prepare(
                r#"SELECT COALESCE("provider", '(none)'), COUNT(*) AS n
                   FROM "works" GROUP BY "provider" ORDER BY n DESC"#,
            ) {
                let rows: Vec<(String, i64)> = stmt
                    .query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)?)))
                    .map(|it| it.filter_map(|r| r.ok()).collect())
                    .unwrap_or_default();

                // Provider rows are indented 4 spaces; subtract 2 from name_width so the
                // count column stays aligned with the table rows (indented 2 spaces).
                let prov_width = name_width.saturating_sub(2);
                for (provider, n) in &rows {
                    println!("    {:<width$}  {:>15}", provider, fmt_count(*n), width = prov_width);
                }
            }
        }
    }

    Ok(())
}

fn print_settings_table(path: &Path, label: &str) -> Result<(), String> {
    if !path.exists() {
        return Err(format!("database not found at '{}'", label));
    }

    let rows = crate::get_all_sqlite_settings(path).map_err(|e| e.to_string())?;

    println!("{}:", label);
    if rows.is_empty() {
        println!("  (no settings)");
        return Ok(());
    }

    let key_width = rows.iter().map(|(k, _)| k.len()).max().unwrap_or(0);
    for (key, value) in &rows {
        println!("  {:<width$}  {}", key, value, width = key_width);
    }
    Ok(())
}