commonmeta 0.9.6

//! commonmeta — a Rust port of front-matter/commonmeta.
//!
//! Convert scholarly metadata between formats. The native model is [`Data`];
//! format modules read into it and write out of it.

pub mod author_utils;
pub mod cmd;
pub mod constants;
pub mod crockford;
pub mod date_utils;
pub mod data;
pub mod doi_utils;
pub mod error;
pub mod io_utils;
mod formats;
pub mod progress;
pub mod schema_utils;
pub mod geonames;
pub mod ror_countries;
pub mod spdx;
pub mod utils;
pub mod vocabularies;

pub use data::{Citation, Data};
pub use error::{Error, Result};
pub use schema_utils::SCHEMA_JSON;
pub use formats::crossref;
pub use formats::pubmed;
pub use formats::inveniordm::PushResult;
pub use formats::ror::AffiliationMatch;
pub use formats::ror::Ror;
pub use formats::ror::RorRelease;

pub const VERSION: &str = env!("CARGO_PKG_VERSION");

/// Read a single record from `from` format, without writing it back out.
pub fn read(from: &str, input: &str) -> Result<Data> {
    formats::read(from, input)
}

/// Read from one format and write to another in a single call.
pub fn convert(from: &str, to: &str, input: &str) -> Result<Vec<u8>> {
    let data = formats::read(from, input)?;
    formats::write(to, &data)
}

/// Write an already-loaded record to `to` format.
pub fn write(to: &str, data: &Data) -> Result<Vec<u8>> {
    formats::write(to, data)
}

/// Like [`write`], but forwards `style` and `locale` to the citation writer.
/// For non-`"citation"` formats both parameters are ignored.
pub fn write_with_style(
    to: &str,
    data: &Data,
    style: Option<&str>,
    locale: Option<&str>,
) -> Result<Vec<u8>> {
    formats::write_citation(to, data, style, locale)
}

/// Write a ROR-derived record as raw ROR-shaped JSON (as opposed to
/// `write("ror", data)`, which produces InvenioRDM vocabulary YAML).
pub fn write_ror_json(data: &Data) -> Result<Vec<u8>> {
    formats::ror::write_json(data)
}

/// Fetch a ROR organization by its ROR URL or other organization identifier
/// from the ROR API. Returns the record converted to the commonmeta `Data` model.
pub fn fetch_ror(id: &str) -> Result<Data> {
    formats::ror::fetch(id)
}

/// Serialize a ROR organization `Data` as a v1.0-compliant commonmeta JSON array.
pub fn write_ror_commonmeta(data: &Data) -> Result<Vec<u8>> {
    formats::ror::write_commonmeta_org(data)
}

/// Fetch metadata for the latest ROR data release from Zenodo (InvenioRDM)
/// without downloading the full archive. Returns the version tag, release date,
/// Zenodo record ID, zip filename, and direct download URL.
pub fn fetch_latest_ror_release() -> Result<RorRelease> {
    formats::ror::fetch_latest_ror_release()
}

/// Download and parse the zip archive described by `release`. The zip is
/// cached locally for 30 days so repeat installs of the same version skip the
/// network round-trip. Returns `(records, from_cache)`.
pub fn download_ror_release(release: &RorRelease) -> Result<(Vec<formats::ror::Ror>, bool)> {
    formats::ror::download_release(release)
}

/// Convenience: fetch the latest release metadata then immediately download
/// and parse the dump. Returns `(RorRelease, Vec<Ror>, from_cache)`.
pub fn download_ror_all() -> Result<(RorRelease, Vec<formats::ror::Ror>, bool)> {
    formats::ror::download_all()
}

/// Look up a ROR organization by its full URL (e.g. `https://ror.org/012xzy7a9`)
/// from a local SQLite database written by [`write_ror_sqlite`]. Returns the
/// record converted to the commonmeta `Data` model, or an error when not found.
pub fn fetch_ror_sqlite(
    id: &str,
    db_path: &std::path::Path,
) -> Result<Data> {
    formats::ror::fetch_sqlite(id, db_path)
}

/// Read a page of ROR organizations from the local SQLite database as `Data` records.
/// `limit` caps records returned; `offset` is the zero-based row offset.
/// `country_code` filters by ISO 3166-1 alpha-2 code; `query` applies FTS.
pub fn read_ror_sqlite(
    db_path: &std::path::Path,
    limit: Option<usize>,
    offset: usize,
    country_code: Option<&str>,
    query: Option<&str>,
) -> Result<Vec<Data>> {
    formats::ror::read_sqlite(db_path, limit, offset, country_code, query)
}

pub fn read_ror_sqlite_raw(
    db_path: &std::path::Path,
    limit: Option<usize>,
    offset: usize,
    country_code: Option<&str>,
    query: Option<&str>,
) -> Result<Vec<formats::ror::Ror>> {
    formats::ror::read_sqlite_raw(db_path, limit, offset, country_code, query)
}

/// Return a random sample of ROR organizations from the local SQLite database.
pub fn sample_ror_sqlite(
    db_path: &std::path::Path,
    limit: Option<usize>,
    country_code: Option<&str>,
) -> Result<Vec<Data>> {
    formats::ror::sample_sqlite(db_path, limit, country_code)
}

pub fn sample_ror_sqlite_raw(
    db_path: &std::path::Path,
    limit: Option<usize>,
    country_code: Option<&str>,
) -> Result<Vec<formats::ror::Ror>> {
    formats::ror::sample_sqlite_raw(db_path, limit, country_code)
}

/// Write a list of ROR records to a SQLite3 database at `path` with an
/// `organizations` table. Existing file is deleted first. JSON array columns
/// (`types`, `locations`, `names`, `external_ids`) are queryable via SQLite's
/// `json_each()`. The `metadata` column stores the full ROR JSON as a
/// zstd-compressed BLOB for lossless round-trips.
///
/// Pass `version` and `date` (e.g. `"v2.9"`, `"2026-06-23"`) to record the
/// installed release in the `settings` table; pass `None` for both when writing
/// a standalone file where version tracking is not needed.
pub fn write_ror_sqlite(
    list: &[formats::ror::Ror],
    path: &std::path::Path,
    version: Option<&str>,
    date: Option<&str>,
) -> Result<()> {
    formats::ror::write_sqlite(list, path, version, date)
}

/// Return the ROR version string stored in the local database's `settings`
/// table, or `None` when the database does not exist or no version has been
/// recorded yet.
pub fn fetch_installed_ror_version(db_path: &std::path::Path) -> Result<Option<String>> {
    formats::ror::fetch_installed_ror_version(db_path)
}

/// Drop and rebuild the `organizations_fts` FTS5 virtual table.
pub fn rebuild_organizations_fts(path: &std::path::Path) -> Result<()> {
    formats::ror::rebuild_organizations_fts(path)
}

/// Return the `vraix_date` (pidbox install date, `YYYY-MM-DD`) stored in the
/// local works database's `settings` table, or `None` when the database does
/// not exist or no date has been recorded yet.
pub fn fetch_installed_vraix_date(db_path: &std::path::Path) -> Result<Option<String>> {
    formats::vraix::fetch_installed_vraix_date(db_path)
}

/// Download the GeoNames cities500 dump, admin1 codes, and country info; parse
/// them; and write the records to the `geonames`, `geonames_admin1`, and
/// `geonames_countries` tables in the SQLite database at `path`. Caches all
/// three files for 30 days; other tables in the database are untouched.
/// Returns `(record_count, from_cache)`.
pub fn install_geonames_sqlite(path: &std::path::Path, date: Option<&str>) -> Result<(usize, bool)> {
    let (list, admin1_list, country_list, from_cache) = geonames::download_all()?;
    let count = list.len();
    geonames::write_sqlite(&list, &admin1_list, &country_list, path, date)?;
    Ok((count, from_cache))
}

/// Look up a GeoNames place by its integer `id` from the local SQLite database.
pub fn fetch_geonames_sqlite(id: i64, db_path: &std::path::Path) -> Result<Data> {
    geonames::fetch_sqlite(id, db_path)
}

/// Return the GeoNames install date stored in the local database's `settings`
/// table, or `None` when the database does not exist or no date has been
/// recorded yet.
pub fn fetch_installed_geonames_date(db_path: &std::path::Path) -> Result<Option<String>> {
    geonames::fetch_installed_geonames_date(db_path)
}

/// Return the raw `Ror` struct for a given ROR URL from the local SQLite
/// database, bypassing the lossy `Data` conversion.
pub fn fetch_ror_raw_sqlite(id: &str, db_path: &std::path::Path) -> Result<formats::ror::Ror> {
    formats::ror::fetch_raw_sqlite(id, db_path)
}

/// Fetch the raw `Ror` struct from the ROR v2 API, bypassing the lossy
/// `Data` conversion.
pub fn fetch_ror_raw(input: &str) -> Result<formats::ror::Ror> {
    formats::ror::fetch_raw(input)
}

/// Enrich missing `geonames_details` fields for each location in a `Ror` record
/// using the locally installed GeoNames SQLite database. Only fills empty fields.
pub fn enrich_ror_locations(ror: &mut formats::ror::Ror, geonames_db: &std::path::Path) {
    formats::ror::enrich_locations(ror, geonames_db)
}

/// Serialize a `Ror` record as ROR v2-compatible JSON, converting empty-string
/// `lang` and `preferred` fields to JSON `null` to match the canonical API output.
pub fn write_ror_v2_json(ror: &formats::ror::Ror) -> Result<Vec<u8>> {
    formats::ror::write_v2_json(ror)
}

/// Match a free-text affiliation string against ROR organizations using the
/// ROR v2 affiliation endpoint.
pub fn match_ror_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
    formats::ror::match_affiliation(affiliation)
}

/// Match a free-text affiliation string against a local ROR SQLite database
/// written by [`write_ror_sqlite`]. Uses Turso's Tantivy-backed FTS index for
/// full-text search across all organization name variants. Returns results in
/// relevance order with `chosen` set on the top result.
pub fn match_ror_affiliation_sqlite(
    affiliation: &str,
    db_path: &std::path::Path,
) -> Result<Vec<AffiliationMatch>> {
    formats::ror::match_affiliation_sqlite(affiliation, db_path)
}

/// Like `convert`, but passes CSL `style` and `locale` through to the citation writer.
pub fn convert_citation(
    from: &str,
    input: &str,
    style: Option<&str>,
    locale: Option<&str>,
) -> Result<Vec<u8>> {
    let data = formats::read(from, input)?;
    formats::write_citation("citation", &data, style, locale)
}

/// Write a list of commonmeta records as a single Parquet file. Alongside a
/// flattened tabular projection of each record's fields (for filtering in
/// tools like DuckDB without parsing JSON), every row also carries a `json`
/// column with the record's complete serialization, so [`read_parquet`]
/// round-trips losslessly.
pub fn write_parquet(list: &[Data]) -> Result<Vec<u8>> {
    formats::commonmeta::write_parquet_all(list)
}

/// Read a list of commonmeta records back from the Parquet schema written by
/// [`write_parquet`]. Lossless: each record is restored from its `json`
/// column, the complete original serialization.
pub fn read_parquet(bytes: &[u8]) -> Result<Vec<Data>> {
    formats::commonmeta::read_parquet_all(bytes)
}

/// Write `list` as a SQLite3 database with a `works` table whose columns
/// mirror the commonmeta v1.0 schema. Simple string fields are stored as
/// TEXT; complex fields are stored as compact JSON TEXT.
/// Any existing file at `path` is deleted first.
pub fn write_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
    formats::commonmeta::write_sqlite(list, path)
}

/// Like [`write_sqlite`] but opens an existing database instead of recreating
/// it. Rows whose `id` already exists are replaced; new rows are inserted.
pub fn upsert_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
    formats::commonmeta::upsert_sqlite(list, path)
}

/// Return the total number of rows in the `works` table of a commonmeta SQLite
/// database — useful for reporting the cumulative count after an upsert.
pub fn count_sqlite_works(path: &std::path::Path) -> Result<usize> {
    formats::commonmeta::count_sqlite_works(path)
}

/// Write a key/value pair into the `settings` table of a commonmeta SQLite database.
pub fn set_sqlite_setting(path: &std::path::Path, key: &str, value: &str) -> Result<()> {
    formats::commonmeta::set_sqlite_setting(path, key, value)
}

/// Read a value from the `settings` table. Returns `None` when the key is absent.
pub fn get_sqlite_setting(path: &std::path::Path, key: &str) -> Result<Option<String>> {
    formats::commonmeta::get_sqlite_setting(path, key)
}

/// Return all rows from the `settings` table, sorted by key.
pub fn get_all_sqlite_settings(path: &std::path::Path) -> Result<Vec<(String, String)>> {
    formats::commonmeta::get_all_sqlite_settings(path)
}

/// Apply any pending schema migrations to an existing database, printing
/// per-step progress and timing to stderr. Returns `(steps_applied, version)`.
pub fn run_migrations(path: &std::path::Path) -> Result<(usize, u32)> {
    formats::commonmeta::run_migrations(path)
}

/// Read records from a commonmeta SQLite database written by [`write_sqlite`].
pub fn read_sqlite_commonmeta(
    path: &std::path::Path,
    limit: Option<usize>,
    offset: usize,
) -> Result<Vec<Data>> {
    formats::commonmeta::read_sqlite_commonmeta(path, limit, offset)
}

/// Look up a single record by its `id` (DOI URL) in a commonmeta SQLite database.
/// Returns `None` when the record is not present.
pub fn read_sqlite_by_id(id: &str, path: &std::path::Path) -> Result<Option<Data>> {
    formats::commonmeta::read_sqlite_by_id(id, path)
}

/// Fetch all works whose DOI matches any entry in `dois` in a single SQL query.
/// DOIs are normalised before lookup; records not found are silently omitted.
pub fn read_sqlite_by_dois(dois: &[String], path: &std::path::Path) -> Result<Vec<Data>> {
    formats::commonmeta::read_sqlite_by_dois(dois, path)
}

/// Fetch all works with a contributor whose ORCID matches `orcid_url`,
/// ordered by `date_published` descending.
pub fn read_sqlite_by_orcid(orcid_url: &str, path: &std::path::Path) -> Result<Vec<Data>> {
    formats::commonmeta::read_sqlite_by_orcid(orcid_url, path)
}

/// Fetch all works with a contributor affiliated with `ror_url`,
/// ordered by `date_published` descending.
pub fn read_sqlite_by_ror(ror_url: &str, path: &std::path::Path) -> Result<Vec<Data>> {
    formats::commonmeta::read_sqlite_by_ror(ror_url, path)
}

/// Fetch all works that cite `doi` (i.e. have it in their reference list),
/// ordered by `date_published` descending.
pub fn read_sqlite_by_citation(doi: &str, path: &std::path::Path) -> Result<Vec<Data>> {
    formats::commonmeta::read_sqlite_by_citation(doi, path)
}

pub use formats::commonmeta::JunctionTable;

/// Backfill one or more junction tables (`works_orcid`, `works_ror`,
/// `works_references`) for every row in `works`. `providers` restricts to
/// specific provider values (e.g. `["Crossref"]`); empty = all providers.
/// Reads blobs in 50 k-row streaming batches; uses `INSERT OR IGNORE` so it
/// is safe to re-run or interrupt and resume. Returns `(works_scanned, rows_inserted)`.
pub fn backfill_junction_tables(path: &std::path::Path, tables: &[JunctionTable], providers: &[&str]) -> Result<(usize, usize)> {
    formats::commonmeta::backfill_junction_tables(path, tables, providers)
}

/// Convenience wrapper: backfill only `works_references`.
pub fn backfill_works_references(path: &std::path::Path) -> Result<(usize, usize)> {
    formats::commonmeta::backfill_works_references(path)
}

/// Drop and rebuild the `works_fts` FTS5 virtual table from the content in `works`.
pub fn rebuild_works_fts(path: &std::path::Path) -> Result<()> {
    formats::commonmeta::rebuild_works_fts(path)
}

/// Populate `data.citations` from the `works_references` junction table,
/// merging with any citations already present (e.g. from DataCite/OpenAlex).
/// No-op when `db_path` does not exist or the lookup fails.
pub fn enrich_citations(data: &mut Data, db_path: &std::path::Path) {
    if !db_path.exists() {
        return;
    }
    let Ok(citing) = formats::commonmeta::read_sqlite_by_citation(&data.id, db_path) else {
        return;
    };
    for work in citing {
        if !data.citations.iter().any(|c| c.id == work.id) {
            data.citations.push(Citation {
                id: work.id,
                asserted_by: work.provider,
                ..Default::default()
            });
        }
    }
}

pub fn read_sqlite_by_pmid(pmid: &str, path: &std::path::Path) -> Result<Option<Data>> {
    formats::commonmeta::read_sqlite_by_pmid(pmid, path)
}

/// Prepare a `Data` record for commonmeta v1.0 JSON serialization: normalises
/// IDs, strips schema-private reference fields, clears invalid ROR/ORCID ids, etc.
pub fn prepare_commonmeta(data: &Data) -> Data {
    formats::commonmeta::prepare(data)
}

pub fn read_sqlite_by_pmcid(pmcid: &str, path: &std::path::Path) -> Result<Option<Data>> {
    formats::commonmeta::read_sqlite_by_pmcid(pmcid, path)
}

/// Fetch the referenced works of `data` that have a DOI.
///
/// Performs a single batch SQLite lookup first, then fetches any remaining
/// DOIs from the network (Crossref first, DataCite fallback) unless
/// `no_network` is true.  Results are returned in the same order as they
/// appear in `data.references`.
pub fn fetch_reference_works(
    data: &Data,
    db_path: Option<&std::path::Path>,
    no_network: bool,
) -> Vec<Data> {
    let ref_dois: Vec<String> = data.references
        .iter()
        .filter_map(|r| {
            if r.id.starts_with("https://doi.org/") { Some(r.id.clone()) } else { None }
        })
        .collect();

    if ref_dois.is_empty() {
        return Vec::new();
    }

    // Batch SQLite lookup.
    let sqlite_works = match db_path {
        Some(path) if path.exists() => {
            formats::commonmeta::read_sqlite_by_dois(&ref_dois, path).unwrap_or_default()
        }
        _ => Vec::new(),
    };

    let sqlite_ids: std::collections::HashSet<&str> =
        sqlite_works.iter().map(|d| d.id.as_str()).collect();

    // Network fallback: Crossref first, then DataCite.
    let mut network_works: Vec<Data> = Vec::new();
    if !no_network {
        for doi in &ref_dois {
            if sqlite_ids.contains(doi.as_str()) {
                continue;
            }
            let work = formats::crossref::fetch(doi)
                .or_else(|_| formats::datacite::fetch(doi));
            if let Ok(w) = work {
                network_works.push(w);
            }
        }
    }

    // Merge and sort by reference order.
    let order: std::collections::HashMap<&str, usize> = ref_dois
        .iter()
        .enumerate()
        .map(|(i, doi)| (doi.as_str(), i))
        .collect();
    let mut all: Vec<Data> = sqlite_works;
    all.extend(network_works);
    all.sort_by_key(|d| order.get(d.id.as_str()).copied().unwrap_or(usize::MAX));
    all
}

pub fn read_sqlite_by_openalex(openalex: &str, path: &std::path::Path) -> Result<Option<Data>> {
    formats::commonmeta::read_sqlite_by_openalex(openalex, path)
}

pub fn read_sqlite_by_arxiv(arxiv: &str, path: &std::path::Path) -> Result<Option<Data>> {
    formats::commonmeta::read_sqlite_by_arxiv(arxiv, path)
}

pub use formats::commonmeta::{FillReport, ValidationError, ValidationReport};

/// Validate records in a commonmeta SQLite database against the v1.0 JSON schema.
///
/// The JSON schema validator is compiled once and reused. Records are streamed in
/// batches to keep memory usage constant regardless of database size.
///
/// - `provider` — optional filter, e.g. `"DataCite"` or `"Crossref"`
/// - `work_type` — optional filter, e.g. `"Dataset"` or `"JournalArticle"`
/// - `limit` — maximum records to check (`0` = all)
pub fn validate_sqlite(
    path: &std::path::Path,
    provider: Option<&str>,
    work_type: Option<&str>,
    id: Option<&str>,
    has_ror_id: bool,
    limit: usize,
    fix: bool,
    recheck: bool,
) -> Result<ValidationReport> {
    formats::commonmeta::validate_sqlite(path, provider, work_type, id, has_ror_id, limit, fix, recheck)
}

/// Bulk-resolve all distinct DOI prefixes in the works database against the DOI RA API
/// and populate the `prefixes` table.
///
/// Prefixes already cached within the last 30 days are skipped. Remaining prefixes are
/// resolved in batches of 100 against `https://doi.org/doiRA/` and written back.
/// Returns the number of prefixes successfully resolved.
pub fn import_prefixes(path: &std::path::Path) -> Result<usize> {
    use doi_utils::{
        collect_work_prefixes, ensure_prefixes_table, fetch_doi_ra_batch, store_prefix_cache,
    };

    let conn = rusqlite::Connection::open(path)
        .map_err(|e| Error::Parse(format!("cannot open '{}': {}", path.display(), e)))?;
    let _ = conn.execute_batch("PRAGMA journal_mode=WAL; PRAGMA cache_size=-65536;");
    ensure_prefixes_table(&conn);

    let prefixes = collect_work_prefixes(&conn);
    if prefixes.is_empty() {
        return Ok(0);
    }

    // Skip prefixes that already have a fresh cache entry (< 30 days old).
    let cutoff = (chrono::Utc::now() - chrono::TimeDelta::days(30)).to_rfc3339();
    let to_resolve: Vec<String> = prefixes
        .into_iter()
        .filter(|p| {
            conn.query_row(
                r#"SELECT "date_updated" FROM prefixes WHERE "prefix" = ?1"#,
                rusqlite::params![p],
                |r| r.get::<_, String>(0),
            )
            .ok()
            .filter(|d| d.as_str() > cutoff.as_str())
            .is_none()
        })
        .collect();

    if to_resolve.is_empty() {
        return Ok(0);
    }

    let bar = progress::count_bar("prefixes", to_resolve.len() as u64);

    let client = reqwest::blocking::Client::builder()
        .timeout(std::time::Duration::from_secs(30))
        .user_agent(format!(
            "commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs)",
            env!("CARGO_PKG_VERSION")
        ))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;

    const BATCH: usize = 100;
    let mut resolved = 0usize;

    for chunk in to_resolve.chunks(BATCH) {
        let refs: Vec<&str> = chunk.iter().map(String::as_str).collect();
        let pairs = fetch_doi_ra_batch(&client, &refs);

        if !pairs.is_empty() {
            let tx = conn
                .unchecked_transaction()
                .map_err(|e| Error::Parse(e.to_string()))?;
            for (prefix, ra) in &pairs {
                store_prefix_cache(&conn, prefix, ra);
                resolved += 1;
            }
            tx.commit().map_err(|e| Error::Parse(e.to_string()))?;
        }

        bar.inc(chunk.len() as u64);
    }
    bar.finish_and_clear();

    Ok(resolved)
}

/// Fill missing or convertible affiliation/organization identifiers in the works database.
pub fn fill_sqlite(
    path: &std::path::Path,
    ror_db_path: &std::path::Path,
    provider: Option<&str>,
    work_type: Option<&str>,
    id: Option<&str>,
    has_ror_id: bool,
    limit: usize,
) -> Result<FillReport> {
    formats::commonmeta::fill_sqlite(path, ror_db_path, provider, work_type, id, has_ror_id, limit)
}

/// Fetch one page of Crossref works using cursor-based pagination.
///
/// Pass `cursor = "*"` for the first page; use the `next-cursor` value from
/// the previous response for every subsequent page. Cursor pagination is
/// required for result sets beyond 10,000 records (the Crossref API limit for
/// offset pagination).
///
/// Returns `(records, next_cursor)`. Stop when `next_cursor` is `None` or
/// when the records slice is shorter than `number`.
#[allow(clippy::too_many_arguments)]
pub fn crossref_fetch_page_with_cursor(
    cursor: &str,
    number: usize,
    member: &str,
    type_: &str,
    year: &str,
    orcid: &str,
    ror: &str,
    has_orcid: bool,
    has_ror: bool,
    has_references: bool,
    has_relation: bool,
    has_abstract: bool,
    has_award: bool,
    has_license: bool,
    has_archive: bool,
    match_ror: bool,
) -> Result<(Vec<Data>, Option<String>)> {
    formats::crossref::fetch_page_with_cursor(
        cursor, number, member, type_, year, orcid, ror,
        has_orcid, has_ror, has_references, has_relation, has_abstract, has_award, has_license, has_archive,
        match_ror,
    )
}

/// Stream a VRAIX daily dump at `input_path` directly to a commonmeta SQLite
/// database at `output_path` in batches of 10 000 rows, converting with
/// `from`-specific parser and writing each batch in a single transaction.
/// `limit` caps total records written; pass `0` for all rows.
/// When `update` is false the output file is deleted and recreated (default).
/// When `update` is true the existing file is kept and rows are upserted by
/// their `id` primary key — new rows are inserted, existing rows are replaced.
/// Returns the number of records written. No `Vec<Data>` is held for the
/// whole file — peak memory is proportional to one batch, not the whole dump.
pub fn stream_vraix_to_sqlite(
    input_path: &std::path::Path,
    from: &str,
    output_path: &std::path::Path,
    limit: usize,
    update: bool,
) -> Result<usize> {
    formats::vraix::stream_dump_to_sqlite(input_path, from, output_path, limit, !update)
}

/// Stream the pidbox dump (a mixed-source VRAIX SQLite file containing crossref,
/// datacite, and ROR rows) directly to a commonmeta SQLite database. Each row
/// is routed to the appropriate parser by its `source_id`; ROR rows are
/// skipped. When `update` is false the output file is recreated; when true
/// rows are upserted by `id`. Returns the number of records written.
pub fn stream_pidbox_to_sqlite(
    input_path: &std::path::Path,
    output_path: &std::path::Path,
    limit: usize,
    update: bool,
) -> Result<usize> {
    formats::vraix::stream_pidbox_to_sqlite(input_path, output_path, limit, !update)
}

/// Delete all rows from the VRAIX-schema transport table in the dragoman
/// cache at `path` and VACUUM to reclaim disk space.  Call this after a
/// successful [`stream_pidbox_to_sqlite`] import to prevent re-importing the
/// same records on the next run.  Returns the number of rows deleted.
pub fn flush_dragoman_cache(path: &std::path::Path) -> Result<usize> {
    formats::vraix::flush_transport_table(path)
}

/// Fetch a person from the ORCID public API and return their record as [`Data`].
/// Accepts a bare ORCID iD (`0000-0003-1419-2405`) or a full ORCID URL.
pub fn fetch_orcid(id: &str) -> Result<Data> {
    formats::orcid::fetch_orcid(id)
}

/// Fetch a person from the ORCID public API and return both the parsed [`Data`]
/// and the raw ORCID 3.0 person JSON in a single HTTP request.
///
/// Use this instead of calling [`fetch_orcid`] + [`fetch_orcid_person_json`]
/// separately when both are needed (e.g. building a response while also caching
/// the raw JSON for later import into the `people` table).
pub fn fetch_orcid_with_json(id: &str) -> Result<(Data, serde_json::Value)> {
    formats::orcid::fetch_orcid_with_json(id)
}

/// Read ORCID person rows written by dragoman into `cache.sqlite3` and upsert
/// them into the `people` table at `people_path`.
///
/// Handles both commonmeta JSON (dragoman < 0.3.16) and raw ORCID 3.0 person
/// JSON (dragoman >= 0.3.16).  Call this as part of
/// `commonmeta import --from cache` after [`stream_pidbox_to_sqlite`].
pub fn stream_cache_orcid_to_people_sqlite(
    cache_path: &std::path::Path,
    people_path: &std::path::Path,
) -> Result<usize> {
    formats::orcid::stream_cache_orcid_to_people_sqlite(cache_path, people_path)
}

/// Fetch a person from the ORCID public API and return the raw ORCID 3.0 person
/// JSON conforming to `orcid_schema_v3.0.json`.
pub fn fetch_orcid_person_json(id: &str) -> Result<serde_json::Value> {
    formats::orcid::fetch_person_json(id)
}

/// Serialize an ORCID 3.0 person JSON value (from [`fetch_orcid_person_json`] or
/// [`fetch_orcid_person_json_sqlite`]) to bytes.
pub fn write_orcid_json(value: &serde_json::Value) -> Result<Vec<u8>> {
    formats::orcid::write_orcid_json(value)
}

pub use formats::orcid::PersonAffiliation;

/// Fetch employment records from the ORCID public API for the given ORCID URL.
/// Returns affiliations sorted by start date. When `db_path` is provided, non-ROR
/// organization identifiers (GRID, ISNI, FundRef, Wikidata) are resolved to ROR
/// IDs via the local `organizations` SQLite table.
pub fn fetch_orcid_employments(
    orcid_url: &str,
    db_path: Option<&std::path::Path>,
) -> Result<Vec<PersonAffiliation>> {
    formats::orcid::fetch_person_employments(orcid_url, db_path)
}

/// Fetch employment **and** education records from the ORCID public API, returning
/// them as a combined list sorted by start date. Supersedes [`fetch_orcid_employments`]
/// when both affiliation types are needed.
pub fn fetch_orcid_affiliations(
    orcid_url: &str,
    db_path: Option<&std::path::Path>,
) -> Result<Vec<PersonAffiliation>> {
    formats::orcid::fetch_person_affiliations(orcid_url, db_path)
}

/// Read affiliations stored in the `affiliations` column of the `people` SQLite table.
/// Returns an empty vec when the record is absent or the column is empty.
pub fn fetch_orcid_affiliations_sqlite(
    orcid_url: &str,
    db_path: &std::path::Path,
) -> Vec<PersonAffiliation> {
    formats::orcid::fetch_person_affiliations_sqlite(orcid_url, db_path)
}

/// Fetch the DOIs of all works listed on an ORCID profile, returned as
/// normalised `https://doi.org/…` URLs in response order.
pub fn fetch_orcid_work_dois(orcid_url: &str) -> Result<Vec<String>> {
    formats::orcid::fetch_orcid_work_dois(orcid_url)
}

/// Fetch works by ORCID from Crossref, sorted by date descending.
/// `page` is 1-based; Crossref offset is computed as `(page-1) * limit`.
pub fn fetch_crossref_by_orcid(orcid_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
    formats::crossref::fetch_by_orcid(orcid_url, limit, page)
}

/// Fetch all works by ORCID from Crossref using cursor-based pagination.
pub fn fetch_all_crossref_by_orcid(orcid_url: &str) -> Result<Vec<Data>> {
    formats::crossref::fetch_all_by_orcid(orcid_url)
}

/// Fetch works by ORCID from DataCite, sorted by date descending.
/// `page` is 1-based and maps directly to DataCite's `page[number]` parameter.
pub fn fetch_datacite_by_orcid(orcid_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
    formats::datacite::fetch_by_orcid(orcid_url, limit, page)
}

/// Fetch all works by ORCID from DataCite, iterating pages until exhausted.
pub fn fetch_all_datacite_by_orcid(orcid_url: &str) -> Result<Vec<Data>> {
    formats::datacite::fetch_all_by_orcid(orcid_url)
}

/// Fetch works by ROR from Crossref, sorted by date descending.
/// `page` is 1-based; Crossref offset is computed as `(page-1) * limit`.
pub fn fetch_crossref_by_ror(ror_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
    formats::crossref::fetch_by_ror(ror_url, limit, page)
}

/// Fetch works by ROR from DataCite, sorted by date descending.
/// `page` is 1-based and maps directly to DataCite's `page[number]` parameter.
pub fn fetch_datacite_by_ror(ror_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
    formats::datacite::fetch_by_ror(ror_url, limit, page)
}

/// Convert ORCID 3.0 person JSON + resolved affiliations + works to a commonmeta
/// array validated against the commonmeta v1.0 schema. `works` may be empty.
pub fn write_orcid_commonmeta(
    person_json: &serde_json::Value,
    affiliations: &[PersonAffiliation],
    works: &[Data],
) -> Result<Vec<u8>> {
    formats::orcid::write_commonmeta_person(person_json, affiliations, works)
}

/// Serialize a person to InvenioRDM names YAML (list form). `person_json` is the
/// ORCID 3.0 `/person` response; `affiliations` from [`fetch_orcid_employments`].
pub fn write_orcid_inveniordm_yaml(
    person_json: &serde_json::Value,
    affiliations: &[PersonAffiliation],
) -> Result<Vec<u8>> {
    formats::orcid::write_inveniordm_person_yaml(person_json, affiliations)
}

/// Look up a person from a local `people` SQLite table and return their record as [`Data`].
/// Accepts a bare ORCID iD or a full ORCID URL.
/// Handles both XML blobs (bulk import) and JSON blobs (single-record API import).
pub fn fetch_orcid_sqlite(id: &str, db_path: &std::path::Path) -> Result<Data> {
    formats::orcid::fetch_sqlite(id, db_path)
}

/// Look up a person from a local `people` SQLite table and return the raw ORCID 3.0
/// person JSON conforming to `orcid_schema_v3.0.json`.
pub fn fetch_orcid_person_json_sqlite(id: &str, db_path: &std::path::Path) -> Result<serde_json::Value> {
    formats::orcid::fetch_person_json_sqlite(id, db_path)
}

/// Fetch a single person record from the ORCID public API, upsert the person
/// into `people_db`, and fetch their works from Crossref and DataCite and
/// upsert them into `works_db` (may be the same path as `people_db`).
/// Accepts a bare ORCID iD or a full ORCID URL.
/// Returns the number of works written.
pub fn import_orcid_person(
    id: &str,
    people_db: &std::path::Path,
    works_db: &std::path::Path,
) -> Result<usize> {
    formats::orcid::import_person(id, people_db, works_db)
}

/// Fetch the latest ORCID Public Data File release metadata from figshare.
pub use formats::orcid::fetch_latest_orcid_release;

/// Read the installed ORCID Public Data File version from the `settings` table.
pub use formats::orcid::fetch_installed_orcid_public_data_version;

/// Drop and rebuild the `people_fts` FTS5 virtual table.
pub fn rebuild_people_fts(path: &std::path::Path) -> Result<()> {
    formats::orcid::rebuild_people_fts(path)
}

/// Download and import the ORCID Public Data File summaries into the `people`
/// table at `output_path`. Skips the download if the current version is already
/// installed; resumes partial downloads automatically.
///
/// Import the ORCID Public Data File summaries into the `people` table.
/// `source` is `None` (auto-discover), a local file path, or a direct URL.
pub fn import_orcid_public_data(
    output_path: &std::path::Path,
    source: Option<&str>,
    no_network: bool,
    sample: bool,
) -> Result<usize> {
    formats::orcid::import_orcid_public_data(output_path, source, no_network, sample)
}

/// Like [`stream_pidbox_to_sqlite`] but reads directly from the
/// zstd-compressed pidbox file without decompressing it to disk first.
/// Requires the database to be well-organised (VACUUM'd or sequential bulk
/// inserts) so that pages appear in DFS pre-order.
pub fn stream_zst_pidbox_to_sqlite(
    zst_path: &std::path::Path,
    output_path: &std::path::Path,
    limit: usize,
) -> Result<usize> {
    formats::sqlite_stream::stream_zst_pidbox_to_sqlite(zst_path, output_path, limit, true)
}

/// Stream a gzip-compressed PMC-ids CSV file into the commonmeta SQLite
/// database at `output_path`, upserting rows that have a DOI. Pass `limit = 0`
/// to process all rows. Returns the number of records written.
pub fn stream_pmc_ids_to_sqlite(
    gz_path: &std::path::Path,
    output_path: &std::path::Path,
    limit: usize,
    no_network: bool,
) -> Result<usize> {
    formats::pubmed::stream_pmc_ids_to_sqlite(gz_path, output_path, limit, no_network)
}

/// Render a list of records to `to` format as a single buffer: a JSON array
/// for object-shaped formats (`commonmeta`, `csl`, `datacite`, `inveniordm`,
/// `schemaorg`, `ror`), or newline-joined output for line/document-shaped
/// formats (e.g. `bibtex`, `ris`, `crossref_xml`).
pub fn write_list(list: &[Data], to: &str) -> Result<Vec<u8>> {
    write_list_citation(list, to, None, None)
}

/// Like `write_list`, but passes CSL `style`/`locale` through to the
/// citation writer when `to == "citation"` (ignored for every other format,
/// same as `convert_citation`/`write_citation`).
pub fn write_list_citation(
    list: &[Data],
    to: &str,
    style: Option<&str>,
    locale: Option<&str>,
) -> Result<Vec<u8>> {
    let bar = progress::count_bar("rendering", list.len() as u64);

    if matches!(
        to,
        "commonmeta"
            | "csl"
            | "datacite"
            | "inveniordm"
            | "schemaorg"
            | "ror"
            | "citation"
            | "crossref_xml"
            | "datacite_xml"
    ) {
        let bytes = formats::write_all_citation(to, list, style, locale)?;
        bar.finish_and_clear();
        return Ok(bytes);
    }

    let mut output = String::new();
    for (idx, item) in list.iter().enumerate() {
        let rendered = formats::write_citation(to, item, style, locale)?;
        if idx > 0 {
            output.push('\n');
        }
        output.push_str(&String::from_utf8_lossy(&rendered));
        bar.inc(1);
    }
    bar.finish_and_clear();
    Ok(output.into_bytes())
}

/// Render `list` to `to` format, split into entries of at most `batch_size`
/// records each — suitable for packing into an archive via
/// [`io_utils::write_zip_archive`]/[`io_utils::write_tar_gz_archive`].
/// `base_name` (e.g. `"out.json"`) names the single entry directly when
/// there's only one batch, or gets a numbered suffix (`"out-00000.json"`,
/// `"out-00001.json"`, ...) when there are several.
pub fn write_archive(
    list: &[Data],
    to: &str,
    base_name: &str,
    batch_size: usize,
) -> Result<Vec<(String, Vec<u8>)>> {
    write_archive_citation(list, to, base_name, batch_size, None, None)
}

/// Like `write_archive`, but passes CSL `style`/`locale` through to the
/// citation writer when `to == "citation"`.
pub fn write_archive_citation(
    list: &[Data],
    to: &str,
    base_name: &str,
    batch_size: usize,
    style: Option<&str>,
    locale: Option<&str>,
) -> Result<Vec<(String, Vec<u8>)>> {
    if list.is_empty() {
        return Err(Error::Serialize("no records to write".to_string()));
    }
    let chunks: Vec<&[Data]> = list.chunks(batch_size.max(1)).collect();
    let multi = chunks.len() > 1;

    let mut entries = Vec::with_capacity(chunks.len());
    for (idx, chunk) in chunks.into_iter().enumerate() {
        let bytes = write_list_citation(chunk, to, style, locale)?;
        let name = batch_entry_name(base_name, if multi { Some(idx) } else { None });
        entries.push((name, bytes));
    }
    Ok(entries)
}

/// Build the entry name for a batch: `base_name` itself when `idx` is
/// `None`, or `{stem}-{idx:05}.{ext}` for numbered batches.
fn batch_entry_name(base_name: &str, idx: Option<usize>) -> String {
    match idx {
        None => base_name.to_string(),
        Some(i) => {
            let path = std::path::Path::new(base_name);
            let stem = path
                .file_stem()
                .unwrap_or_default()
                .to_string_lossy()
                .to_string();
            let ext = path
                .extension()
                .map(|e| e.to_string_lossy().to_string())
                .unwrap_or_default();
            if ext.is_empty() {
                format!("{}-{:05}", stem, i)
            } else {
                format!("{}-{:05}.{}", stem, i, ext)
            }
        }
    }
}

/// Read commonmeta records from a VRAIX daily dump SQLite file already on
/// disk at `sqlite_path`, e.g. an already-downloaded `crossref-2026-06-14.sqlite3`.
///
/// `from` ("crossref" or "datacite") picks how every row is parsed — VRAIX
/// dumps are single-source per file, so this isn't read from the data
/// itself. `limit: None` reads every row; `Some(n)` reads `n` rows starting
/// at `offset`.
pub fn read_vraix_sqlite(
    sqlite_path: &str,
    from: &str,
    limit: Option<usize>,
    offset: usize,
) -> Result<Vec<Data>> {
    formats::vraix::read_dump(sqlite_path, from, limit, offset)
}

/// Write a VRAIX dump's transport table (e.g. `pid_records`) to a single
/// Parquet file's bytes, using its raw columns (`pid`, `source_id`,
/// `raw_metadata`, ...) as-is — *not* converted to commonmeta `Data` the way
/// [`read_vraix_sqlite`] is. For analytics over the dump itself (e.g. via
/// DataFusion/Polars/DuckDB), not for ingesting it as commonmeta records.
/// `batch_size` controls how many rows land in each internal Parquet row
/// group (see [`formats::commonmeta::write_parquet_all`]'s analogous
/// `ROW_GROUP_SIZE` for why this matters for large dumps).
pub fn write_vraix_table_parquet(sqlite_path: &str, batch_size: usize) -> Result<Vec<u8>> {
    formats::vraix::write_table_parquet(sqlite_path, batch_size)
}

/// Fetch commonmeta records from a VRAIX daily dump for `from` ("crossref"
/// or "datacite") and `date` (YYYY-MM-DD).
///
/// With `input_path`, the local SQLite file at that path is read directly
/// via [`read_vraix_sqlite`] (e.g. an already-downloaded dump); otherwise
/// `{from}-{date}.sqlite3.zst` is downloaded from metadata.vraix.org via
/// [`io_utils::ensure_cached_path`] and decompressed into a temp file.
///
/// `limit`/`offset` window the rows read from the dump; `limit: None` reads
/// every row.
pub fn fetch_vraix_dump(
    from: &str,
    date: &str,
    input_path: Option<&str>,
    limit: Option<usize>,
    offset: usize,
    cache_ttl: std::time::Duration,
) -> Result<Vec<Data>> {
    if let Some(path) = input_path {
        return read_vraix_sqlite(path, from, limit, offset);
    }

    let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
    let cache_key = format!("{}-{}.sqlite3.zst", from, date);
    let (zst_path, _from_cache) =
        io_utils::ensure_cached_path(&url, "vraix", &cache_key, cache_ttl)
            .map_err(|e| Error::Http(format!("failed to download '{}': {}", url, e)))?;

    let tmp_path = std::env::temp_dir().join(format!(
        "commonmeta-vraix-{}-{}-{}.sqlite3",
        from,
        date,
        std::process::id()
    ));
    io_utils::decompress_zst_file(&zst_path, &tmp_path)
        .map_err(|e| Error::Parse(format!("failed to decompress '{}': {}", url, e)))?;

    let result = read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
    std::fs::remove_file(&tmp_path).ok();
    result
}

/// Create-or-update, then publish, a list of records in InvenioRDM.
///
/// This performs real, network-visible writes against `host` (a live record
/// is created/updated and published) using `token` for Bearer authentication.
/// Registration with other services (Crossref, DataCite) is not yet supported.
pub fn push_inveniordm(list: &[Data], host: &str, token: &str) -> Vec<PushResult> {
    formats::inveniordm::upsert_all(list, host, token)
}

/// Create-or-update, then publish, a single record in InvenioRDM.
///
/// This performs a real, network-visible write against `host` (a live record
/// is created/updated and published) using `token` for Bearer authentication.
/// Registration with other services (Crossref, DataCite) is not yet supported.
pub fn put_inveniordm(data: &Data, host: &str, token: &str) -> PushResult {
    formats::inveniordm::upsert(data, host, token)
}

/// Run any commonmeta CLI subcommand from a list of arguments.
///
/// `args[0]` should be the program name (e.g. `"commonmeta"`), followed by
/// the subcommand and its flags — exactly what `std::env::args()` produces.
/// Returns `Err(message)` on command failure so callers (including Python via
/// PyO3) can propagate the error without calling `process::exit`.
pub fn run_cli(args: Vec<String>) -> std::result::Result<(), String> {
    let matches = clap::Command::new("commonmeta")
        .version(env!("CARGO_PKG_VERSION"))
        .author("Front Matter <info@front-matter.de>")
        .about("Commonmeta")
        .subcommand(cmd::convert::command())
        .subcommand(cmd::decode::command())
        .subcommand(cmd::dump::command())
        .subcommand(cmd::encode::command())
        .subcommand(cmd::import::command())
        .subcommand(cmd::list::command())
        .subcommand(cmd::r#match::command())
        .subcommand(cmd::migrate::command())
        .subcommand(cmd::push::command())
        .subcommand(cmd::put::command())
        .subcommand(cmd::settings::command())
        .subcommand(cmd::validate::command())
        .get_matches_from(args);

    match matches.subcommand() {
        Some(("convert",  sub)) => cmd::convert::execute(sub),
        Some(("decode",   sub)) => cmd::decode::execute(sub),
        Some(("package",  sub)) => cmd::dump::execute(sub),
        Some(("encode",   sub)) => cmd::encode::execute(sub),
        Some(("import",   sub)) => cmd::import::execute(sub),
        Some(("list",     sub)) => cmd::list::execute(sub),
        Some(("match",    sub)) => cmd::r#match::execute(sub),
        Some(("migrate",  sub)) => cmd::migrate::execute(sub),
        Some(("push",     sub)) => cmd::push::execute(sub),
        Some(("put",      sub)) => cmd::put::execute(sub),
        Some(("settings", sub)) => cmd::settings::execute(sub),
        Some(("validate", sub)) => cmd::validate::execute(sub),
        _ => Ok(()),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn sample_data(id: &str) -> Data {
        Data {
            id: id.to_string(),
            type_: "JournalArticle".to_string(),
            ..Data::default()
        }
    }

    #[test]
    fn test_write_list_json_array_formats() {
        let list = vec![
            sample_data("https://doi.org/10.1/a"),
            sample_data("https://doi.org/10.1/b"),
        ];
        let bytes = write_list(&list, "commonmeta").unwrap();
        let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
        assert_eq!(value.as_array().unwrap().len(), 2);
    }

    #[test]
    fn test_write_list_newline_joined_formats() {
        let list = vec![
            sample_data("https://doi.org/10.1/a"),
            sample_data("https://doi.org/10.1/b"),
        ];
        let bytes = write_list(&list, "ris").unwrap();
        let text = String::from_utf8(bytes).unwrap();
        // Two records, newline-joined rather than a JSON array.
        assert_eq!(text.lines().filter(|l| l.starts_with("TY  -")).count(), 2);
    }

    #[test]
    fn test_write_list_crossref_xml_batches_into_one_doi_batch() {
        let list = vec![
            sample_data("https://doi.org/10.1/a"),
            sample_data("https://doi.org/10.1/b"),
        ];
        let bytes = write_list(&list, "crossref_xml").unwrap();
        let text = String::from_utf8(bytes).unwrap();
        assert_eq!(text.matches("<doi_batch xmlns=").count(), 1);
        assert_eq!(text.matches("<journal_article").count(), 2);
    }

    #[test]
    fn test_write_list_ror_uses_json_array_batch_writer() {
        let mut a = sample_data("https://ror.org/0342dzm54");
        a.title = "Org A".to_string();
        let mut b = sample_data("https://ror.org/0521rfr06");
        b.title = "Org B".to_string();

        let bytes = write_list(&[a, b], "ror").unwrap();
        let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
        assert_eq!(value.as_array().unwrap().len(), 2);
    }

    #[test]
    fn test_write_list_citation_renders_each_record() {
        let mut a = sample_data("https://doi.org/10.1/a");
        a.title = "Title A".to_string();
        a.date_published = "2020".to_string();
        let mut b = sample_data("https://doi.org/10.1/b");
        b.title = "Title B".to_string();
        b.date_published = "2021".to_string();

        let bytes = write_list(&[a, b], "citation").unwrap();
        let text = String::from_utf8(bytes).unwrap();
        let lines: Vec<&str> = text.lines().collect();
        assert_eq!(lines.len(), 2);
        assert!(lines[0].contains("Title A"));
        assert!(lines[1].contains("Title B"));
    }

    #[test]
    fn test_write_list_citation_respects_style() {
        let mut a = sample_data("https://doi.org/10.1/a");
        a.title = "Title A".to_string();
        a.date_published = "2020".to_string();

        let apa = write_list_citation(&[a.clone()], "citation", None, None).unwrap();
        let chicago =
            write_list_citation(&[a], "citation", Some("chicago-author-date"), None).unwrap();
        assert_ne!(apa, chicago);
    }

    #[test]
    fn test_write_archive_single_batch_uses_base_name() {
        let list = vec![sample_data("https://doi.org/10.1/a")];
        let entries = write_archive(&list, "commonmeta", "out.json", 100_000).unwrap();
        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].0, "out.json");
    }

    #[test]
    fn test_write_archive_numbered_batches() {
        let list = vec![
            sample_data("https://doi.org/10.1/a"),
            sample_data("https://doi.org/10.1/b"),
            sample_data("https://doi.org/10.1/c"),
        ];
        let entries = write_archive(&list, "commonmeta", "out.json", 1).unwrap();
        assert_eq!(entries.len(), 3);
        assert_eq!(entries[0].0, "out-00000.json");
        assert_eq!(entries[1].0, "out-00001.json");
        assert_eq!(entries[2].0, "out-00002.json");
    }

    #[test]
    fn test_write_archive_no_extension_base_name() {
        let list = vec![
            sample_data("https://doi.org/10.1/a"),
            sample_data("https://doi.org/10.1/b"),
        ];
        let entries = write_archive(&list, "commonmeta", "out", 1).unwrap();
        assert_eq!(entries[0].0, "out-00000");
        assert_eq!(entries[1].0, "out-00001");
    }

    #[test]
    fn test_write_archive_empty_list_errors() {
        assert!(write_archive(&[], "commonmeta", "out.json", 100_000).is_err());
    }

    #[test]
    fn test_fetch_vraix_dump_uses_local_input_path_without_network() {
        let dir = std::env::temp_dir().join("commonmeta_lib_fetch_vraix_dump");
        std::fs::create_dir_all(&dir).unwrap();
        let path = dir.join("datacite.sqlite3");
        std::fs::remove_file(&path).ok();

        {
            let conn = rusqlite::Connection::open(&path).unwrap();
            conn.execute_batch("CREATE TABLE works (pid TEXT, source_id INTEGER, raw_metadata TEXT);")
                .unwrap();
            conn.execute(
                "INSERT INTO works (pid, source_id, raw_metadata) VALUES (?1, ?2, ?3)",
                rusqlite::params![
                    "pid-0",
                    1i64,
                    r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b"}}}"#
                ],
            )
            .unwrap();
        }

        let data = fetch_vraix_dump(
            "datacite",
            "2026-06-14",
            Some(path.to_str().unwrap()),
            None,
            0,
            std::time::Duration::from_secs(30 * 24 * 60 * 60),
        )
        .unwrap();
        assert_eq!(data.len(), 1);
        assert_eq!(data[0].id, "https://doi.org/10.5678/b");

        std::fs::remove_dir_all(&dir).ok();
    }
}