commonmeta 0.8.2

/*
 * Copyright © 2026 Front Matter <info@front-matter.de>
 */

use std::path::Path;
use std::time::Instant;

use clap::{Arg, ArgAction, ArgMatches, Command};

use commonmeta::{self, io_utils};

use crate::cmd::{
    resolve_db_path, CROSSREF_ANNUAL_TORRENT_URL,
    DATACITE_ANNUAL_HOST,
    PIDBOX_CACHE_KEY, PIDBOX_URL, VRAIX_CACHE_TTL,
};
use crate::cmd::convert::detect_format;
use crate::cmd::list::{fetch_list_from_api, fmt_wrote_sqlite};

pub fn command() -> Command {
    Command::new("import")
        .about("Import scholarly metadata into the local commonmeta database")
        .long_about(
            "Download and import scholarly metadata into the local commonmeta SQLite \
            database (always upserts — existing records are updated, not replaced).\n\n\
            The output path defaults to the COMMONMETA_DB environment variable or \
            the platform default (~/Library/Application Support/commonmeta/commonmeta.sqlite3 \
            on macOS, /var/lib/commonmeta/commonmeta.sqlite3 on Linux).\n\n\
            Single record:\n\
            commonmeta import 10.7554/elife.01561\n\
            commonmeta import https://doi.org/10.7554/elife.01561\n\n\
            Annual public data files:\n\
            commonmeta import --from crossref           # March 2026 corpus (~223 GB) via Academic Torrents\n\
            commonmeta import --from crossref --sample  # first 5 torrent files (~40 MB)\n\
            commonmeta import --from crossref --s3      # same corpus from S3 requester-pays bucket (~$18)\n\
            commonmeta import \"https://datafiles.datacite.org/datafiles/public-2025/download?token=<TOKEN>\"\n\
                                                        # DataCite 2025 (108 M records, 33 GB); token from\n\
                                                        # https://datafiles.datacite.org/datafiles/public-2025\n\
                                                        # TAR cached at ~/Library/Caches/commonmeta/datacite/public-2025.tar\n\
            commonmeta import --from datacite           # re-import from cached TAR (no token needed)\n\
            commonmeta import --from datacite --sample  # first 1000 records from cache\n\n\
            Daily VRAIX dumps:\n\
            commonmeta import --from crossref --date 2026-06-15\n\
            commonmeta import --from datacite --date 2026-06-15\n\
            commonmeta import crossref-2026-06-15.sqlite3\n\n\
            API fetch:\n\
            commonmeta import --from crossref --number 100 --member 78\n\
            commonmeta import --from datacite --number 100 --client cern.zenodo\n\
            commonmeta import --from openalex --number 100 --type journal-article\n\n\
            Vocabulary installs:\n\
            commonmeta import --from ror\n\
            commonmeta import --from pidbox",
        )
        .arg(
            Arg::new("input")
                .help("DOI, VRAIX SQLite path, or DataCite annual download URL (auto-detected)")
                .required(false)
                .index(1),
        )
        .arg(
            Arg::new("from")
                .long("from")
                .short('f')
                .help("Source format: crossref, datacite, openalex, pidbox, ror")
                .default_value("commonmeta"),
        )
        .arg(
            Arg::new("number")
                .long("number")
                .help("Number of records to fetch via API (file and date inputs always import all)")
                .value_parser(clap::value_parser!(usize))
                .default_value("0"),
        )
        .arg(
            Arg::new("page")
                .long("page")
                .help("Page number for API fetches (1-based)")
                .value_parser(clap::value_parser!(usize))
                .default_value("1"),
        )
        .arg(Arg::new("member").long("member").help("Crossref member ID"))
        .arg(Arg::new("client").long("client").help("DataCite client ID"))
        .arg(Arg::new("type").long("type").help("Work type filter"))
        .arg(Arg::new("year").long("year").help("Publication year"))
        .arg(Arg::new("language").long("language").help("Language filter"))
        .arg(Arg::new("orcid").long("orcid").help("Filter by ORCID"))
        .arg(Arg::new("ror").long("ror").help("Filter by ROR"))
        .arg(Arg::new("affiliation").long("affiliation").help("Affiliation name filter"))
        .arg(Arg::new("country").long("country").help("Country code filter"))
        .arg(Arg::new("date-updated").long("date-updated").help("Filter by date updated (YYYY-MM-DD)"))
        .arg(Arg::new("from-host").long("from-host").help("InvenioRDM source host"))
        .arg(Arg::new("from-token").long("from-token").help("InvenioRDM source API token"))
        .arg(Arg::new("community").long("community").help("InvenioRDM community slug"))
        .arg(Arg::new("subject").long("subject").help("Subject area filter"))
        .arg(Arg::new("depositor").long("depositor").help("Crossref depositor name"))
        .arg(Arg::new("registrant").long("registrant").help("Crossref registrant name"))
        .arg(
            Arg::new("email")
                .long("email")
                .help("Email for OpenAlex mailto parameter"),
        )
        .arg(
            Arg::new("sample")
                .long("sample")
                .help(
                    "Crossref annual torrent: downloads first 5 torrent files (~40 MB). \
                    Crossref --s3: processes first 1000 records from cached TAR (cache must exist). \
                    DataCite annual URL: streams TAR and stops after 1000 records. \
                    --from datacite (no URL): uses DataCite API random sample (max 1000) instead of the cached TAR. \
                    Other API fetches: return random works (crossref: max 100, openalex: max 200).",
                )
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-orcid")
                .long("has-orcid")
                .help("Filter for records with ORCID")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-ror-id")
                .long("has-ror-id")
                .help("Filter for records with ROR")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-references")
                .long("has-references")
                .help("Filter for records with references")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-relation")
                .long("has-relation")
                .help("Filter for records with relation")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-abstract")
                .long("has-abstract")
                .help("Filter for records with abstract")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-award")
                .long("has-award")
                .help("Filter for records with award")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-license")
                .long("has-license")
                .help("Filter for records with license")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("has-archive")
                .long("has-archive")
                .help("Filter for records with archive")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("is-archived")
                .long("is-archived")
                .help("Filter for archived records")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("vocabulary")
                .long("vocabulary")
                .help("Output as vocabulary (e.g. InvenioRDM affiliations YAML)")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("match")
                .long("match")
                .help("Enable ROR affiliation matching when reading crossref and datacite records")
                .default_value("true")
                .value_parser(clap::value_parser!(bool)),
        )
        .arg(Arg::new("date").long("date").help(
            "Date (YYYY-MM-DD) of a VRAIX daily dump; downloads \
            {from}-{date}.sqlite3.zst from metadata.vraix.org when no input \
            file path is given",
        ))
        .arg(
            Arg::new("s3")
                .long("s3")
                .help(
                    "Crossref annual: download from the Crossref S3 bucket \
                    (s3://api-snapshots-reqpays-crossref) instead of Academic Torrents. \
                    Requires the AWS CLI to be installed and configured with credentials; \
                    the requester pays bandwidth costs (~$18 for the full 2025 file). \
                    With --sample, processes the first 1000 records from the cached TAR \
                    (cache must exist; run without --sample first to download).",
                )
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("no-network")
                .long("no-network")
                .help("Disable all outbound network requests; only local .sqlite3 file imports are allowed")
                .action(ArgAction::SetTrue),
        )
}

pub fn execute(matches: &ArgMatches) -> Result<(), String> {
    let input_path = matches.get_one::<String>("input").map(String::as_str);
    let date = matches.get_one::<String>("date").map(String::as_str);
    let no_network = matches.get_flag("no-network");

    // Torrent path: download files via aria2c, then import each .jsonl.gz.
    let is_torrent_input = input_path
        .map(|p| p.ends_with(".torrent"))
        .unwrap_or(false);
    if is_torrent_input {
        let torrent_path = std::path::Path::new(input_path.unwrap());
        let from_explicit = matches.get_one::<String>("from").map(String::as_str).unwrap_or("commonmeta");
        return import_torrent(torrent_path, from_explicit, no_network, None, None);
    }

    // DataCite annual data file: the positional input is the time-limited download URL
    // (obtained by submitting email at https://datafiles.datacite.org/datafiles/public-2025).
    let is_datacite_annual = input_path
        .and_then(|p| url::Url::parse(p).ok())
        .and_then(|u| u.host_str().map(|h| h == DATACITE_ANNUAL_HOST))
        .unwrap_or(false);
    if is_datacite_annual {
        let sample = matches.get_flag("sample");
        return import_datacite_annual(input_path.unwrap(), sample, no_network);
    }

    // Auto-detect source from VRAIX filename pattern ({source}-{date}.sqlite3).
    let is_sqlite_input = input_path
        .map(|p| io_utils::get_extension(p, ".json").1 == ".sqlite3")
        .unwrap_or(false);
    let filename_source: Option<&'static str> = if is_sqlite_input {
        input_path
            .and_then(|p| std::path::Path::new(p).file_stem()?.to_str())
            .and_then(|stem| {
                if stem.starts_with("crossref-") { Some("crossref") }
                else if stem.starts_with("datacite-") { Some("datacite") }
                else { None }
            })
    } else {
        None
    };
    let from_explicit = matches.get_one::<String>("from").map(String::as_str).unwrap_or("commonmeta");
    let from_flag: &str = filename_source.unwrap_or(from_explicit);

    // When a source-specific flag is provided but --from is not set, auto-select
    // the matching source (e.g. `import --member 78` implies --from crossref).
    let from_flag = if from_flag == "commonmeta" && input_path.is_none() {
        let has_member = matches.get_one::<String>("member").map(|s| !s.is_empty()).unwrap_or(false);
        let has_client = matches.get_one::<String>("client").map(|s| !s.is_empty()).unwrap_or(false);
        if has_member { "crossref" }
        else if has_client { "datacite" }
        else { from_flag }
    } else {
        from_flag
    };

    // Positional shorthand: `import ror` / `import pidbox` / `import crossref` etc.
    // is treated as `import --from <source>` when --from was not explicitly given
    // and the positional arg is a known source name rather than a DOI or file path.
    let (from, input_path) = match input_path {
        Some(s)
            if from_flag == "commonmeta"
                && !is_sqlite_input
                && matches!(s, "crossref" | "openalex" | "pidbox" | "ror") =>
        {
            (s, None)
        }
        _ => (from_flag, input_path),
    };

    if !matches!(from, "crossref" | "datacite" | "openalex" | "pidbox" | "ror" | "commonmeta") {
        return Err(format!(
            "import: unsupported --from value '{}' (supported: crossref, datacite, openalex, pidbox, ror)",
            from
        ));
    }

    // When --no-network is set, only a local VRAIX .sqlite3 file is accepted.
    // Everything else (DOI lookups, API fetches, date downloads, ror/pidbox installs)
    // requires outbound network access.
    if no_network && !(is_sqlite_input && input_path.is_some()) {
        return Err(
            "--no-network requires a local .sqlite3 input file; \
            provide a VRAIX dump path or remove --no-network"
                .to_string(),
        );
    }

    // ROR is a vocabulary install, not a metadata records import.
    if from == "ror" {
        let out_path = resolve_db_path(None);
        return install_ror(&out_path);
    }

    // pidbox is a full VRAIX dump installed directly into commonmeta.sqlite3.
    if from == "pidbox" {
        let out_path = resolve_db_path(None);
        return install_pidbox(&out_path);
    }

    let out_path = resolve_db_path(None);
    let is_vraix_sqlite = is_sqlite_input && matches!(from, "crossref" | "datacite");
    let is_date_download = date.is_some() && input_path.is_none() && matches!(from, "crossref" | "datacite");

    // Fast path: stream VRAIX SQLite → commonmeta SQLite without loading all
    // records into RAM. Always imports every row (limit=0). Always upserts.
    if is_vraix_sqlite || is_date_download {
        return import_vraix_fast(from, input_path, date, &out_path);
    }

    // Annual torrent/S3 path: `import --from crossref` with no date, no input file,
    // and no API-specific filters. Without --s3 uses Academic Torrents (aria2c);
    // with --s3 downloads from the Crossref S3 requester-pays bucket.
    let has_api_filters = matches.get_one::<String>("member").map(|s| !s.is_empty()).unwrap_or(false)
        || matches.get_one::<String>("client").map(|s| !s.is_empty()).unwrap_or(false)
        || matches.get_one::<String>("orcid").map(|s| !s.is_empty()).unwrap_or(false)
        || matches.get_one::<String>("ror").map(|s| !s.is_empty()).unwrap_or(false)
        || *matches.get_one::<usize>("number").unwrap_or(&0) > 0;
    let s3 = matches.get_flag("s3");
    let is_annual_crossref = matches!(from, "crossref")
        && input_path.is_none()
        && date.is_none()
        && !has_api_filters;
    if is_annual_crossref {
        let sample = matches.get_flag("sample");
        if s3 {
            return import_crossref_s3(sample, no_network);
        }
        return import_annual_torrent(from, sample, no_network);
    }

    // Annual DataCite path: `import --from datacite` (or `import datacite`) with no
    // date and no API filters.  Uses the cached TAR if present; otherwise the user
    // must supply the download URL as the positional argument (handled earlier via
    // `is_datacite_annual`).
    let cache_path = io_utils::cache_dir("datacite").join("public-2025.tar");
    let is_datacite_annual_cmd = from == "datacite"
        && input_path.is_none()
        && date.is_none()
        && !has_api_filters
        && !matches.get_flag("sample");
    if is_datacite_annual_cmd {
        if !cache_path.exists() || cache_path.metadata().map(|m| m.len()).unwrap_or(0) == 0 {
            return Err(format!(
                "import: no cached DataCite 2025 TAR found at {}.\n\
                 Obtain a download URL from https://datafiles.datacite.org/datafiles/public-2025\n\
                 and run: commonmeta import \"<URL>\"",
                cache_path.display()
            ));
        }
        let sample = matches.get_flag("sample");
        return import_datacite_annual("", sample, no_network);
    }

    // Commonmeta sqlite → commonmeta sqlite (merge/upsert).
    if is_sqlite_input && from == "commonmeta" {
        if let Some(src) = input_path {
            return import_commonmeta_sqlite(src, &out_path);
        }
    }

    // Single-record path: DOI, URL, or any identifier that isn't a file path.
    // Auto-detect the source format from the identifier when --from is not given.
    if let Some(identifier) = input_path {
        if !is_sqlite_input {
            let effective_from = if from_explicit == "commonmeta" {
                detect_format(identifier)
            } else {
                from_explicit.to_string()
            };
            return import_single(identifier, &effective_from, &out_path);
        }
    }

    // API fetch path: fetch records, then upsert into commonmeta SQLite.
    if from == "commonmeta" {
        return Err(
            "import: --from commonmeta requires an input .sqlite3 file path".to_string()
        );
    }
    let fetch_start = Instant::now();
    let data = fetch_list_from_api(matches, from)?;
    eprintln!(
        "import: fetch took {:.2?} ({} records)",
        fetch_start.elapsed(),
        data.len()
    );

    let out_sqlite = Path::new(&out_path);
    let write_start = Instant::now();
    commonmeta::upsert_sqlite(&data, out_sqlite).map_err(|e| e.to_string())?;
    let total = commonmeta::count_sqlite_works(out_sqlite).ok();
    eprintln!(
        "import: upsert took {:.2?} ({} records)",
        write_start.elapsed(),
        data.len()
    );
    println!("{}", fmt_wrote_sqlite(&out_path, data.len(), total));
    Ok(())
}

/// Number of files downloaded from the annual torrent when --sample is set.
/// The full annual file has ~28 700 files; 5 gives ~40 MB for a quick smoke-test.
const SAMPLE_FILE_COUNT: usize = 5;

/// Download the annual Crossref (or DataCite) public data file via Academic Torrents
/// and import it. When `sample` is true, downloads only the first SAMPLE_FILE_COUNT
/// files from the annual torrent (which always has active seeders).
/// Import the DataCite annual public data file into the local database.
///
/// The download is a plain TAR archive containing `.jsonl.gz` files, each holding
/// up to 10,000 DataCite records.  Some entries are empty placeholders (valid gzip
/// of 0 bytes) for date partitions with no records — those are silently skipped.
///
/// The TAR is cached at `~/Library/Caches/commonmeta/datacite/public-2025.tar` so
/// re-parses don't require a new token.  `url` (with its 24 h JWT token) is only
/// needed when the cache is absent.
///
/// When `sample` is true, processing stops once DATACITE_SAMPLE_LINES records
/// have been accumulated (skipping empty entries until data is found).
fn import_datacite_annual(url: &str, sample: bool, no_network: bool) -> Result<(), String> {
    use std::io::BufReader;
    use tar::Archive;

    const DATACITE_SAMPLE_LINES: usize = 1_000;
    let limit = if sample { DATACITE_SAMPLE_LINES } else { usize::MAX };
    let cache_path = io_utils::cache_dir("datacite").join("public-2025.tar");
    let cached = cache_path.exists() && cache_path.metadata().map(|m| m.len()).unwrap_or(0) > 0;

    // --sample without cache: stream directly from URL without downloading 33 GB to disk.
    if sample && !cached {
        if url.is_empty() {
            return Err(format!(
                "import: no cached DataCite 2025 TAR at {}; \
                 obtain a download URL from https://datafiles.datacite.org/datafiles/public-2025",
                cache_path.display()
            ));
        }
        if no_network {
            return Err("--no-network: cannot stream DataCite annual data file".to_string());
        }
        eprintln!("import: streaming DataCite 2025 (first {} records, no cache)", limit);
        let client = reqwest::blocking::Client::builder()
            .user_agent(io_utils::commonmeta_user_agent())
            .timeout(std::time::Duration::from_secs(24 * 60 * 60))
            .build()
            .map_err(|e| format!("HTTP client error: {}", e))?;
        let resp = client.get(url).send()
            .and_then(|r| r.error_for_status())
            .map_err(|e| format!("download failed: {}", e))?;
        return process_datacite_archive(Archive::new(BufReader::new(resp)), limit);
    }

    // Full import (or --sample with existing cache): ensure TAR is on disk.
    if !cached {
        if url.is_empty() {
            return Err(format!(
                "import: no cached DataCite 2025 TAR at {}; \
                 obtain a download URL from https://datafiles.datacite.org/datafiles/public-2025",
                cache_path.display()
            ));
        }
        if no_network {
            return Err(format!(
                "--no-network: cached TAR not found at {}; provide a download URL to cache it first",
                cache_path.display()
            ));
        }
        eprintln!("import: downloading DataCite 2025 annual data file to {}", cache_path.display());
        let client = reqwest::blocking::Client::builder()
            .user_agent(io_utils::commonmeta_user_agent())
            .timeout(std::time::Duration::from_secs(24 * 60 * 60))
            .build()
            .map_err(|e| format!("HTTP client error: {}", e))?;
        let mut resp = client.get(url).send()
            .and_then(|r| r.error_for_status())
            .map_err(|e| format!("download failed: {}", e))?;
        if let Some(parent) = cache_path.parent() {
            std::fs::create_dir_all(parent).map_err(|e| format!("mkdir: {}", e))?;
        }
        let mut file = std::fs::File::create(&cache_path)
            .map_err(|e| format!("create cache file: {}", e))?;
        let bytes = std::io::copy(&mut resp, &mut file)
            .map_err(|e| format!("download write: {}", e))?;
        eprintln!("import: cached {} GB at {}", bytes / 1_073_741_824, cache_path.display());
    } else {
        eprintln!("import: using cached DataCite 2025 TAR at {}", cache_path.display());
    }

    eprintln!(
        "import: processing DataCite 2025 annual data file{}",
        if sample { format!(" (first {} records)", limit) } else { String::new() },
    );
    let file = std::fs::File::open(&cache_path)
        .map_err(|e| format!("open cache: {}", e))?;
    process_datacite_archive(Archive::new(file), limit)
}

fn process_datacite_archive<R: std::io::Read>(mut archive: tar::Archive<R>, limit: usize) -> Result<(), String> {
    use flate2::read::GzDecoder;
    use std::io::{BufRead, BufReader, Read as _, Cursor};
    use rayon::prelude::*;

    let out_path = resolve_db_path(None);
    let out_sqlite = Path::new(&out_path);
    let mut total_records = 0usize;
    let start = Instant::now();
    let mut file_count = 0usize;

    'entries: for entry_result in archive.entries().map_err(|e| format!("TAR read: {}", e))? {
        let entry = entry_result.map_err(|e| format!("TAR entry: {}", e))?;
        let name = entry.path()
            .map(|p| p.to_string_lossy().into_owned())
            .unwrap_or_default();
        if !name.ends_with(".jsonl.gz") {
            continue;
        }

        let mut compressed: Vec<u8> = Vec::new();
        { entry }.read_to_end(&mut compressed)
            .map_err(|e| format!("read entry {}: {}", name, e))?;

        let mut decompressed: Vec<u8> = Vec::new();
        if let Err(e) = GzDecoder::new(Cursor::new(&compressed)).read_to_end(&mut decompressed) {
            eprintln!("import: decompress error in {}: {}", name, e);
            continue;
        }
        if decompressed.is_empty() {
            continue;
        }
        file_count += 1;

        let take = limit.saturating_sub(total_records);
        // JSONL records in the annual file are the `data` object without the
        // outer {"data":...} wrapper that the DataCite API response and reader expect.
        let lines: Vec<String> = BufReader::new(Cursor::new(decompressed))
            .lines()
            .filter_map(|l| l.ok())
            .map(|l| l.trim().to_string())
            .filter(|l| !l.is_empty())
            .take(take)
            .collect();

        if lines.is_empty() {
            continue;
        }

        let batch: Vec<commonmeta::Data> = lines
            .par_iter()
            .filter_map(|trimmed| {
                let input = format!("{{\"data\":{}}}", trimmed);
                commonmeta::read("datacite", &input).ok()
            })
            .collect();

        let n = batch.len();
        if n > 0 {
            commonmeta::upsert_sqlite(&batch, out_sqlite)
                .map_err(|e| format!("upsert failed after {}: {}", name, e))?;
            total_records += n;
        }
        eprintln!("import: {} — {} records ({} total in {:.0?})", name, n, total_records, start.elapsed());

        if total_records >= limit {
            break 'entries;
        }
    }

    let db_total = commonmeta::count_sqlite_works(out_sqlite).ok();
    eprintln!("import: {} files, {} records in {:.2?}", file_count, total_records, start.elapsed());
    println!("{}", fmt_wrote_sqlite(&out_path, total_records, db_total));
    Ok(())
}

const CROSSREF_S3_BUCKET: &str = "api-snapshots-reqpays-crossref";
const CROSSREF_S3_KEY: &str = "March_2026_Public_Data_File_from_Crossref.tar";

/// Download the Crossref annual public data file from the S3 requester-pays bucket
/// and import it. Requires the `aws` CLI configured with valid AWS credentials;
/// the requester pays bandwidth costs (~$18 for the ~223 GB 2026 file).
///
/// The TAR is cached at `~/.cache/commonmeta/crossref/crossref-annual-s3.tar`.
/// When `sample` is true and the cache exists, processing stops after 1 000 records.
/// When `sample` is true and no cache exists, an error is returned — use the torrent
/// path (`commonmeta import --from crossref --sample`) for quick smoke-tests.
fn import_crossref_s3(sample: bool, no_network: bool) -> Result<(), String> {
    const CROSSREF_S3_SAMPLE_LINES: usize = 1_000;
    let limit = if sample { CROSSREF_S3_SAMPLE_LINES } else { usize::MAX };
    let cache_path = io_utils::cache_dir("crossref").join("crossref-annual-s3.tar");
    let cached = cache_path.exists() && cache_path.metadata().map(|m| m.len()).unwrap_or(0) > 0;

    if sample && !cached {
        return Err(format!(
            "import: --s3 --sample requires the cached TAR at {}.\n\
             Run without --sample first to download the full archive, then use --sample \
             to test against the cached data.\n\
             For a quick smoke-test without downloading, use:\n\
             commonmeta import --from crossref --sample   (torrent, first 5 files)",
            cache_path.display()
        ));
    }

    if !cached {
        if no_network {
            return Err(format!(
                "--no-network: no cached Crossref S3 TAR at {}",
                cache_path.display()
            ));
        }

        // Verify aws CLI is available.
        if std::process::Command::new("aws").arg("--version").output().is_err() {
            return Err(
                "import: aws CLI not found — install it from https://aws.amazon.com/cli/ \
                and configure credentials before using --s3"
                    .to_string(),
            );
        }

        let key = std::env::var("CROSSREF_S3_KEY")
            .unwrap_or_else(|_| CROSSREF_S3_KEY.to_string());

        if let Some(parent) = cache_path.parent() {
            std::fs::create_dir_all(parent).map_err(|e| format!("mkdir: {}", e))?;
        }
        eprintln!(
            "import: downloading s3://{}/{} (~223 GB) to {}\n\
             Note: this is a requester-pays bucket; bandwidth costs (~$18) are charged to your AWS account.",
            CROSSREF_S3_BUCKET, key, cache_path.display()
        );
        let status = std::process::Command::new("aws")
            .args([
                "s3", "cp",
                "--request-payer", "requester",
                &format!("s3://{}/{}", CROSSREF_S3_BUCKET, key),
                &cache_path.to_string_lossy(),
            ])
            .status()
            .map_err(|e| format!("aws s3 cp failed: {}", e))?;
        if !status.success() {
            return Err(
                "aws s3 cp failed — check AWS credentials, IAM permissions, \
                and that you accept requester-pays charges for this bucket"
                    .to_string(),
            );
        }
        eprintln!("import: cached Crossref annual TAR at {}", cache_path.display());
    } else {
        eprintln!("import: using cached Crossref annual TAR at {}", cache_path.display());
    }

    eprintln!(
        "import: processing Crossref annual data file{}",
        if sample { format!(" (first {} records)", limit) } else { String::new() },
    );
    let file = std::fs::File::open(&cache_path)
        .map_err(|e| format!("open cache: {}", e))?;
    process_crossref_s3_archive(tar::Archive::new(file), limit)
}

fn process_crossref_s3_archive<R: std::io::Read>(mut archive: tar::Archive<R>, limit: usize) -> Result<(), String> {
    use flate2::read::GzDecoder;
    use std::io::{BufRead, BufReader, Read as _, Cursor};
    use rayon::prelude::*;

    let out_path = resolve_db_path(None);
    let out_sqlite = Path::new(&out_path);
    let mut total_records = 0usize;
    let start = Instant::now();
    let mut file_count = 0usize;

    'entries: for entry_result in archive.entries().map_err(|e| format!("TAR read: {}", e))? {
        let mut entry = entry_result.map_err(|e| format!("TAR entry: {}", e))?;
        let name = entry.path()
            .map(|p| p.to_string_lossy().into_owned())
            .unwrap_or_default();

        // Support both compressed (.jsonl.gz) and plain (.jsonl / .json) entries.
        let is_gz = name.ends_with(".jsonl.gz") || name.ends_with(".json.gz");
        let is_jsonl = name.ends_with(".jsonl") || name.ends_with(".json");
        if !is_gz && !is_jsonl {
            continue;
        }

        let mut raw: Vec<u8> = Vec::new();
        entry.read_to_end(&mut raw)
            .map_err(|e| format!("read entry {}: {}", name, e))?;
        if raw.is_empty() {
            continue;
        }

        let decompressed: Vec<u8> = if is_gz {
            let mut buf = Vec::new();
            if let Err(e) = GzDecoder::new(Cursor::new(&raw)).read_to_end(&mut buf) {
                eprintln!("import: decompress error in {}: {}", name, e);
                continue;
            }
            buf
        } else {
            raw
        };
        if decompressed.is_empty() {
            continue;
        }
        file_count += 1;

        let take = limit.saturating_sub(total_records);
        let lines: Vec<String> = BufReader::new(Cursor::new(decompressed))
            .lines()
            .filter_map(|l| l.ok())
            .map(|l| l.trim().to_string())
            .filter(|l| !l.is_empty())
            .take(take)
            .collect();

        if lines.is_empty() {
            continue;
        }

        let batch: Vec<commonmeta::Data> = lines
            .par_iter()
            .filter_map(|trimmed| commonmeta::read("crossref", trimmed).ok())
            .collect();

        let n = batch.len();
        if n > 0 {
            commonmeta::upsert_sqlite(&batch, out_sqlite)
                .map_err(|e| format!("upsert failed after {}: {}", name, e))?;
            total_records += n;
        }
        eprintln!("import: {} — {} records ({} total in {:.0?})", name, n, total_records, start.elapsed());

        if total_records >= limit {
            break 'entries;
        }
    }

    let db_total = commonmeta::count_sqlite_works(out_sqlite).ok();
    eprintln!("import: {} files, {} records in {:.2?}", file_count, total_records, start.elapsed());
    println!("{}", fmt_wrote_sqlite(&out_path, total_records, db_total));
    Ok(())
}

fn import_annual_torrent(from: &str, sample: bool, no_network: bool) -> Result<(), String> {
    let torrent_url = match from {
        "crossref" => CROSSREF_ANNUAL_TORRENT_URL,
        other => return Err(format!(
            "import: annual torrent download not yet supported for '{}'", other
        )),
    };

    eprintln!(
        "import: downloading {} Crossref March 2026 via Academic Torrents (~223 GB{})",
        if sample { "sample of" } else { "full" },
        if sample { format!(", first {} files", SAMPLE_FILE_COUNT) } else { String::new() },
    );

    // Cache the .torrent metafile under a dedicated "torrents" namespace.
    // "vraix" is for VRAIX dump files; torrent metafiles are a different provenance.
    // Use a descriptive name rather than the raw infohash from the URL.
    let torrent_name = match from {
        "crossref" => "crossref-annual.torrent",
        _          => "crossref-annual.torrent",
    };
    let torrent_cache_path = io_utils::cache_dir("torrents").join(torrent_name);
    let cached_size = std::fs::metadata(&torrent_cache_path).map(|m| m.len()).unwrap_or(0);
    if cached_size == 0 {
        std::fs::remove_file(&torrent_cache_path).ok();
    }
    if cached_size == 0 || !torrent_cache_path.exists() {
        eprintln!("import: fetching .torrent metafile from Academic Torrents…");
        let client = reqwest::blocking::Client::builder()
            .user_agent(io_utils::commonmeta_user_agent())
            .timeout(std::time::Duration::from_secs(30))
            .build()
            .map_err(|e| format!("failed to build HTTP client: {}", e))?;
        let bytes = client
            .get(torrent_url)
            .send()
            .and_then(|r| r.error_for_status())
            .and_then(|r| r.bytes())
            .map_err(|e| format!("failed to download torrent metafile: {}", e))?;
        if bytes.is_empty() {
            return Err("torrent metafile download returned 0 bytes — Academic Torrents may require authentication".to_string());
        }
        if let Some(parent) = torrent_cache_path.parent() {
            std::fs::create_dir_all(parent)
                .map_err(|e| format!("failed to create cache dir: {}", e))?;
        }
        std::fs::write(&torrent_cache_path, &bytes)
            .map_err(|e| format!("failed to cache torrent metafile: {}", e))?;
        eprintln!("import: cached .torrent ({} bytes) at {}", bytes.len(), torrent_cache_path.display());
    } else {
        eprintln!("import: torrent metafile cached at {}", torrent_cache_path.display());
    }

    // Place downloaded data alongside the metafile in the torrents cache dir.
    let output_dir = io_utils::cache_dir("torrents").join("crossref-annual");
    let select_files = if sample { Some(SAMPLE_FILE_COUNT) } else { None };
    import_torrent(&torrent_cache_path, from, no_network, Some(&output_dir), select_files)
}

/// Download a torrent's `.jsonl.gz` files via aria2c and import each one.
///
/// `from` must be "crossref" or "datacite" (determines which reader is used).
/// `select_files` limits the download to the first N files (for --sample).
fn import_torrent(torrent_path: &Path, from: &str, no_network: bool, output_dir_override: Option<&Path>, select_files: Option<usize>) -> Result<(), String> {
    use flate2::read::GzDecoder;
    use std::io::Read as _;

    let source = match from {
        "crossref" | "datacite" => from,
        _ => return Err(format!(
            "import: .torrent import requires --from crossref or --from datacite (got '{}')",
            from
        )),
    };

    // Parse the torrent to show a summary before downloading.
    let info = io_utils::parse_torrent(torrent_path)
        .map_err(|e| format!("failed to parse torrent: {}", e))?;
    let jsonl_gz_count = info.files.iter().filter(|f| f.path.ends_with(".jsonl.gz")).count();
    eprintln!(
        "import: torrent '{}' — {} file(s), {} .jsonl.gz",
        info.name, info.files.len(), jsonl_gz_count,
    );

    // Download all files — use override dir when called from import_annual_torrent,
    // otherwise place next to the .torrent file.
    let default_dir = torrent_path
        .parent()
        .unwrap_or_else(|| Path::new("."))
        .join(&info.name);
    let output_dir = output_dir_override.unwrap_or(&default_dir);
    let downloaded = io_utils::download_torrent(torrent_path, output_dir, no_network, select_files)
        .map_err(|e| format!("torrent download failed: {}", e))?;

    let out_path = resolve_db_path(None);
    let out_sqlite = Path::new(&out_path);
    let mut total_records = 0usize;
    let start = Instant::now();

    for file_path in &downloaded {
        if !file_path.extension().map(|e| e == "gz").unwrap_or(false) {
            continue;
        }
        let compressed = std::fs::read(file_path)
            .map_err(|e| format!("failed to read '{}': {}", file_path.display(), e))?;
        let mut decoder = GzDecoder::new(compressed.as_slice());
        let mut content = String::new();
        decoder.read_to_string(&mut content)
            .map_err(|e| format!("failed to decompress '{}': {}", file_path.display(), e))?;

        let mut records = Vec::new();
        for (lineno, line) in content.lines().enumerate() {
            let trimmed = line.trim();
            if trimmed.is_empty() { continue; }
            let value: serde_json::Value = serde_json::from_str(trimmed)
                .map_err(|e| format!("{}: line {}: {}", file_path.display(), lineno + 1, e))?;
            let envelope = serde_json::json!({ "message": value });
            let input = serde_json::to_string(&envelope).map_err(|e| e.to_string())?;
            match commonmeta::convert(source, "commonmeta", &input) {
                Ok(bytes) => {
                    if let Ok(data) = serde_json::from_slice::<commonmeta::Data>(&bytes) {
                        records.push(data);
                    }
                }
                Err(e) => eprintln!("import: skipping line {}: {}", lineno + 1, e),
            }
        }

        if !records.is_empty() {
            commonmeta::upsert_sqlite(&records, out_sqlite)
                .map_err(|e| format!("upsert failed for '{}': {}", file_path.display(), e))?;
            total_records += records.len();
            eprintln!("import: {} → {} records", file_path.display(), records.len());
        }
    }

    let db_total = commonmeta::count_sqlite_works(out_sqlite).ok();
    eprintln!("import: {} total records in {:.2?}", total_records, start.elapsed());
    println!("{}", fmt_wrote_sqlite(&out_path, total_records, db_total));
    Ok(())
}

/// Copy all records from a commonmeta-format SQLite file into the local database.
fn import_commonmeta_sqlite(src_path: &str, out_path: &str) -> Result<(), String> {
    let total_start = Instant::now();
    let src = Path::new(src_path);
    let out = Path::new(out_path);

    let read_start = Instant::now();
    let data = commonmeta::read_sqlite_commonmeta(src, None, 0)
        .map_err(|e| format!("failed to read '{}': {}", src_path, e))?;
    eprintln!("import: read {} records in {:.2?}", data.len(), read_start.elapsed());

    let write_start = Instant::now();
    commonmeta::upsert_sqlite(&data, out).map_err(|e| e.to_string())?;
    let total = commonmeta::count_sqlite_works(out).ok();
    eprintln!("import: upsert took {:.2?}", write_start.elapsed());
    eprintln!("import: total {:.2?}", total_start.elapsed());
    println!("{}", fmt_wrote_sqlite(out_path, data.len(), total));
    Ok(())
}

/// Fetch a single record by DOI, URL, or other identifier and upsert it.
fn import_single(identifier: &str, from: &str, out_path: &str) -> Result<(), String> {
    let fetch_start = Instant::now();
    let data = commonmeta::read(from, identifier).map_err(|e| e.to_string())?;
    eprintln!("import: fetch took {:.2?}", fetch_start.elapsed());

    let out_sqlite = Path::new(out_path);
    let write_start = Instant::now();
    commonmeta::upsert_sqlite(std::slice::from_ref(&data), out_sqlite)
        .map_err(|e| e.to_string())?;
    let total = commonmeta::count_sqlite_works(out_sqlite).ok();
    eprintln!("import: upsert took {:.2?}", write_start.elapsed());
    println!("{}", fmt_wrote_sqlite(out_path, 1, total));
    Ok(())
}

/// Stream a VRAIX SQLite dump (local file or downloaded daily dump) directly
/// into the commonmeta database with upsert semantics. Always imports all rows.
fn import_vraix_fast(
    from: &str,
    input_path: Option<&str>,
    date: Option<&str>,
    out_path: &str,
) -> Result<(), String> {
    let total_start = Instant::now();
    let out_sqlite = std::path::PathBuf::from(out_path);

    // Resolve the VRAIX input to a local .sqlite3 path, downloading and
    // decompressing on demand when only --date was given.
    let (in_sqlite, tmp_to_clean) = if date.is_some() && input_path.is_none() {
        let date = date.unwrap();
        let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
        let cache_key = format!("{}-{}.sqlite3.zst", from, date);
        let dl_start = Instant::now();
        let (cache_path, from_cache) =
            io_utils::ensure_cached_path(&url, "vraix", &cache_key, VRAIX_CACHE_TTL)
                .map_err(|e| format!("failed to download '{}': {}", url, e))?;
        let size = cache_path.metadata().map(|m| m.len()).unwrap_or(0);
        eprintln!(
            "import: download took {:.2?} ({} bytes{})",
            dl_start.elapsed(),
            size,
            if from_cache { ", from cache" } else { "" }
        );
        let dc_start = Instant::now();
        let tmp = out_sqlite.with_extension(format!("sqlite3.vraix-{}.tmp", std::process::id()));
        let dc_bytes = io_utils::decompress_zst_file(&cache_path, &tmp)
            .map_err(|e| format!("failed to decompress '{}': {}", url, e))?;
        eprintln!(
            "import: decompress took {:.2?} ({} bytes)",
            dc_start.elapsed(),
            dc_bytes
        );
        (tmp.clone(), Some(tmp))
    } else {
        (std::path::PathBuf::from(input_path.unwrap()), None)
    };

    let convert_start = Instant::now();
    let result = commonmeta::stream_vraix_to_sqlite(&in_sqlite, from, &out_sqlite, 0, true)
        .map_err(|e| e.to_string());
    if let Some(tmp) = tmp_to_clean {
        std::fs::remove_file(&tmp).ok();
    }
    let n = result?;
    let total = commonmeta::count_sqlite_works(&out_sqlite).ok();
    eprintln!(
        "import: convert+write took {:.2?} ({} records)",
        convert_start.elapsed(),
        n
    );
    eprintln!("import: total took {:.2?}", total_start.elapsed());
    println!("{}", fmt_wrote_sqlite(out_path, n, total));
    Ok(())
}

pub(crate) fn install_ror(out_path: &str) -> Result<(), String> {
    let total = Instant::now();

    eprintln!("Fetching latest ROR release metadata from Zenodo...");
    let t = Instant::now();
    let release = commonmeta::fetch_latest_ror_release().map_err(|e| e.to_string())?;
    eprintln!("  metadata fetched in {:.2}s", t.elapsed().as_secs_f64());

    let db_path = Path::new(out_path);
    match commonmeta::fetch_installed_ror_version(db_path).map_err(|e| e.to_string())? {
        Some(ref installed) if installed == &release.version => {
            println!(
                "ROR {} ({}) is already installed at {}",
                release.version, release.date, out_path
            );
            return Ok(());
        }
        Some(ref installed) => {
            eprintln!("Upgrading ROR {} → {}...", installed, release.version);
        }
        None => {}
    }

    let t = Instant::now();
    let (list, from_cache) =
        commonmeta::download_ror_release(&release).map_err(|e| e.to_string())?;
    eprintln!(
        "  {} and parsed {} organizations in {:.2}s",
        if from_cache { "loaded" } else { "downloaded" },
        list.len(),
        t.elapsed().as_secs_f64()
    );

    eprintln!("Writing to {}...", out_path);
    let t = Instant::now();
    commonmeta::write_ror_sqlite(&list, db_path, Some(&release.version), Some(&release.date))
        .map_err(|e| e.to_string())?;
    eprintln!("  SQLite written in {:.2}s", t.elapsed().as_secs_f64());
    eprintln!("  total: {:.2}s", total.elapsed().as_secs_f64());

    println!(
        "Installed ROR {} ({}) → {} ({} organizations)",
        release.version,
        release.date,
        out_path,
        list.len(),
    );
    Ok(())
}

pub(crate) fn install_pidbox(out_path: &str) -> Result<(), String> {
    let total = Instant::now();

    eprintln!("Downloading pidbox from {}...", PIDBOX_URL);
    let t = Instant::now();
    let (cache_path, from_cache) =
        io_utils::ensure_cached_path(PIDBOX_URL, "vraix", PIDBOX_CACHE_KEY, VRAIX_CACHE_TTL)
            .map_err(|e| format!("failed to download pidbox: {}", e))?;
    if from_cache {
        eprintln!("  pidbox download skipped (cached at {})", cache_path.display());
    } else {
        eprintln!("  downloaded in {:.2}s", t.elapsed().as_secs_f64());
    }

    // The pidbox SQLite database is not VACUUM'd, so overflow pages for large
    // records appear in reverse page-number order.  stream_zst_pidbox_to_sqlite
    // uses a sliding window buffer (default 32 GiB RAM + 500 GiB disk) to
    // resolve backward chain links without extra full-file scans.
    // Tune with COMMONMETA_SCAN_WINDOW_GIB and COMMONMETA_SCAN_DISK_GIB.
    let out = Path::new(out_path);
    eprintln!("Converting (streaming decompress + convert) → {}…", out_path);
    let t = Instant::now();
    let n = commonmeta::stream_zst_pidbox_to_sqlite(&cache_path, out, 0)
        .map_err(|e| format!("failed to convert pidbox: {}", e))?;
    eprintln!("  converted and wrote {} records in {:.0}s", n, t.elapsed().as_secs_f64());
    eprintln!("  total: {:.0}s", total.elapsed().as_secs_f64());

    let date = commonmeta::fetch_installed_vraix_date(out)
        .ok()
        .flatten()
        .map(|d| format!(", vraix_date: {d}"))
        .unwrap_or_default();
    println!("Installed pidbox → {} ({} records{})", out_path, n, date);
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    fn parse_args(args: &[&str]) -> clap::ArgMatches {
        command().try_get_matches_from(args).expect("arg parse failed")
    }

    #[test]
    fn test_no_network_with_doi_errors() {
        let m = parse_args(&["import", "--no-network", "10.7554/elife.01567"]);
        let err = execute(&m).unwrap_err();
        assert!(
            err.contains("--no-network"),
            "expected --no-network in error, got: {err}"
        );
    }

    #[test]
    fn test_no_network_with_api_fetch_errors() {
        let m = parse_args(&["import", "--no-network", "--from", "crossref", "--ror", "00pd74e08"]);
        let err = execute(&m).unwrap_err();
        assert!(
            err.contains("--no-network"),
            "expected --no-network in error, got: {err}"
        );
    }

    #[test]
    fn test_no_network_with_local_sqlite_passes_guard() {
        // Use a generic .sqlite3 name (not the crossref-/datacite- VRAIX pattern)
        // so the guard passes and the command fails at the "from commonmeta requires
        // a .sqlite3 path" check rather than entering the slow streaming path.
        let m = parse_args(&["import", "--no-network", "local.sqlite3"]);
        let err = execute(&m).unwrap_err();
        assert!(
            !err.contains("--no-network"),
            "should not fail at network guard for local sqlite, got: {err}"
        );
    }
}