commonmeta 0.9.4

#![allow(dead_code)]

use std::io::Read;
use std::path::Path;
use serde::{Deserialize, Serialize};

use crate::data::{Data, Identifier, PersonUrl};
use crate::error::{Error, Result};

// ── Figshare release ──────────────────────────────────────────────────────────

const ORCID_FIGSHARE_ARTICLE_ID: u64 = 30375589;

/// Metadata about an ORCID Public Data File release on figshare.
///
/// The release is identified by year and batch number (the second numeric
/// component in the filename, e.g. `ORCID_2025_10_summaries.tar.gz` → year=2025,
/// batch=10). Download the summaries archive (~46 GB) manually and pass its
/// local path to [`stream_summaries_to_sqlite`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OrcidRelease {
    /// Snapshot year, e.g. 2025.
    pub year: u16,
    /// Batch number within the year, e.g. 10.
    pub batch: u16,
    /// Filename of the summaries archive, e.g. `ORCID_2025_10_summaries.tar.gz`.
    pub filename: String,
    /// Direct download URL for the summaries archive.
    pub download_url: String,
    /// Compressed size in bytes (~46 GB for the 2025 release).
    pub size_bytes: u64,
}

// ── XML deserialization structs ───────────────────────────────────────────────
// quick_xml 0.40 non-namespace-aware deserialization strips namespace prefixes
// from element names before matching struct fields. `<common:uri>` matches the
// field rename `"uri"`, not `"common:uri"`. All renames use local names only.

#[derive(Deserialize, Default)]
#[serde(default)]
struct OrcidXml {
    #[serde(rename = "orcid-identifier")]
    orcid_identifier: XmlOrcidId,
    history: XmlHistory,
    person: XmlPerson,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlOrcidId {
    uri: String,
    path: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlHistory {
    #[serde(rename = "last-modified-date")]
    last_modified_date: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlPerson {
    name: XmlPersonName,
    #[serde(rename = "other-names")]
    other_names: XmlOtherNames,
    addresses: XmlAddresses,
    #[serde(rename = "external-identifiers")]
    external_identifiers: XmlExternalIdentifiers,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlPersonName {
    #[serde(rename = "given-names")]
    given_names: String,
    #[serde(rename = "family-name")]
    family_name: String,
    #[serde(rename = "credit-name")]
    credit_name: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlAddresses {
    #[serde(rename = "address")]
    addresses: Vec<XmlAddress>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlAddress {
    country: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlExternalIdentifiers {
    #[serde(rename = "external-identifier")]
    identifiers: Vec<XmlExternalIdentifier>,
}

/// Person-level external identifier (ResearcherID, Scopus Author ID, etc.).
/// Distinct from work-level external IDs in `<activities:activities-summary>`.
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlExternalIdentifier {
    #[serde(rename = "external-id-type")]
    type_: String,
    #[serde(rename = "external-id-value")]
    value: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlOtherNames {
    #[serde(rename = "other-name")]
    names: Vec<XmlOtherName>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlOtherName {
    content: String,
}

// ── Parser ────────────────────────────────────────────────────────────────────

/// Parse a single ORCID summary XML record from raw bytes.
///
/// Returns `None` for locked / deprecated records (`<error:error>` root) and
/// for records with no ORCID URI (malformed or structurally unexpected).
fn parse_orcid_xml(xml_bytes: &[u8]) -> Option<OrcidXml> {
    let xml = std::str::from_utf8(xml_bytes).ok()?;
    // Locked / deprecated records use <error:error> as the root element; skip them.
    if xml.contains("<error:error") {
        return None;
    }
    let record: OrcidXml = quick_xml::de::from_str(xml).ok()?;
    if record.orcid_identifier.uri.is_empty() {
        return None;
    }
    Some(record)
}

// ── Data conversion ───────────────────────────────────────────────────────────

fn display_name(record: &OrcidXml) -> String {
    let name = &record.person.name;
    if !name.credit_name.is_empty() {
        return name.credit_name.clone();
    }
    match (name.given_names.is_empty(), name.family_name.is_empty()) {
        (false, false) => format!("{} {}", name.given_names, name.family_name),
        (true, false) => name.family_name.clone(),
        (false, true) => name.given_names.clone(),
        (true, true) => String::new(),
    }
}

/// Convert a parsed ORCID record to a commonmeta `Data` struct.
fn from_orcid(record: OrcidXml) -> Data {
    let id = record.orcid_identifier.uri.clone();
    let name_obj = &record.person.name;
    let given_name = name_obj.given_names.clone();
    let family_name = name_obj.family_name.clone();
    let name = name_obj.credit_name.clone();
    let title = display_name(&record);

    let additional_names: Vec<String> = record.person.other_names.names
        .iter()
        .map(|n| n.content.clone())
        .filter(|s| !s.is_empty())
        .collect();

    let identifiers: Vec<Identifier> = record
        .person
        .external_identifiers
        .identifiers
        .iter()
        .filter(|e| !e.type_.is_empty() && !e.value.is_empty())
        .map(|e| {
            let (cm_type, _) = map_orcid_ext_id_type(&e.type_);
            Identifier {
                identifier: e.value.clone(),
                identifier_type: cm_type.to_string(),
                ..Default::default()
            }
        })
        .collect();

    let country = record.person.addresses.addresses.first()
        .map(|a| a.country.clone())
        .unwrap_or_default();

    Data {
        id,
        type_: "Person".to_string(),
        given_name,
        family_name,
        name,
        additional_names,
        title,
        identifiers,
        country,
        asserted_by: "Author".to_string(),
        date_updated: record.history.last_modified_date.clone(),
        provider: "ORCID".to_string(),
        ..Data::default()
    }
}

// ── SQLite schema ─────────────────────────────────────────────────────────────

const PEOPLE_DDL: &str = r#"PRAGMA synchronous=NORMAL;
CREATE TABLE IF NOT EXISTS settings (
    "key"   TEXT PRIMARY KEY NOT NULL,
    "value" TEXT NOT NULL DEFAULT ''
);
CREATE TABLE IF NOT EXISTS people (
    "id"                   TEXT PRIMARY KEY NOT NULL,
    "orcid"                TEXT NOT NULL DEFAULT '',
    "given_name"           TEXT NOT NULL DEFAULT '',
    "family_name"          TEXT NOT NULL DEFAULT '',
    "credit_name"          TEXT NOT NULL DEFAULT '',
    "name"                 TEXT NOT NULL DEFAULT '',
    "country"              TEXT NOT NULL DEFAULT '',
    "other_names"          TEXT NOT NULL DEFAULT '',
    "external_identifiers" TEXT NOT NULL DEFAULT '',
    "date_updated"         TEXT NOT NULL DEFAULT '',
    "metadata"             BLOB NOT NULL DEFAULT x''
);
CREATE INDEX IF NOT EXISTS people_country ON people("country");
CREATE INDEX IF NOT EXISTS people_date_updated ON people("date_updated");"#;

// Migrate databases created before other_names / external_identifiers were added.
// SQLite does not support ADD COLUMN IF NOT EXISTS; ignoring errors is idiomatic.
const PEOPLE_MIGRATE_DDL: &[&str] = &[
    "ALTER TABLE people ADD COLUMN other_names TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE people ADD COLUMN external_identifiers TEXT NOT NULL DEFAULT ''",
];

const PEOPLE_FTS5_DDL: &str = "CREATE VIRTUAL TABLE people_fts USING fts5(\
    name, other_names, \
    content=\"people\", \
    content_rowid=\"rowid\", \
    tokenize=\"unicode61 remove_diacritics 1\"\
)";

const PEOPLE_INSERT: &str = r#"INSERT OR REPLACE INTO people (
    "id", "orcid", "given_name", "family_name", "credit_name", "name",
    "country", "other_names", "external_identifiers", "date_updated", "metadata"
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)"#;

struct PeopleRow {
    id: String,
    orcid: String,
    given_name: String,
    family_name: String,
    credit_name: String,
    name: String,
    country: String,
    other_names: String,
    external_identifiers: String,
    date_updated: String,
    metadata: Vec<u8>,
}

fn serialize_to_people_row(record: &OrcidXml, xml_bytes: &[u8]) -> PeopleRow {
    let name = display_name(record);
    let country = record
        .person
        .addresses
        .addresses
        .first()
        .map(|a| a.country.clone())
        .unwrap_or_default();
    let other_names = record
        .person
        .other_names
        .names
        .iter()
        .map(|n| n.content.as_str())
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join(" ");
    let ext_ids: Vec<serde_json::Value> = record
        .person
        .external_identifiers
        .identifiers
        .iter()
        .filter(|e| !e.type_.is_empty() && !e.value.is_empty())
        .map(|e| serde_json::json!({"type": e.type_, "value": e.value}))
        .collect();
    let external_identifiers = serde_json::to_string(&ext_ids).unwrap_or_default();
    let metadata = zstd::encode_all(xml_bytes, 0).unwrap_or_else(|_| xml_bytes.to_vec());
    PeopleRow {
        id: record.orcid_identifier.uri.clone(),
        orcid: record.orcid_identifier.path.clone(),
        given_name: record.person.name.given_names.clone(),
        family_name: record.person.name.family_name.clone(),
        credit_name: record.person.name.credit_name.clone(),
        name,
        country,
        other_names,
        external_identifiers,
        date_updated: record.history.last_modified_date.clone(),
        metadata,
    }
}

// ── Bulk import ───────────────────────────────────────────────────────────────

const BATCH_SIZE: usize = 50_000;

/// Stream all records from an ORCID summaries tar.gz into the `people` table
/// of the SQLite database at `output_path`.
///
/// The tar.gz contains one XML file per ORCID record arranged in prefix
/// subdirectories (`000/`, `001/`, …). Records with `<error:error>` roots
/// (locked or deprecated ORCIDs) are skipped silently.
///
/// The FTS5 index is rebuilt once at the end of the import. Existing rows are
/// replaced if their ORCID ID already exists in the table.
///
/// Returns the total number of rows inserted or replaced.
/// Iterate XML entries from a tar archive (any `Read` source) and upsert them
/// into an already-open, already-schema'd SQLite connection. Does not touch FTS.
/// `limit = 0` means no limit; any other value stops after that many valid records.
fn drain_tar_into_conn<R: Read>(
    conn: &rusqlite::Connection,
    reader: R,
    limit: usize,
) -> Result<usize> {
    use flate2::read::GzDecoder;
    use tar::Archive;

    let gz = GzDecoder::new(reader);
    let mut archive = Archive::new(gz);

    let mut batch: Vec<PeopleRow> = Vec::with_capacity(BATCH_SIZE.min(if limit > 0 { limit } else { BATCH_SIZE }));
    let mut total = 0usize;
    let entries = archive
        .entries()
        .map_err(|e| Error::Parse(format!("read tar entries: {}", e)))?;

    'entries: for entry in entries {
        let mut entry = match entry {
            Ok(e) => e,
            Err(_) => continue,
        };

        let is_xml = entry
            .path()
            .ok()
            .and_then(|p| p.extension().map(|e| e == "xml"))
            .unwrap_or(false);
        if !is_xml {
            continue;
        }

        let mut xml_bytes = Vec::new();
        if entry.read_to_end(&mut xml_bytes).is_err() {
            continue;
        }

        if let Some(record) = parse_orcid_xml(&xml_bytes) {
            batch.push(serialize_to_people_row(&record, &xml_bytes));
        }

        if limit > 0 && total + batch.len() >= limit {
            batch.truncate(limit - total);
            break 'entries;
        }

        if batch.len() >= BATCH_SIZE {
            let count = write_people_batch(conn, &batch)?;
            total += count;
            eprintln!("orcid: {} inserted in batch ({} total)", count, total);
            batch.clear();
        }
    }

    if !batch.is_empty() {
        let count = write_people_batch(conn, &batch)?;
        total += count;
    }

    eprintln!("orcid: {} people imported; building FTS index…", total);
    Ok(total)
}

fn open_people_conn(output_path: &Path) -> Result<rusqlite::Connection> {
    use rusqlite::Connection;
    if let Some(parent) = output_path.parent() {
        if !parent.as_os_str().is_empty() && !parent.exists() {
            std::fs::create_dir_all(parent)
                .map_err(|e| Error::Parse(format!("create output dir: {}", e)))?;
        }
    }
    let conn = Connection::open(output_path)
        .map_err(|e| Error::Parse(format!("open sqlite '{}': {}", output_path.display(), e)))?;
    conn.query_row("PRAGMA journal_mode=WAL", [], |r: &rusqlite::Row<'_>| r.get::<_, String>(0))
        .map_err(|e| Error::Parse(format!("WAL mode: {}", e)))?;
    ensure_people_schema(&conn)?;
    // Drop FTS before bulk import; rebuild at the end for throughput.
    let _ = conn.execute("DROP TABLE IF EXISTS people_fts", []);
    Ok(conn)
}

fn finish_people_import(conn: &rusqlite::Connection, total: usize) -> Result<usize> {
    conn.execute_batch(PEOPLE_FTS5_DDL)
        .map_err(|e| Error::Parse(format!("FTS5 DDL: {}", e)))?;
    conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
        .map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;
    let _ = conn.execute("PRAGMA wal_checkpoint(PASSIVE)", []);
    Ok(total)
}

/// Import a local tar.gz summaries file into the `people` SQLite table.
/// `limit = 0` imports all records; any other value stops after that many valid records.
pub fn stream_summaries_to_sqlite(tar_gz_path: &Path, output_path: &Path, limit: usize) -> Result<usize> {
    let conn = open_people_conn(output_path)?;
    let file = std::fs::File::open(tar_gz_path)
        .map_err(|e| Error::Parse(format!("open '{}': {}", tar_gz_path.display(), e)))?;
    let total = drain_tar_into_conn(&conn, file, limit)?;
    finish_people_import(&conn, total)
}

/// Download the ORCID Public Data File summaries from an HTTP URL to a local
/// cache file using a single sequential GET request (no parallel Range
/// requests), then return the path. Creates parent directories as needed.
/// Skips the download if a non-empty file already exists at `cache_path`.
fn download_summaries_to_cache(url: &str, cache_path: &std::path::Path) -> Result<()> {
    if cache_path.exists() && cache_path.metadata().map(|m| m.len()).unwrap_or(0) > 0 {
        eprintln!("orcid: using cached {}", cache_path.display());
        return Ok(());
    }
    if let Some(parent) = cache_path.parent() {
        std::fs::create_dir_all(parent)
            .map_err(|e| Error::Parse(format!("mkdir: {}", e)))?;
    }
    eprintln!("orcid: downloading to {} …", cache_path.display());
    let client = reqwest::blocking::Client::builder()
        // A 46 GB file at 10 MB/s takes ~78 min; allow 6 h for slow links.
        .timeout(std::time::Duration::from_secs(6 * 3600))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;
    let mut resp = client
        .get(url)
        .send()
        .map_err(|e| Error::Http(format!("GET {url}: {e}")))?
        .error_for_status()
        .map_err(|e| Error::Http(format!("HTTP error: {e}")))?;
    let mut file = std::fs::File::create(cache_path)
        .map_err(|e| Error::Parse(format!("create {}: {}", cache_path.display(), e)))?;
    let bytes = std::io::copy(&mut resp, &mut file)
        .map_err(|e| Error::Parse(format!("download write: {}", e)))?;
    eprintln!(
        "orcid: {:.1} GB saved to {}",
        bytes as f64 / 1_073_741_824.0,
        cache_path.display()
    );
    Ok(())
}

fn write_people_batch(conn: &rusqlite::Connection, batch: &[PeopleRow]) -> Result<usize> {
    use rusqlite::params;
    let tx = conn
        .unchecked_transaction()
        .map_err(|e| Error::Parse(format!("begin transaction: {}", e)))?;
    {
        let mut stmt = tx
            .prepare(PEOPLE_INSERT)
            .map_err(|e| Error::Parse(format!("prepare insert: {}", e)))?;
        for row in batch {
            stmt.execute(params![
                row.id,
                row.orcid,
                row.given_name,
                row.family_name,
                row.credit_name,
                row.name,
                row.country,
                row.other_names,
                row.external_identifiers,
                row.date_updated,
                row.metadata,
            ])
            .map_err(|e| Error::Parse(format!("insert '{}': {}", row.id, e)))?;
        }
    }
    tx.commit()
        .map_err(|e| Error::Parse(format!("commit transaction: {}", e)))?;
    Ok(batch.len())
}

// ── ORCID public API (JSON) ───────────────────────────────────────────────────

// Serde structs for the ORCID member/public API `/v3.0/{orcid}/person` endpoint.
// The JSON uses a mix of plain strings and `{value: "..."}` wrappers.

#[derive(Deserialize, Default)]
#[serde(default)]
struct PersonJson {
    #[serde(rename = "last-modified-date")]
    last_modified_date: Option<EpochMs>,
    name: NameJson,
    biography: Option<BiographyJson>,
    #[serde(rename = "other-names")]
    other_names: OtherNamesJson,
    addresses: AddressesJson,
    #[serde(rename = "researcher-urls")]
    researcher_urls: ResearcherUrlsJson,
    #[serde(rename = "external-identifiers")]
    external_identifiers: ExtIdsJson,
    path: String, // "/0000-0003-1419-2405/person"
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct BiographyJson {
    content: Option<String>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct OtherNamesJson {
    #[serde(rename = "other-name")]
    other_names: Vec<OtherNameItemJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct OtherNameItemJson {
    content: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct EpochMs {
    value: i64,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct NameJson {
    // These can be `null` (not just absent) when the person has not set them.
    #[serde(rename = "given-names")]
    given_names: Option<StringValue>,
    #[serde(rename = "family-name")]
    family_name: Option<StringValue>,
    #[serde(rename = "credit-name")]
    credit_name: Option<StringValue>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct StringValue {
    value: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct AddressesJson {
    address: Vec<AddressItemJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct AddressItemJson {
    country: StringValue,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct ResearcherUrlsJson {
    #[serde(rename = "researcher-url")]
    urls: Vec<ResearcherUrlItemJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct ResearcherUrlItemJson {
    #[serde(rename = "url-name")]
    url_name: Option<String>,
    url: StringValue,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct ExtIdsJson {
    #[serde(rename = "external-identifier")]
    identifiers: Vec<ExtIdItemJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct ExtIdItemJson {
    #[serde(rename = "external-id-type")]
    type_: String,
    #[serde(rename = "external-id-value")]
    value: String,
}

fn epoch_ms_to_iso(ms: i64) -> String {
    use chrono::{TimeZone, Utc};
    if ms == 0 {
        return String::new();
    }
    Utc.timestamp_millis_opt(ms)
        .single()
        .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
        .unwrap_or_default()
}

/// Build a `Data` struct from a parsed ORCID person JSON response.
fn person_json_to_data(person: &PersonJson, orcid_url: &str) -> Data {
    let name_obj = &person.name;
    let given_name = name_obj.given_names.as_ref().map(|s| s.value.clone()).unwrap_or_default();
    let family_name = name_obj.family_name.as_ref().map(|s| s.value.clone()).unwrap_or_default();
    let name = name_obj.credit_name.as_ref().map(|s| s.value.clone()).unwrap_or_default();
    let title = if !name.is_empty() {
        name.clone()
    } else {
        match (given_name.is_empty(), family_name.is_empty()) {
            (false, false) => format!("{} {}", given_name, family_name),
            (true, false) => family_name.clone(),
            (false, true) => given_name.clone(),
            (true, true) => String::new(),
        }
    };
    let additional_names: Vec<String> = person.other_names.other_names
        .iter()
        .map(|n| n.content.clone())
        .filter(|s| !s.is_empty())
        .collect();
    let description = person
        .biography
        .as_ref()
        .and_then(|b| b.content.as_deref())
        .filter(|s| !s.is_empty())
        .unwrap_or_default()
        .to_string();
    let identifiers: Vec<Identifier> = person
        .external_identifiers
        .identifiers
        .iter()
        .filter(|e| !e.type_.is_empty() && !e.value.is_empty())
        .map(|e| {
            let (cm_type, _) = map_orcid_ext_id_type(&e.type_);
            Identifier {
                identifier: e.value.clone(),
                identifier_type: cm_type.to_string(),
                ..Default::default()
            }
        })
        .collect();
    let urls: Vec<PersonUrl> = person.researcher_urls.urls
        .iter()
        .filter(|r| !r.url.value.is_empty())
        .map(|r| PersonUrl {
            name: r.url_name.clone().unwrap_or_default(),
            url: r.url.value.clone(),
        })
        .collect();
    let country = person.addresses.address.first()
        .map(|a| a.country.value.clone())
        .unwrap_or_default();
    Data {
        id: orcid_url.to_string(),
        type_: "Person".to_string(),
        given_name,
        family_name,
        name,
        additional_names,
        title,
        description,
        identifiers,
        urls,
        country,
        asserted_by: "Author".to_string(),
        date_updated: epoch_ms_to_iso(person.last_modified_date.as_ref().map(|d| d.value).unwrap_or(0)),
        provider: "ORCID".to_string(),
        ..Data::default()
    }
}

/// Fetch a person record from the ORCID public API and return a `PeopleRow`
/// ready for SQLite insertion. `orcid_url` must be a canonical ORCID URL.
fn fetch_person_api(orcid_url: &str) -> Result<PeopleRow> {
    let (row, _) = fetch_person_api_with_json(orcid_url)?;
    Ok(row)
}

fn fetch_person_api_with_json(orcid_url: &str) -> Result<(PeopleRow, serde_json::Value)> {
    let orcid = orcid_url.trim_start_matches("https://orcid.org/");
    let api_url = format!("https://pub.orcid.org/v3.0/{}/person", orcid);
    let client = reqwest::blocking::Client::builder()
        .user_agent(crate::io_utils::commonmeta_user_agent())
        .timeout(std::time::Duration::from_secs(30))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;
    let bytes = client
        .get(&api_url)
        .header("Accept", "application/vnd.orcid+json")
        .send()
        .map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
        .error_for_status()
        .map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
        .bytes()
        .map_err(|e| Error::Http(e.to_string()))?;

    let json_value: serde_json::Value =
        serde_json::from_slice(&bytes).map_err(|e| Error::Parse(e.to_string()))?;
    let person: PersonJson =
        serde_json::from_slice(&bytes).map_err(|e| Error::Parse(e.to_string()))?;

    let country = person
        .addresses
        .address
        .first()
        .map(|a| a.country.value.clone())
        .unwrap_or_default();
    let other_names = person
        .other_names
        .other_names
        .iter()
        .map(|n| n.content.as_str())
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join(" ");
    let ext_ids: Vec<serde_json::Value> = person
        .external_identifiers
        .identifiers
        .iter()
        .filter(|e| !e.type_.is_empty() && !e.value.is_empty())
        .map(|e| serde_json::json!({"type": e.type_, "value": e.value}))
        .collect();
    let external_identifiers = serde_json::to_string(&ext_ids).unwrap_or_default();
    let metadata = zstd::encode_all(bytes.as_ref(), 0).unwrap_or_else(|_| bytes.to_vec());
    let data = person_json_to_data(&person, orcid_url);

    let row = PeopleRow {
        id: data.id.clone(),
        orcid: orcid.to_string(),
        given_name: person.name.given_names.as_ref().map(|s| s.value.clone()).unwrap_or_default(),
        family_name: person.name.family_name.as_ref().map(|s| s.value.clone()).unwrap_or_default(),
        credit_name: person.name.credit_name.as_ref().map(|s| s.value.clone()).unwrap_or_default(),
        name: data.title,
        country,
        other_names,
        external_identifiers,
        date_updated: data.date_updated,
        metadata,
    };
    Ok((row, json_value))
}

/// Fetch a person from the ORCID public API and return their record as `Data`.
/// Accepts a bare ORCID iD (`0000-0003-1419-2405`) or a full ORCID URL.
pub fn fetch_orcid(id: &str) -> Result<Data> {
    let url = crate::utils::normalize_orcid(id);
    if url.is_empty() {
        return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
    }
    let (_, json) = fetch_person_api_with_json(&url)?;
    let person: PersonJson =
        serde_json::from_value(json).map_err(|e| Error::Parse(e.to_string()))?;
    Ok(person_json_to_data(&person, &url))
}

/// Fetch a person from the ORCID public API and return both the parsed [`Data`]
/// and the raw ORCID 3.0 person JSON in a single HTTP request.
///
/// Prefer this over calling [`fetch_orcid`] and [`fetch_person_json`] separately
/// when you need both — e.g. when caching the raw JSON for later import into the
/// `people` table while also assembling the response.
pub fn fetch_orcid_with_json(id: &str) -> Result<(Data, serde_json::Value)> {
    let url = crate::utils::normalize_orcid(id);
    if url.is_empty() {
        return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
    }
    let (_, json) = fetch_person_api_with_json(&url)?;
    let person: PersonJson =
        serde_json::from_value(json.clone()).map_err(|e| Error::Parse(e.to_string()))?;
    Ok((person_json_to_data(&person, &url), json))
}

/// Fetch a person from the ORCID public API and return the raw JSON response
/// conforming to the ORCID 3.0 person schema (`orcid_schema_v3.0.json`).
pub fn fetch_person_json(id: &str) -> Result<serde_json::Value> {
    let url = crate::utils::normalize_orcid(id);
    if url.is_empty() {
        return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
    }
    let (_, json) = fetch_person_api_with_json(&url)?;
    Ok(json)
}

/// Build an ORCID 3.0 person JSON value from a parsed XML summary record.
///
/// The summaries archive contains a subset of the full person record (no
/// biography, no researcher-URLs, no emails). Fields not present in the XML
/// are rendered as empty containers matching the schema shape.
fn xml_record_to_person_json(record: &OrcidXml) -> serde_json::Value {
    let orcid = &record.orcid_identifier.path;

    // Convert ISO-8601 string (from XML) to epoch ms (JSON schema format).
    let last_modified_ms = chrono::DateTime::parse_from_rfc3339(&record.history.last_modified_date)
        .map(|dt| dt.timestamp_millis())
        .unwrap_or(0);

    let given = if record.person.name.given_names.is_empty() {
        serde_json::Value::Null
    } else {
        serde_json::json!({"value": record.person.name.given_names})
    };
    let family = if record.person.name.family_name.is_empty() {
        serde_json::Value::Null
    } else {
        serde_json::json!({"value": record.person.name.family_name})
    };
    let credit = if record.person.name.credit_name.is_empty() {
        serde_json::Value::Null
    } else {
        serde_json::json!({"value": record.person.name.credit_name})
    };

    let other_name_items: Vec<serde_json::Value> = record
        .person
        .other_names
        .names
        .iter()
        .map(|n| serde_json::json!({"content": n.content, "visibility": "public"}))
        .collect();

    let address_items: Vec<serde_json::Value> = record
        .person
        .addresses
        .addresses
        .iter()
        .map(|a| serde_json::json!({"country": {"value": a.country}, "visibility": "public"}))
        .collect();

    let keyword_items: Vec<serde_json::Value> = Vec::new();

    let ext_id_items: Vec<serde_json::Value> = record
        .person
        .external_identifiers
        .identifiers
        .iter()
        .map(|e| serde_json::json!({
            "external-id-type": e.type_,
            "external-id-value": e.value,
            "external-id-relationship": "self",
            "visibility": "public"
        }))
        .collect();

    serde_json::json!({
        "last-modified-date": {"value": last_modified_ms},
        "name": {
            "given-names": given,
            "family-name": family,
            "credit-name": credit,
            "visibility": "public",
            "path": orcid
        },
        "other-names": {
            "other-name": other_name_items,
            "path": format!("/{}/other-names", orcid)
        },
        "biography": null,
        "researcher-urls": {
            "researcher-url": [],
            "path": format!("/{}/researcher-urls", orcid)
        },
        "emails": {
            "email": [],
            "path": format!("/{}/email", orcid)
        },
        "addresses": {
            "address": address_items,
            "path": format!("/{}/address", orcid)
        },
        "keywords": {
            "keyword": keyword_items,
            "path": format!("/{}/keywords", orcid)
        },
        "external-identifiers": {
            "external-identifier": ext_id_items,
            "path": format!("/{}/external-identifiers", orcid)
        },
        "path": format!("/{}/person", orcid)
    })
}

/// Look up a person in the local SQLite database and return the raw ORCID 3.0
/// person JSON conforming to `orcid_schema_v3.0.json`.
///
/// JSON metadata blobs (from [`import_person`]) are decompressed and returned
/// as-is. XML blobs (from [`stream_summaries_to_sqlite`]) are reconstructed
/// into the equivalent JSON shape.
pub fn fetch_person_json_sqlite(id: &str, db_path: &Path) -> Result<serde_json::Value> {
    use rusqlite::{params, Connection};

    let url = crate::utils::normalize_orcid(id);
    if url.is_empty() {
        return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
    }

    let conn = Connection::open(db_path)
        .map_err(|e| Error::Parse(format!("open sqlite '{}': {}", db_path.display(), e)))?;
    let blob: Vec<u8> = conn
        .query_row(
            "SELECT metadata FROM people WHERE id = ?1 LIMIT 1",
            params![url],
            |row| row.get(0),
        )
        .map_err(|e| {
            if matches!(e, rusqlite::Error::QueryReturnedNoRows) {
                Error::Parse(format!("person '{}' not found in local database", id))
            } else {
                Error::Parse(format!("sqlite query failed: {}", e))
            }
        })?;
    let raw = zstd::decode_all(std::io::Cursor::new(&blob))
        .map_err(|e| Error::Parse(format!("decompress metadata for '{}': {}", id, e)))?;

    if raw.iter().find(|&&b| b != b' ' && b != b'\n' && b != b'\r' && b != b'\t') == Some(&b'<') {
        // XML blob from bulk import — reconstruct JSON shape.
        let record = parse_orcid_xml(&raw)
            .ok_or_else(|| Error::Parse(format!("re-parse XML for '{}'", id)))?;
        Ok(xml_record_to_person_json(&record))
    } else {
        // JSON blob from API import — return directly.
        serde_json::from_slice(&raw).map_err(|e| Error::Parse(format!("re-parse JSON for '{}': {}", id, e)))
    }
}

/// Ensure the `people` table and its FTS5 index exist and are current.
///
/// Safe to call on both new and existing databases:
/// - New databases: creates table and FTS5 with all columns.
/// - Existing databases: migrates missing columns and recreates the FTS5
///   index (with a content rebuild) when the schema has changed.
fn ensure_people_schema(conn: &rusqlite::Connection) -> Result<()> {
    conn.execute_batch(PEOPLE_DDL)
        .map_err(|e| Error::Parse(format!("people DDL: {}", e)))?;

    // Add new columns to existing tables (errors are silently ignored because
    // SQLite has no ADD COLUMN IF NOT EXISTS).
    for stmt in PEOPLE_MIGRATE_DDL {
        let _ = conn.execute_batch(stmt);
    }

    // Drop the keywords column (removed from v1.0 schema). The FTS must be
    // dropped first because it references the content table; SQLite ≥ 3.35 is
    // required for ALTER TABLE … DROP COLUMN (errors silently ignored).
    let fts_before: String = conn
        .query_row(
            "SELECT COALESCE(sql, '') FROM sqlite_master WHERE name='people_fts'",
            [],
            |r| r.get(0),
        )
        .unwrap_or_default();
    if fts_before.contains("keywords") {
        let _ = conn.execute_batch("DROP TABLE IF EXISTS people_fts");
        let _ = conn.execute_batch("ALTER TABLE people DROP COLUMN keywords");
    }

    // Recreate FTS5 if missing or predating other_names.
    let fts_sql: String = conn
        .query_row(
            "SELECT COALESCE(sql, '') FROM sqlite_master WHERE name='people_fts'",
            [],
            |r| r.get(0),
        )
        .unwrap_or_default();

    if !fts_sql.contains("other_names") {
        conn.execute_batch("DROP TABLE IF EXISTS people_fts")
            .map_err(|e| Error::Parse(format!("drop FTS5: {}", e)))?;
        conn.execute_batch(PEOPLE_FTS5_DDL)
            .map_err(|e| Error::Parse(format!("FTS5 DDL: {}", e)))?;
        let row_count: i64 = conn
            .query_row("SELECT COUNT(*) FROM people", [], |r| r.get(0))
            .unwrap_or(0);
        if row_count > 0 {
            conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
                .map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;
        }
    }

    Ok(())
}

/// Fetch a single person from the ORCID public API and upsert them into the
/// `people` table of the SQLite database at `db_path`.
///
/// Accepts a bare ORCID iD (`0000-0003-1419-2405`) or a full ORCID URL.
/// Creates the table and FTS5 index if they don't already exist.
/// Fetch an ORCID person from the public API and upsert into the `people`
/// table at `people_db`. Also fetches the person's works from Crossref and
/// DataCite and upserts them into the `works` table at `works_db` (may be the
/// same path as `people_db`). Returns the number of works written.
pub fn import_person(id: &str, people_db: &Path, works_db: &Path) -> Result<usize> {
    let orcid_url = crate::utils::normalize_orcid(id);
    if orcid_url.is_empty() {
        return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
    }

    use rusqlite::Connection;
    let conn = Connection::open(people_db)
        .map_err(|e| Error::Parse(format!("open sqlite '{}': {}", people_db.display(), e)))?;
    let _: String = conn
        .query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0))
        .map_err(|e| Error::Parse(format!("WAL mode: {}", e)))?;
    ensure_people_schema(&conn)?;

    let row = fetch_person_api(&orcid_url)?;
    write_people_batch(&conn, std::slice::from_ref(&row))?;

    conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
        .map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;

    // Fetch works from Crossref and DataCite, deduplicate, and upsert.
    const MAX_WORKS: usize = 50;
    let mut works: Vec<crate::Data> = Vec::new();
    let mut cr = crate::formats::crossref::fetch_by_orcid(&orcid_url, MAX_WORKS, 1).unwrap_or_default();
    let mut dc = crate::formats::datacite::fetch_by_orcid(&orcid_url, MAX_WORKS, 1).unwrap_or_default();
    works.append(&mut cr);
    works.append(&mut dc);

    let n = if !works.is_empty() {
        let mut seen = std::collections::HashSet::new();
        works.retain(|d| seen.insert(d.id.clone()));
        works.sort_by(|a, b| b.date_published.cmp(&a.date_published));
        works.truncate(MAX_WORKS);
        let n = works.len();
        crate::formats::commonmeta::upsert_sqlite(&works, works_db)?;
        n
    } else {
        0
    };

    Ok(n)
}

// ── Cache import ─────────────────────────────────────────────────────────────

/// Read ORCID person rows from a dragoman `cache.sqlite3` (`pid_records` table,
/// `raw_metadata_type = 'orcid'`) and upsert them into the `people` table at
/// `people_path` (which may be the same file as the main `commonmeta.sqlite3`).
///
/// Two storage formats are accepted transparently:
/// - **Commonmeta JSON** (written by dragoman when the raw ORCID JSON is not
///   available): top-level `"type": "Person"` field. Keywords are not available
///   in this format.
/// - **Raw ORCID 3.0 person JSON** (written by dragoman when it uses
///   [`fetch_orcid_with_json`]): full fidelity including keywords.
///
/// Returns the number of rows upserted.  FTS5 is rebuilt once at the end.
pub fn stream_cache_orcid_to_people_sqlite(
    cache_path: &Path,
    people_path: &Path,
) -> Result<usize> {
    use rusqlite::Connection;

    let cache_conn = Connection::open(cache_path)
        .map_err(|e| Error::Parse(format!("open cache '{}': {}", cache_path.display(), e)))?;

    let mut stmt = cache_conn
        .prepare(
            "SELECT pid, raw_metadata FROM pid_records WHERE raw_metadata_type = 'orcid'",
        )
        .map_err(|e| Error::Parse(format!("prepare cache query: {}", e)))?;

    let rows: Vec<(String, String)> = stmt
        .query_map([], |row| Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)))
        .map_err(|e| Error::Parse(format!("query cache: {}", e)))?
        .filter_map(|r| r.ok())
        .collect();

    if rows.is_empty() {
        return Ok(0);
    }

    let people_conn = Connection::open(people_path)
        .map_err(|e| Error::Parse(format!("open people db '{}': {}", people_path.display(), e)))?;
    let _: String = people_conn
        .query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0))
        .map_err(|e| Error::Parse(format!("WAL mode: {}", e)))?;
    ensure_people_schema(&people_conn)?;

    let mut batch: Vec<PeopleRow> = Vec::with_capacity(rows.len());
    for (pid, raw) in &rows {
        let raw_bytes = raw.as_bytes();
        let val: serde_json::Value = match serde_json::from_str(raw) {
            Ok(v) => v,
            Err(_) => {
                eprintln!("orcid cache: skipping '{}': JSON parse error", pid);
                continue;
            }
        };

        let row = if val.get("type").and_then(|v| v.as_str()) == Some("Person") {
            // Commonmeta JSON — limited fidelity (no keywords).
            let data: crate::Data = match serde_json::from_value(val) {
                Ok(d) => d,
                Err(e) => {
                    eprintln!("orcid cache: skipping '{}': {}", pid, e);
                    continue;
                }
            };
            let orcid = pid.trim_start_matches("https://orcid.org/").to_string();
            let other_names = data.additional_names.join(" ");
            let ext_ids: Vec<serde_json::Value> = data
                .identifiers
                .iter()
                .map(|i| serde_json::json!({"type": i.identifier_type, "value": i.identifier}))
                .collect();
            let external_identifiers = serde_json::to_string(&ext_ids).unwrap_or_default();
            let metadata = zstd::encode_all(raw_bytes, 0).unwrap_or_else(|_| raw_bytes.to_vec());
            PeopleRow {
                id: data.id,
                orcid,
                given_name: data.given_name,
                family_name: data.family_name,
                credit_name: data.name.clone(),
                name: if !data.title.is_empty() { data.title } else { data.name },
                country: data.country,
                other_names,
                external_identifiers,
                date_updated: data.date_updated,
                metadata,
            }
        } else {
            // Raw ORCID 3.0 person JSON — full fidelity.
            let person: PersonJson = match serde_json::from_value(val) {
                Ok(p) => p,
                Err(e) => {
                    eprintln!("orcid cache: skipping '{}': {}", pid, e);
                    continue;
                }
            };
            let data = person_json_to_data(&person, pid);
            let other_names = person
                .other_names
                .other_names
                .iter()
                .map(|n| n.content.as_str())
                .filter(|s| !s.is_empty())
                .collect::<Vec<_>>()
                .join(" ");
            let ext_ids: Vec<serde_json::Value> = person
                .external_identifiers
                .identifiers
                .iter()
                .filter(|e| !e.type_.is_empty() && !e.value.is_empty())
                .map(|e| serde_json::json!({"type": e.type_, "value": e.value}))
                .collect();
            let external_identifiers = serde_json::to_string(&ext_ids).unwrap_or_default();
            let metadata = zstd::encode_all(raw_bytes, 0).unwrap_or_else(|_| raw_bytes.to_vec());
            let orcid = pid.trim_start_matches("https://orcid.org/").to_string();
            PeopleRow {
                id: data.id,
                orcid,
                given_name: data.given_name,
                family_name: data.family_name,
                credit_name: data.name.clone(),
                name: if !data.title.is_empty() { data.title } else { data.name },
                country: data.country,
                other_names,
                external_identifiers,
                date_updated: data.date_updated,
                metadata,
            }
        };

        batch.push(row);
    }

    if batch.is_empty() {
        return Ok(0);
    }

    let n = write_people_batch(&people_conn, &batch)?;
    people_conn
        .execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
        .map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;
    Ok(n)
}

// ── Lookup ────────────────────────────────────────────────────────────────────

/// Look up a person by their ORCID iD or URL from a local SQLite database.
/// Accepts a bare ORCID iD or a full ORCID URL.
/// Handles both XML blobs (from [`stream_summaries_to_sqlite`]) and JSON blobs
/// (from [`import_person`]).
pub fn fetch_sqlite(id: &str, db_path: &Path) -> Result<Data> {
    use rusqlite::{params, Connection};

    let url = crate::utils::normalize_orcid(id);
    if url.is_empty() {
        return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
    }

    let conn = Connection::open(db_path)
        .map_err(|e| Error::Parse(format!("open sqlite '{}': {}", db_path.display(), e)))?;
    let blob: Vec<u8> = conn
        .query_row(
            "SELECT metadata FROM people WHERE id = ?1 LIMIT 1",
            params![url],
            |row| row.get(0),
        )
        .map_err(|e| {
            if matches!(e, rusqlite::Error::QueryReturnedNoRows) {
                Error::Parse(format!("person '{}' not found in local database", id))
            } else {
                Error::Parse(format!("sqlite query failed: {}", e))
            }
        })?;
    let raw = zstd::decode_all(std::io::Cursor::new(&blob))
        .map_err(|e| Error::Parse(format!("decompress metadata for '{}': {}", id, e)))?;

    // Records from stream_summaries_to_sqlite store XML; records from
    // import_person store JSON. Detect by the first non-whitespace byte.
    if raw.iter().find(|&&b| b != b' ' && b != b'\n' && b != b'\r' && b != b'\t') == Some(&b'<') {
        let record = parse_orcid_xml(&raw)
            .ok_or_else(|| Error::Parse(format!("re-parse XML for '{}'", id)))?;
        Ok(from_orcid(record))
    } else {
        let person: PersonJson = serde_json::from_slice(&raw)
            .map_err(|e| Error::Parse(format!("re-parse JSON for '{}': {}", id, e)))?;
        Ok(person_json_to_data(&person, &url))
    }
}

// ── Figshare release helpers ──────────────────────────────────────────────────

#[derive(Deserialize)]
struct FigshareFile {
    name: String,
    size: u64,
    download_url: String,
}

#[derive(Deserialize)]
struct FigshareArticle {
    files: Vec<FigshareFile>,
}

/// Fetch metadata for the ORCID summaries file from the figshare article API
/// (no download). Pass the figshare article `version` (1-indexed; 1 for the
/// 2025 dataset).
///
/// The returned [`OrcidRelease`] gives you the filename, download URL, and
/// size so you can download the ~46 GB file yourself and pass its local path
/// to [`stream_summaries_to_sqlite`].
pub fn fetch_orcid_release(version: u32) -> Result<OrcidRelease> {
    let url = format!(
        "https://api.figshare.com/v2/articles/{}/versions/{}",
        ORCID_FIGSHARE_ARTICLE_ID, version
    );
    let client = reqwest::blocking::Client::builder()
        .user_agent(format!(
            "commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
            env!("CARGO_PKG_VERSION")
        ))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;
    let text = client
        .get(&url)
        .send()
        .map_err(|e| Error::Http(format!("figshare API: {}", e)))?
        .error_for_status()
        .map_err(|e| Error::Http(format!("figshare API status: {}", e)))?
        .text()
        .map_err(|e| Error::Http(e.to_string()))?;
    let article: FigshareArticle =
        serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
    let summaries = article
        .files
        .into_iter()
        .find(|f| f.name.ends_with("_summaries.tar.gz"))
        .ok_or_else(|| Error::Parse("no summaries tar.gz found in figshare article".into()))?;
    let (year, batch) = parse_filename_date(&summaries.name);
    Ok(OrcidRelease {
        year,
        batch,
        filename: summaries.name,
        download_url: summaries.download_url,
        size_bytes: summaries.size,
    })
}

/// Extract year and batch number from `ORCID_{year}_{batch}_summaries.tar.gz`.
fn parse_filename_date(name: &str) -> (u16, u16) {
    let parts: Vec<&str> = name.split('_').collect();
    let year = parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(0);
    let batch = parts.get(2).and_then(|s| s.parse().ok()).unwrap_or(0);
    (year, batch)
}

/// Fetch the latest ORCID Public Data File release from figshare.
///
/// Uses `GET /v2/articles/{id}/versions` (public, no auth) to find the highest
/// version number, then calls [`fetch_orcid_release`] for the file metadata.
pub fn fetch_latest_orcid_release() -> Result<OrcidRelease> {
    #[derive(Deserialize)]
    struct VersionEntry {
        version: u32,
    }
    let url = format!(
        "https://api.figshare.com/v2/articles/{}/versions",
        ORCID_FIGSHARE_ARTICLE_ID
    );
    let client = reqwest::blocking::Client::builder()
        .user_agent(format!(
            "commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
            env!("CARGO_PKG_VERSION")
        ))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;
    let text = client
        .get(&url)
        .send()
        .map_err(|e| Error::Http(format!("figshare versions API: {}", e)))?
        .error_for_status()
        .map_err(|e| Error::Http(format!("figshare versions API status: {}", e)))?
        .text()
        .map_err(|e| Error::Http(e.to_string()))?;
    let entries: Vec<VersionEntry> =
        serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
    let latest = entries
        .into_iter()
        .map(|e| e.version)
        .max()
        .ok_or_else(|| Error::Parse("figshare returned empty versions list".into()))?;
    fetch_orcid_release(latest)
}

/// Drop and rebuild the `people_fts` FTS5 virtual table.
pub fn rebuild_people_fts(path: &Path) -> Result<()> {
    let conn = rusqlite::Connection::open(path)
        .map_err(|e| Error::Parse(format!("open sqlite '{}': {}", path.display(), e)))?;
    let _ = conn.execute("DROP TABLE IF EXISTS people_fts", []);
    conn.execute_batch(PEOPLE_FTS5_DDL)
        .map_err(|e| Error::Parse(format!("people_fts DDL: {}", e)))?;
    conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
        .map_err(|e| Error::Parse(format!("people_fts rebuild: {}", e)))?;
    Ok(())
}

/// Read the installed ORCID Public Data File version from the `settings` table.
/// Returns `None` when no version has been recorded yet.
pub fn fetch_installed_orcid_public_data_version(db_path: &Path) -> Result<Option<String>> {
    use rusqlite::Connection;
    use rusqlite::Error as SqliteError;
    let conn = Connection::open(db_path)
        .map_err(|e| Error::Parse(format!("failed to open sqlite: {}", e)))?;
    match conn.query_row(
        "SELECT value FROM settings WHERE key = 'orcid_public_data_version' LIMIT 1",
        [],
        |row| row.get::<_, String>(0),
    ) {
        Ok(v) => Ok(Some(v)),
        Err(SqliteError::QueryReturnedNoRows) => Ok(None),
        Err(_) => Ok(None),
    }
}

/// Probe `url` with a HEAD request (no redirect follow) to extract the filename
/// from `Content-Disposition` and the file size from `Content-Length`.
/// Returns `(filename, size_bytes)`.  Both fall back to safe defaults on failure.
fn probe_download_url(url: &str) -> (String, u64) {
    let client = match reqwest::blocking::Client::builder()
        .user_agent(format!(
            "commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
            env!("CARGO_PKG_VERSION")
        ))
        .redirect(reqwest::redirect::Policy::none())
        .timeout(std::time::Duration::from_secs(30))
        .build()
    {
        Ok(c) => c,
        Err(_) => return ("orcid_summaries.tar.gz".to_string(), 0),
    };
    let resp = match client.head(url).send() {
        Ok(r) => r,
        Err(_) => return ("orcid_summaries.tar.gz".to_string(), 0),
    };
    let size = resp.content_length().unwrap_or(0);
    let filename = resp
        .headers()
        .get("content-disposition")
        .and_then(|v| v.to_str().ok())
        .and_then(|s| {
            s.split(';')
                .find_map(|part| {
                    let part = part.trim();
                    part.strip_prefix("filename=").map(|f| f.trim_matches('"').to_string())
                })
        })
        .filter(|f| f.ends_with(".tar.gz"))
        .unwrap_or_else(|| "orcid_summaries.tar.gz".to_string());
    (filename, size)
}

/// Download the ORCID Public Data File (summaries tar.gz) if it is not already
/// cached, then stream it into the `people` table at `output_path`.
///
const ORCID_SAMPLE_SIZE: usize = 1_000;

/// Download or load the ORCID Public Data File summaries and stream them into
/// the `people` table at `output_path`.
///
/// `source` can be:
/// - `None` — auto-discover the latest release from the figshare API
/// - `Some(local_path)` — stream directly from a local `.tar.gz` file (no network)
/// - `Some(url)` starting with `http` — download from that URL, bypassing the
///   figshare API (get the URL via `--list-releases` on a machine that can reach
///   `api.figshare.com`)
///
/// When `sample` is true, stops after [`ORCID_SAMPLE_SIZE`] valid records.
/// For a URL source with no cache, samples without downloading the full file.
pub fn import_orcid_public_data(
    output_path: &Path,
    source: Option<&str>,
    no_network: bool,
    sample: bool,
) -> Result<usize> {
    use rusqlite::params;
    use rusqlite::Connection;

    let limit = if sample { ORCID_SAMPLE_SIZE } else { 0 };

    fn write_version(output_path: &Path, version_key: &str) -> Result<()> {
        if version_key.starts_with("0_") {
            return Ok(());
        }
        let conn = Connection::open(output_path)
            .map_err(|e| Error::Parse(format!("open sqlite: {}", e)))?;
        conn.execute(
            "INSERT OR REPLACE INTO settings (key, value) VALUES ('orcid_public_data_version', ?1)",
            params![version_key],
        )
        .map_err(|e| Error::Parse(format!("write settings: {}", e)))?;
        Ok(())
    }

    // ── Local file: stream directly, no network ───────────────────────────────
    if let Some(s) = source {
        if !s.starts_with("http") {
            let local = std::path::Path::new(s);
            if !local.exists() {
                return Err(Error::Parse(format!("file not found: {}", s)));
            }
            let filename = local
                .file_name()
                .and_then(|n| n.to_str())
                .unwrap_or("orcid_summaries.tar.gz")
                .to_string();
            let (year, batch) = parse_filename_date(&filename);
            let version_key = format!("{}_{}", year, batch);
            eprintln!("orcid: importing {} …", local.display());
            let count = stream_summaries_to_sqlite(local, output_path, limit)?;
            if !sample {
                write_version(output_path, &version_key)?;
            }
            eprintln!("orcid: import complete — {} people", count);
            return Ok(count);
        }
    }

    if no_network {
        return Err(Error::Parse(
            "--no-network: pass a local file path or remove --no-network".to_string(),
        ));
    }

    // ── HTTP URL provided directly → download to cache (or sample-stream) ────
    // Full import: single sequential GET → cache file → stream_summaries_to_sqlite.
    // Sample without cache: stream the first N records directly without writing
    // the full ~46 GB to disk (same pattern as DataCite --sample without cache).
    if let Some(url) = source.filter(|s| s.starts_with("http")) {
        if url.contains("/ndownloader/articles/") {
            return Err(Error::Parse(
                "that URL downloads the full article bundle (~221 GB, all files).\n\
                Pass the summaries-only file (~46 GB) instead.\n\
                On a machine where figshare is reachable:\n  \
                commonmeta import --from orcid --list-releases\n\
                Or download the file there and copy it:\n  \
                scp ORCID_2025_10_summaries.tar.gz root@server:/data/\n  \
                commonmeta import --from orcid /data/ORCID_2025_10_summaries.tar.gz"
                    .to_string(),
            ));
        }
        // Derive version from the filename embedded in the URL path
        // (e.g. ORCID_2025_10_summaries.tar.gz). No HEAD request needed.
        let url_filename = url
            .rsplit('/')
            .next()
            .filter(|f| f.ends_with(".tar.gz"))
            .unwrap_or("orcid_summaries.tar.gz")
            .to_string();
        let (year, batch) = parse_filename_date(&url_filename);
        let version_key = format!("{}_{}", year, batch);
        let installed = if output_path.exists() {
            fetch_installed_orcid_public_data_version(output_path)?
        } else {
            None
        };
        if !sample && !version_key.starts_with("0_") && installed.as_deref() == Some(&version_key) {
            eprintln!("orcid: {} already installed", version_key);
            return Ok(0);
        }
        let cache_path = crate::io_utils::cache_dir("orcid").join(&url_filename);
        let cached = cache_path.exists() && cache_path.metadata().map(|m| m.len()).unwrap_or(0) > 0;
        if sample && !cached {
            // Sample without cache: stream first N records directly, skip disk write.
            eprintln!("orcid: streaming first {} records from {} (no cache)", ORCID_SAMPLE_SIZE, url);
            let client = reqwest::blocking::Client::builder()
                .timeout(std::time::Duration::from_secs(6 * 3600))
                .build()
                .map_err(|e| Error::Http(e.to_string()))?;
            let resp = client
                .get(url)
                .send()
                .map_err(|e| Error::Http(format!("GET {url}: {e}")))?
                .error_for_status()
                .map_err(|e| Error::Http(format!("HTTP error: {e}")))?;
            let conn = open_people_conn(output_path)?;
            let buffered = std::io::BufReader::with_capacity(256 * 1024, resp);
            let total = drain_tar_into_conn(&conn, buffered, limit)?;
            return finish_people_import(&conn, total);
        }
        download_summaries_to_cache(url, &cache_path)?;
        eprintln!("orcid: importing {} into {} …", url_filename, output_path.display());
        let count = stream_summaries_to_sqlite(&cache_path, output_path, limit)?;
        if !sample {
            write_version(output_path, &version_key)?;
        }
        eprintln!("orcid: import complete — {} people", count);
        return Ok(count);
    }

    // ── Resolve download URL and filename (figshare auto-discover) ────────────
    let (download_url, filename, size_bytes) = match source {
        Some(_) => unreachable!("non-http source handled above as local file"),
        None => {
            eprintln!("orcid: checking latest release on figshare …");
            let release = fetch_latest_orcid_release().map_err(|e| {
                let msg = e.to_string();
                if msg.contains("403") {
                    Error::Http(format!(
                        "{msg}\n\
                        figshare is blocked on this host. Options:\n\
                        A) download the file on another machine and copy it here:\n  \
                           commonmeta import --from orcid --list-releases   # get URL\n  \
                           wget <SUMMARIES_URL>   # ~43 GB\n  \
                           scp ORCID_*_summaries.tar.gz root@server:/data/\n  \
                           commonmeta import --from orcid /data/ORCID_*_summaries.tar.gz\n\
                        B) pass the URL directly (from --list-releases on another machine):\n  \
                           commonmeta import --from orcid \"<SUMMARIES_URL>\""
                    ))
                } else {
                    e
                }
            })?;
            eprintln!(
                "orcid: {} ({}_{}) {:.1} GB",
                release.filename, release.year, release.batch,
                release.size_bytes as f64 / 1_073_741_824.0,
            );
            (release.download_url, release.filename, release.size_bytes)
        }
    };

    let (year, batch) = parse_filename_date(&filename);
    let version_key = format!("{}_{}", year, batch);

    let installed = if output_path.exists() {
        fetch_installed_orcid_public_data_version(output_path)?
    } else {
        None
    };
    if !sample && !version_key.starts_with("0_") && installed.as_deref() == Some(&version_key) {
        eprintln!("orcid: {} already installed", version_key);
        return Ok(0);
    }

    let cache_path = crate::io_utils::cache_dir("orcid").join(&filename);
    let cached = cache_path.exists()
        && size_bytes > 0
        && cache_path.metadata().map(|m| m.len()).unwrap_or(0) == size_bytes;

    if sample && !cached {
        eprintln!(
            "orcid: streaming first {} records from {} (no cache) …",
            ORCID_SAMPLE_SIZE, download_url
        );
        let client = reqwest::blocking::Client::builder()
            .timeout(std::time::Duration::from_secs(6 * 3600))
            .build()
            .map_err(|e| Error::Http(e.to_string()))?;
        let resp = client
            .get(&download_url)
            .send()
            .map_err(|e| Error::Http(format!("GET: {e}")))?
            .error_for_status()
            .map_err(|e| Error::Http(format!("HTTP error: {e}")))?;
        let conn = open_people_conn(output_path)?;
        let buffered = std::io::BufReader::with_capacity(256 * 1024, resp);
        let total = drain_tar_into_conn(&conn, buffered, limit)?;
        return finish_people_import(&conn, total);
    }

    if !cached {
        eprintln!("orcid: downloading to {} …", cache_path.display());
        if let Some(parent) = cache_path.parent() {
            std::fs::create_dir_all(parent)
                .map_err(|e| Error::Parse(format!("mkdir: {}", e)))?;
        }
        download_summaries_to_cache(&download_url, &cache_path)
            .map_err(|e| Error::Parse(format!("download: {}", e)))?;
    } else {
        eprintln!("orcid: using cached {}", cache_path.display());
    }

    eprintln!("orcid: importing {} into {} …", filename, output_path.display());
    let count = stream_summaries_to_sqlite(&cache_path, output_path, limit)?;
    if !sample {
        write_version(output_path, &version_key)?;
    }
    eprintln!("orcid: import complete — {} people", count);
    Ok(count)
}

// ── Employment / affiliations ─────────────────────────────────────────────────

#[derive(Deserialize, Default)]
#[serde(default)]
struct EmploymentsResponse {
    #[serde(rename = "affiliation-group")]
    affiliation_groups: Vec<AffiliationGroupJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct AffiliationGroupJson {
    summaries: Vec<AffiliationSummaryWrapper>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct AffiliationSummaryWrapper {
    #[serde(rename = "employment-summary")]
    employment_summary: Option<EmploymentSummaryJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct EmploymentSummaryJson {
    organization: OrganizationJson,
    #[serde(rename = "role-title")]
    role_title: String,
    #[serde(rename = "start-date")]
    start_date: Option<OrcidDateJson>,
    #[serde(rename = "end-date")]
    end_date: Option<OrcidDateJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct OrganizationJson {
    name: String,
    #[serde(rename = "disambiguated-organization")]
    disambiguated: Option<DisambiguatedOrgJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct DisambiguatedOrgJson {
    #[serde(rename = "disambiguated-organization-identifier")]
    identifier: String,
    #[serde(rename = "disambiguation-source")]
    source: String,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct OrcidDateJson {
    year: Option<StringValue>,
    month: Option<StringValue>,
    day: Option<StringValue>,
}

pub struct PersonAffiliation {
    pub ror_id: Option<String>,
    pub name: String,
    pub role: Option<String>,
    pub start_date: Option<String>,
    pub end_date: Option<String>,
}

fn orcid_date_to_iso(date: &OrcidDateJson) -> Option<String> {
    let year = date.year.as_ref()?.value.clone();
    if year.is_empty() {
        return None;
    }
    match (&date.month, &date.day) {
        (Some(m), Some(d)) if !m.value.is_empty() && !d.value.is_empty() => {
            Some(format!("{}-{:0>2}-{:0>2}", year, m.value, d.value))
        }
        (Some(m), _) if !m.value.is_empty() => Some(format!("{}-{:0>2}", year, m.value)),
        _ => Some(year),
    }
}

/// Look up a ROR ID in the `organizations` SQLite table by external identifier type and value.
/// `ext_type` must be lowercase (e.g. "grid", "isni", "fundref", "wikidata").
/// Returns the full ROR URL (e.g. `https://ror.org/04wxnsj81`) or `None`.
fn ror_id_from_sqlite(ext_type: &str, ext_value: &str, db_path: &Path) -> Option<String> {
    use rusqlite::{Connection, params};
    let conn = Connection::open(db_path).ok()?;
    conn.query_row(
        "SELECT o.id FROM organizations o, json_each(o.external_ids) ext \
         WHERE LOWER(json_extract(ext.value, '$.type')) = ?1 \
           AND (json_extract(ext.value, '$.preferred') = ?2 \
                OR EXISTS ( \
                    SELECT 1 FROM json_each(json_extract(ext.value, '$.all')) a \
                    WHERE a.value = ?2 \
                )) \
         LIMIT 1",
        params![ext_type, ext_value],
        |row| row.get::<_, String>(0),
    )
    .ok()
}

/// Resolve a ROR bare ID from an ORCID disambiguation entry.
/// Tries a direct ROR match first, then queries the `organizations` SQLite for
/// GRID / ISNI / FundRef / Wikidata cross-references. Returns `None` when the
/// source is unknown (e.g. RINGGOLD) or no match is found.
fn resolve_ror_id(d: &DisambiguatedOrgJson, db_path: Option<&Path>) -> Option<String> {
    if d.identifier.is_empty() {
        return None;
    }
    if d.source.eq_ignore_ascii_case("ROR") {
        return Some(
            d.identifier
                .trim_start_matches("https://ror.org/")
                .trim_start_matches("http://ror.org/")
                .to_string(),
        );
    }
    let db = db_path?;
    let ext_type = d.source.to_lowercase();
    if matches!(ext_type.as_str(), "grid" | "isni" | "fundref" | "wikidata") {
        return ror_id_from_sqlite(&ext_type, &d.identifier, db)
            .map(|url| url.trim_start_matches("https://ror.org/").to_string());
    }
    None
}

/// Fetch employment records from the ORCID public API and return them as a list
/// of affiliations sorted by start date ascending. Accepts a bare ORCID iD or
/// full ORCID URL. When `db_path` points to the local `organizations` SQLite,
/// non-ROR identifiers (GRID, ISNI, FundRef, Wikidata) are resolved to ROR IDs.
pub fn fetch_person_employments(orcid_url: &str, db_path: Option<&Path>) -> Result<Vec<PersonAffiliation>> {
    let orcid = orcid_url.trim_start_matches("https://orcid.org/");
    let api_url = format!("https://pub.orcid.org/v3.0/{}/employments", orcid);
    let client = reqwest::blocking::Client::builder()
        .user_agent(crate::io_utils::commonmeta_user_agent())
        .timeout(std::time::Duration::from_secs(30))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;
    let resp: EmploymentsResponse = client
        .get(&api_url)
        .header("Accept", "application/vnd.orcid+json")
        .send()
        .map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
        .error_for_status()
        .map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
        .json()
        .map_err(|e| Error::Parse(e.to_string()))?;

    let mut affiliations: Vec<PersonAffiliation> = resp
        .affiliation_groups
        .iter()
        .flat_map(|g| g.summaries.iter())
        .filter_map(|w| w.employment_summary.as_ref())
        .filter(|emp| !emp.organization.name.is_empty())
        .map(|emp| {
            let ror_id = emp
                .organization
                .disambiguated
                .as_ref()
                .and_then(|d| resolve_ror_id(d, db_path));
            PersonAffiliation {
                ror_id,
                name: emp.organization.name.clone(),
                role: if emp.role_title.is_empty() { None } else { Some(emp.role_title.clone()) },
                start_date: emp.start_date.as_ref().and_then(orcid_date_to_iso),
                end_date: emp.end_date.as_ref().and_then(orcid_date_to_iso),
            }
        })
        .collect();

    affiliations.sort_by(|a, b| match (&a.start_date, &b.start_date) {
        (Some(da), Some(db)) => da.cmp(db),
        (Some(_), None) => std::cmp::Ordering::Less,
        (None, Some(_)) => std::cmp::Ordering::Greater,
        (None, None) => std::cmp::Ordering::Equal,
    });
    Ok(affiliations)
}

// ── Works / publications ──────────────────────────────────────────────────────

#[derive(Deserialize, Default)]
#[serde(default)]
struct WorksResponse {
    group: Vec<WorkGroup>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkGroup {
    #[serde(rename = "work-summary")]
    work_summaries: Vec<WorkSummaryJson>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkSummaryJson {
    #[serde(rename = "external-ids")]
    external_ids: WorkExternalIds,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkExternalIds {
    #[serde(rename = "external-id")]
    external_id: Vec<WorkExternalId>,
}

#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkExternalId {
    #[serde(rename = "external-id-type")]
    id_type: String,
    #[serde(rename = "external-id-value")]
    id_value: String,
}

/// Fetch the DOIs of all works listed on an ORCID profile.
///
/// Calls `GET https://pub.orcid.org/v3.0/{orcid}/works`, extracts one DOI per
/// work group (preferring the first summary's identifiers), and returns
/// normalised `https://doi.org/…` URLs deduplicated and in response order.
pub fn fetch_orcid_work_dois(orcid_url: &str) -> Result<Vec<String>> {
    let orcid = orcid_url.trim_start_matches("https://orcid.org/");
    let api_url = format!("https://pub.orcid.org/v3.0/{}/works", orcid);
    let client = reqwest::blocking::Client::builder()
        .user_agent(crate::io_utils::commonmeta_user_agent())
        .timeout(std::time::Duration::from_secs(30))
        .build()
        .map_err(|e| Error::Http(e.to_string()))?;
    let resp: WorksResponse = client
        .get(&api_url)
        .header("Accept", "application/vnd.orcid+json")
        .send()
        .map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
        .error_for_status()
        .map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
        .json()
        .map_err(|e| Error::Parse(e.to_string()))?;

    let mut dois: Vec<String> = Vec::new();
    for group in &resp.group {
        // Each group is one logical work; take the first summary's identifiers.
        if let Some(summary) = group.work_summaries.first() {
            if let Some(doi) = summary
                .external_ids
                .external_id
                .iter()
                .find(|id| id.id_type == "doi")
                .map(|id| crate::doi_utils::normalize_doi(&id.id_value))
                .filter(|s| !s.is_empty())
            {
                if !dois.contains(&doi) {
                    dois.push(doi);
                }
            }
        }
    }
    Ok(dois)
}

// ── InvenioRDM YAML output ────────────────────────────────────────────────────

/// Double-quote a YAML scalar, escaping backslashes, double-quotes, and newlines.
fn yaml_dq(s: &str) -> String {
    format!(
        "\"{}\"",
        s.replace('\\', "\\\\")
            .replace('"', "\\\"")
            .replace('\n', "\\n")
            .replace('\r', "")
            .replace('\t', "\\t")
    )
}

/// Return `s` as a plain YAML scalar when safe; otherwise double-quote it.
fn yaml_scalar(s: &str) -> String {
    if s.is_empty() {
        return yaml_dq(s);
    }
    let first = s.chars().next().unwrap();
    let needs_q = ":-?|>!'\"#&*{}[],%@`".contains(first)
        || s.contains(": ")
        || s.contains(" #")
        || s.contains('\n')
        || matches!(s, "true" | "false" | "null" | "~");
    if needs_q { yaml_dq(s) } else { s.to_string() }
}

/// Serialize a person to InvenioRDM names YAML format (list form).
///
/// `person_json` must be the ORCID 3.0 `/person` endpoint response; `affiliations`
/// is the employment list from [`fetch_person_employments`] (may be empty).
pub fn write_inveniordm_person_yaml(
    person_json: &serde_json::Value,
    affiliations: &[PersonAffiliation],
) -> Result<Vec<u8>> {
    let name_obj = person_json.get("name");
    let given_name = name_obj
        .and_then(|n| n.get("given-names"))
        .and_then(|g| g.get("value"))
        .and_then(|v| v.as_str())
        .unwrap_or("")
        .to_string();
    let family_name = name_obj
        .and_then(|n| n.get("family-name"))
        .and_then(|f| f.get("value"))
        .and_then(|v| v.as_str())
        .unwrap_or("")
        .to_string();
    let credit_name = name_obj
        .and_then(|n| n.get("credit-name"))
        .and_then(|c| c.get("value"))
        .and_then(|v| v.as_str())
        .unwrap_or("")
        .to_string();
    let display_name = if !credit_name.is_empty() {
        credit_name
    } else {
        format!("{} {}", given_name, family_name).trim().to_string()
    };

    let path = person_json.get("path").and_then(|p| p.as_str()).unwrap_or("");
    let orcid_id = path.trim_start_matches('/').split('/').next().unwrap_or("").to_string();

    let biography = person_json
        .get("biography")
        .and_then(|b| b.get("content"))
        .and_then(|c| c.as_str())
        .unwrap_or("")
        .to_string();

    let mut out = String::new();
    out.push_str(&format!("- id: {}\n", orcid_id));
    out.push_str(&format!("  name: {}\n", yaml_scalar(&display_name)));
    out.push_str(&format!("  given_name: {}\n", yaml_scalar(&given_name)));
    out.push_str(&format!("  family_name: {}\n", yaml_scalar(&family_name)));
    out.push_str("  identifiers:\n");
    out.push_str(&format!("    - identifier: https://orcid.org/{}\n", orcid_id));

    if !affiliations.is_empty() {
        out.push_str("  affiliations:\n");
        for aff in affiliations {
            if let Some(ror_id) = &aff.ror_id {
                out.push_str(&format!("    - id: {}\n", ror_id));
                out.push_str(&format!("      name: {}\n", yaml_scalar(&aff.name)));
            } else {
                out.push_str(&format!("    - name: {}\n", yaml_scalar(&aff.name)));
            }
            if let Some(date) = &aff.start_date {
                out.push_str(&format!("      start_date: '{}'\n", date));
            }
            if let Some(date) = &aff.end_date {
                out.push_str(&format!("      end_date: '{}'\n", date));
            }
        }
    }

    if !biography.is_empty() {
        out.push_str(&format!("  description: {}\n", yaml_dq(&biography)));
    }

    Ok(out.into_bytes())
}

// ── JSON output ───────────────────────────────────────────────────────────────

/// Serialize an ORCID 3.0 person JSON value to bytes.
///
/// The value should conform to `orcid_schema_v3.0.json` — i.e., the shape
/// returned by [`fetch_person_json`] or [`fetch_person_json_sqlite`].
pub fn write_orcid_json(value: &serde_json::Value) -> Result<Vec<u8>> {
    serde_json::to_vec(value).map_err(|e| Error::Serialize(e.to_string()))
}

/// Convert ORCID 3.0 person JSON + resolved affiliations to a commonmeta person entity.
pub fn orcid_person_to_commonmeta(
    person_json: &serde_json::Value,
    affiliations: &[PersonAffiliation],
) -> serde_json::Value {
    let mut obj = serde_json::Map::new();

    // id — path may be "/0000-0003-1419-2405/person" (API) or "0000-0003-1419-2405" (SQLite XML)
    if let Some(path) = person_json.get("path").and_then(|v| v.as_str()) {
        let orcid_id = path.trim_start_matches('/').split('/').next().unwrap_or("");
        if !orcid_id.is_empty() {
            obj.insert(
                "id".to_string(),
                serde_json::Value::String(format!("https://orcid.org/{orcid_id}")),
            );
        }
    }

    // given_name / family_name / name (display_name: credit-name, else given + family)
    if let Some(name_obj) = person_json.get("name") {
        let given = name_obj
            .get("given-names")
            .and_then(|n| n.get("value"))
            .and_then(|v| v.as_str())
            .unwrap_or("");
        let family = name_obj
            .get("family-name")
            .and_then(|n| n.get("value"))
            .and_then(|v| v.as_str())
            .unwrap_or("");
        let credit = name_obj
            .get("credit-name")
            .and_then(|n| n.get("value"))
            .and_then(|v| v.as_str())
            .unwrap_or("");
        if !given.is_empty() {
            obj.insert("given_name".to_string(), serde_json::Value::String(given.to_string()));
        }
        if !family.is_empty() {
            obj.insert("family_name".to_string(), serde_json::Value::String(family.to_string()));
        }
        if !credit.is_empty() {
            obj.insert("name".to_string(), serde_json::Value::String(credit.to_string()));
        }
    }

    // description (biography)
    if let Some(bio) = person_json
        .get("biography")
        .and_then(|b| b.get("content"))
        .and_then(|v| v.as_str())
        .filter(|s| !s.is_empty())
    {
        obj.insert("description".to_string(), serde_json::Value::String(bio.to_string()));
    }

    // additional_names (other-names)
    if let Some(others) = person_json
        .get("other-names")
        .and_then(|o| o.get("other-name"))
        .and_then(|v| v.as_array())
    {
        let names: Vec<serde_json::Value> = others
            .iter()
            .filter_map(|n| n.get("content").and_then(|v| v.as_str()))
            .filter(|s| !s.is_empty())
            .map(|s| serde_json::Value::String(s.to_string()))
            .collect();
        if !names.is_empty() {
            obj.insert("additional_names".to_string(), serde_json::Value::Array(names));
        }
    }

    // affiliations (from resolved employments)
    if !affiliations.is_empty() {
        let affs: Vec<serde_json::Value> = affiliations
            .iter()
            .map(|a| {
                let mut aff = serde_json::Map::new();
                if let Some(ref ror) = a.ror_id {
                    aff.insert(
                        "id".to_string(),
                        serde_json::Value::String(format!("https://ror.org/{ror}")),
                    );
                }
                aff.insert("name".to_string(), serde_json::Value::String(a.name.clone()));
                if let Some(ref role) = a.role {
                    aff.insert("role".to_string(), serde_json::Value::String(role.clone()));
                }
                if let Some(ref date) = a.start_date {
                    aff.insert("start_date".to_string(), serde_json::Value::String(date.clone()));
                }
                if let Some(ref date) = a.end_date {
                    aff.insert("end_date".to_string(), serde_json::Value::String(date.clone()));
                }
                serde_json::Value::Object(aff)
            })
            .collect();
        obj.insert("affiliations".to_string(), serde_json::Value::Array(affs));
    }

    // identifiers (external-identifiers)
    if let Some(ext_ids) = person_json
        .get("external-identifiers")
        .and_then(|e| e.get("external-identifier"))
        .and_then(|v| v.as_array())
    {
        let ids: Vec<serde_json::Value> = ext_ids
            .iter()
            .filter_map(|e| {
                let id_type = e.get("external-id-type")?.as_str()?;
                let id_value = e.get("external-id-value")?.as_str()?;
                if id_value.is_empty() {
                    return None;
                }
                let (cm_type, scheme) = map_orcid_ext_id_type(id_type);
                let mut id_obj = serde_json::Map::new();
                id_obj.insert("identifier".to_string(), serde_json::Value::String(id_value.to_string()));
                id_obj.insert("identifier_type".to_string(), serde_json::Value::String(cm_type.to_string()));
                if let Some(s) = scheme {
                    id_obj.insert("scheme".to_string(), serde_json::Value::String(s.to_string()));
                }
                Some(serde_json::Value::Object(id_obj))
            })
            .collect();
        if !ids.is_empty() {
            obj.insert("identifiers".to_string(), serde_json::Value::Array(ids));
        }
    }

    // urls (researcher-urls)
    if let Some(ru) = person_json
        .get("researcher-urls")
        .and_then(|r| r.get("researcher-url"))
        .and_then(|v| v.as_array())
    {
        let urls: Vec<serde_json::Value> = ru
            .iter()
            .filter_map(|r| {
                let url = r.get("url")?.get("value")?.as_str()?;
                if url.is_empty() {
                    return None;
                }
                let mut url_obj = serde_json::Map::new();
                if let Some(name) = r.get("url-name").and_then(|v| v.as_str()).filter(|s| !s.is_empty()) {
                    url_obj.insert("name".to_string(), serde_json::Value::String(name.to_string()));
                }
                url_obj.insert("url".to_string(), serde_json::Value::String(url.to_string()));
                Some(serde_json::Value::Object(url_obj))
            })
            .collect();
        if !urls.is_empty() {
            obj.insert("urls".to_string(), serde_json::Value::Array(urls));
        }
    }

    // country (first address)
    if let Some(country) = person_json
        .get("addresses")
        .and_then(|a| a.get("address"))
        .and_then(|v| v.as_array())
        .and_then(|arr| arr.first())
        .and_then(|a| a.get("country"))
        .and_then(|c| c.get("value"))
        .and_then(|v| v.as_str())
        .filter(|s| !s.is_empty())
    {
        obj.insert("country".to_string(), serde_json::Value::String(country.to_string()));
    }

    obj.insert("asserted_by".to_string(), serde_json::Value::String("Author".to_string()));

    serde_json::Value::Object(obj)
}

fn map_orcid_ext_id_type(id_type: &str) -> (&'static str, Option<&'static str>) {
    match id_type {
        "ResearcherID" | "rid"     => ("ResearcherID", None),
        "Scopus Author ID"         => ("ScopusID", None),
        "ISNI"                     => ("ISNI", None),
        "Wikidata"                 => ("Wikidata", None),
        _                          => ("Other", Some("Other")),
    }
}

/// Serialize a commonmeta person entity optionally followed by works as a JSON
/// array validated against the commonmeta v1.0 schema.
pub fn write_commonmeta_person(
    person_json: &serde_json::Value,
    affiliations: &[PersonAffiliation],
    works: &[Data],
) -> Result<Vec<u8>> {
    let entity = orcid_person_to_commonmeta(person_json, affiliations);
    let mut items = vec![entity];
    for work in works {
        let prepared = crate::formats::commonmeta::prepare(work);
        let v = serde_json::to_value(&prepared).map_err(|e| Error::Serialize(e.to_string()))?;
        items.push(v);
    }
    let arr = serde_json::Value::Array(items);
    let bytes = serde_json::to_vec_pretty(&arr).map_err(|e| Error::Serialize(e.to_string()))?;
    crate::schema_utils::json_schema_errors(&bytes, Some("commonmeta"))?;
    Ok(bytes)
}

// ── Tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    const MINIMAL_RECORD: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<record:record xmlns:common="http://www.orcid.org/ns/common"
               xmlns:history="http://www.orcid.org/ns/history"
               xmlns:person="http://www.orcid.org/ns/person"
               xmlns:personal-details="http://www.orcid.org/ns/personal-details"
               xmlns:address="http://www.orcid.org/ns/address"
               xmlns:keyword="http://www.orcid.org/ns/keyword"
               xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
               xmlns:record="http://www.orcid.org/ns/record"
               path="/0009-0007-0779-1000">
    <common:orcid-identifier>
        <common:uri>https://orcid.org/0009-0007-0779-1000</common:uri>
        <common:path>0009-0007-0779-1000</common:path>
        <common:host>orcid.org</common:host>
    </common:orcid-identifier>
    <history:history>
        <common:last-modified-date>2023-08-20T05:55:33.757Z</common:last-modified-date>
    </history:history>
    <person:person path="/0009-0007-0779-1000/person">
        <person:name visibility="public" path="0009-0007-0779-1000">
            <personal-details:given-names>Yumi</personal-details:given-names>
            <personal-details:family-name>Shin</personal-details:family-name>
        </person:name>
        <address:addresses path="/0009-0007-0779-1000/address"/>
        <keyword:keywords path="/0009-0007-0779-1000/keywords"/>
        <external-identifier:external-identifiers path="/0009-0007-0779-1000/external-identifiers"/>
    </person:person>
</record:record>"#;

    const RICH_RECORD: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<record:record xmlns:common="http://www.orcid.org/ns/common"
               xmlns:history="http://www.orcid.org/ns/history"
               xmlns:person="http://www.orcid.org/ns/person"
               xmlns:personal-details="http://www.orcid.org/ns/personal-details"
               xmlns:address="http://www.orcid.org/ns/address"
               xmlns:keyword="http://www.orcid.org/ns/keyword"
               xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
               xmlns:record="http://www.orcid.org/ns/record"
               path="/0000-0001-8188-0000">
    <common:orcid-identifier>
        <common:uri>https://orcid.org/0000-0001-8188-0000</common:uri>
        <common:path>0000-0001-8188-0000</common:path>
        <common:host>orcid.org</common:host>
    </common:orcid-identifier>
    <history:history>
        <common:last-modified-date>2022-05-26T07:06:31.543Z</common:last-modified-date>
    </history:history>
    <person:person path="/0000-0001-8188-0000/person">
        <person:name visibility="public" path="0000-0001-8188-0000">
            <personal-details:given-names>Ana</personal-details:given-names>
            <personal-details:family-name>Souza</personal-details:family-name>
        </person:name>
        <address:addresses path="/0000-0001-8188-0000/address">
            <address:address put-code="12345" visibility="public" path="/0000-0001-8188-0000/address/12345">
                <address:country>BR</address:country>
            </address:address>
        </address:addresses>
        <keyword:keywords path="/0000-0001-8188-0000/keywords">
            <keyword:keyword put-code="111" visibility="public">
                <keyword:content>bioinformatics</keyword:content>
            </keyword:keyword>
            <keyword:keyword put-code="222" visibility="public">
                <keyword:content>genomics</keyword:content>
            </keyword:keyword>
        </keyword:keywords>
        <external-identifier:external-identifiers path="/0000-0001-8188-0000/external-identifiers">
            <external-identifier:external-identifier put-code="848742" visibility="public">
                <common:external-id-type>ResearcherID</common:external-id-type>
                <common:external-id-value>D-1073-2012</common:external-id-value>
                <common:external-id-url>http://www.researcherid.com/rid/D-1073-2012</common:external-id-url>
            </external-identifier:external-identifier>
            <external-identifier:external-identifier put-code="848755" visibility="public">
                <common:external-id-type>Scopus Author ID</common:external-id-type>
                <common:external-id-value>36852681700</common:external-id-value>
            </external-identifier:external-identifier>
        </external-identifier:external-identifiers>
    </person:person>
</record:record>"#;

    const ERROR_RECORD: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<error:error xmlns:error="http://www.orcid.org/ns/error">
    <error:response-code>409</error:response-code>
    <error:developer-message>409 Conflict: The ORCID record is locked.</error:developer-message>
    <error:user-message>The ORCID record is locked.</error:user-message>
    <error:error-code>9018</error:error-code>
</error:error>"#;

    // ── helpers ───────────────────────────────────────────────────────────────

    /// Build an in-memory tar.gz containing a single XML entry at `path`.
    fn make_tar_gz(path: &str, xml: &str) -> Vec<u8> {
        use flate2::write::GzEncoder;
        use flate2::Compression;

        let mut gz_data: Vec<u8> = Vec::new();
        let enc = GzEncoder::new(&mut gz_data, Compression::default());
        let mut builder = tar::Builder::new(enc);
        let content = xml.as_bytes();
        let mut header = tar::Header::new_gnu();
        header.set_size(content.len() as u64);
        header.set_mode(0o644);
        header.set_mtime(0);
        header.set_cksum();
        builder.append_data(&mut header, path, content).unwrap();
        let enc = builder.into_inner().unwrap();
        enc.finish().unwrap();
        gz_data
    }

    /// Create a minimal cache SQLite (pid_records schema) with one ORCID row.
    fn make_cache_db(dir: &std::path::Path, pid: &str, json: &str) -> std::path::PathBuf {
        let path = dir.join("cache.sqlite3");
        let conn = rusqlite::Connection::open(&path).unwrap();
        conn.execute_batch(
            "CREATE TABLE pid_records (
                pid TEXT PRIMARY KEY, source_id INTEGER NOT NULL,
                resource_url TEXT NOT NULL DEFAULT '',
                raw_metadata TEXT NOT NULL, raw_metadata_type TEXT NOT NULL
            );",
        )
        .unwrap();
        conn.execute(
            "INSERT INTO pid_records (pid, source_id, raw_metadata, raw_metadata_type)
             VALUES (?1, 4, ?2, 'orcid')",
            rusqlite::params![pid, json],
        )
        .unwrap();
        path
    }

    // ── XML fixture data ──────────────────────────────────────────────────────

    #[test]
    fn test_parse_minimal_record() {
        let record = parse_orcid_xml(MINIMAL_RECORD.as_bytes()).unwrap();
        assert_eq!(record.orcid_identifier.uri, "https://orcid.org/0009-0007-0779-1000");
        assert_eq!(record.orcid_identifier.path, "0009-0007-0779-1000");
        assert_eq!(record.history.last_modified_date, "2023-08-20T05:55:33.757Z");
        assert_eq!(record.person.name.given_names, "Yumi");
        assert_eq!(record.person.name.family_name, "Shin");
        assert!(record.person.name.credit_name.is_empty());
        assert!(record.person.addresses.addresses.is_empty());
        assert!(record.person.external_identifiers.identifiers.is_empty());
    }

    #[test]
    fn test_parse_rich_record() {
        let record = parse_orcid_xml(RICH_RECORD.as_bytes()).unwrap();
        assert_eq!(record.orcid_identifier.uri, "https://orcid.org/0000-0001-8188-0000");
        assert_eq!(record.person.name.given_names, "Ana");
        assert_eq!(record.person.name.family_name, "Souza");
        assert_eq!(record.person.addresses.addresses.len(), 1);
        assert_eq!(record.person.addresses.addresses[0].country, "BR");
        assert_eq!(record.person.external_identifiers.identifiers.len(), 2);
        let ext = &record.person.external_identifiers.identifiers[0];
        assert_eq!(ext.type_, "ResearcherID");
        assert_eq!(ext.value, "D-1073-2012");
    }

    #[test]
    fn test_error_record_returns_none() {
        assert!(parse_orcid_xml(ERROR_RECORD.as_bytes()).is_none());
    }

    #[test]
    fn test_display_name_given_family() {
        let record = parse_orcid_xml(MINIMAL_RECORD.as_bytes()).unwrap();
        assert_eq!(display_name(&record), "Yumi Shin");
    }

    #[test]
    fn test_display_name_family_only() {
        let xml = MINIMAL_RECORD.replace(
            "<personal-details:given-names>Yumi</personal-details:given-names>",
            "",
        );
        let record = parse_orcid_xml(xml.as_bytes()).unwrap();
        assert_eq!(display_name(&record), "Shin");
    }

    #[test]
    fn test_from_orcid_data() {
        let record = parse_orcid_xml(RICH_RECORD.as_bytes()).unwrap();
        let data = from_orcid(record);
        assert_eq!(data.id, "https://orcid.org/0000-0001-8188-0000");
        assert_eq!(data.type_, "Person");
        assert_eq!(data.title, "Ana Souza");
        assert_eq!(data.provider, "ORCID");
        assert_eq!(data.identifiers.len(), 2);
        assert!(data.identifiers.iter().any(|i| i.identifier_type == "ResearcherID"));
    }

    #[test]
    fn test_serialize_to_people_row() {
        let record = parse_orcid_xml(RICH_RECORD.as_bytes()).unwrap();
        let row = serialize_to_people_row(&record, RICH_RECORD.as_bytes());
        assert_eq!(row.id, "https://orcid.org/0000-0001-8188-0000");
        assert_eq!(row.orcid, "0000-0001-8188-0000");
        assert_eq!(row.given_name, "Ana");
        assert_eq!(row.family_name, "Souza");
        assert_eq!(row.name, "Ana Souza");
        assert_eq!(row.country, "BR");
        let ext: Vec<serde_json::Value> =
            serde_json::from_str(&row.external_identifiers).unwrap();
        assert_eq!(ext.len(), 2);
        assert!(!row.metadata.is_empty());
    }

    #[test]
    fn test_parse_filename_date() {
        assert_eq!(parse_filename_date("ORCID_2025_10_summaries.tar.gz"), (2025, 10));
        assert_eq!(parse_filename_date("bad_name.tar.gz"), (0, 0));
    }

    #[test]
    fn test_person_validates_against_orcid_schema() {
        let bytes = include_bytes!("../../tests/fixtures/orcid/person_0000-0003-1419-2405.json");
        let result = crate::schema_utils::json_schema_errors(bytes, Some("orcid"));
        assert!(result.is_ok(), "ORCID person fixture should validate: {result:?}");
    }

    // ── bulk import (tar.gz → SQLite) ─────────────────────────────────────────

    #[test]
    fn test_stream_summaries_to_sqlite_inserts_record() {
        let dir = tempfile::tempdir().unwrap();
        let db_path = dir.path().join("people.sqlite3");
        let tar_gz = make_tar_gz("000/0009-0007-0779-1000.xml", MINIMAL_RECORD);
        let tar_gz_path = dir.path().join("summaries.tar.gz");
        std::fs::write(&tar_gz_path, &tar_gz).unwrap();

        let count = stream_summaries_to_sqlite(&tar_gz_path, &db_path, 0).unwrap();
        assert_eq!(count, 1);

        let conn = rusqlite::Connection::open(&db_path).unwrap();
        let name: String = conn
            .query_row(
                "SELECT name FROM people WHERE id = ?1",
                rusqlite::params!["https://orcid.org/0009-0007-0779-1000"],
                |r| r.get(0),
            )
            .unwrap();
        assert_eq!(name, "Yumi Shin");
    }

    #[test]
    fn test_stream_summaries_skips_error_records() {
        let dir = tempfile::tempdir().unwrap();
        let db_path = dir.path().join("people.sqlite3");
        let tar_gz = make_tar_gz("000/locked.xml", ERROR_RECORD);
        let tar_gz_path = dir.path().join("summaries.tar.gz");
        std::fs::write(&tar_gz_path, &tar_gz).unwrap();

        let count = stream_summaries_to_sqlite(&tar_gz_path, &db_path, 0).unwrap();
        assert_eq!(count, 0);
    }

    #[test]
    fn test_stream_summaries_people_table_has_no_keywords_column() {
        let dir = tempfile::tempdir().unwrap();
        let db_path = dir.path().join("people.sqlite3");
        let tar_gz = make_tar_gz("000/0009-0007-0779-1000.xml", MINIMAL_RECORD);
        std::fs::write(dir.path().join("summaries.tar.gz"), &tar_gz).unwrap();
        stream_summaries_to_sqlite(&dir.path().join("summaries.tar.gz"), &db_path, 0).unwrap();

        let conn = rusqlite::Connection::open(&db_path).unwrap();
        let cols: Vec<String> = conn
            .prepare("PRAGMA table_info(people)")
            .unwrap()
            .query_map([], |r| r.get::<_, String>(1))
            .unwrap()
            .filter_map(|r| r.ok())
            .collect();
        assert!(!cols.iter().any(|c| c == "keywords"), "keywords column must not exist");
    }

    // ── SQLite lookup ─────────────────────────────────────────────────────────

    #[test]
    fn test_fetch_sqlite_after_bulk_import() {
        let dir = tempfile::tempdir().unwrap();
        let db_path = dir.path().join("people.sqlite3");
        let tar_gz = make_tar_gz("000/0000-0001-8188-0000.xml", RICH_RECORD);
        std::fs::write(dir.path().join("summaries.tar.gz"), &tar_gz).unwrap();
        stream_summaries_to_sqlite(&dir.path().join("summaries.tar.gz"), &db_path, 0).unwrap();

        let data = fetch_sqlite("0000-0001-8188-0000", &db_path).unwrap();
        assert_eq!(data.id, "https://orcid.org/0000-0001-8188-0000");
        assert_eq!(data.type_, "Person");
        assert_eq!(data.given_name, "Ana");
        assert_eq!(data.family_name, "Souza");
        assert_eq!(data.country, "BR");
        assert_eq!(data.provider, "ORCID");
        assert_eq!(data.identifiers.len(), 2);
        assert!(data.identifiers.iter().any(|i| i.identifier_type == "ResearcherID"));
    }

    #[test]
    fn test_fetch_person_json_sqlite_reconstructs_json_from_xml_blob() {
        let dir = tempfile::tempdir().unwrap();
        let db_path = dir.path().join("people.sqlite3");
        let tar_gz = make_tar_gz("000/0000-0001-8188-0000.xml", RICH_RECORD);
        std::fs::write(dir.path().join("summaries.tar.gz"), &tar_gz).unwrap();
        stream_summaries_to_sqlite(&dir.path().join("summaries.tar.gz"), &db_path, 0).unwrap();

        let json = fetch_person_json_sqlite("0000-0001-8188-0000", &db_path).unwrap();
        assert_eq!(
            json.pointer("/name/given-names/value").and_then(|v| v.as_str()),
            Some("Ana")
        );
        assert_eq!(
            json.pointer("/name/family-name/value").and_then(|v| v.as_str()),
            Some("Souza")
        );
        // keywords not stored; reconstructed blob has empty array
        let kw = json.pointer("/keywords/keyword").and_then(|v| v.as_array());
        assert!(kw.map(|a| a.is_empty()).unwrap_or(true));
    }

    // ── cache integration ─────────────────────────────────────────────────────

    // Minimal ORCID 3.0 person JSON as stored by dragoman (fetch_orcid_with_json).
    const PERSON_JSON: &str = r#"{
        "last-modified-date": {"value": 1748000000000},
        "name": {
            "given-names": {"value": "Ana"},
            "family-name": {"value": "Souza"},
            "credit-name": null,
            "visibility": "public",
            "path": "0000-0001-8188-0000"
        },
        "other-names": {"other-name": [], "path": "/0000-0001-8188-0000/other-names"},
        "biography": null,
        "researcher-urls": {"researcher-url": [], "path": "/0000-0001-8188-0000/researcher-urls"},
        "emails": {"email": [], "path": "/0000-0001-8188-0000/email"},
        "addresses": {
            "address": [{"country": {"value": "BR"}, "visibility": "public"}],
            "path": "/0000-0001-8188-0000/address"
        },
        "external-identifiers": {
            "external-identifier": [
                {"external-id-type": "ResearcherID", "external-id-value": "D-1073-2012",
                 "external-id-relationship": "self", "visibility": "public"}
            ],
            "path": "/0000-0001-8188-0000/external-identifiers"
        },
        "path": "/0000-0001-8188-0000/person"
    }"#;

    #[test]
    fn test_stream_cache_empty_cache_returns_zero() {
        let dir = tempfile::tempdir().unwrap();
        let cache_path = dir.path().join("cache.sqlite3");
        let people_path = dir.path().join("people.sqlite3");
        let conn = rusqlite::Connection::open(&cache_path).unwrap();
        conn.execute_batch(
            "CREATE TABLE pid_records (pid TEXT PRIMARY KEY, source_id INTEGER,
             resource_url TEXT, raw_metadata TEXT, raw_metadata_type TEXT);",
        )
        .unwrap();
        drop(conn);

        let count = stream_cache_orcid_to_people_sqlite(&cache_path, &people_path).unwrap();
        assert_eq!(count, 0);
    }

    #[test]
    fn test_stream_cache_orcid_json_into_people() {
        let dir = tempfile::tempdir().unwrap();
        let cache_path = make_cache_db(
            dir.path(),
            "https://orcid.org/0000-0001-8188-0000",
            PERSON_JSON,
        );
        let people_path = dir.path().join("people.sqlite3");

        let count = stream_cache_orcid_to_people_sqlite(&cache_path, &people_path).unwrap();
        assert_eq!(count, 1);

        let data = fetch_sqlite("0000-0001-8188-0000", &people_path).unwrap();
        assert_eq!(data.id, "https://orcid.org/0000-0001-8188-0000");
        assert_eq!(data.given_name, "Ana");
        assert_eq!(data.family_name, "Souza");
        assert_eq!(data.country, "BR");
        assert_eq!(data.identifiers.len(), 1);
        assert_eq!(data.identifiers[0].identifier_type, "ResearcherID");
    }

    #[test]
    fn test_stream_cache_person_json_sqlite_roundtrip() {
        let dir = tempfile::tempdir().unwrap();
        let cache_path = make_cache_db(
            dir.path(),
            "https://orcid.org/0000-0001-8188-0000",
            PERSON_JSON,
        );
        let people_path = dir.path().join("people.sqlite3");
        stream_cache_orcid_to_people_sqlite(&cache_path, &people_path).unwrap();

        // fetch_person_json_sqlite returns the stored JSON blob without reparse
        let json = fetch_person_json_sqlite("0000-0001-8188-0000", &people_path).unwrap();
        assert_eq!(
            json.pointer("/name/given-names/value").and_then(|v| v.as_str()),
            Some("Ana")
        );
        assert_eq!(
            json.pointer("/name/family-name/value").and_then(|v| v.as_str()),
            Some("Souza")
        );
    }

    // ── schema migration ──────────────────────────────────────────────────────

    #[test]
    fn test_migration_drops_keywords_column_and_rebuilds_fts() {
        let dir = tempfile::tempdir().unwrap();
        let db_path = dir.path().join("people_old.sqlite3");

        // Simulate a database created with the old schema (keywords column + FTS with keywords).
        {
            let conn = rusqlite::Connection::open(&db_path).unwrap();
            conn.execute_batch(
                "CREATE TABLE settings (key TEXT PRIMARY KEY, value TEXT DEFAULT '');
                 CREATE TABLE people (
                     id TEXT PRIMARY KEY, orcid TEXT DEFAULT '',
                     given_name TEXT DEFAULT '', family_name TEXT DEFAULT '',
                     credit_name TEXT DEFAULT '', name TEXT DEFAULT '',
                     country TEXT DEFAULT '', keywords TEXT DEFAULT '',
                     other_names TEXT DEFAULT '', external_identifiers TEXT DEFAULT '',
                     date_updated TEXT DEFAULT '', metadata BLOB DEFAULT x''
                 );
                 CREATE VIRTUAL TABLE people_fts USING fts5(
                     name, keywords, other_names,
                     content='people', content_rowid='rowid',
                     tokenize='unicode61 remove_diacritics 1'
                 );",
            )
            .unwrap();
            conn.execute(
                "INSERT INTO people (id, orcid, name, keywords, other_names)
                 VALUES (?1, ?2, ?3, ?4, ?5)",
                rusqlite::params![
                    "https://orcid.org/0000-0001-8188-0000",
                    "0000-0001-8188-0000",
                    "Ana Souza",
                    "bioinformatics",
                    ""
                ],
            )
            .unwrap();
        }

        // Run migration.
        {
            let conn = rusqlite::Connection::open(&db_path).unwrap();
            ensure_people_schema(&conn).unwrap();
        }

        let conn = rusqlite::Connection::open(&db_path).unwrap();

        // keywords column must be gone.
        let cols: Vec<String> = conn
            .prepare("PRAGMA table_info(people)")
            .unwrap()
            .query_map([], |r| r.get::<_, String>(1))
            .unwrap()
            .filter_map(|r| r.ok())
            .collect();
        assert!(!cols.iter().any(|c| c == "keywords"), "keywords column should be dropped");

        // FTS must not reference keywords.
        let fts_sql: String = conn
            .query_row(
                "SELECT COALESCE(sql,'') FROM sqlite_master WHERE name='people_fts'",
                [],
                |r| r.get(0),
            )
            .unwrap_or_default();
        assert!(!fts_sql.contains("keywords"), "FTS should not reference keywords: {fts_sql}");
        assert!(fts_sql.contains("other_names"), "FTS should index other_names: {fts_sql}");

        // Data row is intact.
        let name: String = conn
            .query_row(
                "SELECT name FROM people WHERE id = ?1",
                rusqlite::params!["https://orcid.org/0000-0001-8188-0000"],
                |r| r.get(0),
            )
            .unwrap();
        assert_eq!(name, "Ana Souza");
    }

    // ── commonmeta output ─────────────────────────────────────────────────────

    #[test]
    fn test_orcid_person_to_commonmeta_fields() {
        let json: serde_json::Value = serde_json::from_str(PERSON_JSON).unwrap();
        let out = orcid_person_to_commonmeta(&json, &[]);
        assert_eq!(out.get("id").and_then(|v| v.as_str()), Some("https://orcid.org/0000-0001-8188-0000"));
        assert_eq!(out.get("given_name").and_then(|v| v.as_str()), Some("Ana"));
        assert_eq!(out.get("family_name").and_then(|v| v.as_str()), Some("Souza"));
        assert_eq!(out.get("country").and_then(|v| v.as_str()), Some("BR"));
        assert_eq!(out.get("asserted_by").and_then(|v| v.as_str()), Some("Author"));
        assert!(out.get("keywords").is_none(), "keywords must not appear in commonmeta output");
    }

    #[test]
    #[ignore = "network"]
    fn test_fetch_orcid_release() {
        let release = fetch_orcid_release(1).unwrap();
        assert_eq!(release.year, 2025);
        assert!(release.batch > 0);
        assert!(release.filename.ends_with("_summaries.tar.gz"));
        assert!(release.download_url.contains("figshare.com"));
        assert!(release.size_bytes > 40_000_000_000);
    }
}