#![allow(dead_code)]
use std::io::Read;
use std::path::Path;
use serde::{Deserialize, Serialize};
use crate::data::{Data, Identifier, PersonUrl};
use crate::error::{Error, Result};
const ORCID_FIGSHARE_ARTICLE_ID: u64 = 30375589;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OrcidRelease {
pub year: u16,
pub batch: u16,
pub filename: String,
pub download_url: String,
pub size_bytes: u64,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct OrcidXml {
#[serde(rename = "orcid-identifier")]
orcid_identifier: XmlOrcidId,
history: XmlHistory,
person: XmlPerson,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlOrcidId {
uri: String,
path: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlHistory {
#[serde(rename = "last-modified-date")]
last_modified_date: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlPerson {
name: XmlPersonName,
#[serde(rename = "other-names")]
other_names: XmlOtherNames,
addresses: XmlAddresses,
keywords: XmlKeywords,
#[serde(rename = "external-identifiers")]
external_identifiers: XmlExternalIdentifiers,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlPersonName {
#[serde(rename = "given-names")]
given_names: String,
#[serde(rename = "family-name")]
family_name: String,
#[serde(rename = "credit-name")]
credit_name: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlAddresses {
#[serde(rename = "address")]
addresses: Vec<XmlAddress>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlAddress {
country: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlKeywords {
#[serde(rename = "keyword")]
keywords: Vec<XmlKeyword>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlKeyword {
content: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlExternalIdentifiers {
#[serde(rename = "external-identifier")]
identifiers: Vec<XmlExternalIdentifier>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlExternalIdentifier {
#[serde(rename = "external-id-type")]
type_: String,
#[serde(rename = "external-id-value")]
value: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlOtherNames {
#[serde(rename = "other-name")]
names: Vec<XmlOtherName>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct XmlOtherName {
content: String,
}
fn parse_orcid_xml(xml_bytes: &[u8]) -> Option<OrcidXml> {
let xml = std::str::from_utf8(xml_bytes).ok()?;
if xml.contains("<error:error") {
return None;
}
let record: OrcidXml = quick_xml::de::from_str(xml).ok()?;
if record.orcid_identifier.uri.is_empty() {
return None;
}
Some(record)
}
fn display_name(record: &OrcidXml) -> String {
let name = &record.person.name;
if !name.credit_name.is_empty() {
return name.credit_name.clone();
}
match (name.given_names.is_empty(), name.family_name.is_empty()) {
(false, false) => format!("{} {}", name.given_names, name.family_name),
(true, false) => name.family_name.clone(),
(false, true) => name.given_names.clone(),
(true, true) => String::new(),
}
}
fn from_orcid(record: OrcidXml) -> Data {
let id = record.orcid_identifier.uri.clone();
let name_obj = &record.person.name;
let given_name = name_obj.given_names.clone();
let family_name = name_obj.family_name.clone();
let name = name_obj.credit_name.clone();
let title = display_name(&record);
let additional_names: Vec<String> = record.person.other_names.names
.iter()
.map(|n| n.content.clone())
.filter(|s| !s.is_empty())
.collect();
let identifiers: Vec<Identifier> = record
.person
.external_identifiers
.identifiers
.iter()
.filter(|e| !e.type_.is_empty() && !e.value.is_empty())
.map(|e| {
let (cm_type, _) = map_orcid_ext_id_type(&e.type_);
Identifier {
identifier: e.value.clone(),
identifier_type: cm_type.to_string(),
..Default::default()
}
})
.collect();
let country = record.person.addresses.addresses.first()
.map(|a| a.country.clone())
.unwrap_or_default();
Data {
id,
type_: "Person".to_string(),
given_name,
family_name,
name,
additional_names,
title,
identifiers,
country,
asserted_by: "Author".to_string(),
date_updated: record.history.last_modified_date.clone(),
provider: "ORCID".to_string(),
..Data::default()
}
}
const PEOPLE_DDL: &str = r#"PRAGMA synchronous=NORMAL;
CREATE TABLE IF NOT EXISTS settings (
"key" TEXT PRIMARY KEY NOT NULL,
"value" TEXT NOT NULL DEFAULT ''
);
CREATE TABLE IF NOT EXISTS people (
"id" TEXT PRIMARY KEY NOT NULL,
"orcid" TEXT NOT NULL DEFAULT '',
"given_name" TEXT NOT NULL DEFAULT '',
"family_name" TEXT NOT NULL DEFAULT '',
"credit_name" TEXT NOT NULL DEFAULT '',
"name" TEXT NOT NULL DEFAULT '',
"country" TEXT NOT NULL DEFAULT '',
"keywords" TEXT NOT NULL DEFAULT '',
"other_names" TEXT NOT NULL DEFAULT '',
"external_identifiers" TEXT NOT NULL DEFAULT '',
"date_updated" TEXT NOT NULL DEFAULT '',
"metadata" BLOB NOT NULL DEFAULT x''
);
CREATE INDEX IF NOT EXISTS people_country ON people("country");
CREATE INDEX IF NOT EXISTS people_date_updated ON people("date_updated");"#;
const PEOPLE_MIGRATE_DDL: &[&str] = &[
"ALTER TABLE people ADD COLUMN other_names TEXT NOT NULL DEFAULT ''",
"ALTER TABLE people ADD COLUMN external_identifiers TEXT NOT NULL DEFAULT ''",
];
const PEOPLE_FTS5_DDL: &str = "CREATE VIRTUAL TABLE people_fts USING fts5(\
name, keywords, other_names, \
content=\"people\", \
content_rowid=\"rowid\", \
tokenize=\"unicode61 remove_diacritics 1\"\
)";
const PEOPLE_INSERT: &str = r#"INSERT OR REPLACE INTO people (
"id", "orcid", "given_name", "family_name", "credit_name", "name",
"country", "keywords", "other_names", "external_identifiers", "date_updated", "metadata"
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)"#;
struct PeopleRow {
id: String,
orcid: String,
given_name: String,
family_name: String,
credit_name: String,
name: String,
country: String,
keywords: String,
other_names: String,
external_identifiers: String,
date_updated: String,
metadata: Vec<u8>,
}
fn serialize_to_people_row(record: &OrcidXml, xml_bytes: &[u8]) -> PeopleRow {
let name = display_name(record);
let country = record
.person
.addresses
.addresses
.first()
.map(|a| a.country.clone())
.unwrap_or_default();
let keywords = record
.person
.keywords
.keywords
.iter()
.map(|k| k.content.as_str())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let other_names = record
.person
.other_names
.names
.iter()
.map(|n| n.content.as_str())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let ext_ids: Vec<serde_json::Value> = record
.person
.external_identifiers
.identifiers
.iter()
.filter(|e| !e.type_.is_empty() && !e.value.is_empty())
.map(|e| serde_json::json!({"type": e.type_, "value": e.value}))
.collect();
let external_identifiers = serde_json::to_string(&ext_ids).unwrap_or_default();
let metadata = zstd::encode_all(xml_bytes, 0).unwrap_or_else(|_| xml_bytes.to_vec());
PeopleRow {
id: record.orcid_identifier.uri.clone(),
orcid: record.orcid_identifier.path.clone(),
given_name: record.person.name.given_names.clone(),
family_name: record.person.name.family_name.clone(),
credit_name: record.person.name.credit_name.clone(),
name,
country,
keywords,
other_names,
external_identifiers,
date_updated: record.history.last_modified_date.clone(),
metadata,
}
}
const BATCH_SIZE: usize = 50_000;
pub fn stream_summaries_to_sqlite(tar_gz_path: &Path, output_path: &Path) -> Result<usize> {
use flate2::read::GzDecoder;
use rusqlite::Connection;
use tar::Archive;
if let Some(parent) = output_path.parent() {
if !parent.as_os_str().is_empty() && !parent.exists() {
std::fs::create_dir_all(parent)
.map_err(|e| Error::Parse(format!("create output dir: {}", e)))?;
}
}
let conn = Connection::open(output_path)
.map_err(|e| Error::Parse(format!("open sqlite '{}': {}", output_path.display(), e)))?;
let _: String = conn
.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0))
.map_err(|e| Error::Parse(format!("WAL mode: {}", e)))?;
ensure_people_schema(&conn)?;
let _ = conn.execute("DROP TABLE IF EXISTS people_fts", []);
let file = std::fs::File::open(tar_gz_path)
.map_err(|e| Error::Parse(format!("open '{}': {}", tar_gz_path.display(), e)))?;
let gz = GzDecoder::new(file);
let mut archive = Archive::new(gz);
let mut batch: Vec<PeopleRow> = Vec::with_capacity(BATCH_SIZE);
let mut total = 0usize;
let entries = archive
.entries()
.map_err(|e| Error::Parse(format!("read tar entries: {}", e)))?;
for entry in entries {
let mut entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
let is_xml = entry
.path()
.ok()
.and_then(|p| p.extension().map(|e| e == "xml"))
.unwrap_or(false);
if !is_xml {
continue;
}
let mut xml_bytes = Vec::new();
if entry.read_to_end(&mut xml_bytes).is_err() {
continue;
}
if let Some(record) = parse_orcid_xml(&xml_bytes) {
batch.push(serialize_to_people_row(&record, &xml_bytes));
}
if batch.len() >= BATCH_SIZE {
let count = write_people_batch(&conn, &batch)?;
total += count;
eprintln!("orcid: {} inserted in batch ({} total)", count, total);
batch.clear();
}
}
if !batch.is_empty() {
let count = write_people_batch(&conn, &batch)?;
total += count;
}
eprintln!("orcid: {} people imported; building FTS index…", total);
conn.execute_batch(PEOPLE_FTS5_DDL)
.map_err(|e| Error::Parse(format!("FTS5 DDL: {}", e)))?;
conn.execute(
"INSERT INTO people_fts(people_fts) VALUES('rebuild')",
[],
)
.map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;
let _ = conn.execute("PRAGMA wal_checkpoint(PASSIVE)", []);
Ok(total)
}
fn write_people_batch(conn: &rusqlite::Connection, batch: &[PeopleRow]) -> Result<usize> {
use rusqlite::params;
let tx = conn
.unchecked_transaction()
.map_err(|e| Error::Parse(format!("begin transaction: {}", e)))?;
{
let mut stmt = tx
.prepare(PEOPLE_INSERT)
.map_err(|e| Error::Parse(format!("prepare insert: {}", e)))?;
for row in batch {
stmt.execute(params![
row.id,
row.orcid,
row.given_name,
row.family_name,
row.credit_name,
row.name,
row.country,
row.keywords,
row.other_names,
row.external_identifiers,
row.date_updated,
row.metadata,
])
.map_err(|e| Error::Parse(format!("insert '{}': {}", row.id, e)))?;
}
}
tx.commit()
.map_err(|e| Error::Parse(format!("commit transaction: {}", e)))?;
Ok(batch.len())
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct PersonJson {
#[serde(rename = "last-modified-date")]
last_modified_date: Option<EpochMs>,
name: NameJson,
biography: Option<BiographyJson>,
#[serde(rename = "other-names")]
other_names: OtherNamesJson,
keywords: KeywordsJson,
addresses: AddressesJson,
#[serde(rename = "researcher-urls")]
researcher_urls: ResearcherUrlsJson,
#[serde(rename = "external-identifiers")]
external_identifiers: ExtIdsJson,
path: String, }
#[derive(Deserialize, Default)]
#[serde(default)]
struct BiographyJson {
content: Option<String>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct OtherNamesJson {
#[serde(rename = "other-name")]
other_names: Vec<OtherNameItemJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct OtherNameItemJson {
content: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct EpochMs {
value: i64,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct NameJson {
#[serde(rename = "given-names")]
given_names: Option<StringValue>,
#[serde(rename = "family-name")]
family_name: Option<StringValue>,
#[serde(rename = "credit-name")]
credit_name: Option<StringValue>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct StringValue {
value: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct KeywordsJson {
keyword: Vec<KeywordItemJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct KeywordItemJson {
content: String, }
#[derive(Deserialize, Default)]
#[serde(default)]
struct AddressesJson {
address: Vec<AddressItemJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct AddressItemJson {
country: StringValue,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct ResearcherUrlsJson {
#[serde(rename = "researcher-url")]
urls: Vec<ResearcherUrlItemJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct ResearcherUrlItemJson {
#[serde(rename = "url-name")]
url_name: Option<String>,
url: StringValue,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct ExtIdsJson {
#[serde(rename = "external-identifier")]
identifiers: Vec<ExtIdItemJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct ExtIdItemJson {
#[serde(rename = "external-id-type")]
type_: String,
#[serde(rename = "external-id-value")]
value: String,
}
fn epoch_ms_to_iso(ms: i64) -> String {
use chrono::{TimeZone, Utc};
if ms == 0 {
return String::new();
}
Utc.timestamp_millis_opt(ms)
.single()
.map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
.unwrap_or_default()
}
fn person_json_to_data(person: &PersonJson, orcid_url: &str) -> Data {
let name_obj = &person.name;
let given_name = name_obj.given_names.as_ref().map(|s| s.value.clone()).unwrap_or_default();
let family_name = name_obj.family_name.as_ref().map(|s| s.value.clone()).unwrap_or_default();
let name = name_obj.credit_name.as_ref().map(|s| s.value.clone()).unwrap_or_default();
let title = if !name.is_empty() {
name.clone()
} else {
match (given_name.is_empty(), family_name.is_empty()) {
(false, false) => format!("{} {}", given_name, family_name),
(true, false) => family_name.clone(),
(false, true) => given_name.clone(),
(true, true) => String::new(),
}
};
let additional_names: Vec<String> = person.other_names.other_names
.iter()
.map(|n| n.content.clone())
.filter(|s| !s.is_empty())
.collect();
let description = person
.biography
.as_ref()
.and_then(|b| b.content.as_deref())
.filter(|s| !s.is_empty())
.unwrap_or_default()
.to_string();
let identifiers: Vec<Identifier> = person
.external_identifiers
.identifiers
.iter()
.filter(|e| !e.type_.is_empty() && !e.value.is_empty())
.map(|e| {
let (cm_type, _) = map_orcid_ext_id_type(&e.type_);
Identifier {
identifier: e.value.clone(),
identifier_type: cm_type.to_string(),
..Default::default()
}
})
.collect();
let urls: Vec<PersonUrl> = person.researcher_urls.urls
.iter()
.filter(|r| !r.url.value.is_empty())
.map(|r| PersonUrl {
name: r.url_name.clone().unwrap_or_default(),
url: r.url.value.clone(),
})
.collect();
let country = person.addresses.address.first()
.map(|a| a.country.value.clone())
.unwrap_or_default();
Data {
id: orcid_url.to_string(),
type_: "Person".to_string(),
given_name,
family_name,
name,
additional_names,
title,
description,
identifiers,
urls,
country,
asserted_by: "Author".to_string(),
date_updated: epoch_ms_to_iso(person.last_modified_date.as_ref().map(|d| d.value).unwrap_or(0)),
provider: "ORCID".to_string(),
..Data::default()
}
}
fn fetch_person_api(orcid_url: &str) -> Result<PeopleRow> {
let (row, _) = fetch_person_api_with_json(orcid_url)?;
Ok(row)
}
fn fetch_person_api_with_json(orcid_url: &str) -> Result<(PeopleRow, serde_json::Value)> {
let orcid = orcid_url.trim_start_matches("https://orcid.org/");
let api_url = format!("https://pub.orcid.org/v3.0/{}/person", orcid);
let client = reqwest::blocking::Client::builder()
.user_agent(crate::io_utils::commonmeta_user_agent())
.timeout(std::time::Duration::from_secs(30))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let bytes = client
.get(&api_url)
.header("Accept", "application/vnd.orcid+json")
.send()
.map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
.error_for_status()
.map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
.bytes()
.map_err(|e| Error::Http(e.to_string()))?;
let json_value: serde_json::Value =
serde_json::from_slice(&bytes).map_err(|e| Error::Parse(e.to_string()))?;
let person: PersonJson =
serde_json::from_slice(&bytes).map_err(|e| Error::Parse(e.to_string()))?;
let country = person
.addresses
.address
.first()
.map(|a| a.country.value.clone())
.unwrap_or_default();
let keywords = person
.keywords
.keyword
.iter()
.map(|k| k.content.as_str())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let other_names = person
.other_names
.other_names
.iter()
.map(|n| n.content.as_str())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let ext_ids: Vec<serde_json::Value> = person
.external_identifiers
.identifiers
.iter()
.filter(|e| !e.type_.is_empty() && !e.value.is_empty())
.map(|e| serde_json::json!({"type": e.type_, "value": e.value}))
.collect();
let external_identifiers = serde_json::to_string(&ext_ids).unwrap_or_default();
let metadata = zstd::encode_all(bytes.as_ref(), 0).unwrap_or_else(|_| bytes.to_vec());
let data = person_json_to_data(&person, orcid_url);
let row = PeopleRow {
id: data.id.clone(),
orcid: orcid.to_string(),
given_name: person.name.given_names.as_ref().map(|s| s.value.clone()).unwrap_or_default(),
family_name: person.name.family_name.as_ref().map(|s| s.value.clone()).unwrap_or_default(),
credit_name: person.name.credit_name.as_ref().map(|s| s.value.clone()).unwrap_or_default(),
name: data.title,
country,
keywords,
other_names,
external_identifiers,
date_updated: data.date_updated,
metadata,
};
Ok((row, json_value))
}
pub fn fetch_orcid(id: &str) -> Result<Data> {
let url = crate::utils::normalize_orcid(id);
if url.is_empty() {
return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
}
let (_, json) = fetch_person_api_with_json(&url)?;
let person: PersonJson =
serde_json::from_value(json).map_err(|e| Error::Parse(e.to_string()))?;
Ok(person_json_to_data(&person, &url))
}
pub fn fetch_person_json(id: &str) -> Result<serde_json::Value> {
let url = crate::utils::normalize_orcid(id);
if url.is_empty() {
return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
}
let (_, json) = fetch_person_api_with_json(&url)?;
Ok(json)
}
fn xml_record_to_person_json(record: &OrcidXml) -> serde_json::Value {
let orcid = &record.orcid_identifier.path;
let last_modified_ms = chrono::DateTime::parse_from_rfc3339(&record.history.last_modified_date)
.map(|dt| dt.timestamp_millis())
.unwrap_or(0);
let given = if record.person.name.given_names.is_empty() {
serde_json::Value::Null
} else {
serde_json::json!({"value": record.person.name.given_names})
};
let family = if record.person.name.family_name.is_empty() {
serde_json::Value::Null
} else {
serde_json::json!({"value": record.person.name.family_name})
};
let credit = if record.person.name.credit_name.is_empty() {
serde_json::Value::Null
} else {
serde_json::json!({"value": record.person.name.credit_name})
};
let other_name_items: Vec<serde_json::Value> = record
.person
.other_names
.names
.iter()
.map(|n| serde_json::json!({"content": n.content, "visibility": "public"}))
.collect();
let address_items: Vec<serde_json::Value> = record
.person
.addresses
.addresses
.iter()
.map(|a| serde_json::json!({"country": {"value": a.country}, "visibility": "public"}))
.collect();
let keyword_items: Vec<serde_json::Value> = record
.person
.keywords
.keywords
.iter()
.map(|k| serde_json::json!({"content": k.content, "visibility": "public"}))
.collect();
let ext_id_items: Vec<serde_json::Value> = record
.person
.external_identifiers
.identifiers
.iter()
.map(|e| serde_json::json!({
"external-id-type": e.type_,
"external-id-value": e.value,
"external-id-relationship": "self",
"visibility": "public"
}))
.collect();
serde_json::json!({
"last-modified-date": {"value": last_modified_ms},
"name": {
"given-names": given,
"family-name": family,
"credit-name": credit,
"visibility": "public",
"path": orcid
},
"other-names": {
"other-name": other_name_items,
"path": format!("/{}/other-names", orcid)
},
"biography": null,
"researcher-urls": {
"researcher-url": [],
"path": format!("/{}/researcher-urls", orcid)
},
"emails": {
"email": [],
"path": format!("/{}/email", orcid)
},
"addresses": {
"address": address_items,
"path": format!("/{}/address", orcid)
},
"keywords": {
"keyword": keyword_items,
"path": format!("/{}/keywords", orcid)
},
"external-identifiers": {
"external-identifier": ext_id_items,
"path": format!("/{}/external-identifiers", orcid)
},
"path": format!("/{}/person", orcid)
})
}
pub fn fetch_person_json_sqlite(id: &str, db_path: &Path) -> Result<serde_json::Value> {
use rusqlite::{params, Connection};
let url = crate::utils::normalize_orcid(id);
if url.is_empty() {
return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
}
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("open sqlite '{}': {}", db_path.display(), e)))?;
let blob: Vec<u8> = conn
.query_row(
"SELECT metadata FROM people WHERE id = ?1 LIMIT 1",
params![url],
|row| row.get(0),
)
.map_err(|e| {
if matches!(e, rusqlite::Error::QueryReturnedNoRows) {
Error::Parse(format!("person '{}' not found in local database", id))
} else {
Error::Parse(format!("sqlite query failed: {}", e))
}
})?;
let raw = zstd::decode_all(std::io::Cursor::new(&blob))
.map_err(|e| Error::Parse(format!("decompress metadata for '{}': {}", id, e)))?;
if raw.iter().find(|&&b| b != b' ' && b != b'\n' && b != b'\r' && b != b'\t') == Some(&b'<') {
let record = parse_orcid_xml(&raw)
.ok_or_else(|| Error::Parse(format!("re-parse XML for '{}'", id)))?;
Ok(xml_record_to_person_json(&record))
} else {
serde_json::from_slice(&raw).map_err(|e| Error::Parse(format!("re-parse JSON for '{}': {}", id, e)))
}
}
fn ensure_people_schema(conn: &rusqlite::Connection) -> Result<()> {
conn.execute_batch(PEOPLE_DDL)
.map_err(|e| Error::Parse(format!("people DDL: {}", e)))?;
for stmt in PEOPLE_MIGRATE_DDL {
let _ = conn.execute_batch(stmt);
}
let fts_sql: String = conn
.query_row(
"SELECT COALESCE(sql, '') FROM sqlite_master WHERE name='people_fts'",
[],
|r| r.get(0),
)
.unwrap_or_default();
if !fts_sql.contains("other_names") {
conn.execute_batch("DROP TABLE IF EXISTS people_fts")
.map_err(|e| Error::Parse(format!("drop FTS5: {}", e)))?;
conn.execute_batch(PEOPLE_FTS5_DDL)
.map_err(|e| Error::Parse(format!("FTS5 DDL: {}", e)))?;
let row_count: i64 = conn
.query_row("SELECT COUNT(*) FROM people", [], |r| r.get(0))
.unwrap_or(0);
if row_count > 0 {
conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
.map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;
}
}
Ok(())
}
pub fn import_person(id: &str, people_db: &Path, works_db: &Path) -> Result<usize> {
let orcid_url = crate::utils::normalize_orcid(id);
if orcid_url.is_empty() {
return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
}
use rusqlite::Connection;
let conn = Connection::open(people_db)
.map_err(|e| Error::Parse(format!("open sqlite '{}': {}", people_db.display(), e)))?;
let _: String = conn
.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0))
.map_err(|e| Error::Parse(format!("WAL mode: {}", e)))?;
ensure_people_schema(&conn)?;
let row = fetch_person_api(&orcid_url)?;
write_people_batch(&conn, std::slice::from_ref(&row))?;
conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
.map_err(|e| Error::Parse(format!("FTS5 rebuild: {}", e)))?;
const MAX_WORKS: usize = 50;
let mut works: Vec<crate::Data> = Vec::new();
let mut cr = crate::formats::crossref::fetch_by_orcid(&orcid_url, MAX_WORKS, 1).unwrap_or_default();
let mut dc = crate::formats::datacite::fetch_by_orcid(&orcid_url, MAX_WORKS, 1).unwrap_or_default();
works.append(&mut cr);
works.append(&mut dc);
let n = if !works.is_empty() {
let mut seen = std::collections::HashSet::new();
works.retain(|d| seen.insert(d.id.clone()));
works.sort_by(|a, b| b.date_published.cmp(&a.date_published));
works.truncate(MAX_WORKS);
let n = works.len();
crate::formats::commonmeta::upsert_sqlite(&works, works_db)?;
n
} else {
0
};
Ok(n)
}
pub fn fetch_sqlite(id: &str, db_path: &Path) -> Result<Data> {
use rusqlite::{params, Connection};
let url = crate::utils::normalize_orcid(id);
if url.is_empty() {
return Err(Error::Parse(format!("'{}' is not a valid ORCID identifier", id)));
}
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("open sqlite '{}': {}", db_path.display(), e)))?;
let blob: Vec<u8> = conn
.query_row(
"SELECT metadata FROM people WHERE id = ?1 LIMIT 1",
params![url],
|row| row.get(0),
)
.map_err(|e| {
if matches!(e, rusqlite::Error::QueryReturnedNoRows) {
Error::Parse(format!("person '{}' not found in local database", id))
} else {
Error::Parse(format!("sqlite query failed: {}", e))
}
})?;
let raw = zstd::decode_all(std::io::Cursor::new(&blob))
.map_err(|e| Error::Parse(format!("decompress metadata for '{}': {}", id, e)))?;
if raw.iter().find(|&&b| b != b' ' && b != b'\n' && b != b'\r' && b != b'\t') == Some(&b'<') {
let record = parse_orcid_xml(&raw)
.ok_or_else(|| Error::Parse(format!("re-parse XML for '{}'", id)))?;
Ok(from_orcid(record))
} else {
let person: PersonJson = serde_json::from_slice(&raw)
.map_err(|e| Error::Parse(format!("re-parse JSON for '{}': {}", id, e)))?;
Ok(person_json_to_data(&person, &url))
}
}
#[derive(Deserialize)]
struct FigshareFile {
name: String,
size: u64,
download_url: String,
}
#[derive(Deserialize)]
struct FigshareArticle {
files: Vec<FigshareFile>,
}
pub fn fetch_orcid_release(version: u32) -> Result<OrcidRelease> {
let url = format!(
"https://api.figshare.com/v2/articles/{}/versions/{}",
ORCID_FIGSHARE_ARTICLE_ID, version
);
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let text = client
.get(&url)
.send()
.map_err(|e| Error::Http(format!("figshare API: {}", e)))?
.error_for_status()
.map_err(|e| Error::Http(format!("figshare API status: {}", e)))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let article: FigshareArticle =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
let summaries = article
.files
.into_iter()
.find(|f| f.name.ends_with("_summaries.tar.gz"))
.ok_or_else(|| Error::Parse("no summaries tar.gz found in figshare article".into()))?;
let (year, batch) = parse_filename_date(&summaries.name);
Ok(OrcidRelease {
year,
batch,
filename: summaries.name,
download_url: summaries.download_url,
size_bytes: summaries.size,
})
}
fn parse_filename_date(name: &str) -> (u16, u16) {
let parts: Vec<&str> = name.split('_').collect();
let year = parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(0);
let batch = parts.get(2).and_then(|s| s.parse().ok()).unwrap_or(0);
(year, batch)
}
pub fn fetch_latest_orcid_release() -> Result<OrcidRelease> {
#[derive(Deserialize)]
struct VersionEntry {
version: u32,
}
let url = format!(
"https://api.figshare.com/v2/articles/{}/versions",
ORCID_FIGSHARE_ARTICLE_ID
);
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let text = client
.get(&url)
.send()
.map_err(|e| Error::Http(format!("figshare versions API: {}", e)))?
.error_for_status()
.map_err(|e| Error::Http(format!("figshare versions API status: {}", e)))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let entries: Vec<VersionEntry> =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
let latest = entries
.into_iter()
.map(|e| e.version)
.max()
.ok_or_else(|| Error::Parse("figshare returned empty versions list".into()))?;
fetch_orcid_release(latest)
}
pub fn rebuild_people_fts(path: &Path) -> Result<()> {
let conn = rusqlite::Connection::open(path)
.map_err(|e| Error::Parse(format!("open sqlite '{}': {}", path.display(), e)))?;
let _ = conn.execute("DROP TABLE IF EXISTS people_fts", []);
conn.execute_batch(PEOPLE_FTS5_DDL)
.map_err(|e| Error::Parse(format!("people_fts DDL: {}", e)))?;
conn.execute("INSERT INTO people_fts(people_fts) VALUES('rebuild')", [])
.map_err(|e| Error::Parse(format!("people_fts rebuild: {}", e)))?;
Ok(())
}
pub fn fetch_installed_orcid_public_data_version(db_path: &Path) -> Result<Option<String>> {
use rusqlite::Connection;
use rusqlite::Error as SqliteError;
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite: {}", e)))?;
match conn.query_row(
"SELECT value FROM settings WHERE key = 'orcid_public_data_version' LIMIT 1",
[],
|row| row.get::<_, String>(0),
) {
Ok(v) => Ok(Some(v)),
Err(SqliteError::QueryReturnedNoRows) => Ok(None),
Err(_) => Ok(None),
}
}
fn probe_download_url(url: &str) -> (String, u64) {
let client = match reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.redirect(reqwest::redirect::Policy::none())
.timeout(std::time::Duration::from_secs(30))
.build()
{
Ok(c) => c,
Err(_) => return ("orcid_summaries.tar.gz".to_string(), 0),
};
let resp = match client.head(url).send() {
Ok(r) => r,
Err(_) => return ("orcid_summaries.tar.gz".to_string(), 0),
};
let size = resp.content_length().unwrap_or(0);
let filename = resp
.headers()
.get("content-disposition")
.and_then(|v| v.to_str().ok())
.and_then(|s| {
s.split(';')
.find_map(|part| {
let part = part.trim();
part.strip_prefix("filename=").map(|f| f.trim_matches('"').to_string())
})
})
.filter(|f| f.ends_with(".tar.gz"))
.unwrap_or_else(|| "orcid_summaries.tar.gz".to_string());
(filename, size)
}
pub fn import_orcid_public_data(
output_path: &Path,
source: Option<&str>,
no_network: bool,
) -> Result<usize> {
use rusqlite::params;
use rusqlite::Connection;
fn write_version(output_path: &Path, version_key: &str) -> Result<()> {
if version_key.starts_with("0_") {
return Ok(());
}
let conn = Connection::open(output_path)
.map_err(|e| Error::Parse(format!("open sqlite: {}", e)))?;
conn.execute(
"INSERT OR REPLACE INTO settings (key, value) VALUES ('orcid_public_data_version', ?1)",
params![version_key],
)
.map_err(|e| Error::Parse(format!("write settings: {}", e)))?;
Ok(())
}
if let Some(s) = source {
if !s.starts_with("http") {
let local = std::path::Path::new(s);
if !local.exists() {
return Err(Error::Parse(format!("file not found: {}", s)));
}
let filename = local
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("orcid_summaries.tar.gz")
.to_string();
let (year, batch) = parse_filename_date(&filename);
let version_key = format!("{}_{}", year, batch);
eprintln!("orcid: importing {} …", local.display());
let count = stream_summaries_to_sqlite(local, output_path)?;
write_version(output_path, &version_key)?;
eprintln!("orcid: import complete — {} people", count);
return Ok(count);
}
}
if no_network {
return Err(Error::Parse(
"--no-network: pass a local file path or remove --no-network".to_string(),
));
}
let (download_url, filename, size_bytes) = match source {
Some(url) => {
if url.contains("/ndownloader/articles/") {
return Err(Error::Parse(
"that URL downloads the full article bundle (~221 GB, all files).\n\
Pass the summaries-only file (~46 GB) instead.\n\
On a machine where figshare is reachable:\n \
commonmeta import --from orcid --list-releases\n\
Or download the file there and copy it:\n \
scp ORCID_2025_10_summaries.tar.gz root@server:/data/\n \
commonmeta import --from orcid /data/ORCID_2025_10_summaries.tar.gz"
.to_string(),
));
}
eprintln!("orcid: probing URL for filename and size …");
let (filename, size) = probe_download_url(url);
eprintln!("orcid: {} ({:.1} GB)", filename, size as f64 / 1_073_741_824.0);
(url.to_string(), filename, size)
}
None => {
eprintln!("orcid: checking latest release on figshare …");
let release = fetch_latest_orcid_release().map_err(|e| {
let msg = e.to_string();
if msg.contains("403") {
Error::Http(format!(
"{msg}\n\
figshare is blocked on this host. Options:\n\
A) download the file on another machine and copy it here:\n \
commonmeta import --from orcid --list-releases # get URL\n \
wget <SUMMARIES_URL> # ~43 GB\n \
scp ORCID_*_summaries.tar.gz root@server:/data/\n \
commonmeta import --from orcid /data/ORCID_*_summaries.tar.gz\n\
B) pass the URL directly (from --list-releases on another machine):\n \
commonmeta import --from orcid \"<SUMMARIES_URL>\""
))
} else {
e
}
})?;
eprintln!(
"orcid: {} ({}_{}) {:.1} GB",
release.filename, release.year, release.batch,
release.size_bytes as f64 / 1_073_741_824.0,
);
(release.download_url, release.filename, release.size_bytes)
}
};
let (year, batch) = parse_filename_date(&filename);
let version_key = format!("{}_{}", year, batch);
let installed = if output_path.exists() {
fetch_installed_orcid_public_data_version(output_path)?
} else {
None
};
if !version_key.starts_with("0_") && installed.as_deref() == Some(&version_key) {
eprintln!("orcid: {} already installed", version_key);
return Ok(0);
}
let cache_path = crate::io_utils::cache_dir("orcid").join(&filename);
let cached = cache_path.exists()
&& size_bytes > 0
&& cache_path.metadata().map(|m| m.len()).unwrap_or(0) == size_bytes;
if !cached {
eprintln!("orcid: downloading to {} …", cache_path.display());
if let Some(parent) = cache_path.parent() {
std::fs::create_dir_all(parent)
.map_err(|e| Error::Parse(format!("mkdir: {}", e)))?;
}
crate::io_utils::download_file_to_path(&download_url, &cache_path)
.map_err(|e| Error::Parse(format!("download: {}", e)))?;
} else {
eprintln!("orcid: using cached {}", cache_path.display());
}
eprintln!("orcid: importing {} into {} …", filename, output_path.display());
let count = stream_summaries_to_sqlite(&cache_path, output_path)?;
write_version(output_path, &version_key)?;
eprintln!("orcid: import complete — {} people", count);
Ok(count)
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct EmploymentsResponse {
#[serde(rename = "affiliation-group")]
affiliation_groups: Vec<AffiliationGroupJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct AffiliationGroupJson {
summaries: Vec<AffiliationSummaryWrapper>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct AffiliationSummaryWrapper {
#[serde(rename = "employment-summary")]
employment_summary: Option<EmploymentSummaryJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct EmploymentSummaryJson {
organization: OrganizationJson,
#[serde(rename = "role-title")]
role_title: String,
#[serde(rename = "start-date")]
start_date: Option<OrcidDateJson>,
#[serde(rename = "end-date")]
end_date: Option<OrcidDateJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct OrganizationJson {
name: String,
#[serde(rename = "disambiguated-organization")]
disambiguated: Option<DisambiguatedOrgJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct DisambiguatedOrgJson {
#[serde(rename = "disambiguated-organization-identifier")]
identifier: String,
#[serde(rename = "disambiguation-source")]
source: String,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct OrcidDateJson {
year: Option<StringValue>,
month: Option<StringValue>,
day: Option<StringValue>,
}
pub struct PersonAffiliation {
pub ror_id: Option<String>,
pub name: String,
pub role: Option<String>,
pub start_date: Option<String>,
pub end_date: Option<String>,
}
fn orcid_date_to_iso(date: &OrcidDateJson) -> Option<String> {
let year = date.year.as_ref()?.value.clone();
if year.is_empty() {
return None;
}
match (&date.month, &date.day) {
(Some(m), Some(d)) if !m.value.is_empty() && !d.value.is_empty() => {
Some(format!("{}-{:0>2}-{:0>2}", year, m.value, d.value))
}
(Some(m), _) if !m.value.is_empty() => Some(format!("{}-{:0>2}", year, m.value)),
_ => Some(year),
}
}
fn ror_id_from_sqlite(ext_type: &str, ext_value: &str, db_path: &Path) -> Option<String> {
use rusqlite::{Connection, params};
let conn = Connection::open(db_path).ok()?;
conn.query_row(
"SELECT o.id FROM organizations o, json_each(o.external_ids) ext \
WHERE LOWER(json_extract(ext.value, '$.type')) = ?1 \
AND (json_extract(ext.value, '$.preferred') = ?2 \
OR EXISTS ( \
SELECT 1 FROM json_each(json_extract(ext.value, '$.all')) a \
WHERE a.value = ?2 \
)) \
LIMIT 1",
params![ext_type, ext_value],
|row| row.get::<_, String>(0),
)
.ok()
}
fn resolve_ror_id(d: &DisambiguatedOrgJson, db_path: Option<&Path>) -> Option<String> {
if d.identifier.is_empty() {
return None;
}
if d.source.eq_ignore_ascii_case("ROR") {
return Some(
d.identifier
.trim_start_matches("https://ror.org/")
.trim_start_matches("http://ror.org/")
.to_string(),
);
}
let db = db_path?;
let ext_type = d.source.to_lowercase();
if matches!(ext_type.as_str(), "grid" | "isni" | "fundref" | "wikidata") {
return ror_id_from_sqlite(&ext_type, &d.identifier, db)
.map(|url| url.trim_start_matches("https://ror.org/").to_string());
}
None
}
pub fn fetch_person_employments(orcid_url: &str, db_path: Option<&Path>) -> Result<Vec<PersonAffiliation>> {
let orcid = orcid_url.trim_start_matches("https://orcid.org/");
let api_url = format!("https://pub.orcid.org/v3.0/{}/employments", orcid);
let client = reqwest::blocking::Client::builder()
.user_agent(crate::io_utils::commonmeta_user_agent())
.timeout(std::time::Duration::from_secs(30))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let resp: EmploymentsResponse = client
.get(&api_url)
.header("Accept", "application/vnd.orcid+json")
.send()
.map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
.error_for_status()
.map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
.json()
.map_err(|e| Error::Parse(e.to_string()))?;
let mut affiliations: Vec<PersonAffiliation> = resp
.affiliation_groups
.iter()
.flat_map(|g| g.summaries.iter())
.filter_map(|w| w.employment_summary.as_ref())
.filter(|emp| !emp.organization.name.is_empty())
.map(|emp| {
let ror_id = emp
.organization
.disambiguated
.as_ref()
.and_then(|d| resolve_ror_id(d, db_path));
PersonAffiliation {
ror_id,
name: emp.organization.name.clone(),
role: if emp.role_title.is_empty() { None } else { Some(emp.role_title.clone()) },
start_date: emp.start_date.as_ref().and_then(orcid_date_to_iso),
end_date: emp.end_date.as_ref().and_then(orcid_date_to_iso),
}
})
.collect();
affiliations.sort_by(|a, b| match (&a.start_date, &b.start_date) {
(Some(da), Some(db)) => da.cmp(db),
(Some(_), None) => std::cmp::Ordering::Less,
(None, Some(_)) => std::cmp::Ordering::Greater,
(None, None) => std::cmp::Ordering::Equal,
});
Ok(affiliations)
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct WorksResponse {
group: Vec<WorkGroup>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkGroup {
#[serde(rename = "work-summary")]
work_summaries: Vec<WorkSummaryJson>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkSummaryJson {
#[serde(rename = "external-ids")]
external_ids: WorkExternalIds,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkExternalIds {
#[serde(rename = "external-id")]
external_id: Vec<WorkExternalId>,
}
#[derive(Deserialize, Default)]
#[serde(default)]
struct WorkExternalId {
#[serde(rename = "external-id-type")]
id_type: String,
#[serde(rename = "external-id-value")]
id_value: String,
}
pub fn fetch_orcid_work_dois(orcid_url: &str) -> Result<Vec<String>> {
let orcid = orcid_url.trim_start_matches("https://orcid.org/");
let api_url = format!("https://pub.orcid.org/v3.0/{}/works", orcid);
let client = reqwest::blocking::Client::builder()
.user_agent(crate::io_utils::commonmeta_user_agent())
.timeout(std::time::Duration::from_secs(30))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let resp: WorksResponse = client
.get(&api_url)
.header("Accept", "application/vnd.orcid+json")
.send()
.map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
.error_for_status()
.map_err(|e| Error::Http(format!("ORCID API: {}", e)))?
.json()
.map_err(|e| Error::Parse(e.to_string()))?;
let mut dois: Vec<String> = Vec::new();
for group in &resp.group {
if let Some(summary) = group.work_summaries.first() {
if let Some(doi) = summary
.external_ids
.external_id
.iter()
.find(|id| id.id_type == "doi")
.map(|id| crate::doi_utils::normalize_doi(&id.id_value))
.filter(|s| !s.is_empty())
{
if !dois.contains(&doi) {
dois.push(doi);
}
}
}
}
Ok(dois)
}
fn yaml_dq(s: &str) -> String {
format!(
"\"{}\"",
s.replace('\\', "\\\\")
.replace('"', "\\\"")
.replace('\n', "\\n")
.replace('\r', "")
.replace('\t', "\\t")
)
}
fn yaml_scalar(s: &str) -> String {
if s.is_empty() {
return yaml_dq(s);
}
let first = s.chars().next().unwrap();
let needs_q = ":-?|>!'\"#&*{}[],%@`".contains(first)
|| s.contains(": ")
|| s.contains(" #")
|| s.contains('\n')
|| matches!(s, "true" | "false" | "null" | "~");
if needs_q { yaml_dq(s) } else { s.to_string() }
}
pub fn write_inveniordm_person_yaml(
person_json: &serde_json::Value,
affiliations: &[PersonAffiliation],
) -> Result<Vec<u8>> {
let name_obj = person_json.get("name");
let given_name = name_obj
.and_then(|n| n.get("given-names"))
.and_then(|g| g.get("value"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let family_name = name_obj
.and_then(|n| n.get("family-name"))
.and_then(|f| f.get("value"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let credit_name = name_obj
.and_then(|n| n.get("credit-name"))
.and_then(|c| c.get("value"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let display_name = if !credit_name.is_empty() {
credit_name
} else {
format!("{} {}", given_name, family_name).trim().to_string()
};
let path = person_json.get("path").and_then(|p| p.as_str()).unwrap_or("");
let orcid_id = path.trim_start_matches('/').split('/').next().unwrap_or("").to_string();
let biography = person_json
.get("biography")
.and_then(|b| b.get("content"))
.and_then(|c| c.as_str())
.unwrap_or("")
.to_string();
let mut out = String::new();
out.push_str(&format!("- id: {}\n", orcid_id));
out.push_str(&format!(" name: {}\n", yaml_scalar(&display_name)));
out.push_str(&format!(" given_name: {}\n", yaml_scalar(&given_name)));
out.push_str(&format!(" family_name: {}\n", yaml_scalar(&family_name)));
out.push_str(" identifiers:\n");
out.push_str(&format!(" - identifier: https://orcid.org/{}\n", orcid_id));
if !affiliations.is_empty() {
out.push_str(" affiliations:\n");
for aff in affiliations {
if let Some(ror_id) = &aff.ror_id {
out.push_str(&format!(" - id: {}\n", ror_id));
out.push_str(&format!(" name: {}\n", yaml_scalar(&aff.name)));
} else {
out.push_str(&format!(" - name: {}\n", yaml_scalar(&aff.name)));
}
if let Some(date) = &aff.start_date {
out.push_str(&format!(" start_date: '{}'\n", date));
}
if let Some(date) = &aff.end_date {
out.push_str(&format!(" end_date: '{}'\n", date));
}
}
}
if !biography.is_empty() {
out.push_str(&format!(" description: {}\n", yaml_dq(&biography)));
}
Ok(out.into_bytes())
}
pub fn write_orcid_json(value: &serde_json::Value) -> Result<Vec<u8>> {
serde_json::to_vec(value).map_err(|e| Error::Serialize(e.to_string()))
}
pub fn orcid_person_to_commonmeta(
person_json: &serde_json::Value,
affiliations: &[PersonAffiliation],
) -> serde_json::Value {
let mut obj = serde_json::Map::new();
if let Some(path) = person_json.get("path").and_then(|v| v.as_str()) {
let orcid_id = path.trim_start_matches('/').split('/').next().unwrap_or("");
if !orcid_id.is_empty() {
obj.insert(
"id".to_string(),
serde_json::Value::String(format!("https://orcid.org/{orcid_id}")),
);
}
}
if let Some(name_obj) = person_json.get("name") {
let given = name_obj
.get("given-names")
.and_then(|n| n.get("value"))
.and_then(|v| v.as_str())
.unwrap_or("");
let family = name_obj
.get("family-name")
.and_then(|n| n.get("value"))
.and_then(|v| v.as_str())
.unwrap_or("");
let credit = name_obj
.get("credit-name")
.and_then(|n| n.get("value"))
.and_then(|v| v.as_str())
.unwrap_or("");
if !given.is_empty() {
obj.insert("given_name".to_string(), serde_json::Value::String(given.to_string()));
}
if !family.is_empty() {
obj.insert("family_name".to_string(), serde_json::Value::String(family.to_string()));
}
if !credit.is_empty() {
obj.insert("name".to_string(), serde_json::Value::String(credit.to_string()));
}
}
if let Some(bio) = person_json
.get("biography")
.and_then(|b| b.get("content"))
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
{
obj.insert("description".to_string(), serde_json::Value::String(bio.to_string()));
}
if let Some(others) = person_json
.get("other-names")
.and_then(|o| o.get("other-name"))
.and_then(|v| v.as_array())
{
let names: Vec<serde_json::Value> = others
.iter()
.filter_map(|n| n.get("content").and_then(|v| v.as_str()))
.filter(|s| !s.is_empty())
.map(|s| serde_json::Value::String(s.to_string()))
.collect();
if !names.is_empty() {
obj.insert("additional_names".to_string(), serde_json::Value::Array(names));
}
}
if !affiliations.is_empty() {
let affs: Vec<serde_json::Value> = affiliations
.iter()
.map(|a| {
let mut aff = serde_json::Map::new();
if let Some(ref ror) = a.ror_id {
aff.insert(
"id".to_string(),
serde_json::Value::String(format!("https://ror.org/{ror}")),
);
}
aff.insert("name".to_string(), serde_json::Value::String(a.name.clone()));
if let Some(ref role) = a.role {
aff.insert("role".to_string(), serde_json::Value::String(role.clone()));
}
if let Some(ref date) = a.start_date {
aff.insert("start_date".to_string(), serde_json::Value::String(date.clone()));
}
if let Some(ref date) = a.end_date {
aff.insert("end_date".to_string(), serde_json::Value::String(date.clone()));
}
serde_json::Value::Object(aff)
})
.collect();
obj.insert("affiliations".to_string(), serde_json::Value::Array(affs));
}
if let Some(ext_ids) = person_json
.get("external-identifiers")
.and_then(|e| e.get("external-identifier"))
.and_then(|v| v.as_array())
{
let ids: Vec<serde_json::Value> = ext_ids
.iter()
.filter_map(|e| {
let id_type = e.get("external-id-type")?.as_str()?;
let id_value = e.get("external-id-value")?.as_str()?;
if id_value.is_empty() {
return None;
}
let (cm_type, scheme) = map_orcid_ext_id_type(id_type);
let mut id_obj = serde_json::Map::new();
id_obj.insert("identifier".to_string(), serde_json::Value::String(id_value.to_string()));
id_obj.insert("identifier_type".to_string(), serde_json::Value::String(cm_type.to_string()));
if let Some(s) = scheme {
id_obj.insert("scheme".to_string(), serde_json::Value::String(s.to_string()));
}
Some(serde_json::Value::Object(id_obj))
})
.collect();
if !ids.is_empty() {
obj.insert("identifiers".to_string(), serde_json::Value::Array(ids));
}
}
if let Some(ru) = person_json
.get("researcher-urls")
.and_then(|r| r.get("researcher-url"))
.and_then(|v| v.as_array())
{
let urls: Vec<serde_json::Value> = ru
.iter()
.filter_map(|r| {
let url = r.get("url")?.get("value")?.as_str()?;
if url.is_empty() {
return None;
}
let mut url_obj = serde_json::Map::new();
if let Some(name) = r.get("url-name").and_then(|v| v.as_str()).filter(|s| !s.is_empty()) {
url_obj.insert("name".to_string(), serde_json::Value::String(name.to_string()));
}
url_obj.insert("url".to_string(), serde_json::Value::String(url.to_string()));
Some(serde_json::Value::Object(url_obj))
})
.collect();
if !urls.is_empty() {
obj.insert("urls".to_string(), serde_json::Value::Array(urls));
}
}
if let Some(country) = person_json
.get("addresses")
.and_then(|a| a.get("address"))
.and_then(|v| v.as_array())
.and_then(|arr| arr.first())
.and_then(|a| a.get("country"))
.and_then(|c| c.get("value"))
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
{
obj.insert("country".to_string(), serde_json::Value::String(country.to_string()));
}
obj.insert("asserted_by".to_string(), serde_json::Value::String("Author".to_string()));
serde_json::Value::Object(obj)
}
fn map_orcid_ext_id_type(id_type: &str) -> (&'static str, Option<&'static str>) {
match id_type {
"ResearcherID" | "rid" => ("ResearcherID", None),
"Scopus Author ID" => ("ScopusID", None),
"ISNI" => ("ISNI", None),
"Wikidata" => ("Wikidata", None),
_ => ("Other", Some("Other")),
}
}
pub fn write_commonmeta_person(
person_json: &serde_json::Value,
affiliations: &[PersonAffiliation],
works: &[Data],
) -> Result<Vec<u8>> {
let entity = orcid_person_to_commonmeta(person_json, affiliations);
let mut items = vec![entity];
for work in works {
let prepared = crate::formats::commonmeta::prepare(work);
let v = serde_json::to_value(&prepared).map_err(|e| Error::Serialize(e.to_string()))?;
items.push(v);
}
let arr = serde_json::Value::Array(items);
let bytes = serde_json::to_vec_pretty(&arr).map_err(|e| Error::Serialize(e.to_string()))?;
crate::schema_utils::json_schema_errors(&bytes, Some("commonmeta"))?;
Ok(bytes)
}
#[cfg(test)]
mod tests {
use super::*;
const MINIMAL_RECORD: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<record:record xmlns:common="http://www.orcid.org/ns/common"
xmlns:history="http://www.orcid.org/ns/history"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:address="http://www.orcid.org/ns/address"
xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:record="http://www.orcid.org/ns/record"
path="/0009-0007-0779-1000">
<common:orcid-identifier>
<common:uri>https://orcid.org/0009-0007-0779-1000</common:uri>
<common:path>0009-0007-0779-1000</common:path>
<common:host>orcid.org</common:host>
</common:orcid-identifier>
<history:history>
<common:last-modified-date>2023-08-20T05:55:33.757Z</common:last-modified-date>
</history:history>
<person:person path="/0009-0007-0779-1000/person">
<person:name visibility="public" path="0009-0007-0779-1000">
<personal-details:given-names>Yumi</personal-details:given-names>
<personal-details:family-name>Shin</personal-details:family-name>
</person:name>
<address:addresses path="/0009-0007-0779-1000/address"/>
<keyword:keywords path="/0009-0007-0779-1000/keywords"/>
<external-identifier:external-identifiers path="/0009-0007-0779-1000/external-identifiers"/>
</person:person>
</record:record>"#;
const RICH_RECORD: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<record:record xmlns:common="http://www.orcid.org/ns/common"
xmlns:history="http://www.orcid.org/ns/history"
xmlns:person="http://www.orcid.org/ns/person"
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
xmlns:address="http://www.orcid.org/ns/address"
xmlns:keyword="http://www.orcid.org/ns/keyword"
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
xmlns:record="http://www.orcid.org/ns/record"
path="/0000-0001-8188-0000">
<common:orcid-identifier>
<common:uri>https://orcid.org/0000-0001-8188-0000</common:uri>
<common:path>0000-0001-8188-0000</common:path>
<common:host>orcid.org</common:host>
</common:orcid-identifier>
<history:history>
<common:last-modified-date>2022-05-26T07:06:31.543Z</common:last-modified-date>
</history:history>
<person:person path="/0000-0001-8188-0000/person">
<person:name visibility="public" path="0000-0001-8188-0000">
<personal-details:given-names>Ana</personal-details:given-names>
<personal-details:family-name>Souza</personal-details:family-name>
</person:name>
<address:addresses path="/0000-0001-8188-0000/address">
<address:address put-code="12345" visibility="public" path="/0000-0001-8188-0000/address/12345">
<address:country>BR</address:country>
</address:address>
</address:addresses>
<keyword:keywords path="/0000-0001-8188-0000/keywords">
<keyword:keyword put-code="111" visibility="public">
<keyword:content>bioinformatics</keyword:content>
</keyword:keyword>
<keyword:keyword put-code="222" visibility="public">
<keyword:content>genomics</keyword:content>
</keyword:keyword>
</keyword:keywords>
<external-identifier:external-identifiers path="/0000-0001-8188-0000/external-identifiers">
<external-identifier:external-identifier put-code="848742" visibility="public">
<common:external-id-type>ResearcherID</common:external-id-type>
<common:external-id-value>D-1073-2012</common:external-id-value>
<common:external-id-url>http://www.researcherid.com/rid/D-1073-2012</common:external-id-url>
</external-identifier:external-identifier>
<external-identifier:external-identifier put-code="848755" visibility="public">
<common:external-id-type>Scopus Author ID</common:external-id-type>
<common:external-id-value>36852681700</common:external-id-value>
</external-identifier:external-identifier>
</external-identifier:external-identifiers>
</person:person>
</record:record>"#;
const ERROR_RECORD: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<error:error xmlns:error="http://www.orcid.org/ns/error">
<error:response-code>409</error:response-code>
<error:developer-message>409 Conflict: The ORCID record is locked.</error:developer-message>
<error:user-message>The ORCID record is locked.</error:user-message>
<error:error-code>9018</error:error-code>
</error:error>"#;
#[test]
fn test_parse_minimal_record() {
let record = parse_orcid_xml(MINIMAL_RECORD.as_bytes()).unwrap();
assert_eq!(record.orcid_identifier.uri, "https://orcid.org/0009-0007-0779-1000");
assert_eq!(record.orcid_identifier.path, "0009-0007-0779-1000");
assert_eq!(record.history.last_modified_date, "2023-08-20T05:55:33.757Z");
assert_eq!(record.person.name.given_names, "Yumi");
assert_eq!(record.person.name.family_name, "Shin");
assert!(record.person.name.credit_name.is_empty());
assert!(record.person.addresses.addresses.is_empty());
assert!(record.person.keywords.keywords.is_empty());
assert!(record.person.external_identifiers.identifiers.is_empty());
}
#[test]
fn test_parse_rich_record() {
let record = parse_orcid_xml(RICH_RECORD.as_bytes()).unwrap();
assert_eq!(record.orcid_identifier.uri, "https://orcid.org/0000-0001-8188-0000");
assert_eq!(record.person.name.given_names, "Ana");
assert_eq!(record.person.name.family_name, "Souza");
assert_eq!(record.person.addresses.addresses.len(), 1);
assert_eq!(record.person.addresses.addresses[0].country, "BR");
assert_eq!(record.person.keywords.keywords.len(), 2);
assert_eq!(record.person.keywords.keywords[0].content, "bioinformatics");
assert_eq!(record.person.keywords.keywords[1].content, "genomics");
assert_eq!(record.person.external_identifiers.identifiers.len(), 2);
let ext = &record.person.external_identifiers.identifiers[0];
assert_eq!(ext.type_, "ResearcherID");
assert_eq!(ext.value, "D-1073-2012");
}
#[test]
fn test_error_record_returns_none() {
assert!(parse_orcid_xml(ERROR_RECORD.as_bytes()).is_none());
}
#[test]
fn test_display_name_given_family() {
let record = parse_orcid_xml(MINIMAL_RECORD.as_bytes()).unwrap();
assert_eq!(display_name(&record), "Yumi Shin");
}
#[test]
fn test_display_name_family_only() {
let xml = MINIMAL_RECORD.replace(
"<personal-details:given-names>Yumi</personal-details:given-names>",
"",
);
let record = parse_orcid_xml(xml.as_bytes()).unwrap();
assert_eq!(display_name(&record), "Shin");
}
#[test]
fn test_from_orcid_data() {
let record = parse_orcid_xml(RICH_RECORD.as_bytes()).unwrap();
let data = from_orcid(record);
assert_eq!(data.id, "https://orcid.org/0000-0001-8188-0000");
assert_eq!(data.type_, "Person");
assert_eq!(data.title, "Ana Souza");
assert_eq!(data.provider, "ORCID");
assert_eq!(data.identifiers.len(), 2);
assert!(data.identifiers.iter().any(|i| i.identifier_type == "ResearcherID"));
}
#[test]
fn test_serialize_to_people_row() {
let record = parse_orcid_xml(RICH_RECORD.as_bytes()).unwrap();
let row = serialize_to_people_row(&record, RICH_RECORD.as_bytes());
assert_eq!(row.id, "https://orcid.org/0000-0001-8188-0000");
assert_eq!(row.orcid, "0000-0001-8188-0000");
assert_eq!(row.given_name, "Ana");
assert_eq!(row.family_name, "Souza");
assert_eq!(row.name, "Ana Souza");
assert_eq!(row.country, "BR");
assert_eq!(row.keywords, "bioinformatics genomics");
assert!(!row.metadata.is_empty());
}
#[test]
fn test_parse_filename_date() {
assert_eq!(
parse_filename_date("ORCID_2025_10_summaries.tar.gz"),
(2025, 10)
);
assert_eq!(parse_filename_date("bad_name.tar.gz"), (0, 0));
}
#[test]
fn test_person_validates_against_orcid_schema() {
let bytes = include_bytes!("../../tests/fixtures/orcid/person_0000-0003-1419-2405.json");
let result = crate::schema_utils::json_schema_errors(bytes, Some("orcid"));
assert!(result.is_ok(), "ORCID person fixture should validate: {result:?}");
}
#[test]
#[ignore = "network"]
fn test_fetch_orcid_release() {
let release = fetch_orcid_release(1).unwrap();
assert_eq!(release.year, 2025);
assert!(release.batch > 0);
assert!(release.filename.ends_with("_summaries.tar.gz"));
assert!(release.download_url.contains("figshare.com"));
assert!(release.size_bytes > 40_000_000_000);
}
}