pub mod author_utils;
pub mod cmd;
pub mod constants;
pub mod crockford;
pub mod date_utils;
pub mod data;
pub mod doi_utils;
pub mod error;
pub mod io_utils;
mod formats;
pub mod progress;
pub mod schema_utils;
pub mod geonames;
pub mod ror_countries;
pub mod spdx;
pub mod utils;
pub mod vocabularies;
pub use data::{Citation, Data};
pub use error::{Error, Result};
pub use schema_utils::SCHEMA_JSON;
pub use formats::crossref;
pub use formats::pubmed;
pub use formats::inveniordm::PushResult;
pub use formats::ror::AffiliationMatch;
pub use formats::ror::Ror;
pub use formats::ror::RorRelease;
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub fn read(from: &str, input: &str) -> Result<Data> {
formats::read(from, input)
}
pub fn convert(from: &str, to: &str, input: &str) -> Result<Vec<u8>> {
let data = formats::read(from, input)?;
formats::write(to, &data)
}
pub fn write(to: &str, data: &Data) -> Result<Vec<u8>> {
formats::write(to, data)
}
pub fn write_with_style(
to: &str,
data: &Data,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<u8>> {
formats::write_citation(to, data, style, locale)
}
pub fn write_ror_json(data: &Data) -> Result<Vec<u8>> {
formats::ror::write_json(data)
}
pub fn fetch_ror(id: &str) -> Result<Data> {
formats::ror::fetch(id)
}
pub fn write_ror_commonmeta(data: &Data) -> Result<Vec<u8>> {
formats::ror::write_commonmeta_org(data)
}
pub fn fetch_latest_ror_release() -> Result<RorRelease> {
formats::ror::fetch_latest_ror_release()
}
pub fn download_ror_release(release: &RorRelease) -> Result<(Vec<formats::ror::Ror>, bool)> {
formats::ror::download_release(release)
}
pub fn download_ror_all() -> Result<(RorRelease, Vec<formats::ror::Ror>, bool)> {
formats::ror::download_all()
}
pub fn fetch_ror_sqlite(
id: &str,
db_path: &std::path::Path,
) -> Result<Data> {
formats::ror::fetch_sqlite(id, db_path)
}
pub fn read_ror_sqlite(
db_path: &std::path::Path,
limit: Option<usize>,
offset: usize,
country_code: Option<&str>,
query: Option<&str>,
) -> Result<Vec<Data>> {
formats::ror::read_sqlite(db_path, limit, offset, country_code, query)
}
pub fn read_ror_sqlite_raw(
db_path: &std::path::Path,
limit: Option<usize>,
offset: usize,
country_code: Option<&str>,
query: Option<&str>,
) -> Result<Vec<formats::ror::Ror>> {
formats::ror::read_sqlite_raw(db_path, limit, offset, country_code, query)
}
pub fn sample_ror_sqlite(
db_path: &std::path::Path,
limit: Option<usize>,
country_code: Option<&str>,
) -> Result<Vec<Data>> {
formats::ror::sample_sqlite(db_path, limit, country_code)
}
pub fn sample_ror_sqlite_raw(
db_path: &std::path::Path,
limit: Option<usize>,
country_code: Option<&str>,
) -> Result<Vec<formats::ror::Ror>> {
formats::ror::sample_sqlite_raw(db_path, limit, country_code)
}
pub fn write_ror_sqlite(
list: &[formats::ror::Ror],
path: &std::path::Path,
version: Option<&str>,
date: Option<&str>,
) -> Result<()> {
formats::ror::write_sqlite(list, path, version, date)
}
pub fn fetch_installed_ror_version(db_path: &std::path::Path) -> Result<Option<String>> {
formats::ror::fetch_installed_ror_version(db_path)
}
pub fn rebuild_organizations_fts(path: &std::path::Path) -> Result<()> {
formats::ror::rebuild_organizations_fts(path)
}
pub fn fetch_installed_vraix_date(db_path: &std::path::Path) -> Result<Option<String>> {
formats::vraix::fetch_installed_vraix_date(db_path)
}
pub fn install_geonames_sqlite(path: &std::path::Path, date: Option<&str>) -> Result<(usize, bool)> {
let (list, admin1_list, country_list, from_cache) = geonames::download_all()?;
let count = list.len();
geonames::write_sqlite(&list, &admin1_list, &country_list, path, date)?;
Ok((count, from_cache))
}
pub fn fetch_geonames_sqlite(id: i64, db_path: &std::path::Path) -> Result<Data> {
geonames::fetch_sqlite(id, db_path)
}
pub fn fetch_installed_geonames_date(db_path: &std::path::Path) -> Result<Option<String>> {
geonames::fetch_installed_geonames_date(db_path)
}
pub fn fetch_ror_raw_sqlite(id: &str, db_path: &std::path::Path) -> Result<formats::ror::Ror> {
formats::ror::fetch_raw_sqlite(id, db_path)
}
pub fn fetch_ror_raw(input: &str) -> Result<formats::ror::Ror> {
formats::ror::fetch_raw(input)
}
pub fn enrich_ror_locations(ror: &mut formats::ror::Ror, geonames_db: &std::path::Path) {
formats::ror::enrich_locations(ror, geonames_db)
}
pub fn write_ror_v2_json(ror: &formats::ror::Ror) -> Result<Vec<u8>> {
formats::ror::write_v2_json(ror)
}
pub fn match_ror_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
formats::ror::match_affiliation(affiliation)
}
pub fn match_ror_affiliation_sqlite(
affiliation: &str,
db_path: &std::path::Path,
) -> Result<Vec<AffiliationMatch>> {
formats::ror::match_affiliation_sqlite(affiliation, db_path)
}
pub fn convert_citation(
from: &str,
input: &str,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<u8>> {
let data = formats::read(from, input)?;
formats::write_citation("citation", &data, style, locale)
}
pub fn write_parquet(list: &[Data]) -> Result<Vec<u8>> {
formats::commonmeta::write_parquet_all(list)
}
pub fn read_parquet(bytes: &[u8]) -> Result<Vec<Data>> {
formats::commonmeta::read_parquet_all(bytes)
}
pub fn write_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
formats::commonmeta::write_sqlite(list, path)
}
pub fn upsert_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
formats::commonmeta::upsert_sqlite(list, path)
}
pub fn count_sqlite_works(path: &std::path::Path) -> Result<usize> {
formats::commonmeta::count_sqlite_works(path)
}
pub fn set_sqlite_setting(path: &std::path::Path, key: &str, value: &str) -> Result<()> {
formats::commonmeta::set_sqlite_setting(path, key, value)
}
pub fn get_sqlite_setting(path: &std::path::Path, key: &str) -> Result<Option<String>> {
formats::commonmeta::get_sqlite_setting(path, key)
}
pub fn get_all_sqlite_settings(path: &std::path::Path) -> Result<Vec<(String, String)>> {
formats::commonmeta::get_all_sqlite_settings(path)
}
pub fn run_migrations(path: &std::path::Path) -> Result<(usize, u32)> {
formats::commonmeta::run_migrations(path)
}
pub fn read_sqlite_commonmeta(
path: &std::path::Path,
limit: Option<usize>,
offset: usize,
) -> Result<Vec<Data>> {
formats::commonmeta::read_sqlite_commonmeta(path, limit, offset)
}
pub fn read_sqlite_by_id(id: &str, path: &std::path::Path) -> Result<Option<Data>> {
formats::commonmeta::read_sqlite_by_id(id, path)
}
pub fn read_sqlite_by_dois(dois: &[String], path: &std::path::Path) -> Result<Vec<Data>> {
formats::commonmeta::read_sqlite_by_dois(dois, path)
}
pub fn read_sqlite_by_orcid(orcid_url: &str, path: &std::path::Path) -> Result<Vec<Data>> {
formats::commonmeta::read_sqlite_by_orcid(orcid_url, path)
}
pub fn read_sqlite_by_ror(ror_url: &str, path: &std::path::Path) -> Result<Vec<Data>> {
formats::commonmeta::read_sqlite_by_ror(ror_url, path)
}
pub fn read_sqlite_by_citation(doi: &str, path: &std::path::Path) -> Result<Vec<Data>> {
formats::commonmeta::read_sqlite_by_citation(doi, path)
}
pub use formats::commonmeta::JunctionTable;
pub fn backfill_junction_tables(path: &std::path::Path, tables: &[JunctionTable], providers: &[&str]) -> Result<(usize, usize)> {
formats::commonmeta::backfill_junction_tables(path, tables, providers)
}
pub fn backfill_works_references(path: &std::path::Path) -> Result<(usize, usize)> {
formats::commonmeta::backfill_works_references(path)
}
pub fn rebuild_works_fts(path: &std::path::Path) -> Result<()> {
formats::commonmeta::rebuild_works_fts(path)
}
pub fn enrich_citations(data: &mut Data, db_path: &std::path::Path) {
if !db_path.exists() {
return;
}
let Ok(citing) = formats::commonmeta::read_sqlite_by_citation(&data.id, db_path) else {
return;
};
for work in citing {
if !data.citations.iter().any(|c| c.id == work.id) {
data.citations.push(Citation {
id: work.id,
asserted_by: work.provider,
..Default::default()
});
}
}
}
pub fn read_sqlite_by_pmid(pmid: &str, path: &std::path::Path) -> Result<Option<Data>> {
formats::commonmeta::read_sqlite_by_pmid(pmid, path)
}
pub fn prepare_commonmeta(data: &Data) -> Data {
formats::commonmeta::prepare(data)
}
pub fn read_sqlite_by_pmcid(pmcid: &str, path: &std::path::Path) -> Result<Option<Data>> {
formats::commonmeta::read_sqlite_by_pmcid(pmcid, path)
}
pub fn fetch_reference_works(
data: &Data,
db_path: Option<&std::path::Path>,
no_network: bool,
) -> Vec<Data> {
let ref_dois: Vec<String> = data.references
.iter()
.filter_map(|r| {
if r.id.starts_with("https://doi.org/") { Some(r.id.clone()) } else { None }
})
.collect();
if ref_dois.is_empty() {
return Vec::new();
}
let sqlite_works = match db_path {
Some(path) if path.exists() => {
formats::commonmeta::read_sqlite_by_dois(&ref_dois, path).unwrap_or_default()
}
_ => Vec::new(),
};
let sqlite_ids: std::collections::HashSet<&str> =
sqlite_works.iter().map(|d| d.id.as_str()).collect();
let mut network_works: Vec<Data> = Vec::new();
if !no_network {
for doi in &ref_dois {
if sqlite_ids.contains(doi.as_str()) {
continue;
}
let work = formats::crossref::fetch(doi)
.or_else(|_| formats::datacite::fetch(doi));
if let Ok(w) = work {
network_works.push(w);
}
}
}
let order: std::collections::HashMap<&str, usize> = ref_dois
.iter()
.enumerate()
.map(|(i, doi)| (doi.as_str(), i))
.collect();
let mut all: Vec<Data> = sqlite_works;
all.extend(network_works);
all.sort_by_key(|d| order.get(d.id.as_str()).copied().unwrap_or(usize::MAX));
all
}
pub fn read_sqlite_by_openalex(openalex: &str, path: &std::path::Path) -> Result<Option<Data>> {
formats::commonmeta::read_sqlite_by_openalex(openalex, path)
}
pub fn read_sqlite_by_arxiv(arxiv: &str, path: &std::path::Path) -> Result<Option<Data>> {
formats::commonmeta::read_sqlite_by_arxiv(arxiv, path)
}
pub use formats::commonmeta::{FillReport, ValidationError, ValidationReport};
pub fn validate_sqlite(
path: &std::path::Path,
provider: Option<&str>,
work_type: Option<&str>,
id: Option<&str>,
has_ror_id: bool,
limit: usize,
fix: bool,
recheck: bool,
) -> Result<ValidationReport> {
formats::commonmeta::validate_sqlite(path, provider, work_type, id, has_ror_id, limit, fix, recheck)
}
pub fn import_prefixes(path: &std::path::Path) -> Result<usize> {
use doi_utils::{
collect_work_prefixes, ensure_prefixes_table, fetch_doi_ra_batch, store_prefix_cache,
};
let conn = rusqlite::Connection::open(path)
.map_err(|e| Error::Parse(format!("cannot open '{}': {}", path.display(), e)))?;
let _ = conn.execute_batch("PRAGMA journal_mode=WAL; PRAGMA cache_size=-65536;");
ensure_prefixes_table(&conn);
let prefixes = collect_work_prefixes(&conn);
if prefixes.is_empty() {
return Ok(0);
}
let cutoff = (chrono::Utc::now() - chrono::TimeDelta::days(30)).to_rfc3339();
let to_resolve: Vec<String> = prefixes
.into_iter()
.filter(|p| {
conn.query_row(
r#"SELECT "date_updated" FROM prefixes WHERE "prefix" = ?1"#,
rusqlite::params![p],
|r| r.get::<_, String>(0),
)
.ok()
.filter(|d| d.as_str() > cutoff.as_str())
.is_none()
})
.collect();
if to_resolve.is_empty() {
return Ok(0);
}
let bar = progress::count_bar("prefixes", to_resolve.len() as u64);
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
const BATCH: usize = 100;
let mut resolved = 0usize;
for chunk in to_resolve.chunks(BATCH) {
let refs: Vec<&str> = chunk.iter().map(String::as_str).collect();
let pairs = fetch_doi_ra_batch(&client, &refs);
if !pairs.is_empty() {
let tx = conn
.unchecked_transaction()
.map_err(|e| Error::Parse(e.to_string()))?;
for (prefix, ra) in &pairs {
store_prefix_cache(&conn, prefix, ra);
resolved += 1;
}
tx.commit().map_err(|e| Error::Parse(e.to_string()))?;
}
bar.inc(chunk.len() as u64);
}
bar.finish_and_clear();
Ok(resolved)
}
pub fn fill_sqlite(
path: &std::path::Path,
ror_db_path: &std::path::Path,
provider: Option<&str>,
work_type: Option<&str>,
id: Option<&str>,
has_ror_id: bool,
limit: usize,
) -> Result<FillReport> {
formats::commonmeta::fill_sqlite(path, ror_db_path, provider, work_type, id, has_ror_id, limit)
}
#[allow(clippy::too_many_arguments)]
pub fn crossref_fetch_page_with_cursor(
cursor: &str,
number: usize,
member: &str,
type_: &str,
year: &str,
orcid: &str,
ror: &str,
has_orcid: bool,
has_ror: bool,
has_references: bool,
has_relation: bool,
has_abstract: bool,
has_award: bool,
has_license: bool,
has_archive: bool,
match_ror: bool,
) -> Result<(Vec<Data>, Option<String>)> {
formats::crossref::fetch_page_with_cursor(
cursor, number, member, type_, year, orcid, ror,
has_orcid, has_ror, has_references, has_relation, has_abstract, has_award, has_license, has_archive,
match_ror,
)
}
pub fn stream_vraix_to_sqlite(
input_path: &std::path::Path,
from: &str,
output_path: &std::path::Path,
limit: usize,
update: bool,
) -> Result<usize> {
formats::vraix::stream_dump_to_sqlite(input_path, from, output_path, limit, !update)
}
pub fn stream_pidbox_to_sqlite(
input_path: &std::path::Path,
output_path: &std::path::Path,
limit: usize,
update: bool,
) -> Result<usize> {
formats::vraix::stream_pidbox_to_sqlite(input_path, output_path, limit, !update)
}
pub fn flush_dragoman_cache(path: &std::path::Path) -> Result<usize> {
formats::vraix::flush_transport_table(path)
}
pub fn fetch_orcid(id: &str) -> Result<Data> {
formats::orcid::fetch_orcid(id)
}
pub fn fetch_orcid_with_json(id: &str) -> Result<(Data, serde_json::Value)> {
formats::orcid::fetch_orcid_with_json(id)
}
pub fn stream_cache_orcid_to_people_sqlite(
cache_path: &std::path::Path,
people_path: &std::path::Path,
) -> Result<usize> {
formats::orcid::stream_cache_orcid_to_people_sqlite(cache_path, people_path)
}
pub fn fetch_orcid_person_json(id: &str) -> Result<serde_json::Value> {
formats::orcid::fetch_person_json(id)
}
pub fn write_orcid_json(value: &serde_json::Value) -> Result<Vec<u8>> {
formats::orcid::write_orcid_json(value)
}
pub use formats::orcid::PersonAffiliation;
pub fn fetch_orcid_employments(
orcid_url: &str,
db_path: Option<&std::path::Path>,
) -> Result<Vec<PersonAffiliation>> {
formats::orcid::fetch_person_employments(orcid_url, db_path)
}
pub fn fetch_orcid_affiliations(
orcid_url: &str,
db_path: Option<&std::path::Path>,
) -> Result<Vec<PersonAffiliation>> {
formats::orcid::fetch_person_affiliations(orcid_url, db_path)
}
pub fn fetch_orcid_affiliations_sqlite(
orcid_url: &str,
db_path: &std::path::Path,
) -> Vec<PersonAffiliation> {
formats::orcid::fetch_person_affiliations_sqlite(orcid_url, db_path)
}
pub fn fetch_orcid_work_dois(orcid_url: &str) -> Result<Vec<String>> {
formats::orcid::fetch_orcid_work_dois(orcid_url)
}
pub fn fetch_crossref_by_orcid(orcid_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
formats::crossref::fetch_by_orcid(orcid_url, limit, page)
}
pub fn fetch_all_crossref_by_orcid(orcid_url: &str) -> Result<Vec<Data>> {
formats::crossref::fetch_all_by_orcid(orcid_url)
}
pub fn fetch_datacite_by_orcid(orcid_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
formats::datacite::fetch_by_orcid(orcid_url, limit, page)
}
pub fn fetch_all_datacite_by_orcid(orcid_url: &str) -> Result<Vec<Data>> {
formats::datacite::fetch_all_by_orcid(orcid_url)
}
pub fn fetch_crossref_by_ror(ror_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
formats::crossref::fetch_by_ror(ror_url, limit, page)
}
pub fn fetch_datacite_by_ror(ror_url: &str, limit: usize, page: usize) -> Result<Vec<Data>> {
formats::datacite::fetch_by_ror(ror_url, limit, page)
}
pub fn write_orcid_commonmeta(
person_json: &serde_json::Value,
affiliations: &[PersonAffiliation],
works: &[Data],
) -> Result<Vec<u8>> {
formats::orcid::write_commonmeta_person(person_json, affiliations, works)
}
pub fn write_orcid_inveniordm_yaml(
person_json: &serde_json::Value,
affiliations: &[PersonAffiliation],
) -> Result<Vec<u8>> {
formats::orcid::write_inveniordm_person_yaml(person_json, affiliations)
}
pub fn fetch_orcid_sqlite(id: &str, db_path: &std::path::Path) -> Result<Data> {
formats::orcid::fetch_sqlite(id, db_path)
}
pub fn fetch_orcid_person_json_sqlite(id: &str, db_path: &std::path::Path) -> Result<serde_json::Value> {
formats::orcid::fetch_person_json_sqlite(id, db_path)
}
pub fn import_orcid_person(
id: &str,
people_db: &std::path::Path,
works_db: &std::path::Path,
) -> Result<usize> {
formats::orcid::import_person(id, people_db, works_db)
}
pub use formats::orcid::fetch_latest_orcid_release;
pub use formats::orcid::fetch_installed_orcid_public_data_version;
pub fn rebuild_people_fts(path: &std::path::Path) -> Result<()> {
formats::orcid::rebuild_people_fts(path)
}
pub fn import_orcid_public_data(
output_path: &std::path::Path,
source: Option<&str>,
no_network: bool,
sample: bool,
) -> Result<usize> {
formats::orcid::import_orcid_public_data(output_path, source, no_network, sample)
}
pub fn stream_zst_pidbox_to_sqlite(
zst_path: &std::path::Path,
output_path: &std::path::Path,
limit: usize,
) -> Result<usize> {
formats::sqlite_stream::stream_zst_pidbox_to_sqlite(zst_path, output_path, limit, true)
}
pub fn stream_pmc_ids_to_sqlite(
gz_path: &std::path::Path,
output_path: &std::path::Path,
limit: usize,
no_network: bool,
) -> Result<usize> {
formats::pubmed::stream_pmc_ids_to_sqlite(gz_path, output_path, limit, no_network)
}
pub fn write_list(list: &[Data], to: &str) -> Result<Vec<u8>> {
write_list_citation(list, to, None, None)
}
pub fn write_list_citation(
list: &[Data],
to: &str,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<u8>> {
let bar = progress::count_bar("rendering", list.len() as u64);
if matches!(
to,
"commonmeta"
| "csl"
| "datacite"
| "inveniordm"
| "schemaorg"
| "ror"
| "citation"
| "crossref_xml"
| "datacite_xml"
) {
let bytes = formats::write_all_citation(to, list, style, locale)?;
bar.finish_and_clear();
return Ok(bytes);
}
let mut output = String::new();
for (idx, item) in list.iter().enumerate() {
let rendered = formats::write_citation(to, item, style, locale)?;
if idx > 0 {
output.push('\n');
}
output.push_str(&String::from_utf8_lossy(&rendered));
bar.inc(1);
}
bar.finish_and_clear();
Ok(output.into_bytes())
}
pub fn write_archive(
list: &[Data],
to: &str,
base_name: &str,
batch_size: usize,
) -> Result<Vec<(String, Vec<u8>)>> {
write_archive_citation(list, to, base_name, batch_size, None, None)
}
pub fn write_archive_citation(
list: &[Data],
to: &str,
base_name: &str,
batch_size: usize,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<(String, Vec<u8>)>> {
if list.is_empty() {
return Err(Error::Serialize("no records to write".to_string()));
}
let chunks: Vec<&[Data]> = list.chunks(batch_size.max(1)).collect();
let multi = chunks.len() > 1;
let mut entries = Vec::with_capacity(chunks.len());
for (idx, chunk) in chunks.into_iter().enumerate() {
let bytes = write_list_citation(chunk, to, style, locale)?;
let name = batch_entry_name(base_name, if multi { Some(idx) } else { None });
entries.push((name, bytes));
}
Ok(entries)
}
fn batch_entry_name(base_name: &str, idx: Option<usize>) -> String {
match idx {
None => base_name.to_string(),
Some(i) => {
let path = std::path::Path::new(base_name);
let stem = path
.file_stem()
.unwrap_or_default()
.to_string_lossy()
.to_string();
let ext = path
.extension()
.map(|e| e.to_string_lossy().to_string())
.unwrap_or_default();
if ext.is_empty() {
format!("{}-{:05}", stem, i)
} else {
format!("{}-{:05}.{}", stem, i, ext)
}
}
}
}
pub fn read_vraix_sqlite(
sqlite_path: &str,
from: &str,
limit: Option<usize>,
offset: usize,
) -> Result<Vec<Data>> {
formats::vraix::read_dump(sqlite_path, from, limit, offset)
}
pub fn write_vraix_table_parquet(sqlite_path: &str, batch_size: usize) -> Result<Vec<u8>> {
formats::vraix::write_table_parquet(sqlite_path, batch_size)
}
pub fn fetch_vraix_dump(
from: &str,
date: &str,
input_path: Option<&str>,
limit: Option<usize>,
offset: usize,
cache_ttl: std::time::Duration,
) -> Result<Vec<Data>> {
if let Some(path) = input_path {
return read_vraix_sqlite(path, from, limit, offset);
}
let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
let cache_key = format!("{}-{}.sqlite3.zst", from, date);
let (zst_path, _from_cache) =
io_utils::ensure_cached_path(&url, "vraix", &cache_key, cache_ttl)
.map_err(|e| Error::Http(format!("failed to download '{}': {}", url, e)))?;
let tmp_path = std::env::temp_dir().join(format!(
"commonmeta-vraix-{}-{}-{}.sqlite3",
from,
date,
std::process::id()
));
io_utils::decompress_zst_file(&zst_path, &tmp_path)
.map_err(|e| Error::Parse(format!("failed to decompress '{}': {}", url, e)))?;
let result = read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
std::fs::remove_file(&tmp_path).ok();
result
}
pub fn push_inveniordm(list: &[Data], host: &str, token: &str) -> Vec<PushResult> {
formats::inveniordm::upsert_all(list, host, token)
}
pub fn put_inveniordm(data: &Data, host: &str, token: &str) -> PushResult {
formats::inveniordm::upsert(data, host, token)
}
pub fn run_cli(args: Vec<String>) -> std::result::Result<(), String> {
let matches = clap::Command::new("commonmeta")
.version(env!("CARGO_PKG_VERSION"))
.author("Front Matter <info@front-matter.de>")
.about("Commonmeta")
.subcommand(cmd::convert::command())
.subcommand(cmd::decode::command())
.subcommand(cmd::dump::command())
.subcommand(cmd::encode::command())
.subcommand(cmd::import::command())
.subcommand(cmd::list::command())
.subcommand(cmd::r#match::command())
.subcommand(cmd::migrate::command())
.subcommand(cmd::push::command())
.subcommand(cmd::put::command())
.subcommand(cmd::settings::command())
.subcommand(cmd::validate::command())
.get_matches_from(args);
match matches.subcommand() {
Some(("convert", sub)) => cmd::convert::execute(sub),
Some(("decode", sub)) => cmd::decode::execute(sub),
Some(("package", sub)) => cmd::dump::execute(sub),
Some(("encode", sub)) => cmd::encode::execute(sub),
Some(("import", sub)) => cmd::import::execute(sub),
Some(("list", sub)) => cmd::list::execute(sub),
Some(("match", sub)) => cmd::r#match::execute(sub),
Some(("migrate", sub)) => cmd::migrate::execute(sub),
Some(("push", sub)) => cmd::push::execute(sub),
Some(("put", sub)) => cmd::put::execute(sub),
Some(("settings", sub)) => cmd::settings::execute(sub),
Some(("validate", sub)) => cmd::validate::execute(sub),
_ => Ok(()),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_data(id: &str) -> Data {
Data {
id: id.to_string(),
type_: "JournalArticle".to_string(),
..Data::default()
}
}
#[test]
fn test_write_list_json_array_formats() {
let list = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
let bytes = write_list(&list, "commonmeta").unwrap();
let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(value.as_array().unwrap().len(), 2);
}
#[test]
fn test_write_list_newline_joined_formats() {
let list = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
let bytes = write_list(&list, "ris").unwrap();
let text = String::from_utf8(bytes).unwrap();
assert_eq!(text.lines().filter(|l| l.starts_with("TY -")).count(), 2);
}
#[test]
fn test_write_list_crossref_xml_batches_into_one_doi_batch() {
let list = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
let bytes = write_list(&list, "crossref_xml").unwrap();
let text = String::from_utf8(bytes).unwrap();
assert_eq!(text.matches("<doi_batch xmlns=").count(), 1);
assert_eq!(text.matches("<journal_article").count(), 2);
}
#[test]
fn test_write_list_ror_uses_json_array_batch_writer() {
let mut a = sample_data("https://ror.org/0342dzm54");
a.title = "Org A".to_string();
let mut b = sample_data("https://ror.org/0521rfr06");
b.title = "Org B".to_string();
let bytes = write_list(&[a, b], "ror").unwrap();
let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(value.as_array().unwrap().len(), 2);
}
#[test]
fn test_write_list_citation_renders_each_record() {
let mut a = sample_data("https://doi.org/10.1/a");
a.title = "Title A".to_string();
a.date_published = "2020".to_string();
let mut b = sample_data("https://doi.org/10.1/b");
b.title = "Title B".to_string();
b.date_published = "2021".to_string();
let bytes = write_list(&[a, b], "citation").unwrap();
let text = String::from_utf8(bytes).unwrap();
let lines: Vec<&str> = text.lines().collect();
assert_eq!(lines.len(), 2);
assert!(lines[0].contains("Title A"));
assert!(lines[1].contains("Title B"));
}
#[test]
fn test_write_list_citation_respects_style() {
let mut a = sample_data("https://doi.org/10.1/a");
a.title = "Title A".to_string();
a.date_published = "2020".to_string();
let apa = write_list_citation(&[a.clone()], "citation", None, None).unwrap();
let chicago =
write_list_citation(&[a], "citation", Some("chicago-author-date"), None).unwrap();
assert_ne!(apa, chicago);
}
#[test]
fn test_write_archive_single_batch_uses_base_name() {
let list = vec![sample_data("https://doi.org/10.1/a")];
let entries = write_archive(&list, "commonmeta", "out.json", 100_000).unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].0, "out.json");
}
#[test]
fn test_write_archive_numbered_batches() {
let list = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
sample_data("https://doi.org/10.1/c"),
];
let entries = write_archive(&list, "commonmeta", "out.json", 1).unwrap();
assert_eq!(entries.len(), 3);
assert_eq!(entries[0].0, "out-00000.json");
assert_eq!(entries[1].0, "out-00001.json");
assert_eq!(entries[2].0, "out-00002.json");
}
#[test]
fn test_write_archive_no_extension_base_name() {
let list = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
let entries = write_archive(&list, "commonmeta", "out", 1).unwrap();
assert_eq!(entries[0].0, "out-00000");
assert_eq!(entries[1].0, "out-00001");
}
#[test]
fn test_write_archive_empty_list_errors() {
assert!(write_archive(&[], "commonmeta", "out.json", 100_000).is_err());
}
#[test]
fn test_fetch_vraix_dump_uses_local_input_path_without_network() {
let dir = std::env::temp_dir().join("commonmeta_lib_fetch_vraix_dump");
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("datacite.sqlite3");
std::fs::remove_file(&path).ok();
{
let conn = rusqlite::Connection::open(&path).unwrap();
conn.execute_batch("CREATE TABLE works (pid TEXT, source_id INTEGER, raw_metadata TEXT);")
.unwrap();
conn.execute(
"INSERT INTO works (pid, source_id, raw_metadata) VALUES (?1, ?2, ?3)",
rusqlite::params![
"pid-0",
1i64,
r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b"}}}"#
],
)
.unwrap();
}
let data = fetch_vraix_dump(
"datacite",
"2026-06-14",
Some(path.to_str().unwrap()),
None,
0,
std::time::Duration::from_secs(30 * 24 * 60 * 60),
)
.unwrap();
assert_eq!(data.len(), 1);
assert_eq!(data[0].id, "https://doi.org/10.5678/b");
std::fs::remove_dir_all(&dir).ok();
}
}