pub mod crockford;
pub mod data;
pub mod doi_utils;
pub mod error;
pub mod file_utils;
pub mod progress;
pub mod schema_utils;
mod formats;
pub mod traits;
pub mod utils;
pub mod vocab;
pub use data::Data;
pub use error::{Error, Result};
pub use formats::crossref;
pub use formats::inveniordm::PushResult;
pub use formats::ror::AffiliationMatch;
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub fn read(from: &str, input: &str) -> Result<Data> {
formats::read(from, input)
}
pub fn convert(from: &str, to: &str, input: &str) -> Result<Vec<u8>> {
let data = formats::read(from, input)?;
formats::write(to, &data)
}
pub fn write(to: &str, data: &Data) -> Result<Vec<u8>> {
formats::write(to, data)
}
pub fn write_ror_json(data: &Data) -> Result<Vec<u8>> {
formats::ror::write_json(data)
}
pub fn match_ror_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
formats::ror::match_affiliation(affiliation)
}
pub fn convert_citation(
from: &str,
input: &str,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<u8>> {
let data = formats::read(from, input)?;
formats::write_citation("citation", &data, style, locale)
}
pub fn write_parquet(list: &[Data]) -> Result<Vec<u8>> {
formats::commonmeta::write_parquet_all(list)
}
pub fn read_parquet(bytes: &[u8]) -> Result<Vec<Data>> {
formats::commonmeta::read_parquet_all(bytes)
}
pub fn write_list(list: &[Data], to: &str) -> Result<Vec<u8>> {
write_list_citation(list, to, None, None)
}
pub fn write_list_citation(
list: &[Data],
to: &str,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<u8>> {
let bar = progress::count_bar("rendering", list.len() as u64);
if matches!(to, "commonmeta" | "csl" | "datacite" | "inveniordm" | "schemaorg" | "ror") {
let mut items: Vec<serde_json::Value> = Vec::with_capacity(list.len());
for item in list {
let rendered = formats::write_citation(to, item, style, locale)?;
let value: serde_json::Value = serde_json::from_slice(&rendered).map_err(|e| {
Error::Serialize(format!("failed to parse {} output as JSON: {}", to, e))
})?;
items.push(value);
bar.inc(1);
}
bar.finish_and_clear();
return serde_json::to_vec_pretty(&items).map_err(|e| Error::Serialize(e.to_string()));
}
let mut output = String::new();
for (idx, item) in list.iter().enumerate() {
let rendered = formats::write_citation(to, item, style, locale)?;
if idx > 0 {
output.push('\n');
}
output.push_str(&String::from_utf8_lossy(&rendered));
bar.inc(1);
}
bar.finish_and_clear();
Ok(output.into_bytes())
}
pub fn write_archive(
list: &[Data],
to: &str,
base_name: &str,
batch_size: usize,
) -> Result<Vec<(String, Vec<u8>)>> {
write_archive_citation(list, to, base_name, batch_size, None, None)
}
pub fn write_archive_citation(
list: &[Data],
to: &str,
base_name: &str,
batch_size: usize,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<(String, Vec<u8>)>> {
if list.is_empty() {
return Err(Error::Serialize("no records to write".to_string()));
}
let chunks: Vec<&[Data]> = list.chunks(batch_size.max(1)).collect();
let multi = chunks.len() > 1;
let mut entries = Vec::with_capacity(chunks.len());
for (idx, chunk) in chunks.into_iter().enumerate() {
let bytes = write_list_citation(chunk, to, style, locale)?;
let name = batch_entry_name(base_name, if multi { Some(idx) } else { None });
entries.push((name, bytes));
}
Ok(entries)
}
fn batch_entry_name(base_name: &str, idx: Option<usize>) -> String {
match idx {
None => base_name.to_string(),
Some(i) => {
let path = std::path::Path::new(base_name);
let stem = path.file_stem().unwrap_or_default().to_string_lossy().to_string();
let ext = path.extension().map(|e| e.to_string_lossy().to_string()).unwrap_or_default();
if ext.is_empty() {
format!("{}-{:05}", stem, i)
} else {
format!("{}-{:05}.{}", stem, i, ext)
}
}
}
}
pub fn read_vraix_sqlite(sqlite_path: &str, from: &str, limit: Option<usize>, offset: usize) -> Result<Vec<Data>> {
formats::vraix::read_dump(sqlite_path, from, limit, offset)
}
pub fn fetch_vraix_dump(
from: &str,
date: &str,
input_path: Option<&str>,
limit: Option<usize>,
offset: usize,
cache_ttl: std::time::Duration,
) -> Result<Vec<Data>> {
if let Some(path) = input_path {
return read_vraix_sqlite(path, from, limit, offset);
}
let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
let cache_key = format!("{}-{}.sqlite3.zst", from, date);
let (compressed, _from_cache) = file_utils::download_file_cached(&url, "vraix", &cache_key, cache_ttl)
.map_err(|e| Error::Http(format!("failed to download '{}': {}", url, e)))?;
let decompressed = file_utils::unzst_content(&compressed)
.map_err(|e| Error::Parse(format!("failed to decompress '{}': {}", url, e)))?;
let tmp_path = std::env::temp_dir()
.join(format!("commonmeta-vraix-{}-{}-{}.sqlite3", from, date, std::process::id()));
file_utils::write_file(&tmp_path, &decompressed)
.map_err(|e| Error::Parse(format!("failed to write temp file '{}': {}", tmp_path.display(), e)))?;
let result = read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
std::fs::remove_file(&tmp_path).ok();
result
}
pub fn push_inveniordm(list: &[Data], host: &str, token: &str) -> Vec<PushResult> {
formats::inveniordm::upsert_all(list, host, token)
}
pub fn put_inveniordm(data: &Data, host: &str, token: &str) -> PushResult {
formats::inveniordm::upsert(data, host, token)
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_data(id: &str) -> Data {
Data { id: id.to_string(), type_: "JournalArticle".to_string(), ..Data::default() }
}
#[test]
fn test_write_list_json_array_formats() {
let list = vec![sample_data("https://doi.org/10.1/a"), sample_data("https://doi.org/10.1/b")];
let bytes = write_list(&list, "commonmeta").unwrap();
let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(value.as_array().unwrap().len(), 2);
}
#[test]
fn test_write_list_newline_joined_formats() {
let list = vec![sample_data("https://doi.org/10.1/a"), sample_data("https://doi.org/10.1/b")];
let bytes = write_list(&list, "ris").unwrap();
let text = String::from_utf8(bytes).unwrap();
assert_eq!(text.lines().filter(|l| l.starts_with("TY -")).count(), 2);
}
#[test]
fn test_write_list_citation_renders_each_record() {
let mut a = sample_data("https://doi.org/10.1/a");
a.titles.push(crate::data::Title { title: "Title A".to_string(), ..Default::default() });
a.date.published = "2020".to_string();
let mut b = sample_data("https://doi.org/10.1/b");
b.titles.push(crate::data::Title { title: "Title B".to_string(), ..Default::default() });
b.date.published = "2021".to_string();
let bytes = write_list(&[a, b], "citation").unwrap();
let text = String::from_utf8(bytes).unwrap();
let lines: Vec<&str> = text.lines().collect();
assert_eq!(lines.len(), 2);
assert!(lines[0].contains("Title A"));
assert!(lines[1].contains("Title B"));
}
#[test]
fn test_write_list_citation_respects_style() {
let mut a = sample_data("https://doi.org/10.1/a");
a.titles.push(crate::data::Title { title: "Title A".to_string(), ..Default::default() });
a.date.published = "2020".to_string();
let apa = write_list_citation(&[a.clone()], "citation", None, None).unwrap();
let chicago = write_list_citation(&[a], "citation", Some("chicago-author-date"), None).unwrap();
assert_ne!(apa, chicago);
}
#[test]
fn test_write_archive_single_batch_uses_base_name() {
let list = vec![sample_data("https://doi.org/10.1/a")];
let entries = write_archive(&list, "commonmeta", "out.json", 100_000).unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].0, "out.json");
}
#[test]
fn test_write_archive_numbered_batches() {
let list = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
sample_data("https://doi.org/10.1/c"),
];
let entries = write_archive(&list, "commonmeta", "out.json", 1).unwrap();
assert_eq!(entries.len(), 3);
assert_eq!(entries[0].0, "out-00000.json");
assert_eq!(entries[1].0, "out-00001.json");
assert_eq!(entries[2].0, "out-00002.json");
}
#[test]
fn test_write_archive_no_extension_base_name() {
let list = vec![sample_data("https://doi.org/10.1/a"), sample_data("https://doi.org/10.1/b")];
let entries = write_archive(&list, "commonmeta", "out", 1).unwrap();
assert_eq!(entries[0].0, "out-00000");
assert_eq!(entries[1].0, "out-00001");
}
#[test]
fn test_write_archive_empty_list_errors() {
assert!(write_archive(&[], "commonmeta", "out.json", 100_000).is_err());
}
#[test]
fn test_fetch_vraix_dump_uses_local_input_path_without_network() {
let dir = std::env::temp_dir().join("commonmeta_lib_fetch_vraix_dump");
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("datacite.sqlite3");
std::fs::remove_file(&path).ok();
let connection = rusqlite::Connection::open(&path).unwrap();
connection
.execute_batch("CREATE TABLE works (pid TEXT, source_id INTEGER, raw_metadata TEXT);")
.unwrap();
connection
.execute(
"INSERT INTO works (pid, source_id, raw_metadata) VALUES (?1, ?2, ?3)",
rusqlite::params![
"pid-0",
1i64,
r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b"}}}"#
],
)
.unwrap();
let data = fetch_vraix_dump(
"datacite",
"2026-06-14",
Some(path.to_str().unwrap()),
None,
0,
std::time::Duration::from_secs(30 * 24 * 60 * 60),
)
.unwrap();
assert_eq!(data.len(), 1);
assert_eq!(data[0].id, "https://doi.org/10.5678/b");
std::fs::remove_dir_all(&dir).ok();
}
}