use std::path::{Path, PathBuf};
use std::time::Instant;
use clap::{Arg, ArgAction, ArgMatches, Command};
use serde_json::json;
use url::Url;
use crate::{Data};
use crate::constants::{cm_to_cr_filter_type, cm_to_dc, lookup_work_type};
use crate::io_utils;
const BATCH_SIZE: usize = 100_000;
fn temp_dir() -> PathBuf {
std::env::var_os("COMMONMETA_TEMP_DIR")
.map(PathBuf::from)
.unwrap_or_else(std::env::temp_dir)
}
pub(crate) fn fmt_wrote_sqlite(path: &str, written: usize, total: Option<usize>) -> String {
match total {
Some(t) if t != written => {
format!("wrote {} ({} upserted, {} total)", path, written, t)
}
_ => format!("wrote {} ({} records)", path, written),
}
}
use crate::cmd::{resolve_db_path, VRAIX_CACHE_TTL};
pub fn command() -> Command {
Command::new("list")
.about("A list of scholarly metadata")
.long_about(
"A list of scholarly metadata retrieved via file or API.\n\n\
Examples:\n\n\
commonmeta list --number 10 --member 78 --type JournalArticle --from crossref\n\
commonmeta list --number 10 --client cern.zenodo --type Dataset --from datacite\n\
commonmeta list --number 10 --from openalex --type JournalArticle\n\
commonmeta list --from crossref --file out.json\n\
commonmeta list --from crossref --to citation --style chicago-author-date\n\
(--to accepts every format --to accepts in convert: commonmeta, csl, datacite,\n\
inveniordm, schemaorg, ror, bibtex, ris, crossref_xml, citation; --style/--locale\n\
only affect --to citation, same as convert)\n\
commonmeta list --from crossref --number 1000 --file out.parquet\n\
(a .parquet --file extension selects Parquet output and is only supported for\n\
--to commonmeta, the default; output is always zstd-compressed, with records\n\
split into batches of 100,000 written in parallel, e.g. out-00000.parquet.zst, ...)\n\
commonmeta list --from crossref --number 1000 --file out.parquet.zip\n\
(combining .parquet with .zip/.tgz packs the same zstd-compressed batches\n\
into a single archive instead of writing them loose to disk)\n\
commonmeta list batch-commonmeta-00000.parquet.zst --to csl\n\
(a .parquet/.parquet.zst input is auto-detected as a commonmeta Parquet dump\n\
written by --file *.parquet, regardless of --from, and read back into a list)\n\
commonmeta list --from crossref --date 2026-06-14\n\
commonmeta list --from datacite --date 2026-06-14\n\
commonmeta list datacite-2026-06-14.sqlite3.zst --from datacite\n\
(--date pairs with --from crossref or --from datacite to read a VRAIX daily\n\
dump SQLite database; --from picks both the dump file, {from}-{date}.sqlite3.zst,\n\
and how its rows are parsed. With no input path, the file is downloaded from\n\
metadata.vraix.org and decompressed. A .sqlite3/.sqlite3.zst input path is\n\
auto-detected the same way --file *.parquet is, so --date can be omitted when\n\
pointing directly at an already-downloaded dump; it's only needed to name the\n\
file to download when no input path is given)\n\
commonmeta list --from crossref --date 2026-06-14 --number 0 --file out.zip\n\
commonmeta list --from datacite --date 2026-06-14 --number 0 --file out.tgz\n\
(a .zip/.tgz --file extension archives the records as multiple batched\n\
entries of 100,000 records each instead of one in-memory buffer, useful with\n\
--number 0, which fetches every row in a VRAIX dump)",
)
.arg(
Arg::new("input")
.help("Optional input file path (JSON/JSONL, Parquet, or SQLite with --date)")
.required(false)
.index(1),
)
.arg(
Arg::new("from")
.long("from")
.short('f')
.help("Input source format")
.default_value("commonmeta"),
)
.arg(
Arg::new("to")
.long("to")
.short('t')
.help("Output format")
.default_value("commonmeta"),
)
.arg(
Arg::new("number")
.long("number")
.help("Number of records to fetch")
.value_parser(clap::value_parser!(usize))
.default_value("10"),
)
.arg(
Arg::new("page")
.long("page")
.help("Page number (1-based)")
.value_parser(clap::value_parser!(usize))
.default_value("1"),
)
.arg(Arg::new("member").long("member").help("Crossref member ID"))
.arg(Arg::new("client").long("client").help("DataCite client ID"))
.arg(Arg::new("type").long("type").help("Work type filter"))
.arg(Arg::new("year").long("year").help("Publication year"))
.arg(
Arg::new("language")
.long("language")
.help("Language filter"),
)
.arg(Arg::new("orcid").long("orcid").help("Filter by ORCID"))
.arg(Arg::new("ror").long("ror").help("Filter by ROR"))
.arg(Arg::new("cited-by").long("cited-by").help("Return works that cite the given DOI (requires commonmeta SQLite input)"))
.arg(Arg::new("affiliation").long("affiliation").help("Affiliation name filter"))
.arg(Arg::new("country").long("country").help("Country code filter"))
.arg(Arg::new("date-updated").long("date-updated").help("Filter by date updated (YYYY-MM-DD)"))
.arg(Arg::new("from-host").long("from-host").help("InvenioRDM source host"))
.arg(Arg::new("from-token").long("from-token").help("InvenioRDM source API token"))
.arg(Arg::new("community").long("community").help("InvenioRDM community slug"))
.arg(Arg::new("subject").long("subject").help("Subject area filter"))
.arg(Arg::new("depositor").long("depositor").help("Crossref depositor name"))
.arg(Arg::new("registrant").long("registrant").help("Crossref registrant name"))
.arg(
Arg::new("email")
.long("email")
.help("Email for OpenAlex mailto parameter"),
)
.arg(
Arg::new("sample")
.long("sample")
.help("Return random works (crossref: max 100, datacite: max 1000, openalex: max 200)")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-orcid")
.long("has-orcid")
.help("Filter for records with ORCID")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-ror-id")
.long("has-ror-id")
.help("Filter for records with ROR")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-references")
.long("has-references")
.help("Filter for records with references")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-relation")
.long("has-relation")
.help("Filter for records with relation")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-abstract")
.long("has-abstract")
.help("Filter for records with abstract")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-award")
.long("has-award")
.help("Filter for records with award")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-license")
.long("has-license")
.help("Filter for records with license")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("has-archive")
.long("has-archive")
.help("Filter for records with archive")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("is-archived")
.long("is-archived")
.help("Filter for archived records")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("vocabulary")
.long("vocabulary")
.help("Output as vocabulary (e.g. InvenioRDM affiliations YAML)")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("match")
.long("match")
.help("Enable ROR affiliation matching when reading crossref and datacite records")
.default_value("true")
.value_parser(clap::value_parser!(bool)),
)
.arg(
Arg::new("file")
.long("file")
.help("Write output to file instead of stdout"),
)
.arg(Arg::new("date").long("date").help(
"Date (YYYY-MM-DD) of a VRAIX daily dump, used with --from crossref or \
--from datacite; downloads {from}-{date}.sqlite3.zst from \
metadata.vraix.org unless an input file path is also given",
))
.arg(
Arg::new("timers")
.long("timers")
.help("Print timing for download/parse/write phases to stderr")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("style")
.long("style")
.help("CSL style name for --to citation output (default: apa)"),
)
.arg(
Arg::new("locale")
.long("locale")
.help("BCP 47 locale for --to citation output (e.g. de-DE)"),
)
.arg(
Arg::new("no-network")
.long("no-network")
.help("Disable all outbound network requests; fails if no local input file is provided")
.action(ArgAction::SetTrue),
)
}
pub fn execute(matches: &ArgMatches) -> Result<(), String> {
let from_explicit = matches
.get_one::<String>("from")
.map(String::as_str)
.unwrap_or("commonmeta");
let from = if from_explicit == "commonmeta" {
let has_member = matches.get_one::<String>("member").map(|s| !s.is_empty()).unwrap_or(false);
let has_client = matches.get_one::<String>("client").map(|s| !s.is_empty()).unwrap_or(false);
if has_member { "crossref" }
else if has_client { "datacite" }
else { from_explicit }
} else {
from_explicit
};
let to = matches
.get_one::<String>("to")
.map(String::as_str)
.unwrap_or("commonmeta");
let out_file = matches.get_one::<String>("file");
let date = matches.get_one::<String>("date").map(String::as_str);
let timers = matches.get_flag("timers");
let update = out_file.map(|p| std::path::Path::new(p).exists()).unwrap_or(false);
let style = matches.get_one::<String>("style").map(String::as_str);
let locale = matches.get_one::<String>("locale").map(String::as_str);
if !matches!(from, "crossref" | "datacite" | "openalex" | "pubmed" | "commonmeta" | "ror") {
return Err(format!(
"list: --from {} is not implemented yet (supported: crossref, datacite, openalex, pubmed, commonmeta, ror)",
from
));
}
if from == "ror" {
return execute_ror_list(matches, to, out_file, style, locale);
}
if !is_supported_output_format(to) {
return Err(format!("list: unsupported --to format: {}", to));
}
let no_network = matches.get_flag("no-network");
let number = *matches.get_one::<usize>("number").unwrap_or(&10);
let cited_by = matches.get_one::<String>("cited-by").map(String::as_str);
let explicit_input = matches.get_one::<String>("input").map(String::as_str);
let db_fallback: Option<String> = if no_network && explicit_input.is_none() {
let path = resolve_db_path(None);
if !std::path::Path::new(&path).exists() {
return Err(format!(
"local database not found at '{}'; \
import records first with 'commonmeta import' or remove --no-network",
path
));
}
Some(path)
} else {
None
};
let input_path: Option<&str> = db_fallback.as_deref().or(explicit_input);
let is_sqlite_input = input_path
.map(|p| io_utils::get_extension(p, ".json").1 == ".sqlite3")
.unwrap_or(false);
let filename_source: Option<&'static str> = if is_sqlite_input && !matches!(from, "crossref" | "datacite") {
input_path
.and_then(|p| std::path::Path::new(p).file_stem()?.to_str())
.and_then(|stem| {
if stem.starts_with("crossref-") { Some("crossref") }
else if stem.starts_with("datacite-") { Some("datacite") }
else { None }
})
} else {
None
};
let from: &str = if db_fallback.is_some() {
"commonmeta"
} else {
filename_source.unwrap_or(from)
};
let is_vraix_sqlite_input = is_sqlite_input && matches!(from, "crossref" | "datacite");
if let Some(out_path) = out_file {
let (_base, out_ext, out_compress) = io_utils::get_extension(out_path, ".json");
let compress_supported = out_compress.is_empty() || out_compress == "zst";
let is_date_download = date.is_some() && input_path.is_none();
if out_ext == ".sqlite3"
&& compress_supported
&& to == "commonmeta"
&& matches!(from, "crossref" | "datacite")
&& (is_vraix_sqlite_input || is_date_download)
{
let write_start = Instant::now();
let (base_out, _ext, _c) = io_utils::get_extension(out_path, ".sqlite3");
let sqlite_out = if out_compress.is_empty() {
base_out.clone()
} else {
base_out.with_extension("sqlite3.tmp")
};
let (in_sqlite, tmp_to_clean) = if is_date_download {
let url = format!(
"https://metadata.vraix.org/{}-{}.sqlite3.zst",
from,
date.unwrap()
);
let cache_key = format!("{}-{}.sqlite3.zst", from, date.unwrap());
let download_start = Instant::now();
let (cache_path, from_cache) =
io_utils::ensure_cached_path(&url, "vraix", &cache_key, VRAIX_CACHE_TTL)
.map_err(|e| format!("failed to download '{}': {}", url, e))?;
if timers {
let size = cache_path.metadata().map(|m| m.len()).unwrap_or(0);
if from_cache {
eprintln!(
"list: download took {:.2?} ({} bytes, from local cache)",
download_start.elapsed(),
size
);
} else {
eprintln!(
"list: download took {:.2?} ({} bytes)",
download_start.elapsed(),
size
);
}
}
let decompress_start = Instant::now();
let tmp_path = sqlite_out.with_extension(format!(
"sqlite3.vraix-{}.tmp",
std::process::id()
));
let decompressed_bytes = io_utils::decompress_zst_file(&cache_path, &tmp_path)
.map_err(|e| format!("failed to decompress '{}': {}", url, e))?;
if timers {
eprintln!(
"list: decompress + write temp took {:.2?} ({} bytes)",
decompress_start.elapsed(),
decompressed_bytes
);
}
(tmp_path.clone(), Some(tmp_path))
} else {
(std::path::PathBuf::from(input_path.unwrap()), None)
};
let convert_start = Instant::now();
let result = crate::stream_vraix_to_sqlite(&in_sqlite, from, &sqlite_out, number, update)
.map_err(|e| e.to_string());
if let Some(tmp) = tmp_to_clean {
std::fs::remove_file(&tmp).ok();
}
let n = result?;
if timers {
eprintln!(
"list: stream convert+write took {:.2?} ({} records)",
convert_start.elapsed(),
n
);
}
let total = if update {
crate::count_sqlite_works(&sqlite_out).ok()
} else {
None
};
if !out_compress.is_empty() {
let compress_start = Instant::now();
{
let out_file = std::fs::File::create(out_path)
.map_err(|e| format!("failed to create '{}': {}", out_path, e))?;
let mut encoder = zstd::Encoder::new(out_file, 0)
.map_err(|e| format!("failed to create zstd encoder: {}", e))?;
let mut in_file = std::fs::File::open(&sqlite_out)
.map_err(|e| format!("failed to open '{}': {}", sqlite_out.display(), e))?;
std::io::copy(&mut in_file, &mut encoder)
.map_err(|e| format!("zstd compression failed: {}", e))?;
encoder
.finish()
.map_err(|e| format!("failed to finish zstd encoder: {}", e))?;
}
std::fs::remove_file(&sqlite_out).ok();
if timers {
eprintln!("list: zstd compression took {:.2?}", compress_start.elapsed());
}
}
if timers {
eprintln!(
"list: total took {:.2?} ({} records)",
write_start.elapsed(),
n
);
}
println!("{}", fmt_wrote_sqlite(out_path, n, total));
return Ok(());
}
}
let data = if date.is_some() || is_vraix_sqlite_input {
if !matches!(from, "crossref" | "datacite") {
return Err(
"list: reading a VRAIX SQLite dump requires --from crossref or --from datacite"
.to_string(),
);
}
load_vraix_list_for_date(date.unwrap_or(""), input_path, from, matches, timers)?
} else if let Some(input_path) = input_path {
let read_start = Instant::now();
let page = *matches.get_one::<usize>("page").unwrap_or(&1);
let file_limit = if number == 0 { None } else { Some(number) };
let file_offset = page.saturating_sub(1).saturating_mul(number);
let d = if let Some(doi) = cited_by {
if !is_sqlite_input || from != "commonmeta" {
return Err("list: --cited-by requires a commonmeta SQLite input file".to_string());
}
crate::read_sqlite_by_citation(doi, Path::new(input_path))
.map_err(|e| e.to_string())?
} else {
load_list_from_file(input_path, from, file_limit, file_offset)?
};
if timers {
eprintln!(
"list: read from file took {:.2?} ({} records)",
read_start.elapsed(),
d.len()
);
}
d
} else {
if from == "commonmeta" {
return Err("list: --from commonmeta requires an input .parquet or .sqlite3 file path".to_string());
}
fetch_list_from_api(matches, from)?
};
if let Some(path) = out_file {
let (_base, extension, compress) = io_utils::get_extension(path, ".json");
if extension == ".parquet" {
if to != "commonmeta" {
return Err(format!(
"list: --file *.parquet output is only supported for --to commonmeta (got --to {}), until other flattened formats are added",
to
));
}
let write_start = Instant::now();
let result = if compress == "zip" || compress == "tgz" {
write_parquet_archive(&data, path, &compress)
} else {
write_parquet_batches(&data, path)
};
if timers {
eprintln!(
"list: write {} took {:.2?} ({} records)",
to,
write_start.elapsed(),
data.len()
);
}
return result;
}
if extension == ".sqlite3" {
if to != "commonmeta" {
return Err(format!(
"list: --file *.sqlite3 output is only supported for --to commonmeta (got --to {})",
to
));
}
let write_start = Instant::now();
let result = write_sqlite_output(&data, path, &compress, update);
if timers {
eprintln!(
"list: write sqlite3 took {:.2?} ({} records)",
write_start.elapsed(),
data.len()
);
}
return result;
}
if compress == "zip" || compress == "tgz" {
let write_start = Instant::now();
let result = write_archive_batches(&data, to, path, &compress, style, locale);
if timers {
eprintln!(
"list: write {} took {:.2?} ({} records)",
to,
write_start.elapsed(),
data.len()
);
}
return result;
}
}
let write_start = Instant::now();
let output = write_output(&data, to, style, locale)?;
if timers {
eprintln!(
"list: write {} took {:.2?} ({} records)",
to,
write_start.elapsed(),
data.len()
);
}
match out_file {
Some(path) => {
let (file, _extension, compress) = io_utils::get_extension(path, ".json");
match compress.as_str() {
"gz" => io_utils::write_gz_file(&file, &output)
.map_err(|e| format!("failed to write gzip '{}': {}", path, e)),
"zst" => io_utils::write_zst_file(&file, &output)
.map_err(|e| format!("failed to write zst '{}': {}", path, e)),
_ => io_utils::write_file(&file, &output)
.map_err(|e| format!("failed to write '{}': {}", path, e)),
}
}
None => {
println!("{}", String::from_utf8_lossy(&output));
Ok(())
}
}
}
fn is_supported_output_format(to: &str) -> bool {
matches!(
to,
"commonmeta"
| "csl"
| "datacite"
| "inveniordm"
| "schemaorg"
| "ror"
| "bibtex"
| "ris"
| "crossref_xml"
| "citation"
)
}
fn write_parquet_batches(data: &[Data], out_path: &str) -> Result<(), String> {
if data.is_empty() {
return Err("list: no records to write".to_string());
}
let (base_path, _extension, _compress) = io_utils::get_extension(out_path, ".parquet");
write_parquet_batch(data, &base_path)
}
fn write_parquet_batch(data: &[Data], base_path: &Path) -> Result<(), String> {
let bytes = crate::write_parquet(data).map_err(|e| e.to_string())?;
let compressed = zstd::stream::encode_all(std::io::Cursor::new(bytes), 0)
.map_err(|e| format!("failed to zstd-compress parquet: {}", e))?;
let path = parquet_batch_path(base_path);
io_utils::write_file(&path, &compressed)
.map_err(|e| format!("failed to write '{}': {}", path.display(), e))?;
println!("wrote {} ({} records)", path.display(), data.len());
Ok(())
}
fn parquet_batch_path(base_path: &Path) -> PathBuf {
let mut path = base_path.to_path_buf();
let name = format!(
"{}.zst",
path.file_name().unwrap_or_default().to_string_lossy()
);
path.set_file_name(name);
path
}
fn write_parquet_archive(data: &[Data], out_path: &str, compress: &str) -> Result<(), String> {
if data.is_empty() {
return Err("list: no records to write".to_string());
}
let (base_path, _extension, _compress) = io_utils::get_extension(out_path, ".parquet");
let base_name = base_path
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
let entry = parquet_archive_entry(data, &base_name)?;
match compress {
"zip" => io_utils::write_zip_archive(out_path, std::slice::from_ref(&entry))
.map_err(|e| format!("failed to write zip '{}': {}", out_path, e))?,
"tgz" => io_utils::write_tar_gz_archive(out_path, std::slice::from_ref(&entry))
.map_err(|e| format!("failed to write tgz '{}': {}", out_path, e))?,
other => return Err(format!("list: unsupported archive compression: {}", other)),
}
println!("wrote {} ({} records)", out_path, data.len());
Ok(())
}
fn write_sqlite_output(data: &[Data], out_path: &str, compress: &str, update: bool) -> Result<(), String> {
if data.is_empty() {
return Err("list: no records to write".to_string());
}
let (base_path, _extension, _) = io_utils::get_extension(out_path, ".sqlite3");
let write_fn = if update {
crate::upsert_sqlite as fn(&[Data], &Path) -> crate::Result<()>
} else {
crate::write_sqlite as fn(&[Data], &Path) -> crate::Result<()>
};
if compress.is_empty() {
write_fn(data, &base_path).map_err(|e| e.to_string())?;
let total = if update {
crate::count_sqlite_works(&base_path).ok()
} else {
None
};
println!("{}", fmt_wrote_sqlite(&base_path.to_string_lossy(), data.len(), total));
return Ok(());
}
let tmp_path = base_path.with_extension("sqlite3.tmp");
write_fn(data, &tmp_path).map_err(|e| e.to_string())?;
let total = if update {
crate::count_sqlite_works(&tmp_path).ok()
} else {
None
};
let bytes = std::fs::read(&tmp_path)
.map_err(|e| format!("failed to read temp sqlite '{}': {}", tmp_path.display(), e))?;
std::fs::remove_file(&tmp_path).ok();
let entry_name = base_path
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
match compress {
"zst" => {
let compressed = zstd::stream::encode_all(std::io::Cursor::new(&bytes), 0)
.map_err(|e| format!("failed to zstd-compress sqlite: {}", e))?;
io_utils::write_file(Path::new(out_path), &compressed)
.map_err(|e| format!("failed to write zst '{}': {}", out_path, e))?;
}
"zip" => {
io_utils::write_zip_archive(out_path, std::slice::from_ref(&(entry_name, bytes)))
.map_err(|e| format!("failed to write zip '{}': {}", out_path, e))?;
}
"tgz" => {
io_utils::write_tar_gz_archive(out_path, std::slice::from_ref(&(entry_name, bytes)))
.map_err(|e| format!("failed to write tgz '{}': {}", out_path, e))?;
}
other => return Err(format!("list: unsupported compression for sqlite3: {}", other)),
}
println!("{}", fmt_wrote_sqlite(out_path, data.len(), total));
Ok(())
}
fn parquet_archive_entry(data: &[Data], base_name: &str) -> Result<(String, Vec<u8>), String> {
let bytes = crate::write_parquet(data).map_err(|e| e.to_string())?;
let compressed = zstd::stream::encode_all(std::io::Cursor::new(bytes), 0)
.map_err(|e| format!("failed to zstd-compress parquet: {}", e))?;
let entry_name = parquet_batch_path(Path::new(base_name))
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
Ok((entry_name, compressed))
}
fn write_archive_batches(
data: &[Data],
to: &str,
out_path: &str,
compress: &str,
style: Option<&str>,
locale: Option<&str>,
) -> Result<(), String> {
let (base_path, inner_ext, _) = io_utils::get_extension(out_path, ".json");
let base_path = if base_path.extension().is_none() {
let inner_ext = if inner_ext.is_empty() {
".json"
} else {
&inner_ext
};
base_path.with_extension(inner_ext.trim_start_matches('.'))
} else {
base_path
};
let base_name = base_path
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
let entries =
crate::write_archive_citation(data, to, &base_name, BATCH_SIZE, style, locale)
.map_err(|e| e.to_string())?;
match compress {
"zip" => io_utils::write_zip_archive(out_path, &entries)
.map_err(|e| format!("failed to write zip '{}': {}", out_path, e))?,
"tgz" => io_utils::write_tar_gz_archive(out_path, &entries)
.map_err(|e| format!("failed to write tgz '{}': {}", out_path, e))?,
other => return Err(format!("list: unsupported archive compression: {}", other)),
}
println!(
"wrote {} ({} records in {} batch(es))",
out_path,
data.len(),
entries.len()
);
Ok(())
}
fn write_output(
data: &[Data],
to: &str,
style: Option<&str>,
locale: Option<&str>,
) -> Result<Vec<u8>, String> {
crate::write_list_citation(data, to, style, locale).map_err(|e| e.to_string())
}
fn sample_max(from: &str) -> usize {
match from {
"crossref" => 100,
"openalex" => 200,
_ => 1000, }
}
pub(crate) fn fetch_list_from_api(matches: &ArgMatches, from: &str) -> Result<Vec<Data>, String> {
let number = *matches.get_one::<usize>("number").unwrap_or(&10);
let page = *matches.get_one::<usize>("page").unwrap_or(&1);
let sample = matches.get_flag("sample");
if sample {
let max = sample_max(from);
let n = if number == 0 || number > max { max } else { number };
return match from {
"crossref" => fetch_crossref_page(matches, n, page),
"datacite" => fetch_datacite_list(matches, n, page),
"openalex" => fetch_openalex_list(matches, n, page),
other => Err(format!("--sample is not supported for --from {other}")),
};
}
if number == 0 {
return fetch_all_pages(matches, from, page);
}
match from {
"crossref" => fetch_crossref_page(matches, number, page),
"datacite" => fetch_datacite_list(matches, number, page),
"openalex" => fetch_openalex_list(matches, number, page),
"pubmed" => fetch_pubmed_page(matches, number),
_ => Err(format!("unsupported source: {from}")),
}
}
fn api_batch_size(from: &str) -> usize {
match from {
"openalex" => 200,
"pubmed" => 1000, _ => 1000, }
}
const API_RATE_DELAY_MS: u64 = 500;
fn fetch_all_pages(matches: &ArgMatches, from: &str, _start_page: usize) -> Result<Vec<Data>, String> {
let batch = api_batch_size(from);
let delay = std::time::Duration::from_millis(API_RATE_DELAY_MS);
let mut all: Vec<Data> = Vec::new();
match from {
"crossref" => {
let mut cursor = "*".to_string();
let mut page = 1usize;
loop {
let (got, next) = fetch_crossref_page_with_cursor(matches, batch, &cursor)?;
let n = got.len();
all.extend(got);
eprintln!("fetched {} records from crossref (page {}, {} total)", n, page, all.len());
if n < batch || next.is_none() {
break;
}
cursor = next.unwrap();
page += 1;
std::thread::sleep(delay);
}
}
"datacite" => {
let mut cursor: Option<String> = None; let mut page = 1usize;
loop {
let (got, next) = fetch_datacite_page_with_cursor(matches, batch, cursor.as_deref())?;
let n = got.len();
all.extend(got);
eprintln!("fetched {} records from datacite (page {}, {} total)", n, page, all.len());
if n < batch || next.is_none() {
break;
}
cursor = next;
page += 1;
std::thread::sleep(delay);
}
}
"openalex" => {
let mut cursor = "*".to_string();
let mut page = 1usize;
loop {
let (got, next) = fetch_openalex_page_with_cursor(matches, batch, &cursor)?;
let n = got.len();
all.extend(got);
eprintln!("fetched {} records from openalex (page {}, {} total)", n, page, all.len());
if n < batch || next.is_none() {
break;
}
cursor = next.unwrap();
page += 1;
std::thread::sleep(delay);
}
}
"pubmed" => {
let query = build_epmc_query(matches);
let mut cursor = "*".to_string();
let mut page = 1usize;
loop {
let (got, next) = crate::pubmed::fetch_page(&query, batch, &cursor)
.map_err(|e| e.to_string())?;
let n = got.len();
all.extend(got);
eprintln!("fetched {} records from pubmed (page {}, {} total)", n, page, all.len());
if n < batch || next.is_none() {
break;
}
cursor = next.unwrap();
page += 1;
std::thread::sleep(delay);
}
}
_ => return Err(format!("unsupported source: {from}")),
}
Ok(all)
}
fn fetch_crossref_page_with_cursor(
matches: &ArgMatches,
number: usize,
cursor: &str,
) -> Result<(Vec<Data>, Option<String>), String> {
crate::crossref_fetch_page_with_cursor(
cursor,
number,
matches.get_one::<String>("member").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("type").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("year").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("ror").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("orcid").map(String::as_str).unwrap_or(""),
matches.get_flag("has-orcid"),
matches.get_flag("has-ror-id"),
matches.get_flag("has-references"),
matches.get_flag("has-relation"),
matches.get_flag("has-abstract"),
matches.get_flag("has-award"),
matches.get_flag("has-license"),
matches.get_flag("has-archive"),
*matches.get_one::<bool>("match").unwrap_or(&true),
)
.map_err(|e| e.to_string())
}
fn fetch_datacite_page_with_cursor(
matches: &ArgMatches,
number: usize,
cursor: Option<&str>,
) -> Result<(Vec<Data>, Option<String>), String> {
let mut url =
Url::parse("https://api.datacite.org/dois").map_err(|e| format!("invalid URL: {}", e))?;
{
let mut query = url.query_pairs_mut();
query.append_pair("page[size]", &number.clamp(1, 1000).to_string());
query.append_pair("page[cursor]", cursor.unwrap_or("1"));
query.append_pair("affiliation", "true");
if let Some(client_id) = matches.get_one::<String>("client")
&& !client_id.is_empty()
{
query.append_pair("client-id", client_id);
}
let mut search_terms: Vec<String> = Vec::new();
if let Some(type_) = matches.get_one::<String>("type")
&& !type_.is_empty()
{
let dc_type = cm_to_dc(lookup_work_type(type_));
if !dc_type.is_empty() && dc_type != "Other" {
search_terms.push(format!("types.resourceTypeGeneral:{}", dc_type));
}
}
if let Some(year) = matches.get_one::<String>("year")
&& !year.is_empty()
{
search_terms.push(format!("publicationYear:{}", year));
}
if let Some(language) = matches.get_one::<String>("language")
&& !language.is_empty()
{
search_terms.push(format!("language:{}", language));
}
if let Some(orcid) = matches.get_one::<String>("orcid")
&& !orcid.is_empty()
{
search_terms.push(format!("creators.nameIdentifiers.nameIdentifier:{}", orcid));
}
if let Some(ror) = matches.get_one::<String>("ror")
&& !ror.is_empty()
{
search_terms.push(format!(
"creators.affiliation.affiliationIdentifier:{}",
ror
));
}
if !search_terms.is_empty() {
query.append_pair("query", &search_terms.join(" "));
}
}
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| format!("http client build failed: {}", e))?;
let text = client
.get(url.as_str())
.send()
.map_err(|e| format!("http request failed: {}", e))?
.error_for_status()
.map_err(|e| format!("http status error: {}", e))?
.text()
.map_err(|e| format!("failed to read response: {}", e))?;
let value: serde_json::Value =
serde_json::from_str(&text).map_err(|e| format!("invalid DataCite response: {}", e))?;
let next_cursor = value
.get("meta")
.and_then(|m| m.get("next"))
.and_then(|v| v.as_str())
.map(String::from);
let items = value
.get("data")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| "DataCite response missing data array".to_string())?;
let mut out: Vec<Data> = Vec::with_capacity(items.len());
for item in items {
out.push(convert_datacite_item(item)?);
}
Ok((out, next_cursor))
}
fn fetch_openalex_page_with_cursor(
matches: &ArgMatches,
number: usize,
cursor: &str,
) -> Result<(Vec<Data>, Option<String>), String> {
let mut url =
Url::parse("https://api.openalex.org/works").map_err(|e| format!("invalid URL: {}", e))?;
{
let mut query = url.query_pairs_mut();
query.append_pair("per-page", &number.clamp(1, 200).to_string());
query.append_pair("cursor", cursor);
if let Some(email) = matches.get_one::<String>("email")
&& !email.is_empty()
{
query.append_pair("mailto", email);
}
let mut filters: Vec<String> = Vec::new();
if let Some(type_) = matches.get_one::<String>("type")
&& !type_.is_empty()
{
let cr_type = cm_to_cr_filter_type(lookup_work_type(type_));
if !cr_type.is_empty() {
filters.push(format!("type_crossref:{}", cr_type));
}
}
if let Some(year) = matches.get_one::<String>("year")
&& !year.is_empty()
{
filters.push(format!("from_publication_date:{}-01-01", year));
filters.push(format!("to_publication_date:{}-12-31", year));
}
if let Some(orcid) = matches.get_one::<String>("orcid")
&& !orcid.is_empty()
{
filters.push(format!("author.orcid:{}", orcid));
}
if let Some(ror) = matches.get_one::<String>("ror")
&& !ror.is_empty()
{
filters.push(format!("institutions.ror:{}", ror));
}
if matches.get_flag("has-abstract") {
filters.push("has_abstract:true".to_string());
}
if matches.get_flag("has-references") {
filters.push("referenced_works_count:>0".to_string());
}
if !filters.is_empty() {
query.append_pair("filter", &filters.join(","));
}
}
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| format!("http client build failed: {}", e))?;
let text = client
.get(url.as_str())
.send()
.map_err(|e| format!("http request failed: {}", e))?
.error_for_status()
.map_err(|e| format!("http status error: {}", e))?
.text()
.map_err(|e| format!("failed to read response: {}", e))?;
let value: serde_json::Value =
serde_json::from_str(&text).map_err(|e| format!("invalid OpenAlex response: {}", e))?;
let next_cursor = value
.get("meta")
.and_then(|m| m.get("next_cursor"))
.and_then(|v| v.as_str())
.map(String::from);
let items = value
.get("results")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| "OpenAlex response missing results array".to_string())?;
let mut out: Vec<Data> = Vec::with_capacity(items.len());
for item in items {
out.push(convert_openalex_item(item)?);
}
Ok((out, next_cursor))
}
fn fetch_crossref_page(matches: &ArgMatches, number: usize, page: usize) -> Result<Vec<Data>, String> {
crate::crossref::fetch_all(
number,
page,
matches.get_one::<String>("member").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("type").map(String::as_str).unwrap_or(""),
matches.get_flag("sample"),
matches.get_one::<String>("year").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("ror").map(String::as_str).unwrap_or(""),
matches.get_one::<String>("orcid").map(String::as_str).unwrap_or(""),
matches.get_flag("has-orcid"),
matches.get_flag("has-ror-id"),
matches.get_flag("has-references"),
matches.get_flag("has-relation"),
matches.get_flag("has-abstract"),
matches.get_flag("has-award"),
matches.get_flag("has-license"),
matches.get_flag("has-archive"),
*matches.get_one::<bool>("match").unwrap_or(&true),
)
.map_err(|e| e.to_string())
}
fn fetch_datacite_list(
matches: &ArgMatches,
number: usize,
page: usize,
) -> Result<Vec<Data>, String> {
let sample = matches.get_flag("sample");
let mut url =
Url::parse("https://api.datacite.org/dois").map_err(|e| format!("invalid URL: {}", e))?;
{
let mut query = url.query_pairs_mut();
query.append_pair("page[size]", &number.clamp(1, 1000).to_string());
query.append_pair("page[number]", &page.max(1).to_string());
query.append_pair("affiliation", "true");
if sample {
query.append_pair("random", "true");
}
if let Some(client_id) = matches.get_one::<String>("client")
&& !client_id.is_empty()
{
query.append_pair("client-id", client_id);
}
let mut search_terms: Vec<String> = Vec::new();
if let Some(type_) = matches.get_one::<String>("type")
&& !type_.is_empty()
{
let dc_type = cm_to_dc(lookup_work_type(type_));
if !dc_type.is_empty() && dc_type != "Other" {
search_terms.push(format!("types.resourceTypeGeneral:{}", dc_type));
}
}
if let Some(year) = matches.get_one::<String>("year")
&& !year.is_empty()
{
search_terms.push(format!("publicationYear:{}", year));
}
if let Some(language) = matches.get_one::<String>("language")
&& !language.is_empty()
{
search_terms.push(format!("language:{}", language));
}
if let Some(orcid) = matches.get_one::<String>("orcid")
&& !orcid.is_empty()
{
search_terms.push(format!("creators.nameIdentifiers.nameIdentifier:{}", orcid));
}
if let Some(ror) = matches.get_one::<String>("ror")
&& !ror.is_empty()
{
search_terms.push(format!(
"creators.affiliation.affiliationIdentifier:{}",
ror
));
}
if !search_terms.is_empty() {
query.append_pair("query", &search_terms.join(" "));
}
}
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| format!("http client build failed: {}", e))?;
let text = client
.get(url.as_str())
.send()
.map_err(|e| format!("http request failed: {}", e))?
.error_for_status()
.map_err(|e| format!("http status error: {}", e))?
.text()
.map_err(|e| format!("failed to read response: {}", e))?;
let value: serde_json::Value =
serde_json::from_str(&text).map_err(|e| format!("invalid DataCite response: {}", e))?;
let items = value
.get("data")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| "DataCite response missing data array".to_string())?;
let mut out: Vec<Data> = Vec::with_capacity(items.len());
for item in items {
out.push(convert_datacite_item(item)?);
}
Ok(out)
}
fn fetch_openalex_list(
matches: &ArgMatches,
number: usize,
page: usize,
) -> Result<Vec<Data>, String> {
let sample = matches.get_flag("sample");
let mut url =
Url::parse("https://api.openalex.org/works").map_err(|e| format!("invalid URL: {}", e))?;
{
let mut query = url.query_pairs_mut();
if sample {
query.append_pair("sample", &number.clamp(1, 200).to_string());
} else {
query.append_pair("per-page", &number.clamp(1, 200).to_string());
query.append_pair("page", &page.max(1).to_string());
}
if let Some(email) = matches.get_one::<String>("email")
&& !email.is_empty()
{
query.append_pair("mailto", email);
}
let mut filters: Vec<String> = Vec::new();
if let Some(type_) = matches.get_one::<String>("type")
&& !type_.is_empty()
{
let cr_type = cm_to_cr_filter_type(lookup_work_type(type_));
if !cr_type.is_empty() {
filters.push(format!("type_crossref:{}", cr_type));
}
}
if let Some(year) = matches.get_one::<String>("year")
&& !year.is_empty()
{
filters.push(format!("from_publication_date:{}-01-01", year));
filters.push(format!("to_publication_date:{}-12-31", year));
}
if let Some(orcid) = matches.get_one::<String>("orcid")
&& !orcid.is_empty()
{
filters.push(format!("author.orcid:{}", orcid));
}
if let Some(ror) = matches.get_one::<String>("ror")
&& !ror.is_empty()
{
filters.push(format!("institutions.ror:{}", ror));
}
if matches.get_flag("has-abstract") {
filters.push("has_abstract:true".to_string());
}
if matches.get_flag("has-references") {
filters.push("referenced_works_count:>0".to_string());
}
if !filters.is_empty() {
query.append_pair("filter", &filters.join(","));
}
}
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| format!("http client build failed: {}", e))?;
let text = client
.get(url.as_str())
.send()
.map_err(|e| format!("http request failed: {}", e))?
.error_for_status()
.map_err(|e| format!("http status error: {}", e))?
.text()
.map_err(|e| format!("failed to read response: {}", e))?;
let value: serde_json::Value =
serde_json::from_str(&text).map_err(|e| format!("invalid OpenAlex response: {}", e))?;
let items = value
.get("results")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| "OpenAlex response missing results array".to_string())?;
let mut out: Vec<Data> = Vec::with_capacity(items.len());
for item in items {
out.push(convert_openalex_item(item)?);
}
Ok(out)
}
fn build_epmc_query(matches: &ArgMatches) -> String {
let mut terms = vec!["SRC:MED".to_string()];
if let Some(orcid) = matches.get_one::<String>("orcid")
&& !orcid.is_empty()
{
terms.push(format!("AUTHORID:{}", orcid));
}
if let Some(year) = matches.get_one::<String>("year")
&& !year.is_empty()
{
terms.push(format!(
"FIRST_PDATE:[{year}-01-01 TO {year}-12-31]"
));
}
if let Some(type_) = matches.get_one::<String>("type")
&& !type_.is_empty()
{
let type_lower = type_.to_lowercase();
let epmc_type = match type_lower.as_str() {
"journal-article" | "journalarticle" => "research-article",
"book-chapter" | "bookchapter" => "book-chapter",
"review" => "review-article",
other => other,
};
terms.push(format!("ARTICLE_TYPE:{}", epmc_type));
}
if matches.get_flag("has-abstract") {
terms.push("HAS_ABSTRACT:Y".to_string());
}
if matches.get_flag("has-orcid") {
terms.push("HAS_ORCID:Y".to_string());
}
terms.join(" AND ")
}
fn fetch_pubmed_page(matches: &ArgMatches, number: usize) -> Result<Vec<Data>, String> {
let query = build_epmc_query(matches);
let (data, _) = crate::pubmed::fetch_page(&query, number, "*")
.map_err(|e| e.to_string())?;
Ok(data)
}
pub(crate) fn load_list_from_file(
path: &str,
from: &str,
limit: Option<usize>,
offset: usize,
) -> Result<Vec<Data>, String> {
let (_base, extension, compress) = io_utils::get_extension(path, ".json");
if extension == ".parquet" {
return load_commonmeta_list_from_parquet(path);
}
if extension == ".sqlite3" {
return load_commonmeta_list_from_sqlite(path, &compress, limit, offset);
}
match from {
"crossref" => load_crossref_list_from_file(path),
"datacite" => load_datacite_list_from_file(path),
"openalex" => load_openalex_list_from_file(path),
"commonmeta" => Err(format!(
"list: --from commonmeta expects a .parquet or .sqlite3 input file, got '{}'",
path
)),
_ => Err(format!("unsupported source: {from}")),
}
}
fn load_commonmeta_list_from_sqlite(
path: &str,
compress: &str,
limit: Option<usize>,
offset: usize,
) -> Result<Vec<Data>, String> {
let sqlite_path = if compress == "zst" {
let tmp = temp_dir().join(format!(
"commonmeta-sqlite-{}.sqlite3",
std::process::id()
));
{
let src = std::fs::File::open(path)
.map_err(|e| format!("failed to open '{}': {}", path, e))?;
let mut decoder = zstd::Decoder::new(src)
.map_err(|e| format!("failed to create zstd decoder: {}", e))?;
let mut dst = std::fs::File::create(&tmp)
.map_err(|e| format!("failed to create temp file: {}", e))?;
std::io::copy(&mut decoder, &mut dst)
.map_err(|e| format!("failed to decompress '{}': {}", path, e))?;
}
tmp
} else {
std::path::PathBuf::from(path)
};
let result = crate::read_sqlite_commonmeta(&sqlite_path, limit, offset)
.map_err(|e| e.to_string());
if compress == "zst" {
std::fs::remove_file(&sqlite_path).ok();
}
result
}
fn load_commonmeta_list_from_parquet(path: &str) -> Result<Vec<Data>, String> {
let (_base, extension, compress) = io_utils::get_extension(path, ".parquet");
if extension != ".parquet" {
return Err(format!(
"list: --from commonmeta expects a .parquet (optionally .zst/.zip/.tgz) input file, got '{}'",
path
));
}
let compressed_batches: Vec<Vec<u8>> = match compress.as_str() {
"zip" => {
let raw = io_utils::read_file(path)
.map_err(|e| format!("failed to read '{}': {}", path, e))?;
io_utils::read_zip_entries(&raw)
.map_err(|e| format!("failed to read zip '{}': {}", path, e))?
}
"tgz" => {
let raw = io_utils::read_file(path)
.map_err(|e| format!("failed to read '{}': {}", path, e))?;
io_utils::read_tar_gz_entries(&raw)
.map_err(|e| format!("failed to read tgz '{}': {}", path, e))?
}
"zst" => vec![
io_utils::read_zst_file(path)
.map_err(|e| format!("failed to read zstd-compressed '{}': {}", path, e))?,
],
_ => vec![
io_utils::read_file(path).map_err(|e| format!("failed to read '{}': {}", path, e))?,
],
};
let parquet_batches: Vec<Vec<u8>> = if matches!(compress.as_str(), "zip" | "tgz") {
compressed_batches
.into_iter()
.map(|bytes| {
io_utils::unzst_content(&bytes)
.map_err(|e| format!("failed to decompress entry in '{}': {}", path, e))
})
.collect::<Result<Vec<_>, String>>()?
} else {
compressed_batches
};
let mut out = Vec::new();
for bytes in parquet_batches {
let records = crate::read_parquet(&bytes)
.map_err(|e| format!("failed to parse Parquet '{}': {}", path, e))?;
out.extend(records);
}
Ok(out)
}
fn load_crossref_list_from_file(path: &str) -> Result<Vec<Data>, String> {
let content =
std::fs::read_to_string(path).map_err(|e| format!("failed to read '{}': {}", path, e))?;
if path.ends_with(".jsonl") || path.ends_with(".jsonlines") {
return parse_crossref_jsonlines(&content);
}
let value: serde_json::Value =
serde_json::from_str(&content).map_err(|e| format!("invalid JSON in '{}': {}", path, e))?;
let mut out: Vec<Data> = Vec::new();
if let Some(items) = value
.get("items")
.and_then(serde_json::Value::as_array)
.or_else(|| {
value
.get("message")
.and_then(|m| m.get("items"))
.and_then(serde_json::Value::as_array)
})
{
for item in items {
out.push(convert_crossref_item(item)?);
}
return Ok(out);
}
if let Some(items) = value.as_array() {
for item in items {
out.push(convert_crossref_item(item)?);
}
return Ok(out);
}
Err(
"unsupported Crossref list file format; expected JSON array, {items:[...]}, or JSON Lines"
.to_string(),
)
}
fn parse_crossref_jsonlines(content: &str) -> Result<Vec<Data>, String> {
let mut out: Vec<Data> = Vec::new();
for (index, line) in content.lines().enumerate() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let value: serde_json::Value = serde_json::from_str(trimmed)
.map_err(|e| format!("invalid JSON at line {}: {}", index + 1, e))?;
out.push(convert_crossref_item(&value)?);
}
Ok(out)
}
fn convert_crossref_item(item: &serde_json::Value) -> Result<Data, String> {
let envelope = json!({ "message": item });
let input = serde_json::to_string(&envelope).map_err(|e| e.to_string())?;
let bytes = crate::convert("crossref", "commonmeta", &input)
.map_err(|e| format!("crossref conversion failed: {}", e))?;
serde_json::from_slice::<Data>(&bytes)
.map_err(|e| format!("failed to parse output JSON: {}", e))
}
fn load_datacite_list_from_file(path: &str) -> Result<Vec<Data>, String> {
let content =
std::fs::read_to_string(path).map_err(|e| format!("failed to read '{}': {}", path, e))?;
if path.ends_with(".jsonl") || path.ends_with(".jsonlines") {
let mut out: Vec<Data> = Vec::new();
for (index, line) in content.lines().enumerate() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let value: serde_json::Value = serde_json::from_str(trimmed)
.map_err(|e| format!("invalid JSON at line {}: {}", index + 1, e))?;
out.push(convert_datacite_item(&value)?);
}
return Ok(out);
}
let value: serde_json::Value =
serde_json::from_str(&content).map_err(|e| format!("invalid JSON in '{}': {}", path, e))?;
let mut out: Vec<Data> = Vec::new();
if let Some(items) = value.get("data").and_then(serde_json::Value::as_array) {
for item in items {
out.push(convert_datacite_item(item)?);
}
return Ok(out);
}
if let Some(items) = value.as_array() {
for item in items {
out.push(convert_datacite_item(item)?);
}
return Ok(out);
}
Err(
"unsupported DataCite list file format; expected JSON array, {data:[...]}, or JSON Lines"
.to_string(),
)
}
fn convert_datacite_item(item: &serde_json::Value) -> Result<Data, String> {
let envelope = if item.get("data").is_some() {
item.clone()
} else {
json!({ "data": item })
};
let input = serde_json::to_string(&envelope).map_err(|e| e.to_string())?;
let bytes = crate::convert("datacite", "commonmeta", &input)
.map_err(|e| format!("datacite conversion failed: {}", e))?;
serde_json::from_slice::<Data>(&bytes)
.map_err(|e| format!("failed to parse output JSON: {}", e))
}
fn load_openalex_list_from_file(path: &str) -> Result<Vec<Data>, String> {
let content =
std::fs::read_to_string(path).map_err(|e| format!("failed to read '{}': {}", path, e))?;
if path.ends_with(".jsonl") || path.ends_with(".jsonlines") {
let mut out: Vec<Data> = Vec::new();
for (index, line) in content.lines().enumerate() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let value: serde_json::Value = serde_json::from_str(trimmed)
.map_err(|e| format!("invalid JSON at line {}: {}", index + 1, e))?;
out.push(convert_openalex_item(&value)?);
}
return Ok(out);
}
let value: serde_json::Value =
serde_json::from_str(&content).map_err(|e| format!("invalid JSON in '{}': {}", path, e))?;
let mut out: Vec<Data> = Vec::new();
if let Some(items) = value
.get("results")
.and_then(serde_json::Value::as_array)
.or_else(|| value.as_array())
{
for item in items {
out.push(convert_openalex_item(item)?);
}
return Ok(out);
}
Err("unsupported OpenAlex list file format; expected JSON array, {results:[...]}, or JSON Lines".to_string())
}
fn convert_openalex_item(item: &serde_json::Value) -> Result<Data, String> {
let input = serde_json::to_string(item).map_err(|e| e.to_string())?;
let bytes = crate::convert("openalex", "commonmeta", &input)
.map_err(|e| format!("openalex conversion failed: {}", e))?;
serde_json::from_slice::<Data>(&bytes)
.map_err(|e| format!("failed to parse output JSON: {}", e))
}
fn load_vraix_list_for_date(
date: &str,
input_path: Option<&str>,
from: &str,
matches: &ArgMatches,
timers: bool,
) -> Result<Vec<Data>, String> {
let number = *matches.get_one::<usize>("number").unwrap_or(&10);
let page = *matches.get_one::<usize>("page").unwrap_or(&1);
let offset = page.saturating_sub(1).saturating_mul(number);
let limit = if number == 0 { None } else { Some(number) };
if let Some(path) = input_path {
let convert_start = Instant::now();
let (_base, _extension, compress) = io_utils::get_extension(path, ".json");
let data = if compress == "zst" {
let compressed = io_utils::read_zst_file(path)
.map_err(|e| format!("failed to read zstd-compressed '{}': {}", path, e))?;
let tmp_path = temp_dir().join(format!(
"commonmeta-vraix-local-{}-{}.sqlite3",
from,
std::process::id()
));
io_utils::write_file(&tmp_path, &compressed).map_err(|e| {
format!("failed to write temp file '{}': {}", tmp_path.display(), e)
})?;
let result =
crate::read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
std::fs::remove_file(&tmp_path).ok();
result.map_err(|e| e.to_string())?
} else {
crate::read_vraix_sqlite(path, from, limit, offset).map_err(|e| e.to_string())?
};
if timers {
eprintln!(
"list: read to commonmeta took {:.2?} ({} records)",
convert_start.elapsed(),
data.len()
);
}
return Ok(data);
}
let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
let cache_key = format!("{}-{}.sqlite3.zst", from, date);
let download_start = Instant::now();
let (zst_path, from_cache) =
io_utils::ensure_cached_path(&url, "vraix", &cache_key, VRAIX_CACHE_TTL)
.map_err(|e| format!("failed to download '{}': {}", url, e))?;
if timers {
let label = if from_cache { ", from local cache" } else { "" };
eprintln!(
"list: download took {:.2?}{}",
download_start.elapsed(),
label,
);
}
let convert_start = Instant::now();
let tmp_path = temp_dir().join(format!(
"commonmeta-vraix-{}-{}-{}.sqlite3",
from,
date,
std::process::id()
));
io_utils::decompress_zst_file(&zst_path, &tmp_path)
.map_err(|e| format!("failed to decompress '{}': {}", url, e))?;
let result = crate::read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
std::fs::remove_file(&tmp_path).ok();
let data = result.map_err(|e| e.to_string())?;
if timers {
eprintln!(
"list: read to commonmeta took {:.2?} ({} records)",
convert_start.elapsed(),
data.len()
);
}
Ok(data)
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_args(args: &[&str]) -> clap::ArgMatches {
command().try_get_matches_from(args).expect("arg parse failed")
}
#[test]
fn test_no_network_without_input_falls_back_to_local_db() {
let m = parse_args(&["list", "--no-network", "--from", "crossref"]);
match execute(&m) {
Ok(()) => {}
Err(e) => assert!(
!e.contains("--no-network requires"),
"should fall back to local DB rather than refusing; got: {e}"
),
}
}
#[test]
fn test_no_network_with_local_file_passes_guard() {
let m = parse_args(&["list", "--no-network", "nonexistent.sqlite3"]);
let err = execute(&m).unwrap_err();
assert!(
!err.contains("--no-network requires"),
"should not fail at network guard when input file is given, got: {err}"
);
}
fn sample_data(id: &str) -> Data {
Data {
id: id.to_string(),
type_: "JournalArticle".to_string(),
..Data::default()
}
}
#[test]
fn test_parquet_batch_path_single() {
let path = parquet_batch_path(Path::new("/tmp/out.parquet"));
assert_eq!(path, PathBuf::from("/tmp/out.parquet.zst"));
}
#[test]
fn test_parquet_batch_path_no_extension() {
let path = parquet_batch_path(Path::new("/tmp/out"));
assert_eq!(path, PathBuf::from("/tmp/out.zst"));
}
#[test]
fn test_write_archive_batches_zip() {
let dir = temp_dir().join("commonmeta_list_archive_zip");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.zip");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_archive_batches(
&data,
"commonmeta",
out_path.to_str().unwrap(),
"zip",
None,
None,
)
.unwrap();
assert!(out_path.exists());
let mut archive = zip::ZipArchive::new(std::fs::File::open(&out_path).unwrap()).unwrap();
assert_eq!(archive.len(), 1);
assert_eq!(archive.by_index(0).unwrap().name(), "out.json");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_write_archive_batches_tgz_no_inner_extension() {
let dir = temp_dir().join("commonmeta_list_archive_tgz");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.tgz");
let data = vec![sample_data("https://doi.org/10.1/a")];
write_archive_batches(
&data,
"commonmeta",
out_path.to_str().unwrap(),
"tgz",
None,
None,
)
.unwrap();
assert!(out_path.exists());
let decoder = flate2::read::GzDecoder::new(std::fs::File::open(&out_path).unwrap());
let mut archive = tar::Archive::new(decoder);
let entries: Vec<String> = archive
.entries()
.unwrap()
.map(|e| e.unwrap().path().unwrap().to_string_lossy().to_string())
.collect();
assert_eq!(entries, vec!["out.json"]);
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_write_archive_batches_empty_data_errors() {
let result =
write_archive_batches(&[], "commonmeta", "/tmp/whatever.zip", "zip", None, None);
assert!(result.is_err());
}
#[test]
fn test_write_parquet_batches_single_batch() {
let dir = temp_dir().join("commonmeta_list_parquet_single");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_parquet_batches(&data, out_path.to_str().unwrap()).unwrap();
let zst_path = dir.join("out.parquet.zst");
assert!(zst_path.exists());
assert!(!dir.join("out-00000.parquet.zst").exists());
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_write_parquet_batches_large_list_still_one_file() {
let dir = temp_dir().join("commonmeta_list_parquet_large");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet");
let data: Vec<Data> = (0..5)
.map(|i| sample_data(&format!("https://doi.org/10.1/{i}")))
.collect();
write_parquet_batches(&data, out_path.to_str().unwrap()).unwrap();
assert!(dir.join("out.parquet.zst").exists());
assert!(!dir.join("out-00000.parquet.zst").exists());
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_write_parquet_archive_zip_single_batch() {
let dir = temp_dir().join("commonmeta_list_parquet_archive_zip");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet.zip");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_parquet_archive(&data, out_path.to_str().unwrap(), "zip").unwrap();
assert!(out_path.exists());
let mut archive = zip::ZipArchive::new(std::fs::File::open(&out_path).unwrap()).unwrap();
assert_eq!(archive.len(), 1);
assert_eq!(archive.by_index(0).unwrap().name(), "out.parquet.zst");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_write_parquet_archive_tgz_single_entry() {
let (name, bytes) =
parquet_archive_entry(&[sample_data("https://doi.org/10.1/a")], "out.parquet").unwrap();
assert_eq!(name, "out.parquet.zst");
assert!(!bytes.is_empty());
let dir = temp_dir().join("commonmeta_list_parquet_archive_tgz");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet.tgz");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_parquet_archive(&data, out_path.to_str().unwrap(), "tgz").unwrap();
let decoder = flate2::read::GzDecoder::new(std::fs::File::open(&out_path).unwrap());
let mut archive = tar::Archive::new(decoder);
let entries: Vec<String> = archive
.entries()
.unwrap()
.map(|e| e.unwrap().path().unwrap().to_string_lossy().to_string())
.collect();
assert_eq!(entries, vec!["out.parquet.zst"]);
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_write_parquet_archive_empty_data_errors() {
let result = write_parquet_archive(&[], "/tmp/whatever.parquet.zip", "zip");
assert!(result.is_err());
}
#[test]
fn test_load_commonmeta_list_from_parquet_zst() {
let dir = temp_dir().join("commonmeta_list_load_parquet_zst");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("batch-commonmeta.parquet");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_parquet_batches(&data, out_path.to_str().unwrap()).unwrap();
let zst_path = dir.join("batch-commonmeta.parquet.zst");
let loaded = load_commonmeta_list_from_parquet(zst_path.to_str().unwrap()).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded[0].id, "https://doi.org/10.1/a");
assert_eq!(loaded[1].id, "https://doi.org/10.1/b");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_load_commonmeta_list_from_parquet_zip_round_trip() {
let dir = temp_dir().join("commonmeta_list_load_parquet_zip");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet.zip");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_parquet_archive(&data, out_path.to_str().unwrap(), "zip").unwrap();
let loaded = load_commonmeta_list_from_parquet(out_path.to_str().unwrap()).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded[0].id, "https://doi.org/10.1/a");
assert_eq!(loaded[1].id, "https://doi.org/10.1/b");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_load_commonmeta_list_from_parquet_tgz_round_trip() {
let dir = temp_dir().join("commonmeta_list_load_parquet_tgz");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet.tgz");
let data = vec![
sample_data("https://doi.org/10.1/a"),
sample_data("https://doi.org/10.1/b"),
];
write_parquet_archive(&data, out_path.to_str().unwrap(), "tgz").unwrap();
let loaded = load_commonmeta_list_from_parquet(out_path.to_str().unwrap()).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded[0].id, "https://doi.org/10.1/a");
assert_eq!(loaded[1].id, "https://doi.org/10.1/b");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_load_commonmeta_list_from_parquet_zip_multi_batch_round_trip() {
let dir = temp_dir().join("commonmeta_list_load_parquet_zip_multi");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet.zip");
let chunk_a = vec![sample_data("https://doi.org/10.1/a")];
let chunk_b = vec![sample_data("https://doi.org/10.1/b")];
let mut entry_a = parquet_archive_entry(&chunk_a, "out.parquet").unwrap();
let entry_b = parquet_archive_entry(&chunk_b, "out.parquet").unwrap();
entry_a.0 = "out-00000.parquet.zst".to_string();
let entry_b = ("out-00001.parquet.zst".to_string(), entry_b.1);
io_utils::write_zip_archive(&out_path, &[entry_a, entry_b]).unwrap();
let loaded = load_commonmeta_list_from_parquet(out_path.to_str().unwrap()).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded[0].id, "https://doi.org/10.1/a");
assert_eq!(loaded[1].id, "https://doi.org/10.1/b");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_load_commonmeta_list_from_parquet_uncompressed() {
let dir = temp_dir().join("commonmeta_list_load_parquet_plain");
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("data.parquet");
let bytes = crate::write_parquet(&[sample_data("https://doi.org/10.1/a")]).unwrap();
std::fs::write(&path, bytes).unwrap();
let loaded = load_commonmeta_list_from_parquet(path.to_str().unwrap()).unwrap();
assert_eq!(loaded.len(), 1);
assert_eq!(loaded[0].id, "https://doi.org/10.1/a");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_load_commonmeta_list_from_parquet_wrong_extension() {
let result = load_commonmeta_list_from_parquet("/tmp/whatever.json");
assert!(result.is_err());
}
#[test]
fn test_write_parquet_batches_empty_data_errors() {
let result = write_parquet_batches(&[], "/tmp/whatever.parquet");
assert!(result.is_err());
}
#[test]
fn test_write_parquet_batch_roundtrip_readable() {
use parquet::file::reader::{FileReader, SerializedFileReader};
let dir = temp_dir().join("commonmeta_list_parquet_readable");
std::fs::create_dir_all(&dir).unwrap();
let out_path = dir.join("out.parquet");
let data = vec![sample_data("https://doi.org/10.1/a")];
write_parquet_batches(&data, out_path.to_str().unwrap()).unwrap();
let zst_path = dir.join("out.parquet.zst");
let compressed = std::fs::read(&zst_path).unwrap();
let decompressed = zstd::stream::decode_all(std::io::Cursor::new(compressed)).unwrap();
let reader = SerializedFileReader::new(bytes::Bytes::from(decompressed)).unwrap();
assert_eq!(reader.metadata().file_metadata().num_rows(), 1);
std::fs::remove_dir_all(&dir).ok();
}
fn write_vraix_sqlite(path: &Path, rows: &[(&str, &str)]) {
std::fs::remove_file(path).ok();
let conn = rusqlite::Connection::open(path).unwrap();
conn.execute_batch("CREATE TABLE works (pid TEXT, source_id INTEGER, raw_metadata TEXT);")
.unwrap();
for (pid, raw_metadata) in rows {
conn.execute(
"INSERT INTO works (pid, source_id, raw_metadata) VALUES (?1, ?2, ?3)",
rusqlite::params![*pid, 1i64, *raw_metadata],
)
.unwrap();
}
let _ = conn.execute("PRAGMA wal_checkpoint(TRUNCATE)", []);
}
#[test]
fn test_load_vraix_list_for_date_uses_local_input_path() {
let dir = temp_dir().join("commonmeta_list_vraix_local_date");
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("datacite.sqlite3");
write_vraix_sqlite(
&path,
&[(
"10.5678/b",
r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b","types":{"resourceTypeGeneral":"Dataset"},"titles":[{"title":"Test"}]}}}"#,
)],
);
let matches = command().get_matches_from(vec!["list", "--number", "10", "--page", "1"]);
let data = load_vraix_list_for_date(
"2026-06-14",
Some(path.to_str().unwrap()),
"datacite",
&matches,
false,
)
.unwrap();
assert_eq!(data.len(), 1);
assert_eq!(data[0].id, "https://doi.org/10.5678/b");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_execute_date_requires_crossref_or_datacite_from() {
let matches =
command().get_matches_from(vec!["list", "--from", "openalex", "--date", "2026-06-14"]);
let err = execute(&matches).unwrap_err();
assert!(err.contains("requires --from crossref or --from datacite"));
}
#[test]
fn test_is_supported_output_format_includes_citation() {
assert!(is_supported_output_format("citation"));
}
#[test]
fn test_write_output_renders_citation_with_style() {
let mut data = sample_data("https://doi.org/10.1/a");
data.title = "A Title".to_string();
data.date_published = "2020".to_string();
let apa = write_output(&[data.clone()], "citation", None, None).unwrap();
let chicago = write_output(&[data], "citation", Some("chicago-author-date"), None).unwrap();
assert_ne!(apa, chicago);
}
#[test]
fn test_load_vraix_list_for_date_decompresses_local_zst_input() {
let dir = temp_dir().join("commonmeta_list_vraix_local_zst");
std::fs::create_dir_all(&dir).unwrap();
let sqlite_path = dir.join("datacite.sqlite3");
write_vraix_sqlite(
&sqlite_path,
&[(
"10.5678/b",
r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b","types":{"resourceTypeGeneral":"Dataset"},"titles":[{"title":"Test"}]}}}"#,
)],
);
let raw = std::fs::read(&sqlite_path).unwrap();
let compressed = zstd::stream::encode_all(std::io::Cursor::new(raw), 0).unwrap();
let zst_path = dir.join("datacite.sqlite3.zst");
std::fs::write(&zst_path, compressed).unwrap();
let matches = command().get_matches_from(vec!["list", "--number", "10", "--page", "1"]);
let data = load_vraix_list_for_date(
"2026-06-14",
Some(zst_path.to_str().unwrap()),
"datacite",
&matches,
false,
)
.unwrap();
assert_eq!(data.len(), 1);
assert_eq!(data[0].id, "https://doi.org/10.5678/b");
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_execute_auto_detects_sqlite_input_without_date() {
let dir = temp_dir().join("commonmeta_list_execute_sqlite_no_date");
std::fs::create_dir_all(&dir).unwrap();
let sqlite_path = dir.join("datacite.sqlite3");
write_vraix_sqlite(
&sqlite_path,
&[(
"10.5678/b",
r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b","types":{"resourceTypeGeneral":"Dataset"},"titles":[{"title":"Test"}]}}}"#,
)],
);
let matches = command().get_matches_from(vec![
"list",
sqlite_path.to_str().unwrap(),
"--from",
"datacite",
]);
let result = execute(&matches);
assert!(result.is_ok(), "expected success, got: {:?}", result);
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn test_execute_auto_detects_sqlite_zst_input_without_date() {
let dir = temp_dir().join("commonmeta_list_execute_sqlite_zst_no_date");
std::fs::create_dir_all(&dir).unwrap();
let sqlite_path = dir.join("datacite.sqlite3");
write_vraix_sqlite(
&sqlite_path,
&[(
"10.5678/b",
r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b","types":{"resourceTypeGeneral":"Dataset"},"titles":[{"title":"Test"}]}}}"#,
)],
);
let raw = std::fs::read(&sqlite_path).unwrap();
let compressed = zstd::stream::encode_all(std::io::Cursor::new(raw), 0).unwrap();
let zst_path = dir.join("datacite.sqlite3.zst");
std::fs::write(&zst_path, compressed).unwrap();
let matches = command().get_matches_from(vec![
"list",
zst_path.to_str().unwrap(),
"--from",
"datacite",
]);
let result = execute(&matches);
assert!(result.is_ok(), "expected success, got: {:?}", result);
std::fs::remove_dir_all(&dir).ok();
}
}
fn execute_ror_list(
matches: &ArgMatches,
to: &str,
out_file: Option<&String>,
style: Option<&str>,
locale: Option<&str>,
) -> Result<(), String> {
if !matches!(to, "commonmeta" | "ror") {
return Err(format!(
"list --from ror: --to {} is not supported (use commonmeta or ror)",
to
));
}
let number = *matches.get_one::<usize>("number").unwrap_or(&10);
let page = *matches.get_one::<usize>("page").unwrap_or(&1);
let sample = matches.get_flag("sample");
let country = matches.get_one::<String>("country").map(String::as_str);
let query = matches.get_one::<String>("input").map(String::as_str);
let db_path_str = crate::cmd::resolve_db_path(None);
let db_path = std::path::Path::new(&db_path_str);
if !db_path.exists() {
return Err(format!(
"list --from ror: no local ROR database found at '{}'; \
run 'commonmeta import --from ror' first",
db_path_str
));
}
let limit = if number == 0 { None } else { Some(number) };
let offset = if sample {
0
} else {
page.saturating_sub(1).saturating_mul(number)
};
let output: Vec<u8> = if to == "ror" {
let raws: Vec<crate::Ror> = if sample {
crate::sample_ror_sqlite_raw(db_path, limit, country)
.map_err(|e| e.to_string())?
} else {
crate::read_ror_sqlite_raw(db_path, limit, offset, country, query)
.map_err(|e| e.to_string())?
};
if raws.is_empty() {
eprintln!("list --from ror: no records found");
return Ok(());
}
let items: Vec<serde_json::Value> = raws
.into_iter()
.map(|mut ror| -> Result<serde_json::Value, String> {
crate::enrich_ror_locations(&mut ror, db_path);
let bytes = crate::write_ror_v2_json(&ror).map_err(|e| e.to_string())?;
serde_json::from_slice(&bytes).map_err(|e| e.to_string())
})
.collect::<Result<Vec<_>, _>>()?;
serde_json::to_vec_pretty(&items).map_err(|e| e.to_string())?
} else {
let data: Vec<Data> = if sample {
crate::sample_ror_sqlite(db_path, limit, country)
.map_err(|e| e.to_string())?
} else {
crate::read_ror_sqlite(db_path, limit, offset, country, query)
.map_err(|e| e.to_string())?
};
if data.is_empty() {
eprintln!("list --from ror: no records found");
return Ok(());
}
crate::write_list_citation(&data, to, style, locale).map_err(|e| e.to_string())?
};
match out_file {
Some(path) => {
let file = std::path::Path::new(path);
let (_base, _ext, compress) = io_utils::get_extension(path, ".json");
match compress.as_str() {
"zst" => io_utils::write_zst_file(file, &output)
.map_err(|e| format!("failed to write '{}': {}", path, e)),
_ => io_utils::write_file(file, &output)
.map_err(|e| format!("failed to write '{}': {}", path, e)),
}
}
None => {
println!("{}", String::from_utf8_lossy(&output));
Ok(())
}
}
}