1pub mod author_utils;
7pub mod constants;
8pub mod crockford;
9pub mod data;
10pub mod doi_utils;
11pub mod error;
12pub mod file_utils;
13mod formats;
14pub mod progress;
15pub mod schema_utils;
16pub mod spdx;
17pub mod utils;
18pub mod vocabularies;
19
20pub use data::Data;
21pub use error::{Error, Result};
22pub use formats::crossref;
23pub use formats::inveniordm::PushResult;
24pub use formats::ror::AffiliationMatch;
25
26pub const VERSION: &str = env!("CARGO_PKG_VERSION");
27
28pub fn read(from: &str, input: &str) -> Result<Data> {
30 formats::read(from, input)
31}
32
33pub fn convert(from: &str, to: &str, input: &str) -> Result<Vec<u8>> {
35 let data = formats::read(from, input)?;
36 formats::write(to, &data)
37}
38
39pub fn write(to: &str, data: &Data) -> Result<Vec<u8>> {
41 formats::write(to, data)
42}
43
44pub fn write_with_style(
47 to: &str,
48 data: &Data,
49 style: Option<&str>,
50 locale: Option<&str>,
51) -> Result<Vec<u8>> {
52 formats::write_citation(to, data, style, locale)
53}
54
55pub fn write_ror_json(data: &Data) -> Result<Vec<u8>> {
58 formats::ror::write_json(data)
59}
60
61pub fn match_ror_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
64 formats::ror::match_affiliation(affiliation)
65}
66
67pub fn convert_citation(
69 from: &str,
70 input: &str,
71 style: Option<&str>,
72 locale: Option<&str>,
73) -> Result<Vec<u8>> {
74 let data = formats::read(from, input)?;
75 formats::write_citation("citation", &data, style, locale)
76}
77
78pub fn write_parquet(list: &[Data]) -> Result<Vec<u8>> {
84 formats::commonmeta::write_parquet_all(list)
85}
86
87pub fn read_parquet(bytes: &[u8]) -> Result<Vec<Data>> {
91 formats::commonmeta::read_parquet_all(bytes)
92}
93
94pub fn write_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
99 formats::commonmeta::write_sqlite(list, path)
100}
101
102pub fn upsert_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
105 formats::commonmeta::upsert_sqlite(list, path)
106}
107
108pub fn count_sqlite_works(path: &std::path::Path) -> Result<usize> {
111 formats::commonmeta::count_sqlite_works(path)
112}
113
114pub fn read_sqlite_commonmeta(
116 path: &std::path::Path,
117 limit: Option<usize>,
118 offset: usize,
119) -> Result<Vec<Data>> {
120 formats::commonmeta::read_sqlite_commonmeta(path, limit, offset)
121}
122
123pub fn stream_vraix_to_sqlite(
133 input_path: &std::path::Path,
134 from: &str,
135 output_path: &std::path::Path,
136 limit: usize,
137 update: bool,
138) -> Result<usize> {
139 formats::vraix::stream_dump_to_sqlite(input_path, from, output_path, limit, !update)
140}
141
142pub fn stream_pidbox_to_sqlite(
148 input_path: &std::path::Path,
149 output_path: &std::path::Path,
150 limit: usize,
151 update: bool,
152) -> Result<usize> {
153 formats::vraix::stream_pidbox_to_sqlite(input_path, output_path, limit, !update)
154}
155
156pub fn write_list(list: &[Data], to: &str) -> Result<Vec<u8>> {
161 write_list_citation(list, to, None, None)
162}
163
164pub fn write_list_citation(
168 list: &[Data],
169 to: &str,
170 style: Option<&str>,
171 locale: Option<&str>,
172) -> Result<Vec<u8>> {
173 let bar = progress::count_bar("rendering", list.len() as u64);
174
175 if matches!(
176 to,
177 "commonmeta"
178 | "csl"
179 | "datacite"
180 | "inveniordm"
181 | "schemaorg"
182 | "ror"
183 | "citation"
184 | "crossref_xml"
185 ) {
186 let bytes = formats::write_all_citation(to, list, style, locale)?;
187 bar.finish_and_clear();
188 return Ok(bytes);
189 }
190
191 let mut output = String::new();
192 for (idx, item) in list.iter().enumerate() {
193 let rendered = formats::write_citation(to, item, style, locale)?;
194 if idx > 0 {
195 output.push('\n');
196 }
197 output.push_str(&String::from_utf8_lossy(&rendered));
198 bar.inc(1);
199 }
200 bar.finish_and_clear();
201 Ok(output.into_bytes())
202}
203
204pub fn write_archive(
211 list: &[Data],
212 to: &str,
213 base_name: &str,
214 batch_size: usize,
215) -> Result<Vec<(String, Vec<u8>)>> {
216 write_archive_citation(list, to, base_name, batch_size, None, None)
217}
218
219pub fn write_archive_citation(
222 list: &[Data],
223 to: &str,
224 base_name: &str,
225 batch_size: usize,
226 style: Option<&str>,
227 locale: Option<&str>,
228) -> Result<Vec<(String, Vec<u8>)>> {
229 if list.is_empty() {
230 return Err(Error::Serialize("no records to write".to_string()));
231 }
232 let chunks: Vec<&[Data]> = list.chunks(batch_size.max(1)).collect();
233 let multi = chunks.len() > 1;
234
235 let mut entries = Vec::with_capacity(chunks.len());
236 for (idx, chunk) in chunks.into_iter().enumerate() {
237 let bytes = write_list_citation(chunk, to, style, locale)?;
238 let name = batch_entry_name(base_name, if multi { Some(idx) } else { None });
239 entries.push((name, bytes));
240 }
241 Ok(entries)
242}
243
244fn batch_entry_name(base_name: &str, idx: Option<usize>) -> String {
247 match idx {
248 None => base_name.to_string(),
249 Some(i) => {
250 let path = std::path::Path::new(base_name);
251 let stem = path
252 .file_stem()
253 .unwrap_or_default()
254 .to_string_lossy()
255 .to_string();
256 let ext = path
257 .extension()
258 .map(|e| e.to_string_lossy().to_string())
259 .unwrap_or_default();
260 if ext.is_empty() {
261 format!("{}-{:05}", stem, i)
262 } else {
263 format!("{}-{:05}.{}", stem, i, ext)
264 }
265 }
266 }
267}
268
269pub fn read_vraix_sqlite(
277 sqlite_path: &str,
278 from: &str,
279 limit: Option<usize>,
280 offset: usize,
281) -> Result<Vec<Data>> {
282 formats::vraix::read_dump(sqlite_path, from, limit, offset)
283}
284
285pub fn write_vraix_table_parquet(sqlite_path: &str, batch_size: usize) -> Result<Vec<u8>> {
294 formats::vraix::write_table_parquet(sqlite_path, batch_size)
295}
296
297pub fn fetch_vraix_dump(
309 from: &str,
310 date: &str,
311 input_path: Option<&str>,
312 limit: Option<usize>,
313 offset: usize,
314 cache_ttl: std::time::Duration,
315) -> Result<Vec<Data>> {
316 if let Some(path) = input_path {
317 return read_vraix_sqlite(path, from, limit, offset);
318 }
319
320 let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
321 let cache_key = format!("{}-{}.sqlite3.zst", from, date);
322 let (compressed, _from_cache) =
323 file_utils::download_file_cached(&url, "vraix", &cache_key, cache_ttl)
324 .map_err(|e| Error::Http(format!("failed to download '{}': {}", url, e)))?;
325 let decompressed = file_utils::unzst_content(&compressed)
326 .map_err(|e| Error::Parse(format!("failed to decompress '{}': {}", url, e)))?;
327
328 let tmp_path = std::env::temp_dir().join(format!(
329 "commonmeta-vraix-{}-{}-{}.sqlite3",
330 from,
331 date,
332 std::process::id()
333 ));
334 file_utils::write_file(&tmp_path, &decompressed).map_err(|e| {
335 Error::Parse(format!(
336 "failed to write temp file '{}': {}",
337 tmp_path.display(),
338 e
339 ))
340 })?;
341
342 let result = read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
343 std::fs::remove_file(&tmp_path).ok();
344 result
345}
346
347pub fn push_inveniordm(list: &[Data], host: &str, token: &str) -> Vec<PushResult> {
353 formats::inveniordm::upsert_all(list, host, token)
354}
355
356pub fn put_inveniordm(data: &Data, host: &str, token: &str) -> PushResult {
362 formats::inveniordm::upsert(data, host, token)
363}
364
365#[cfg(test)]
366mod tests {
367 use super::*;
368
369 fn sample_data(id: &str) -> Data {
370 Data {
371 id: id.to_string(),
372 type_: "JournalArticle".to_string(),
373 ..Data::default()
374 }
375 }
376
377 #[test]
378 fn test_write_list_json_array_formats() {
379 let list = vec![
380 sample_data("https://doi.org/10.1/a"),
381 sample_data("https://doi.org/10.1/b"),
382 ];
383 let bytes = write_list(&list, "commonmeta").unwrap();
384 let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
385 assert_eq!(value.as_array().unwrap().len(), 2);
386 }
387
388 #[test]
389 fn test_write_list_newline_joined_formats() {
390 let list = vec![
391 sample_data("https://doi.org/10.1/a"),
392 sample_data("https://doi.org/10.1/b"),
393 ];
394 let bytes = write_list(&list, "ris").unwrap();
395 let text = String::from_utf8(bytes).unwrap();
396 assert_eq!(text.lines().filter(|l| l.starts_with("TY -")).count(), 2);
398 }
399
400 #[test]
401 fn test_write_list_crossref_xml_batches_into_one_doi_batch() {
402 let list = vec![
403 sample_data("https://doi.org/10.1/a"),
404 sample_data("https://doi.org/10.1/b"),
405 ];
406 let bytes = write_list(&list, "crossref_xml").unwrap();
407 let text = String::from_utf8(bytes).unwrap();
408 assert_eq!(text.matches("<doi_batch xmlns=").count(), 1);
409 assert_eq!(text.matches("<journal_article").count(), 2);
410 }
411
412 #[test]
413 fn test_write_list_ror_uses_json_array_batch_writer() {
414 let mut a = sample_data("https://ror.org/0342dzm54");
415 a.title = "Org A".to_string();
416 let mut b = sample_data("https://ror.org/0521rfr06");
417 b.title = "Org B".to_string();
418
419 let bytes = write_list(&[a, b], "ror").unwrap();
420 let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
421 assert_eq!(value.as_array().unwrap().len(), 2);
422 }
423
424 #[test]
425 fn test_write_list_citation_renders_each_record() {
426 let mut a = sample_data("https://doi.org/10.1/a");
427 a.title = "Title A".to_string();
428 a.date_published = "2020".to_string();
429 let mut b = sample_data("https://doi.org/10.1/b");
430 b.title = "Title B".to_string();
431 b.date_published = "2021".to_string();
432
433 let bytes = write_list(&[a, b], "citation").unwrap();
434 let text = String::from_utf8(bytes).unwrap();
435 let lines: Vec<&str> = text.lines().collect();
436 assert_eq!(lines.len(), 2);
437 assert!(lines[0].contains("Title A"));
438 assert!(lines[1].contains("Title B"));
439 }
440
441 #[test]
442 fn test_write_list_citation_respects_style() {
443 let mut a = sample_data("https://doi.org/10.1/a");
444 a.title = "Title A".to_string();
445 a.date_published = "2020".to_string();
446
447 let apa = write_list_citation(&[a.clone()], "citation", None, None).unwrap();
448 let chicago =
449 write_list_citation(&[a], "citation", Some("chicago-author-date"), None).unwrap();
450 assert_ne!(apa, chicago);
451 }
452
453 #[test]
454 fn test_write_archive_single_batch_uses_base_name() {
455 let list = vec![sample_data("https://doi.org/10.1/a")];
456 let entries = write_archive(&list, "commonmeta", "out.json", 100_000).unwrap();
457 assert_eq!(entries.len(), 1);
458 assert_eq!(entries[0].0, "out.json");
459 }
460
461 #[test]
462 fn test_write_archive_numbered_batches() {
463 let list = vec![
464 sample_data("https://doi.org/10.1/a"),
465 sample_data("https://doi.org/10.1/b"),
466 sample_data("https://doi.org/10.1/c"),
467 ];
468 let entries = write_archive(&list, "commonmeta", "out.json", 1).unwrap();
469 assert_eq!(entries.len(), 3);
470 assert_eq!(entries[0].0, "out-00000.json");
471 assert_eq!(entries[1].0, "out-00001.json");
472 assert_eq!(entries[2].0, "out-00002.json");
473 }
474
475 #[test]
476 fn test_write_archive_no_extension_base_name() {
477 let list = vec![
478 sample_data("https://doi.org/10.1/a"),
479 sample_data("https://doi.org/10.1/b"),
480 ];
481 let entries = write_archive(&list, "commonmeta", "out", 1).unwrap();
482 assert_eq!(entries[0].0, "out-00000");
483 assert_eq!(entries[1].0, "out-00001");
484 }
485
486 #[test]
487 fn test_write_archive_empty_list_errors() {
488 assert!(write_archive(&[], "commonmeta", "out.json", 100_000).is_err());
489 }
490
491 #[test]
492 fn test_fetch_vraix_dump_uses_local_input_path_without_network() {
493 let dir = std::env::temp_dir().join("commonmeta_lib_fetch_vraix_dump");
494 std::fs::create_dir_all(&dir).unwrap();
495 let path = dir.join("datacite.sqlite3");
496 std::fs::remove_file(&path).ok();
497
498 tokio::runtime::Builder::new_multi_thread()
499 .enable_all()
500 .build()
501 .unwrap()
502 .block_on(async {
503 let db = libsql::Builder::new_local(&path).build().await.unwrap();
504 let conn = db.connect().unwrap();
505 conn.execute_batch(
506 "CREATE TABLE works (pid TEXT, source_id INTEGER, raw_metadata TEXT);",
507 )
508 .await
509 .unwrap();
510 conn.execute(
511 "INSERT INTO works (pid, source_id, raw_metadata) VALUES (?1, ?2, ?3)",
512 libsql::params![
513 "pid-0",
514 1i64,
515 r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b"}}}"#
516 ],
517 )
518 .await
519 .unwrap();
520 });
521
522 let data = fetch_vraix_dump(
523 "datacite",
524 "2026-06-14",
525 Some(path.to_str().unwrap()),
526 None,
527 0,
528 std::time::Duration::from_secs(30 * 24 * 60 * 60),
529 )
530 .unwrap();
531 assert_eq!(data.len(), 1);
532 assert_eq!(data[0].id, "https://doi.org/10.5678/b");
533
534 std::fs::remove_dir_all(&dir).ok();
535 }
536}