Skip to main content

commonmeta/
lib.rs

1//! commonmeta — a Rust port of front-matter/commonmeta.
2//!
3//! Convert scholarly metadata between formats. The native model is [`Data`];
4//! format modules read into it and write out of it.
5
6pub mod author_utils;
7pub mod constants;
8pub mod crockford;
9pub mod data;
10pub mod doi_utils;
11pub mod error;
12pub mod file_utils;
13mod formats;
14pub mod progress;
15pub mod schema_utils;
16pub mod spdx;
17pub mod utils;
18pub mod vocabularies;
19
20pub use data::Data;
21pub use error::{Error, Result};
22pub use formats::crossref;
23pub use formats::inveniordm::PushResult;
24pub use formats::ror::AffiliationMatch;
25
26pub const VERSION: &str = env!("CARGO_PKG_VERSION");
27
28/// Read a single record from `from` format, without writing it back out.
29pub fn read(from: &str, input: &str) -> Result<Data> {
30    formats::read(from, input)
31}
32
33/// Read from one format and write to another in a single call.
34pub fn convert(from: &str, to: &str, input: &str) -> Result<Vec<u8>> {
35    let data = formats::read(from, input)?;
36    formats::write(to, &data)
37}
38
39/// Write an already-loaded record to `to` format.
40pub fn write(to: &str, data: &Data) -> Result<Vec<u8>> {
41    formats::write(to, data)
42}
43
44/// Like [`write`], but forwards `style` and `locale` to the citation writer.
45/// For non-`"citation"` formats both parameters are ignored.
46pub fn write_with_style(
47    to: &str,
48    data: &Data,
49    style: Option<&str>,
50    locale: Option<&str>,
51) -> Result<Vec<u8>> {
52    formats::write_citation(to, data, style, locale)
53}
54
55/// Write a ROR-derived record as raw ROR-shaped JSON (as opposed to
56/// `write("ror", data)`, which produces InvenioRDM vocabulary YAML).
57pub fn write_ror_json(data: &Data) -> Result<Vec<u8>> {
58    formats::ror::write_json(data)
59}
60
61/// Match a free-text affiliation string against ROR organizations using the
62/// ROR v2 affiliation endpoint.
63pub fn match_ror_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
64    formats::ror::match_affiliation(affiliation)
65}
66
67/// Like `convert`, but passes CSL `style` and `locale` through to the citation writer.
68pub fn convert_citation(
69    from: &str,
70    input: &str,
71    style: Option<&str>,
72    locale: Option<&str>,
73) -> Result<Vec<u8>> {
74    let data = formats::read(from, input)?;
75    formats::write_citation("citation", &data, style, locale)
76}
77
78/// Write a list of commonmeta records as a single Parquet file. Alongside a
79/// flattened tabular projection of each record's fields (for filtering in
80/// tools like DuckDB without parsing JSON), every row also carries a `json`
81/// column with the record's complete serialization, so [`read_parquet`]
82/// round-trips losslessly.
83pub fn write_parquet(list: &[Data]) -> Result<Vec<u8>> {
84    formats::commonmeta::write_parquet_all(list)
85}
86
87/// Read a list of commonmeta records back from the Parquet schema written by
88/// [`write_parquet`]. Lossless: each record is restored from its `json`
89/// column, the complete original serialization.
90pub fn read_parquet(bytes: &[u8]) -> Result<Vec<Data>> {
91    formats::commonmeta::read_parquet_all(bytes)
92}
93
94/// Write `list` as a SQLite3 database with a `works` table whose columns
95/// mirror the commonmeta v1.0 schema. Simple string fields are stored as
96/// TEXT; complex fields are stored as compact JSON TEXT.
97/// Any existing file at `path` is deleted first.
98pub fn write_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
99    formats::commonmeta::write_sqlite(list, path)
100}
101
102/// Like [`write_sqlite`] but opens an existing database instead of recreating
103/// it. Rows whose `id` already exists are replaced; new rows are inserted.
104pub fn upsert_sqlite(list: &[Data], path: &std::path::Path) -> Result<()> {
105    formats::commonmeta::upsert_sqlite(list, path)
106}
107
108/// Return the total number of rows in the `works` table of a commonmeta SQLite
109/// database — useful for reporting the cumulative count after an upsert.
110pub fn count_sqlite_works(path: &std::path::Path) -> Result<usize> {
111    formats::commonmeta::count_sqlite_works(path)
112}
113
114/// Read records from a commonmeta SQLite database written by [`write_sqlite`].
115pub fn read_sqlite_commonmeta(
116    path: &std::path::Path,
117    limit: Option<usize>,
118    offset: usize,
119) -> Result<Vec<Data>> {
120    formats::commonmeta::read_sqlite_commonmeta(path, limit, offset)
121}
122
123/// Stream a VRAIX daily dump at `input_path` directly to a commonmeta SQLite
124/// database at `output_path` in batches of 10 000 rows, converting with
125/// `from`-specific parser and writing each batch in a single transaction.
126/// `limit` caps total records written; pass `0` for all rows.
127/// When `update` is false the output file is deleted and recreated (default).
128/// When `update` is true the existing file is kept and rows are upserted by
129/// their `id` primary key — new rows are inserted, existing rows are replaced.
130/// Returns the number of records written. No `Vec<Data>` is held for the
131/// whole file — peak memory is proportional to one batch, not the whole dump.
132pub fn stream_vraix_to_sqlite(
133    input_path: &std::path::Path,
134    from: &str,
135    output_path: &std::path::Path,
136    limit: usize,
137    update: bool,
138) -> Result<usize> {
139    formats::vraix::stream_dump_to_sqlite(input_path, from, output_path, limit, !update)
140}
141
142/// Stream the pidbox dump (a mixed-source VRAIX SQLite file containing crossref,
143/// datacite, and ROR rows) directly to a commonmeta SQLite database. Each row
144/// is routed to the appropriate parser by its `source_id`; ROR rows are
145/// skipped. When `update` is false the output file is recreated; when true
146/// rows are upserted by `id`. Returns the number of records written.
147pub fn stream_pidbox_to_sqlite(
148    input_path: &std::path::Path,
149    output_path: &std::path::Path,
150    limit: usize,
151    update: bool,
152) -> Result<usize> {
153    formats::vraix::stream_pidbox_to_sqlite(input_path, output_path, limit, !update)
154}
155
156/// Render a list of records to `to` format as a single buffer: a JSON array
157/// for object-shaped formats (`commonmeta`, `csl`, `datacite`, `inveniordm`,
158/// `schemaorg`, `ror`), or newline-joined output for line/document-shaped
159/// formats (e.g. `bibtex`, `ris`, `crossref_xml`).
160pub fn write_list(list: &[Data], to: &str) -> Result<Vec<u8>> {
161    write_list_citation(list, to, None, None)
162}
163
164/// Like `write_list`, but passes CSL `style`/`locale` through to the
165/// citation writer when `to == "citation"` (ignored for every other format,
166/// same as `convert_citation`/`write_citation`).
167pub fn write_list_citation(
168    list: &[Data],
169    to: &str,
170    style: Option<&str>,
171    locale: Option<&str>,
172) -> Result<Vec<u8>> {
173    let bar = progress::count_bar("rendering", list.len() as u64);
174
175    if matches!(
176        to,
177        "commonmeta"
178            | "csl"
179            | "datacite"
180            | "inveniordm"
181            | "schemaorg"
182            | "ror"
183            | "citation"
184            | "crossref_xml"
185    ) {
186        let bytes = formats::write_all_citation(to, list, style, locale)?;
187        bar.finish_and_clear();
188        return Ok(bytes);
189    }
190
191    let mut output = String::new();
192    for (idx, item) in list.iter().enumerate() {
193        let rendered = formats::write_citation(to, item, style, locale)?;
194        if idx > 0 {
195            output.push('\n');
196        }
197        output.push_str(&String::from_utf8_lossy(&rendered));
198        bar.inc(1);
199    }
200    bar.finish_and_clear();
201    Ok(output.into_bytes())
202}
203
204/// Render `list` to `to` format, split into entries of at most `batch_size`
205/// records each — suitable for packing into an archive via
206/// [`file_utils::write_zip_archive`]/[`file_utils::write_tar_gz_archive`].
207/// `base_name` (e.g. `"out.json"`) names the single entry directly when
208/// there's only one batch, or gets a numbered suffix (`"out-00000.json"`,
209/// `"out-00001.json"`, ...) when there are several.
210pub fn write_archive(
211    list: &[Data],
212    to: &str,
213    base_name: &str,
214    batch_size: usize,
215) -> Result<Vec<(String, Vec<u8>)>> {
216    write_archive_citation(list, to, base_name, batch_size, None, None)
217}
218
219/// Like `write_archive`, but passes CSL `style`/`locale` through to the
220/// citation writer when `to == "citation"`.
221pub fn write_archive_citation(
222    list: &[Data],
223    to: &str,
224    base_name: &str,
225    batch_size: usize,
226    style: Option<&str>,
227    locale: Option<&str>,
228) -> Result<Vec<(String, Vec<u8>)>> {
229    if list.is_empty() {
230        return Err(Error::Serialize("no records to write".to_string()));
231    }
232    let chunks: Vec<&[Data]> = list.chunks(batch_size.max(1)).collect();
233    let multi = chunks.len() > 1;
234
235    let mut entries = Vec::with_capacity(chunks.len());
236    for (idx, chunk) in chunks.into_iter().enumerate() {
237        let bytes = write_list_citation(chunk, to, style, locale)?;
238        let name = batch_entry_name(base_name, if multi { Some(idx) } else { None });
239        entries.push((name, bytes));
240    }
241    Ok(entries)
242}
243
244/// Build the entry name for a batch: `base_name` itself when `idx` is
245/// `None`, or `{stem}-{idx:05}.{ext}` for numbered batches.
246fn batch_entry_name(base_name: &str, idx: Option<usize>) -> String {
247    match idx {
248        None => base_name.to_string(),
249        Some(i) => {
250            let path = std::path::Path::new(base_name);
251            let stem = path
252                .file_stem()
253                .unwrap_or_default()
254                .to_string_lossy()
255                .to_string();
256            let ext = path
257                .extension()
258                .map(|e| e.to_string_lossy().to_string())
259                .unwrap_or_default();
260            if ext.is_empty() {
261                format!("{}-{:05}", stem, i)
262            } else {
263                format!("{}-{:05}.{}", stem, i, ext)
264            }
265        }
266    }
267}
268
269/// Read commonmeta records from a VRAIX daily dump SQLite file already on
270/// disk at `sqlite_path`, e.g. an already-downloaded `crossref-2026-06-14.sqlite3`.
271///
272/// `from` ("crossref" or "datacite") picks how every row is parsed — VRAIX
273/// dumps are single-source per file, so this isn't read from the data
274/// itself. `limit: None` reads every row; `Some(n)` reads `n` rows starting
275/// at `offset`.
276pub fn read_vraix_sqlite(
277    sqlite_path: &str,
278    from: &str,
279    limit: Option<usize>,
280    offset: usize,
281) -> Result<Vec<Data>> {
282    formats::vraix::read_dump(sqlite_path, from, limit, offset)
283}
284
285/// Write a VRAIX dump's transport table (e.g. `pid_records`) to a single
286/// Parquet file's bytes, using its raw columns (`pid`, `source_id`,
287/// `raw_metadata`, ...) as-is — *not* converted to commonmeta `Data` the way
288/// [`read_vraix_sqlite`] is. For analytics over the dump itself (e.g. via
289/// DataFusion/Polars/DuckDB), not for ingesting it as commonmeta records.
290/// `batch_size` controls how many rows land in each internal Parquet row
291/// group (see [`formats::commonmeta::write_parquet_all`]'s analogous
292/// `ROW_GROUP_SIZE` for why this matters for large dumps).
293pub fn write_vraix_table_parquet(sqlite_path: &str, batch_size: usize) -> Result<Vec<u8>> {
294    formats::vraix::write_table_parquet(sqlite_path, batch_size)
295}
296
297/// Fetch commonmeta records from a VRAIX daily dump for `from` ("crossref"
298/// or "datacite") and `date` (YYYY-MM-DD).
299///
300/// With `input_path`, the local SQLite file at that path is read directly
301/// via [`read_vraix_sqlite`] (e.g. an already-downloaded dump); otherwise
302/// `{from}-{date}.sqlite3.zst` is downloaded from metadata.vraix.org —
303/// cached locally for `cache_ttl` via [`file_utils::download_file_cached`]
304/// — and decompressed into a temp file first.
305///
306/// `limit`/`offset` window the rows read from the dump; `limit: None` reads
307/// every row.
308pub fn fetch_vraix_dump(
309    from: &str,
310    date: &str,
311    input_path: Option<&str>,
312    limit: Option<usize>,
313    offset: usize,
314    cache_ttl: std::time::Duration,
315) -> Result<Vec<Data>> {
316    if let Some(path) = input_path {
317        return read_vraix_sqlite(path, from, limit, offset);
318    }
319
320    let url = format!("https://metadata.vraix.org/{}-{}.sqlite3.zst", from, date);
321    let cache_key = format!("{}-{}.sqlite3.zst", from, date);
322    let (compressed, _from_cache) =
323        file_utils::download_file_cached(&url, "vraix", &cache_key, cache_ttl)
324            .map_err(|e| Error::Http(format!("failed to download '{}': {}", url, e)))?;
325    let decompressed = file_utils::unzst_content(&compressed)
326        .map_err(|e| Error::Parse(format!("failed to decompress '{}': {}", url, e)))?;
327
328    let tmp_path = std::env::temp_dir().join(format!(
329        "commonmeta-vraix-{}-{}-{}.sqlite3",
330        from,
331        date,
332        std::process::id()
333    ));
334    file_utils::write_file(&tmp_path, &decompressed).map_err(|e| {
335        Error::Parse(format!(
336            "failed to write temp file '{}': {}",
337            tmp_path.display(),
338            e
339        ))
340    })?;
341
342    let result = read_vraix_sqlite(tmp_path.to_str().unwrap(), from, limit, offset);
343    std::fs::remove_file(&tmp_path).ok();
344    result
345}
346
347/// Create-or-update, then publish, a list of records in InvenioRDM.
348///
349/// This performs real, network-visible writes against `host` (a live record
350/// is created/updated and published) using `token` for Bearer authentication.
351/// Registration with other services (Crossref, DataCite) is not yet supported.
352pub fn push_inveniordm(list: &[Data], host: &str, token: &str) -> Vec<PushResult> {
353    formats::inveniordm::upsert_all(list, host, token)
354}
355
356/// Create-or-update, then publish, a single record in InvenioRDM.
357///
358/// This performs a real, network-visible write against `host` (a live record
359/// is created/updated and published) using `token` for Bearer authentication.
360/// Registration with other services (Crossref, DataCite) is not yet supported.
361pub fn put_inveniordm(data: &Data, host: &str, token: &str) -> PushResult {
362    formats::inveniordm::upsert(data, host, token)
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368
369    fn sample_data(id: &str) -> Data {
370        Data {
371            id: id.to_string(),
372            type_: "JournalArticle".to_string(),
373            ..Data::default()
374        }
375    }
376
377    #[test]
378    fn test_write_list_json_array_formats() {
379        let list = vec![
380            sample_data("https://doi.org/10.1/a"),
381            sample_data("https://doi.org/10.1/b"),
382        ];
383        let bytes = write_list(&list, "commonmeta").unwrap();
384        let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
385        assert_eq!(value.as_array().unwrap().len(), 2);
386    }
387
388    #[test]
389    fn test_write_list_newline_joined_formats() {
390        let list = vec![
391            sample_data("https://doi.org/10.1/a"),
392            sample_data("https://doi.org/10.1/b"),
393        ];
394        let bytes = write_list(&list, "ris").unwrap();
395        let text = String::from_utf8(bytes).unwrap();
396        // Two records, newline-joined rather than a JSON array.
397        assert_eq!(text.lines().filter(|l| l.starts_with("TY  -")).count(), 2);
398    }
399
400    #[test]
401    fn test_write_list_crossref_xml_batches_into_one_doi_batch() {
402        let list = vec![
403            sample_data("https://doi.org/10.1/a"),
404            sample_data("https://doi.org/10.1/b"),
405        ];
406        let bytes = write_list(&list, "crossref_xml").unwrap();
407        let text = String::from_utf8(bytes).unwrap();
408        assert_eq!(text.matches("<doi_batch xmlns=").count(), 1);
409        assert_eq!(text.matches("<journal_article").count(), 2);
410    }
411
412    #[test]
413    fn test_write_list_ror_uses_json_array_batch_writer() {
414        let mut a = sample_data("https://ror.org/0342dzm54");
415        a.title = "Org A".to_string();
416        let mut b = sample_data("https://ror.org/0521rfr06");
417        b.title = "Org B".to_string();
418
419        let bytes = write_list(&[a, b], "ror").unwrap();
420        let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
421        assert_eq!(value.as_array().unwrap().len(), 2);
422    }
423
424    #[test]
425    fn test_write_list_citation_renders_each_record() {
426        let mut a = sample_data("https://doi.org/10.1/a");
427        a.title = "Title A".to_string();
428        a.date_published = "2020".to_string();
429        let mut b = sample_data("https://doi.org/10.1/b");
430        b.title = "Title B".to_string();
431        b.date_published = "2021".to_string();
432
433        let bytes = write_list(&[a, b], "citation").unwrap();
434        let text = String::from_utf8(bytes).unwrap();
435        let lines: Vec<&str> = text.lines().collect();
436        assert_eq!(lines.len(), 2);
437        assert!(lines[0].contains("Title A"));
438        assert!(lines[1].contains("Title B"));
439    }
440
441    #[test]
442    fn test_write_list_citation_respects_style() {
443        let mut a = sample_data("https://doi.org/10.1/a");
444        a.title = "Title A".to_string();
445        a.date_published = "2020".to_string();
446
447        let apa = write_list_citation(&[a.clone()], "citation", None, None).unwrap();
448        let chicago =
449            write_list_citation(&[a], "citation", Some("chicago-author-date"), None).unwrap();
450        assert_ne!(apa, chicago);
451    }
452
453    #[test]
454    fn test_write_archive_single_batch_uses_base_name() {
455        let list = vec![sample_data("https://doi.org/10.1/a")];
456        let entries = write_archive(&list, "commonmeta", "out.json", 100_000).unwrap();
457        assert_eq!(entries.len(), 1);
458        assert_eq!(entries[0].0, "out.json");
459    }
460
461    #[test]
462    fn test_write_archive_numbered_batches() {
463        let list = vec![
464            sample_data("https://doi.org/10.1/a"),
465            sample_data("https://doi.org/10.1/b"),
466            sample_data("https://doi.org/10.1/c"),
467        ];
468        let entries = write_archive(&list, "commonmeta", "out.json", 1).unwrap();
469        assert_eq!(entries.len(), 3);
470        assert_eq!(entries[0].0, "out-00000.json");
471        assert_eq!(entries[1].0, "out-00001.json");
472        assert_eq!(entries[2].0, "out-00002.json");
473    }
474
475    #[test]
476    fn test_write_archive_no_extension_base_name() {
477        let list = vec![
478            sample_data("https://doi.org/10.1/a"),
479            sample_data("https://doi.org/10.1/b"),
480        ];
481        let entries = write_archive(&list, "commonmeta", "out", 1).unwrap();
482        assert_eq!(entries[0].0, "out-00000");
483        assert_eq!(entries[1].0, "out-00001");
484    }
485
486    #[test]
487    fn test_write_archive_empty_list_errors() {
488        assert!(write_archive(&[], "commonmeta", "out.json", 100_000).is_err());
489    }
490
491    #[test]
492    fn test_fetch_vraix_dump_uses_local_input_path_without_network() {
493        let dir = std::env::temp_dir().join("commonmeta_lib_fetch_vraix_dump");
494        std::fs::create_dir_all(&dir).unwrap();
495        let path = dir.join("datacite.sqlite3");
496        std::fs::remove_file(&path).ok();
497
498        tokio::runtime::Builder::new_multi_thread()
499            .enable_all()
500            .build()
501            .unwrap()
502            .block_on(async {
503                let db = libsql::Builder::new_local(&path).build().await.unwrap();
504                let conn = db.connect().unwrap();
505                conn.execute_batch(
506                    "CREATE TABLE works (pid TEXT, source_id INTEGER, raw_metadata TEXT);",
507                )
508                .await
509                .unwrap();
510                conn.execute(
511                    "INSERT INTO works (pid, source_id, raw_metadata) VALUES (?1, ?2, ?3)",
512                    libsql::params![
513                        "pid-0",
514                        1i64,
515                        r#"{"data":{"id":"10.5678/b","attributes":{"doi":"10.5678/b"}}}"#
516                    ],
517                )
518                .await
519                .unwrap();
520            });
521
522        let data = fetch_vraix_dump(
523            "datacite",
524            "2026-06-14",
525            Some(path.to_str().unwrap()),
526            None,
527            0,
528            std::time::Duration::from_secs(30 * 24 * 60 * 60),
529        )
530        .unwrap();
531        assert_eq!(data.len(), 1);
532        assert_eq!(data[0].id, "https://doi.org/10.5678/b");
533
534        std::fs::remove_dir_all(&dir).ok();
535    }
536}