commonmeta 0.9.4

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
//! Utilities for working with DOIs
//!
//! This module provides functionality for:
//! - Validating, normalizing and escaping DOIs
//! - Encoding and decoding DOI identifiers
//! - Checking DOI registration status
//! - Working with DOI prefixes and registration agencies
//! - Generating DOIs for specific blogging platforms like WordPress and Substack
use lazy_static::lazy_static;
use regex::Regex;
use reqwest::Client;
use std::error::Error;
use std::string::ToString;
use std::time::Duration;
use url::Url;

/// Extracts DOI prefix from URL
pub fn prefix_from_url(s: &str) -> Result<String, Box<dyn Error>> {
    let url = Url::parse(s)?;

    if url.host_str() != Some("doi.org") || !url.path().starts_with("/10.") {
        return Ok(String::new());
    }

    let path: Vec<&str> = url.path().split('/').collect();
    if path.len() < 2 {
        return Ok(String::new());
    }

    Ok(path[1].to_string())
}

/// Percent-encode characters that are valid in DOI suffixes but forbidden
/// unencoded in RFC 3986 URI paths. Covers brackets used by legacy schemes
/// (e.g. SICI – Serial Item and Contribution Identifier, NISO Z39.56) and
/// angle brackets used as structural delimiters in SICI suffixes such as
/// `10.1206/0003-0090(2003)277<0001:TSSAAA>2.0.CO;2`.
pub fn encode_doi_suffix(doi_str: &str) -> String {
    doi_str
        .replace('[', "%5B")
        .replace(']', "%5D")
        .replace('<', "%3C")
        .replace('>', "%3E")
}

/// Normalizes a DOI
pub fn normalize_doi(doi: &str) -> String {
    if let Some(doi_str) = validate_doi(doi) {
        let resolver = doi_resolver(doi, false);
        let encoded = encode_doi_suffix(&doi_str.to_lowercase());
        return format!("{}{}", resolver, encoded);
    }
    String::new()
}

/// Validates a DOI
pub fn validate_doi(doi: &str) -> Option<String> {
    lazy_static! {
        static ref DOI_REGEX: Regex = Regex::new(
            r"^(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/[^\s]+)$"
        ).unwrap();
    }

    if let Some(captures) = DOI_REGEX.captures(doi) {
        return captures.get(6).map(|m| m.as_str().to_string());
    }
    None
}

/// Escapes a DOI, i.e. replaces '/' with '%2F'
pub fn escape_doi(doi: &str) -> String {
    if let Some(doi_str) = validate_doi(doi) {
        return doi_str.replace("/", "%2F");
    }
    String::new()
}

/// Encodes a DOI with a randomly generated suffix
pub fn encode_doi(prefix: &str) -> String {
    let suffix = crate::crockford::generate(10, 5, true);
    let doi = format!("https://doi.org/{}/{}", prefix, suffix);
    doi
}

/// Decodes a DOI suffix to an integer
pub fn decode_doi(doi: &str) -> i64 {
    if let Some(d) = validate_doi(doi) {
        let parts: Vec<&str> = d.split('/').collect();
        if parts.len() < 2 {
            return 0;
        }

        let suffix = parts[1];
        match crate::crockford::decode(suffix, true) {
            Ok(number) => return number,
            Err(e) => {
                eprintln!("Error decoding DOI suffix: {}", e);
                return 0;
            }
        }
    }
    0
}

/// Checks if a DOI resolves (i.e. redirects) via the DOI handle servers
pub async fn is_registered_doi(doi: &str) -> bool {
    let url = normalize_doi(doi);
    if url.is_empty() {
        return false;
    }

    let client = Client::builder()
        .timeout(Duration::from_secs(10))
        .build()
        .unwrap_or_default();

    match client.head(&url).send().await {
        Ok(resp) => resp.status().as_u16() <= 308,
        Err(_) => false,
    }
}

/// Validates a DOI prefix for a given DOI
pub fn validate_prefix(doi: &str) -> Option<String> {
    lazy_static! {
        static ref PREFIX_REGEX: Regex = Regex::new(
            r"^(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5})"
        ).unwrap();
    }

    if let Some(captures) = PREFIX_REGEX.captures(doi) {
        return captures.get(6).map(|m| m.as_str().to_string());
    }
    None
}

/// Returns a DOI resolver for a given DOI
pub fn doi_resolver(doi: &str, sandbox: bool) -> String {
    if let Ok(d) = Url::parse(doi)
        && (d.host_str() == Some("stage.datacite.org") || sandbox)
    {
        return "https://handle.stage.datacite.org/".to_string();
    }
    "https://doi.org/".to_string()
}

/// DDL for the `prefixes` table, created lazily on first use.
const PREFIXES_DDL: &str = r#"
CREATE TABLE IF NOT EXISTS prefixes (
    "prefix"       TEXT PRIMARY KEY NOT NULL,
    "ra"           TEXT NOT NULL DEFAULT '',
    "date_created" TEXT NOT NULL DEFAULT '',
    "date_updated" TEXT NOT NULL DEFAULT ''
);
CREATE UNIQUE INDEX IF NOT EXISTS prefixes_prefix ON prefixes("prefix");
"#;

/// Ensure the `prefixes` table exists in `conn`. Safe to call multiple times.
#[allow(dead_code)]
pub(crate) fn ensure_prefixes_table(conn: &rusqlite::Connection) {
    let _ = conn.execute_batch(PREFIXES_DDL);
}

/// Path to the commonmeta SQLite database, following the same precedence as
/// the CLI's `resolve_db_path`: `COMMONMETA_DB` env var → platform default.
fn default_db_path() -> std::path::PathBuf {
    if let Ok(p) = std::env::var("COMMONMETA_DB") {
        return std::path::PathBuf::from(p);
    }
    #[cfg(target_os = "macos")]
    {
        let home = std::env::var("HOME").unwrap_or_default();
        return std::path::PathBuf::from(format!(
            "{}/Library/Application Support/commonmeta/commonmeta.sqlite3",
            home
        ));
    }
    #[cfg(target_os = "linux")]
    {
        return std::path::PathBuf::from("/var/lib/commonmeta/commonmeta.sqlite3");
    }
    #[cfg(not(any(target_os = "macos", target_os = "linux")))]
    {
        std::path::PathBuf::from("commonmeta.sqlite3")
    }
}

/// Open `commonmeta.sqlite3` and ensure the `prefixes` table exists.
fn open_prefixes_db() -> Option<rusqlite::Connection> {
    let path = default_db_path();
    if let Some(parent) = path.parent() {
        std::fs::create_dir_all(parent).ok()?;
    }
    let conn = rusqlite::Connection::open(&path).ok()?;
    let _: String = conn.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0)).ok()?;
    conn.execute_batch(PREFIXES_DDL).ok()?;
    Some(conn)
}

/// Fetch the RA for `prefix` from the DOI RA API (no DB access).
pub(crate) fn fetch_doi_ra(prefix: &str) -> Option<String> {
    #[derive(serde::Deserialize)]
    struct RaEntry {
        #[serde(rename = "RA", default)]
        ra: String,
    }
    let url = format!("https://doi.org/doiRA/{}", prefix);
    let client = reqwest::blocking::Client::builder()
        .timeout(std::time::Duration::from_secs(10))
        .build()
        .ok()?;
    let entries: Vec<RaEntry> = client.get(&url).send().ok()?.json().ok()?;
    let ra = entries.into_iter().next()?.ra;
    if ra.is_empty() { None } else { Some(ra) }
}

/// Return the cached RA for `prefix` if the entry is less than 30 days old.
pub(crate) fn lookup_prefix_cache(conn: &rusqlite::Connection, prefix: &str) -> Option<String> {
    let row: Option<(String, String)> = conn.query_row(
        r#"SELECT "ra", "date_updated" FROM prefixes WHERE "prefix" = ?1"#,
        rusqlite::params![prefix],
        |r| Ok((r.get(0)?, r.get(1)?)),
    ).ok();
    let (ra, date_updated) = row?;
    if ra.is_empty() {
        return None;
    }
    let stored = chrono::DateTime::parse_from_rfc3339(&date_updated).ok()?;
    let age = chrono::Utc::now().signed_duration_since(stored.with_timezone(&chrono::Utc));
    if age > chrono::TimeDelta::days(30) {
        return None;
    }
    Some(ra)
}

/// Upsert `prefix → ra` into the `prefixes` table.
pub(crate) fn store_prefix_cache(conn: &rusqlite::Connection, prefix: &str, ra: &str) {
    let now = chrono::Utc::now().to_rfc3339();
    let _ = conn.execute(
        r#"INSERT INTO prefixes ("prefix", "ra", "date_created", "date_updated")
           VALUES (?1, ?2, ?3, ?3)
           ON CONFLICT("prefix") DO UPDATE SET
               "ra"           = excluded."ra",
               "date_updated" = excluded."date_updated""#,
        rusqlite::params![prefix, ra, now],
    );
}

/// Collect every distinct DOI prefix currently stored in the `works` table of `conn`.
///
/// Extracts the `10.XXXX` part from `works.id` using a SQLite string expression and
/// filters out non-DOI rows. The returned list is deduplicated and sorted.
#[allow(dead_code)]
pub(crate) fn collect_work_prefixes(conn: &rusqlite::Connection) -> Vec<String> {
    let sql = "SELECT DISTINCT \
        SUBSTR(id, INSTR(id, '10.'), INSTR(SUBSTR(id, INSTR(id, '10.')), '/') - 1) \
        FROM works WHERE id GLOB '*10.*/*'";
    let mut stmt = match conn.prepare(sql) {
        Ok(s) => s,
        Err(_) => return vec![],
    };
    match stmt.query_map([], |r| r.get::<_, String>(0)) {
        Ok(rows) => rows
            .filter_map(|r| r.ok())
            .filter(|p| p.starts_with("10.") && p.len() > 4)
            .collect(),
        Err(_) => vec![],
    }
}

/// Resolve up to `prefixes.len()` DOI prefixes in a single API request.
///
/// The DOI RA API accepts a comma-separated list: `https://doi.org/doiRA/10.1000,10.1001`.
/// Returns `(prefix, ra)` pairs for every entry that has a non-empty RA in the response.
#[allow(dead_code)]
pub(crate) fn fetch_doi_ra_batch(
    client: &reqwest::blocking::Client,
    prefixes: &[&str],
) -> Vec<(String, String)> {
    if prefixes.is_empty() {
        return vec![];
    }
    #[derive(serde::Deserialize)]
    struct Entry {
        #[serde(rename = "DOI", default)]
        doi: String,
        #[serde(rename = "RA", default)]
        ra: String,
    }
    let url = format!("https://doi.org/doiRA/{}", prefixes.join(","));
    let entries: Vec<Entry> = match client.get(&url).send().and_then(|r| r.json()) {
        Ok(e) => e,
        Err(_) => return vec![],
    };
    entries
        .into_iter()
        .filter(|e| !e.doi.is_empty() && !e.ra.is_empty())
        .map(|e| (e.doi, e.ra))
        .collect()
}

/// Look up the registration agency for a DOI prefix (blocking).
///
/// Checks the local `prefixes` cache in `commonmeta.sqlite3` first; falls back
/// to `https://doi.org/doiRA/{prefix}` when the prefix is absent or the cached
/// entry is older than 30 days. Results are stored for future calls.
///
/// When `no_network` is `true` the function only consults the local cache and
/// returns `None` on a miss rather than making a network request.
pub fn get_doi_ra_sync(doi: &str, no_network: bool) -> Option<String> {
    let prefix = validate_prefix(doi)?;

    let conn = open_prefixes_db();

    // Return cached value if still fresh.
    if let Some(ref c) = conn {
        if let Some(ra) = lookup_prefix_cache(c, &prefix) {
            return Some(ra);
        }
    }

    if no_network {
        return None;
    }

    // Cache miss or stale — fetch from the DOI RA API.
    let ra = fetch_doi_ra(&prefix)?;

    if let Some(ref c) = conn {
        store_prefix_cache(c, &prefix, &ra);
    }

    Some(ra)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_validate_doi_parity_cases() {
        let cases = [
            ("10.7554/elife.01567", Some("10.7554/elife.01567")),
            (
                "https://doi.org/10.7554/elife.01567",
                Some("10.7554/elife.01567"),
            ),
            ("https://doi.org/10.7554", None),
            ("10.7554", None),
            ("10.3201/eid1503.081203 10.1083/jcb.1843iti1", None),
            ("", None),
        ];

        for (input, expected) in cases {
            assert_eq!(validate_doi(input).as_deref(), expected, "input: {input}");
        }
    }

    #[test]
    fn test_validate_prefix_parity_cases() {
        let cases = [
            ("10.7554/elife.01567", Some("10.7554")),
            ("https://doi.org/10.7554/elife.01567", Some("10.7554")),
            ("https://doi.org/10.7554", Some("10.7554")),
            ("10.7554", Some("10.7554")),
            ("", None),
        ];

        for (input, expected) in cases {
            assert_eq!(
                validate_prefix(input).as_deref(),
                expected,
                "input: {input}"
            );
        }
    }

    #[test]
    fn test_normalize_and_escape_doi() {
        assert_eq!(
            normalize_doi("10.7554/eLife.01567"),
            "https://doi.org/10.7554/elife.01567"
        );
        assert_eq!(
            escape_doi("https://doi.org/10.7554/elife.01567"),
            "10.7554%2Felife.01567"
        );
        assert_eq!(normalize_doi("not-a-doi"), "");
        assert_eq!(escape_doi("not-a-doi"), "");
    }

    #[test]
    fn test_prefix_from_url() {
        assert_eq!(
            prefix_from_url("https://doi.org/10.7554/elife.01567").ok(),
            Some("10.7554".to_string())
        );
        assert_eq!(
            prefix_from_url("https://example.org/10.7554/elife.01567").ok(),
            Some("".to_string())
        );
    }

    #[test]
    fn test_encode_doi_suffix_sici() {
        // SICI (NISO Z39.56) DOIs use angle brackets and square brackets as
        // structural delimiters that must be percent-encoded in RFC 3986 URIs.
        assert_eq!(
            normalize_doi("10.1206/0003-0090(2003)277<0001:TSSAAA>2.0.CO;2"),
            "https://doi.org/10.1206/0003-0090(2003)277%3C0001:tssaaa%3E2.0.co;2"
        );
        assert_eq!(
            normalize_doi("10.1663/0006-8101(2002)068[0270:AAAROW]2.0.CO;2"),
            "https://doi.org/10.1663/0006-8101(2002)068%5B0270:aaarow%5D2.0.co;2"
        );
    }
}