commonmeta 0.8.26

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
//! Utilities for working with DOIs
//!
//! This module provides functionality for:
//! - Validating, normalizing and escaping DOIs
//! - Encoding and decoding DOI identifiers
//! - Checking DOI registration status
//! - Working with DOI prefixes and registration agencies
//! - Generating DOIs for specific blogging platforms like WordPress and Substack
use lazy_static::lazy_static;
use regex::Regex;
use reqwest::Client;
use std::error::Error;
use std::string::ToString;
use std::time::Duration;
use url::Url;

/// Extracts DOI prefix from URL
pub fn prefix_from_url(s: &str) -> Result<String, Box<dyn Error>> {
    let url = Url::parse(s)?;

    if url.host_str() != Some("doi.org") || !url.path().starts_with("/10.") {
        return Ok(String::new());
    }

    let path: Vec<&str> = url.path().split('/').collect();
    if path.len() < 2 {
        return Ok(String::new());
    }

    Ok(path[1].to_string())
}

/// Percent-encode characters that are valid in DOI suffixes but forbidden
/// unencoded in RFC 3986 URI paths. Covers brackets used by legacy schemes
/// (e.g. SICI – Serial Item and Contribution Identifier, NISO Z39.56) and
/// angle brackets used as structural delimiters in SICI suffixes such as
/// `10.1206/0003-0090(2003)277<0001:TSSAAA>2.0.CO;2`.
pub fn encode_doi_suffix(doi_str: &str) -> String {
    doi_str
        .replace('[', "%5B")
        .replace(']', "%5D")
        .replace('<', "%3C")
        .replace('>', "%3E")
}

/// Normalizes a DOI
pub fn normalize_doi(doi: &str) -> String {
    if let Some(doi_str) = validate_doi(doi) {
        let resolver = doi_resolver(doi, false);
        let encoded = encode_doi_suffix(&doi_str.to_lowercase());
        return format!("{}{}", resolver, encoded);
    }
    String::new()
}

/// Validates a DOI
pub fn validate_doi(doi: &str) -> Option<String> {
    lazy_static! {
        static ref DOI_REGEX: Regex = Regex::new(
            r"^(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/[^\s]+)$"
        ).unwrap();
    }

    if let Some(captures) = DOI_REGEX.captures(doi) {
        return captures.get(6).map(|m| m.as_str().to_string());
    }
    None
}

/// Escapes a DOI, i.e. replaces '/' with '%2F'
pub fn escape_doi(doi: &str) -> String {
    if let Some(doi_str) = validate_doi(doi) {
        return doi_str.replace("/", "%2F");
    }
    String::new()
}

/// Encodes a DOI with a randomly generated suffix
pub fn encode_doi(prefix: &str) -> String {
    let suffix = crate::crockford::generate(10, 5, true);
    let doi = format!("https://doi.org/{}/{}", prefix, suffix);
    doi
}

/// Decodes a DOI suffix to an integer
pub fn decode_doi(doi: &str) -> i64 {
    if let Some(d) = validate_doi(doi) {
        let parts: Vec<&str> = d.split('/').collect();
        if parts.len() < 2 {
            return 0;
        }

        let suffix = parts[1];
        match crate::crockford::decode(suffix, true) {
            Ok(number) => return number,
            Err(e) => {
                eprintln!("Error decoding DOI suffix: {}", e);
                return 0;
            }
        }
    }
    0
}

/// Checks if a DOI resolves (i.e. redirects) via the DOI handle servers
pub async fn is_registered_doi(doi: &str) -> bool {
    let url = normalize_doi(doi);
    if url.is_empty() {
        return false;
    }

    let client = Client::builder()
        .timeout(Duration::from_secs(10))
        .build()
        .unwrap_or_default();

    match client.head(&url).send().await {
        Ok(resp) => resp.status().as_u16() <= 308,
        Err(_) => false,
    }
}

/// Validates a DOI prefix for a given DOI
pub fn validate_prefix(doi: &str) -> Option<String> {
    lazy_static! {
        static ref PREFIX_REGEX: Regex = Regex::new(
            r"^(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5})"
        ).unwrap();
    }

    if let Some(captures) = PREFIX_REGEX.captures(doi) {
        return captures.get(6).map(|m| m.as_str().to_string());
    }
    None
}

/// Returns a DOI resolver for a given DOI
pub fn doi_resolver(doi: &str, sandbox: bool) -> String {
    if let Ok(d) = Url::parse(doi)
        && (d.host_str() == Some("stage.datacite.org") || sandbox)
    {
        return "https://handle.stage.datacite.org/".to_string();
    }
    "https://doi.org/".to_string()
}

/// DDL for the `prefixes` table, created lazily on first use.
const PREFIXES_DDL: &str = r#"
CREATE TABLE IF NOT EXISTS prefixes (
    "prefix"       TEXT PRIMARY KEY NOT NULL,
    "ra"           TEXT NOT NULL DEFAULT '',
    "date_created" TEXT NOT NULL DEFAULT '',
    "date_updated" TEXT NOT NULL DEFAULT ''
);
CREATE UNIQUE INDEX IF NOT EXISTS prefixes_prefix ON prefixes("prefix");
"#;

/// Path to the commonmeta SQLite database, following the same precedence as
/// the CLI's `resolve_db_path`: `COMMONMETA_DB` env var → platform default.
fn default_db_path() -> std::path::PathBuf {
    if let Ok(p) = std::env::var("COMMONMETA_DB") {
        return std::path::PathBuf::from(p);
    }
    #[cfg(target_os = "macos")]
    {
        let home = std::env::var("HOME").unwrap_or_default();
        return std::path::PathBuf::from(format!(
            "{}/Library/Application Support/commonmeta/commonmeta.sqlite3",
            home
        ));
    }
    #[cfg(target_os = "linux")]
    {
        return std::path::PathBuf::from("/var/lib/commonmeta/commonmeta.sqlite3");
    }
    #[cfg(not(any(target_os = "macos", target_os = "linux")))]
    {
        std::path::PathBuf::from("commonmeta.sqlite3")
    }
}

/// Open `commonmeta.sqlite3` and ensure the `prefixes` table exists.
fn open_prefixes_db() -> Option<rusqlite::Connection> {
    let path = default_db_path();
    if let Some(parent) = path.parent() {
        std::fs::create_dir_all(parent).ok()?;
    }
    let conn = rusqlite::Connection::open(&path).ok()?;
    let _: String = conn.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0)).ok()?;
    conn.execute_batch(PREFIXES_DDL).ok()?;
    Some(conn)
}

/// Fetch the RA for `prefix` from the DOI RA API (no DB access).
pub(crate) fn fetch_doi_ra(prefix: &str) -> Option<String> {
    #[derive(serde::Deserialize)]
    struct RaEntry {
        #[serde(rename = "RA", default)]
        ra: String,
    }
    let url = format!("https://doi.org/doiRA/{}", prefix);
    let client = reqwest::blocking::Client::builder()
        .timeout(std::time::Duration::from_secs(10))
        .build()
        .ok()?;
    let entries: Vec<RaEntry> = client.get(&url).send().ok()?.json().ok()?;
    let ra = entries.into_iter().next()?.ra;
    if ra.is_empty() { None } else { Some(ra) }
}

/// Return the cached RA for `prefix` if the entry is less than 30 days old.
pub(crate) fn lookup_prefix_cache(conn: &rusqlite::Connection, prefix: &str) -> Option<String> {
    let row: Option<(String, String)> = conn.query_row(
        r#"SELECT "ra", "date_updated" FROM prefixes WHERE "prefix" = ?1"#,
        rusqlite::params![prefix],
        |r| Ok((r.get(0)?, r.get(1)?)),
    ).ok();
    let (ra, date_updated) = row?;
    if ra.is_empty() {
        return None;
    }
    let stored = chrono::DateTime::parse_from_rfc3339(&date_updated).ok()?;
    let age = chrono::Utc::now().signed_duration_since(stored.with_timezone(&chrono::Utc));
    if age > chrono::TimeDelta::days(30) {
        return None;
    }
    Some(ra)
}

/// Upsert `prefix → ra` into the `prefixes` table.
pub(crate) fn store_prefix_cache(conn: &rusqlite::Connection, prefix: &str, ra: &str) {
    let now = chrono::Utc::now().to_rfc3339();
    let _ = conn.execute(
        r#"INSERT INTO prefixes ("prefix", "ra", "date_created", "date_updated")
           VALUES (?1, ?2, ?3, ?3)
           ON CONFLICT("prefix") DO UPDATE SET
               "ra"           = excluded."ra",
               "date_updated" = excluded."date_updated""#,
        rusqlite::params![prefix, ra, now],
    );
}

/// Look up the registration agency for a DOI prefix (blocking).
///
/// Checks the local `prefixes` cache in `commonmeta.sqlite3` first; falls back
/// to `https://doi.org/doiRA/{prefix}` when the prefix is absent or the cached
/// entry is older than 30 days. Results are stored for future calls.
///
/// When `no_network` is `true` the function only consults the local cache and
/// returns `None` on a miss rather than making a network request.
pub fn get_doi_ra_sync(doi: &str, no_network: bool) -> Option<String> {
    let prefix = validate_prefix(doi)?;

    let conn = open_prefixes_db();

    // Return cached value if still fresh.
    if let Some(ref c) = conn {
        if let Some(ra) = lookup_prefix_cache(c, &prefix) {
            return Some(ra);
        }
    }

    if no_network {
        return None;
    }

    // Cache miss or stale — fetch from the DOI RA API.
    let ra = fetch_doi_ra(&prefix)?;

    if let Some(ref c) = conn {
        store_prefix_cache(c, &prefix, &ra);
    }

    Some(ra)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_validate_doi_parity_cases() {
        let cases = [
            ("10.7554/elife.01567", Some("10.7554/elife.01567")),
            (
                "https://doi.org/10.7554/elife.01567",
                Some("10.7554/elife.01567"),
            ),
            ("https://doi.org/10.7554", None),
            ("10.7554", None),
            ("10.3201/eid1503.081203 10.1083/jcb.1843iti1", None),
            ("", None),
        ];

        for (input, expected) in cases {
            assert_eq!(validate_doi(input).as_deref(), expected, "input: {input}");
        }
    }

    #[test]
    fn test_validate_prefix_parity_cases() {
        let cases = [
            ("10.7554/elife.01567", Some("10.7554")),
            ("https://doi.org/10.7554/elife.01567", Some("10.7554")),
            ("https://doi.org/10.7554", Some("10.7554")),
            ("10.7554", Some("10.7554")),
            ("", None),
        ];

        for (input, expected) in cases {
            assert_eq!(
                validate_prefix(input).as_deref(),
                expected,
                "input: {input}"
            );
        }
    }

    #[test]
    fn test_normalize_and_escape_doi() {
        assert_eq!(
            normalize_doi("10.7554/eLife.01567"),
            "https://doi.org/10.7554/elife.01567"
        );
        assert_eq!(
            escape_doi("https://doi.org/10.7554/elife.01567"),
            "10.7554%2Felife.01567"
        );
        assert_eq!(normalize_doi("not-a-doi"), "");
        assert_eq!(escape_doi("not-a-doi"), "");
    }

    #[test]
    fn test_prefix_from_url() {
        assert_eq!(
            prefix_from_url("https://doi.org/10.7554/elife.01567").ok(),
            Some("10.7554".to_string())
        );
        assert_eq!(
            prefix_from_url("https://example.org/10.7554/elife.01567").ok(),
            Some("".to_string())
        );
    }

    #[test]
    fn test_encode_doi_suffix_sici() {
        // SICI (NISO Z39.56) DOIs use angle brackets and square brackets as
        // structural delimiters that must be percent-encoded in RFC 3986 URIs.
        assert_eq!(
            normalize_doi("10.1206/0003-0090(2003)277<0001:TSSAAA>2.0.CO;2"),
            "https://doi.org/10.1206/0003-0090(2003)277%3C0001:tssaaa%3E2.0.co;2"
        );
        assert_eq!(
            normalize_doi("10.1663/0006-8101(2002)068[0270:AAAROW]2.0.CO;2"),
            "https://doi.org/10.1663/0006-8101(2002)068%5B0270:aaarow%5D2.0.co;2"
        );
    }
}