use lazy_static::lazy_static;
use regex::Regex;
use reqwest::Client;
use std::error::Error;
use std::string::ToString;
use std::time::Duration;
use url::Url;
pub fn prefix_from_url(s: &str) -> Result<String, Box<dyn Error>> {
let url = Url::parse(s)?;
if url.host_str() != Some("doi.org") || !url.path().starts_with("/10.") {
return Ok(String::new());
}
let path: Vec<&str> = url.path().split('/').collect();
if path.len() < 2 {
return Ok(String::new());
}
Ok(path[1].to_string())
}
pub fn encode_doi_suffix(doi_str: &str) -> String {
doi_str
.replace('[', "%5B")
.replace(']', "%5D")
.replace('<', "%3C")
.replace('>', "%3E")
}
pub fn normalize_doi(doi: &str) -> String {
if let Some(doi_str) = validate_doi(doi) {
let resolver = doi_resolver(doi, false);
let encoded = encode_doi_suffix(&doi_str.to_lowercase());
return format!("{}{}", resolver, encoded);
}
String::new()
}
pub fn validate_doi(doi: &str) -> Option<String> {
lazy_static! {
static ref DOI_REGEX: Regex = Regex::new(
r"^(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/[^\s]+)$"
).unwrap();
}
if let Some(captures) = DOI_REGEX.captures(doi) {
return captures.get(6).map(|m| m.as_str().to_string());
}
None
}
pub fn escape_doi(doi: &str) -> String {
if let Some(doi_str) = validate_doi(doi) {
return doi_str.replace("/", "%2F");
}
String::new()
}
pub fn encode_doi(prefix: &str) -> String {
let suffix = crate::crockford::generate(10, 5, true);
let doi = format!("https://doi.org/{}/{}", prefix, suffix);
doi
}
pub fn decode_doi(doi: &str) -> i64 {
if let Some(d) = validate_doi(doi) {
let parts: Vec<&str> = d.split('/').collect();
if parts.len() < 2 {
return 0;
}
let suffix = parts[1];
match crate::crockford::decode(suffix, true) {
Ok(number) => return number,
Err(e) => {
eprintln!("Error decoding DOI suffix: {}", e);
return 0;
}
}
}
0
}
pub async fn is_registered_doi(doi: &str) -> bool {
let url = normalize_doi(doi);
if url.is_empty() {
return false;
}
let client = Client::builder()
.timeout(Duration::from_secs(10))
.build()
.unwrap_or_default();
match client.head(&url).send().await {
Ok(resp) => resp.status().as_u16() <= 308,
Err(_) => false,
}
}
pub fn validate_prefix(doi: &str) -> Option<String> {
lazy_static! {
static ref PREFIX_REGEX: Regex = Regex::new(
r"^(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org|handle\.test\.datacite\.org)/)?(doi:)?(10\.\d{4,5})"
).unwrap();
}
if let Some(captures) = PREFIX_REGEX.captures(doi) {
return captures.get(6).map(|m| m.as_str().to_string());
}
None
}
pub fn doi_resolver(doi: &str, sandbox: bool) -> String {
if let Ok(d) = Url::parse(doi)
&& (d.host_str() == Some("stage.datacite.org") || sandbox)
{
return "https://handle.stage.datacite.org/".to_string();
}
"https://doi.org/".to_string()
}
const PREFIXES_DDL: &str = r#"
CREATE TABLE IF NOT EXISTS prefixes (
"prefix" TEXT PRIMARY KEY NOT NULL,
"ra" TEXT NOT NULL DEFAULT '',
"date_created" TEXT NOT NULL DEFAULT '',
"date_updated" TEXT NOT NULL DEFAULT ''
);
CREATE UNIQUE INDEX IF NOT EXISTS prefixes_prefix ON prefixes("prefix");
"#;
#[allow(dead_code)]
pub(crate) fn ensure_prefixes_table(conn: &rusqlite::Connection) {
let _ = conn.execute_batch(PREFIXES_DDL);
}
fn default_db_path() -> std::path::PathBuf {
if let Ok(p) = std::env::var("COMMONMETA_DB") {
return std::path::PathBuf::from(p);
}
#[cfg(target_os = "macos")]
{
let home = std::env::var("HOME").unwrap_or_default();
return std::path::PathBuf::from(format!(
"{}/Library/Application Support/commonmeta/commonmeta.sqlite3",
home
));
}
#[cfg(target_os = "linux")]
{
return std::path::PathBuf::from("/var/lib/commonmeta/commonmeta.sqlite3");
}
#[cfg(not(any(target_os = "macos", target_os = "linux")))]
{
std::path::PathBuf::from("commonmeta.sqlite3")
}
}
fn open_prefixes_db() -> Option<rusqlite::Connection> {
let path = default_db_path();
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).ok()?;
}
let conn = rusqlite::Connection::open(&path).ok()?;
let _: String = conn.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0)).ok()?;
conn.execute_batch(PREFIXES_DDL).ok()?;
Some(conn)
}
pub(crate) fn fetch_doi_ra(prefix: &str) -> Option<String> {
#[derive(serde::Deserialize)]
struct RaEntry {
#[serde(rename = "RA", default)]
ra: String,
}
let url = format!("https://doi.org/doiRA/{}", prefix);
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
.ok()?;
let entries: Vec<RaEntry> = client.get(&url).send().ok()?.json().ok()?;
let ra = entries.into_iter().next()?.ra;
if ra.is_empty() { None } else { Some(ra) }
}
pub(crate) fn lookup_prefix_cache(conn: &rusqlite::Connection, prefix: &str) -> Option<String> {
let row: Option<(String, String)> = conn.query_row(
r#"SELECT "ra", "date_updated" FROM prefixes WHERE "prefix" = ?1"#,
rusqlite::params![prefix],
|r| Ok((r.get(0)?, r.get(1)?)),
).ok();
let (ra, date_updated) = row?;
if ra.is_empty() {
return None;
}
let stored = chrono::DateTime::parse_from_rfc3339(&date_updated).ok()?;
let age = chrono::Utc::now().signed_duration_since(stored.with_timezone(&chrono::Utc));
if age > chrono::TimeDelta::days(30) {
return None;
}
Some(ra)
}
pub(crate) fn store_prefix_cache(conn: &rusqlite::Connection, prefix: &str, ra: &str) {
let now = chrono::Utc::now().to_rfc3339();
let _ = conn.execute(
r#"INSERT INTO prefixes ("prefix", "ra", "date_created", "date_updated")
VALUES (?1, ?2, ?3, ?3)
ON CONFLICT("prefix") DO UPDATE SET
"ra" = excluded."ra",
"date_updated" = excluded."date_updated""#,
rusqlite::params![prefix, ra, now],
);
}
#[allow(dead_code)]
pub(crate) fn collect_work_prefixes(conn: &rusqlite::Connection) -> Vec<String> {
let sql = "SELECT DISTINCT \
SUBSTR(id, INSTR(id, '10.'), INSTR(SUBSTR(id, INSTR(id, '10.')), '/') - 1) \
FROM works WHERE id GLOB '*10.*/*'";
let mut stmt = match conn.prepare(sql) {
Ok(s) => s,
Err(_) => return vec![],
};
match stmt.query_map([], |r| r.get::<_, String>(0)) {
Ok(rows) => rows
.filter_map(|r| r.ok())
.filter(|p| p.starts_with("10.") && p.len() > 4)
.collect(),
Err(_) => vec![],
}
}
#[allow(dead_code)]
pub(crate) fn fetch_doi_ra_batch(
client: &reqwest::blocking::Client,
prefixes: &[&str],
) -> Vec<(String, String)> {
if prefixes.is_empty() {
return vec![];
}
#[derive(serde::Deserialize)]
struct Entry {
#[serde(rename = "DOI", default)]
doi: String,
#[serde(rename = "RA", default)]
ra: String,
}
let url = format!("https://doi.org/doiRA/{}", prefixes.join(","));
let entries: Vec<Entry> = match client.get(&url).send().and_then(|r| r.json()) {
Ok(e) => e,
Err(_) => return vec![],
};
entries
.into_iter()
.filter(|e| !e.doi.is_empty() && !e.ra.is_empty())
.map(|e| (e.doi, e.ra))
.collect()
}
pub fn get_doi_ra_sync(doi: &str, no_network: bool) -> Option<String> {
let prefix = validate_prefix(doi)?;
let conn = open_prefixes_db();
if let Some(ref c) = conn {
if let Some(ra) = lookup_prefix_cache(c, &prefix) {
return Some(ra);
}
}
if no_network {
return None;
}
let ra = fetch_doi_ra(&prefix)?;
if let Some(ref c) = conn {
store_prefix_cache(c, &prefix, &ra);
}
Some(ra)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_doi_parity_cases() {
let cases = [
("10.7554/elife.01567", Some("10.7554/elife.01567")),
(
"https://doi.org/10.7554/elife.01567",
Some("10.7554/elife.01567"),
),
("https://doi.org/10.7554", None),
("10.7554", None),
("10.3201/eid1503.081203 10.1083/jcb.1843iti1", None),
("", None),
];
for (input, expected) in cases {
assert_eq!(validate_doi(input).as_deref(), expected, "input: {input}");
}
}
#[test]
fn test_validate_prefix_parity_cases() {
let cases = [
("10.7554/elife.01567", Some("10.7554")),
("https://doi.org/10.7554/elife.01567", Some("10.7554")),
("https://doi.org/10.7554", Some("10.7554")),
("10.7554", Some("10.7554")),
("", None),
];
for (input, expected) in cases {
assert_eq!(
validate_prefix(input).as_deref(),
expected,
"input: {input}"
);
}
}
#[test]
fn test_normalize_and_escape_doi() {
assert_eq!(
normalize_doi("10.7554/eLife.01567"),
"https://doi.org/10.7554/elife.01567"
);
assert_eq!(
escape_doi("https://doi.org/10.7554/elife.01567"),
"10.7554%2Felife.01567"
);
assert_eq!(normalize_doi("not-a-doi"), "");
assert_eq!(escape_doi("not-a-doi"), "");
}
#[test]
fn test_prefix_from_url() {
assert_eq!(
prefix_from_url("https://doi.org/10.7554/elife.01567").ok(),
Some("10.7554".to_string())
);
assert_eq!(
prefix_from_url("https://example.org/10.7554/elife.01567").ok(),
Some("".to_string())
);
}
#[test]
fn test_encode_doi_suffix_sici() {
assert_eq!(
normalize_doi("10.1206/0003-0090(2003)277<0001:TSSAAA>2.0.CO;2"),
"https://doi.org/10.1206/0003-0090(2003)277%3C0001:tssaaa%3E2.0.co;2"
);
assert_eq!(
normalize_doi("10.1663/0006-8101(2002)068[0270:AAAROW]2.0.CO;2"),
"https://doi.org/10.1663/0006-8101(2002)068%5B0270:aaarow%5D2.0.co;2"
);
}
}