use lazy_static::lazy_static;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
use url::Url;
use crate::crockford::decode;
use crate::doi_utils::{normalize_doi, validate_doi};
fn validate_mod11_2(input: &str) -> Result<(), String> {
if !input.chars().all(|c| c.is_ascii_digit() || c == 'X') {
return Err("Invalid characters in input".to_string());
}
let checksum_char = input.chars().last().unwrap();
let body = &input[..input.len() - 1];
let mut m = 0;
for c in body.chars() {
let d = c.to_digit(10).unwrap() as i32;
m = ((m + d) * 2) % 11;
}
let check_value = (12 - m) % 11;
let expected_char = if check_value == 10 {
'X'
} else {
char::from_digit(check_value as u32, 10).unwrap()
};
if checksum_char == expected_char {
Ok(())
} else {
Err("Invalid checksum".to_string())
}
}
pub fn decode_id(id: &str) -> Result<i64, String> {
let (identifier, identifier_type) = validate_id(id);
match identifier_type {
"DOI" => {
let parts: Vec<&str> = identifier.split('/').collect();
if parts.len() < 2 {
return Err(format!("Invalid DOI format: {}", id));
}
let suffix = parts[1];
decode(suffix, true).map_err(|e| e.to_string())
}
"ROR" => {
decode(&identifier, true).map_err(|e| e.to_string())
}
"RID" => {
decode(&identifier, true).map_err(|e| e.to_string())
}
"ORCID" => {
let cleaned = identifier.replace("-", "");
if let Err(e) = validate_mod11_2(&cleaned) {
return Err(format!("Invalid checksum for ORCID {}: {}", identifier, e));
}
let number_str = &cleaned[..cleaned.len() - 1];
match number_str.parse::<i64>() {
Ok(n) => Ok(n),
Err(e) => Err(format!("Failed to parse ORCID: {}", e)),
}
}
_ => Err(format!("identifier {} not recognized", id)),
}
}
pub fn validate_id(id: &str) -> (String, &'static str) {
if let Some(fundref) = validate_crossref_funder_id(id) {
return (fundref, "Crossref Funder ID");
}
if let Some(doi) = validate_doi(id) {
return (doi, "DOI");
}
if let Some(uuid) = validate_uuid(id) {
return (uuid, "UUID");
}
if let Some(pmid) = validate_pmid(id) {
return (pmid, "PMID");
}
if let Some(pmcid) = validate_pmcid(id) {
return (pmcid, "PMCID");
}
if let Some(openalex) = validate_openalex(id) {
return (openalex, "OpenAlex");
}
if let Some(orcid) = validate_orcid(id) {
return (orcid, "ORCID");
}
if let Some(ror) = validate_ror(id) {
return (ror, "ROR");
}
if let Some(grid) = validate_grid(id) {
return (grid, "GRID");
}
if let Some(rid) = validate_rid(id) {
return (rid, "RID");
}
if let Some(wikidata) = validate_wikidata(id) {
return (wikidata, "Wikidata");
}
if let Some(isni) = validate_isni(id) {
return (isni, "ISNI");
}
if let Some(issn) = validate_issn(id) {
return (issn, "ISSN");
}
let url = validate_url(id);
if !url.is_empty() {
return (id.to_string(), "URL");
}
(String::new(), "")
}
pub fn validate_id_category(id: &str) -> (String, &'static str, &'static str) {
let (pid, type_) = validate_id(id);
let category = match type_ {
"ROR" | "Crossref Funder ID" | "GRID" => "Organization",
"ORCID" => "Person",
"ISNI" => "Contributor",
"DOI" | "PMID" | "PMCID" => "Work",
"Wikidata" | "OpenAlex" | "URL" | "UUID" => "All",
_ => "",
};
(pid, type_, category)
}
pub fn validate_crossref_funder_id(fundref: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:https?://doi\.org/)?(?:10\.13039/)?((501)?1000[0-9]{5})$").unwrap();
}
RE.captures(fundref)
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_grid(grid: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^(?:(?:http|https)://(?:(?:www)?\.)?grid\.ac/)?(?:institutes/)?(grid\.[0-9]+\.[a-f0-9]{1,2})$").unwrap();
}
RE.captures(grid)
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_isni(isni: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^(?:(?:http|https)://(?:(?:www)?\.)?isni\.org/)?(?:isni/)?(0000[ -]?00\d{2}[ -]?\d{4}[ -]?\d{3}[0-9X]+)$").unwrap();
}
RE.captures(isni)
.and_then(|captures| captures.get(1))
.and_then(|m| {
let clean_match = m.as_str().replace(" ", "").replace("-", "");
if !check_orcid_number_range(&clean_match) {
Some(clean_match)
} else {
None
}
})
}
pub fn validate_issn(issn: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:https://portal\.issn\.org/resource/ISSN/)?(\d{4}\-\d{3}(\d|x|X))$")
.unwrap();
}
RE.captures(issn)
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_orcid(orcid: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^(?:(?:http|https)://(?:(?:www|sandbox)?\.)?orcid\.org/)?(000[09][ -]000[123][ -]\d{4}[ -]\d{3}[0-9X]+)$").unwrap();
}
RE.captures(orcid)
.and_then(|captures| captures.get(1))
.filter(|m| check_orcid_number_range(m.as_str()))
.map(|m| m.as_str().to_string())
}
fn check_orcid_number_range(orcid: &str) -> bool {
const RANGE1_START: &str = "0000000150000007";
const RANGE1_END: &str = "0000000350000001";
const RANGE2_START: &str = "0009000000000000";
const RANGE2_END: &str = "0009001000000000";
let number = orcid.replace('-', "").replace(" ", "");
is_in_range(&number, RANGE1_START, RANGE1_END) || is_in_range(&number, RANGE2_START, RANGE2_END)
}
fn is_in_range(value: &str, start: &str, end: &str) -> bool {
value >= start && value <= end
}
pub fn validate_rid(rid: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^[0-9A-Z]{5}-[0-9A-Z]{3}[0-9]{2}$").unwrap();
}
if RE.is_match(rid) {
Some(rid.to_string())
} else {
None
}
}
pub fn validate_ror(ror: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:(?:http|https)://ror\.org/)?(0[0-9a-z]{6}\d{2})$").unwrap();
}
RE.captures(ror)
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_url(str: &str) -> String {
if validate_doi(str).is_some() {
return "DOI".to_string();
}
match url::Url::parse(str) {
Err(_) => String::new(),
Ok(url) => {
if has_disallowed_fragments(&url) {
return String::new();
}
if is_rogue_scholar_url(&url) {
let path_segments: Vec<&str> = url.path().split('/').collect();
if is_valid_rogue_scholar_post(&path_segments) {
return "JSONFEEDID".to_string();
}
}
else if url.scheme() == "http" || url.scheme() == "https" {
return "URL".to_string();
}
String::new()
}
}
}
fn has_disallowed_fragments(url: &Url) -> bool {
let disallowed_fragments = [";origin=", ";jsessionid="];
for fragment in &disallowed_fragments {
if url.as_str().contains(fragment) {
return true;
}
}
false
}
fn is_rogue_scholar_url(url: &Url) -> bool {
url.scheme() == "https" && url.host_str() == Some("api.rogue-scholar.org")
}
fn is_valid_rogue_scholar_post(path_segments: &[&str]) -> bool {
if path_segments.len() >= 2 && path_segments[1] == "posts" {
if path_segments.len() == 3 {
return validate_uuid(path_segments[2]).is_some();
}
else if path_segments.len() == 4 {
let doi = format!("{}/{}", path_segments[2], path_segments[3]);
return validate_doi(&doi).is_some();
}
}
false
}
pub fn validate_uuid(uuid: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex = Regex::new(
r"^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-4[a-fA-F0-9]{3}-[89aAbB][a-fA-F0-9]{3}-[a-fA-F0-9]{12}$"
)
.unwrap();
}
if RE.is_match(uuid) {
Some(uuid.to_string())
} else {
None
}
}
pub fn validate_wikidata(wikidata: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:(?:http|https)://(?:(?:www)?\.)?wikidata\.org/wiki/)?(Q\d+)$")
.unwrap();
}
RE.captures(wikidata)
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_openalex(openalex: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:(?:http|https)://openalex\.org/)?([AFIPSW]\d{8,10})$").unwrap();
}
RE.captures(openalex)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_pmid(pmid: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:(?:http|https)://pubmed\.ncbi\.nlm\.nih\.gov/)?(\d{4,8})$").unwrap();
}
RE.captures(pmid)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
pub fn validate_pmcid(pmcid: &str) -> Option<String> {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^(?:(?:http|https)://www\.ncbi\.nlm\.nih\.gov/pmc/articles/)?(\d{4,8})$")
.unwrap();
}
RE.captures(pmcid)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
pub fn normalize_id(pid: &str) -> String {
let doi = normalize_doi(pid);
if !doi.is_empty() {
return doi;
}
if let Some(uuid) = validate_uuid(pid) {
return uuid;
}
if let Some(wikidata) = validate_wikidata(pid) {
return format!("https://www.wikidata.org/wiki/{}", wikidata);
}
match Url::parse(pid) {
Err(_) => String::new(),
Ok(mut u) => {
if u.scheme().is_empty() {
return String::new();
}
if u.scheme() == "http" {
let _ = u.set_scheme("https");
}
let s = u.to_string();
if s.ends_with('/') {
s[..s.len() - 1].to_string()
} else {
s
}
}
}
}
pub fn normalize_work_id(id: &str) -> String {
let (pid, type_, category) = validate_id_category(id);
if !["Work", "All"].contains(&category) {
return String::new();
}
match type_ {
"DOI" => normalize_doi(&pid),
"UUID" | "URL" => pid,
"Wikidata" => format!("https://www.wikidata.org/wiki/{}", pid),
_ => String::new(),
}
}
pub fn normalize_organization_id(id: &str) -> String {
let (pid, type_, category) = validate_id_category(id);
if !["Organization", "Contributor", "All"].contains(&category) {
return String::new();
}
match type_ {
"ROR" => format!("https://ror.org/{}", pid),
"Crossref Funder ID" => format!("https://doi.org/{}", pid),
"GRID" => format!("https://grid.ac/institutes/{}", pid),
"Wikidata" => format!("https://www.wikidata.org/wiki/{}", pid),
"ISNI" => format!("https://isni.org/isni/{}", pid),
_ => String::new(),
}
}
pub fn normalize_person_id(id: &str) -> String {
let (pid, type_, category) = validate_id_category(id);
if !["Person", "Contributor", "All"].contains(&category) {
return String::new();
}
match type_ {
"ORCID" => format!("https://orcid.org/{}", pid),
"ISNI" => format!("https://isni.org/isni/{}", pid),
"Wikidata" => format!("https://www.wikidata.org/wiki/{}", pid),
_ => String::new(),
}
}
pub fn normalize_orcid(orcid: &str) -> String {
match validate_orcid(orcid) {
Some(id) => format!("https://orcid.org/{}", id),
None => String::new(),
}
}
pub fn normalize_ror(ror: &str) -> String {
match validate_ror(ror) {
Some(id) => format!("https://ror.org/{}", id),
None => String::new(),
}
}
pub fn normalize_url(s: &str, secure: bool, lower: bool) -> Option<String> {
let mut u = Url::parse(s).ok()?;
u.host_str()?;
if secure && u.scheme() == "http" {
let _ = u.set_scheme("https");
}
let result = u.to_string();
Some(if lower {
result.to_lowercase()
} else {
result
})
}
pub fn normalize_cc_url(url_: &str) -> (String, bool) {
lazy_static! {
static ref CC_MAP: std::collections::HashMap<&'static str, &'static str> = {
let mut m = std::collections::HashMap::new();
m.insert("https://creativecommons.org/licenses/by/1.0", "https://creativecommons.org/licenses/by/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by/2.0", "https://creativecommons.org/licenses/by/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by/2.5", "https://creativecommons.org/licenses/by/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by/3.0", "https://creativecommons.org/licenses/by/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by/3.0/us", "https://creativecommons.org/licenses/by/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by/4.0", "https://creativecommons.org/licenses/by/4.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc/1.0", "https://creativecommons.org/licenses/by-nc/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc/2.0", "https://creativecommons.org/licenses/by-nc/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc/2.5", "https://creativecommons.org/licenses/by-nc/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc/3.0", "https://creativecommons.org/licenses/by-nc/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc/4.0", "https://creativecommons.org/licenses/by-nc/4.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd-nc/1.0", "https://creativecommons.org/licenses/by-nd-nc/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd-nc/2.0", "https://creativecommons.org/licenses/by-nd-nc/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd-nc/2.5", "https://creativecommons.org/licenses/by-nd-nc/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd-nc/3.0", "https://creativecommons.org/licenses/by-nd-nc/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd-nc/4.0", "https://creativecommons.org/licenses/by-nd-nc/4.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-sa/1.0", "https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-sa/2.0", "https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-sa/2.5", "https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-sa/3.0", "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-sa/3.0/us", "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-sa/4.0", "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd/1.0", "https://creativecommons.org/licenses/by-nd/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd/2.0", "https://creativecommons.org/licenses/by-nd/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd/2.5", "https://creativecommons.org/licenses/by-nd/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd/3.0", "https://creativecommons.org/licenses/by-nd/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nd/4.0", "https://creativecommons.org/licenses/by-nd/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-sa/1.0", "https://creativecommons.org/licenses/by-sa/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-sa/2.0", "https://creativecommons.org/licenses/by-sa/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-sa/2.5", "https://creativecommons.org/licenses/by-sa/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by-sa/3.0", "https://creativecommons.org/licenses/by-sa/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-sa/4.0", "https://creativecommons.org/licenses/by-sa/4.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-nd/1.0", "https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-nd/2.0", "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-nd/2.5", "https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-nd/3.0", "https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode");
m.insert("https://creativecommons.org/licenses/by-nc-nd/4.0", "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode");
m.insert("https://creativecommons.org/licenses/publicdomain", "https://creativecommons.org/licenses/publicdomain/");
m.insert("https://creativecommons.org/publicdomain/zero/1.0", "https://creativecommons.org/publicdomain/zero/1.0/legalcode");
m
};
}
if url_.is_empty() {
return (String::new(), false);
}
let normalized = match normalize_url(url_, true, false) {
Some(u) => u,
None => return (String::new(), false),
};
let mut u = match Url::parse(&normalized) {
Ok(u) => u,
Err(_) => return (String::new(), false),
};
if u.query().is_none() {
let path = u.path().to_string();
if path.len() > 1 && path.ends_with('/') {
u.set_path(&path[..path.len() - 1]);
}
}
let key = u.to_string();
if let Some(v) = CC_MAP.get(key.as_str()) {
return (v.to_string(), true);
}
let stripped = key
.strip_suffix("/legalcode")
.unwrap_or(key.as_str());
match CC_MAP.get(stripped) {
Some(v) => (v.to_string(), true),
None => (String::new(), false),
}
}
pub fn url_to_spdx(url: &str) -> String {
let (canonical, _) = normalize_cc_url(url);
if canonical.is_empty() {
return String::new();
}
match canonical.as_str() {
"https://creativecommons.org/licenses/by/1.0/legalcode" => "CC-BY-1.0",
"https://creativecommons.org/licenses/by/2.0/legalcode" => "CC-BY-2.0",
"https://creativecommons.org/licenses/by/2.5/legalcode" => "CC-BY-2.5",
"https://creativecommons.org/licenses/by/3.0/legalcode" => "CC-BY-3.0",
"https://creativecommons.org/licenses/by/4.0/legalcode" => "CC-BY-4.0",
"https://creativecommons.org/licenses/by-nc/1.0/legalcode" => "CC-BY-NC-1.0",
"https://creativecommons.org/licenses/by-nc/2.0/legalcode" => "CC-BY-NC-2.0",
"https://creativecommons.org/licenses/by-nc/2.5/legalcode" => "CC-BY-NC-2.5",
"https://creativecommons.org/licenses/by-nc/3.0/legalcode" => "CC-BY-NC-3.0",
"https://creativecommons.org/licenses/by-nc/4.0/legalcode" => "CC-BY-NC-4.0",
"https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode" => "CC-BY-NC-ND-1.0",
"https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode" => "CC-BY-NC-ND-2.0",
"https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode" => "CC-BY-NC-ND-2.5",
"https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode" => "CC-BY-NC-ND-3.0",
"https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode" => "CC-BY-NC-ND-4.0",
"https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode" => "CC-BY-NC-SA-1.0",
"https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode" => "CC-BY-NC-SA-2.0",
"https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode" => "CC-BY-NC-SA-2.5",
"https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode" => "CC-BY-NC-SA-3.0",
"https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode" => "CC-BY-NC-SA-4.0",
"https://creativecommons.org/licenses/by-nd/1.0/legalcode" => "CC-BY-ND-1.0",
"https://creativecommons.org/licenses/by-nd/2.0/legalcode" => "CC-BY-ND-2.0",
"https://creativecommons.org/licenses/by-nd/2.5/legalcode" => "CC-BY-ND-2.5",
"https://creativecommons.org/licenses/by-nd/3.0/legalcode" => "CC-BY-ND-3.0",
"https://creativecommons.org/licenses/by-nd/4.0/legalcode" => "CC-BY-ND-4.0",
"https://creativecommons.org/licenses/by-sa/1.0/legalcode" => "CC-BY-SA-1.0",
"https://creativecommons.org/licenses/by-sa/2.0/legalcode" => "CC-BY-SA-2.0",
"https://creativecommons.org/licenses/by-sa/2.5/legalcode" => "CC-BY-SA-2.5",
"https://creativecommons.org/licenses/by-sa/3.0/legalcode" => "CC-BY-SA-3.0",
"https://creativecommons.org/licenses/by-sa/4.0/legalcode" => "CC-BY-SA-4.0",
"https://creativecommons.org/publicdomain/zero/1.0/legalcode" => "CC0-1.0",
_ => "",
}
.to_string()
}
pub fn issn_as_url(issn: &str) -> String {
if issn.is_empty() {
return String::new();
}
format!("https://portal.issn.org/resource/ISSN/{}", issn)
}
pub fn community_slug_as_url(slug: &str, host: &str) -> String {
if slug.is_empty() {
return String::new();
}
let h = if host.is_empty() { "rogue-scholar.org" } else { host };
format!("https://{}/api/communities/{}", h, slug)
}
pub fn sanitize(html: &str) -> String {
let allowed: std::collections::HashSet<&str> =
["b", "br", "code", "em", "i", "sub", "sup", "strong"]
.iter()
.copied()
.collect();
let clean = ammonia::Builder::new()
.tags(allowed)
.clean(html)
.to_string();
clean.trim_matches('\n').to_string()
}
pub fn title_case(s: &str) -> String {
let mut c = s.chars();
match c.next() {
None => String::new(),
Some(f) => f.to_uppercase().collect::<String>() + c.as_str(),
}
}
pub fn dedupe_slice<T: Eq + std::hash::Hash + Clone>(v: Vec<T>) -> Vec<T> {
let mut seen = std::collections::HashSet::new();
v.into_iter().filter(|x| seen.insert(x.clone())).collect()
}
pub fn camel_case_to_words(s: &str) -> String {
lazy_static! {
static ref RE1: Regex = Regex::new("(.)([A-Z][a-z]+)").unwrap();
static ref RE2: Regex = Regex::new("([a-z0-9])([A-Z])").unwrap();
}
let words = RE1.replace_all(s, "${1} ${2}");
let words = RE2.replace_all(&words, "${1} ${2}");
title_case(&words.to_lowercase())
}
pub fn words_to_camel_case(s: &str) -> String {
lazy_static! {
static ref RE1: Regex = Regex::new("(.)([A-Z][a-z]+)").unwrap();
static ref RE2: Regex = Regex::new("([a-z0-9])([A-Z])").unwrap();
}
let words = RE1.replace_all(s, "${1} ${2}");
let words = RE2.replace_all(&words, "${1} ${2}");
let pascal: String = words
.split_whitespace()
.map(title_case)
.collect::<String>();
let pascal = pascal.replace([' ', '-'], "");
if pascal.is_empty() {
return pascal;
}
let mut chars = pascal.chars();
match chars.next() {
None => String::new(),
Some(c) => c.to_lowercase().collect::<String>() + chars.as_str(),
}
}
pub fn camel_case_string(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(c) => c.to_lowercase().collect::<String>() + chars.as_str(),
}
}
pub fn kebab_case_to_camel_case(s: &str) -> String {
lazy_static! {
static ref RE: Regex = Regex::new("-([a-z])").unwrap();
}
RE.replace_all(s, |caps: ®ex::Captures| caps[1].to_uppercase())
.to_string()
}
pub fn kebab_case_to_pascal_case(s: &str) -> String {
let camel = kebab_case_to_camel_case(s);
title_case(&camel)
}
pub fn normalize_string(s: &str) -> String {
s.nfd()
.filter(|c| !('\u{0300}'..='\u{036F}').contains(c))
.nfc()
.collect()
}
pub fn string_to_slug(s: &str) -> String {
normalize_string(s)
.chars()
.filter_map(|c| {
if c.is_alphanumeric() {
Some(c.to_lowercase().next().unwrap_or(c))
} else {
None
}
})
.collect()
}
pub fn split_string(s: &str, n: usize, sep: &str) -> String {
if n == 0 {
return s.to_string();
}
s.as_bytes()
.chunks(n)
.map(|chunk| std::str::from_utf8(chunk).unwrap_or(""))
.collect::<Vec<_>>()
.join(sep)
}
pub fn get_language(lang: &str, format: &str) -> String {
const LANGS: &[(&str, &str, &str)] = &[
("af", "afr", "Afrikaans"),
("ar", "ara", "Arabic"),
("bg", "bul", "Bulgarian"),
("cs", "ces", "Czech"),
("cy", "cym", "Welsh"),
("da", "dan", "Danish"),
("de", "deu", "German"),
("el", "ell", "Greek"),
("en", "eng", "English"),
("es", "spa", "Spanish"),
("et", "est", "Estonian"),
("fi", "fin", "Finnish"),
("fr", "fra", "French"),
("ga", "gle", "Irish"),
("gl", "glg", "Galician"),
("hr", "hrv", "Croatian"),
("hu", "hun", "Hungarian"),
("id", "ind", "Indonesian"),
("it", "ita", "Italian"),
("ja", "jpn", "Japanese"),
("ko", "kor", "Korean"),
("lt", "lit", "Lithuanian"),
("lv", "lav", "Latvian"),
("mk", "mkd", "Macedonian"),
("ms", "msa", "Malay"),
("mt", "mlt", "Maltese"),
("nl", "nld", "Dutch"),
("no", "nor", "Norwegian"),
("pl", "pol", "Polish"),
("pt", "por", "Portuguese"),
("ro", "ron", "Romanian"),
("ru", "rus", "Russian"),
("sk", "slk", "Slovak"),
("sl", "slv", "Slovenian"),
("sq", "sqi", "Albanian"),
("sr", "srp", "Serbian"),
("sv", "swe", "Swedish"),
("tr", "tur", "Turkish"),
("uk", "ukr", "Ukrainian"),
("vi", "vie", "Vietnamese"),
("zh", "zho", "Chinese"),
];
let lc = lang.to_lowercase();
for &(part1, part3, name) in LANGS {
if lc == part1 || lc == part3 || lc.eq_ignore_ascii_case(name) {
return match format {
"iso639-3" => part3.to_string(),
"name" => name.to_string(),
_ => part1.to_string(),
};
}
}
String::new()
}
pub fn find_from_format(
pid: Option<&str>,
str_: Option<&str>,
ext: Option<&str>,
filename: Option<&str>,
) -> &'static str {
if let Some(p) = pid
&& !p.is_empty() {
return find_from_format_by_id(p);
}
if let (Some(s), Some(e)) = (str_, ext)
&& !s.is_empty() && !e.is_empty() {
return find_from_format_by_ext(e);
}
if let Some(s) = str_
&& !s.is_empty() {
return find_from_format_by_string(s);
}
if let Some(f) = filename
&& !f.is_empty() {
return find_from_format_by_filename(f);
}
"datacite"
}
pub fn find_from_format_by_id(id: &str) -> &'static str {
if validate_doi(id).is_some() {
return "crossref";
}
if id.ends_with("codemeta.json") {
return "codemeta";
}
if id.ends_with("CITATION.cff") || id.contains("github.com") {
return "cff";
}
if id.contains("jsonfeed") {
return "jsonfeed";
}
lazy_static! {
static ref RE_ROGUE: Regex =
Regex::new(r"^https:/(/)?api\.rogue-scholar\.org/posts/(.+)$").unwrap();
static ref RE_INVENIO: Regex =
Regex::new(r"^https:/(/)(.+)/(api/)?records/(.+)$").unwrap();
}
if RE_ROGUE.is_match(id) {
return "jsonfeed";
}
if RE_INVENIO.is_match(id) {
return "inveniordm";
}
"schemaorg"
}
pub fn find_from_format_by_ext(ext: &str) -> &'static str {
match ext {
".bib" => "bibtex",
".ris" => "ris",
_ => "",
}
}
pub fn find_from_format_by_string(s: &str) -> &'static str {
let data: serde_json::Value = match serde_json::from_str(s) {
Ok(v) => v,
Err(_) => return "",
};
if let Some(v) = data.get("schema_version").and_then(|v| v.as_str())
&& v.starts_with("https://commonmeta.org") {
return "commonmeta";
}
if let Some(v) = data.get("@context").and_then(|v| v.as_str()) {
if v == "http://schema.org" {
return "schemaorg";
}
if v.contains("codemeta") {
return "codemeta";
}
}
if data.get("guid").is_some() {
return "jsonfeed";
}
if let Some(v) = data.get("schemaVersion").and_then(|v| v.as_str())
&& v.starts_with("http://datacite.org/schema/kernel") {
return "datacite";
}
if data.get("source").and_then(|v| v.as_str()) == Some("Crossref") {
return "crossref";
}
if data.get("conceptdoi").is_some() {
return "inveniordm";
}
if data.get("credit_metadata").is_some() {
return "kbase";
}
""
}
pub fn find_from_format_by_filename(filename: &str) -> &'static str {
if filename == "CITATION.cff" {
return "cff";
}
""
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_openalex() {
assert_eq!(validate_openalex("W1234567890"), Some("W1234567890".into()));
assert_eq!(validate_openalex("https://openalex.org/W1234567890"), Some("W1234567890".into()));
assert_eq!(validate_openalex("X123"), None);
}
#[test]
fn test_validate_pmid() {
assert_eq!(validate_pmid("12345678"), Some("12345678".into()));
assert_eq!(validate_pmid("https://pubmed.ncbi.nlm.nih.gov/12345678"), Some("12345678".into()));
assert_eq!(validate_pmid("123"), None); }
#[test]
fn test_normalize_orcid() {
assert_eq!(
normalize_orcid("0000-0001-5000-0007"),
"https://orcid.org/0000-0001-5000-0007"
);
assert_eq!(normalize_orcid("not-an-orcid"), "");
}
#[test]
fn test_normalize_ror() {
assert_eq!(
normalize_ror("https://ror.org/0521rfr06"),
"https://ror.org/0521rfr06"
);
}
#[test]
fn test_issn_as_url() {
assert_eq!(
issn_as_url("1234-5678"),
"https://portal.issn.org/resource/ISSN/1234-5678"
);
assert_eq!(issn_as_url(""), "");
}
#[test]
fn test_community_slug_as_url() {
assert_eq!(
community_slug_as_url("my-blog", ""),
"https://rogue-scholar.org/api/communities/my-blog"
);
assert_eq!(
community_slug_as_url("blog", "example.org"),
"https://example.org/api/communities/blog"
);
}
#[test]
fn test_camel_case_string() {
assert_eq!(camel_case_string("IsVersionOf"), "isVersionOf");
assert_eq!(camel_case_string("HasPreprint"), "hasPreprint");
assert_eq!(camel_case_string(""), "");
}
#[test]
fn test_kebab_case_to_camel_case() {
assert_eq!(kebab_case_to_camel_case("foo-bar-baz"), "fooBarBaz");
assert_eq!(kebab_case_to_pascal_case("foo-bar"), "FooBar");
}
#[test]
fn test_normalize_string() {
assert_eq!(normalize_string("Héllo Wörld"), "Hello World");
assert_eq!(normalize_string("café"), "cafe");
}
#[test]
fn test_string_to_slug() {
assert_eq!(string_to_slug("Héllo Wörld!"), "helloworld");
assert_eq!(string_to_slug("café au lait"), "cafeaulait");
}
#[test]
fn test_split_string() {
assert_eq!(split_string("1234567890", 4, "-"), "1234-5678-90");
assert_eq!(split_string("abcdef", 2, "_"), "ab_cd_ef");
}
#[test]
fn test_get_language() {
assert_eq!(get_language("en", "iso639-3"), "eng");
assert_eq!(get_language("deu", ""), "de");
assert_eq!(get_language("French", "iso639-3"), "fra");
assert_eq!(get_language("xyz", ""), "");
}
#[test]
fn test_normalize_cc_url() {
let (url, ok) = normalize_cc_url("https://creativecommons.org/licenses/by/4.0/");
assert!(ok);
assert_eq!(url, "https://creativecommons.org/licenses/by/4.0/legalcode");
let (_, ok) = normalize_cc_url("https://example.com/license");
assert!(!ok);
}
#[test]
fn test_dedupe_slice() {
assert_eq!(dedupe_slice(vec![1, 2, 2, 3, 1]), vec![1, 2, 3]);
assert_eq!(
dedupe_slice(vec!["a", "b", "a"]),
vec!["a", "b"]
);
}
#[test]
fn test_find_from_format_by_string() {
let json = r#"{"schema_version":"https://commonmeta.org/commonmeta_v0.14","id":"x"}"#;
assert_eq!(find_from_format_by_string(json), "commonmeta");
let json = r#"{"guid":"abc-123","url":"https://example.com"}"#;
assert_eq!(find_from_format_by_string(json), "jsonfeed");
}
#[test]
fn test_validate_id_category() {
let (id, type_, cat) = validate_id_category("https://ror.org/0521rfr06");
assert_eq!(type_, "ROR");
assert_eq!(cat, "Organization");
assert_eq!(id, "0521rfr06");
let (_, type_, cat) = validate_id_category("https://orcid.org/0000-0001-5000-0007");
assert_eq!(type_, "ORCID");
assert_eq!(cat, "Person");
}
#[test]
fn test_validate_orcid_parity_cases() {
let cases = [
("http://orcid.org/0000-0002-2590-225X", Some("0000-0002-2590-225X")),
(
"https://orcid.org/0000-0002-1825-0097",
Some("0000-0002-1825-0097"),
),
("0000-0002-1825-0097", Some("0000-0002-1825-0097")),
(
"https://sandbox.orcid.org/0000-0002-1825-0097",
Some("0000-0002-1825-0097"),
),
("0000-0002-1825-009", None),
];
for (input, expected) in cases {
assert_eq!(validate_orcid(input).as_deref(), expected, "input: {input}");
}
}
#[test]
fn test_validate_isni_parity_cases() {
let cases = [
(
"https://isni.org/isni/0000000121122291",
Some("0000000121122291"),
),
(
"https://isni.org/isni/0000 0001 2112 2291",
Some("0000000121122291"),
),
("0000-0001-2112-2291", Some("0000000121122291")),
("https://isni.org/isni/000000021825009", None),
];
for (input, expected) in cases {
assert_eq!(validate_isni(input).as_deref(), expected, "input: {input}");
}
}
#[test]
fn test_validate_wikidata_parity_cases() {
let cases = [
("https://www.wikidata.org/wiki/Q7186", Some("Q7186")),
("https://www.wikidata.org/wiki/Q251061", Some("Q251061")),
("Q251061", Some("Q251061")),
("https://www.wikidata.org/wiki/Property:P610", None),
];
for (input, expected) in cases {
assert_eq!(validate_wikidata(input).as_deref(), expected, "input: {input}");
}
}
#[test]
fn test_validate_ror_parity_cases() {
let cases = [
("https://ror.org/0342dzm54", Some("0342dzm54")),
("0342dzm54", Some("0342dzm54")),
("invalid", None),
];
for (input, expected) in cases {
assert_eq!(validate_ror(input).as_deref(), expected, "input: {input}");
}
}
#[test]
fn test_validate_crossref_funder_id_parity_cases() {
let cases = [
("https://doi.org/10.13039/501100000155", Some("501100000155")),
("10.13039/501100000155", Some("501100000155")),
("100010540", Some("100010540")),
("not-a-funder-id", None),
];
for (input, expected) in cases {
assert_eq!(
validate_crossref_funder_id(input).as_deref(),
expected,
"input: {input}"
);
}
}
#[test]
fn test_validate_url_and_id_parity_cases() {
assert_eq!(validate_url("https://elifesciences.org/articles/91729"), "URL");
assert_eq!(validate_url("https://doi.org/10.7554/eLife.91729.3"), "DOI");
assert_eq!(validate_url("10.7554/eLife.91729.3"), "DOI");
assert_eq!(validate_url("https://doi.org/10.1101"), "URL");
assert_eq!(validate_url("10.1101"), "");
let (_, id_type) = validate_id("https://isni.org/isni/0000000121122291");
assert_eq!(id_type, "ISNI");
let (_, id_type) = validate_id("https://orcid.org/0000-0002-1825-0097");
assert_eq!(id_type, "ORCID");
let (_, id_type) = validate_id("https://datadryad.org/stash/dataset/doi:10.5061/dryad.8515");
assert_eq!(id_type, "URL");
}
#[test]
fn test_find_from_format_helpers_parity_cases() {
assert_eq!(find_from_format_by_ext(".bib"), "bibtex");
assert_eq!(find_from_format_by_ext(".ris"), "ris");
assert_eq!(find_from_format_by_ext(".json"), "");
assert_eq!(find_from_format_by_filename("CITATION.cff"), "cff");
assert_eq!(find_from_format_by_filename("citation.cff"), "");
}
}