use serde_yaml::Value;
use crate::author_utils::{
cleanup_author, infer_contributor_type, normalize_contributor_roles, parse_affiliation_value,
split_person_name,
};
use crate::data::{
Affiliation, Contributor, Data, License, Organization, Person, Publisher, Reference, Subject,
};
use crate::doi_utils::normalize_doi;
use crate::error::{Error, Result};
use crate::utils::{normalize_id, normalize_orcid, sanitize};
fn val_str(v: &Value) -> &str {
match v {
Value::String(s) => s.as_str(),
_ => "",
}
}
fn val_str_owned(v: &Value) -> String {
match v {
Value::String(s) => s.clone(),
Value::Number(n) => n.to_string(),
_ => String::new(),
}
}
fn val_seq(v: &Value) -> &[Value] {
match v {
Value::Sequence(s) => s.as_slice(),
_ => &[],
}
}
static NULL_VAL: std::sync::OnceLock<Value> = std::sync::OnceLock::new();
fn null_val() -> &'static Value {
NULL_VAL.get_or_init(|| Value::Null)
}
fn get<'a>(v: &'a Value, key: &str) -> &'a Value {
match v {
Value::Mapping(m) => m.get(Value::String(key.to_string())).unwrap_or(null_val()),
_ => null_val(),
}
}
struct GithubParts {
owner: String,
repo: String,
release: String, path: String, }
fn github_from_url(url: &str) -> Option<GithubParts> {
let parsed = url::Url::parse(url).ok()?;
let host = parsed.host_str()?;
if !host.ends_with("github.com") && !host.ends_with("githubusercontent.com") {
return None;
}
let words: Vec<&str> = parsed.path().trim_start_matches('/').split('/').collect();
let owner = words
.first()
.copied()
.filter(|s| !s.is_empty())?
.to_string();
let repo = words.get(1).copied().filter(|s| !s.is_empty())?.to_string();
let release = words
.get(3)
.copied()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| "main".to_string());
let path = if words.len() > 4 {
words[4..].join("/")
} else {
String::new()
};
Some(GithubParts {
owner,
repo,
release,
path,
})
}
fn github_as_cff_url(url: &str) -> Option<String> {
let p = github_from_url(url)?;
if !p.path.is_empty() && p.path.ends_with("CITATION.cff") {
Some(format!(
"https://raw.githubusercontent.com/{}/{}/{}/{}",
p.owner, p.repo, p.release, p.path
))
} else {
Some(format!(
"https://raw.githubusercontent.com/{}/{}/main/CITATION.cff",
p.owner, p.repo
))
}
}
fn github_as_repo_url(url: &str) -> Option<String> {
let p = github_from_url(url)?;
Some(format!("https://github.com/{}/{}", p.owner, p.repo))
}
fn parse_cff_contributors(authors: &[Value]) -> Vec<Contributor> {
authors
.iter()
.map(|author| {
let mut family_name = val_str(get(author, "family-names")).to_string();
let mut given_name = val_str(get(author, "given-names")).to_string();
let orcid_raw = val_str(get(author, "orcid")).to_string();
let orcid = if !orcid_raw.is_empty() {
let n = normalize_orcid(&orcid_raw);
if n.is_empty() { None } else { Some(n) }
} else {
None
};
let cleaned_name = cleanup_author(Some(val_str(get(author, "name")))).unwrap_or_default();
let has_person_fields = !family_name.is_empty() || !given_name.is_empty() || orcid.is_some();
if has_person_fields
&& (family_name.is_empty() || given_name.is_empty())
&& !cleaned_name.is_empty()
{
let (g, f, _) = split_person_name(&cleaned_name);
if given_name.is_empty() {
given_name = g;
}
if family_name.is_empty() {
family_name = f;
}
}
let mut type_ = if has_person_fields {
infer_contributor_type(
"",
orcid.as_deref().unwrap_or(""),
&given_name,
&family_name,
&cleaned_name,
None,
)
} else {
"Organization".to_string()
};
if type_.is_empty() {
type_ = "Organization".to_string();
}
let roles = normalize_contributor_roles(&["Author".to_string()], "Author");
if type_ == "Person" {
let affiliations = match get(author, "affiliation") {
Value::String(s) if !s.is_empty() => vec![Affiliation {
name: s.clone(),
..Default::default()
}],
Value::Sequence(seq) => seq
.iter()
.filter_map(|a| {
let json_val = serde_json::Value::String(val_str(a).to_string());
parse_affiliation_value(&json_val)
})
.collect(),
_ => vec![],
};
Contributor::person(
Person {
id: orcid.unwrap_or_default(),
given_name,
family_name,
affiliations,
asserted_by: String::new(),
},
roles,
)
} else {
Contributor::organization(
Organization {
name: cleaned_name,
..Default::default()
},
roles,
)
}
})
.collect()
}
fn parse_cff_references(references: &[Value]) -> Vec<Reference> {
references
.iter()
.filter_map(|r| {
let identifiers = val_seq(get(r, "identifiers"));
let doi_entry = identifiers
.iter()
.find(|id| val_str(get(id, "type")) == "doi")?;
let value = val_str(get(doi_entry, "value"));
if value.is_empty() {
return None;
}
let id = normalize_doi(value);
if id.is_empty() {
return None;
}
Some(Reference {
id,
..Default::default()
})
})
.collect()
}
fn from_value(doc: &Value) -> Data {
let doi_raw = val_str_owned(get(doc, "doi"));
let id = if !doi_raw.is_empty() {
normalize_doi(&doi_raw)
} else {
String::new()
};
let repo_code = val_str(get(doc, "repository-code")).to_string();
let url = if !repo_code.is_empty() {
normalize_id(&repo_code)
} else {
String::new()
};
let publisher = if url.contains("github.com") {
Publisher {
name: "GitHub".to_string(),
..Default::default()
}
} else {
Publisher::default()
};
let title = val_str(get(doc, "title")).to_string();
let contributors = parse_cff_contributors(val_seq(get(doc, "authors")));
let date_published = val_str_owned(get(doc, "date-released"));
let abstract_text = val_str(get(doc, "abstract"));
let description = if !abstract_text.is_empty() {
sanitize(abstract_text)
} else {
String::new()
};
let license_val = get(doc, "license");
let license_id = match license_val {
Value::String(s) => s.clone(),
Value::Sequence(seq) => seq
.first()
.and_then(|v| {
if let Value::String(s) = v {
Some(s.clone())
} else {
None
}
})
.unwrap_or_default(),
_ => String::new(),
};
let license = if !license_id.is_empty() {
crate::spdx::from_id(&license_id)
} else {
License::default()
};
let version = val_str_owned(get(doc, "version"));
let subjects: Vec<Subject> = val_seq(get(doc, "keywords"))
.iter()
.map(|k| Subject {
subject: val_str(k).to_string(),
..Default::default()
})
.filter(|s| !s.subject.is_empty())
.collect();
let references = parse_cff_references(val_seq(get(doc, "references")));
Data {
id,
type_: "Software".to_string(),
url,
title,
contributors,
date_published,
description,
license,
version,
subjects,
references,
publisher,
..Data::default()
}
}
pub fn read_yaml(input: &str) -> Result<Data> {
let doc: Value = serde_yaml::from_str(input).map_err(|e| Error::Parse(e.to_string()))?;
Ok(from_value(&doc))
}
pub fn fetch(url: &str) -> Result<Data> {
let cff_url = github_as_cff_url(url)
.ok_or_else(|| Error::Parse(format!("cannot derive CITATION.cff URL from: {}", url)))?;
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let text = client
.get(&cff_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let mut doc: Value = serde_yaml::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
if get(&doc, "repository-code") == null_val()
&& let Some(repo_url) = github_as_repo_url(&cff_url)
&& let Value::Mapping(ref mut m) = doc
{
m.insert(
Value::String("repository-code".to_string()),
Value::String(repo_url),
);
}
Ok(from_value(&doc))
}
#[cfg(test)]
mod tests {
use super::*;
const CFF_SOFTWARE: &str = r#"
cff-version: 1.2.0
title: My Research Software
authors:
- family-names: Smith
given-names: John
orcid: https://orcid.org/0000-0002-1825-0097
affiliation: University of Example
- name: ACME Research Group
doi: 10.5281/zenodo.1234567
version: 2.1.0
date-released: 2024-03-15
abstract: A software tool for research.
license: MIT
keywords:
- research
- data science
repository-code: https://github.com/example/my-software
"#;
#[test]
fn test_read_cff_basic() {
let data = read_yaml(CFF_SOFTWARE).unwrap();
assert_eq!(data.type_, "Software");
assert_eq!(data.id, "https://doi.org/10.5281/zenodo.1234567");
assert_eq!(data.url, "https://github.com/example/my-software");
assert_eq!(data.title, "My Research Software");
assert_eq!(data.version, "2.1.0");
assert_eq!(data.date_published, "2024-03-15");
assert_eq!(data.license.id, "MIT");
assert_eq!(data.publisher.name, "GitHub");
}
#[test]
fn test_cff_contributors() {
let data = read_yaml(CFF_SOFTWARE).unwrap();
assert_eq!(data.contributors.len(), 2);
let person = &data.contributors[0];
assert_eq!(person.type_, "Person");
assert_eq!(person.family_name(), "Smith");
assert_eq!(person.given_name(), "John");
assert_eq!(person.id(), "https://orcid.org/0000-0002-1825-0097");
assert_eq!(person.affiliations()[0].name, "University of Example");
let org = &data.contributors[1];
assert_eq!(org.type_, "Organization");
assert_eq!(org.name(), "ACME Research Group");
}
#[test]
fn test_cff_subjects() {
let data = read_yaml(CFF_SOFTWARE).unwrap();
assert_eq!(data.subjects.len(), 2);
assert_eq!(data.subjects[0].subject, "research");
assert_eq!(data.subjects[1].subject, "data science");
}
#[test]
fn test_cff_description() {
let data = read_yaml(CFF_SOFTWARE).unwrap();
assert_eq!(data.description, "A software tool for research.");
}
#[test]
fn test_github_as_cff_url() {
assert_eq!(
github_as_cff_url("https://github.com/owner/repo"),
Some("https://raw.githubusercontent.com/owner/repo/main/CITATION.cff".to_string())
);
assert_eq!(
github_as_cff_url("https://github.com/owner/repo/tree/v1.0/CITATION.cff"),
Some("https://raw.githubusercontent.com/owner/repo/v1.0/CITATION.cff".to_string())
);
}
#[test]
fn test_cff_references() {
let cff = r#"
cff-version: 1.2.0
title: Test
authors:
- name: Author
references:
- type: article
title: Related paper
identifiers:
- type: doi
value: 10.1000/ref.2024
- type: book
title: No DOI book
"#;
let data = read_yaml(cff).unwrap();
assert_eq!(data.references.len(), 1);
assert_eq!(data.references[0].id, "https://doi.org/10.1000/ref.2024");
}
#[test]
fn test_cff_license_list() {
let cff = r#"
cff-version: 1.2.0
title: Test
authors:
- name: Author
license:
- Apache-2.0
- MIT
"#;
let data = read_yaml(cff).unwrap();
assert_eq!(data.license.id, "Apache-2.0");
}
}