#![allow(dead_code)]
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::{HashMap, HashSet};
use crate::data::{Data, Date, Identifier, Relation, Title};
use crate::error::{Error, Result};
use crate::utils::{normalize_ror, validate_id, validate_ror};
use crate::formats::ror_countries::ROR_COUNTRIES;
fn null_as_empty<'de, D>(d: D) -> std::result::Result<String, D::Error>
where
D: Deserializer<'de>,
{
Ok(Option::<String>::deserialize(d)?.unwrap_or_default())
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Ror {
#[serde(default, deserialize_with = "null_as_empty")]
pub id: String,
#[serde(default)]
pub established: Option<i32>,
#[serde(default)]
pub external_ids: Vec<ExternalId>,
#[serde(default)]
pub links: Vec<Link>,
#[serde(default)]
pub locations: Vec<Location>,
#[serde(default)]
pub names: Vec<Name>,
#[serde(default)]
pub relationships: Vec<Relationship>,
#[serde(default, deserialize_with = "null_as_empty")]
pub status: String,
#[serde(rename = "types", default)]
pub types: Vec<String>,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct ExternalId {
#[serde(rename = "type", default, deserialize_with = "null_as_empty")]
pub type_: String,
#[serde(default)]
pub all: Vec<String>,
#[serde(default, deserialize_with = "null_as_empty")]
pub preferred: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Link {
#[serde(rename = "type", default, deserialize_with = "null_as_empty")]
pub type_: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub value: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Location {
#[serde(default)]
pub geonames_id: i64,
#[serde(default)]
pub geonames_details: GeonamesDetails,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct GeonamesDetails {
#[serde(default, deserialize_with = "null_as_empty")]
pub country_code: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_name: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_subdivision_code: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_subdivision_name: String,
#[serde(default)]
pub lat: f64,
#[serde(default)]
pub lng: f64,
#[serde(default, deserialize_with = "null_as_empty")]
pub name: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Name {
#[serde(default, deserialize_with = "null_as_empty")]
pub value: String,
#[serde(rename = "types", default)]
pub types: Vec<String>,
#[serde(default, deserialize_with = "null_as_empty")]
pub lang: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Relationship {
#[serde(rename = "type", default, deserialize_with = "null_as_empty")]
pub type_: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub label: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub id: String,
}
#[derive(Debug, Deserialize)]
struct RorListResponse {
#[serde(default)]
number_of_results: i32,
#[serde(default)]
items: Vec<Ror>,
}
pub fn get_display_name(ror: &Ror) -> String {
ror.names
.iter()
.find(|n| n.types.iter().any(|t| t == "ror_display"))
.map(|n| n.value.clone())
.unwrap_or_default()
}
fn get_website_url(ror: &Ror) -> String {
ror.links
.iter()
.find(|l| l.type_ == "website")
.map(|l| l.value.clone())
.unwrap_or_default()
}
fn external_id_type(ror_type: &str) -> &'static str {
match ror_type {
"GRID" => "GRID",
"Wikidata" => "Wikidata",
"FundRef" => "Crossref Funder ID",
"ISNI" => "ISNI",
_ => "",
}
}
fn from_ror(ror: Ror) -> Data {
let id = normalize_ror(&ror.id);
let type_ = "Organization".to_string();
let display_name = get_display_name(&ror);
let titles = if !display_name.is_empty() {
vec![Title { title: display_name, ..Default::default() }]
} else {
vec![]
};
let url = get_website_url(&ror);
let date = if let Some(year) = ror.established {
Date { published: year.to_string(), ..Default::default() }
} else {
Date::default()
};
let identifiers: Vec<Identifier> = ror
.external_ids
.iter()
.filter_map(|ext| {
let id_type = external_id_type(&ext.type_);
if id_type.is_empty() {
return None;
}
let value = if !ext.preferred.is_empty() {
ext.preferred.clone()
} else {
ext.all.first().cloned().unwrap_or_default()
};
if value.is_empty() {
return None;
}
Some(Identifier {
identifier: value,
identifier_type: id_type.to_string(),
})
})
.collect();
let relations: Vec<Relation> = ror
.relationships
.iter()
.filter(|r| !r.id.is_empty())
.map(|r| {
let rel_type = match r.type_.as_str() {
"parent" => "IsPartOf",
"child" => "HasPart",
"related" => "References",
other => other,
};
Relation {
id: normalize_ror(&r.id),
type_: rel_type.to_string(),
}
})
.collect();
Data {
id,
type_,
url,
titles,
date,
identifiers,
relations,
provider: "ROR".to_string(),
..Data::default()
}
}
#[derive(Serialize)]
struct OutInvenioRdm {
id: String,
name: String,
#[serde(skip_serializing_if = "String::is_empty")]
acronym: String,
#[serde(skip_serializing_if = "String::is_empty")]
country: String,
identifiers: Vec<OutIdentifier>,
title: HashMap<String, String>,
}
#[derive(Serialize)]
struct OutIdentifier {
identifier: String,
scheme: String,
}
fn cm_to_inveniordm_scheme(cm_type: &str) -> &'static str {
match cm_type {
"GRID" => "grid",
"Wikidata" => "wikidata",
"ISNI" => "isni",
"Crossref Funder ID" => "fundref",
_ => "",
}
}
fn cm_to_ror_ext_type(cm_type: &str) -> &'static str {
match cm_type {
"GRID" => "GRID",
"Wikidata" => "Wikidata",
"ISNI" => "ISNI",
"Crossref Funder ID" => "FundRef",
_ => "",
}
}
pub fn write(data: &Data) -> Result<Vec<u8>> {
let bare_id = validate_ror(&data.id).ok_or_else(|| {
Error::InvalidId(format!("not a valid ROR ID: {}", data.id))
})?;
let mut title: HashMap<String, String> = HashMap::new();
for t in &data.titles {
let lang = if t.language.is_empty() { "en".to_string() } else { t.language.clone() };
title.insert(lang, t.title.clone());
}
if title.is_empty() {
if let Some(t) = data.titles.first() {
title.insert("en".to_string(), t.title.clone());
}
}
let name = data.titles.first().map(|t| t.title.clone()).unwrap_or_default();
let mut identifiers = vec![OutIdentifier {
identifier: bare_id.clone(),
scheme: "ror".to_string(),
}];
for id in &data.identifiers {
let scheme = cm_to_inveniordm_scheme(&id.identifier_type);
if !scheme.is_empty() {
identifiers.push(OutIdentifier {
identifier: id.identifier.clone(),
scheme: scheme.to_string(),
});
}
}
let out = OutInvenioRdm {
id: bare_id,
name,
acronym: String::new(),
country: String::new(),
identifiers,
title,
};
serde_yaml::to_string(&out)
.map(|s| s.into_bytes())
.map_err(|e| Error::Serialize(e.to_string()))
}
pub fn write_json(data: &Data) -> Result<Vec<u8>> {
use serde_json::{json, Map, Value};
let ror_id = normalize_ror(&data.id);
let name = data.titles.first().map(|t| t.title.as_str()).unwrap_or("");
let mut names: Vec<Value> = Vec::new();
if !name.is_empty() {
let lang = data.titles.first()
.and_then(|t| if t.language.is_empty() { None } else { Some(t.language.as_str()) })
.unwrap_or("en");
names.push(json!({
"value": name,
"types": ["ror_display"],
"lang": lang
}));
}
let links: Vec<Value> = if !data.url.is_empty() {
vec![json!({"type": "website", "value": data.url})]
} else {
vec![]
};
let external_ids: Vec<Value> = data
.identifiers
.iter()
.filter_map(|id| {
let ext_type = cm_to_ror_ext_type(&id.identifier_type);
if ext_type.is_empty() {
return None;
}
Some(json!({
"type": ext_type,
"all": [id.identifier],
"preferred": id.identifier
}))
})
.collect();
let relationships: Vec<Value> = data
.relations
.iter()
.map(|r| {
let rel_type = match r.type_.as_str() {
"IsPartOf" => "parent",
"HasPart" => "child",
_ => "related",
};
json!({"type": rel_type, "id": r.id, "label": ""})
})
.collect();
let mut obj = Map::new();
obj.insert("id".to_string(), Value::String(ror_id));
obj.insert("names".to_string(), Value::Array(names));
obj.insert("links".to_string(), Value::Array(links));
obj.insert("external_ids".to_string(), Value::Array(external_ids));
obj.insert("relationships".to_string(), Value::Array(relationships));
obj.insert("status".to_string(), Value::String("active".to_string()));
obj.insert("types".to_string(), Value::Array(vec![]));
if let Ok(year) = data.date.published.parse::<i64>() {
obj.insert("established".to_string(), Value::Number(year.into()));
}
serde_json::to_vec_pretty(&Value::Object(obj))
.map_err(|e| Error::Serialize(e.to_string()))
}
#[derive(Debug, Default, Clone, Serialize, Deserialize, parquet_derive::ParquetRecordWriter)]
pub struct RorCsv {
pub id: String,
pub name: String,
pub types: String,
pub status: String,
pub links: String,
pub aliases: String,
pub labels: String,
pub acronyms: String,
pub wikipedia_url: String,
pub established: String,
pub latitude: String,
pub longitude: String,
pub place: String,
pub geonames_id: String,
pub country_subdivision_name: String,
pub country_subdivision_code: String,
pub country_code: String,
pub country_name: String,
pub external_ids_grid_preferred: String,
pub external_ids_grid_all: String,
pub external_ids_isni_preferred: String,
pub external_ids_isni_all: String,
pub external_ids_fundref_preferred: String,
pub external_ids_fundref_all: String,
pub external_ids_wikidata_preferred: String,
pub external_ids_wikidata_all: String,
pub relationships: String,
}
pub fn convert_ror_csv(ror: &Ror) -> RorCsv {
let mut out = RorCsv { id: ror.id.clone(), status: ror.status.clone(), ..Default::default() };
let mut acronyms = Vec::new();
let mut aliases = Vec::new();
let mut labels = Vec::new();
for name in &ror.names {
if name.types.iter().any(|t| t == "ror_display") {
out.name = name.value.clone();
} else if name.types.iter().any(|t| t == "acronym") && !name.value.is_empty() {
acronyms.push(name.value.clone());
} else if name.types.iter().any(|t| t == "alias") {
aliases.push(name.value.clone());
} else if name.types.iter().any(|t| t == "label") {
if !name.lang.is_empty() {
labels.push(format!("{}: {}", name.lang, name.value));
} else {
labels.push(name.value.clone());
}
}
}
let mut compacted_types: Vec<&str> = Vec::new();
for t in &ror.types {
if compacted_types.last().map(|last| *last != t.as_str()).unwrap_or(true) {
compacted_types.push(t.as_str());
}
}
out.types = compacted_types.join("; ");
for link in &ror.links {
if link.type_ == "website" {
out.links = link.value.clone();
} else if link.type_ == "wikipedia" {
out.wikipedia_url = link.value.clone();
}
}
out.aliases = aliases.join("; ");
out.labels = labels.join("; ");
out.acronyms = acronyms.join("; ");
if let Some(year) = ror.established {
if year != 0 {
out.established = year.to_string();
}
}
if let Some(loc) = ror.locations.first() {
out.latitude = format!("{:.6}", loc.geonames_details.lat);
out.longitude = format!("{:.6}", loc.geonames_details.lng);
out.place = loc.geonames_details.name.clone();
out.geonames_id = loc.geonames_id.to_string();
out.country_subdivision_name = loc.geonames_details.country_subdivision_name.clone();
out.country_subdivision_code = loc.geonames_details.country_subdivision_code.clone();
out.country_code = loc.geonames_details.country_code.clone();
out.country_name = loc.geonames_details.country_name.clone();
}
for ext in &ror.external_ids {
match ext.type_.to_lowercase().as_str() {
"grid" => {
out.external_ids_grid_preferred = ext.preferred.clone();
out.external_ids_grid_all = ext.all.join(";");
}
"isni" => {
out.external_ids_isni_preferred = ext.preferred.clone();
out.external_ids_isni_all = ext.all.join(";");
}
"fundref" => {
out.external_ids_fundref_preferred = ext.preferred.clone();
out.external_ids_fundref_all = ext.all.join(";");
}
"wikidata" => {
out.external_ids_wikidata_preferred = ext.preferred.clone();
out.external_ids_wikidata_all = ext.all.join(";");
}
_ => {}
}
}
let mut child = Vec::new();
let mut parent = Vec::new();
let mut related = Vec::new();
for rel in &ror.relationships {
match rel.type_.as_str() {
"child" => child.push(rel.id.clone()),
"parent" => parent.push(rel.id.clone()),
"related" => related.push(rel.id.clone()),
_ => {}
}
}
let mut groups = Vec::new();
if !child.is_empty() {
groups.push(format!("Child: {}", child.join(", ")));
}
if !parent.is_empty() {
groups.push(format!("Parent: {}", parent.join(", ")));
}
if !related.is_empty() {
groups.push(format!("Related: {}", related.join(", ")));
}
out.relationships = groups.join("; ");
out
}
pub fn write_csv(list: &[Ror]) -> Result<Vec<u8>> {
let mut writer = csv::Writer::from_writer(Vec::new());
for ror in list {
writer
.serialize(convert_ror_csv(ror))
.map_err(|e| Error::Serialize(e.to_string()))?;
}
writer
.into_inner()
.map_err(|e| Error::Serialize(e.to_string()))
}
pub fn write_parquet(list: &[Ror]) -> Result<Vec<u8>> {
use parquet::file::properties::WriterProperties;
use parquet::file::writer::SerializedFileWriter;
use parquet::record::RecordWriter;
let rows: Vec<RorCsv> = list.iter().map(convert_ror_csv).collect();
let schema = rows.as_slice().schema().map_err(|e| Error::Serialize(e.to_string()))?;
let props = std::sync::Arc::new(WriterProperties::builder().build());
let buffer: Vec<u8> = Vec::new();
let mut writer = SerializedFileWriter::new(buffer, schema, props)
.map_err(|e| Error::Serialize(e.to_string()))?;
let mut row_group = writer.next_row_group().map_err(|e| Error::Serialize(e.to_string()))?;
rows.as_slice()
.write_to_row_group(&mut row_group)
.map_err(|e| Error::Serialize(e.to_string()))?;
row_group.close().map_err(|e| Error::Serialize(e.to_string()))?;
writer.into_inner().map_err(|e| Error::Serialize(e.to_string()))
}
pub fn write_all(list: &[Ror], extension: &str) -> Result<Vec<u8>> {
match extension {
".yaml" => serde_yaml::to_string(list)
.map(|s| s.into_bytes())
.map_err(|e| Error::Serialize(e.to_string())),
".json" => serde_json::to_vec(list).map_err(|e| Error::Serialize(e.to_string())),
".jsonl" => {
let mut out = Vec::new();
for item in list {
serde_json::to_writer(&mut out, item)
.map_err(|e| Error::Serialize(e.to_string()))?;
out.push(b'\n');
}
Ok(out)
}
".csv" => write_csv(list),
".parquet" => write_parquet(list),
other => Err(Error::UnsupportedFormat(other.to_string())),
}
}
const SPECIAL_CHARS: &str = r"[+\-=|><!()\\\{\}\[\]^~*?:/.,;]";
pub fn clean_search_string(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for ch in s.chars() {
if matches!(ch, '+' | '-' | '=' | '|' | '>' | '<' | '!' | '(' | ')' |
'\\' | '{' | '}' | '[' | ']' | '^' | '"' | '~' | '*' |
'?' | ':' | '/' | '.' | ',' | ';') {
out.push(' ');
} else {
out.push(ch);
}
}
let mut result = String::with_capacity(out.len());
let mut chars = out.chars().peekable();
while let Some(ch) = chars.next() {
if ch.is_ascii_digit() {
let mut digits = String::new();
digits.push(ch);
while let Some(&next) = chars.peek() {
if next.is_ascii_digit() {
digits.push(next);
chars.next();
} else {
break;
}
}
if digits.len() != 5 {
result.push_str(&digits);
} else {
result.push(' ');
}
} else {
result.push(ch);
}
}
result.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub fn to_region(code: &str) -> &'static str {
match code {
"GB" | "UK" => "GB-UK",
"CN" | "HK" | "TW" => "CN-HK-TW",
"PR" | "US" => "US-PR",
_ => {
""
}
}
}
pub fn get_country_codes(s: &str) -> Vec<String> {
let lower: String = s.to_lowercase();
let alpha_lower: String = lower
.chars()
.map(|c| if c.is_ascii_alphabetic() { c } else { ' ' })
.collect();
let alpha_lower = alpha_lower.split_whitespace().collect::<Vec<_>>().join(" ");
let alpha_tokens: Vec<&str> = alpha_lower.split_whitespace().collect();
let orig_tokens: Vec<String> = s
.split_whitespace()
.map(|t| t.chars().filter(|c| c.is_ascii_alphabetic()).collect::<String>().to_uppercase())
.filter(|t| !t.is_empty())
.collect();
let mut codes: HashSet<String> = HashSet::new();
'outer: for &(code, names) in ROR_COUNTRIES {
for &name in names {
let name_chars: Vec<char> = name.chars().collect();
let name_len = name_chars.len();
let matched = if name_len == 2 && name.chars().all(|c| c.is_ascii_alphabetic()) {
orig_tokens.iter().any(|t| t == &name.to_uppercase())
} else {
let has_special = name.chars().any(|c| !c.is_ascii_alphabetic() && c != ' ');
if has_special {
lower.contains(name)
} else if name.contains(' ') {
alpha_lower.contains(name)
} else {
alpha_tokens.iter().any(|&t| t == name)
}
};
if matched {
codes.insert(code.to_string());
continue 'outer;
}
}
}
let mut result: Vec<String> = codes.into_iter().collect();
result.sort();
result
}
pub fn get_countries(s: &str) -> Vec<String> {
let codes = get_country_codes(s);
let mut regions: HashSet<String> = HashSet::new();
for code in &codes {
let region = to_region(code);
if region.is_empty() {
regions.insert(code.clone());
} else {
regions.insert(region.to_string());
}
}
let mut result: Vec<String> = regions.into_iter().collect();
result.sort();
result
}
#[derive(Debug, Deserialize)]
pub struct AffiliationMatch {
pub substring: String,
pub score: f64,
pub matching_type: String,
pub chosen: bool,
#[serde(skip)]
pub organization: Data,
#[serde(rename = "organization")]
organization_raw: Ror,
}
#[derive(Debug, Deserialize)]
struct AffiliationResponse {
#[serde(default)]
number_of_results: i32,
#[serde(default)]
items: Vec<AffiliationMatch>,
}
pub fn match_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
let cleaned = clean_search_string(affiliation);
if cleaned.is_empty() {
return Ok(vec![]);
}
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let encoded: String = url::form_urlencoded::byte_serialize(cleaned.as_bytes()).collect();
let api_url = format!("https://api.ror.org/v2/organizations?affiliation={}", encoded);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let mut resp: AffiliationResponse =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
for item in &mut resp.items {
let ror = std::mem::take(&mut item.organization_raw);
item.organization = from_ror(ror);
}
Ok(resp.items)
}
pub fn read_json(input: &str) -> Result<Data> {
let ror: Ror = serde_json::from_str(input).map_err(|e| Error::Parse(e.to_string()))?;
Ok(from_ror(ror))
}
pub fn fetch(input: &str) -> Result<Data> {
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let (id, id_type) = validate_id(input);
let ror = if id_type == "ROR" {
let ror_id = validate_ror(&id).unwrap_or(id.clone());
let api_url = format!("https://api.ror.org/v2/organizations/{}", ror_id);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
serde_json::from_str::<Ror>(&text).map_err(|e| Error::Parse(e.to_string()))?
} else {
let org_types = ["ROR", "Crossref Funder ID", "GRID", "Wikidata", "ISNI"];
if !org_types.contains(&id_type) {
return Err(Error::Parse(format!(
"Not a supported organization identifier: {}",
input
)));
}
let encoded: String = url::form_urlencoded::byte_serialize(id.as_bytes()).collect();
let api_url = format!("https://api.ror.org/v2/organizations?query={}", encoded);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let list: RorListResponse =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
if list.number_of_results != 1 {
return Err(Error::Parse(format!(
"Expected 1 result from ROR query, got {}",
list.number_of_results
)));
}
list.items.into_iter().next().ok_or_else(|| {
Error::Parse("No items in ROR query response".to_string())
})?
};
Ok(from_ror(ror))
}
#[cfg(test)]
mod tests {
use super::*;
const ROR_ORG: &str = r#"{
"id": "https://ror.org/02nr0ka47",
"established": 2013,
"external_ids": [
{
"type": "Wikidata",
"all": ["Q19341888"],
"preferred": ""
},
{
"type": "GRID",
"all": ["grid.465570.2"],
"preferred": "grid.465570.2"
},
{
"type": "FundRef",
"all": ["100012611"],
"preferred": "100012611"
}
],
"links": [
{"type": "website", "value": "https://impactstory.org"}
],
"locations": [
{
"geonames_id": 4774183,
"geonames_details": {
"country_code": "US",
"country_name": "United States",
"name": "Williamsburg"
}
}
],
"names": [
{"value": "Impactstory", "types": ["ror_display", "label"], "lang": "en"},
{"value": "IS", "types": ["acronym"], "lang": ""}
],
"relationships": [
{"type": "related", "label": "Our Society", "id": "https://ror.org/045gyfv07"}
],
"status": "active",
"types": ["nonprofit"]
}"#;
#[test]
fn test_read_ror_basic() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.id, "https://ror.org/02nr0ka47");
assert_eq!(data.type_, "Organization");
assert_eq!(data.provider, "ROR");
assert_eq!(data.url, "https://impactstory.org");
}
#[test]
fn test_ror_display_name() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.titles[0].title, "Impactstory");
}
#[test]
fn test_ror_date() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.date.published, "2013");
}
#[test]
fn test_ror_identifiers() {
let data = read_json(ROR_ORG).unwrap();
let wikidata = data.identifiers.iter().find(|i| i.identifier_type == "Wikidata").unwrap();
assert_eq!(wikidata.identifier, "Q19341888");
let grid = data.identifiers.iter().find(|i| i.identifier_type == "GRID").unwrap();
assert_eq!(grid.identifier, "grid.465570.2");
let fundref = data.identifiers.iter().find(|i| i.identifier_type == "Crossref Funder ID").unwrap();
assert_eq!(fundref.identifier, "100012611");
}
#[test]
fn test_ror_relations() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.relations.len(), 1);
assert_eq!(data.relations[0].id, "https://ror.org/045gyfv07");
assert_eq!(data.relations[0].type_, "References");
}
#[test]
fn test_ror_no_established() {
let json = r#"{
"id": "https://ror.org/01234abc",
"names": [{"value": "Test Org", "types": ["ror_display"]}],
"links": [],
"external_ids": [],
"relationships": [],
"status": "active",
"types": ["education"]
}"#;
let data = read_json(json).unwrap();
assert!(data.date.published.is_empty());
}
#[test]
fn test_get_display_name() {
let ror: Ror = serde_json::from_str(ROR_ORG).unwrap();
assert_eq!(get_display_name(&ror), "Impactstory");
}
#[test]
fn test_ror_no_display_name() {
let json = r#"{
"id": "https://ror.org/abc123",
"names": [{"value": "Other Name", "types": ["label"]}],
"links": [],
"external_ids": [],
"relationships": [],
"status": "active",
"types": []
}"#;
let data = read_json(json).unwrap();
assert!(data.titles.is_empty());
}
#[test]
fn test_write_inveniordm_yaml() {
let data = read_json(ROR_ORG).unwrap();
let bytes = write(&data).unwrap();
let yaml = String::from_utf8(bytes).unwrap();
assert!(yaml.contains("id: 02nr0ka47"), "expected bare ROR id, got:\n{yaml}");
assert!(yaml.contains("name: Impactstory"));
assert!(yaml.contains("scheme: ror"));
assert!(yaml.contains("scheme: wikidata"));
assert!(yaml.contains("scheme: grid"));
assert!(yaml.contains("scheme: fundref"));
assert!(yaml.contains("en: Impactstory"));
}
#[test]
fn test_write_ror_json() {
let data = read_json(ROR_ORG).unwrap();
let bytes = write_json(&data).unwrap();
let v: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(v["id"].as_str().unwrap(), "https://ror.org/02nr0ka47");
assert_eq!(v["names"][0]["value"].as_str().unwrap(), "Impactstory");
assert_eq!(v["links"][0]["type"].as_str().unwrap(), "website");
assert_eq!(v["links"][0]["value"].as_str().unwrap(), "https://impactstory.org");
assert_eq!(v["established"].as_i64().unwrap(), 2013);
assert_eq!(v["status"].as_str().unwrap(), "active");
let ext_ids = v["external_ids"].as_array().unwrap();
assert!(ext_ids.iter().any(|e| e["type"] == "Wikidata"));
assert!(ext_ids.iter().any(|e| e["type"] == "GRID"));
assert!(ext_ids.iter().any(|e| e["type"] == "FundRef"));
let rels = v["relationships"].as_array().unwrap();
assert_eq!(rels[0]["type"].as_str().unwrap(), "related");
assert_eq!(rels[0]["id"].as_str().unwrap(), "https://ror.org/045gyfv07");
}
#[test]
fn test_write_invalid_ror_id() {
use crate::data::Data;
let data = Data { id: "https://doi.org/10.1234/not-a-ror".to_string(), ..Data::default() };
assert!(write(&data).is_err());
}
#[test]
fn test_clean_search_string_special_chars() {
assert_eq!(
clean_search_string("Dept. of Biology, MIT; USA"),
"Dept of Biology MIT USA"
);
}
#[test]
fn test_clean_search_string_postal_code() {
assert_eq!(
clean_search_string("Stanford University 94305 CA"),
"Stanford University CA"
);
assert_eq!(
clean_search_string("Lab 2024 report"),
"Lab 2024 report"
);
}
#[test]
fn test_clean_search_string_collapses_whitespace() {
assert_eq!(
clean_search_string(" University of Cambridge "),
"University of Cambridge"
);
}
#[test]
fn test_clean_search_string_empty() {
assert_eq!(clean_search_string(""), "");
assert_eq!(clean_search_string("...;;;"), "");
}
#[test]
fn test_get_country_codes_full_name() {
let codes = get_country_codes("University of California, United States");
assert!(codes.contains(&"US".to_string()), "expected US in {:?}", codes);
}
#[test]
fn test_get_country_codes_abbreviation() {
let codes = get_country_codes("Max Planck Institute, Germany");
assert!(codes.contains(&"DE".to_string()), "expected DE in {:?}", codes);
}
#[test]
fn test_get_country_codes_uk() {
let codes = get_country_codes("University of Oxford, United Kingdom");
assert!(codes.contains(&"UK".to_string()), "expected UK in {:?}", codes);
}
#[test]
fn test_get_country_codes_no_match() {
let codes = get_country_codes("Department of Zoology");
let _ = codes;
}
#[test]
fn test_get_countries_region_mapping() {
let regions = get_countries("Harvard University, USA");
assert!(regions.contains(&"US-PR".to_string()), "expected US-PR in {:?}", regions);
}
#[test]
fn test_get_countries_uk_region() {
let regions = get_countries("Imperial College London, UK");
assert!(regions.contains(&"GB-UK".to_string()), "expected GB-UK in {:?}", regions);
}
#[test]
fn test_to_region_special() {
assert_eq!(to_region("US"), "US-PR");
assert_eq!(to_region("PR"), "US-PR");
assert_eq!(to_region("GB"), "GB-UK");
assert_eq!(to_region("UK"), "GB-UK");
assert_eq!(to_region("CN"), "CN-HK-TW");
assert_eq!(to_region("HK"), "CN-HK-TW");
assert_eq!(to_region("TW"), "CN-HK-TW");
}
#[test]
fn test_to_region_passthrough() {
assert_eq!(to_region("DE"), "");
assert_eq!(to_region("FR"), "");
}
#[test]
fn test_get_country_codes_france() {
let codes = get_country_codes("CNRS, France");
assert!(codes.contains(&"FR".to_string()), "expected FR in {:?}", codes);
}
#[test]
fn test_get_country_codes_japan() {
let codes = get_country_codes("University of Tokyo, Japan");
assert!(codes.contains(&"JP".to_string()), "expected JP in {:?}", codes);
}
fn sample_ror() -> Ror {
serde_json::from_str(ROR_ORG).unwrap()
}
#[test]
fn test_convert_ror_csv_basic() {
let ror = sample_ror();
let row = convert_ror_csv(&ror);
assert_eq!(row.id, "https://ror.org/02nr0ka47");
assert_eq!(row.name, "Impactstory");
assert_eq!(row.types, "nonprofit");
assert_eq!(row.status, "active");
assert_eq!(row.links, "https://impactstory.org");
assert_eq!(row.established, "2013");
assert_eq!(row.country_code, "US");
assert_eq!(row.place, "Williamsburg");
}
#[test]
fn test_convert_ror_csv_external_ids() {
let ror = sample_ror();
let row = convert_ror_csv(&ror);
assert_eq!(row.external_ids_wikidata_all, "Q19341888");
assert_eq!(row.external_ids_grid_preferred, "grid.465570.2");
assert_eq!(row.external_ids_fundref_preferred, "100012611");
}
#[test]
fn test_convert_ror_csv_relationships() {
let ror = sample_ror();
let row = convert_ror_csv(&ror);
assert_eq!(row.relationships, "Related: https://ror.org/045gyfv07");
}
#[test]
fn test_convert_ror_csv_no_established() {
let mut ror = sample_ror();
ror.established = None;
let row = convert_ror_csv(&ror);
assert!(row.established.is_empty());
ror.established = Some(0);
let row = convert_ror_csv(&ror);
assert!(row.established.is_empty());
}
#[test]
fn test_convert_ror_csv_no_locations() {
let mut ror = sample_ror();
ror.locations.clear();
let row = convert_ror_csv(&ror);
assert!(row.country_code.is_empty());
assert!(row.place.is_empty());
}
#[test]
fn test_write_csv_roundtrip() {
let ror = sample_ror();
let bytes = write_csv(std::slice::from_ref(&ror)).unwrap();
let text = String::from_utf8(bytes).unwrap();
let mut reader = csv::Reader::from_reader(text.as_bytes());
let records: Vec<RorCsv> = reader.deserialize().map(|r| r.unwrap()).collect();
assert_eq!(records.len(), 1);
assert_eq!(records[0].name, "Impactstory");
assert_eq!(records[0].id, "https://ror.org/02nr0ka47");
}
#[test]
fn test_write_parquet_roundtrip() {
let ror = sample_ror();
let bytes = write_parquet(std::slice::from_ref(&ror)).unwrap();
assert!(!bytes.is_empty());
assert_eq!(&bytes[0..4], b"PAR1");
assert_eq!(&bytes[bytes.len() - 4..], b"PAR1");
}
#[test]
fn test_write_all_dispatch() {
let ror = sample_ror();
let list = vec![ror];
let json = write_all(&list, ".json").unwrap();
assert!(String::from_utf8(json).unwrap().contains("Impactstory"));
let yaml = write_all(&list, ".yaml").unwrap();
assert!(String::from_utf8(yaml).unwrap().contains("Impactstory"));
let jsonl = write_all(&list, ".jsonl").unwrap();
let jsonl_text = String::from_utf8(jsonl).unwrap();
assert_eq!(jsonl_text.lines().count(), 1);
let csv_bytes = write_all(&list, ".csv").unwrap();
assert!(String::from_utf8(csv_bytes).unwrap().contains("Impactstory"));
let parquet_bytes = write_all(&list, ".parquet").unwrap();
assert_eq!(&parquet_bytes[0..4], b"PAR1");
assert!(write_all(&list, ".sql").is_err());
}
#[test]
fn test_write_all_empty_list() {
let list: Vec<Ror> = vec![];
assert!(write_all(&list, ".json").unwrap() == b"[]");
let csv_bytes = write_all(&list, ".csv").unwrap();
assert!(csv_bytes.is_empty());
}
}