#![allow(dead_code)]
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::{HashMap, HashSet};
use std::path::Path;
use crate::data::{Data, Identifier, Relation};
use crate::error::{Error, Result};
use crate::utils::{normalize_ror, validate_id, validate_ror};
use crate::ror_countries::ROR_COUNTRIES;
fn null_as_empty<'de, D>(d: D) -> std::result::Result<String, D::Error>
where
D: Deserializer<'de>,
{
Ok(Option::<String>::deserialize(d)?.unwrap_or_default())
}
fn serialize_empty_as_null<S>(v: &str, s: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if v.is_empty() {
s.serialize_none()
} else {
s.serialize_str(v)
}
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct AdminEntry {
#[serde(default, deserialize_with = "null_as_empty")]
pub date: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub schema_version: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Admin {
#[serde(default)]
pub created: AdminEntry,
#[serde(default)]
pub last_modified: AdminEntry,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Ror {
#[serde(default)]
pub admin: Option<Admin>,
#[serde(default)]
pub domains: Vec<String>,
#[serde(default, deserialize_with = "null_as_empty")]
pub id: String,
#[serde(default)]
pub established: Option<i32>,
#[serde(default)]
pub external_ids: Vec<ExternalId>,
#[serde(default)]
pub links: Vec<Link>,
#[serde(default)]
pub locations: Vec<Location>,
#[serde(default)]
pub names: Vec<Name>,
#[serde(default)]
pub relationships: Vec<Relationship>,
#[serde(default, deserialize_with = "null_as_empty")]
pub status: String,
#[serde(rename = "types", default)]
pub types: Vec<String>,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct ExternalId {
#[serde(rename = "type", default, deserialize_with = "null_as_empty")]
pub type_: String,
#[serde(default)]
pub all: Vec<String>,
#[serde(default, deserialize_with = "null_as_empty", serialize_with = "serialize_empty_as_null")]
pub preferred: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Link {
#[serde(rename = "type", default, deserialize_with = "null_as_empty")]
pub type_: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub value: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Location {
#[serde(default)]
pub geonames_id: i64,
#[serde(default)]
pub geonames_details: GeonamesDetails,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct GeonamesDetails {
#[serde(default, deserialize_with = "null_as_empty")]
pub continent_code: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub continent_name: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_code: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_name: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_subdivision_code: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub country_subdivision_name: String,
#[serde(default)]
pub lat: f64,
#[serde(default)]
pub lng: f64,
#[serde(default, deserialize_with = "null_as_empty")]
pub name: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Name {
#[serde(default, deserialize_with = "null_as_empty")]
pub value: String,
#[serde(rename = "types", default)]
pub types: Vec<String>,
#[serde(default, deserialize_with = "null_as_empty", serialize_with = "serialize_empty_as_null")]
pub lang: String,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Relationship {
#[serde(rename = "type", default, deserialize_with = "null_as_empty")]
pub type_: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub label: String,
#[serde(default, deserialize_with = "null_as_empty")]
pub id: String,
}
#[derive(Debug, Deserialize)]
struct RorListResponse {
#[serde(default)]
number_of_results: i32,
#[serde(default)]
items: Vec<Ror>,
}
pub fn get_display_name(ror: &Ror) -> String {
ror.names
.iter()
.find(|n| n.types.iter().any(|t| t == "ror_display"))
.map(|n| n.value.clone())
.unwrap_or_default()
}
fn get_website_url(ror: &Ror) -> String {
ror.links
.iter()
.find(|l| l.type_ == "website")
.map(|l| l.value.clone())
.unwrap_or_default()
}
fn external_id_type(ror_type: &str) -> &'static str {
match ror_type.to_lowercase().as_str() {
"grid" => "GRID",
"wikidata" => "Wikidata",
"fundref" => "Crossref Funder ID",
"isni" => "ISNI",
_ => "",
}
}
fn from_ror(ror: Ror) -> Data {
let id = normalize_ror(&ror.id);
let name = get_display_name(&ror);
let title = name.clone();
let acronym = ror.names.iter()
.find(|n| n.types.iter().any(|t| t == "acronym"))
.map(|n| n.value.clone())
.unwrap_or_default();
let additional_names: Vec<String> = ror.names.iter()
.filter(|n| {
n.types.iter().any(|t| t == "alias" || t == "label")
&& !n.types.iter().any(|t| t == "ror_display")
})
.map(|n| n.value.clone())
.filter(|s| !s.is_empty() && s != &name)
.collect();
let types: Vec<String> = ror.types.iter()
.map(|t| t.to_lowercase())
.collect();
let status = ror.status.clone();
let established = ror.established;
let url = get_website_url(&ror);
let urls: Vec<crate::data::PersonUrl> = ror.links.iter()
.filter(|l| !l.value.is_empty())
.map(|l| crate::data::PersonUrl {
name: l.type_.clone(),
url: l.value.clone(),
})
.collect();
let first_loc = ror.locations.first();
let country = first_loc
.map(|l| l.geonames_details.country_code.clone())
.unwrap_or_default();
let geo_locations: Vec<crate::data::GeoLocation> = ror.locations.iter()
.filter(|l| l.geonames_details.lat != 0.0 || l.geonames_details.lng != 0.0)
.map(|l| {
let d = &l.geonames_details;
let place = if d.country_subdivision_name == d.name || d.country_subdivision_name.is_empty() {
format!("{}, {}", d.name, d.country_name)
} else {
format!("{}, {}, {}", d.name, d.country_subdivision_name, d.country_name)
};
let id = if l.geonames_id != 0 {
format!("https://www.geonames.org/{}", l.geonames_id)
} else {
String::new()
};
crate::data::GeoLocation {
id,
place: place,
point_latitude: Some(d.lat),
point_longitude: Some(d.lng),
..Default::default()
}
})
.collect();
let date_updated = ror.admin.as_ref()
.map(|a| a.last_modified.date.clone())
.unwrap_or_default();
let identifiers: Vec<Identifier> = ror
.external_ids
.iter()
.filter_map(|ext| {
let id_type = external_id_type(&ext.type_);
if id_type.is_empty() {
return None;
}
let value = if !ext.preferred.is_empty() {
ext.preferred.clone()
} else {
ext.all.first().cloned().unwrap_or_default()
};
if value.is_empty() {
return None;
}
Some(Identifier {
identifier: value,
identifier_type: id_type.to_string(),
..Default::default()
})
})
.collect();
let relations: Vec<Relation> = ror
.relationships
.iter()
.filter(|r| !r.id.is_empty())
.map(|r| {
let rel_type = match r.type_.as_str() {
"parent" => "IsPartOf",
"child" => "HasPart",
"related" => "IsRelatedTo",
_ => "Other",
};
Relation {
id: normalize_ror(&r.id),
type_: rel_type.to_string(),
..Default::default()
}
})
.collect();
Data {
id,
type_: "Organization".to_string(),
name,
title,
acronym,
additional_names,
types,
status,
established,
url,
urls,
country,
geo_locations,
date_updated,
identifiers,
relations,
asserted_by: "ROR".to_string(),
provider: "ROR".to_string(),
..Data::default()
}
}
#[derive(Serialize)]
struct OutInvenioRdm {
id: String,
name: String,
#[serde(skip_serializing_if = "String::is_empty")]
acronym: String,
#[serde(skip_serializing_if = "String::is_empty")]
country: String,
identifiers: Vec<OutIdentifier>,
title: HashMap<String, String>,
}
#[derive(Serialize)]
struct OutIdentifier {
identifier: String,
scheme: String,
}
fn cm_to_inveniordm_scheme(cm_type: &str) -> &'static str {
match cm_type {
"GRID" => "grid",
"Wikidata" => "wikidata",
"ISNI" => "isni",
"Crossref Funder ID" => "fundref",
_ => "",
}
}
fn cm_to_ror_ext_type(cm_type: &str) -> &'static str {
match cm_type {
"GRID" => "GRID",
"Wikidata" => "Wikidata",
"ISNI" => "ISNI",
"Crossref Funder ID" => "FundRef",
_ => "",
}
}
pub fn write(data: &Data) -> Result<Vec<u8>> {
let bare_id = validate_ror(&data.id)
.ok_or_else(|| Error::InvalidId(format!("not a valid ROR ID: {}", data.id)))?;
let mut title: HashMap<String, String> = HashMap::new();
if !data.title.is_empty() {
let lang = if data.language.is_empty() { "en".to_string() } else { data.language.clone() };
title.insert(lang, data.title.clone());
}
let name = data.title.clone();
let mut identifiers = vec![OutIdentifier {
identifier: bare_id.clone(),
scheme: "ror".to_string(),
}];
for id in &data.identifiers {
let scheme = cm_to_inveniordm_scheme(&id.identifier_type);
if !scheme.is_empty() {
identifiers.push(OutIdentifier {
identifier: id.identifier.clone(),
scheme: scheme.to_string(),
});
}
}
let out = OutInvenioRdm {
id: bare_id,
name,
acronym: String::new(),
country: String::new(),
identifiers,
title,
};
serde_yaml::to_string(&out)
.map(|s| s.into_bytes())
.map_err(|e| Error::Serialize(e.to_string()))
}
fn convert_json(data: &Data) -> serde_json::Value {
use serde_json::{Map, Value, json};
let ror_id = normalize_ror(&data.id);
let display_name = if !data.name.is_empty() { &data.name } else { &data.title };
let mut names: Vec<Value> = Vec::new();
if !display_name.is_empty() {
let lang = if data.language.is_empty() { "en" } else { &data.language };
names.push(json!({
"value": display_name,
"types": ["ror_display"],
"lang": lang
}));
}
if !data.acronym.is_empty() {
names.push(json!({"value": data.acronym, "types": ["acronym"], "lang": ""}));
}
for alt in &data.additional_names {
names.push(json!({"value": alt, "types": ["alias"], "lang": ""}));
}
let links: Vec<Value> = if !data.urls.is_empty() {
data.urls.iter().map(|u| json!({"type": u.name, "value": u.url})).collect()
} else if !data.url.is_empty() {
vec![json!({"type": "website", "value": data.url})]
} else {
vec![]
};
let external_ids: Vec<Value> = data
.identifiers
.iter()
.filter_map(|id| {
let ext_type = cm_to_ror_ext_type(&id.identifier_type);
if ext_type.is_empty() {
return None;
}
Some(json!({
"type": ext_type,
"all": [id.identifier],
"preferred": id.identifier
}))
})
.collect();
let relationships: Vec<Value> = data
.relations
.iter()
.map(|r| {
let rel_type = match r.type_.as_str() {
"IsPartOf" => "parent",
"HasPart" => "child",
_ => "related",
};
json!({"type": rel_type, "id": r.id, "label": ""})
})
.collect();
let status = if !data.status.is_empty() { &data.status } else { "active" };
let types: Vec<Value> = if !data.types.is_empty() {
data.types.iter().map(|t| {
let mut s = t.clone();
if let Some(c) = s.get_mut(0..1) { c.make_ascii_uppercase(); }
Value::String(s)
}).collect()
} else {
vec![]
};
let mut obj = Map::new();
obj.insert("id".to_string(), Value::String(ror_id));
obj.insert("names".to_string(), Value::Array(names));
obj.insert("links".to_string(), Value::Array(links));
obj.insert("external_ids".to_string(), Value::Array(external_ids));
obj.insert("relationships".to_string(), Value::Array(relationships));
obj.insert("status".to_string(), Value::String(status.to_string()));
obj.insert("types".to_string(), Value::Array(types));
if let Some(year) = data.established {
obj.insert("established".to_string(), Value::Number(year.into()));
}
Value::Object(obj)
}
pub fn write_json(data: &Data) -> Result<Vec<u8>> {
serde_json::to_vec_pretty(&convert_json(data)).map_err(|e| Error::Serialize(e.to_string()))
}
pub fn write_json_all(list: &[Data]) -> Result<Vec<u8>> {
let values: Vec<serde_json::Value> = list.iter().map(convert_json).collect();
serde_json::to_vec_pretty(&values).map_err(|e| Error::Serialize(e.to_string()))
}
pub fn write_commonmeta_org(data: &Data) -> Result<Vec<u8>> {
use serde_json::{Map, Value};
let mut obj = Map::new();
obj.insert("id".to_string(), Value::String(data.id.clone()));
let display_name = if !data.name.is_empty() { &data.name } else { &data.title };
if !display_name.is_empty() {
obj.insert("name".to_string(), Value::String(display_name.clone()));
}
if !data.acronym.is_empty() {
obj.insert("acronym".to_string(), Value::String(data.acronym.clone()));
}
if !data.additional_names.is_empty() {
obj.insert("additional_names".to_string(), serde_json::json!(data.additional_names));
}
if !data.types.is_empty() {
obj.insert("types".to_string(), serde_json::json!(data.types));
}
if !data.status.is_empty() {
obj.insert("status".to_string(), Value::String(data.status.clone()));
}
if let Some(year) = data.established {
obj.insert("established".to_string(), Value::Number(year.into()));
}
if !data.date_updated.is_empty() {
obj.insert("date_updated".to_string(), Value::String(data.date_updated.clone()));
}
if !data.country.is_empty() {
obj.insert("country".to_string(), Value::String(data.country.clone()));
}
let urls_arr: Vec<Value> = if !data.urls.is_empty() {
data.urls.iter().map(|u| {
let mut m = Map::new();
if !u.name.is_empty() { m.insert("name".to_string(), Value::String(u.name.clone())); }
m.insert("url".to_string(), Value::String(u.url.clone()));
Value::Object(m)
}).collect()
} else if !data.url.is_empty() {
vec![serde_json::json!({"url": data.url})]
} else {
vec![]
};
if !urls_arr.is_empty() {
obj.insert("urls".to_string(), Value::Array(urls_arr));
}
const VALID_ID_TYPES: &[&str] = &[
"ARK", "arXiv", "article_id", "Bibcode", "DOI", "FundRef", "GRID", "Handle",
"ISBN", "ISNI", "ISSN", "OpenAlex", "PMID", "PMCID", "PURL", "RAiD",
"ResearcherID", "ROR", "ScopusID", "SWHID", "URL", "URN", "UUID", "GUID",
"Wikidata", "Other",
];
let ids: Vec<Value> = data
.identifiers
.iter()
.filter(|id| !id.identifier.is_empty())
.filter_map(|id| {
let id_type = match id.identifier_type.as_str() {
"Crossref Funder ID" => "FundRef",
t if VALID_ID_TYPES.contains(&t) => t,
_ => "Other",
};
let mut m = Map::new();
m.insert("identifier".to_string(), Value::String(id.identifier.clone()));
m.insert("identifier_type".to_string(), Value::String(id_type.to_string()));
if id_type == "Other" && !id.scheme.is_empty() {
m.insert("scheme".to_string(), Value::String(id.scheme.clone()));
}
Some(Value::Object(m))
})
.collect();
if !ids.is_empty() {
obj.insert("identifiers".to_string(), Value::Array(ids));
}
let rels: Vec<Value> = data
.relations
.iter()
.filter(|r| !r.id.is_empty())
.map(|r| {
let mut m = Map::new();
m.insert("id".to_string(), Value::String(r.id.clone()));
m.insert("type".to_string(), Value::String(r.type_.clone()));
Value::Object(m)
})
.collect();
if !rels.is_empty() {
obj.insert("relations".to_string(), Value::Array(rels));
}
let arr = Value::Array(vec![Value::Object(obj)]);
let bytes = serde_json::to_vec_pretty(&arr).map_err(|e| Error::Serialize(e.to_string()))?;
crate::schema_utils::json_schema_errors(&bytes, Some("commonmeta"))?;
Ok(bytes)
}
#[derive(Debug, Default, Clone, Serialize, Deserialize, parquet_derive::ParquetRecordWriter)]
pub struct RorCsv {
pub id: String,
pub name: String,
pub types: String,
pub status: String,
pub links: String,
pub aliases: String,
pub labels: String,
pub acronyms: String,
pub wikipedia_url: String,
pub established: String,
pub latitude: String,
pub longitude: String,
pub place: String,
pub geonames_id: String,
pub country_subdivision_name: String,
pub country_subdivision_code: String,
pub country_code: String,
pub country_name: String,
pub external_ids_grid_preferred: String,
pub external_ids_grid_all: String,
pub external_ids_isni_preferred: String,
pub external_ids_isni_all: String,
pub external_ids_fundref_preferred: String,
pub external_ids_fundref_all: String,
pub external_ids_wikidata_preferred: String,
pub external_ids_wikidata_all: String,
pub relationships: String,
}
pub fn convert_ror_csv(ror: &Ror) -> RorCsv {
let mut out = RorCsv {
id: ror.id.clone(),
status: ror.status.clone(),
..Default::default()
};
let mut acronyms = Vec::new();
let mut aliases = Vec::new();
let mut labels = Vec::new();
for name in &ror.names {
if name.types.iter().any(|t| t == "ror_display") {
out.name = name.value.clone();
} else if name.types.iter().any(|t| t == "acronym") && !name.value.is_empty() {
acronyms.push(name.value.clone());
} else if name.types.iter().any(|t| t == "alias") {
aliases.push(name.value.clone());
} else if name.types.iter().any(|t| t == "label") {
if !name.lang.is_empty() {
labels.push(format!("{}: {}", name.lang, name.value));
} else {
labels.push(name.value.clone());
}
}
}
let mut compacted_types: Vec<&str> = Vec::new();
for t in &ror.types {
if compacted_types
.last()
.map(|last| *last != t.as_str())
.unwrap_or(true)
{
compacted_types.push(t.as_str());
}
}
out.types = compacted_types.join("; ");
for link in &ror.links {
if link.type_ == "website" {
out.links = link.value.clone();
} else if link.type_ == "wikipedia" {
out.wikipedia_url = link.value.clone();
}
}
out.aliases = aliases.join("; ");
out.labels = labels.join("; ");
out.acronyms = acronyms.join("; ");
if let Some(year) = ror.established
&& year != 0
{
out.established = year.to_string();
}
if let Some(loc) = ror.locations.first() {
out.latitude = format!("{:.6}", loc.geonames_details.lat);
out.longitude = format!("{:.6}", loc.geonames_details.lng);
out.place = loc.geonames_details.name.clone();
out.geonames_id = loc.geonames_id.to_string();
out.country_subdivision_name = loc.geonames_details.country_subdivision_name.clone();
out.country_subdivision_code = loc.geonames_details.country_subdivision_code.clone();
out.country_code = loc.geonames_details.country_code.clone();
out.country_name = loc.geonames_details.country_name.clone();
}
for ext in &ror.external_ids {
match ext.type_.to_lowercase().as_str() {
"grid" => {
out.external_ids_grid_preferred = ext.preferred.clone();
out.external_ids_grid_all = ext.all.join(";");
}
"isni" => {
out.external_ids_isni_preferred = ext.preferred.clone();
out.external_ids_isni_all = ext.all.join(";");
}
"fundref" => {
out.external_ids_fundref_preferred = ext.preferred.clone();
out.external_ids_fundref_all = ext.all.join(";");
}
"wikidata" => {
out.external_ids_wikidata_preferred = ext.preferred.clone();
out.external_ids_wikidata_all = ext.all.join(";");
}
_ => {}
}
}
let mut child = Vec::new();
let mut parent = Vec::new();
let mut related = Vec::new();
for rel in &ror.relationships {
match rel.type_.as_str() {
"child" => child.push(rel.id.clone()),
"parent" => parent.push(rel.id.clone()),
"related" => related.push(rel.id.clone()),
_ => {}
}
}
let mut groups = Vec::new();
if !child.is_empty() {
groups.push(format!("Child: {}", child.join(", ")));
}
if !parent.is_empty() {
groups.push(format!("Parent: {}", parent.join(", ")));
}
if !related.is_empty() {
groups.push(format!("Related: {}", related.join(", ")));
}
out.relationships = groups.join("; ");
out
}
pub fn write_csv(list: &[Ror]) -> Result<Vec<u8>> {
let mut writer = csv::Writer::from_writer(Vec::new());
for ror in list {
writer
.serialize(convert_ror_csv(ror))
.map_err(|e| Error::Serialize(e.to_string()))?;
}
writer
.into_inner()
.map_err(|e| Error::Serialize(e.to_string()))
}
pub fn write_parquet(list: &[Ror]) -> Result<Vec<u8>> {
use parquet::file::properties::WriterProperties;
use parquet::file::writer::SerializedFileWriter;
use parquet::record::RecordWriter;
let rows: Vec<RorCsv> = list.iter().map(convert_ror_csv).collect();
let schema = rows
.as_slice()
.schema()
.map_err(|e| Error::Serialize(e.to_string()))?;
let props = std::sync::Arc::new(WriterProperties::builder().build());
let buffer: Vec<u8> = Vec::new();
let mut writer = SerializedFileWriter::new(buffer, schema, props)
.map_err(|e| Error::Serialize(e.to_string()))?;
let mut row_group = writer
.next_row_group()
.map_err(|e| Error::Serialize(e.to_string()))?;
rows.as_slice()
.write_to_row_group(&mut row_group)
.map_err(|e| Error::Serialize(e.to_string()))?;
row_group
.close()
.map_err(|e| Error::Serialize(e.to_string()))?;
writer
.into_inner()
.map_err(|e| Error::Serialize(e.to_string()))
}
const ROR_SQLITE_DDL: &str = r#"PRAGMA synchronous=NORMAL;
CREATE TABLE IF NOT EXISTS settings (
"key" TEXT PRIMARY KEY NOT NULL,
"value" TEXT NOT NULL DEFAULT ''
);
CREATE TABLE IF NOT EXISTS organizations (
"id" TEXT PRIMARY KEY NOT NULL,
"name" TEXT NOT NULL DEFAULT '',
"status" TEXT NOT NULL DEFAULT 'active',
"types" TEXT NOT NULL DEFAULT '[]',
"locations" TEXT NOT NULL DEFAULT '[]',
"names" TEXT NOT NULL DEFAULT '[]',
"external_ids" TEXT NOT NULL DEFAULT '[]',
"date_updated" TEXT NOT NULL DEFAULT '',
"names_flat" TEXT NOT NULL DEFAULT '',
"country_code" TEXT NOT NULL DEFAULT '',
"metadata" BLOB NOT NULL DEFAULT x''
);
CREATE INDEX IF NOT EXISTS organizations_status ON organizations("status");
CREATE INDEX IF NOT EXISTS organizations_date_updated ON organizations("date_updated");"#;
const ROR_SQLITE_FTS5_DDL: &str =
"CREATE VIRTUAL TABLE organizations_fts USING fts5(\
name, names_flat, \
content=\"organizations\", \
content_rowid=\"rowid\", \
tokenize=\"unicode61 remove_diacritics 1\"\
)";
const ROR_SQLITE_INSERT: &str = r#"INSERT OR REPLACE INTO organizations (
"id", "name", "status", "types", "locations", "names", "external_ids",
"date_updated", "names_flat", "country_code", "metadata"
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)"#;
struct RorRow {
id: String,
name: String,
names_flat: String,
status: String,
types: String,
locations: String,
names: String,
external_ids: String,
date_updated: String,
country_code: String,
metadata: Vec<u8>,
}
fn serialize_ror_to_row(ror: &Ror) -> RorRow {
let name = get_display_name(ror);
let names_flat = ror
.names
.iter()
.map(|n| n.value.as_str())
.collect::<Vec<_>>()
.join(" ");
let types = serde_json::to_string(&ror.types).unwrap_or_default();
let locations = serde_json::to_string(&ror.locations).unwrap_or_default();
let names = serde_json::to_string(&ror.names).unwrap_or_default();
let external_ids = serde_json::to_string(&ror.external_ids).unwrap_or_default();
let date_updated = ror
.admin
.as_ref()
.map(|a| a.last_modified.date.clone())
.unwrap_or_default();
let country_code = ror
.locations
.first()
.map(|l| l.geonames_details.country_code.clone())
.unwrap_or_default();
let json = serde_json::to_string(ror).unwrap_or_default();
let metadata =
zstd::encode_all(json.as_bytes(), 0).unwrap_or_else(|_| json.into_bytes());
RorRow {
id: ror.id.clone(),
name,
names_flat,
status: ror.status.clone(),
types,
locations,
names,
external_ids,
date_updated,
country_code,
metadata,
}
}
pub fn write_sqlite(list: &[Ror], path: &Path, version: Option<&str>, date: Option<&str>) -> Result<()> {
use rusqlite::{params, Connection};
if let Some(parent) = path.parent() {
if !parent.as_os_str().is_empty() && !parent.exists() {
std::fs::create_dir_all(parent).map_err(|e| {
Error::Parse(format!(
"failed to create directory '{}': {}",
parent.display(),
e
))
})?;
}
}
let conn = Connection::open(path)
.map_err(|e| Error::Parse(format!("failed to open sqlite '{}': {}", path.display(), e)))?;
let _: String = conn.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0))
.map_err(|e| Error::Parse(format!("failed to set WAL mode: {}", e)))?;
conn.execute_batch(ROR_SQLITE_DDL)
.map_err(|e| Error::Parse(format!("failed to create organizations table: {}", e)))?;
let _ = conn.execute(
"ALTER TABLE organizations ADD COLUMN \"names_flat\" TEXT NOT NULL DEFAULT ''",
[],
);
let _ = conn.execute(
"ALTER TABLE organizations ADD COLUMN \"country_code\" TEXT NOT NULL DEFAULT ''",
[],
);
let _ = conn.execute(
"DELETE FROM settings WHERE key IN ('ror_version', 'ror_date')",
[],
);
let _ = conn.execute("DROP TABLE IF EXISTS organizations_fts", []);
conn.execute("DELETE FROM organizations", [])
.map_err(|e| Error::Parse(format!("failed to clear organizations: {}", e)))?;
let rows: Vec<RorRow> = list.iter().map(serialize_ror_to_row).collect();
let bar = crate::progress::count_bar("writing", rows.len() as u64);
{
let tx = conn.unchecked_transaction()
.map_err(|e| Error::Parse(format!("failed to begin transaction: {}", e)))?;
{
let mut stmt = tx.prepare(ROR_SQLITE_INSERT)
.map_err(|e| Error::Parse(format!("failed to prepare insert: {}", e)))?;
for row in &rows {
stmt.execute(params![
row.id,
row.name,
row.status,
row.types,
row.locations,
row.names,
row.external_ids,
row.date_updated,
row.names_flat,
row.country_code,
row.metadata,
])
.map_err(|e| Error::Parse(format!("failed to insert organization: {}", e)))?;
bar.inc(1);
}
}
if let Some(v) = version {
tx.execute(
"INSERT OR REPLACE INTO settings (key, value) VALUES ('ror_version', ?1)",
params![v],
)
.map_err(|e| Error::Parse(format!("failed to store ror_version: {}", e)))?;
}
if let Some(d) = date {
tx.execute(
"INSERT OR REPLACE INTO settings (key, value) VALUES ('ror_date', ?1)",
params![d],
)
.map_err(|e| Error::Parse(format!("failed to store ror_date: {}", e)))?;
}
tx.commit()
.map_err(|e| Error::Parse(format!("failed to commit transaction: {}", e)))?;
}
bar.finish_and_clear();
eprintln!("Building FTS index...");
conn.execute_batch(ROR_SQLITE_FTS5_DDL)
.map_err(|e| Error::Parse(format!("failed to create FTS5 table: {}", e)))?;
conn.execute(
"INSERT INTO organizations_fts(organizations_fts) VALUES('rebuild')",
[],
)
.map_err(|e| Error::Parse(format!("failed to rebuild FTS5 index: {}", e)))?;
let _ = conn.execute("PRAGMA wal_checkpoint(PASSIVE)", []);
Ok(())
}
pub fn rebuild_organizations_fts(path: &Path) -> Result<()> {
let conn = rusqlite::Connection::open(path)
.map_err(|e| Error::Parse(format!("open sqlite '{}': {}", path.display(), e)))?;
let _ = conn.execute("DROP TABLE IF EXISTS organizations_fts", []);
conn.execute_batch(ROR_SQLITE_FTS5_DDL)
.map_err(|e| Error::Parse(format!("organizations_fts DDL: {}", e)))?;
conn.execute("INSERT INTO organizations_fts(organizations_fts) VALUES('rebuild')", [])
.map_err(|e| Error::Parse(format!("organizations_fts rebuild: {}", e)))?;
Ok(())
}
pub fn fetch_installed_ror_version(db_path: &Path) -> Result<Option<String>> {
use rusqlite::{Connection, Error as SqliteError};
if !db_path.exists() {
return Ok(None);
}
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite: {}", e)))?;
match conn.query_row(
"SELECT value FROM settings WHERE key = 'ror_version' LIMIT 1",
[],
|row| row.get::<_, String>(0),
) {
Ok(v) => Ok(Some(v)),
Err(SqliteError::QueryReturnedNoRows) => Ok(None),
Err(_) => Ok(None),
}
}
pub fn write_all(list: &[Ror], extension: &str) -> Result<Vec<u8>> {
match extension {
".yaml" => serde_yaml::to_string(list)
.map(|s| s.into_bytes())
.map_err(|e| Error::Serialize(e.to_string())),
".json" => serde_json::to_vec(list).map_err(|e| Error::Serialize(e.to_string())),
".jsonl" => {
let mut out = Vec::new();
for item in list {
serde_json::to_writer(&mut out, item)
.map_err(|e| Error::Serialize(e.to_string()))?;
out.push(b'\n');
}
Ok(out)
}
".csv" => write_csv(list),
".parquet" => write_parquet(list),
".sqlite3" => {
let tmp = std::env::temp_dir()
.join(format!("ror-{}.sqlite3", std::process::id()));
write_sqlite(list, &tmp, None, None)?;
let bytes = std::fs::read(&tmp)
.map_err(|e| Error::Serialize(format!("failed to read temp sqlite: {}", e)))?;
let _ = std::fs::remove_file(&tmp);
Ok(bytes)
}
other => Err(Error::UnsupportedFormat(other.to_string())),
}
}
const SPECIAL_CHARS: &str = r"[+\-=|><!()\\\{\}\[\]^~*?:/.,;]";
pub fn clean_search_string(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for ch in s.chars() {
if matches!(
ch,
'+' | '-'
| '='
| '|'
| '>'
| '<'
| '!'
| '('
| ')'
| '\\'
| '{'
| '}'
| '['
| ']'
| '^'
| '"'
| '~'
| '*'
| '?'
| ':'
| '/'
| '.'
| ','
| ';'
) {
out.push(' ');
} else {
out.push(ch);
}
}
let mut result = String::with_capacity(out.len());
let mut chars = out.chars().peekable();
while let Some(ch) = chars.next() {
if ch.is_ascii_digit() {
let mut digits = String::new();
digits.push(ch);
while let Some(&next) = chars.peek() {
if next.is_ascii_digit() {
digits.push(next);
chars.next();
} else {
break;
}
}
if digits.len() != 5 {
result.push_str(&digits);
} else {
result.push(' ');
}
} else {
result.push(ch);
}
}
result.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub fn to_region(code: &str) -> &'static str {
match code {
"GB" | "UK" => "GB-UK",
"CN" | "HK" | "TW" => "CN-HK-TW",
"PR" | "US" => "US-PR",
_ => {
""
}
}
}
pub fn get_country_codes(s: &str) -> Vec<String> {
let lower: String = s.to_lowercase();
let alpha_lower: String = lower
.chars()
.map(|c| if c.is_ascii_alphabetic() { c } else { ' ' })
.collect();
let alpha_lower = alpha_lower.split_whitespace().collect::<Vec<_>>().join(" ");
let alpha_tokens: Vec<&str> = alpha_lower.split_whitespace().collect();
let orig_tokens: Vec<String> = s
.split_whitespace()
.map(|t| {
t.chars()
.filter(|c| c.is_ascii_alphabetic())
.collect::<String>()
.to_uppercase()
})
.filter(|t| !t.is_empty())
.collect();
let mut codes: HashSet<String> = HashSet::new();
'outer: for &(code, names) in ROR_COUNTRIES {
for &name in names {
let name_chars: Vec<char> = name.chars().collect();
let name_len = name_chars.len();
let matched = if name_len == 2 && name.chars().all(|c| c.is_ascii_alphabetic()) {
orig_tokens.iter().any(|t| t == &name.to_uppercase())
} else {
let has_special = name.chars().any(|c| !c.is_ascii_alphabetic() && c != ' ');
if has_special {
lower.contains(name)
} else if name.contains(' ') {
alpha_lower.contains(name)
} else {
alpha_tokens.contains(&name)
}
};
if matched {
codes.insert(code.to_string());
continue 'outer;
}
}
}
let mut result: Vec<String> = codes.into_iter().collect();
result.sort();
result
}
pub fn get_countries(s: &str) -> Vec<String> {
let codes = get_country_codes(s);
let mut regions: HashSet<String> = HashSet::new();
for code in &codes {
let region = to_region(code);
if region.is_empty() {
regions.insert(code.clone());
} else {
regions.insert(region.to_string());
}
}
let mut result: Vec<String> = regions.into_iter().collect();
result.sort();
result
}
#[derive(Debug, Deserialize)]
pub struct AffiliationMatch {
pub substring: String,
pub score: f64,
pub matching_type: String,
pub chosen: bool,
#[serde(skip)]
pub organization: Data,
#[serde(rename = "organization")]
organization_raw: Ror,
}
#[derive(Debug, Deserialize)]
struct AffiliationResponse {
#[serde(default)]
number_of_results: i32,
#[serde(default)]
items: Vec<AffiliationMatch>,
}
pub fn match_affiliation(affiliation: &str) -> Result<Vec<AffiliationMatch>> {
let cleaned = clean_search_string(affiliation);
if cleaned.is_empty() {
return Ok(vec![]);
}
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let encoded: String = url::form_urlencoded::byte_serialize(cleaned.as_bytes()).collect();
let api_url = format!(
"https://api.ror.org/v2/organizations?affiliation={}",
encoded
);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let mut resp: AffiliationResponse =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
for item in &mut resp.items {
let ror = std::mem::take(&mut item.organization_raw);
item.organization = from_ror(ror);
}
Ok(resp.items)
}
fn normalize_affiliation(s: &str) -> String {
let s = s.replace(" & ", " and ").replace('&', " and ");
let s = if s.get(..4).is_some_and(|p| p.eq_ignore_ascii_case("the ")) {
s[4..].to_string()
} else {
s
};
s.split_whitespace()
.map(|word| {
let bare: String = word.chars().filter(|c| c.is_alphabetic()).collect();
match bare.to_lowercase().as_str() {
"univ" => "university",
"lab" | "labs" => "laboratory",
"inst" => "institute",
"tech" => "technology",
"dept" => "department",
"div" => "division",
"natl" => "national",
"intl" => "international",
_ => word,
}
})
.collect::<Vec<_>>()
.join(" ")
}
fn split_affiliation(s: &str) -> Vec<&str> {
s.split(|c| c == ',' || c == ';')
.map(str::trim)
.filter(|p| p.split_whitespace().count() >= 2)
.collect()
}
fn fts_query_sqlite(
conn: &rusqlite::Connection,
fts_query: &str,
substring: &str,
country_codes: &[String],
) -> Result<Vec<AffiliationMatch>> {
let sql_base = "SELECT o.id, o.name, o.country_code, o.metadata \
FROM organizations_fts \
JOIN organizations AS o ON o.rowid = organizations_fts.rowid \
WHERE organizations_fts MATCH ?1 \
AND o.status = 'active'";
let run_query = |sql: &str, params: Vec<String>| -> Result<Vec<AffiliationMatch>> {
let mut stmt = conn
.prepare(sql)
.map_err(|e| Error::Parse(format!("prepare FTS: {}", e)))?;
let mut rows = stmt
.query(rusqlite::params_from_iter(params.iter()))
.map_err(|e| Error::Parse(format!("FTS query: {}", e)))?;
let mut out = Vec::new();
while let Some(row) = rows.next().map_err(|e| Error::Parse(e.to_string()))? {
let id: String = row.get(0).map_err(|e| Error::Parse(e.to_string()))?;
let name: String = row.get(1).map_err(|e| Error::Parse(e.to_string()))?;
let blob: Vec<u8> = row.get(3).map_err(|e| Error::Parse(e.to_string()))?;
let decompressed = zstd::decode_all(std::io::Cursor::new(&blob))
.map_err(|e| Error::Parse(format!("decompress '{}': {}", id, e)))?;
let ror: Ror = serde_json::from_slice(&decompressed)
.map_err(|e| Error::Parse(format!("deserialize '{}': {}", id, e)))?;
out.push(AffiliationMatch {
substring: substring.to_string(),
score: 0.0,
matching_type: "LOCAL".to_string(),
chosen: false,
organization: from_ror(ror),
organization_raw: Ror::default(),
});
let _ = name; }
Ok(out)
};
if !country_codes.is_empty() {
let placeholders = std::iter::once("?1".to_string())
.chain(
(2..=country_codes.len() + 1).map(|i| format!("?{}", i)),
)
.collect::<Vec<_>>();
let sql = format!(
"{} AND o.country_code IN ({}) ORDER BY organizations_fts.rank LIMIT 10",
sql_base,
placeholders[1..].join(", ")
);
let mut params = vec![fts_query.to_string()];
params.extend(country_codes.iter().cloned());
let filtered = run_query(&sql, params)?;
if !filtered.is_empty() {
return Ok(filtered);
}
}
let sql = format!("{} ORDER BY organizations_fts.rank LIMIT 10", sql_base);
run_query(&sql, vec![fts_query.to_string()])
}
pub fn match_affiliation_sqlite(affiliation: &str, db_path: &Path) -> Result<Vec<AffiliationMatch>> {
if affiliation.trim().is_empty() {
return Ok(vec![]);
}
let country_codes = get_country_codes(affiliation);
let conn = rusqlite::Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite: {}", e)))?;
let substrings = split_affiliation(affiliation);
let parts: Vec<&str> = if substrings.is_empty() {
vec![affiliation.trim()]
} else {
substrings
};
let mut seen_ids: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut matches: Vec<AffiliationMatch> = Vec::new();
for part in parts {
let normalized = normalize_affiliation(part);
let cleaned = clean_search_string(&normalized);
let fts_query = cleaned
.split_whitespace()
.map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
.filter(|w| !w.is_empty())
.collect::<Vec<_>>()
.join(" ");
if fts_query.is_empty() {
continue;
}
for m in fts_query_sqlite(&conn, &fts_query, part, &country_codes)? {
if seen_ids.insert(m.organization.id.clone()) {
matches.push(m);
}
}
}
let n = matches.len();
for (i, m) in matches.iter_mut().enumerate() {
m.score = if n <= 1 { 1.0 } else { 1.0 - (i as f64 / n as f64) };
m.chosen = i == 0;
}
Ok(matches)
}
pub fn read_json(input: &str) -> Result<Data> {
let ror: Ror = serde_json::from_str(input).map_err(|e| Error::Parse(e.to_string()))?;
Ok(from_ror(ror))
}
pub fn fetch(input: &str) -> Result<Data> {
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let (id, id_type) = validate_id(input);
let ror = if id_type == "ROR" {
let ror_id = validate_ror(&id).unwrap_or(id.clone());
let api_url = format!("https://api.ror.org/v2/organizations/{}", ror_id);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
serde_json::from_str::<Ror>(&text).map_err(|e| Error::Parse(e.to_string()))?
} else {
let org_types = ["ROR", "Crossref Funder ID", "GRID", "Wikidata", "ISNI"];
if !org_types.contains(&id_type) {
return Err(Error::Parse(format!(
"Not a supported organization identifier: {}",
input
)));
}
let encoded: String = url::form_urlencoded::byte_serialize(id.as_bytes()).collect();
let api_url = format!("https://api.ror.org/v2/organizations?query={}", encoded);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let list: RorListResponse =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
if list.number_of_results != 1 {
return Err(Error::Parse(format!(
"Expected 1 result from ROR query, got {}",
list.number_of_results
)));
}
list.items
.into_iter()
.next()
.ok_or_else(|| Error::Parse("No items in ROR query response".to_string()))?
};
Ok(from_ror(ror))
}
pub fn fetch_sqlite(id: &str, db_path: &Path) -> Result<Data> {
use rusqlite::{params, Connection};
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite '{}': {}", db_path.display(), e)))?;
let blob: Vec<u8> = conn
.query_row(
"SELECT metadata FROM organizations WHERE id = ?1 LIMIT 1",
params![id],
|row| row.get(0),
)
.map_err(|e| {
if matches!(e, rusqlite::Error::QueryReturnedNoRows) {
Error::Parse(format!("organization '{}' not found in sqlite", id))
} else {
Error::Parse(format!("sqlite query failed: {}", e))
}
})?;
let decompressed = zstd::decode_all(std::io::Cursor::new(&blob))
.map_err(|e| Error::Parse(format!("decompress metadata: {}", e)))?;
let ror: Ror = serde_json::from_slice(&decompressed)
.map_err(|e| Error::Parse(format!("deserialize: {}", e)))?;
Ok(from_ror(ror))
}
pub fn lookup_org_sqlite(id: &str, db_path: &Path) -> Option<(String, String)> {
use crate::utils::validate_id;
let (normalized, id_type) = validate_id(id);
if normalized.is_empty() {
return None;
}
let conn = rusqlite::Connection::open(db_path).ok()?;
match id_type {
"ROR" => {
let ror_url = format!("https://ror.org/{}", normalized);
conn.query_row(
"SELECT id, name FROM organizations WHERE id = ?1 LIMIT 1",
rusqlite::params![ror_url],
|row| Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)),
)
.ok()
}
"Crossref Funder ID" | "ISNI" => {
let (ext_type, search_value) = if id_type == "Crossref Funder ID" {
("FundRef", normalized.clone())
} else {
let spaced = if normalized.len() == 16 {
format!(
"{} {} {} {}",
&normalized[0..4],
&normalized[4..8],
&normalized[8..12],
&normalized[12..16]
)
} else {
normalized.clone()
};
("ISNI", spaced)
};
let type_pattern = format!("%\"type\":\"{}\"%", ext_type);
let value_pattern = format!("%\"{}\"%", search_value);
conn.query_row(
r#"SELECT id, name FROM organizations
WHERE external_ids LIKE ?1 AND external_ids LIKE ?2
AND status = 'active' LIMIT 1"#,
rusqlite::params![type_pattern, value_pattern],
|row| Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)),
)
.ok()
}
_ => None,
}
}
pub fn read_sqlite_raw(
db_path: &Path,
limit: Option<usize>,
offset: usize,
country_code: Option<&str>,
query: Option<&str>,
) -> Result<Vec<Ror>> {
fetch_ror_blobs(db_path, limit, offset, country_code, query)
.and_then(|blobs| blobs.iter().map(|b| parse_ror_blob(b)).collect())
}
pub fn read_sqlite(
db_path: &Path,
limit: Option<usize>,
offset: usize,
country_code: Option<&str>,
query: Option<&str>,
) -> Result<Vec<Data>> {
read_sqlite_raw(db_path, limit, offset, country_code, query)
.map(|rors| rors.into_iter().map(from_ror).collect())
}
fn fetch_ror_blobs(
db_path: &Path,
limit: Option<usize>,
offset: usize,
country_code: Option<&str>,
query: Option<&str>,
) -> Result<Vec<Vec<u8>>> {
use rusqlite::Connection;
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite '{}': {}", db_path.display(), e)))?;
if let Some(q) = query {
let fts_query = clean_search_string(q)
.split_whitespace()
.map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
.filter(|w| !w.is_empty())
.collect::<Vec<_>>()
.join(" ");
if fts_query.is_empty() {
return Ok(vec![]);
}
let (sql, params_vec) = if let Some(cc) = country_code {
(
"SELECT o.metadata FROM organizations_fts \
JOIN organizations AS o ON o.rowid = organizations_fts.rowid \
WHERE organizations_fts MATCH ?1 AND o.status = 'active' \
AND o.country_code = ?2 \
ORDER BY organizations_fts.rank LIMIT ?3 OFFSET ?4"
.to_string(),
vec![fts_query, cc.to_string(), limit.map(|n| n.to_string()).unwrap_or("-1".to_string()), offset.to_string()],
)
} else {
(
"SELECT o.metadata FROM organizations_fts \
JOIN organizations AS o ON o.rowid = organizations_fts.rowid \
WHERE organizations_fts MATCH ?1 AND o.status = 'active' \
ORDER BY organizations_fts.rank LIMIT ?2 OFFSET ?3"
.to_string(),
vec![fts_query, limit.map(|n| n.to_string()).unwrap_or("-1".to_string()), offset.to_string()],
)
};
let mut stmt = conn.prepare(&sql).map_err(|e| Error::Parse(e.to_string()))?;
Ok(stmt.query_map(rusqlite::params_from_iter(params_vec.iter()), |row| row.get(0))
.map_err(|e| Error::Parse(e.to_string()))?
.filter_map(|r| r.ok())
.collect())
} else if let Some(cc) = country_code {
let mut stmt = conn
.prepare(
"SELECT metadata FROM organizations \
WHERE status = 'active' AND country_code = ?1 \
ORDER BY name LIMIT ?2 OFFSET ?3",
)
.map_err(|e| Error::Parse(e.to_string()))?;
Ok(stmt.query_map(
rusqlite::params![cc, limit.map(|n| n as i64).unwrap_or(-1), offset as i64],
|row| row.get(0),
)
.map_err(|e| Error::Parse(e.to_string()))?
.filter_map(|r| r.ok())
.collect())
} else {
let mut stmt = conn
.prepare(
"SELECT metadata FROM organizations \
WHERE status = 'active' \
ORDER BY name LIMIT ?1 OFFSET ?2",
)
.map_err(|e| Error::Parse(e.to_string()))?;
Ok(stmt.query_map(
rusqlite::params![limit.map(|n| n as i64).unwrap_or(-1), offset as i64],
|row| row.get(0),
)
.map_err(|e| Error::Parse(e.to_string()))?
.filter_map(|r| r.ok())
.collect())
}
}
fn parse_ror_blob(blob: &[u8]) -> Result<Ror> {
let decompressed = zstd::decode_all(std::io::Cursor::new(blob))
.map_err(|e| Error::Parse(format!("decompress metadata: {}", e)))?;
serde_json::from_slice(&decompressed)
.map_err(|e| Error::Parse(format!("deserialize: {}", e)))
}
pub fn sample_sqlite_raw(
db_path: &Path,
limit: Option<usize>,
country_code: Option<&str>,
) -> Result<Vec<Ror>> {
fetch_ror_sample_blobs(db_path, limit, country_code)
.and_then(|blobs| blobs.iter().map(|b| parse_ror_blob(b)).collect())
}
pub fn sample_sqlite(
db_path: &Path,
limit: Option<usize>,
country_code: Option<&str>,
) -> Result<Vec<Data>> {
sample_sqlite_raw(db_path, limit, country_code)
.map(|rors| rors.into_iter().map(from_ror).collect())
}
fn fetch_ror_sample_blobs(
db_path: &Path,
limit: Option<usize>,
country_code: Option<&str>,
) -> Result<Vec<Vec<u8>>> {
use rusqlite::Connection;
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite '{}': {}", db_path.display(), e)))?;
let n = limit.map(|n| n as i64).unwrap_or(-1);
if let Some(cc) = country_code {
let mut stmt = conn
.prepare(
"SELECT metadata FROM organizations \
WHERE status = 'active' AND country_code = ?1 \
ORDER BY RANDOM() LIMIT ?2",
)
.map_err(|e| Error::Parse(e.to_string()))?;
Ok(stmt.query_map(rusqlite::params![cc, n], |row| row.get(0))
.map_err(|e| Error::Parse(e.to_string()))?
.filter_map(|r| r.ok())
.collect())
} else {
let mut stmt = conn
.prepare(
"SELECT metadata FROM organizations \
WHERE status = 'active' \
ORDER BY RANDOM() LIMIT ?1",
)
.map_err(|e| Error::Parse(e.to_string()))?;
Ok(stmt.query_map(rusqlite::params![n], |row| row.get(0))
.map_err(|e| Error::Parse(e.to_string()))?
.filter_map(|r| r.ok())
.collect())
}
}
pub fn fetch_raw_sqlite(id: &str, db_path: &Path) -> Result<Ror> {
use rusqlite::{params, Connection};
let conn = Connection::open(db_path)
.map_err(|e| Error::Parse(format!("failed to open sqlite: {}", e)))?;
let blob: Vec<u8> = conn.query_row(
"SELECT metadata FROM organizations WHERE id = ?1 LIMIT 1",
params![id],
|row| row.get(0),
).map_err(|e| {
if matches!(e, rusqlite::Error::QueryReturnedNoRows) {
Error::Parse(format!("organization '{}' not found in sqlite", id))
} else {
Error::Parse(format!("sqlite query failed: {}", e))
}
})?;
let decompressed = zstd::decode_all(std::io::Cursor::new(&blob))
.map_err(|e| Error::Parse(format!("decompress metadata: {}", e)))?;
serde_json::from_slice(&decompressed).map_err(|e| Error::Parse(format!("deserialize: {}", e)))
}
pub fn fetch_raw(input: &str) -> Result<Ror> {
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let (id, id_type) = validate_id(input);
let ror = if id_type == "ROR" {
let ror_id = validate_ror(&id).unwrap_or(id.clone());
let api_url = format!("https://api.ror.org/v2/organizations/{}", ror_id);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
serde_json::from_str::<Ror>(&text).map_err(|e| Error::Parse(e.to_string()))?
} else {
let org_types = ["ROR", "Crossref Funder ID", "GRID", "Wikidata", "ISNI"];
if !org_types.contains(&id_type) {
return Err(Error::Parse(format!(
"Not a supported organization identifier: {}",
input
)));
}
let encoded: String = url::form_urlencoded::byte_serialize(id.as_bytes()).collect();
let api_url = format!("https://api.ror.org/v2/organizations?query={}", encoded);
let text = client
.get(&api_url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let list: RorListResponse =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
if list.number_of_results != 1 {
return Err(Error::Parse(format!(
"Expected 1 result from ROR query, got {}",
list.number_of_results
)));
}
list.items
.into_iter()
.next()
.ok_or_else(|| Error::Parse("No items in ROR query response".to_string()))?
};
Ok(ror)
}
pub fn enrich_locations(ror: &mut Ror, geonames_db: &Path) {
use crate::geonames as geo;
for loc in &mut ror.locations {
if loc.geonames_id == 0 {
continue;
}
let id = loc.geonames_id;
let details = &mut loc.geonames_details;
if let Ok(g) = geo::fetch_geoname_raw(id, geonames_db) {
if details.name.is_empty() { details.name = g.name.clone(); }
if details.lat == 0.0 { details.lat = g.latitude; }
if details.lng == 0.0 { details.lng = g.longitude; }
if details.country_code.is_empty() { details.country_code = g.country_code.clone(); }
if details.country_subdivision_code.is_empty() {
details.country_subdivision_code = g.admin1_code.clone();
}
if details.country_name.is_empty() || details.continent_code.is_empty() {
if let Ok(Some(ci)) = geo::lookup_country(&g.country_code, geonames_db) {
if details.country_name.is_empty() { details.country_name = ci.0; }
if details.continent_code.is_empty() { details.continent_code = ci.1.clone(); }
if details.continent_name.is_empty() { details.continent_name = ci.2; }
}
}
if details.country_subdivision_name.is_empty() && !g.admin1_code.is_empty() {
if let Ok(Some(name)) = geo::lookup_admin1(&g.country_code, &g.admin1_code, geonames_db) {
details.country_subdivision_name = name;
}
}
}
}
}
pub fn write_v2_json(ror: &Ror) -> Result<Vec<u8>> {
let mut val = serde_json::to_value(ror).map_err(|e| Error::Serialize(e.to_string()))?;
if let Some(types) = val.get_mut("types").and_then(|v| v.as_array_mut()) {
types.sort_by(|a, b| {
a.as_str().unwrap_or("").cmp(b.as_str().unwrap_or(""))
});
}
serde_json::to_vec_pretty(&val).map_err(|e| Error::Serialize(e.to_string()))
}
const ROR_ZENODO_CONCEPT_ID: &str = "6347574";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RorRelease {
pub version: String,
pub date: String,
pub zenodo_id: String,
pub filename: String,
pub download_url: String,
}
#[derive(Deserialize)]
struct ZenodoRecord {
id: u64,
metadata: ZenodoMeta,
files: Vec<ZenodoFile>,
}
#[derive(Deserialize)]
struct ZenodoMeta {
version: String,
publication_date: String,
}
#[derive(Deserialize)]
struct ZenodoFile {
key: String,
links: ZenodoFileLinks,
}
#[derive(Deserialize)]
struct ZenodoFileLinks {
#[serde(rename = "self")]
self_: String,
}
pub fn fetch_latest_ror_release() -> Result<RorRelease> {
let url = format!(
"https://zenodo.org/api/records/{}/versions/latest",
ROR_ZENODO_CONCEPT_ID
);
let client = reqwest::blocking::Client::builder()
.user_agent(format!(
"commonmeta-rs/{} (https://github.com/front-matter/commonmeta-rs; mailto:info@front-matter.de)",
env!("CARGO_PKG_VERSION")
))
.build()
.map_err(|e| Error::Http(e.to_string()))?;
let text = client
.get(&url)
.send()
.map_err(|e| Error::Http(e.to_string()))?
.error_for_status()
.map_err(|e| Error::Http(e.to_string()))?
.text()
.map_err(|e| Error::Http(e.to_string()))?;
let record: ZenodoRecord =
serde_json::from_str(&text).map_err(|e| Error::Parse(e.to_string()))?;
let zip = record
.files
.into_iter()
.find(|f| f.key.ends_with(".zip"))
.ok_or_else(|| Error::Parse("no zip file found in Zenodo release".into()))?;
Ok(RorRelease {
version: record.metadata.version,
date: record.metadata.publication_date,
zenodo_id: record.id.to_string(),
filename: zip.key,
download_url: zip.links.self_,
})
}
pub fn download_release(release: &RorRelease) -> Result<(Vec<Ror>, bool)> {
let ttl = std::time::Duration::from_secs(30 * 24 * 60 * 60);
let (zip_path, from_cache) = crate::io_utils::ensure_cached_path(
&release.download_url,
"ror",
&release.filename,
ttl,
)
.map_err(|e| Error::Http(e.to_string()))?;
let zip_bytes = std::fs::read(&zip_path)
.map_err(|e| Error::Http(format!("reading cached zip: {}", e)))?;
let json_bytes = crate::io_utils::unzip_first_json(&zip_bytes)
.map_err(|e| Error::Parse(e.to_string()))?;
let list: Vec<Ror> = serde_json::from_slice(&json_bytes)
.map_err(|e| Error::Parse(format!("parsing ROR JSON: {}", e)))?;
Ok((list, from_cache))
}
pub fn download_all() -> Result<(RorRelease, Vec<Ror>, bool)> {
let release = fetch_latest_ror_release()?;
let (list, from_cache) = download_release(&release)?;
Ok((release, list, from_cache))
}
#[cfg(test)]
mod tests {
use super::*;
const ROR_ORG: &str = r#"{
"id": "https://ror.org/02nr0ka47",
"established": 2013,
"external_ids": [
{
"type": "Wikidata",
"all": ["Q19341888"],
"preferred": ""
},
{
"type": "GRID",
"all": ["grid.465570.2"],
"preferred": "grid.465570.2"
},
{
"type": "FundRef",
"all": ["100012611"],
"preferred": "100012611"
}
],
"links": [
{"type": "website", "value": "https://impactstory.org"}
],
"locations": [
{
"geonames_id": 4774183,
"geonames_details": {
"country_code": "US",
"country_name": "United States",
"name": "Williamsburg"
}
}
],
"names": [
{"value": "Impactstory", "types": ["ror_display", "label"], "lang": "en"},
{"value": "IS", "types": ["acronym"], "lang": ""}
],
"relationships": [
{"type": "related", "label": "Our Society", "id": "https://ror.org/045gyfv07"}
],
"status": "active",
"types": ["nonprofit"]
}"#;
#[test]
fn test_read_ror_basic() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.id, "https://ror.org/02nr0ka47");
assert_eq!(data.type_, "Organization");
assert_eq!(data.provider, "ROR");
assert_eq!(data.url, "https://impactstory.org");
}
#[test]
fn test_ror_display_name() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.title, "Impactstory");
}
#[test]
fn test_ror_date() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.established, Some(2013));
}
#[test]
fn test_ror_identifiers() {
let data = read_json(ROR_ORG).unwrap();
let wikidata = data
.identifiers
.iter()
.find(|i| i.identifier_type == "Wikidata")
.unwrap();
assert_eq!(wikidata.identifier, "Q19341888");
let grid = data
.identifiers
.iter()
.find(|i| i.identifier_type == "GRID")
.unwrap();
assert_eq!(grid.identifier, "grid.465570.2");
let fundref = data
.identifiers
.iter()
.find(|i| i.identifier_type == "Crossref Funder ID")
.unwrap();
assert_eq!(fundref.identifier, "100012611");
}
#[test]
fn test_ror_relations() {
let data = read_json(ROR_ORG).unwrap();
assert_eq!(data.relations.len(), 1);
assert_eq!(data.relations[0].id, "https://ror.org/045gyfv07");
assert_eq!(data.relations[0].type_, "IsRelatedTo");
}
#[test]
fn test_ror_no_established() {
let json = r#"{
"id": "https://ror.org/01234abc",
"names": [{"value": "Test Org", "types": ["ror_display"]}],
"links": [],
"external_ids": [],
"relationships": [],
"status": "active",
"types": ["education"]
}"#;
let data = read_json(json).unwrap();
assert_eq!(data.established, None);
}
#[test]
fn test_get_display_name() {
let ror: Ror = serde_json::from_str(ROR_ORG).unwrap();
assert_eq!(get_display_name(&ror), "Impactstory");
}
#[test]
fn test_ror_no_display_name() {
let json = r#"{
"id": "https://ror.org/abc123",
"names": [{"value": "Other Name", "types": ["label"]}],
"links": [],
"external_ids": [],
"relationships": [],
"status": "active",
"types": []
}"#;
let data = read_json(json).unwrap();
assert!(data.title.is_empty());
}
#[test]
fn test_write_inveniordm_yaml() {
let data = read_json(ROR_ORG).unwrap();
let bytes = write(&data).unwrap();
let yaml = String::from_utf8(bytes).unwrap();
assert!(
yaml.contains("id: 02nr0ka47"),
"expected bare ROR id, got:\n{yaml}"
);
assert!(yaml.contains("name: Impactstory"));
assert!(yaml.contains("scheme: ror"));
assert!(yaml.contains("scheme: wikidata"));
assert!(yaml.contains("scheme: grid"));
assert!(yaml.contains("scheme: fundref"));
assert!(yaml.contains("en: Impactstory"));
}
#[test]
fn test_write_ror_json() {
let data = read_json(ROR_ORG).unwrap();
let bytes = write_json(&data).unwrap();
let v: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(v["id"].as_str().unwrap(), "https://ror.org/02nr0ka47");
assert_eq!(v["names"][0]["value"].as_str().unwrap(), "Impactstory");
assert_eq!(v["links"][0]["type"].as_str().unwrap(), "website");
assert_eq!(
v["links"][0]["value"].as_str().unwrap(),
"https://impactstory.org"
);
assert_eq!(v["established"].as_i64().unwrap(), 2013);
assert_eq!(v["status"].as_str().unwrap(), "active");
let ext_ids = v["external_ids"].as_array().unwrap();
assert!(ext_ids.iter().any(|e| e["type"] == "Wikidata"));
assert!(ext_ids.iter().any(|e| e["type"] == "GRID"));
assert!(ext_ids.iter().any(|e| e["type"] == "FundRef"));
let rels = v["relationships"].as_array().unwrap();
assert_eq!(rels[0]["type"].as_str().unwrap(), "related");
assert_eq!(rels[0]["id"].as_str().unwrap(), "https://ror.org/045gyfv07");
}
#[test]
fn test_write_invalid_ror_id() {
use crate::data::Data;
let data = Data {
id: "https://doi.org/10.1234/not-a-ror".to_string(),
..Data::default()
};
assert!(write(&data).is_err());
}
#[test]
fn test_clean_search_string_special_chars() {
assert_eq!(
clean_search_string("Dept. of Biology, MIT; USA"),
"Dept of Biology MIT USA"
);
}
#[test]
fn test_clean_search_string_postal_code() {
assert_eq!(
clean_search_string("Stanford University 94305 CA"),
"Stanford University CA"
);
assert_eq!(clean_search_string("Lab 2024 report"), "Lab 2024 report");
}
#[test]
fn test_clean_search_string_collapses_whitespace() {
assert_eq!(
clean_search_string(" University of Cambridge "),
"University of Cambridge"
);
}
#[test]
fn test_clean_search_string_empty() {
assert_eq!(clean_search_string(""), "");
assert_eq!(clean_search_string("...;;;"), "");
}
#[test]
fn test_get_country_codes_full_name() {
let codes = get_country_codes("University of California, United States");
assert!(
codes.contains(&"US".to_string()),
"expected US in {:?}",
codes
);
}
#[test]
fn test_get_country_codes_abbreviation() {
let codes = get_country_codes("Max Planck Institute, Germany");
assert!(
codes.contains(&"DE".to_string()),
"expected DE in {:?}",
codes
);
}
#[test]
fn test_get_country_codes_uk() {
let codes = get_country_codes("University of Oxford, United Kingdom");
assert!(
codes.contains(&"UK".to_string()),
"expected UK in {:?}",
codes
);
}
#[test]
fn test_get_country_codes_no_match() {
let codes = get_country_codes("Department of Zoology");
let _ = codes;
}
#[test]
fn test_normalize_affiliation_abbreviations() {
assert_eq!(normalize_affiliation("Dept. of Physics, Univ. of Vienna"), "department of Physics, university of Vienna");
assert_eq!(normalize_affiliation("Max Planck Inst. for Biology"), "Max Planck institute for Biology");
assert_eq!(normalize_affiliation("MIT Lab for AI & Tech"), "MIT laboratory for AI and technology");
}
#[test]
fn test_normalize_affiliation_leading_the() {
assert_eq!(normalize_affiliation("The University of Tokyo"), "University of Tokyo");
assert_eq!(normalize_affiliation("the Royal Society"), "Royal Society");
}
#[test]
fn test_normalize_affiliation_ampersand() {
assert_eq!(normalize_affiliation("Science & Technology"), "Science and Technology");
assert_eq!(normalize_affiliation("Arts&Sciences"), "Arts and Sciences");
}
#[test]
fn test_split_affiliation_basic() {
let parts = split_affiliation("Dept of Physics, University of Vienna, Austria");
assert!(parts.contains(&"University of Vienna"), "got {:?}", parts);
assert!(!parts.contains(&"Austria"), "single-word parts should be excluded");
}
#[test]
fn test_split_affiliation_semicolon() {
let parts = split_affiliation("Harvard Medical School; Boston, USA");
assert!(parts.iter().any(|p| p.contains("Harvard")));
}
#[test]
fn test_get_countries_region_mapping() {
let regions = get_countries("Harvard University, USA");
assert!(
regions.contains(&"US-PR".to_string()),
"expected US-PR in {:?}",
regions
);
}
#[test]
fn test_get_countries_uk_region() {
let regions = get_countries("Imperial College London, UK");
assert!(
regions.contains(&"GB-UK".to_string()),
"expected GB-UK in {:?}",
regions
);
}
#[test]
fn test_to_region_special() {
assert_eq!(to_region("US"), "US-PR");
assert_eq!(to_region("PR"), "US-PR");
assert_eq!(to_region("GB"), "GB-UK");
assert_eq!(to_region("UK"), "GB-UK");
assert_eq!(to_region("CN"), "CN-HK-TW");
assert_eq!(to_region("HK"), "CN-HK-TW");
assert_eq!(to_region("TW"), "CN-HK-TW");
}
#[test]
fn test_to_region_passthrough() {
assert_eq!(to_region("DE"), "");
assert_eq!(to_region("FR"), "");
}
#[test]
fn test_get_country_codes_france() {
let codes = get_country_codes("CNRS, France");
assert!(
codes.contains(&"FR".to_string()),
"expected FR in {:?}",
codes
);
}
#[test]
fn test_get_country_codes_japan() {
let codes = get_country_codes("University of Tokyo, Japan");
assert!(
codes.contains(&"JP".to_string()),
"expected JP in {:?}",
codes
);
}
fn sample_ror() -> Ror {
serde_json::from_str(ROR_ORG).unwrap()
}
#[test]
fn test_convert_ror_csv_basic() {
let ror = sample_ror();
let row = convert_ror_csv(&ror);
assert_eq!(row.id, "https://ror.org/02nr0ka47");
assert_eq!(row.name, "Impactstory");
assert_eq!(row.types, "nonprofit");
assert_eq!(row.status, "active");
assert_eq!(row.links, "https://impactstory.org");
assert_eq!(row.established, "2013");
assert_eq!(row.country_code, "US");
assert_eq!(row.place, "Williamsburg");
}
#[test]
fn test_convert_ror_csv_external_ids() {
let ror = sample_ror();
let row = convert_ror_csv(&ror);
assert_eq!(row.external_ids_wikidata_all, "Q19341888");
assert_eq!(row.external_ids_grid_preferred, "grid.465570.2");
assert_eq!(row.external_ids_fundref_preferred, "100012611");
}
#[test]
fn test_convert_ror_csv_relationships() {
let ror = sample_ror();
let row = convert_ror_csv(&ror);
assert_eq!(row.relationships, "Related: https://ror.org/045gyfv07");
}
#[test]
fn test_convert_ror_csv_no_established() {
let mut ror = sample_ror();
ror.established = None;
let row = convert_ror_csv(&ror);
assert!(row.established.is_empty());
ror.established = Some(0);
let row = convert_ror_csv(&ror);
assert!(row.established.is_empty());
}
#[test]
fn test_convert_ror_csv_no_locations() {
let mut ror = sample_ror();
ror.locations.clear();
let row = convert_ror_csv(&ror);
assert!(row.country_code.is_empty());
assert!(row.place.is_empty());
}
#[test]
fn test_write_csv_roundtrip() {
let ror = sample_ror();
let bytes = write_csv(std::slice::from_ref(&ror)).unwrap();
let text = String::from_utf8(bytes).unwrap();
let mut reader = csv::Reader::from_reader(text.as_bytes());
let records: Vec<RorCsv> = reader.deserialize().map(|r| r.unwrap()).collect();
assert_eq!(records.len(), 1);
assert_eq!(records[0].name, "Impactstory");
assert_eq!(records[0].id, "https://ror.org/02nr0ka47");
}
#[test]
fn test_write_parquet_roundtrip() {
let ror = sample_ror();
let bytes = write_parquet(std::slice::from_ref(&ror)).unwrap();
assert!(!bytes.is_empty());
assert_eq!(&bytes[0..4], b"PAR1");
assert_eq!(&bytes[bytes.len() - 4..], b"PAR1");
}
#[test]
fn test_write_all_dispatch() {
let ror = sample_ror();
let list = vec![ror];
let json = write_all(&list, ".json").unwrap();
assert!(String::from_utf8(json).unwrap().contains("Impactstory"));
let yaml = write_all(&list, ".yaml").unwrap();
assert!(String::from_utf8(yaml).unwrap().contains("Impactstory"));
let jsonl = write_all(&list, ".jsonl").unwrap();
let jsonl_text = String::from_utf8(jsonl).unwrap();
assert_eq!(jsonl_text.lines().count(), 1);
let csv_bytes = write_all(&list, ".csv").unwrap();
assert!(
String::from_utf8(csv_bytes)
.unwrap()
.contains("Impactstory")
);
let parquet_bytes = write_all(&list, ".parquet").unwrap();
assert_eq!(&parquet_bytes[0..4], b"PAR1");
assert!(write_all(&list, ".sql").is_err());
}
#[test]
fn test_write_all_empty_list() {
let list: Vec<Ror> = vec![];
assert!(write_all(&list, ".json").unwrap() == b"[]");
let csv_bytes = write_all(&list, ".csv").unwrap();
assert!(csv_bytes.is_empty());
}
#[test]
#[ignore = "network"]
fn test_fetch_latest_ror_release() {
let release = fetch_latest_ror_release().unwrap();
assert!(
release.version.starts_with('v'),
"version should start with 'v': {}",
release.version
);
assert_eq!(release.date.len(), 10, "date should be YYYY-MM-DD: {}", release.date);
assert!(!release.zenodo_id.is_empty());
assert!(release.filename.ends_with(".zip"));
assert!(release.download_url.contains("zenodo.org"));
assert_eq!(release.version, "v2.9");
assert_eq!(release.date, "2026-06-23");
}
}