Skip to main content

commonmeta/
author_utils.rs

1use serde_json::Value;
2
3use crate::data::Affiliation;
4use crate::utils::{normalize_id, validate_orcid, validate_ror};
5use crate::constants::CONTRIBUTOR_ROLES;
6
7const ORG_HINT_WORDS: &[&str] = &[
8    "University",
9    "College",
10    "Institute",
11    "School",
12    "Center",
13    "Department",
14    "Laboratory",
15    "Library",
16    "Museum",
17    "Foundation",
18    "Society",
19    "Association",
20    "Company",
21    "Corporation",
22    "Collaboration",
23    "Consortium",
24    "Incorporated",
25    "Inc.",
26    "Institut",
27    "Research",
28    "Science",
29    "Team",
30    "Ministry",
31    "Government",
32    "Count",
33    "Reviewers",
34    "Staff",
35    "Lab",
36    "Redaktion",
37    "Group",
38    "area",
39];
40
41pub fn cleanup_author(author: Option<&str>) -> Option<String> {
42    let Some(author) = author else {
43        return None;
44    };
45    let trimmed = author.trim();
46    if trimmed.is_empty() || trimmed.starts_with(',') {
47        return None;
48    }
49
50    let cleaned = trimmed
51        .replace(" - ", "-")
52        .split_whitespace()
53        .collect::<Vec<_>>()
54        .join(" ");
55    if cleaned.is_empty() {
56        None
57    } else {
58        Some(cleaned)
59    }
60}
61
62pub fn to_ror_id(id: Option<&str>) -> Option<String> {
63    let Some(id) = id else {
64        return None;
65    };
66    validate_ror(id).map(|ror| format!("https://ror.org/{}", ror))
67}
68
69pub fn is_personal_name(name: &str) -> bool {
70    if name.contains(';') {
71        return false;
72    }
73
74    if name.split_whitespace().count() == 1 && !name.contains(',') {
75        return false;
76    }
77
78    if ORG_HINT_WORDS.iter().any(|word| name.contains(word)) {
79        return false;
80    }
81
82    if let Some(last) = name.rsplit(", ").next()
83        && matches!(last, "MD" | "PhD" | "BS")
84    {
85        return true;
86    }
87
88    name.contains(',') || name.split_whitespace().count() >= 2
89}
90
91pub fn split_person_name(name: &str) -> (String, String, String) {
92    let name = name.trim();
93    if name.is_empty() {
94        return (String::new(), String::new(), String::new());
95    }
96
97    if let Some(comma) = name.find(',') {
98        let family = name[..comma].trim().to_string();
99        let given = name[comma + 1..].trim().to_string();
100        return (given, family, String::new());
101    }
102
103    if let Some(space) = name.rfind(' ') {
104        let given = name[..space].trim().to_string();
105        let family = name[space + 1..].trim().to_string();
106        if !given.is_empty() && !family.is_empty() {
107            return (given, family, String::new());
108        }
109    }
110
111    (String::new(), String::new(), name.to_string())
112}
113
114pub fn infer_contributor_type(
115    raw_type: &str,
116    id: &str,
117    given_name: &str,
118    family_name: &str,
119    name: &str,
120    via: Option<&str>,
121) -> String {
122    let mut type_ = raw_type.to_string();
123    if type_.ends_with("al") {
124        type_.truncate(type_.len() - 2);
125    }
126
127    if type_.is_empty() && validate_ror(id).is_some() {
128        return "Organization".to_string();
129    }
130    if type_.is_empty() && validate_orcid(id).is_some() {
131        return "Person".to_string();
132    }
133    if type_.is_empty() && (!given_name.is_empty() || !family_name.is_empty()) {
134        return "Person".to_string();
135    }
136    if type_.is_empty() && !name.is_empty() && via == Some("crossref") {
137        return "Organization".to_string();
138    }
139    if type_.is_empty() && is_personal_name(name) {
140        return "Person".to_string();
141    }
142    if type_.is_empty() && !name.is_empty() {
143        return "Organization".to_string();
144    }
145    type_
146}
147
148pub fn normalize_contributor_roles(raw_roles: &[String], default_role: &str) -> Vec<String> {
149    let filtered: Vec<String> = raw_roles
150        .iter()
151        .filter(|r| CONTRIBUTOR_ROLES.contains(&r.as_str()))
152        .cloned()
153        .collect();
154    if filtered.is_empty() {
155        vec![default_role.to_string()]
156    } else {
157        filtered
158    }
159}
160
161pub fn parse_affiliation_value(v: &Value) -> Option<Affiliation> {
162    if let Some(name) = v.as_str() {
163        if name.is_empty() {
164            return None;
165        }
166        return Some(Affiliation {
167            name: name.to_string(),
168            ..Default::default()
169        });
170    }
171
172    let obj = v.as_object()?;
173    let mut affiliation_identifier = String::new();
174    let name = obj
175        .get("name")
176        .and_then(Value::as_str)
177        .or_else(|| obj.get("#text").and_then(Value::as_str))
178        .unwrap_or("")
179        .to_string();
180
181    if let Some(raw_aff_id) = obj.get("affiliationIdentifier").and_then(Value::as_str) {
182        let normalized = if !raw_aff_id.starts_with("https://") {
183            if let Some(scheme_uri) = obj.get("schemeURI").and_then(Value::as_str) {
184                let normalized_scheme = if scheme_uri.ends_with('/') {
185                    scheme_uri.to_string()
186                } else {
187                    format!("{}/", scheme_uri)
188                };
189                normalize_id(&format!("{}{}", normalized_scheme, raw_aff_id))
190            } else {
191                normalize_id(raw_aff_id)
192            }
193        } else {
194            normalize_id(raw_aff_id)
195        };
196        affiliation_identifier = normalized;
197    } else if let Some(id_val) = obj
198        .get("id")
199        .and_then(Value::as_str)
200        .or_else(|| obj.get("@id").and_then(Value::as_str))
201    {
202        if id_val.starts_with("http://") || id_val.starts_with("https://") {
203            affiliation_identifier = id_val.to_string();
204        } else if let Some(ror) = to_ror_id(Some(id_val)) {
205            // bare ROR identifier (e.g. "03y3e3s17" from InvenioRDM)
206            affiliation_identifier = ror;
207        }
208    } else if let Some(same_as) = obj.get("sameAs").and_then(Value::as_str)
209        && (same_as.starts_with("http://") || same_as.starts_with("https://"))
210    {
211        affiliation_identifier = same_as.to_string();
212    }
213
214    if name.is_empty() && affiliation_identifier.is_empty() {
215        return None;
216    }
217
218    let id = to_ror_id(Some(&affiliation_identifier)).unwrap_or_default();
219    if id.is_empty() && name.is_empty() {
220        return None;
221    }
222    Some(Affiliation {
223        id,
224        name,
225        ..Default::default()
226    })
227}
228
229pub fn parse_affiliations(values: &[Value]) -> Vec<Affiliation> {
230    let mut out = Vec::new();
231    for value in values {
232        if let Some(aff) = parse_affiliation_value(value) {
233            let duplicate = out
234                .iter()
235                .any(|existing: &Affiliation| existing.id == aff.id && existing.name == aff.name);
236            if !duplicate {
237                out.push(aff);
238            }
239        }
240    }
241    out
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn detects_personal_name() {
250        assert!(is_personal_name("Doe, Jane"));
251        assert!(is_personal_name("Jane Doe"));
252        assert!(!is_personal_name("Big Science Collaboration"));
253    }
254
255    #[test]
256    fn infers_type_from_orcid_and_ror() {
257        assert_eq!(
258            infer_contributor_type("", "https://orcid.org/0000-0001-5000-0007", "", "", "", None),
259            "Person"
260        );
261        assert_eq!(
262            infer_contributor_type("", "https://ror.org/05dxps055", "", "", "", None),
263            "Organization"
264        );
265    }
266
267    #[test]
268    fn parses_affiliation_from_string_and_object() {
269        let values = vec![
270            Value::String("Example University".to_string()),
271            serde_json::json!({"id": "https://ror.org/05dxps055", "name": "Example University"}),
272        ];
273        let affiliations = parse_affiliations(&values);
274        assert_eq!(affiliations.len(), 2);
275        assert_eq!(affiliations[0].name, "Example University");
276        assert_eq!(affiliations[1].id, "https://ror.org/05dxps055");
277    }
278}