use crate::pubmed::tags::PubmedTag;
use compact_str::CompactString;
use std::borrow::Cow;
#[derive(PartialEq)]
pub(crate) struct AuthorName {
name: String,
full: bool,
}
impl AuthorName {
pub fn au(name: String) -> Self {
AuthorName { name, full: false }
}
pub fn fau(name: String) -> Self {
AuthorName { name, full: true }
}
pub fn last_name(&self) -> &str {
let parts = if self.full {
self.name.split_once(", ")
} else {
self.name.rsplit_once(' ')
};
if let Some((last_name, _)) = parts {
last_name
} else {
&self.name
}
}
pub fn first_initials(&self) -> CompactString {
if self.full {
fau_initials(&self.name)
} else {
au_initials(&self.name)
}
}
pub(crate) fn given_name(&self) -> Option<&str> {
if self.full {
self.name.split_once(", ").map(|(_, r)| r)
} else {
self.name.rsplit_once(' ').map(|(_, r)| r)
}
}
pub fn as_au(&self) -> Cow<'_, str> {
if self.full {
let initials = self.first_initials();
if initials.is_empty() {
Cow::Borrowed(self.last_name())
} else {
Cow::Owned(format!("{} {}", self.last_name(), initials))
}
} else {
Cow::Borrowed(&self.name)
}
}
pub fn au_equals(&self, au: &str) -> bool {
let (last_name, initials) = au.rsplit_once(' ').unwrap_or((au, ""));
self.last_name() == last_name && self.first_initials().starts_with(initials)
}
}
impl std::fmt::Debug for AuthorName {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(if self.full { "FAU" } else { "AU" })?;
f.write_str("(")?;
f.write_str(&self.name)?;
f.write_str(")")?;
Ok(())
}
}
fn au_initials(au: &str) -> CompactString {
if let Some((_, r)) = au.rsplit_once(' ') {
CompactString::new(r)
} else {
CompactString::const_new("")
}
}
fn fau_initials(fau: &str) -> CompactString {
if let Some((_, r)) = fau.split_once(", ") {
let chars = r.split(' ').map_while(|s| s.chars().next());
CompactString::from_iter(chars)
} else {
CompactString::const_new("")
}
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub(crate) enum ConsecutiveTag {
Author,
FullAuthorName,
Affiliation,
}
impl ConsecutiveTag {
pub(crate) fn from_tag(tag: PubmedTag) -> Option<Self> {
match tag {
PubmedTag::Author => Some(ConsecutiveTag::Author),
PubmedTag::Affiliation => Some(ConsecutiveTag::Affiliation),
PubmedTag::FullAuthorName => Some(ConsecutiveTag::FullAuthorName),
_ => None,
}
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct PubmedAuthor {
pub(crate) name: AuthorName,
pub(crate) affiliations: Vec<String>,
}
impl PubmedAuthor {
fn new(name: AuthorName) -> Self {
Self {
name,
affiliations: Vec::with_capacity(1),
}
}
fn from_au(au: String) -> Self {
Self::new(AuthorName::au(au))
}
fn from_fau(au: String) -> Self {
Self::new(AuthorName::fau(au))
}
}
pub(crate) fn resolve_authors(
data: Vec<(ConsecutiveTag, String)>,
) -> (Vec<PubmedAuthor>, Vec<String>) {
let mut authors: Vec<PubmedAuthor> = Vec::with_capacity(data.len() / 2 + 1);
let mut unused_affiliations = Vec::new();
for (tag, value) in data {
match tag {
ConsecutiveTag::Author => {
let prev = authors.last().map(|a| &a.name);
if !prev.is_some_and(|n| n.full && n.au_equals(&value)) {
authors.push(PubmedAuthor::from_au(value));
}
}
ConsecutiveTag::FullAuthorName => {
authors.push(PubmedAuthor::from_fau(value));
}
ConsecutiveTag::Affiliation => {
if let Some(author) = authors.last_mut() {
author.affiliations.push(value);
} else {
unused_affiliations.push(value);
}
}
}
}
(authors, unused_affiliations)
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use rstest::*;
#[rstest]
#[case("", "", "", "", None)]
#[case("Archimedes", "Archimedes", "Archimedes", "", None)]
#[case("Einstein A", "Einstein, Albert", "Einstein", "A", Some("Albert"))]
#[case("Newton I", "Newton, Issac", "Newton", "I", Some("Issac"))]
#[case("Watson JD", "Watson, James D", "Watson", "JD", Some("James D"))]
#[case(
"Watson JD",
"Watson, James Dewey",
"Watson",
"JD",
Some("James Dewey")
)]
#[case("Crick FH", "Crick, Francis H", "Crick", "FH", Some("Francis H"))]
#[case(
"Crick FHC",
"Crick, Francis Harry Compton",
"Crick",
"FHC",
Some("Francis Harry Compton")
)]
#[case(
"van der Valk JPM",
"van der Valk, J P M",
"van der Valk",
"JPM",
Some("J P M")
)]
fn test_author_name(
#[case] au: &str,
#[case] fau: &str,
#[case] last_name: &str,
#[case] initials: &str,
#[case] given_name: Option<&str>,
) {
let full = AuthorName::fau(fau.to_string());
assert_eq!(full.last_name(), last_name);
assert_eq!(full.first_initials(), initials);
assert_eq!(full.as_au(), au);
assert_eq!(full.given_name(), given_name);
let short = AuthorName::au(au.to_string());
assert_eq!(short.last_name(), last_name);
assert_eq!(short.first_initials(), initials);
assert_eq!(short.as_au(), au);
if given_name.is_some() {
assert_eq!(short.given_name(), Some(initials));
}
}
#[rstest]
#[case(&["Watson JD", "Crick FH"])]
#[case(&["Watson JD", "Watson JD"])]
fn test_resolve_author_consecutive_au(#[case] names: &[&str]) {
let data = names
.into_iter()
.map(|s| (ConsecutiveTag::Author, s.to_string()))
.collect();
let (authors, _) = resolve_authors(data);
let actual: Vec<_> = authors.iter().map(|a| a.name.as_au()).collect::<Vec<_>>();
assert_eq!(&actual, names);
}
#[rstest]
fn test_resolve_author_typical() {
let data = vec![
(ConsecutiveTag::FullAuthorName, "Lerch, Jason P".to_string()),
(ConsecutiveTag::Author, "Lerch JP".to_string()),
(ConsecutiveTag::Affiliation, "Program in Neuroscience and Mental Health, The Hospital for Sick Children, Toronto, Canada.".to_string()),
(ConsecutiveTag::Affiliation, "Department of Medical Biophysics, University of Toronto, Toronto, Canada.".to_string()),
(ConsecutiveTag::FullAuthorName, "van der Kouwe, André J W".to_string()),
(ConsecutiveTag::Author, "van der Kouwe AJ".to_string()),
(ConsecutiveTag::Affiliation, "Athinoula A. Martinos Center for Biomedical Research, Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Charlestown, Massachusetts, USA.".to_string()),
(ConsecutiveTag::Affiliation, "Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Boston, Massachusetts, USA.".to_string()),
(ConsecutiveTag::FullAuthorName, "Fischl, Bruce".to_string()),
(ConsecutiveTag::Author, "Fischl B".to_string()),
(ConsecutiveTag::Affiliation, "Athinoula A. Martinos Center for Biomedical Research, Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Charlestown, Massachusetts, USA.".to_string()),
(ConsecutiveTag::Affiliation, "Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Boston, Massachusetts, USA.".to_string()),
(ConsecutiveTag::Affiliation, "Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts, USA.".to_string()),
];
let (actual, leading_affiliations) = resolve_authors(data);
assert!(leading_affiliations.is_empty());
let expected = vec![
PubmedAuthor {
name: AuthorName::fau("Lerch, Jason P".to_string()),
affiliations: vec![
"Program in Neuroscience and Mental Health, The Hospital for Sick Children, Toronto, Canada.".to_string(),
"Department of Medical Biophysics, University of Toronto, Toronto, Canada.".to_string()
],
},
PubmedAuthor {
name: AuthorName::fau("van der Kouwe, André J W".to_string()),
affiliations: vec![
"Athinoula A. Martinos Center for Biomedical Research, Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Charlestown, Massachusetts, USA.".to_string(),
"Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Boston, Massachusetts, USA.".to_string()
],
},
PubmedAuthor {
name: AuthorName::fau("Fischl, Bruce".to_string()),
affiliations: vec![
"Athinoula A. Martinos Center for Biomedical Research, Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Charlestown, Massachusetts, USA.".to_string(),
"Department of Radiology, Massachusetts General Hospital and Harvard Medical School, Boston, Massachusetts, USA.".to_string(),
"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts, USA.".to_string()
],
},
];
assert_eq!(actual, expected);
}
#[rstest]
#[case(&[
(ConsecutiveTag::FullAuthorName, "Bose, Satyendra N"),
(ConsecutiveTag::Author, "Bose SN"),
(ConsecutiveTag::FullAuthorName, "Einstein, Albert"),
(ConsecutiveTag::Author, "Einstein A"),
])]
#[case(&[
(ConsecutiveTag::FullAuthorName, "Bose, Satyendra N"),
(ConsecutiveTag::FullAuthorName, "Einstein, Albert"),
(ConsecutiveTag::Author, "Einstein A"),
])]
#[case(&[
(ConsecutiveTag::Author, "Bose SN"),
(ConsecutiveTag::Author, "Einstein A"),
])]
fn test_resolve_author_deduplication(#[case] names: &[(ConsecutiveTag, &str)]) {
let data = names
.into_iter()
.map(|(t, n)| (*t, n.to_string()))
.collect();
let (authors, _) = resolve_authors(data);
let actual: Vec<_> = authors.iter().map(|a| a.name.as_au()).collect::<Vec<_>>();
assert_eq!(&actual, &["Bose SN", "Einstein A"]);
}
#[rstest]
fn test_resolve_author_leading_affiliations() {
let data = vec![
(
ConsecutiveTag::Affiliation,
"Lab of Unknown Stuff".to_string(),
),
(
ConsecutiveTag::Affiliation,
"Mysterious Basement".to_string(),
),
(
ConsecutiveTag::FullAuthorName,
"Einstein, Albert".to_string(),
),
(ConsecutiveTag::Author, "Einstein A".to_string()),
(
ConsecutiveTag::Affiliation,
"University of Zurich".to_string(),
),
(
ConsecutiveTag::Affiliation,
"University of Bern".to_string(),
),
];
let (authors, leading_affiliations) = resolve_authors(data);
let expected = [
"Lab of Unknown Stuff".to_string(),
"Mysterious Basement".to_string(),
];
assert_eq!(leading_affiliations, &expected);
assert_eq!(authors.len(), 1);
let author = &authors[0].name;
assert_eq!(author.name, "Einstein, Albert");
assert_eq!(author.full, true);
let affiliations = &authors[0].affiliations;
let expected = [
"University of Zurich".to_string(),
"University of Bern".to_string(),
];
assert_eq!(affiliations, &expected)
}
}