use super::case::is_mixed_case;
use super::namepart::{Location, NamePart};
use super::suffix;
use super::surname;
use super::title;
use crate::Cow;
use smallvec::SmallVec;
use std::num::NonZeroU8;
pub struct Name<'a> {
parts: SmallVec<[NamePart<'a>; 7]>,
pub surname_index: usize,
pub generation: Option<NonZeroU8>,
reversed_prefixes: Vec<NamePart<'a>>,
honorific_suffixes: Vec<NamePart<'a>>,
}
impl<'a> Name<'a> {
pub fn words(&self) -> &[NamePart<'a>] {
self.parts.as_ref()
}
pub fn honorific_prefix(&self) -> Option<Cow<str>> {
match self.reversed_prefixes.len() {
0 => None,
1 => self
.reversed_prefixes
.first()
.map(title::canonicalize_prefix),
_ => Some(Cow::Owned(
self.reversed_prefixes
.iter()
.rev()
.map(title::canonicalize_prefix)
.collect::<SmallVec<[Cow<str>; 4]>>()
.join(" "),
)),
}
}
pub fn honorific_suffix(&self) -> Option<Cow<str>> {
match self.honorific_suffixes.len() {
0 => None,
1 => self
.honorific_suffixes
.first()
.map(title::canonicalize_suffix),
_ => Some(Cow::Owned(
self.honorific_suffixes
.iter()
.map(title::canonicalize_suffix)
.collect::<SmallVec<[Cow<str>; 4]>>()
.join(" "),
)),
}
}
}
#[derive(Debug)]
struct ParseOp<'a> {
words: SmallVec<[NamePart<'a>; 7]>,
surname_index: usize,
generation_from_suffix: Option<NonZeroU8>,
reversed_prefixes: Vec<NamePart<'a>>,
honorific_suffixes: Vec<NamePart<'a>>,
use_capitalization: bool,
}
pub const MAX_WORDS: usize = u8::max_value() as usize;
pub fn parse(name: &str) -> Option<Name> {
let mut op = ParseOp {
words: SmallVec::new(),
surname_index: 0,
generation_from_suffix: None,
reversed_prefixes: Vec::new(),
honorific_suffixes: Vec::new(),
use_capitalization: is_mixed_case(name),
};
if op.run(name) {
Some(Name {
parts: op.words,
surname_index: op.surname_index,
generation: op.generation_from_suffix,
reversed_prefixes: op.reversed_prefixes,
honorific_suffixes: op.honorific_suffixes,
})
} else {
None
}
}
impl<'a> ParseOp<'a> {
fn run(&mut self, name: &'a str) -> bool {
let mut parts = name.split(',').peekable();
while let Some(part) = parts.next() {
let first_part = self.words.is_empty();
let last_part = parts.peek().is_none();
if first_part && last_part {
self.handle_no_comma(part);
} else if first_part {
self.handle_before_comma(part);
} else if self.surname_index == 0 {
let must_include_given = last_part && self.words.len() == 1;
self.handle_after_comma(part, must_include_given);
} else {
self.handle_after_surname(part);
}
}
if !self.valid() {
if let Some(i) = self.possible_false_postfix() {
self.words.push(self.honorific_suffixes.remove(i));
} else if let Some(i) = self.possible_false_prefix() {
self.words.insert(0, self.reversed_prefixes.remove(i));
}
}
while self.words.last().filter(|w| !w.is_namelike()).is_some() {
let removed = self.words.pop().unwrap();
self.surname_index = 0;
if self.use_capitalization && !self.valid() {
let word = NamePart::from_word(removed.word, false, Location::End);
if word.is_namelike() {
self.words.push(word);
break;
}
}
}
if self.surname_index == 0 && self.words.len() > 1 {
self.surname_index = surname::find_surname_index(&self.words[1..]) + 1;
}
self.valid()
}
fn valid(&self) -> bool {
self.words.len() >= 2
&& self.words.len() <= MAX_WORDS
&& self
.words
.iter()
.all(|w| w.is_namelike() || w.is_initials())
&& self.words[self.surname_index..]
.iter()
.any(|w| w.is_namelike())
}
fn handle_no_comma(&mut self, name: &'a str) {
debug_assert!(
self.words.is_empty()
&& self.surname_index == 0
&& self.possible_false_prefix().is_none()
&& self.possible_false_postfix().is_none(),
"Invalid state for handle_no_comma!"
);
let mut in_prefix = true;
for word in NamePart::all_from_text(name, self.use_capitalization, Location::Start) {
if in_prefix && (word.is_namelike() || word.is_initials()) {
in_prefix = false;
}
if in_prefix {
self.reversed_prefixes.insert(0, word);
} else {
self.words.push(word);
}
}
if self.words.is_empty() {
return;
}
let prefix_title_len = if self.words.len() > 2 {
title::find_prefix_len(&self.words)
} else {
0
};
self.strip_prefix(prefix_title_len);
let first_postfix_index =
if self.words.len() + self.possible_false_prefix().iter().count() > 2 {
title::find_postfix_index(&self.words[1..], false) + 1
} else {
self.words.len()
};
self.strip_postfix(first_postfix_index);
self.surname_index = surname::find_surname_index(&self.words[1..]) + 1;
}
fn handle_before_comma(&mut self, part: &'a str) {
debug_assert!(
self.words.is_empty()
&& self.surname_index == 0
&& self.possible_false_prefix().is_none()
&& self.possible_false_postfix().is_none(),
"Invalid state for handle_before_comma!"
);
self.words.extend(NamePart::all_from_text(
part,
self.use_capitalization,
Location::End,
));
if self.words.is_empty() {
return;
}
let prefix_title_len = title::find_prefix_len(&self.words);
self.strip_prefix(prefix_title_len);
let first_postfix_index = title::find_postfix_index(&self.words[1..], false) + 1;
self.strip_postfix(first_postfix_index);
if prefix_title_len > 0 {
self.surname_index = surname::find_surname_index(&self.words[1..]) + 1;
} else {
self.surname_index = surname::find_surname_index(&self.words);
}
}
fn handle_after_comma(&mut self, part: &'a str, must_include_given: bool) {
debug_assert!(
!self.words.is_empty() && self.surname_index == 0,
"Invalid state for handle_after_comma!"
);
let mut given_middle_or_postfix_words: SmallVec<[NamePart<'a>; 5]> =
NamePart::all_from_text(part, self.use_capitalization, Location::Start).collect();
if given_middle_or_postfix_words.is_empty() {
return;
}
if given_middle_or_postfix_words.len() > 1 {
let prefix_len = title::find_prefix_len(&given_middle_or_postfix_words);
self.strip_unsaved_prefix(&mut given_middle_or_postfix_words, prefix_len);
}
let first_postfix_index = if must_include_given {
title::find_postfix_index(&given_middle_or_postfix_words[1..], true) + 1
} else {
title::find_postfix_index(&given_middle_or_postfix_words, true)
};
self.strip_unsaved_postfix(&mut given_middle_or_postfix_words, first_postfix_index);
if !given_middle_or_postfix_words.is_empty() {
self.surname_index = given_middle_or_postfix_words.len();
self.words.reserve(given_middle_or_postfix_words.len());
self.words.insert_many(0, given_middle_or_postfix_words);
}
}
fn handle_after_surname(&mut self, part: &'a str) {
debug_assert!(
self.surname_index > 0,
"Invalid state for handle_after_surname!"
);
for word in NamePart::all_from_text(part, self.use_capitalization, Location::End) {
self.found_suffix_or_postfix(word, false);
}
}
fn strip_prefix(&mut self, len: usize) {
for i in (0..len).rev() {
let word = self.words.remove(i);
self.reversed_prefixes.push(word);
}
}
fn strip_unsaved_prefix(&mut self, words: &mut SmallVec<[NamePart<'a>; 5]>, len: usize) {
for i in (0..len).rev() {
self.reversed_prefixes.push(words.remove(i));
}
}
fn possible_false_prefix(&self) -> Option<usize> {
self.reversed_prefixes
.iter()
.position(|p| p.is_namelike() || p.is_initials())
}
fn possible_false_postfix(&self) -> Option<usize> {
self.honorific_suffixes
.iter()
.position(|p| p.is_namelike() || p.is_initials())
}
fn strip_postfix(&mut self, index: usize) {
if index < self.words.len() {
let postfixes = self
.words
.drain(index..)
.collect::<SmallVec<[NamePart<'a>; 5]>>();
for postfix in postfixes {
self.found_suffix_or_postfix(postfix, false);
}
self.words.truncate(index);
}
}
fn strip_unsaved_postfix(&mut self, words: &mut SmallVec<[NamePart<'a>; 5]>, index: usize) {
if index < words.len() {
for postfix in words.drain(index..) {
self.found_suffix_or_postfix(postfix, false);
}
}
}
fn found_suffix_or_postfix(&mut self, postfix: NamePart<'a>, expect_initials: bool) {
if self.generation_from_suffix.is_none() {
if let Some(gen) = suffix::generation_from_suffix(&postfix, expect_initials) {
self.generation_from_suffix = Some(gen);
return;
}
}
self.honorific_suffixes.push(postfix);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "bench")]
use test::{black_box, Bencher};
#[test]
fn first_last() {
let Name {
parts,
surname_index,
generation,
..
} = parse("John Doe").unwrap();
assert_eq!("John", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(None, generation);
}
#[test]
fn initial_last() {
let Name {
parts,
surname_index,
generation,
..
} = parse("J. Doe").unwrap();
assert_eq!("J.", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(None, generation);
}
#[test]
fn last_first() {
let Name {
parts,
surname_index,
generation,
..
} = parse("Doe, John").unwrap();
assert_eq!("John", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(None, generation);
}
#[test]
fn last_initial() {
let Name {
parts,
surname_index,
generation,
..
} = parse("Doe, J.").unwrap();
assert_eq!("J.", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(None, generation);
}
#[test]
fn suffix() {
let Name {
parts,
surname_index,
generation,
..
} = parse("John Doe III").unwrap();
assert_eq!("John", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(NonZeroU8::new(3), generation);
}
#[test]
fn suffix_comma() {
let Name {
parts,
surname_index,
generation,
..
} = parse("Doe, John III").unwrap();
assert_eq!("John", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(NonZeroU8::new(3), generation);
}
#[test]
fn intermediate_suffix() {
let Name {
parts,
surname_index,
generation,
..
} = parse("Doe, II, John").unwrap();
assert_eq!("John", parts[0].word);
assert_eq!("Doe", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(NonZeroU8::new(2), generation);
let Name {
parts,
surname_index,
generation,
..
} = parse("Griffey, Jr., Ken").unwrap();
assert_eq!("Ken", parts[0].word);
assert_eq!("Griffey", parts[1].word);
assert_eq!(1, surname_index);
assert_eq!(NonZeroU8::new(2), generation);
}
#[test]
fn honorifics() {
let name = parse("Lt Col Sir John Doe, X, YY, ZZZ").unwrap();
assert_eq!("Lt. Col. Sir", name.honorific_prefix().unwrap());
assert_eq!("X YY ZZZ", name.honorific_suffix().unwrap());
let name = parse("Doe, Lt Col Sir John, X, YY, ZZZ").unwrap();
assert_eq!("Lt. Col. Sir", name.honorific_prefix().unwrap());
assert_eq!("X YY ZZZ", name.honorific_suffix().unwrap());
let name = parse("Air Chief Marshal Sir Stuart William Peach, GBE, KCB, ADC, DL").unwrap();
assert_eq!("Air Chief Marshal Sir", name.honorific_prefix().unwrap());
assert_eq!("GBE KCB ADC DL", name.honorific_suffix().unwrap());
let name = parse("Air Chief Marshal Sir Stuart William Peach GBE KCB ADC DL").unwrap();
assert_eq!("Air Chief Marshal Sir", name.honorific_prefix().unwrap());
assert_eq!("GBE KCB ADC DL", name.honorific_suffix().unwrap());
let name = parse("Peach, Air Chief Marshal Sir Stuart William, GBE KCB ADC DL").unwrap();
assert_eq!("Air Chief Marshal Sir", name.honorific_prefix().unwrap());
assert_eq!("GBE KCB ADC DL", name.honorific_suffix().unwrap());
}
#[test]
fn et_al() {
let name = parse("Dr. Jane Doe, et al").unwrap();
assert_eq!("et al.", name.honorific_suffix().unwrap());
let name = parse("DR JANE DOE ET AL").unwrap();
assert_eq!("et al.", name.honorific_suffix().unwrap());
}
#[cfg(feature = "bench")]
#[bench]
fn parse_simple(b: &mut Bencher) {
b.iter(|| black_box(parse("John Doe").is_some()))
}
#[cfg(feature = "bench")]
#[bench]
fn parse_nonascii(b: &mut Bencher) {
b.iter(|| black_box(parse("이용희").is_some()))
}
#[cfg(feature = "bench")]
#[bench]
fn parse_comma(b: &mut Bencher) {
b.iter(|| black_box(parse("Doe, John").is_some()))
}
#[cfg(feature = "bench")]
#[bench]
fn parse_all_caps(b: &mut Bencher) {
b.iter(|| black_box(parse("JOHN DOE").is_some()))
}
#[cfg(feature = "bench")]
#[bench]
fn parse_complex(b: &mut Bencher) {
b.iter(|| black_box(parse("James S. Brown MD, FRCS, FDSRCS").is_some()))
}
}