use super::case::*;
use super::nickname::have_matching_variants;
use super::transliterate;
use super::{Location, Name};
use std::borrow::Cow;
use std::convert::TryInto;
use std::iter;
use std::ops::Range;
use unicode_segmentation::UnicodeSegmentation;
pub const MIN_SURNAME_CHAR_MATCH: usize = 4;
pub const MIN_GIVEN_NAME_CHAR_MATCH: usize = 3;
impl Name {
/// Might this name represent the same person as another name?
///
/// # Examples
/// ```
/// use human_name::Name;
///
/// let j_doe = Name::parse("J. Doe").unwrap();
/// let jane_doe = Name::parse("Jane Doe").unwrap();
/// let john_m_doe = Name::parse("John M. Doe").unwrap();
/// let john_l_doe = Name::parse("John L. Doe").unwrap();
///
/// assert!(j_doe.consistent_with(&john_m_doe));
/// assert!(j_doe.consistent_with(&john_l_doe));
/// assert!(j_doe.consistent_with(&jane_doe));
/// assert!(j_doe.consistent_with(&j_doe));
/// assert!(!john_m_doe.consistent_with(&john_l_doe));
/// assert!(!jane_doe.consistent_with(&john_l_doe));
///
/// let zheng_he = Name::parse("Zheng He").unwrap();
/// let han_chars = Name::parse("鄭和").unwrap();
/// assert!(han_chars.consistent_with(&zheng_he));
/// ```
///
/// # Defining "consistency"
///
/// Requires that all known parts are consistent, which means at minimum,
/// the final words of the surnames match, and one ordered set of first
/// and middle initials is a superset of the other. If given and/or middle
/// names and/or suffixes are present in both names, they must match as well.
///
/// Transliterates everything to ASCII before comparison using the naive
/// algorithm of [unidecode](https://github.com/chowdhurya/rust-unidecode/)
/// (which ignores context), and ignores case, accents and combining marks.
///
/// In the case of given and middle names, allows one name to be a prefix of
/// the other, without requiring the prefix end at a word boundary as we do
/// with surname suffix matches, and supports matching a small number of
/// common nicknames and nickname patterns based on the root name.
///
/// # Limitations
///
/// There will be false positives ("Jan Doe" is probably not "Jane Doe"),
/// and false negatives ("James Hanson" might be "James Hansen"). And, of
/// course, even identical names do not necessarily represent the same person.
///
/// Given limited information, we err on the side of false positives. This
/// kind of matching will be most useful in cases where we already have
/// reason to believe that a single individual's name appears twice, and we
/// are trying to figure out exactly where, e.g. a particular author's index
/// in the list of authors of a co-authored paper.
pub fn consistent_with(&self, other: &Name) -> bool {
// Fast path
if self.surname_hash() != other.surname_hash() {
return false;
}
// Check given name(s) first because if we got this far, we know that
// at least the last characters of the surnames are consistent
self.given_and_middle_names_consistent(other)
&& self.surname_consistent(other)
&& self.suffix_consistent(other)
}
// Not clear why we have to `always` here but the performance difference is detectable
// and there's only one caller (though we call this twice)
#[inline(always)]
fn split_initials(&self) -> (char, usize) {
let mut initials = self.initials().chars();
let first = initials.next().unwrap();
let rest_count = initials.count();
(first, rest_count)
}
#[inline]
fn given_and_middle_names_consistent(&self, other: &Name) -> bool {
let (my_first, my_middle_count) = self.split_initials();
let (their_first, their_middle_count) = other.split_initials();
// Handle simple cases first, where we only have to worry about one name
// and/or initial.
if my_middle_count == 0 && their_middle_count == 0 {
match (self.given_name(), other.given_name()) {
(Some(my_name), Some(their_name)) => have_matching_variants(my_name, their_name),
_ => {
transliterate::to_ascii_initial(my_first)
== transliterate::to_ascii_initial(their_first)
}
}
}
// For the more complicated cases, we'll simplify things a bit by
// letting ourselves assume `self` has the more complete name.
else if my_middle_count >= their_middle_count {
self.given_and_middle_names_consistent_with_less_complete(other)
} else {
other.given_and_middle_names_consistent_with_less_complete(self)
}
}
#[inline]
fn given_names_or_initials(
&self,
) -> GivenNamesOrInitials<
impl Iterator<Item = (usize, char)> + '_,
impl Iterator<Item = (&str, Location)> + '_,
> {
GivenNamesOrInitials {
initials: self.initials().chars().enumerate(),
names_and_locations: self
.given_iter()
.zip(self.given_names_in_initials().iter().copied())
.peekable(),
}
}
#[inline(never)]
fn given_and_middle_names_consistent_with_less_complete(&self, other: &Name) -> bool {
// Check initials first
if !self.initials_consistent_with_less_complete(other) {
return false;
}
// Unless both versions of the name have given or middle names, we're done
if self.given_name_words == 0 || other.given_name_words == 0 {
return true;
}
// In the case where we have to compare multiple given or middle names,
// align words using their initials, and for each initial where we know
// the word for both names, transliterate to ASCII lowercase, then require
// that the words are an exact match, or that one is a prefix of the other,
// or that one is a recognized nickname or spelling variant of the other.
let missing_any_names = self.missing_any_name() || other.missing_any_name();
let mut their_parts = other.given_names_or_initials();
let mut suffix_for_prior_prefix_match: Option<String> = None;
let mut looked_up_nicknames = false;
let mut their_part_if_any = their_parts.next();
for my_part in self.given_names_or_initials() {
if let Some(ref their_part) = their_part_if_any {
let result = my_part.check_consistency(their_part, !looked_up_nicknames);
match result {
ComparisonResult::Inconsistent => {
// The names are inconsistent
return false;
}
ComparisonResult::DifferentInitials => {
// They don't have a word for this initial, so we don't
// advance the iterator for their words/initials
continue;
}
ComparisonResult::NicknameMatch => {
looked_up_nicknames = true;
}
ComparisonResult::PrefixOfOther(remaining_chars) => {
suffix_for_prior_prefix_match = Some(remaining_chars);
}
_ => {
// Any other kind of match; no-op, just continue
}
}
} else if missing_any_names {
// We've matched everything available, and will skip the check
// in the next block
return true;
} else if let Some(suffix) = suffix_for_prior_prefix_match {
// We've matched everything available, but we're not quite done.
//
// It's not uncommon for representations of a name to be inconsistent
// in whether two parts of a given name are separated by a space,
// a hyphen, or nothing (especially with transliterated names).
//
// This is one of the reasons we accept prefix-only matches for
// given & middle names. However, doing so potentially opens us up
// to more false positives, and we want to mitigate that.
//
// When it looks like we have a name part following a space or hyphen
// which might reasonably be treated as part of the same actual name
// as the preceding part, we don't want to just accept a prefix-only
// match on the preceding part as sufficient for a full match.
// Instead we'll continue after the prefix match by comparing the
// following part to the suffix, just as if the two parts hadn't
// been separated by a space or hyphen.
//
// This separates, e.g., "Jinli" from "Jin Yi", or "Xiaofeng" from
// "Xiao Peng".
//
// We don't try to do this in the presence of middle initials
// without corresponding names, because the logic would have to
// be even more complicated.
if let NameWordOrInitial::Word(word, _) = my_part {
return eq_casefolded_alpha_prefix(&suffix, word);
} else {
return true;
}
} else {
// We've matched everything available
return true;
}
let mut advance_by = my_part.initials_count();
while advance_by > 0 {
their_part_if_any = their_parts.next();
if let Some(ref their_part) = their_part_if_any {
advance_by -= their_part.initials_count();
} else {
break;
}
}
}
their_part_if_any.is_none()
}
#[inline]
fn initials_consistent_with_less_complete(&self, other: &Name) -> bool {
let (my_first, my_initials) = self.transliterated_initials();
let (their_first, their_initials) = other.transliterated_initials();
// Unless we have both given names, we require that the first initials
// match (if we do have the names, we'll check for nicknames, etc,
// which might violate this requirement.)
//
// If we do have both, we skip this requirement because we might find
// a nickname match.
if self.missing_given_name() || other.missing_given_name() {
if self.goes_by_middle_name() {
// Edge case: if we go by a middle name, another version of the
// name might use the middle name's initial as the first initial.
//
// However, only check in this direction because we know `self`
// has more complete initials, so their initials won't contain
// ours unless they're equal.
if !my_initials.contains(&*their_initials) {
return false;
}
} else {
// Normal case, given the absence of (true) given names
//
// Using byte offsets is ok because we already converted to ASCII
if my_first != their_first {
return false;
}
}
}
// If we have middle initials, check their consistency alone before
// looking at names (& we know "self" has the more complete initials).
my_initials[my_first.len_utf8()..].contains(&their_initials[their_first.len_utf8()..])
}
fn transliterated_initials(&self) -> (char, Cow<str>) {
let initials = self.initials();
if initials.is_ascii() {
(initials.as_bytes()[0].into(), Cow::Borrowed(initials))
} else {
let transliterated = self
.initials()
.chars()
.filter_map(transliterate::to_ascii_initial)
.collect::<String>();
if transliterated.is_empty() {
(initials.chars().next().unwrap(), Cow::Borrowed(initials))
} else {
(
transliterated.as_bytes()[0].into(),
Cow::Owned(transliterated),
)
}
}
}
fn missing_given_name(&self) -> bool {
if let Some(loc) = self.given_names_in_initials().get(0) {
loc.range().start > 0
} else {
true
}
}
fn missing_any_name(&self) -> bool {
let mut prev = 0;
for Range { start, end } in self.given_names_in_initials().iter().map(|loc| loc.range()) {
if start > prev {
return true;
} else {
prev = end;
}
}
// TODO: This is incorrect and should be `self.initials().len() > prev`
// but fixing this actually breaks a test. Related to our failure to handle
// PrefixOfSelf below?
usize::from(self.given_name_words) > prev
}
#[inline]
fn surname_consistent(&self, other: &Name) -> bool {
let mine = self.surname();
let theirs = other.surname();
if mine.is_ascii() && theirs.is_ascii() {
if self.surname_words == 1
&& other.surname_words == 1
&& mine.bytes().all(|b| b.is_ascii_alphabetic())
&& theirs.bytes().all(|b| b.is_ascii_alphabetic())
{
return mine.eq_ignore_ascii_case(theirs);
}
let filter = { |c: char| c.is_ascii_alphanumeric() };
Self::surname_consistent_slow(mine.rmatches(filter), theirs.rmatches(filter))
} else {
Self::surname_consistent_slow(mine.unicode_words().rev(), theirs.unicode_words().rev())
}
}
fn surname_consistent_slow<'a, I>(mut my_words: I, mut their_words: I) -> bool
where
I: Iterator<Item = &'a str>,
{
let mut my_word = my_words.next();
let mut their_word = their_words.next();
let mut matching_chars = 0;
// Require either an exact match (ignoring case etc), or a partial match
// of len >= MIN_SURNAME_CHAR_MATCH and breaking on a word boundary
loop {
// No words remaining for some surname - that's ok if it's true of
// both, or if the components that match are long enough
if my_word.is_none() && their_word.is_none() {
return true;
}
let my_chars = my_word.and_then(transliterate::to_ascii_casefolded_reversed);
let their_chars = their_word.and_then(transliterate::to_ascii_casefolded_reversed);
if my_chars.is_none() || their_chars.is_none() {
return matching_chars >= MIN_SURNAME_CHAR_MATCH || my_word == their_word;
}
let mut my_chars = my_chars.unwrap();
let mut their_chars = their_chars.unwrap();
let mut my_char = my_chars.next();
let mut their_char = their_chars.next();
loop {
if my_char.is_none() && their_char.is_none() {
// The words matched exactly, try the next word
my_word = my_words.next();
their_word = their_words.next();
break;
} else if my_char.is_none() {
// My word is a suffix of their word, check my next word
// against the rest of their word
my_word = my_words.next();
if let Some(chars) =
my_word.and_then(transliterate::to_ascii_casefolded_reversed)
{
// Continue the inner loop but incrementing through my
// next word
my_chars = chars;
my_char = my_chars.next();
} else {
// There is no next word, so this is a suffix-only match,
// and we don't allow those
return false;
}
} else if their_char.is_none() {
// Their word is a suffix of my word, check their next word
// against the rest of my_words
their_word = their_words.next();
if let Some(chars) =
their_word.and_then(transliterate::to_ascii_casefolded_reversed)
{
// Continue the inner loop but incrementing through their
// next word
their_chars = chars;
their_char = their_chars.next();
} else {
// There is no next word, so this is a suffix-only match,
// and we don't allow those
return false;
}
} else if my_char != their_char {
// We found a conflict and can short-circuit
return false;
} else {
// Characters matched, continue the inner loop
matching_chars += 1;
my_char = my_chars.next();
their_char = their_chars.next();
}
}
}
}
#[inline]
fn suffix_consistent(&self, other: &Name) -> bool {
match (self.generational_suffix(), other.generational_suffix()) {
(Some(mine), Some(theirs)) => mine == theirs,
_ => true,
}
}
}
#[derive(Eq, PartialEq, Debug)]
enum ComparisonResult {
Inconsistent,
DifferentInitials,
InitialsOnlyMatch,
ExactMatch,
PrefixOfOther(String),
PrefixOfSelf(String),
NicknameMatch,
}
impl<'a> NameWordOrInitial<'a> {
#[inline]
fn check_consistency(
&self,
other: &NameWordOrInitial,
allow_nicknames: bool,
) -> ComparisonResult {
#[inline]
fn fold_initial(c: char) -> char {
transliterate::to_ascii_initial(c).unwrap_or(c)
}
#[inline]
fn fold_rest(c: char, word: Option<&str>) -> Option<impl Iterator<Item = char> + '_> {
word.and_then(|w| transliterate::to_ascii_casefolded(&w[c.len_utf8()..]))
}
let (my_initial, their_initial) = (self.initial(), other.initial());
if fold_initial(my_initial) != fold_initial(their_initial) {
if self.word().is_some() && other.word().is_some() {
return ComparisonResult::Inconsistent;
} else {
return ComparisonResult::DifferentInitials;
}
}
let my_word = self.word();
let their_word = other.word();
// We just checked the initial matches, so skip ahead
let mut matched = 1;
let (my_chars, their_chars) = (
fold_rest(my_initial, my_word),
fold_rest(their_initial, their_word),
);
if my_chars.is_none() || their_chars.is_none() {
if my_word.is_some() && my_word == their_word {
return ComparisonResult::ExactMatch;
} else {
return ComparisonResult::InitialsOnlyMatch;
}
}
let (mut my_chars, mut their_chars) = (my_chars.unwrap(), their_chars.unwrap());
loop {
let my_char = my_chars.next();
let their_char = their_chars.next();
if my_char.is_none() && their_char.is_none() {
return ComparisonResult::ExactMatch;
} else if (my_char.is_none() || their_char.is_none())
&& matched >= MIN_GIVEN_NAME_CHAR_MATCH
{
if their_char.is_some() {
return ComparisonResult::PrefixOfOther(format!(
"{}{}",
their_char.unwrap(),
their_chars.collect::<String>()
));
} else {
return ComparisonResult::PrefixOfSelf(format!(
"{}{}",
my_char.unwrap(),
my_chars.collect::<String>()
));
}
} else if my_char != their_char {
// Failed match; abort, but first, maybe try nickname db
if allow_nicknames && have_matching_variants(my_word.unwrap(), their_word.unwrap())
{
return ComparisonResult::NicknameMatch;
} else {
return ComparisonResult::Inconsistent;
}
} else {
matched += 1;
}
}
}
#[inline]
fn initial(&self) -> char {
match *self {
NameWordOrInitial::Word(word, _) => word.chars().next().unwrap(),
NameWordOrInitial::Initial(initial) => initial,
}
}
#[inline]
fn word(&self) -> Option<&str> {
match *self {
NameWordOrInitial::Word(word, _) => Some(word),
NameWordOrInitial::Initial(_) => None,
}
}
#[inline]
fn initials_count(&self) -> i32 {
match *self {
NameWordOrInitial::Word(_, count) => count.try_into().unwrap(),
NameWordOrInitial::Initial(_) => 1,
}
}
}
struct GivenNamesOrInitials<'a, I, L>
where
I: Iterator<Item = (usize, char)>,
L: Iterator<Item = (&'a str, Location)>,
{
initials: I,
names_and_locations: iter::Peekable<L>,
}
#[derive(Debug)]
enum NameWordOrInitial<'a> {
Word(&'a str, usize),
Initial(char),
}
impl<'a, I, L> Iterator for GivenNamesOrInitials<'a, I, L>
where
I: Iterator<Item = (usize, char)>,
L: Iterator<Item = (&'a str, Location)>,
{
type Item = NameWordOrInitial<'a>;
fn next(&mut self) -> Option<NameWordOrInitial<'a>> {
self.initials.next().map(|(i, initial)| {
match self.names_and_locations.peek().map(|(_, loc)| loc.range()) {
Some(Range { start, end }) if start == i => {
let (word, _) = self.names_and_locations.next().unwrap();
// Handle case of hyphenated name for which we have 2+ initials
//
// When stabilized use `advance_by`: https://github.com/rust-lang/rust/issues/77404
let initials_for_word = end - start;
for _ in 1..initials_for_word {
self.initials.next();
}
NameWordOrInitial::Word(word, initials_for_word)
}
_ => NameWordOrInitial::Initial(initial),
}
})
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.initials.size_hint()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn non_bmp_alphas_simple() {
let a = NameWordOrInitial::Word("𐒴𐓘", 1);
assert_eq!(ComparisonResult::ExactMatch, a.check_consistency(&a, false));
assert_eq!(ComparisonResult::ExactMatch, a.check_consistency(&a, true));
let b = NameWordOrInitial::Word("𐓊𐓙", 1);
assert_eq!(
ComparisonResult::Inconsistent,
a.check_consistency(&b, false)
);
assert_eq!(
ComparisonResult::Inconsistent,
a.check_consistency(&b, true)
);
assert!(Name::surname_consistent_slow(
"𐒴𐓘".unicode_words().rev(),
"𐒴𐓘".unicode_words().rev()
));
assert!(!Name::surname_consistent_slow(
"𐒴𐓘".unicode_words().rev(),
"𐓊𐓙".unicode_words().rev()
));
let a = Name::parse("𐒴𐓘 𐓊𐓙").unwrap();
let b = Name::parse("𐒴𐓘 𐓍𐓙").unwrap();
assert!(a.consistent_with(&a));
assert!(!a.consistent_with(&b));
}
#[test]
fn non_bmp_alphas_with_middle_initial() {
let a = NameWordOrInitial::Initial('𐒴');
assert_eq!(
ComparisonResult::InitialsOnlyMatch,
a.check_consistency(&a, false)
);
assert_eq!(
ComparisonResult::InitialsOnlyMatch,
a.check_consistency(&a, true)
);
let b = NameWordOrInitial::Initial('𐒵');
assert_eq!(
ComparisonResult::DifferentInitials,
a.check_consistency(&b, true)
);
assert_eq!(
ComparisonResult::DifferentInitials,
a.check_consistency(&b, false)
);
let a = Name::parse("𐒴𐓘 𐓊𐓙").unwrap();
let b = Name::parse("𐒴𐓘 𐒵 𐓊𐓙").unwrap();
let c = Name::parse("𐒴𐓘 𐓍 𐓊𐓙").unwrap();
assert!(a.consistent_with(&b));
assert!(a.consistent_with(&c));
assert!(!b.consistent_with(&c));
}
#[test]
fn bug() {
let a = Name::parse("Peter Martin-Le Bore").unwrap();
let b = Name::parse("Peter Martin-Le Bore").unwrap();
assert!(a.consistent_with(&b));
}
}