1#![doc(html_root_url = "https://djudd.github.io/human-name/")]
6#![cfg_attr(feature = "bench", feature(test))]
7
8extern crate crossbeam_utils;
9extern crate smallvec;
10extern crate unicode_normalization;
11extern crate unicode_segmentation;
12extern crate unidecode;
13
14#[cfg(test)]
15#[cfg(feature = "bench")]
16extern crate test;
17
18#[cfg(test)]
19extern crate alloc_counter;
20
21mod case;
22mod comparison;
23mod decomposition;
24mod features;
25mod namecase;
26mod namepart;
27mod nickname;
28mod parse;
29mod segment;
30mod suffix;
31mod surname;
32mod title;
33mod transliterate;
34mod word;
35
36#[cfg(feature = "ffi")]
37pub mod external;
38
39#[cfg(feature = "name_eq_hash")]
40mod eq_hash;
41
42#[cfg(feature = "serialization")]
43mod serialization;
44
45use crate::decomposition::normalize_nfkd_whitespace;
46use crate::word::{Location, Words};
47use compact_str::CompactString;
48use crossbeam_utils::atomic::AtomicCell;
49use smallvec::SmallVec;
50use std::borrow::Cow;
51use std::collections::hash_map::DefaultHasher;
52use std::convert::TryInto;
53use std::hash::{Hash, Hasher};
54use std::num::NonZeroU8;
55
56#[cfg(test)]
57use alloc_counter::AllocCounterSystem;
58
59#[cfg(test)]
60#[global_allocator]
61static A: AllocCounterSystem = AllocCounterSystem;
62
63pub const MAX_NAME_LEN: usize = 1024;
64pub const MAX_SEGMENT_LEN: usize = segment::MAX_LEN;
65pub const MAX_SEGMENTS: usize = parse::MAX_WORDS;
66
67#[derive(Debug)]
85pub struct Name {
86 text: CompactString, locations: SmallVec<[Location; 6]>, given_name_words: u8, surname_words: u8, initials_len: u8, generation: Option<NonZeroU8>,
92 honorifics: Option<Box<Honorifics>>,
93 surname_hash: AtomicCell<Option<u32>>,
94}
95
96#[derive(Clone, Debug)]
97struct Honorifics {
98 prefix: Option<Box<str>>,
99 suffix: Option<Box<str>>,
100}
101
102impl Clone for Name {
103 fn clone(&self) -> Self {
104 Name {
105 text: self.text.clone(),
106 locations: self.locations.clone(),
107 given_name_words: self.given_name_words,
108 surname_words: self.surname_words,
109 initials_len: self.initials_len,
110 generation: self.generation,
111 honorifics: self.honorifics.clone(),
112 surname_hash: Default::default(),
113 }
114 }
115}
116
117impl Name {
118 pub fn parse(name: &str) -> Option<Name> {
179 if name.len() >= MAX_NAME_LEN {
180 return None;
181 }
182
183 let name = normalize_nfkd_whitespace(name);
184 let name = nickname::strip_nickname(&name);
185 let parsed = parse::parse(&name)?;
186
187 Name::initialize_struct(&parsed, name.len())
188 }
189
190 fn initialize_struct(parsed: &parse::Name, name_len: usize) -> Option<Name> {
191 let words = parsed.words();
192 let surname_index = parsed.surname_index;
193
194 let mut text = CompactString::with_capacity(name_len + surname_index);
195 let mut initials = CompactString::with_capacity(surname_index);
196
197 let mut locations = SmallVec::with_capacity(words.len() + surname_index);
198 let mut locations_in_initials: SmallVec<[Location; 4]> =
199 SmallVec::with_capacity(surname_index);
200
201 for word in &words[..surname_index] {
202 if word.is_initials() {
203 word.with_initials(|c| {
204 text.push(c);
205 text.push_str(". ");
206
207 initials.push(c);
208 });
209 } else {
210 let prior_len = text.len();
211 word.with_namecased(|s| text.push_str(s));
212 locations.push(Location::new(prior_len..text.len())?);
213
214 let prior_len = initials.len();
215 word.with_initials(|c| initials.push(c));
216 locations_in_initials.push(Location::new(prior_len..initials.len())?);
217
218 text.push(' ');
219 }
220 }
221
222 let surname_words = &words[surname_index..];
223 for (i, word) in surname_words.iter().enumerate() {
224 let prior_len = text.len();
225 word.with_namecased(|s| text.push_str(s));
226 locations.push(Location::new(prior_len..text.len())?);
227
228 if i < surname_words.len() - 1 {
229 text.push(' ');
230 }
231 }
232
233 debug_assert!(!text.is_empty(), "Names are empty!");
234 debug_assert!(!initials.is_empty(), "Initials are empty!");
235
236 let generation = parsed.generation;
237 let honorifics = {
238 let prefix = parsed
239 .honorific_prefix()
240 .map(|s| s.into_owned().into_boxed_str());
241 let suffix = parsed
242 .honorific_suffix()
243 .map(|s| s.into_owned().into_boxed_str());
244
245 if prefix.is_some() || suffix.is_some() {
246 Some(Box::new(Honorifics { prefix, suffix }))
247 } else {
248 None
249 }
250 };
251
252 let surname_words = (locations.len() - locations_in_initials.len())
253 .try_into()
254 .ok()?;
255 let given_name_words = locations_in_initials.len().try_into().ok()?;
256 let initials_len = initials.len().try_into().ok()?;
257
258 text.push_str(&initials);
259 text.shrink_to_fit();
260
261 locations.extend_from_slice(&locations_in_initials);
262 locations.shrink_to_fit();
263
264 Some(Name {
265 text,
266 locations,
267 given_name_words,
268 surname_words,
269 initials_len,
270 generation,
271 honorifics,
272 surname_hash: Default::default(),
273 })
274 }
275
276 pub fn first_initial(&self) -> char {
278 self.initials().chars().next().unwrap()
279 }
280
281 pub fn given_name(&self) -> Option<&str> {
293 self.given_iter().next()
294 }
295
296 pub fn goes_by_middle_name(&self) -> bool {
311 if let Some(loc) = self.given_names_in_initials().first() {
312 loc.range().start > 0
313 } else {
314 false
315 }
316 }
317
318 #[inline]
330 pub fn initials(&self) -> &str {
331 &self.text[self.name_bytes()..]
332 }
333
334 pub fn middle_names(&self) -> Option<SmallVec<[&str; 3]>> {
336 self.middle_name_iter().map(|i| i.collect())
337 }
338
339 pub fn middle_name(&self) -> Option<Cow<str>> {
357 self.middle_name_iter().map(|i| i.join())
358 }
359
360 pub fn middle_initials(&self) -> Option<&str> {
378 self.initials()
379 .char_indices()
380 .nth(1)
381 .map(|(i, _)| &self.text[self.name_bytes() + i..])
382 }
383
384 pub fn surnames(&self) -> SmallVec<[&str; 3]> {
386 self.surname_iter().collect()
387 }
388
389 pub fn surname(&self) -> &str {
401 let start = self.surname_locations()[0].range().start;
402 let end = self.surname_end_in_text();
403 &self.text[start..end]
404 }
405
406 pub fn generational_suffix(&self) -> Option<&str> {
415 self.generation.map(suffix::display_generational_suffix)
416 }
417
418 pub fn honorific_prefix(&self) -> Option<&str> {
427 self.honorifics
428 .as_ref()
429 .and_then(|h| h.prefix.as_ref())
430 .map(|p| p.as_ref())
431 }
432
433 pub fn honorific_suffix(&self) -> Option<&str> {
442 self.honorifics
443 .as_ref()
444 .and_then(|h| h.suffix.as_ref())
445 .map(|s| s.as_ref())
446 }
447
448 pub fn display_initial_surname(&self) -> Cow<str> {
463 if self.given_name_words == 0 && self.initials_len == 1 {
464 Cow::Borrowed(&self.text[..self.surname_end_in_text()])
465 } else {
466 Cow::Owned(format!("{}. {}", self.first_initial(), self.surname()))
467 }
468 }
469
470 pub fn display_first_last(&self) -> Cow<str> {
489 if self.given_name_words <= 1 && self.initials_len == 1 {
490 Cow::Borrowed(&self.text[..self.surname_end_in_text()])
491 } else if let Some(ref name) = self.given_name() {
492 Cow::Owned(format!("{} {}", name, self.surname()))
493 } else {
494 self.display_initial_surname()
495 }
496 }
497
498 #[inline]
510 pub fn byte_len(&self) -> usize {
511 const SEPARATOR_LEN: usize = ", ".len();
512
513 self.name_bytes()
514 + self
515 .generational_suffix()
516 .map(|g| g.len() + SEPARATOR_LEN)
517 .unwrap_or(0)
518 }
519
520 #[inline]
521 fn name_bytes(&self) -> usize {
522 self.text.len() - usize::from(self.initials_len)
523 }
524
525 #[inline]
540 pub fn display_full(&self) -> Cow<str> {
541 let name = &self.text[..self.name_bytes()];
542 if let Some(suffix) = self.generational_suffix() {
543 let mut result = name.to_string();
544 result.push_str(", ");
545 result.push_str(suffix);
546 Cow::Owned(result)
547 } else {
548 Cow::Borrowed(name)
549 }
550 }
551
552 pub fn display_full_with_honorifics(&self) -> Cow<str> {
566 if let Some(honorifics) = self.honorifics.as_ref() {
567 let mut result = String::with_capacity(
568 honorifics.prefix.as_ref().map(|t| t.len() + 1).unwrap_or(0)
569 + self.byte_len()
570 + honorifics.suffix.as_ref().map(|t| t.len() + 1).unwrap_or(0),
571 );
572 if let Some(prefix) = &honorifics.prefix {
573 result.push_str(prefix);
574 result.push(' ');
575 }
576 result.push_str(&self.display_full());
577 if let Some(suffix) = &honorifics.suffix {
578 result.push(' ');
579 result.push_str(suffix);
580 }
581 Cow::Owned(result)
582 } else {
583 self.display_full()
584 }
585 }
586
587 pub fn surname_hash(&self) -> u64 {
610 if let Some(hash) = self.surname_hash.load() {
611 return hash.into();
612 }
613
614 let mut s = DefaultHasher::new();
615 self.hash_surname(&mut s);
616
617 let hash = s.finish() as u32;
620 self.surname_hash.store(Some(hash));
621 hash.into()
622 }
623
624 fn hash_surname<H: Hasher>(&self, state: &mut H) {
625 for c in self
626 .surname_iter()
627 .rev()
628 .flat_map(|word| {
629 transliterate::to_ascii_casefolded_reversed(word)
630 .into_iter()
631 .flatten()
632 })
633 .take(comparison::MIN_SURNAME_CHAR_MATCH)
634 {
635 c.hash(state);
636 }
637 }
638
639 #[inline]
640 fn surname_end_in_text(&self) -> usize {
641 self.surname_locations()[usize::from(self.surname_words) - 1]
642 .range()
643 .end
644 }
645
646 #[inline]
647 fn surname_iter(
648 &self,
649 ) -> Words<impl Iterator<Item = Location> + DoubleEndedIterator + ExactSizeIterator + '_> {
650 self.word_iter(self.surname_locations())
651 }
652
653 #[inline]
654 fn middle_name_iter(
655 &self,
656 ) -> Option<Words<impl Iterator<Item = Location> + DoubleEndedIterator + ExactSizeIterator + '_>>
657 {
658 if self.given_name_words > 1 {
659 Some(self.word_iter(&self.given_name_locations()[1..]))
660 } else {
661 None
662 }
663 }
664
665 #[inline]
666 fn given_iter(
667 &self,
668 ) -> Words<impl Iterator<Item = Location> + DoubleEndedIterator + ExactSizeIterator + '_> {
669 self.word_iter(self.given_name_locations())
670 }
671
672 #[inline]
673 fn word_iter<'a>(
674 &'a self,
675 locations: &'a [Location],
676 ) -> Words<'_, impl Iterator<Item = Location> + DoubleEndedIterator + ExactSizeIterator + '_>
677 {
678 Words::new(&self.text, locations.iter().copied())
679 }
680
681 #[inline]
682 fn given_name_locations(&self) -> &[Location] {
683 &self.locations[..self.given_name_words.into()]
684 }
685
686 #[inline]
687 fn surname_locations(&self) -> &[Location] {
688 &self.locations
689 [self.given_name_words.into()..(self.given_name_words + self.surname_words).into()]
690 }
691
692 #[inline]
693 fn given_names_in_initials(&self) -> &[Location] {
694 &self.locations[(self.given_name_words + self.surname_words).into()..]
695 }
696}
697
698#[cfg(test)]
699mod tests {
700 use super::*;
701 use alloc_counter::deny_alloc;
702
703 #[cfg(feature = "bench")]
704 use test::{black_box, Bencher};
705
706 #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
707 #[test]
708 fn struct_size() {
709 assert_eq!(80, std::mem::size_of::<Name>());
710 assert_eq!(32, std::mem::size_of::<Honorifics>());
711 }
712
713 #[test]
714 fn fast_path_parse_does_not_allocate() {
715 deny_alloc(|| Name::parse("Jane Doe").unwrap());
716 deny_alloc(|| Name::parse("J. Doe").unwrap());
717 }
718
719 #[test]
720 fn fast_path_eq_does_not_allocate() {
721 let n1 = Name::parse("Jane Doe").unwrap();
722 let n2 = Name::parse("John Doe").unwrap();
723 let n3 = Name::parse("J. Doe").unwrap();
724 deny_alloc(|| {
725 assert!(!n1.consistent_with(&n2));
726 assert!(n1.consistent_with(&n3));
727 });
728 }
729
730 #[test]
731 fn parse_high_proportion_of_combining_chars() {
732 let name = Name::parse(".ΰ\u{330}\u{610}`");
733 assert!(name.is_none());
734 }
735
736 #[test]
737 fn parse_very_long_honorific_prefix() {
738 let name = Name::parse("%%%%%hLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLe pl Puc");
740 assert_eq!("H. Lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll. E. P. L. Puc", name.unwrap().display_full_with_honorifics());
741 }
742
743 #[test]
744 fn eq_non_alphanumeric_initials() {
745 let a = Name::parse("\u{3}\n\u{4}\u{19}Joo\n'lA").unwrap();
747 let b = Name::parse("H8\n'lA/").unwrap();
748 assert!(!a.consistent_with(&b));
749 }
750
751 #[test]
752 fn eq_empty_transliterated_initials() {
753 let a = Name::parse("Ng\nmac").unwrap();
755 let b = Name::parse("\u{65c}\nmac\n").unwrap();
756 assert!(!a.consistent_with(&b));
757 }
758
759 #[test]
760 fn digits() {
761 let a = Name::parse("111 222");
762 assert!(a.is_none());
763
764 let a = Name::parse("One-1 Ones").unwrap();
765 let b = Name::parse("One-2 Ones").unwrap();
766 assert!(a.consistent_with(&b));
767
768 let a = Name::parse("One Ones-1").unwrap();
769 let b = Name::parse("One Ones-2").unwrap();
770 assert!(!a.consistent_with(&b));
771
772 let a = Name::parse("One Ones1").unwrap();
773 let b = Name::parse("One Ones2").unwrap();
774 assert!(!a.consistent_with(&b));
775
776 let a = Name::parse("One1 Ones").unwrap();
777 let b = Name::parse("One2 Ones").unwrap();
778 assert!(a.consistent_with(&b));
779
780 let a = Name::parse("One 1 Ones").unwrap();
781 let b = Name::parse("One 2 Ones").unwrap();
782 assert!(a.consistent_with(&b));
783 }
784
785 #[test]
786 fn non_bmp_alphas() {
787 let a = Name::parse("𐒴𐓘 𐓊𐓙").unwrap();
788 let b = Name::parse("𐒴𐓘 𐒵 𐓊𐓙").unwrap();
789 assert_eq!("𐒴𐓘 𐓊𐓙", a.display_first_last());
790 assert_eq!("𐒴𐓘 𐓊𐓙", b.display_first_last());
791 assert!(a.consistent_with(&b));
792
793 let c = Name::parse("𐒴𐓘 𐒵𐓙").unwrap();
794 assert_eq!("𐒴𐓘 𐒵𐓙", c.display_first_last());
795 assert!(!a.consistent_with(&c));
796
797 let d = Name::parse("𐒴𐓘 𐓍 𐓊𐓙").unwrap();
798 assert_eq!("𐒴𐓘 𐓊𐓙", d.display_first_last());
799 assert!(a.consistent_with(&d));
800 assert!(!b.consistent_with(&d));
801
802 let a = Name::parse("𐒴𐓘-𐓊𐓙 𐓍𐓙").unwrap();
803 assert_eq!("𐒴𐓘 𐓍𐓙", a.display_first_last()); assert!(a.consistent_with(&a));
805 let b = Name::parse("𐒴𐓘 𐓊𐓙-𐓍𐓙").unwrap();
806 assert_eq!("𐒴𐓘 𐓍𐓙", b.display_first_last()); assert!(b.consistent_with(&b));
808 let c = Name::parse("𐒴𐓘 𐓊𐓙 𐓍𐓙").unwrap();
809 assert_eq!("𐒴𐓘 𐓍𐓙", c.display_first_last());
810 assert!(c.consistent_with(&c));
811
812 assert!(a.consistent_with(&b));
813 assert!(a.consistent_with(&c));
814 assert!(b.consistent_with(&c));
815 }
816
817 #[test]
818 fn stops_being_nfkd() {
819 let input = "\u{5c4}((\0)\u{64f}()()\u{5c4}\u{64f}\u{612}";
822 assert!(Name::parse(input).is_none());
823 }
824
825 #[test]
826 fn emojis() {
827 let a = Name::parse("😃 😃");
828 assert!(a.is_none());
829
830 let a = Name::parse("smile-😃 smiley").unwrap();
831 let b = Name::parse("smile-😰 smiley").unwrap();
832 assert!(a.consistent_with(&b));
833
834 let a = Name::parse("smile smiley-😃").unwrap();
835 let b = Name::parse("smile smiley-😰").unwrap();
836 assert!(a.consistent_with(&b));
837
838 let a = Name::parse("smile 😃 smiley").unwrap();
839 let b = Name::parse("smile 😰 smiley").unwrap();
840 assert!(a.consistent_with(&b));
841
842 let a = Name::parse("smile-😃 smiley").unwrap();
843 let b = Name::parse("smile-😰 smiley").unwrap();
844 assert!(a.consistent_with(&b));
845
846 let a = Name::parse("smile😃 smiley").unwrap();
847 let b = Name::parse("smile😰 smiley").unwrap();
848 assert!(a.consistent_with(&b));
849
850 let a = Name::parse("smile smiley😃").unwrap();
851 let b = Name::parse("smile smiley😰").unwrap();
852 assert!(a.consistent_with(&b));
853 }
854
855 #[cfg(feature = "bench")]
856 #[bench]
857 fn initialize_struct_initial_surname(b: &mut Bencher) {
858 let name = "J. Doe";
859 let parsed = parse::parse(&*name).unwrap();
860 b.iter(|| {
861 black_box(
862 Name::initialize_struct(&parsed, name.len())
863 .unwrap()
864 .byte_len(),
865 )
866 })
867 }
868
869 #[cfg(feature = "bench")]
870 #[bench]
871 fn initialize_struct_first_last(b: &mut Bencher) {
872 let name = "John Doe";
873 let parsed = parse::parse(&*name).unwrap();
874 b.iter(|| {
875 black_box(
876 Name::initialize_struct(&parsed, name.len())
877 .unwrap()
878 .byte_len(),
879 )
880 })
881 }
882
883 #[cfg(feature = "bench")]
884 #[bench]
885 fn initialize_struct_complex(b: &mut Bencher) {
886 let name = "John Allen Q.R. de la MacDonald Jr.";
887 let parsed = parse::parse(&*name).unwrap();
888 b.iter(|| {
889 black_box(
890 Name::initialize_struct(&parsed, name.len())
891 .unwrap()
892 .byte_len(),
893 )
894 })
895 }
896}
897
898#[cfg(feature = "bench")]
899#[cfg(test)]
900mod bench {
901 use super::Name;
902 use std::fs::File;
903 use std::io::prelude::*;
904 use std::io::BufReader;
905
906 #[cfg(feature = "bench")]
907 use test::{black_box, Bencher};
908
909 #[bench]
910 fn bench_parsing_first_last(b: &mut Bencher) {
911 b.iter(|| {
912 let parsed = Name::parse("Juan Garcia");
913 black_box(parsed.is_none())
914 })
915 }
916
917 #[bench]
918 fn bench_parsing_sort_order(b: &mut Bencher) {
919 b.iter(|| {
920 let parsed = Name::parse("Garcia, J.Q.");
921 black_box(parsed.is_none())
922 })
923 }
924
925 #[bench]
926 fn bench_parsing_needs_namecase(b: &mut Bencher) {
927 b.iter(|| {
928 let parsed = Name::parse("JAIME GARCIA");
929 black_box(parsed.is_none())
930 })
931 }
932
933 #[bench]
934 fn bench_parsing_unparseable(b: &mut Bencher) {
935 b.iter(|| {
936 let parsed = Name::parse("foo@bar.com");
937 black_box(parsed.is_none())
938 })
939 }
940
941 #[bench]
942 fn bench_parsing_complex(b: &mut Bencher) {
943 let name = "鈴木 Velasquez y Garcia, Dr. Juan Q. 'Don Juan' Xavier III";
944 b.iter(|| {
945 let parsed = Name::parse(name);
946 black_box(parsed.is_none())
947 })
948 }
949
950 #[bench]
951 fn bench_equality_equal(b: &mut Bencher) {
952 let x = Name::parse("Jane Doe").unwrap();
953 let y = Name::parse("Jane H. Doe").unwrap();
954
955 b.iter(|| black_box(x.consistent_with(&y)))
956 }
957
958 #[bench]
959 fn bench_equality_not_equal(b: &mut Bencher) {
960 let x = Name::parse("Jane Doe").unwrap();
961 let y = Name::parse("Foo Bar").unwrap();
962
963 b.iter(|| black_box(x.consistent_with(&y)))
964 }
965
966 #[bench]
967 fn bench_equality_close_to_equal(b: &mut Bencher) {
968 let x = Name::parse("Jane Doe").unwrap();
969 let y = Name::parse("John Doe").unwrap();
970
971 b.iter(|| black_box(x.consistent_with(&y)))
972 }
973
974 #[bench]
975 fn bench_parsing_many(b: &mut Bencher) {
976 let f = File::open("tests/benchmark-names.txt").ok().unwrap();
977 let reader = BufReader::new(f);
978 let names: Vec<String> = reader.lines().map(|l| l.ok().unwrap()).collect();
979
980 b.iter(move || {
981 let mut valid = 0;
982 let mut invalid = 0;
983
984 for name in names.iter() {
985 let parsed = Name::parse(&name);
986 if parsed.is_none() {
987 invalid += 1;
988 } else {
989 valid += 1;
990 }
991 }
992
993 black_box(valid);
994 black_box(invalid);
995 })
996 }
997
998 #[bench]
999 fn bench_equality_many(b: &mut Bencher) {
1000 let f = File::open("tests/benchmark-names.txt").ok().unwrap();
1001 let reader = BufReader::new(f);
1002 let names: Vec<Name> = reader
1003 .lines()
1004 .filter_map(|l| Name::parse(&l.ok().unwrap()))
1005 .collect();
1006
1007 b.iter(|| {
1008 let mut matches = 0;
1009
1010 for a in &names[..64] {
1011 for b in &names {
1012 if a.consistent_with(&b) {
1013 matches += 1;
1014 }
1015 }
1016 }
1017
1018 black_box(matches);
1019 })
1020 }
1021}