1#![cfg_attr(feature = "no_std", no_std)]
71#![cfg_attr(test, feature(test))]
72#![deny(missing_docs, unsafe_code)]
73
74#[cfg(all(test, feature = "no_std"))]
75#[macro_use]
76extern crate std;
77
78use core::{char, fmt};
79use generated::{
80 LONGEST_NAME_LEN, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT,
81};
82
83#[allow(dead_code)]
84#[rustfmt::skip]
85#[allow(clippy::all)]
86mod generated {
87 include!(concat!(env!("OUT_DIR"), "/generated.rs"));
88}
89#[allow(dead_code)]
90#[rustfmt::skip]
91#[allow(clippy::all)]
92mod generated_phf {
93 include!(concat!(env!("OUT_DIR"), "/generated_phf.rs"));
94}
95#[allow(dead_code)]
96mod jamo;
97
98#[allow(dead_code)]
101static ALIASES: phf::Map<&'static [u8], char> =
102 include!(concat!(env!("OUT_DIR"), "/generated_alias.rs"));
103
104mod iter_str;
105
106static HANGUL_SYLLABLE_PREFIX: &str = "HANGUL SYLLABLE ";
107static NORMALISED_HANGUL_SYLLABLE_PREFIX: &str = "HANGULSYLLABLE";
108static CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJK UNIFIED IDEOGRAPH-";
109static NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJKUNIFIEDIDEOGRAPH";
110
111fn is_cjk_unified_ideograph(ch: char) -> bool {
112 generated::CJK_IDEOGRAPH_RANGES
113 .iter()
114 .any(|&(lo, hi)| lo <= ch && ch <= hi)
115}
116
117#[derive(Clone)]
129pub struct Name {
130 data: Name_,
131}
132#[allow(clippy::upper_case_acronyms)]
133#[derive(Clone)]
134enum Name_ {
135 Plain(iter_str::IterStr),
136 CJK(CJK),
137 Hangul(Hangul),
138}
139
140#[allow(clippy::upper_case_acronyms)]
141#[derive(Copy)]
142struct CJK {
143 emit_prefix: bool,
144 idx: u8,
145 data: [u8; 6],
147}
148#[derive(Copy)]
149struct Hangul {
150 emit_prefix: bool,
151 idx: u8,
152 data: [u8; 3],
155}
156impl Clone for CJK {
157 fn clone(&self) -> CJK {
158 *self
159 }
160}
161impl Clone for Hangul {
162 fn clone(&self) -> Hangul {
163 *self
164 }
165}
166
167#[allow(clippy::len_without_is_empty)]
168impl Name {
169 pub fn len(&self) -> usize {
174 let counted = self.clone();
175 counted.fold(0, |a, s| a + s.len())
176 }
177}
178
179impl Iterator for Name {
180 type Item = &'static str;
181
182 fn next(&mut self) -> Option<&'static str> {
183 match self.data {
184 Name_::Plain(ref mut s) => s.next(),
185 Name_::CJK(ref mut state) => {
186 if state.emit_prefix {
188 state.emit_prefix = false;
189 return Some(CJK_UNIFIED_IDEOGRAPH_PREFIX);
190 }
191 state
195 .data
196 .get(state.idx as usize)
197 .map(|digit| *digit as usize)
199 .map(|d| {
200 state.idx += 1;
201 static DIGITS: &str = "0123456789ABCDEF";
202 &DIGITS[d..d + 1]
203 })
204 }
205 Name_::Hangul(ref mut state) => {
206 if state.emit_prefix {
207 state.emit_prefix = false;
208 return Some(HANGUL_SYLLABLE_PREFIX);
209 }
210
211 let idx = state.idx as usize;
212 state.data.get(idx).map(|x| *x as usize).map(|x| {
213 state.idx += 1;
215 [jamo::CHOSEONG, jamo::JUNGSEONG, jamo::JONGSEONG][idx][x]
216 })
217 }
218 }
219 }
220
221 fn size_hint(&self) -> (usize, Option<usize>) {
222 let counted = self.clone();
224 let n = counted.count();
225 (n, Some(n))
226 }
227}
228
229impl fmt::Debug for Name {
230 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
231 fmt::Display::fmt(self, fmtr)
232 }
233}
234impl fmt::Display for Name {
235 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
236 let printed = self.clone();
237 for s in printed {
238 write!(fmtr, "{}", s)?
239 }
240 Ok(())
241 }
242}
243
244pub fn name(c: char) -> Option<Name> {
263 let cc = c as usize;
264 let offset =
265 (PHRASEBOOK_OFFSETS1[cc >> PHRASEBOOK_OFFSET_SHIFT] as usize) << PHRASEBOOK_OFFSET_SHIFT;
266
267 let mask = (1 << PHRASEBOOK_OFFSET_SHIFT) - 1;
268 let offset = PHRASEBOOK_OFFSETS2[offset + (cc & mask)];
269 if offset == 0 {
270 if is_cjk_unified_ideograph(c) {
271 let mut data = [b'0'; 6];
273 let mut number = c as u32;
274 let mut data_start = 6;
275 for place in data.iter_mut().rev() {
276 if number == 0 {
279 break;
280 }
281 *place = (number % 16) as u8;
282 number /= 16;
283 data_start -= 1;
284 }
285 Some(Name {
286 data: Name_::CJK(CJK {
287 emit_prefix: true,
288 idx: data_start,
289 data,
290 }),
291 })
292 } else {
293 jamo::syllable_decomposition(c).map(|(ch, ju, jo)| Name {
295 data: Name_::Hangul(Hangul {
296 emit_prefix: true,
297 idx: 0,
298 data: [ch, ju, jo],
299 }),
300 })
301 }
302 } else {
303 Some(Name {
304 data: Name_::Plain(iter_str::IterStr::new(offset as usize)),
305 })
306 }
307}
308
309fn fnv_hash<I: Iterator<Item = u8>>(x: I) -> u64 {
310 let mut g = 0xcbf29ce484222325 ^ generated_phf::NAME2CODE_N;
311 for b in x {
312 g ^= b as u64;
313 g = g.wrapping_mul(0x100000001b3);
314 }
315 g
316}
317fn displace(f1: u32, f2: u32, d1: u32, d2: u32) -> u32 {
318 d2.wrapping_add(f1.wrapping_mul(d1)).wrapping_add(f2)
319}
320fn split(hash: u64) -> (u32, u32, u32) {
321 let bits = 21;
322 let mask = (1 << bits) - 1;
323 (
324 (hash & mask) as u32,
325 ((hash >> bits) & mask) as u32,
326 ((hash >> (2 * bits)) & mask) as u32,
327 )
328}
329
330fn character_by_alias(name: &[u8]) -> Option<char> {
332 ALIASES.get(name).copied()
333}
334
335pub fn character(search_name: &str) -> Option<char> {
355 let original_name = search_name;
356 let mut buf = [0; LONGEST_NAME_LEN];
357 let len = normalise_name(search_name, &mut buf);
358 let search_name = &buf[..len];
359
360 if search_name.starts_with(NORMALISED_HANGUL_SYLLABLE_PREFIX.as_bytes()) {
362 let remaining = &search_name[NORMALISED_HANGUL_SYLLABLE_PREFIX.len()..];
363 let (choseong, remaining) = jamo::slice_shift_choseong(remaining);
364 let (jungseong, remaining) = jamo::slice_shift_jungseong(remaining);
365 let (jongseong, remaining) = jamo::slice_shift_jongseong(remaining);
366 match (choseong, jungseong, jongseong, remaining) {
367 (Some(choseong), Some(jungseong), Some(jongseong), b"") => {
368 let c = 0xac00 + (choseong * 21 + jungseong) * 28 + jongseong;
369 return char::from_u32(c);
370 }
371 (_, _, _, _) => {
372 return None;
375 }
376 }
377 }
378
379 if search_name.starts_with(NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) {
381 let remaining = &search_name[NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..];
382 if remaining.len() > 5 {
383 return None;
384 } let mut v = 0u32;
387 for &c in remaining {
388 v = match c {
389 b'0'..=b'9' => (v << 4) | (c - b'0') as u32,
390 b'A'..=b'F' => (v << 4) | (c - b'A' + 10) as u32,
391 _ => return None,
392 }
393 }
394 let ch = char::from_u32(v)?;
395
396 if is_cjk_unified_ideograph(ch) {
398 return Some(ch);
399 } else {
400 return None;
403 }
404 }
405
406 let (g, f1, f2) = split(fnv_hash(search_name.iter().copied()));
408 let (d1, d2) = generated_phf::NAME2CODE_DISP[g as usize % generated_phf::NAME2CODE_DISP.len()];
410
411 let idx = displace(f1, f2, d1 as u32, d2 as u32) as usize;
413 let codepoint = generated_phf::NAME2CODE_CODE[idx % generated_phf::NAME2CODE_CODE.len()];
415
416 let maybe_name = match name(codepoint) {
422 None => {
423 if true {
424 debug_assert!(false) }
426 return character_by_alias(search_name);
427 }
428 Some(name) => name,
429 };
430
431 let mut cmp_name = search_name;
435 for part in maybe_name {
436 let part = match part {
437 "" => "-", " " => continue, "-" if codepoint != '\u{1180}' => continue, word => word,
441 };
442
443 if let Some(rest) = cmp_name.strip_prefix(part.as_bytes()) {
444 cmp_name = rest;
445 } else {
446 return character_by_alias(search_name);
447 }
448 }
449
450 if codepoint == '\u{116C}'
454 && original_name
455 .trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '_')
456 .bytes()
457 .nth_back(1)
458 == Some(b'-')
459 {
460 return Some('\u{1180}');
461 }
462
463 Some(codepoint)
464}
465
466fn normalise_name(search_name: &str, buf: &mut [u8; LONGEST_NAME_LEN]) -> usize {
475 let mut cursor = 0;
476 let bytes = search_name.as_bytes();
477
478 for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() {
479 if c.is_ascii_whitespace() || c == b'_' {
481 continue;
482 }
483
484 if c == b'-'
487 && bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric)
488 && bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric)
489 {
490 continue;
491 }
492
493 if !c.is_ascii_alphanumeric() && c != b'-' {
494 return 0;
497 }
498
499 if cursor >= buf.len() {
500 return 0;
502 }
503 buf[cursor] = c;
504 cursor += 1;
505 }
506
507 cursor
508}
509
510#[cfg(test)]
511mod tests {
512 use super::*;
513 use rand::{
514 distributions::{Distribution, Standard},
515 prelude::{SeedableRng, StdRng},
516 };
517 use std::char;
518 use std::prelude::v1::*;
519
520 extern crate test;
521
522 use test::bench::Bencher;
523
524 static DATA: &'static str =
525 include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/data/UnicodeData.txt"));
526
527 #[test]
528 fn exhaustive() {
529 fn negative_range(from: u32, to: u32) {
532 for c in (from..to).filter_map(char::from_u32) {
533 if !is_cjk_unified_ideograph(c) && !jamo::is_hangul_syllable(c) {
534 let n = name(c);
535 assert!(
536 n.is_none(),
537 "{} ({}) shouldn't have a name but is called {}",
538 c,
539 c as u32,
540 n.unwrap()
541 );
542 }
543 }
544 }
545
546 let mut last = 0;
547 for line in DATA.lines() {
548 let mut it = line.split(';');
549
550 let raw_c = it.next();
551 let c = match char::from_u32(
552 raw_c.and_then(|s| u32::from_str_radix(s, 16).ok()).unwrap(),
553 ) {
554 Some(c) => c,
555 None => continue,
556 };
557
558 let n = it.next().unwrap();
559 if n.starts_with("<") {
560 continue;
561 }
562
563 let computed_n = name(c).unwrap();
564 let n_str = computed_n.to_string();
565 assert_eq!(n_str, n.to_string());
566 assert_eq!(computed_n.len(), n_str.len());
567
568 let (hint_low, hint_high) = computed_n.size_hint();
569 let number_of_parts = computed_n.count();
570 assert_eq!(hint_low, number_of_parts);
571 assert_eq!(hint_high, Some(number_of_parts));
572
573 assert_eq!(character(n), Some(c));
574 assert_eq!(character(&n.to_ascii_lowercase()), Some(c));
575
576 negative_range(last, c as u32);
577 last = c as u32 + 1;
578 }
579 negative_range(last, 0x10FFFF + 1)
580 }
581
582 #[test]
583 fn name_to_string() {
584 let n = name('a').unwrap();
585 assert_eq!(n.to_string(), "LATIN SMALL LETTER A".to_string());
586 let n = name('🁣').unwrap();
587 assert_eq!(n.to_string(), "DOMINO TILE VERTICAL-00-00".to_string());
588 }
589
590 #[test]
591 fn character_negative() {
592 let long_name = "x".repeat(generated::LONGEST_NAME_LEN + 1);
593 let prefix = format!("{}x", generated::LONGEST_NAME); let names = ["", "x", "öäå", "SPAACE", &long_name, &prefix];
595 for &n in names.iter() {
596 assert_eq!(character(n), None);
597 }
598 }
599
600 #[test]
601 fn name_hangul_syllable() {
602 assert_eq!(
603 name('\u{ac00}').map(|s| s.to_string()),
604 Some("HANGUL SYLLABLE GA".to_string())
605 ); assert_eq!(
607 name('\u{bdc1}').map(|s| s.to_string()),
608 Some("HANGUL SYLLABLE BWELG".to_string())
609 );
610 assert_eq!(
611 name('\u{d7a3}').map(|s| s.to_string()),
612 Some("HANGUL SYLLABLE HIH".to_string())
613 ); }
615
616 #[test]
617 fn character_hangul_syllable() {
618 assert_eq!(character("HANGUL SYLLABLE GA"), Some('\u{ac00}'));
619 assert_eq!(character("HANGUL SYLLABLE BWELG"), Some('\u{bdc1}'));
620 assert_eq!(character("HANGUL SYLLABLE HIH"), Some('\u{d7a3}'));
621 assert_eq!(character("HANGUL SYLLABLE BLAH"), None);
622 }
623
624 #[test]
625 fn cjk_unified_ideograph_exhaustive() {
626 for &(lo, hi) in generated::CJK_IDEOGRAPH_RANGES.iter() {
627 for x in lo as u32..=hi as u32 {
628 let c = char::from_u32(x).unwrap();
629
630 let real_name = format!("CJK UNIFIED IDEOGRAPH-{:X}", x);
631 let lower_real_name = format!("CJK UNIFIED IDEOGRAPH-{:x}", x);
632 assert_eq!(character(&real_name), Some(c));
633 assert_eq!(character(&lower_real_name), Some(c));
634
635 assert_eq!(name(c).map(|s| s.to_string()), Some(real_name));
636 }
637 }
638 }
639 #[test]
640 fn name_cjk_unified_ideograph() {
641 assert_eq!(
642 name('\u{4e00}').map(|s| s.to_string()),
643 Some("CJK UNIFIED IDEOGRAPH-4E00".to_string())
644 ); assert_eq!(
646 name('\u{9fcc}').map(|s| s.to_string()),
647 Some("CJK UNIFIED IDEOGRAPH-9FCC".to_string())
648 ); assert_eq!(
650 name('\u{20000}').map(|s| s.to_string()),
651 Some("CJK UNIFIED IDEOGRAPH-20000".to_string())
652 ); assert_eq!(
654 name('\u{2a6d6}').map(|s| s.to_string()),
655 Some("CJK UNIFIED IDEOGRAPH-2A6D6".to_string())
656 );
657 assert_eq!(
658 name('\u{2a700}').map(|s| s.to_string()),
659 Some("CJK UNIFIED IDEOGRAPH-2A700".to_string())
660 );
661 assert_eq!(
662 name('\u{2b81d}').map(|s| s.to_string()),
663 Some("CJK UNIFIED IDEOGRAPH-2B81D".to_string())
664 ); }
666
667 #[test]
668 fn character_cjk_unified_ideograph() {
669 assert_eq!(character("CJK UNIFIED IDEOGRAPH-4E00"), Some('\u{4e00}'));
670 assert_eq!(character("CJK UNIFIED IDEOGRAPH-9FCC"), Some('\u{9fcc}'));
671 assert_eq!(character("CJK UNIFIED IDEOGRAPH-20000"), Some('\u{20000}'));
672 assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A6D6"), Some('\u{2a6d6}'));
673 assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A700"), Some('\u{2a700}'));
674 assert_eq!(character("CJK UNIFIED IDEOGRAPH-2B81D"), Some('\u{2b81d}'));
675 assert_eq!(character("CJK UNIFIED IDEOGRAPH-"), None);
676 assert_eq!(character("CJK UNIFIED IDEOGRAPH-!@#$"), None);
677 assert_eq!(character("CJK UNIFIED IDEOGRAPH-1234"), None);
678 assert_eq!(character("CJK UNIFIED IDEOGRAPH-EFGH"), None);
679 assert_eq!(character("CJK UNIFIED IDEOGRAPH-12345"), None);
680 assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A6FF"), None); assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A6FF"), None);
682 }
683
684 #[test]
685 fn character_by_alias() {
686 assert_eq!(super::character_by_alias(b"NEW LINE"), Some('\n'));
687 assert_eq!(super::character_by_alias(b"BACKSPACE"), Some('\u{8}'));
688 assert_eq!(super::character_by_alias(b"NOT AN ALIAS"), None);
689 }
690
691 #[test]
692 fn test_uax44() {
693 assert_eq!(character(" L_O_W l_i_n_e"), Some('_'));
694 assert_eq!(character("space \x09\x0a\x0c\x0d"), Some(' '));
695 assert_eq!(character("FULL S-T-O-P"), Some('.'));
696 assert_eq!(character("tibetan letter -a"), Some('\u{F60}'));
697 assert_eq!(character("tibetan letter- a"), Some('\u{F60}'));
698 assert_eq!(character("tibetan letter - a"), Some('\u{F60}'));
699 assert_eq!(character("tibetan letter_-_a"), Some('\u{F60}'));
700 assert_eq!(character("latinSMALLletterA"), Some('a'));
701
702 let jungseong_oe = Some('\u{116C}');
704 let jungseong_o_e = Some('\u{1180}');
705 assert_eq!(character("HANGUL JUNGSEONG OE"), jungseong_oe);
706 assert_eq!(character("HANGUL JUNGSEONG O_E"), jungseong_oe);
707 assert_eq!(character("HANGUL JUNGSEONG O E"), jungseong_oe);
708 assert_eq!(character("HANGUL JUNGSEONG O-E"), jungseong_o_e);
709 assert_eq!(character("HANGUL JUNGSEONG O-E\n"), jungseong_o_e);
710 assert_eq!(character("HANGUL JUNGSEONG O-E__"), jungseong_o_e);
711 assert_eq!(character("HANGUL JUNGSEONG O- E"), jungseong_o_e);
712 assert_eq!(character("HANGUL JUNGSEONG O -E"), jungseong_o_e);
713 assert_eq!(character("HANGUL JUNGSEONG O_-_E"), jungseong_o_e);
714 }
715
716 #[bench]
717 fn name_basic(b: &mut Bencher) {
718 b.iter(|| {
719 for s in name('ö').unwrap() {
720 test::black_box(s);
721 }
722 })
723 }
724
725 #[bench]
726 fn character_basic(b: &mut Bencher) {
727 b.iter(|| character("LATIN SMALL LETTER O WITH DIAERESIS"));
728 }
729
730 #[bench]
731 fn name_10000_invalid(b: &mut Bencher) {
732 let mut rng = StdRng::seed_from_u64(0x12345678);
734 let chars: Vec<char> = Standard
735 .sample_iter(&mut rng)
736 .take(10000)
737 .filter_map(|c| match c {
738 c if name(c).is_none() => Some(c),
739 _ => None,
740 })
741 .collect();
742
743 b.iter(|| {
744 for &c in chars.iter() {
745 assert!(name(c).is_none());
746 }
747 })
748 }
749
750 #[bench]
751 fn name_all_valid(b: &mut Bencher) {
752 let chars = (0u32..0x10FFFF)
753 .filter_map(|x| match char::from_u32(x) {
754 Some(c) if name(c).is_some() => Some(c),
755 _ => None,
756 })
757 .collect::<Vec<char>>();
758
759 b.iter(|| {
760 for c in chars.iter() {
761 for s in name(*c).unwrap() {
762 test::black_box(s);
763 }
764 }
765 });
766 }
767
768 #[bench]
769 fn character_10000(b: &mut Bencher) {
770 let mut rng = StdRng::seed_from_u64(0x12345678);
772
773 let names: Vec<_> = Standard
774 .sample_iter(&mut rng)
775 .take(10000)
776 .filter_map(name)
777 .map(|name| name.to_string())
778 .collect();
779
780 b.iter(|| {
781 for n in names.iter() {
782 test::black_box(character(&n));
783 }
784 })
785 }
786}
787
788#[cfg(all(feature = "no_std", not(test)))]
789mod std {
790 pub use core::{clone, fmt, marker};
791}