unicode_names2/
lib.rs

1//! Convert between characters and their standard names.
2//!
3//! This crate provides two functions for mapping from a `char` to the
4//! name given by the Unicode standard (16.0). There are no runtime
5//! requirements so this is usable with only `core` (this requires
6//! specifying the `no_std` cargo feature). The tables are heavily
7//! compressed, but still large (500KB), and still offer efficient
8//! `O(1)` look-ups in both directions (more precisely, `O(length of
9//! name)`).
10//!
11//! ```rust
12//!     println!("☃ is called {:?}", unicode_names2::name('☃')); // SNOWMAN
13//!     println!("{:?} is happy", unicode_names2::character("white smiling face")); // ☺
14//!     // (NB. case insensitivity)
15//! ```
16//!
17//! [**Source**](https://github.com/ProgVal/unicode_names2).
18//!
19//! # Macros
20//!
21//! The associated `unicode_names2_macros` crate provides two macros
22//! for converting at compile-time, giving named literals similar to
23//! Python's `"\N{...}"`.
24//!
25//! - `named_char!(name)` takes a single string `name` and creates a
26//!   `char` literal.
27//! - `named!(string)` takes a string and replaces any `\\N{name}`
28//!   sequences with the character with that name. NB. String escape
29//!   sequences cannot be customised, so the extra backslash (or a raw
30//!   string) is required, unless you use a raw string.
31//!
32//! ```rust
33//! #![feature(proc_macro_hygiene)]
34//!
35//! #[macro_use]
36//! extern crate unicode_names2_macros;
37//!
38//! fn main() {
39//!     let x: char = named_char!("snowman");
40//!     assert_eq!(x, '☃');
41//!
42//!     let y: &str = named!("foo bar \\N{BLACK STAR} baz qux");
43//!     assert_eq!(y, "foo bar ★ baz qux");
44//!
45//!     let y: &str = named!(r"foo bar \N{BLACK STAR} baz qux");
46//!     assert_eq!(y, "foo bar ★ baz qux");
47//! }
48//! ```
49//!
50//! # Loose Matching
51//! For name->char retrieval (the `character` function and macros) this crate uses loose matching,
52//! as defined in Unicode Standard Annex #44[^1].
53//! In general, this means case, whitespace and underscore characters are ignored, as well as
54//! _medial hyphens_, which are hyphens (`-`) that come between two alphanumeric characters[^1].
55//!
56//! Under this scheme, the query `Low_Line` will find `U+005F LOW LINE`, as well as `l o w L-I-N-E`,
57//! `lowline`, and `low\nL-I-N-E`, but not `low- line`.
58//! Similarly, `tibetan letter -a` will find `U+0F60 TIBETAN LETTER -A`, as well as
59//! `tibetanletter - a` and `TIBETAN L_ETTE_R-  __a__`, but not `tibetan letter-a` or
60//! `TIBETAN LETTER A`.
61//!
62//! In the implementation of this crate, 'whitespace' is determined by the [`is_ascii_whitespace`]
63//! method on `u8` and `char`. See its documentation for more info.
64//!
65//! [^1]: See [UAX44-LM2] for precise details.
66//!
67//! [UAX44-LM2]: https://www.unicode.org/reports/tr44/tr44-34.html#UAX44-LM2
68//! [`is_ascii_whitespace`]: char::is_ascii_whitespace
69
70#![cfg_attr(feature = "no_std", no_std)]
71#![cfg_attr(test, feature(test))]
72#![deny(missing_docs, unsafe_code)]
73
74#[cfg(all(test, feature = "no_std"))]
75#[macro_use]
76extern crate std;
77
78use core::{char, fmt};
79use generated::{
80    LONGEST_NAME_LEN, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT,
81};
82
83#[allow(dead_code)]
84#[rustfmt::skip]
85#[allow(clippy::all)]
86mod generated {
87    include!(concat!(env!("OUT_DIR"), "/generated.rs"));
88}
89#[allow(dead_code)]
90#[rustfmt::skip]
91#[allow(clippy::all)]
92mod generated_phf {
93    include!(concat!(env!("OUT_DIR"), "/generated_phf.rs"));
94}
95#[allow(dead_code)]
96mod jamo;
97
98/// A map of unicode aliases to their corresponding values.
99/// Generated in generator
100#[allow(dead_code)]
101static ALIASES: phf::Map<&'static [u8], char> =
102    include!(concat!(env!("OUT_DIR"), "/generated_alias.rs"));
103
104mod iter_str;
105
106static HANGUL_SYLLABLE_PREFIX: &str = "HANGUL SYLLABLE ";
107static NORMALISED_HANGUL_SYLLABLE_PREFIX: &str = "HANGULSYLLABLE";
108static CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJK UNIFIED IDEOGRAPH-";
109static NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJKUNIFIEDIDEOGRAPH";
110
111fn is_cjk_unified_ideograph(ch: char) -> bool {
112    generated::CJK_IDEOGRAPH_RANGES
113        .iter()
114        .any(|&(lo, hi)| lo <= ch && ch <= hi)
115}
116
117/// An iterator over the components of a code point's name. Notably implements `Display`.
118///
119/// To reconstruct the full Unicode name from this iterator, you can concatenate every string slice
120/// yielded from it. Each such slice is either a word matching `[A-Z0-9]*`, a space `" "`, or a
121/// hyphen `"-"`. (In particular, words can be the empty string `""`).
122///
123/// The [size hint] returns an exact size, by cloning the iterator and iterating it fully.
124/// Cloning and iteration are cheap, and all names are relatively short, so this should not have a
125/// high impact.
126///
127/// [size hint]: std::iter::Iterator::size_hint
128#[derive(Clone)]
129pub struct Name {
130    data: Name_,
131}
132#[allow(clippy::upper_case_acronyms)]
133#[derive(Clone)]
134enum Name_ {
135    Plain(iter_str::IterStr),
136    CJK(CJK),
137    Hangul(Hangul),
138}
139
140#[allow(clippy::upper_case_acronyms)]
141#[derive(Copy)]
142struct CJK {
143    emit_prefix: bool,
144    idx: u8,
145    // the longest character is 0x10FFFF
146    data: [u8; 6],
147}
148#[derive(Copy)]
149struct Hangul {
150    emit_prefix: bool,
151    idx: u8,
152    // stores the choseong, jungseong, jongseong syllable numbers (in
153    // that order)
154    data: [u8; 3],
155}
156impl Clone for CJK {
157    fn clone(&self) -> CJK {
158        *self
159    }
160}
161impl Clone for Hangul {
162    fn clone(&self) -> Hangul {
163        *self
164    }
165}
166
167#[allow(clippy::len_without_is_empty)]
168impl Name {
169    /// The number of bytes in the name.
170    ///
171    /// All names are plain ASCII, so this is also the number of
172    /// Unicode codepoints and the number of graphemes.
173    pub fn len(&self) -> usize {
174        let counted = self.clone();
175        counted.fold(0, |a, s| a + s.len())
176    }
177}
178
179impl Iterator for Name {
180    type Item = &'static str;
181
182    fn next(&mut self) -> Option<&'static str> {
183        match self.data {
184            Name_::Plain(ref mut s) => s.next(),
185            Name_::CJK(ref mut state) => {
186                // we're a CJK unified ideograph
187                if state.emit_prefix {
188                    state.emit_prefix = false;
189                    return Some(CJK_UNIFIED_IDEOGRAPH_PREFIX);
190                }
191                // run until we've run out of array: the construction
192                // of the data means this is exactly when we have
193                // finished emitting the number.
194                state
195                    .data
196                    .get(state.idx as usize)
197                    // (avoid conflicting mutable borrow problems)
198                    .map(|digit| *digit as usize)
199                    .map(|d| {
200                        state.idx += 1;
201                        static DIGITS: &str = "0123456789ABCDEF";
202                        &DIGITS[d..d + 1]
203                    })
204            }
205            Name_::Hangul(ref mut state) => {
206                if state.emit_prefix {
207                    state.emit_prefix = false;
208                    return Some(HANGUL_SYLLABLE_PREFIX);
209                }
210
211                let idx = state.idx as usize;
212                state.data.get(idx).map(|x| *x as usize).map(|x| {
213                    // progressively walk through the syllables
214                    state.idx += 1;
215                    [jamo::CHOSEONG, jamo::JUNGSEONG, jamo::JONGSEONG][idx][x]
216                })
217            }
218        }
219    }
220
221    fn size_hint(&self) -> (usize, Option<usize>) {
222        // we can estimate exactly by just iterating and summing up.
223        let counted = self.clone();
224        let n = counted.count();
225        (n, Some(n))
226    }
227}
228
229impl fmt::Debug for Name {
230    fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
231        fmt::Display::fmt(self, fmtr)
232    }
233}
234impl fmt::Display for Name {
235    fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
236        let printed = self.clone();
237        for s in printed {
238            write!(fmtr, "{}", s)?
239        }
240        Ok(())
241    }
242}
243
244/// Find the name of `c`, or `None` if `c` has no name.
245///
246/// The return value is an iterator that yields `&'static str` components of the name successively
247/// (including spaces and hyphens). It implements `Display`, so can be used naturally to build
248/// `String`s or be printed. See also the [type-level docs][Name].
249///
250/// # Example
251///
252/// ```rust
253/// assert_eq!(unicode_names2::name('a').unwrap().to_string(), "LATIN SMALL LETTER A");
254/// assert_eq!(unicode_names2::name('\u{2605}').unwrap().to_string(), "BLACK STAR");
255/// assert_eq!(unicode_names2::name('☃').unwrap().to_string(), "SNOWMAN");
256///
257/// // control code
258/// assert!(unicode_names2::name('\x00').is_none());
259/// // unassigned
260/// assert!(unicode_names2::name('\u{10FFFF}').is_none());
261/// ```
262pub fn name(c: char) -> Option<Name> {
263    let cc = c as usize;
264    let offset =
265        (PHRASEBOOK_OFFSETS1[cc >> PHRASEBOOK_OFFSET_SHIFT] as usize) << PHRASEBOOK_OFFSET_SHIFT;
266
267    let mask = (1 << PHRASEBOOK_OFFSET_SHIFT) - 1;
268    let offset = PHRASEBOOK_OFFSETS2[offset + (cc & mask)];
269    if offset == 0 {
270        if is_cjk_unified_ideograph(c) {
271            // write the hex number out right aligned in this array.
272            let mut data = [b'0'; 6];
273            let mut number = c as u32;
274            let mut data_start = 6;
275            for place in data.iter_mut().rev() {
276                // this would be incorrect if U+0000 was CJK unified
277                // ideograph, but it's not, so it's fine.
278                if number == 0 {
279                    break;
280                }
281                *place = (number % 16) as u8;
282                number /= 16;
283                data_start -= 1;
284            }
285            Some(Name {
286                data: Name_::CJK(CJK {
287                    emit_prefix: true,
288                    idx: data_start,
289                    data,
290                }),
291            })
292        } else {
293            // maybe it is a hangul syllable?
294            jamo::syllable_decomposition(c).map(|(ch, ju, jo)| Name {
295                data: Name_::Hangul(Hangul {
296                    emit_prefix: true,
297                    idx: 0,
298                    data: [ch, ju, jo],
299                }),
300            })
301        }
302    } else {
303        Some(Name {
304            data: Name_::Plain(iter_str::IterStr::new(offset as usize)),
305        })
306    }
307}
308
309fn fnv_hash<I: Iterator<Item = u8>>(x: I) -> u64 {
310    let mut g = 0xcbf29ce484222325 ^ generated_phf::NAME2CODE_N;
311    for b in x {
312        g ^= b as u64;
313        g = g.wrapping_mul(0x100000001b3);
314    }
315    g
316}
317fn displace(f1: u32, f2: u32, d1: u32, d2: u32) -> u32 {
318    d2.wrapping_add(f1.wrapping_mul(d1)).wrapping_add(f2)
319}
320fn split(hash: u64) -> (u32, u32, u32) {
321    let bits = 21;
322    let mask = (1 << bits) - 1;
323    (
324        (hash & mask) as u32,
325        ((hash >> bits) & mask) as u32,
326        ((hash >> (2 * bits)) & mask) as u32,
327    )
328}
329
330/// Get alias value from alias name, returns `None` if the alias is not found.
331fn character_by_alias(name: &[u8]) -> Option<char> {
332    ALIASES.get(name).copied()
333}
334
335/// Find the character called `name`, or `None` if no such character
336/// exists.
337///
338/// This function uses the [UAX44-LM2] loose matching scheme for lookup. For more information, see
339/// the [crate-level docs][self].
340///
341/// [UAX44-LM2]: https://www.unicode.org/reports/tr44/tr44-34.html#UAX44-LM2
342///
343/// # Example
344///
345/// ```rust
346/// assert_eq!(unicode_names2::character("LATIN SMALL LETTER A"), Some('a'));
347/// assert_eq!(unicode_names2::character("latinsmalllettera"), Some('a'));
348/// assert_eq!(unicode_names2::character("Black_Star"), Some('★'));
349/// assert_eq!(unicode_names2::character("SNOWMAN"), Some('☃'));
350/// assert_eq!(unicode_names2::character("BACKSPACE"), Some('\x08'));
351///
352/// assert_eq!(unicode_names2::character("nonsense"), None);
353/// ```
354pub fn character(search_name: &str) -> Option<char> {
355    let original_name = search_name;
356    let mut buf = [0; LONGEST_NAME_LEN];
357    let len = normalise_name(search_name, &mut buf);
358    let search_name = &buf[..len];
359
360    // try `HANGUL SYLLABLE <choseong><jungseong><jongseong>`
361    if search_name.starts_with(NORMALISED_HANGUL_SYLLABLE_PREFIX.as_bytes()) {
362        let remaining = &search_name[NORMALISED_HANGUL_SYLLABLE_PREFIX.len()..];
363        let (choseong, remaining) = jamo::slice_shift_choseong(remaining);
364        let (jungseong, remaining) = jamo::slice_shift_jungseong(remaining);
365        let (jongseong, remaining) = jamo::slice_shift_jongseong(remaining);
366        match (choseong, jungseong, jongseong, remaining) {
367            (Some(choseong), Some(jungseong), Some(jongseong), b"") => {
368                let c = 0xac00 + (choseong * 21 + jungseong) * 28 + jongseong;
369                return char::from_u32(c);
370            }
371            (_, _, _, _) => {
372                // there are no other names starting with `HANGUL SYLLABLE `
373                // (verified by `generator/...`).
374                return None;
375            }
376        }
377    }
378
379    // try `CJK UNIFIED IDEOGRAPH-<digits>`
380    if search_name.starts_with(NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) {
381        let remaining = &search_name[NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..];
382        if remaining.len() > 5 {
383            return None;
384        } // avoid overflow
385
386        let mut v = 0u32;
387        for &c in remaining {
388            v = match c {
389                b'0'..=b'9' => (v << 4) | (c - b'0') as u32,
390                b'A'..=b'F' => (v << 4) | (c - b'A' + 10) as u32,
391                _ => return None,
392            }
393        }
394        let ch = char::from_u32(v)?;
395
396        // check if the resulting code is indeed in the known ranges
397        if is_cjk_unified_ideograph(ch) {
398            return Some(ch);
399        } else {
400            // there are no other names starting with `CJK UNIFIED IDEOGRAPH-`
401            // (verified by `src/generate.py`).
402            return None;
403        }
404    }
405
406    // get the parts of the hash...
407    let (g, f1, f2) = split(fnv_hash(search_name.iter().copied()));
408    // ...and the appropriate displacements...
409    let (d1, d2) = generated_phf::NAME2CODE_DISP[g as usize % generated_phf::NAME2CODE_DISP.len()];
410
411    // ...to find the right index...
412    let idx = displace(f1, f2, d1 as u32, d2 as u32) as usize;
413    // ...for looking up the codepoint.
414    let codepoint = generated_phf::NAME2CODE_CODE[idx % generated_phf::NAME2CODE_CODE.len()];
415
416    // Now check that this is actually correct. Since this is a
417    // perfect hash table, valid names map precisely to their code
418    // point (and invalid names map to anything), so we only need to
419    // check the name for this codepoint matches the input and we know
420    // everything. (i.e. no need for probing)
421    let maybe_name = match name(codepoint) {
422        None => {
423            if true {
424                debug_assert!(false) // what?
425            }
426            return character_by_alias(search_name);
427        }
428        Some(name) => name,
429    };
430
431    // `name(codepoint)` returns an iterator yielding words separated by spaces or hyphens.
432    // That means whenever a name contains a non-medial hyphen, it must be emulated by inserting an
433    // artificial empty word (`""`) between the space and the hyphen.
434    let mut cmp_name = search_name;
435    for part in maybe_name {
436        let part = match part {
437            "" => "-",       // Non-medial hyphens are preserved by `normalise_name`, check them.
438            " " => continue, // Spaces and medial hyphens are removed, ignore them.
439            "-" if codepoint != '\u{1180}' => continue, // But the hyphen in U+1180 is preserved.
440            word => word,
441        };
442
443        if let Some(rest) = cmp_name.strip_prefix(part.as_bytes()) {
444            cmp_name = rest;
445        } else {
446            return character_by_alias(search_name);
447        }
448    }
449
450    // "HANGUL JUNGSEONG O-E" is ambiguous, returning U+116C HANGUL JUNGSEONG OE instead.
451    // All other ways of spelling U+1180 will get properly detected, so it's enough to just check
452    // if the hyphen is in the right place.
453    if codepoint == '\u{116C}'
454        && original_name
455            .trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '_')
456            .bytes()
457            .nth_back(1)
458            == Some(b'-')
459    {
460        return Some('\u{1180}');
461    }
462
463    Some(codepoint)
464}
465
466/// Convert a Unicode name to a form that can be used for loose matching, as per
467/// [UAX#44](https://www.unicode.org/reports/tr44/tr44-34.html#Matching_Names).
468///
469/// This function matches `unicode_names2_generator::normalise_name` in implementation, except that
470/// the special case of U+1180 HANGUL JUNGSEONG O-E isn't handled here, because we don't yet know
471/// which character is being queried and a string comparison would be expensive to inspect each
472/// query with given it only matches for one character. Thus the case of U+1180 is handled at the
473/// end of [`character`].
474fn normalise_name(search_name: &str, buf: &mut [u8; LONGEST_NAME_LEN]) -> usize {
475    let mut cursor = 0;
476    let bytes = search_name.as_bytes();
477
478    for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() {
479        // "Ignore case, whitespace, underscore ('_'), [...]"
480        if c.is_ascii_whitespace() || c == b'_' {
481            continue;
482        }
483
484        // "[...] and all medial hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E."
485        // See doc comment for why U+1180 isn't handled
486        if c == b'-'
487            && bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric)
488            && bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric)
489        {
490            continue;
491        }
492
493        if !c.is_ascii_alphanumeric() && c != b'-' {
494            // All unicode names comprise only of alphanumeric characters and hyphens after
495            // stripping spaces and underscores. Returning 0 effectively serves as returning `None`.
496            return 0;
497        }
498
499        if cursor >= buf.len() {
500            // No Unicode character has this long a name.
501            return 0;
502        }
503        buf[cursor] = c;
504        cursor += 1;
505    }
506
507    cursor
508}
509
510#[cfg(test)]
511mod tests {
512    use super::*;
513    use rand::{
514        distributions::{Distribution, Standard},
515        prelude::{SeedableRng, StdRng},
516    };
517    use std::char;
518    use std::prelude::v1::*;
519
520    extern crate test;
521
522    use test::bench::Bencher;
523
524    static DATA: &'static str =
525        include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/data/UnicodeData.txt"));
526
527    #[test]
528    fn exhaustive() {
529        // check that gaps have no names (these are unassigned/control
530        // codes).
531        fn negative_range(from: u32, to: u32) {
532            for c in (from..to).filter_map(char::from_u32) {
533                if !is_cjk_unified_ideograph(c) && !jamo::is_hangul_syllable(c) {
534                    let n = name(c);
535                    assert!(
536                        n.is_none(),
537                        "{} ({}) shouldn't have a name but is called {}",
538                        c,
539                        c as u32,
540                        n.unwrap()
541                    );
542                }
543            }
544        }
545
546        let mut last = 0;
547        for line in DATA.lines() {
548            let mut it = line.split(';');
549
550            let raw_c = it.next();
551            let c = match char::from_u32(
552                raw_c.and_then(|s| u32::from_str_radix(s, 16).ok()).unwrap(),
553            ) {
554                Some(c) => c,
555                None => continue,
556            };
557
558            let n = it.next().unwrap();
559            if n.starts_with("<") {
560                continue;
561            }
562
563            let computed_n = name(c).unwrap();
564            let n_str = computed_n.to_string();
565            assert_eq!(n_str, n.to_string());
566            assert_eq!(computed_n.len(), n_str.len());
567
568            let (hint_low, hint_high) = computed_n.size_hint();
569            let number_of_parts = computed_n.count();
570            assert_eq!(hint_low, number_of_parts);
571            assert_eq!(hint_high, Some(number_of_parts));
572
573            assert_eq!(character(n), Some(c));
574            assert_eq!(character(&n.to_ascii_lowercase()), Some(c));
575
576            negative_range(last, c as u32);
577            last = c as u32 + 1;
578        }
579        negative_range(last, 0x10FFFF + 1)
580    }
581
582    #[test]
583    fn name_to_string() {
584        let n = name('a').unwrap();
585        assert_eq!(n.to_string(), "LATIN SMALL LETTER A".to_string());
586        let n = name('🁣').unwrap();
587        assert_eq!(n.to_string(), "DOMINO TILE VERTICAL-00-00".to_string());
588    }
589
590    #[test]
591    fn character_negative() {
592        let long_name = "x".repeat(generated::LONGEST_NAME_LEN + 1);
593        let prefix = format!("{}x", generated::LONGEST_NAME); // This name would appear valid if truncated
594        let names = ["", "x", "öäå", "SPAACE", &long_name, &prefix];
595        for &n in names.iter() {
596            assert_eq!(character(n), None);
597        }
598    }
599
600    #[test]
601    fn name_hangul_syllable() {
602        assert_eq!(
603            name('\u{ac00}').map(|s| s.to_string()),
604            Some("HANGUL SYLLABLE GA".to_string())
605        ); // first
606        assert_eq!(
607            name('\u{bdc1}').map(|s| s.to_string()),
608            Some("HANGUL SYLLABLE BWELG".to_string())
609        );
610        assert_eq!(
611            name('\u{d7a3}').map(|s| s.to_string()),
612            Some("HANGUL SYLLABLE HIH".to_string())
613        ); // last
614    }
615
616    #[test]
617    fn character_hangul_syllable() {
618        assert_eq!(character("HANGUL SYLLABLE GA"), Some('\u{ac00}'));
619        assert_eq!(character("HANGUL SYLLABLE BWELG"), Some('\u{bdc1}'));
620        assert_eq!(character("HANGUL SYLLABLE HIH"), Some('\u{d7a3}'));
621        assert_eq!(character("HANGUL SYLLABLE BLAH"), None);
622    }
623
624    #[test]
625    fn cjk_unified_ideograph_exhaustive() {
626        for &(lo, hi) in generated::CJK_IDEOGRAPH_RANGES.iter() {
627            for x in lo as u32..=hi as u32 {
628                let c = char::from_u32(x).unwrap();
629
630                let real_name = format!("CJK UNIFIED IDEOGRAPH-{:X}", x);
631                let lower_real_name = format!("CJK UNIFIED IDEOGRAPH-{:x}", x);
632                assert_eq!(character(&real_name), Some(c));
633                assert_eq!(character(&lower_real_name), Some(c));
634
635                assert_eq!(name(c).map(|s| s.to_string()), Some(real_name));
636            }
637        }
638    }
639    #[test]
640    fn name_cjk_unified_ideograph() {
641        assert_eq!(
642            name('\u{4e00}').map(|s| s.to_string()),
643            Some("CJK UNIFIED IDEOGRAPH-4E00".to_string())
644        ); // first in BMP
645        assert_eq!(
646            name('\u{9fcc}').map(|s| s.to_string()),
647            Some("CJK UNIFIED IDEOGRAPH-9FCC".to_string())
648        ); // last in BMP (as of 6.1)
649        assert_eq!(
650            name('\u{20000}').map(|s| s.to_string()),
651            Some("CJK UNIFIED IDEOGRAPH-20000".to_string())
652        ); // first in SIP
653        assert_eq!(
654            name('\u{2a6d6}').map(|s| s.to_string()),
655            Some("CJK UNIFIED IDEOGRAPH-2A6D6".to_string())
656        );
657        assert_eq!(
658            name('\u{2a700}').map(|s| s.to_string()),
659            Some("CJK UNIFIED IDEOGRAPH-2A700".to_string())
660        );
661        assert_eq!(
662            name('\u{2b81d}').map(|s| s.to_string()),
663            Some("CJK UNIFIED IDEOGRAPH-2B81D".to_string())
664        ); // last in SIP (as of 6.0)
665    }
666
667    #[test]
668    fn character_cjk_unified_ideograph() {
669        assert_eq!(character("CJK UNIFIED IDEOGRAPH-4E00"), Some('\u{4e00}'));
670        assert_eq!(character("CJK UNIFIED IDEOGRAPH-9FCC"), Some('\u{9fcc}'));
671        assert_eq!(character("CJK UNIFIED IDEOGRAPH-20000"), Some('\u{20000}'));
672        assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A6D6"), Some('\u{2a6d6}'));
673        assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A700"), Some('\u{2a700}'));
674        assert_eq!(character("CJK UNIFIED IDEOGRAPH-2B81D"), Some('\u{2b81d}'));
675        assert_eq!(character("CJK UNIFIED IDEOGRAPH-"), None);
676        assert_eq!(character("CJK UNIFIED IDEOGRAPH-!@#$"), None);
677        assert_eq!(character("CJK UNIFIED IDEOGRAPH-1234"), None);
678        assert_eq!(character("CJK UNIFIED IDEOGRAPH-EFGH"), None);
679        assert_eq!(character("CJK UNIFIED IDEOGRAPH-12345"), None);
680        assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A6FF"), None); // between Ext B and Ext C
681        assert_eq!(character("CJK UNIFIED IDEOGRAPH-2A6FF"), None);
682    }
683
684    #[test]
685    fn character_by_alias() {
686        assert_eq!(super::character_by_alias(b"NEW LINE"), Some('\n'));
687        assert_eq!(super::character_by_alias(b"BACKSPACE"), Some('\u{8}'));
688        assert_eq!(super::character_by_alias(b"NOT AN ALIAS"), None);
689    }
690
691    #[test]
692    fn test_uax44() {
693        assert_eq!(character(" L_O_W l_i_n_e"), Some('_'));
694        assert_eq!(character("space \x09\x0a\x0c\x0d"), Some(' '));
695        assert_eq!(character("FULL S-T-O-P"), Some('.'));
696        assert_eq!(character("tibetan letter -a"), Some('\u{F60}'));
697        assert_eq!(character("tibetan letter- a"), Some('\u{F60}'));
698        assert_eq!(character("tibetan letter  -   a"), Some('\u{F60}'));
699        assert_eq!(character("tibetan letter_-_a"), Some('\u{F60}'));
700        assert_eq!(character("latinSMALLletterA"), Some('a'));
701
702        // Test exceptions related to U+1180
703        let jungseong_oe = Some('\u{116C}');
704        let jungseong_o_e = Some('\u{1180}');
705        assert_eq!(character("HANGUL JUNGSEONG OE"), jungseong_oe);
706        assert_eq!(character("HANGUL JUNGSEONG O_E"), jungseong_oe);
707        assert_eq!(character("HANGUL JUNGSEONG O E"), jungseong_oe);
708        assert_eq!(character("HANGUL JUNGSEONG O-E"), jungseong_o_e);
709        assert_eq!(character("HANGUL JUNGSEONG O-E\n"), jungseong_o_e);
710        assert_eq!(character("HANGUL JUNGSEONG O-E__"), jungseong_o_e);
711        assert_eq!(character("HANGUL JUNGSEONG O- E"), jungseong_o_e);
712        assert_eq!(character("HANGUL JUNGSEONG O -E"), jungseong_o_e);
713        assert_eq!(character("HANGUL JUNGSEONG O_-_E"), jungseong_o_e);
714    }
715
716    #[bench]
717    fn name_basic(b: &mut Bencher) {
718        b.iter(|| {
719            for s in name('ö').unwrap() {
720                test::black_box(s);
721            }
722        })
723    }
724
725    #[bench]
726    fn character_basic(b: &mut Bencher) {
727        b.iter(|| character("LATIN SMALL LETTER O WITH DIAERESIS"));
728    }
729
730    #[bench]
731    fn name_10000_invalid(b: &mut Bencher) {
732        // be consistent across runs, but avoid sequential/caching.
733        let mut rng = StdRng::seed_from_u64(0x12345678);
734        let chars: Vec<char> = Standard
735            .sample_iter(&mut rng)
736            .take(10000)
737            .filter_map(|c| match c {
738                c if name(c).is_none() => Some(c),
739                _ => None,
740            })
741            .collect();
742
743        b.iter(|| {
744            for &c in chars.iter() {
745                assert!(name(c).is_none());
746            }
747        })
748    }
749
750    #[bench]
751    fn name_all_valid(b: &mut Bencher) {
752        let chars = (0u32..0x10FFFF)
753            .filter_map(|x| match char::from_u32(x) {
754                Some(c) if name(c).is_some() => Some(c),
755                _ => None,
756            })
757            .collect::<Vec<char>>();
758
759        b.iter(|| {
760            for c in chars.iter() {
761                for s in name(*c).unwrap() {
762                    test::black_box(s);
763                }
764            }
765        });
766    }
767
768    #[bench]
769    fn character_10000(b: &mut Bencher) {
770        // be consistent across runs, but avoid sequential/caching.
771        let mut rng = StdRng::seed_from_u64(0x12345678);
772
773        let names: Vec<_> = Standard
774            .sample_iter(&mut rng)
775            .take(10000)
776            .filter_map(name)
777            .map(|name| name.to_string())
778            .collect();
779
780        b.iter(|| {
781            for n in names.iter() {
782                test::black_box(character(&n));
783            }
784        })
785    }
786}
787
788#[cfg(all(feature = "no_std", not(test)))]
789mod std {
790    pub use core::{clone, fmt, marker};
791}
unicode_names2/lib.rs

unicode_names2/
lib.rs