unicode_charname/
lib.rs

1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode character name properties
12//! as described in
13//! [Unicode Standard Annex #44](http://www.unicode.org/reports/tr44/).
14//!
15//! ```rust
16//! extern crate unicode_charname;
17//!
18//! use unicode_charname::CharName;
19//!
20//! fn main() {
21//!     assert_eq!('A'.char_name().unwrap_or_default().to_string(),
22//!                "LATIN CAPITAL LETTER A");
23//! }
24//! ```
25//!
26//! ## crates.io
27//!
28//! You can use this package in your project by adding the following
29//! to your `Cargo.toml`:
30//!
31//! ```toml
32//! [dependencies]
33//! unicode-charname = "0.1"
34//! ```
35//!
36//! ## `no_std` + `alloc` support
37//!
38//! This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`.
39
40#![deny(missing_docs, unsafe_code)]
41#![doc(
42    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44)]
45#![cfg_attr(not(feature = "std"), no_std)]
46
47extern crate alloc;
48
49#[cfg(feature = "std")]
50extern crate core;
51
52#[cfg(not(feature = "std"))]
53use alloc::string::String;
54use core::fmt;
55
56#[rustfmt::skip]
57mod tables;
58
59mod jamo;
60mod reserved;
61
62pub use tables::UNICODE_VERSION;
63
64/// Methods for retrieving character name for a code point.
65pub trait CharName {
66    /// Retrieve the character name for a code point.
67    ///
68    /// # Examples
69    ///
70    /// ```
71    /// # use unicode_charname::CharName;
72    /// assert_eq!('\u{1F402}'.char_name().unwrap_or_default().to_string(), "OX");
73    /// ```
74    ///
75    /// Note that for all code points having a property value
76    /// of na = "" for the Name property, the return value
77    /// will be its hex representation dash-prefixed with a special label
78    /// indicating its code point type (See D10a Code point type within
79    /// the Unicode standard) within angle brackets.
80    /// 
81    /// The following special labels are used: `control`, `private-use`,
82    /// `surrogate`, `noncharacter` and `reserved`.
83    /// # Examples
84    ///
85    /// ```
86    /// # use unicode_charname::CharName;
87    /// assert_eq!('\u{81}'.char_name().unwrap_or_default().to_string(), "<control-0081>");
88    /// ```
89    /// This function never return `None` for valid
90    /// Unicode code points, but always return `None` for
91    /// other integers.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// # use unicode_charname::CharName;
97    /// assert!(0x200000.char_name().is_none());
98    /// ```
99    fn char_name(self) -> Option<Name>;
100
101    /// Retrieve the Unicode Name property value for a code point.
102    ///
103    /// Similar to `char_name` function, but also returns `None` for
104    /// all code points having a property value of na = "" for the Name property.
105    /// # Examples
106    ///
107    /// ```
108    /// # use unicode_charname::CharName;
109    /// assert!('\u{81}'.property_name().is_none());
110    /// ```
111    fn property_name(self) -> Option<Name>;
112}
113
114impl CharName for char {
115    fn char_name(self) -> Option<Name> {
116        CharName::char_name(self as u32)
117    }
118    fn property_name(self) -> Option<Name> {
119        CharName::property_name(self as u32)
120    }
121}
122
123impl CharName for u32 {
124    fn char_name(self) -> Option<Name> {
125        if let Some(slice) = tables::find_in_enumerate_names(self) {
126            let name = Name(NameInner::Enumeration {
127                encoded_slice: slice,
128                codepoint_repr: alloc::format!("{:04X}", self),
129            });
130            return Some(name);
131        }
132        if let Some(special_group) = tables::find_in_special_groups(self) {
133            return name_for_special_group_char(
134                self,
135                special_group,
136                CodePointLabelMode::Label {
137                    use_angle_bracket: true,
138                },
139            );
140        }
141        if reserved::is_code_point(self) {
142            if reserved::is_noncharacter(self) {
143                return Some(code_point_label("noncharacter-", self, true));
144            } else {
145                return Some(code_point_label("reserved-", self, true));
146            }
147        }
148        None
149    }
150
151    fn property_name(self) -> Option<Name> {
152        if let Some(slice) = tables::find_in_enumerate_names(self) {
153            let name = Name(NameInner::Enumeration {
154                encoded_slice: slice,
155                codepoint_repr: alloc::format!("{:04X}", self),
156            });
157            return Some(name);
158        }
159        if let Some(special_group) = tables::find_in_special_groups(self) {
160            return name_for_special_group_char(self, special_group, CodePointLabelMode::None);
161        }
162        None
163    }
164}
165
166fn nr1_name(_prefix: &str, v: u32) -> Name {
167    // ignore prefix here, because hangul_name will provide one.
168    let str = jamo::hangul_name(v);
169    Name(NameInner::Generated(str))
170}
171
172fn nr2_name(prefix: &str, v: u32) -> Name {
173    Name(NameInner::Generated(alloc::format!("{}{:04X}", prefix, v)))
174}
175
176fn code_point_label(prefix: &str, v: u32, use_angle_bracket: bool) -> Name {
177    let str = if use_angle_bracket {
178        alloc::format!("<{}{:04X}>", prefix, v)
179    } else {
180        alloc::format!("{}{:04X}", prefix, v)
181    };
182    Name(NameInner::Generated(str))
183}
184
185enum CodePointLabelMode {
186    None,
187    Label { use_angle_bracket: bool },
188}
189
190fn name_for_special_group_char(
191    v: u32,
192    special_group: tables::SpecialGroup,
193    code_point_label_mode: CodePointLabelMode,
194) -> Option<Name> {
195    use tables::SpecialGroup;
196    match special_group {
197        SpecialGroup::HangulSyllable => {
198            // NR1
199            Some(nr1_name("HANGUL SYLLABLE ", v))
200        }
201        SpecialGroup::CJKIdeographExtensionA
202        | SpecialGroup::CJKIdeograph
203        | SpecialGroup::CJKIdeographExtensionB
204        | SpecialGroup::CJKIdeographExtensionC
205        | SpecialGroup::CJKIdeographExtensionD
206        | SpecialGroup::CJKIdeographExtensionE
207        | SpecialGroup::CJKIdeographExtensionF
208        | SpecialGroup::CJKIdeographExtensionG
209        | SpecialGroup::CJKIdeographExtensionH
210        | SpecialGroup::CJKIdeographExtensionI
211        | SpecialGroup::CJKIdeographExtensionJ => {
212            // NR2
213            Some(nr2_name("CJK UNIFIED IDEOGRAPH-", v))
214        }
215        SpecialGroup::TangutIdeograph | SpecialGroup::TangutIdeographSupplement => {
216            // NR2
217            Some(nr2_name("TANGUT IDEOGRAPH-", v))
218        }
219        /* other NR2 cases already covered in UnicodeData.txt */
220        SpecialGroup::control => {
221            if let CodePointLabelMode::Label { use_angle_bracket } = code_point_label_mode {
222                Some(code_point_label("control-", v, use_angle_bracket))
223            } else {
224                None
225            }
226        }
227        SpecialGroup::NonPrivateUseHighSurrogate
228        | SpecialGroup::PrivateUseHighSurrogate
229        | SpecialGroup::LowSurrogate => {
230            if let CodePointLabelMode::Label { use_angle_bracket } = code_point_label_mode {
231                Some(code_point_label("surrogate-", v, use_angle_bracket))
232            } else {
233                None
234            }
235        }
236        SpecialGroup::PrivateUse
237        | SpecialGroup::Plane15PrivateUse
238        | SpecialGroup::Plane16PrivateUse => {
239            if let CodePointLabelMode::Label { use_angle_bracket } = code_point_label_mode {
240                Some(code_point_label("private-use-", v, use_angle_bracket))
241            } else {
242                None
243            }
244        }
245    }
246}
247
248#[derive(Clone)]
249enum NameInner {
250    Enumeration {
251        encoded_slice: &'static [u16],
252        codepoint_repr: String,
253    },
254    Generated(String),
255}
256
257/// Represents retrieved Unicode character name.
258///
259/// It implements the `Display` trait and can also
260/// be converted to a `String` value with `to_string`
261/// method.
262#[derive(Clone)]
263pub struct Name(NameInner);
264
265impl Default for Name {
266    fn default() -> Self {
267        Name(NameInner::Generated(Default::default()))
268    }
269}
270
271impl Name {
272    fn iter(&self) -> NameIter<'_> {
273        NameIter {
274            name: &self.0,
275            offset: 0,
276            state: NameIterState::Initial,
277        }
278    }
279}
280
281impl fmt::Display for Name {
282    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283        for s in self.iter() {
284            write!(f, "{}", s)?;
285        }
286        Ok(())
287    }
288}
289
290#[derive(Clone)]
291#[non_exhaustive]
292struct NameIter<'a> {
293    name: &'a NameInner,
294    offset: usize,
295    state: NameIterState,
296}
297
298#[derive(Copy, Clone)]
299enum NameIterState {
300    Initial,
301    InsertSpace { cur_special: bool },
302    Middle { cur_special: bool },
303    Finished,
304}
305
306impl<'a> Iterator for NameIter<'a> {
307    type Item = &'a str;
308    fn next(&mut self) -> Option<&'a str> {
309        match self.name {
310            NameInner::Enumeration {
311                encoded_slice,
312                codepoint_repr,
313            } => match self.state {
314                NameIterState::Finished => None,
315                _ if self.offset >= encoded_slice.len() => {
316                    self.state = NameIterState::Finished;
317                    None
318                }
319                NameIterState::InsertSpace { cur_special } => {
320                    self.state = NameIterState::Middle { cur_special };
321                    Some(tables::ENUMERATION_WORD_TABLE[tables::WORD_TABLE_INDEX_SPACE as usize])
322                }
323                _ => {
324                    /* NameIterState::Initial | NameIterState::Middle {..} */
325                    let cur_word_idx = encoded_slice[self.offset];
326                    self.offset += 1;
327                    if let Some(&next_word_idx) = encoded_slice.get(self.offset) {
328                        let cur_special = match self.state {
329                            NameIterState::Initial => tables::is_special_word_index(cur_word_idx),
330                            NameIterState::Middle { cur_special } => cur_special,
331                            _ => unreachable!(),
332                        };
333                        let next_special = tables::is_special_word_index(next_word_idx);
334                        if !cur_special && !next_special {
335                            self.state = NameIterState::InsertSpace {
336                                cur_special: next_special,
337                            };
338                        } else {
339                            self.state = NameIterState::Middle {
340                                cur_special: next_special,
341                            };
342                        }
343                    } else {
344                        self.state = NameIterState::Finished;
345                    }
346                    if cur_word_idx == tables::WORD_TABLE_INDEX_CODEPOINT {
347                        Some(codepoint_repr)
348                    } else {
349                        Some(tables::ENUMERATION_WORD_TABLE[cur_word_idx as usize])
350                    }
351                }
352            },
353            NameInner::Generated(s) => match self.state {
354                NameIterState::Initial => {
355                    self.state = NameIterState::Finished;
356                    Some(&s)
357                }
358                NameIterState::Finished => None,
359                _ => unreachable!(),
360            },
361        }
362    }
363}