unic_ucd_category/
category.rs

1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use unic_char_property::TotalCharProperty;
12
13char_property! {
14    /// Represents the Unicode Character
15    /// [`General_Category`](http://unicode.org/reports/tr44/#General_Category) property.
16    ///
17    /// This is a useful breakdown into various character types which can be used as a default
18    /// categorization in implementations. For the property values, see
19    /// [`General_Category Values`](http://unicode.org/reports/tr44/#General_Category_Values).
20    pub enum GeneralCategory {
21        abbr => "gc";
22        long => "General_Category";
23        human => "General Category";
24
25        /// An uppercase letter
26        UppercaseLetter {
27            abbr => Lu,
28            long => Uppercase_Letter,
29            human => "Uppercase Letter",
30        }
31
32        /// A lowercase letter
33        LowercaseLetter {
34            abbr => Ll,
35            long => Lowercase_Letter,
36            human => "Lowercase Letter",
37        }
38
39        /// A digraphic character, with first part uppercase
40        TitlecaseLetter {
41            abbr => Lt,
42            long => Titlecase_Letter,
43            human => "Titlecase Letter",
44        }
45
46        /// A modifier letter
47        ModifierLetter {
48            abbr => Lm,
49            long => Modifier_Letter,
50            human => "Modifier Letter",
51        }
52
53        /// Other letters, including syllables and ideographs
54        OtherLetter {
55            abbr => Lo,
56            long => Other_Letter,
57            human => "Other Letter",
58        }
59
60        /// A nonspacing combining mark (zero advance width)
61        NonspacingMark {
62            abbr => Mn,
63            long => Nonspacing_Mark,
64            human => "Nonspacing Mark",
65        }
66
67        /// A spacing combining mark (positive advance width)
68        SpacingMark {
69            abbr => Mc,
70            long => Spacing_Mark,
71            human => "Spacing Mark",
72        }
73
74        /// An enclosing combining mark
75        EnclosingMark {
76            abbr => Me,
77            long => Enclosing_Mark,
78            human => "Enclosing Mark",
79        }
80
81        /// A decimal digit
82        DecimalNumber {
83            abbr => Nd,
84            long => Decimal_Number,
85            human => "Decimal Digit",
86        }
87
88        /// A letterlike numeric character
89        LetterNumber {
90            abbr => Nl,
91            long => Letter_Number,
92            human => "Letterlike Number",
93        }
94
95        /// A numeric character of other type
96        OtherNumber {
97            abbr => No,
98            long => Other_Number,
99            human => "Other Numeric",
100        }
101
102        /// A connecting punctuation mark, like a tie
103        ConnectorPunctuation {
104            abbr => Pc,
105            long => Connector_Punctuation,
106            human => "Connecting Punctuation",
107        }
108
109        /// A dash or hyphen punctuation mark
110        DashPunctuation {
111            abbr => Pd,
112            long => Dash_Punctuation,
113            human => "Dash Punctuation",
114        }
115
116        /// An opening punctuation mark (of a pair)
117        OpenPunctuation {
118            abbr => Ps,
119            long => Open_Punctuation,
120            human => "Opening Punctuation",
121        }
122
123        /// A closing punctuation mark (of a pair)
124        ClosePunctuation {
125            abbr => Pe,
126            long => Close_Punctuation,
127            human => "Closing Punctuation",
128        }
129
130        /// An initial quotation mark
131        InitialPunctuation {
132            abbr => Pi,
133            long => Initial_Punctuation,
134            human => "Initial Quotation",
135        }
136
137        /// A final quotation mark
138        FinalPunctuation {
139            abbr => Pf,
140            long => Final_Punctuation,
141            human => "Final Quotation",
142        }
143
144        /// A punctuation mark of other type
145        OtherPunctuation {
146            abbr => Po,
147            long => Other_Punctuation,
148            human => "Other Punctuation",
149        }
150
151        /// A symbol of mathematical use
152        MathSymbol {
153            abbr => Sm,
154            long => Math_Symbol,
155            human => "Math Symbol",
156        }
157
158        /// A currency sign
159        CurrencySymbol {
160            abbr => Sc,
161            long => Currency_Symbol,
162            human => "Currency Symbol",
163        }
164
165        /// A non-letterlike modifier symbol
166        ModifierSymbol {
167            abbr => Sk,
168            long => Modifier_Symbol,
169            human => "Modifier Symbol",
170        }
171
172        /// A symbol of other type
173        OtherSymbol {
174            abbr => So,
175            long => Other_Symbol,
176            human => "Other Symbol",
177        }
178
179        /// A space character (of various non-zero widths)
180        SpaceSeparator {
181            abbr => Zs,
182            long => Space_Separator,
183            human => "Space",
184        }
185
186        /// U+2028 LINE SEPARATOR only
187        LineSeparator {
188            abbr => Zl,
189            long => Line_Separator,
190            human => "Line Separator",
191        }
192
193        /// U+2029 PARAGRAPH SEPARATOR only
194        ParagraphSeparator {
195            abbr => Zp,
196            long => Paragraph_Separator,
197            human => "Paragraph Separator",
198        }
199
200        /// A C0 or C1 control code
201        Control {
202            abbr => Cc,
203            long => Control,
204            human => "Control",
205        }
206
207        /// A format control character
208        Format {
209            abbr => Cf,
210            long => Format,
211            human => "Formatting",
212        }
213
214        /// A surrogate code point
215        Surrogate {
216            abbr => Cs,
217            long => Surrogate,
218            human => "Surrogate",
219        }
220
221        /// A private-use character
222        PrivateUse {
223            abbr => Co,
224            long => Private_Use,
225            human => "Private-Use",
226        }
227
228        /// Unassigned
229        Unassigned {
230            abbr => Cn,
231            long => Unassigned,
232            human => "Unassigned",
233        }
234    }
235
236    pub mod abbr_names for abbr;
237    pub mod long_names for long;
238}
239
240impl TotalCharProperty for GeneralCategory {
241    fn of(ch: char) -> Self {
242        Self::of(ch)
243    }
244}
245
246impl Default for GeneralCategory {
247    fn default() -> Self {
248        GeneralCategory::Unassigned
249    }
250}
251
252mod data {
253    use super::abbr_names::*;
254    use unic_char_property::tables::CharDataTable;
255    pub const GENERAL_CATEGORY_TABLE: CharDataTable<super::GeneralCategory> =
256        include!("../tables/general_category.rsv");
257}
258
259impl GeneralCategory {
260    /// Find the `GeneralCategory` of a single char.
261    pub fn of(ch: char) -> GeneralCategory {
262        data::GENERAL_CATEGORY_TABLE.find_or_default(ch)
263    }
264}
265
266impl GeneralCategory {
267    /// `Lu` | `Ll` | `Lt`  (Short form: `LC`)
268    pub fn is_cased_letter(&self) -> bool {
269        use self::abbr_names::*;
270        matches!(*self, Lu | Ll | Lt)
271    }
272
273    /// `Lu` | `Ll` | `Lt` | `Lm` | `Lo`  (Short form: `L`)
274    pub fn is_letter(&self) -> bool {
275        use self::abbr_names::*;
276        matches!(*self, Lu | Ll | Lt | Lm | Lo)
277    }
278
279    /// `Mn` | `Mc` | `Me`  (Short form: `M`)
280    pub fn is_mark(&self) -> bool {
281        use self::abbr_names::*;
282        matches!(*self, Mn | Mc | Me)
283    }
284
285    /// `Nd` | `Nl` | `No`  (Short form: `N`)
286    pub fn is_number(&self) -> bool {
287        use self::abbr_names::*;
288        matches!(*self, Nd | Nl | No)
289    }
290
291    /// `Pc` | `Pd` | `Ps` | `Pe` | `Pi` | `Pf` | `Po`  (Short form: `P`)
292    pub fn is_punctuation(&self) -> bool {
293        use self::abbr_names::*;
294        matches!(*self, Pc | Pd | Ps | Pe | Pi | Pf | Po)
295    }
296
297    /// `Sm` | `Sc` | `Sk` | `So`  (Short form: `S`)
298    pub fn is_symbol(&self) -> bool {
299        use self::abbr_names::*;
300        matches!(*self, Sm | Sc | Sk | So)
301    }
302
303    /// `Zs` | `Zl` | `Zp`  (Short form: `Z`)
304    pub fn is_separator(&self) -> bool {
305        use self::abbr_names::*;
306        matches!(*self, Zs | Zl | Zp)
307    }
308
309    /// `Cc` | `Cf` | `Cs` | `Co` | `Cn`  (Short form: `C`)
310    pub fn is_other(&self) -> bool {
311        use self::abbr_names::*;
312        matches!(*self, Cc | Cf | Cs | Co | Cn)
313    }
314}
315
316#[cfg(test)]
317mod tests {
318    use super::GeneralCategory as GC;
319    use core::char;
320    use unic_char_property::EnumeratedCharProperty;
321
322    #[test]
323    fn test_ascii() {
324        for c in 0x00..(0x1F + 1) {
325            let c = char::from_u32(c).unwrap();
326            assert_eq!(GC::of(c), GC::Control);
327        }
328
329        assert_eq!(GC::of(' '), GC::SpaceSeparator);
330        assert_eq!(GC::of('!'), GC::OtherPunctuation);
331        assert_eq!(GC::of('"'), GC::OtherPunctuation);
332        assert_eq!(GC::of('#'), GC::OtherPunctuation);
333        assert_eq!(GC::of('$'), GC::CurrencySymbol);
334        assert_eq!(GC::of('%'), GC::OtherPunctuation);
335        assert_eq!(GC::of('&'), GC::OtherPunctuation);
336        assert_eq!(GC::of('\''), GC::OtherPunctuation);
337        assert_eq!(GC::of('('), GC::OpenPunctuation);
338        assert_eq!(GC::of(')'), GC::ClosePunctuation);
339        assert_eq!(GC::of('*'), GC::OtherPunctuation);
340        assert_eq!(GC::of('+'), GC::MathSymbol);
341        assert_eq!(GC::of(','), GC::OtherPunctuation);
342        assert_eq!(GC::of('-'), GC::DashPunctuation);
343        assert_eq!(GC::of('.'), GC::OtherPunctuation);
344        assert_eq!(GC::of('/'), GC::OtherPunctuation);
345
346        for c in ('0' as u32)..('9' as u32 + 1) {
347            let c = char::from_u32(c).unwrap();
348            assert_eq!(GC::of(c), GC::DecimalNumber);
349        }
350
351        assert_eq!(GC::of(':'), GC::OtherPunctuation);
352        assert_eq!(GC::of(';'), GC::OtherPunctuation);
353        assert_eq!(GC::of('<'), GC::MathSymbol);
354        assert_eq!(GC::of('='), GC::MathSymbol);
355        assert_eq!(GC::of('>'), GC::MathSymbol);
356        assert_eq!(GC::of('?'), GC::OtherPunctuation);
357        assert_eq!(GC::of('@'), GC::OtherPunctuation);
358
359        for c in ('A' as u32)..('Z' as u32 + 1) {
360            let c = char::from_u32(c).unwrap();
361            assert_eq!(GC::of(c), GC::UppercaseLetter);
362        }
363
364        assert_eq!(GC::of('['), GC::OpenPunctuation);
365        assert_eq!(GC::of('\\'), GC::OtherPunctuation);
366        assert_eq!(GC::of(']'), GC::ClosePunctuation);
367        assert_eq!(GC::of('^'), GC::ModifierSymbol);
368        assert_eq!(GC::of('_'), GC::ConnectorPunctuation);
369        assert_eq!(GC::of('`'), GC::ModifierSymbol);
370
371        for c in ('a' as u32)..('z' as u32 + 1) {
372            let c = char::from_u32(c).unwrap();
373            assert_eq!(GC::of(c), GC::LowercaseLetter);
374        }
375
376        assert_eq!(GC::of('{'), GC::OpenPunctuation);
377        assert_eq!(GC::of('|'), GC::MathSymbol);
378        assert_eq!(GC::of('}'), GC::ClosePunctuation);
379        assert_eq!(GC::of('~'), GC::MathSymbol);
380    }
381
382    #[test]
383    fn test_bmp_edge() {
384        // 0xFEFF ZERO WIDTH NO-BREAK SPACE (or) BYTE ORDER MARK
385        let bom = '\u{FEFF}';
386        assert_eq!(GC::of(bom), GC::Format);
387        // 0xFFFC OBJECT REPLACEMENT CHARACTER
388        assert_eq!(GC::of(''), GC::OtherSymbol);
389        // 0xFFFD REPLACEMENT CHARACTER
390        assert_eq!(GC::of('�'), GC::OtherSymbol);
391
392        for &c in [0xFFEF, 0xFFFE, 0xFFFF].iter() {
393            let c = char::from_u32(c).unwrap();
394            assert_eq!(GC::of(c), GC::Unassigned);
395        }
396    }
397
398    #[test]
399    fn test_private_use() {
400        for c in 0xF_0000..(0xF_FFFD + 1) {
401            let c = char::from_u32(c).unwrap();
402            assert_eq!(GC::of(c), GC::PrivateUse);
403        }
404
405        for c in 0x10_0000..(0x10_FFFD + 1) {
406            let c = char::from_u32(c).unwrap();
407            assert_eq!(GC::of(c), GC::PrivateUse);
408        }
409
410        for &c in [0xF_FFFE, 0xF_FFFF, 0x10_FFFE, 0x10_FFFF].iter() {
411            let c = char::from_u32(c).unwrap();
412            assert_eq!(GC::of(c), GC::Unassigned);
413        }
414    }
415
416    #[test]
417    fn test_abbr_name() {
418        assert_eq!(GC::UppercaseLetter.abbr_name(), "Lu");
419        assert_eq!(GC::Unassigned.abbr_name(), "Cn");
420    }
421
422    #[test]
423    fn test_long_name() {
424        assert_eq!(GC::UppercaseLetter.long_name(), "Uppercase_Letter");
425        assert_eq!(GC::Unassigned.long_name(), "Unassigned");
426    }
427
428    #[test]
429    fn test_human_name() {
430        assert_eq!(GC::UppercaseLetter.human_name(), "Uppercase Letter");
431        assert_eq!(GC::Unassigned.human_name(), "Unassigned");
432    }
433}