unicode_intervals/
categories.rs

1use crate::{constants::ALL_CATEGORIES, error};
2use core::{
3    fmt,
4    ops::{BitOr, BitOrAssign},
5    str::FromStr,
6};
7use UnicodeCategory::*;
8
9/// Unicode category abbreviation.
10#[derive(Debug, Eq, PartialEq, Copy, Clone, Hash)]
11pub enum UnicodeCategory {
12    /// Close Punctuation.
13    Pe,
14    /// Connector Punctuation.
15    Pc,
16    /// Control.
17    Cc,
18    /// Currency Symbol.
19    Sc,
20    /// Dash Punctuation.
21    Pd,
22    /// Decimal Number.
23    Nd,
24    /// Enclosing Mark.
25    Me,
26    /// Final Punctuation.
27    Pf,
28    /// Format.
29    Cf,
30    /// Initial Punctuation.
31    Pi,
32    /// Letter Number.
33    Nl,
34    /// Line Separator.
35    Zl,
36    /// Lowercase Letter.
37    Ll,
38    /// Math Symbol.
39    Sm,
40    /// Modifier Letter.
41    Lm,
42    /// Modifier Symbol.
43    Sk,
44    /// Nonspacing Mark.
45    Mn,
46    /// Open Punctuation.
47    Ps,
48    /// Other Letter.
49    Lo,
50    /// Other Number.
51    No,
52    /// Other Punctuation.
53    Po,
54    /// Other Symbol.
55    So,
56    /// Paragraph Separator.
57    Zp,
58    /// Private Use.
59    Co,
60    /// Space Separator.
61    Zs,
62    /// Spacing Mark.
63    Mc,
64    /// Surrogate.
65    Cs,
66    /// Titlecase Letter.
67    Lt,
68    /// Unassigned.
69    Cn,
70    /// Uppercase Letter.
71    Lu,
72}
73
74impl FromStr for UnicodeCategory {
75    type Err = error::Error;
76
77    fn from_str(s: &str) -> Result<Self, Self::Err> {
78        Ok(match s {
79            "Pe" => Pe,
80            "Pc" => Pc,
81            "Cc" => Cc,
82            "Sc" => Sc,
83            "Pd" => Pd,
84            "Nd" => Nd,
85            "Me" => Me,
86            "Pf" => Pf,
87            "Cf" => Cf,
88            "Pi" => Pi,
89            "Nl" => Nl,
90            "Zl" => Zl,
91            "Ll" => Ll,
92            "Sm" => Sm,
93            "Lm" => Lm,
94            "Sk" => Sk,
95            "Mn" => Mn,
96            "Ps" => Ps,
97            "Lo" => Lo,
98            "No" => No,
99            "Po" => Po,
100            "So" => So,
101            "Zp" => Zp,
102            "Co" => Co,
103            "Zs" => Zs,
104            "Mc" => Mc,
105            "Cs" => Cs,
106            "Lt" => Lt,
107            "Cn" => Cn,
108            "Lu" => Lu,
109            _ => return Err(Self::Err::InvalidCategory(s.to_owned().into_boxed_str())),
110        })
111    }
112}
113
114impl UnicodeCategory {
115    /// Letters.
116    pub const L: UnicodeCategorySet = UnicodeCategorySet(
117        1 << Ll as u32 | 1 << Lm as u32 | 1 << Lo as u32 | 1 << Lt as u32 | 1 << Lu as u32,
118    );
119    /// Marks.
120    pub const M: UnicodeCategorySet =
121        UnicodeCategorySet(1 << Mc as u32 | 1 << Me as u32 | 1 << Mn as u32);
122    /// Numbers.
123    pub const N: UnicodeCategorySet =
124        UnicodeCategorySet(1 << Nd as u32 | 1 << Nl as u32 | 1 << No as u32);
125    /// Punctuation.
126    pub const P: UnicodeCategorySet = UnicodeCategorySet(
127        1 << Pc as u32
128            | 1 << Pd as u32
129            | 1 << Pe as u32
130            | 1 << Pf as u32
131            | 1 << Pi as u32
132            | 1 << Po as u32
133            | 1 << Ps as u32,
134    );
135    /// Symbols.
136    pub const S: UnicodeCategorySet =
137        UnicodeCategorySet(1 << Sc as u32 | 1 << Sk as u32 | 1 << Sm as u32 | 1 << So as u32);
138    /// Separators.
139    pub const Z: UnicodeCategorySet =
140        UnicodeCategorySet(1 << Zp as u32 | 1 << Zs as u32 | 1 << Zl as u32);
141    /// Control, format, private, unassigned and surrogates.
142    pub const C: UnicodeCategorySet = UnicodeCategorySet(
143        1 << Cc as u32 | 1 << Cf as u32 | 1 << Cn as u32 | 1 << Co as u32 | 1 << Cs as u32,
144    );
145    // Full category names
146    /// Close Punctuation (alias).
147    pub const CLOSE_PUNCTUATION: UnicodeCategory = Pe;
148    /// Connector Punctuation (alias).
149    pub const CONNECTOR_PUNCTUATION: UnicodeCategory = Pc;
150    /// Control (alias).
151    pub const CONTROL: UnicodeCategory = Cc;
152    /// Currency Symbol (alias).
153    pub const CURRENCY_SYMBOL: UnicodeCategory = Sc;
154    /// Dash Punctuation (alias).
155    pub const DASH_PUNCTUATION: UnicodeCategory = Pd;
156    /// Decimal Number (alias).
157    pub const DECIMAL_NUMBER: UnicodeCategory = Nd;
158    /// Enclosing Mark (alias).
159    pub const ENCLOSING_MARK: UnicodeCategory = Me;
160    /// Final Punctuation (alias).
161    pub const FINAL_PUNCTUATION: UnicodeCategory = Pf;
162    /// Format (alias).
163    pub const FORMAT: UnicodeCategory = Cf;
164    /// Initial Punctuation (alias).
165    pub const INITIAL_PUNCTUATION: UnicodeCategory = Pi;
166    /// Letter Number (alias).
167    pub const LETTER_NUMBER: UnicodeCategory = Nl;
168    /// Line Separator (alias).
169    pub const LINE_SEPARATOR: UnicodeCategory = Zl;
170    /// Lowercase Letter (alias).
171    pub const LOWERCASE_LETTER: UnicodeCategory = Ll;
172    /// Math Symbol (alias).
173    pub const MATH_SYMBOL: UnicodeCategory = Sm;
174    /// Modifier Letter (alias).
175    pub const MODIFIER_LETTER: UnicodeCategory = Lm;
176    /// Modifier Symbol (alias).
177    pub const MODIFIER_SYMBOL: UnicodeCategory = Sk;
178    /// Nonspacing Mark (alias).
179    pub const NONSPACING_MARK: UnicodeCategory = Mn;
180    /// Open Punctuation (alias).
181    pub const OPEN_PUNCTUATION: UnicodeCategory = Ps;
182    /// Other Letter (alias).
183    pub const OTHER_LETTER: UnicodeCategory = Lo;
184    /// Other Number (alias).
185    pub const OTHER_NUMBER: UnicodeCategory = No;
186    /// Other Punctuation (alias).
187    pub const OTHER_PUNCTUATION: UnicodeCategory = Po;
188    /// Other Symbol (alias).
189    pub const OTHER_SYMBOL: UnicodeCategory = So;
190    /// Paragraph Separator (alias).
191    pub const PARAGRAPH_SEPARATOR: UnicodeCategory = Zp;
192    /// Private Use (alias).
193    pub const PRIVATE_USE: UnicodeCategory = Co;
194    /// Space Separator (alias).
195    pub const SPACE_SEPARATOR: UnicodeCategory = Zs;
196    /// Spacing Mark (alias).
197    pub const SPACING_MARK: UnicodeCategory = Mc;
198    /// Surrogate (alias).
199    pub const SURROGATE: UnicodeCategory = Cs;
200    /// Titlecase Letter (alias).
201    pub const TITLECASE_LETTER: UnicodeCategory = Lt;
202    /// Unassigned (alias).
203    pub const UNASSIGNED: UnicodeCategory = Cn;
204    /// Uppercase Letter (alias).
205    pub const UPPERCASE_LETTER: UnicodeCategory = Lu;
206
207    /// Abbreviation as a string.
208    #[must_use]
209    pub const fn as_str(self) -> &'static str {
210        match self {
211            Pe => "Pe",
212            Pc => "Pc",
213            Cc => "Cc",
214            Sc => "Sc",
215            Pd => "Pd",
216            Nd => "Nd",
217            Me => "Me",
218            Pf => "Pf",
219            Cf => "Cf",
220            Pi => "Pi",
221            Nl => "Nl",
222            Zl => "Zl",
223            Ll => "Ll",
224            Sm => "Sm",
225            Lm => "Lm",
226            Sk => "Sk",
227            Mn => "Mn",
228            Ps => "Ps",
229            Lo => "Lo",
230            No => "No",
231            Po => "Po",
232            So => "So",
233            Zp => "Zp",
234            Co => "Co",
235            Zs => "Zs",
236            Mc => "Mc",
237            Cs => "Cs",
238            Lt => "Lt",
239            Cn => "Cn",
240            Lu => "Lu",
241        }
242    }
243}
244
245impl fmt::Display for UnicodeCategory {
246    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
247        f.write_str(self.as_str())
248    }
249}
250
251/// Set of Unicode categories.
252#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
253pub struct UnicodeCategorySet(u32);
254
255impl UnicodeCategorySet {
256    /// Empty set of Unicode categories.
257    #[inline]
258    #[must_use]
259    pub const fn new() -> Self {
260        Self(0)
261    }
262    /// All Unicode categories.
263    #[inline]
264    #[must_use]
265    pub const fn all() -> Self {
266        Self(ALL_CATEGORIES)
267    }
268    /// Create a category set, but do not check whether the input value is valid.
269    #[inline]
270    #[must_use]
271    pub(crate) const fn from_value_unchecked(value: u32) -> Self {
272        Self(value)
273    }
274    /// Add a new Unicode category to the set.
275    #[inline]
276    pub fn add(&mut self, category: UnicodeCategory) {
277        self.set(category as u8);
278    }
279    /// Remove a Unicode category from the set.
280    #[inline]
281    pub fn remove(&mut self, category: UnicodeCategory) {
282        self.unset(category as u8);
283    }
284    /// Whether the set contains `category`.
285    #[inline]
286    #[must_use]
287    pub const fn contains(self, category: UnicodeCategory) -> bool {
288        self.is_set(category as u8)
289    }
290    /// The size of the set.
291    #[inline]
292    #[must_use]
293    pub const fn len(self) -> usize {
294        self.0.count_ones() as usize
295    }
296    /// Whether the set is empty.
297    #[inline]
298    #[must_use]
299    pub const fn is_empty(self) -> bool {
300        self.0 == 0
301    }
302    /// Extract the inner storage value.
303    #[inline]
304    #[must_use]
305    pub const fn into_value(self) -> u32 {
306        self.0
307    }
308    /// Iterate over included Unicode categories.
309    #[inline]
310    #[must_use]
311    pub const fn iter(self) -> Iter {
312        Iter { data: self }
313    }
314    // `index` is always < 30 and can't overflow
315    #[inline]
316    #[allow(clippy::integer_arithmetic)]
317    pub(crate) fn set(&mut self, index: u8) {
318        self.0 |= 1 << index;
319    }
320    // `index` is always < 30 and can't overflow
321    #[inline]
322    #[allow(clippy::integer_arithmetic)]
323    pub(crate) fn unset(&mut self, index: u8) {
324        self.0 &= !(1 << index);
325    }
326    // `index`` is always < 30 and can't overflow
327    #[inline]
328    #[allow(clippy::integer_arithmetic)]
329    const fn is_set(self, index: u8) -> bool {
330        self.0 & (1 << index) != 0
331    }
332}
333
334impl Default for UnicodeCategorySet {
335    #[inline]
336    fn default() -> Self {
337        UnicodeCategorySet::new()
338    }
339}
340
341impl fmt::Display for UnicodeCategorySet {
342    // `idx` can't overflow as the maximum possible size of `iter` is 30 < usize::MAX
343    #[allow(clippy::integer_arithmetic)]
344    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
345        let len = self.len();
346        for (idx, category) in self.iter().enumerate() {
347            f.write_str(category.as_str())?;
348            if idx + 1 != len {
349                f.write_str(", ")?;
350            }
351        }
352        Ok(())
353    }
354}
355
356impl BitOr for UnicodeCategory {
357    type Output = UnicodeCategorySet;
358
359    // `self` and `rhs` are both < 30; Therefore shift won't overflow
360    #[inline]
361    #[allow(clippy::integer_arithmetic)]
362    fn bitor(self, rhs: Self) -> Self::Output {
363        UnicodeCategorySet(1 << self as u8 | 1 << rhs as u8)
364    }
365}
366impl BitOr<UnicodeCategorySet> for UnicodeCategory {
367    type Output = UnicodeCategorySet;
368
369    #[inline]
370    fn bitor(self, rhs: UnicodeCategorySet) -> Self::Output {
371        // Reusing existing `BitOr<UnicodeCategory> for UnicodeCategorySet`
372        rhs | self
373    }
374}
375
376impl BitOr<UnicodeCategory> for UnicodeCategorySet {
377    type Output = Self;
378
379    // `rhs as u8` can't overflow as it has only 30 elements
380    #[inline]
381    #[allow(clippy::integer_arithmetic)]
382    fn bitor(self, rhs: UnicodeCategory) -> Self::Output {
383        Self(self.into_value() | 1 << rhs as u8)
384    }
385}
386
387impl BitOr<UnicodeCategorySet> for UnicodeCategorySet {
388    type Output = Self;
389
390    #[inline]
391    fn bitor(self, rhs: UnicodeCategorySet) -> Self::Output {
392        Self(self.into_value() | rhs.into_value())
393    }
394}
395
396impl BitOrAssign<UnicodeCategorySet> for UnicodeCategorySet {
397    #[inline]
398    fn bitor_assign(&mut self, rhs: UnicodeCategorySet) {
399        self.0 |= rhs.into_value();
400    }
401}
402
403impl BitOrAssign<UnicodeCategory> for UnicodeCategorySet {
404    #[inline]
405    fn bitor_assign(&mut self, rhs: UnicodeCategory) {
406        self.add(rhs);
407    }
408}
409
410#[derive(Debug)]
411pub struct Iter {
412    data: UnicodeCategorySet,
413}
414
415impl Iterator for Iter {
416    type Item = UnicodeCategory;
417
418    fn next(&mut self) -> Option<Self::Item> {
419        // INVARIANT: The number of trailing zeros for `u32` is 32 at most which is less than `u8::MAX`
420        #[allow(clippy::cast_possible_truncation)]
421        let index = self.data.0.trailing_zeros() as u8;
422        let category = match index {
423            0 => Pe,
424            1 => Pc,
425            2 => Cc,
426            3 => Sc,
427            4 => Pd,
428            5 => Nd,
429            6 => Me,
430            7 => Pf,
431            8 => Cf,
432            9 => Pi,
433            10 => Nl,
434            11 => Zl,
435            12 => Ll,
436            13 => Sm,
437            14 => Lm,
438            15 => Sk,
439            16 => Mn,
440            17 => Ps,
441            18 => Lo,
442            19 => No,
443            20 => Po,
444            21 => So,
445            22 => Zp,
446            23 => Co,
447            24 => Zs,
448            25 => Mc,
449            26 => Cs,
450            27 => Lt,
451            28 => Cn,
452            29 => Lu,
453            _ => return None,
454        };
455        self.data.unset(index);
456        Some(category)
457    }
458}
459
460impl ExactSizeIterator for Iter {
461    #[inline]
462    fn len(&self) -> usize {
463        self.data.len()
464    }
465}
466
467impl From<UnicodeCategory> for UnicodeCategorySet {
468    // `category as u8` can't overflow as it has only 30 elements
469    #[inline]
470    #[allow(clippy::integer_arithmetic)]
471    fn from(category: UnicodeCategory) -> Self {
472        Self::from_value_unchecked(1 << category as u8)
473    }
474}
475
476impl From<UnicodeCategory> for Option<UnicodeCategorySet> {
477    #[inline]
478    fn from(category: UnicodeCategory) -> Self {
479        Some(category.into())
480    }
481}
482
483/// Return all Unicode categories that are in `include`, but not in `exclude`.
484#[inline]
485#[must_use]
486pub const fn merge(
487    include: Option<UnicodeCategorySet>,
488    exclude: UnicodeCategorySet,
489) -> UnicodeCategorySet {
490    if let Some(include) = include {
491        if include.is_empty() {
492            // include no categories
493            include
494        } else {
495            UnicodeCategorySet::from_value_unchecked(
496                (ALL_CATEGORIES ^ exclude.into_value()) & include.into_value(),
497            )
498        }
499    } else {
500        UnicodeCategorySet::from_value_unchecked(ALL_CATEGORIES ^ exclude.into_value())
501    }
502}
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507    use std::{
508        collections::hash_map::DefaultHasher,
509        hash::{Hash, Hasher},
510    };
511    use test_case::test_case;
512
513    #[test]
514    fn test_category_from_str_error() {
515        assert_eq!(
516            UnicodeCategory::from_str("wrong")
517                .expect_err("Should fail")
518                .to_string(),
519            "'wrong' is not a valid Unicode category"
520        );
521    }
522
523    #[test]
524    #[allow(clippy::clone_on_copy)]
525    fn test_category_traits() {
526        let mut hasher = DefaultHasher::new();
527        Ll.hash(&mut hasher);
528        hasher.finish();
529        let _ = Ll.clone();
530        assert_eq!(format!("{Ll:?}"), "Ll");
531    }
532
533    #[test]
534    fn test_single_letter_categories() {
535        assert_eq!(UnicodeCategory::L, Ll | Lm | Lo | Lt | Lu);
536    }
537
538    #[test]
539    fn test_set_display() {
540        assert_eq!(UnicodeCategory::L.to_string(), "Ll, Lm, Lo, Lt, Lu");
541    }
542
543    #[test]
544    fn test_set_add() {
545        let mut set = UnicodeCategorySet::new();
546        assert!(set.is_empty());
547        set.add(Ll);
548        assert!(set.contains(Ll));
549        assert_eq!(set.len(), 1);
550    }
551
552    #[test]
553    fn test_set_remove() {
554        let mut set = UnicodeCategorySet::all();
555        assert!(set.contains(Ll));
556        set.remove(Ll);
557        assert!(!set.contains(Ll));
558    }
559
560    #[test]
561    #[allow(clippy::clone_on_copy)]
562    fn test_category_set_traits() {
563        let set = UnicodeCategory::L;
564        let mut hasher = DefaultHasher::new();
565        set.hash(&mut hasher);
566        hasher.finish();
567        let _ = set.clone();
568        assert_eq!(format!("{set:?}"), "UnicodeCategorySet(671371264)");
569    }
570
571    #[test]
572    fn test_iter_traits() {
573        let set = UnicodeCategory::L;
574        let iter = set.iter();
575        assert_eq!(
576            format!("{iter:?}"),
577            "Iter { data: UnicodeCategorySet(671371264) }"
578        );
579    }
580
581    #[test]
582    fn test_bit_or() {
583        assert_eq!(Ll | UnicodeCategorySet::new(), Ll.into());
584        assert_eq!(
585            UnicodeCategory::L | UnicodeCategory::C,
586            Ll | Lm | Lo | Lt | Lu | Cs | Cc | Cf | Cn | Co
587        );
588        let mut set = UnicodeCategorySet::new();
589        set |= Ll;
590        set |= UnicodeCategory::C;
591        assert_eq!(set, Ll | Cs | Cc | Cf | Cn | Co);
592    }
593
594    #[test]
595    fn test_set_iter() {
596        let all_categories = UnicodeCategorySet::all();
597        assert_eq!(all_categories.iter().len(), all_categories.len());
598        let mut set = UnicodeCategorySet::new();
599        for category in all_categories.iter() {
600            let name = format!("{category}");
601            assert_eq!(
602                UnicodeCategory::from_str(&name).expect("Invalid category"),
603                category
604            );
605            set.add(category);
606        }
607        assert_eq!(all_categories, set);
608    }
609
610    #[test]
611    fn test_set_default() {
612        assert_eq!(UnicodeCategorySet::default(), UnicodeCategorySet::new());
613    }
614
615    #[test]
616    fn test_set_option_from_category() {
617        let set: Option<UnicodeCategorySet> = Ll.into();
618        assert!(set.is_some());
619        assert_eq!(set.expect("Unexpected `None`"), Ll.into());
620    }
621
622    #[test_case(Some(Lu | Me | Cs | So), So.into(), Lu | Me | Cs)]
623    #[test_case(None, UnicodeCategory::L | UnicodeCategory::M | UnicodeCategory::N | UnicodeCategory::P | UnicodeCategory::S, UnicodeCategory::Z | UnicodeCategory::C)]
624    #[test_case(
625        Some(UnicodeCategorySet::new()),
626        UnicodeCategorySet::new(),
627        UnicodeCategorySet::new()
628    )]
629    fn test_category_merge(
630        include: Option<UnicodeCategorySet>,
631        exclude: UnicodeCategorySet,
632        expected: UnicodeCategorySet,
633    ) {
634        assert_eq!(merge(include, exclude), expected);
635    }
636}