1use crate::{constants::ALL_CATEGORIES, error};
2use core::{
3 fmt,
4 ops::{BitOr, BitOrAssign},
5 str::FromStr,
6};
7use UnicodeCategory::*;
8
9#[derive(Debug, Eq, PartialEq, Copy, Clone, Hash)]
11pub enum UnicodeCategory {
12 Pe,
14 Pc,
16 Cc,
18 Sc,
20 Pd,
22 Nd,
24 Me,
26 Pf,
28 Cf,
30 Pi,
32 Nl,
34 Zl,
36 Ll,
38 Sm,
40 Lm,
42 Sk,
44 Mn,
46 Ps,
48 Lo,
50 No,
52 Po,
54 So,
56 Zp,
58 Co,
60 Zs,
62 Mc,
64 Cs,
66 Lt,
68 Cn,
70 Lu,
72}
73
74impl FromStr for UnicodeCategory {
75 type Err = error::Error;
76
77 fn from_str(s: &str) -> Result<Self, Self::Err> {
78 Ok(match s {
79 "Pe" => Pe,
80 "Pc" => Pc,
81 "Cc" => Cc,
82 "Sc" => Sc,
83 "Pd" => Pd,
84 "Nd" => Nd,
85 "Me" => Me,
86 "Pf" => Pf,
87 "Cf" => Cf,
88 "Pi" => Pi,
89 "Nl" => Nl,
90 "Zl" => Zl,
91 "Ll" => Ll,
92 "Sm" => Sm,
93 "Lm" => Lm,
94 "Sk" => Sk,
95 "Mn" => Mn,
96 "Ps" => Ps,
97 "Lo" => Lo,
98 "No" => No,
99 "Po" => Po,
100 "So" => So,
101 "Zp" => Zp,
102 "Co" => Co,
103 "Zs" => Zs,
104 "Mc" => Mc,
105 "Cs" => Cs,
106 "Lt" => Lt,
107 "Cn" => Cn,
108 "Lu" => Lu,
109 _ => return Err(Self::Err::InvalidCategory(s.to_owned().into_boxed_str())),
110 })
111 }
112}
113
114impl UnicodeCategory {
115 pub const L: UnicodeCategorySet = UnicodeCategorySet(
117 1 << Ll as u32 | 1 << Lm as u32 | 1 << Lo as u32 | 1 << Lt as u32 | 1 << Lu as u32,
118 );
119 pub const M: UnicodeCategorySet =
121 UnicodeCategorySet(1 << Mc as u32 | 1 << Me as u32 | 1 << Mn as u32);
122 pub const N: UnicodeCategorySet =
124 UnicodeCategorySet(1 << Nd as u32 | 1 << Nl as u32 | 1 << No as u32);
125 pub const P: UnicodeCategorySet = UnicodeCategorySet(
127 1 << Pc as u32
128 | 1 << Pd as u32
129 | 1 << Pe as u32
130 | 1 << Pf as u32
131 | 1 << Pi as u32
132 | 1 << Po as u32
133 | 1 << Ps as u32,
134 );
135 pub const S: UnicodeCategorySet =
137 UnicodeCategorySet(1 << Sc as u32 | 1 << Sk as u32 | 1 << Sm as u32 | 1 << So as u32);
138 pub const Z: UnicodeCategorySet =
140 UnicodeCategorySet(1 << Zp as u32 | 1 << Zs as u32 | 1 << Zl as u32);
141 pub const C: UnicodeCategorySet = UnicodeCategorySet(
143 1 << Cc as u32 | 1 << Cf as u32 | 1 << Cn as u32 | 1 << Co as u32 | 1 << Cs as u32,
144 );
145 pub const CLOSE_PUNCTUATION: UnicodeCategory = Pe;
148 pub const CONNECTOR_PUNCTUATION: UnicodeCategory = Pc;
150 pub const CONTROL: UnicodeCategory = Cc;
152 pub const CURRENCY_SYMBOL: UnicodeCategory = Sc;
154 pub const DASH_PUNCTUATION: UnicodeCategory = Pd;
156 pub const DECIMAL_NUMBER: UnicodeCategory = Nd;
158 pub const ENCLOSING_MARK: UnicodeCategory = Me;
160 pub const FINAL_PUNCTUATION: UnicodeCategory = Pf;
162 pub const FORMAT: UnicodeCategory = Cf;
164 pub const INITIAL_PUNCTUATION: UnicodeCategory = Pi;
166 pub const LETTER_NUMBER: UnicodeCategory = Nl;
168 pub const LINE_SEPARATOR: UnicodeCategory = Zl;
170 pub const LOWERCASE_LETTER: UnicodeCategory = Ll;
172 pub const MATH_SYMBOL: UnicodeCategory = Sm;
174 pub const MODIFIER_LETTER: UnicodeCategory = Lm;
176 pub const MODIFIER_SYMBOL: UnicodeCategory = Sk;
178 pub const NONSPACING_MARK: UnicodeCategory = Mn;
180 pub const OPEN_PUNCTUATION: UnicodeCategory = Ps;
182 pub const OTHER_LETTER: UnicodeCategory = Lo;
184 pub const OTHER_NUMBER: UnicodeCategory = No;
186 pub const OTHER_PUNCTUATION: UnicodeCategory = Po;
188 pub const OTHER_SYMBOL: UnicodeCategory = So;
190 pub const PARAGRAPH_SEPARATOR: UnicodeCategory = Zp;
192 pub const PRIVATE_USE: UnicodeCategory = Co;
194 pub const SPACE_SEPARATOR: UnicodeCategory = Zs;
196 pub const SPACING_MARK: UnicodeCategory = Mc;
198 pub const SURROGATE: UnicodeCategory = Cs;
200 pub const TITLECASE_LETTER: UnicodeCategory = Lt;
202 pub const UNASSIGNED: UnicodeCategory = Cn;
204 pub const UPPERCASE_LETTER: UnicodeCategory = Lu;
206
207 #[must_use]
209 pub const fn as_str(self) -> &'static str {
210 match self {
211 Pe => "Pe",
212 Pc => "Pc",
213 Cc => "Cc",
214 Sc => "Sc",
215 Pd => "Pd",
216 Nd => "Nd",
217 Me => "Me",
218 Pf => "Pf",
219 Cf => "Cf",
220 Pi => "Pi",
221 Nl => "Nl",
222 Zl => "Zl",
223 Ll => "Ll",
224 Sm => "Sm",
225 Lm => "Lm",
226 Sk => "Sk",
227 Mn => "Mn",
228 Ps => "Ps",
229 Lo => "Lo",
230 No => "No",
231 Po => "Po",
232 So => "So",
233 Zp => "Zp",
234 Co => "Co",
235 Zs => "Zs",
236 Mc => "Mc",
237 Cs => "Cs",
238 Lt => "Lt",
239 Cn => "Cn",
240 Lu => "Lu",
241 }
242 }
243}
244
245impl fmt::Display for UnicodeCategory {
246 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
247 f.write_str(self.as_str())
248 }
249}
250
251#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
253pub struct UnicodeCategorySet(u32);
254
255impl UnicodeCategorySet {
256 #[inline]
258 #[must_use]
259 pub const fn new() -> Self {
260 Self(0)
261 }
262 #[inline]
264 #[must_use]
265 pub const fn all() -> Self {
266 Self(ALL_CATEGORIES)
267 }
268 #[inline]
270 #[must_use]
271 pub(crate) const fn from_value_unchecked(value: u32) -> Self {
272 Self(value)
273 }
274 #[inline]
276 pub fn add(&mut self, category: UnicodeCategory) {
277 self.set(category as u8);
278 }
279 #[inline]
281 pub fn remove(&mut self, category: UnicodeCategory) {
282 self.unset(category as u8);
283 }
284 #[inline]
286 #[must_use]
287 pub const fn contains(self, category: UnicodeCategory) -> bool {
288 self.is_set(category as u8)
289 }
290 #[inline]
292 #[must_use]
293 pub const fn len(self) -> usize {
294 self.0.count_ones() as usize
295 }
296 #[inline]
298 #[must_use]
299 pub const fn is_empty(self) -> bool {
300 self.0 == 0
301 }
302 #[inline]
304 #[must_use]
305 pub const fn into_value(self) -> u32 {
306 self.0
307 }
308 #[inline]
310 #[must_use]
311 pub const fn iter(self) -> Iter {
312 Iter { data: self }
313 }
314 #[inline]
316 #[allow(clippy::integer_arithmetic)]
317 pub(crate) fn set(&mut self, index: u8) {
318 self.0 |= 1 << index;
319 }
320 #[inline]
322 #[allow(clippy::integer_arithmetic)]
323 pub(crate) fn unset(&mut self, index: u8) {
324 self.0 &= !(1 << index);
325 }
326 #[inline]
328 #[allow(clippy::integer_arithmetic)]
329 const fn is_set(self, index: u8) -> bool {
330 self.0 & (1 << index) != 0
331 }
332}
333
334impl Default for UnicodeCategorySet {
335 #[inline]
336 fn default() -> Self {
337 UnicodeCategorySet::new()
338 }
339}
340
341impl fmt::Display for UnicodeCategorySet {
342 #[allow(clippy::integer_arithmetic)]
344 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
345 let len = self.len();
346 for (idx, category) in self.iter().enumerate() {
347 f.write_str(category.as_str())?;
348 if idx + 1 != len {
349 f.write_str(", ")?;
350 }
351 }
352 Ok(())
353 }
354}
355
356impl BitOr for UnicodeCategory {
357 type Output = UnicodeCategorySet;
358
359 #[inline]
361 #[allow(clippy::integer_arithmetic)]
362 fn bitor(self, rhs: Self) -> Self::Output {
363 UnicodeCategorySet(1 << self as u8 | 1 << rhs as u8)
364 }
365}
366impl BitOr<UnicodeCategorySet> for UnicodeCategory {
367 type Output = UnicodeCategorySet;
368
369 #[inline]
370 fn bitor(self, rhs: UnicodeCategorySet) -> Self::Output {
371 rhs | self
373 }
374}
375
376impl BitOr<UnicodeCategory> for UnicodeCategorySet {
377 type Output = Self;
378
379 #[inline]
381 #[allow(clippy::integer_arithmetic)]
382 fn bitor(self, rhs: UnicodeCategory) -> Self::Output {
383 Self(self.into_value() | 1 << rhs as u8)
384 }
385}
386
387impl BitOr<UnicodeCategorySet> for UnicodeCategorySet {
388 type Output = Self;
389
390 #[inline]
391 fn bitor(self, rhs: UnicodeCategorySet) -> Self::Output {
392 Self(self.into_value() | rhs.into_value())
393 }
394}
395
396impl BitOrAssign<UnicodeCategorySet> for UnicodeCategorySet {
397 #[inline]
398 fn bitor_assign(&mut self, rhs: UnicodeCategorySet) {
399 self.0 |= rhs.into_value();
400 }
401}
402
403impl BitOrAssign<UnicodeCategory> for UnicodeCategorySet {
404 #[inline]
405 fn bitor_assign(&mut self, rhs: UnicodeCategory) {
406 self.add(rhs);
407 }
408}
409
410#[derive(Debug)]
411pub struct Iter {
412 data: UnicodeCategorySet,
413}
414
415impl Iterator for Iter {
416 type Item = UnicodeCategory;
417
418 fn next(&mut self) -> Option<Self::Item> {
419 #[allow(clippy::cast_possible_truncation)]
421 let index = self.data.0.trailing_zeros() as u8;
422 let category = match index {
423 0 => Pe,
424 1 => Pc,
425 2 => Cc,
426 3 => Sc,
427 4 => Pd,
428 5 => Nd,
429 6 => Me,
430 7 => Pf,
431 8 => Cf,
432 9 => Pi,
433 10 => Nl,
434 11 => Zl,
435 12 => Ll,
436 13 => Sm,
437 14 => Lm,
438 15 => Sk,
439 16 => Mn,
440 17 => Ps,
441 18 => Lo,
442 19 => No,
443 20 => Po,
444 21 => So,
445 22 => Zp,
446 23 => Co,
447 24 => Zs,
448 25 => Mc,
449 26 => Cs,
450 27 => Lt,
451 28 => Cn,
452 29 => Lu,
453 _ => return None,
454 };
455 self.data.unset(index);
456 Some(category)
457 }
458}
459
460impl ExactSizeIterator for Iter {
461 #[inline]
462 fn len(&self) -> usize {
463 self.data.len()
464 }
465}
466
467impl From<UnicodeCategory> for UnicodeCategorySet {
468 #[inline]
470 #[allow(clippy::integer_arithmetic)]
471 fn from(category: UnicodeCategory) -> Self {
472 Self::from_value_unchecked(1 << category as u8)
473 }
474}
475
476impl From<UnicodeCategory> for Option<UnicodeCategorySet> {
477 #[inline]
478 fn from(category: UnicodeCategory) -> Self {
479 Some(category.into())
480 }
481}
482
483#[inline]
485#[must_use]
486pub const fn merge(
487 include: Option<UnicodeCategorySet>,
488 exclude: UnicodeCategorySet,
489) -> UnicodeCategorySet {
490 if let Some(include) = include {
491 if include.is_empty() {
492 include
494 } else {
495 UnicodeCategorySet::from_value_unchecked(
496 (ALL_CATEGORIES ^ exclude.into_value()) & include.into_value(),
497 )
498 }
499 } else {
500 UnicodeCategorySet::from_value_unchecked(ALL_CATEGORIES ^ exclude.into_value())
501 }
502}
503
504#[cfg(test)]
505mod tests {
506 use super::*;
507 use std::{
508 collections::hash_map::DefaultHasher,
509 hash::{Hash, Hasher},
510 };
511 use test_case::test_case;
512
513 #[test]
514 fn test_category_from_str_error() {
515 assert_eq!(
516 UnicodeCategory::from_str("wrong")
517 .expect_err("Should fail")
518 .to_string(),
519 "'wrong' is not a valid Unicode category"
520 );
521 }
522
523 #[test]
524 #[allow(clippy::clone_on_copy)]
525 fn test_category_traits() {
526 let mut hasher = DefaultHasher::new();
527 Ll.hash(&mut hasher);
528 hasher.finish();
529 let _ = Ll.clone();
530 assert_eq!(format!("{Ll:?}"), "Ll");
531 }
532
533 #[test]
534 fn test_single_letter_categories() {
535 assert_eq!(UnicodeCategory::L, Ll | Lm | Lo | Lt | Lu);
536 }
537
538 #[test]
539 fn test_set_display() {
540 assert_eq!(UnicodeCategory::L.to_string(), "Ll, Lm, Lo, Lt, Lu");
541 }
542
543 #[test]
544 fn test_set_add() {
545 let mut set = UnicodeCategorySet::new();
546 assert!(set.is_empty());
547 set.add(Ll);
548 assert!(set.contains(Ll));
549 assert_eq!(set.len(), 1);
550 }
551
552 #[test]
553 fn test_set_remove() {
554 let mut set = UnicodeCategorySet::all();
555 assert!(set.contains(Ll));
556 set.remove(Ll);
557 assert!(!set.contains(Ll));
558 }
559
560 #[test]
561 #[allow(clippy::clone_on_copy)]
562 fn test_category_set_traits() {
563 let set = UnicodeCategory::L;
564 let mut hasher = DefaultHasher::new();
565 set.hash(&mut hasher);
566 hasher.finish();
567 let _ = set.clone();
568 assert_eq!(format!("{set:?}"), "UnicodeCategorySet(671371264)");
569 }
570
571 #[test]
572 fn test_iter_traits() {
573 let set = UnicodeCategory::L;
574 let iter = set.iter();
575 assert_eq!(
576 format!("{iter:?}"),
577 "Iter { data: UnicodeCategorySet(671371264) }"
578 );
579 }
580
581 #[test]
582 fn test_bit_or() {
583 assert_eq!(Ll | UnicodeCategorySet::new(), Ll.into());
584 assert_eq!(
585 UnicodeCategory::L | UnicodeCategory::C,
586 Ll | Lm | Lo | Lt | Lu | Cs | Cc | Cf | Cn | Co
587 );
588 let mut set = UnicodeCategorySet::new();
589 set |= Ll;
590 set |= UnicodeCategory::C;
591 assert_eq!(set, Ll | Cs | Cc | Cf | Cn | Co);
592 }
593
594 #[test]
595 fn test_set_iter() {
596 let all_categories = UnicodeCategorySet::all();
597 assert_eq!(all_categories.iter().len(), all_categories.len());
598 let mut set = UnicodeCategorySet::new();
599 for category in all_categories.iter() {
600 let name = format!("{category}");
601 assert_eq!(
602 UnicodeCategory::from_str(&name).expect("Invalid category"),
603 category
604 );
605 set.add(category);
606 }
607 assert_eq!(all_categories, set);
608 }
609
610 #[test]
611 fn test_set_default() {
612 assert_eq!(UnicodeCategorySet::default(), UnicodeCategorySet::new());
613 }
614
615 #[test]
616 fn test_set_option_from_category() {
617 let set: Option<UnicodeCategorySet> = Ll.into();
618 assert!(set.is_some());
619 assert_eq!(set.expect("Unexpected `None`"), Ll.into());
620 }
621
622 #[test_case(Some(Lu | Me | Cs | So), So.into(), Lu | Me | Cs)]
623 #[test_case(None, UnicodeCategory::L | UnicodeCategory::M | UnicodeCategory::N | UnicodeCategory::P | UnicodeCategory::S, UnicodeCategory::Z | UnicodeCategory::C)]
624 #[test_case(
625 Some(UnicodeCategorySet::new()),
626 UnicodeCategorySet::new(),
627 UnicodeCategorySet::new()
628 )]
629 fn test_category_merge(
630 include: Option<UnicodeCategorySet>,
631 exclude: UnicodeCategorySet,
632 expected: UnicodeCategorySet,
633 ) {
634 assert_eq!(merge(include, exclude), expected);
635 }
636}