1#![warn(
88 clippy::cast_possible_truncation,
89 clippy::doc_markdown,
90 clippy::explicit_iter_loop,
91 clippy::map_unwrap_or,
92 clippy::match_same_arms,
93 clippy::needless_borrow,
94 clippy::needless_pass_by_value,
95 clippy::print_stdout,
96 clippy::redundant_closure,
97 clippy::trivially_copy_pass_by_ref,
98 missing_debug_implementations,
99 missing_docs,
100 trivial_casts,
101 trivial_numeric_casts,
102 unused_extern_crates,
103 unused_import_braces,
104 variant_size_differences,
105 clippy::integer_arithmetic,
106 clippy::unwrap_used,
107 clippy::semicolon_if_nothing_returned,
108 clippy::cargo
109)]
110#![allow(clippy::redundant_static_lifetimes)]
111use crate::constants::MAX_CODEPOINT;
112use core::fmt;
113use std::str::FromStr;
114
115mod categories;
116mod constants;
117mod error;
118mod intervals;
119mod intervalset;
120mod query;
121mod tables;
122pub use crate::{
123 categories::{UnicodeCategory, UnicodeCategorySet},
124 error::Error,
125 intervalset::IntervalSet,
126};
127
128#[cfg(feature = "__benchmark_internals")]
129pub mod internals {
131 pub mod categories {
133 pub use crate::categories::merge;
134 }
135
136 pub mod intervals {
138 pub use crate::intervals::{from_str, merge, subtract};
139 }
140
141 pub mod query {
143 pub use crate::query::{intervals_for_set, query};
144 }
145}
146
147pub type Interval = (u32, u32);
149
150#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
152pub enum UnicodeVersion {
153 V9_0_0,
155 V10_0_0,
157 V11_0_0,
159 V12_0_0,
161 V12_1_0,
163 V13_0_0,
165 V14_0_0,
167 V15_0_0,
169}
170
171impl fmt::Display for UnicodeVersion {
172 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173 f.write_str(self.as_str())
174 }
175}
176
177impl FromStr for UnicodeVersion {
178 type Err = Error;
179
180 fn from_str(s: &str) -> Result<Self, Self::Err> {
181 match s {
182 "9.0.0" => Ok(UnicodeVersion::V9_0_0),
183 "10.0.0" => Ok(UnicodeVersion::V10_0_0),
184 "11.0.0" => Ok(UnicodeVersion::V11_0_0),
185 "12.0.0" => Ok(UnicodeVersion::V12_0_0),
186 "12.1.0" => Ok(UnicodeVersion::V12_1_0),
187 "13.0.0" => Ok(UnicodeVersion::V13_0_0),
188 "14.0.0" => Ok(UnicodeVersion::V14_0_0),
189 "15.0.0" => Ok(UnicodeVersion::V15_0_0),
190 _ => Err(Error::InvalidVersion(s.to_string().into_boxed_str())),
191 }
192 }
193}
194
195impl UnicodeVersion {
196 #[must_use]
198 pub const fn as_str(self) -> &'static str {
199 match self {
200 UnicodeVersion::V9_0_0 => "9.0.0",
201 UnicodeVersion::V10_0_0 => "10.0.0",
202 UnicodeVersion::V11_0_0 => "11.0.0",
203 UnicodeVersion::V12_0_0 => "12.0.0",
204 UnicodeVersion::V12_1_0 => "12.1.0",
205 UnicodeVersion::V13_0_0 => "13.0.0",
206 UnicodeVersion::V14_0_0 => "14.0.0",
207 UnicodeVersion::V15_0_0 => "15.0.0",
208 }
209 }
210 #[must_use]
212 pub const fn latest() -> UnicodeVersion {
213 UnicodeVersion::V15_0_0
214 }
215 #[inline]
218 #[must_use]
219 pub const fn table(self) -> &'static [&'static [Interval]] {
220 match self {
221 UnicodeVersion::V9_0_0 => tables::v9_0_0::BY_NAME,
222 UnicodeVersion::V10_0_0 => tables::v10_0_0::BY_NAME,
223 UnicodeVersion::V11_0_0 => tables::v11_0_0::BY_NAME,
224 UnicodeVersion::V12_0_0 => tables::v12_0_0::BY_NAME,
225 UnicodeVersion::V12_1_0 => tables::v12_1_0::BY_NAME,
226 UnicodeVersion::V13_0_0 => tables::v13_0_0::BY_NAME,
227 UnicodeVersion::V14_0_0 => tables::v14_0_0::BY_NAME,
228 UnicodeVersion::V15_0_0 => tables::v15_0_0::BY_NAME,
229 }
230 }
231
232 #[inline]
234 #[must_use]
235 pub const fn intervals_for(self, category: UnicodeCategory) -> &'static [Interval] {
236 self.table()[category as usize]
237 }
238
239 #[inline]
241 #[must_use]
242 pub const fn normalized_categories(self) -> [UnicodeCategory; 30] {
243 let mut lengths: [(UnicodeCategory, usize); 30] = [(UnicodeCategory::Cc, 0); 30];
245 let mut idx = 0;
246 let table = self.table();
247 let categories = [
248 UnicodeCategory::Pe,
249 UnicodeCategory::Pc,
250 UnicodeCategory::Cc,
251 UnicodeCategory::Sc,
252 UnicodeCategory::Pd,
253 UnicodeCategory::Nd,
254 UnicodeCategory::Me,
255 UnicodeCategory::Pf,
256 UnicodeCategory::Cf,
257 UnicodeCategory::Pi,
258 UnicodeCategory::Nl,
259 UnicodeCategory::Zl,
260 UnicodeCategory::Ll,
261 UnicodeCategory::Sm,
262 UnicodeCategory::Lm,
263 UnicodeCategory::Sk,
264 UnicodeCategory::Mn,
265 UnicodeCategory::Ps,
266 UnicodeCategory::Lo,
267 UnicodeCategory::No,
268 UnicodeCategory::Po,
269 UnicodeCategory::So,
270 UnicodeCategory::Zp,
271 UnicodeCategory::Co,
272 UnicodeCategory::Zs,
273 UnicodeCategory::Mc,
274 UnicodeCategory::Cs,
275 UnicodeCategory::Lt,
276 UnicodeCategory::Cn,
277 UnicodeCategory::Lu,
278 ];
279 #[allow(clippy::integer_arithmetic)]
281 while idx < table.len() {
282 lengths[idx] = (categories[idx], table[idx].len());
283 idx += 1;
284 }
285 loop {
289 let mut swapped = false;
290 let mut idx = 1;
291 #[allow(clippy::integer_arithmetic)]
293 while idx < lengths.len() {
294 if lengths[idx - 1].1 > lengths[idx].1 {
295 let left = lengths[idx - 1];
296 let right = lengths[idx];
297 lengths[idx - 1] = right;
298 lengths[idx] = left;
299 swapped = true;
300 }
301 idx += 1;
302 }
303 if !swapped {
304 break;
305 }
306 }
307
308 let mut output: [UnicodeCategory; 30] = [
310 UnicodeCategory::Cc,
311 UnicodeCategory::Cc,
312 UnicodeCategory::Cc,
313 UnicodeCategory::Cc,
314 UnicodeCategory::Cc,
315 UnicodeCategory::Cc,
316 UnicodeCategory::Cc,
317 UnicodeCategory::Cc,
318 UnicodeCategory::Cc,
319 UnicodeCategory::Cc,
320 UnicodeCategory::Cc,
321 UnicodeCategory::Cc,
322 UnicodeCategory::Cc,
323 UnicodeCategory::Cc,
324 UnicodeCategory::Cc,
325 UnicodeCategory::Cc,
326 UnicodeCategory::Cc,
327 UnicodeCategory::Cc,
328 UnicodeCategory::Cc,
329 UnicodeCategory::Cc,
330 UnicodeCategory::Cc,
331 UnicodeCategory::Cc,
332 UnicodeCategory::Cc,
333 UnicodeCategory::Cc,
334 UnicodeCategory::Cc,
335 UnicodeCategory::Cc,
336 UnicodeCategory::Cc,
337 UnicodeCategory::Cc,
338 UnicodeCategory::Cc,
339 UnicodeCategory::Cs,
340 ];
341 let mut idx = 0;
342 let mut ptr = 0;
343
344 while idx < lengths.len() {
345 let (category, _) = lengths[idx];
346 #[allow(clippy::integer_arithmetic)]
348 if category as u8 == UnicodeCategory::Cc as u8
349 || category as u8 == UnicodeCategory::Cs as u8
350 {
351 idx += 1;
352 } else {
353 output[ptr] = category;
354 ptr += 1;
355 idx += 1;
356 }
357 }
358 output
359 }
360
361 #[must_use]
363 #[inline]
364 pub fn query<'a>(self) -> IntervalQuery<'a> {
365 IntervalQuery::new(self)
366 }
367
368 pub fn intervals<'a>(
375 self,
376 include_categories: impl Into<Option<UnicodeCategorySet>>,
377 exclude_categories: impl Into<Option<UnicodeCategorySet>>,
378 include_characters: impl Into<Option<&'a str>>,
379 exclude_characters: impl Into<Option<&'a str>>,
380 min_codepoint: impl Into<Option<u32>>,
381 max_codepoint: impl Into<Option<u32>>,
382 ) -> Result<Vec<Interval>, Error> {
383 let exclude_categories: UnicodeCategorySet = exclude_categories
384 .into()
385 .unwrap_or_else(UnicodeCategorySet::new);
386 let min_codepoint = min_codepoint.into().unwrap_or(0);
387 let max_codepoint = max_codepoint.into().unwrap_or(MAX_CODEPOINT);
388 self.intervals_impl(
389 include_categories.into(),
390 exclude_categories,
391 include_characters.into(),
392 exclude_characters.into(),
393 min_codepoint,
394 max_codepoint,
395 )
396 }
397
398 fn intervals_impl(
399 self,
400 include_categories: Option<UnicodeCategorySet>,
401 exclude_categories: UnicodeCategorySet,
402 include_characters: Option<&str>,
403 exclude_characters: Option<&str>,
404 min_codepoint: u32,
405 max_codepoint: u32,
406 ) -> Result<Vec<Interval>, Error> {
407 if min_codepoint > MAX_CODEPOINT || max_codepoint > MAX_CODEPOINT {
408 return Err(Error::CodepointNotInRange(min_codepoint, max_codepoint));
409 }
410 if min_codepoint > max_codepoint {
411 return Err(Error::InvalidCodepoints(min_codepoint, max_codepoint));
412 }
413 Ok(query::query(
414 self,
415 include_categories,
416 exclude_categories,
417 include_characters.unwrap_or(""),
418 exclude_characters.unwrap_or(""),
419 min_codepoint,
420 max_codepoint,
421 ))
422 }
423
424 pub fn interval_set<'a>(
431 self,
432 include_categories: impl Into<Option<UnicodeCategorySet>>,
433 exclude_categories: impl Into<Option<UnicodeCategorySet>>,
434 include_characters: impl Into<Option<&'a str>>,
435 exclude_characters: impl Into<Option<&'a str>>,
436 min_codepoint: impl Into<Option<u32>>,
437 max_codepoint: impl Into<Option<u32>>,
438 ) -> Result<IntervalSet, Error> {
439 let intervals = self.intervals(
440 include_categories,
441 exclude_categories,
442 include_characters,
443 exclude_characters,
444 min_codepoint,
445 max_codepoint,
446 )?;
447 Ok(IntervalSet::new(intervals))
448 }
449}
450
451#[derive(Debug, Clone, PartialEq)]
470pub struct IntervalQuery<'a> {
471 version: UnicodeVersion,
472 include_categories: Option<UnicodeCategorySet>,
473 exclude_categories: Option<UnicodeCategorySet>,
474 include_characters: Option<&'a str>,
475 exclude_characters: Option<&'a str>,
476 min_codepoint: u32,
477 max_codepoint: u32,
478}
479
480impl<'a> IntervalQuery<'a> {
481 fn new(version: UnicodeVersion) -> IntervalQuery<'a> {
482 IntervalQuery {
483 version,
484 include_categories: None,
485 exclude_categories: None,
486 include_characters: None,
487 exclude_characters: None,
488 min_codepoint: 0,
489 max_codepoint: MAX_CODEPOINT,
490 }
491 }
492 #[must_use]
494 pub fn include_categories(
495 mut self,
496 include_categories: impl Into<Option<UnicodeCategorySet>>,
497 ) -> IntervalQuery<'a> {
498 self.include_categories = include_categories.into();
499 self
500 }
501 #[must_use]
503 pub fn exclude_categories(
504 mut self,
505 exclude_categories: impl Into<Option<UnicodeCategorySet>>,
506 ) -> IntervalQuery<'a> {
507 self.exclude_categories = exclude_categories.into();
508 self
509 }
510 #[must_use]
512 pub fn include_characters(mut self, include_characters: &'a str) -> IntervalQuery<'a> {
513 self.include_characters = Some(include_characters);
514 self
515 }
516 #[must_use]
518 pub fn exclude_characters(mut self, exclude_characters: &'a str) -> IntervalQuery<'a> {
519 self.exclude_characters = Some(exclude_characters);
520 self
521 }
522 #[must_use]
524 pub fn min_codepoint(mut self, min_codepoint: u32) -> IntervalQuery<'a> {
525 self.min_codepoint = min_codepoint;
526 self
527 }
528 #[must_use]
530 pub fn max_codepoint(mut self, max_codepoint: u32) -> IntervalQuery<'a> {
531 self.max_codepoint = max_codepoint;
532 self
533 }
534 pub fn intervals(&self) -> Result<Vec<Interval>, Error> {
541 self.version.intervals(
542 self.include_categories,
543 self.exclude_categories,
544 self.include_characters,
545 self.exclude_characters,
546 self.min_codepoint,
547 self.max_codepoint,
548 )
549 }
550 pub fn interval_set(&self) -> Result<IntervalSet, Error> {
557 self.version.interval_set(
558 self.include_categories,
559 self.exclude_categories,
560 self.include_characters,
561 self.exclude_characters,
562 self.min_codepoint,
563 self.max_codepoint,
564 )
565 }
566}
567
568pub fn query<'a>() -> IntervalQuery<'a> {
572 UnicodeVersion::latest().query()
573}
574
575#[cfg(test)]
576mod tests {
577 use super::*;
578 use std::{
579 collections::hash_map::DefaultHasher,
580 hash::{Hash, Hasher},
581 };
582 use test_case::test_case;
583
584 #[test_case(None, None, &[(95, 95), (8255, 8256), (8276, 8276), (65075, 65076), (65101, 65103), (65343, 65343)])]
585 #[test_case(None, Some(128), &[(95, 95)])]
586 #[test_case(Some(65077), None, &[(65101, 65103), (65343, 65343)])]
587 #[test_case(Some(65076), Some(65102), &[(65076, 65076), (65101, 65102)])]
588 fn test_intervals(
589 min_codepoint: Option<u32>,
590 max_codepoint: Option<u32>,
591 expected: &[Interval],
592 ) {
593 let intervals = UnicodeVersion::V15_0_0
594 .intervals(
595 UnicodeCategory::Pc,
596 None,
597 None,
598 None,
599 min_codepoint,
600 max_codepoint,
601 )
602 .expect("Invalid query");
603 assert_eq!(intervals, expected);
604 }
605
606 #[test]
607 fn test_interval_set() {
608 let interval_set = UnicodeVersion::V15_0_0
609 .interval_set(UnicodeCategory::Lu, None, None, None, None, 128)
610 .expect("Invalid query");
611 assert_eq!(interval_set.index_of('A'), Some(0));
612 }
613
614 #[test]
615 fn test_top_level_query() {
616 assert_eq!(
617 query().intervals().expect("Invalid query"),
618 vec![(0, MAX_CODEPOINT)]
619 );
620 }
621
622 #[test]
623 fn test_query_include_only_characters() {
624 let intervals = UnicodeVersion::V15_0_0
625 .query()
626 .include_categories(UnicodeCategory::Pc)
627 .min_codepoint(0)
628 .max_codepoint(50)
629 .include_characters("abc")
630 .intervals()
631 .expect("Invalid query");
632 assert_eq!(intervals, &[(97, 99)]);
633 }
634
635 #[test]
636 fn test_query_exclude_only_characters() {
637 let intervals = UnicodeVersion::V15_0_0
638 .query()
639 .include_categories(UnicodeCategory::UPPERCASE_LETTER)
640 .max_codepoint(90)
641 .exclude_characters("ABC")
642 .intervals()
643 .expect("Invalid query");
644 assert_eq!(intervals, &[(68, 90)]);
645 }
646
647 #[test]
648 fn test_query_exclude_categories() {
649 let intervals = UnicodeVersion::V15_0_0
650 .query()
651 .exclude_categories(UnicodeCategory::UPPERCASE_LETTER)
652 .max_codepoint(90)
653 .intervals()
654 .expect("Invalid query");
655 assert_eq!(intervals, &[(0, 64)]);
656 }
657
658 #[test]
659 fn test_query_include_category_and_characters() {
660 let intervals = UnicodeVersion::V15_0_0
661 .intervals(UnicodeCategory::Pc, None, "abc", None, None, None)
662 .expect("Invalid query");
663 assert_eq!(
664 intervals,
665 &[
666 (95, 95),
667 (97, 99),
668 (8255, 8256),
669 (8276, 8276),
670 (65075, 65076),
671 (65101, 65103),
672 (65343, 65343)
673 ]
674 );
675 }
676
677 #[test_case(
678 1073741824,
679 2147483648,
680 "Codepoints should be in [0; 1114111] range. Got: [1073741824; 2147483648]"
681 )]
682 #[test_case(
683 0,
684 2147483648,
685 "Codepoints should be in [0; 1114111] range. Got: [0; 2147483648]"
686 )]
687 #[test_case(
688 5,
689 1,
690 "Minimum codepoint should be less or equal than maximum codepoint. Got 5 < 1"
691 )]
692 fn test_query_invalid_codepoints(min_codepoint: u32, max_codepoint: u32, expected: &str) {
693 let error = UnicodeVersion::V15_0_0
694 .query()
695 .min_codepoint(min_codepoint)
696 .max_codepoint(max_codepoint)
697 .intervals()
698 .expect_err("Should error");
699 assert_eq!(error.to_string(), expected);
700 let error = UnicodeVersion::V15_0_0
701 .query()
702 .min_codepoint(min_codepoint)
703 .max_codepoint(max_codepoint)
704 .interval_set()
705 .expect_err("Should error");
706 assert_eq!(error.to_string(), expected);
707 }
708
709 #[test]
710 fn test_intervals_for() {
711 assert_eq!(
712 UnicodeVersion::V15_0_0.intervals_for(UnicodeCategory::Pc),
713 &[
714 (95, 95),
715 (8255, 8256),
716 (8276, 8276),
717 (65075, 65076),
718 (65101, 65103),
719 (65343, 65343),
720 ]
721 );
722 }
723
724 #[test]
725 fn test_normalized_categories() {
726 assert_eq!(
727 UnicodeVersion::V15_0_0.normalized_categories(),
728 [
729 UnicodeCategory::Zl,
730 UnicodeCategory::Zp,
731 UnicodeCategory::Co,
732 UnicodeCategory::Me,
733 UnicodeCategory::Pc,
734 UnicodeCategory::Zs,
735 UnicodeCategory::Pf,
736 UnicodeCategory::Lt,
737 UnicodeCategory::Pi,
738 UnicodeCategory::Nl,
739 UnicodeCategory::Pd,
740 UnicodeCategory::Sc,
741 UnicodeCategory::Cf,
742 UnicodeCategory::Sk,
743 UnicodeCategory::Nd,
744 UnicodeCategory::Sm,
745 UnicodeCategory::Lm,
746 UnicodeCategory::No,
747 UnicodeCategory::Pe,
748 UnicodeCategory::Ps,
749 UnicodeCategory::Mc,
750 UnicodeCategory::So,
751 UnicodeCategory::Po,
752 UnicodeCategory::Mn,
753 UnicodeCategory::Lo,
754 UnicodeCategory::Lu,
755 UnicodeCategory::Ll,
756 UnicodeCategory::Cn,
757 UnicodeCategory::Cc,
758 UnicodeCategory::Cs,
759 ]
760 );
761 }
762
763 #[test_case(UnicodeVersion::V9_0_0)]
764 #[test_case(UnicodeVersion::V10_0_0)]
765 #[test_case(UnicodeVersion::V11_0_0)]
766 #[test_case(UnicodeVersion::V12_0_0)]
767 #[test_case(UnicodeVersion::V12_1_0)]
768 #[test_case(UnicodeVersion::V13_0_0)]
769 #[test_case(UnicodeVersion::V14_0_0)]
770 #[test_case(UnicodeVersion::V15_0_0)]
771 fn test_successive_union(version: UnicodeVersion) {
772 let mut x = vec![];
773 for v in version.table() {
774 x.extend_from_slice(v);
775 }
776 intervals::merge(&mut x);
777 assert_eq!(x, vec![(0, MAX_CODEPOINT)]);
778 }
779
780 #[test_case(UnicodeVersion::V9_0_0, "9.0.0")]
781 #[test_case(UnicodeVersion::V10_0_0, "10.0.0")]
782 #[test_case(UnicodeVersion::V11_0_0, "11.0.0")]
783 #[test_case(UnicodeVersion::V12_0_0, "12.0.0")]
784 #[test_case(UnicodeVersion::V12_1_0, "12.1.0")]
785 #[test_case(UnicodeVersion::V13_0_0, "13.0.0")]
786 #[test_case(UnicodeVersion::V14_0_0, "14.0.0")]
787 #[test_case(UnicodeVersion::V15_0_0, "15.0.0")]
788 fn test_display(version: UnicodeVersion, expected: &str) {
789 let string = version.to_string();
790 assert_eq!(string, expected);
791 assert_eq!(
792 UnicodeVersion::from_str(&string).expect("Invalid version"),
793 version
794 );
795 }
796
797 #[test_case("9.0.0", UnicodeVersion::V9_0_0)]
798 #[test_case("10.0.0", UnicodeVersion::V10_0_0)]
799 #[test_case("11.0.0", UnicodeVersion::V11_0_0)]
800 #[test_case("12.0.0", UnicodeVersion::V12_0_0)]
801 #[test_case("12.1.0", UnicodeVersion::V12_1_0)]
802 #[test_case("13.0.0", UnicodeVersion::V13_0_0)]
803 #[test_case("14.0.0", UnicodeVersion::V14_0_0)]
804 #[test_case("15.0.0", UnicodeVersion::V15_0_0)]
805 fn test_version_from_str(version: &str, expected: UnicodeVersion) {
806 assert_eq!(
807 UnicodeVersion::from_str(version).expect("Invalid version"),
808 expected
809 );
810 }
811
812 #[test]
813 fn test_version_from_str_error() {
814 assert_eq!(
815 UnicodeVersion::from_str("invalid")
816 .expect_err("Should fail")
817 .to_string(),
818 "'invalid' is not a valid Unicode version"
819 );
820 }
821
822 #[test]
823 #[allow(clippy::clone_on_copy)]
824 fn test_unicode_version_traits() {
825 let version = UnicodeVersion::V15_0_0;
826 let mut hasher = DefaultHasher::new();
827 version.hash(&mut hasher);
828 hasher.finish();
829 let _ = version.clone();
830 assert_eq!(format!("{version:?}"), "V15_0_0");
831 }
832
833 #[test]
834 fn test_interval_query_traits() {
835 let query = UnicodeVersion::V15_0_0.query();
836 let _ = query.clone();
837 assert_eq!(
838 format!("{query:?}"),
839 "IntervalQuery { version: V15_0_0, include_categories: None, exclude_categories: None, include_characters: None, exclude_characters: None, min_codepoint: 0, max_codepoint: 1114111 }"
840 );
841 assert_eq!(query, query);
842 }
843}