pomsky_syntax/exprs/char_class/mod.rs
1//! Implements _character classes_. The analogue in the regex world are
2//! [character classes](https://www.regular-expressions.info/charclass.html),
3//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
4//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
5//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
6//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
7//! [dot](https://www.regular-expressions.info/dot.html).
8//!
9//! All kinds of character classes mentioned above require `[` square brackets
10//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
11//! after the opening bracket. For example, `![.]` compiles to `\n`.
12//!
13//! ## Items
14//!
15//! A character class can contain multiple _items_, which can be
16//!
17//! - A __code point__, e.g. `['a']` or `[U+107]`
18//!
19//! - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
20//! Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
21//!
22//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
23//! point P where `U+10 ≤ P ≤ U+200`
24//!
25//! - A __named character class__, which can be one of
26//!
27//! - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
28//! Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
29//!
30//! - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
31//! Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
32//! `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
33//! `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
34//! `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
35//! classes are not Unicode aware!\ _Note_: They're converted to ranges,
36//! e.g. `[ascii_alpha]` = `[a-zA-Z]`.
37//!
38//! - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
39//! For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
40//! treats any uppercase identifier except `R` as Unicode class.
41//!
42//! ## Compilation
43//!
44//! When a character class contains only a single item (e.g. `[w]`), the
45//! character class is "flattened":
46//!
47//! - `['a']` = `a`
48//! - `[w]` = `\w`
49//! - `[Letter]` = `\p{Letter}`
50//!
51//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
52//! character class is created:
53//!
54//! - `['a'-'z' '!']` = `[a-z!]`
55//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
56//!
57//! ### Negation
58//!
59//! Negation is implemented as follows:
60//!
61//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
62//! character class, e.g. `[^a-z!\e]`.
63//!
64//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
65//! class.
66//!
67//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
68//! (`![w]` = `\W`), except when there is more than one item in the class
69//! (`![w '-']` = `[^\w\-]`)
70//!
71//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
72//! individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
73//! `![!Latin 'a']` = `[^\P{Latin}a]`.
74//!
75//! When a negated character class only contains 1 item, which is also
76//! negated, the class is removed and the negations cancel each other out:
77//! `![!w]` = `\w`, `![!L]` = `\p{L}`.
78
79use crate::Span;
80
81pub use char_group::{CharGroup, GroupItem, GroupName, ScriptExtension};
82pub use unicode::{Category, CodeBlock, OtherProperties, Script};
83
84mod ascii;
85pub(crate) mod char_group;
86pub(crate) mod unicode;
87
88pub use unicode::{blocks_supported_in_dotnet, list_shorthands, props_supported_in_java};
89
90/// A _character class_. Refer to the [module-level documentation](self) for
91/// details.
92#[derive(Debug, Clone, PartialEq, Eq)]
93pub struct CharClass {
94 pub inner: Vec<GroupItem>,
95 pub span: Span,
96 pub unicode_aware: bool,
97}
98
99impl CharClass {
100 pub fn new(inner: Vec<GroupItem>, span: Span, unicode_aware: bool) -> Self {
101 CharClass { inner, span, unicode_aware }
102 }
103
104 #[cfg(feature = "dbg")]
105 pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) {
106 buf.push('[');
107 for (i, item) in self.inner.iter().enumerate() {
108 if i > 0 {
109 buf.push(' ');
110 }
111 item.pretty_print(buf);
112 }
113 buf.push(']');
114 }
115}
116
117#[cfg(feature = "arbitrary")]
118impl arbitrary::Arbitrary<'_> for CharClass {
119 fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result<Self> {
120 let len = u.arbitrary_len::<GroupItem>()?.max(1);
121 let mut inner = Vec::with_capacity(len);
122 for _ in 0..len {
123 inner.push(u.arbitrary()?);
124 }
125
126 Ok(CharClass { inner, span: Span::arbitrary(u)?, unicode_aware: bool::arbitrary(u)? })
127 }
128
129 fn size_hint(depth: usize) -> (usize, Option<usize>) {
130 let (_, Some(group_item_size)) = GroupItem::size_hint(depth) else { panic!() };
131 (group_item_size + 1, Some(group_item_size * 10 + 1))
132 }
133}