pomsky_syntax/exprs/char_class/
mod.rs

1//! Implements _character classes_. The analogue in the regex world are
2//! [character classes](https://www.regular-expressions.info/charclass.html),
3//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
4//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
5//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
6//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
7//! [dot](https://www.regular-expressions.info/dot.html).
8//!
9//! All kinds of character classes mentioned above require `[` square brackets
10//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
11//! after the opening bracket. For example, `![.]` compiles to `\n`.
12//!
13//! ## Items
14//!
15//! A character class can contain multiple _items_, which can be
16//!
17//! - A __code point__, e.g. `['a']` or `[U+107]`
18//!
19//!   - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
20//!     Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
21//!
22//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
23//!   point P where `U+10 ≤ P ≤ U+200`
24//!
25//! - A __named character class__, which can be one of
26//!
27//!   - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
28//!     Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
29//!
30//!   - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
31//!     Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
32//!     `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
33//!     `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
34//!     `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
35//!     classes are not Unicode aware!\ _Note_: They're converted to ranges,
36//!     e.g. `[ascii_alpha]` = `[a-zA-Z]`.
37//!
38//!   - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
39//!     For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
40//!     treats any uppercase identifier except `R` as Unicode class.
41//!
42//! ## Compilation
43//!
44//! When a character class contains only a single item (e.g. `[w]`), the
45//! character class is "flattened":
46//!
47//! - `['a']` = `a`
48//! - `[w]` = `\w`
49//! - `[Letter]` = `\p{Letter}`
50//!
51//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
52//! character class is created:
53//!
54//! - `['a'-'z' '!']` = `[a-z!]`
55//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
56//!
57//! ### Negation
58//!
59//! Negation is implemented as follows:
60//!
61//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
62//!   character class, e.g. `[^a-z!\e]`.
63//!
64//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
65//!   class.
66//!
67//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
68//!   (`![w]` = `\W`), except when there is more than one item in the class
69//!   (`![w '-']` = `[^\w\-]`)
70//!
71//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
72//!   individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
73//!   `![!Latin 'a']` = `[^\P{Latin}a]`.
74//!
75//!   When a negated character class only contains 1 item, which is also
76//!   negated, the class is   removed and the negations cancel each other out:
77//!   `![!w]` = `\w`, `![!L]` = `\p{L}`.
78
79use crate::Span;
80
81pub use char_group::{CharGroup, GroupItem, GroupName, ScriptExtension};
82pub use unicode::{Category, CodeBlock, OtherProperties, Script};
83
84mod ascii;
85pub(crate) mod char_group;
86pub(crate) mod unicode;
87
88pub use unicode::{blocks_supported_in_dotnet, list_shorthands, props_supported_in_java};
89
90/// A _character class_. Refer to the [module-level documentation](self) for
91/// details.
92#[derive(Debug, Clone, PartialEq, Eq)]
93pub struct CharClass {
94    pub inner: Vec<GroupItem>,
95    pub span: Span,
96    pub unicode_aware: bool,
97}
98
99impl CharClass {
100    pub fn new(inner: Vec<GroupItem>, span: Span, unicode_aware: bool) -> Self {
101        CharClass { inner, span, unicode_aware }
102    }
103
104    #[cfg(feature = "dbg")]
105    pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) {
106        buf.push('[');
107        for (i, item) in self.inner.iter().enumerate() {
108            if i > 0 {
109                buf.push(' ');
110            }
111            item.pretty_print(buf);
112        }
113        buf.push(']');
114    }
115}
116
117#[cfg(feature = "arbitrary")]
118impl arbitrary::Arbitrary<'_> for CharClass {
119    fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result<Self> {
120        let len = u.arbitrary_len::<GroupItem>()?.max(1);
121        let mut inner = Vec::with_capacity(len);
122        for _ in 0..len {
123            inner.push(u.arbitrary()?);
124        }
125
126        Ok(CharClass { inner, span: Span::arbitrary(u)?, unicode_aware: bool::arbitrary(u)? })
127    }
128
129    fn size_hint(depth: usize) -> (usize, Option<usize>) {
130        let (_, Some(group_item_size)) = GroupItem::size_hint(depth) else { panic!() };
131        (group_item_size + 1, Some(group_item_size * 10 + 1))
132    }
133}