pomsky_syntax/exprs/char_class/
char_group.rs

1//! Contains the [`CharGroup`] type, which is the contents of a
2//! [`CharClass`](crate::char_class::CharClass).
3//!
4//! However, a `CharGroup` doesn't store the information whether the character
5//! class is negated.
6//!
7//! Refer to the [`char_class` module](crate::char_class) for more information.
8
9use crate::{Span, error::ParseErrorKind};
10
11use super::unicode::{Category, CodeBlock, OtherProperties, Script};
12
13/// The contents of a [`CharClass`](crate::char_class::CharClass).
14///
15/// Refer to the [`char_class` module](crate::char_class) for more information.
16#[derive(Clone, PartialEq, Eq)]
17pub struct CharGroup {
18    /// This variant is used for the remaining cases.
19    pub items: Vec<GroupItem>,
20}
21
22impl CharGroup {
23    /// Tries to create a `CharGroup` from a range of characters (inclusive).
24    /// Returns `None` if `last` is lower than `first`.
25    pub(crate) fn try_from_range(first: char, last: char) -> Option<Vec<GroupItem>> {
26        if first < last { Some(vec![GroupItem::Range { first, last }]) } else { None }
27    }
28
29    /// Try to create a `CharGroup` from the name of a character class. Fails if
30    /// the name is lowercase and not known, or if it matches a keyword.
31    ///
32    /// POSIX classes (e.g. `alnum` or `blank`) are converted to ranges (e.g.
33    /// `[0-9a-zA-Z]`). This is relatively simple and maximizes
34    /// compatibility.
35    ///
36    /// If the name is uppercase (and not `R`), we just assume that it is a
37    /// Unicode category, script or block. This needs to be fixed at one
38    /// point!
39    pub(crate) fn try_from_group_name(
40        kind: Option<&str>,
41        name: &str,
42        negative: bool,
43        span: Span,
44    ) -> Result<Vec<GroupItem>, ParseErrorKind> {
45        Ok(match name {
46            _ if name == "ascii" || name.starts_with("ascii_") => {
47                super::ascii::parse_ascii_group(name, negative)?
48            }
49            _ => {
50                let name = super::unicode::parse_group_name(kind, name)?;
51                vec![GroupItem::Named { name, negative, span }]
52            }
53        })
54    }
55}
56
57/// One item in a character class.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
59pub enum GroupItem {
60    /// A Unicode code point. It can be denoted in quotes (e.g. `'a'`) or in
61    /// hexadecimal notation (`U+201`).
62    ///
63    /// Some non-printable ASCII characters are also parsed to a
64    /// [`GroupItem::Char`]: `[n]`, `[t]`, `[r]`, `[a]`, `[e]` and `[f]`.
65    Char(char),
66    /// A range of Unicode code points. It is denoted as `A-B`, where `A` and
67    /// `B` are Unicode code points, allowing the same notation as for
68    /// [`GroupItem::Char`]. Both `A` and `B` are included in the range.
69    Range { first: char, last: char },
70    /// A named character class, i.e. a shorthand or a Unicode
71    /// category/script/block. Shorthands are `[w]`, `[s]`, `[d]`, `[v]`,
72    /// `[h]` and `[R]`.
73    ///
74    /// Some of them (`w`, `d`, `s` and Unicode) can be negated.
75    Named { name: GroupName, negative: bool, span: Span },
76}
77
78impl GroupItem {
79    pub(crate) fn range_unchecked(first: char, last: char) -> Self {
80        GroupItem::Range { first, last }
81    }
82
83    #[cfg(feature = "dbg")]
84    pub(crate) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) {
85        fn print_char(c: char, buf: &mut crate::PrettyPrinter) {
86            match c {
87                '\n' => buf.push('n'),
88                '\r' => buf.push('r'),
89                '\t' => buf.push('t'),
90                '\u{07}' => buf.push('a'),
91                '\u{1b}' => buf.push('e'),
92                '\u{0c}' => buf.push('f'),
93                _ => buf.pretty_print_char(c),
94            }
95        }
96
97        match *self {
98            Self::Char(c) => print_char(c, buf),
99            Self::Range { first, last } => {
100                print_char(first, buf);
101                buf.push('-');
102                print_char(last, buf);
103            }
104            Self::Named { name, negative, .. } => {
105                if negative {
106                    buf.push('!');
107                }
108                let name = match name {
109                    GroupName::Word => "word",
110                    GroupName::Digit => "digit",
111                    GroupName::Space => "space",
112                    GroupName::HorizSpace => "horiz_space",
113                    GroupName::VertSpace => "vert_space",
114                    GroupName::Category(c) => c.as_str(),
115                    GroupName::Script(s, e) => {
116                        buf.push_str(e.as_str());
117                        buf.push_str(s.as_str());
118                        return;
119                    }
120                    GroupName::CodeBlock(b) => b.as_str(),
121                    GroupName::OtherProperties(b) => b.as_str(),
122                };
123                buf.push_str(name);
124            }
125        }
126    }
127}
128
129#[cfg(feature = "arbitrary")]
130impl arbitrary::Arbitrary<'_> for GroupItem {
131    fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result<Self> {
132        Ok(match u.int_in_range(0u8..=2)? {
133            0 => GroupItem::Char(u.arbitrary()?),
134            1 => {
135                let first = u.arbitrary()?;
136                let last = u.arbitrary()?;
137                if first >= last {
138                    return Err(arbitrary::Error::IncorrectFormat);
139                }
140                GroupItem::Range { first, last }
141            }
142            _ => GroupItem::Named {
143                name: GroupName::arbitrary(u)?,
144                negative: bool::arbitrary(u)?,
145                span: Span::arbitrary(u)?,
146            },
147        })
148    }
149
150    fn size_hint(depth: usize) -> (usize, Option<usize>) {
151        arbitrary::size_hint::and(
152            u8::size_hint(depth),
153            arbitrary::size_hint::or_all(&[
154                char::size_hint(depth),
155                arbitrary::size_hint::and(char::size_hint(depth), char::size_hint(depth)),
156                arbitrary::size_hint::and(GroupName::size_hint(depth), bool::size_hint(depth)),
157            ]),
158        )
159    }
160}
161
162#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
163#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
164pub enum GroupName {
165    Word,
166    Digit,
167    Space,
168    HorizSpace,
169    VertSpace,
170    Category(Category),
171    Script(Script, ScriptExtension),
172    CodeBlock(CodeBlock),
173    OtherProperties(OtherProperties),
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
177#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
178pub enum ScriptExtension {
179    Yes,
180    No,
181    Unspecified,
182}
183
184impl GroupName {
185    pub fn kind(self) -> &'static str {
186        match self {
187            GroupName::Word
188            | GroupName::Digit
189            | GroupName::Space
190            | GroupName::HorizSpace
191            | GroupName::VertSpace => "shorthand",
192            GroupName::Category(_) => "category",
193            GroupName::Script(..) => "script",
194            GroupName::CodeBlock(_) => "block",
195            GroupName::OtherProperties(_) => "property",
196        }
197    }
198}
199
200impl ScriptExtension {
201    pub fn as_str(self) -> &'static str {
202        match self {
203            ScriptExtension::Yes => "scx:",
204            ScriptExtension::No => "sc:",
205            ScriptExtension::Unspecified => "",
206        }
207    }
208}