icu_experimental/unicodeset_parse/
parse.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::borrow::Cow;
6use alloc::collections::{BTreeMap, BTreeSet};
7use alloc::fmt::Display;
8use alloc::format;
9use alloc::string::{String, ToString};
10use alloc::vec::Vec;
11use core::{iter::Peekable, str::CharIndices};
12
13use icu_collections::{
14    codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder},
15    codepointinvliststringlist::CodePointInversionListAndStringList,
16};
17use icu_properties::script::ScriptWithExtensions;
18use icu_properties::{
19    props::{
20        CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
21        GraphemeClusterBreak, LineBreak, Script, SentenceBreak, WordBreak,
22    },
23    CodePointMapData,
24};
25use icu_properties::{
26    props::{PatternWhiteSpace, XidContinue, XidStart},
27    CodePointSetData,
28};
29use icu_properties::{provider::*, PropertyParser};
30use icu_provider::prelude::*;
31
32/// The kind of error that occurred.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, displaydoc::Display)]
34#[non_exhaustive]
35pub enum ParseErrorKind {
36    /// An unexpected character was encountered.
37    ///
38    /// This variant implies the other variants
39    /// (notably `UnknownProperty` and `Unimplemented`) do not apply.
40    #[displaydoc("An unexpected character was encountered")]
41    UnexpectedChar(char),
42    /// The property name or value is unknown.
43    ///
44    /// For property names, make sure you use the spelling
45    /// defined in [ECMA-262](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
46    #[displaydoc("The property name or value is unknown")]
47    UnknownProperty,
48    /// A reference to an unknown variable.
49    UnknownVariable,
50    /// A variable of a certain type occurring in an unexpected context.
51    UnexpectedVariable,
52    /// The source is an incomplete unicode set.
53    Eof,
54    /// Something unexpected went wrong with our code. Please file a bug report on GitHub.
55    Internal,
56    /// The provided syntax is not supported by us.
57    ///
58    /// Note that unknown properties will return the
59    /// `UnknownProperty` variant, not this one.
60    #[displaydoc("The provided syntax is not supported by us.")]
61    Unimplemented,
62    /// The provided escape sequence is not a valid Unicode code point or represents too many code points.
63    InvalidEscape,
64}
65use zerovec::VarZeroVec;
66use ParseErrorKind as PEK;
67
68impl ParseErrorKind {
69    fn with_offset(self, offset: usize) -> ParseError {
70        ParseError {
71            offset: Some(offset),
72            kind: self,
73        }
74    }
75}
76
77impl From<ParseErrorKind> for ParseError {
78    fn from(kind: ParseErrorKind) -> Self {
79        ParseError { offset: None, kind }
80    }
81}
82
83/// The error type returned by the `parse` functions in this crate.
84///
85/// See [`ParseError::fmt_with_source`] for pretty-printing and [`ParseErrorKind`] of the
86/// different types of errors represented by this struct.
87#[derive(Debug, Clone, Copy, PartialEq, Eq)]
88pub struct ParseError {
89    // offset is the index to an arbitrary byte in the last character in the source that makes sense
90    // to display as location for the error, e.g., the unexpected character itself or
91    // for an unknown property name the last character of the name.
92    offset: Option<usize>,
93    kind: ParseErrorKind,
94}
95
96type Result<T, E = ParseError> = core::result::Result<T, E>;
97
98impl ParseError {
99    /// Pretty-prints this error and if applicable, shows where the error occurred in the source.
100    ///
101    /// Must be called with the same source that was used to parse the set.
102    ///
103    /// # Examples
104    ///
105    /// ```
106    /// use icu::experimental::unicodeset_parse::*;
107    ///
108    /// let source = "[[abc]-x]";
109    /// let set = parse(source);
110    /// assert!(set.is_err());
111    /// let err = set.unwrap_err();
112    /// assert_eq!(
113    ///     err.fmt_with_source(source).to_string(),
114    ///     "[[abc]-x← error: unexpected character 'x'"
115    /// );
116    /// ```
117    ///
118    /// ```
119    /// use icu::experimental::unicodeset_parse::*;
120    ///
121    /// let source = r"[\N{LATIN CAPITAL LETTER A}]";
122    /// let set = parse(source);
123    /// assert!(set.is_err());
124    /// let err = set.unwrap_err();
125    /// assert_eq!(
126    ///     err.fmt_with_source(source).to_string(),
127    ///     r"[\N← error: unimplemented"
128    /// );
129    /// ```
130    pub fn fmt_with_source(&self, source: &str) -> impl Display {
131        let ParseError { offset, kind } = *self;
132
133        if kind == ParseErrorKind::Eof {
134            return format!("{source}← error: unexpected end of input");
135        }
136        let mut s = String::new();
137        if let Some(offset) = offset {
138            if offset < source.len() {
139                // offset points to any byte of the last character we want to display.
140                // in the case of ASCII, this is easy - we just display bytes [..=offset].
141                // however, if the last character is more than one byte in UTF-8
142                // we cannot use ..=offset, because that would potentially include only partial
143                // bytes of last character in our string. hence we must find the start of the
144                // following character and use that as the (exclusive) end of our string.
145
146                // offset points into the last character we want to include, hence the start of the
147                // first character we want to exclude is at least offset + 1.
148                let mut exclusive_end = offset + 1;
149                // TODO: replace this loop with str::ceil_char_boundary once stable
150                for _ in 0..3 {
151                    // is_char_boundary returns true at the latest once exclusive_end == source.len()
152                    if source.is_char_boundary(exclusive_end) {
153                        break;
154                    }
155                    exclusive_end += 1;
156                }
157
158                // exclusive_end is at most source.len() due to str::is_char_boundary and at least 0 by type
159                #[allow(clippy::indexing_slicing)]
160                s.push_str(&source[..exclusive_end]);
161                s.push_str("← ");
162            }
163        }
164        s.push_str("error: ");
165        match kind {
166            ParseErrorKind::UnexpectedChar(c) => {
167                s.push_str(&format!("unexpected character '{}'", c.escape_debug()));
168            }
169            ParseErrorKind::UnknownProperty => {
170                s.push_str("unknown property");
171            }
172            ParseErrorKind::UnknownVariable => {
173                s.push_str("unknown variable");
174            }
175            ParseErrorKind::UnexpectedVariable => {
176                s.push_str("unexpected variable");
177            }
178            ParseErrorKind::Eof => {
179                s.push_str("unexpected end of input");
180            }
181            ParseErrorKind::Internal => {
182                s.push_str("internal error");
183            }
184            ParseErrorKind::Unimplemented => {
185                s.push_str("unimplemented");
186            }
187            ParseErrorKind::InvalidEscape => {
188                s.push_str("invalid escape sequence");
189            }
190        }
191
192        s
193    }
194
195    /// Returns the [`ParseErrorKind`] of this error.
196    pub fn kind(&self) -> ParseErrorKind {
197        self.kind
198    }
199
200    /// Returns the offset of this error in the source string, if it was specified.
201    pub fn offset(&self) -> Option<usize> {
202        self.offset
203    }
204
205    fn or_with_offset(self, offset: usize) -> Self {
206        match self.offset {
207            Some(_) => self,
208            None => ParseError {
209                offset: Some(offset),
210                ..self
211            },
212        }
213    }
214}
215
216/// The value of a variable in a UnicodeSet. Used as value type in [`VariableMap`].
217#[derive(Debug, Clone)]
218#[non_exhaustive]
219pub enum VariableValue<'a> {
220    /// A UnicodeSet, represented as a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList).
221    UnicodeSet(CodePointInversionListAndStringList<'a>),
222    // in theory, a one-code-point string is always the same as a char, but we might want to keep
223    // this variant for efficiency?
224    /// A single code point.
225    Char(char),
226    /// A string. It is guaranteed that when returned from a VariableMap, this variant contains never exactly one code point.
227    String(Cow<'a, str>),
228}
229
230/// The map used for parsing UnicodeSets with variable support. See [`parse_with_variables`].
231#[derive(Debug, Clone, Default)]
232pub struct VariableMap<'a>(BTreeMap<String, VariableValue<'a>>);
233
234impl<'a> VariableMap<'a> {
235    /// Creates a new empty map.
236    pub fn new() -> Self {
237        Self::default()
238    }
239
240    /// Removes a key from the map, returning the value at the key if the key
241    /// was previously in the map.
242    pub fn remove(&mut self, key: &str) -> Option<VariableValue<'a>> {
243        self.0.remove(key)
244    }
245
246    /// Get a reference to the value associated with this key, if it exists.
247    pub fn get(&self, key: &str) -> Option<&VariableValue<'a>> {
248        self.0.get(key)
249    }
250
251    /// Insert a `VariableValue` into the `VariableMap`.
252    ///
253    /// Returns `Err` with the old value, if it exists, and does not update the map.
254    pub fn insert(&mut self, key: String, value: VariableValue<'a>) -> Result<(), &VariableValue> {
255        // borrow-checker shenanigans, otherwise we could use if let
256        if self.0.contains_key(&key) {
257            // we just checked that this key exists
258            #[allow(clippy::indexing_slicing)]
259            return Err(&self.0[&key]);
260        }
261
262        if let VariableValue::String(s) = &value {
263            let mut chars = s.chars();
264            if let (Some(c), None) = (chars.next(), chars.next()) {
265                self.0.insert(key, VariableValue::Char(c));
266                return Ok(());
267            };
268        }
269
270        self.0.insert(key, value);
271        Ok(())
272    }
273
274    /// Insert a `char` into the `VariableMap`.    
275    ///
276    /// Returns `Err` with the old value, if it exists, and does not update the map.
277    pub fn insert_char(&mut self, key: String, c: char) -> Result<(), &VariableValue> {
278        // borrow-checker shenanigans, otherwise we could use if let
279        if self.0.contains_key(&key) {
280            // we just checked that this key exists
281            #[allow(clippy::indexing_slicing)]
282            return Err(&self.0[&key]);
283        }
284
285        self.0.insert(key, VariableValue::Char(c));
286        Ok(())
287    }
288
289    /// Insert a `String` of any length into the `VariableMap`.
290    ///
291    /// Returns `Err` with the old value, if it exists, and does not update the map.
292    pub fn insert_string(&mut self, key: String, s: String) -> Result<(), &VariableValue> {
293        // borrow-checker shenanigans, otherwise we could use if let
294        if self.0.contains_key(&key) {
295            // we just checked that this key exists
296            #[allow(clippy::indexing_slicing)]
297            return Err(&self.0[&key]);
298        }
299
300        let mut chars = s.chars();
301        let val = match (chars.next(), chars.next()) {
302            (Some(c), None) => VariableValue::Char(c),
303            _ => VariableValue::String(Cow::Owned(s)),
304        };
305
306        self.0.insert(key, val);
307        Ok(())
308    }
309
310    /// Insert a `&str` of any length into the `VariableMap`.
311    ///
312    /// Returns `Err` with the old value, if it exists, and does not update the map.
313    pub fn insert_str(&mut self, key: String, s: &'a str) -> Result<(), &VariableValue> {
314        // borrow-checker shenanigans, otherwise we could use if let
315        if self.0.contains_key(&key) {
316            // we just checked that this key exists
317            #[allow(clippy::indexing_slicing)]
318            return Err(&self.0[&key]);
319        }
320
321        let mut chars = s.chars();
322        let val = match (chars.next(), chars.next()) {
323            (Some(c), None) => VariableValue::Char(c),
324            _ => VariableValue::String(Cow::Borrowed(s)),
325        };
326
327        self.0.insert(key, val);
328        Ok(())
329    }
330
331    /// Insert a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList) into the `VariableMap`.
332    ///
333    /// Returns `Err` with the old value, if it exists, and does not update the map.
334    pub fn insert_set(
335        &mut self,
336        key: String,
337        set: CodePointInversionListAndStringList<'a>,
338    ) -> Result<(), &VariableValue> {
339        // borrow-checker shenanigans, otherwise we could use if let
340        if self.0.contains_key(&key) {
341            // we just checked that this key exists
342            #[allow(clippy::indexing_slicing)]
343            return Err(&self.0[&key]);
344        }
345        self.0.insert(key, VariableValue::UnicodeSet(set));
346        Ok(())
347    }
348}
349
350// this ignores the ambiguity between \-escapes and \p{} perl properties. it assumes it is in a context where \p is just 'p'
351// returns whether the provided char signifies the start of a literal char (raw or escaped - so \ is a legal char start)
352// important: assumes c is not pattern_white_space
353fn legal_char_start(c: char) -> bool {
354    !(c == '&' || c == '-' || c == '$' || c == '^' || c == '[' || c == ']' || c == '{')
355}
356
357// same as `legal_char_start` but adapted to the charInString nonterminal. \ is allowed due to escapes.
358// important: assumes c is not pattern_white_space
359fn legal_char_in_string_start(c: char) -> bool {
360    c != '}'
361}
362
363#[derive(Debug)]
364enum SingleOrMultiChar {
365    Single(char),
366    // Multi is a marker that indicates parsing was paused and needs to be resumed using parse_multi_escape* when
367    // this token is consumed. The contained char is the first char of the multi sequence.
368    Multi(char),
369}
370
371// A char or a string. The Vec<char> represents multi-escapes in the 2+ case.
372// invariant: a String is either zero or 2+ chars long, a one-char-string is equivalent to a single char.
373// invariant: a char is 1+ chars long
374#[derive(Debug)]
375enum Literal {
376    String(String),
377    CharKind(SingleOrMultiChar),
378}
379
380#[derive(Debug)]
381enum MainToken<'data> {
382    // to be interpreted as value
383    Literal(Literal),
384    // inner set
385    UnicodeSet(CodePointInversionListAndStringList<'data>),
386    // anchor, only at the end of a set ([... $])
387    DollarSign,
388    // intersection operator, only inbetween two sets ([[...] & [...]])
389    Ampersand,
390    // difference operator, only inbetween two sets ([[...] - [...]])
391    // or
392    // range operator, only inbetween two chars ([a-z], [a-{z}])
393    Minus,
394    // ] to indicate the end of a set
395    ClosingBracket,
396}
397
398impl<'data> MainToken<'data> {
399    fn from_variable_value(val: VariableValue<'data>) -> Self {
400        match val {
401            VariableValue::Char(c) => {
402                MainToken::Literal(Literal::CharKind(SingleOrMultiChar::Single(c)))
403            }
404            VariableValue::String(s) => {
405                // we know that the VariableMap only contains non-length-1 Strings.
406                MainToken::Literal(Literal::String(s.into_owned()))
407            }
408            VariableValue::UnicodeSet(set) => MainToken::UnicodeSet(set),
409        }
410    }
411}
412
413#[derive(Debug, Clone, Copy)]
414enum Operation {
415    Union,
416    Difference,
417    Intersection,
418}
419
420// this builds the set on-the-fly while parsing it
421struct UnicodeSetBuilder<'a, 'b, P: ?Sized> {
422    single_set: CodePointInversionListBuilder,
423    string_set: BTreeSet<String>,
424    iter: &'a mut Peekable<CharIndices<'b>>,
425    source: &'b str,
426    inverted: bool,
427    variable_map: &'a VariableMap<'a>,
428    xid_start: &'a CodePointInversionList<'a>,
429    xid_continue: &'a CodePointInversionList<'a>,
430    pat_ws: &'a CodePointInversionList<'a>,
431    property_provider: &'a P,
432}
433
434impl<'a, 'b, P> UnicodeSetBuilder<'a, 'b, P>
435where
436    P: ?Sized
437        + DataProvider<PropertyBinaryAlphabeticV1>
438        + DataProvider<PropertyBinaryAsciiHexDigitV1>
439        + DataProvider<PropertyBinaryBidiControlV1>
440        + DataProvider<PropertyBinaryBidiMirroredV1>
441        + DataProvider<PropertyBinaryCasedV1>
442        + DataProvider<PropertyBinaryCaseIgnorableV1>
443        + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
444        + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
445        + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
446        + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
447        + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
448        + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
449        + DataProvider<PropertyBinaryDashV1>
450        + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
451        + DataProvider<PropertyBinaryDeprecatedV1>
452        + DataProvider<PropertyBinaryDiacriticV1>
453        + DataProvider<PropertyBinaryEmojiComponentV1>
454        + DataProvider<PropertyBinaryEmojiModifierBaseV1>
455        + DataProvider<PropertyBinaryEmojiModifierV1>
456        + DataProvider<PropertyBinaryEmojiPresentationV1>
457        + DataProvider<PropertyBinaryEmojiV1>
458        + DataProvider<PropertyBinaryExtendedPictographicV1>
459        + DataProvider<PropertyBinaryExtenderV1>
460        + DataProvider<PropertyBinaryGraphemeBaseV1>
461        + DataProvider<PropertyBinaryGraphemeExtendV1>
462        + DataProvider<PropertyBinaryHexDigitV1>
463        + DataProvider<PropertyBinaryIdContinueV1>
464        + DataProvider<PropertyBinaryIdeographicV1>
465        + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
466        + DataProvider<PropertyBinaryIdStartV1>
467        + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
468        + DataProvider<PropertyBinaryJoinControlV1>
469        + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
470        + DataProvider<PropertyBinaryLowercaseV1>
471        + DataProvider<PropertyBinaryMathV1>
472        + DataProvider<PropertyBinaryNoncharacterCodePointV1>
473        + DataProvider<PropertyBinaryPatternSyntaxV1>
474        + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
475        + DataProvider<PropertyBinaryQuotationMarkV1>
476        + DataProvider<PropertyBinaryRadicalV1>
477        + DataProvider<PropertyBinaryRegionalIndicatorV1>
478        + DataProvider<PropertyBinarySentenceTerminalV1>
479        + DataProvider<PropertyBinarySoftDottedV1>
480        + DataProvider<PropertyBinaryTerminalPunctuationV1>
481        + DataProvider<PropertyBinaryUnifiedIdeographV1>
482        + DataProvider<PropertyBinaryUppercaseV1>
483        + DataProvider<PropertyBinaryVariationSelectorV1>
484        + DataProvider<PropertyBinaryWhiteSpaceV1>
485        + DataProvider<PropertyBinaryXidContinueV1>
486        + DataProvider<PropertyBinaryXidStartV1>
487        + DataProvider<PropertyEnumCanonicalCombiningClassV1>
488        + DataProvider<PropertyEnumGeneralCategoryV1>
489        + DataProvider<PropertyEnumGraphemeClusterBreakV1>
490        + DataProvider<PropertyEnumLineBreakV1>
491        + DataProvider<PropertyEnumScriptV1>
492        + DataProvider<PropertyEnumSentenceBreakV1>
493        + DataProvider<PropertyEnumWordBreakV1>
494        + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
495        + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
496        + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
497        + DataProvider<PropertyNameParseLineBreakV1>
498        + DataProvider<PropertyNameParseScriptV1>
499        + DataProvider<PropertyNameParseSentenceBreakV1>
500        + DataProvider<PropertyNameParseWordBreakV1>
501        + DataProvider<PropertyScriptWithExtensionsV1>,
502{
503    fn new_internal(
504        iter: &'a mut Peekable<CharIndices<'b>>,
505        source: &'b str,
506        variable_map: &'a VariableMap<'a>,
507        xid_start: &'a CodePointInversionList<'a>,
508        xid_continue: &'a CodePointInversionList<'a>,
509        pat_ws: &'a CodePointInversionList<'a>,
510        provider: &'a P,
511    ) -> Self {
512        UnicodeSetBuilder {
513            single_set: CodePointInversionListBuilder::new(),
514            string_set: Default::default(),
515            iter,
516            source,
517            inverted: false,
518            variable_map,
519            xid_start,
520            xid_continue,
521            pat_ws,
522            property_provider: provider,
523        }
524    }
525
526    // the entry point, parses a full UnicodeSet. ignores remaining input
527    fn parse_unicode_set(&mut self) -> Result<()> {
528        match self.must_peek_char()? {
529            '\\' => self.parse_property_perl(),
530            '[' => {
531                self.iter.next();
532                if let Some(':') = self.peek_char() {
533                    self.parse_property_posix()
534                } else {
535                    self.parse_unicode_set_inner()
536                }
537            }
538            '$' => {
539                // must be variable ref to a UnicodeSet
540                let (offset, v) = self.parse_variable()?;
541                match v {
542                    Some(VariableValue::UnicodeSet(s)) => {
543                        self.single_set.add_set(s.code_points());
544                        self.string_set
545                            .extend(s.strings().iter().map(ToString::to_string));
546                        Ok(())
547                    }
548                    Some(_) => Err(PEK::UnexpectedVariable.with_offset(offset)),
549                    None => Err(PEK::UnexpectedChar('$').with_offset(offset)),
550                }
551            }
552            c => self.error_here(PEK::UnexpectedChar(c)),
553        }
554    }
555
556    // beginning [ is already consumed
557    fn parse_unicode_set_inner(&mut self) -> Result<()> {
558        // special cases for the first chars after [
559        if self.must_peek_char()? == '^' {
560            self.iter.next();
561            self.inverted = true;
562        }
563        // whitespace allowed between ^ and - in `[^ - ....]`
564        self.skip_whitespace();
565        if self.must_peek_char()? == '-' {
566            self.iter.next();
567            self.single_set.add_char('-');
568        }
569
570        // repeatedly parse the following:
571        // char
572        // char-char
573        // {string}
574        // unicodeset
575        // & and - operators, but only between unicodesets
576        // $variables in place of strings, chars, or unicodesets
577
578        #[derive(Debug, Clone, Copy)]
579        enum State {
580            // a state equivalent to the beginning
581            Begin,
582            // a state after a char. implies `prev_char` is Some(_), because we need to buffer it
583            // in case it is part of a range, e.g., a-z
584            Char,
585            // in the middle of parsing a range. implies `prev_char` is Some(_), and the next
586            // element must be a char as well
587            CharMinus,
588            // state directly after parsing a recursive unicode set. operators are only allowed
589            // in this state
590            AfterUnicodeSet,
591            // state directly after parsing an operator. forces the next element to be a recursive
592            // unicode set
593            AfterOp,
594            // state after parsing a $ (that was not a variable reference)
595            // the only valid next option is a closing bracket
596            AfterDollar,
597            // state after parsing a - in an otherwise invalid position
598            // the only valid next option is a closing bracket
599            AfterMinus,
600        }
601        use State::*;
602
603        const DEFAULT_OP: Operation = Operation::Union;
604
605        let mut state = Begin;
606        let mut prev_char = None;
607        let mut operation = Operation::Union;
608
609        loop {
610            self.skip_whitespace();
611
612            // for error messages
613            let (immediate_offset, immediate_char) = self.must_peek()?;
614
615            let (tok_offset, from_var, tok) = self.parse_main_token()?;
616            // warning: self.iter should not be advanced any more after this point on any path to
617            // MT::Literal(Literal::CharKind(SingleOrMultiChar::Multi)), because that variant
618            // expects a certain self.iter state
619
620            use MainToken as MT;
621            use SingleOrMultiChar as SMC;
622            match (state, tok) {
623                // the end of this unicode set
624                (
625                    Begin | Char | CharMinus | AfterUnicodeSet | AfterDollar | AfterMinus,
626                    MT::ClosingBracket,
627                ) => {
628                    if let Some(prev) = prev_char.take() {
629                        self.single_set.add_char(prev);
630                    }
631                    if matches!(state, CharMinus) {
632                        self.single_set.add_char('-');
633                    }
634
635                    return Ok(());
636                }
637                // special case ends for -
638                // [[a-z]-]
639                (AfterOp, MT::ClosingBracket) if matches!(operation, Operation::Difference) => {
640                    self.single_set.add_char('-');
641                    return Ok(());
642                }
643                (Begin, MT::Minus) => {
644                    self.single_set.add_char('-');
645                    state = AfterMinus;
646                }
647                // inner unicode set
648                (Begin | Char | AfterUnicodeSet | AfterOp, MT::UnicodeSet(set)) => {
649                    if let Some(prev) = prev_char.take() {
650                        self.single_set.add_char(prev);
651                    }
652
653                    self.process_chars(operation, set.code_points().clone());
654                    self.process_strings(
655                        operation,
656                        set.strings().iter().map(ToString::to_string).collect(),
657                    );
658
659                    operation = DEFAULT_OP;
660                    state = AfterUnicodeSet;
661                }
662                // a literal char (either individually or as the start of a range if char)
663                (
664                    Begin | Char | AfterUnicodeSet,
665                    MT::Literal(Literal::CharKind(SMC::Single(c))),
666                ) => {
667                    if let Some(prev) = prev_char.take() {
668                        self.single_set.add_char(prev);
669                    }
670                    prev_char = Some(c);
671                    state = Char;
672                }
673                // a bunch of literal chars as part of a multi-escape sequence
674                (
675                    Begin | Char | AfterUnicodeSet,
676                    MT::Literal(Literal::CharKind(SMC::Multi(first_c))),
677                ) => {
678                    if let Some(prev) = prev_char.take() {
679                        self.single_set.add_char(prev);
680                    }
681                    self.single_set.add_char(first_c);
682                    self.parse_multi_escape_into_set()?;
683
684                    // Note we cannot go to the Char state, because a multi-escape sequence of
685                    // length > 1 cannot initiate a range
686                    state = Begin;
687                }
688                // a literal string (length != 1, by CharOrString invariant)
689                (Begin | Char | AfterUnicodeSet, MT::Literal(Literal::String(s))) => {
690                    if let Some(prev) = prev_char.take() {
691                        self.single_set.add_char(prev);
692                    }
693
694                    self.string_set.insert(s);
695                    state = Begin;
696                }
697                // parse a literal char as the end of a range
698                (CharMinus, MT::Literal(Literal::CharKind(SMC::Single(c)))) => {
699                    let start = prev_char.ok_or(PEK::Internal.with_offset(tok_offset))?;
700                    let end = c;
701                    if start > end {
702                        // TODO(#3558): Better error message (e.g., "start greater than end in range")?
703                        return Err(PEK::UnexpectedChar(end).with_offset(tok_offset));
704                    }
705
706                    self.single_set.add_range(start..=end);
707                    prev_char = None;
708                    state = Begin;
709                }
710                // start parsing a char range
711                (Char, MT::Minus) => {
712                    state = CharMinus;
713                }
714                // start parsing a unicode set difference
715                (AfterUnicodeSet, MT::Minus) => {
716                    operation = Operation::Difference;
717                    state = AfterOp;
718                }
719                // start parsing a unicode set difference
720                (AfterUnicodeSet, MT::Ampersand) => {
721                    operation = Operation::Intersection;
722                    state = AfterOp;
723                }
724                (Begin | Char | AfterUnicodeSet, MT::DollarSign) => {
725                    if let Some(prev) = prev_char.take() {
726                        self.single_set.add_char(prev);
727                    }
728                    self.single_set.add_char('\u{FFFF}');
729                    state = AfterDollar;
730                }
731                _ => {
732                    // TODO(#3558): We have precise knowledge about the following MainToken here,
733                    //  should we make use of that?
734
735                    if from_var {
736                        // otherwise we get error messages such as
737                        // [$a-$← error: unexpected character '$'
738                        // for input [$a-$b], $a = 'a', $b = "string" ;
739                        return Err(PEK::UnexpectedVariable.with_offset(tok_offset));
740                    }
741                    return Err(PEK::UnexpectedChar(immediate_char).with_offset(immediate_offset));
742                }
743            }
744        }
745    }
746
747    fn parse_main_token(&mut self) -> Result<(usize, bool, MainToken<'a>)> {
748        let (initial_offset, first) = self.must_peek()?;
749        if first == ']' {
750            self.iter.next();
751            return Ok((initial_offset, false, MainToken::ClosingBracket));
752        }
753        let (_, second) = self.must_peek_double()?;
754        match (first, second) {
755            // variable or anchor
756            ('$', _) => {
757                let (offset, var_or_anchor) = self.parse_variable()?;
758                match var_or_anchor {
759                    None => Ok((offset, false, MainToken::DollarSign)),
760                    Some(v) => Ok((offset, true, MainToken::from_variable_value(v.clone()))),
761                }
762            }
763            // string
764            ('{', _) => self
765                .parse_string()
766                .map(|(offset, l)| (offset, false, MainToken::Literal(l))),
767            // inner set
768            ('\\', 'p' | 'P') | ('[', _) => {
769                let mut inner_builder = UnicodeSetBuilder::new_internal(
770                    self.iter,
771                    self.source,
772                    self.variable_map,
773                    self.xid_start,
774                    self.xid_continue,
775                    self.pat_ws,
776                    self.property_provider,
777                );
778                inner_builder.parse_unicode_set()?;
779                let (single, string_set) = inner_builder.finalize();
780                // note: offset - 1, because we already consumed full set
781                let offset = self.must_peek_index()? - 1;
782                let mut strings = string_set.into_iter().collect::<Vec<_>>();
783                strings.sort();
784                let cpilasl = CodePointInversionListAndStringList::try_from(
785                    single.build(),
786                    VarZeroVec::from(&strings),
787                )
788                .map_err(|_| PEK::Internal.with_offset(offset))?;
789                Ok((offset, false, MainToken::UnicodeSet(cpilasl)))
790            }
791            // note: c cannot be a whitespace, because we called skip_whitespace just before
792            // (in the main parse loop), so it's safe to call this guard function
793            (c, _) if legal_char_start(c) => self
794                .parse_char()
795                .map(|(offset, c)| (offset, false, MainToken::Literal(Literal::CharKind(c)))),
796            ('-', _) => {
797                self.iter.next();
798                Ok((initial_offset, false, MainToken::Minus))
799            }
800            ('&', _) => {
801                self.iter.next();
802                Ok((initial_offset, false, MainToken::Ampersand))
803            }
804            (c, _) => Err(PEK::UnexpectedChar(c).with_offset(initial_offset)),
805        }
806    }
807
808    // parses a variable or an anchor. expects '$' as next token.
809    // if this is a single $ (eg `[... $ ]` or the invalid `$ a`), then this function returns Ok(None),
810    // otherwise Ok(Some(variable_value)).
811    fn parse_variable(&mut self) -> Result<(usize, Option<&'a VariableValue<'a>>)> {
812        self.consume('$')?;
813
814        let mut res = String::new();
815        let (mut var_offset, first_c) = self.must_peek()?;
816
817        if !self.xid_start.contains(first_c) {
818            // -1 because we already consumed the '$'
819            return Ok((var_offset - 1, None));
820        }
821
822        res.push(first_c);
823        self.iter.next();
824        // important: if we are parsing a root unicodeset as a variable, we might reach EOF as
825        // a valid end of the variable name, so we cannot use must_peek here.
826        while let Some(&(offset, c)) = self.iter.peek() {
827            if !self.xid_continue.contains(c) {
828                break;
829            }
830            // only update the offset if we're adding a new char to our variable
831            var_offset = offset;
832            self.iter.next();
833            res.push(c);
834        }
835
836        if let Some(v) = self.variable_map.0.get(&res) {
837            return Ok((var_offset, Some(v)));
838        }
839
840        Err(PEK::UnknownVariable.with_offset(var_offset))
841    }
842
843    // parses and consumes: '{' (s charInString)* s '}'
844    fn parse_string(&mut self) -> Result<(usize, Literal)> {
845        self.consume('{')?;
846
847        let mut buffer = String::new();
848        let mut last_offset;
849
850        loop {
851            self.skip_whitespace();
852            last_offset = self.must_peek_index()?;
853            match self.must_peek_char()? {
854                '}' => {
855                    self.iter.next();
856                    break;
857                }
858                // note: c cannot be a whitespace, because we called skip_whitespace just before,
859                // so it's safe to call this guard function
860                c if legal_char_in_string_start(c) => {
861                    // don't need the offset, because '}' will always be the last char
862                    let (_, c) = self.parse_char()?;
863                    match c {
864                        SingleOrMultiChar::Single(c) => buffer.push(c),
865                        SingleOrMultiChar::Multi(first) => {
866                            buffer.push(first);
867                            self.parse_multi_escape_into_string(&mut buffer)?;
868                        }
869                    }
870                }
871                c => return self.error_here(PEK::UnexpectedChar(c)),
872            }
873        }
874
875        let mut chars = buffer.chars();
876        let literal = match (chars.next(), chars.next()) {
877            (Some(c), None) => Literal::CharKind(SingleOrMultiChar::Single(c)),
878            _ => Literal::String(buffer),
879        };
880        Ok((last_offset, literal))
881    }
882
883    // finishes a partial multi escape parse. in case of a parse error, self.single_set
884    // may be left in an inconsistent state
885    fn parse_multi_escape_into_set(&mut self) -> Result<()> {
886        // note: would be good to somehow merge the two multi_escape methods. splitting up the UnicodeSetBuilder into a more
887        // conventional parser + lexer combo might allow this.
888        // issue is that we cannot pass this method an argument that somehow mutates `self` in the current architecture.
889        // self.lexer.parse_multi_into_charappendable(&mut self.single_set) should work because the lifetimes are separate
890
891        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
892        // enforced when creating the SingleOrMultiChar::Multi.
893        let mut first = true;
894        loop {
895            let skipped = self.skip_whitespace();
896            match self.must_peek_char()? {
897                '}' => {
898                    self.iter.next();
899                    return Ok(());
900                }
901                initial_c => {
902                    if skipped == 0 && !first {
903                        // bracketed hex code points must be separated by whitespace
904                        return self.error_here(PEK::UnexpectedChar(initial_c));
905                    }
906                    first = false;
907
908                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
909                    self.single_set.add_char(c);
910                }
911            }
912        }
913    }
914
915    // finishes a partial multi escape parse. in case of a parse error, the caller must clean up the
916    // string if necessary.
917    fn parse_multi_escape_into_string(&mut self, s: &mut String) -> Result<()> {
918        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
919        // enforced when creating the SingleOrMultiChar::Multi.
920        let mut first = true;
921        loop {
922            let skipped = self.skip_whitespace();
923            match self.must_peek_char()? {
924                '}' => {
925                    self.iter.next();
926                    return Ok(());
927                }
928                initial_c => {
929                    if skipped == 0 && !first {
930                        // bracketed hex code points must be separated by whitespace
931                        return self.error_here(PEK::UnexpectedChar(initial_c));
932                    }
933                    first = false;
934
935                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
936                    s.push(c);
937                }
938            }
939        }
940    }
941
942    // starts with \ and consumes the whole escape sequence if a single
943    // char is escaped, otherwise pauses the parse after the first char
944    fn parse_escaped_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
945        self.consume('\\')?;
946
947        let (offset, next_char) = self.must_next()?;
948
949        match next_char {
950            'u' | 'x' if self.peek_char() == Some('{') => {
951                // bracketedHex
952                self.iter.next();
953
954                self.skip_whitespace();
955                let (_, first_c) = self.parse_hex_digits_into_char(1, 6)?;
956                let skipped = self.skip_whitespace();
957
958                match self.must_peek()? {
959                    (offset, '}') => {
960                        self.iter.next();
961                        Ok((offset, SingleOrMultiChar::Single(first_c)))
962                    }
963                    // note: enforcing whitespace after the first char here, because the parse_multi_escape functions
964                    // won't have access to this information anymore
965                    (offset, c) if c.is_ascii_hexdigit() && skipped > 0 => {
966                        Ok((offset, SingleOrMultiChar::Multi(first_c)))
967                    }
968                    (_, c) => self.error_here(PEK::UnexpectedChar(c)),
969                }
970            }
971            'u' => {
972                // 'u' hex{4}
973                self.parse_hex_digits_into_char(4, 4)
974                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
975            }
976            'x' => {
977                // 'x' hex{2}
978                self.parse_hex_digits_into_char(2, 2)
979                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
980            }
981            'U' => {
982                // 'U00' ('0' hex{5} | '10' hex{4})
983                self.consume('0')?;
984                self.consume('0')?;
985                self.parse_hex_digits_into_char(6, 6)
986                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
987            }
988            'N' => {
989                // parse code point with name in {}
990                // tracking issue: https://github.com/unicode-org/icu4x/issues/1397
991                Err(PEK::Unimplemented.with_offset(offset))
992            }
993            'a' => Ok((offset, SingleOrMultiChar::Single('\u{0007}'))),
994            'b' => Ok((offset, SingleOrMultiChar::Single('\u{0008}'))),
995            't' => Ok((offset, SingleOrMultiChar::Single('\u{0009}'))),
996            'n' => Ok((offset, SingleOrMultiChar::Single('\u{000A}'))),
997            'v' => Ok((offset, SingleOrMultiChar::Single('\u{000B}'))),
998            'f' => Ok((offset, SingleOrMultiChar::Single('\u{000C}'))),
999            'r' => Ok((offset, SingleOrMultiChar::Single('\u{000D}'))),
1000            _ => Ok((offset, SingleOrMultiChar::Single(next_char))),
1001        }
1002    }
1003
1004    // starts with :, consumes the trailing :]
1005    fn parse_property_posix(&mut self) -> Result<()> {
1006        self.consume(':')?;
1007        if self.must_peek_char()? == '^' {
1008            self.inverted = true;
1009            self.iter.next();
1010        }
1011
1012        self.parse_property_inner(':')?;
1013
1014        self.consume(']')?;
1015
1016        Ok(())
1017    }
1018
1019    // starts with \p{ or \P{, consumes the trailing }
1020    fn parse_property_perl(&mut self) -> Result<()> {
1021        self.consume('\\')?;
1022        match self.must_next()? {
1023            (_, 'p') => {}
1024            (_, 'P') => self.inverted = true,
1025            (offset, c) => return Err(PEK::UnexpectedChar(c).with_offset(offset)),
1026        }
1027        self.consume('{')?;
1028
1029        self.parse_property_inner('}')?;
1030
1031        Ok(())
1032    }
1033
1034    fn parse_property_inner(&mut self, end: char) -> Result<()> {
1035        // UnicodeSet spec ignores whitespace, '-', and '_',
1036        // but ECMA-262 requires '_', so we'll allow that.
1037        // TODO(#3559): support loose matching on property names (e.g., "AS  -_-  CII_Hex_ D-igit")
1038        // TODO(#3559): support more properties than ECMA-262
1039
1040        let property_offset;
1041
1042        let mut key_buffer = String::new();
1043        let mut value_buffer = String::new();
1044
1045        enum State {
1046            // initial state, nothing parsed yet
1047            Begin,
1048            // non-empty property name
1049            PropertyName,
1050            // property name parsed, '=' or '≠' parsed, no value parsed yet
1051            PropertyValueBegin,
1052            // non-empty property name, non-empty property value
1053            PropertyValue,
1054        }
1055        use State::*;
1056
1057        let mut state = Begin;
1058        // whether '=' (true) or '≠' (false) was parsed
1059        let mut equality = true;
1060
1061        loop {
1062            self.skip_whitespace();
1063            match (state, self.must_peek_char()?) {
1064                // parse the end of the property expression
1065                (PropertyName | PropertyValue, c) if c == end => {
1066                    // byte index of (full) property name/value is one back
1067                    property_offset = self.must_peek_index()? - 1;
1068                    self.iter.next();
1069                    break;
1070                }
1071                // parse the property name
1072                // NOTE: this might be too strict, because in the case of e.g. [:value:], we might want to
1073                // allow [:lower-case-letter:] ([:gc=lower-case-letter:] works)
1074                (Begin | PropertyName, c) if c.is_ascii_alphanumeric() || c == '_' => {
1075                    key_buffer.push(c);
1076                    self.iter.next();
1077                    state = PropertyName;
1078                }
1079                // parse the name-value separator
1080                (PropertyName, c @ ('=' | '≠')) => {
1081                    equality = c == '=';
1082                    self.iter.next();
1083                    state = PropertyValueBegin;
1084                }
1085                // parse the property value
1086                (PropertyValue | PropertyValueBegin, c) if c != end => {
1087                    value_buffer.push(c);
1088                    self.iter.next();
1089                    state = PropertyValue;
1090                }
1091                (_, c) => return self.error_here(PEK::UnexpectedChar(c)),
1092            }
1093        }
1094
1095        if !equality {
1096            self.inverted = !self.inverted;
1097        }
1098
1099        let inverted = self
1100            .load_property_codepoints(&key_buffer, &value_buffer)
1101            // any error that does not already have an offset should use the appropriate property offset
1102            .map_err(|e| e.or_with_offset(property_offset))?;
1103        if inverted {
1104            self.inverted = !self.inverted;
1105        }
1106
1107        Ok(())
1108    }
1109
1110    // returns whether the set needs to be inverted or not
1111    fn load_property_codepoints(&mut self, key: &str, value: &str) -> Result<bool> {
1112        // we support:
1113        // [:gc = value:]
1114        // [:sc = value:]
1115        // [:scx = value:]
1116        // [:Grapheme_Cluster_Break = value:]
1117        // [:Sentence_Break = value:]
1118        // [:Word_Break = value:]
1119        // [:value:] - looks up value in gc, sc
1120        // [:prop:] - binary property, returns codepoints that have the property
1121        // [:prop = truthy/falsy:] - same as above
1122
1123        let mut inverted = false;
1124
1125        // contains a value for the General_Category property that needs to be tried
1126        let mut try_gc = Err(PEK::UnknownProperty.into());
1127        // contains a value for the Script property that needs to be tried
1128        let mut try_sc = Err(PEK::UnknownProperty.into());
1129        // contains a value for the Script_Extensions property that needs to be tried
1130        let mut try_scx = Err(PEK::UnknownProperty.into());
1131        // contains a value for the Grapheme_Cluster_Break property that needs to be tried
1132        let mut try_gcb = Err(PEK::UnknownProperty.into());
1133        // contains a value for the Line_Break property that needs to be tried
1134        let mut try_lb = Err(PEK::UnknownProperty.into());
1135        // contains a value for the Sentence_Break property that needs to be tried
1136        let mut try_sb = Err(PEK::UnknownProperty.into());
1137        // contains a value for the Word_Break property that needs to be tried
1138        let mut try_wb = Err(PEK::UnknownProperty.into());
1139        // contains a supposed binary property name that needs to be tried
1140        let mut try_binary = Err(PEK::UnknownProperty.into());
1141        // contains a supposed canonical combining class property name that needs to be tried
1142        let mut try_ccc: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
1143        // contains a supposed block property name that needs to be tried
1144        let mut try_block: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
1145
1146        if !value.is_empty() {
1147            // key is gc, sc, scx, grapheme cluster break, sentence break, word break
1148            // value is a property value
1149            // OR
1150            // key is a binary property and value is a truthy/falsy value
1151
1152            match key.as_bytes() {
1153                GeneralCategory::NAME | GeneralCategory::SHORT_NAME => try_gc = Ok(value),
1154                GraphemeClusterBreak::NAME | GraphemeClusterBreak::SHORT_NAME => {
1155                    try_gcb = Ok(value)
1156                }
1157                LineBreak::NAME | LineBreak::SHORT_NAME => try_lb = Ok(value),
1158                Script::NAME | Script::SHORT_NAME => try_sc = Ok(value),
1159                SentenceBreak::NAME | SentenceBreak::SHORT_NAME => try_sb = Ok(value),
1160                WordBreak::NAME | WordBreak::SHORT_NAME => try_wb = Ok(value),
1161                CanonicalCombiningClass::NAME | CanonicalCombiningClass::SHORT_NAME => {
1162                    try_ccc = Ok(value)
1163                }
1164                b"Script_Extensions" | b"scx" => try_scx = Ok(value),
1165                b"Block" | b"blk" => try_block = Ok(value),
1166                _ => {
1167                    let normalized_value = value.to_ascii_lowercase();
1168                    let truthy = matches!(normalized_value.as_str(), "true" | "t" | "yes" | "y");
1169                    let falsy = matches!(normalized_value.as_str(), "false" | "f" | "no" | "n");
1170                    // value must either match truthy or falsy
1171                    if truthy == falsy {
1172                        return Err(PEK::UnknownProperty.into());
1173                    }
1174                    // correctness: if we reach this point, only `try_binary` can be Ok, hence
1175                    // it does not matter that further down we unconditionally return `inverted`,
1176                    // because only `try_binary` can enter that code path.
1177                    inverted = falsy;
1178                    try_binary = Ok(key);
1179                }
1180            }
1181        } else {
1182            // key is binary property
1183            // OR a value of gc, sc (only gc or sc are supported as implicit keys by UTS35!)
1184            try_gc = Ok(key);
1185            try_sc = Ok(key);
1186            try_binary = Ok(key);
1187        }
1188
1189        try_gc
1190            .and_then(|value| self.try_load_general_category_set(value))
1191            .or_else(|_| try_sc.and_then(|value| self.try_load_script_set(value)))
1192            .or_else(|_| try_scx.and_then(|value| self.try_load_script_extensions_set(value)))
1193            .or_else(|_| try_binary.and_then(|value| self.try_load_ecma262_binary_set(value)))
1194            .or_else(|_| try_gcb.and_then(|value| self.try_load_grapheme_cluster_break_set(value)))
1195            .or_else(|_| try_lb.and_then(|value| self.try_load_line_break_set(value)))
1196            .or_else(|_| try_sb.and_then(|value| self.try_load_sentence_break_set(value)))
1197            .or_else(|_| try_wb.and_then(|value| self.try_load_word_break_set(value)))
1198            .or_else(|_| try_ccc.and_then(|value| self.try_load_ccc_set(value)))
1199            .or_else(|_| try_block.and_then(|value| self.try_load_block_set(value)))?;
1200        Ok(inverted)
1201    }
1202
1203    fn finalize(mut self) -> (CodePointInversionListBuilder, BTreeSet<String>) {
1204        if self.inverted {
1205            // code point inversion; removes all strings
1206            #[cfg(feature = "log")]
1207            if !self.string_set.is_empty() {
1208                log::info!(
1209                    "Inverting a unicode set with strings. This removes all strings entirely."
1210                );
1211            }
1212            self.string_set.clear();
1213            self.single_set.complement();
1214        }
1215
1216        (self.single_set, self.string_set)
1217    }
1218
1219    // parses either a raw char or an escaped char. all chars are allowed, the caller must make sure to handle
1220    // cases where some characters are not allowed
1221    fn parse_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
1222        let (offset, c) = self.must_peek()?;
1223        match c {
1224            '\\' => self.parse_escaped_char(),
1225            _ => {
1226                self.iter.next();
1227                Ok((offset, SingleOrMultiChar::Single(c)))
1228            }
1229        }
1230    }
1231
1232    // note: could turn this from the current two-pass approach into a one-pass approach
1233    // by manually parsing the digits instead of using u32::from_str_radix.
1234    fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<(usize, char)> {
1235        let first_offset = self.must_peek_index()?;
1236        let end_offset = self.validate_hex_digits(min, max)?;
1237
1238        // validate_hex_digits ensures that chars (including the last one) are ascii hex digits,
1239        // which are all exactly one UTF-8 byte long, so slicing on these offsets always respects char boundaries
1240        #[allow(clippy::indexing_slicing)]
1241        let hex_source = &self.source[first_offset..=end_offset];
1242        let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?;
1243        char::try_from(num)
1244            .map(|c| (end_offset, c))
1245            .map_err(|_| PEK::InvalidEscape.with_offset(end_offset))
1246    }
1247
1248    // validates [0-9a-fA-F]{min,max}, returns the offset of the last digit, consuming everything in the process
1249    fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
1250        let mut last_offset = 0;
1251        for count in 0..max {
1252            let (offset, c) = self.must_peek()?;
1253            if !c.is_ascii_hexdigit() {
1254                if count < min {
1255                    return Err(PEK::UnexpectedChar(c).with_offset(offset));
1256                } else {
1257                    break;
1258                }
1259            }
1260            self.iter.next();
1261            last_offset = offset;
1262        }
1263        Ok(last_offset)
1264    }
1265
1266    // returns the number of skipped whitespace chars
1267    fn skip_whitespace(&mut self) -> usize {
1268        let mut num = 0;
1269        while let Some(c) = self.peek_char() {
1270            if !self.pat_ws.contains(c) {
1271                break;
1272            }
1273            self.iter.next();
1274            num += 1;
1275        }
1276        num
1277    }
1278
1279    fn consume(&mut self, expected: char) -> Result<()> {
1280        match self.must_next()? {
1281            (offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)),
1282            _ => Ok(()),
1283        }
1284    }
1285
1286    // use this whenever an empty iterator would imply an Eof error
1287    fn must_next(&mut self) -> Result<(usize, char)> {
1288        self.iter.next().ok_or(PEK::Eof.into())
1289    }
1290
1291    // use this whenever an empty iterator would imply an Eof error
1292    fn must_peek(&mut self) -> Result<(usize, char)> {
1293        self.iter.peek().copied().ok_or(PEK::Eof.into())
1294    }
1295
1296    // must_peek, but looks two chars ahead. use sparingly
1297    fn must_peek_double(&mut self) -> Result<(usize, char)> {
1298        let mut copy = self.iter.clone();
1299        copy.next();
1300        copy.next().ok_or(PEK::Eof.into())
1301    }
1302
1303    // see must_peek
1304    fn must_peek_char(&mut self) -> Result<char> {
1305        self.must_peek().map(|(_, c)| c)
1306    }
1307
1308    // see must_peek
1309    fn must_peek_index(&mut self) -> Result<usize> {
1310        self.must_peek().map(|(idx, _)| idx)
1311    }
1312
1313    fn peek_char(&mut self) -> Option<char> {
1314        self.iter.peek().map(|&(_, c)| c)
1315    }
1316
1317    // TODO: return Result<!> once ! is stable
1318    #[inline]
1319    fn error_here<T>(&mut self, kind: ParseErrorKind) -> Result<T> {
1320        match self.iter.peek() {
1321            None => Err(kind.into()),
1322            Some(&(offset, _)) => Err(kind.with_offset(offset)),
1323        }
1324    }
1325
1326    fn process_strings(&mut self, op: Operation, other_strings: BTreeSet<String>) {
1327        match op {
1328            Operation::Union => self.string_set.extend(other_strings),
1329            Operation::Difference => {
1330                self.string_set = self
1331                    .string_set
1332                    .difference(&other_strings)
1333                    .cloned()
1334                    .collect()
1335            }
1336            Operation::Intersection => {
1337                self.string_set = self
1338                    .string_set
1339                    .intersection(&other_strings)
1340                    .cloned()
1341                    .collect()
1342            }
1343        }
1344    }
1345
1346    fn process_chars(&mut self, op: Operation, other_chars: CodePointInversionList) {
1347        match op {
1348            Operation::Union => self.single_set.add_set(&other_chars),
1349            Operation::Difference => self.single_set.remove_set(&other_chars),
1350            Operation::Intersection => self.single_set.retain_set(&other_chars),
1351        }
1352    }
1353
1354    fn try_load_general_category_set(&mut self, name: &str) -> Result<()> {
1355        // TODO(#3550): This could be cached; does not depend on name.
1356        let name_map =
1357            PropertyParser::<GeneralCategoryGroup>::try_new_unstable(self.property_provider)
1358                .map_err(|_| PEK::Internal)?;
1359        let gc_value = name_map
1360            .as_borrowed()
1361            .get_loose(name)
1362            .ok_or(PEK::UnknownProperty)?;
1363        // TODO(#3550): This could be cached; does not depend on name.
1364        let set = CodePointMapData::<GeneralCategory>::try_new_unstable(self.property_provider)
1365            .map_err(|_| PEK::Internal)?
1366            .as_borrowed()
1367            .get_set_for_value_group(gc_value);
1368        self.single_set.add_set(&set.to_code_point_inversion_list());
1369        Ok(())
1370    }
1371
1372    fn try_get_script(&self, name: &str) -> Result<Script> {
1373        // TODO(#3550): This could be cached; does not depend on name.
1374        let name_map = PropertyParser::<Script>::try_new_unstable(self.property_provider)
1375            .map_err(|_| PEK::Internal)?;
1376        name_map
1377            .as_borrowed()
1378            .get_loose(name)
1379            .ok_or(PEK::UnknownProperty.into())
1380    }
1381
1382    fn try_load_script_set(&mut self, name: &str) -> Result<()> {
1383        let sc_value = self.try_get_script(name)?;
1384        // TODO(#3550): This could be cached; does not depend on name.
1385        let property_map = CodePointMapData::<Script>::try_new_unstable(self.property_provider)
1386            .map_err(|_| PEK::Internal)?;
1387        let set = property_map.as_borrowed().get_set_for_value(sc_value);
1388        self.single_set.add_set(&set.to_code_point_inversion_list());
1389        Ok(())
1390    }
1391
1392    fn try_load_script_extensions_set(&mut self, name: &str) -> Result<()> {
1393        // TODO(#3550): This could be cached; does not depend on name.
1394        let scx = ScriptWithExtensions::try_new_unstable(self.property_provider)
1395            .map_err(|_| PEK::Internal)?;
1396        let sc_value = self.try_get_script(name)?;
1397        let set = scx.as_borrowed().get_script_extensions_set(sc_value);
1398        self.single_set.add_set(&set);
1399        Ok(())
1400    }
1401
1402    fn try_load_ecma262_binary_set(&mut self, name: &str) -> Result<()> {
1403        let set =
1404            CodePointSetData::try_new_for_ecma262_unstable(self.property_provider, name.as_bytes())
1405                .ok_or(PEK::UnknownProperty)?
1406                .map_err(|_data_error| PEK::Internal)?;
1407        self.single_set.add_set(&set.to_code_point_inversion_list());
1408        Ok(())
1409    }
1410
1411    fn try_load_grapheme_cluster_break_set(&mut self, name: &str) -> Result<()> {
1412        let parser =
1413            PropertyParser::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1414                .map_err(|_| PEK::Internal)?;
1415        let gcb_value = parser
1416            .as_borrowed()
1417            .get_loose(name)
1418            .ok_or(PEK::UnknownProperty)?;
1419        // TODO(#3550): This could be cached; does not depend on name.
1420        let property_map =
1421            CodePointMapData::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1422                .map_err(|_| PEK::Internal)?;
1423        let set = property_map.as_borrowed().get_set_for_value(gcb_value);
1424        self.single_set.add_set(&set.to_code_point_inversion_list());
1425        Ok(())
1426    }
1427
1428    fn try_load_line_break_set(&mut self, name: &str) -> Result<()> {
1429        let parser = PropertyParser::<LineBreak>::try_new_unstable(self.property_provider)
1430            .map_err(|_| PEK::Internal)?;
1431        let lb_value = parser
1432            .as_borrowed()
1433            .get_loose(name)
1434            .ok_or(PEK::UnknownProperty)?;
1435        // TODO(#3550): This could be cached; does not depend on name.
1436        let property_map = CodePointMapData::<LineBreak>::try_new_unstable(self.property_provider)
1437            .map_err(|_| PEK::Internal)?;
1438        let set = property_map.as_borrowed().get_set_for_value(lb_value);
1439        self.single_set.add_set(&set.to_code_point_inversion_list());
1440        Ok(())
1441    }
1442
1443    fn try_load_sentence_break_set(&mut self, name: &str) -> Result<()> {
1444        let parser = PropertyParser::<SentenceBreak>::try_new_unstable(self.property_provider)
1445            .map_err(|_| PEK::Internal)?;
1446        let sb_value = parser
1447            .as_borrowed()
1448            .get_loose(name)
1449            .ok_or(PEK::UnknownProperty)?;
1450        // TODO(#3550): This could be cached; does not depend on name.
1451        let property_map =
1452            CodePointMapData::<SentenceBreak>::try_new_unstable(self.property_provider)
1453                .map_err(|_| PEK::Internal)?;
1454        let set = property_map.as_borrowed().get_set_for_value(sb_value);
1455        self.single_set.add_set(&set.to_code_point_inversion_list());
1456        Ok(())
1457    }
1458
1459    fn try_load_word_break_set(&mut self, name: &str) -> Result<()> {
1460        let parser = PropertyParser::<WordBreak>::try_new_unstable(self.property_provider)
1461            .map_err(|_| PEK::Internal)?;
1462        let wb_value = parser
1463            .as_borrowed()
1464            .get_loose(name)
1465            .ok_or(PEK::UnknownProperty)?;
1466        // TODO(#3550): This could be cached; does not depend on name.
1467        let property_map = CodePointMapData::<WordBreak>::try_new_unstable(self.property_provider)
1468            .map_err(|_| PEK::Internal)?;
1469        let set = property_map.as_borrowed().get_set_for_value(wb_value);
1470        self.single_set.add_set(&set.to_code_point_inversion_list());
1471        Ok(())
1472    }
1473
1474    fn try_load_ccc_set(&mut self, name: &str) -> Result<()> {
1475        let parser =
1476            PropertyParser::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
1477                .map_err(|_| PEK::Internal)?;
1478        let value = parser
1479            .as_borrowed()
1480            .get_loose(name)
1481            // TODO: make the property parser do this
1482            .or_else(|| {
1483                name.parse()
1484                    .ok()
1485                    .map(CanonicalCombiningClass::from_icu4c_value)
1486            })
1487            .ok_or(PEK::UnknownProperty)?;
1488        // TODO(#3550): This could be cached; does not depend on name.
1489        let property_map =
1490            CodePointMapData::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
1491                .map_err(|_| PEK::Internal)?;
1492        let set = property_map.as_borrowed().get_set_for_value(value);
1493        self.single_set.add_set(&set.to_code_point_inversion_list());
1494        Ok(())
1495    }
1496
1497    fn try_load_block_set(&mut self, name: &str) -> Result<()> {
1498        // TODO: source these from properties
1499        self.single_set
1500            .add_range(match name.to_ascii_lowercase().as_str() {
1501                "arabic" => '\u{0600}'..'\u{06FF}',
1502                "thaana" => '\u{0780}'..'\u{07BF}',
1503                _ => {
1504                    #[cfg(feature = "log")]
1505                    log::warn!("Skipping :block={name}:");
1506                    return Err(PEK::Unimplemented.into());
1507                }
1508            });
1509        Ok(())
1510    }
1511}
1512
1513/// Parses a UnicodeSet pattern and returns a UnicodeSet in the form of a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList),
1514/// as well as the number of bytes consumed from the source string.
1515///
1516/// Supports UnicodeSets as described in [UTS #35 - Unicode Sets](https://unicode.org/reports/tr35/#Unicode_Sets).
1517///
1518/// The error type of the returned Result can be pretty-printed with [`ParseError::fmt_with_source`].
1519///
1520/// # Variables
1521///
1522/// If you need support for variables inside UnicodeSets (e.g., `[$start-$end]`), use [`parse_with_variables`].
1523///
1524/// # Limitations
1525///
1526/// * Currently, we only support the [ECMA-262 properties](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
1527///   The property names must match the exact spelling listed in ECMA-262. Note that we do support UTS35 syntax for elided `General_Category`
1528///   and `Script` property names, i.e., `[:Latn:]` and `[:Ll:]` are both valid, with the former implying the `Script` property, and the latter the
1529///   `General_Category` property.
1530/// * We do not support `\N{Unicode code point name}` character escaping. Use any other escape method described in UTS35.
1531///
1532/// ✨ *Enabled with the `compiled_data` Cargo feature.*
1533///
1534/// [📚 Help choosing a constructor](icu_provider::constructors)
1535///
1536/// # Examples
1537///
1538/// Parse ranges
1539/// ```
1540/// use icu::experimental::unicodeset_parse::parse;
1541///
1542/// let source = "[a-zA-Z0-9]";
1543/// let (set, consumed) = parse(source).unwrap();
1544/// let code_points = set.code_points();
1545///
1546/// assert!(code_points.contains_range('a'..='z'));
1547/// assert!(code_points.contains_range('A'..='Z'));
1548/// assert!(code_points.contains_range('0'..='9'));
1549/// assert_eq!(consumed, source.len());
1550/// ```
1551///
1552/// Parse properties, set operations, inner sets
1553/// ```
1554/// use icu::experimental::unicodeset_parse::parse;
1555///
1556/// let (set, _) =
1557///     parse("[[:^ll:]-[^][:gc = Lowercase Letter:]&[^[[^]-[a-z]]]]").unwrap();
1558/// assert!(set.code_points().contains_range('a'..='z'));
1559/// assert_eq!(('a'..='z').count(), set.size());
1560/// ```
1561///
1562/// Inversions remove strings
1563/// ```
1564/// use icu::experimental::unicodeset_parse::parse;
1565///
1566/// let (set, _) =
1567///     parse(r"[[a-z{hello\ world}]&[^a-y{hello\ world}]]").unwrap();
1568/// assert!(set.contains('z'));
1569/// assert_eq!(set.size(), 1);
1570/// assert!(!set.has_strings());
1571/// ```
1572///
1573/// Set operators (including the implicit union) have the same precedence and are left-associative
1574/// ```
1575/// use icu::experimental::unicodeset_parse::parse;
1576///
1577/// let (set, _) = parse("[[ace][bdf] - [abc][def]]").unwrap();
1578/// assert!(set.code_points().contains_range('d'..='f'));
1579/// assert_eq!(set.size(), ('d'..='f').count());
1580/// ```
1581///
1582/// Supports partial parses
1583/// ```
1584/// use icu::experimental::unicodeset_parse::parse;
1585///
1586/// let (set, consumed) = parse("[a-c][x-z]").unwrap();
1587/// let code_points = set.code_points();
1588/// assert!(code_points.contains_range('a'..='c'));
1589/// assert!(!code_points.contains_range('x'..='z'));
1590/// assert_eq!(set.size(), ('a'..='c').count());
1591/// // only the first UnicodeSet is parsed
1592/// assert_eq!(consumed, "[a-c]".len());
1593/// ```
1594#[cfg(feature = "compiled_data")]
1595pub fn parse(source: &str) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1596    parse_unstable(source, &icu_properties::provider::Baked)
1597}
1598
1599/// Parses a UnicodeSet pattern with support for variables enabled.
1600///
1601/// See [`parse`] for more information.
1602///
1603/// # Examples
1604///
1605/// ```
1606/// use icu::experimental::unicodeset_parse::*;
1607///
1608/// let (my_set, _) = parse("[abc]").unwrap();
1609///
1610/// let mut variable_map = VariableMap::new();
1611/// variable_map.insert_char("start".into(), 'a').unwrap();
1612/// variable_map.insert_char("end".into(), 'z').unwrap();
1613/// variable_map.insert_string("str".into(), "Hello World".into()).unwrap();
1614/// variable_map.insert_set("the_set".into(), my_set).unwrap();
1615///
1616/// // If a variable already exists, `Err` is returned, and the map is not updated.
1617/// variable_map.insert_char("end".into(), 'Ω').unwrap_err();
1618///
1619/// let source = "[[$start-$end]-$the_set $str]";
1620/// let (set, consumed) = parse_with_variables(source, &variable_map).unwrap();
1621/// assert_eq!(consumed, source.len());
1622/// assert!(set.code_points().contains_range('d'..='z'));
1623/// assert!(set.contains_str("Hello World"));
1624/// assert_eq!(set.size(), 1 + ('d'..='z').count());
1625#[cfg(feature = "compiled_data")]
1626pub fn parse_with_variables(
1627    source: &str,
1628    variable_map: &VariableMap<'_>,
1629) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1630    parse_unstable_with_variables(source, variable_map, &icu_properties::provider::Baked)
1631}
1632
1633#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse_with_variables)]
1634pub fn parse_unstable_with_variables<P>(
1635    source: &str,
1636    variable_map: &VariableMap<'_>,
1637    provider: &P,
1638) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1639where
1640    P: ?Sized
1641        + DataProvider<PropertyBinaryAlphabeticV1>
1642        + DataProvider<PropertyBinaryAsciiHexDigitV1>
1643        + DataProvider<PropertyBinaryBidiControlV1>
1644        + DataProvider<PropertyBinaryBidiMirroredV1>
1645        + DataProvider<PropertyBinaryCasedV1>
1646        + DataProvider<PropertyBinaryCaseIgnorableV1>
1647        + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
1648        + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
1649        + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
1650        + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
1651        + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
1652        + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
1653        + DataProvider<PropertyBinaryDashV1>
1654        + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
1655        + DataProvider<PropertyBinaryDeprecatedV1>
1656        + DataProvider<PropertyBinaryDiacriticV1>
1657        + DataProvider<PropertyBinaryEmojiComponentV1>
1658        + DataProvider<PropertyBinaryEmojiModifierBaseV1>
1659        + DataProvider<PropertyBinaryEmojiModifierV1>
1660        + DataProvider<PropertyBinaryEmojiPresentationV1>
1661        + DataProvider<PropertyBinaryEmojiV1>
1662        + DataProvider<PropertyBinaryExtendedPictographicV1>
1663        + DataProvider<PropertyBinaryExtenderV1>
1664        + DataProvider<PropertyBinaryGraphemeBaseV1>
1665        + DataProvider<PropertyBinaryGraphemeExtendV1>
1666        + DataProvider<PropertyBinaryHexDigitV1>
1667        + DataProvider<PropertyBinaryIdContinueV1>
1668        + DataProvider<PropertyBinaryIdeographicV1>
1669        + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
1670        + DataProvider<PropertyBinaryIdStartV1>
1671        + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
1672        + DataProvider<PropertyBinaryJoinControlV1>
1673        + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
1674        + DataProvider<PropertyBinaryLowercaseV1>
1675        + DataProvider<PropertyBinaryMathV1>
1676        + DataProvider<PropertyBinaryNoncharacterCodePointV1>
1677        + DataProvider<PropertyBinaryPatternSyntaxV1>
1678        + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
1679        + DataProvider<PropertyBinaryQuotationMarkV1>
1680        + DataProvider<PropertyBinaryRadicalV1>
1681        + DataProvider<PropertyBinaryRegionalIndicatorV1>
1682        + DataProvider<PropertyBinarySentenceTerminalV1>
1683        + DataProvider<PropertyBinarySoftDottedV1>
1684        + DataProvider<PropertyBinaryTerminalPunctuationV1>
1685        + DataProvider<PropertyBinaryUnifiedIdeographV1>
1686        + DataProvider<PropertyBinaryUppercaseV1>
1687        + DataProvider<PropertyBinaryVariationSelectorV1>
1688        + DataProvider<PropertyBinaryWhiteSpaceV1>
1689        + DataProvider<PropertyBinaryXidContinueV1>
1690        + DataProvider<PropertyBinaryXidStartV1>
1691        + DataProvider<PropertyEnumCanonicalCombiningClassV1>
1692        + DataProvider<PropertyEnumGeneralCategoryV1>
1693        + DataProvider<PropertyEnumGraphemeClusterBreakV1>
1694        + DataProvider<PropertyEnumLineBreakV1>
1695        + DataProvider<PropertyEnumScriptV1>
1696        + DataProvider<PropertyEnumSentenceBreakV1>
1697        + DataProvider<PropertyEnumWordBreakV1>
1698        + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
1699        + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
1700        + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
1701        + DataProvider<PropertyNameParseLineBreakV1>
1702        + DataProvider<PropertyNameParseScriptV1>
1703        + DataProvider<PropertyNameParseSentenceBreakV1>
1704        + DataProvider<PropertyNameParseWordBreakV1>
1705        + DataProvider<PropertyScriptWithExtensionsV1>,
1706{
1707    // TODO(#3550): Add function "parse_overescaped" that uses a custom iterator to de-overescape (i.e., maps \\ to \) on-the-fly?
1708    // ^ will likely need a different iterator type on UnicodeSetBuilder
1709
1710    let mut iter = source.char_indices().peekable();
1711
1712    let xid_start =
1713        CodePointSetData::try_new_unstable::<XidStart>(provider).map_err(|_| PEK::Internal)?;
1714    let xid_start_list = xid_start.to_code_point_inversion_list();
1715    let xid_continue =
1716        CodePointSetData::try_new_unstable::<XidContinue>(provider).map_err(|_| PEK::Internal)?;
1717    let xid_continue_list = xid_continue.to_code_point_inversion_list();
1718
1719    let pat_ws = CodePointSetData::try_new_unstable::<PatternWhiteSpace>(provider)
1720        .map_err(|_| PEK::Internal)?;
1721    let pat_ws_list = pat_ws.to_code_point_inversion_list();
1722
1723    let mut builder = UnicodeSetBuilder::new_internal(
1724        &mut iter,
1725        source,
1726        variable_map,
1727        &xid_start_list,
1728        &xid_continue_list,
1729        &pat_ws_list,
1730        provider,
1731    );
1732
1733    builder.parse_unicode_set()?;
1734    let (single, string_set) = builder.finalize();
1735    let built_single = single.build();
1736
1737    let mut strings = string_set.into_iter().collect::<Vec<_>>();
1738    strings.sort();
1739    let zerovec = (&strings).into();
1740
1741    let cpinvlistandstrlist = CodePointInversionListAndStringList::try_from(built_single, zerovec)
1742        .map_err(|_| PEK::Internal)?;
1743
1744    let parsed_bytes = match iter.peek().copied() {
1745        None => source.len(),
1746        Some((offset, _)) => offset,
1747    };
1748
1749    Ok((cpinvlistandstrlist, parsed_bytes))
1750}
1751
1752#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse)]
1753pub fn parse_unstable<P>(
1754    source: &str,
1755    provider: &P,
1756) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1757where
1758    P: ?Sized
1759        + DataProvider<PropertyBinaryAlphabeticV1>
1760        + DataProvider<PropertyBinaryAsciiHexDigitV1>
1761        + DataProvider<PropertyBinaryBidiControlV1>
1762        + DataProvider<PropertyBinaryBidiMirroredV1>
1763        + DataProvider<PropertyBinaryCasedV1>
1764        + DataProvider<PropertyBinaryCaseIgnorableV1>
1765        + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
1766        + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
1767        + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
1768        + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
1769        + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
1770        + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
1771        + DataProvider<PropertyBinaryDashV1>
1772        + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
1773        + DataProvider<PropertyBinaryDeprecatedV1>
1774        + DataProvider<PropertyBinaryDiacriticV1>
1775        + DataProvider<PropertyBinaryEmojiComponentV1>
1776        + DataProvider<PropertyBinaryEmojiModifierBaseV1>
1777        + DataProvider<PropertyBinaryEmojiModifierV1>
1778        + DataProvider<PropertyBinaryEmojiPresentationV1>
1779        + DataProvider<PropertyBinaryEmojiV1>
1780        + DataProvider<PropertyBinaryExtendedPictographicV1>
1781        + DataProvider<PropertyBinaryExtenderV1>
1782        + DataProvider<PropertyBinaryGraphemeBaseV1>
1783        + DataProvider<PropertyBinaryGraphemeExtendV1>
1784        + DataProvider<PropertyBinaryHexDigitV1>
1785        + DataProvider<PropertyBinaryIdContinueV1>
1786        + DataProvider<PropertyBinaryIdeographicV1>
1787        + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
1788        + DataProvider<PropertyBinaryIdStartV1>
1789        + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
1790        + DataProvider<PropertyBinaryJoinControlV1>
1791        + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
1792        + DataProvider<PropertyBinaryLowercaseV1>
1793        + DataProvider<PropertyBinaryMathV1>
1794        + DataProvider<PropertyBinaryNoncharacterCodePointV1>
1795        + DataProvider<PropertyBinaryPatternSyntaxV1>
1796        + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
1797        + DataProvider<PropertyBinaryQuotationMarkV1>
1798        + DataProvider<PropertyBinaryRadicalV1>
1799        + DataProvider<PropertyBinaryRegionalIndicatorV1>
1800        + DataProvider<PropertyBinarySentenceTerminalV1>
1801        + DataProvider<PropertyBinarySoftDottedV1>
1802        + DataProvider<PropertyBinaryTerminalPunctuationV1>
1803        + DataProvider<PropertyBinaryUnifiedIdeographV1>
1804        + DataProvider<PropertyBinaryUppercaseV1>
1805        + DataProvider<PropertyBinaryVariationSelectorV1>
1806        + DataProvider<PropertyBinaryWhiteSpaceV1>
1807        + DataProvider<PropertyBinaryXidContinueV1>
1808        + DataProvider<PropertyBinaryXidStartV1>
1809        + DataProvider<PropertyEnumCanonicalCombiningClassV1>
1810        + DataProvider<PropertyEnumGeneralCategoryV1>
1811        + DataProvider<PropertyEnumGraphemeClusterBreakV1>
1812        + DataProvider<PropertyEnumLineBreakV1>
1813        + DataProvider<PropertyEnumScriptV1>
1814        + DataProvider<PropertyEnumSentenceBreakV1>
1815        + DataProvider<PropertyEnumWordBreakV1>
1816        + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
1817        + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
1818        + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
1819        + DataProvider<PropertyNameParseLineBreakV1>
1820        + DataProvider<PropertyNameParseScriptV1>
1821        + DataProvider<PropertyNameParseSentenceBreakV1>
1822        + DataProvider<PropertyNameParseWordBreakV1>
1823        + DataProvider<PropertyScriptWithExtensionsV1>,
1824{
1825    let dummy = Default::default();
1826    parse_unstable_with_variables(source, &dummy, provider)
1827}
1828
1829#[cfg(test)]
1830mod tests {
1831    use core::ops::RangeInclusive;
1832    use std::collections::HashSet;
1833
1834    use super::*;
1835
1836    // "aabxzz" => [a..=a, b..=x, z..=z]
1837    fn range_iter_from_str(s: &str) -> impl Iterator<Item = RangeInclusive<u32>> {
1838        debug_assert_eq!(
1839            s.chars().count() % 2,
1840            0,
1841            "string \"{}\" does not contain an even number of code points",
1842            s.escape_debug()
1843        );
1844        let mut res = vec![];
1845        let mut skip = false;
1846        for (a, b) in s.chars().zip(s.chars().skip(1)) {
1847            if skip {
1848                skip = false;
1849                continue;
1850            }
1851            let a = a as u32;
1852            let b = b as u32;
1853            res.push(a..=b);
1854            skip = true;
1855        }
1856
1857        res.into_iter()
1858    }
1859
1860    fn assert_set_equality<'a>(
1861        source: &str,
1862        cpinvlistandstrlist: &CodePointInversionListAndStringList,
1863        single: impl Iterator<Item = RangeInclusive<u32>>,
1864        strings: impl Iterator<Item = &'a str>,
1865    ) {
1866        let expected_ranges: HashSet<_> = single.collect();
1867        let actual_ranges: HashSet<_> = cpinvlistandstrlist.code_points().iter_ranges().collect();
1868        assert_eq!(
1869            actual_ranges,
1870            expected_ranges,
1871            "got unexpected ranges {:?}, expected {:?} for parsed set \"{}\"",
1872            actual_ranges,
1873            expected_ranges,
1874            source.escape_debug()
1875        );
1876        let mut expected_size = cpinvlistandstrlist.code_points().size();
1877        for s in strings {
1878            expected_size += 1;
1879            assert!(
1880                cpinvlistandstrlist.contains_str(s),
1881                "missing string \"{}\" from parsed set \"{}\"",
1882                s.escape_debug(),
1883                source.escape_debug()
1884            );
1885        }
1886        let actual_size = cpinvlistandstrlist.size();
1887        assert_eq!(
1888            actual_size,
1889            expected_size,
1890            "got unexpected size {}, expected {} for parsed set \"{}\"",
1891            actual_size,
1892            expected_size,
1893            source.escape_debug()
1894        );
1895    }
1896
1897    fn assert_is_error_and_message_eq(source: &str, expected_err: &str, vm: &VariableMap<'_>) {
1898        let result = parse_with_variables(source, vm);
1899        assert!(result.is_err(), "{source} does not cause an error!");
1900        let err = result.unwrap_err();
1901        assert_eq!(err.fmt_with_source(source).to_string(), expected_err);
1902    }
1903
1904    #[test]
1905    fn test_semantics_with_variables() {
1906        let mut map_char_char = VariableMap::default();
1907        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1908        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1909
1910        let mut map_headache = VariableMap::default();
1911        map_headache.insert_char("hehe".to_string(), '-').unwrap();
1912
1913        let mut map_char_string = VariableMap::default();
1914        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1915        map_char_string
1916            .insert_string("var2".to_string(), "abc".to_string())
1917            .unwrap();
1918
1919        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1920        let mut map_char_set = VariableMap::default();
1921        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1922        map_char_set.insert_set("set".to_string(), set).unwrap();
1923
1924        let cases: Vec<(_, _, _, Vec<&str>)> = vec![
1925            // simple
1926            (&map_char_char, "[$a]", "aa", vec![]),
1927            (&map_char_char, "[ $a ]", "aa", vec![]),
1928            (&map_char_char, "[$a$]", "aa\u{ffff}\u{ffff}", vec![]),
1929            (&map_char_char, "[$a$ ]", "aa\u{ffff}\u{ffff}", vec![]),
1930            (&map_char_char, "[$a$var2]", "aazz", vec![]),
1931            (&map_char_char, "[$a - $var2]", "az", vec![]),
1932            (&map_char_char, "[$a-$var2]", "az", vec![]),
1933            (&map_headache, "[a $hehe z]", "aazz--", vec![]),
1934            (
1935                &map_char_char,
1936                "[[$]var2]",
1937                "\u{ffff}\u{ffff}vvaarr22",
1938                vec![],
1939            ),
1940            // variable prefix escaping
1941            (&map_char_char, r"[\$var2]", "$$vvaarr22", vec![]),
1942            (&map_char_char, r"[\\$var2]", r"\\zz", vec![]),
1943            // no variable dereferencing in strings
1944            (&map_char_char, "[{$a}]", "", vec!["$a"]),
1945            // set operations
1946            (&map_char_set, "[$set & [b-z]]", "bz", vec![]),
1947            (&map_char_set, "[[a-z]-[b-z]]", "aa", vec![]),
1948            (&map_char_set, "[$set-[b-z]]", "aa", vec!["Hello, World!"]),
1949            (&map_char_set, "[$set-$set]", "", vec![]),
1950            (&map_char_set, "[[a-zA]-$set]", "AA", vec![]),
1951            (&map_char_set, "[$set[b-z]]", "az", vec!["Hello, World!"]),
1952            (&map_char_set, "[[a-a]$set]", "az", vec!["Hello, World!"]),
1953            (&map_char_set, "$set", "az", vec!["Hello, World!"]),
1954            // strings
1955            (&map_char_string, "[$var2]", "", vec!["abc"]),
1956        ];
1957        for (variable_map, source, single, strings) in cases {
1958            let parsed = parse_with_variables(source, variable_map);
1959            if let Err(err) = parsed {
1960                panic!(
1961                    "{source} results in an error: {}",
1962                    err.fmt_with_source(source)
1963                );
1964            }
1965            let (set, consumed) = parsed.unwrap();
1966            assert_eq!(consumed, source.len(), "{source:?} is not fully consumed");
1967            assert_set_equality(
1968                source,
1969                &set,
1970                range_iter_from_str(single),
1971                strings.into_iter(),
1972            );
1973        }
1974    }
1975
1976    #[test]
1977    fn test_semantics() {
1978        const ALL_CHARS: &str = "\x00\u{10FFFF}";
1979        let cases: Vec<(_, _, Vec<&str>)> = vec![
1980            // simple
1981            ("[a]", "aa", vec![]),
1982            ("[]", "", vec![]),
1983            ("[qax]", "aaqqxx", vec![]),
1984            ("[a-z]", "az", vec![]),
1985            ("[--]", "--", vec![]),
1986            ("[a-b-]", "ab--", vec![]),
1987            ("[[a-b]-]", "ab--", vec![]),
1988            ("[{ab}-]", "--", vec!["ab"]),
1989            ("[-a-b]", "ab--", vec![]),
1990            ("[-a]", "--aa", vec![]),
1991            // whitespace escaping
1992            (r"[\n]", "\n\n", vec![]),
1993            ("[\\\n]", "\n\n", vec![]),
1994            // empty - whitespace is skipped
1995            ("[\n]", "", vec![]),
1996            ("[\u{9}]", "", vec![]),
1997            ("[\u{A}]", "", vec![]),
1998            ("[\u{B}]", "", vec![]),
1999            ("[\u{C}]", "", vec![]),
2000            ("[\u{D}]", "", vec![]),
2001            ("[\u{20}]", "", vec![]),
2002            ("[\u{85}]", "", vec![]),
2003            ("[\u{200E}]", "", vec![]),
2004            ("[\u{200F}]", "", vec![]),
2005            ("[\u{2028}]", "", vec![]),
2006            ("[\u{2029}]", "", vec![]),
2007            // whitespace significance:
2008            ("[^[^$]]", "\u{ffff}\u{ffff}", vec![]),
2009            ("[^[^ $]]", "\u{ffff}\u{ffff}", vec![]),
2010            ("[^[^ $ ]]", "\u{ffff}\u{ffff}", vec![]),
2011            ("[^[^a$]]", "aa\u{ffff}\u{ffff}", vec![]),
2012            ("[^[^a$ ]]", "aa\u{ffff}\u{ffff}", vec![]),
2013            ("[-]", "--", vec![]),
2014            ("[  -  ]", "--", vec![]),
2015            ("[  - -  ]", "--", vec![]),
2016            ("[ a-b -  ]", "ab--", vec![]),
2017            ("[ -a]", "--aa", vec![]),
2018            ("[a-]", "--aa", vec![]),
2019            ("[a- ]", "--aa", vec![]),
2020            ("[ :]", "::", vec![]),
2021            ("[ :L:]", "::LL", vec![]),
2022            // but not all "whitespace", only Pattern_White_Space:
2023            ("[\u{A0}]", "\u{A0}\u{A0}", vec![]), // non-breaking space
2024            // anchor
2025            ("[$]", "\u{ffff}\u{ffff}", vec![]),
2026            (r"[\$]", "$$", vec![]),
2027            ("[{$}]", "$$", vec![]),
2028            // set operations
2029            ("[[a-z]&[b-z]]", "bz", vec![]),
2030            ("[[a-z]-[b-z]]", "aa", vec![]),
2031            ("[[a-z][b-z]]", "az", vec![]),
2032            ("[[a-a][b-z]]", "az", vec![]),
2033            ("[[a-z{abc}]&[b-z{abc}{abx}]]", "bz", vec!["abc"]),
2034            ("[[{abx}a-z{abc}]&[b-z{abc}]]", "bz", vec!["abc"]),
2035            ("[[a-z{abx}]-[{abx}b-z{abc}]]", "aa", vec![]),
2036            ("[[a-z{abx}{abc}]-[{abx}b-z]]", "aa", vec!["abc"]),
2037            ("[[a-z{abc}][b-z{abx}]]", "az", vec!["abc", "abx"]),
2038            // strings
2039            ("[{this is a minus -}]", "", vec!["thisisaminus-"]),
2040            // associativity
2041            ("[[a-a][b-z] - [a-d][e-z]]", "ez", vec![]),
2042            ("[[a-a][b-z] - [a-d]&[e-z]]", "ez", vec![]),
2043            ("[[a-a][b-z] - [a-z][]]", "", vec![]),
2044            ("[[a-a][b-z] - [a-z]&[]]", "", vec![]),
2045            ("[[a-a][b-z] & [a-z]-[]]", "az", vec![]),
2046            ("[[a-a][b-z] & []-[a-z]]", "", vec![]),
2047            ("[[a-a][b-z] & [a-b][x-z]]", "abxz", vec![]),
2048            ("[[a-z]-[a-b]-[y-z]]", "cx", vec![]),
2049            // escape tests
2050            (r"[\x61-\x63]", "ac", vec![]),
2051            (r"[a-\x63]", "ac", vec![]),
2052            (r"[\x61-c]", "ac", vec![]),
2053            (r"[\u0061-\x63]", "ac", vec![]),
2054            (r"[\U00000061-\x63]", "ac", vec![]),
2055            (r"[\x{61}-\x63]", "ac", vec![]),
2056            (r"[\u{61}-\x63]", "ac", vec![]),
2057            (r"[\u{61}{hello\ world}]", "aa", vec!["hello world"]),
2058            (r"[{hello\ world}\u{61}]", "aa", vec!["hello world"]),
2059            (r"[{h\u{65}llo\ world}]", "", vec!["hello world"]),
2060            // complement tests
2061            (r"[^]", ALL_CHARS, vec![]),
2062            (r"[[^]-[^a-z]]", "az", vec![]),
2063            (r"[^{h\u{65}llo\ world}]", ALL_CHARS, vec![]),
2064            (
2065                r"[^[{h\u{65}llo\ world}]-[{hello\ world}]]",
2066                ALL_CHARS,
2067                vec![],
2068            ),
2069            (
2070                r"[^[\x00-\U0010FFFF]-[\u0100-\U0010FFFF]]",
2071                "\u{100}\u{10FFFF}",
2072                vec![],
2073            ),
2074            (r"[^[^a-z]]", "az", vec![]),
2075            (r"[^[^\^]]", "^^", vec![]),
2076            (r"[{\x{61 0062   063}}]", "", vec!["abc"]),
2077            (r"[\x{61 0062   063}]", "ac", vec![]),
2078            // binary properties
2079            (r"[:AHex:]", "09afAF", vec![]),
2080            (r"[:AHex=True:]", "09afAF", vec![]),
2081            (r"[:AHex=T:]", "09afAF", vec![]),
2082            (r"[:AHex=Yes:]", "09afAF", vec![]),
2083            (r"[:AHex=Y:]", "09afAF", vec![]),
2084            (r"[:^AHex≠True:]", "09afAF", vec![]),
2085            (r"[:AHex≠False:]", "09afAF", vec![]),
2086            (r"[[:^AHex≠False:]&[\x00-\x10]]", "\0\x10", vec![]),
2087            (r"\p{AHex}", "09afAF", vec![]),
2088            (r"\p{AHex=True}", "09afAF", vec![]),
2089            (r"\p{AHex=T}", "09afAF", vec![]),
2090            (r"\p{AHex=Yes}", "09afAF", vec![]),
2091            (r"\p{AHex=Y}", "09afAF", vec![]),
2092            (r"\P{AHex≠True}", "09afAF", vec![]),
2093            (r"\p{AHex≠False}", "09afAF", vec![]),
2094            // general category
2095            (r"[[:gc=lower-case-letter:]&[a-zA-Z]]", "az", vec![]),
2096            (r"[[:lower case letter:]&[a-zA-Z]]", "az", vec![]),
2097            // general category groups
2098            // equivalence between L and the union of all the L* categories
2099            (
2100                r"[[[:L:]-[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]][[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]-[:L:]]]",
2101                "",
2102                vec![],
2103            ),
2104            // script
2105            (r"[[:sc=latn:]&[a-zA-Z]]", "azAZ", vec![]),
2106            (r"[[:sc=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
2107            (r"[[:Latin:]&[a-zA-Z]]", "azAZ", vec![]),
2108            (r"[[:latn:]&[a-zA-Z]]", "azAZ", vec![]),
2109            // script extensions
2110            (r"[[:scx=latn:]&[a-zA-Z]]", "azAZ", vec![]),
2111            (r"[[:scx=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
2112            (r"[[:scx=Hira:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
2113            (r"[[:sc=Hira:]&[\u30FC]]", "", vec![]),
2114            (r"[[:scx=Kana:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
2115            (r"[[:sc=Kana:]&[\u30FC]]", "", vec![]),
2116            (r"[[:sc=Common:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
2117            // grapheme cluster break
2118            (
2119                r"\p{Grapheme_Cluster_Break=ZWJ}",
2120                "\u{200D}\u{200D}",
2121                vec![],
2122            ),
2123            // sentence break
2124            (
2125                r"\p{Sentence_Break=ATerm}",
2126                "\u{002E}\u{002E}\u{2024}\u{2024}\u{FE52}\u{FE52}\u{FF0E}\u{FF0E}",
2127                vec![],
2128            ),
2129            // word break
2130            (r"\p{Word_Break=Single_Quote}", "\u{0027}\u{0027}", vec![]),
2131            // more syntax edge cases from UTS35 directly
2132            (r"[\^a]", "^^aa", vec![]),
2133            (r"[{{}]", "{{", vec![]),
2134            (r"[{}}]", "}}", vec![""]),
2135            (r"[}]", "}}", vec![]),
2136            (r"[{$var}]", "", vec!["$var"]),
2137            (r"[{[a-z}]", "", vec!["[a-z"]),
2138            (r"[ { [ a - z } ]", "", vec!["[a-z"]),
2139            // TODO(#3556): Add more tests (specifically conformance tests if they exist)
2140        ];
2141        for (source, single, strings) in cases {
2142            let parsed = parse(source);
2143            if let Err(err) = parsed {
2144                panic!(
2145                    "{source} results in an error: {}",
2146                    err.fmt_with_source(source)
2147                );
2148            }
2149            let (set, consumed) = parsed.unwrap();
2150            assert_eq!(consumed, source.len());
2151            assert_set_equality(
2152                source,
2153                &set,
2154                range_iter_from_str(single),
2155                strings.into_iter(),
2156            );
2157        }
2158    }
2159
2160    #[test]
2161    fn test_error_messages_with_variables() {
2162        let mut map_char_char = VariableMap::default();
2163        map_char_char.insert_char("a".to_string(), 'a').unwrap();
2164        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
2165
2166        let mut map_char_string = VariableMap::default();
2167        map_char_string.insert_char("a".to_string(), 'a').unwrap();
2168        map_char_string
2169            .insert_string("var2".to_string(), "abc".to_string())
2170            .unwrap();
2171
2172        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
2173        let mut map_char_set = VariableMap::default();
2174        map_char_set.insert_char("a".to_string(), 'a').unwrap();
2175        map_char_set.insert_set("set".to_string(), set).unwrap();
2176
2177        let cases = [
2178            (&map_char_char, "[$$a]", r"[$$a← error: unexpected variable"),
2179            (
2180                &map_char_char,
2181                "[$ a]",
2182                r"[$ a← error: unexpected character 'a'",
2183            ),
2184            (&map_char_char, "$a", r"$a← error: unexpected variable"),
2185            (&map_char_char, "$", r"$← error: unexpected end of input"),
2186            (
2187                &map_char_string,
2188                "[$var2-$a]",
2189                r"[$var2-$a← error: unexpected variable",
2190            ),
2191            (
2192                &map_char_string,
2193                "[$a-$var2]",
2194                r"[$a-$var2← error: unexpected variable",
2195            ),
2196            (
2197                &map_char_set,
2198                "[$a-$set]",
2199                r"[$a-$set← error: unexpected variable",
2200            ),
2201            (
2202                &map_char_set,
2203                "[$set-$a]",
2204                r"[$set-$a← error: unexpected variable",
2205            ),
2206            (
2207                &map_char_set,
2208                "[$=]",
2209                "[$=← error: unexpected character '='",
2210            ),
2211        ];
2212        for (variable_map, source, expected_err) in cases {
2213            assert_is_error_and_message_eq(source, expected_err, variable_map);
2214        }
2215    }
2216
2217    #[test]
2218    fn test_error_messages() {
2219        let cases = [
2220            (r"[a-z[\]]", r"[a-z[\]]← error: unexpected end of input"),
2221            (r"", r"← error: unexpected end of input"),
2222            (r"[{]", r"[{]← error: unexpected end of input"),
2223            // we match ECMA-262 strictly, so case matters
2224            (
2225                r"[:general_category:]",
2226                r"[:general_category← error: unknown property",
2227            ),
2228            (r"[:ll=true:]", r"[:ll=true← error: unknown property"),
2229            (r"[:=", r"[:=← error: unexpected character '='"),
2230            // property names may not be empty
2231            (r"[::]", r"[::← error: unexpected character ':'"),
2232            (r"[:=hello:]", r"[:=← error: unexpected character '='"),
2233            // property values may not be empty
2234            (r"[:gc=:]", r"[:gc=:← error: unexpected character ':'"),
2235            (r"[\xag]", r"[\xag← error: unexpected character 'g'"),
2236            (r"[a-b-z]", r"[a-b-z← error: unexpected character 'z'"),
2237            // TODO(#3558): Might be better as "[a-\p← error: unexpected character 'p'"?
2238            (r"[a-\p{ll}]", r"[a-\← error: unexpected character '\\'"),
2239            (r"[a-&]", r"[a-&← error: unexpected character '&'"),
2240            (r"[a&b]", r"[a&← error: unexpected character '&'"),
2241            (r"[[set]&b]", r"[[set]&b← error: unexpected character 'b'"),
2242            (r"[[set]&]", r"[[set]&]← error: unexpected character ']'"),
2243            (r"[a-\x60]", r"[a-\x60← error: unexpected character '`'"),
2244            (r"[a-`]", r"[a-`← error: unexpected character '`'"),
2245            (r"[\x{6g}]", r"[\x{6g← error: unexpected character 'g'"),
2246            (r"[\x{g}]", r"[\x{g← error: unexpected character 'g'"),
2247            (r"[\x{}]", r"[\x{}← error: unexpected character '}'"),
2248            (
2249                r"[\x{dabeef}]",
2250                r"[\x{dabeef← error: invalid escape sequence",
2251            ),
2252            (
2253                r"[\x{10ffff0}]",
2254                r"[\x{10ffff0← error: unexpected character '0'",
2255            ),
2256            (
2257                r"[\x{11ffff}]",
2258                r"[\x{11ffff← error: invalid escape sequence",
2259            ),
2260            (
2261                r"[\x{10ffff 1 10ffff0}]",
2262                r"[\x{10ffff 1 10ffff0← error: unexpected character '0'",
2263            ),
2264            // > 1 byte in UTF-8 edge case
2265            (r"ä", r"ä← error: unexpected character 'ä'"),
2266            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
2267            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
2268            (
2269                r"[\xe5-\xe4]",
2270                r"[\xe5-\xe4← error: unexpected character 'ä'",
2271            ),
2272            (r"[\xe5-ä]", r"[\xe5-ä← error: unexpected character 'ä'"),
2273            // whitespace significance
2274            (r"[ ^]", r"[ ^← error: unexpected character '^'"),
2275            (r"[:]", r"[:]← error: unexpected character ']'"),
2276            (r"[:L]", r"[:L]← error: unexpected character ']'"),
2277            (r"\p {L}", r"\p ← error: unexpected character ' '"),
2278            // multi-escapes are not allowed in ranges
2279            (
2280                r"[\x{61 62}-d]",
2281                r"[\x{61 62}-d← error: unexpected character 'd'",
2282            ),
2283            (
2284                r"[\x{61 63}-\x{62 64}]",
2285                r"[\x{61 63}-\← error: unexpected character '\\'",
2286            ),
2287            // TODO(#3558): This is a bad error message.
2288            (r"[a-\x{62 64}]", r"[a-\← error: unexpected character '\\'"),
2289        ];
2290        let vm = Default::default();
2291        for (source, expected_err) in cases {
2292            assert_is_error_and_message_eq(source, expected_err, &vm);
2293        }
2294    }
2295
2296    #[test]
2297    fn test_consumed() {
2298        let cases = [
2299            (r"[a-z\]{[}]".len(), r"[a-z\]{[}][]"),
2300            (r"[a-z\]{[}]".len(), r"[a-z\]{[}] []"),
2301            (r"[a-z\]{[}]".len(), r"[a-z\]{]}] []"),
2302            (r"[a-z\]{{[}]".len(), r"[a-z\]{{]}] []"),
2303            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]\p{L}"),
2304            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]$var"),
2305        ];
2306
2307        let vm = Default::default();
2308        for (expected_consumed, source) in cases {
2309            let (_, consumed) = parse(source).unwrap();
2310            assert_eq!(expected_consumed, consumed);
2311            let (_, consumed) = parse_with_variables(source, &vm).unwrap();
2312            assert_eq!(expected_consumed, consumed);
2313        }
2314    }
2315}
icu_experimental/unicodeset_parse/parse.rs

icu_experimental/unicodeset_parse/
parse.rs