1use alloc::borrow::Cow;
6use alloc::collections::{BTreeMap, BTreeSet};
7use alloc::fmt::Display;
8use alloc::format;
9use alloc::string::{String, ToString};
10use alloc::vec::Vec;
11use core::{iter::Peekable, str::CharIndices};
12
13use icu_collections::{
14 codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder},
15 codepointinvliststringlist::CodePointInversionListAndStringList,
16};
17use icu_properties::script::ScriptWithExtensions;
18use icu_properties::{
19 props::{
20 CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
21 GraphemeClusterBreak, LineBreak, Script, SentenceBreak, WordBreak,
22 },
23 CodePointMapData,
24};
25use icu_properties::{
26 props::{PatternWhiteSpace, XidContinue, XidStart},
27 CodePointSetData,
28};
29use icu_properties::{provider::*, PropertyParser};
30use icu_provider::prelude::*;
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, displaydoc::Display)]
34#[non_exhaustive]
35pub enum ParseErrorKind {
36 #[displaydoc("An unexpected character was encountered")]
41 UnexpectedChar(char),
42 #[displaydoc("The property name or value is unknown")]
47 UnknownProperty,
48 UnknownVariable,
50 UnexpectedVariable,
52 Eof,
54 Internal,
56 #[displaydoc("The provided syntax is not supported by us.")]
61 Unimplemented,
62 InvalidEscape,
64}
65use zerovec::VarZeroVec;
66use ParseErrorKind as PEK;
67
68impl ParseErrorKind {
69 fn with_offset(self, offset: usize) -> ParseError {
70 ParseError {
71 offset: Some(offset),
72 kind: self,
73 }
74 }
75}
76
77impl From<ParseErrorKind> for ParseError {
78 fn from(kind: ParseErrorKind) -> Self {
79 ParseError { offset: None, kind }
80 }
81}
82
83#[derive(Debug, Clone, Copy, PartialEq, Eq)]
88pub struct ParseError {
89 offset: Option<usize>,
93 kind: ParseErrorKind,
94}
95
96type Result<T, E = ParseError> = core::result::Result<T, E>;
97
98impl ParseError {
99 pub fn fmt_with_source(&self, source: &str) -> impl Display {
131 let ParseError { offset, kind } = *self;
132
133 if kind == ParseErrorKind::Eof {
134 return format!("{source}← error: unexpected end of input");
135 }
136 let mut s = String::new();
137 if let Some(offset) = offset {
138 if offset < source.len() {
139 let mut exclusive_end = offset + 1;
149 for _ in 0..3 {
151 if source.is_char_boundary(exclusive_end) {
153 break;
154 }
155 exclusive_end += 1;
156 }
157
158 #[allow(clippy::indexing_slicing)]
160 s.push_str(&source[..exclusive_end]);
161 s.push_str("← ");
162 }
163 }
164 s.push_str("error: ");
165 match kind {
166 ParseErrorKind::UnexpectedChar(c) => {
167 s.push_str(&format!("unexpected character '{}'", c.escape_debug()));
168 }
169 ParseErrorKind::UnknownProperty => {
170 s.push_str("unknown property");
171 }
172 ParseErrorKind::UnknownVariable => {
173 s.push_str("unknown variable");
174 }
175 ParseErrorKind::UnexpectedVariable => {
176 s.push_str("unexpected variable");
177 }
178 ParseErrorKind::Eof => {
179 s.push_str("unexpected end of input");
180 }
181 ParseErrorKind::Internal => {
182 s.push_str("internal error");
183 }
184 ParseErrorKind::Unimplemented => {
185 s.push_str("unimplemented");
186 }
187 ParseErrorKind::InvalidEscape => {
188 s.push_str("invalid escape sequence");
189 }
190 }
191
192 s
193 }
194
195 pub fn kind(&self) -> ParseErrorKind {
197 self.kind
198 }
199
200 pub fn offset(&self) -> Option<usize> {
202 self.offset
203 }
204
205 fn or_with_offset(self, offset: usize) -> Self {
206 match self.offset {
207 Some(_) => self,
208 None => ParseError {
209 offset: Some(offset),
210 ..self
211 },
212 }
213 }
214}
215
216#[derive(Debug, Clone)]
218#[non_exhaustive]
219pub enum VariableValue<'a> {
220 UnicodeSet(CodePointInversionListAndStringList<'a>),
222 Char(char),
226 String(Cow<'a, str>),
228}
229
230#[derive(Debug, Clone, Default)]
232pub struct VariableMap<'a>(BTreeMap<String, VariableValue<'a>>);
233
234impl<'a> VariableMap<'a> {
235 pub fn new() -> Self {
237 Self::default()
238 }
239
240 pub fn remove(&mut self, key: &str) -> Option<VariableValue<'a>> {
243 self.0.remove(key)
244 }
245
246 pub fn get(&self, key: &str) -> Option<&VariableValue<'a>> {
248 self.0.get(key)
249 }
250
251 pub fn insert(&mut self, key: String, value: VariableValue<'a>) -> Result<(), &VariableValue> {
255 if self.0.contains_key(&key) {
257 #[allow(clippy::indexing_slicing)]
259 return Err(&self.0[&key]);
260 }
261
262 if let VariableValue::String(s) = &value {
263 let mut chars = s.chars();
264 if let (Some(c), None) = (chars.next(), chars.next()) {
265 self.0.insert(key, VariableValue::Char(c));
266 return Ok(());
267 };
268 }
269
270 self.0.insert(key, value);
271 Ok(())
272 }
273
274 pub fn insert_char(&mut self, key: String, c: char) -> Result<(), &VariableValue> {
278 if self.0.contains_key(&key) {
280 #[allow(clippy::indexing_slicing)]
282 return Err(&self.0[&key]);
283 }
284
285 self.0.insert(key, VariableValue::Char(c));
286 Ok(())
287 }
288
289 pub fn insert_string(&mut self, key: String, s: String) -> Result<(), &VariableValue> {
293 if self.0.contains_key(&key) {
295 #[allow(clippy::indexing_slicing)]
297 return Err(&self.0[&key]);
298 }
299
300 let mut chars = s.chars();
301 let val = match (chars.next(), chars.next()) {
302 (Some(c), None) => VariableValue::Char(c),
303 _ => VariableValue::String(Cow::Owned(s)),
304 };
305
306 self.0.insert(key, val);
307 Ok(())
308 }
309
310 pub fn insert_str(&mut self, key: String, s: &'a str) -> Result<(), &VariableValue> {
314 if self.0.contains_key(&key) {
316 #[allow(clippy::indexing_slicing)]
318 return Err(&self.0[&key]);
319 }
320
321 let mut chars = s.chars();
322 let val = match (chars.next(), chars.next()) {
323 (Some(c), None) => VariableValue::Char(c),
324 _ => VariableValue::String(Cow::Borrowed(s)),
325 };
326
327 self.0.insert(key, val);
328 Ok(())
329 }
330
331 pub fn insert_set(
335 &mut self,
336 key: String,
337 set: CodePointInversionListAndStringList<'a>,
338 ) -> Result<(), &VariableValue> {
339 if self.0.contains_key(&key) {
341 #[allow(clippy::indexing_slicing)]
343 return Err(&self.0[&key]);
344 }
345 self.0.insert(key, VariableValue::UnicodeSet(set));
346 Ok(())
347 }
348}
349
350fn legal_char_start(c: char) -> bool {
354 !(c == '&' || c == '-' || c == '$' || c == '^' || c == '[' || c == ']' || c == '{')
355}
356
357fn legal_char_in_string_start(c: char) -> bool {
360 c != '}'
361}
362
363#[derive(Debug)]
364enum SingleOrMultiChar {
365 Single(char),
366 Multi(char),
369}
370
371#[derive(Debug)]
375enum Literal {
376 String(String),
377 CharKind(SingleOrMultiChar),
378}
379
380#[derive(Debug)]
381enum MainToken<'data> {
382 Literal(Literal),
384 UnicodeSet(CodePointInversionListAndStringList<'data>),
386 DollarSign,
388 Ampersand,
390 Minus,
394 ClosingBracket,
396}
397
398impl<'data> MainToken<'data> {
399 fn from_variable_value(val: VariableValue<'data>) -> Self {
400 match val {
401 VariableValue::Char(c) => {
402 MainToken::Literal(Literal::CharKind(SingleOrMultiChar::Single(c)))
403 }
404 VariableValue::String(s) => {
405 MainToken::Literal(Literal::String(s.into_owned()))
407 }
408 VariableValue::UnicodeSet(set) => MainToken::UnicodeSet(set),
409 }
410 }
411}
412
413#[derive(Debug, Clone, Copy)]
414enum Operation {
415 Union,
416 Difference,
417 Intersection,
418}
419
420struct UnicodeSetBuilder<'a, 'b, P: ?Sized> {
422 single_set: CodePointInversionListBuilder,
423 string_set: BTreeSet<String>,
424 iter: &'a mut Peekable<CharIndices<'b>>,
425 source: &'b str,
426 inverted: bool,
427 variable_map: &'a VariableMap<'a>,
428 xid_start: &'a CodePointInversionList<'a>,
429 xid_continue: &'a CodePointInversionList<'a>,
430 pat_ws: &'a CodePointInversionList<'a>,
431 property_provider: &'a P,
432}
433
434impl<'a, 'b, P> UnicodeSetBuilder<'a, 'b, P>
435where
436 P: ?Sized
437 + DataProvider<PropertyBinaryAlphabeticV1>
438 + DataProvider<PropertyBinaryAsciiHexDigitV1>
439 + DataProvider<PropertyBinaryBidiControlV1>
440 + DataProvider<PropertyBinaryBidiMirroredV1>
441 + DataProvider<PropertyBinaryCasedV1>
442 + DataProvider<PropertyBinaryCaseIgnorableV1>
443 + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
444 + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
445 + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
446 + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
447 + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
448 + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
449 + DataProvider<PropertyBinaryDashV1>
450 + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
451 + DataProvider<PropertyBinaryDeprecatedV1>
452 + DataProvider<PropertyBinaryDiacriticV1>
453 + DataProvider<PropertyBinaryEmojiComponentV1>
454 + DataProvider<PropertyBinaryEmojiModifierBaseV1>
455 + DataProvider<PropertyBinaryEmojiModifierV1>
456 + DataProvider<PropertyBinaryEmojiPresentationV1>
457 + DataProvider<PropertyBinaryEmojiV1>
458 + DataProvider<PropertyBinaryExtendedPictographicV1>
459 + DataProvider<PropertyBinaryExtenderV1>
460 + DataProvider<PropertyBinaryGraphemeBaseV1>
461 + DataProvider<PropertyBinaryGraphemeExtendV1>
462 + DataProvider<PropertyBinaryHexDigitV1>
463 + DataProvider<PropertyBinaryIdContinueV1>
464 + DataProvider<PropertyBinaryIdeographicV1>
465 + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
466 + DataProvider<PropertyBinaryIdStartV1>
467 + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
468 + DataProvider<PropertyBinaryJoinControlV1>
469 + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
470 + DataProvider<PropertyBinaryLowercaseV1>
471 + DataProvider<PropertyBinaryMathV1>
472 + DataProvider<PropertyBinaryNoncharacterCodePointV1>
473 + DataProvider<PropertyBinaryPatternSyntaxV1>
474 + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
475 + DataProvider<PropertyBinaryQuotationMarkV1>
476 + DataProvider<PropertyBinaryRadicalV1>
477 + DataProvider<PropertyBinaryRegionalIndicatorV1>
478 + DataProvider<PropertyBinarySentenceTerminalV1>
479 + DataProvider<PropertyBinarySoftDottedV1>
480 + DataProvider<PropertyBinaryTerminalPunctuationV1>
481 + DataProvider<PropertyBinaryUnifiedIdeographV1>
482 + DataProvider<PropertyBinaryUppercaseV1>
483 + DataProvider<PropertyBinaryVariationSelectorV1>
484 + DataProvider<PropertyBinaryWhiteSpaceV1>
485 + DataProvider<PropertyBinaryXidContinueV1>
486 + DataProvider<PropertyBinaryXidStartV1>
487 + DataProvider<PropertyEnumCanonicalCombiningClassV1>
488 + DataProvider<PropertyEnumGeneralCategoryV1>
489 + DataProvider<PropertyEnumGraphemeClusterBreakV1>
490 + DataProvider<PropertyEnumLineBreakV1>
491 + DataProvider<PropertyEnumScriptV1>
492 + DataProvider<PropertyEnumSentenceBreakV1>
493 + DataProvider<PropertyEnumWordBreakV1>
494 + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
495 + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
496 + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
497 + DataProvider<PropertyNameParseLineBreakV1>
498 + DataProvider<PropertyNameParseScriptV1>
499 + DataProvider<PropertyNameParseSentenceBreakV1>
500 + DataProvider<PropertyNameParseWordBreakV1>
501 + DataProvider<PropertyScriptWithExtensionsV1>,
502{
503 fn new_internal(
504 iter: &'a mut Peekable<CharIndices<'b>>,
505 source: &'b str,
506 variable_map: &'a VariableMap<'a>,
507 xid_start: &'a CodePointInversionList<'a>,
508 xid_continue: &'a CodePointInversionList<'a>,
509 pat_ws: &'a CodePointInversionList<'a>,
510 provider: &'a P,
511 ) -> Self {
512 UnicodeSetBuilder {
513 single_set: CodePointInversionListBuilder::new(),
514 string_set: Default::default(),
515 iter,
516 source,
517 inverted: false,
518 variable_map,
519 xid_start,
520 xid_continue,
521 pat_ws,
522 property_provider: provider,
523 }
524 }
525
526 fn parse_unicode_set(&mut self) -> Result<()> {
528 match self.must_peek_char()? {
529 '\\' => self.parse_property_perl(),
530 '[' => {
531 self.iter.next();
532 if let Some(':') = self.peek_char() {
533 self.parse_property_posix()
534 } else {
535 self.parse_unicode_set_inner()
536 }
537 }
538 '$' => {
539 let (offset, v) = self.parse_variable()?;
541 match v {
542 Some(VariableValue::UnicodeSet(s)) => {
543 self.single_set.add_set(s.code_points());
544 self.string_set
545 .extend(s.strings().iter().map(ToString::to_string));
546 Ok(())
547 }
548 Some(_) => Err(PEK::UnexpectedVariable.with_offset(offset)),
549 None => Err(PEK::UnexpectedChar('$').with_offset(offset)),
550 }
551 }
552 c => self.error_here(PEK::UnexpectedChar(c)),
553 }
554 }
555
556 fn parse_unicode_set_inner(&mut self) -> Result<()> {
558 if self.must_peek_char()? == '^' {
560 self.iter.next();
561 self.inverted = true;
562 }
563 self.skip_whitespace();
565 if self.must_peek_char()? == '-' {
566 self.iter.next();
567 self.single_set.add_char('-');
568 }
569
570 #[derive(Debug, Clone, Copy)]
579 enum State {
580 Begin,
582 Char,
585 CharMinus,
588 AfterUnicodeSet,
591 AfterOp,
594 AfterDollar,
597 AfterMinus,
600 }
601 use State::*;
602
603 const DEFAULT_OP: Operation = Operation::Union;
604
605 let mut state = Begin;
606 let mut prev_char = None;
607 let mut operation = Operation::Union;
608
609 loop {
610 self.skip_whitespace();
611
612 let (immediate_offset, immediate_char) = self.must_peek()?;
614
615 let (tok_offset, from_var, tok) = self.parse_main_token()?;
616 use MainToken as MT;
621 use SingleOrMultiChar as SMC;
622 match (state, tok) {
623 (
625 Begin | Char | CharMinus | AfterUnicodeSet | AfterDollar | AfterMinus,
626 MT::ClosingBracket,
627 ) => {
628 if let Some(prev) = prev_char.take() {
629 self.single_set.add_char(prev);
630 }
631 if matches!(state, CharMinus) {
632 self.single_set.add_char('-');
633 }
634
635 return Ok(());
636 }
637 (AfterOp, MT::ClosingBracket) if matches!(operation, Operation::Difference) => {
640 self.single_set.add_char('-');
641 return Ok(());
642 }
643 (Begin, MT::Minus) => {
644 self.single_set.add_char('-');
645 state = AfterMinus;
646 }
647 (Begin | Char | AfterUnicodeSet | AfterOp, MT::UnicodeSet(set)) => {
649 if let Some(prev) = prev_char.take() {
650 self.single_set.add_char(prev);
651 }
652
653 self.process_chars(operation, set.code_points().clone());
654 self.process_strings(
655 operation,
656 set.strings().iter().map(ToString::to_string).collect(),
657 );
658
659 operation = DEFAULT_OP;
660 state = AfterUnicodeSet;
661 }
662 (
664 Begin | Char | AfterUnicodeSet,
665 MT::Literal(Literal::CharKind(SMC::Single(c))),
666 ) => {
667 if let Some(prev) = prev_char.take() {
668 self.single_set.add_char(prev);
669 }
670 prev_char = Some(c);
671 state = Char;
672 }
673 (
675 Begin | Char | AfterUnicodeSet,
676 MT::Literal(Literal::CharKind(SMC::Multi(first_c))),
677 ) => {
678 if let Some(prev) = prev_char.take() {
679 self.single_set.add_char(prev);
680 }
681 self.single_set.add_char(first_c);
682 self.parse_multi_escape_into_set()?;
683
684 state = Begin;
687 }
688 (Begin | Char | AfterUnicodeSet, MT::Literal(Literal::String(s))) => {
690 if let Some(prev) = prev_char.take() {
691 self.single_set.add_char(prev);
692 }
693
694 self.string_set.insert(s);
695 state = Begin;
696 }
697 (CharMinus, MT::Literal(Literal::CharKind(SMC::Single(c)))) => {
699 let start = prev_char.ok_or(PEK::Internal.with_offset(tok_offset))?;
700 let end = c;
701 if start > end {
702 return Err(PEK::UnexpectedChar(end).with_offset(tok_offset));
704 }
705
706 self.single_set.add_range(start..=end);
707 prev_char = None;
708 state = Begin;
709 }
710 (Char, MT::Minus) => {
712 state = CharMinus;
713 }
714 (AfterUnicodeSet, MT::Minus) => {
716 operation = Operation::Difference;
717 state = AfterOp;
718 }
719 (AfterUnicodeSet, MT::Ampersand) => {
721 operation = Operation::Intersection;
722 state = AfterOp;
723 }
724 (Begin | Char | AfterUnicodeSet, MT::DollarSign) => {
725 if let Some(prev) = prev_char.take() {
726 self.single_set.add_char(prev);
727 }
728 self.single_set.add_char('\u{FFFF}');
729 state = AfterDollar;
730 }
731 _ => {
732 if from_var {
736 return Err(PEK::UnexpectedVariable.with_offset(tok_offset));
740 }
741 return Err(PEK::UnexpectedChar(immediate_char).with_offset(immediate_offset));
742 }
743 }
744 }
745 }
746
747 fn parse_main_token(&mut self) -> Result<(usize, bool, MainToken<'a>)> {
748 let (initial_offset, first) = self.must_peek()?;
749 if first == ']' {
750 self.iter.next();
751 return Ok((initial_offset, false, MainToken::ClosingBracket));
752 }
753 let (_, second) = self.must_peek_double()?;
754 match (first, second) {
755 ('$', _) => {
757 let (offset, var_or_anchor) = self.parse_variable()?;
758 match var_or_anchor {
759 None => Ok((offset, false, MainToken::DollarSign)),
760 Some(v) => Ok((offset, true, MainToken::from_variable_value(v.clone()))),
761 }
762 }
763 ('{', _) => self
765 .parse_string()
766 .map(|(offset, l)| (offset, false, MainToken::Literal(l))),
767 ('\\', 'p' | 'P') | ('[', _) => {
769 let mut inner_builder = UnicodeSetBuilder::new_internal(
770 self.iter,
771 self.source,
772 self.variable_map,
773 self.xid_start,
774 self.xid_continue,
775 self.pat_ws,
776 self.property_provider,
777 );
778 inner_builder.parse_unicode_set()?;
779 let (single, string_set) = inner_builder.finalize();
780 let offset = self.must_peek_index()? - 1;
782 let mut strings = string_set.into_iter().collect::<Vec<_>>();
783 strings.sort();
784 let cpilasl = CodePointInversionListAndStringList::try_from(
785 single.build(),
786 VarZeroVec::from(&strings),
787 )
788 .map_err(|_| PEK::Internal.with_offset(offset))?;
789 Ok((offset, false, MainToken::UnicodeSet(cpilasl)))
790 }
791 (c, _) if legal_char_start(c) => self
794 .parse_char()
795 .map(|(offset, c)| (offset, false, MainToken::Literal(Literal::CharKind(c)))),
796 ('-', _) => {
797 self.iter.next();
798 Ok((initial_offset, false, MainToken::Minus))
799 }
800 ('&', _) => {
801 self.iter.next();
802 Ok((initial_offset, false, MainToken::Ampersand))
803 }
804 (c, _) => Err(PEK::UnexpectedChar(c).with_offset(initial_offset)),
805 }
806 }
807
808 fn parse_variable(&mut self) -> Result<(usize, Option<&'a VariableValue<'a>>)> {
812 self.consume('$')?;
813
814 let mut res = String::new();
815 let (mut var_offset, first_c) = self.must_peek()?;
816
817 if !self.xid_start.contains(first_c) {
818 return Ok((var_offset - 1, None));
820 }
821
822 res.push(first_c);
823 self.iter.next();
824 while let Some(&(offset, c)) = self.iter.peek() {
827 if !self.xid_continue.contains(c) {
828 break;
829 }
830 var_offset = offset;
832 self.iter.next();
833 res.push(c);
834 }
835
836 if let Some(v) = self.variable_map.0.get(&res) {
837 return Ok((var_offset, Some(v)));
838 }
839
840 Err(PEK::UnknownVariable.with_offset(var_offset))
841 }
842
843 fn parse_string(&mut self) -> Result<(usize, Literal)> {
845 self.consume('{')?;
846
847 let mut buffer = String::new();
848 let mut last_offset;
849
850 loop {
851 self.skip_whitespace();
852 last_offset = self.must_peek_index()?;
853 match self.must_peek_char()? {
854 '}' => {
855 self.iter.next();
856 break;
857 }
858 c if legal_char_in_string_start(c) => {
861 let (_, c) = self.parse_char()?;
863 match c {
864 SingleOrMultiChar::Single(c) => buffer.push(c),
865 SingleOrMultiChar::Multi(first) => {
866 buffer.push(first);
867 self.parse_multi_escape_into_string(&mut buffer)?;
868 }
869 }
870 }
871 c => return self.error_here(PEK::UnexpectedChar(c)),
872 }
873 }
874
875 let mut chars = buffer.chars();
876 let literal = match (chars.next(), chars.next()) {
877 (Some(c), None) => Literal::CharKind(SingleOrMultiChar::Single(c)),
878 _ => Literal::String(buffer),
879 };
880 Ok((last_offset, literal))
881 }
882
883 fn parse_multi_escape_into_set(&mut self) -> Result<()> {
886 let mut first = true;
894 loop {
895 let skipped = self.skip_whitespace();
896 match self.must_peek_char()? {
897 '}' => {
898 self.iter.next();
899 return Ok(());
900 }
901 initial_c => {
902 if skipped == 0 && !first {
903 return self.error_here(PEK::UnexpectedChar(initial_c));
905 }
906 first = false;
907
908 let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
909 self.single_set.add_char(c);
910 }
911 }
912 }
913 }
914
915 fn parse_multi_escape_into_string(&mut self, s: &mut String) -> Result<()> {
918 let mut first = true;
921 loop {
922 let skipped = self.skip_whitespace();
923 match self.must_peek_char()? {
924 '}' => {
925 self.iter.next();
926 return Ok(());
927 }
928 initial_c => {
929 if skipped == 0 && !first {
930 return self.error_here(PEK::UnexpectedChar(initial_c));
932 }
933 first = false;
934
935 let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
936 s.push(c);
937 }
938 }
939 }
940 }
941
942 fn parse_escaped_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
945 self.consume('\\')?;
946
947 let (offset, next_char) = self.must_next()?;
948
949 match next_char {
950 'u' | 'x' if self.peek_char() == Some('{') => {
951 self.iter.next();
953
954 self.skip_whitespace();
955 let (_, first_c) = self.parse_hex_digits_into_char(1, 6)?;
956 let skipped = self.skip_whitespace();
957
958 match self.must_peek()? {
959 (offset, '}') => {
960 self.iter.next();
961 Ok((offset, SingleOrMultiChar::Single(first_c)))
962 }
963 (offset, c) if c.is_ascii_hexdigit() && skipped > 0 => {
966 Ok((offset, SingleOrMultiChar::Multi(first_c)))
967 }
968 (_, c) => self.error_here(PEK::UnexpectedChar(c)),
969 }
970 }
971 'u' => {
972 self.parse_hex_digits_into_char(4, 4)
974 .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
975 }
976 'x' => {
977 self.parse_hex_digits_into_char(2, 2)
979 .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
980 }
981 'U' => {
982 self.consume('0')?;
984 self.consume('0')?;
985 self.parse_hex_digits_into_char(6, 6)
986 .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
987 }
988 'N' => {
989 Err(PEK::Unimplemented.with_offset(offset))
992 }
993 'a' => Ok((offset, SingleOrMultiChar::Single('\u{0007}'))),
994 'b' => Ok((offset, SingleOrMultiChar::Single('\u{0008}'))),
995 't' => Ok((offset, SingleOrMultiChar::Single('\u{0009}'))),
996 'n' => Ok((offset, SingleOrMultiChar::Single('\u{000A}'))),
997 'v' => Ok((offset, SingleOrMultiChar::Single('\u{000B}'))),
998 'f' => Ok((offset, SingleOrMultiChar::Single('\u{000C}'))),
999 'r' => Ok((offset, SingleOrMultiChar::Single('\u{000D}'))),
1000 _ => Ok((offset, SingleOrMultiChar::Single(next_char))),
1001 }
1002 }
1003
1004 fn parse_property_posix(&mut self) -> Result<()> {
1006 self.consume(':')?;
1007 if self.must_peek_char()? == '^' {
1008 self.inverted = true;
1009 self.iter.next();
1010 }
1011
1012 self.parse_property_inner(':')?;
1013
1014 self.consume(']')?;
1015
1016 Ok(())
1017 }
1018
1019 fn parse_property_perl(&mut self) -> Result<()> {
1021 self.consume('\\')?;
1022 match self.must_next()? {
1023 (_, 'p') => {}
1024 (_, 'P') => self.inverted = true,
1025 (offset, c) => return Err(PEK::UnexpectedChar(c).with_offset(offset)),
1026 }
1027 self.consume('{')?;
1028
1029 self.parse_property_inner('}')?;
1030
1031 Ok(())
1032 }
1033
1034 fn parse_property_inner(&mut self, end: char) -> Result<()> {
1035 let property_offset;
1041
1042 let mut key_buffer = String::new();
1043 let mut value_buffer = String::new();
1044
1045 enum State {
1046 Begin,
1048 PropertyName,
1050 PropertyValueBegin,
1052 PropertyValue,
1054 }
1055 use State::*;
1056
1057 let mut state = Begin;
1058 let mut equality = true;
1060
1061 loop {
1062 self.skip_whitespace();
1063 match (state, self.must_peek_char()?) {
1064 (PropertyName | PropertyValue, c) if c == end => {
1066 property_offset = self.must_peek_index()? - 1;
1068 self.iter.next();
1069 break;
1070 }
1071 (Begin | PropertyName, c) if c.is_ascii_alphanumeric() || c == '_' => {
1075 key_buffer.push(c);
1076 self.iter.next();
1077 state = PropertyName;
1078 }
1079 (PropertyName, c @ ('=' | '≠')) => {
1081 equality = c == '=';
1082 self.iter.next();
1083 state = PropertyValueBegin;
1084 }
1085 (PropertyValue | PropertyValueBegin, c) if c != end => {
1087 value_buffer.push(c);
1088 self.iter.next();
1089 state = PropertyValue;
1090 }
1091 (_, c) => return self.error_here(PEK::UnexpectedChar(c)),
1092 }
1093 }
1094
1095 if !equality {
1096 self.inverted = !self.inverted;
1097 }
1098
1099 let inverted = self
1100 .load_property_codepoints(&key_buffer, &value_buffer)
1101 .map_err(|e| e.or_with_offset(property_offset))?;
1103 if inverted {
1104 self.inverted = !self.inverted;
1105 }
1106
1107 Ok(())
1108 }
1109
1110 fn load_property_codepoints(&mut self, key: &str, value: &str) -> Result<bool> {
1112 let mut inverted = false;
1124
1125 let mut try_gc = Err(PEK::UnknownProperty.into());
1127 let mut try_sc = Err(PEK::UnknownProperty.into());
1129 let mut try_scx = Err(PEK::UnknownProperty.into());
1131 let mut try_gcb = Err(PEK::UnknownProperty.into());
1133 let mut try_lb = Err(PEK::UnknownProperty.into());
1135 let mut try_sb = Err(PEK::UnknownProperty.into());
1137 let mut try_wb = Err(PEK::UnknownProperty.into());
1139 let mut try_binary = Err(PEK::UnknownProperty.into());
1141 let mut try_ccc: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
1143 let mut try_block: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
1145
1146 if !value.is_empty() {
1147 match key.as_bytes() {
1153 GeneralCategory::NAME | GeneralCategory::SHORT_NAME => try_gc = Ok(value),
1154 GraphemeClusterBreak::NAME | GraphemeClusterBreak::SHORT_NAME => {
1155 try_gcb = Ok(value)
1156 }
1157 LineBreak::NAME | LineBreak::SHORT_NAME => try_lb = Ok(value),
1158 Script::NAME | Script::SHORT_NAME => try_sc = Ok(value),
1159 SentenceBreak::NAME | SentenceBreak::SHORT_NAME => try_sb = Ok(value),
1160 WordBreak::NAME | WordBreak::SHORT_NAME => try_wb = Ok(value),
1161 CanonicalCombiningClass::NAME | CanonicalCombiningClass::SHORT_NAME => {
1162 try_ccc = Ok(value)
1163 }
1164 b"Script_Extensions" | b"scx" => try_scx = Ok(value),
1165 b"Block" | b"blk" => try_block = Ok(value),
1166 _ => {
1167 let normalized_value = value.to_ascii_lowercase();
1168 let truthy = matches!(normalized_value.as_str(), "true" | "t" | "yes" | "y");
1169 let falsy = matches!(normalized_value.as_str(), "false" | "f" | "no" | "n");
1170 if truthy == falsy {
1172 return Err(PEK::UnknownProperty.into());
1173 }
1174 inverted = falsy;
1178 try_binary = Ok(key);
1179 }
1180 }
1181 } else {
1182 try_gc = Ok(key);
1185 try_sc = Ok(key);
1186 try_binary = Ok(key);
1187 }
1188
1189 try_gc
1190 .and_then(|value| self.try_load_general_category_set(value))
1191 .or_else(|_| try_sc.and_then(|value| self.try_load_script_set(value)))
1192 .or_else(|_| try_scx.and_then(|value| self.try_load_script_extensions_set(value)))
1193 .or_else(|_| try_binary.and_then(|value| self.try_load_ecma262_binary_set(value)))
1194 .or_else(|_| try_gcb.and_then(|value| self.try_load_grapheme_cluster_break_set(value)))
1195 .or_else(|_| try_lb.and_then(|value| self.try_load_line_break_set(value)))
1196 .or_else(|_| try_sb.and_then(|value| self.try_load_sentence_break_set(value)))
1197 .or_else(|_| try_wb.and_then(|value| self.try_load_word_break_set(value)))
1198 .or_else(|_| try_ccc.and_then(|value| self.try_load_ccc_set(value)))
1199 .or_else(|_| try_block.and_then(|value| self.try_load_block_set(value)))?;
1200 Ok(inverted)
1201 }
1202
1203 fn finalize(mut self) -> (CodePointInversionListBuilder, BTreeSet<String>) {
1204 if self.inverted {
1205 #[cfg(feature = "log")]
1207 if !self.string_set.is_empty() {
1208 log::info!(
1209 "Inverting a unicode set with strings. This removes all strings entirely."
1210 );
1211 }
1212 self.string_set.clear();
1213 self.single_set.complement();
1214 }
1215
1216 (self.single_set, self.string_set)
1217 }
1218
1219 fn parse_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
1222 let (offset, c) = self.must_peek()?;
1223 match c {
1224 '\\' => self.parse_escaped_char(),
1225 _ => {
1226 self.iter.next();
1227 Ok((offset, SingleOrMultiChar::Single(c)))
1228 }
1229 }
1230 }
1231
1232 fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<(usize, char)> {
1235 let first_offset = self.must_peek_index()?;
1236 let end_offset = self.validate_hex_digits(min, max)?;
1237
1238 #[allow(clippy::indexing_slicing)]
1241 let hex_source = &self.source[first_offset..=end_offset];
1242 let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?;
1243 char::try_from(num)
1244 .map(|c| (end_offset, c))
1245 .map_err(|_| PEK::InvalidEscape.with_offset(end_offset))
1246 }
1247
1248 fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
1250 let mut last_offset = 0;
1251 for count in 0..max {
1252 let (offset, c) = self.must_peek()?;
1253 if !c.is_ascii_hexdigit() {
1254 if count < min {
1255 return Err(PEK::UnexpectedChar(c).with_offset(offset));
1256 } else {
1257 break;
1258 }
1259 }
1260 self.iter.next();
1261 last_offset = offset;
1262 }
1263 Ok(last_offset)
1264 }
1265
1266 fn skip_whitespace(&mut self) -> usize {
1268 let mut num = 0;
1269 while let Some(c) = self.peek_char() {
1270 if !self.pat_ws.contains(c) {
1271 break;
1272 }
1273 self.iter.next();
1274 num += 1;
1275 }
1276 num
1277 }
1278
1279 fn consume(&mut self, expected: char) -> Result<()> {
1280 match self.must_next()? {
1281 (offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)),
1282 _ => Ok(()),
1283 }
1284 }
1285
1286 fn must_next(&mut self) -> Result<(usize, char)> {
1288 self.iter.next().ok_or(PEK::Eof.into())
1289 }
1290
1291 fn must_peek(&mut self) -> Result<(usize, char)> {
1293 self.iter.peek().copied().ok_or(PEK::Eof.into())
1294 }
1295
1296 fn must_peek_double(&mut self) -> Result<(usize, char)> {
1298 let mut copy = self.iter.clone();
1299 copy.next();
1300 copy.next().ok_or(PEK::Eof.into())
1301 }
1302
1303 fn must_peek_char(&mut self) -> Result<char> {
1305 self.must_peek().map(|(_, c)| c)
1306 }
1307
1308 fn must_peek_index(&mut self) -> Result<usize> {
1310 self.must_peek().map(|(idx, _)| idx)
1311 }
1312
1313 fn peek_char(&mut self) -> Option<char> {
1314 self.iter.peek().map(|&(_, c)| c)
1315 }
1316
1317 #[inline]
1319 fn error_here<T>(&mut self, kind: ParseErrorKind) -> Result<T> {
1320 match self.iter.peek() {
1321 None => Err(kind.into()),
1322 Some(&(offset, _)) => Err(kind.with_offset(offset)),
1323 }
1324 }
1325
1326 fn process_strings(&mut self, op: Operation, other_strings: BTreeSet<String>) {
1327 match op {
1328 Operation::Union => self.string_set.extend(other_strings),
1329 Operation::Difference => {
1330 self.string_set = self
1331 .string_set
1332 .difference(&other_strings)
1333 .cloned()
1334 .collect()
1335 }
1336 Operation::Intersection => {
1337 self.string_set = self
1338 .string_set
1339 .intersection(&other_strings)
1340 .cloned()
1341 .collect()
1342 }
1343 }
1344 }
1345
1346 fn process_chars(&mut self, op: Operation, other_chars: CodePointInversionList) {
1347 match op {
1348 Operation::Union => self.single_set.add_set(&other_chars),
1349 Operation::Difference => self.single_set.remove_set(&other_chars),
1350 Operation::Intersection => self.single_set.retain_set(&other_chars),
1351 }
1352 }
1353
1354 fn try_load_general_category_set(&mut self, name: &str) -> Result<()> {
1355 let name_map =
1357 PropertyParser::<GeneralCategoryGroup>::try_new_unstable(self.property_provider)
1358 .map_err(|_| PEK::Internal)?;
1359 let gc_value = name_map
1360 .as_borrowed()
1361 .get_loose(name)
1362 .ok_or(PEK::UnknownProperty)?;
1363 let set = CodePointMapData::<GeneralCategory>::try_new_unstable(self.property_provider)
1365 .map_err(|_| PEK::Internal)?
1366 .as_borrowed()
1367 .get_set_for_value_group(gc_value);
1368 self.single_set.add_set(&set.to_code_point_inversion_list());
1369 Ok(())
1370 }
1371
1372 fn try_get_script(&self, name: &str) -> Result<Script> {
1373 let name_map = PropertyParser::<Script>::try_new_unstable(self.property_provider)
1375 .map_err(|_| PEK::Internal)?;
1376 name_map
1377 .as_borrowed()
1378 .get_loose(name)
1379 .ok_or(PEK::UnknownProperty.into())
1380 }
1381
1382 fn try_load_script_set(&mut self, name: &str) -> Result<()> {
1383 let sc_value = self.try_get_script(name)?;
1384 let property_map = CodePointMapData::<Script>::try_new_unstable(self.property_provider)
1386 .map_err(|_| PEK::Internal)?;
1387 let set = property_map.as_borrowed().get_set_for_value(sc_value);
1388 self.single_set.add_set(&set.to_code_point_inversion_list());
1389 Ok(())
1390 }
1391
1392 fn try_load_script_extensions_set(&mut self, name: &str) -> Result<()> {
1393 let scx = ScriptWithExtensions::try_new_unstable(self.property_provider)
1395 .map_err(|_| PEK::Internal)?;
1396 let sc_value = self.try_get_script(name)?;
1397 let set = scx.as_borrowed().get_script_extensions_set(sc_value);
1398 self.single_set.add_set(&set);
1399 Ok(())
1400 }
1401
1402 fn try_load_ecma262_binary_set(&mut self, name: &str) -> Result<()> {
1403 let set =
1404 CodePointSetData::try_new_for_ecma262_unstable(self.property_provider, name.as_bytes())
1405 .ok_or(PEK::UnknownProperty)?
1406 .map_err(|_data_error| PEK::Internal)?;
1407 self.single_set.add_set(&set.to_code_point_inversion_list());
1408 Ok(())
1409 }
1410
1411 fn try_load_grapheme_cluster_break_set(&mut self, name: &str) -> Result<()> {
1412 let parser =
1413 PropertyParser::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1414 .map_err(|_| PEK::Internal)?;
1415 let gcb_value = parser
1416 .as_borrowed()
1417 .get_loose(name)
1418 .ok_or(PEK::UnknownProperty)?;
1419 let property_map =
1421 CodePointMapData::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1422 .map_err(|_| PEK::Internal)?;
1423 let set = property_map.as_borrowed().get_set_for_value(gcb_value);
1424 self.single_set.add_set(&set.to_code_point_inversion_list());
1425 Ok(())
1426 }
1427
1428 fn try_load_line_break_set(&mut self, name: &str) -> Result<()> {
1429 let parser = PropertyParser::<LineBreak>::try_new_unstable(self.property_provider)
1430 .map_err(|_| PEK::Internal)?;
1431 let lb_value = parser
1432 .as_borrowed()
1433 .get_loose(name)
1434 .ok_or(PEK::UnknownProperty)?;
1435 let property_map = CodePointMapData::<LineBreak>::try_new_unstable(self.property_provider)
1437 .map_err(|_| PEK::Internal)?;
1438 let set = property_map.as_borrowed().get_set_for_value(lb_value);
1439 self.single_set.add_set(&set.to_code_point_inversion_list());
1440 Ok(())
1441 }
1442
1443 fn try_load_sentence_break_set(&mut self, name: &str) -> Result<()> {
1444 let parser = PropertyParser::<SentenceBreak>::try_new_unstable(self.property_provider)
1445 .map_err(|_| PEK::Internal)?;
1446 let sb_value = parser
1447 .as_borrowed()
1448 .get_loose(name)
1449 .ok_or(PEK::UnknownProperty)?;
1450 let property_map =
1452 CodePointMapData::<SentenceBreak>::try_new_unstable(self.property_provider)
1453 .map_err(|_| PEK::Internal)?;
1454 let set = property_map.as_borrowed().get_set_for_value(sb_value);
1455 self.single_set.add_set(&set.to_code_point_inversion_list());
1456 Ok(())
1457 }
1458
1459 fn try_load_word_break_set(&mut self, name: &str) -> Result<()> {
1460 let parser = PropertyParser::<WordBreak>::try_new_unstable(self.property_provider)
1461 .map_err(|_| PEK::Internal)?;
1462 let wb_value = parser
1463 .as_borrowed()
1464 .get_loose(name)
1465 .ok_or(PEK::UnknownProperty)?;
1466 let property_map = CodePointMapData::<WordBreak>::try_new_unstable(self.property_provider)
1468 .map_err(|_| PEK::Internal)?;
1469 let set = property_map.as_borrowed().get_set_for_value(wb_value);
1470 self.single_set.add_set(&set.to_code_point_inversion_list());
1471 Ok(())
1472 }
1473
1474 fn try_load_ccc_set(&mut self, name: &str) -> Result<()> {
1475 let parser =
1476 PropertyParser::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
1477 .map_err(|_| PEK::Internal)?;
1478 let value = parser
1479 .as_borrowed()
1480 .get_loose(name)
1481 .or_else(|| {
1483 name.parse()
1484 .ok()
1485 .map(CanonicalCombiningClass::from_icu4c_value)
1486 })
1487 .ok_or(PEK::UnknownProperty)?;
1488 let property_map =
1490 CodePointMapData::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
1491 .map_err(|_| PEK::Internal)?;
1492 let set = property_map.as_borrowed().get_set_for_value(value);
1493 self.single_set.add_set(&set.to_code_point_inversion_list());
1494 Ok(())
1495 }
1496
1497 fn try_load_block_set(&mut self, name: &str) -> Result<()> {
1498 self.single_set
1500 .add_range(match name.to_ascii_lowercase().as_str() {
1501 "arabic" => '\u{0600}'..'\u{06FF}',
1502 "thaana" => '\u{0780}'..'\u{07BF}',
1503 _ => {
1504 #[cfg(feature = "log")]
1505 log::warn!("Skipping :block={name}:");
1506 return Err(PEK::Unimplemented.into());
1507 }
1508 });
1509 Ok(())
1510 }
1511}
1512
1513#[cfg(feature = "compiled_data")]
1595pub fn parse(source: &str) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1596 parse_unstable(source, &icu_properties::provider::Baked)
1597}
1598
1599#[cfg(feature = "compiled_data")]
1626pub fn parse_with_variables(
1627 source: &str,
1628 variable_map: &VariableMap<'_>,
1629) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1630 parse_unstable_with_variables(source, variable_map, &icu_properties::provider::Baked)
1631}
1632
1633#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse_with_variables)]
1634pub fn parse_unstable_with_variables<P>(
1635 source: &str,
1636 variable_map: &VariableMap<'_>,
1637 provider: &P,
1638) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1639where
1640 P: ?Sized
1641 + DataProvider<PropertyBinaryAlphabeticV1>
1642 + DataProvider<PropertyBinaryAsciiHexDigitV1>
1643 + DataProvider<PropertyBinaryBidiControlV1>
1644 + DataProvider<PropertyBinaryBidiMirroredV1>
1645 + DataProvider<PropertyBinaryCasedV1>
1646 + DataProvider<PropertyBinaryCaseIgnorableV1>
1647 + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
1648 + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
1649 + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
1650 + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
1651 + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
1652 + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
1653 + DataProvider<PropertyBinaryDashV1>
1654 + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
1655 + DataProvider<PropertyBinaryDeprecatedV1>
1656 + DataProvider<PropertyBinaryDiacriticV1>
1657 + DataProvider<PropertyBinaryEmojiComponentV1>
1658 + DataProvider<PropertyBinaryEmojiModifierBaseV1>
1659 + DataProvider<PropertyBinaryEmojiModifierV1>
1660 + DataProvider<PropertyBinaryEmojiPresentationV1>
1661 + DataProvider<PropertyBinaryEmojiV1>
1662 + DataProvider<PropertyBinaryExtendedPictographicV1>
1663 + DataProvider<PropertyBinaryExtenderV1>
1664 + DataProvider<PropertyBinaryGraphemeBaseV1>
1665 + DataProvider<PropertyBinaryGraphemeExtendV1>
1666 + DataProvider<PropertyBinaryHexDigitV1>
1667 + DataProvider<PropertyBinaryIdContinueV1>
1668 + DataProvider<PropertyBinaryIdeographicV1>
1669 + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
1670 + DataProvider<PropertyBinaryIdStartV1>
1671 + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
1672 + DataProvider<PropertyBinaryJoinControlV1>
1673 + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
1674 + DataProvider<PropertyBinaryLowercaseV1>
1675 + DataProvider<PropertyBinaryMathV1>
1676 + DataProvider<PropertyBinaryNoncharacterCodePointV1>
1677 + DataProvider<PropertyBinaryPatternSyntaxV1>
1678 + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
1679 + DataProvider<PropertyBinaryQuotationMarkV1>
1680 + DataProvider<PropertyBinaryRadicalV1>
1681 + DataProvider<PropertyBinaryRegionalIndicatorV1>
1682 + DataProvider<PropertyBinarySentenceTerminalV1>
1683 + DataProvider<PropertyBinarySoftDottedV1>
1684 + DataProvider<PropertyBinaryTerminalPunctuationV1>
1685 + DataProvider<PropertyBinaryUnifiedIdeographV1>
1686 + DataProvider<PropertyBinaryUppercaseV1>
1687 + DataProvider<PropertyBinaryVariationSelectorV1>
1688 + DataProvider<PropertyBinaryWhiteSpaceV1>
1689 + DataProvider<PropertyBinaryXidContinueV1>
1690 + DataProvider<PropertyBinaryXidStartV1>
1691 + DataProvider<PropertyEnumCanonicalCombiningClassV1>
1692 + DataProvider<PropertyEnumGeneralCategoryV1>
1693 + DataProvider<PropertyEnumGraphemeClusterBreakV1>
1694 + DataProvider<PropertyEnumLineBreakV1>
1695 + DataProvider<PropertyEnumScriptV1>
1696 + DataProvider<PropertyEnumSentenceBreakV1>
1697 + DataProvider<PropertyEnumWordBreakV1>
1698 + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
1699 + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
1700 + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
1701 + DataProvider<PropertyNameParseLineBreakV1>
1702 + DataProvider<PropertyNameParseScriptV1>
1703 + DataProvider<PropertyNameParseSentenceBreakV1>
1704 + DataProvider<PropertyNameParseWordBreakV1>
1705 + DataProvider<PropertyScriptWithExtensionsV1>,
1706{
1707 let mut iter = source.char_indices().peekable();
1711
1712 let xid_start =
1713 CodePointSetData::try_new_unstable::<XidStart>(provider).map_err(|_| PEK::Internal)?;
1714 let xid_start_list = xid_start.to_code_point_inversion_list();
1715 let xid_continue =
1716 CodePointSetData::try_new_unstable::<XidContinue>(provider).map_err(|_| PEK::Internal)?;
1717 let xid_continue_list = xid_continue.to_code_point_inversion_list();
1718
1719 let pat_ws = CodePointSetData::try_new_unstable::<PatternWhiteSpace>(provider)
1720 .map_err(|_| PEK::Internal)?;
1721 let pat_ws_list = pat_ws.to_code_point_inversion_list();
1722
1723 let mut builder = UnicodeSetBuilder::new_internal(
1724 &mut iter,
1725 source,
1726 variable_map,
1727 &xid_start_list,
1728 &xid_continue_list,
1729 &pat_ws_list,
1730 provider,
1731 );
1732
1733 builder.parse_unicode_set()?;
1734 let (single, string_set) = builder.finalize();
1735 let built_single = single.build();
1736
1737 let mut strings = string_set.into_iter().collect::<Vec<_>>();
1738 strings.sort();
1739 let zerovec = (&strings).into();
1740
1741 let cpinvlistandstrlist = CodePointInversionListAndStringList::try_from(built_single, zerovec)
1742 .map_err(|_| PEK::Internal)?;
1743
1744 let parsed_bytes = match iter.peek().copied() {
1745 None => source.len(),
1746 Some((offset, _)) => offset,
1747 };
1748
1749 Ok((cpinvlistandstrlist, parsed_bytes))
1750}
1751
1752#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse)]
1753pub fn parse_unstable<P>(
1754 source: &str,
1755 provider: &P,
1756) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1757where
1758 P: ?Sized
1759 + DataProvider<PropertyBinaryAlphabeticV1>
1760 + DataProvider<PropertyBinaryAsciiHexDigitV1>
1761 + DataProvider<PropertyBinaryBidiControlV1>
1762 + DataProvider<PropertyBinaryBidiMirroredV1>
1763 + DataProvider<PropertyBinaryCasedV1>
1764 + DataProvider<PropertyBinaryCaseIgnorableV1>
1765 + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
1766 + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
1767 + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
1768 + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
1769 + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
1770 + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
1771 + DataProvider<PropertyBinaryDashV1>
1772 + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
1773 + DataProvider<PropertyBinaryDeprecatedV1>
1774 + DataProvider<PropertyBinaryDiacriticV1>
1775 + DataProvider<PropertyBinaryEmojiComponentV1>
1776 + DataProvider<PropertyBinaryEmojiModifierBaseV1>
1777 + DataProvider<PropertyBinaryEmojiModifierV1>
1778 + DataProvider<PropertyBinaryEmojiPresentationV1>
1779 + DataProvider<PropertyBinaryEmojiV1>
1780 + DataProvider<PropertyBinaryExtendedPictographicV1>
1781 + DataProvider<PropertyBinaryExtenderV1>
1782 + DataProvider<PropertyBinaryGraphemeBaseV1>
1783 + DataProvider<PropertyBinaryGraphemeExtendV1>
1784 + DataProvider<PropertyBinaryHexDigitV1>
1785 + DataProvider<PropertyBinaryIdContinueV1>
1786 + DataProvider<PropertyBinaryIdeographicV1>
1787 + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
1788 + DataProvider<PropertyBinaryIdStartV1>
1789 + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
1790 + DataProvider<PropertyBinaryJoinControlV1>
1791 + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
1792 + DataProvider<PropertyBinaryLowercaseV1>
1793 + DataProvider<PropertyBinaryMathV1>
1794 + DataProvider<PropertyBinaryNoncharacterCodePointV1>
1795 + DataProvider<PropertyBinaryPatternSyntaxV1>
1796 + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
1797 + DataProvider<PropertyBinaryQuotationMarkV1>
1798 + DataProvider<PropertyBinaryRadicalV1>
1799 + DataProvider<PropertyBinaryRegionalIndicatorV1>
1800 + DataProvider<PropertyBinarySentenceTerminalV1>
1801 + DataProvider<PropertyBinarySoftDottedV1>
1802 + DataProvider<PropertyBinaryTerminalPunctuationV1>
1803 + DataProvider<PropertyBinaryUnifiedIdeographV1>
1804 + DataProvider<PropertyBinaryUppercaseV1>
1805 + DataProvider<PropertyBinaryVariationSelectorV1>
1806 + DataProvider<PropertyBinaryWhiteSpaceV1>
1807 + DataProvider<PropertyBinaryXidContinueV1>
1808 + DataProvider<PropertyBinaryXidStartV1>
1809 + DataProvider<PropertyEnumCanonicalCombiningClassV1>
1810 + DataProvider<PropertyEnumGeneralCategoryV1>
1811 + DataProvider<PropertyEnumGraphemeClusterBreakV1>
1812 + DataProvider<PropertyEnumLineBreakV1>
1813 + DataProvider<PropertyEnumScriptV1>
1814 + DataProvider<PropertyEnumSentenceBreakV1>
1815 + DataProvider<PropertyEnumWordBreakV1>
1816 + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
1817 + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
1818 + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
1819 + DataProvider<PropertyNameParseLineBreakV1>
1820 + DataProvider<PropertyNameParseScriptV1>
1821 + DataProvider<PropertyNameParseSentenceBreakV1>
1822 + DataProvider<PropertyNameParseWordBreakV1>
1823 + DataProvider<PropertyScriptWithExtensionsV1>,
1824{
1825 let dummy = Default::default();
1826 parse_unstable_with_variables(source, &dummy, provider)
1827}
1828
1829#[cfg(test)]
1830mod tests {
1831 use core::ops::RangeInclusive;
1832 use std::collections::HashSet;
1833
1834 use super::*;
1835
1836 fn range_iter_from_str(s: &str) -> impl Iterator<Item = RangeInclusive<u32>> {
1838 debug_assert_eq!(
1839 s.chars().count() % 2,
1840 0,
1841 "string \"{}\" does not contain an even number of code points",
1842 s.escape_debug()
1843 );
1844 let mut res = vec![];
1845 let mut skip = false;
1846 for (a, b) in s.chars().zip(s.chars().skip(1)) {
1847 if skip {
1848 skip = false;
1849 continue;
1850 }
1851 let a = a as u32;
1852 let b = b as u32;
1853 res.push(a..=b);
1854 skip = true;
1855 }
1856
1857 res.into_iter()
1858 }
1859
1860 fn assert_set_equality<'a>(
1861 source: &str,
1862 cpinvlistandstrlist: &CodePointInversionListAndStringList,
1863 single: impl Iterator<Item = RangeInclusive<u32>>,
1864 strings: impl Iterator<Item = &'a str>,
1865 ) {
1866 let expected_ranges: HashSet<_> = single.collect();
1867 let actual_ranges: HashSet<_> = cpinvlistandstrlist.code_points().iter_ranges().collect();
1868 assert_eq!(
1869 actual_ranges,
1870 expected_ranges,
1871 "got unexpected ranges {:?}, expected {:?} for parsed set \"{}\"",
1872 actual_ranges,
1873 expected_ranges,
1874 source.escape_debug()
1875 );
1876 let mut expected_size = cpinvlistandstrlist.code_points().size();
1877 for s in strings {
1878 expected_size += 1;
1879 assert!(
1880 cpinvlistandstrlist.contains_str(s),
1881 "missing string \"{}\" from parsed set \"{}\"",
1882 s.escape_debug(),
1883 source.escape_debug()
1884 );
1885 }
1886 let actual_size = cpinvlistandstrlist.size();
1887 assert_eq!(
1888 actual_size,
1889 expected_size,
1890 "got unexpected size {}, expected {} for parsed set \"{}\"",
1891 actual_size,
1892 expected_size,
1893 source.escape_debug()
1894 );
1895 }
1896
1897 fn assert_is_error_and_message_eq(source: &str, expected_err: &str, vm: &VariableMap<'_>) {
1898 let result = parse_with_variables(source, vm);
1899 assert!(result.is_err(), "{source} does not cause an error!");
1900 let err = result.unwrap_err();
1901 assert_eq!(err.fmt_with_source(source).to_string(), expected_err);
1902 }
1903
1904 #[test]
1905 fn test_semantics_with_variables() {
1906 let mut map_char_char = VariableMap::default();
1907 map_char_char.insert_char("a".to_string(), 'a').unwrap();
1908 map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1909
1910 let mut map_headache = VariableMap::default();
1911 map_headache.insert_char("hehe".to_string(), '-').unwrap();
1912
1913 let mut map_char_string = VariableMap::default();
1914 map_char_string.insert_char("a".to_string(), 'a').unwrap();
1915 map_char_string
1916 .insert_string("var2".to_string(), "abc".to_string())
1917 .unwrap();
1918
1919 let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1920 let mut map_char_set = VariableMap::default();
1921 map_char_set.insert_char("a".to_string(), 'a').unwrap();
1922 map_char_set.insert_set("set".to_string(), set).unwrap();
1923
1924 let cases: Vec<(_, _, _, Vec<&str>)> = vec![
1925 (&map_char_char, "[$a]", "aa", vec![]),
1927 (&map_char_char, "[ $a ]", "aa", vec![]),
1928 (&map_char_char, "[$a$]", "aa\u{ffff}\u{ffff}", vec![]),
1929 (&map_char_char, "[$a$ ]", "aa\u{ffff}\u{ffff}", vec![]),
1930 (&map_char_char, "[$a$var2]", "aazz", vec![]),
1931 (&map_char_char, "[$a - $var2]", "az", vec![]),
1932 (&map_char_char, "[$a-$var2]", "az", vec![]),
1933 (&map_headache, "[a $hehe z]", "aazz--", vec![]),
1934 (
1935 &map_char_char,
1936 "[[$]var2]",
1937 "\u{ffff}\u{ffff}vvaarr22",
1938 vec![],
1939 ),
1940 (&map_char_char, r"[\$var2]", "$$vvaarr22", vec![]),
1942 (&map_char_char, r"[\\$var2]", r"\\zz", vec![]),
1943 (&map_char_char, "[{$a}]", "", vec!["$a"]),
1945 (&map_char_set, "[$set & [b-z]]", "bz", vec![]),
1947 (&map_char_set, "[[a-z]-[b-z]]", "aa", vec![]),
1948 (&map_char_set, "[$set-[b-z]]", "aa", vec!["Hello, World!"]),
1949 (&map_char_set, "[$set-$set]", "", vec![]),
1950 (&map_char_set, "[[a-zA]-$set]", "AA", vec![]),
1951 (&map_char_set, "[$set[b-z]]", "az", vec!["Hello, World!"]),
1952 (&map_char_set, "[[a-a]$set]", "az", vec!["Hello, World!"]),
1953 (&map_char_set, "$set", "az", vec!["Hello, World!"]),
1954 (&map_char_string, "[$var2]", "", vec!["abc"]),
1956 ];
1957 for (variable_map, source, single, strings) in cases {
1958 let parsed = parse_with_variables(source, variable_map);
1959 if let Err(err) = parsed {
1960 panic!(
1961 "{source} results in an error: {}",
1962 err.fmt_with_source(source)
1963 );
1964 }
1965 let (set, consumed) = parsed.unwrap();
1966 assert_eq!(consumed, source.len(), "{source:?} is not fully consumed");
1967 assert_set_equality(
1968 source,
1969 &set,
1970 range_iter_from_str(single),
1971 strings.into_iter(),
1972 );
1973 }
1974 }
1975
1976 #[test]
1977 fn test_semantics() {
1978 const ALL_CHARS: &str = "\x00\u{10FFFF}";
1979 let cases: Vec<(_, _, Vec<&str>)> = vec![
1980 ("[a]", "aa", vec![]),
1982 ("[]", "", vec![]),
1983 ("[qax]", "aaqqxx", vec![]),
1984 ("[a-z]", "az", vec![]),
1985 ("[--]", "--", vec![]),
1986 ("[a-b-]", "ab--", vec![]),
1987 ("[[a-b]-]", "ab--", vec![]),
1988 ("[{ab}-]", "--", vec!["ab"]),
1989 ("[-a-b]", "ab--", vec![]),
1990 ("[-a]", "--aa", vec![]),
1991 (r"[\n]", "\n\n", vec![]),
1993 ("[\\\n]", "\n\n", vec![]),
1994 ("[\n]", "", vec![]),
1996 ("[\u{9}]", "", vec![]),
1997 ("[\u{A}]", "", vec![]),
1998 ("[\u{B}]", "", vec![]),
1999 ("[\u{C}]", "", vec![]),
2000 ("[\u{D}]", "", vec![]),
2001 ("[\u{20}]", "", vec![]),
2002 ("[\u{85}]", "", vec![]),
2003 ("[\u{200E}]", "", vec![]),
2004 ("[\u{200F}]", "", vec![]),
2005 ("[\u{2028}]", "", vec![]),
2006 ("[\u{2029}]", "", vec![]),
2007 ("[^[^$]]", "\u{ffff}\u{ffff}", vec![]),
2009 ("[^[^ $]]", "\u{ffff}\u{ffff}", vec![]),
2010 ("[^[^ $ ]]", "\u{ffff}\u{ffff}", vec![]),
2011 ("[^[^a$]]", "aa\u{ffff}\u{ffff}", vec![]),
2012 ("[^[^a$ ]]", "aa\u{ffff}\u{ffff}", vec![]),
2013 ("[-]", "--", vec![]),
2014 ("[ - ]", "--", vec![]),
2015 ("[ - - ]", "--", vec![]),
2016 ("[ a-b - ]", "ab--", vec![]),
2017 ("[ -a]", "--aa", vec![]),
2018 ("[a-]", "--aa", vec![]),
2019 ("[a- ]", "--aa", vec![]),
2020 ("[ :]", "::", vec![]),
2021 ("[ :L:]", "::LL", vec![]),
2022 ("[\u{A0}]", "\u{A0}\u{A0}", vec![]), ("[$]", "\u{ffff}\u{ffff}", vec![]),
2026 (r"[\$]", "$$", vec![]),
2027 ("[{$}]", "$$", vec![]),
2028 ("[[a-z]&[b-z]]", "bz", vec![]),
2030 ("[[a-z]-[b-z]]", "aa", vec![]),
2031 ("[[a-z][b-z]]", "az", vec![]),
2032 ("[[a-a][b-z]]", "az", vec![]),
2033 ("[[a-z{abc}]&[b-z{abc}{abx}]]", "bz", vec!["abc"]),
2034 ("[[{abx}a-z{abc}]&[b-z{abc}]]", "bz", vec!["abc"]),
2035 ("[[a-z{abx}]-[{abx}b-z{abc}]]", "aa", vec![]),
2036 ("[[a-z{abx}{abc}]-[{abx}b-z]]", "aa", vec!["abc"]),
2037 ("[[a-z{abc}][b-z{abx}]]", "az", vec!["abc", "abx"]),
2038 ("[{this is a minus -}]", "", vec!["thisisaminus-"]),
2040 ("[[a-a][b-z] - [a-d][e-z]]", "ez", vec![]),
2042 ("[[a-a][b-z] - [a-d]&[e-z]]", "ez", vec![]),
2043 ("[[a-a][b-z] - [a-z][]]", "", vec![]),
2044 ("[[a-a][b-z] - [a-z]&[]]", "", vec![]),
2045 ("[[a-a][b-z] & [a-z]-[]]", "az", vec![]),
2046 ("[[a-a][b-z] & []-[a-z]]", "", vec![]),
2047 ("[[a-a][b-z] & [a-b][x-z]]", "abxz", vec![]),
2048 ("[[a-z]-[a-b]-[y-z]]", "cx", vec![]),
2049 (r"[\x61-\x63]", "ac", vec![]),
2051 (r"[a-\x63]", "ac", vec![]),
2052 (r"[\x61-c]", "ac", vec![]),
2053 (r"[\u0061-\x63]", "ac", vec![]),
2054 (r"[\U00000061-\x63]", "ac", vec![]),
2055 (r"[\x{61}-\x63]", "ac", vec![]),
2056 (r"[\u{61}-\x63]", "ac", vec![]),
2057 (r"[\u{61}{hello\ world}]", "aa", vec!["hello world"]),
2058 (r"[{hello\ world}\u{61}]", "aa", vec!["hello world"]),
2059 (r"[{h\u{65}llo\ world}]", "", vec!["hello world"]),
2060 (r"[^]", ALL_CHARS, vec![]),
2062 (r"[[^]-[^a-z]]", "az", vec![]),
2063 (r"[^{h\u{65}llo\ world}]", ALL_CHARS, vec![]),
2064 (
2065 r"[^[{h\u{65}llo\ world}]-[{hello\ world}]]",
2066 ALL_CHARS,
2067 vec![],
2068 ),
2069 (
2070 r"[^[\x00-\U0010FFFF]-[\u0100-\U0010FFFF]]",
2071 "\u{100}\u{10FFFF}",
2072 vec![],
2073 ),
2074 (r"[^[^a-z]]", "az", vec![]),
2075 (r"[^[^\^]]", "^^", vec![]),
2076 (r"[{\x{61 0062 063}}]", "", vec!["abc"]),
2077 (r"[\x{61 0062 063}]", "ac", vec![]),
2078 (r"[:AHex:]", "09afAF", vec![]),
2080 (r"[:AHex=True:]", "09afAF", vec![]),
2081 (r"[:AHex=T:]", "09afAF", vec![]),
2082 (r"[:AHex=Yes:]", "09afAF", vec![]),
2083 (r"[:AHex=Y:]", "09afAF", vec![]),
2084 (r"[:^AHex≠True:]", "09afAF", vec![]),
2085 (r"[:AHex≠False:]", "09afAF", vec![]),
2086 (r"[[:^AHex≠False:]&[\x00-\x10]]", "\0\x10", vec![]),
2087 (r"\p{AHex}", "09afAF", vec![]),
2088 (r"\p{AHex=True}", "09afAF", vec![]),
2089 (r"\p{AHex=T}", "09afAF", vec![]),
2090 (r"\p{AHex=Yes}", "09afAF", vec![]),
2091 (r"\p{AHex=Y}", "09afAF", vec![]),
2092 (r"\P{AHex≠True}", "09afAF", vec![]),
2093 (r"\p{AHex≠False}", "09afAF", vec![]),
2094 (r"[[:gc=lower-case-letter:]&[a-zA-Z]]", "az", vec![]),
2096 (r"[[:lower case letter:]&[a-zA-Z]]", "az", vec![]),
2097 (
2100 r"[[[:L:]-[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]][[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]-[:L:]]]",
2101 "",
2102 vec![],
2103 ),
2104 (r"[[:sc=latn:]&[a-zA-Z]]", "azAZ", vec![]),
2106 (r"[[:sc=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
2107 (r"[[:Latin:]&[a-zA-Z]]", "azAZ", vec![]),
2108 (r"[[:latn:]&[a-zA-Z]]", "azAZ", vec![]),
2109 (r"[[:scx=latn:]&[a-zA-Z]]", "azAZ", vec![]),
2111 (r"[[:scx=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
2112 (r"[[:scx=Hira:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
2113 (r"[[:sc=Hira:]&[\u30FC]]", "", vec![]),
2114 (r"[[:scx=Kana:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
2115 (r"[[:sc=Kana:]&[\u30FC]]", "", vec![]),
2116 (r"[[:sc=Common:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
2117 (
2119 r"\p{Grapheme_Cluster_Break=ZWJ}",
2120 "\u{200D}\u{200D}",
2121 vec![],
2122 ),
2123 (
2125 r"\p{Sentence_Break=ATerm}",
2126 "\u{002E}\u{002E}\u{2024}\u{2024}\u{FE52}\u{FE52}\u{FF0E}\u{FF0E}",
2127 vec![],
2128 ),
2129 (r"\p{Word_Break=Single_Quote}", "\u{0027}\u{0027}", vec![]),
2131 (r"[\^a]", "^^aa", vec![]),
2133 (r"[{{}]", "{{", vec![]),
2134 (r"[{}}]", "}}", vec![""]),
2135 (r"[}]", "}}", vec![]),
2136 (r"[{$var}]", "", vec!["$var"]),
2137 (r"[{[a-z}]", "", vec!["[a-z"]),
2138 (r"[ { [ a - z } ]", "", vec!["[a-z"]),
2139 ];
2141 for (source, single, strings) in cases {
2142 let parsed = parse(source);
2143 if let Err(err) = parsed {
2144 panic!(
2145 "{source} results in an error: {}",
2146 err.fmt_with_source(source)
2147 );
2148 }
2149 let (set, consumed) = parsed.unwrap();
2150 assert_eq!(consumed, source.len());
2151 assert_set_equality(
2152 source,
2153 &set,
2154 range_iter_from_str(single),
2155 strings.into_iter(),
2156 );
2157 }
2158 }
2159
2160 #[test]
2161 fn test_error_messages_with_variables() {
2162 let mut map_char_char = VariableMap::default();
2163 map_char_char.insert_char("a".to_string(), 'a').unwrap();
2164 map_char_char.insert_char("var2".to_string(), 'z').unwrap();
2165
2166 let mut map_char_string = VariableMap::default();
2167 map_char_string.insert_char("a".to_string(), 'a').unwrap();
2168 map_char_string
2169 .insert_string("var2".to_string(), "abc".to_string())
2170 .unwrap();
2171
2172 let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
2173 let mut map_char_set = VariableMap::default();
2174 map_char_set.insert_char("a".to_string(), 'a').unwrap();
2175 map_char_set.insert_set("set".to_string(), set).unwrap();
2176
2177 let cases = [
2178 (&map_char_char, "[$$a]", r"[$$a← error: unexpected variable"),
2179 (
2180 &map_char_char,
2181 "[$ a]",
2182 r"[$ a← error: unexpected character 'a'",
2183 ),
2184 (&map_char_char, "$a", r"$a← error: unexpected variable"),
2185 (&map_char_char, "$", r"$← error: unexpected end of input"),
2186 (
2187 &map_char_string,
2188 "[$var2-$a]",
2189 r"[$var2-$a← error: unexpected variable",
2190 ),
2191 (
2192 &map_char_string,
2193 "[$a-$var2]",
2194 r"[$a-$var2← error: unexpected variable",
2195 ),
2196 (
2197 &map_char_set,
2198 "[$a-$set]",
2199 r"[$a-$set← error: unexpected variable",
2200 ),
2201 (
2202 &map_char_set,
2203 "[$set-$a]",
2204 r"[$set-$a← error: unexpected variable",
2205 ),
2206 (
2207 &map_char_set,
2208 "[$=]",
2209 "[$=← error: unexpected character '='",
2210 ),
2211 ];
2212 for (variable_map, source, expected_err) in cases {
2213 assert_is_error_and_message_eq(source, expected_err, variable_map);
2214 }
2215 }
2216
2217 #[test]
2218 fn test_error_messages() {
2219 let cases = [
2220 (r"[a-z[\]]", r"[a-z[\]]← error: unexpected end of input"),
2221 (r"", r"← error: unexpected end of input"),
2222 (r"[{]", r"[{]← error: unexpected end of input"),
2223 (
2225 r"[:general_category:]",
2226 r"[:general_category← error: unknown property",
2227 ),
2228 (r"[:ll=true:]", r"[:ll=true← error: unknown property"),
2229 (r"[:=", r"[:=← error: unexpected character '='"),
2230 (r"[::]", r"[::← error: unexpected character ':'"),
2232 (r"[:=hello:]", r"[:=← error: unexpected character '='"),
2233 (r"[:gc=:]", r"[:gc=:← error: unexpected character ':'"),
2235 (r"[\xag]", r"[\xag← error: unexpected character 'g'"),
2236 (r"[a-b-z]", r"[a-b-z← error: unexpected character 'z'"),
2237 (r"[a-\p{ll}]", r"[a-\← error: unexpected character '\\'"),
2239 (r"[a-&]", r"[a-&← error: unexpected character '&'"),
2240 (r"[a&b]", r"[a&← error: unexpected character '&'"),
2241 (r"[[set]&b]", r"[[set]&b← error: unexpected character 'b'"),
2242 (r"[[set]&]", r"[[set]&]← error: unexpected character ']'"),
2243 (r"[a-\x60]", r"[a-\x60← error: unexpected character '`'"),
2244 (r"[a-`]", r"[a-`← error: unexpected character '`'"),
2245 (r"[\x{6g}]", r"[\x{6g← error: unexpected character 'g'"),
2246 (r"[\x{g}]", r"[\x{g← error: unexpected character 'g'"),
2247 (r"[\x{}]", r"[\x{}← error: unexpected character '}'"),
2248 (
2249 r"[\x{dabeef}]",
2250 r"[\x{dabeef← error: invalid escape sequence",
2251 ),
2252 (
2253 r"[\x{10ffff0}]",
2254 r"[\x{10ffff0← error: unexpected character '0'",
2255 ),
2256 (
2257 r"[\x{11ffff}]",
2258 r"[\x{11ffff← error: invalid escape sequence",
2259 ),
2260 (
2261 r"[\x{10ffff 1 10ffff0}]",
2262 r"[\x{10ffff 1 10ffff0← error: unexpected character '0'",
2263 ),
2264 (r"ä", r"ä← error: unexpected character 'ä'"),
2266 (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
2267 (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
2268 (
2269 r"[\xe5-\xe4]",
2270 r"[\xe5-\xe4← error: unexpected character 'ä'",
2271 ),
2272 (r"[\xe5-ä]", r"[\xe5-ä← error: unexpected character 'ä'"),
2273 (r"[ ^]", r"[ ^← error: unexpected character '^'"),
2275 (r"[:]", r"[:]← error: unexpected character ']'"),
2276 (r"[:L]", r"[:L]← error: unexpected character ']'"),
2277 (r"\p {L}", r"\p ← error: unexpected character ' '"),
2278 (
2280 r"[\x{61 62}-d]",
2281 r"[\x{61 62}-d← error: unexpected character 'd'",
2282 ),
2283 (
2284 r"[\x{61 63}-\x{62 64}]",
2285 r"[\x{61 63}-\← error: unexpected character '\\'",
2286 ),
2287 (r"[a-\x{62 64}]", r"[a-\← error: unexpected character '\\'"),
2289 ];
2290 let vm = Default::default();
2291 for (source, expected_err) in cases {
2292 assert_is_error_and_message_eq(source, expected_err, &vm);
2293 }
2294 }
2295
2296 #[test]
2297 fn test_consumed() {
2298 let cases = [
2299 (r"[a-z\]{[}]".len(), r"[a-z\]{[}][]"),
2300 (r"[a-z\]{[}]".len(), r"[a-z\]{[}] []"),
2301 (r"[a-z\]{[}]".len(), r"[a-z\]{]}] []"),
2302 (r"[a-z\]{{[}]".len(), r"[a-z\]{{]}] []"),
2303 (r"[a-z\]{[}]".len(), r"[a-z\]{]}]\p{L}"),
2304 (r"[a-z\]{[}]".len(), r"[a-z\]{]}]$var"),
2305 ];
2306
2307 let vm = Default::default();
2308 for (expected_consumed, source) in cases {
2309 let (_, consumed) = parse(source).unwrap();
2310 assert_eq!(expected_consumed, consumed);
2311 let (_, consumed) = parse_with_variables(source, &vm).unwrap();
2312 assert_eq!(expected_consumed, consumed);
2313 }
2314 }
2315}