use alloc::borrow::Cow;
use alloc::collections::{BTreeMap, BTreeSet};
use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::fmt::Display;
use core::{iter::Peekable, str::CharIndices};
use icu_collections::{
codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder},
codepointinvliststringlist::CodePointInversionListAndStringList,
};
use icu_properties::script::ScriptWithExtensions;
use icu_properties::{
props::{
CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
GraphemeClusterBreak, LineBreak, Script, SentenceBreak, WordBreak,
},
CodePointMapData,
};
use icu_properties::{
props::{PatternWhiteSpace, XidContinue, XidStart},
CodePointSetData,
};
use icu_properties::{provider::*, PropertyParser};
use icu_provider::prelude::*;
#[derive(Debug, Clone, Copy, PartialEq, Eq, displaydoc::Display)]
#[non_exhaustive]
pub enum ParseErrorKind {
#[displaydoc("An unexpected character was encountered")]
UnexpectedChar(char),
#[displaydoc("The property name or value is unknown")]
UnknownProperty,
UnknownVariable,
UnexpectedVariable,
Eof,
Internal,
#[displaydoc("The provided syntax is not supported by us.")]
Unimplemented,
InvalidEscape,
}
use zerovec::VarZeroVec;
use ParseErrorKind as PEK;
impl ParseErrorKind {
fn with_offset(self, offset: usize) -> ParseError {
ParseError {
offset: Some(offset),
kind: self,
}
}
}
impl From<ParseErrorKind> for ParseError {
fn from(kind: ParseErrorKind) -> Self {
ParseError { offset: None, kind }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ParseError {
offset: Option<usize>,
kind: ParseErrorKind,
}
type Result<T, E = ParseError> = core::result::Result<T, E>;
impl ParseError {
pub fn fmt_with_source(&self, source: &str) -> impl Display {
let ParseError { offset, kind } = *self;
if kind == ParseErrorKind::Eof {
return format!("{source}← error: unexpected end of input");
}
let mut s = String::new();
if let Some(offset) = offset {
if offset < source.len() {
let mut exclusive_end = offset + 1;
for _ in 0..3 {
if source.is_char_boundary(exclusive_end) {
break;
}
exclusive_end += 1;
}
s.push_str(&source[..exclusive_end]);
s.push_str("← ");
}
}
s.push_str("error: ");
match kind {
ParseErrorKind::UnexpectedChar(c) => {
s.push_str(&format!("unexpected character '{}'", c.escape_debug()));
}
ParseErrorKind::UnknownProperty => {
s.push_str("unknown property");
}
ParseErrorKind::UnknownVariable => {
s.push_str("unknown variable");
}
ParseErrorKind::UnexpectedVariable => {
s.push_str("unexpected variable");
}
ParseErrorKind::Eof => {
s.push_str("unexpected end of input");
}
ParseErrorKind::Internal => {
s.push_str("internal error");
}
ParseErrorKind::Unimplemented => {
s.push_str("unimplemented");
}
ParseErrorKind::InvalidEscape => {
s.push_str("invalid escape sequence");
}
}
s
}
pub fn kind(&self) -> ParseErrorKind {
self.kind
}
pub fn offset(&self) -> Option<usize> {
self.offset
}
fn or_with_offset(self, offset: usize) -> Self {
match self.offset {
Some(_) => self,
None => ParseError {
offset: Some(offset),
..self
},
}
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum VariableValue<'a> {
UnicodeSet(CodePointInversionListAndStringList<'a>),
Char(char),
String(Cow<'a, str>),
}
#[derive(Debug, Clone, Default)]
pub struct VariableMap<'a>(BTreeMap<String, VariableValue<'a>>);
impl<'a> VariableMap<'a> {
pub fn new() -> Self {
Self::default()
}
pub fn remove(&mut self, key: &str) -> Option<VariableValue<'a>> {
self.0.remove(key)
}
pub fn get(&self, key: &str) -> Option<&VariableValue<'a>> {
self.0.get(key)
}
pub fn insert(
&mut self,
key: String,
value: VariableValue<'a>,
) -> Result<(), &VariableValue<'_>> {
if self.0.contains_key(&key) {
#[expect(clippy::indexing_slicing)]
return Err(&self.0[&key]);
}
if let VariableValue::String(s) = &value {
let mut chars = s.chars();
if let (Some(c), None) = (chars.next(), chars.next()) {
self.0.insert(key, VariableValue::Char(c));
return Ok(());
};
}
self.0.insert(key, value);
Ok(())
}
pub fn insert_char(&mut self, key: String, c: char) -> Result<(), &VariableValue<'_>> {
if self.0.contains_key(&key) {
#[expect(clippy::indexing_slicing)]
return Err(&self.0[&key]);
}
self.0.insert(key, VariableValue::Char(c));
Ok(())
}
pub fn insert_string(&mut self, key: String, s: String) -> Result<(), &VariableValue<'_>> {
if self.0.contains_key(&key) {
#[expect(clippy::indexing_slicing)]
return Err(&self.0[&key]);
}
let mut chars = s.chars();
let val = match (chars.next(), chars.next()) {
(Some(c), None) => VariableValue::Char(c),
_ => VariableValue::String(Cow::Owned(s)),
};
self.0.insert(key, val);
Ok(())
}
pub fn insert_str(&mut self, key: String, s: &'a str) -> Result<(), &VariableValue<'_>> {
if self.0.contains_key(&key) {
#[expect(clippy::indexing_slicing)]
return Err(&self.0[&key]);
}
let mut chars = s.chars();
let val = match (chars.next(), chars.next()) {
(Some(c), None) => VariableValue::Char(c),
_ => VariableValue::String(Cow::Borrowed(s)),
};
self.0.insert(key, val);
Ok(())
}
pub fn insert_set(
&mut self,
key: String,
set: CodePointInversionListAndStringList<'a>,
) -> Result<(), &VariableValue<'_>> {
if self.0.contains_key(&key) {
#[expect(clippy::indexing_slicing)]
return Err(&self.0[&key]);
}
self.0.insert(key, VariableValue::UnicodeSet(set));
Ok(())
}
}
fn legal_char_start(c: char) -> bool {
!(c == '&' || c == '-' || c == '$' || c == '^' || c == '[' || c == ']' || c == '{')
}
fn legal_char_in_string_start(c: char) -> bool {
c != '}'
}
#[derive(Debug)]
enum SingleOrMultiChar {
Single(char),
Multi(char),
}
#[derive(Debug)]
enum Literal {
String(String),
CharKind(SingleOrMultiChar),
}
#[derive(Debug)]
enum MainToken<'data> {
Literal(Literal),
UnicodeSet(CodePointInversionListAndStringList<'data>),
DollarSign,
Ampersand,
Minus,
ClosingBracket,
}
impl<'data> MainToken<'data> {
fn from_variable_value(val: VariableValue<'data>) -> Self {
match val {
VariableValue::Char(c) => {
MainToken::Literal(Literal::CharKind(SingleOrMultiChar::Single(c)))
}
VariableValue::String(s) => {
MainToken::Literal(Literal::String(s.into_owned()))
}
VariableValue::UnicodeSet(set) => MainToken::UnicodeSet(set),
}
}
}
#[derive(Debug, Clone, Copy)]
enum Operation {
Union,
Difference,
Intersection,
}
struct UnicodeSetBuilder<'a, 'b, P: ?Sized> {
single_set: CodePointInversionListBuilder,
string_set: BTreeSet<String>,
iter: &'a mut Peekable<CharIndices<'b>>,
source: &'b str,
inverted: bool,
variable_map: &'a VariableMap<'a>,
xid_start: &'a CodePointInversionList<'a>,
xid_continue: &'a CodePointInversionList<'a>,
pat_ws: &'a CodePointInversionList<'a>,
property_provider: &'a P,
}
impl<'a, 'b, P> UnicodeSetBuilder<'a, 'b, P>
where
P: ?Sized
+ DataProvider<PropertyBinaryAlphabeticV1>
+ DataProvider<PropertyBinaryAsciiHexDigitV1>
+ DataProvider<PropertyBinaryBidiControlV1>
+ DataProvider<PropertyBinaryBidiMirroredV1>
+ DataProvider<PropertyBinaryCasedV1>
+ DataProvider<PropertyBinaryCaseIgnorableV1>
+ DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenCasemappedV1>
+ DataProvider<PropertyBinaryChangesWhenLowercasedV1>
+ DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
+ DataProvider<PropertyBinaryChangesWhenUppercasedV1>
+ DataProvider<PropertyBinaryDashV1>
+ DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
+ DataProvider<PropertyBinaryDeprecatedV1>
+ DataProvider<PropertyBinaryDiacriticV1>
+ DataProvider<PropertyBinaryEmojiComponentV1>
+ DataProvider<PropertyBinaryEmojiModifierBaseV1>
+ DataProvider<PropertyBinaryEmojiModifierV1>
+ DataProvider<PropertyBinaryEmojiPresentationV1>
+ DataProvider<PropertyBinaryEmojiV1>
+ DataProvider<PropertyBinaryExtendedPictographicV1>
+ DataProvider<PropertyBinaryExtenderV1>
+ DataProvider<PropertyBinaryGraphemeBaseV1>
+ DataProvider<PropertyBinaryGraphemeExtendV1>
+ DataProvider<PropertyBinaryHexDigitV1>
+ DataProvider<PropertyBinaryIdContinueV1>
+ DataProvider<PropertyBinaryIdeographicV1>
+ DataProvider<PropertyBinaryIdsBinaryOperatorV1>
+ DataProvider<PropertyBinaryIdStartV1>
+ DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
+ DataProvider<PropertyBinaryJoinControlV1>
+ DataProvider<PropertyBinaryLogicalOrderExceptionV1>
+ DataProvider<PropertyBinaryLowercaseV1>
+ DataProvider<PropertyBinaryMathV1>
+ DataProvider<PropertyBinaryNoncharacterCodePointV1>
+ DataProvider<PropertyBinaryPatternSyntaxV1>
+ DataProvider<PropertyBinaryPatternWhiteSpaceV1>
+ DataProvider<PropertyBinaryQuotationMarkV1>
+ DataProvider<PropertyBinaryRadicalV1>
+ DataProvider<PropertyBinaryRegionalIndicatorV1>
+ DataProvider<PropertyBinarySentenceTerminalV1>
+ DataProvider<PropertyBinarySoftDottedV1>
+ DataProvider<PropertyBinaryTerminalPunctuationV1>
+ DataProvider<PropertyBinaryUnifiedIdeographV1>
+ DataProvider<PropertyBinaryUppercaseV1>
+ DataProvider<PropertyBinaryVariationSelectorV1>
+ DataProvider<PropertyBinaryWhiteSpaceV1>
+ DataProvider<PropertyBinaryXidContinueV1>
+ DataProvider<PropertyBinaryXidStartV1>
+ DataProvider<PropertyEnumCanonicalCombiningClassV1>
+ DataProvider<PropertyEnumGeneralCategoryV1>
+ DataProvider<PropertyEnumGraphemeClusterBreakV1>
+ DataProvider<PropertyEnumLineBreakV1>
+ DataProvider<PropertyEnumScriptV1>
+ DataProvider<PropertyEnumSentenceBreakV1>
+ DataProvider<PropertyEnumWordBreakV1>
+ DataProvider<PropertyNameParseCanonicalCombiningClassV1>
+ DataProvider<PropertyNameParseGeneralCategoryMaskV1>
+ DataProvider<PropertyNameParseGraphemeClusterBreakV1>
+ DataProvider<PropertyNameParseLineBreakV1>
+ DataProvider<PropertyNameParseScriptV1>
+ DataProvider<PropertyNameParseSentenceBreakV1>
+ DataProvider<PropertyNameParseWordBreakV1>
+ DataProvider<PropertyScriptWithExtensionsV1>,
{
fn new_internal(
iter: &'a mut Peekable<CharIndices<'b>>,
source: &'b str,
variable_map: &'a VariableMap<'a>,
xid_start: &'a CodePointInversionList<'a>,
xid_continue: &'a CodePointInversionList<'a>,
pat_ws: &'a CodePointInversionList<'a>,
provider: &'a P,
) -> Self {
UnicodeSetBuilder {
single_set: CodePointInversionListBuilder::new(),
string_set: Default::default(),
iter,
source,
inverted: false,
variable_map,
xid_start,
xid_continue,
pat_ws,
property_provider: provider,
}
}
fn parse_unicode_set(&mut self) -> Result<()> {
match self.must_peek_char()? {
'\\' => self.parse_property_perl(),
'[' => {
self.iter.next();
if let Some(':') = self.peek_char() {
self.parse_property_posix()
} else {
self.parse_unicode_set_inner()
}
}
'$' => {
let (offset, v) = self.parse_variable()?;
match v {
Some(VariableValue::UnicodeSet(s)) => {
self.single_set.add_set(s.code_points());
self.string_set
.extend(s.strings().iter().map(ToString::to_string));
Ok(())
}
Some(_) => Err(PEK::UnexpectedVariable.with_offset(offset)),
None => Err(PEK::UnexpectedChar('$').with_offset(offset)),
}
}
c => self.error_here(PEK::UnexpectedChar(c)),
}
}
fn parse_unicode_set_inner(&mut self) -> Result<()> {
if self.must_peek_char()? == '^' {
self.iter.next();
self.inverted = true;
}
self.skip_whitespace();
if self.must_peek_char()? == '-' {
self.iter.next();
self.single_set.add_char('-');
}
#[derive(Debug, Clone, Copy)]
enum State {
Begin,
Char,
CharMinus,
AfterUnicodeSet,
AfterOp,
AfterDollar,
AfterMinus,
}
use State::*;
const DEFAULT_OP: Operation = Operation::Union;
let mut state = Begin;
let mut prev_char = None;
let mut operation = Operation::Union;
loop {
self.skip_whitespace();
let (immediate_offset, immediate_char) = self.must_peek()?;
let (tok_offset, from_var, tok) = self.parse_main_token()?;
use MainToken as MT;
use SingleOrMultiChar as SMC;
match (state, tok) {
(
Begin | Char | CharMinus | AfterUnicodeSet | AfterDollar | AfterMinus,
MT::ClosingBracket,
) => {
if let Some(prev) = prev_char.take() {
self.single_set.add_char(prev);
}
if matches!(state, CharMinus) {
self.single_set.add_char('-');
}
return Ok(());
}
(AfterOp, MT::ClosingBracket) if matches!(operation, Operation::Difference) => {
self.single_set.add_char('-');
return Ok(());
}
(Begin, MT::Minus) => {
self.single_set.add_char('-');
state = AfterMinus;
}
(Begin | Char | AfterUnicodeSet | AfterOp, MT::UnicodeSet(set)) => {
if let Some(prev) = prev_char.take() {
self.single_set.add_char(prev);
}
self.process_chars(operation, set.code_points().clone());
self.process_strings(
operation,
set.strings().iter().map(ToString::to_string).collect(),
);
operation = DEFAULT_OP;
state = AfterUnicodeSet;
}
(
Begin | Char | AfterUnicodeSet,
MT::Literal(Literal::CharKind(SMC::Single(c))),
) => {
if let Some(prev) = prev_char.take() {
self.single_set.add_char(prev);
}
prev_char = Some(c);
state = Char;
}
(
Begin | Char | AfterUnicodeSet,
MT::Literal(Literal::CharKind(SMC::Multi(first_c))),
) => {
if let Some(prev) = prev_char.take() {
self.single_set.add_char(prev);
}
self.single_set.add_char(first_c);
self.parse_multi_escape_into_set()?;
state = Begin;
}
(Begin | Char | AfterUnicodeSet, MT::Literal(Literal::String(s))) => {
if let Some(prev) = prev_char.take() {
self.single_set.add_char(prev);
}
self.string_set.insert(s);
state = Begin;
}
(CharMinus, MT::Literal(Literal::CharKind(SMC::Single(c)))) => {
let start = prev_char.ok_or(ParseError {
offset: Some(tok_offset),
kind: PEK::Internal,
})?;
let end = c;
if start > end {
return Err(PEK::UnexpectedChar(end).with_offset(tok_offset));
}
self.single_set.add_range(start..=end);
prev_char = None;
state = Begin;
}
(Char, MT::Minus) => {
state = CharMinus;
}
(AfterUnicodeSet, MT::Minus) => {
operation = Operation::Difference;
state = AfterOp;
}
(AfterUnicodeSet, MT::Ampersand) => {
operation = Operation::Intersection;
state = AfterOp;
}
(Begin | Char | AfterUnicodeSet, MT::DollarSign) => {
if let Some(prev) = prev_char.take() {
self.single_set.add_char(prev);
}
self.single_set.add_char('\u{FFFF}');
state = AfterDollar;
}
_ => {
if from_var {
return Err(PEK::UnexpectedVariable.with_offset(tok_offset));
}
return Err(PEK::UnexpectedChar(immediate_char).with_offset(immediate_offset));
}
}
}
}
fn parse_main_token(&mut self) -> Result<(usize, bool, MainToken<'a>)> {
let (initial_offset, first) = self.must_peek()?;
if first == ']' {
self.iter.next();
return Ok((initial_offset, false, MainToken::ClosingBracket));
}
let (_, second) = self.must_peek_double()?;
match (first, second) {
('$', _) => {
let (offset, var_or_anchor) = self.parse_variable()?;
match var_or_anchor {
None => Ok((offset, false, MainToken::DollarSign)),
Some(v) => Ok((offset, true, MainToken::from_variable_value(v.clone()))),
}
}
('{', _) => self
.parse_string()
.map(|(offset, l)| (offset, false, MainToken::Literal(l))),
('\\', 'p' | 'P') | ('[', _) => {
let mut inner_builder = UnicodeSetBuilder::new_internal(
self.iter,
self.source,
self.variable_map,
self.xid_start,
self.xid_continue,
self.pat_ws,
self.property_provider,
);
inner_builder.parse_unicode_set()?;
let (single, string_set) = inner_builder.finalize();
let offset = self.must_peek_index()? - 1;
let mut strings = string_set.into_iter().collect::<Vec<_>>();
strings.sort();
let cpilasl = CodePointInversionListAndStringList::try_from(
single.build(),
VarZeroVec::from(&strings),
)
.map_err(|_| PEK::Internal.with_offset(offset))?;
Ok((offset, false, MainToken::UnicodeSet(cpilasl)))
}
(c, _) if legal_char_start(c) => self
.parse_char()
.map(|(offset, c)| (offset, false, MainToken::Literal(Literal::CharKind(c)))),
('-', _) => {
self.iter.next();
Ok((initial_offset, false, MainToken::Minus))
}
('&', _) => {
self.iter.next();
Ok((initial_offset, false, MainToken::Ampersand))
}
(c, _) => Err(PEK::UnexpectedChar(c).with_offset(initial_offset)),
}
}
fn parse_variable(&mut self) -> Result<(usize, Option<&'a VariableValue<'a>>)> {
self.consume('$')?;
let mut res = String::new();
let (mut var_offset, first_c) = self.must_peek()?;
if !self.xid_start.contains(first_c) {
return Ok((var_offset - 1, None));
}
res.push(first_c);
self.iter.next();
while let Some(&(offset, c)) = self.iter.peek() {
if !self.xid_continue.contains(c) {
break;
}
var_offset = offset;
self.iter.next();
res.push(c);
}
if let Some(v) = self.variable_map.0.get(&res) {
return Ok((var_offset, Some(v)));
}
Err(PEK::UnknownVariable.with_offset(var_offset))
}
fn parse_string(&mut self) -> Result<(usize, Literal)> {
self.consume('{')?;
let mut buffer = String::new();
let mut last_offset;
loop {
self.skip_whitespace();
last_offset = self.must_peek_index()?;
match self.must_peek_char()? {
'}' => {
self.iter.next();
break;
}
c if legal_char_in_string_start(c) => {
let (_, c) = self.parse_char()?;
match c {
SingleOrMultiChar::Single(c) => buffer.push(c),
SingleOrMultiChar::Multi(first) => {
buffer.push(first);
self.parse_multi_escape_into_string(&mut buffer)?;
}
}
}
c => return self.error_here(PEK::UnexpectedChar(c)),
}
}
let mut chars = buffer.chars();
let literal = match (chars.next(), chars.next()) {
(Some(c), None) => Literal::CharKind(SingleOrMultiChar::Single(c)),
_ => Literal::String(buffer),
};
Ok((last_offset, literal))
}
fn parse_multi_escape_into_set(&mut self) -> Result<()> {
let mut first = true;
loop {
let skipped = self.skip_whitespace();
match self.must_peek_char()? {
'}' => {
self.iter.next();
return Ok(());
}
initial_c => {
if skipped == 0 && !first {
return self.error_here(PEK::UnexpectedChar(initial_c));
}
first = false;
let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
self.single_set.add_char(c);
}
}
}
}
fn parse_multi_escape_into_string(&mut self, s: &mut String) -> Result<()> {
let mut first = true;
loop {
let skipped = self.skip_whitespace();
match self.must_peek_char()? {
'}' => {
self.iter.next();
return Ok(());
}
initial_c => {
if skipped == 0 && !first {
return self.error_here(PEK::UnexpectedChar(initial_c));
}
first = false;
let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
s.push(c);
}
}
}
}
fn parse_escaped_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
self.consume('\\')?;
let (offset, next_char) = self.must_next()?;
match next_char {
'u' | 'x' if self.peek_char() == Some('{') => {
self.iter.next();
self.skip_whitespace();
let (_, first_c) = self.parse_hex_digits_into_char(1, 6)?;
let skipped = self.skip_whitespace();
match self.must_peek()? {
(offset, '}') => {
self.iter.next();
Ok((offset, SingleOrMultiChar::Single(first_c)))
}
(offset, c) if c.is_ascii_hexdigit() && skipped > 0 => {
Ok((offset, SingleOrMultiChar::Multi(first_c)))
}
(_, c) => self.error_here(PEK::UnexpectedChar(c)),
}
}
'u' => {
self.parse_hex_digits_into_char(4, 4)
.map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
}
'x' => {
self.parse_hex_digits_into_char(2, 2)
.map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
}
'U' => {
self.consume('0')?;
self.consume('0')?;
self.parse_hex_digits_into_char(6, 6)
.map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
}
'N' => {
Err(PEK::Unimplemented.with_offset(offset))
}
'a' => Ok((offset, SingleOrMultiChar::Single('\u{0007}'))),
'b' => Ok((offset, SingleOrMultiChar::Single('\u{0008}'))),
't' => Ok((offset, SingleOrMultiChar::Single('\u{0009}'))),
'n' => Ok((offset, SingleOrMultiChar::Single('\u{000A}'))),
'v' => Ok((offset, SingleOrMultiChar::Single('\u{000B}'))),
'f' => Ok((offset, SingleOrMultiChar::Single('\u{000C}'))),
'r' => Ok((offset, SingleOrMultiChar::Single('\u{000D}'))),
_ => Ok((offset, SingleOrMultiChar::Single(next_char))),
}
}
fn parse_property_posix(&mut self) -> Result<()> {
self.consume(':')?;
if self.must_peek_char()? == '^' {
self.inverted = true;
self.iter.next();
}
self.parse_property_inner(':')?;
self.consume(']')?;
Ok(())
}
fn parse_property_perl(&mut self) -> Result<()> {
self.consume('\\')?;
match self.must_next()? {
(_, 'p') => {}
(_, 'P') => self.inverted = true,
(offset, c) => return Err(PEK::UnexpectedChar(c).with_offset(offset)),
}
self.consume('{')?;
self.parse_property_inner('}')?;
Ok(())
}
fn parse_property_inner(&mut self, end: char) -> Result<()> {
let property_offset;
let mut key_buffer = String::new();
let mut value_buffer = String::new();
enum State {
Begin,
PropertyName,
PropertyValueBegin,
PropertyValue,
}
use State::*;
let mut state = Begin;
let mut equality = true;
loop {
self.skip_whitespace();
match (state, self.must_peek_char()?) {
(PropertyName | PropertyValue, c) if c == end => {
property_offset = self.must_peek_index()? - 1;
self.iter.next();
break;
}
(Begin | PropertyName, c) if c.is_ascii_alphanumeric() || c == '_' => {
key_buffer.push(c);
self.iter.next();
state = PropertyName;
}
(PropertyName, c @ ('=' | '≠')) => {
equality = c == '=';
self.iter.next();
state = PropertyValueBegin;
}
(PropertyValue | PropertyValueBegin, c) if c != end => {
value_buffer.push(c);
self.iter.next();
state = PropertyValue;
}
(_, c) => return self.error_here(PEK::UnexpectedChar(c)),
}
}
if !equality {
self.inverted = !self.inverted;
}
let inverted = self
.load_property_codepoints(&key_buffer, &value_buffer)
.map_err(|e| e.or_with_offset(property_offset))?;
if inverted {
self.inverted = !self.inverted;
}
Ok(())
}
fn load_property_codepoints(&mut self, key: &str, value: &str) -> Result<bool> {
let mut inverted = false;
let mut try_gc = Err(PEK::UnknownProperty.into());
let mut try_sc = Err(PEK::UnknownProperty.into());
let mut try_scx = Err(PEK::UnknownProperty.into());
let mut try_gcb = Err(PEK::UnknownProperty.into());
let mut try_lb = Err(PEK::UnknownProperty.into());
let mut try_sb = Err(PEK::UnknownProperty.into());
let mut try_wb = Err(PEK::UnknownProperty.into());
let mut try_binary = Err(PEK::UnknownProperty.into());
let mut try_ccc: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
let mut try_block: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
if !value.is_empty() {
match key.as_bytes() {
GeneralCategory::NAME | GeneralCategory::SHORT_NAME => try_gc = Ok(value),
GraphemeClusterBreak::NAME | GraphemeClusterBreak::SHORT_NAME => {
try_gcb = Ok(value)
}
LineBreak::NAME | LineBreak::SHORT_NAME => try_lb = Ok(value),
Script::NAME | Script::SHORT_NAME => try_sc = Ok(value),
SentenceBreak::NAME | SentenceBreak::SHORT_NAME => try_sb = Ok(value),
WordBreak::NAME | WordBreak::SHORT_NAME => try_wb = Ok(value),
CanonicalCombiningClass::NAME | CanonicalCombiningClass::SHORT_NAME => {
try_ccc = Ok(value)
}
b"Script_Extensions" | b"scx" => try_scx = Ok(value),
b"Block" | b"blk" => try_block = Ok(value),
_ => {
let normalized_value = value.to_ascii_lowercase();
let truthy = matches!(normalized_value.as_str(), "true" | "t" | "yes" | "y");
let falsy = matches!(normalized_value.as_str(), "false" | "f" | "no" | "n");
if truthy == falsy {
return Err(PEK::UnknownProperty.into());
}
inverted = falsy;
try_binary = Ok(key);
}
}
} else {
try_gc = Ok(key);
try_sc = Ok(key);
try_binary = Ok(key);
}
try_gc
.and_then(|value| self.try_load_general_category_set(value))
.or_else(|_| try_sc.and_then(|value| self.try_load_script_set(value)))
.or_else(|_| try_scx.and_then(|value| self.try_load_script_extensions_set(value)))
.or_else(|_| try_binary.and_then(|value| self.try_load_ecma262_binary_set(value)))
.or_else(|_| try_gcb.and_then(|value| self.try_load_grapheme_cluster_break_set(value)))
.or_else(|_| try_lb.and_then(|value| self.try_load_line_break_set(value)))
.or_else(|_| try_sb.and_then(|value| self.try_load_sentence_break_set(value)))
.or_else(|_| try_wb.and_then(|value| self.try_load_word_break_set(value)))
.or_else(|_| try_ccc.and_then(|value| self.try_load_ccc_set(value)))
.or_else(|_| try_block.and_then(|value| self.try_load_block_set(value)))?;
Ok(inverted)
}
fn finalize(mut self) -> (CodePointInversionListBuilder, BTreeSet<String>) {
if self.inverted {
#[cfg(feature = "log")]
{
let single_set = self.single_set.clone().build();
if !self
.string_set
.iter()
.all(|s| s.chars().next().is_some_and(|c| single_set.contains(c)))
{
log::info!(
"Inverting a unicode set with strings. This removes all strings entirely."
);
}
}
self.string_set.clear();
self.single_set.complement();
}
(self.single_set, self.string_set)
}
fn parse_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
let (offset, c) = self.must_peek()?;
match c {
'\\' => self.parse_escaped_char(),
_ => {
self.iter.next();
Ok((offset, SingleOrMultiChar::Single(c)))
}
}
}
fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<(usize, char)> {
let first_offset = self.must_peek_index()?;
let end_offset = self.validate_hex_digits(min, max)?;
let hex_source = &self.source[first_offset..=end_offset];
let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?;
char::try_from(num)
.map(|c| (end_offset, c))
.map_err(|_| PEK::InvalidEscape.with_offset(end_offset))
}
fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
let mut last_offset = 0;
for count in 0..max {
let (offset, c) = self.must_peek()?;
if !c.is_ascii_hexdigit() {
if count < min {
return Err(PEK::UnexpectedChar(c).with_offset(offset));
} else {
break;
}
}
self.iter.next();
last_offset = offset;
}
Ok(last_offset)
}
fn skip_whitespace(&mut self) -> usize {
let mut num = 0;
while let Some(c) = self.peek_char() {
if !self.pat_ws.contains(c) {
break;
}
self.iter.next();
num += 1;
}
num
}
fn consume(&mut self, expected: char) -> Result<()> {
match self.must_next()? {
(offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)),
_ => Ok(()),
}
}
fn must_next(&mut self) -> Result<(usize, char)> {
self.iter.next().ok_or(ParseError {
offset: None,
kind: PEK::Eof,
})
}
fn must_peek(&mut self) -> Result<(usize, char)> {
self.iter.peek().copied().ok_or(ParseError {
offset: None,
kind: PEK::Eof,
})
}
fn must_peek_double(&mut self) -> Result<(usize, char)> {
let mut copy = self.iter.clone();
copy.next();
copy.next().ok_or(ParseError {
offset: None,
kind: PEK::Eof,
})
}
fn must_peek_char(&mut self) -> Result<char> {
self.must_peek().map(|(_, c)| c)
}
fn must_peek_index(&mut self) -> Result<usize> {
self.must_peek().map(|(idx, _)| idx)
}
fn peek_char(&mut self) -> Option<char> {
self.iter.peek().map(|&(_, c)| c)
}
#[inline]
fn error_here<T>(&mut self, kind: ParseErrorKind) -> Result<T> {
match self.iter.peek() {
None => Err(kind.into()),
Some(&(offset, _)) => Err(kind.with_offset(offset)),
}
}
fn process_strings(&mut self, op: Operation, other_strings: BTreeSet<String>) {
match op {
Operation::Union => self.string_set.extend(other_strings),
Operation::Difference => {
self.string_set = self
.string_set
.difference(&other_strings)
.cloned()
.collect()
}
Operation::Intersection => {
self.string_set = self
.string_set
.intersection(&other_strings)
.cloned()
.collect()
}
}
}
fn process_chars(&mut self, op: Operation, other_chars: CodePointInversionList) {
match op {
Operation::Union => self.single_set.add_set(&other_chars),
Operation::Difference => self.single_set.remove_set(&other_chars),
Operation::Intersection => self.single_set.retain_set(&other_chars),
}
}
fn try_load_general_category_set(&mut self, name: &str) -> Result<()> {
let name_map =
PropertyParser::<GeneralCategoryGroup>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let gc_value = name_map
.as_borrowed()
.get_loose(name)
.ok_or(PEK::UnknownProperty)?;
let set = CodePointMapData::<GeneralCategory>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?
.as_borrowed()
.get_set_for_value_group(gc_value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_get_script(&self, name: &str) -> Result<Script> {
let name_map = PropertyParser::<Script>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
name_map.as_borrowed().get_loose(name).ok_or(ParseError {
offset: None,
kind: PEK::UnknownProperty,
})
}
fn try_load_script_set(&mut self, name: &str) -> Result<()> {
let sc_value = self.try_get_script(name)?;
let property_map = CodePointMapData::<Script>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let set = property_map.as_borrowed().get_set_for_value(sc_value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_script_extensions_set(&mut self, name: &str) -> Result<()> {
let scx = ScriptWithExtensions::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let sc_value = self.try_get_script(name)?;
let set = scx.as_borrowed().get_script_extensions_set(sc_value);
self.single_set.add_set(&set);
Ok(())
}
fn try_load_ecma262_binary_set(&mut self, name: &str) -> Result<()> {
let set =
CodePointSetData::try_new_for_ecma262_unstable(self.property_provider, name.as_bytes())
.ok_or(PEK::UnknownProperty)?
.map_err(|_data_error| PEK::Internal)?;
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_grapheme_cluster_break_set(&mut self, name: &str) -> Result<()> {
let parser =
PropertyParser::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let gcb_value = parser
.as_borrowed()
.get_loose(name)
.ok_or(PEK::UnknownProperty)?;
let property_map =
CodePointMapData::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let set = property_map.as_borrowed().get_set_for_value(gcb_value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_line_break_set(&mut self, name: &str) -> Result<()> {
let parser = PropertyParser::<LineBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let lb_value = parser
.as_borrowed()
.get_loose(name)
.ok_or(PEK::UnknownProperty)?;
let property_map = CodePointMapData::<LineBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let set = property_map.as_borrowed().get_set_for_value(lb_value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_sentence_break_set(&mut self, name: &str) -> Result<()> {
let parser = PropertyParser::<SentenceBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let sb_value = parser
.as_borrowed()
.get_loose(name)
.ok_or(PEK::UnknownProperty)?;
let property_map =
CodePointMapData::<SentenceBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let set = property_map.as_borrowed().get_set_for_value(sb_value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_word_break_set(&mut self, name: &str) -> Result<()> {
let parser = PropertyParser::<WordBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let wb_value = parser
.as_borrowed()
.get_loose(name)
.ok_or(PEK::UnknownProperty)?;
let property_map = CodePointMapData::<WordBreak>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let set = property_map.as_borrowed().get_set_for_value(wb_value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_ccc_set(&mut self, name: &str) -> Result<()> {
let parser =
PropertyParser::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let value = parser
.as_borrowed()
.get_loose(name)
.or_else(|| {
name.parse()
.ok()
.map(CanonicalCombiningClass::from_icu4c_value)
})
.ok_or(PEK::UnknownProperty)?;
let property_map =
CodePointMapData::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
.map_err(|_| PEK::Internal)?;
let set = property_map.as_borrowed().get_set_for_value(value);
self.single_set.add_set(&set.to_code_point_inversion_list());
Ok(())
}
fn try_load_block_set(&mut self, name: &str) -> Result<()> {
self.single_set
.add_range(match name.to_ascii_lowercase().as_str() {
"arabic" => '\u{0600}'..'\u{06FF}',
"thaana" => '\u{0780}'..'\u{07BF}',
_ => {
#[cfg(feature = "log")]
log::warn!("Skipping :block={name}:");
return Err(PEK::Unimplemented.into());
}
});
Ok(())
}
}
#[cfg(feature = "compiled_data")]
pub fn parse(source: &str) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
parse_unstable(source, &Baked)
}
#[cfg(feature = "compiled_data")]
pub fn parse_with_variables(
source: &str,
variable_map: &VariableMap<'_>,
) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
parse_unstable_with_variables(source, variable_map, &Baked)
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse_with_variables)]
pub fn parse_unstable_with_variables<P>(
source: &str,
variable_map: &VariableMap<'_>,
provider: &P,
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
where
P: ?Sized
+ DataProvider<PropertyBinaryAlphabeticV1>
+ DataProvider<PropertyBinaryAsciiHexDigitV1>
+ DataProvider<PropertyBinaryBidiControlV1>
+ DataProvider<PropertyBinaryBidiMirroredV1>
+ DataProvider<PropertyBinaryCasedV1>
+ DataProvider<PropertyBinaryCaseIgnorableV1>
+ DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenCasemappedV1>
+ DataProvider<PropertyBinaryChangesWhenLowercasedV1>
+ DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
+ DataProvider<PropertyBinaryChangesWhenUppercasedV1>
+ DataProvider<PropertyBinaryDashV1>
+ DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
+ DataProvider<PropertyBinaryDeprecatedV1>
+ DataProvider<PropertyBinaryDiacriticV1>
+ DataProvider<PropertyBinaryEmojiComponentV1>
+ DataProvider<PropertyBinaryEmojiModifierBaseV1>
+ DataProvider<PropertyBinaryEmojiModifierV1>
+ DataProvider<PropertyBinaryEmojiPresentationV1>
+ DataProvider<PropertyBinaryEmojiV1>
+ DataProvider<PropertyBinaryExtendedPictographicV1>
+ DataProvider<PropertyBinaryExtenderV1>
+ DataProvider<PropertyBinaryGraphemeBaseV1>
+ DataProvider<PropertyBinaryGraphemeExtendV1>
+ DataProvider<PropertyBinaryHexDigitV1>
+ DataProvider<PropertyBinaryIdContinueV1>
+ DataProvider<PropertyBinaryIdeographicV1>
+ DataProvider<PropertyBinaryIdsBinaryOperatorV1>
+ DataProvider<PropertyBinaryIdStartV1>
+ DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
+ DataProvider<PropertyBinaryJoinControlV1>
+ DataProvider<PropertyBinaryLogicalOrderExceptionV1>
+ DataProvider<PropertyBinaryLowercaseV1>
+ DataProvider<PropertyBinaryMathV1>
+ DataProvider<PropertyBinaryNoncharacterCodePointV1>
+ DataProvider<PropertyBinaryPatternSyntaxV1>
+ DataProvider<PropertyBinaryPatternWhiteSpaceV1>
+ DataProvider<PropertyBinaryQuotationMarkV1>
+ DataProvider<PropertyBinaryRadicalV1>
+ DataProvider<PropertyBinaryRegionalIndicatorV1>
+ DataProvider<PropertyBinarySentenceTerminalV1>
+ DataProvider<PropertyBinarySoftDottedV1>
+ DataProvider<PropertyBinaryTerminalPunctuationV1>
+ DataProvider<PropertyBinaryUnifiedIdeographV1>
+ DataProvider<PropertyBinaryUppercaseV1>
+ DataProvider<PropertyBinaryVariationSelectorV1>
+ DataProvider<PropertyBinaryWhiteSpaceV1>
+ DataProvider<PropertyBinaryXidContinueV1>
+ DataProvider<PropertyBinaryXidStartV1>
+ DataProvider<PropertyEnumCanonicalCombiningClassV1>
+ DataProvider<PropertyEnumGeneralCategoryV1>
+ DataProvider<PropertyEnumGraphemeClusterBreakV1>
+ DataProvider<PropertyEnumLineBreakV1>
+ DataProvider<PropertyEnumScriptV1>
+ DataProvider<PropertyEnumSentenceBreakV1>
+ DataProvider<PropertyEnumWordBreakV1>
+ DataProvider<PropertyNameParseCanonicalCombiningClassV1>
+ DataProvider<PropertyNameParseGeneralCategoryMaskV1>
+ DataProvider<PropertyNameParseGraphemeClusterBreakV1>
+ DataProvider<PropertyNameParseLineBreakV1>
+ DataProvider<PropertyNameParseScriptV1>
+ DataProvider<PropertyNameParseSentenceBreakV1>
+ DataProvider<PropertyNameParseWordBreakV1>
+ DataProvider<PropertyScriptWithExtensionsV1>,
{
let mut iter = source.char_indices().peekable();
let xid_start =
CodePointSetData::try_new_unstable::<XidStart>(provider).map_err(|_| PEK::Internal)?;
let xid_start_list = xid_start.to_code_point_inversion_list();
let xid_continue =
CodePointSetData::try_new_unstable::<XidContinue>(provider).map_err(|_| PEK::Internal)?;
let xid_continue_list = xid_continue.to_code_point_inversion_list();
let pat_ws = CodePointSetData::try_new_unstable::<PatternWhiteSpace>(provider)
.map_err(|_| PEK::Internal)?;
let pat_ws_list = pat_ws.to_code_point_inversion_list();
let mut builder = UnicodeSetBuilder::new_internal(
&mut iter,
source,
variable_map,
&xid_start_list,
&xid_continue_list,
&pat_ws_list,
provider,
);
builder.parse_unicode_set()?;
let (single, string_set) = builder.finalize();
let built_single = single.build();
let mut strings = string_set.into_iter().collect::<Vec<_>>();
strings.sort();
let zerovec = (&strings).into();
let cpinvlistandstrlist = CodePointInversionListAndStringList::try_from(built_single, zerovec)
.map_err(|_| PEK::Internal)?;
let parsed_bytes = match iter.peek().copied() {
None => source.len(),
Some((offset, _)) => offset,
};
Ok((cpinvlistandstrlist, parsed_bytes))
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse)]
pub fn parse_unstable<P>(
source: &str,
provider: &P,
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
where
P: ?Sized
+ DataProvider<PropertyBinaryAlphabeticV1>
+ DataProvider<PropertyBinaryAsciiHexDigitV1>
+ DataProvider<PropertyBinaryBidiControlV1>
+ DataProvider<PropertyBinaryBidiMirroredV1>
+ DataProvider<PropertyBinaryCasedV1>
+ DataProvider<PropertyBinaryCaseIgnorableV1>
+ DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenCasemappedV1>
+ DataProvider<PropertyBinaryChangesWhenLowercasedV1>
+ DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
+ DataProvider<PropertyBinaryChangesWhenUppercasedV1>
+ DataProvider<PropertyBinaryDashV1>
+ DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
+ DataProvider<PropertyBinaryDeprecatedV1>
+ DataProvider<PropertyBinaryDiacriticV1>
+ DataProvider<PropertyBinaryEmojiComponentV1>
+ DataProvider<PropertyBinaryEmojiModifierBaseV1>
+ DataProvider<PropertyBinaryEmojiModifierV1>
+ DataProvider<PropertyBinaryEmojiPresentationV1>
+ DataProvider<PropertyBinaryEmojiV1>
+ DataProvider<PropertyBinaryExtendedPictographicV1>
+ DataProvider<PropertyBinaryExtenderV1>
+ DataProvider<PropertyBinaryGraphemeBaseV1>
+ DataProvider<PropertyBinaryGraphemeExtendV1>
+ DataProvider<PropertyBinaryHexDigitV1>
+ DataProvider<PropertyBinaryIdContinueV1>
+ DataProvider<PropertyBinaryIdeographicV1>
+ DataProvider<PropertyBinaryIdsBinaryOperatorV1>
+ DataProvider<PropertyBinaryIdStartV1>
+ DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
+ DataProvider<PropertyBinaryJoinControlV1>
+ DataProvider<PropertyBinaryLogicalOrderExceptionV1>
+ DataProvider<PropertyBinaryLowercaseV1>
+ DataProvider<PropertyBinaryMathV1>
+ DataProvider<PropertyBinaryNoncharacterCodePointV1>
+ DataProvider<PropertyBinaryPatternSyntaxV1>
+ DataProvider<PropertyBinaryPatternWhiteSpaceV1>
+ DataProvider<PropertyBinaryQuotationMarkV1>
+ DataProvider<PropertyBinaryRadicalV1>
+ DataProvider<PropertyBinaryRegionalIndicatorV1>
+ DataProvider<PropertyBinarySentenceTerminalV1>
+ DataProvider<PropertyBinarySoftDottedV1>
+ DataProvider<PropertyBinaryTerminalPunctuationV1>
+ DataProvider<PropertyBinaryUnifiedIdeographV1>
+ DataProvider<PropertyBinaryUppercaseV1>
+ DataProvider<PropertyBinaryVariationSelectorV1>
+ DataProvider<PropertyBinaryWhiteSpaceV1>
+ DataProvider<PropertyBinaryXidContinueV1>
+ DataProvider<PropertyBinaryXidStartV1>
+ DataProvider<PropertyEnumCanonicalCombiningClassV1>
+ DataProvider<PropertyEnumGeneralCategoryV1>
+ DataProvider<PropertyEnumGraphemeClusterBreakV1>
+ DataProvider<PropertyEnumLineBreakV1>
+ DataProvider<PropertyEnumScriptV1>
+ DataProvider<PropertyEnumSentenceBreakV1>
+ DataProvider<PropertyEnumWordBreakV1>
+ DataProvider<PropertyNameParseCanonicalCombiningClassV1>
+ DataProvider<PropertyNameParseGeneralCategoryMaskV1>
+ DataProvider<PropertyNameParseGraphemeClusterBreakV1>
+ DataProvider<PropertyNameParseLineBreakV1>
+ DataProvider<PropertyNameParseScriptV1>
+ DataProvider<PropertyNameParseSentenceBreakV1>
+ DataProvider<PropertyNameParseWordBreakV1>
+ DataProvider<PropertyScriptWithExtensionsV1>,
{
let dummy = Default::default();
parse_unstable_with_variables(source, &dummy, provider)
}
#[cfg(test)]
mod tests {
use core::ops::RangeInclusive;
use std::collections::HashSet;
use super::*;
fn range_iter_from_str(s: &str) -> impl Iterator<Item = RangeInclusive<u32>> {
debug_assert_eq!(
s.chars().count() % 2,
0,
"string \"{}\" does not contain an even number of code points",
s.escape_debug()
);
let mut res = vec![];
let mut skip = false;
for (a, b) in s.chars().zip(s.chars().skip(1)) {
if skip {
skip = false;
continue;
}
let a = a as u32;
let b = b as u32;
res.push(a..=b);
skip = true;
}
res.into_iter()
}
fn assert_set_equality<'a>(
source: &str,
cpinvlistandstrlist: &CodePointInversionListAndStringList,
single: impl Iterator<Item = RangeInclusive<u32>>,
strings: impl Iterator<Item = &'a str>,
) {
let expected_ranges: HashSet<_> = single.collect();
let actual_ranges: HashSet<_> = cpinvlistandstrlist.code_points().iter_ranges().collect();
assert_eq!(
actual_ranges,
expected_ranges,
"got unexpected ranges {:?}, expected {:?} for parsed set \"{}\"",
actual_ranges,
expected_ranges,
source.escape_debug()
);
let mut expected_size = cpinvlistandstrlist.code_points().size();
for s in strings {
expected_size += 1;
assert!(
cpinvlistandstrlist.contains_str(s),
"missing string \"{}\" from parsed set \"{}\"",
s.escape_debug(),
source.escape_debug()
);
}
let actual_size = cpinvlistandstrlist.size();
assert_eq!(
actual_size,
expected_size,
"got unexpected size {}, expected {} for parsed set \"{}\"",
actual_size,
expected_size,
source.escape_debug()
);
}
fn assert_is_error_and_message_eq(source: &str, expected_err: &str, vm: &VariableMap<'_>) {
let result = parse_with_variables(source, vm);
assert!(result.is_err(), "{source} does not cause an error!");
let err = result.unwrap_err();
assert_eq!(err.fmt_with_source(source).to_string(), expected_err);
}
#[test]
fn test_semantics_with_variables() {
let mut map_char_char = VariableMap::default();
map_char_char.insert_char("a".to_string(), 'a').unwrap();
map_char_char.insert_char("var2".to_string(), 'z').unwrap();
let mut map_headache = VariableMap::default();
map_headache.insert_char("hehe".to_string(), '-').unwrap();
let mut map_char_string = VariableMap::default();
map_char_string.insert_char("a".to_string(), 'a').unwrap();
map_char_string
.insert_string("var2".to_string(), "abc".to_string())
.unwrap();
let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
let mut map_char_set = VariableMap::default();
map_char_set.insert_char("a".to_string(), 'a').unwrap();
map_char_set.insert_set("set".to_string(), set).unwrap();
let cases: Vec<(_, _, _, Vec<&str>)> = vec![
(&map_char_char, "[$a]", "aa", vec![]),
(&map_char_char, "[ $a ]", "aa", vec![]),
(&map_char_char, "[$a$]", "aa\u{ffff}\u{ffff}", vec![]),
(&map_char_char, "[$a$ ]", "aa\u{ffff}\u{ffff}", vec![]),
(&map_char_char, "[$a$var2]", "aazz", vec![]),
(&map_char_char, "[$a - $var2]", "az", vec![]),
(&map_char_char, "[$a-$var2]", "az", vec![]),
(&map_headache, "[a $hehe z]", "aazz--", vec![]),
(
&map_char_char,
"[[$]var2]",
"\u{ffff}\u{ffff}vvaarr22",
vec![],
),
(&map_char_char, r"[\$var2]", "$$vvaarr22", vec![]),
(&map_char_char, r"[\\$var2]", r"\\zz", vec![]),
(&map_char_char, "[{$a}]", "", vec!["$a"]),
(&map_char_set, "[$set & [b-z]]", "bz", vec![]),
(&map_char_set, "[[a-z]-[b-z]]", "aa", vec![]),
(&map_char_set, "[$set-[b-z]]", "aa", vec!["Hello, World!"]),
(&map_char_set, "[$set-$set]", "", vec![]),
(&map_char_set, "[[a-zA]-$set]", "AA", vec![]),
(&map_char_set, "[$set[b-z]]", "az", vec!["Hello, World!"]),
(&map_char_set, "[[a-a]$set]", "az", vec!["Hello, World!"]),
(&map_char_set, "$set", "az", vec!["Hello, World!"]),
(&map_char_string, "[$var2]", "", vec!["abc"]),
];
for (variable_map, source, single, strings) in cases {
let parsed = parse_with_variables(source, variable_map);
if let Err(err) = parsed {
panic!(
"{source} results in an error: {}",
err.fmt_with_source(source)
);
}
let (set, consumed) = parsed.unwrap();
assert_eq!(consumed, source.len(), "{source:?} is not fully consumed");
assert_set_equality(
source,
&set,
range_iter_from_str(single),
strings.into_iter(),
);
}
}
#[test]
fn test_semantics() {
const ALL_CHARS: &str = "\x00\u{10FFFF}";
let cases: Vec<(_, _, Vec<&str>)> = vec![
("[a]", "aa", vec![]),
("[]", "", vec![]),
("[qax]", "aaqqxx", vec![]),
("[a-z]", "az", vec![]),
("[--]", "--", vec![]),
("[a-b-]", "ab--", vec![]),
("[[a-b]-]", "ab--", vec![]),
("[{ab}-]", "--", vec!["ab"]),
("[-a-b]", "ab--", vec![]),
("[-a]", "--aa", vec![]),
(r"[\n]", "\n\n", vec![]),
("[\\\n]", "\n\n", vec![]),
("[\n]", "", vec![]),
("[\u{9}]", "", vec![]),
("[\u{A}]", "", vec![]),
("[\u{B}]", "", vec![]),
("[\u{C}]", "", vec![]),
("[\u{D}]", "", vec![]),
("[\u{20}]", "", vec![]),
("[\u{85}]", "", vec![]),
("[\u{200E}]", "", vec![]),
("[\u{200F}]", "", vec![]),
("[\u{2028}]", "", vec![]),
("[\u{2029}]", "", vec![]),
("[^[^$]]", "\u{ffff}\u{ffff}", vec![]),
("[^[^ $]]", "\u{ffff}\u{ffff}", vec![]),
("[^[^ $ ]]", "\u{ffff}\u{ffff}", vec![]),
("[^[^a$]]", "aa\u{ffff}\u{ffff}", vec![]),
("[^[^a$ ]]", "aa\u{ffff}\u{ffff}", vec![]),
("[-]", "--", vec![]),
("[ - ]", "--", vec![]),
("[ - - ]", "--", vec![]),
("[ a-b - ]", "ab--", vec![]),
("[ -a]", "--aa", vec![]),
("[a-]", "--aa", vec![]),
("[a- ]", "--aa", vec![]),
("[ :]", "::", vec![]),
("[ :L:]", "::LL", vec![]),
("[\u{A0}]", "\u{A0}\u{A0}", vec![]), ("[$]", "\u{ffff}\u{ffff}", vec![]),
(r"[\$]", "$$", vec![]),
("[{$}]", "$$", vec![]),
("[[a-z]&[b-z]]", "bz", vec![]),
("[[a-z]-[b-z]]", "aa", vec![]),
("[[a-z][b-z]]", "az", vec![]),
("[[a-a][b-z]]", "az", vec![]),
("[[a-z{abc}]&[b-z{abc}{abx}]]", "bz", vec!["abc"]),
("[[{abx}a-z{abc}]&[b-z{abc}]]", "bz", vec!["abc"]),
("[[a-z{abx}]-[{abx}b-z{abc}]]", "aa", vec![]),
("[[a-z{abx}{abc}]-[{abx}b-z]]", "aa", vec!["abc"]),
("[[a-z{abc}][b-z{abx}]]", "az", vec!["abc", "abx"]),
("[{this is a minus -}]", "", vec!["thisisaminus-"]),
("[[a-a][b-z] - [a-d][e-z]]", "ez", vec![]),
("[[a-a][b-z] - [a-d]&[e-z]]", "ez", vec![]),
("[[a-a][b-z] - [a-z][]]", "", vec![]),
("[[a-a][b-z] - [a-z]&[]]", "", vec![]),
("[[a-a][b-z] & [a-z]-[]]", "az", vec![]),
("[[a-a][b-z] & []-[a-z]]", "", vec![]),
("[[a-a][b-z] & [a-b][x-z]]", "abxz", vec![]),
("[[a-z]-[a-b]-[y-z]]", "cx", vec![]),
(r"[\x61-\x63]", "ac", vec![]),
(r"[a-\x63]", "ac", vec![]),
(r"[\x61-c]", "ac", vec![]),
(r"[\u0061-\x63]", "ac", vec![]),
(r"[\U00000061-\x63]", "ac", vec![]),
(r"[\x{61}-\x63]", "ac", vec![]),
(r"[\u{61}-\x63]", "ac", vec![]),
(r"[\u{61}{hello\ world}]", "aa", vec!["hello world"]),
(r"[{hello\ world}\u{61}]", "aa", vec!["hello world"]),
(r"[{h\u{65}llo\ world}]", "", vec!["hello world"]),
(r"[^]", ALL_CHARS, vec![]),
(r"[[^]-[^a-z]]", "az", vec![]),
(r"[^{h\u{65}llo\ world}]", ALL_CHARS, vec![]),
(
r"[^[{h\u{65}llo\ world}]-[{hello\ world}]]",
ALL_CHARS,
vec![],
),
(
r"[^[\x00-\U0010FFFF]-[\u0100-\U0010FFFF]]",
"\u{100}\u{10FFFF}",
vec![],
),
(r"[^[^a-z]]", "az", vec![]),
(r"[^[^\^]]", "^^", vec![]),
(r"[{\x{61 0062 063}}]", "", vec!["abc"]),
(r"[\x{61 0062 063}]", "ac", vec![]),
(r"[:AHex:]", "09afAF", vec![]),
(r"[:AHex=True:]", "09afAF", vec![]),
(r"[:AHex=T:]", "09afAF", vec![]),
(r"[:AHex=Yes:]", "09afAF", vec![]),
(r"[:AHex=Y:]", "09afAF", vec![]),
(r"[:^AHex≠True:]", "09afAF", vec![]),
(r"[:AHex≠False:]", "09afAF", vec![]),
(r"[[:^AHex≠False:]&[\x00-\x10]]", "\0\x10", vec![]),
(r"\p{AHex}", "09afAF", vec![]),
(r"\p{AHex=True}", "09afAF", vec![]),
(r"\p{AHex=T}", "09afAF", vec![]),
(r"\p{AHex=Yes}", "09afAF", vec![]),
(r"\p{AHex=Y}", "09afAF", vec![]),
(r"\P{AHex≠True}", "09afAF", vec![]),
(r"\p{AHex≠False}", "09afAF", vec![]),
(r"[[:gc=lower-case-letter:]&[a-zA-Z]]", "az", vec![]),
(r"[[:lower case letter:]&[a-zA-Z]]", "az", vec![]),
(
r"[[[:L:]-[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]][[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]-[:L:]]]",
"",
vec![],
),
(r"[[:sc=latn:]&[a-zA-Z]]", "azAZ", vec![]),
(r"[[:sc=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
(r"[[:Latin:]&[a-zA-Z]]", "azAZ", vec![]),
(r"[[:latn:]&[a-zA-Z]]", "azAZ", vec![]),
(r"[[:scx=latn:]&[a-zA-Z]]", "azAZ", vec![]),
(r"[[:scx=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
(r"[[:scx=Hira:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
(r"[[:sc=Hira:]&[\u30FC]]", "", vec![]),
(r"[[:scx=Kana:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
(r"[[:sc=Kana:]&[\u30FC]]", "", vec![]),
(r"[[:sc=Common:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
(
r"\p{Grapheme_Cluster_Break=ZWJ}",
"\u{200D}\u{200D}",
vec![],
),
(
r"\p{Sentence_Break=ATerm}",
"\u{002E}\u{002E}\u{2024}\u{2024}\u{FE52}\u{FE52}\u{FF0E}\u{FF0E}",
vec![],
),
(r"\p{Word_Break=Single_Quote}", "\u{0027}\u{0027}", vec![]),
(r"[\^a]", "^^aa", vec![]),
(r"[{{}]", "{{", vec![]),
(r"[{}}]", "}}", vec![""]),
(r"[}]", "}}", vec![]),
(r"[{$var}]", "", vec!["$var"]),
(r"[{[a-z}]", "", vec!["[a-z"]),
(r"[ { [ a - z } ]", "", vec!["[a-z"]),
];
for (source, single, strings) in cases {
let parsed = parse(source);
if let Err(err) = parsed {
panic!(
"{source} results in an error: {}",
err.fmt_with_source(source)
);
}
let (set, consumed) = parsed.unwrap();
assert_eq!(consumed, source.len());
assert_set_equality(
source,
&set,
range_iter_from_str(single),
strings.into_iter(),
);
}
}
#[test]
fn test_error_messages_with_variables() {
let mut map_char_char = VariableMap::default();
map_char_char.insert_char("a".to_string(), 'a').unwrap();
map_char_char.insert_char("var2".to_string(), 'z').unwrap();
let mut map_char_string = VariableMap::default();
map_char_string.insert_char("a".to_string(), 'a').unwrap();
map_char_string
.insert_string("var2".to_string(), "abc".to_string())
.unwrap();
let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
let mut map_char_set = VariableMap::default();
map_char_set.insert_char("a".to_string(), 'a').unwrap();
map_char_set.insert_set("set".to_string(), set).unwrap();
let cases = [
(&map_char_char, "[$$a]", r"[$$a← error: unexpected variable"),
(
&map_char_char,
"[$ a]",
r"[$ a← error: unexpected character 'a'",
),
(&map_char_char, "$a", r"$a← error: unexpected variable"),
(&map_char_char, "$", r"$← error: unexpected end of input"),
(
&map_char_string,
"[$var2-$a]",
r"[$var2-$a← error: unexpected variable",
),
(
&map_char_string,
"[$a-$var2]",
r"[$a-$var2← error: unexpected variable",
),
(
&map_char_set,
"[$a-$set]",
r"[$a-$set← error: unexpected variable",
),
(
&map_char_set,
"[$set-$a]",
r"[$set-$a← error: unexpected variable",
),
(
&map_char_set,
"[$=]",
"[$=← error: unexpected character '='",
),
];
for (variable_map, source, expected_err) in cases {
assert_is_error_and_message_eq(source, expected_err, variable_map);
}
}
#[test]
fn test_error_messages() {
let cases = [
(r"[a-z[\]]", r"[a-z[\]]← error: unexpected end of input"),
(r"", r"← error: unexpected end of input"),
(r"[{]", r"[{]← error: unexpected end of input"),
(
r"[:general_category:]",
r"[:general_category← error: unknown property",
),
(r"[:ll=true:]", r"[:ll=true← error: unknown property"),
(r"[:=", r"[:=← error: unexpected character '='"),
(r"[::]", r"[::← error: unexpected character ':'"),
(r"[:=hello:]", r"[:=← error: unexpected character '='"),
(r"[:gc=:]", r"[:gc=:← error: unexpected character ':'"),
(r"[\xag]", r"[\xag← error: unexpected character 'g'"),
(r"[a-b-z]", r"[a-b-z← error: unexpected character 'z'"),
(r"[a-\p{ll}]", r"[a-\← error: unexpected character '\\'"),
(r"[a-&]", r"[a-&← error: unexpected character '&'"),
(r"[a&b]", r"[a&← error: unexpected character '&'"),
(r"[[set]&b]", r"[[set]&b← error: unexpected character 'b'"),
(r"[[set]&]", r"[[set]&]← error: unexpected character ']'"),
(r"[a-\x60]", r"[a-\x60← error: unexpected character '`'"),
(r"[a-`]", r"[a-`← error: unexpected character '`'"),
(r"[\x{6g}]", r"[\x{6g← error: unexpected character 'g'"),
(r"[\x{g}]", r"[\x{g← error: unexpected character 'g'"),
(r"[\x{}]", r"[\x{}← error: unexpected character '}'"),
(
r"[\x{dabeef}]",
r"[\x{dabeef← error: invalid escape sequence",
),
(
r"[\x{10ffff0}]",
r"[\x{10ffff0← error: unexpected character '0'",
),
(
r"[\x{11ffff}]",
r"[\x{11ffff← error: invalid escape sequence",
),
(
r"[\x{10ffff 1 10ffff0}]",
r"[\x{10ffff 1 10ffff0← error: unexpected character '0'",
),
(r"ä", r"ä← error: unexpected character 'ä'"),
(r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
(r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
(
r"[\xe5-\xe4]",
r"[\xe5-\xe4← error: unexpected character 'ä'",
),
(r"[\xe5-ä]", r"[\xe5-ä← error: unexpected character 'ä'"),
(r"[ ^]", r"[ ^← error: unexpected character '^'"),
(r"[:]", r"[:]← error: unexpected character ']'"),
(r"[:L]", r"[:L]← error: unexpected character ']'"),
(r"\p {L}", r"\p ← error: unexpected character ' '"),
(
r"[\x{61 62}-d]",
r"[\x{61 62}-d← error: unexpected character 'd'",
),
(
r"[\x{61 63}-\x{62 64}]",
r"[\x{61 63}-\← error: unexpected character '\\'",
),
(r"[a-\x{62 64}]", r"[a-\← error: unexpected character '\\'"),
];
let vm = Default::default();
for (source, expected_err) in cases {
assert_is_error_and_message_eq(source, expected_err, &vm);
}
}
#[test]
fn test_consumed() {
let cases = [
(r"[a-z\]{[}]".len(), r"[a-z\]{[}][]"),
(r"[a-z\]{[}]".len(), r"[a-z\]{[}] []"),
(r"[a-z\]{[}]".len(), r"[a-z\]{]}] []"),
(r"[a-z\]{{[}]".len(), r"[a-z\]{{]}] []"),
(r"[a-z\]{[}]".len(), r"[a-z\]{]}]\p{L}"),
(r"[a-z\]{[}]".len(), r"[a-z\]{]}]$var"),
];
let vm = Default::default();
for (expected_consumed, source) in cases {
let (_, consumed) = parse(source).unwrap();
assert_eq!(expected_consumed, consumed);
let (_, consumed) = parse_with_variables(source, &vm).unwrap();
assert_eq!(expected_consumed, consumed);
}
}
}