use std::borrow::Cow;
use crate::{
compile::CompileResult,
error::{CompileError, CompileErrorKind, Feature, ParseErrorKind},
literal,
options::{CompileOptions, RegexFlavor},
regex::{Regex, RegexProperty, RegexShorthand},
span::Span,
};
pub(crate) use char_group::{CharGroup, GroupItem};
use self::{
char_group::GroupName,
unicode::{Category, OtherProperties},
};
mod ascii;
pub(crate) mod char_group;
pub(crate) mod unicode;
#[derive(Clone, PartialEq, Eq)]
pub(crate) struct CharClass {
negative: bool,
inner: CharGroup,
pub(crate) span: Span,
}
impl CharClass {
pub(crate) fn new(inner: CharGroup, span: Span) -> Self {
CharClass { inner, span, negative: false }
}
pub(crate) fn negate(&mut self) -> Result<(), ParseErrorKind> {
if self.negative {
Err(ParseErrorKind::UnallowedDoubleNot)
} else {
self.negative = !self.negative;
Ok(())
}
}
pub(crate) fn compile(&self, options: CompileOptions) -> CompileResult<'static> {
let span = self.span;
match &self.inner {
CharGroup::Dot => {
Ok(if self.negative { Regex::Literal(Cow::Borrowed("\\n")) } else { Regex::Dot })
}
CharGroup::CodePoint => {
if self.negative {
return Err(CompileErrorKind::EmptyClassNegated.at(span));
}
Ok(Regex::CharClass(RegexCharClass {
negative: false,
items: vec![
RegexClassItem::Shorthand(RegexShorthand::Space),
RegexClassItem::Shorthand(RegexShorthand::NotSpace),
],
}))
}
CharGroup::Items(items) => match (items.len(), self.negative) {
(0, _) => Err(CompileErrorKind::EmptyClass.at(span)),
(1, false) => match items[0] {
GroupItem::Char(c) => Ok(Regex::Char(c)),
GroupItem::Range { first, last } => Ok(Regex::CharClass(RegexCharClass {
negative: false,
items: vec![RegexClassItem::Range { first, last }],
})),
GroupItem::Named { name, negative } => {
named_class_to_regex(name, negative, options.flavor, span)
}
},
(1, true) => match items[0] {
GroupItem::Char(c) => Ok(Regex::CharClass(RegexCharClass {
negative: true,
items: vec![RegexClassItem::Char(c)],
})),
GroupItem::Range { first, last } => Ok(Regex::CharClass(RegexCharClass {
negative: true,
items: vec![RegexClassItem::Range { first, last }],
})),
GroupItem::Named { name, negative } => {
named_class_to_regex(name, !negative, options.flavor, span)
}
},
(_, negative) => {
let mut buf = Vec::new();
for item in items {
match *item {
GroupItem::Char(c) => buf.push(RegexClassItem::Char(c)),
GroupItem::Range { first, last } => {
buf.push(RegexClassItem::Range { first, last })
}
GroupItem::Named { name, negative } => {
named_class_to_regex_class_items(
name,
negative,
options.flavor,
span,
&mut buf,
)?;
}
}
}
Ok(Regex::CharClass(RegexCharClass { negative, items: buf }))
}
},
}
}
}
fn named_class_to_regex(
group: GroupName,
negative: bool,
flavor: RegexFlavor,
span: Span,
) -> CompileResult<'static> {
Ok(match group {
GroupName::Word => {
if flavor == RegexFlavor::JavaScript {
Regex::CharClass(RegexCharClass {
negative,
items: vec![
RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),
RegexProperty::Category(Category::Mark).negative_item(false),
RegexProperty::Category(Category::Decimal_Number).negative_item(false),
RegexProperty::Category(Category::Connector_Punctuation)
.negative_item(false),
],
})
} else {
Regex::Shorthand(if negative {
RegexShorthand::NotWord
} else {
RegexShorthand::Word
})
}
}
GroupName::Digit => {
if flavor == RegexFlavor::JavaScript {
RegexProperty::Category(Category::Decimal_Number).negative(negative)
} else {
Regex::Shorthand(if negative {
RegexShorthand::NotDigit
} else {
RegexShorthand::Digit
})
}
}
GroupName::Space if negative => Regex::Shorthand(RegexShorthand::NotSpace),
GroupName::Space => Regex::Shorthand(RegexShorthand::Space),
GroupName::HorizSpace | GroupName::VertSpace
if matches!(flavor, RegexFlavor::Pcre | RegexFlavor::Java) =>
{
let shorthand = if group == GroupName::HorizSpace {
RegexShorthand::HorizSpace
} else {
RegexShorthand::VertSpace
};
if negative {
Regex::CharClass(RegexCharClass {
negative: true,
items: vec![RegexClassItem::Shorthand(shorthand)],
})
} else {
Regex::Shorthand(shorthand)
}
}
GroupName::HorizSpace => Regex::CharClass(RegexCharClass {
negative,
items: vec![
RegexClassItem::Char('\t'),
RegexProperty::Category(Category::Space_Separator).negative_item(false),
],
}),
GroupName::VertSpace => Regex::CharClass(RegexCharClass {
negative,
items: vec![
RegexClassItem::Range { first: '\x0A', last: '\x0D' },
RegexClassItem::Char('\u{85}'),
RegexClassItem::Char('\u{2028}'),
RegexClassItem::Char('\u{2029}'),
],
}),
GroupName::Category(c) => RegexProperty::Category(c).negative(negative),
GroupName::Script(s) => RegexProperty::Script(s).negative(negative),
GroupName::CodeBlock(b) => match flavor {
RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
RegexProperty::Block(b).negative(negative)
}
_ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),
},
GroupName::OtherProperties(o) => {
if flavor == RegexFlavor::Pcre {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
}
RegexProperty::Other(o).negative(negative)
}
})
}
fn named_class_to_regex_class_items(
group: GroupName,
negative: bool,
flavor: RegexFlavor,
span: Span,
buf: &mut Vec<RegexClassItem>,
) -> Result<(), CompileError> {
match group {
GroupName::Word => {
if let RegexFlavor::JavaScript = flavor {
if negative {
return Err(
CompileErrorKind::Unsupported(Feature::NegativeShorthandW, flavor).at(span)
);
}
buf.push(RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false));
buf.push(RegexProperty::Category(Category::Mark).negative_item(false));
buf.push(RegexProperty::Category(Category::Decimal_Number).negative_item(false));
buf.push(
RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),
);
} else {
buf.push(RegexClassItem::Shorthand(if negative {
RegexShorthand::NotWord
} else {
RegexShorthand::Word
}))
}
}
GroupName::Digit => {
if flavor == RegexFlavor::JavaScript {
buf.push(RegexProperty::Category(Category::Decimal_Number).negative_item(negative));
} else if negative {
buf.push(RegexClassItem::Shorthand(RegexShorthand::NotDigit));
} else {
buf.push(RegexClassItem::Shorthand(RegexShorthand::Digit));
}
}
GroupName::Space => buf.push(RegexClassItem::Shorthand(if negative {
RegexShorthand::NotSpace
} else {
RegexShorthand::Space
})),
GroupName::HorizSpace | GroupName::VertSpace if negative => {
return Err(CompileErrorKind::Other(
"horiz_space and vert_space can't be negated within a character class",
)
.at(span));
}
GroupName::HorizSpace | GroupName::VertSpace
if matches!(flavor, RegexFlavor::Pcre | RegexFlavor::Java) =>
{
buf.push(RegexClassItem::Shorthand(if group == GroupName::HorizSpace {
RegexShorthand::HorizSpace
} else {
RegexShorthand::VertSpace
}));
}
GroupName::HorizSpace => {
buf.push(RegexClassItem::Char('\t'));
buf.push(RegexProperty::Category(Category::Space_Separator).negative_item(false));
}
GroupName::VertSpace => {
buf.push(RegexClassItem::Range { first: '\x0A', last: '\x0D' });
buf.push(RegexClassItem::Char('\u{85}'));
buf.push(RegexClassItem::Char('\u{2028}'));
buf.push(RegexClassItem::Char('\u{2029}'));
}
GroupName::Category(c) => buf.push(RegexProperty::Category(c).negative_item(negative)),
GroupName::Script(s) => buf.push(RegexProperty::Script(s).negative_item(negative)),
GroupName::CodeBlock(b) => match flavor {
RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
buf.push(RegexProperty::Block(b).negative_item(negative));
}
_ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),
},
GroupName::OtherProperties(o) => {
if flavor == RegexFlavor::Pcre {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
}
buf.push(RegexProperty::Other(o).negative_item(negative));
}
}
Ok(())
}
#[cfg(feature = "dbg")]
impl core::fmt::Debug for CharClass {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
use std::fmt::Write;
f.write_str("CharClass(")?;
if self.negative {
f.write_str("not ")?;
}
match &self.inner {
CharGroup::Dot => f.write_str(".")?,
CharGroup::CodePoint => f.write_str("codepoint")?,
CharGroup::Items(items) => {
for (i, item) in items.iter().enumerate() {
if i > 0 {
f.write_char(' ')?;
}
item.fmt(f)?;
}
}
}
f.write_char(')')
}
}
#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) struct RegexCharClass {
negative: bool,
items: Vec<RegexClassItem>,
}
impl RegexCharClass {
pub(crate) fn new(items: Vec<RegexClassItem>) -> Self {
Self { negative: false, items }
}
pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {
if self.negative {
buf.push_str("[^");
} else {
buf.push('[');
}
for item in &self.items {
match *item {
RegexClassItem::Char(c) => {
literal::compile_char_esc_in_class(c, buf, flavor);
}
RegexClassItem::Range { first, last } => {
literal::compile_char_esc_in_class(first, buf, flavor);
buf.push('-');
literal::compile_char_esc_in_class(last, buf, flavor);
}
RegexClassItem::Shorthand(s) => s.codegen(buf),
RegexClassItem::Property { negative, value } => {
value.codegen(buf, negative, flavor)
}
}
}
buf.push(']');
}
}
#[derive(Clone, Copy)]
#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) enum RegexClassItem {
Char(char),
Range { first: char, last: char },
Shorthand(RegexShorthand),
Property { negative: bool, value: RegexProperty },
}
impl RegexClassItem {
pub(crate) fn range_unchecked(first: char, last: char) -> Self {
Self::Range { first, last }
}
}