use pomsky_syntax::Span;
use pomsky_syntax::exprs::{
Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script, ScriptExtension,
};
use crate::{
compile::{CompileResult, CompileState},
diagnose::{CompileError, CompileErrorKind, Feature},
exprs::literal,
options::{CompileOptions, RegexFlavor},
regex::{Regex, RegexProperty, RegexShorthand},
unicode_set::UnicodeSet,
};
pub(crate) use char_set_item::{RegexCharSet, RegexCharSetItem, RegexCompoundCharSet};
use super::Compile;
mod char_set_item;
impl Compile for CharClass {
fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult {
let is_single = self.inner.len() == 1;
let mut group_negative = false;
let mut set = UnicodeSet::new();
for item in &self.inner {
match *item {
GroupItem::Char(c) => {
if !is_single {
validate_char_in_class(c, options.flavor, self.span)?;
}
set.add_char(c)
}
GroupItem::Range { first, last } => {
validate_char_in_class(first, options.flavor, self.span)?;
validate_char_in_class(last, options.flavor, self.span)?;
set.add_range(first..=last);
}
GroupItem::Named { name, negative, span } => {
if self.unicode_aware {
named_class_to_regex_unicode(
name,
negative,
&mut group_negative,
is_single,
options.flavor,
span,
&mut set,
)?;
} else {
named_class_to_regex_ascii(name, negative, options.flavor, span, &mut set)?;
}
}
}
}
if let Some(only_char) = set.try_into_char() {
return Ok(Regex::Literal(only_char.to_string()));
}
Ok(Regex::CharSet(RegexCharSet { negative: group_negative, set }))
}
}
fn validate_char_in_class(char: char, flavor: RegexFlavor, span: Span) -> Result<(), CompileError> {
if flavor == RegexFlavor::DotNet && char > '\u{FFFF}' {
Err(CompileErrorKind::Unsupported(Feature::LargeCodePointInCharClass(char), flavor)
.at(span))
} else {
Ok(())
}
}
pub(crate) fn check_char_class_empty(
char_set: &RegexCharSet,
span: Span,
) -> Result<(), CompileError> {
if char_set.negative
&& let Some((group1, group2)) = char_set.set.full_props()
{
return Err(CompileErrorKind::EmptyClassNegated { group1, group2 }.at(span));
}
Ok(())
}
pub fn is_ascii_only_in_flavor(group: GroupName, flavor: RegexFlavor) -> bool {
match flavor {
RegexFlavor::JavaScript => matches!(group, GroupName::Word | GroupName::Digit),
RegexFlavor::RE2 => matches!(group, GroupName::Word | GroupName::Digit | GroupName::Space),
_ => false,
}
}
fn named_class_to_regex_ascii(
group: GroupName,
negative: bool,
flavor: RegexFlavor,
span: Span,
set: &mut UnicodeSet,
) -> Result<(), CompileError> {
if negative && !is_ascii_only_in_flavor(group, flavor) {
return Err(CompileErrorKind::NegativeShorthandInAsciiMode.at(span));
}
match group {
GroupName::Word => {
if let RegexFlavor::JavaScript | RegexFlavor::RE2 = flavor {
let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
set.add_prop(RegexCharSetItem::Shorthand(s));
} else {
set.add_range('a'..='z');
set.add_range('A'..='Z');
set.add_range('0'..='9');
set.add_char('_');
}
}
GroupName::Digit => {
if let RegexFlavor::JavaScript | RegexFlavor::RE2 = flavor {
let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
set.add_prop(RegexCharSetItem::Shorthand(s));
} else {
set.add_range('0'..='9');
}
}
GroupName::Space => {
if let RegexFlavor::RE2 = flavor {
let s = if negative { RegexShorthand::NotSpace } else { RegexShorthand::Space };
set.add_prop(RegexCharSetItem::Shorthand(s));
} else {
set.add_char(' ');
set.add_range('\x09'..='\x0D'); }
}
GroupName::HorizSpace => set.add_char('\t'),
GroupName::VertSpace => set.add_range('\x0A'..='\x0D'),
_ => return Err(CompileErrorKind::UnicodeInAsciiMode.at(span)),
}
Ok(())
}
fn named_class_to_regex_unicode(
group: GroupName,
negative: bool,
group_negative: &mut bool,
is_single: bool,
flavor: RegexFlavor,
span: Span,
set: &mut UnicodeSet,
) -> Result<(), CompileError> {
match group {
GroupName::Word => {
if flavor == RegexFlavor::RE2 {
return Err(CompileErrorKind::Unsupported(Feature::ShorthandW, flavor).at(span));
} else if flavor == RegexFlavor::JavaScript {
if negative {
if is_single {
*group_negative ^= true;
} else {
return Err(CompileErrorKind::Unsupported(
Feature::NegativeShorthandW,
flavor,
)
.at(span));
}
}
set.add_prop(
RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),
);
set.add_prop(RegexProperty::Category(Category::Mark).negative_item(false));
set.add_prop(
RegexProperty::Category(Category::Decimal_Number).negative_item(false),
);
set.add_prop(
RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),
);
} else {
let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
set.add_prop(RegexCharSetItem::Shorthand(s));
}
}
GroupName::Digit => {
if matches!(flavor, RegexFlavor::JavaScript | RegexFlavor::RE2) {
set.add_prop(
RegexProperty::Category(Category::Decimal_Number).negative_item(negative),
);
} else {
let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
set.add_prop(RegexCharSetItem::Shorthand(s));
}
}
GroupName::Space => {
if flavor == RegexFlavor::RE2 {
if negative {
if is_single {
*group_negative ^= true;
} else {
return Err(CompileErrorKind::Unsupported(
Feature::NegativeShorthandS,
flavor,
)
.at(span));
}
}
set.add_prop(RegexCharSetItem::Shorthand(RegexShorthand::Space));
set.add_char('\x0b');
set.add_char('\u{a0}');
set.add_char('\u{1680}');
set.add_range('\u{2000}'..='\u{200a}');
set.add_range('\u{2028}'..='\u{2029}');
set.add_char('\u{202f}');
set.add_char('\u{205f}');
set.add_char('\u{3000}');
set.add_char('\u{feff}');
} else {
set.add_prop(RegexCharSetItem::Shorthand(if negative {
RegexShorthand::NotSpace
} else {
RegexShorthand::Space
}))
}
}
GroupName::HorizSpace | GroupName::VertSpace if negative => {
return Err(CompileErrorKind::NegatedHorizVertSpace.at(span));
}
GroupName::HorizSpace | GroupName::VertSpace
if matches!(flavor, RegexFlavor::Pcre | RegexFlavor::Java) =>
{
set.add_prop(RegexCharSetItem::Shorthand(if group == GroupName::HorizSpace {
RegexShorthand::HorizSpace
} else {
RegexShorthand::VertSpace
}));
}
GroupName::HorizSpace => {
set.add_char('\t');
if flavor == RegexFlavor::Python {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
} else {
set.add_prop(
RegexProperty::Category(Category::Space_Separator).negative_item(false),
);
}
}
GroupName::VertSpace => {
set.add_range('\x0A'..='\x0D');
set.add_char('\u{85}');
set.add_char('\u{2028}');
set.add_char('\u{2029}');
}
_ if flavor == RegexFlavor::Python => {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
}
GroupName::Category(c) => {
if let (RegexFlavor::Rust, Category::Surrogate)
| (RegexFlavor::DotNet | RegexFlavor::RE2, Category::Cased_Letter) = (flavor, c)
{
return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
}
set.add_prop(RegexProperty::Category(c).negative_item(negative));
}
GroupName::Script(s, e) => {
if flavor == RegexFlavor::DotNet {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));
}
if let (RegexFlavor::Rust, Script::Unknown) = (flavor, s) {
return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
}
let set_extensions = match e {
ScriptExtension::Yes => match flavor {
RegexFlavor::Rust | RegexFlavor::Pcre | RegexFlavor::JavaScript => {
ScriptExtension::Yes
}
RegexFlavor::Java
| RegexFlavor::DotNet
| RegexFlavor::Ruby
| RegexFlavor::Python
| RegexFlavor::RE2 => {
return Err(CompileErrorKind::Unsupported(
Feature::ScriptExtensions,
flavor,
)
.at(span));
}
},
ScriptExtension::No => match flavor {
RegexFlavor::Pcre => ScriptExtension::No,
_ => ScriptExtension::Unspecified,
},
_ => ScriptExtension::Unspecified,
};
set.add_prop(RegexProperty::Script(s, set_extensions).negative_item(negative));
}
GroupName::CodeBlock(b) => match flavor {
RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
match (flavor, b) {
(RegexFlavor::Java, CodeBlock::No_Block)
| (
RegexFlavor::Ruby,
CodeBlock::Arabic_Extended_C
| CodeBlock::CJK_Unified_Ideographs_Extension_H
| CodeBlock::Cyrillic_Extended_D
| CodeBlock::Devanagari_Extended_A
| CodeBlock::Kaktovik_Numerals,
) => {
return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
}
(RegexFlavor::DotNet, _) => {
let dotnet_name = b.as_str().replace("_And_", "_and_").replace('_', "");
if pomsky_syntax::blocks_supported_in_dotnet()
.binary_search(&dotnet_name.as_str())
.is_err()
{
return Err(
CompileErrorKind::unsupported_specific_prop_in(flavor).at(span)
);
}
}
_ => {}
}
set.add_prop(RegexProperty::Block(b).negative_item(negative));
}
_ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),
},
GroupName::OtherProperties(o) => {
use OtherProperties as OP;
use RegexFlavor as RF;
if let RF::JavaScript | RF::Rust | RF::Pcre | RF::Ruby = flavor {
match (flavor, o) {
(RF::JavaScript, _) => {}
(_, OP::Changes_When_NFKC_Casefolded)
| (RF::Pcre, OP::Assigned)
| (RF::Ruby, OP::Bidi_Mirrored) => {
return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
}
_ => {}
}
set.add_prop(RegexProperty::Other(o).negative_item(negative));
} else if flavor == RF::Java {
if pomsky_syntax::props_supported_in_java().binary_search(&o.as_str()).is_ok() {
set.add_prop(RegexProperty::Other(o).negative_item(negative));
} else {
return Err(CompileErrorKind::Unsupported(
Feature::SpecificUnicodeProp,
flavor,
)
.at(span));
}
} else {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
}
}
}
Ok(())
}