use std::borrow::Borrow;
use pomsky_syntax::{
Span,
exprs::{
BoundaryKind, Category, CodeBlock, LookaroundKind, OtherProperties, RepetitionKind, Script,
ScriptExtension,
},
};
use crate::{
compile::CompileResult,
diagnose::{CompileErrorKind, Feature, IllegalNegationKind},
exprs::{
alternation::RegexAlternation,
boundary::boundary_kind_codegen,
char_class::{RegexCharSet, RegexCharSetItem, RegexCompoundCharSet},
group::{RegexGroup, RegexGroupKind},
literal,
lookaround::RegexLookaround,
recursion,
reference::RegexReference,
repetition::RegexRepetition,
},
options::RegexFlavor,
};
mod optimize;
pub(super) use optimize::Count;
#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) enum Regex {
Literal(String),
Unescaped(String),
CharSet(RegexCharSet),
CompoundCharSet(RegexCompoundCharSet),
Grapheme,
Dot,
Group(RegexGroup),
Alternation(RegexAlternation),
Repetition(Box<RegexRepetition>),
Boundary(BoundaryKind),
Lookaround(Box<RegexLookaround>),
Reference(RegexReference),
Recursion,
}
impl Regex {
pub(super) fn validate_in_lookbehind_py(&self) -> Result<Option<u32>, CompileErrorKind> {
match self {
Regex::Literal(str) => Ok(Some(str.chars().count() as u32)),
Regex::Unescaped(_) => Ok(None),
Regex::CharSet(_) | Regex::CompoundCharSet(_) => Ok(Some(1)),
Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Python,
feature: Feature::Grapheme,
}),
Regex::Dot => Ok(Some(1)),
Regex::Group(g) => g.parts.iter().try_fold(Some(0), |acc, part| {
Ok(match (acc, part.validate_in_lookbehind_py()?) {
(Some(a), Some(b)) => Some(a + b),
_ => None,
})
}),
Regex::Alternation(alt) => {
let mut count = None;
for part in &alt.parts {
let c = part.validate_in_lookbehind_py()?;
count = match (count, c) {
(Some(a), Some(b)) if a == b => Some(a),
(Some(_), Some(_)) => {
return Err(CompileErrorKind::LookbehindNotConstantLength {
flavor: RegexFlavor::Python,
});
}
(Some(a), None) | (None, Some(a)) => Some(a),
_ => None,
};
}
Ok(count)
}
Regex::Repetition(r) => {
if let RepetitionKind { lower_bound, upper_bound: Some(upper) } = r.kind
&& lower_bound == upper
{
return Ok(Some(upper));
}
Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Python })
}
Regex::Boundary(_) => Ok(Some(0)),
Regex::Lookaround(_) => Ok(Some(0)),
Regex::Reference(_) => Ok(None), Regex::Recursion => unreachable!("not supported in python"),
}
}
pub(super) fn validate_in_lookbehind_pcre(&self) -> Result<(), CompileErrorKind> {
match self {
Regex::Literal(_) => Ok(()),
Regex::Unescaped(_) => Ok(()),
Regex::CharSet(_) | Regex::CompoundCharSet(_) => Ok(()),
Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Pcre,
feature: Feature::Grapheme,
}),
Regex::Dot => Ok(()),
Regex::Group(g) => {
for part in &g.parts {
part.validate_in_lookbehind_pcre()?;
}
Ok(())
}
Regex::Alternation(alt) => {
for part in &alt.parts {
part.validate_in_lookbehind_pcre()?;
}
Ok(())
}
Regex::Repetition(r) => match r.kind.upper_bound {
Some(_) => Ok(()),
_ => {
Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Pcre })
}
},
Regex::Boundary(_) => Ok(()),
Regex::Lookaround(_) => Ok(()),
Regex::Reference(_) => Ok(()), Regex::Recursion => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Pcre,
feature: Feature::Recursion,
}),
}
}
pub(super) fn validate_in_lookbehind_java(&self) -> Result<(), CompileErrorKind> {
match self {
Regex::Group(g) => {
for part in &g.parts {
part.validate_in_lookbehind_java()?;
}
Ok(())
}
Regex::Alternation(alt) => {
for part in &alt.parts {
part.validate_in_lookbehind_java()?;
}
Ok(())
}
Regex::Repetition(r) => match r.kind.upper_bound {
Some(_) => Ok(()),
_ => {
Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Java })
}
},
Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Java,
feature: Feature::Grapheme,
}),
Regex::Reference(_) => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Java,
feature: Feature::Backreference,
}),
_ => Ok(()),
}
}
pub(super) fn is_single_char(&self) -> bool {
if let Regex::Literal(l) = self {
!l.is_empty() && l.chars().nth(1).is_none()
} else {
matches!(self, Regex::CharSet(_))
}
}
pub(super) fn terminates(&self) -> bool {
match self {
Regex::Recursion => false,
Regex::Repetition(repetition) => {
repetition.kind.lower_bound == 0 || repetition.content.terminates()
}
Regex::Group(group) => group.parts.iter().all(|part| part.terminates()),
Regex::Alternation(alternation) => alternation.parts.iter().any(|alt| alt.terminates()),
Regex::Lookaround(lookaround) => lookaround.content.terminates(),
_ => true,
}
}
}
impl Default for Regex {
fn default() -> Self {
Regex::Literal("".into())
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) enum RegexShorthand {
Word,
Digit,
Space,
NotWord,
NotDigit,
NotSpace,
VertSpace,
HorizSpace,
}
impl RegexShorthand {
pub(crate) fn negate(&self) -> Option<RegexShorthand> {
Some(match self {
RegexShorthand::Word => RegexShorthand::NotWord,
RegexShorthand::Digit => RegexShorthand::NotDigit,
RegexShorthand::Space => RegexShorthand::NotSpace,
RegexShorthand::NotWord => RegexShorthand::Word,
RegexShorthand::NotDigit => RegexShorthand::Digit,
RegexShorthand::NotSpace => RegexShorthand::Space,
RegexShorthand::VertSpace => return None,
RegexShorthand::HorizSpace => return None,
})
}
pub(crate) fn as_str(&self) -> &'static str {
match self {
RegexShorthand::Word => "word",
RegexShorthand::Digit => "digit",
RegexShorthand::Space => "space",
RegexShorthand::NotWord => "!word",
RegexShorthand::NotDigit => "!digit",
RegexShorthand::NotSpace => "!space",
RegexShorthand::VertSpace => "vert_space",
RegexShorthand::HorizSpace => "horiz_space",
}
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) enum RegexProperty {
Category(Category),
Script(Script, ScriptExtension),
Block(CodeBlock),
Other(OtherProperties),
}
impl RegexProperty {
pub fn as_str(&self) -> &'static str {
match self {
RegexProperty::Category(c) => c.as_str(),
RegexProperty::Script(s, _) => s.as_str(),
RegexProperty::Block(b) => b.as_str(),
RegexProperty::Other(o) => o.as_str(),
}
}
pub fn prefix_as_str(&self) -> &'static str {
match self {
RegexProperty::Script(_, ScriptExtension::No) => "sc:",
RegexProperty::Script(_, ScriptExtension::Yes) => "scx:",
_ => "",
}
}
pub(crate) fn negative_item(self, negative: bool) -> RegexCharSetItem {
RegexCharSetItem::Property { negative, value: self }
}
}
impl Regex {
pub(crate) fn negate(self, not_span: Span, flavor: RegexFlavor) -> CompileResult {
match self {
Regex::Literal(l) => {
let mut iter = l.chars();
let Some(c) = iter.next().and_then(|c| iter.next().is_none().then_some(c)) else {
return Err(CompileErrorKind::IllegalNegation {
kind: IllegalNegationKind::Literal(l.to_string()),
}
.at(not_span));
};
if flavor == RegexFlavor::DotNet && c.len_utf16() > 1 {
return Err(CompileErrorKind::IllegalNegation {
kind: IllegalNegationKind::DotNetChar(c),
}
.at(not_span));
}
Ok(Regex::CharSet(RegexCharSet::new(c.into()).negate()))
}
Regex::CharSet(s) => Ok(Regex::CharSet(s.negate())),
Regex::CompoundCharSet(s) => Ok(Regex::CompoundCharSet(s.negate())),
Regex::Boundary(b) => match b {
BoundaryKind::Word => Ok(Regex::Boundary(BoundaryKind::NotWord)),
BoundaryKind::NotWord => Ok(Regex::Boundary(BoundaryKind::Word)),
_ => Err(CompileErrorKind::IllegalNegation { kind: IllegalNegationKind::Boundary }
.at(not_span)),
},
Regex::Lookaround(mut l) => {
l.kind = match l.kind {
LookaroundKind::Ahead => LookaroundKind::AheadNegative,
LookaroundKind::Behind => LookaroundKind::BehindNegative,
LookaroundKind::AheadNegative => LookaroundKind::Ahead,
LookaroundKind::BehindNegative => LookaroundKind::Behind,
};
Ok(Regex::Lookaround(l))
}
Regex::Group(mut g)
if matches!(g.kind, RegexGroupKind::Normal) && g.parts.len() == 1 =>
{
g.parts.pop().unwrap().negate(not_span, flavor)
}
Regex::Unescaped(_)
| Regex::Grapheme
| Regex::Dot
| Regex::Group(_)
| Regex::Alternation(_)
| Regex::Repetition(_)
| Regex::Reference(_)
| Regex::Recursion => Err(CompileErrorKind::IllegalNegation {
kind: match self {
Regex::Unescaped(_) => IllegalNegationKind::Unescaped,
Regex::Grapheme => IllegalNegationKind::Grapheme,
Regex::Dot => IllegalNegationKind::Dot,
Regex::Group(_) => IllegalNegationKind::Group,
Regex::Alternation(_) => IllegalNegationKind::Alternation,
Regex::Repetition(_) => IllegalNegationKind::Repetition,
Regex::Reference(_) => IllegalNegationKind::Reference,
Regex::Recursion => IllegalNegationKind::Recursion,
_ => unreachable!(),
},
}
.at(not_span)),
}
}
pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {
match self {
Regex::Literal(l) => {
let mut chars = l.chars();
while let Some(c) = chars.next() {
if c == '\r' {
literal::codegen_char_esc('\n', buf, flavor);
match chars.next() {
Some('\n') | None => {}
Some(c) => literal::codegen_char_esc(c, buf, flavor),
}
} else {
literal::codegen_char_esc(c, buf, flavor);
}
}
}
Regex::Unescaped(u) => {
buf.push_str(u);
}
Regex::CharSet(c) => c.codegen(buf, flavor, false),
Regex::CompoundCharSet(c) => c.codegen(buf, flavor),
Regex::Grapheme => buf.push_str("\\X"),
Regex::Dot => buf.push('.'),
Regex::Group(g) => g.codegen(buf, flavor),
Regex::Alternation(a) => a.codegen(buf, flavor),
Regex::Repetition(r) => r.codegen(buf, flavor),
Regex::Boundary(b) => boundary_kind_codegen(*b, buf, flavor),
Regex::Lookaround(l) => l.codegen(buf, flavor),
Regex::Reference(r) => r.codegen(buf),
Regex::Recursion => recursion::codegen(buf, flavor),
}
}
pub(crate) fn needs_parens_in_sequence(&self) -> bool {
match self {
Regex::Alternation(_) => true,
Regex::Literal(_)
| Regex::Unescaped(_)
| Regex::Group(_)
| Regex::CharSet(_)
| Regex::CompoundCharSet(_)
| Regex::Grapheme
| Regex::Repetition(_)
| Regex::Boundary(_)
| Regex::Lookaround(_)
| Regex::Reference(_)
| Regex::Dot
| Regex::Recursion => false,
}
}
pub(crate) fn needs_parens_before_repetition(&self, flavor: RegexFlavor) -> bool {
match self {
Regex::Literal(l) => literal::needs_parens_before_repetition(l.borrow()),
Regex::Group(g) => g.needs_parens_before_repetition(flavor),
Regex::Repetition(_)
| Regex::Alternation(_)
| Regex::Boundary(_)
| Regex::Unescaped(_) => true,
Regex::Lookaround(_) => matches!(flavor, RegexFlavor::JavaScript),
Regex::CharSet(_)
| Regex::CompoundCharSet(_)
| Regex::Grapheme
| Regex::Reference(_)
| Regex::Dot
| Regex::Recursion => false,
}
}
pub(crate) fn result_is_empty(&self) -> bool {
match self {
Regex::Literal(l) => l.is_empty(),
Regex::Group(g) => g.parts.iter().all(Regex::result_is_empty),
Regex::Unescaped(r) => r.is_empty(),
Regex::Repetition(r) => r.content.result_is_empty(),
_ => false,
}
}
pub(crate) fn is_assertion(&self) -> bool {
match self {
Regex::Lookaround(_) | Regex::Boundary(_) => true,
Regex::Group(g) if matches!(g.kind, RegexGroupKind::Normal) => {
let mut iter = g.parts.iter().filter(|part| !part.result_is_empty());
iter.next().is_some_and(Regex::is_assertion) && iter.next().is_none()
}
Regex::Alternation(g) => g.parts.iter().any(Regex::is_assertion),
_ => false,
}
}
}
impl RegexShorthand {
pub(crate) fn codegen(self, buf: &mut String) {
match self {
RegexShorthand::Word => buf.push_str("\\w"),
RegexShorthand::Digit => buf.push_str("\\d"),
RegexShorthand::Space => buf.push_str("\\s"),
RegexShorthand::NotWord => buf.push_str("\\W"),
RegexShorthand::NotDigit => buf.push_str("\\D"),
RegexShorthand::NotSpace => buf.push_str("\\S"),
RegexShorthand::VertSpace => buf.push_str("\\v"),
RegexShorthand::HorizSpace => buf.push_str("\\h"),
}
}
}
impl RegexProperty {
pub(crate) fn codegen(self, buf: &mut String, negative: bool, flavor: RegexFlavor) {
let is_single = matches!(
(self, flavor),
(
RegexProperty::Category(
Category::Letter
| Category::Mark
| Category::Number
| Category::Punctuation
| Category::Symbol
| Category::Separator
| Category::Other
),
RegexFlavor::Java | RegexFlavor::Pcre | RegexFlavor::Rust | RegexFlavor::Ruby,
)
);
if negative {
buf.push_str("\\P");
} else {
buf.push_str("\\p");
}
if !is_single {
buf.push('{');
}
match self {
RegexProperty::Category(c) => {
buf.push_str(c.as_str());
}
RegexProperty::Script(s, e) => {
if matches!(flavor, RegexFlavor::JavaScript | RegexFlavor::Java)
|| e != ScriptExtension::Unspecified
{
buf.push_str(if let ScriptExtension::Yes = e { "scx=" } else { "sc=" });
}
buf.push_str(s.as_str());
}
RegexProperty::Block(b) => match flavor {
RegexFlavor::DotNet => {
buf.push_str("Is");
buf.push_str(&b.as_str().replace("_And_", "_and_").replace('_', ""));
}
RegexFlavor::Java => {
buf.push_str("In");
match b {
CodeBlock::Cyrillic_Supplement => buf.push_str("Cyrillic_Supplementary"),
CodeBlock::Combining_Diacritical_Marks_For_Symbols => {
buf.push_str("Combining_Marks_For_Symbols")
}
_ => buf.push_str(&b.as_str().replace('-', "_")),
};
}
RegexFlavor::Ruby => {
buf.push_str("In");
buf.push_str(b.as_str());
}
_ => panic!("No other flavors support Unicode blocks"),
},
RegexProperty::Other(o) => {
if flavor == RegexFlavor::Java {
buf.push_str("Is");
}
buf.push_str(o.as_str());
}
}
if !is_single {
buf.push('}');
}
}
}