use crate::bitmaps::Bitmap;
use boreal_parser::hex_string::{Mask, Token};
use boreal_parser::regex::{
AssertionKind, BracketedClass, BracketedClassItem, ClassKind, Literal, LiteralChar, Node,
PerlClass, PerlClassKind, RepetitionKind, RepetitionRange,
};
use std::ops::Range;
#[derive(Clone, Debug)]
pub enum Hir {
Alternation(Vec<Hir>),
Assertion(AssertionKind),
Class(Class),
Mask {
value: u8,
mask: u8,
negated: bool,
},
Concat(Vec<Hir>),
Dot,
Empty,
Literal(u8),
Group(Box<Hir>),
Repetition {
hir: Box<Hir>,
kind: RepetitionKind,
greedy: bool,
},
}
#[derive(Clone, Debug)]
pub struct Class {
pub definition: ClassKind,
pub bitmap: Bitmap,
}
pub(crate) fn regex_ast_to_hir(node: Node, warnings: &mut Vec<RegexAstError>) -> Hir {
match node {
Node::Alternation(v) => Hir::Alternation(
v.into_iter()
.map(|n| regex_ast_to_hir(n, warnings))
.collect(),
),
Node::Assertion(v) => Hir::Assertion(v),
Node::Class(definition) => Hir::Class(Class {
bitmap: class_to_bitmap(&definition, warnings),
definition,
}),
Node::Concat(v) => Hir::Concat(
v.into_iter()
.map(|n| regex_ast_to_hir(n, warnings))
.collect(),
),
Node::Dot => Hir::Dot,
Node::Empty => Hir::Empty,
Node::Literal(lit) => {
let byte = unwrap_literal(&lit, warnings);
Hir::Literal(byte)
}
Node::Group(v) => Hir::Group(Box::new(regex_ast_to_hir(*v, warnings))),
Node::Repetition { node, kind, greedy } => {
match *node {
Node::Char(LiteralChar { c, span, escaped }) => {
if escaped {
warnings.push(RegexAstError::UnknownEscape {
span: span.clone(),
c,
});
}
warnings.push(RegexAstError::NonAsciiChar { span });
let mut enc = vec![0; 4];
let _r = c.encode_utf8(&mut enc);
let len = c.len_utf8();
let mut concat = Vec::with_capacity(len);
for b in &enc[0..(len - 1)] {
concat.push(Hir::Literal(*b));
}
concat.push(Hir::Repetition {
hir: Box::new(Hir::Literal(enc[len - 1])),
kind,
greedy,
});
Hir::Concat(concat)
}
v => Hir::Repetition {
hir: Box::new(regex_ast_to_hir(v, warnings)),
kind,
greedy,
},
}
}
Node::Char(LiteralChar { c, span, escaped }) => {
if escaped {
warnings.push(RegexAstError::UnknownEscape {
span: span.clone(),
c,
});
}
warnings.push(RegexAstError::NonAsciiChar { span });
let mut enc = vec![0; 4];
let res = c.encode_utf8(&mut enc);
Hir::Concat(res.as_bytes().iter().map(|v| Hir::Literal(*v)).collect())
}
}
}
fn unwrap_literal(lit: &Literal, warnings: &mut Vec<RegexAstError>) -> u8 {
let Literal {
byte,
span,
escaped,
} = lit;
if *escaped && !is_meta_character(*byte) {
warnings.push(RegexAstError::UnknownEscape {
span: span.clone(),
c: char::from(*byte),
});
}
*byte
}
fn is_meta_character(byte: u8) -> bool {
matches!(
byte,
b'\\'
| b'/'
| b'.'
| b'+'
| b'*'
| b'?'
| b'('
| b')'
| b'|'
| b'['
| b']'
| b'{'
| b'}'
| b'^'
| b'$'
| b'-'
)
}
fn class_to_bitmap(class_kind: &ClassKind, warnings: &mut Vec<RegexAstError>) -> Bitmap {
match class_kind {
ClassKind::Perl(p) => perl_class_to_bitmap(p),
ClassKind::Bracketed(BracketedClass { items, negated }) => {
let mut bitmap = Bitmap::new();
for item in items {
match item {
BracketedClassItem::Perl(p) => {
bitmap |= perl_class_to_bitmap(p);
}
BracketedClassItem::Literal(lit) => {
let byte = unwrap_literal(lit, warnings);
bitmap.set(byte);
}
BracketedClassItem::Range(lita, litb) => {
let a = unwrap_literal(lita, warnings);
let b = unwrap_literal(litb, warnings);
for c in a..=b {
bitmap.set(c);
}
}
}
}
if *negated {
bitmap.invert();
}
bitmap
}
}
}
fn perl_class_to_bitmap(cls: &PerlClass) -> Bitmap {
let PerlClass { kind, negated } = cls;
let mut bitmap = Bitmap::new();
match kind {
PerlClassKind::Word => {
for c in b'0'..=b'9' {
bitmap.set(c);
}
for c in b'A'..=b'Z' {
bitmap.set(c);
}
bitmap.set(b'_');
for c in b'a'..=b'z' {
bitmap.set(c);
}
}
PerlClassKind::Space => {
for c in [b'\t', b'\n', b'\x0B', b'\x0C', b'\r', b' '] {
bitmap.set(c);
}
}
PerlClassKind::Digit => {
for c in b'0'..=b'9' {
bitmap.set(c);
}
}
}
if *negated {
bitmap.invert();
}
bitmap
}
impl From<Vec<Token>> for Hir {
fn from(tokens: Vec<Token>) -> Self {
Hir::Concat(tokens.into_iter().map(Into::into).collect())
}
}
impl From<Token> for Hir {
fn from(token: Token) -> Self {
match token {
Token::Byte(b) => Hir::Literal(b),
Token::NotByte(b) => {
let mut bitmap = Bitmap::new();
bitmap.set(b);
bitmap.invert();
Hir::Class(Class {
definition: ClassKind::Bracketed(BracketedClass {
items: vec![BracketedClassItem::Literal(Literal {
byte: b,
span: 0..1,
escaped: false,
})],
negated: true,
}),
bitmap,
})
}
Token::MaskedByte(b, mask) => masked_byte_to_hir(b, &mask, false),
Token::NotMaskedByte(b, mask) => masked_byte_to_hir(b, &mask, true),
Token::Jump(jump) => {
let kind = match (jump.from, jump.to) {
(from, None) => RepetitionKind::Range(RepetitionRange::AtLeast(from)),
(from, Some(to)) => RepetitionKind::Range(RepetitionRange::Bounded(from, to)),
};
Hir::Repetition {
hir: Box::new(Hir::Dot),
kind,
greedy: false,
}
}
Token::Alternatives(elems) => Hir::Group(Box::new(Hir::Alternation(
elems.into_iter().map(Into::into).collect(),
))),
}
}
}
fn masked_byte_to_hir(byte: u8, mask: &Mask, negated: bool) -> Hir {
match mask {
Mask::Left => Hir::Mask {
value: byte,
mask: 0x0F,
negated,
},
Mask::Right => Hir::Mask {
value: byte << 4,
mask: 0xF0,
negated,
},
Mask::All => Hir::Dot,
}
}
#[derive(Clone, Debug)]
pub enum RegexAstError {
NonAsciiChar {
span: Range<usize>,
},
UnknownEscape {
span: Range<usize>,
c: char,
},
}
#[cfg(test)]
mod tests {
use crate::test_helpers::test_type_traits;
use super::*;
#[test]
fn test_types_traits() {
test_type_traits(Hir::Empty);
test_type_traits(Class {
definition: ClassKind::Perl(PerlClass {
kind: PerlClassKind::Word,
negated: false,
}),
bitmap: Bitmap::new(),
});
test_type_traits(RegexAstError::NonAsciiChar { span: 0..1 });
}
}