use std::error::Error as StdError;
use std::fmt::Write;
use std::ops::Range;
use regex_automata::{meta, util::syntax, Input};
use boreal_parser::regex::{
AssertionKind, BracketedClass, BracketedClassItem, ClassKind, Literal, PerlClass,
PerlClassKind, RepetitionKind, RepetitionRange,
};
mod hir;
pub use hir::*;
mod visitor;
pub(crate) use visitor::{visit, VisitAction, Visitor};
#[derive(Clone, Debug)]
pub struct Regex {
meta: meta::Regex,
expr: String,
#[cfg(feature = "serialize")]
case_insensitive: bool,
#[cfg(feature = "serialize")]
dot_all: bool,
}
#[cfg(feature = "serialize")]
impl PartialEq for Regex {
fn eq(&self, other: &Self) -> bool {
self.expr == other.expr
&& self.case_insensitive == other.case_insensitive
&& self.dot_all == other.dot_all
}
}
impl Regex {
pub(crate) fn from_string(
expr: String,
case_insensitive: bool,
dot_all: bool,
) -> Result<Self, Error> {
let meta = Self::builder(case_insensitive, dot_all)
.build(&expr)
.map_err(Error::from)?;
Ok(Regex {
meta,
expr,
#[cfg(feature = "serialize")]
case_insensitive,
#[cfg(feature = "serialize")]
dot_all,
})
}
pub(crate) fn builder(case_insensitive: bool, dot_all: bool) -> meta::Builder {
let mut builder = meta::Builder::new();
let _b = builder
.configure(meta::Config::new().utf8_empty(false))
.syntax(
syntax::Config::new()
.octal(false)
.unicode(false)
.utf8(false)
.multi_line(false)
.case_insensitive(case_insensitive)
.dot_matches_new_line(dot_all),
);
builder
}
#[must_use]
pub fn find(&self, haystack: &[u8]) -> Option<Range<usize>> {
self.find_in_input(Input::new(haystack))
}
#[must_use]
pub fn find_at(&self, haystack: &[u8], offset: usize) -> Option<Range<usize>> {
self.find_in_input(Input::new(haystack).span(offset..haystack.len()))
}
#[must_use]
fn find_in_input(&self, input: Input) -> Option<Range<usize>> {
self.meta.find(input).map(|m| m.range())
}
#[must_use]
pub fn is_match(&self, mem: &[u8]) -> bool {
self.meta.is_match(mem)
}
#[must_use]
pub fn as_str(&self) -> &str {
&self.expr
}
}
pub(crate) fn regex_hir_to_string(hir: &Hir) -> String {
visit(hir, AstPrinter::default())
}
#[derive(Default)]
struct AstPrinter {
res: String,
}
impl Visitor for AstPrinter {
type Output = String;
fn visit_pre(&mut self, node: &Hir) -> VisitAction {
match node {
Hir::Assertion(AssertionKind::StartLine) => self.res.push('^'),
Hir::Assertion(AssertionKind::EndLine) => self.res.push('$'),
Hir::Assertion(AssertionKind::WordBoundary) => self.res.push_str(r"\b"),
Hir::Assertion(AssertionKind::NonWordBoundary) => self.res.push_str(r"\B"),
Hir::Mask {
value,
mask,
negated,
} => {
if *mask == 0xF0 {
self.res.push('[');
if *negated {
self.res.push('^');
}
self.push_literal(*value);
self.res.push('-');
self.push_literal(value | 0x0F);
self.res.push(']');
} else {
self.res.push('[');
if *negated {
self.res.push('^');
}
for b in 0..16 {
self.push_literal((b << 4) | value);
}
self.res.push(']');
}
}
Hir::Class(Class {
definition: ClassKind::Perl(p),
bitmap: _bitmap,
}) => self.push_perl_class(p),
Hir::Class(Class {
definition: ClassKind::Bracketed(c),
bitmap: _bitmap,
}) => self.push_bracketed_class(c),
Hir::Dot => self.res.push('.'),
Hir::Literal(b) => self.push_literal(*b),
Hir::Group(_) => self.res.push('('),
Hir::Alternation(_) | Hir::Concat(_) | Hir::Empty | Hir::Repetition { .. } => (),
}
VisitAction::Continue
}
fn visit_post(&mut self, node: &Hir) {
match node {
Hir::Alternation(_)
| Hir::Assertion(_)
| Hir::Mask { .. }
| Hir::Class(_)
| Hir::Concat(_)
| Hir::Dot
| Hir::Empty
| Hir::Literal(_) => (),
Hir::Group(_) => self.res.push(')'),
Hir::Repetition {
kind,
greedy,
hir: _,
} => {
match kind {
RepetitionKind::ZeroOrOne => self.res.push('?'),
RepetitionKind::ZeroOrMore => self.res.push('*'),
RepetitionKind::OneOrMore => self.res.push('+'),
RepetitionKind::Range(range) => {
let _r = match range {
RepetitionRange::Exactly(n) => write!(self.res, "{{{n}}}"),
RepetitionRange::AtLeast(n) => write!(self.res, "{{{n},}}"),
RepetitionRange::Bounded(n, m) => write!(self.res, "{{{n},{m}}}"),
};
}
}
if !greedy {
self.res.push('?');
}
}
}
}
fn visit_alternation_in(&mut self) {
self.res.push('|');
}
fn finish(self) -> Self::Output {
self.res
}
}
impl AstPrinter {
fn push_literal(&mut self, lit: u8) {
if (lit.is_ascii_alphanumeric()
|| lit.is_ascii_graphic()
|| lit.is_ascii_punctuation()
|| lit == b' ')
&& !regex_syntax::is_meta_character(char::from(lit))
{
self.res.push(char::from(lit));
} else {
let _r = write!(&mut self.res, r"\x{lit:02x}");
}
}
fn push_perl_class(&mut self, cls: &PerlClass) {
match cls {
PerlClass {
kind: PerlClassKind::Word,
negated: false,
} => self.res.push_str(r"\w"),
PerlClass {
kind: PerlClassKind::Word,
negated: true,
} => self.res.push_str(r"\W"),
PerlClass {
kind: PerlClassKind::Space,
negated: false,
} => self.res.push_str(r"\s"),
PerlClass {
kind: PerlClassKind::Space,
negated: true,
} => self.res.push_str(r"\S"),
PerlClass {
kind: PerlClassKind::Digit,
negated: false,
} => self.res.push_str(r"\d"),
PerlClass {
kind: PerlClassKind::Digit,
negated: true,
} => self.res.push_str(r"\D"),
}
}
fn push_bracketed_class(&mut self, cls: &BracketedClass) {
self.res.push('[');
if cls.negated {
self.res.push('^');
}
for item in &cls.items {
match item {
BracketedClassItem::Perl(p) => self.push_perl_class(p),
BracketedClassItem::Literal(Literal { byte, .. }) => self.push_literal(*byte),
BracketedClassItem::Range(Literal { byte: a, .. }, Literal { byte: b, .. }) => {
self.push_literal(*a);
self.res.push('-');
self.push_literal(*b);
}
}
}
self.res.push(']');
}
}
#[derive(Clone, Debug)]
pub struct Error(String);
impl From<meta::BuildError> for Error {
fn from(err: meta::BuildError) -> Self {
if let Some(size_limit) = err.size_limit() {
Self(format!(
"Compiled regex exceeds size limit of {size_limit} bytes.",
))
} else {
Self(err.to_string())
}
}
}
impl From<regex_automata::hybrid::BuildError> for Error {
fn from(err: regex_automata::hybrid::BuildError) -> Self {
if let Some(source) = err.source() {
if let Some(nfa_err) =
source.downcast_ref::<regex_automata::nfa::thompson::BuildError>()
{
if let Some(size_limit) = nfa_err.size_limit() {
return Self(format!(
"Compiled regex exceeds size limit of {size_limit} bytes.",
));
}
}
}
Self(err.to_string())
}
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl std::error::Error for Error {}
#[cfg(feature = "serialize")]
mod wire {
use std::io;
use crate::wire::{Deserialize, Serialize};
use super::Regex;
impl Serialize for Regex {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.case_insensitive.serialize(writer)?;
self.dot_all.serialize(writer)?;
self.expr.serialize(writer)?;
Ok(())
}
}
impl Deserialize for Regex {
fn deserialize_reader<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let case_insensitive = bool::deserialize_reader(reader)?;
let dot_all = bool::deserialize_reader(reader)?;
let expr = String::deserialize_reader(reader)?;
Regex::from_string(expr, case_insensitive, dot_all).map_err(|err| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("invalid regex expression: {err:?}"),
)
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::wire::tests::{test_invalid_deserialization, test_round_trip};
#[test]
fn test_wire_regex() {
test_round_trip(
&Regex::from_string("abc".to_owned(), false, true).unwrap(),
&[0, 1, 2],
);
test_invalid_deserialization::<Regex>(b"\x00\x00\x01\x00\x00\x00[");
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_helpers::{expr_to_hir, test_type_traits};
#[test]
fn test_regex_conversion() {
#[track_caller]
fn test(expr: &str, expected_res: Option<&str>) {
let hir = expr_to_hir(expr);
assert_eq!(regex_hir_to_string(&hir), expected_res.unwrap_or(expr));
}
test("^a.d+$", None);
test(r"\s?\S??\w*(\W*?\d+?\D\b)+", None);
test(r"(\ba\B[a\w]|a(b|cd)t[^a-z])", None);
test(
r"[]] [^].[^] [!---]",
Some(r"[\x5d] [^\x2e\x5b\x5e\x5d] [!-\x2d\x2d]"),
);
test(
r"[|\\.+*?()\]{}^$#&\-~]",
Some(r"[\x7c\x5c\x2e\x2b\x2a\x3f\x28\x29\x5d\x7b\x7d\x5e\x24\x23\x26\x2d\x7e]"),
);
test(
r"[\|\\\.\+\*\?\(\)\]\{\}\^\$\#\&\-\~]",
Some(r"[\x7c\x5c\x2e\x2b\x2a\x3f\x28\x29\x5d\x7b\x7d\x5e\x24\x23\x26\x2d\x7e]"),
);
test(
r"\|\\\.\+\*\?\(\)\]\{\}\^\$\#\&\-\~\[",
Some(r"\x7c\x5c\x2e\x2b\x2a\x3f\x28\x29\x5d\x7b\x7d\x5e\x24\x23\x26\x2d\x7e\x5b"),
);
test(r#"\k\i\z\p\P\"\A\z"#, Some(r#"kizpP"Az"#));
test(
r"a{0} b{1,} c{,2} d{3,4} e{} f{*} g{1,h}",
Some(r"a{0} b{1,} c{0,2} d{3,4} e\x7b\x7d f\x7b*\x7d g\x7b1,h\x7d"),
);
test(
r#" {"Hosts":\[".{10,512}"\],"Proxy":".{0,512}","Version":".{1,32}","Guid":""#,
Some(
r#" \x7b"Hosts":\x5b".{10,512}"\x5d,"Proxy":".{0,512}","Version":".{1,32}","Guid":""#,
),
);
}
#[test]
fn test_hex_string_to_regex() {
#[track_caller]
fn test(expr: &str, expected_regex: &str) {
let hir = expr_to_hir(expr);
assert_eq!(®ex_hir_to_string(&hir), expected_regex);
}
test(
"{ AB ?D 01 }",
r"\xab[\x0d\x1d\x2d=M\x5dm\x7d\x8d\x9d\xad\xbd\xcd\xdd\xed\xfd]\x01",
);
test("{ C7 [-] ?? }", r"\xc7.{0,}?.");
test(
"{ C7 [3-] 5? 03 [-6] C7 ( FF 15 | E8 ) [4] 6A ( FF D? | E8 [2-4] ??) }",
r"\xc7.{3,}?[P-_]\x03.{0,6}?\xc7(\xff\x15|\xe8).{4,4}?j(\xff[\xd0-\xdf]|\xe8.{2,4}?.)",
);
}
#[test]
fn test_regex_as_str() {
let expr = r"^a+b\wc";
let regex = Regex::from_string(expr.to_owned(), false, false).unwrap();
assert_eq!(regex.as_str(), expr);
}
#[test]
fn test_types_traits() {
test_type_traits(Regex::from_string("a".to_owned(), false, false));
test_type_traits(Error("a".to_owned()));
}
}