use alloc::{boxed::Box, string::String, vec, vec::Vec};
use crate::{error::Error, utf8};
mod parse;
pub fn escape(pattern: &str) -> String {
let mut buf = String::new();
buf.reserve(pattern.len());
for ch in pattern.chars() {
if is_meta_character(ch) {
buf.push('\\');
}
buf.push(ch);
}
buf
}
fn is_meta_character(c: char) -> bool {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
| '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
_ => false,
}
}
fn is_escapable_character(c: char) -> bool {
if is_meta_character(c) {
return true;
}
if !c.is_ascii() {
return false;
}
match c {
'0'..='9' | 'A'..='Z' | 'a'..='z' => false,
'<' | '>' => false,
_ => true,
}
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct Config {
pub(crate) nest_limit: u32,
pub(crate) flags: Flags,
}
impl Default for Config {
fn default() -> Config {
Config { nest_limit: 50, flags: Flags::default() }
}
}
#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct Flags {
pub(crate) case_insensitive: bool,
pub(crate) multi_line: bool,
pub(crate) dot_matches_new_line: bool,
pub(crate) swap_greed: bool,
pub(crate) crlf: bool,
pub(crate) ignore_whitespace: bool,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct Hir {
kind: HirKind,
is_start_anchored: bool,
is_match_empty: bool,
static_explicit_captures_len: Option<usize>,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) enum HirKind {
Empty,
Char(char),
Class(Class),
Look(Look),
Repetition(Repetition),
Capture(Capture),
Concat(Vec<Hir>),
Alternation(Vec<Hir>),
}
impl Hir {
pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> {
self::parse::Parser::new(config, pattern).parse()
}
pub(crate) fn kind(&self) -> &HirKind {
&self.kind
}
pub(crate) fn is_start_anchored(&self) -> bool {
self.is_start_anchored
}
pub(crate) fn is_match_empty(&self) -> bool {
self.is_match_empty
}
pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
self.static_explicit_captures_len
}
fn fail() -> Hir {
let kind = HirKind::Class(Class { ranges: vec![] });
Hir {
kind,
is_start_anchored: false,
is_match_empty: false,
static_explicit_captures_len: Some(0),
}
}
fn empty() -> Hir {
let kind = HirKind::Empty;
Hir {
kind,
is_start_anchored: false,
is_match_empty: true,
static_explicit_captures_len: Some(0),
}
}
fn char(ch: char) -> Hir {
let kind = HirKind::Char(ch);
Hir {
kind,
is_start_anchored: false,
is_match_empty: false,
static_explicit_captures_len: Some(0),
}
}
fn class(class: Class) -> Hir {
let kind = HirKind::Class(class);
Hir {
kind,
is_start_anchored: false,
is_match_empty: false,
static_explicit_captures_len: Some(0),
}
}
fn look(look: Look) -> Hir {
let kind = HirKind::Look(look);
Hir {
kind,
is_start_anchored: matches!(look, Look::Start),
is_match_empty: true,
static_explicit_captures_len: Some(0),
}
}
fn repetition(rep: Repetition) -> Hir {
if rep.min == 0 && rep.max == Some(0) {
return Hir::empty();
} else if rep.min == 1 && rep.max == Some(1) {
return *rep.sub;
}
let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored;
let is_match_empty = rep.min == 0 || rep.sub.is_match_empty;
let mut static_explicit_captures_len =
rep.sub.static_explicit_captures_len;
if rep.min == 0
&& static_explicit_captures_len.map_or(false, |len| len > 0)
{
if rep.max == Some(0) {
static_explicit_captures_len = Some(0);
} else {
static_explicit_captures_len = None;
}
}
Hir {
kind: HirKind::Repetition(rep),
is_start_anchored,
is_match_empty,
static_explicit_captures_len,
}
}
fn capture(cap: Capture) -> Hir {
let is_start_anchored = cap.sub.is_start_anchored;
let is_match_empty = cap.sub.is_match_empty;
let static_explicit_captures_len = cap
.sub
.static_explicit_captures_len
.map(|len| len.saturating_add(1));
let kind = HirKind::Capture(cap);
Hir {
kind,
is_start_anchored,
is_match_empty,
static_explicit_captures_len,
}
}
fn concat(mut subs: Vec<Hir>) -> Hir {
if subs.is_empty() {
Hir::empty()
} else if subs.len() == 1 {
subs.pop().unwrap()
} else {
let is_start_anchored = subs[0].is_start_anchored;
let mut is_match_empty = true;
let mut static_explicit_captures_len = Some(0usize);
for sub in subs.iter() {
is_match_empty = is_match_empty && sub.is_match_empty;
static_explicit_captures_len = static_explicit_captures_len
.and_then(|len1| {
Some((len1, sub.static_explicit_captures_len?))
})
.and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
}
Hir {
kind: HirKind::Concat(subs),
is_start_anchored,
is_match_empty,
static_explicit_captures_len,
}
}
}
fn alternation(mut subs: Vec<Hir>) -> Hir {
if subs.is_empty() {
Hir::fail()
} else if subs.len() == 1 {
subs.pop().unwrap()
} else {
let mut it = subs.iter().peekable();
let mut is_start_anchored =
it.peek().map_or(false, |sub| sub.is_start_anchored);
let mut is_match_empty =
it.peek().map_or(false, |sub| sub.is_match_empty);
let mut static_explicit_captures_len =
it.peek().and_then(|sub| sub.static_explicit_captures_len);
for sub in it {
is_start_anchored = is_start_anchored && sub.is_start_anchored;
is_match_empty = is_match_empty || sub.is_match_empty;
if static_explicit_captures_len
!= sub.static_explicit_captures_len
{
static_explicit_captures_len = None;
}
}
Hir {
kind: HirKind::Alternation(subs),
is_start_anchored,
is_match_empty,
static_explicit_captures_len,
}
}
}
}
impl HirKind {
fn subs(&self) -> &[Hir] {
use core::slice::from_ref;
match *self {
HirKind::Empty
| HirKind::Char(_)
| HirKind::Class(_)
| HirKind::Look(_) => &[],
HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
HirKind::Concat(ref subs) => subs,
HirKind::Alternation(ref subs) => subs,
}
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct Class {
pub(crate) ranges: Vec<ClassRange>,
}
impl Class {
fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
let mut class = Class { ranges: ranges.into_iter().collect() };
class.canonicalize();
class
}
fn ascii_case_fold(&mut self) {
let len = self.ranges.len();
for i in 0..len {
if let Some(folded) = self.ranges[i].ascii_case_fold() {
self.ranges.push(folded);
}
}
self.canonicalize();
}
fn negate(&mut self) {
const MIN: char = '\x00';
const MAX: char = char::MAX;
if self.ranges.is_empty() {
self.ranges.push(ClassRange { start: MIN, end: MAX });
return;
}
let drain_end = self.ranges.len();
if self.ranges[0].start > MIN {
self.ranges.push(ClassRange {
start: MIN,
end: prev_char(self.ranges[0].start).unwrap(),
});
}
for i in 1..drain_end {
self.ranges.push(ClassRange {
start: next_char(self.ranges[i - 1].end).unwrap(),
end: prev_char(self.ranges[i].start).unwrap(),
});
}
if self.ranges[drain_end - 1].end < MAX {
self.ranges.push(ClassRange {
start: next_char(self.ranges[drain_end - 1].end).unwrap(),
end: MAX,
});
}
self.ranges.drain(..drain_end);
}
fn canonicalize(&mut self) {
if self.is_canonical() {
return;
}
self.ranges.sort();
assert!(!self.ranges.is_empty());
let drain_end = self.ranges.len();
for oldi in 0..drain_end {
if self.ranges.len() > drain_end {
let (last, rest) = self.ranges.split_last_mut().unwrap();
if let Some(union) = last.union(&rest[oldi]) {
*last = union;
continue;
}
}
self.ranges.push(self.ranges[oldi]);
}
self.ranges.drain(..drain_end);
}
fn is_canonical(&self) -> bool {
for pair in self.ranges.windows(2) {
if pair[0] >= pair[1] {
return false;
}
if pair[0].is_contiguous(&pair[1]) {
return false;
}
}
true
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)]
pub(crate) struct ClassRange {
pub(crate) start: char,
pub(crate) end: char,
}
impl ClassRange {
fn ascii_case_fold(&self) -> Option<ClassRange> {
if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) {
let start = core::cmp::max(self.start, 'a');
let end = core::cmp::min(self.end, 'z');
return Some(ClassRange {
start: char::try_from(u32::from(start) - 32).unwrap(),
end: char::try_from(u32::from(end) - 32).unwrap(),
});
}
if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) {
let start = core::cmp::max(self.start, 'A');
let end = core::cmp::min(self.end, 'Z');
return Some(ClassRange {
start: char::try_from(u32::from(start) + 32).unwrap(),
end: char::try_from(u32::from(end) + 32).unwrap(),
});
}
None
}
fn union(&self, other: &ClassRange) -> Option<ClassRange> {
if !self.is_contiguous(other) {
return None;
}
let start = core::cmp::min(self.start, other.start);
let end = core::cmp::max(self.end, other.end);
Some(ClassRange { start, end })
}
fn is_contiguous(&self, other: &ClassRange) -> bool {
let (s1, e1) = (u32::from(self.start), u32::from(self.end));
let (s2, e2) = (u32::from(other.start), u32::from(other.end));
core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1)
}
fn is_intersection_empty(&self, other: &ClassRange) -> bool {
let (s1, e1) = (self.start, self.end);
let (s2, e2) = (other.start, other.end);
core::cmp::max(s1, s2) > core::cmp::min(e1, e2)
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum Look {
Start = 1 << 0,
End = 1 << 1,
StartLF = 1 << 2,
EndLF = 1 << 3,
StartCRLF = 1 << 4,
EndCRLF = 1 << 5,
Word = 1 << 6,
WordNegate = 1 << 7,
WordStart = 1 << 8,
WordEnd = 1 << 9,
WordStartHalf = 1 << 10,
WordEndHalf = 1 << 11,
}
impl Look {
pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool {
use self::Look::*;
match *self {
Start => at == 0,
End => at == haystack.len(),
StartLF => at == 0 || haystack[at - 1] == b'\n',
EndLF => at == haystack.len() || haystack[at] == b'\n',
StartCRLF => {
at == 0
|| haystack[at - 1] == b'\n'
|| (haystack[at - 1] == b'\r'
&& (at >= haystack.len() || haystack[at] != b'\n'))
}
EndCRLF => {
at == haystack.len()
|| haystack[at] == b'\r'
|| (haystack[at] == b'\n'
&& (at == 0 || haystack[at - 1] != b'\r'))
}
Word => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
word_before != word_after
}
WordNegate => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
word_before == word_after
}
WordStart => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
!word_before && word_after
}
WordEnd => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
word_before && !word_after
}
WordStartHalf => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
!word_before
}
WordEndHalf => {
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
!word_after
}
}
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct Repetition {
pub(crate) min: u32,
pub(crate) max: Option<u32>,
pub(crate) greedy: bool,
pub(crate) sub: Box<Hir>,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct Capture {
pub(crate) index: u32,
pub(crate) name: Option<Box<str>>,
pub(crate) sub: Box<Hir>,
}
fn next_char(ch: char) -> Option<char> {
if ch == '\u{D7FF}' {
return Some('\u{E000}');
}
char::from_u32(u32::from(ch).checked_add(1).unwrap())
}
fn prev_char(ch: char) -> Option<char> {
if ch == '\u{E000}' {
return Some('\u{D7FF}');
}
Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap())
}
impl Drop for Hir {
fn drop(&mut self) {
use core::mem;
match *self.kind() {
HirKind::Empty
| HirKind::Char(_)
| HirKind::Class(_)
| HirKind::Look(_) => return,
HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
return
}
HirKind::Concat(ref x) if x.is_empty() => return,
HirKind::Alternation(ref x) if x.is_empty() => return,
_ => {}
}
let mut stack = vec![mem::replace(self, Hir::empty())];
while let Some(mut expr) = stack.pop() {
match expr.kind {
HirKind::Empty
| HirKind::Char(_)
| HirKind::Class(_)
| HirKind::Look(_) => {}
HirKind::Capture(ref mut x) => {
stack.push(mem::replace(&mut x.sub, Hir::empty()));
}
HirKind::Repetition(ref mut x) => {
stack.push(mem::replace(&mut x.sub, Hir::empty()));
}
HirKind::Concat(ref mut x) => {
stack.extend(x.drain(..));
}
HirKind::Alternation(ref mut x) => {
stack.extend(x.drain(..));
}
}
}
}
}