use super::{
ast::{parse, Assertion},
compile::{compile_ast, optimise, CompiledOps, Inst, Op, Prog},
matches::{Match, MatchIter},
Error,
};
use crate::buffer::{Buffer, GapBuffer};
use std::{mem::swap, rc::Rc};
pub(super) const N_SLOTS: usize = 30;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Regex {
prog: Prog,
submatch_names: Rc<[String]>,
clist: Box<[Thread]>,
nlist: Box<[Thread]>,
sms: Box<[SubMatches]>,
free_sms: Vec<usize>,
track_submatches: bool,
gen: usize,
p: usize,
prev: Option<char>,
next: Option<char>,
}
impl Regex {
pub fn compile(re: &str) -> Result<Self, Error> {
let mut ast = parse(re)?;
ast.optimise();
let CompiledOps {
ops,
submatch_names,
} = compile_ast(ast, false);
Ok(Self::new(ops, submatch_names))
}
pub fn compile_reverse(re: &str) -> Result<Self, Error> {
let mut ast = parse(re)?;
ast.optimise();
let CompiledOps {
ops,
submatch_names,
} = compile_ast(ast, true);
Ok(Self::new(ops, submatch_names))
}
fn new(ops: Vec<Op>, submatch_names: Vec<String>) -> Self {
let prog: Prog = optimise(ops)
.into_iter()
.map(|op| Inst { op, gen: 0 })
.collect();
let clist = vec![Thread::default(); prog.len()].into_boxed_slice();
let nlist = vec![Thread::default(); prog.len()].into_boxed_slice();
let sms = vec![SubMatches::default(); prog.len()].into_boxed_slice();
let free_sms = (1..prog.len()).collect();
Self {
prog,
submatch_names: Rc::from(submatch_names.into_boxed_slice()),
clist,
nlist,
gen: 0,
p: 0,
prev: None,
next: None,
sms,
free_sms,
track_submatches: true,
}
}
pub fn match_str(&mut self, input: &str) -> Option<Match> {
self.track_submatches = true;
self.match_iter(&mut input.chars().enumerate(), 0)
}
pub fn match_str_all<'a, 'b>(&'a mut self, input: &'b str) -> MatchIter<'a, &'b str> {
self.track_submatches = true;
MatchIter {
it: input,
r: self,
from: 0,
}
}
pub fn match_buffer_all<'a, 'b>(&'a mut self, b: &'b Buffer) -> MatchIter<'a, &'b GapBuffer> {
self.track_submatches = true;
MatchIter {
it: &b.txt,
r: self,
from: 0,
}
}
pub fn match_iter<I>(&mut self, input: &mut I, sp: usize) -> Option<Match>
where
I: Iterator<Item = (usize, char)>,
{
self.track_submatches = true;
self._match_iter(input, sp)
}
pub fn matches_str(&mut self, input: &str) -> bool {
self.track_submatches = false;
self.matches_iter(&mut input.chars().enumerate(), 0)
}
pub fn matches_iter<I>(&mut self, input: &mut I, sp: usize) -> bool
where
I: Iterator<Item = (usize, char)>,
{
self.track_submatches = false;
self._match_iter(input, sp).is_some()
}
fn _match_iter<I>(&mut self, input: &mut I, mut sp: usize) -> Option<Match>
where
I: Iterator<Item = (usize, char)>,
{
let mut sub_matches = [0; N_SLOTS];
self.free_sms = (1..self.prog.len()).collect();
self.sms[0] = SubMatches {
refs: 1,
inner: [0; N_SLOTS],
};
self.gen += 1;
self.add_thread(Thread::default(), sp, '\0', true);
swap(&mut self.clist, &mut self.nlist);
self.gen += 1;
let mut n = self.p;
self.p = 0;
let mut matched = false;
let mut it = input.peekable();
self.prev = None;
self.next = None;
while let Some((i, ch)) = it.next() {
sp = i;
self.next = it.peek().map(|(_, c)| *c);
for i in 0..n {
if let Some(sm) = self.step_thread(i, sp, ch) {
if !self.track_submatches {
return Some(Match::synthetic(0, 0));
}
matched = true;
sub_matches = self.sms[sm].inner;
for j in i..n {
self.sm_dec_ref(self.clist[j].sm);
}
break;
}
}
swap(&mut self.clist, &mut self.nlist);
self.prev = Some(ch);
self.gen += 1;
n = self.p;
if self.p == 0 {
break;
}
self.p = 0;
}
self.prev = None;
self.next = None;
for t in self.clist.iter_mut().take(n) {
if self.prog[t.pc].op == Op::Match && self.sms[t.sm].inner[1] >= sub_matches[1] {
matched = true;
sub_matches = self.sms[t.sm].inner;
break;
}
}
if !matched {
return None;
}
Some(Match {
sub_matches,
submatch_names: self.submatch_names.clone(),
})
}
#[inline]
fn step_thread(&mut self, i: usize, sp: usize, ch: char) -> Option<usize> {
let t = &self.clist[i];
match &self.prog[t.pc].op {
Op::Comp(comp) if comp.matches(ch) => match t.assertion {
Some(a) if !a.holds_for(self.prev, ch, self.next) => {
self.sm_dec_ref(t.sm);
return None;
}
_ => self.add_thread(thread(t.pc + 1, t.sm), sp, ch, false),
},
Op::Match => return Some(t.sm),
_ => self.sm_dec_ref(t.sm),
}
None
}
#[inline]
fn add_thread(&mut self, t: Thread, sp: usize, ch: char, initial: bool) {
if self.prog[t.pc].gen == self.gen {
self.sm_dec_ref(t.sm);
return; }
self.prog[t.pc].gen = self.gen;
if let Op::Jump(l1) = self.prog[t.pc].op {
let th = match t.assertion {
Some(a) => assert_thread(l1, t.sm, a),
None => thread(l1, t.sm),
};
self.add_thread(th, sp, ch, initial);
} else if let Op::Split(l1, l2) = self.prog[t.pc].op {
self.sms[t.sm].refs += 1;
let (t1, t2) = match t.assertion {
Some(a) => (assert_thread(l1, t.sm, a), assert_thread(l2, t.sm, a)),
None => (thread(l1, t.sm), thread(l2, t.sm)),
};
self.add_thread(t1, sp, ch, initial);
self.add_thread(t2, sp, ch, initial);
} else if let Op::Assertion(a) = self.prog[t.pc].op {
self.add_thread(assert_thread(t.pc + 1, t.sm, a), sp, ch, initial);
} else if let Op::Save(s) = self.prog[t.pc].op {
self.handle_save(t, s, sp, ch, initial, false)
} else if let Op::RSave(s) = self.prog[t.pc].op {
self.handle_save(t, s, sp, ch, initial, true)
} else {
self.nlist[self.p] = t;
self.p += 1;
}
}
#[inline]
fn handle_save(&mut self, t: Thread, s: usize, sp: usize, ch: char, initial: bool, rev: bool) {
if (!rev && s % 2 == 0) || (rev && s % 2 == 1) {
let sm = self.sm_update(t.sm, s, sp, initial, rev);
let th = match t.assertion {
Some(a) => assert_thread(t.pc + 1, sm, a),
None => thread(t.pc + 1, sm),
};
self.add_thread(th, sp, ch, initial);
} else {
match t.assertion {
Some(a) if !a.holds_for(self.prev, ch, self.next) => self.sm_dec_ref(t.sm),
_ => {
let sm = self.sm_update(t.sm, s, sp, initial, rev);
self.add_thread(thread(t.pc + 1, sm), sp, ch, initial);
}
}
}
}
#[inline]
fn sm_dec_ref(&mut self, i: usize) {
if !self.track_submatches {
return;
}
self.sms[i].refs -= 1;
if self.sms[i].refs == 0 {
self.free_sms.push(i);
}
}
#[inline]
fn sm_update(&mut self, i: usize, s: usize, sp: usize, initial: bool, reverse: bool) -> usize {
if !self.track_submatches || s >= N_SLOTS {
return i;
}
let i = if self.sms[i].refs == 1 {
i
} else {
self.sm_dec_ref(i);
let j = self.free_sms.swap_remove(0);
self.sms[j].inner = self.sms[i].inner;
self.sms[j].refs = 1;
j
};
let mut val = if !initial { sp + 1 } else { sp };
if reverse && !initial {
val = val.saturating_sub(1);
}
self.sms[i].inner[s] = val;
i
}
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
struct SubMatches {
refs: usize,
inner: [usize; N_SLOTS],
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
struct Thread {
pc: usize,
assertion: Option<Assertion>,
sm: usize,
}
#[inline]
fn thread(pc: usize, sm: usize) -> Thread {
Thread {
pc,
sm,
assertion: None,
}
}
#[inline]
fn assert_thread(pc: usize, sm: usize, a: Assertion) -> Thread {
Thread {
pc,
sm,
assertion: Some(a),
}
}
#[cfg(test)]
mod tests {
use super::*;
use simple_test_case::test_case;
#[test_case("foo", "foo", Some("foo"); "literal full string")]
#[test_case("ba*", "baaaaa", Some("baaaaa"); "zero or more present")]
#[test_case("ba*", "b", Some("b"); "zero or more not present")]
#[test_case("ba+", "baaaaa", Some("baaaaa"); "one or more present")]
#[test_case("ba+", "b", None; "one or more not present")]
#[test_case("b?a", "ba", Some("ba"); "optional present")]
#[test_case("b?a", "a", Some("a"); "optional not present")]
#[test_case("a(bb)+a", "abbbba", Some("abbbba"); "article example matching")]
#[test_case("a(bb)+a", "abbba", None; "article example non matching")]
#[test_case(".*b", "123b", Some("123b"); "dot star prefix")]
#[test_case("1.*", "123b", Some("123b"); "dot star suffix")]
#[test_case("1.*b", "123b", Some("123b"); "dot star inner")]
#[test_case("(c|C)ase matters", "case matters", Some("case matters"); "alternation first")]
#[test_case("(c|C)ase matters", "Case matters", Some("Case matters"); "alternation second")]
#[test_case("(aa|bbb|c|dd)", "c", Some("c"); "chained alternation")]
#[test_case("this@*works", "this contains\nbut still works", Some("this contains\nbut still works"); "true any")]
#[test_case(r"literal\?", "literal?", Some("literal?"); "escape special char")]
#[test_case(r"literal\t", "literal\t", Some("literal\t"); "escape sequence")]
#[test_case("[abc] happy cow", "a happy cow", Some("a happy cow"); "character class")]
#[test_case("[^abc] happy cow", "a happy cow", None; "negated character class")]
#[test_case("[a-zA-Z]*", "camelCaseFtw", Some("camelCaseFtw"); "char class ranges matching")]
#[test_case("[a-zA-Z]*1", "kebab-case-not-so-much", None; "char class ranges non matching")]
#[test_case("[a-zA-Z ]*", "this should work", Some("this should work"); "char class mixed")]
#[test_case("[\\]5]*", "5]]5555]]", Some("5]]5555]]"); "char class escaped bracket")]
#[test_case("[0-9]+", "0123", Some("0123"); "digit range")]
#[test_case("[0-9]+", "0", Some("0"); "digit range range start only")]
#[test_case("25[0-5]", "255", Some("255"); "ipv4 element one")]
#[test_case("2[0-4][0-9]", "231", Some("231"); "ipv4 element two")]
#[test_case("1?[0-9]?[0-9]", "155", Some("155"); "ipv4 element three three digit")]
#[test_case("1?[0-9]?[0-9]", "72", Some("72"); "ipv4 element three two digit")]
#[test_case("1?[0-9]?[0-9]", "8", Some("8"); "ipv4 element three one digit")]
#[test_case("1?[0-9]?[0-9]", "0", Some("0"); "ipv4 element three zero")]
#[test_case("(25[0-5]|2[0-4][0-9])", "255", Some("255"); "ipv4 elements one and two matching one")]
#[test_case("(25[0-5]|2[0-4][0-9])", "219", Some("219"); "ipv4 elements one and two matching two")]
#[test_case("(25[0-5]|2[0-4][0-9])", "42", None; "ipv4 elements one and two not matching")]
#[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 elements two and three matching two")]
#[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 elements two and three matching three")]
#[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "251", Some("251"); "ipv4 all elements matching one")]
#[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 all elements matching two")]
#[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 all elements matching three")]
#[test_case(
r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
"127.0.0.1 ",
Some("127.0.0.1");
"ipv4 full"
)]
#[test_case("^foo", "foo at the start", Some("foo"); "SOL holding")]
#[test_case("^foo", "bar\nfoo at the start", Some("foo"); "SOL holding after newline")]
#[test_case("^foo", "we have foo but not at the start", None; "SOL not holding")]
#[test_case("foo$", "a line that ends with foo", Some("foo"); "BOL holding")]
#[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
#[test_case("foo$", "a line with foo in the middle", None; "BOL not holding")]
#[test_case("a{3}", "aaa", Some("aaa"); "counted repetition")]
#[test_case("a{3}", "aa", None; "counted repetition non matching")]
#[test_case("a{3,}", "aaaaaa", Some("aaaaaa"); "counted repetition at least")]
#[test_case("a{3,}", "aa", None; "counted repetition at least non matching")]
#[test_case("a{3,5}", "aaa", Some("aaa"); "counted repetition between lower")]
#[test_case("a{3,5}", "aaaaa", Some("aaaaa"); "counted repetition between upper")]
#[test_case("a{3,5}", "aaaa", Some("aaaa"); "counted repetition in range")]
#[test_case("a{3,5}", "aa", None; "counted repetition less")]
#[test_case("^a{3,5}$", "aaaaaa", None; "counted repetition more")]
#[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
#[test_case("\\bfor\\b", "forward", None; "word boundary for match at start of word")]
#[test_case("\\bfor\\b", "for ward", Some("for"); "word boundary for match not inside word")]
#[test_case("\\bfor\\b", "bob for", Some("for"); "word boundary match not at BOF")]
#[test_case("\\bfor\\b", "bob for bob", Some("for"); "word boundary match not at BOF or EOF")]
#[test_case("\\bin\\b", "min", None; "word boundary for match at end of word")]
#[test_case("\\b(in)\\b", "min", None; "word boundary for sub expression match at end of word")]
#[test_case("\\b(in|for)\\b", "min", None; "word boundary for alt match at end of word")]
#[test_case("\\b(in|for)\\b", "bob for", Some("for"); "word boundary for alt match not at BOF")]
#[test]
fn match_works(re: &str, s: &str, expected: Option<&str>) {
let mut r = Regex::compile(re).unwrap();
let m = r.match_str(s).map(|m| m.str_match_text(s));
assert_eq!(m.as_deref(), expected);
}
#[test_case("foo", "foo", Some("foo"); "literal full string")]
#[test_case("ba*", " baaaaa foo", Some("baaaaa"); "zero or more present")]
#[test_case("ba*", "b foo", Some("b"); "zero or more not present")]
#[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
#[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
#[test_case(
r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
"127.0.0.1 ",
Some("127.0.0.1");
"ipv4 full"
)]
#[test_case(
"his",
"this is a line\nand another\n- [ ] something to do\n",
Some("his");
"multiline intput"
)]
#[test]
fn rev_match_works(re: &str, s: &str, expected: Option<&str>) {
use crate::exec::IterBoundedChars;
let mut r = Regex::compile_reverse(re).unwrap();
let b = Buffer::new_unnamed(0, s);
let mut it = b.rev_iter_between(s.len(), 0);
let m = r
.match_iter(&mut it, s.len())
.map(|m| m.str_match_text(&b.txt.to_string()).to_string());
assert_eq!(m.as_deref(), expected);
}
#[test_case("[0-9]+", " 42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers")]
#[test_case("[0-9]+", " 42 3 127 9991", &["42", "3", "127", "9991"]; "integers to EOF")]
#[test_case("[0-9]+", "42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers from BOF")]
#[test_case("[0-9]+", "42 3 127 9991", &["42", "3", "127", "9991"]; "integers full input")]
#[test_case("foo|bar|baz", "baz bar foo bar", &["baz", "bar", "foo", "bar"]; "alts spaced in s")]
#[test_case("foo|bar|baz", "bazbarfoobar", &["baz", "bar", "foo", "bar"]; "alts back to back in s")]
#[test_case("(foo|bar|baz)", "foo foobar barfoo baz", &["foo", "foo", "bar", "bar", "foo", "baz"]; "alts in parens")]
#[test_case("\\b(foo|bar|baz)\\b", "foo foobar barfoo baz", &["foo", "baz"]; "alts with word boundaries")]
#[test]
fn match_all_works(re: &str, s: &str, expected: &[&str]) {
let mut r = Regex::compile(re).unwrap();
let matches: Vec<String> = r.match_str_all(s).map(|m| m.str_match_text(s)).collect();
assert_eq!(&matches, expected);
}
#[test]
fn dot_star_works() {
let mut r = Regex::compile(".*").unwrap();
let s = "\nthis is\na multiline\nfile";
let m1 = r.match_str(s).unwrap();
assert_eq!(m1.str_match_text(s), "");
let m2 = r.match_str(&s[1..]).unwrap();
assert_eq!(m2.str_match_text(&s[1..]), "this is");
}
#[test]
fn match_extraction_works() {
let re = "([0-9]+)-([0-9]+)-([0-9]+)";
let mut r = Regex::compile(re).unwrap();
let s = "this should work 123-456-789 other stuff";
let m = r.match_str(s).unwrap();
assert_eq!(m.str_submatch_text(1, s).as_deref(), Some("123"));
assert_eq!(m.str_submatch_text(2, s).as_deref(), Some("456"));
assert_eq!(m.str_submatch_text(3, s).as_deref(), Some("789"));
assert_eq!(m.str_match_text(s), "123-456-789");
}
#[test_case("(?<xy>X|Y)", "xy", "X"; "named match on its own")]
#[test_case("(?<xy>X|Y)(a|b)", "xy", "X"; "named match before unnamed")]
#[test_case("(e| )(?<xy>X|Y)", "xy", "X"; "named match after unnamed")]
#[test_case("(e| )(?<xy>X|Y)(a|b)", "xy", "X"; "named match inbetween unnamed")]
#[test]
fn named_submatch_works(re: &str, name: &str, expected: &str) {
let mut r = Regex::compile(re).unwrap();
let s = "text before Xanadu";
let m = r.match_str(s).unwrap();
assert_eq!(m.named_matches(), vec![name]);
assert_eq!(m.str_sub_loc_text_ref_by_name(name, s), Some(expected));
}
#[test]
fn multiline_input_match_dot_star_works() {
let mut r = Regex::compile(".*").unwrap();
let s = "this is\na multiline\nfile";
let m = r.match_str(s).unwrap();
assert_eq!(m.str_match_text(s), "this is");
}
#[test]
fn multiline_input_match_dot_star_works_with_non_zero_initial_sp() {
let mut r = Regex::compile(".*").unwrap();
let s = "this is\na multiline\nfile";
assert_eq!(s.chars().skip(7).collect::<String>(), "\na multiline\nfile");
let m1 = r.match_iter(&mut s.chars().enumerate().skip(7), 7).unwrap();
assert_eq!(m1.str_match_text(s), "");
let m2 = r.match_iter(&mut s.chars().enumerate().skip(8), 8).unwrap();
assert_eq!(m2.str_match_text(s), "a multiline");
}
#[test]
fn multiline_input_match_all_dot_star_works() {
let mut r = Regex::compile(".*").unwrap();
let s = "this is\na multiline\nfile";
let mut it = r.match_str_all(s);
let m1 = it.next().unwrap();
assert_eq!(m1.str_match_text(s), "this is");
let m2 = it.next().unwrap();
assert_eq!(m2.str_match_text(s), "");
let m3 = it.next().unwrap();
assert_eq!(m3.str_match_text(s), "a multiline");
let m4 = it.next().unwrap();
assert_eq!(m4.str_match_text(s), "");
let m5 = it.next().unwrap();
assert_eq!(m5.str_match_text(s), "file");
assert_eq!(it.next(), None);
}
#[test]
fn match_extraction_works_when_multibyte_characters_are_present() {
let s: &str = "const VLINE: char = '│';
impl Editor {
";
let re = r"impl (\w+) \{";
let mut r = Regex::compile(re).unwrap();
let m = r.match_str(s).unwrap();
assert_eq!(m.str_submatch_text(1, s).as_deref(), Some("Editor"));
assert_eq!(m.str_match_text(s), "impl Editor {");
}
#[test]
fn pathological_match_doesnt_explode() {
let s = "a".repeat(100);
let mut re = "a?".repeat(100);
re.push_str(&s);
let mut r = Regex::compile(&re).unwrap();
assert!(r.match_str(&s).is_some());
}
#[test]
fn repeated_match_works() {
let re = "a(bb)+a";
let mut r = Regex::compile(re).unwrap();
for _ in 0..10 {
assert!(r.match_str("abbbba").is_some());
assert!(r.match_str("foo").is_none());
}
}
}