use regex::Regex;
use std::collections::BTreeSet;
use std::usize;
use crate::error::RuntimeError;
use crate::prev_codepoint_ix;
use crate::Error;
use crate::Result;
use crate::{codepoint_len, RegexOptions};
const OPTION_TRACE: u32 = 1 << 0;
pub(crate) const OPTION_SKIPPED_EMPTY_MATCH: u32 = 1 << 1;
const MAX_STACK: usize = 1_000_000;
#[derive(Debug, Clone)]
pub enum Insn {
End,
Any,
AnyNoNL,
Lit(String), Split(usize, usize),
Jmp(usize),
Save(usize),
Save0(usize),
Restore(usize),
RepeatGr {
lo: usize,
hi: usize,
next: usize,
repeat: usize,
},
RepeatNg {
lo: usize,
hi: usize,
next: usize,
repeat: usize,
},
RepeatEpsilonGr {
lo: usize,
next: usize,
repeat: usize,
check: usize,
},
RepeatEpsilonNg {
lo: usize,
next: usize,
repeat: usize,
check: usize,
},
FailNegativeLookAround,
GoBack(usize),
Backref(usize),
BeginAtomic,
EndAtomic,
DelegateSized(Box<Regex>, usize),
Delegate {
inner: Box<Regex>,
inner1: Option<Box<Regex>>,
start_group: usize,
end_group: usize,
},
ContinueFromPreviousMatchEnd,
BackrefExistsCondition(usize),
}
#[derive(Debug, Clone)]
pub struct Prog {
pub body: Vec<Insn>,
n_saves: usize,
}
impl Prog {
pub(crate) fn new(body: Vec<Insn>, n_saves: usize) -> Prog {
Prog { body, n_saves }
}
#[doc(hidden)]
pub(crate) fn debug_print(&self) {
for (i, insn) in self.body.iter().enumerate() {
println!("{:3}: {:?}", i, insn);
}
}
}
#[derive(Debug)]
struct Branch {
pc: usize,
ix: usize,
nsave: usize,
}
#[derive(Debug)]
struct Save {
slot: usize,
value: usize,
}
struct State {
saves: Vec<usize>,
stack: Vec<Branch>,
oldsave: Vec<Save>,
nsave: usize,
explicit_sp: usize,
max_stack: usize,
options: u32,
}
impl State {
fn new(n_saves: usize, max_stack: usize, options: u32) -> State {
State {
saves: vec![usize::MAX; n_saves],
stack: Vec::new(),
oldsave: Vec::new(),
nsave: 0,
explicit_sp: n_saves,
max_stack,
options,
}
}
fn push(&mut self, pc: usize, ix: usize) -> Result<()> {
if self.stack.len() < self.max_stack {
let nsave = self.nsave;
self.stack.push(Branch { pc, ix, nsave });
self.nsave = 0;
self.trace_stack("push");
Ok(())
} else {
Err(Error::RuntimeError(RuntimeError::StackOverflow))
}
}
fn pop(&mut self) -> (usize, usize) {
for _ in 0..self.nsave {
let Save { slot, value } = self.oldsave.pop().unwrap();
self.saves[slot] = value;
}
let Branch { pc, ix, nsave } = self.stack.pop().unwrap();
self.nsave = nsave;
self.trace_stack("pop");
(pc, ix)
}
fn save(&mut self, slot: usize, val: usize) {
for i in 0..self.nsave {
if self.oldsave[self.oldsave.len() - i - 1].slot == slot {
self.saves[slot] = val;
return;
}
}
self.oldsave.push(Save {
slot,
value: self.saves[slot],
});
self.nsave += 1;
self.saves[slot] = val;
if self.options & OPTION_TRACE != 0 {
println!("saves: {:?}", self.saves);
}
}
fn get(&self, slot: usize) -> usize {
self.saves[slot]
}
fn stack_push(&mut self, val: usize) {
if self.saves.len() == self.explicit_sp {
self.saves.push(self.explicit_sp + 1);
}
let explicit_sp = self.explicit_sp;
let sp = self.get(explicit_sp);
if self.saves.len() == sp {
self.saves.push(val);
} else {
self.save(sp, val);
}
self.save(explicit_sp, sp + 1);
}
fn stack_pop(&mut self) -> usize {
let explicit_sp = self.explicit_sp;
let sp = self.get(explicit_sp) - 1;
let result = self.get(sp);
self.save(explicit_sp, sp);
result
}
fn backtrack_count(&self) -> usize {
self.stack.len()
}
fn backtrack_cut(&mut self, count: usize) {
if self.stack.len() == count {
return;
}
let (oldsave_start, oldsave_end) = {
let mut end = self.oldsave.len() - self.nsave;
for &Branch { nsave, .. } in &self.stack[count + 1..] {
end -= nsave;
}
let start = end - self.stack[count].nsave;
(start, end)
};
let mut saved = BTreeSet::new();
for &Save { slot, .. } in &self.oldsave[oldsave_start..oldsave_end] {
saved.insert(slot);
}
let mut oldsave_ix = oldsave_end;
for ix in oldsave_end..self.oldsave.len() {
let Save { slot, .. } = self.oldsave[ix];
let new_slot = saved.insert(slot);
if new_slot {
self.oldsave.swap(oldsave_ix, ix);
oldsave_ix += 1;
}
}
self.stack.truncate(count);
self.oldsave.truncate(oldsave_ix);
self.nsave = oldsave_ix - oldsave_start;
}
#[inline]
fn trace_stack(&self, operation: &str) {
if self.options & OPTION_TRACE != 0 {
println!("stack after {}: {:?}", operation, self.stack);
}
}
}
fn codepoint_len_at(s: &str, ix: usize) -> usize {
codepoint_len(s.as_bytes()[ix])
}
#[inline]
fn matches_literal(s: &str, ix: usize, end: usize, literal: &str) -> bool {
end <= s.len() && &s.as_bytes()[ix..end] == literal.as_bytes()
}
pub fn run_trace(prog: &Prog, s: &str, pos: usize) -> Result<Option<Vec<usize>>> {
run(prog, s, pos, OPTION_TRACE, &RegexOptions::default())
}
pub fn run_default(prog: &Prog, s: &str, pos: usize) -> Result<Option<Vec<usize>>> {
run(prog, s, pos, 0, &RegexOptions::default())
}
#[allow(clippy::cognitive_complexity)]
pub(crate) fn run(
prog: &Prog,
s: &str,
pos: usize,
option_flags: u32,
options: &RegexOptions,
) -> Result<Option<Vec<usize>>> {
let mut state = State::new(prog.n_saves, MAX_STACK, option_flags);
if option_flags & OPTION_TRACE != 0 {
println!("pos\tinstruction");
}
let mut backtrack_count = 0;
let mut pc = 0;
let mut ix = pos;
loop {
'fail: loop {
if option_flags & OPTION_TRACE != 0 {
println!("{}\t{} {:?}", ix, pc, prog.body[pc]);
}
match prog.body[pc] {
Insn::End => {
if option_flags & OPTION_TRACE != 0 {
println!("saves: {:?}", state.saves);
}
if let Some(&slot1) = state.saves.get(1) {
if state.get(0) > slot1 {
state.save(0, slot1);
}
}
return Ok(Some(state.saves));
}
Insn::Any => {
if ix < s.len() {
ix += codepoint_len_at(s, ix);
} else {
break 'fail;
}
}
Insn::AnyNoNL => {
if ix < s.len() && s.as_bytes()[ix] != b'\n' {
ix += codepoint_len_at(s, ix);
} else {
break 'fail;
}
}
Insn::Lit(ref val) => {
let ix_end = ix + val.len();
if !matches_literal(s, ix, ix_end, val) {
break 'fail;
}
ix = ix_end;
}
Insn::Split(x, y) => {
state.push(y, ix)?;
pc = x;
continue;
}
Insn::Jmp(target) => {
pc = target;
continue;
}
Insn::Save(slot) => state.save(slot, ix),
Insn::Save0(slot) => state.save(slot, 0),
Insn::Restore(slot) => ix = state.get(slot),
Insn::RepeatGr {
lo,
hi,
next,
repeat,
} => {
let repcount = state.get(repeat);
if repcount == hi {
pc = next;
continue;
}
state.save(repeat, repcount + 1);
if repcount >= lo {
state.push(next, ix)?;
}
}
Insn::RepeatNg {
lo,
hi,
next,
repeat,
} => {
let repcount = state.get(repeat);
if repcount == hi {
pc = next;
continue;
}
state.save(repeat, repcount + 1);
if repcount >= lo {
state.push(pc + 1, ix)?;
pc = next;
continue;
}
}
Insn::RepeatEpsilonGr {
lo,
next,
repeat,
check,
} => {
let repcount = state.get(repeat);
if repcount > lo && state.get(check) == ix {
break 'fail;
}
state.save(repeat, repcount + 1);
if repcount >= lo {
state.save(check, ix);
state.push(next, ix)?;
}
}
Insn::RepeatEpsilonNg {
lo,
next,
repeat,
check,
} => {
let repcount = state.get(repeat);
if repcount > lo && state.get(check) == ix {
break 'fail;
}
state.save(repeat, repcount + 1);
if repcount >= lo {
state.save(check, ix);
state.push(pc + 1, ix)?;
pc = next;
continue;
}
}
Insn::GoBack(count) => {
for _ in 0..count {
if ix == 0 {
break 'fail;
}
ix = prev_codepoint_ix(s, ix);
}
}
Insn::FailNegativeLookAround => {
loop {
let (popped_pc, _) = state.pop();
if popped_pc == pc + 1 {
break;
}
}
break 'fail;
}
Insn::Backref(slot) => {
let lo = state.get(slot);
if lo == usize::MAX {
break 'fail;
}
let hi = state.get(slot + 1);
if hi == usize::MAX {
break 'fail;
}
let ref_text = &s[lo..hi];
let ix_end = ix + ref_text.len();
if !matches_literal(s, ix, ix_end, ref_text) {
break 'fail;
}
ix = ix_end;
}
Insn::BackrefExistsCondition(group) => {
let lo = state.get(group * 2);
if lo == usize::MAX {
break 'fail;
}
}
Insn::BeginAtomic => {
let count = state.backtrack_count();
state.stack_push(count);
}
Insn::EndAtomic => {
let count = state.stack_pop();
state.backtrack_cut(count);
}
Insn::DelegateSized(ref inner, size) => {
if inner.is_match(&s[ix..]) {
for _ in 0..size {
ix += codepoint_len_at(s, ix);
}
} else {
break 'fail;
}
}
Insn::Delegate {
ref inner,
ref inner1,
start_group,
end_group,
} => {
let re = match *inner1 {
Some(ref inner1) if ix > 0 => {
ix = prev_codepoint_ix(s, ix);
inner1
}
_ => inner,
};
if start_group == end_group {
match re.find(&s[ix..]) {
Some(m) => ix += m.end(),
_ => break 'fail,
}
} else {
let mut locations = re.capture_locations();
if let Some(m) = re.captures_read(&mut locations, &s[ix..]) {
for i in 0..(end_group - start_group) {
let slot = (start_group + i) * 2;
if let Some((start, end)) = locations.get(i + 1) {
state.save(slot, ix + start);
state.save(slot + 1, ix + end);
} else {
state.save(slot, usize::MAX);
state.save(slot + 1, usize::MAX);
}
}
ix += m.end();
} else {
break 'fail;
}
}
}
Insn::ContinueFromPreviousMatchEnd => {
if ix > pos || option_flags & OPTION_SKIPPED_EMPTY_MATCH != 0 {
break 'fail;
}
}
}
pc += 1;
}
if option_flags & OPTION_TRACE != 0 {
println!("fail");
}
if state.stack.is_empty() {
return Ok(None);
}
backtrack_count += 1;
if backtrack_count > options.backtrack_limit {
return Err(Error::RuntimeError(RuntimeError::BacktrackLimitExceeded));
}
let (newpc, newix) = state.pop();
pc = newpc;
ix = newix;
}
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, Arbitrary, Gen};
#[test]
fn state_push_pop() {
let mut state = State::new(1, MAX_STACK, 0);
state.push(0, 0).unwrap();
state.push(1, 1).unwrap();
assert_eq!(state.pop(), (1, 1));
assert_eq!(state.pop(), (0, 0));
assert!(state.stack.is_empty());
state.push(2, 2).unwrap();
assert_eq!(state.pop(), (2, 2));
assert!(state.stack.is_empty());
}
#[test]
fn state_save_override() {
let mut state = State::new(1, MAX_STACK, 0);
state.save(0, 10);
state.push(0, 0).unwrap();
state.save(0, 20);
assert_eq!(state.pop(), (0, 0));
assert_eq!(state.get(0), 10);
}
#[test]
fn state_save_override_twice() {
let mut state = State::new(1, MAX_STACK, 0);
state.save(0, 10);
state.push(0, 0).unwrap();
state.save(0, 20);
state.push(1, 1).unwrap();
state.save(0, 30);
assert_eq!(state.get(0), 30);
assert_eq!(state.pop(), (1, 1));
assert_eq!(state.get(0), 20);
assert_eq!(state.pop(), (0, 0));
assert_eq!(state.get(0), 10);
}
#[test]
fn state_explicit_stack() {
let mut state = State::new(1, MAX_STACK, 0);
state.stack_push(11);
state.stack_push(12);
state.push(100, 101).unwrap();
state.stack_push(13);
assert_eq!(state.stack_pop(), 13);
state.stack_push(14);
assert_eq!(state.pop(), (100, 101));
assert_eq!(state.stack_pop(), 12);
assert_eq!(state.stack_pop(), 11);
}
#[test]
fn state_backtrack_cut_simple() {
let mut state = State::new(2, MAX_STACK, 0);
state.save(0, 1);
state.save(1, 2);
let count = state.backtrack_count();
state.push(0, 0).unwrap();
state.save(0, 3);
assert_eq!(state.backtrack_count(), 1);
state.backtrack_cut(count);
assert_eq!(state.backtrack_count(), 0);
assert_eq!(state.get(0), 3);
assert_eq!(state.get(1), 2);
}
#[test]
fn state_backtrack_cut_complex() {
let mut state = State::new(2, MAX_STACK, 0);
state.save(0, 1);
state.save(1, 2);
state.push(0, 0).unwrap();
state.save(0, 3);
let count = state.backtrack_count();
state.push(1, 1).unwrap();
state.save(0, 4);
state.push(2, 2).unwrap();
state.save(1, 5);
assert_eq!(state.backtrack_count(), 3);
state.backtrack_cut(count);
assert_eq!(state.backtrack_count(), 1);
assert_eq!(state.get(0), 4);
assert_eq!(state.get(1), 5);
state.pop();
assert_eq!(state.backtrack_count(), 0);
assert_eq!(state.get(0), 1);
assert_eq!(state.get(1), 2);
}
#[derive(Clone, Debug)]
enum Operation {
Push,
Pop,
Save(usize, usize),
}
impl Arbitrary for Operation {
fn arbitrary(g: &mut Gen) -> Self {
match g.choose(&[0, 1, 2]) {
Some(0) => Operation::Push,
Some(1) => Operation::Pop,
_ => Operation::Save(
*g.choose(&[0usize, 1, 2, 3, 4]).unwrap(),
usize::arbitrary(g),
),
}
}
}
fn check_saves_for_operations(operations: Vec<Operation>) -> bool {
let slots = operations
.iter()
.map(|o| match o {
&Operation::Save(slot, _) => slot + 1,
_ => 0,
})
.max()
.unwrap_or(0);
if slots == 0 {
return true;
}
let mut stack = Vec::new();
let mut saves = vec![usize::MAX; slots];
let mut state = State::new(slots, MAX_STACK, 0);
let mut expected = Vec::new();
let mut actual = Vec::new();
for operation in operations {
match operation {
Operation::Push => {
stack.push((0, 0, saves.clone()));
state.push(0, 0).unwrap();
}
Operation::Pop => {
if let Some((_, _, previous_saves)) = stack.pop() {
saves = previous_saves;
state.pop();
}
}
Operation::Save(slot, value) => {
saves[slot] = value;
state.save(slot, value);
}
}
expected.push(saves.clone());
let mut actual_saves = vec![usize::MAX; slots];
for i in 0..slots {
actual_saves[i] = state.get(i);
}
actual.push(actual_saves);
}
expected == actual
}
quickcheck! {
fn state_save_quickcheck(operations: Vec<Operation>) -> bool {
check_saves_for_operations(operations)
}
}
}