use std::collections::HashMap;
use std::fmt;
use std::mem;
use params::Params;
use prog::{Inst, Program};
use sparse::SparseSet;
const CACHE_LIMIT: usize = 2 * (1<<20);
pub fn can_exec(insts: &Program) -> bool {
use prog::Inst::*;
use prog::EmptyLook::*;
if insts.len() > ::std::u32::MAX as usize {
return false;
}
for inst in insts {
match *inst {
Char(_) | Ranges(_) => return false,
EmptyLook(ref inst) => {
match inst.look {
WordBoundary | NotWordBoundary => return false,
WordBoundaryAscii | NotWordBoundaryAscii => return false,
StartLine | EndLine | StartText | EndText => {}
}
}
Match(_) | Save(_) | Split(_) | Bytes(_) => {}
}
}
true
}
#[derive(Debug)]
pub struct DfaCache {
compiled: HashMap<StateKey, StatePtr>,
states: Vec<State>,
start_states: Vec<StatePtr>,
stack: Vec<InstPtr>,
cache_flush_count: u64,
qcur: SparseSet,
qnext: SparseSet,
}
#[derive(Debug)]
pub struct Dfa<'a, 'p, 'c: 'p, 'm: 'p> {
prog: &'a Program,
start: StatePtr,
params: &'p mut Params<'c, 'm>,
at: usize,
last_match_si: StatePtr,
last_cache_flush: usize,
compiled: &'a mut HashMap<StateKey, StatePtr>,
states: &'a mut Vec<State>,
start_states: &'a mut Vec<StatePtr>,
stack: &'a mut Vec<InstPtr>,
cache_flush_count: &'a mut u64,
}
pub enum DfaResult {
Match,
NoMatch,
Quit,
}
impl DfaResult {
pub fn is_match(&self) -> bool {
match *self {
DfaResult::Match => true,
DfaResult::NoMatch | DfaResult::Quit => false,
}
}
}
#[derive(Clone)]
struct State {
next: Vec<StatePtr>,
insts: Vec<InstPtr>,
is_match: bool,
inst_flags: Flags,
}
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
struct StateKey {
insts: Vec<InstPtr>,
is_match: bool,
}
type InstPtr = u32;
type StatePtr = u32;
const STATE_UNKNOWN: StatePtr = 0;
const STATE_DEAD: StatePtr = 1;
#[derive(Copy, Clone, Debug)]
struct Byte(u16);
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
struct Flags(u8);
impl DfaCache {
pub fn new() -> Self {
DfaCache {
compiled: HashMap::new(),
states: vec![State::invalid(), State::invalid()],
start_states: vec![STATE_UNKNOWN; 256],
stack: vec![],
cache_flush_count: 0,
qcur: SparseSet::new(0),
qnext: SparseSet::new(0),
}
}
fn resize(&mut self, num_insts: usize) {
if num_insts == self.qcur.capacity() {
return;
}
self.qcur = SparseSet::new(num_insts);
self.qnext = SparseSet::new(num_insts);
}
}
impl<'a, 'p, 'c, 'm> Dfa<'a, 'p, 'c, 'm> {
pub fn exec(
prog: &'a Program,
cache: &mut DfaCache,
params: &'p mut Params<'c, 'm>,
text: &[u8],
at: usize,
) -> DfaResult {
cache.resize(prog.len());
let mut dfa = Dfa {
prog: prog,
start: 0, params: params,
at: at,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
compiled: &mut cache.compiled,
states: &mut cache.states,
start_states: &mut cache.start_states,
stack: &mut cache.stack,
cache_flush_count: &mut cache.cache_flush_count,
};
dfa.start = match dfa.start_state(&mut cache.qcur, text) {
None => return DfaResult::Quit,
Some(STATE_DEAD) => return DfaResult::NoMatch,
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
let result = if prog.is_reverse {
dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
} else {
dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
};
if result.is_match() {
if prog.matches.len() == 1 {
dfa.params.set_match(0);
} else {
debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
debug_assert!(dfa.last_match_si != STATE_DEAD);
for &ip in &dfa.states[dfa.last_match_si as usize].insts {
if let Inst::Match(slot) = dfa.prog[ip as usize] {
dfa.params.set_match(slot);
}
}
}
}
result
}
fn exec_at(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
) -> DfaResult {
debug_assert!(!self.prog.is_reverse);
let (mut si, mut result) = (self.start, DfaResult::NoMatch);
while self.at < text.len() {
if !self.prog.prefixes.is_empty()
&& si == self.start
&& !self.prog.is_anchored_start {
self.at = match self.prefix_at(text, self.at) {
None => return DfaResult::NoMatch,
Some(i) => i,
};
}
let cls = self.prog.byte_classes[text[self.at] as usize];
let mut next_si = self.states[si as usize].next[cls as usize];
if next_si <= STATE_DEAD {
if next_si == STATE_DEAD {
return result;
}
let b = Byte::byte(text[self.at]);
next_si = match self.exec_byte(qcur, qnext, si, b) {
None => return DfaResult::Quit,
Some(next_si) => next_si,
};
debug_assert!(next_si != STATE_UNKNOWN);
if next_si == STATE_DEAD {
return result;
}
}
si = next_si;
if self.states[si as usize].is_match {
self.last_match_si = si;
result = DfaResult::Match;
self.params.set_end(Some(self.at));
if self.params.style().quit_after_first_match() {
return result;
}
}
self.at += 1;
}
si = match self.next_state(qcur, qnext, si, Byte::eof()) {
None => return DfaResult::Quit,
Some(si) => si,
};
debug_assert!(si != STATE_UNKNOWN);
if si == STATE_DEAD {
return result;
}
if self.states[si as usize].is_match {
self.last_match_si = si;
result = DfaResult::Match;
self.params.set_end(Some(text.len()));
if self.params.style().quit_after_first_match() {
return result;
}
}
result
}
fn exec_at_reverse(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
) -> DfaResult {
debug_assert!(self.prog.is_reverse);
let (mut si, mut result) = (self.start, DfaResult::NoMatch);
while self.at > 0 {
self.at -= 1;
let cls = self.prog.byte_classes[text[self.at] as usize];
let mut next_si = self.states[si as usize].next[cls as usize];
if next_si <= STATE_DEAD {
if next_si == STATE_DEAD {
return result;
}
let b = Byte::byte(text[self.at]);
next_si = match self.exec_byte(qcur, qnext, si, b) {
None => return DfaResult::Quit,
Some(next_si) => next_si,
};
debug_assert!(next_si != STATE_UNKNOWN);
if next_si == STATE_DEAD {
return result;
}
}
si = next_si;
if self.states[si as usize].is_match {
self.last_match_si = si;
result = DfaResult::Match;
self.params.set_start(Some(self.at+1));
if self.params.style().quit_after_first_match() {
return result;
}
}
}
si = match self.next_state(qcur, qnext, si, Byte::eof()) {
None => return DfaResult::Quit,
Some(si) => si,
};
debug_assert!(si != STATE_UNKNOWN);
if si == STATE_DEAD {
return result;
}
if self.states[si as usize].is_match {
self.last_match_si = si;
result = DfaResult::Match;
self.params.set_start(Some(0));
if self.params.style().quit_after_first_match() {
return result;
}
}
result
}
fn exec_byte(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
mut si: StatePtr,
b: Byte,
) -> Option<StatePtr> {
use prog::Inst::*;
qcur.clear();
for &ip in &self.states[si as usize].insts {
qcur.add(ip as usize);
}
if self.states[si as usize].inst_flags.has_non_match_flags() {
let mut flags = Flags::new();
if b.is_eof() {
flags.set_end(true).set_end_line(true);
} else if b.as_byte().map_or(false, |b| b == b'\n') {
flags.set_end_line(true);
}
qnext.clear();
for &ip in &*qcur {
self.follow_epsilons(usize_to_u32(ip), qnext, flags);
}
mem::swap(qcur, qnext);
}
let mut flags = Flags::new();
let mut is_match = false;
if b.as_byte().map_or(false, |b| b == b'\n') {
flags.set_start_line(true);
}
qnext.clear();
for &ip in &*qcur {
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) | Split(_) | EmptyLook(_) => {}
Match(_) => {
is_match = true;
if !self.continue_past_first_match() {
break;
} else if self.prog.matches.len() > 1
&& !qnext.contains_ip(ip as usize) {
qnext.add(ip);
}
}
Bytes(ref inst) => {
if b.as_byte().map_or(false, |b| inst.matches(b)) {
self.follow_epsilons(
inst.goto as InstPtr, qnext, flags);
}
}
}
}
let mut cache = true;
if b.is_eof() && self.prog.matches.len() > 1 {
mem::swap(qcur, qnext);
cache = false;
}
let next = match self.cached_state(qnext, is_match, Some(&mut si)) {
None => return None,
Some(next) => next,
};
debug_assert!(next != STATE_UNKNOWN);
let cls = self.byte_class(b);
if cache {
self.states[si as usize].next[cls] = next;
}
Some(next)
}
fn follow_epsilons(
&mut self,
ip: InstPtr,
q: &mut SparseSet,
flags: Flags,
) {
use prog::Inst::*;
use prog::EmptyLook::*;
self.stack.push(ip);
while let Some(ip) = self.stack.pop() {
if q.contains_ip(ip as usize) {
continue;
}
q.add(ip as usize);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Match(_) | Bytes(_) => {}
EmptyLook(ref inst) => {
match inst.look {
StartLine if flags.is_start_line() => {
self.stack.push(inst.goto as InstPtr);
}
EndLine if flags.is_end_line() => {
self.stack.push(inst.goto as InstPtr);
}
StartText if flags.is_start() => {
self.stack.push(inst.goto as InstPtr);
}
EndText if flags.is_end() => {
self.stack.push(inst.goto as InstPtr);
}
StartLine | EndLine | StartText | EndText => {}
WordBoundary
| NotWordBoundary
| WordBoundaryAscii
| NotWordBoundaryAscii => unreachable!(),
}
}
Save(ref inst) => self.stack.push(inst.goto as InstPtr),
Split(ref inst) => {
self.stack.push(inst.goto2 as InstPtr);
self.stack.push(inst.goto1 as InstPtr);
}
}
}
}
fn cached_state(
&mut self,
q: &SparseSet,
is_match: bool,
current_state: Option<&mut StatePtr>,
) -> Option<StatePtr> {
let (key, inst_flags) = match self.cached_state_key(q, is_match) {
None => return Some(STATE_DEAD),
Some(v) => v,
};
if let Some(&si) = self.compiled.get(&key) {
return Some(si);
}
if self.approximate_size() > CACHE_LIMIT {
if !self.clear_cache_and_save(current_state) {
return None;
}
}
let next = vec![STATE_UNKNOWN; self.num_byte_classes()];
self.states.push(State {
next: next,
insts: key.insts.clone(),
is_match: is_match,
inst_flags: inst_flags,
});
let si = usize_to_u32(self.states.len().checked_sub(1).unwrap());
self.compiled.insert(key, si);
Some(si)
}
fn cached_state_key(
&mut self,
q: &SparseSet,
is_match: bool,
) -> Option<(StateKey, Flags)> {
use prog::Inst::*;
use prog::EmptyLook::*;
let mut inst_flags = Flags::new();
let mut insts = vec![];
for &ip in q {
let ip = usize_to_u32(ip);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) => {}
Split(_) => {}
Bytes(_) => insts.push(ip),
EmptyLook(ref inst) => {
match inst.look {
StartLine => {
inst_flags.set_start_line(true);
insts.push(ip);
}
EndLine => {
inst_flags.set_end_line(true);
insts.push(ip);
}
StartText => {
inst_flags.set_start(true);
insts.push(ip);
}
EndText => {
inst_flags.set_end(true);
insts.push(ip);
}
WordBoundary
| NotWordBoundary
| WordBoundaryAscii
| NotWordBoundaryAscii => unreachable!(),
}
}
Match(_) => {
insts.push(ip);
if !self.continue_past_first_match() {
break;
}
}
}
}
if insts.len() == 0 && !is_match {
None
} else {
let key = StateKey { insts: insts, is_match: is_match };
Some((key, inst_flags))
}
}
fn clear_cache_and_save(
&mut self,
current_state: Option<&mut StatePtr>,
) -> bool {
if self.states.len() <= 2 {
return true;
}
match current_state {
None => self.clear_cache(),
Some(si) => {
let cur = self.copy_state(*si);
if !self.clear_cache() {
return false;
}
*si = self.restore_state(cur);
true
}
}
}
fn clear_cache(&mut self) -> bool {
if *self.cache_flush_count >= 3
&& self.at >= self.last_cache_flush
&& (self.at - self.last_cache_flush) <= 10 * self.states.len() {
return false;
}
self.last_cache_flush = self.at;
*self.cache_flush_count += 1;
let start = self.copy_state(self.start);
self.states.clear();
self.compiled.clear();
for start in self.start_states.iter_mut() {
*start = STATE_UNKNOWN;
}
self.states.push(State::invalid());
self.states.push(State::invalid());
self.start = self.restore_state(start);
true
}
fn copy_state(&self, si: StatePtr) -> State {
let mut state = self.states[si as usize].clone();
state.next = vec![STATE_UNKNOWN; self.num_byte_classes()];
state
}
fn restore_state(&mut self, state: State) -> StatePtr {
let key = StateKey {
insts: state.insts.clone(),
is_match: state.is_match,
};
if let Some(&si) = self.compiled.get(&key) {
return si;
}
let si = usize_to_u32(self.states.len());
self.states.push(state);
self.compiled.insert(key, si);
si
}
fn next_state(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
si: StatePtr,
b: Byte,
) -> Option<StatePtr> {
let cls = self.byte_class(b);
match self.states[si as usize].next[cls] {
STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
STATE_DEAD => return Some(STATE_DEAD),
nsi => return Some(nsi),
}
}
fn start_state(
&mut self,
q: &mut SparseSet,
text: &[u8],
) -> Option<StatePtr> {
let start_flags = if self.prog.is_reverse {
self.start_flags_reverse(text, self.at)
} else {
self.start_flags(text, self.at)
};
let flagi = start_flags.0 as usize;
match self.start_states[flagi] {
STATE_UNKNOWN => {}
STATE_DEAD => return Some(STATE_DEAD),
si => return Some(si),
}
q.clear();
let start = usize_to_u32(self.prog.start);
self.follow_epsilons(start, q, start_flags);
let sp = match self.cached_state(q, false, None) {
None => return None,
Some(sp) => sp,
};
self.start_states[flagi] = sp;
Some(sp)
}
fn start_flags(&self, text: &[u8], at: usize) -> Flags {
let mut flags = Flags::new();
flags.set_start(at == 0).set_end(text.len() == 0);
flags.set_start_line(at == 0 || text[at - 1] == b'\n');
flags.set_end_line(text.len() == 0);
flags
}
fn start_flags_reverse(&self, text: &[u8], at: usize) -> Flags {
let mut flags = Flags::new();
flags.set_start(at == text.len()).set_end(text.len() == 0);
flags.set_start_line(at == text.len() || text[at] == b'\n');
flags.set_end_line(text.len() == 0);
flags
}
fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
}
fn num_byte_classes(&self) -> usize {
(self.prog.byte_classes[255] as usize + 1) + 1
}
fn byte_class(&self, b: Byte) -> usize {
if b.is_eof() {
self.num_byte_classes() - 1
} else {
self.prog.byte_classes[b.0 as usize] as usize
}
}
fn continue_past_first_match(&self) -> bool {
self.prog.is_reverse || self.prog.matches.len() > 1
}
fn approximate_size(&self) -> usize {
use std::mem::size_of as size;
let compiled =
(self.compiled.len() * (size::<StateKey>() + 128))
+ (self.compiled.len() * size::<StatePtr>());
let states =
self.states.len()
* (size::<State>()
+ 128
+ (self.num_byte_classes() * size::<StatePtr>()));
let start_states = self.start_states.len() * size::<StatePtr>();
self.prog.approximate_size() + compiled + states + start_states
}
}
impl State {
fn invalid() -> State {
State {
next: vec![],
insts: vec![],
is_match: false,
inst_flags: Flags::new(),
}
}
}
impl Flags {
#[inline]
fn new() -> Self {
Flags(0)
}
#[inline]
fn has_non_match_flags(&self) -> bool {
self.0 & 0b0_1111111 > 0
}
#[inline]
fn set(&mut self, yes: bool, bit: u8) {
if yes {
self.0 = self.0 | bit;
} else {
self.0 = self.0 & !bit;
}
}
#[inline]
fn is_start(&self) -> bool { self.0 & 0b0_1_000000 > 0 }
#[inline]
fn set_start(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b0_1_000000);
self
}
#[inline]
fn is_end(&self) -> bool { self.0 & 0b00_1_00000 > 0 }
#[inline]
fn set_end(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b00_1_00000);
self
}
#[inline]
fn is_start_line(&self) -> bool { self.0 & 0b000_1_0000 > 0 }
#[inline]
fn set_start_line(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b000_1_0000);
self
}
#[inline]
fn is_end_line(&self) -> bool { self.0 & 0b0000_1_000 > 0 }
#[inline]
fn set_end_line(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b0000_1_000);
self
}
}
impl Byte {
#[inline] fn byte(b: u8) -> Self { Byte(b as u16) }
#[inline] fn eof() -> Self { Byte(256) }
#[inline] fn is_eof(&self) -> bool { self.0 == 256 }
#[inline]
fn as_byte(&self) -> Option<u8> {
if self.is_eof() {
None
} else {
Some(self.0 as u8)
}
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut next = vec![];
for (b, next_sp) in self.next.iter().enumerate() {
match *next_sp {
STATE_UNKNOWN => {}
STATE_DEAD => next.push((vb(b as usize), "DEAD".to_string())),
si => next.push((vb(b as usize), si.to_string())),
}
}
f.debug_struct("State")
.field("is_match", &self.is_match)
.field("insts", &self.insts)
.field("next", &next)
.finish()
}
}
impl fmt::Debug for Flags {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Flags")
.field("start", &if self.is_start() { 1 } else { 0 })
.field("end", &if self.is_end() { 1 } else { 0 })
.field("start_line", &if self.is_start_line() { 1 } else { 0 })
.field("end_line", &if self.is_end_line() { 1 } else { 0 })
.finish()
}
}
fn vb(b: usize) -> String {
use std::ascii::escape_default;
if b > ::std::u8::MAX as usize {
"EOF".to_owned()
} else {
let escaped = escape_default(b as u8).collect::<Vec<u8>>();
String::from_utf8_lossy(&escaped).into_owned()
}
}
fn usize_to_u32(n: usize) -> u32 {
if (n as u64) > (::std::u32::MAX as u64) {
panic!("BUG: {} is too big to fit into u32", n)
}
n as u32
}