use std::collections::HashMap;
use std::fmt;
use std::mem;
use exec::ProgramCache;
use prog::{Inst, Program};
use sparse::SparseSet;
const CACHE_LIMIT: usize = 2 * (1<<20);
pub fn can_exec(insts: &Program) -> bool {
use prog::Inst::*;
use prog::EmptyLook::*;
if insts.len() > ::std::u32::MAX as usize {
return false;
}
for inst in insts {
match *inst {
Char(_) | Ranges(_) => return false,
EmptyLook(ref inst) => {
match inst.look {
WordBoundary | NotWordBoundary => return false,
WordBoundaryAscii | NotWordBoundaryAscii => {}
StartLine | EndLine | StartText | EndText => {}
}
}
Match(_) | Save(_) | Split(_) | Bytes(_) => {}
}
}
true
}
#[derive(Clone, Debug)]
pub struct Cache {
inner: CacheInner,
qcur: SparseSet,
qnext: SparseSet,
}
#[derive(Clone, Debug)]
struct CacheInner {
compiled: HashMap<StateKey, StatePtr>,
states: Vec<State>,
start_states: Vec<StatePtr>,
stack: Vec<InstPtr>,
flush_count: u64,
}
#[derive(Debug)]
pub struct Fsm<'a> {
prog: &'a Program,
start: StatePtr,
at: usize,
quit_after_match: bool,
last_match_si: StatePtr,
last_cache_flush: usize,
cache: &'a mut CacheInner,
}
#[derive(Clone, Debug)]
pub enum Result<T> {
Match(T),
NoMatch,
Quit,
}
impl<T> Result<T> {
pub fn is_match(&self) -> bool {
match *self {
Result::Match(_) => true,
Result::NoMatch | Result::Quit => false,
}
}
}
#[derive(Clone)]
struct State {
next: Box<[StatePtr]>,
insts: Box<[InstPtr]>,
flags: StateFlags,
}
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
struct StateKey {
insts: Box<[InstPtr]>,
flags: StateFlags,
}
type InstPtr = u32;
type StatePtr = u32;
const STATE_UNKNOWN: StatePtr = 0;
const STATE_DEAD: StatePtr = 1;
#[derive(Copy, Clone, Debug)]
struct Byte(u16);
#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
struct EmptyFlags {
start: bool,
end: bool,
start_line: bool,
end_line: bool,
word_boundary: bool,
not_word_boundary: bool,
}
#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
struct StateFlags(u8);
impl Cache {
pub fn new(prog: &Program) -> Self {
Cache {
inner: CacheInner {
compiled: HashMap::new(),
states: vec![State::invalid(), State::invalid()],
start_states: vec![STATE_UNKNOWN; 256],
stack: vec![],
flush_count: 0,
},
qcur: SparseSet::new(prog.insts.len()),
qnext: SparseSet::new(prog.insts.len()),
}
}
}
impl<'a> Fsm<'a> {
#[inline(always)] pub fn forward(
prog: &'a Program,
cache: &ProgramCache,
quit_after_match: bool,
text: &[u8],
at: usize,
) -> Result<usize> {
let mut cache = cache.borrow_mut();
let mut cache = &mut cache.dfa;
let mut dfa = Fsm {
prog: prog,
start: 0, at: at,
quit_after_match: quit_after_match,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
};
let (empty_flags, state_flags) = dfa.start_flags(text, at);
dfa.start = match dfa.start_state(
&mut cache.qcur,
empty_flags,
state_flags,
) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch,
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
}
#[inline(always)] pub fn reverse(
prog: &'a Program,
cache: &ProgramCache,
quit_after_match: bool,
text: &[u8],
at: usize,
) -> Result<usize> {
let mut cache = cache.borrow_mut();
let mut cache = &mut cache.dfa_reverse;
let mut dfa = Fsm {
prog: prog,
start: 0, at: at,
quit_after_match: quit_after_match,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
};
let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
dfa.start = match dfa.start_state(
&mut cache.qcur,
empty_flags,
state_flags,
) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch,
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
}
#[inline(always)] pub fn forward_many(
prog: &'a Program,
cache: &ProgramCache,
matches: &mut [bool],
text: &[u8],
at: usize,
) -> Result<usize> {
debug_assert!(matches.len() == prog.matches.len());
let mut cache = cache.borrow_mut();
let mut cache = &mut cache.dfa;
let mut dfa = Fsm {
prog: prog,
start: 0, at: at,
quit_after_match: false,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
};
let (empty_flags, state_flags) = dfa.start_flags(text, at);
dfa.start = match dfa.start_state(
&mut cache.qcur,
empty_flags,
state_flags,
) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch,
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
if result.is_match() {
if matches.len() == 1 {
matches[0] = true;
} else {
debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
debug_assert!(dfa.last_match_si != STATE_DEAD);
for &ip in dfa.cache.state(dfa.last_match_si).insts.iter() {
if let Inst::Match(slot) = dfa.prog[ip as usize] {
matches[slot] = true;
}
}
}
}
result
}
#[inline(always)] fn exec_at(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
) -> Result<usize> {
debug_assert!(!self.prog.is_reverse);
let mut result = Result::NoMatch;
let mut si = self.start;
let mut next_si;
let mut b: u8;
while self.at < text.len() {
if !self.prog.prefixes.is_empty()
&& si == self.start
&& !self.prog.is_anchored_start {
self.at = match self.prefix_at(text, self.at) {
None => return Result::NoMatch,
Some(i) => i,
};
}
b = text[self.at];
next_si = {
let s = self.cache.state(si);
if s.flags.is_match() {
result = Result::Match(self.at - 1);
if self.quit_after_match {
return result;
}
self.last_match_si = si;
}
s.next[self.u8_class(b)]
};
if next_si <= STATE_DEAD {
let byte = Byte::byte(b);
next_si = match self.next_state(qcur, qnext, si, byte) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(si) => si,
};
debug_assert!(next_si != STATE_UNKNOWN);
}
si = next_si;
self.at += 1;
}
if self.cache.state(si).flags.is_match() {
result = Result::Match(self.at - 1);
if self.quit_after_match {
return result;
}
self.last_match_si = si;
}
si = match self.next_state(qcur, qnext, si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(si) => si,
};
debug_assert!(si != STATE_UNKNOWN);
if self.cache.state(si).flags.is_match() {
self.last_match_si = si;
result = Result::Match(text.len());
}
result
}
#[inline(always)] fn exec_at_reverse(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
) -> Result<usize> {
debug_assert!(self.prog.is_reverse);
let mut result = Result::NoMatch;
let mut si = self.start;
let mut next_si;
let mut b: u8;
while self.at > 0 {
self.at -= 1;
b = text[self.at];
next_si = {
let s = self.cache.state(si);
if s.flags.is_match() {
result = Result::Match(self.at + 2);
if self.quit_after_match {
return result;
}
self.last_match_si = si;
}
s.next[self.u8_class(b)]
};
if next_si <= STATE_DEAD {
let byte = Byte::byte(b);
next_si = match self.next_state(qcur, qnext, si, byte) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(si) => si,
};
debug_assert!(next_si != STATE_UNKNOWN);
}
si = next_si;
}
if self.cache.state(si).flags.is_match() {
result = Result::Match(self.at + 1);
if self.quit_after_match {
return result;
}
self.last_match_si = si;
}
si = match self.next_state(qcur, qnext, si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(si) => si,
};
debug_assert!(si != STATE_UNKNOWN);
if self.cache.state(si).flags.is_match() {
self.last_match_si = si;
result = Result::Match(0);
}
result
}
fn exec_byte(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
mut si: StatePtr,
b: Byte,
) -> Option<StatePtr> {
use prog::Inst::*;
qcur.clear();
for &ip in self.cache.state(si).insts.iter() {
qcur.add(ip as usize);
}
let is_word_last = self.cache.state(si).flags.is_word();
let is_word = b.is_ascii_word();
if self.cache.state(si).flags.has_empty() {
let mut flags = EmptyFlags::default();
if b.is_eof() {
flags.end = true;
flags.end_line = true;
} else if b.as_byte().map_or(false, |b| b == b'\n') {
flags.end_line = true;
}
if is_word_last == is_word {
flags.not_word_boundary = true;
} else {
flags.word_boundary = true;
}
qnext.clear();
for &ip in &*qcur {
self.follow_epsilons(usize_to_u32(ip), qnext, flags);
}
mem::swap(qcur, qnext);
}
let mut empty_flags = EmptyFlags::default();
let mut state_flags = StateFlags::default();
empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
if b.is_ascii_word() {
state_flags.set_word();
}
qnext.clear();
for &ip in &*qcur {
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) | Split(_) | EmptyLook(_) => {}
Match(_) => {
state_flags.set_match();
if !self.continue_past_first_match() {
break;
} else if self.prog.matches.len() > 1
&& !qnext.contains_ip(ip as usize) {
qnext.add(ip);
}
}
Bytes(ref inst) => {
if b.as_byte().map_or(false, |b| inst.matches(b)) {
self.follow_epsilons(
inst.goto as InstPtr, qnext, empty_flags);
}
}
}
}
let mut cache = true;
if b.is_eof() && self.prog.matches.len() > 1 {
mem::swap(qcur, qnext);
cache = false;
}
let next = match self.cached_state(qnext, state_flags, Some(&mut si)) {
None => return None,
Some(next) => next,
};
debug_assert!(next != STATE_UNKNOWN);
let cls = self.byte_class(b);
if cache {
self.cache.state_mut(si).next[cls] = next;
}
Some(next)
}
fn follow_epsilons(
&mut self,
ip: InstPtr,
q: &mut SparseSet,
flags: EmptyFlags,
) {
use prog::Inst::*;
use prog::EmptyLook::*;
self.cache.stack.push(ip);
while let Some(ip) = self.cache.stack.pop() {
if q.contains_ip(ip as usize) {
continue;
}
q.add(ip as usize);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Match(_) | Bytes(_) => {}
EmptyLook(ref inst) => {
match inst.look {
StartLine if flags.start_line => {
self.cache.stack.push(inst.goto as InstPtr);
}
EndLine if flags.end_line => {
self.cache.stack.push(inst.goto as InstPtr);
}
StartText if flags.start => {
self.cache.stack.push(inst.goto as InstPtr);
}
EndText if flags.end => {
self.cache.stack.push(inst.goto as InstPtr);
}
WordBoundaryAscii if flags.word_boundary => {
self.cache.stack.push(inst.goto as InstPtr);
}
NotWordBoundaryAscii if flags.not_word_boundary => {
self.cache.stack.push(inst.goto as InstPtr);
}
StartLine | EndLine | StartText | EndText => {}
WordBoundaryAscii | NotWordBoundaryAscii => {}
WordBoundary | NotWordBoundary => unreachable!(),
}
}
Save(ref inst) => self.cache.stack.push(inst.goto as InstPtr),
Split(ref inst) => {
self.cache.stack.push(inst.goto2 as InstPtr);
self.cache.stack.push(inst.goto1 as InstPtr);
}
}
}
}
fn cached_state(
&mut self,
q: &SparseSet,
mut state_flags: StateFlags,
current_state: Option<&mut StatePtr>,
) -> Option<StatePtr> {
let key = match self.cached_state_key(q, &mut state_flags) {
None => return Some(STATE_DEAD),
Some(v) => v,
};
if let Some(&si) = self.cache.compiled.get(&key) {
return Some(si);
}
if self.approximate_size() > CACHE_LIMIT {
if !self.clear_cache_and_save(current_state) {
return None;
}
}
let next = vec![STATE_UNKNOWN; self.num_byte_classes()];
self.cache.states.push(State {
next: next.into_boxed_slice(),
insts: key.insts.clone(),
flags: state_flags,
});
let si = usize_to_u32(self.cache.states.len().checked_sub(1).unwrap());
self.cache.compiled.insert(key, si);
Some(si)
}
fn cached_state_key(
&mut self,
q: &SparseSet,
state_flags: &mut StateFlags,
) -> Option<StateKey> {
use prog::Inst::*;
use prog::EmptyLook::*;
let mut insts = vec![];
for &ip in q {
let ip = usize_to_u32(ip);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) => {}
Split(_) => {}
Bytes(_) => insts.push(ip),
EmptyLook(ref inst) => {
match inst.look {
StartLine => {
state_flags.set_empty();
insts.push(ip);
}
EndLine => {
state_flags.set_empty();
insts.push(ip);
}
StartText => {
state_flags.set_empty();
insts.push(ip);
}
EndText => {
state_flags.set_empty();
insts.push(ip);
}
WordBoundaryAscii => {
state_flags.set_empty();
insts.push(ip);
}
NotWordBoundaryAscii => {
state_flags.set_empty();
insts.push(ip);
}
WordBoundary | NotWordBoundary => unreachable!(),
}
}
Match(_) => {
insts.push(ip);
if !self.continue_past_first_match() {
break;
}
}
}
}
if insts.len() == 0 && !state_flags.is_match() {
None
} else {
Some(StateKey {
insts: insts.into_boxed_slice(),
flags: *state_flags,
})
}
}
fn clear_cache_and_save(
&mut self,
current_state: Option<&mut StatePtr>,
) -> bool {
if self.cache.states.len() <= 2 {
return true;
}
match current_state {
None => self.clear_cache(),
Some(si) => {
let cur = self.copy_state(*si);
if !self.clear_cache() {
return false;
}
*si = self.restore_state(cur);
true
}
}
}
fn clear_cache(&mut self) -> bool {
let nstates = self.cache.states.len();
if self.cache.flush_count >= 3
&& self.at >= self.last_cache_flush
&& (self.at - self.last_cache_flush) <= 10 * nstates {
return false;
}
self.last_cache_flush = self.at;
self.cache.flush_count += 1;
let start = self.copy_state(self.start);
self.cache.states.clear();
self.cache.compiled.clear();
for start in self.cache.start_states.iter_mut() {
*start = STATE_UNKNOWN;
}
self.cache.states.push(State::invalid());
self.cache.states.push(State::invalid());
self.start = self.restore_state(start);
true
}
fn copy_state(&self, si: StatePtr) -> State {
let mut state = self.cache.state(si).clone();
let unknowns = vec![STATE_UNKNOWN; self.num_byte_classes()];
state.next = unknowns.into_boxed_slice();
state
}
fn restore_state(&mut self, state: State) -> StatePtr {
let key = StateKey {
insts: state.insts.clone(),
flags: state.flags,
};
if let Some(&si) = self.cache.compiled.get(&key) {
return si;
}
let si = usize_to_u32(self.cache.states.len());
self.cache.states.push(state);
self.cache.compiled.insert(key, si);
si
}
fn next_state(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
si: StatePtr,
b: Byte,
) -> Option<StatePtr> {
if si == STATE_DEAD {
return Some(STATE_DEAD);
}
let cls = self.byte_class(b);
match self.cache.state(si).next[cls] {
STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
STATE_DEAD => return Some(STATE_DEAD),
nsi => return Some(nsi),
}
}
#[inline(always)] fn start_state(
&mut self,
q: &mut SparseSet,
empty_flags: EmptyFlags,
state_flags: StateFlags,
) -> Option<StatePtr> {
let flagi = empty_flags.as_index();
match self.cache.start_states[flagi] {
STATE_UNKNOWN => {}
STATE_DEAD => return Some(STATE_DEAD),
si => return Some(si),
}
q.clear();
let start = usize_to_u32(self.prog.start);
self.follow_epsilons(start, q, empty_flags);
let sp = match self.cached_state(q, state_flags, None) {
None => return None,
Some(sp) => sp,
};
self.cache.start_states[flagi] = sp;
Some(sp)
}
fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
let mut empty_flags = EmptyFlags::default();
let mut state_flags = StateFlags::default();
empty_flags.start = at == 0;
empty_flags.end = text.len() == 0;
empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
empty_flags.end_line = text.len() == 0;
if at > 0 && Byte::byte(text[at - 1]).is_ascii_word() {
state_flags.set_word();
}
(empty_flags, state_flags)
}
fn start_flags_reverse(
&self,
text: &[u8],
at: usize,
) -> (EmptyFlags, StateFlags) {
let mut empty_flags = EmptyFlags::default();
let mut state_flags = StateFlags::default();
empty_flags.start = at == text.len();
empty_flags.end = text.len() == 0;
empty_flags.start_line = at == text.len() || text[at] == b'\n';
empty_flags.end_line = text.len() == 0;
if at < text.len() && Byte::byte(text[at]).is_ascii_word() {
state_flags.set_word();
}
(empty_flags, state_flags)
}
fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
}
fn num_byte_classes(&self) -> usize {
(self.prog.byte_classes[255] as usize + 1) + 1
}
fn byte_class(&self, b: Byte) -> usize {
if b.is_eof() {
self.num_byte_classes() - 1
} else {
self.prog.byte_classes[b.0 as usize] as usize
}
}
fn u8_class(&self, b: u8) -> usize {
self.prog.byte_classes[b as usize] as usize
}
fn continue_past_first_match(&self) -> bool {
self.prog.is_reverse || self.prog.matches.len() > 1
}
fn approximate_size(&self) -> usize {
use std::mem::size_of as size;
let compiled =
(self.cache.compiled.len() * (size::<StateKey>() + 64))
+ (self.cache.compiled.len() * size::<StatePtr>());
let states =
self.cache.states.len()
* (size::<State>()
+ 64
+ (self.num_byte_classes() * size::<StatePtr>()));
let start_states = self.cache.start_states.len() * size::<StatePtr>();
self.prog.approximate_size() + compiled + states + start_states
}
#[allow(dead_code)] fn actual_size(&self) -> usize {
let mut compiled = 0;
for k in self.cache.compiled.keys() {
compiled += mem::size_of::<StateKey>();
compiled += mem::size_of::<StatePtr>();
compiled += k.insts.len() * mem::size_of::<InstPtr>();
}
let mut states = 0;
for s in &self.cache.states {
states += mem::size_of::<State>();
states += s.next.len() * mem::size_of::<StatePtr>();
states += s.insts.len() * mem::size_of::<InstPtr>();
}
compiled
+ states
+ (self.cache.start_states.len() * mem::size_of::<StatePtr>())
+ (self.cache.stack.len() * mem::size_of::<InstPtr>())
+ mem::size_of::<Fsm>()
+ mem::size_of::<CacheInner>()
+ self.prog.approximate_size() }
}
impl State {
fn invalid() -> State {
State {
next: vec![].into_boxed_slice(),
insts: vec![].into_boxed_slice(),
flags: StateFlags::default(),
}
}
}
impl CacheInner {
fn state(&self, si: StatePtr) -> &State {
&self.states[si as usize]
}
fn state_mut(&mut self, si: StatePtr) -> &mut State {
&mut self.states[si as usize]
}
}
impl EmptyFlags {
fn as_index(&self) -> usize {
(((self.start as u8) << 0) |
((self.end as u8) << 1) |
((self.start_line as u8) << 2) |
((self.end_line as u8) << 3) |
((self.word_boundary as u8) << 4) |
((self.not_word_boundary as u8) << 5))
as usize
}
}
impl StateFlags {
fn is_match(&self) -> bool {
self.0 & 0b0000000_1 > 0
}
fn set_match(&mut self) {
self.0 |= 0b0000000_1;
}
fn is_word(&self) -> bool {
self.0 & 0b000000_1_0 > 0
}
fn set_word(&mut self) {
self.0 |= 0b000000_1_0;
}
fn has_empty(&self) -> bool {
self.0 & 0b00000_1_00 > 0
}
fn set_empty(&mut self) {
self.0 |= 0b00000_1_00;
}
}
impl Byte {
fn byte(b: u8) -> Self { Byte(b as u16) }
fn eof() -> Self { Byte(256) }
fn is_eof(&self) -> bool { self.0 == 256 }
fn is_ascii_word(&self) -> bool {
let b = match self.as_byte() {
None => return false,
Some(b) => b,
};
match b {
b'A'...b'Z' | b'a'...b'z' | b'0'...b'9' | b'_' => true,
_ => false,
}
}
fn as_byte(&self) -> Option<u8> {
if self.is_eof() {
None
} else {
Some(self.0 as u8)
}
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut next = vec![];
for (b, next_sp) in self.next.iter().enumerate() {
match *next_sp {
STATE_UNKNOWN => {}
STATE_DEAD => next.push((vb(b as usize), "DEAD".to_string())),
si => next.push((vb(b as usize), si.to_string())),
}
}
f.debug_struct("State")
.field("flags", &self.flags)
.field("insts", &self.insts)
.field("next", &next)
.finish()
}
}
impl fmt::Debug for StateFlags {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("StateFlags")
.field("is_match", &self.is_match())
.field("is_word", &self.is_word())
.field("has_empty", &self.has_empty())
.finish()
}
}
fn vb(b: usize) -> String {
use std::ascii::escape_default;
if b > ::std::u8::MAX as usize {
"EOF".to_owned()
} else {
let escaped = escape_default(b as u8).collect::<Vec<u8>>();
String::from_utf8_lossy(&escaped).into_owned()
}
}
fn usize_to_u32(n: usize) -> u32 {
if (n as u64) > (::std::u32::MAX as u64) {
panic!("BUG: {} is too big to fit into u32", n)
}
n as u32
}