use std::collections::HashMap;
use std::fmt;
use std::iter::repeat;
use std::mem;
use std::sync::Arc;
use exec::ProgramCache;
use prog::{Inst, Program};
use sparse::SparseSet;
pub fn can_exec(insts: &Program) -> bool {
use prog::Inst::*;
if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
return false;
}
for inst in insts {
match *inst {
Char(_) | Ranges(_) => return false,
EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
}
}
true
}
#[derive(Debug)]
pub struct Cache {
inner: CacheInner,
qcur: SparseSet,
qnext: SparseSet,
}
#[derive(Debug)]
struct CacheInner {
compiled: StateMap,
trans: Transitions,
start_states: Vec<StatePtr>,
stack: Vec<InstPtr>,
flush_count: u64,
size: usize,
insts_scratch_space: Vec<u8>,
}
#[derive(Clone)]
struct Transitions {
table: Vec<StatePtr>,
num_byte_classes: usize,
}
#[derive(Debug)]
pub struct Fsm<'a> {
prog: &'a Program,
start: StatePtr,
at: usize,
quit_after_match: bool,
last_match_si: StatePtr,
last_cache_flush: usize,
cache: &'a mut CacheInner,
}
#[derive(Clone, Debug)]
pub enum Result<T> {
Match(T),
NoMatch(usize),
Quit,
}
impl<T> Result<T> {
pub fn is_match(&self) -> bool {
match *self {
Result::Match(_) => true,
Result::NoMatch(_) | Result::Quit => false,
}
}
#[cfg(feature = "perf-literal")]
pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
match self {
Result::Match(t) => Result::Match(f(t)),
Result::NoMatch(x) => Result::NoMatch(x),
Result::Quit => Result::Quit,
}
}
fn set_non_match(self, at: usize) -> Result<T> {
match self {
Result::NoMatch(_) => Result::NoMatch(at),
r => r,
}
}
}
#[derive(Clone, Eq, Hash, PartialEq)]
struct State {
data: Arc<[u8]>,
}
type InstPtr = u32;
fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
let delta = (ip as i32) - (*prev as i32);
write_vari32(data, delta);
*prev = ip;
}
struct InstPtrs<'a> {
base: usize,
data: &'a [u8],
}
impl<'a> Iterator for InstPtrs<'a> {
type Item = usize;
fn next(&mut self) -> Option<usize> {
if self.data.is_empty() {
return None;
}
let (delta, nread) = read_vari32(self.data);
let base = self.base as i32 + delta;
debug_assert!(base >= 0);
debug_assert!(nread > 0);
self.data = &self.data[nread..];
self.base = base as usize;
Some(self.base)
}
}
impl State {
fn flags(&self) -> StateFlags {
StateFlags(self.data[0])
}
fn inst_ptrs(&self) -> InstPtrs {
InstPtrs { base: 0, data: &self.data[1..] }
}
}
type StatePtr = u32;
const STATE_UNKNOWN: StatePtr = 1 << 31;
const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
const STATE_QUIT: StatePtr = STATE_DEAD + 1;
const STATE_START: StatePtr = 1 << 30;
const STATE_MATCH: StatePtr = 1 << 29;
const STATE_MAX: StatePtr = STATE_MATCH - 1;
#[derive(Copy, Clone, Debug)]
struct Byte(u16);
#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
struct EmptyFlags {
start: bool,
end: bool,
start_line: bool,
end_line: bool,
word_boundary: bool,
not_word_boundary: bool,
}
#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
struct StateFlags(u8);
impl Cache {
pub fn new(prog: &Program) -> Self {
let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
let starts = vec![STATE_UNKNOWN; 256];
let mut cache = Cache {
inner: CacheInner {
compiled: StateMap::new(num_byte_classes),
trans: Transitions::new(num_byte_classes),
start_states: starts,
stack: vec![],
flush_count: 0,
size: 0,
insts_scratch_space: vec![],
},
qcur: SparseSet::new(prog.insts.len()),
qnext: SparseSet::new(prog.insts.len()),
};
cache.inner.reset_size();
cache
}
}
impl CacheInner {
fn reset_size(&mut self) {
self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
+ (self.stack.len() * mem::size_of::<InstPtr>());
}
}
impl<'a> Fsm<'a> {
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn forward(
prog: &'a Program,
cache: &ProgramCache,
quit_after_match: bool,
text: &[u8],
at: usize,
) -> Result<usize> {
let mut cache = cache.borrow_mut();
let cache = &mut cache.dfa;
let mut dfa = Fsm {
prog: prog,
start: 0, at: at,
quit_after_match: quit_after_match,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
};
let (empty_flags, state_flags) = dfa.start_flags(text, at);
dfa.start =
match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch(at),
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn reverse(
prog: &'a Program,
cache: &ProgramCache,
quit_after_match: bool,
text: &[u8],
at: usize,
) -> Result<usize> {
let mut cache = cache.borrow_mut();
let cache = &mut cache.dfa_reverse;
let mut dfa = Fsm {
prog: prog,
start: 0, at: at,
quit_after_match: quit_after_match,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
};
let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
dfa.start =
match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch(at),
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn forward_many(
prog: &'a Program,
cache: &ProgramCache,
matches: &mut [bool],
text: &[u8],
at: usize,
) -> Result<usize> {
debug_assert!(matches.len() == prog.matches.len());
let mut cache = cache.borrow_mut();
let cache = &mut cache.dfa;
let mut dfa = Fsm {
prog: prog,
start: 0, at: at,
quit_after_match: false,
last_match_si: STATE_UNKNOWN,
last_cache_flush: at,
cache: &mut cache.inner,
};
let (empty_flags, state_flags) = dfa.start_flags(text, at);
dfa.start =
match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch(at),
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
if result.is_match() {
if matches.len() == 1 {
matches[0] = true;
} else {
debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
debug_assert!(dfa.last_match_si != STATE_DEAD);
for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
if let Inst::Match(slot) = dfa.prog[ip] {
matches[slot] = true;
}
}
}
}
result
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn exec_at(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
) -> Result<usize> {
debug_assert!(!self.prog.is_reverse);
let mut result = Result::NoMatch(self.at);
let (mut prev_si, mut next_si) = (self.start, self.start);
let mut at = self.at;
while at < text.len() {
while next_si <= STATE_MAX && at < text.len() {
prev_si = unsafe { self.next_si(next_si, text, at) };
at += 1;
if prev_si > STATE_MAX || at + 2 >= text.len() {
mem::swap(&mut prev_si, &mut next_si);
break;
}
next_si = unsafe { self.next_si(prev_si, text, at) };
at += 1;
if next_si > STATE_MAX {
break;
}
prev_si = unsafe { self.next_si(next_si, text, at) };
at += 1;
if prev_si > STATE_MAX {
mem::swap(&mut prev_si, &mut next_si);
break;
}
next_si = unsafe { self.next_si(prev_si, text, at) };
at += 1;
}
if next_si & STATE_MATCH > 0 {
next_si &= !STATE_MATCH;
result = Result::Match(at - 1);
if self.quit_after_match {
return result;
}
self.last_match_si = next_si;
prev_si = next_si;
if self.prog.matches.len() > 1 {
let state = self.state(next_si);
let just_matches =
state.inst_ptrs().all(|ip| self.prog[ip].is_match());
if just_matches {
return result;
}
}
let cur = at;
while (next_si & !STATE_MATCH) == prev_si
&& at + 2 < text.len()
{
next_si = unsafe {
self.next_si(next_si & !STATE_MATCH, text, at)
};
at += 1;
}
if at > cur {
result = Result::Match(at - 2);
}
} else if next_si & STATE_START > 0 {
debug_assert!(self.has_prefix());
next_si &= !STATE_START;
prev_si = next_si;
at = match self.prefix_at(text, at) {
None => return Result::NoMatch(text.len()),
Some(i) => i,
};
} else if next_si >= STATE_UNKNOWN {
if next_si == STATE_QUIT {
return Result::Quit;
}
let byte = Byte::byte(text[at - 1]);
prev_si &= STATE_MAX;
self.at = at;
next_si = match self.next_state(qcur, qnext, prev_si, byte) {
None => return Result::Quit,
Some(STATE_DEAD) => return result.set_non_match(at),
Some(si) => si,
};
debug_assert!(next_si != STATE_UNKNOWN);
if next_si & STATE_MATCH > 0 {
next_si &= !STATE_MATCH;
result = Result::Match(at - 1);
if self.quit_after_match {
return result;
}
self.last_match_si = next_si;
}
prev_si = next_si;
} else {
prev_si = next_si;
}
}
prev_si &= STATE_MAX;
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result.set_non_match(text.len()),
Some(si) => si & !STATE_START,
};
debug_assert!(prev_si != STATE_UNKNOWN);
if prev_si & STATE_MATCH > 0 {
prev_si &= !STATE_MATCH;
self.last_match_si = prev_si;
result = Result::Match(text.len());
}
result
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn exec_at_reverse(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
) -> Result<usize> {
debug_assert!(self.prog.is_reverse);
let mut result = Result::NoMatch(self.at);
let (mut prev_si, mut next_si) = (self.start, self.start);
let mut at = self.at;
while at > 0 {
while next_si <= STATE_MAX && at > 0 {
at -= 1;
prev_si = unsafe { self.next_si(next_si, text, at) };
if prev_si > STATE_MAX || at <= 4 {
mem::swap(&mut prev_si, &mut next_si);
break;
}
at -= 1;
next_si = unsafe { self.next_si(prev_si, text, at) };
if next_si > STATE_MAX {
break;
}
at -= 1;
prev_si = unsafe { self.next_si(next_si, text, at) };
if prev_si > STATE_MAX {
mem::swap(&mut prev_si, &mut next_si);
break;
}
at -= 1;
next_si = unsafe { self.next_si(prev_si, text, at) };
}
if next_si & STATE_MATCH > 0 {
next_si &= !STATE_MATCH;
result = Result::Match(at + 1);
if self.quit_after_match {
return result;
}
self.last_match_si = next_si;
prev_si = next_si;
let cur = at;
while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
at -= 1;
next_si = unsafe {
self.next_si(next_si & !STATE_MATCH, text, at)
};
}
if at < cur {
result = Result::Match(at + 2);
}
} else if next_si >= STATE_UNKNOWN {
if next_si == STATE_QUIT {
return Result::Quit;
}
let byte = Byte::byte(text[at]);
prev_si &= STATE_MAX;
self.at = at;
next_si = match self.next_state(qcur, qnext, prev_si, byte) {
None => return Result::Quit,
Some(STATE_DEAD) => return result.set_non_match(at),
Some(si) => si,
};
debug_assert!(next_si != STATE_UNKNOWN);
if next_si & STATE_MATCH > 0 {
next_si &= !STATE_MATCH;
result = Result::Match(at + 1);
if self.quit_after_match {
return result;
}
self.last_match_si = next_si;
}
prev_si = next_si;
} else {
prev_si = next_si;
}
}
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result.set_non_match(0),
Some(si) => si,
};
debug_assert!(prev_si != STATE_UNKNOWN);
if prev_si & STATE_MATCH > 0 {
prev_si &= !STATE_MATCH;
self.last_match_si = prev_si;
result = Result::Match(0);
}
result
}
#[cfg_attr(feature = "perf-inline", inline(always))]
unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
debug_assert!(i < text.len());
let b = *text.get_unchecked(i);
debug_assert!((b as usize) < self.prog.byte_classes.len());
let cls = *self.prog.byte_classes.get_unchecked(b as usize);
self.cache.trans.next_unchecked(si, cls as usize)
}
fn exec_byte(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
mut si: StatePtr,
b: Byte,
) -> Option<StatePtr> {
use prog::Inst::*;
qcur.clear();
for ip in self.state(si).inst_ptrs() {
qcur.insert(ip);
}
let is_word_last = self.state(si).flags().is_word();
let is_word = b.is_ascii_word();
if self.state(si).flags().has_empty() {
let mut flags = EmptyFlags::default();
if b.is_eof() {
flags.end = true;
flags.end_line = true;
} else if b.as_byte().map_or(false, |b| b == b'\n') {
flags.end_line = true;
}
if is_word_last == is_word {
flags.not_word_boundary = true;
} else {
flags.word_boundary = true;
}
qnext.clear();
for &ip in &*qcur {
self.follow_epsilons(usize_to_u32(ip), qnext, flags);
}
mem::swap(qcur, qnext);
}
let mut empty_flags = EmptyFlags::default();
let mut state_flags = StateFlags::default();
empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
if b.is_ascii_word() {
state_flags.set_word();
}
qnext.clear();
for &ip in &*qcur {
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) | Split(_) | EmptyLook(_) => {}
Match(_) => {
state_flags.set_match();
if !self.continue_past_first_match() {
break;
} else if self.prog.matches.len() > 1
&& !qnext.contains(ip as usize)
{
qnext.insert(ip);
}
}
Bytes(ref inst) => {
if b.as_byte().map_or(false, |b| inst.matches(b)) {
self.follow_epsilons(
inst.goto as InstPtr,
qnext,
empty_flags,
);
}
}
}
}
let cache = if b.is_eof() && self.prog.matches.len() > 1 {
mem::swap(qcur, qnext);
false
} else {
true
};
let mut next =
match self.cached_state(qnext, state_flags, Some(&mut si)) {
None => return None,
Some(next) => next,
};
if (self.start & !STATE_START) == next {
debug_assert!(!self.state(next).flags().is_match());
next = self.start_ptr(next);
}
if next <= STATE_MAX && self.state(next).flags().is_match() {
next |= STATE_MATCH;
}
debug_assert!(next != STATE_UNKNOWN);
if cache {
let cls = self.byte_class(b);
self.cache.trans.set_next(si, cls, next);
}
Some(next)
}
fn follow_epsilons(
&mut self,
ip: InstPtr,
q: &mut SparseSet,
flags: EmptyFlags,
) {
use prog::EmptyLook::*;
use prog::Inst::*;
self.cache.stack.push(ip);
while let Some(mut ip) = self.cache.stack.pop() {
loop {
if q.contains(ip as usize) {
break;
}
q.insert(ip as usize);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Match(_) | Bytes(_) => {
break;
}
EmptyLook(ref inst) => {
match inst.look {
StartLine if flags.start_line => {
ip = inst.goto as InstPtr;
}
EndLine if flags.end_line => {
ip = inst.goto as InstPtr;
}
StartText if flags.start => {
ip = inst.goto as InstPtr;
}
EndText if flags.end => {
ip = inst.goto as InstPtr;
}
WordBoundaryAscii if flags.word_boundary => {
ip = inst.goto as InstPtr;
}
NotWordBoundaryAscii
if flags.not_word_boundary =>
{
ip = inst.goto as InstPtr;
}
WordBoundary if flags.word_boundary => {
ip = inst.goto as InstPtr;
}
NotWordBoundary if flags.not_word_boundary => {
ip = inst.goto as InstPtr;
}
StartLine | EndLine | StartText | EndText
| WordBoundaryAscii | NotWordBoundaryAscii
| WordBoundary | NotWordBoundary => {
break;
}
}
}
Save(ref inst) => {
ip = inst.goto as InstPtr;
}
Split(ref inst) => {
self.cache.stack.push(inst.goto2 as InstPtr);
ip = inst.goto1 as InstPtr;
}
}
}
}
}
fn cached_state(
&mut self,
q: &SparseSet,
mut state_flags: StateFlags,
current_state: Option<&mut StatePtr>,
) -> Option<StatePtr> {
let key = match self.cached_state_key(q, &mut state_flags) {
None => return Some(STATE_DEAD),
Some(v) => v,
};
if let Some(si) = self.cache.compiled.get_ptr(&key) {
return Some(si);
}
if self.approximate_size() > self.prog.dfa_size_limit
&& !self.clear_cache_and_save(current_state)
{
return None;
}
self.add_state(key)
}
fn cached_state_key(
&mut self,
q: &SparseSet,
state_flags: &mut StateFlags,
) -> Option<State> {
use prog::Inst::*;
let mut insts =
mem::replace(&mut self.cache.insts_scratch_space, vec![]);
insts.clear();
insts.push(0);
let mut prev = 0;
for &ip in q {
let ip = usize_to_u32(ip);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) | Split(_) => {}
Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
EmptyLook(_) => {
state_flags.set_empty();
push_inst_ptr(&mut insts, &mut prev, ip)
}
Match(_) => {
push_inst_ptr(&mut insts, &mut prev, ip);
if !self.continue_past_first_match() {
break;
}
}
}
}
let opt_state = if insts.len() == 1 && !state_flags.is_match() {
None
} else {
let StateFlags(f) = *state_flags;
insts[0] = f;
Some(State { data: Arc::from(&*insts) })
};
self.cache.insts_scratch_space = insts;
opt_state
}
fn clear_cache_and_save(
&mut self,
current_state: Option<&mut StatePtr>,
) -> bool {
if self.cache.compiled.is_empty() {
return true;
}
match current_state {
None => self.clear_cache(),
Some(si) => {
let cur = self.state(*si).clone();
if !self.clear_cache() {
return false;
}
*si = self.restore_state(cur).unwrap();
true
}
}
}
fn clear_cache(&mut self) -> bool {
let nstates = self.cache.compiled.len();
if self.cache.flush_count >= 3
&& self.at >= self.last_cache_flush
&& (self.at - self.last_cache_flush) <= 10 * nstates
{
return false;
}
self.last_cache_flush = self.at;
self.cache.flush_count += 1;
let start = self.state(self.start & !STATE_START).clone();
let last_match = if self.last_match_si <= STATE_MAX {
Some(self.state(self.last_match_si).clone())
} else {
None
};
self.cache.reset_size();
self.cache.trans.clear();
self.cache.compiled.clear();
for s in &mut self.cache.start_states {
*s = STATE_UNKNOWN;
}
let start_ptr = self.restore_state(start).unwrap();
self.start = self.start_ptr(start_ptr);
if let Some(last_match) = last_match {
self.last_match_si = self.restore_state(last_match).unwrap();
}
true
}
fn restore_state(&mut self, state: State) -> Option<StatePtr> {
if let Some(si) = self.cache.compiled.get_ptr(&state) {
return Some(si);
}
self.add_state(state)
}
fn next_state(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
si: StatePtr,
b: Byte,
) -> Option<StatePtr> {
if si == STATE_DEAD {
return Some(STATE_DEAD);
}
match self.cache.trans.next(si, self.byte_class(b)) {
STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
STATE_QUIT => None,
STATE_DEAD => Some(STATE_DEAD),
nsi => Some(nsi),
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state(
&mut self,
q: &mut SparseSet,
empty_flags: EmptyFlags,
state_flags: StateFlags,
) -> Option<StatePtr> {
let flagi = {
(((empty_flags.start as u8) << 0)
| ((empty_flags.end as u8) << 1)
| ((empty_flags.start_line as u8) << 2)
| ((empty_flags.end_line as u8) << 3)
| ((empty_flags.word_boundary as u8) << 4)
| ((empty_flags.not_word_boundary as u8) << 5)
| ((state_flags.is_word() as u8) << 6)) as usize
};
match self.cache.start_states[flagi] {
STATE_UNKNOWN => {}
STATE_DEAD => return Some(STATE_DEAD),
si => return Some(si),
}
q.clear();
let start = usize_to_u32(self.prog.start);
self.follow_epsilons(start, q, empty_flags);
let sp = match self.cached_state(q, state_flags, None) {
None => return None,
Some(sp) => self.start_ptr(sp),
};
self.cache.start_states[flagi] = sp;
Some(sp)
}
fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
let mut empty_flags = EmptyFlags::default();
let mut state_flags = StateFlags::default();
empty_flags.start = at == 0;
empty_flags.end = text.is_empty();
empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
empty_flags.end_line = text.is_empty();
let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
if is_word_last {
state_flags.set_word();
}
if is_word == is_word_last {
empty_flags.not_word_boundary = true;
} else {
empty_flags.word_boundary = true;
}
(empty_flags, state_flags)
}
fn start_flags_reverse(
&self,
text: &[u8],
at: usize,
) -> (EmptyFlags, StateFlags) {
let mut empty_flags = EmptyFlags::default();
let mut state_flags = StateFlags::default();
empty_flags.start = at == text.len();
empty_flags.end = text.is_empty();
empty_flags.start_line = at == text.len() || text[at] == b'\n';
empty_flags.end_line = text.is_empty();
let is_word_last =
at < text.len() && Byte::byte(text[at]).is_ascii_word();
let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
if is_word_last {
state_flags.set_word();
}
if is_word == is_word_last {
empty_flags.not_word_boundary = true;
} else {
empty_flags.word_boundary = true;
}
(empty_flags, state_flags)
}
fn state(&self, si: StatePtr) -> &State {
self.cache.compiled.get_state(si).unwrap()
}
fn add_state(&mut self, state: State) -> Option<StatePtr> {
let si = match self.cache.trans.add() {
None => return None,
Some(si) => si,
};
if self.prog.has_unicode_word_boundary {
for b in 128..256 {
let cls = self.byte_class(Byte::byte(b as u8));
self.cache.trans.set_next(si, cls, STATE_QUIT);
}
}
self.cache.size += self.cache.trans.state_heap_size()
+ state.data.len()
+ (2 * mem::size_of::<State>())
+ mem::size_of::<StatePtr>();
self.cache.compiled.insert(state, si);
debug_assert!(
self.cache.compiled.len() == self.cache.trans.num_states()
);
Some(si)
}
fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
}
fn num_byte_classes(&self) -> usize {
(self.prog.byte_classes[255] as usize + 1) + 1
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn byte_class(&self, b: Byte) -> usize {
match b.as_byte() {
None => self.num_byte_classes() - 1,
Some(b) => self.u8_class(b),
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn u8_class(&self, b: u8) -> usize {
self.prog.byte_classes[b as usize] as usize
}
fn continue_past_first_match(&self) -> bool {
self.prog.is_reverse || self.prog.matches.len() > 1
}
fn has_prefix(&self) -> bool {
!self.prog.is_reverse
&& !self.prog.prefixes.is_empty()
&& !self.prog.is_anchored_start
}
fn start_ptr(&self, si: StatePtr) -> StatePtr {
if self.has_prefix() {
si | STATE_START
} else {
si
}
}
fn approximate_size(&self) -> usize {
self.cache.size + self.prog.approximate_size()
}
}
#[derive(Debug)]
struct StateMap {
map: HashMap<State, StatePtr>,
states: Vec<State>,
num_byte_classes: usize,
}
impl StateMap {
fn new(num_byte_classes: usize) -> StateMap {
StateMap {
map: HashMap::new(),
states: vec![],
num_byte_classes: num_byte_classes,
}
}
fn len(&self) -> usize {
self.states.len()
}
fn is_empty(&self) -> bool {
self.states.is_empty()
}
fn get_ptr(&self, state: &State) -> Option<StatePtr> {
self.map.get(state).cloned()
}
fn get_state(&self, si: StatePtr) -> Option<&State> {
self.states.get(si as usize / self.num_byte_classes)
}
fn insert(&mut self, state: State, si: StatePtr) {
self.map.insert(state.clone(), si);
self.states.push(state);
}
fn clear(&mut self) {
self.map.clear();
self.states.clear();
}
}
impl Transitions {
fn new(num_byte_classes: usize) -> Transitions {
Transitions { table: vec![], num_byte_classes: num_byte_classes }
}
fn num_states(&self) -> usize {
self.table.len() / self.num_byte_classes
}
fn add(&mut self) -> Option<StatePtr> {
let si = self.table.len();
if si > STATE_MAX as usize {
return None;
}
self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
Some(usize_to_u32(si))
}
fn clear(&mut self) {
self.table.clear();
}
fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
self.table[si as usize + cls] = next;
}
fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
self.table[si as usize + cls]
}
fn state_heap_size(&self) -> usize {
self.num_byte_classes * mem::size_of::<StatePtr>()
}
unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
debug_assert!((si as usize) < self.table.len());
debug_assert!(cls < self.num_byte_classes);
*self.table.get_unchecked(si as usize + cls)
}
}
impl StateFlags {
fn is_match(&self) -> bool {
self.0 & 0b0000000_1 > 0
}
fn set_match(&mut self) {
self.0 |= 0b0000000_1;
}
fn is_word(&self) -> bool {
self.0 & 0b000000_1_0 > 0
}
fn set_word(&mut self) {
self.0 |= 0b000000_1_0;
}
fn has_empty(&self) -> bool {
self.0 & 0b00000_1_00 > 0
}
fn set_empty(&mut self) {
self.0 |= 0b00000_1_00;
}
}
impl Byte {
fn byte(b: u8) -> Self {
Byte(b as u16)
}
fn eof() -> Self {
Byte(256)
}
fn is_eof(&self) -> bool {
self.0 == 256
}
fn is_ascii_word(&self) -> bool {
let b = match self.as_byte() {
None => return false,
Some(b) => b,
};
match b {
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
_ => false,
}
}
fn as_byte(&self) -> Option<u8> {
if self.is_eof() {
None
} else {
Some(self.0 as u8)
}
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let ips: Vec<usize> = self.inst_ptrs().collect();
f.debug_struct("State")
.field("flags", &self.flags())
.field("insts", &ips)
.finish()
}
}
impl fmt::Debug for Transitions {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut fmtd = f.debug_map();
for si in 0..self.num_states() {
let s = si * self.num_byte_classes;
let e = s + self.num_byte_classes;
fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
}
fmtd.finish()
}
}
struct TransitionsRow<'a>(&'a [StatePtr]);
impl<'a> fmt::Debug for TransitionsRow<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut fmtd = f.debug_map();
for (b, si) in self.0.iter().enumerate() {
match *si {
STATE_UNKNOWN => {}
STATE_DEAD => {
fmtd.entry(&vb(b as usize), &"DEAD");
}
si => {
fmtd.entry(&vb(b as usize), &si.to_string());
}
}
}
fmtd.finish()
}
}
impl fmt::Debug for StateFlags {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("StateFlags")
.field("is_match", &self.is_match())
.field("is_word", &self.is_word())
.field("has_empty", &self.has_empty())
.finish()
}
}
fn vb(b: usize) -> String {
use std::ascii::escape_default;
if b > ::std::u8::MAX as usize {
"EOF".to_owned()
} else {
let escaped = escape_default(b as u8).collect::<Vec<u8>>();
String::from_utf8_lossy(&escaped).into_owned()
}
}
fn usize_to_u32(n: usize) -> u32 {
if (n as u64) > (::std::u32::MAX as u64) {
panic!("BUG: {} is too big to fit into u32", n)
}
n as u32
}
#[allow(dead_code)] fn show_state_ptr(si: StatePtr) -> String {
let mut s = format!("{:?}", si & STATE_MAX);
if si == STATE_UNKNOWN {
s = format!("{} (unknown)", s);
}
if si == STATE_DEAD {
s = format!("{} (dead)", s);
}
if si == STATE_QUIT {
s = format!("{} (quit)", s);
}
if si & STATE_START > 0 {
s = format!("{} (start)", s);
}
if si & STATE_MATCH > 0 {
s = format!("{} (match)", s);
}
s
}
fn write_vari32(data: &mut Vec<u8>, n: i32) {
let mut un = (n as u32) << 1;
if n < 0 {
un = !un;
}
write_varu32(data, un)
}
fn read_vari32(data: &[u8]) -> (i32, usize) {
let (un, i) = read_varu32(data);
let mut n = (un >> 1) as i32;
if un & 1 != 0 {
n = !n;
}
(n, i)
}
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
while n >= 0b1000_0000 {
data.push((n as u8) | 0b1000_0000);
n >>= 7;
}
data.push(n as u8);
}
fn read_varu32(data: &[u8]) -> (u32, usize) {
let mut n: u32 = 0;
let mut shift: u32 = 0;
for (i, &b) in data.iter().enumerate() {
if b < 0b1000_0000 {
return (n | ((b as u32) << shift), i + 1);
}
n |= ((b as u32) & 0b0111_1111) << shift;
shift += 7;
}
(0, 0)
}
#[cfg(test)]
mod tests {
extern crate rand;
use super::{
push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
State, StateFlags,
};
use quickcheck::{quickcheck, QuickCheck, StdGen};
use std::sync::Arc;
#[test]
fn prop_state_encode_decode() {
fn p(ips: Vec<u32>, flags: u8) -> bool {
let mut data = vec![flags];
let mut prev = 0;
for &ip in ips.iter() {
push_inst_ptr(&mut data, &mut prev, ip);
}
let state = State { data: Arc::from(&data[..]) };
let expected: Vec<usize> =
ips.into_iter().map(|ip| ip as usize).collect();
let got: Vec<usize> = state.inst_ptrs().collect();
expected == got && state.flags() == StateFlags(flags)
}
QuickCheck::new()
.gen(StdGen::new(self::rand::thread_rng(), 10_000))
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
}
#[test]
fn prop_read_write_u32() {
fn p(n: u32) -> bool {
let mut buf = vec![];
write_varu32(&mut buf, n);
let (got, nread) = read_varu32(&buf);
nread == buf.len() && got == n
}
quickcheck(p as fn(u32) -> bool);
}
#[test]
fn prop_read_write_i32() {
fn p(n: i32) -> bool {
let mut buf = vec![];
write_vari32(&mut buf, n);
let (got, nread) = read_vari32(&buf);
nread == buf.len() && got == n
}
quickcheck(p as fn(i32) -> bool);
}
}