use std::cmp::{self, Ordering};
use syntax;
use Error;
use backtrack::{Backtrack, BackMachine};
use char::Char;
use compile::Compiler;
use nfa::{Nfa, NfaThreads};
use pool::Pool;
use prefix::Prefix;
use re::CaptureIdxs;
const NUM_PREFIX_LIMIT: usize = 30;
const PREFIX_LENGTH_LIMIT: usize = 15;
pub type InstIdx = usize;
#[derive(Clone, Debug)]
pub enum Inst {
Match,
Save(usize),
Jump(InstIdx),
Split(InstIdx, InstIdx),
EmptyLook(LookInst),
Char(char),
Ranges(CharRanges),
}
#[derive(Clone, Debug)]
pub struct CharRanges {
pub ranges: Vec<(char, char)>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum LookInst {
StartLine,
EndLine,
StartText,
EndText,
WordBoundary,
NotWordBoundary,
}
impl CharRanges {
pub fn any() -> CharRanges {
CharRanges { ranges: vec![('\x00', '\u{10ffff}')] }
}
pub fn any_nonl() -> CharRanges {
CharRanges { ranges: vec![('\x00', '\x09'), ('\x0B', '\u{10ffff}')] }
}
pub fn from_class(cls: syntax::CharClass) -> CharRanges {
CharRanges {
ranges: cls.into_iter().map(|r| (r.start, r.end)).collect(),
}
}
#[inline(always)] pub fn matches(&self, c: Char) -> bool {
for r in self.ranges.iter().take(4) {
if c < r.0 {
return false;
}
if c <= r.1 {
return true;
}
}
self.ranges.binary_search_by(|r| {
if r.1 < c {
Ordering::Less
} else if r.0 > c {
Ordering::Greater
} else {
Ordering::Equal
}
}).is_ok()
}
}
impl LookInst {
pub fn matches(&self, c1: Char, c2: Char) -> bool {
use self::LookInst::*;
match *self {
StartLine => c1.is_none() || c1 == '\n',
EndLine => c2.is_none() || c2 == '\n',
StartText => c1.is_none(),
EndText => c2.is_none(),
ref wbty => {
let (w1, w2) = (c1.is_word_char(), c2.is_word_char());
(*wbty == WordBoundary && w1 ^ w2)
|| (*wbty == NotWordBoundary && !(w1 ^ w2))
}
}
}
}
#[doc(hidden)]
#[derive(Clone, Copy, Debug)]
pub enum MatchEngine {
Backtrack,
Nfa,
Literals,
}
#[derive(Debug)]
pub struct Program {
pub original: String,
pub insts: Vec<Inst>,
pub cap_names: Vec<Option<String>>,
pub prefixes: Prefix,
pub prefixes_complete: bool,
pub anchored_begin: bool,
pub anchored_end: bool,
pub engine: Option<MatchEngine>,
pub nfa_threads: Pool<NfaThreads>,
pub backtrack: Pool<BackMachine>,
}
impl Program {
pub fn new(
engine: Option<MatchEngine>,
size_limit: usize,
re: &str,
) -> Result<Program, Error> {
let expr = try!(syntax::Expr::parse(re));
let (insts, cap_names) = try!(Compiler::new(size_limit).compile(expr));
let (insts_len, ncaps) = (insts.len(), num_captures(&insts));
let create_threads = move || NfaThreads::new(insts_len, ncaps);
let create_backtrack = move || BackMachine::new();
let mut prog = Program {
original: re.into(),
insts: insts,
cap_names: cap_names,
prefixes: Prefix::Empty,
prefixes_complete: false,
anchored_begin: false,
anchored_end: false,
engine: engine,
nfa_threads: Pool::new(Box::new(create_threads)),
backtrack: Pool::new(Box::new(create_backtrack)),
};
prog.find_prefixes();
prog.anchored_begin = match prog.insts[1] {
Inst::EmptyLook(LookInst::StartText) => true,
_ => false,
};
prog.anchored_end = match prog.insts[prog.insts.len() - 3] {
Inst::EmptyLook(LookInst::EndText) => true,
_ => false,
};
Ok(prog)
}
pub fn exec(
&self,
caps: &mut CaptureIdxs,
text: &str,
start: usize,
) -> bool {
match self.choose_engine(caps.len(), text) {
MatchEngine::Backtrack => Backtrack::exec(self, caps, text, start),
MatchEngine::Nfa => Nfa::exec(self, caps, text, start),
MatchEngine::Literals => {
match self.prefixes.find(&text[start..]) {
None => false,
Some((s, e)) => {
if caps.len() == 2 {
caps[0] = Some(start + s);
caps[1] = Some(start + e);
}
true
}
}
}
}
}
fn choose_engine(&self, cap_len: usize, text: &str) -> MatchEngine {
self.engine.unwrap_or_else(|| {
if cap_len <= 2
&& self.prefixes_complete
&& self.prefixes.preserves_priority() {
MatchEngine::Literals
} else if Backtrack::should_exec(self, text) {
MatchEngine::Backtrack
} else {
MatchEngine::Nfa
}
})
}
pub fn num_captures(&self) -> usize {
num_captures(&self.insts)
}
pub fn alloc_captures(&self) -> Vec<Option<usize>> {
vec![None; 2 * self.num_captures()]
}
pub fn find_prefixes(&mut self) {
let (ps, complete) = self.literals(self.skip(1));
if ps.len() > 0 {
self.prefixes = Prefix::new(ps);
self.prefixes_complete = complete;
return;
}
if let Some((pfxs, complete)) = self.alternate_prefixes() {
self.prefixes = Prefix::new(pfxs);
self.prefixes_complete = complete;
}
}
fn alternate_prefixes(&self) -> Option<(Vec<String>, bool)> {
let mut prefixes = vec![];
let mut pcomplete = true;
let mut stack = vec![self.skip(1)];
while let Some(mut pc) = stack.pop() {
pc = self.skip(pc);
match &self.insts[pc] {
&Inst::Split(x, y) => { stack.push(y); stack.push(x); }
_ => {
let (alt_prefixes, complete) = self.literals(pc);
if alt_prefixes.is_empty() {
return None;
}
if prefixes.len() + alt_prefixes.len() > NUM_PREFIX_LIMIT {
return None;
}
pcomplete = pcomplete && complete;
prefixes.extend(alt_prefixes);
}
}
}
if prefixes.is_empty() {
None
} else {
Some((prefixes, pcomplete))
}
}
fn literals(&self, mut pc: usize) -> (Vec<String>, bool) {
use self::Inst::*;
let mut complete = true;
let mut alts = vec![String::new()];
while pc < self.insts.len() {
let inst = &self.insts[pc];
if alts[0].len() > PREFIX_LENGTH_LIMIT {
complete = false;
break;
}
match *inst {
Save(_) => { pc += 1; continue }
Jump(pc2) => pc = pc2,
Char(c) => {
for alt in &mut alts {
alt.push(c);
}
pc += 1;
}
Ranges(CharRanges { ref ranges }) => {
let nchars = num_chars_in_ranges(ranges);
if alts.len() * nchars > NUM_PREFIX_LIMIT {
complete = false;
break;
}
let orig = alts;
alts = Vec::with_capacity(orig.len());
for &(s, e) in ranges {
for c in (s as u32)..(e as u32 + 1){
for alt in &orig {
let mut alt = alt.clone();
alt.push(::std::char::from_u32(c).unwrap());
alts.push(alt);
}
}
}
pc += 1;
}
_ => { complete = self.leads_to_match(pc); break }
}
}
if alts[0].len() == 0 {
(vec![], false)
} else {
(alts, complete)
}
}
fn leads_to_match(&self, pc: usize) -> bool {
match self.insts[self.skip(pc)] {
Inst::Match => true,
_ => false,
}
}
fn skip(&self, mut pc: usize) -> usize {
loop {
match self.insts[pc] {
Inst::Save(_) => pc += 1,
Inst::Jump(pc2) => pc = pc2,
_ => return pc,
}
}
}
}
impl Clone for Program {
fn clone(&self) -> Program {
let (insts_len, ncaps) = (self.insts.len(), self.num_captures());
let create_threads = move || NfaThreads::new(insts_len, ncaps);
let create_backtrack = move || BackMachine::new();
Program {
original: self.original.clone(),
insts: self.insts.clone(),
cap_names: self.cap_names.clone(),
prefixes: self.prefixes.clone(),
prefixes_complete: self.prefixes_complete,
anchored_begin: self.anchored_begin,
anchored_end: self.anchored_end,
engine: self.engine,
nfa_threads: Pool::new(Box::new(create_threads)),
backtrack: Pool::new(Box::new(create_backtrack)),
}
}
}
fn num_captures(insts: &[Inst]) -> usize {
let mut n = 0;
for inst in insts {
match *inst {
Inst::Save(c) => n = cmp::max(n, c+1),
_ => {}
}
}
n / 2
}
fn num_chars_in_ranges(ranges: &[(char, char)]) -> usize {
ranges.iter()
.map(|&(s, e)| 1 + (e as u32) - (s as u32))
.fold(0, |acc, len| acc + len) as usize
}
#[cfg(test)]
mod tests {
use super::Program;
macro_rules! prog {
($re:expr) => { Program::new(None, 1 << 30, $re).unwrap() }
}
macro_rules! prefixes {
($re:expr) => {{
let p = prog!($re);
assert!(!p.prefixes_complete);
p.prefixes.prefixes()
}}
}
macro_rules! prefixes_complete {
($re:expr) => {{
let p = prog!($re);
assert!(p.prefixes_complete);
p.prefixes.prefixes()
}}
}
#[test]
fn single() {
assert_eq!(prefixes_complete!("a"), vec!["a"]);
assert_eq!(prefixes_complete!("[a]"), vec!["a"]);
assert_eq!(prefixes!("a+"), vec!["a"]);
assert_eq!(prefixes!("(?:a)+"), vec!["a"]);
assert_eq!(prefixes!("(a)+"), vec!["a"]);
}
#[test]
fn single_alt() {
assert_eq!(prefixes_complete!("a|b"), vec!["a", "b"]);
assert_eq!(prefixes_complete!("b|a"), vec!["b", "a"]);
assert_eq!(prefixes_complete!("[a]|[b]"), vec!["a", "b"]);
assert_eq!(prefixes!("a+|b"), vec!["a", "b"]);
assert_eq!(prefixes!("a|b+"), vec!["a", "b"]);
assert_eq!(prefixes!("(?:a+)|b"), vec!["a", "b"]);
assert_eq!(prefixes!("(a+)|b"), vec!["a", "b"]);
}
#[test]
fn many() {
assert_eq!(prefixes_complete!("abcdef"), vec!["abcdef"]);
assert_eq!(prefixes!("abcdef+"), vec!["abcdef"]);
assert_eq!(prefixes!("(?:abcdef)+"), vec!["abcdef"]);
assert_eq!(prefixes!("(abcdef)+"), vec!["abcdef"]);
}
#[test]
fn many_alt() {
assert_eq!(prefixes_complete!("abc|def"), vec!["abc", "def"]);
assert_eq!(prefixes_complete!("def|abc"), vec!["def", "abc"]);
assert_eq!(prefixes!("abc+|def"), vec!["abc", "def"]);
assert_eq!(prefixes!("abc|def+"), vec!["abc", "def"]);
assert_eq!(prefixes!("(?:abc)+|def"), vec!["abc", "def"]);
assert_eq!(prefixes!("(abc)+|def"), vec!["abc", "def"]);
}
#[test]
fn class() {
assert_eq!(prefixes_complete!("[0-9]"), vec![
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
]);
assert_eq!(prefixes!("[0-9]+"), vec![
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
]);
}
#[test]
fn preceding_alt() {
assert_eq!(prefixes!("(?:a|b).+"), vec!["a", "b"]);
assert_eq!(prefixes!("(a|b).+"), vec!["a", "b"]);
}
#[test]
fn nested_alt() {
assert_eq!(prefixes_complete!("(a|b|c|d)"),
vec!["a", "b", "c", "d"]);
assert_eq!(prefixes_complete!("((a|b)|(c|d))"),
vec!["a", "b", "c", "d"]);
}
}