use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::Arc;
use mempool::Pool;
use syntax::{Expr, ExprBuilder, Literals};
use backtrack;
use compile::Compiler;
use dfa;
use error::Error;
use input::{ByteInput, CharInput};
use literals::LiteralSearcher;
use pikevm;
use prog::Program;
use re_bytes;
use re_trait::{RegularExpression, Slot};
use re_unicode;
use set;
#[derive(Debug)]
pub struct Exec {
ro: Arc<ExecReadOnly>,
cache: Pool<ProgramCache>,
}
#[derive(Debug)]
pub struct ExecNoSync<'c> {
ro: &'c Arc<ExecReadOnly>,
cache: &'c ProgramCache,
}
pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
#[derive(Debug)]
struct ExecReadOnly {
res: Vec<String>,
nfa: Program,
dfa: Program,
dfa_reverse: Program,
suffixes: LiteralSearcher,
match_type: MatchType,
}
pub struct ExecBuilder {
res: Vec<String>,
match_type: Option<MatchType>,
size_limit: usize,
bytes: bool,
only_utf8: bool,
}
impl ExecBuilder {
pub fn new(re: &str) -> Self {
Self::new_many(&[re])
}
pub fn new_many<I, S>(res: I) -> Self
where S: AsRef<str>, I: IntoIterator<Item=S> {
ExecBuilder {
res: res.into_iter().map(|s| s.as_ref().to_owned()).collect(),
match_type: None,
size_limit: 10 * (1 << 20),
bytes: false,
only_utf8: true,
}
}
pub fn automatic(mut self) -> Self {
self.match_type = None;
self
}
pub fn nfa(mut self) -> Self {
self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
self
}
pub fn bounded_backtracking(mut self) -> Self {
self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
self
}
pub fn size_limit(mut self, bytes: usize) -> Self {
self.size_limit = bytes;
self
}
pub fn bytes(mut self, yes: bool) -> Self {
self.bytes = yes;
self
}
pub fn only_utf8(mut self, yes: bool) -> Self {
self.only_utf8 = yes;
self
}
pub fn build(self) -> Result<Exec, Error> {
if self.res.is_empty() {
let ro = Arc::new(ExecReadOnly {
res: vec![],
nfa: Program::new(),
dfa: Program::new(),
dfa_reverse: Program::new(),
suffixes: LiteralSearcher::empty(),
match_type: MatchType::Nothing,
});
let ro_ = ro.clone();
return Ok(Exec { ro: ro, cache: create_pool(ro_) });
}
let parsed = try!(Parsed::parse(&self.res, self.only_utf8));
let mut nfa = try!(
Compiler::new()
.size_limit(self.size_limit)
.bytes(self.bytes)
.only_utf8(self.only_utf8)
.compile(&parsed.exprs));
let mut dfa = try!(
Compiler::new()
.size_limit(self.size_limit)
.dfa(true)
.only_utf8(self.only_utf8)
.compile(&parsed.exprs));
let dfa_reverse = try!(
Compiler::new()
.size_limit(self.size_limit)
.dfa(true)
.only_utf8(self.only_utf8)
.reverse(true)
.compile(&parsed.exprs));
let prefixes = parsed.prefixes.unambiguous_prefixes();
let suffixes = parsed.suffixes.unambiguous_suffixes();
nfa.prefixes = LiteralSearcher::prefixes(prefixes);
dfa.prefixes = nfa.prefixes.clone();
let mut ro = ExecReadOnly {
res: self.res,
nfa: nfa,
dfa: dfa,
dfa_reverse: dfa_reverse,
suffixes: LiteralSearcher::suffixes(suffixes),
match_type: MatchType::Nothing,
};
ro.match_type = ro.choose_match_type(self.match_type);
let ro = Arc::new(ro);
let ro_ = ro.clone();
Ok(Exec { ro: ro, cache: create_pool(ro_) })
}
}
impl<'c> RegularExpression for ExecNoSyncStr<'c> {
type Text = str;
fn slots_len(&self) -> usize { self.0.slots_len() }
fn next_after_empty(&self, text: &str, i: usize) -> usize {
i + text[i..].chars().next().unwrap().len_utf8()
}
#[inline(always)] fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
self.0.shortest_match_at(text.as_bytes(), start)
}
#[inline(always)] fn is_match_at(&self, text: &str, start: usize) -> bool {
self.0.is_match_at(text.as_bytes(), start)
}
#[inline(always)] fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
self.0.find_at(text.as_bytes(), start)
}
#[inline(always)] fn captures_at(
&self,
slots: &mut [Slot],
text: &str,
start: usize,
) -> Option<(usize, usize)> {
self.0.captures_at(slots, text.as_bytes(), start)
}
}
impl<'c> RegularExpression for ExecNoSync<'c> {
type Text = [u8];
fn slots_len(&self) -> usize {
self.ro.nfa.captures.len() * 2
}
fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
i + 1
}
#[inline(always)] fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
if !self.is_anchor_end_match(text) {
return None;
}
match self.ro.match_type {
MatchType::Literal(ty) => {
self.exec_literals(ty, text, start).map(|(_, e)| e)
}
MatchType::Dfa | MatchType::DfaMany => {
match dfa::Fsm::forward(
&self.ro.dfa,
&self.cache,
true,
text,
start,
) {
dfa::Result::Match(e) => Some(start + e),
dfa::Result::NoMatch => None,
dfa::Result::Quit => {
return self.shortest_match_nfa(
MatchNfaType::Auto, text, start);
}
}
}
MatchType::Nfa(ty) => self.shortest_match_nfa(ty, text, start),
MatchType::Nothing => None,
}
}
#[inline(always)] fn is_match_at(&self, text: &[u8], start: usize) -> bool {
use self::MatchType::*;
if !self.is_anchor_end_match(text) {
return false;
}
match self.ro.match_type {
Literal(_) | Dfa | DfaMany | Nothing => {
self.shortest_match_at(text, start).is_some()
}
Nfa(ty) => {
self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
}
}
}
#[inline(always)] fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
if !self.is_anchor_end_match(text) {
return None;
}
match self.ro.match_type {
MatchType::Literal(ty) => {
self.exec_literals(ty, text, start)
}
MatchType::Dfa => {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => Some((s, e)),
dfa::Result::NoMatch => None,
dfa::Result::Quit => {
self.find_nfa(MatchNfaType::Auto, text, start)
}
}
}
MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
MatchType::Nothing => None,
MatchType::DfaMany => {
unreachable!("BUG: RegexSet cannot be used with find")
}
}
}
fn captures_at(
&self,
slots: &mut [Slot],
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
if !self.is_anchor_end_match(text) {
return None;
}
match self.ro.match_type {
MatchType::Literal(ty) => {
self.exec_literals(ty, text, start).and_then(|(s, _)| {
self.captures_nfa(MatchNfaType::Auto, slots, text, s)
})
}
MatchType::Dfa => {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, _)) => {
self.captures_nfa(
MatchNfaType::Auto, slots, text, s)
}
dfa::Result::NoMatch => None,
dfa::Result::Quit => {
self.captures_nfa(
MatchNfaType::Auto, slots, text, start)
}
}
}
MatchType::Nfa(ty) => {
self.captures_nfa(ty, slots, text, start)
}
MatchType::Nothing => None,
MatchType::DfaMany => {
unreachable!("BUG: RegexSet cannot be used with captures")
}
}
}
}
impl<'c> ExecNoSync<'c> {
pub fn many_matches_at(
&self,
matches: &mut [bool],
text: &[u8],
start: usize,
) -> bool {
use self::MatchType::*;
if !self.is_anchor_end_match(text) {
return false;
}
match self.ro.match_type {
Literal(ty) => {
debug_assert!(matches.len() == 1);
matches[0] = self.exec_literals(ty, text, start).is_some();
matches[0]
}
Dfa | DfaMany => {
match dfa::Fsm::forward_many(
&self.ro.dfa,
&self.cache,
matches,
text,
start,
) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch => false,
dfa::Result::Quit => {
self.exec_nfa(
MatchNfaType::Auto,
matches,
&mut [],
false,
text,
start)
}
}
}
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
Nothing => false,
}
}
fn shortest_match_nfa(
&self,
ty: MatchNfaType,
text: &[u8],
start: usize,
) -> Option<usize> {
let mut slots = [None, None];
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
slots[1]
} else {
None
}
}
#[inline(always)] fn find_dfa_forward(
&self,
text: &[u8],
start: usize,
) -> dfa::Result<(usize, usize)> {
use dfa::Result::*;
let end = match dfa::Fsm::forward(
&self.ro.dfa,
&self.cache,
false,
text,
start,
) {
NoMatch => return NoMatch,
Quit => return Quit,
Match(end) if start == end => return Match((start, start)),
Match(end) => end,
};
match dfa::Fsm::reverse(
&self.ro.dfa_reverse,
&self.cache,
false,
&text[start..],
end - start,
) {
Match(s) => Match((start + s, end)),
NoMatch => NoMatch,
Quit => Quit,
}
}
fn find_nfa(
&self,
ty: MatchNfaType,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let mut slots = [None, None];
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
}
} else {
None
}
}
fn captures_nfa(
&self,
ty: MatchNfaType,
slots: &mut [Slot],
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
}
} else {
None
}
}
#[inline(always)] fn exec_literals(
&self,
ty: MatchLiteralType,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
use self::MatchLiteralType::*;
match ty {
Unanchored => {
let lits = &self.ro.nfa.prefixes;
lits.find(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AnchoredStart => {
let lits = &self.ro.nfa.prefixes;
lits.find_start(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AnchoredEnd => self.ro.suffixes.find_end(&text),
}
}
fn exec_nfa(
&self,
mut ty: MatchNfaType,
matches: &mut [bool],
slots: &mut [Slot],
quit_after_match: bool,
text: &[u8],
start: usize,
) -> bool {
use self::MatchNfaType::*;
if let Auto = ty {
if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
ty = Backtrack;
} else {
ty = PikeVM;
}
}
match ty {
Auto => unreachable!(),
Backtrack => self.exec_backtrack(matches, slots, text, start),
PikeVM => {
self.exec_pikevm(
matches, slots, quit_after_match, text, start)
}
}
}
fn exec_pikevm(
&self,
matches: &mut [bool],
slots: &mut [Slot],
quit_after_match: bool,
text: &[u8],
start: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
pikevm::Fsm::exec(
&self.ro.nfa,
&self.cache,
matches,
slots,
quit_after_match,
ByteInput::new(text),
start)
} else {
pikevm::Fsm::exec(
&self.ro.nfa,
&self.cache,
matches,
slots,
quit_after_match,
CharInput::new(text),
start)
}
}
fn exec_backtrack(
&self,
matches: &mut [bool],
slots: &mut [Slot],
text: &[u8],
start: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
backtrack::Bounded::exec(
&self.ro.nfa,
&self.cache,
matches,
slots,
ByteInput::new(text),
start)
} else {
backtrack::Bounded::exec(
&self.ro.nfa,
&self.cache,
matches,
slots,
CharInput::new(text),
start)
}
}
#[inline(always)] fn is_anchor_end_match(&self, text: &[u8]) -> bool {
if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
let lcs = self.ro.suffixes.lcs();
if lcs.len() >= 1 && !lcs.is_suffix(text) {
return false;
}
}
true
}
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}
}
impl<'c> ExecNoSyncStr<'c> {
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
self.0.capture_name_idx()
}
}
impl Exec {
#[inline(always)] pub fn searcher(&self) -> ExecNoSync {
ExecNoSync {
ro: &self.ro, cache: self.cache.get(),
}
}
#[inline(always)] pub fn searcher_str(&self) -> ExecNoSyncStr {
ExecNoSyncStr(self.searcher())
}
pub fn into_regex(self) -> re_unicode::Regex {
re_unicode::Regex::from(self)
}
pub fn into_regex_set(self) -> set::RegexSet {
set::RegexSet::from(self)
}
pub fn into_byte_regex(self) -> re_bytes::Regex {
re_bytes::Regex::from(self)
}
pub fn into_byte_regex_set(self) -> re_bytes::RegexSet {
re_bytes::RegexSet::from(self)
}
pub fn regex_strings(&self) -> &[String] {
&self.ro.res
}
pub fn capture_names(&self) -> &[Option<String>] {
&self.ro.nfa.captures
}
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}
}
impl Clone for Exec {
fn clone(&self) -> Exec {
Exec {
ro: self.ro.clone(),
cache: create_pool(self.ro.clone()),
}
}
}
impl ExecReadOnly {
fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
use self::MatchType::*;
if let Some(Nfa(_)) = hint {
return hint.unwrap();
}
if self.nfa.insts.is_empty() {
return Nothing;
}
if self.res.len() == 1 {
if self.nfa.prefixes.complete() {
return if self.nfa.is_anchored_start {
Literal(MatchLiteralType::AnchoredStart)
} else {
Literal(MatchLiteralType::Unanchored)
};
}
if self.suffixes.complete() {
return if self.nfa.is_anchored_end {
Literal(MatchLiteralType::AnchoredEnd)
} else {
Literal(MatchLiteralType::Unanchored)
};
}
}
if dfa::can_exec(&self.dfa) {
if self.res.len() >= 2 {
return DfaMany;
}
return Dfa;
}
Nfa(MatchNfaType::Auto)
}
}
#[derive(Clone, Copy, Debug)]
enum MatchType {
Literal(MatchLiteralType),
Dfa,
DfaMany,
Nfa(MatchNfaType),
Nothing,
}
#[derive(Clone, Copy, Debug)]
enum MatchLiteralType {
Unanchored,
AnchoredStart,
AnchoredEnd,
}
#[derive(Clone, Copy, Debug)]
enum MatchNfaType {
Auto,
Backtrack,
PikeVM,
}
pub type ProgramCache = RefCell<ProgramCacheInner>;
#[derive(Clone, Debug)]
pub struct ProgramCacheInner {
pub pikevm: pikevm::Cache,
pub backtrack: backtrack::Cache,
pub dfa: dfa::Cache,
pub dfa_reverse: dfa::Cache,
}
fn create_pool(ro: Arc<ExecReadOnly>) -> Pool<ProgramCache> {
Pool::new(Box::new(move || RefCell::new(ProgramCacheInner::new(&ro))))
}
impl ProgramCacheInner {
fn new(ro: &ExecReadOnly) -> Self {
ProgramCacheInner {
pikevm: pikevm::Cache::new(&ro.nfa),
backtrack: backtrack::Cache::new(&ro.nfa),
dfa: dfa::Cache::new(&ro.dfa),
dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
}
}
}
struct Parsed {
exprs: Vec<Expr>,
prefixes: Literals,
suffixes: Literals,
}
impl Parsed {
fn parse(res: &[String], only_utf8: bool) -> Result<Parsed, Error> {
let mut exprs = Vec::with_capacity(res.len());
let mut prefixes = Some(Literals::empty());
let mut suffixes = Some(Literals::empty());
for re in res {
let parser =
ExprBuilder::new()
.allow_bytes(!only_utf8)
.unicode(only_utf8);
let expr = try!(parser.parse(re));
prefixes = prefixes.and_then(|mut prefixes| {
if !prefixes.union_prefixes(&expr) {
None
} else {
Some(prefixes)
}
});
suffixes = suffixes.and_then(|mut suffixes| {
if !suffixes.union_suffixes(&expr) {
None
} else {
Some(suffixes)
}
});
exprs.push(expr);
}
Ok(Parsed {
exprs: exprs,
prefixes: prefixes.unwrap_or(Literals::empty()),
suffixes: suffixes.unwrap_or(Literals::empty()),
})
}
}