regex 0.1.43

An implementation of regular expressions for Rust.
// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::cmp::{self, Ordering};

use syntax;

use Error;
use backtrack::{Backtrack, BackMachine};
use char::Char;
use compile::Compiler;
use nfa::{Nfa, NfaThreads};
use pool::Pool;
use prefix::Prefix;
use re::CaptureIdxs;

const NUM_PREFIX_LIMIT: usize = 30;
const PREFIX_LENGTH_LIMIT: usize = 15;

pub type InstIdx = usize;

/// An instruction, the underlying unit of a compiled regular expression
#[derive(Clone, Debug)]
pub enum Inst {
    /// A match has occurred.
    /// This is always the last instruction and only occurs in a single spot.
    /// We could special case this in the code, but it is much clearer to
    /// handle it as a proper instruction.
    Match,
    /// Save the current location in the input into the given capture location.
    Save(usize),
    /// Jump to the instruction given.
    Jump(InstIdx),
    /// Match either instruction, preferring the first.
    Split(InstIdx, InstIdx),
    /// A zero-width instruction. When this instruction matches, the input
    /// is not advanced.
    EmptyLook(LookInst),
    /// Match a single possibly case insensitive character.
    Char(char),
    /// Match one or more possibly case insensitive character ranges.
    Ranges(CharRanges),
}

/// A multi-range character class instruction.
#[derive(Clone, Debug)]
pub struct CharRanges {
    /// Sorted sequence of non-overlapping ranges.
    pub ranges: Vec<(char, char)>,
}

/// The set of zero-width match instructions.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum LookInst {
    /// Start of line or input.
    StartLine,
    /// End of line or input.
    EndLine,
    /// Start of input.
    StartText,
    /// End of input.
    EndText,
    /// Word character on one side and non-word character on other.
    WordBoundary,
    /// Word character on both sides or non-word character on both sides.
    NotWordBoundary,
}

impl CharRanges {
    /// Emits a range specifically for the `(?s).` expression.
    pub fn any() -> CharRanges {
        CharRanges { ranges: vec![('\x00', '\u{10ffff}')] }
    }

    /// Emits a range specifically for the `.` expression.
    pub fn any_nonl() -> CharRanges {
        CharRanges { ranges: vec![('\x00', '\x09'), ('\x0B', '\u{10ffff}')] }
    }

    /// Emits a range from the AST character class.
    pub fn from_class(cls: syntax::CharClass) -> CharRanges {
        CharRanges {
            ranges: cls.into_iter().map(|r| (r.start, r.end)).collect(),
        }
    }

    /// Tests whether the given input character matches this instruction.
    #[inline(always)] // About ~5-15% more throughput then `#[inline]`
    pub fn matches(&self, c: Char) -> bool {
        // This speeds up the `match_class_unicode` benchmark by checking
        // some common cases quickly without binary search. e.g., Matching
        // a Unicode class on predominantly ASCII text.
        for r in self.ranges.iter().take(4) {
            if c < r.0 {
                return false;
            }
            if c <= r.1 {
                return true;
            }
        }
        self.ranges.binary_search_by(|r| {
            if r.1 < c {
                Ordering::Less
            } else if r.0 > c {
                Ordering::Greater
            } else {
                Ordering::Equal
            }
        }).is_ok()
    }
}

impl LookInst {
    /// Tests whether the pair of characters matches this zero-width
    /// instruction.
    pub fn matches(&self, c1: Char, c2: Char) -> bool {
        use self::LookInst::*;
        match *self {
            StartLine => c1.is_none() || c1 == '\n',
            EndLine => c2.is_none() || c2 == '\n',
            StartText => c1.is_none(),
            EndText => c2.is_none(),
            ref wbty => {
                let (w1, w2) = (c1.is_word_char(), c2.is_word_char());
                (*wbty == WordBoundary && w1 ^ w2)
                || (*wbty == NotWordBoundary && !(w1 ^ w2))
            }
        }
    }
}

/// The matching engines offered by this regex implementation.
///
/// N.B. This is exported for use in testing.
#[doc(hidden)]
#[derive(Clone, Copy, Debug)]
pub enum MatchEngine {
    /// A bounded backtracking implementation. About twice as fast as the
    /// NFA, but can only work on small regexes and small input.
    Backtrack,
    /// A full NFA simulation. Can always be employed but almost always the
    /// slowest choice.
    Nfa,
    /// If the entire regex is a literal and no capture groups have been
    /// requested, then we can degrade to a simple substring match.
    Literals,
}

/// Program represents a compiled regular expression. Once an expression is
/// compiled, its representation is immutable and will never change.
/// (Well, almost. In fact, the matching engines cache state that can be
/// reused on subsequent searches. But this is interior mutability that
/// shouldn't be observable by the caller.)
#[derive(Debug)]
pub struct Program {
    /// The original regular expression string.
    pub original: String,
    /// A sequence of instructions.
    pub insts: Vec<Inst>,
    /// The sequence of capture group names. There is an entry for each capture
    /// group index and a name exists only if the capture group is named.
    pub cap_names: Vec<Option<String>>,
    /// If the regular expression requires a literal prefix in order to have a
    /// match, that prefix is stored here as a DFA.
    pub prefixes: Prefix,
    /// True iff matching any literal prefix indicates a match.
    pub prefixes_complete: bool,
    /// True iff program is anchored at the beginning.
    pub anchored_begin: bool,
    /// True iff program is anchored at the end.
    pub anchored_end: bool,
    /// The type of matching engine to use.
    /// When `None` (the default), pick an engine automatically.
    pub engine: Option<MatchEngine>,
    /// Cached NFA threads.
    pub nfa_threads: Pool<NfaThreads>,
    /// Cached backtracking memory.
    pub backtrack: Pool<BackMachine>,
}

impl Program {
    /// Compiles a Regex.
    pub fn new(
        engine: Option<MatchEngine>,
        size_limit: usize,
        re: &str,
    ) -> Result<Program, Error> {
        let expr = try!(syntax::Expr::parse(re));
        let (insts, cap_names) = try!(Compiler::new(size_limit).compile(expr));
        let (insts_len, ncaps) = (insts.len(), num_captures(&insts));
        let create_threads = move || NfaThreads::new(insts_len, ncaps);
        let create_backtrack = move || BackMachine::new();
        let mut prog = Program {
            original: re.into(),
            insts: insts,
            cap_names: cap_names,
            prefixes: Prefix::Empty,
            prefixes_complete: false,
            anchored_begin: false,
            anchored_end: false,
            engine: engine,
            nfa_threads: Pool::new(Box::new(create_threads)),
            backtrack: Pool::new(Box::new(create_backtrack)),
        };

        prog.find_prefixes();
        prog.anchored_begin = match prog.insts[1] {
            Inst::EmptyLook(LookInst::StartText) => true,
            _ => false,
        };
        prog.anchored_end = match prog.insts[prog.insts.len() - 3] {
            Inst::EmptyLook(LookInst::EndText) => true,
            _ => false,
        };
        Ok(prog)
    }

    /// Executes a compiled regex program.
    pub fn exec(
        &self,
        caps: &mut CaptureIdxs,
        text: &str,
        start: usize,
    ) -> bool {
        match self.choose_engine(caps.len(), text) {
            MatchEngine::Backtrack => Backtrack::exec(self, caps, text, start),
            MatchEngine::Nfa => Nfa::exec(self, caps, text, start),
            MatchEngine::Literals => {
                match self.prefixes.find(&text[start..]) {
                    None => false,
                    Some((s, e)) => {
                        if caps.len() == 2 {
                            caps[0] = Some(start + s);
                            caps[1] = Some(start + e);
                        }
                        true
                    }
                }
            }
        }
    }

    fn choose_engine(&self, cap_len: usize, text: &str) -> MatchEngine {
        // If the engine is already chosen, then we use it.
        // But that might not be a good idea. e.g., What if `Literals` is
        // chosen and it can't work? I guess we should probably check whether
        // the chosen engine is appropriate or not.
        self.engine.unwrap_or_else(|| {
            if cap_len <= 2
               && self.prefixes_complete
               && self.prefixes.preserves_priority() {
                MatchEngine::Literals
            } else if Backtrack::should_exec(self, text) {
                // We're only here if the input and regex combined are small.
                MatchEngine::Backtrack
            } else {
                MatchEngine::Nfa
            }
        })
    }

    /// Returns the total number of capture groups in the regular expression.
    /// This includes the zeroth capture.
    pub fn num_captures(&self) -> usize {
        num_captures(&self.insts)
    }

    /// Allocate new capture groups.
    pub fn alloc_captures(&self) -> Vec<Option<usize>> {
        vec![None; 2 * self.num_captures()]
    }

    /// Find and store a prefix machine for the current program.
    pub fn find_prefixes(&mut self) {
        // First, look for a standard literal prefix---this includes things
        // like `a+` and `[0-9]+`, but not `a|b`.
        let (ps, complete) = self.literals(self.skip(1));
        if ps.len() > 0 {
            self.prefixes = Prefix::new(ps);
            self.prefixes_complete = complete;
            return;
        }
        // Ok, now look for alternate prefixes, e.g., `a|b`.
        if let Some((pfxs, complete)) = self.alternate_prefixes() {
            self.prefixes = Prefix::new(pfxs);
            self.prefixes_complete = complete;
        }
    }

    fn alternate_prefixes(&self) -> Option<(Vec<String>, bool)> {
        let mut prefixes = vec![];
        let mut pcomplete = true;
        let mut stack = vec![self.skip(1)];
        while let Some(mut pc) = stack.pop() {
            pc = self.skip(pc);
            match &self.insts[pc] {
                &Inst::Split(x, y) => { stack.push(y); stack.push(x); }
                _ => {
                    let (alt_prefixes, complete) = self.literals(pc);
                    if alt_prefixes.is_empty() {
                        // If no prefixes could be identified for this
                        // alternate, then we can't use a prefix machine to
                        // skip through the input. Thus, we fail and report
                        // nothing.
                        return None;
                    }
                    if prefixes.len() + alt_prefixes.len() > NUM_PREFIX_LIMIT {
                        // Arg. We've over-extended ourselves, quit with
                        // nothing to show for it.
                        //
                        // This could happen if the regex is `a|b|c|...`, where
                        // the number of alternates is too much for us to
                        // handle given an empirically defined threshold limit.
                        //
                        // When this happens, we can't capture all of the
                        // prefixes, so our prefix machine becomes useless.
                        // Thus, fail and report nothing.
                        return None;
                    }
                    pcomplete = pcomplete && complete;
                    prefixes.extend(alt_prefixes);
                }
            }
        }
        if prefixes.is_empty() {
            None
        } else {
            Some((prefixes, pcomplete))
        }
    }

    /// Find required literals starting at the given instruction.
    ///
    /// Returns `true` in the tuple if the end of the literal leads trivially
    /// to a match. (This may report false negatives, but being conservative
    /// is OK.)
    fn literals(&self, mut pc: usize) -> (Vec<String>, bool) {
        use self::Inst::*;

        let mut complete = true;
        let mut alts = vec![String::new()];
        while pc < self.insts.len() {
            let inst = &self.insts[pc];

            // Each iteration adds one character to every alternate prefix *or*
            // it stops. Thus, the prefix alternates grow in lock step, and it
            // suffices to check one of them to see if the prefix limit has
            // been exceeded.
            if alts[0].len() > PREFIX_LENGTH_LIMIT {
                complete = false;
                break;
            }
            match *inst {
                Save(_) => { pc += 1; continue }
                Jump(pc2) => pc = pc2,
                Char(c) => {
                    for alt in &mut alts {
                        alt.push(c);
                    }
                    pc += 1;
                }
                Ranges(CharRanges { ref ranges }) => {
                    // This adds a new literal for *each* character in this
                    // range. This has the potential to use way too much
                    // memory, so we bound it naively for now.
                    let nchars = num_chars_in_ranges(ranges);
                    if alts.len() * nchars > NUM_PREFIX_LIMIT {
                        complete = false;
                        break;
                    }

                    let orig = alts;
                    alts = Vec::with_capacity(orig.len());
                    for &(s, e) in ranges {
                        for c in (s as u32)..(e as u32 + 1){
                            for alt in &orig {
                                let mut alt = alt.clone();
                                alt.push(::std::char::from_u32(c).unwrap());
                                alts.push(alt);
                            }
                        }
                    }
                    pc += 1;
                }
                _ => { complete = self.leads_to_match(pc); break }
            }
        }
        if alts[0].len() == 0 {
            (vec![], false)
        } else {
            (alts, complete)
        }
    }

    fn leads_to_match(&self, pc: usize) -> bool {
        // I'm pretty sure this is conservative, so it might have some
        // false negatives.
        match self.insts[self.skip(pc)] {
            Inst::Match => true,
            _ => false,
        }
    }

    fn skip(&self, mut pc: usize) -> usize {
        loop {
            match self.insts[pc] {
                Inst::Save(_) => pc += 1,
                Inst::Jump(pc2) => pc = pc2,
                _ => return pc,
            }
        }
    }
}

impl Clone for Program {
    fn clone(&self) -> Program {
        let (insts_len, ncaps) = (self.insts.len(), self.num_captures());
        let create_threads = move || NfaThreads::new(insts_len, ncaps);
        let create_backtrack = move || BackMachine::new();
        Program {
            original: self.original.clone(),
            insts: self.insts.clone(),
            cap_names: self.cap_names.clone(),
            prefixes: self.prefixes.clone(),
            prefixes_complete: self.prefixes_complete,
            anchored_begin: self.anchored_begin,
            anchored_end: self.anchored_end,
            engine: self.engine,
            nfa_threads: Pool::new(Box::new(create_threads)),
            backtrack: Pool::new(Box::new(create_backtrack)),
        }
    }
}

/// Return the number of captures in the given sequence of instructions.
fn num_captures(insts: &[Inst]) -> usize {
    let mut n = 0;
    for inst in insts {
        match *inst {
            Inst::Save(c) => n = cmp::max(n, c+1),
            _ => {}
        }
    }
    // There's exactly 2 Save slots for every capture.
    n / 2
}

/// Count the number of characters in the given range.
///
/// This is useful for pre-emptively limiting the number of prefix literals
/// we extract from a regex program.
fn num_chars_in_ranges(ranges: &[(char, char)]) -> usize {
    ranges.iter()
          .map(|&(s, e)| 1 + (e as u32) - (s as u32))
          .fold(0, |acc, len| acc + len) as usize
}

#[cfg(test)]
mod tests {
    use super::Program;

    macro_rules! prog {
        ($re:expr) => { Program::new(None, 1 << 30, $re).unwrap() }
    }

    macro_rules! prefixes {
        ($re:expr) => {{
            let p = prog!($re);
            assert!(!p.prefixes_complete);
            p.prefixes.prefixes()
        }}
    }
    macro_rules! prefixes_complete {
        ($re:expr) => {{
            let p = prog!($re);
            assert!(p.prefixes_complete);
            p.prefixes.prefixes()
        }}
    }

    #[test]
    fn single() {
        assert_eq!(prefixes_complete!("a"), vec!["a"]);
        assert_eq!(prefixes_complete!("[a]"), vec!["a"]);
        assert_eq!(prefixes!("a+"), vec!["a"]);
        assert_eq!(prefixes!("(?:a)+"), vec!["a"]);
        assert_eq!(prefixes!("(a)+"), vec!["a"]);
    }

    #[test]
    fn single_alt() {
        assert_eq!(prefixes_complete!("a|b"), vec!["a", "b"]);
        assert_eq!(prefixes_complete!("b|a"), vec!["b", "a"]);
        assert_eq!(prefixes_complete!("[a]|[b]"), vec!["a", "b"]);
        assert_eq!(prefixes!("a+|b"), vec!["a", "b"]);
        assert_eq!(prefixes!("a|b+"), vec!["a", "b"]);
        assert_eq!(prefixes!("(?:a+)|b"), vec!["a", "b"]);
        assert_eq!(prefixes!("(a+)|b"), vec!["a", "b"]);
    }

    #[test]
    fn many() {
        assert_eq!(prefixes_complete!("abcdef"), vec!["abcdef"]);
        assert_eq!(prefixes!("abcdef+"), vec!["abcdef"]);
        assert_eq!(prefixes!("(?:abcdef)+"), vec!["abcdef"]);
        assert_eq!(prefixes!("(abcdef)+"), vec!["abcdef"]);
    }

    #[test]
    fn many_alt() {
        assert_eq!(prefixes_complete!("abc|def"), vec!["abc", "def"]);
        assert_eq!(prefixes_complete!("def|abc"), vec!["def", "abc"]);
        assert_eq!(prefixes!("abc+|def"), vec!["abc", "def"]);
        assert_eq!(prefixes!("abc|def+"), vec!["abc", "def"]);
        assert_eq!(prefixes!("(?:abc)+|def"), vec!["abc", "def"]);
        assert_eq!(prefixes!("(abc)+|def"), vec!["abc", "def"]);
    }

    #[test]
    fn class() {
        assert_eq!(prefixes_complete!("[0-9]"), vec![
            "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
        ]);
        assert_eq!(prefixes!("[0-9]+"), vec![
            "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
        ]);
    }

    #[test]
    fn preceding_alt() {
        assert_eq!(prefixes!("(?:a|b).+"), vec!["a", "b"]);
        assert_eq!(prefixes!("(a|b).+"), vec!["a", "b"]);
    }

    #[test]
    fn nested_alt() {
        assert_eq!(prefixes_complete!("(a|b|c|d)"),
                   vec!["a", "b", "c", "d"]);
        assert_eq!(prefixes_complete!("((a|b)|(c|d))"),
                   vec!["a", "b", "c", "d"]);
    }
}