regex 0.1.43

An implementation of regular expressions for Rust.
// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use syntax::{Expr, Repeater, CharClass, ClassRange};

use Error;
use program::{CharRanges, Inst, InstIdx};

type Compiled = (Vec<Inst>, Vec<Option<String>>);

/// A regex compiler.
///
/// A regex compiler is responsible for turning a regex's AST into a sequence
/// of instructions.
pub struct Compiler {
    size_limit: usize,
    insts: Vec<Inst>,
    cap_names: Vec<Option<String>>,
}

impl Compiler {
    /// Creates a new compiler that limits the size of the regex program
    /// to the size given (in bytes).
    pub fn new(size_limit: usize) -> Compiler {
        Compiler {
            size_limit: size_limit,
            insts: vec![],
            cap_names: vec![None],
        }
    }

    /// Compiles the given regex AST into a tuple of a sequence of
    /// instructions and a sequence of capture groups, optionally named.
    pub fn compile(mut self, ast: Expr) -> Result<Compiled, Error> {
        self.insts.push(Inst::Save(0));
        try!(self.c(ast));
        self.insts.push(Inst::Save(1));
        self.insts.push(Inst::Match);
        Ok((self.insts, self.cap_names))
    }

    fn c(&mut self, ast: Expr) -> Result<(), Error> {
        use program::Inst::*;
        use program::LookInst::*;

        match ast {
            Expr::Empty => {},
            Expr::Literal { chars, casei } => {
                for c in chars {
                    if casei {
                        try!(self.c(Expr::Class(CharClass::new(vec![
                            ClassRange { start: c, end: c },
                        ]).case_fold())));
                    } else {
                        self.push(Char(c));
                    }
                }
            }
            Expr::AnyChar => self.push(Ranges(CharRanges::any())),
            Expr::AnyCharNoNL => self.push(Ranges(CharRanges::any_nonl())),
            Expr::Class(cls) => {
                if cls.len() == 1 && cls[0].start == cls[0].end {
                    self.push(Char(cls[0].start));
                } else {
                    self.push(Ranges(CharRanges::from_class(cls)));
                }
            }
            Expr::StartLine => self.push(EmptyLook(StartLine)),
            Expr::EndLine => self.push(EmptyLook(EndLine)),
            Expr::StartText => self.push(EmptyLook(StartText)),
            Expr::EndText => self.push(EmptyLook(EndText)),
            Expr::WordBoundary => self.push(EmptyLook(WordBoundary)),
            Expr::NotWordBoundary => self.push(EmptyLook(NotWordBoundary)),
            Expr::Group { e, i: None, name: None } => try!(self.c(*e)),
            Expr::Group { e, i, name } => {
                let i = i.expect("capture index");
                self.cap_names.push(name);
                self.push(Save(2 * i));
                try!(self.c(*e));
                self.push(Save(2 * i + 1));
            }
            Expr::Concat(es) => {
                for e in es {
                    try!(self.c(e));
                }
            }
            Expr::Alternate(mut es) => {
                // TODO: Don't use recursion here. ---AG
                if es.len() == 0 {
                    return Ok(());
                }
                let e1 = es.remove(0);
                if es.len() == 0 {
                    try!(self.c(e1));
                    return Ok(());
                }
                let e2 = Expr::Alternate(es); // this causes recursion

                let split = self.empty_split();
                let j1 = self.insts.len();
                try!(self.c(e1));
                let jmp = self.empty_jump();
                let j2 = self.insts.len();
                try!(self.c(e2));
                let j3 = self.insts.len();

                self.set_split(split, j1, j2);
                self.set_jump(jmp, j3);
            }
            Expr::Repeat { e, r: Repeater::ZeroOrOne, greedy } => {
                let split = self.empty_split();
                let j1 = self.insts.len();
                try!(self.c(*e));
                let j2 = self.insts.len();

                if greedy {
                    self.set_split(split, j1, j2);
                } else {
                    self.set_split(split, j2, j1);
                }
            }
            Expr::Repeat { e, r: Repeater::ZeroOrMore, greedy } => {
                let j1 = self.insts.len();
                let split = self.empty_split();
                let j2 = self.insts.len();
                try!(self.c(*e));
                let jmp = self.empty_jump();
                let j3 = self.insts.len();

                self.set_jump(jmp, j1);
                if greedy {
                    self.set_split(split, j2, j3);
                } else {
                    self.set_split(split, j3, j2);
                }
            }
            Expr::Repeat { e, r: Repeater::OneOrMore, greedy } => {
                let j1 = self.insts.len();
                try!(self.c(*e));
                let split = self.empty_split();
                let j2 = self.insts.len();

                if greedy {
                    self.set_split(split, j1, j2);
                } else {
                    self.set_split(split, j2, j1);
                }
            }
            Expr::Repeat {
                e,
                r: Repeater::Range { min, max: None },
                greedy,
            } => {
                let e = *e;
                for _ in 0..min {
                    try!(self.c(e.clone()));
                }
                try!(self.c(Expr::Repeat {
                    e: Box::new(e),
                    r: Repeater::ZeroOrMore,
                    greedy: greedy,
                }));
            }
            Expr::Repeat {
                e,
                r: Repeater::Range { min, max: Some(max) },
                greedy,
            } => {
                let e = *e;
                for _ in 0..min {
                    try!(self.c(e.clone()));
                }
                // It is much simpler to compile, e.g., `a{2,5}` as:
                //
                //     aaa?a?a?
                //
                // But you end up with a sequence of instructions like this:
                //
                //     0: 'a'
                //     1: 'a',
                //     2: split(3, 4)
                //     3: 'a'
                //     4: split(5, 6)
                //     5: 'a'
                //     6: split(7, 8)
                //     7: 'a'
                //     8: MATCH
                //
                // This is *incredibly* inefficient because the splits end
                // up forming a chain. Given a much larger number than `5`,
                // it is easy cause perverse behavior in the matching engines
                // like stack overflows. We could fix the matching engine,
                // but instead, we should just make the program smarter.
                // Thus, we do a custom job here and instead of chaining the
                // splits together, we simply point them to the MATCH
                // instruction directly.
                let (mut splits, mut starts) = (vec![], vec![]);
                for _ in min..max {
                    splits.push(self.empty_split());
                    starts.push(self.insts.len());
                    try!(self.c(e.clone()));
                }
                let end = self.insts.len();
                for (split, start) in splits.into_iter().zip(starts) {
                    if greedy {
                        self.set_split(split, start, end);
                    } else {
                        self.set_split(split, end, start);
                    }
                }
            }
        }
        self.check_size()
    }

    fn check_size(&self) -> Result<(), Error> {
        use std::mem::size_of;

        if self.insts.len() * size_of::<Inst>() > self.size_limit {
            Err(Error::CompiledTooBig(self.size_limit))
        } else {
            Ok(())
        }
    }

    /// Appends the given instruction to the program.
    #[inline]
    fn push(&mut self, x: Inst) {
        self.insts.push(x)
    }

    /// Appends an *empty* `Split` instruction to the program and returns
    /// the index of that instruction. (The index can then be used to "patch"
    /// the actual locations of the split in later.)
    #[inline]
    fn empty_split(&mut self) -> InstIdx {
        self.insts.push(Inst::Split(0, 0));
        self.insts.len() - 1
    }

    /// Sets the left and right locations of a `Split` instruction at index
    /// `i` to `pc1` and `pc2`, respectively.
    /// If the instruction at index `i` isn't a `Split` instruction, then
    /// `panic!` is called.
    #[inline]
    fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
        let split = &mut self.insts[i];
        match *split {
            Inst::Split(_, _) => *split = Inst::Split(pc1, pc2),
            _ => panic!("BUG: Invalid split index."),
        }
    }

    /// Appends an *empty* `Jump` instruction to the program and returns the
    /// index of that instruction.
    #[inline]
    fn empty_jump(&mut self) -> InstIdx {
        self.insts.push(Inst::Jump(0));
        self.insts.len() - 1
    }

    /// Sets the location of a `Jump` instruction at index `i` to `pc`.
    /// If the instruction at index `i` isn't a `Jump` instruction, then
    /// `panic!` is called.
    #[inline]
    fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
        let jmp = &mut self.insts[i];
        match *jmp {
            Inst::Jump(_) => *jmp = Inst::Jump(pc),
            _ => panic!("BUG: Invalid jump index."),
        }
    }
}