use regex_syntax::{Expr, Repeater, CharClass, ClassRange};
use utf8_ranges::{Utf8Sequences, Utf8Sequence};
use regex::{Error, Inst};
use Result;
pub struct Compiler {
size_limit: usize,
insts: Vec<Inst>,
}
impl Compiler {
pub fn new(size_limit: usize) -> Compiler {
Compiler {
size_limit: size_limit,
insts: vec![],
}
}
pub fn compile(mut self, ast: &Expr) -> Result<Vec<Inst>> {
try!(self.c(ast));
self.insts.push(Inst::Match);
Ok(self.insts)
}
fn c(&mut self, ast: &Expr) -> Result<()> {
use regex::Inst::*;
match *ast {
Expr::StartLine
| Expr::EndLine
| Expr::StartText
| Expr::EndText => return Err(From::from(Error::NoEmpty)),
Expr::WordBoundary
| Expr::NotWordBoundary => {
return Err(From::from(Error::NoWordBoundary));
}
Expr::Empty => {},
Expr::Literal { ref chars, casei } => {
for &c in chars {
if casei {
try!(self.c(&Expr::Class(CharClass::new(vec![
ClassRange { start: c, end: c },
]).case_fold())));
} else {
for seq in Utf8Sequences::new(c, c) {
self.compile_utf8_ranges(&seq);
}
}
}
}
Expr::AnyChar => try!(self.c(&Expr::Class(CharClass::new(vec![
ClassRange { start: '\u{0}', end: '\u{10FFFF}' },
])))),
Expr::AnyCharNoNL => try!(self.c(&Expr::Class(CharClass::new(vec![
ClassRange { start: '\u{0}', end: '\u{09}' },
ClassRange { start: '\u{0B}', end: '\u{10FFFF}' },
])))),
Expr::Class(ref cls) => {
try!(self.compile_class(cls));
}
Expr::Group { ref e, .. } => try!(self.c(e)),
Expr::Concat(ref es) => {
for e in es {
try!(self.c(e));
}
}
Expr::Alternate(ref es) => {
if es.len() == 0 {
return Ok(());
}
let mut jmps_to_end = vec![];
for e in &es[0..es.len()-1] {
let split = self.empty_split();
let j1 = self.insts.len();
try!(self.c(e));
jmps_to_end.push(self.empty_jump());
let j2 = self.insts.len();
self.set_split(split, j1, j2);
}
try!(self.c(&es[es.len()-1]));
let end = self.insts.len();
for jmp_to_end in jmps_to_end {
self.set_jump(jmp_to_end, end);
}
}
Expr::Repeat { greedy: false, .. } => {
return Err(Error::NoLazy.into());
}
Expr::Repeat { ref e, r: Repeater::ZeroOrOne, .. } => {
let split = self.empty_split();
let j1 = self.insts.len();
try!(self.c(e));
let j2 = self.insts.len();
self.set_split(split, j1, j2);
}
Expr::Repeat { ref e, r: Repeater::ZeroOrMore, .. } => {
let j1 = self.insts.len();
let split = self.empty_split();
let j2 = self.insts.len();
try!(self.c(e));
let jmp = self.empty_jump();
let j3 = self.insts.len();
self.set_jump(jmp, j1);
self.set_split(split, j2, j3);
}
Expr::Repeat { ref e, r: Repeater::OneOrMore, .. } => {
let j1 = self.insts.len();
try!(self.c(e));
let split = self.empty_split();
let j2 = self.insts.len();
self.set_split(split, j1, j2);
}
Expr::Repeat {
ref e,
r: Repeater::Range { min, max: None },
..
} => {
for _ in 0..min {
try!(self.c(e));
}
try!(self.c(&Expr::Repeat {
e: e.clone(),
r: Repeater::ZeroOrMore,
greedy: true,
}));
}
Expr::Repeat {
ref e,
r: Repeater::Range { min, max: Some(max) },
..
} => {
for _ in 0..min {
try!(self.c(e));
}
let (mut splits, mut starts) = (vec![], vec![]);
for _ in min..max {
splits.push(self.empty_split());
starts.push(self.insts.len());
try!(self.c(e));
}
let end = self.insts.len();
for (split, start) in splits.into_iter().zip(starts) {
self.set_split(split, start, end);
}
}
}
self.check_size()
}
fn compile_class(&mut self, class: &CharClass) -> Result<()> {
if class.is_empty() {
return Ok(());
}
let mut jmps = vec![];
for r in &class[0..class.len()-1] {
let split = self.empty_split();
let j1 = self.insts.len();
try!(self.compile_class_range(r));
jmps.push(self.empty_jump());
let j2 = self.insts.len();
self.set_split(split, j1, j2);
}
try!(self.compile_class_range(&class[class.len()-1]));
let end = self.insts.len();
for jmp in jmps {
self.set_jump(jmp, end);
}
Ok(())
}
fn compile_class_range(&mut self, char_range: &ClassRange) -> Result<()> {
let mut it = Utf8Sequences::new(char_range.start, char_range.end)
.peekable();
let mut jmps = vec![];
let mut utf8_ranges = it.next().expect("non-empty char class");
while it.peek().is_some() {
let split = self.empty_split();
let j1 = self.insts.len();
self.compile_utf8_ranges(&utf8_ranges);
jmps.push(self.empty_jump());
let j2 = self.insts.len();
self.set_split(split, j1, j2);
utf8_ranges = it.next().unwrap(); }
self.compile_utf8_ranges(&utf8_ranges);
let end = self.insts.len();
for jmp in jmps {
self.set_jump(jmp, end);
}
Ok(())
}
fn compile_utf8_ranges(&mut self, utf8_ranges: &Utf8Sequence) {
for r in utf8_ranges {
self.push(Inst::Range(r.start, r.end));
}
}
fn check_size(&self) -> Result<()> {
use std::mem::size_of;
if self.insts.len() * size_of::<Inst>() > self.size_limit {
Err(Error::CompiledTooBig(self.size_limit).into())
} else {
Ok(())
}
}
#[inline]
fn push(&mut self, x: Inst) {
self.insts.push(x)
}
#[inline]
fn empty_split(&mut self) -> usize {
self.insts.push(Inst::Split(0, 0));
self.insts.len() - 1
}
#[inline]
fn set_split(&mut self, i: usize, pc1: usize, pc2: usize) {
let split = &mut self.insts[i];
match *split {
Inst::Split(_, _) => *split = Inst::Split(pc1, pc2),
_ => panic!("BUG: Invalid split index."),
}
}
#[inline]
fn empty_jump(&mut self) -> usize {
self.insts.push(Inst::Jump(0));
self.insts.len() - 1
}
#[inline]
fn set_jump(&mut self, i: usize, pc: usize) {
let jmp = &mut self.insts[i];
match *jmp {
Inst::Jump(_) => *jmp = Inst::Jump(pc),
_ => panic!("BUG: Invalid jump index."),
}
}
}