extern crate bit_set;
extern crate regex;
#[cfg(test)]
#[macro_use]
extern crate matches;
#[cfg(test)]
#[macro_use]
extern crate quickcheck;
#[cfg(test)]
extern crate rand;
use bit_set::BitSet;
use std::fmt;
use std::usize;
pub mod analyze;
pub mod compile;
pub mod parse;
pub mod vm;
use analyze::analyze;
use compile::compile;
use parse::Parser;
use vm::Prog;
const MAX_RECURSION: usize = 64;
pub type Result<T> = ::std::result::Result<T, Error>;
static DEFAULT_SIZE_LIMIT: usize = 10 * (1<<20);
#[derive(Debug, PartialEq)]
pub enum Error {
ParseError,
UnclosedOpenParen,
InvalidRepeat,
RecursionExceeded,
LookBehindNotConst,
TrailingBackslash,
InvalidEscape,
UnclosedUnicodeName,
InvalidHex,
InvalidCodepointValue,
InvalidClass,
UnknownFlag,
NonUnicodeUnsupported,
InvalidBackref,
InnerError(regex::Error),
StackOverflow,
}
impl fmt::Display for Error {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
Error::ParseError => write!(formatter, "failed to parse regular expression"),
Error::UnclosedOpenParen => write!(formatter, "unmatched `(` in regular expression"),
Error::InvalidRepeat => {
write!(formatter, "invalid repeat syntax in regular expression")
}
Error::RecursionExceeded => {
write!(formatter, "recursion limit exceeded in regular expression")
}
Error::LookBehindNotConst => {
write!(formatter, "invalid look-behind in regular expression")
}
Error::TrailingBackslash => {
write!(formatter, "trailing backslash in regular expression")
}
Error::InvalidEscape => {
write!(formatter, "invalid escape sequence in regular expression")
}
Error::UnclosedUnicodeName => {
write!(formatter, "unclosed unicode name in regular expression")
}
Error::InvalidHex => write!(formatter, "invalid hex code in regular expression"),
Error::InvalidCodepointValue => {
write!(formatter, "invalid codepoint in regular expression")
}
Error::InvalidClass => {
write!(formatter, "invalid character class in regular expression")
}
Error::UnknownFlag => write!(formatter, "invalid flag in regular expression"),
Error::NonUnicodeUnsupported => write!(
formatter,
"non-unicode regular expressions are not supported"
),
Error::InvalidBackref => {
write!(formatter, "invalid back-reference in regular expression")
}
Error::InnerError(ref inner) => inner.fmt(formatter),
Error::StackOverflow => {
write!(formatter, "stack overflow executing regular expression")
}
}
}
}
pub enum Regex {
Wrap {
inner: regex::Regex,
inner1: Option<Box<regex::Regex>>,
original: String,
},
Impl {
prog: Prog,
n_groups: usize,
original: String,
},
}
#[derive(Debug)]
pub struct RegexBuilder {
pattern: String,
case_insensitive: bool,
multi_line: bool,
dot_matches_new_line: bool,
unicode: bool,
has_flags: bool,
size_limit: usize,
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t str,
start: usize,
end: usize,
}
#[derive(Debug)]
pub enum Captures<'t> {
Wrap {
text: &'t str,
inner: regex::Captures<'t>,
offset: usize,
enclosing_groups: usize,
},
Impl {
text: &'t str,
saves: Vec<usize>,
},
}
#[derive(Debug)]
pub struct SubCaptureMatches<'c, 't: 'c> {
caps: &'c Captures<'t>,
i: usize,
}
impl fmt::Debug for Regex {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl Regex {
pub fn new(re: &str) -> Result<Regex> {
Regex::new_with_size_limit(re, DEFAULT_SIZE_LIMIT)
}
fn new_with_size_limit(re: &str, size_limit: usize) -> Result<Regex> {
let (raw_e, backrefs) = Expr::parse(re)?;
let e = Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Any { newline: true }),
lo: 0,
hi: usize::MAX,
greedy: false,
},
Expr::Group(Box::new(raw_e)),
]);
let info = analyze(&e, &backrefs)?;
let inner_info = &info.children[1].children[0];
if !inner_info.hard {
let mut re_cooked = String::new();
let raw_e = match e {
Expr::Concat(ref v) => match v[1] {
Expr::Group(ref child) => child,
_ => unreachable!(),
},
_ => unreachable!(),
};
raw_e.to_str(&mut re_cooked, 0);
let inner = compile::compile_inner_with_size_limit(&re_cooked, size_limit)?;
let inner1 = if inner_info.looks_left {
let re1 = ["^(?s:.)+?(", re_cooked.as_str(), ")"].concat();
let compiled = compile::compile_inner_with_size_limit(&re1, size_limit)?;
Some(Box::new(compiled))
} else {
None
};
return Ok(Regex::Wrap {
inner: inner,
inner1: inner1,
original: re.to_string(),
});
}
let p = compile(&info)?;
Ok(Regex::Impl {
prog: p,
n_groups: info.end_group,
original: re.to_string(),
})
}
pub fn as_str(&self) -> &str {
match *self {
Regex::Wrap { ref original, .. } => &original,
Regex::Impl { ref original, .. } => &original,
}
}
pub fn is_match(&self, text: &str) -> Result<bool> {
match *self {
Regex::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
Regex::Impl { ref prog, .. } => {
let result = vm::run(prog, text, 0, 0)?;
Ok(result.is_some())
}
}
}
pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
match *self {
Regex::Wrap { ref inner, .. } => Ok(inner
.find(text)
.map(|m| Match::new(text, m.start(), m.end()))),
Regex::Impl { ref prog, .. } => {
let result = vm::run(prog, text, 0, 0)?;
Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
}
}
}
pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
match *self {
Regex::Wrap { ref inner, .. } => Ok(inner.captures(text).map(|caps| Captures::Wrap {
text,
inner: caps,
offset: 0,
enclosing_groups: 0,
})),
Regex::Impl {
ref prog, n_groups, ..
} => {
let result = vm::run(prog, text, 0, 0)?;
Ok(result.map(|mut saves| {
saves.truncate(n_groups * 2);
Captures::Impl {
text,
saves: saves,
}
}))
}
}
}
pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
match *self {
Regex::Wrap {
ref inner,
ref inner1,
..
} => {
if inner1.is_none() || pos == 0 {
Ok(inner.captures(&text[pos..]).map(|caps| Captures::Wrap {
text,
inner: caps,
offset: pos,
enclosing_groups: 0,
}))
} else {
let ix = prev_codepoint_ix(text, pos);
let inner1 = inner1.as_ref().unwrap();
Ok(inner1.captures(&text[ix..]).map(|caps| Captures::Wrap {
text,
inner: caps,
offset: ix,
enclosing_groups: 1,
}))
}
}
Regex::Impl {
ref prog, n_groups, ..
} => {
let result = vm::run(prog, text, pos, 0)?;
Ok(result.map(|mut saves| {
saves.truncate(n_groups * 2);
Captures::Impl {
text,
saves,
}
}))
}
}
}
pub fn debug_print(&self) {
match *self {
Regex::Wrap { ref inner, .. } => println!("wrapped {:?}", inner),
Regex::Impl { ref prog, .. } => prog.debug_print(),
}
}
}
impl RegexBuilder {
pub fn new(pattern: impl Into<String>) -> Self {
Self {
pattern: pattern.into(),
case_insensitive: false,
multi_line: false,
dot_matches_new_line: false,
unicode: false,
has_flags: false,
size_limit: DEFAULT_SIZE_LIMIT,
}
}
pub fn case_insensitive(&mut self, value: bool) -> &mut Self {
self.case_insensitive = value;
if value {
self.has_flags = true;
}
self
}
pub fn multi_line(&mut self, value: bool) -> &mut Self {
self.multi_line = value;
if value {
self.has_flags = true;
}
self
}
pub fn dot_matches_new_line(&mut self, value: bool) -> &mut Self {
self.dot_matches_new_line = value;
if value {
self.has_flags = true;
}
self
}
pub fn unicode(&mut self, value: bool) -> &mut Self {
self.unicode = value;
if value {
self.has_flags = true;
}
self
}
pub fn size_limit(&mut self, value: usize) -> &mut Self {
self.size_limit = value;
self
}
pub fn build(&self) -> Result<Regex> {
if self.has_flags {
let flags = format!(
"{}{}{}{}",
if self.case_insensitive { "i" } else { "" },
if self.multi_line { "m" } else { "" },
if self.dot_matches_new_line { "s" } else { "" },
if self.unicode { "u" } else { "" },
);
let pattern = format!("(?{}){}", &flags, &self.pattern);
Regex::new_with_size_limit(&pattern, self.size_limit)
} else {
Regex::new_with_size_limit(&self.pattern, self.size_limit)
}
}
}
impl<'t> Match<'t> {
#[inline]
pub fn start(&self) -> usize {
self.start
}
#[inline]
pub fn end(&self) -> usize {
self.end
}
#[inline]
pub fn as_str(&self) -> &'t str {
&self.text[self.start..self.end]
}
fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
Match { text, start, end }
}
}
impl<'t> Captures<'t> {
pub fn get(&self, i: usize) -> Option<Match<'t>> {
match *self {
Captures::Wrap {
text,
ref inner,
ref offset,
enclosing_groups,
} => inner.get(i + enclosing_groups).map(|m| Match {
text,
start: m.start() + offset,
end: m.end() + offset,
}),
Captures::Impl { text, ref saves } => {
if i >= saves.len() {
return None;
}
let lo = saves[i * 2];
if lo == std::usize::MAX {
return None;
}
let hi = saves[i * 2 + 1];
Some(Match {
text,
start: lo,
end: hi,
})
}
}
}
pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
SubCaptureMatches { caps: self, i: 0 }
}
pub fn len(&self) -> usize {
match *self {
Captures::Wrap {
ref inner,
enclosing_groups,
..
} => inner.len() - enclosing_groups,
Captures::Impl { ref saves, .. } => saves.len() / 2,
}
}
}
impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
type Item = Option<Match<'t>>;
fn next(&mut self) -> Option<Option<Match<'t>>> {
if self.i < self.caps.len() {
let result = self.caps.get(self.i);
self.i += 1;
Some(result)
} else {
None
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum Expr {
Empty,
Any {
newline: bool,
},
StartText,
EndText,
StartLine,
EndLine,
Literal {
val: String,
casei: bool,
},
Concat(Vec<Expr>),
Alt(Vec<Expr>),
Group(Box<Expr>),
LookAround(Box<Expr>, LookAround),
Repeat {
child: Box<Expr>,
lo: usize,
hi: usize,
greedy: bool,
},
Delegate {
inner: String,
size: usize,
casei: bool,
},
Backref(usize),
AtomicGroup(Box<Expr>),
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum LookAround {
LookAhead,
LookAheadNeg,
LookBehind,
LookBehindNeg,
}
fn push_usize(s: &mut String, x: usize) {
if x >= 10 {
push_usize(s, x / 10);
s.push((b'0' + (x % 10) as u8) as char);
} else {
s.push((b'0' + (x as u8)) as char);
}
}
fn push_quoted(buf: &mut String, s: &str) {
for c in s.chars() {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
| '#' => buf.push('\\'),
_ => (),
}
buf.push(c);
}
}
impl Expr {
pub fn parse(re: &str) -> Result<(Expr, BitSet)> {
Parser::parse(re)
}
pub fn to_str(&self, buf: &mut String, precedence: u8) {
match *self {
Expr::Empty => (),
Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
Expr::Literal { ref val, casei } => {
if casei {
buf.push_str("(?i:");
}
push_quoted(buf, val);
if casei {
buf.push_str(")");
}
}
Expr::StartText => buf.push('^'),
Expr::EndText => buf.push('$'),
Expr::StartLine => buf.push_str("(?m:^)"),
Expr::EndLine => buf.push_str("(?m:$)"),
Expr::Concat(ref children) => {
if precedence > 1 {
buf.push_str("(?:");
}
for child in children {
child.to_str(buf, 2);
}
if precedence > 1 {
buf.push(')')
}
}
Expr::Alt(ref children) => {
if precedence > 0 {
buf.push_str("(?:");
}
let is_empty = |e| match e {
&Expr::Empty => true,
_ => false,
};
let contains_empty = children.iter().any(&is_empty);
if contains_empty {
buf.push_str("(?:");
}
for (i, child) in children.iter().filter(|&c| !is_empty(c)).enumerate() {
if i != 0 {
buf.push('|');
}
child.to_str(buf, 1);
}
if contains_empty {
buf.push_str(")?");
}
if precedence > 0 {
buf.push(')');
}
}
Expr::Group(ref child) => {
buf.push('(');
child.to_str(buf, 0);
buf.push(')');
}
Expr::Repeat {
ref child,
lo,
hi,
greedy,
} => {
if precedence > 2 {
buf.push_str("(?:");
}
child.to_str(buf, 3);
buf.push('{');
push_usize(buf, lo);
buf.push(',');
if hi != usize::MAX {
push_usize(buf, hi);
}
buf.push('}');
if !greedy {
buf.push('?');
}
if precedence > 2 {
buf.push(')');
}
}
Expr::Delegate {
ref inner, casei, ..
} => {
if casei {
buf.push_str("(?i:");
}
buf.push_str(inner);
if casei {
buf.push_str(")");
}
}
_ => panic!("attempting to format hard expr"),
}
}
}
fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
let bytes = s.as_bytes();
loop {
ix -= 1;
if (bytes[ix] as i8) >= -0x40 {
break;
}
}
ix
}
fn codepoint_len(b: u8) -> usize {
match b {
b if b < 0x80 => 1,
b if b < 0xe0 => 2,
b if b < 0xf0 => 3,
_ => 4,
}
}
#[cfg(test)]
mod tests {
use parse::make_literal;
use Expr;
use Regex;
#[test]
fn to_str_concat_alt() {
let mut s = String::new();
let e = Expr::Concat(vec![
Expr::Alt(vec![make_literal("a"), make_literal("b")]),
make_literal("c"),
]);
e.to_str(&mut s, 0);
assert_eq!(s, "(?:a|b)c");
}
#[test]
fn to_str_rep_concat() {
let mut s = String::new();
let e = Expr::Repeat {
child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
lo: 2,
hi: 3,
greedy: true,
};
e.to_str(&mut s, 0);
assert_eq!(s, "(?:ab){2,3}");
}
#[test]
fn to_str_group_alt() {
let mut s = String::new();
let e = Expr::Group(Box::new(Expr::Alt(vec![
make_literal("a"),
make_literal("b"),
])));
e.to_str(&mut s, 0);
assert_eq!(s, "(a|b)");
}
#[test]
fn as_str_debug() {
let s = r"(a+)b\1";
let regex = Regex::new(s).unwrap();
assert_eq!(s, regex.as_str());
assert_eq!(s, format!("{:?}", regex));
}
}