use {
grep_matcher::{
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
NoError,
},
regex_automata::{
Input, PatternID, meta::Regex,
util::captures::Captures as AutomataCaptures,
},
};
use crate::{config::Config, error::Error, literal::InnerLiterals};
#[derive(Clone, Debug)]
pub struct RegexMatcherBuilder {
config: Config,
}
impl Default for RegexMatcherBuilder {
fn default() -> RegexMatcherBuilder {
RegexMatcherBuilder::new()
}
}
impl RegexMatcherBuilder {
pub fn new() -> RegexMatcherBuilder {
RegexMatcherBuilder { config: Config::default() }
}
pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
self.build_many(&[pattern])
}
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<RegexMatcher, Error> {
let mut chir = self.config.build_many(patterns)?;
if chir.config().whole_line {
chir = chir.into_whole_line();
} else if chir.config().word {
chir = chir.into_word();
}
let regex = chir.to_regex()?;
log::trace!("final regex: {:?}", chir.hir().to_string());
let non_matching_bytes = chir.non_matching_bytes();
let fast_line_regex = InnerLiterals::new(&chir, ®ex).one_regex()?;
let mut config = self.config.clone();
config.line_terminator = chir.line_terminator();
Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes })
}
pub fn build_literals<B: AsRef<str>>(
&self,
literals: &[B],
) -> Result<RegexMatcher, Error> {
self.build_many(literals)
}
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.case_insensitive = yes;
self
}
pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.case_smart = yes;
self
}
pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.multi_line = yes;
self
}
pub fn dot_matches_new_line(
&mut self,
yes: bool,
) -> &mut RegexMatcherBuilder {
self.config.dot_matches_new_line = yes;
self
}
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.swap_greed = yes;
self
}
pub fn ignore_whitespace(
&mut self,
yes: bool,
) -> &mut RegexMatcherBuilder {
self.config.ignore_whitespace = yes;
self
}
pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.unicode = yes;
self
}
pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.octal = yes;
self
}
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder {
self.config.size_limit = bytes;
self
}
pub fn dfa_size_limit(
&mut self,
bytes: usize,
) -> &mut RegexMatcherBuilder {
self.config.dfa_size_limit = bytes;
self
}
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder {
self.config.nest_limit = limit;
self
}
pub fn line_terminator(
&mut self,
line_term: Option<u8>,
) -> &mut RegexMatcherBuilder {
self.config.line_terminator = line_term.map(LineTerminator::byte);
self
}
pub fn ban_byte(&mut self, byte: Option<u8>) -> &mut RegexMatcherBuilder {
self.config.ban = byte;
self
}
pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
if yes {
self.config.line_terminator = Some(LineTerminator::crlf());
} else {
self.config.line_terminator = None;
}
self.config.crlf = yes;
self
}
pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.word = yes;
self
}
pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.fixed_strings = yes;
self
}
pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
self.config.whole_line = yes;
self
}
}
#[derive(Clone, Debug)]
pub struct RegexMatcher {
config: Config,
regex: Regex,
fast_line_regex: Option<Regex>,
non_matching_bytes: ByteSet,
}
impl RegexMatcher {
pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
RegexMatcherBuilder::new().build(pattern)
}
pub fn new_line_matcher(pattern: &str) -> Result<RegexMatcher, Error> {
RegexMatcherBuilder::new().line_terminator(Some(b'\n')).build(pattern)
}
}
impl Matcher for RegexMatcher {
type Captures = RegexCaptures;
type Error = NoError;
#[inline]
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
let input = Input::new(haystack).span(at..haystack.len());
Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
}
#[inline]
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
Ok(RegexCaptures::new(self.regex.create_captures()))
}
#[inline]
fn capture_count(&self) -> usize {
self.regex.captures_len()
}
#[inline]
fn capture_index(&self, name: &str) -> Option<usize> {
self.regex.group_info().to_index(PatternID::ZERO, name)
}
#[inline]
fn try_find_iter<F, E>(
&self,
haystack: &[u8],
mut matched: F,
) -> Result<Result<(), E>, NoError>
where
F: FnMut(Match) -> Result<bool, E>,
{
for m in self.regex.find_iter(haystack) {
match matched(Match::new(m.start(), m.end())) {
Ok(true) => continue,
Ok(false) => return Ok(Ok(())),
Err(err) => return Ok(Err(err)),
}
}
Ok(Ok(()))
}
#[inline]
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
let input = Input::new(haystack).span(at..haystack.len());
let caps = caps.captures_mut();
self.regex.search_captures(&input, caps);
Ok(caps.is_match())
}
#[inline]
fn shortest_match_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<usize>, NoError> {
let input = Input::new(haystack).span(at..haystack.len());
Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
}
#[inline]
fn non_matching_bytes(&self) -> Option<&ByteSet> {
Some(&self.non_matching_bytes)
}
#[inline]
fn line_terminator(&self) -> Option<LineTerminator> {
self.config.line_terminator
}
#[inline]
fn find_candidate_line(
&self,
haystack: &[u8],
) -> Result<Option<LineMatchKind>, NoError> {
Ok(match self.fast_line_regex {
Some(ref regex) => {
let input = Input::new(haystack);
regex
.search_half(&input)
.map(|hm| LineMatchKind::Candidate(hm.offset()))
}
None => {
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
}
})
}
}
#[derive(Clone, Debug)]
pub struct RegexCaptures {
caps: AutomataCaptures,
}
impl Captures for RegexCaptures {
#[inline]
fn len(&self) -> usize {
self.caps.group_info().all_group_len()
}
#[inline]
fn get(&self, i: usize) -> Option<Match> {
self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end))
}
}
impl RegexCaptures {
#[inline]
pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures {
RegexCaptures { caps }
}
#[inline]
pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures {
&mut self.caps
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn word() {
let matcher =
RegexMatcherBuilder::new().word(true).build(r"-2").unwrap();
assert!(matcher.is_match(b"abc -2 foo").unwrap());
let matcher =
RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap();
assert!(!matcher.is_match(b"abc -2 foo").unwrap());
}
#[test]
fn line_terminator() {
let matcher = RegexMatcherBuilder::new().build(r"abc\sxyz").unwrap();
assert!(matcher.is_match(b"abc\nxyz").unwrap());
let matcher = RegexMatcherBuilder::new()
.line_terminator(Some(b'\n'))
.build(r"abc\sxyz")
.unwrap();
assert!(!matcher.is_match(b"abc\nxyz").unwrap());
}
#[test]
fn line_terminator_error() {
assert!(
RegexMatcherBuilder::new()
.line_terminator(Some(b'\n'))
.build(r"a\nz")
.is_err()
)
}
#[test]
fn line_terminator_crlf() {
let matcher = RegexMatcherBuilder::new()
.multi_line(true)
.build(r"abc$")
.unwrap();
assert!(matcher.is_match(b"abc\n").unwrap());
let matcher = RegexMatcherBuilder::new()
.multi_line(true)
.build(r"abc$")
.unwrap();
assert!(!matcher.is_match(b"abc\r\n").unwrap());
let matcher = RegexMatcherBuilder::new()
.multi_line(true)
.crlf(true)
.build(r"abc$")
.unwrap();
assert!(matcher.is_match(b"abc\r\n").unwrap());
}
#[test]
fn case_smart() {
let matcher =
RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap();
assert!(matcher.is_match(b"ABC").unwrap());
let matcher =
RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap();
assert!(!matcher.is_match(b"ABC").unwrap());
}
#[test]
#[ignore]
fn candidate_lines() {
fn is_confirmed(m: LineMatchKind) -> bool {
match m {
LineMatchKind::Confirmed(_) => true,
_ => false,
}
}
fn is_candidate(m: LineMatchKind) -> bool {
match m {
LineMatchKind::Candidate(_) => true,
_ => false,
}
}
let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap();
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
assert!(is_confirmed(m));
let matcher = RegexMatcherBuilder::new()
.line_terminator(Some(b'\n'))
.build(r"\wfoo\s")
.unwrap();
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
assert!(is_candidate(m));
}
}