use std::collections::hash_map::DefaultHasher;
use std::hash::Hash;
use std::hash::Hasher;
use memchr::memmem::Finder;
use regex::bytes::Regex;
use crate::cli::ResolvedConfig;
pub struct CompiledPattern {
pub regex: Regex,
pub cache_key: String,
literal: Option<Finder<'static>>,
prefix: Option<Finder<'static>>,
}
impl CompiledPattern {
pub fn compile(config: &ResolvedConfig) -> Result<Self, regex::Error> {
let escaped: Vec<String> = if config.fixed_strings {
config.patterns.iter().map(|p| regex::escape(p)).collect()
} else {
config.patterns.clone()
};
let pattern = if escaped.len() == 1 {
escaped.into_iter().next().unwrap()
} else {
escaped.iter().map(|p| format!("(?:{p})")).collect::<Vec<_>>().join("|")
};
let pattern = if config.fixed_strings { pattern } else { sanitize_braces(&pattern) };
let pattern = if config.line_regexp {
format!("^(?:{pattern})$")
} else if config.word_regexp {
format!(r"\b(?:{pattern})\b")
} else {
pattern
};
let regex = regex::bytes::RegexBuilder::new(&pattern)
.case_insensitive(config.ignore_case)
.unicode(false)
.build()?;
let literal = if config.patterns.len() == 1
&& !config.ignore_case
&& !config.word_regexp
&& !config.line_regexp
&& (config.fixed_strings || is_literal(&config.patterns[0]))
{
Some(Finder::new(config.patterns[0].as_bytes()).into_owned())
} else {
None
};
let prefix = if literal.is_none() && config.patterns.len() == 1 && !config.ignore_case {
let raw = &config.patterns[0];
let pfx = extract_literal_prefix(raw);
if pfx.len() >= 2 { Some(Finder::new(pfx.as_bytes()).into_owned()) } else { None }
} else {
None
};
let cache_key = Self::make_cache_key(config);
Ok(Self { regex, cache_key, literal, prefix })
}
#[inline]
pub fn is_match(&self, haystack: &[u8]) -> bool {
if let Some(ref finder) = self.literal {
finder.find(haystack).is_some()
} else {
self.regex.is_match(haystack)
}
}
#[inline]
pub fn literal_finder(&self) -> Option<&Finder<'_>> {
self.literal.as_ref()
}
#[inline]
pub fn prefix_finder(&self) -> Option<&Finder<'_>> {
self.prefix.as_ref()
}
pub fn required_trigrams(&self) -> Vec<[u8; 3]> {
let needle = self
.literal
.as_ref()
.map(|f| f.needle())
.or_else(|| self.prefix.as_ref().map(|f| f.needle()));
match needle {
Some(bytes) if bytes.len() >= 3 => {
let mut seen = std::collections::HashSet::new();
for w in bytes.windows(3) {
seen.insert([w[0], w[1], w[2]]);
}
seen.into_iter().collect()
}
_ => Vec::new(),
}
}
fn make_cache_key(config: &ResolvedConfig) -> String {
let mut hasher = DefaultHasher::new();
config.patterns.hash(&mut hasher);
config.ignore_case.hash(&mut hasher);
config.invert_match.hash(&mut hasher);
config.word_regexp.hash(&mut hasher);
config.fixed_strings.hash(&mut hasher);
format!("{:016x}", hasher.finish())
}
}
fn sanitize_braces(pattern: &str) -> String {
let bytes = pattern.as_bytes();
let mut out = String::with_capacity(bytes.len());
let mut i = 0;
let mut in_class = false;
while i < bytes.len() {
let c = bytes[i];
if c == b'\\' && i + 1 < bytes.len() {
out.push(c as char);
out.push(bytes[i + 1] as char);
i += 2;
continue;
}
if in_class {
if c == b']' {
in_class = false;
}
out.push(c as char);
i += 1;
continue;
}
if c == b'[' {
in_class = true;
out.push(c as char);
i += 1;
continue;
}
if c == b'{' && !is_valid_repetition(&bytes[i..]) {
out.push('\\');
out.push('{');
i += 1;
continue;
}
out.push(c as char);
i += 1;
}
out
}
fn is_valid_repetition(bytes: &[u8]) -> bool {
debug_assert_eq!(bytes[0], b'{');
let mut i = 1;
let start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == start {
return false;
}
if i < bytes.len() && bytes[i] == b',' {
i += 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
i < bytes.len() && bytes[i] == b'}'
}
fn is_literal(pattern: &str) -> bool {
!pattern.contains(['.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|', '^', '$', '\\'])
}
fn extract_literal_prefix(pattern: &str) -> String {
if has_top_level_alternation(pattern) {
return String::new();
}
let mut prefix = String::new();
let mut chars = pattern.chars().peekable();
while let Some(&c) = chars.peek() {
match c {
'.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' => {
break;
}
'\\' => {
chars.next(); match chars.peek() {
Some(
&ec @ ('.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
| '^' | '$' | '\\'),
) => {
prefix.push(ec);
chars.next();
}
_ => break,
}
}
_ => {
prefix.push(c);
chars.next();
}
}
}
prefix
}
fn has_top_level_alternation(pattern: &str) -> bool {
let mut depth: u32 = 0;
let mut chars = pattern.chars();
while let Some(c) = chars.next() {
match c {
'\\' => {
chars.next(); }
'(' | '[' => depth += 1,
')' | ']' => depth = depth.saturating_sub(1),
'|' if depth == 0 => return true,
_ => {}
}
}
false
}