use std::borrow::Cow;
use std::sync::OnceLock;
#[derive(Debug)]
pub enum CompiledRegex {
Linear(regex::Regex),
Backtracking(fancy_regex::Regex),
}
impl CompiledRegex {
pub fn new(pattern: &str) -> Result<Self, String> {
if needs_backtracking_engine(pattern) {
fancy_regex::Regex::new(pattern)
.map(Self::Backtracking)
.map_err(|e| format!("fancy_regex compile error: {e}"))
} else {
match regex::Regex::new(pattern) {
Ok(re) => Ok(Self::Linear(re)),
Err(e) => {
fancy_regex::Regex::new(pattern)
.map(Self::Backtracking)
.map_err(|fancy_err| {
format!(
"regex compile error: {e}, fancy_regex compile error: {fancy_err}"
)
})
}
}
}
}
pub fn new_linear(pattern: &str) -> Result<Self, String> {
regex::Regex::new(pattern)
.map(Self::Linear)
.map_err(|e| format!("regex compile error: {e}"))
}
pub fn new_backtracking(pattern: &str) -> Result<Self, String> {
fancy_regex::Regex::new(pattern)
.map(Self::Backtracking)
.map_err(|e| format!("fancy_regex compile error: {e}"))
}
#[must_use]
pub fn is_match(&self, text: &str) -> bool {
match self {
Self::Linear(re) => re.is_match(text),
Self::Backtracking(re) => re.is_match(text).unwrap_or(false),
}
}
#[must_use]
pub fn find(&self, text: &str) -> Option<(usize, usize)> {
match self {
Self::Linear(re) => re.find(text).map(|m| (m.start(), m.end())),
Self::Backtracking(re) => re.find(text).ok().flatten().map(|m| (m.start(), m.end())),
}
}
#[must_use]
pub fn as_str(&self) -> &str {
match self {
Self::Linear(re) => re.as_str(),
Self::Backtracking(re) => re.as_str(),
}
}
#[must_use]
pub const fn uses_backtracking(&self) -> bool {
matches!(self, Self::Backtracking(_))
}
#[must_use]
pub fn replacen<'t>(&self, text: &'t str, limit: usize, rep: &str) -> Cow<'t, str> {
match self {
Self::Linear(re) => re.replacen(text, limit, rep),
Self::Backtracking(re) => re
.try_replacen(text, limit, rep)
.unwrap_or(Cow::Borrowed(text)),
}
}
}
#[must_use]
pub fn needs_backtracking_engine(pattern: &str) -> bool {
if pattern.contains("(?=")
|| pattern.contains("(?!")
|| pattern.contains("(?<=")
|| pattern.contains("(?<!")
|| pattern.contains("(?>")
{
return true;
}
if pattern.contains("*+")
|| pattern.contains("++")
|| pattern.contains("?+")
|| pattern.contains("}+")
{
return true;
}
let bytes = pattern.as_bytes();
for i in 0..bytes.len().saturating_sub(1) {
if bytes[i] == b'\\' {
let next = bytes[i + 1];
if next.is_ascii_digit() && next != b'0' {
return true;
}
}
}
false
}
#[derive(Debug)]
pub struct LazyCompiledRegex {
pattern: PatternText,
compiled: OnceLock<Result<CompiledRegex, String>>,
}
#[derive(Debug)]
enum PatternText {
Static(&'static str),
Owned(String),
}
impl PatternText {
fn as_str(&self) -> &str {
match self {
Self::Static(pattern) => pattern,
Self::Owned(pattern) => pattern.as_str(),
}
}
}
impl LazyCompiledRegex {
#[must_use]
pub const fn new(pattern: &'static str) -> Self {
Self {
pattern: PatternText::Static(pattern),
compiled: OnceLock::new(),
}
}
#[must_use]
#[allow(clippy::missing_const_for_fn)]
pub fn new_owned(pattern: String) -> Self {
Self {
pattern: PatternText::Owned(pattern),
compiled: OnceLock::new(),
}
}
fn get_compiled(&self) -> Option<&CompiledRegex> {
self.compiled
.get_or_init(|| CompiledRegex::new(self.pattern.as_str()))
.as_ref()
.ok()
}
#[must_use]
pub fn is_match(&self, haystack: &str) -> bool {
self.get_compiled()
.is_some_and(|compiled| compiled.is_match(haystack))
}
#[must_use]
pub fn find(&self, haystack: &str) -> Option<(usize, usize)> {
self.get_compiled()
.and_then(|compiled| compiled.find(haystack))
}
#[must_use]
pub fn as_str(&self) -> &str {
self.pattern.as_str()
}
#[must_use]
pub fn is_compiled(&self) -> bool {
matches!(self.compiled.get(), Some(Ok(_)))
}
}
impl std::fmt::Display for LazyCompiledRegex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_linear_engine_selection() {
let re = CompiledRegex::new(r"rm\s+-rf").unwrap();
assert!(!re.uses_backtracking());
assert!(re.is_match("rm -rf /"));
}
#[test]
fn test_backtracking_engine_selection() {
let re = CompiledRegex::new(r"git\s+push(?=.*--force)").unwrap();
assert!(re.uses_backtracking());
assert!(re.is_match("git push --force"));
assert!(!re.is_match("git push"));
}
#[test]
fn test_lookbehind() {
let re = CompiledRegex::new(r"(?<=drop\s)database").unwrap();
assert!(re.uses_backtracking());
assert!(re.is_match("drop database"));
}
#[test]
fn test_negative_lookahead() {
let re = CompiledRegex::new(r"rm(?!\s+--dry-run)").unwrap();
assert!(re.uses_backtracking());
assert!(re.is_match("rm -rf"));
assert!(!re.is_match("rm --dry-run"));
}
#[test]
fn test_needs_backtracking_detection() {
assert!(!needs_backtracking_engine(r"simple"));
assert!(!needs_backtracking_engine(r"git\s+status"));
assert!(!needs_backtracking_engine(r"\d+\.\d+")); assert!(!needs_backtracking_engine(r"foo\0bar"));
assert!(needs_backtracking_engine(r"(?=lookahead)"));
assert!(needs_backtracking_engine(r"(?!negative)"));
assert!(needs_backtracking_engine(r"(?<=lookbehind)"));
assert!(needs_backtracking_engine(r"(?<!negative-behind)"));
assert!(needs_backtracking_engine(r"(foo)\1"));
assert!(needs_backtracking_engine(r"(\w+)\s+\1"));
assert!(needs_backtracking_engine(r"(a)(b)\2\1"));
assert!(needs_backtracking_engine(
r"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.).\10"
));
}
#[test]
fn test_backreference_pattern() {
let re = CompiledRegex::new(r"(\w+)\s+\1").unwrap();
assert!(re.uses_backtracking());
assert!(re.is_match("hello hello"));
assert!(re.is_match("the the"));
assert!(!re.is_match("hello world"));
}
#[test]
fn test_find_linear() {
let re = CompiledRegex::new(r"rm").unwrap();
assert!(!re.uses_backtracking());
assert_eq!(re.find("test rm command"), Some((5, 7)));
assert_eq!(re.find("no match"), None);
}
#[test]
fn test_find_backtracking() {
let re = CompiledRegex::new(r"git(?=\s+push)").unwrap();
assert!(re.uses_backtracking());
assert_eq!(re.find("run git push"), Some((4, 7)));
assert_eq!(re.find("git status"), None); }
#[test]
fn test_replacen_linear() {
let re = CompiledRegex::new(r"foo").unwrap();
assert!(!re.uses_backtracking());
assert_eq!(re.replacen("foo bar foo", 1, "baz"), "baz bar foo");
assert_eq!(re.replacen("foo bar foo", 0, "baz"), "baz bar baz"); }
#[test]
fn test_replacen_backtracking() {
let re = CompiledRegex::new(r"(\w+)\s+\1").unwrap();
assert!(re.uses_backtracking());
assert_eq!(re.replacen("the the cat", 1, "DUPE"), "DUPE cat");
}
#[test]
fn test_worst_case_alternation() {
let re = CompiledRegex::new(r"(a|a)+").unwrap();
assert!(!re.uses_backtracking());
let input = "a".repeat(100);
assert!(re.is_match(&input));
}
#[test]
fn test_worst_case_nested_quantifiers() {
let re = CompiledRegex::new(r"(a+)+$").unwrap();
assert!(!re.uses_backtracking());
let mut input = "a".repeat(50);
input.push('!'); assert!(!re.is_match(&input)); }
#[test]
fn test_worst_case_star_quantifier() {
let re = CompiledRegex::new(r"a*a*a*a*a*b").unwrap();
assert!(!re.uses_backtracking());
let input = "a".repeat(100);
assert!(!re.is_match(&input)); }
#[test]
fn test_worst_case_dot_star() {
let re = CompiledRegex::new(r".*.*.*=").unwrap();
assert!(!re.uses_backtracking());
let input = "x".repeat(100);
assert!(!re.is_match(&input)); }
#[test]
fn test_linear_engine_long_input() {
let re = CompiledRegex::new(r"rm\s+-rf\s+/").unwrap();
assert!(!re.uses_backtracking());
let mut cmd = "echo ".to_string();
cmd.push_str(&"x".repeat(10_000));
cmd.push_str(" && rm -rf / && ");
cmd.push_str(&"y".repeat(10_000));
assert!(re.is_match(&cmd));
}
#[test]
fn test_backtracking_safety() {
let re = CompiledRegex::new(r"git\s+push(?=.*--force)").unwrap();
assert!(re.uses_backtracking());
let cmd = format!("git push {} --force", "branch".repeat(100));
assert!(re.is_match(&cmd));
}
#[test]
fn test_as_str_preserves_pattern() {
let pattern = r"test\s+pattern";
let re = CompiledRegex::new(pattern).unwrap();
assert_eq!(re.as_str(), pattern);
}
#[test]
fn test_as_str_backtracking() {
let pattern = r"foo(?=bar)";
let re = CompiledRegex::new(pattern).unwrap();
assert!(re.uses_backtracking());
assert_eq!(re.as_str(), pattern);
}
#[test]
fn test_empty_pattern() {
let re = CompiledRegex::new("").unwrap();
assert!(!re.uses_backtracking()); assert!(re.is_match("")); assert!(re.is_match("anything")); assert_eq!(re.find("test"), Some((0, 0))); }
#[test]
fn test_compile_error_linear() {
let result = CompiledRegex::new_linear(r"(unclosed");
assert!(result.is_err());
assert!(result.unwrap_err().contains("regex compile error"));
}
#[test]
fn test_compile_error_backtracking() {
let result = CompiledRegex::new_backtracking(r"(unclosed");
assert!(result.is_err());
assert!(result.unwrap_err().contains("fancy_regex compile error"));
}
#[test]
fn test_lazy_regex_not_compiled_initially() {
let lazy = LazyCompiledRegex::new(r"test\s+pattern");
assert!(!lazy.is_compiled());
}
#[test]
fn test_lazy_regex_compiles_on_first_use() {
let lazy = LazyCompiledRegex::new(r"test\s+pattern");
assert!(!lazy.is_compiled());
let _ = lazy.is_match("test pattern");
assert!(lazy.is_compiled());
}
#[test]
fn test_lazy_regex_is_match_same_as_eager() {
let pattern = r"git\s+reset\s+--hard";
let lazy = LazyCompiledRegex::new(pattern);
let eager = CompiledRegex::new(pattern).unwrap();
let inputs = [
"git reset --hard",
"git reset --hard HEAD",
" git reset --hard ",
"git status", "reset --hard", "",
];
for input in inputs {
assert_eq!(
lazy.is_match(input),
eager.is_match(input),
"Mismatch for input: {input:?}"
);
}
}
#[test]
fn test_lazy_regex_find_span_same_as_eager() {
let pattern = r"rm\s+-rf";
let lazy = LazyCompiledRegex::new(pattern);
let eager = CompiledRegex::new(pattern).unwrap();
let inputs = [
"rm -rf /",
"sudo rm -rf /tmp",
"echo rm -rf",
"rm command",
"",
];
for input in inputs {
let lazy_span = lazy.find(input);
let eager_span = eager.find(input);
assert_eq!(lazy_span, eager_span, "Span mismatch for input: {input:?}");
}
}
#[test]
fn test_lazy_regex_as_str() {
let pattern = r"test\s+pattern";
let lazy = LazyCompiledRegex::new(pattern);
assert_eq!(lazy.as_str(), pattern);
}
#[test]
fn test_lazy_regex_lookahead() {
let lazy = LazyCompiledRegex::new(r"git\s+push(?=.*--force)");
assert!(lazy.is_match("git push --force"));
assert!(lazy.is_match("git push origin main --force"));
assert!(!lazy.is_match("git push"));
assert!(!lazy.is_match("git push origin main"));
}
#[test]
fn test_lazy_regex_lookbehind() {
let lazy = LazyCompiledRegex::new(r"(?<=drop\s)database");
assert!(lazy.is_match("drop database"));
assert!(!lazy.is_match("database"));
}
#[test]
fn test_lazy_regex_span_with_lookahead() {
let lazy = LazyCompiledRegex::new(r"git(?=\s+push)");
assert_eq!(lazy.find("git push"), Some((0, 3)));
assert_eq!(lazy.find("run git push now"), Some((4, 7)));
assert_eq!(lazy.find("git status"), None);
}
#[test]
fn test_lazy_regex_static_usage() {
static PATTERN: LazyCompiledRegex = LazyCompiledRegex::new(r"hello\s+world");
assert!(PATTERN.is_match("hello world"));
assert!(!PATTERN.is_match("hello"));
}
#[test]
fn test_lazy_regex_empty_pattern() {
let lazy = LazyCompiledRegex::new("");
assert!(lazy.is_match(""));
assert!(lazy.is_match("anything"));
assert_eq!(lazy.find("test"), Some((0, 0)));
}
#[test]
fn test_lazy_regex_reuses_compiled() {
let lazy = LazyCompiledRegex::new(r"test");
assert!(lazy.is_match("test"));
assert!(lazy.is_match("test again"));
assert!(lazy.is_match("another test"));
assert!(lazy.is_compiled());
}
}