use regex::Regex;
use regex_lite as regex;
use std::fmt;
#[derive(Debug, Clone)]
pub enum Lookaround {
Ahead { pattern: String, regex: Regex },
Behind {
#[allow(dead_code)]
pattern: String,
regex: Regex,
},
NegAhead {
#[allow(dead_code)]
pattern: String,
regex: Regex,
check_at_start: bool,
},
NegBehind {
#[allow(dead_code)]
pattern: String,
regex: Regex,
},
}
impl Lookaround {
pub fn ahead(pattern: &str) -> Result<Self, regex::Error> {
Ok(Lookaround::Ahead {
pattern: pattern.to_string(),
regex: Regex::new(pattern)?,
})
}
pub fn behind(pattern: &str) -> Result<Self, regex::Error> {
Ok(Lookaround::Behind {
pattern: pattern.to_string(),
regex: Regex::new(pattern)?,
})
}
pub fn neg_ahead(pattern: &str) -> Result<Self, regex::Error> {
Ok(Lookaround::NegAhead {
pattern: pattern.to_string(),
regex: Regex::new(pattern)?,
check_at_start: false,
})
}
pub fn neg_ahead_at_start(pattern: &str) -> Result<Self, regex::Error> {
Ok(Lookaround::NegAhead {
pattern: pattern.to_string(),
regex: Regex::new(pattern)?,
check_at_start: true,
})
}
pub fn neg_behind(pattern: &str) -> Result<Self, regex::Error> {
Ok(Lookaround::NegBehind {
pattern: pattern.to_string(),
regex: Regex::new(pattern)?,
})
}
pub fn verify(&self, text: &str, match_start: usize, match_end: usize) -> bool {
match self {
Lookaround::Ahead { regex, pattern } => {
match pattern.as_str() {
r"\s|$" | r"$|\s" => {
if match_end >= text.len() {
return true;
}
let ch = text.as_bytes()[match_end];
return ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r';
}
r"\s" => {
if match_end >= text.len() {
return false;
}
let ch = text.as_bytes()[match_end];
return ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r';
}
"$" => {
return match_end >= text.len();
}
r"\s[A-Z]" => {
if match_end + 1 >= text.len() {
return false;
}
let bytes = text.as_bytes();
let ch1 = bytes[match_end];
let ch2 = bytes[match_end + 1];
return (ch1 == b' ' || ch1 == b'\t' || ch1 == b'\n' || ch1 == b'\r')
&& ch2.is_ascii_uppercase();
}
r"\s[A-Z][a-z]{2}\s" => {
if match_end + 4 >= text.len() {
return false;
}
let bytes = text.as_bytes();
let ch0 = bytes[match_end];
let ch1 = bytes[match_end + 1];
let ch2 = bytes[match_end + 2];
let ch3 = bytes[match_end + 3];
let ch4 = bytes[match_end + 4];
return (ch0 == b' ' || ch0 == b'\t')
&& ch1.is_ascii_uppercase()
&& ch2.is_ascii_lowercase()
&& ch3.is_ascii_lowercase()
&& (ch4 == b' ' || ch4 == b'\t');
}
"[:/]" => {
if match_end >= text.len() {
return false;
}
let ch = text.as_bytes()[match_end];
return ch == b':' || ch == b'/';
}
r"\.\d+\.\d+\.\d+" => {
if match_end + 6 > text.len() {
return false;
}
let bytes = &text.as_bytes()[match_end..];
if bytes.is_empty() || bytes[0] != b'.' {
return false;
}
let remaining = &text[match_end..];
if let Some(mat) = regex.find(remaining) {
return mat.start() == 0;
}
return false;
}
r"[KMG]B?" => {
if match_end >= text.len() {
return false;
}
let bytes = text.as_bytes();
let ch1 = bytes[match_end];
if ch1 == b'K' || ch1 == b'M' || ch1 == b'G' {
if match_end + 1 < bytes.len() && bytes[match_end + 1] == b'B' {
return true;
}
return true;
}
return false;
}
"[KMGT]" => {
if match_end >= text.len() {
return false;
}
let ch = text.as_bytes()[match_end];
return ch == b'K' || ch == b'M' || ch == b'G' || ch == b'T';
}
_ => {
}
}
let remaining = &text[match_end..];
if let Some(mat) = regex.find(remaining) {
mat.start() == 0
} else {
false
}
}
Lookaround::Behind { regex, .. } => {
if match_start == 0 {
return regex.is_match("");
}
if let Some(last_match) = regex.find_iter(&text[..match_start]).last() {
last_match.end() == match_start
} else {
false
}
}
Lookaround::NegAhead {
regex,
check_at_start,
..
} => {
let check_pos = if *check_at_start {
match_start
} else {
match_end
};
let remaining = &text[check_pos..];
if let Some(mat) = regex.find(remaining) {
mat.start() != 0
} else {
true }
}
Lookaround::NegBehind { regex, .. } => {
if match_start == 0 {
return !regex.is_match("");
}
if let Some(last_match) = regex.find_iter(&text[..match_start]).last() {
last_match.end() != match_start
} else {
true }
}
}
}
}
#[derive(Clone)]
pub struct EnhancedRegex {
main_regex: Regex,
lookarounds: Vec<Lookaround>,
original_pattern: String,
}
impl fmt::Debug for EnhancedRegex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("EnhancedRegex")
.field("pattern", &self.original_pattern)
.field("lookarounds", &self.lookarounds.len())
.finish()
}
}
impl EnhancedRegex {
pub fn new(pattern: &str) -> Result<Self, regex::Error> {
let (main_pattern, lookarounds) = parse_pattern(pattern)?;
Ok(EnhancedRegex {
main_regex: Regex::new(&main_pattern)?,
lookarounds,
original_pattern: pattern.to_string(),
})
}
pub fn find_from_pos<'t>(&self, text: &'t str, start: usize) -> Option<regex::Match<'t>> {
if self.lookarounds.is_empty() {
return self.main_regex.find_at(text, start);
}
let mut pos = start;
while pos < text.len() {
if let Some(mat) = self.main_regex.find_at(text, pos) {
let match_start = mat.start();
let match_end = mat.end();
if self.verify_lookarounds(text, match_start, match_end) {
return Some(mat);
}
let min_length = 1;
let backtrack_chars = if match_end - match_start > 10 {
5
} else {
match_end - match_start - min_length
};
let backtrack_start = match_end.saturating_sub(backtrack_chars);
if backtrack_start > match_start {
for try_end in (match_start + min_length..=backtrack_start).rev() {
let substring = &text[match_start..try_end];
if let Some(sub_mat) = self.main_regex.find(substring)
&& sub_mat.start() == 0
&& sub_mat.end() == substring.len()
{
if self.verify_lookarounds(text, match_start, try_end) {
let restricted_text = &text[..try_end];
if let Some(final_mat) =
self.main_regex.find_at(restricted_text, match_start)
&& final_mat.start() == match_start
&& final_mat.end() == try_end
{
return Some(final_mat);
}
}
}
}
}
pos = match_start + 1;
} else {
break;
}
}
None
}
pub fn find_iter<'t>(&self, text: &'t str) -> EnhancedMatches<'_, 't> {
EnhancedMatches {
regex: self,
text,
last_pos: 0,
}
}
pub fn captures_from_pos<'t>(
&self,
text: &'t str,
start: usize,
) -> Option<regex::Captures<'t>> {
let mut pos = start;
while pos < text.len() {
if let Some(caps) = self.main_regex.captures_at(text, pos) {
let mat = caps.get(0).unwrap();
if self.verify_lookarounds(text, mat.start(), mat.end()) {
return Some(caps);
}
pos = mat.start() + 1;
} else {
break;
}
}
None
}
#[inline]
fn verify_lookarounds(&self, text: &str, match_start: usize, match_end: usize) -> bool {
if self.lookarounds.is_empty() {
return true;
}
for lookaround in &self.lookarounds {
if !lookaround.verify(text, match_start, match_end) {
return false;
}
}
true
}
pub fn is_match(&self, text: &str) -> bool {
self.find_from_pos(text, 0).is_some()
}
#[allow(dead_code)]
pub fn as_str(&self) -> &str {
&self.original_pattern
}
}
pub struct EnhancedMatches<'r, 't> {
regex: &'r EnhancedRegex,
text: &'t str,
last_pos: usize,
}
impl<'r, 't> Iterator for EnhancedMatches<'r, 't> {
type Item = regex::Match<'t>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(mat) = self.regex.find_from_pos(self.text, self.last_pos) {
self.last_pos = mat.end();
Some(mat)
} else {
None
}
}
}
fn extract_lookaround_content(pattern: &str, start: usize) -> Option<(usize, String)> {
let chars: Vec<char> = pattern.chars().collect();
if start + 2 >= chars.len() {
return None;
}
if chars[start] != '(' || chars[start + 1] != '?' {
return None;
}
let mut depth = 1;
let mut i = start + 2;
while i < chars.len() && depth > 0 {
match chars[i] {
'(' => depth += 1,
')' => depth -= 1,
'\\' => i += 1, _ => {}
}
i += 1;
}
if depth == 0 {
let inner_start = start + 2; let mut type_end = inner_start;
while type_end < chars.len() && "=!<".contains(chars[type_end]) {
type_end += 1;
}
let content: String = chars[type_end..i - 1].iter().collect();
Some((i, content))
} else {
None
}
}
fn preprocess_pattern(pattern: &str) -> String {
let mut result = pattern.to_string();
result = fix_invalid_escapes_outside_char_class(&result);
result = fix_character_class_escapes(&result);
result = fix_boundary_in_character_class(&result);
result = fix_variable_length_lookbehind(&result);
result
}
fn fix_invalid_escapes_outside_char_class(pattern: &str) -> String {
let chars: Vec<char> = pattern.chars().collect();
let mut i = 0;
let mut in_char_class = false;
let mut output = String::new();
while i < chars.len() {
let ch = chars[i];
match ch {
'[' => {
in_char_class = true;
output.push(ch);
}
']' if in_char_class => {
in_char_class = false;
output.push(ch);
}
'\\' if !in_char_class && i + 1 < chars.len() => {
let next_ch = chars[i + 1];
match next_ch {
'>' | '<' => {
output.push(next_ch);
i += 1; }
'n'
| 'r'
| 't'
| '0'..='7'
| 'x'
| 'u'
| 'd'
| 's'
| 'w'
| 'b'
| 'B'
| 'A'
| 'z'
| 'Z'
| '"'
| '\''
| '\\'
| '('
| ')'
| '{'
| '}'
| '.'
| '*'
| '+'
| '?'
| '^'
| '$'
| '|' => {
output.push(ch);
}
_ => {
output.push(ch);
}
}
}
'\\' if !in_char_class => {
output.push(ch);
}
_ => {
output.push(ch);
}
}
i += 1;
}
output
}
fn fix_character_class_escapes(pattern: &str) -> String {
let chars: Vec<char> = pattern.chars().collect();
let mut i = 0;
let mut in_char_class = false;
let mut output = String::new();
while i < chars.len() {
let ch = chars[i];
match ch {
'[' => {
in_char_class = true;
output.push(ch);
}
']' if in_char_class && (output.is_empty() || !output.ends_with('[')) => {
in_char_class = false;
output.push(ch);
}
'\\' if in_char_class && i + 1 < chars.len() => {
let next_ch = chars[i + 1];
match next_ch {
'>' | '<' => {
output.push(next_ch);
i += 1; }
'n' | 'r' | 't' | '0'..='7' | 'x' | 'u' | '"' | '\'' | '-' | ']' => {
output.push(ch);
}
_ => {
output.push(ch);
}
}
}
'\\' if in_char_class => {
output.push(ch);
}
_ => {
output.push(ch);
}
}
i += 1;
}
output
}
fn fix_boundary_in_character_class(pattern: &str) -> String {
let chars: Vec<char> = pattern.chars().collect();
let mut i = 0;
let mut output = String::new();
while i < chars.len() {
if chars[i] == '[' {
let mut class_content = String::new();
let mut has_invalid_boundary = false;
let mut boundary_pos = 0;
i += 1;
while i < chars.len() && chars[i] != ']' {
if chars[i] == '\\' && i + 1 < chars.len() && chars[i + 1] == 'b' {
has_invalid_boundary = true;
boundary_pos = class_content.len();
class_content.push('\\');
class_content.push('b');
i += 2;
} else {
class_content.push(chars[i]);
i += 1;
}
}
if has_invalid_boundary {
let before_boundary = &class_content[..boundary_pos];
let after_boundary = &class_content[boundary_pos + 2..];
output.push_str("(?:[");
output.push_str(before_boundary);
output.push_str(after_boundary);
output.push_str("]|\\b)");
} else {
output.push('[');
output.push_str(&class_content);
}
if i < chars.len() && !has_invalid_boundary {
output.push(']');
i += 1;
} else if has_invalid_boundary && i < chars.len() {
i += 1; }
} else {
output.push(chars[i]);
i += 1;
}
}
output
}
fn fix_variable_length_lookbehind(pattern: &str) -> String {
let chars: Vec<char> = pattern.chars().collect();
let mut i = 0;
let mut output = String::new();
while i < chars.len() {
if i + 3 < chars.len() && chars[i] == '(' && chars[i + 1] == '?' && chars[i + 2] == '<' {
let is_negative = chars[i + 3] == '!';
let lookaround_start = i;
let mut depth = 1;
let mut j = i + 4;
let mut lookbehind_content = String::new();
while j < chars.len() && depth > 0 {
match chars[j] {
'(' => {
depth += 1;
lookbehind_content.push(chars[j]);
}
')' => {
depth -= 1;
if depth > 0 {
lookbehind_content.push(chars[j]);
}
}
'\\' => {
lookbehind_content.push(chars[j]);
if j + 1 < chars.len() {
j += 1;
lookbehind_content.push(chars[j]);
}
}
_ => {
lookbehind_content.push(chars[j]);
}
}
j += 1;
}
if lookbehind_content.contains('|') {
if let Some(first_alt) = lookbehind_content.split('|').next()
&& first_alt.len() <= 2
&& !first_alt.contains('(')
&& !first_alt.contains('[')
{
output.push_str("(?");
output.push('<');
if is_negative {
output.push('!');
} else {
output.push('=');
}
output.push_str(first_alt);
output.push(')');
i = j;
continue;
}
i = j;
continue;
} else {
for ch in &chars[lookaround_start..j] {
output.push(*ch);
}
i = j;
continue;
}
}
output.push(chars[i]);
i += 1;
}
output
}
fn parse_pattern(pattern: &str) -> Result<(String, Vec<Lookaround>), regex::Error> {
let processed_pattern = preprocess_pattern(pattern);
let mut main_pattern = processed_pattern;
let mut lookarounds = Vec::new();
let mut found_lookarounds = Vec::new();
let chars: Vec<char> = main_pattern.chars().collect();
let mut i = 0;
while i < chars.len() {
if i + 2 < chars.len() && chars[i] == '(' && chars[i + 1] == '?' {
let lookaround_type = if i + 3 < chars.len() && chars[i + 2] == '<' {
if i + 4 < chars.len() {
if chars[i + 3] == '=' {
Some(("<=", 4))
} else if chars[i + 3] == '!' {
Some(("<!", 4))
} else {
None
}
} else {
None
}
} else if i + 3 < chars.len() {
if chars[i + 2] == '=' {
Some(("=", 3))
} else if chars[i + 2] == '!' {
Some(("!", 3))
} else {
None
}
} else {
None
};
if let Some((type_str, _type_len)) = lookaround_type
&& let Some((end_pos, inner_pattern)) = extract_lookaround_content(pattern, i)
{
let lookaround = match type_str {
"=" => Lookaround::ahead(&inner_pattern)?,
"!" => {
let prefix = &pattern[..i];
let is_at_start = prefix.trim_start_matches('^').trim_start() == "(?:";
if is_at_start {
Lookaround::neg_ahead_at_start(&inner_pattern)?
} else {
Lookaround::neg_ahead(&inner_pattern)?
}
}
"<=" => Lookaround::behind(&inner_pattern)?,
"<!" => Lookaround::neg_behind(&inner_pattern)?,
_ => unreachable!(),
};
found_lookarounds.push((i, end_pos, lookaround));
i = end_pos;
continue;
}
}
i += 1;
}
found_lookarounds.sort_by_key(|(start, _, _)| *start);
for (_, _, lookaround) in &found_lookarounds {
lookarounds.push(lookaround.clone());
}
for (start, end, _) in found_lookarounds.iter().rev() {
main_pattern.replace_range(*start..*end, "");
}
if !lookarounds.is_empty() {
main_pattern = main_pattern
.replace("+?", "+")
.replace("*?", "*")
.replace("??", "?");
}
Ok((main_pattern, lookarounds))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_lookahead() {
let re = EnhancedRegex::new(r"\d+(?=\s)").unwrap();
assert!(re.is_match("123 "));
assert!(!re.is_match("123"));
assert!(!re.is_match("123a"));
}
#[test]
fn test_simple_lookbehind() {
let re = EnhancedRegex::new(r"(?<=\s)\d+").unwrap();
assert!(re.is_match(" 123"));
assert!(!re.is_match("123"));
assert!(!re.is_match("a123"));
}
#[test]
fn test_negative_lookahead() {
let re = EnhancedRegex::new(r"\d+(?!\s)").unwrap();
assert!(re.is_match("123"));
assert!(re.is_match("123a"));
assert!(re.is_match("123 "));
let re2 = EnhancedRegex::new(r"^\d+(?!\s)$").unwrap();
assert!(!re2.is_match("123 "));
}
#[test]
fn test_negative_lookbehind() {
let re = EnhancedRegex::new(r"(?<!\s)\d+").unwrap();
assert!(re.is_match("123"));
assert!(re.is_match("a123"));
assert!(re.is_match(" 123"));
if let Some(m) = re.find_from_pos(" 123", 0) {
assert_eq!(m.as_str(), "23");
assert_eq!(m.start(), 2);
}
}
#[test]
fn test_common_pattern_end_boundary() {
let re = EnhancedRegex::new(r"\d+(?=\s|$)").unwrap();
assert!(re.is_match("123 "));
assert!(re.is_match("123"));
assert!(!re.is_match("123a"));
}
#[test]
fn test_common_pattern_ls_size() {
let re = EnhancedRegex::new(r"\d{7}(?=\s[A-Z][a-z]{2}\s)").unwrap();
assert!(re.is_match("1234567 Mar "));
assert!(re.is_match("9876543 Nov 30"));
assert!(!re.is_match("1234567 "));
assert!(!re.is_match("1234567 123"));
}
#[test]
fn test_captures() {
let re = EnhancedRegex::new(r"(\d+)(?=\s)").unwrap();
if let Some(caps) = re.captures_from_pos("abc 123 def", 0) {
assert_eq!(caps.get(1).unwrap().as_str(), "123");
} else {
panic!("Expected to find match");
}
}
#[test]
fn test_find_iter() {
let re = EnhancedRegex::new(r"\d+(?=\s)").unwrap();
let matches: Vec<_> = re.find_iter("123 456 789").map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["123", "456"]);
}
#[test]
fn test_no_lookaround() {
let re = EnhancedRegex::new(r"\d+").unwrap();
assert!(re.is_match("123"));
assert!(re.is_match("abc123def"));
}
#[test]
fn test_multiple_lookarounds() {
let re = EnhancedRegex::new(r"(?<=\s)\d+(?=\s)").unwrap();
assert!(re.is_match(" 123 "));
assert!(!re.is_match("123 "));
assert!(!re.is_match(" 123"));
assert!(!re.is_match("123"));
}
#[test]
fn test_preprocess_invalid_escapes_outside_char_class() {
let result = fix_invalid_escapes_outside_char_class(r"^\>");
assert_eq!(result, r"^>");
let result = fix_invalid_escapes_outside_char_class(r"^\<");
assert_eq!(result, r"^<");
}
#[test]
fn test_preprocess_character_class_escapes() {
let result = fix_character_class_escapes(r"[^\>]");
assert_eq!(result, r"[^>]");
let result = fix_character_class_escapes(r"[^\<]");
assert_eq!(result, r"[^<]");
}
#[test]
fn test_preprocess_boundary_in_character_class() {
let result = fix_boundary_in_character_class(r"[:\b]");
assert_eq!(result, r"(?:[:]|\b)");
let result = fix_boundary_in_character_class(r"[Ww]arning[:\b]");
assert_eq!(result, r"[Ww]arning(?:[:]|\b)");
}
#[test]
fn test_preprocess_complex_pattern_diff() {
let result = preprocess_pattern(r"^\>([^\>].*|$)");
assert_eq!(result, r"^>([^>].*|$)");
let re = EnhancedRegex::new(r"^\>([^\>].*|$)").unwrap();
assert!(re.is_match(">test"));
assert!(re.is_match(">"));
}
#[test]
fn test_preprocess_complex_pattern_gcc() {
let result = preprocess_pattern(r"[Ww]arning[:\b]");
assert_eq!(result, r"[Ww]arning(?:[:]|\b)");
let re = EnhancedRegex::new(r"[Ww]arning[:\b]").unwrap();
assert!(re.is_match("warning:"));
assert!(re.is_match("Warning:"));
assert!(re.is_match("warning"));
}
#[test]
fn test_preprocess_multiple_escapes() {
let result = preprocess_pattern(r"^\>.*?\<");
assert_eq!(result, r"^>.*?<");
}
#[test]
fn test_preprocess_nested_character_classes() {
let result = preprocess_pattern(r"[a\>b][c\<d]");
assert_eq!(result, r"[a>b][c<d]");
}
#[test]
fn test_diff_pattern_compilation() {
let re1 = EnhancedRegex::new(r"^\>([^\>].*|$)").unwrap();
let re2 = EnhancedRegex::new(r"^\<([^\<].*|$)").unwrap();
assert!(re1.is_match(">old line"));
assert!(re1.is_match(">"));
assert!(!re1.is_match(">>"));
assert!(re2.is_match("<new line"));
assert!(re2.is_match("<"));
assert!(!re2.is_match("<<")); }
#[test]
fn test_gcc_pattern_compilation() {
let re1 = EnhancedRegex::new(r"[Ww]arning[:\b]").unwrap();
let re2 = EnhancedRegex::new(r"[Ee]rror[:\b]").unwrap();
assert!(re1.is_match("warning:"));
assert!(re1.is_match("Warning:"));
assert!(re1.is_match("warning"));
assert!(re2.is_match("error:"));
assert!(re2.is_match("Error:"));
assert!(re2.is_match("error"));
}
#[test]
fn test_preprocess_variable_length_lookbehind() {
let result = fix_variable_length_lookbehind(r"(?<=─|-)");
assert!(!result.contains("─|-"));
}
#[test]
fn test_findmnt_pattern_compilation() {
let result = preprocess_pattern(r"(?<=─|-)(?:\/([^\/ ]+))+");
assert!(!result.is_empty());
let re = EnhancedRegex::new(r"(?<=─|-)(?:\/([^\/ ]+))+");
let _ = re;
}
}