use crate::types::MIN_LITERAL_PREFIX_CHARS;
pub fn extract_literal_prefixes(pattern: &str) -> Vec<String> {
let pattern = strip_leading_inline_flags(pattern);
if let Some(rest) = strip_leading_boundary_guard(pattern) {
let inner = extract_literal_prefixes(rest);
if !inner.is_empty() {
return inner;
}
}
if pattern.starts_with('(') && pattern.contains('|') {
let mut depth = 0;
let mut end_idx = None;
for (i, ch) in pattern.char_indices() {
match ch {
'(' => depth += 1,
')' => {
depth -= 1;
if depth == 0 {
end_idx = Some(i);
break;
}
}
_ => {}
}
}
if let Some(end) = end_idx {
let mut inner = &pattern[1..end];
if inner.starts_with("?:") {
inner = &inner[2..];
} else if inner.starts_with("?i:")
|| inner.starts_with("?m:")
|| inner.starts_with("?s:")
{
inner = &inner[3..];
} else if inner.starts_with("?im:")
|| inner.starts_with("?is:")
|| inner.starts_with("?ms:")
{
inner = &inner[4..];
}
let mut parts = Vec::new();
let mut start = 0;
let mut d = 0;
for (i, ch) in inner.char_indices() {
match ch {
'(' => d += 1,
')' => d -= 1,
'|' if d == 0 => {
parts.push(&inner[start..i]);
start = i + 1;
}
_ => {}
}
}
parts.push(&inner[start..]);
let mut results = Vec::new();
for part in parts {
if let Some(p) = extract_literal_prefix(part) {
results.push(p);
}
}
if !results.is_empty() {
return results;
}
}
}
extract_literal_prefix(pattern).into_iter().collect()
}
pub fn strip_leading_boundary_guard(pattern: &str) -> Option<&str> {
let body = pattern.strip_prefix("(?:")?;
let bytes = body.as_bytes();
let mut depth = 0i32;
let mut in_class = false;
let mut i = 0;
let mut end = None;
while i < bytes.len() {
match bytes[i] {
b'\\' => i += 1, b'[' if !in_class => in_class = true,
b']' if in_class => in_class = false,
b'(' if !in_class => depth += 1,
b')' if !in_class => {
if depth == 0 {
end = Some(i);
break;
}
depth -= 1;
}
_ => {}
}
i += 1;
}
let end = end?;
let group = &body[..end];
let rest = &body[end + 1..];
if group.is_empty() || rest.is_empty() {
return None;
}
let mut alts = Vec::new();
let mut start = 0;
let mut d = 0i32;
let mut cls = false;
for (j, ch) in group.char_indices() {
match ch {
'[' if !cls => cls = true,
']' if cls => cls = false,
'(' if !cls => d += 1,
')' if !cls => d -= 1,
'|' if d == 0 && !cls => {
alts.push(&group[start..j]);
start = j + 1;
}
_ => {}
}
}
alts.push(&group[start..]);
let all_boundary = alts.iter().all(|a| {
let a = a.trim();
matches!(
a,
"^" | "$"
| r"\b"
| r"\B"
| r"\w"
| r"\W"
| r"\s"
| r"\S"
| r"\d"
| r"\D"
| r"\A"
| r"\z"
) || (a.starts_with('[') && a.ends_with(']') && a.len() >= 3)
});
if all_boundary {
Some(rest)
} else {
None
}
}
pub fn strip_leading_inline_flags(pattern: &str) -> &str {
if !pattern.starts_with("(?") {
return pattern;
}
let bytes = pattern.as_bytes();
if bytes.len() < 4 || bytes[0] != b'(' || bytes[1] != b'?' {
return pattern;
}
let mut i = 2;
while i < bytes.len() && matches!(bytes[i], b'i' | b'm' | b's' | b'x' | b'u' | b'U' | b'-') {
i += 1;
}
if i < bytes.len() && bytes[i] == b')' {
&pattern[i + 1..]
} else {
pattern
}
}
pub fn extract_literal_prefix(pattern: &str) -> Option<String> {
let mut prefix = String::new();
let mut chars = pattern.chars().peekable();
while let Some(ch) = chars.next() {
match ch {
'\\' => {
let Some(next) = chars.next() else {
break;
};
if is_escaped_literal(next) {
prefix.push(next);
} else {
break;
}
}
'[' | '.' | '*' | '+' | '?' | '{' | '|' | '^' | '$' => break,
'(' => {
let group_start = chars.clone().collect::<String>();
if let Some(alternatives) = extract_group_alternatives(&group_start) {
if let Some(first) = alternatives.first() {
let common: String = first
.chars()
.enumerate()
.take_while(|(i, c)| {
alternatives
.iter()
.all(|alt| alt.chars().nth(*i) == Some(*c))
})
.map(|(_, c)| c)
.collect();
if !common.is_empty() {
prefix.push_str(&common);
}
}
}
break;
}
_ => {
prefix.push(ch);
}
}
}
if prefix.len() >= MIN_LITERAL_PREFIX_CHARS {
Some(prefix)
} else {
None
}
}
fn extract_group_alternatives(s: &str) -> Option<Vec<String>> {
let inner = s
.strip_prefix("?:")
.or_else(|| s.strip_prefix("?i:"))
.or_else(|| s.strip_prefix("?im:"))
.unwrap_or(s);
let mut depth = 0i32;
let mut end = None;
for (i, ch) in inner.char_indices() {
match ch {
'(' => depth += 1,
')' => {
if depth == 0 {
end = Some(i);
break;
}
depth -= 1;
}
_ => {}
}
}
let end = end?;
let group_content = &inner[..end];
let mut parts = Vec::new();
let mut start = 0;
let mut d = 0i32;
for (i, ch) in group_content.char_indices() {
match ch {
'(' => d += 1,
')' => d -= 1,
'|' if d == 0 => {
parts.push(&group_content[start..i]);
start = i + 1;
}
_ => {}
}
}
parts.push(&group_content[start..]);
let literals: Vec<String> = parts
.iter()
.filter_map(|part| {
let mut lit = String::new();
for ch in part.chars() {
match ch {
'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '-' | '.' | ':' | '=' | ' ' => {
lit.push(ch);
}
'\\' => break, _ => break, }
}
if lit.is_empty() {
None
} else {
Some(lit)
}
})
.collect();
if literals.len() == parts.len() && !literals.is_empty() {
Some(literals)
} else {
None
}
}
pub fn is_escaped_literal(ch: char) -> bool {
matches!(
ch,
'[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
)
}
pub const MIN_INNER_LITERAL_CHARS: usize = 4;
pub fn extract_inner_literals(pattern: &str) -> Vec<String> {
use regex_syntax::ast::parse::Parser;
let Ok(ast) = Parser::new().parse(pattern) else {
return Vec::new();
};
let mut out = Vec::new();
walk_ast(&ast, &mut out);
out.retain(|s| s.len() >= MIN_INNER_LITERAL_CHARS);
let mut seen = std::collections::HashSet::new();
out.retain(|s| seen.insert(s.clone()));
out
}
fn walk_ast(ast: ®ex_syntax::ast::Ast, out: &mut Vec<String>) {
use regex_syntax::ast::Ast;
match ast {
Ast::Concat(concat) => {
let mut run = String::new();
for inner in concat.asts.iter() {
match inner {
Ast::Literal(lit) => run.push(lit.c),
_ => {
if run.len() >= MIN_INNER_LITERAL_CHARS {
out.push(std::mem::take(&mut run));
} else {
run.clear();
}
walk_ast(inner, out);
}
}
}
if run.len() >= MIN_INNER_LITERAL_CHARS {
out.push(run);
}
}
Ast::Group(group) => walk_ast(&group.ast, out),
Ast::Alternation(alt) => {
for branch in alt.asts.iter() {
walk_ast(branch, out);
}
}
Ast::Literal(lit) => {
let s = lit.c.to_string();
if s.len() >= MIN_INNER_LITERAL_CHARS {
out.push(s);
}
}
Ast::Repetition(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
| Ast::ClassBracketed(_)
| Ast::Dot(_)
| Ast::Empty(_)
| Ast::Flags(_)
| Ast::Assertion(_) => {}
}
}