use std::collections::HashMap;
use std::sync::{LazyLock, RwLock};
use regex::Regex;
static REGEX_CACHE: LazyLock<RwLock<HashMap<String, &'static Regex>>> =
LazyLock::new(|| RwLock::new(HashMap::new()));
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct Variant {
text: String,
groups: Vec<String>,
}
impl Variant {
fn empty() -> Self {
Self::default()
}
fn literal(value: impl Into<String>) -> Self {
Self {
text: value.into(),
groups: Vec::new(),
}
}
fn placeholder(name: impl Into<String>, include_group: bool) -> Self {
let name = name.into();
let groups = if include_group {
vec![name.clone()]
} else {
Vec::new()
};
Self {
text: format!("%({name})s"),
groups,
}
}
}
struct Parser {
chars: Vec<char>,
pos: usize,
next_group: usize,
}
impl Parser {
fn new(pattern: &str) -> Self {
Self {
chars: pattern.chars().collect(),
pos: 0,
next_group: 0,
}
}
fn parse(mut self) -> Vec<Variant> {
let variants = self.parse_expression(None);
if variants.is_empty() {
vec![Variant::empty()]
} else {
variants
}
}
fn parse_expression(&mut self, terminator: Option<char>) -> Vec<Variant> {
let mut variants = self.parse_sequence(terminator);
while self.peek() == Some('|') {
self.pos += 1;
variants.extend(self.parse_sequence(terminator));
}
variants
}
fn parse_sequence(&mut self, terminator: Option<char>) -> Vec<Variant> {
let mut variants = vec![Variant::empty()];
while let Some(ch) = self.peek() {
if Some(ch) == terminator || ch == '|' {
break;
}
let atom = self.parse_atom();
variants = combine(&variants, &atom);
}
variants
}
fn parse_atom(&mut self) -> Vec<Variant> {
let atom = match self.next() {
Some('^' | '$') => vec![Variant::empty()],
Some('.') => vec![Variant::literal(".")],
Some('[') => vec![Variant::literal(self.parse_character_class())],
Some('(') => self.parse_group(),
Some('\\') => vec![Variant::literal(parse_escape(self.next()))],
Some(ch) => vec![Variant::literal(ch.to_string())],
None => vec![Variant::empty()],
};
self.apply_quantifier(atom)
}
fn parse_group(&mut self) -> Vec<Variant> {
match self.peek() {
Some('?') => {
self.pos += 1;
self.parse_special_group()
}
_ => {
let name = format!("_{}", self.next_group);
self.next_group += 1;
let variants = self.parse_expression(Some(')'));
self.consume_if(')');
if should_expand_positional_group(&variants) {
variants
} else {
vec![Variant::placeholder(name, true)]
}
}
}
}
fn parse_special_group(&mut self) -> Vec<Variant> {
match self.next() {
Some(':') => {
let variants = self.parse_expression(Some(')'));
self.consume_if(')');
variants
}
Some('P') => match self.next() {
Some('<') => {
let name = self.read_until('>');
self.skip_group_body();
vec![Variant::placeholder(name, true)]
}
Some('=') => {
let name = self.read_until(')');
vec![Variant::placeholder(name, false)]
}
_ => {
self.skip_group_body();
vec![Variant::empty()]
}
},
Some('=') | Some('!') => {
self.skip_group_body();
vec![Variant::empty()]
}
Some('<') => {
if matches!(self.peek(), Some('=') | Some('!')) {
self.pos += 1;
}
self.skip_group_body();
vec![Variant::empty()]
}
_ => {
self.skip_group_body();
vec![Variant::empty()]
}
}
}
fn parse_character_class(&mut self) -> String {
if self.peek() == Some('^') {
self.pos += 1;
}
let mut representative = None;
let mut escaped = false;
while let Some(ch) = self.next() {
if escaped {
if representative.is_none() {
representative = Some(parse_escape(Some(ch)));
}
escaped = false;
continue;
}
match ch {
'\\' => escaped = true,
']' => break,
'-' if representative.is_some() => {}
_ if representative.is_none() => representative = Some(ch.to_string()),
_ => {}
}
}
representative.unwrap_or_else(|| "a".to_string())
}
fn apply_quantifier(&mut self, variants: Vec<Variant>) -> Vec<Variant> {
match self.peek() {
Some('?') | Some('*') => {
self.pos += 1;
optional(variants)
}
Some('+') => {
self.pos += 1;
variants
}
Some('{') => {
self.pos += 1;
let (min, max) = self.parse_range_quantifier();
match (min, max) {
(0, Some(0)) => vec![Variant::empty()],
(0, _) => optional(variants),
(count, _) => repeat(variants, count),
}
}
_ => variants,
}
}
fn parse_range_quantifier(&mut self) -> (usize, Option<usize>) {
let mut content = String::new();
while let Some(ch) = self.next() {
if ch == '}' {
break;
}
content.push(ch);
}
let mut parts = content.splitn(2, ',');
let min = parts
.next()
.and_then(|part| part.parse::<usize>().ok())
.unwrap_or(0);
let max = parts.next().and_then(|part| {
if part.is_empty() {
None
} else {
part.parse::<usize>().ok()
}
});
if self.peek() == Some('?') {
self.pos += 1;
}
(min, max)
}
fn skip_group_body(&mut self) {
let mut nested_groups = 0usize;
let mut in_class = false;
let mut escaped = false;
while let Some(ch) = self.next() {
if escaped {
escaped = false;
continue;
}
match ch {
'\\' => escaped = true,
'[' if !in_class => in_class = true,
']' if in_class => in_class = false,
'(' if !in_class => nested_groups += 1,
')' if !in_class && nested_groups == 0 => break,
')' if !in_class => nested_groups -= 1,
_ => {}
}
}
}
fn read_until(&mut self, terminator: char) -> String {
let mut value = String::new();
while let Some(ch) = self.next() {
if ch == terminator {
break;
}
value.push(ch);
}
value
}
fn consume_if(&mut self, expected: char) {
if self.peek() == Some(expected) {
self.pos += 1;
}
}
fn peek(&self) -> Option<char> {
self.chars.get(self.pos).copied()
}
fn next(&mut self) -> Option<char> {
let ch = self.peek()?;
self.pos += 1;
Some(ch)
}
}
fn parse_escape(ch: Option<char>) -> String {
match ch {
Some('A' | 'b' | 'B' | 'Z') => String::new(),
Some('d') => "0".to_string(),
Some('D' | 'S' | 'w') => "x".to_string(),
Some('s') => " ".to_string(),
Some('W') => "!".to_string(),
Some(other) => other.to_string(),
None => String::new(),
}
}
fn combine(left: &[Variant], right: &[Variant]) -> Vec<Variant> {
let mut combined = Vec::with_capacity(left.len().saturating_mul(right.len().max(1)));
for lhs in left {
for rhs in right {
let mut text = lhs.text.clone();
text.push_str(&rhs.text);
let mut groups = lhs.groups.clone();
groups.extend(rhs.groups.iter().cloned());
combined.push(Variant { text, groups });
}
}
combined
}
fn should_expand_positional_group(variants: &[Variant]) -> bool {
variants.len() > 1
&& variants.iter().all(|variant| {
!variant.text.is_empty() && variant.groups.is_empty() && !variant.text.contains("%(")
})
}
fn optional(variants: Vec<Variant>) -> Vec<Variant> {
let mut result = Vec::with_capacity(variants.len() + 1);
result.push(Variant::empty());
result.extend(variants);
result
}
fn repeat(variants: Vec<Variant>, count: usize) -> Vec<Variant> {
let mut result = vec![Variant::empty()];
for _ in 0..count {
result = combine(&result, &variants);
}
result
}
#[must_use]
pub fn lazy_re_compile(pattern: &str) -> &'static Regex {
if let Some(compiled) = REGEX_CACHE
.read()
.expect("regex cache read lock poisoned")
.get(pattern)
.copied()
{
return compiled;
}
let mut cache = REGEX_CACHE
.write()
.expect("regex cache write lock poisoned");
if let Some(compiled) = cache.get(pattern).copied() {
return compiled;
}
let compiled = Box::leak(Box::new(
Regex::new(pattern).expect("invalid regex pattern"),
));
cache.insert(pattern.to_owned(), compiled);
compiled
}
#[must_use]
pub fn normalize(regex: &str) -> Vec<(String, Vec<String>)> {
Parser::new(regex)
.parse()
.into_iter()
.map(|variant| (variant.text, variant.groups))
.collect()
}
#[macro_export]
macro_rules! lazy_regex {
($pattern:expr) => {{
static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
regex::Regex::new($pattern).expect(concat!("invalid regex: ", $pattern))
});
&*RE
}};
}
#[cfg(test)]
mod tests {
use super::{lazy_re_compile, normalize};
#[test]
fn normalize_strips_anchors_and_preserves_literals() {
assert_eq!(
normalize(r"^articles/2024/$"),
vec![("articles/2024/".to_string(), vec![])]
);
}
#[test]
fn normalize_expands_named_groups_and_optionals() {
assert_eq!(
normalize(r"^users/(?P<slug>[a-z]+)/?$"),
vec![
("users/%(slug)s".to_string(), vec!["slug".to_string()]),
("users/%(slug)s/".to_string(), vec!["slug".to_string()]),
]
);
}
#[test]
fn normalize_expands_alternation() {
assert_eq!(
normalize(r"^(foo|bar)/(?P<id>\d+)$"),
vec![
("foo/%(id)s".to_string(), vec!["id".to_string()]),
("bar/%(id)s".to_string(), vec!["id".to_string()]),
]
);
}
#[test]
fn normalize_supports_positional_groups_and_backreferences() {
assert_eq!(
normalize(r"^(?P<name>.*)-(?P=name)-(.+)$"),
vec![(
"%(name)s-%(name)s-%(_0)s".to_string(),
vec!["name".to_string(), "_0".to_string()],
)]
);
}
#[test]
fn normalize_handles_empty_patterns() {
assert_eq!(normalize(r""), vec![("".to_string(), vec![])]);
}
#[test]
fn normalize_preserves_escaped_literals() {
assert_eq!(
normalize(r"\\\^\$\.\|\?\*\+\(\)\["),
vec![(r"\^$.|?*+()[".to_string(), vec![])]
);
}
#[test]
fn normalize_supports_multiple_positional_groups() {
assert_eq!(
normalize(r"(.*)-(.+)"),
vec![(
"%(_0)s-%(_1)s".to_string(),
vec!["_0".to_string(), "_1".to_string()],
)]
);
}
#[test]
fn normalize_flattens_noncapturing_groups() {
assert_eq!(
normalize(r"(?:non-capturing)"),
vec![("non-capturing".to_string(), vec![])]
);
}
#[test]
fn normalize_supports_multiple_named_groups() {
assert_eq!(
normalize(r"(?P<first_group_name>.*)-(?P<second_group_name>.*)"),
vec![(
"%(first_group_name)s-%(second_group_name)s".to_string(),
vec![
"first_group_name".to_string(),
"second_group_name".to_string(),
],
)]
);
}
#[test]
fn normalize_reuses_named_backreferences_without_new_groups() {
assert_eq!(
normalize(r"(?P<first_group_name>.*)-(?P=first_group_name)"),
vec![(
"%(first_group_name)s-%(first_group_name)s".to_string(),
vec!["first_group_name".to_string()],
)]
);
}
#[test]
fn lazy_re_compile_caches_by_pattern() {
let first = lazy_re_compile(r"^foo$");
let second = lazy_re_compile(r"^foo$");
assert!(std::ptr::eq(first, second));
assert!(first.is_match("foo"));
assert!(!first.is_match("foobar"));
}
#[test]
fn lazy_regex_macro_returns_static_regex() {
fn compiled() -> &'static regex::Regex {
crate::lazy_regex!(r"^bar$")
}
let first = compiled();
let second = compiled();
assert!(std::ptr::eq(first, second));
assert!(first.is_match("bar"));
assert!(!first.is_match("bars"));
}
}