use std::collections::HashMap;
use crate::helpers::{TrailError, consume_lexeme, format_error, get_env, is_escaped, peek};
enum SplitMarker {
GROUP { index: usize },
INTERVAL { index: usize },
CLASS { index: usize },
}
pub fn rex_split(regex: &str) -> Result<Vec<String>, TrailError> {
let mut lexemes: Vec<String> = Vec::new();
let mut is_char_class = false;
let mut is_intv_quant = false;
let mut accumulate: Vec<char> = Vec::new();
let mut markers: Vec<SplitMarker> = Vec::new();
let mut i = 0;
let chars: Vec<char> = regex.chars().collect();
while i < chars.len() {
let curr = chars[i];
if curr == '[' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
let next = peek(&chars, i, 1);
if next == ']' {
lexemes.push(String::new());
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
markers.push(SplitMarker::CLASS { index: i });
is_char_class = true;
}
}
} else if curr == ']' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else {
if is_char_class {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
let marker = markers
.pop()
.expect("Expected a `SplitMarker`, found `None` instead.");
assert!(
matches!(marker, SplitMarker::CLASS { .. }),
"`MarkerKind.CLASS` was not found."
);
is_char_class = false;
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Unmatched closing bracket `]`.",
&context,
"]",
)));
}
}
}
else if curr == '{' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
let (prev, next) = (peek(&chars, i, -1), peek(&chars, i, 1));
if matches!(prev, '*' | '+' | '?' | '}' | '(') {
let context: String = chars[..i - 1].iter().collect();
let source = format!("{}{}", prev, "{");
return Err(TrailError(format_error(
"Invalid quantifier precedence.",
&context,
&source,
)));
} else if prev == '\0' {
return Err(TrailError(format_error(
"Interval quantifiers must be preceded by either an expression or a group.",
"",
"{",
)));
} else if next == '}' {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Invalid interval quantifier.",
&context,
"{}",
)));
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
markers.push(SplitMarker::INTERVAL { index: i });
is_intv_quant = true;
}
}
} else if curr == '}' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
if is_intv_quant {
consume_lexeme(&mut lexemes, &mut accumulate);
let length = lexemes.len();
let (bracket, quantifier) = (lexemes.get(length - 2), lexemes.get(length - 1));
if let (Some(bracket), Some(quantifier)) = (bracket, quantifier) {
let parts: Vec<&str> =
quantifier.split(",").map(|part| part.trim()).collect();
if bracket == "{"
&& parts.len() <= 2
&& parts
.iter()
.all(|x| x.is_empty() || x.parse::<u32>().is_ok())
{
if parts.len() == 2 && parts.iter().all(|x| x.parse::<u32>().is_ok()) {
let (low, up) = (
parts[0].parse::<u32>().unwrap(),
parts[1].parse::<u32>().unwrap(),
);
if low > up {
let context: String =
chars[..i - quantifier.len() - 1].iter().collect();
let source = format!("{}{}{}", "{", quantifier, "}");
return Err(TrailError(format_error(
"Interval quantifier bounds out of order.",
&context,
&source,
)));
}
}
lexemes.push(curr.to_string());
} else {
let context: String = chars[..i - bracket.len() - quantifier.len()]
.iter()
.collect();
let source = format!("{}{}{}", bracket, quantifier, "}");
return Err(TrailError(format_error(
"Invalid interval quantifier.",
&context,
&source,
)));
}
let marker = markers
.pop()
.expect("Expected a `SplitMarker`, found `None` instead.");
assert!(
matches!(marker, SplitMarker::INTERVAL { .. }),
"`MarkerKind.INTERVAL` was not found."
);
is_intv_quant = false;
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Invalid interval quantifier.",
&context,
"}",
)));
}
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Unmatched closing bracket `}`.",
&context,
"}",
)));
}
}
}
else if curr == '(' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
markers.push(SplitMarker::GROUP { index: i })
}
} else if curr == ')' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else if !markers.is_empty() {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
let marker = markers
.pop()
.expect("Expected a `SplitMarker`, found `None` instead.");
assert!(
matches!(marker, SplitMarker::GROUP { .. }),
"`MarkerKind.GROUP` was not found."
);
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Unmatched closing bracket `)`.",
&context,
")",
)));
}
}
else if curr.is_ascii_digit() {
if is_escaped(&chars, i) {
let context: String = chars[..i - 1].iter().collect();
let source = format!("\\{}", curr);
return Err(TrailError(format_error(
"Unsupported backreference format, use `(?<alphanumeric>...)` to capture, and `$<alphanumeric>` to load instead.",
&context,
&source,
)));
} else {
accumulate.push(curr);
}
} else if curr == '?' {
if is_escaped(&chars, i) {
lexemes.push(curr.to_string());
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
let (prev, next) = (peek(&chars, i, -1), peek(&chars, i, 1));
if prev == '\0' {
return Err(TrailError(format_error(
"Quantifiers must be preceded by either an expression or a group.",
"",
&curr.to_string(),
)));
} else if matches!(prev, '*' | '+' | '?' | '}') {
let context: String = chars[..i - 1].iter().collect();
return Err(TrailError(format_error(
"Invalid quantifier precedence.",
&context,
&curr.to_string(),
)));
} else if prev == '(' && !is_escaped(&chars, i) {
let skip = peek(&chars, i, 2);
if next == '<' && !matches!(skip, '=' | '!') {
consume_lexeme(&mut lexemes, &mut accumulate);
consume_reference(&mut lexemes, &mut i, &chars)?;
} else {
let context: String = chars[..i - 1].iter().collect();
let source = format!("{}{}{}", prev, curr, next);
return Err(TrailError(format_error(
"Unsupported question-mark construct.",
&context,
&source,
)));
}
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
}
}
} else if curr == '$' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
let next = peek(&chars, i, 1);
if matches!(next, '\0' | '|') {
} else if next == '<' {
consume_lexeme(&mut lexemes, &mut accumulate);
consume_reference(&mut lexemes, &mut i, &chars)?;
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Misplaced anchor, escape with `\\` for literal dollar sign.",
&context,
"$",
)));
}
}
} else if curr == '^' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else {
let (prev, next) = (peek(&chars, i, -1), peek(&chars, i, 1));
if is_char_class {
if prev == '[' && !is_escaped(&chars, i - 1) {
if next == ']' {
lexemes.push(String::new());
}
assert!(accumulate.is_empty(), "`accumulate` expected to be empty.");
lexemes.push(curr.to_string());
} else {
accumulate.extend(['\\', curr]);
}
} else {
if matches!(prev, '\0' | '|') {
} else {
let context: String = chars[..i].iter().collect();
return Err(TrailError(format_error(
"Misplaced anchor, escape with `\\` for literal caret.",
&context,
"^",
)));
}
}
}
} else if curr == '-' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else {
if is_char_class {
let (prev, next) = (peek(&chars, i, -1), peek(&chars, i, 1));
if next == ']' {
accumulate.extend(['\\', curr]);
} else {
if !accumulate.is_empty() {
if prev.is_ascii_alphanumeric() && next.is_ascii_alphanumeric() {
let (start, end) = (prev as u32, next as u32);
if start <= end {
if next == ']' {
accumulate.extend(['\\', curr]);
} else {
accumulate.pop();
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.extend([
prev.to_string(),
curr.to_string(),
next.to_string(),
]);
i += 1;
}
} else {
let header = format!(
"Character range '{prev}-{next}' is invalid: start must be less than or equal to end."
);
let context: String = chars[..i - 1].iter().collect();
let source = format!("{}{}{}", prev, curr, next);
return Err(TrailError(format_error(
&header, &context, &source,
)));
}
} else {
let context: String = chars[..i - 1].iter().collect();
let source = format!("{}-{}", prev, next);
return Err(TrailError(format_error(
"Range contains non-alphanumeric characters.",
&context,
&source,
)));
}
} else {
accumulate.extend(['\\', curr]);
}
}
} else {
accumulate.extend(['\\', curr]);
}
}
} else if curr == '|' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
}
} else if curr == '.' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
}
}
else if matches!(curr, '+' | '*') {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else if is_char_class {
accumulate.extend(['\\', curr]);
} else {
let prev = peek(&chars, i, -1);
if matches!(prev, '*' | '+' | '?' | '}' | '(') {
let context: String = chars[..i - 1].iter().collect();
let source = format!("{}{}", prev, curr);
return Err(TrailError(format_error(
"Invalid quantifier precedence.",
&context,
&source,
)));
} else if prev == '\0' {
return Err(TrailError(format_error(
"Quantifiers must be preceded by either an expression or a group.",
"",
&curr.to_string(),
)));
} else {
consume_lexeme(&mut lexemes, &mut accumulate);
lexemes.push(curr.to_string());
}
}
}
else if matches!(curr, 'd' | 'D' | 'w' | 'W' | 's' | 'S') {
if is_escaped(&chars, i) {
accumulate.pop();
consume_lexeme(&mut lexemes, &mut accumulate);
accumulate.extend(['\\', curr]);
} else {
accumulate.push(curr);
}
}
else if matches!(curr, 'b' | 'B') {
if is_escaped(&chars, i) {
let context: String = chars[..i - 1].iter().collect();
let source = format!("\\{}", curr);
return Err(TrailError(format_error(
"Unsupported predefined escape character.",
&context,
&source,
)));
}
}
else if curr == '\\' {
if is_escaped(&chars, i) {
accumulate.push(curr);
} else {
let next = peek(&chars, i, 1);
if ESCAPABLE.contains(&next) {
accumulate.push(curr);
} else if next.is_ascii_digit() || next == '/' {
} else {
let context: String = chars[..i].iter().collect();
let source = format!("\\{}", next);
return Err(TrailError(format_error(
"Invalid escape character.",
&context,
&source,
)));
}
}
}
else if curr == '\0' {
let context: String = chars[..i - 1].iter().collect();
return Err(TrailError(format_error(
"Reserved null character `\\0`.",
&context,
&curr.to_string(),
)));
} else {
accumulate.push(curr);
}
i += 1;
}
consume_lexeme(&mut lexemes, &mut accumulate);
if let Some(marker) = markers.pop() {
match marker {
SplitMarker::GROUP { index } => {
let context: String = chars[..index].iter().collect();
return Err(TrailError(format_error(
"Unmatched '(' - expected a closing ')'.",
&context,
"(",
)));
}
SplitMarker::CLASS { index } => {
let context: String = chars[..index].iter().collect();
return Err(TrailError(format_error(
"Unmatched '[' - expected a closing ']'.",
&context,
"[",
)));
}
SplitMarker::INTERVAL { index } => {
let context: String = chars[..index].iter().collect();
return Err(TrailError(format_error(
"Unmatched '{' - expected a closing '}'.",
&context,
"{",
)));
}
}
}
return Ok(lexemes);
}
pub fn rex_expand(chunks: Vec<String>) -> Vec<String> {
let digits = expand_ascii('0', '9');
let non_digits = digits
.clone()
.pipe(|s| exclude_from_ascii(&s))
.pipe(|s| include_escapes(&s));
let word = format!(
"{}{}{}_",
expand_ascii('0', '9'),
expand_ascii('a', 'z'),
expand_ascii('A', 'Z')
);
let non_word = word
.clone()
.pipe(|s| exclude_from_ascii(&s))
.pipe(|s| include_escapes(&s));
let space = " \t\n\r\u{000C}\u{000B}".to_string();
let non_space = String::from(" ")
.pipe(|s| exclude_from_ascii(&s))
.pipe(|s| include_escapes(&s));
let wildcard = String::from("\n")
.pipe(|s| exclude_from_ascii(&s))
.pipe(|s| include_escapes(&s));
let mut lexemes: Vec<String> = Vec::new();
let mut i = 0;
let mut is_char_class = false;
let mut is_complement = false;
while i < chunks.len() {
let chunk = chunks[i].clone();
if chunk == "[" {
is_char_class = true;
lexemes.push(chunk);
} else if chunk == "]" {
is_char_class = false;
let mut assembled = Vec::new();
while let Some(lexeme) = lexemes.pop() {
if lexeme == "[" {
break;
}
assembled.push(lexeme);
}
assert!(!assembled.is_empty(), "Character class is empty.");
let mut chunk: String = assembled.into_iter().rev().collect();
if is_complement {
chunk = chunk
.pipe(|s| exclude_escapes(&s))
.pipe(|s| exclude_from_ascii(&s))
.pipe(|s| include_escapes(&s));
is_complement = false;
}
lexemes.extend(["[".to_string(), chunk, "]".to_string()]);
} else if chunk == "^" {
is_complement = true;
}
else if chunk == "-" {
let start = lexemes
.pop()
.expect("Range `-` must be preceded by a bound.")
.chars()
.collect::<Vec<char>>()[0];
let end: char = chunks
.get(i + 1)
.expect("Range `-` must be succeeded by a bound.")
.chars()
.collect::<Vec<char>>()[0];
lexemes.push(expand_ascii(start, end));
i += 1;
}
else if chunk == "\\d" {
if is_char_class {
lexemes.push(digits.clone());
} else {
lexemes.extend(["[".to_string(), digits.clone(), "]".to_string()]);
}
} else if chunk == "\\D" {
if is_char_class {
lexemes.push(non_digits.clone());
} else {
lexemes.extend(["[".to_string(), non_digits.clone(), "]".to_string()]);
}
} else if chunk == "\\w" {
if is_char_class {
lexemes.push(word.clone());
} else {
lexemes.extend(["[".to_string(), word.clone(), "]".to_string()]);
}
} else if chunk == "\\W" {
if is_char_class {
lexemes.push(non_word.clone());
} else {
lexemes.extend(["[".to_string(), non_word.clone(), "]".to_string()]);
}
} else if chunk == "\\s" {
if is_char_class {
lexemes.push(space.clone());
} else {
lexemes.extend(["[".to_string(), space.clone(), "]".to_string()]);
}
} else if chunk == "\\S" {
if is_char_class {
lexemes.push(non_space.clone());
} else {
lexemes.extend(["[".to_string(), non_space.clone(), "]".to_string()]);
}
} else if chunk == "." {
lexemes.extend(["[".to_string(), wildcard.clone(), "]".to_string()]);
} else {
lexemes.push(chunk);
}
i += 1;
}
return lexemes;
}
pub fn rex_norm(chunks: Vec<String>) -> Vec<String> {
let quantifiers: HashMap<&str, (Vec<&str>, Vec<&str>)> = HashMap::from([
("?", (vec!["["], vec!["]"])),
("+", (vec!["{"], vec!["}"])),
("*", (vec!["{", "["], vec!["]", "}"])),
]);
let mut lexemes: Vec<String> = Vec::new();
let mut i = 0;
while i < chunks.len() {
let chunk = chunks[i].clone();
if chunk == "[" {
let (next, skip) = (
chunks.get(i + 1).expect("Character class is empty."),
chunks.get(i + 2).expect("Character class is empty."),
);
assert_eq!(skip, "]", "Expected assembled character class.");
if get_env("TEST_MODE", false) {
lexemes.push("(".to_string());
let unions = exclude_escapes(next)
.chars()
.map(|c| format!("\"{}\"", c))
.collect::<Vec<_>>()
.join("|");
lexemes.push(unions);
lexemes.push(")".to_string());
} else {
lexemes.push("(".to_string());
let mut unions: Vec<String> = exclude_escapes(next)
.chars()
.flat_map(|c| [format!("\"{}\"", c), "|".to_string()])
.collect();
lexemes.extend(unions.drain(..unions.len() - 1));
lexemes.push(")".to_string());
}
i += 2
}
else if matches!(&*chunk, "*" | "+" | "?") {
assert!(
!lexemes.is_empty(),
"Quantifiers must be preceded by either a expression or a group."
);
let prev = lexemes
.pop()
.expect("Quantifiers must be preceded by either a expression or a group.");
let (open_br, close_br) = &quantifiers[&*chunk];
if prev == ")" {
let mut assembled = Vec::new();
let mut depth = 0;
while let Some(chunk) = lexemes.pop() {
if chunk == "(" && depth == 0 {
break;
}
depth += if chunk == ")" {
1
} else if chunk == "(" {
-1
} else {
0
};
assembled.push(chunk);
}
assembled = assembled.into_iter().rev().collect();
lexemes.extend(open_br.iter().map(|&s| s.to_string()));
lexemes.extend(assembled);
lexemes.extend(close_br.iter().map(|&s| s.to_string()));
} else {
assert!(
prev.starts_with("\"") && prev.len() == 3 && prev.ends_with("\""),
"Quantifier was not preceded by a terminal character."
);
lexemes.extend(open_br.iter().map(|&s| s.to_string()));
lexemes.push(prev);
lexemes.extend(close_br.iter().map(|&s| s.to_string()));
}
}
else if chunk == "{" {
let prev = lexemes
.pop()
.expect("Interval quantifiers must be preceded by either a expression or a group.");
let next: Vec<&str> = chunks
.get(i + 1)
.expect("Interval quantifiers must have a content.")
.split(",")
.collect();
let args = match next.len() {
2 => (next[0], next[1]),
_ => (next[0], next[0]),
};
let (req, max) = (
args.0.parse::<u32>().unwrap_or(0),
args.1.parse::<u32>().unwrap_or(0),
);
let opt: i32 = (max as i32) - (req as i32);
if prev == ")" {
let mut assembled = Vec::new();
let mut depth = 0;
while let Some(chunk) = lexemes.pop() {
if chunk == "(" && depth == 0 {
break;
}
depth += if chunk == ")" {
1
} else if chunk == "(" {
-1
} else {
0
};
assembled.push(chunk);
}
assembled = assembled.into_iter().rev().collect();
for _ in 0..req {
lexemes.push("(".to_string());
lexemes.extend(assembled.iter().cloned());
lexemes.push(")".to_string());
}
for _ in 0..opt {
lexemes.push("[".to_string());
lexemes.extend(assembled.iter().cloned());
lexemes.push("]".to_string());
}
if opt < 0 {
lexemes.extend(["{".to_string(), "[".to_string()]);
lexemes.extend(assembled.iter().cloned());
lexemes.extend(["]".to_string(), "}".to_string()]);
}
} else {
assert!(
prev.starts_with("\"") && prev.len() == 3 && prev.ends_with("\""),
"Quantifier was not preceded by a terminal character."
);
lexemes.extend(std::iter::repeat(prev.clone()).take(req as usize));
for _ in 0..opt {
lexemes.push("[".to_string());
lexemes.push(prev.clone());
lexemes.push("]".to_string());
}
if opt < 0 {
lexemes.extend(["{".to_string(), "[".to_string()]);
lexemes.push(prev.clone());
lexemes.extend(["]".to_string(), "}".to_string()]);
}
}
i += 2;
} else {
assert!(
!"[]{}.-^$+*?".contains(&chunk),
"Invalid symbols at normalization pass."
);
if matches!(&*chunk, "(" | ")" | "|") || is_reference(&chunk) {
lexemes.push(chunk);
} else {
let chunk = exclude_escapes(&chunk);
if matches!(chunks.get(i + 1), Some(s) if ["{", "*", "?", "+"].contains(&s.as_str()))
{
let size = chunk.len();
if size == 1 {
lexemes.push(format!("\"{}\"", chunk));
} else {
lexemes.extend([
format!("\"{}\"", &chunk[..size - 1]),
format!("\"{}\"", &chunk[size - 1..]),
]);
}
} else {
lexemes.push(format!("\"{}\"", chunk));
}
}
}
i += 1;
}
return lexemes;
}
pub fn rex_parse(regex: &str) -> Result<Vec<String>, TrailError> {
Ok(rex_norm(rex_expand(rex_split(regex)?)))
}
const ESCAPABLE: &[char] = &[
'(', ')', '[', ']', '{', '}', 'd', 'D', 'w', 'W', 's', 'S', '|', '+', '*', '?', '-', '^', '$',
'.',
];
trait Pipe: Sized {
fn pipe<F, R>(self, f: F) -> R
where
F: FnOnce(Self) -> R,
{
f(self)
}
}
impl Pipe for String {}
fn consume_reference(
lexemes: &mut Vec<String>,
index: &mut usize,
chars: &Vec<char>,
) -> Result<(), TrailError> {
let (prefix, bracket) = (chars[*index], chars[*index + 1]);
let mut reference: Vec<char> = Vec::new();
reference.extend([prefix, bracket]);
*index += 2;
let mut curr = chars[*index];
while curr != '>' {
if curr.is_ascii_alphanumeric() || curr == '_' {
reference.push(curr);
} else {
let context: String = chars[..*index].iter().collect();
let source = curr.to_string();
return Err(TrailError(format_error(
"Reference name must be a word.",
&context,
&source,
)));
}
*index += 1;
curr = chars[*index]
}
reference.push(chars[*index]);
consume_lexeme(lexemes, &mut reference);
return Ok(());
}
const CONTEXT: &str = "()[]{}|.*?+-";
fn include_escapes(exp: &str) -> String {
exp.chars()
.flat_map(|c| {
if CONTEXT.contains(c) {
vec!['\\', c]
} else {
vec![c]
}
})
.collect()
}
fn expand_ascii(start: char, end: char) -> String {
(start..=end).collect()
}
const PRINTABLE: &str = "0123456789\
abcdefghijklmnopqrstuvwxyz\
ABCDEFGHIJKLMNOPQRSTUVWXYZ\
!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \
\t\n\r\x0b\x0c";
fn exclude_from_ascii(exp: &str) -> String {
PRINTABLE.chars().filter(|&c| !exp.contains(c)).collect()
}
fn exclude_escapes(exp: &str) -> String {
let mut result = Vec::new();
for c in exp.chars() {
if CONTEXT.contains(c) && !result.is_empty() {
result.pop();
}
result.push(c);
}
result.into_iter().collect()
}
fn is_reference(reference: &String) -> bool {
(reference.starts_with("?<") || reference.starts_with("$<")) && reference.ends_with(">")
}