use std::cell::Cell;
use std::collections::HashMap;
use bitflags::bitflags;
use regress::{Flags as RegressFlags, Regex};
use crate::builtins::string::{decode_utf16, encode_utf16};
use crate::error::{StatorError, StatorResult};
bitflags! {
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub struct RegExpFlags: u8 {
const GLOBAL = 0b0000_0001;
const IGNORE_CASE = 0b0000_0010;
const MULTILINE = 0b0000_0100;
const DOT_ALL = 0b0000_1000;
const UNICODE = 0b0001_0000;
const UNICODE_SETS = 0b0010_0000;
const STICKY = 0b0100_0000;
const HAS_INDICES = 0b1000_0000;
}
}
impl RegExpFlags {
pub fn parse(flags: &str) -> StatorResult<Self> {
let mut result = Self::empty();
for ch in flags.chars() {
let bit = match ch {
'g' => Self::GLOBAL,
'i' => Self::IGNORE_CASE,
'm' => Self::MULTILINE,
's' => Self::DOT_ALL,
'u' => Self::UNICODE,
'v' => Self::UNICODE_SETS,
'y' => Self::STICKY,
'd' => Self::HAS_INDICES,
_ => {
return Err(StatorError::SyntaxError(format!(
"Invalid regular expression flags: '{ch}'"
)));
}
};
if result.contains(bit) {
return Err(StatorError::SyntaxError(format!(
"Duplicate regular expression flag: '{ch}'"
)));
}
result |= bit;
}
if result.contains(Self::UNICODE) && result.contains(Self::UNICODE_SETS) {
return Err(StatorError::SyntaxError(
"Regular expression flags 'u' and 'v' cannot be combined".to_string(),
));
}
Ok(result)
}
pub fn to_flags_string(self) -> String {
let mut s = String::with_capacity(8);
if self.contains(Self::HAS_INDICES) {
s.push('d');
}
if self.contains(Self::GLOBAL) {
s.push('g');
}
if self.contains(Self::IGNORE_CASE) {
s.push('i');
}
if self.contains(Self::MULTILINE) {
s.push('m');
}
if self.contains(Self::DOT_ALL) {
s.push('s');
}
if self.contains(Self::UNICODE) {
s.push('u');
}
if self.contains(Self::UNICODE_SETS) {
s.push('v');
}
if self.contains(Self::STICKY) {
s.push('y');
}
s
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct RegExpMatch {
pub matched: String,
pub captures: Vec<Option<String>>,
pub named_groups: HashMap<String, Option<String>>,
pub index: usize,
pub input: String,
pub indices: Option<MatchIndices>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct MatchIndices {
pub pairs: Vec<Option<(usize, usize)>>,
pub groups: HashMap<String, (usize, usize)>,
}
pub struct JsRegExp {
pattern: String,
flags: RegExpFlags,
compiled: Regex,
last_index: Cell<usize>,
}
impl std::fmt::Debug for JsRegExp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("JsRegExp")
.field("pattern", &self.pattern)
.field("flags", &self.flags)
.field("last_index", &self.last_index.get())
.finish()
}
}
impl JsRegExp {
pub fn new(pattern: &str, flags: &str) -> StatorResult<Self> {
let flags = RegExpFlags::parse(flags)?;
let regress_flags = build_regress_flags(flags);
let compiled = stacker::maybe_grow(256 * 1024, 4 * 1024 * 1024, || {
Regex::with_flags(pattern, regress_flags)
})
.map_err(|e| {
StatorError::SyntaxError(format!("Invalid regular expression: /{pattern}/: {e}"))
})?;
Ok(Self {
pattern: pattern.to_string(),
flags,
compiled,
last_index: Cell::new(0),
})
}
pub fn pattern(&self) -> &str {
&self.pattern
}
pub fn source_text(&self) -> String {
if self.pattern.is_empty() {
return "(?:)".to_string();
}
let mut source = String::with_capacity(self.pattern.len());
for ch in self.pattern.chars() {
if ch == '/' {
source.push('\\');
}
source.push(ch);
}
source
}
pub fn flags(&self) -> RegExpFlags {
self.flags
}
pub fn last_index(&self) -> usize {
self.last_index.get()
}
pub fn set_last_index(&self, index: usize) {
self.last_index.set(index);
}
pub fn test(&self, input: &str) -> bool {
self.exec(input).is_some()
}
pub fn exec(&self, input: &str) -> Option<RegExpMatch> {
stacker::maybe_grow(256 * 1024, 4 * 1024 * 1024, || self.exec_inner(input))
}
fn exec_inner(&self, input: &str) -> Option<RegExpMatch> {
let is_stateful = self
.flags
.intersects(RegExpFlags::GLOBAL | RegExpFlags::STICKY);
let start = if is_stateful {
self.last_index.get()
} else {
0
};
if start > input.len() {
if is_stateful {
self.last_index.set(0);
}
return None;
}
if self.pattern == "." {
return self.exec_dot_pattern(input, start, is_stateful);
}
let m = if self.flags.contains(RegExpFlags::STICKY) {
self.compiled
.find_from(input, start)
.next()
.filter(|m| m.start() == start)
} else {
self.compiled.find_from(input, start).next()
};
match m {
None => {
if is_stateful {
self.last_index.set(0);
}
None
}
Some(mat) => {
if is_stateful {
self.last_index.set(mat.end());
}
Some(build_match(
input,
&mat,
self.flags.contains(RegExpFlags::HAS_INDICES),
))
}
}
}
fn exec_dot_pattern(
&self,
input: &str,
start: usize,
is_stateful: bool,
) -> Option<RegExpMatch> {
let units = encode_utf16(input);
if start > units.len() {
if is_stateful {
self.last_index.set(0);
}
return None;
}
let first = *units.get(start)?;
if !self.flags.contains(RegExpFlags::DOT_ALL) && is_line_terminator_code_unit(first) {
if is_stateful {
self.last_index.set(0);
}
return None;
}
let (matched_units, consumed_units) = if self
.flags
.intersects(RegExpFlags::UNICODE | RegExpFlags::UNICODE_SETS)
&& (0xD800..=0xDBFF).contains(&first)
&& let Some(&second) = units.get(start + 1)
&& (0xDC00..=0xDFFF).contains(&second)
{
(vec![first, second], 2)
} else {
(vec![first], 1)
};
if is_stateful {
self.last_index.set(start + consumed_units);
}
Some(RegExpMatch {
matched: decode_utf16(&matched_units),
captures: Vec::new(),
named_groups: HashMap::new(),
index: utf16_index_to_byte_floor(input, start),
input: input.to_string(),
indices: None,
})
}
}
impl std::fmt::Display for JsRegExp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"/{}/{}",
self.source_text(),
self.flags.to_flags_string()
)
}
}
impl JsRegExp {
pub fn symbol_match(&self, input: &str) -> Option<SymbolMatchResult> {
if !self.flags.contains(RegExpFlags::GLOBAL) {
self.exec(input).map(SymbolMatchResult::Single)
} else {
self.last_index.set(0);
let mut matches: Vec<String> = Vec::new();
loop {
let start = self.last_index.get();
if start > input.len() {
self.last_index.set(0);
break;
}
let m = if self.flags.contains(RegExpFlags::STICKY) {
self.compiled
.find_from(input, start)
.next()
.filter(|m| m.start() == start)
} else {
self.compiled.find_from(input, start).next()
};
match m {
None => {
self.last_index.set(0);
break;
}
Some(mat) => {
let end = mat.end();
matches.push(input[mat.range()].to_string());
self.last_index.set(advance_after_match(input, start, end));
}
}
}
if matches.is_empty() {
None
} else {
Some(SymbolMatchResult::All(matches))
}
}
}
pub fn symbol_replace(&self, input: &str, replacement: &str) -> String {
let global = self.flags.contains(RegExpFlags::GLOBAL);
if global {
self.last_index.set(0);
let mut result = String::new();
let mut last_end = 0_usize;
loop {
let start = self.last_index.get();
if start > input.len() {
self.last_index.set(0);
break;
}
let m = if self.flags.contains(RegExpFlags::STICKY) {
self.compiled
.find_from(input, start)
.next()
.filter(|m| m.start() == start)
} else {
self.compiled.find_from(input, start).next()
};
match m {
None => {
self.last_index.set(0);
break;
}
Some(mat) => {
let rm = build_match(input, &mat, false);
result.push_str(&input[last_end..rm.index]);
result.push_str(&apply_replacement(replacement, &rm, input));
let end = mat.end();
last_end = end;
self.last_index.set(advance_after_match(input, start, end));
}
}
}
result.push_str(&input[last_end..]);
return result;
}
if let Some(matched) = self.exec(input) {
let end = matched.index + matched.matched.len();
let mut result = String::new();
result.push_str(&input[..matched.index]);
result.push_str(&apply_replacement(replacement, &matched, input));
result.push_str(&input[end..]);
result
} else {
input.to_string()
}
}
pub fn symbol_search(&self, input: &str) -> i64 {
let saved = self.last_index.get();
self.last_index.set(0);
let result = self.exec(input).map_or(-1, |m| m.index as i64);
self.last_index.set(saved);
result
}
pub fn symbol_split(&self, input: &str, limit: Option<usize>) -> Vec<Option<String>> {
let lim = limit.unwrap_or(usize::MAX);
if lim == 0 {
return Vec::new();
}
if input.is_empty() {
let empty_match = self
.compiled
.find(input)
.is_some_and(|m| m.start() == 0 && m.end() == 0);
return if empty_match {
Vec::new()
} else {
vec![Some(String::new())]
};
}
let mut parts: Vec<Option<String>> = Vec::new();
let mut last_end = 0usize;
let mut search_index = 0usize;
while search_index < input.len() {
let matched = self
.compiled
.find_from(input, search_index)
.next()
.filter(|mat| mat.start() == search_index);
let Some(mat) = matched else {
search_index = advance_string_index(input, search_index);
continue;
};
let match_end = mat.end();
if match_end == last_end {
search_index = advance_string_index(input, search_index);
continue;
}
parts.push(Some(input[last_end..search_index].to_string()));
if parts.len() >= lim {
return parts;
}
for cap in &mat.captures {
parts.push(cap.as_ref().map(|r| input[r.clone()].to_string()));
if parts.len() >= lim {
return parts;
}
}
last_end = match_end;
search_index = last_end;
}
if parts.len() < lim {
parts.push(Some(input[last_end..].to_string()));
}
parts
}
pub fn symbol_match_all(&self, input: &str) -> Vec<RegExpMatch> {
let has_indices = self.flags.contains(RegExpFlags::HAS_INDICES);
let mut results = Vec::new();
let mut start = self.last_index.get();
loop {
if start > input.len() {
break;
}
let m = if self.flags.contains(RegExpFlags::STICKY) {
self.compiled
.find_from(input, start)
.next()
.filter(|m| m.start() == start)
} else {
self.compiled.find_from(input, start).next()
};
match m {
None => break,
Some(mat) => {
let end = mat.end();
results.push(build_match(input, &mat, has_indices));
start = advance_after_match(input, start, end);
}
}
}
results
}
pub fn clone_for_match_all(&self, last_index: usize) -> StatorResult<Self> {
let flags_str = self.flags.to_flags_string();
let cloned = Self::new(&self.pattern, &flags_str)?;
cloned.last_index.set(last_index);
Ok(cloned)
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum SymbolMatchResult {
Single(RegExpMatch),
All(Vec<String>),
}
fn build_regress_flags(f: RegExpFlags) -> RegressFlags {
RegressFlags {
icase: f.contains(RegExpFlags::IGNORE_CASE),
multiline: f.contains(RegExpFlags::MULTILINE),
dot_all: f.contains(RegExpFlags::DOT_ALL),
unicode: f.contains(RegExpFlags::UNICODE) || f.contains(RegExpFlags::UNICODE_SETS),
unicode_sets: f.contains(RegExpFlags::UNICODE_SETS),
no_opt: false,
}
}
fn build_match(input: &str, mat: ®ress::Match, has_indices: bool) -> RegExpMatch {
let matched = input[mat.range()].to_string();
let index = mat.start();
let captures: Vec<Option<String>> = mat
.captures
.iter()
.map(|cap| cap.as_ref().map(|r| input[r.clone()].to_string()))
.collect();
let mut named_groups = HashMap::new();
for (name, range) in mat.named_groups() {
let value = range.map(|r| input[r].to_string());
named_groups
.entry(name.to_string())
.and_modify(|existing: &mut Option<String>| {
if existing.is_none() && value.is_some() {
*existing = value.clone();
}
})
.or_insert(value);
}
let indices = if has_indices {
let mut pairs = Vec::with_capacity(1 + mat.captures.len());
pairs.push(Some((mat.start(), mat.end())));
for cap in &mat.captures {
pairs.push(cap.as_ref().map(|r| (r.start, r.end)));
}
let mut groups = HashMap::new();
for (name, range) in mat.named_groups() {
if let Some(r) = range {
groups.entry(name.to_string()).or_insert((r.start, r.end));
}
}
Some(MatchIndices { pairs, groups })
} else {
None
};
RegExpMatch {
matched,
captures,
named_groups,
index,
input: input.to_string(),
indices,
}
}
fn apply_replacement(replacement: &str, m: &RegExpMatch, input: &str) -> String {
let mut out = String::new();
let bytes = replacement.as_bytes();
let mut i = 0;
while i < replacement.len() {
if bytes[i] == b'$' && i + 1 < bytes.len() {
match bytes[i + 1] {
b'$' => {
out.push('$');
i += 2;
}
b'&' => {
out.push_str(&m.matched);
i += 2;
}
b'`' => {
out.push_str(&input[..m.index]);
i += 2;
}
b'\'' => {
let after_start = m.index + m.matched.len();
if after_start <= input.len() {
out.push_str(&input[after_start..]);
}
i += 2;
}
b'<' => {
if let Some(end) = replacement[i + 2..].find('>') {
let name = &replacement[i + 2..i + 2 + end];
if m.named_groups.is_empty() {
out.push_str(&replacement[i..i + 3 + end]);
} else if let Some(Some(val)) = m.named_groups.get(name) {
out.push_str(val);
}
i += 2 + end + 1; } else {
out.push('$');
i += 1;
}
}
b'0'..=b'9' => {
let mut num = (bytes[i + 1] - b'0') as usize;
let mut consumed = 2;
if i + 2 < bytes.len()
&& let Some(d) = (bytes[i + 2] as char).to_digit(10)
{
let two_digit = num * 10 + d as usize;
if two_digit > 0 && two_digit <= m.captures.len() {
num = two_digit;
consumed = 3;
}
}
if num > 0 {
if let Some(Some(cap)) = m.captures.get(num - 1) {
out.push_str(cap);
}
} else {
out.push('$');
out.push(bytes[i + 1] as char);
}
i += consumed;
}
_ => {
out.push('$');
i += 1;
}
}
} else {
if let Some(ch) = replacement[i..].chars().next() {
out.push(ch);
i += ch.len_utf8();
} else {
break;
}
}
}
out
}
fn advance_after_match(input: &str, start: usize, end: usize) -> usize {
if end > start {
end
} else {
advance_string_index(input, start)
}
}
fn advance_string_index(input: &str, index: usize) -> usize {
if index >= input.len() {
index.saturating_add(1)
} else {
input[index..]
.chars()
.next()
.map_or(index.saturating_add(1), |ch| index + ch.len_utf8())
}
}
fn is_line_terminator_code_unit(unit: u16) -> bool {
matches!(unit, 0x000A | 0x000D | 0x2028 | 0x2029)
}
fn utf16_index_to_byte_floor(input: &str, utf16_index: usize) -> usize {
let mut byte_index = 0usize;
let mut units_seen = 0usize;
for ch in input.chars() {
if units_seen >= utf16_index {
break;
}
let unit_len = ch.len_utf16();
if units_seen + unit_len > utf16_index {
break;
}
units_seen += unit_len;
byte_index += ch.len_utf8();
}
byte_index
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_empty_flags() {
let f = RegExpFlags::parse("").unwrap();
assert_eq!(f, RegExpFlags::empty());
}
#[test]
fn test_parse_all_flags() {
let f = RegExpFlags::parse("gimsyd").unwrap();
assert!(f.contains(RegExpFlags::GLOBAL));
assert!(f.contains(RegExpFlags::IGNORE_CASE));
assert!(f.contains(RegExpFlags::MULTILINE));
assert!(f.contains(RegExpFlags::DOT_ALL));
assert!(f.contains(RegExpFlags::STICKY));
assert!(f.contains(RegExpFlags::HAS_INDICES));
}
#[test]
fn test_parse_unknown_flag_errors() {
let err = RegExpFlags::parse("x").unwrap_err();
assert!(matches!(err, StatorError::SyntaxError(_)));
}
#[test]
fn test_parse_duplicate_flag_errors() {
let err = RegExpFlags::parse("gg").unwrap_err();
assert!(matches!(err, StatorError::SyntaxError(_)));
}
#[test]
fn test_parse_uv_combined_errors() {
let err = RegExpFlags::parse("uv").unwrap_err();
assert!(matches!(err, StatorError::SyntaxError(_)));
}
#[test]
fn test_flags_string_order() {
let f = RegExpFlags::parse("ymisgd").unwrap();
assert_eq!(f.to_flags_string(), "dgimsy");
}
#[test]
fn test_flags_string_single() {
assert_eq!(RegExpFlags::parse("g").unwrap().to_flags_string(), "g");
assert_eq!(RegExpFlags::parse("i").unwrap().to_flags_string(), "i");
assert_eq!(RegExpFlags::parse("m").unwrap().to_flags_string(), "m");
assert_eq!(RegExpFlags::parse("s").unwrap().to_flags_string(), "s");
assert_eq!(RegExpFlags::parse("u").unwrap().to_flags_string(), "u");
assert_eq!(RegExpFlags::parse("v").unwrap().to_flags_string(), "v");
assert_eq!(RegExpFlags::parse("y").unwrap().to_flags_string(), "y");
assert_eq!(RegExpFlags::parse("d").unwrap().to_flags_string(), "d");
}
#[test]
fn test_new_valid_pattern() {
let re = JsRegExp::new(r"\d+", "g").unwrap();
assert_eq!(re.pattern(), r"\d+");
assert!(re.flags().contains(RegExpFlags::GLOBAL));
}
#[test]
fn test_new_invalid_pattern_errors() {
let err = JsRegExp::new("[invalid", "").unwrap_err();
assert!(matches!(err, StatorError::SyntaxError(_)));
}
#[test]
fn test_new_invalid_flag_errors() {
let err = JsRegExp::new("a", "z").unwrap_err();
assert!(matches!(err, StatorError::SyntaxError(_)));
}
#[test]
fn test_to_string_no_flags() {
let re = JsRegExp::new("hello", "").unwrap();
assert_eq!(re.to_string(), "/hello/");
}
#[test]
fn test_to_string_with_flags() {
let re = JsRegExp::new("foo", "gi").unwrap();
assert_eq!(re.to_string(), "/foo/gi");
}
#[test]
fn test_test_match() {
let re = JsRegExp::new(r"\d+", "").unwrap();
assert!(re.test("foo 42 bar"));
assert!(!re.test("no numbers here"));
}
#[test]
fn test_test_case_insensitive() {
let re = JsRegExp::new("hello", "i").unwrap();
assert!(re.test("Say HELLO World"));
assert!(!re.test("say goodbye world"));
}
#[test]
fn test_test_multiline() {
let re = JsRegExp::new("^start", "m").unwrap();
assert!(re.test("first line\nstart of second"));
assert!(!re.test("first line\n start with space"));
}
#[test]
fn test_test_dot_all() {
let re = JsRegExp::new("a.b", "s").unwrap();
assert!(re.test("a\nb"));
let re_no_s = JsRegExp::new("a.b", "").unwrap();
assert!(!re_no_s.test("a\nb"));
}
#[test]
fn test_exec_no_match_returns_none() {
let re = JsRegExp::new("xyz", "").unwrap();
assert!(re.exec("hello world").is_none());
}
#[test]
fn test_exec_simple_match() {
let re = JsRegExp::new(r"(\d+)", "").unwrap();
let m = re.exec("price 42 dollars").unwrap();
assert_eq!(m.matched, "42");
assert_eq!(m.index, 6);
assert_eq!(m.captures, vec![Some("42".to_string())]);
}
#[test]
fn test_exec_named_captures() {
let re = JsRegExp::new(r"(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})", "u").unwrap();
let m = re.exec("today is 2024-07-15 ok").unwrap();
assert_eq!(m.matched, "2024-07-15");
assert_eq!(
m.named_groups.get("year").and_then(|v| v.as_deref()),
Some("2024")
);
assert_eq!(
m.named_groups.get("month").and_then(|v| v.as_deref()),
Some("07")
);
assert_eq!(
m.named_groups.get("day").and_then(|v| v.as_deref()),
Some("15")
);
}
#[test]
fn test_exec_lookbehind() {
let re = JsRegExp::new(r"(?<=\$)\d+", "").unwrap();
let m = re.exec("price $100").unwrap();
assert_eq!(m.matched, "100");
assert!(re.exec("price 100").is_none());
}
#[test]
fn test_exec_negative_lookbehind() {
let re = JsRegExp::new(r"(?<!\$)\d+", "").unwrap();
let m = re.exec("100 dollars").unwrap();
assert_eq!(m.matched, "100");
}
#[test]
fn test_exec_unicode_flag() {
let re = JsRegExp::new(r"\p{L}+", "u").unwrap();
let m = re.exec("hello 42").unwrap();
assert_eq!(m.matched, "hello");
}
#[test]
fn test_global_last_index_advances() {
let re = JsRegExp::new(r"\d+", "g").unwrap();
assert_eq!(re.last_index(), 0);
let m1 = re.exec("a1 b2 c3").unwrap();
assert_eq!(m1.matched, "1");
let m2 = re.exec("a1 b2 c3").unwrap();
assert_eq!(m2.matched, "2");
let m3 = re.exec("a1 b2 c3").unwrap();
assert_eq!(m3.matched, "3");
let m4 = re.exec("a1 b2 c3");
assert!(m4.is_none());
assert_eq!(re.last_index(), 0);
}
#[test]
fn test_sticky_only_matches_at_last_index() {
let re = JsRegExp::new(r"\d+", "y").unwrap();
let m = re.exec("5 apples").unwrap();
assert_eq!(m.matched, "5");
assert!(re.exec("5 apples").is_none());
assert_eq!(re.last_index(), 0);
}
#[test]
fn test_symbol_match_non_global() {
let re = JsRegExp::new(r"\d+", "").unwrap();
let result = re.symbol_match("price 42 and 7").unwrap();
if let SymbolMatchResult::Single(m) = result {
assert_eq!(m.matched, "42");
} else {
panic!("expected Single");
}
}
#[test]
fn test_symbol_match_global_all() {
let re = JsRegExp::new(r"\d+", "g").unwrap();
let result = re.symbol_match("a1 b22 c333").unwrap();
if let SymbolMatchResult::All(v) = result {
assert_eq!(v, vec!["1", "22", "333"]);
} else {
panic!("expected All");
}
}
#[test]
fn test_symbol_match_no_match_returns_none() {
let re = JsRegExp::new(r"\d+", "g").unwrap();
assert!(re.symbol_match("no numbers").is_none());
}
#[test]
fn test_symbol_replace_first_match() {
let re = JsRegExp::new(r"\d+", "").unwrap();
assert_eq!(re.symbol_replace("foo 42 bar 7", "NUM"), "foo NUM bar 7");
}
#[test]
fn test_symbol_replace_global() {
let re = JsRegExp::new(r"\d+", "g").unwrap();
assert_eq!(re.symbol_replace("a1 b2 c3", "N"), "aN bN cN");
}
#[test]
fn test_symbol_replace_dollar_amp() {
let re = JsRegExp::new(r"\d+", "").unwrap();
assert_eq!(re.symbol_replace("price 42", "[$&]"), "price [42]");
}
#[test]
fn test_symbol_replace_dollar_dollar() {
let re = JsRegExp::new("x", "").unwrap();
assert_eq!(re.symbol_replace("axb", "$$"), "a$b");
}
#[test]
fn test_symbol_replace_capture_group() {
let re = JsRegExp::new(r"(\d+)-(\d+)", "").unwrap();
assert_eq!(re.symbol_replace("2024-07", "$2/$1"), "07/2024");
}
#[test]
fn test_symbol_replace_named_capture() {
let re = JsRegExp::new(r"(?<y>\d{4})-(?<m>\d{2})", "u").unwrap();
assert_eq!(
re.symbol_replace("date 2024-07 end", "$<m>/$<y>"),
"date 07/2024 end"
);
}
#[test]
fn test_symbol_replace_before_after() {
let re = JsRegExp::new("b", "").unwrap();
assert_eq!(re.symbol_replace("abc", "$`|$'"), "aa|cc");
}
#[test]
fn test_symbol_search_found() {
let re = JsRegExp::new(r"\d+", "").unwrap();
assert_eq!(re.symbol_search("foo 42 bar"), 4);
}
#[test]
fn test_symbol_search_not_found() {
let re = JsRegExp::new(r"\d+", "").unwrap();
assert_eq!(re.symbol_search("no numbers"), -1);
}
#[test]
fn test_symbol_search_resets_last_index() {
let re = JsRegExp::new(r"\d+", "g").unwrap();
re.set_last_index(5);
let _ = re.symbol_search("foo 42 bar");
assert_eq!(re.last_index(), 5); }
#[test]
fn test_symbol_split_basic() {
let re = JsRegExp::new(",", "").unwrap();
assert_eq!(
re.symbol_split("a,b,c", None),
vec![Some("a".into()), Some("b".into()), Some("c".into())]
);
}
#[test]
fn test_symbol_split_with_limit() {
let re = JsRegExp::new(",", "").unwrap();
assert_eq!(
re.symbol_split("a,b,c,d", Some(2)),
vec![Some("a".into()), Some("b".into())]
);
}
#[test]
fn test_symbol_split_zero_limit() {
let re = JsRegExp::new(",", "").unwrap();
assert_eq!(
re.symbol_split("a,b,c", Some(0)),
Vec::<Option<String>>::new()
);
}
#[test]
fn test_symbol_split_captures_included() {
let re = JsRegExp::new(r"(\d+)", "").unwrap();
assert_eq!(
re.symbol_split("a1b2c", None),
vec![
Some("a".into()),
Some("1".into()),
Some("b".into()),
Some("2".into()),
Some("c".into())
]
);
}
#[test]
fn test_symbol_split_no_match_returns_whole_string() {
let re = JsRegExp::new(r"\d+", "").unwrap();
assert_eq!(re.symbol_split("abc", None), vec![Some("abc".into())]);
}
#[test]
fn test_symbol_split_nonparticipating_capture_is_none() {
let re = JsRegExp::new("-(x)?", "").unwrap();
assert_eq!(
re.symbol_split("a-b", None),
vec![Some("a".into()), None, Some("b".into())]
);
}
#[test]
fn test_symbol_split_zero_width_returns_characters() {
let re = JsRegExp::new(r"(?:)", "").unwrap();
assert_eq!(
re.symbol_split("ab", None),
vec![Some("a".into()), Some("b".into())]
);
}
#[test]
fn test_symbol_split_scans_forward_sticky_style() {
let re = JsRegExp::new("a", "").unwrap();
assert_eq!(
re.symbol_split("baab", None),
vec![Some("b".into()), Some(String::new()), Some("b".into())]
);
}
#[test]
fn test_apply_replacement_named_capture_is_literal_without_named_groups() {
let m = RegExpMatch {
matched: "a".into(),
captures: vec![],
named_groups: HashMap::new(),
index: 0,
input: "a".into(),
indices: None,
};
assert_eq!(apply_replacement("$<x>", &m, "a"), "$<x>");
}
#[test]
fn test_unicode_property_escape_letter() {
let re = JsRegExp::new(r"\p{L}", "u").unwrap();
assert!(re.test("hello"));
assert!(!re.test("123"));
}
#[test]
fn test_unicode_property_escape_digit() {
let re = JsRegExp::new(r"\p{N}", "u").unwrap();
assert!(re.test("42"));
assert!(!re.test("abc"));
}
#[test]
fn test_named_capture_single_group() {
let re = JsRegExp::new(r"(?<word>\w+)", "").unwrap();
let m = re.exec("hello world").unwrap();
assert_eq!(m.matched, "hello");
assert_eq!(
m.named_groups.get("word").and_then(|v| v.as_deref()),
Some("hello")
);
}
#[test]
fn test_named_capture_nonparticipating_group() {
let re = JsRegExp::new(r"(?<a>x)?(?<b>\d+)", "").unwrap();
let m = re.exec("42").unwrap();
assert_eq!(m.matched, "42");
assert_eq!(m.named_groups.get("a").and_then(|v| v.as_deref()), None);
assert_eq!(
m.named_groups.get("b").and_then(|v| v.as_deref()),
Some("42")
);
assert!(m.named_groups.contains_key("a"));
}
#[test]
fn test_named_capture_multiple_groups() {
let re = JsRegExp::new(r"(?<first>\w+)\s(?<last>\w+)", "").unwrap();
let m = re.exec("John Doe").unwrap();
assert_eq!(
m.named_groups.get("first").and_then(|v| v.as_deref()),
Some("John")
);
assert_eq!(
m.named_groups.get("last").and_then(|v| v.as_deref()),
Some("Doe")
);
}
#[test]
fn test_named_and_numbered_captures_coexist() {
let re = JsRegExp::new(r"(\d+)-(?<name>\w+)", "").unwrap();
let m = re.exec("42-foo").unwrap();
assert_eq!(m.captures[0], Some("42".to_string()));
assert_eq!(m.captures[1], Some("foo".to_string()));
assert_eq!(
m.named_groups.get("name").and_then(|v| v.as_deref()),
Some("foo")
);
}
#[test]
fn test_named_backreference_basic() {
let re = JsRegExp::new(r"(?<tag>\w+)=\k<tag>", "").unwrap();
let m = re.exec("abc=abc").unwrap();
assert_eq!(m.matched, "abc=abc");
}
#[test]
fn test_named_backreference_no_match_when_different() {
let re = JsRegExp::new(r"(?<tag>\w+)=\k<tag>", "").unwrap();
assert!(re.exec("abc=def").is_none());
}
#[test]
fn test_named_backreference_with_flag_u() {
let re = JsRegExp::new(r"(?<char>.)\k<char>", "u").unwrap();
let m = re.exec("aabbcc").unwrap();
assert_eq!(m.matched, "aa");
}
#[test]
fn test_named_backreference_html_tag() {
let re = JsRegExp::new(r"<(?<tag>\w+)>.*?</\k<tag>>", "").unwrap();
let m = re.exec("<div>hello</div>").unwrap();
assert_eq!(m.matched, "<div>hello</div>");
assert!(re.exec("<div>hello</span>").is_none());
}
#[test]
fn test_replace_named_capture_global() {
let re = JsRegExp::new(r"(?<d>\d+)", "gu").unwrap();
assert_eq!(re.symbol_replace("a1 b2 c3", "[$<d>]"), "a[1] b[2] c[3]");
}
#[test]
fn test_replace_named_capture_missing_name() {
let re = JsRegExp::new(r"(?<a>\d+)", "u").unwrap();
assert_eq!(re.symbol_replace("42", "$<b>"), "");
}
#[test]
fn test_replace_named_capture_nonparticipating() {
let re = JsRegExp::new(r"(?<a>x)?(?<b>\d+)", "").unwrap();
assert_eq!(re.symbol_replace("42", "$<a>-$<b>"), "-42");
}
#[test]
fn test_replace_named_capture_unclosed_angle() {
let re = JsRegExp::new(r"(?<a>\d+)", "").unwrap();
assert_eq!(re.symbol_replace("42", "$<a"), "$<a");
}
#[test]
fn test_has_indices_flag_parsed() {
let re = JsRegExp::new("a", "d").unwrap();
assert!(re.flags().contains(RegExpFlags::HAS_INDICES));
}
#[test]
fn test_has_indices_basic() {
let re = JsRegExp::new(r"\d+", "d").unwrap();
let m = re.exec("abc 42 end").unwrap();
let idx = m.indices.as_ref().unwrap();
assert_eq!(idx.pairs[0], Some((4, 6)));
}
#[test]
fn test_has_indices_capture_groups() {
let re = JsRegExp::new(r"(\d+)-(\d+)", "d").unwrap();
let m = re.exec("abc 12-34 end").unwrap();
let idx = m.indices.as_ref().unwrap();
assert_eq!(idx.pairs[0], Some((4, 9)));
assert_eq!(idx.pairs[1], Some((4, 6)));
assert_eq!(idx.pairs[2], Some((7, 9)));
}
#[test]
fn test_has_indices_nonparticipating_group() {
let re = JsRegExp::new(r"(x)?(\d+)", "d").unwrap();
let m = re.exec("42").unwrap();
let idx = m.indices.as_ref().unwrap();
assert_eq!(idx.pairs[0], Some((0, 2)));
assert_eq!(idx.pairs[1], None); assert_eq!(idx.pairs[2], Some((0, 2)));
}
#[test]
fn test_has_indices_named_groups() {
let re = JsRegExp::new(r"(?<year>\d{4})-(?<month>\d{2})", "du").unwrap();
let m = re.exec("2024-07").unwrap();
let idx = m.indices.as_ref().unwrap();
assert_eq!(idx.groups.get("year"), Some(&(0, 4)));
assert_eq!(idx.groups.get("month"), Some(&(5, 7)));
}
#[test]
fn test_no_indices_without_d_flag() {
let re = JsRegExp::new(r"\d+", "").unwrap();
let m = re.exec("42").unwrap();
assert!(m.indices.is_none());
}
#[test]
fn test_v_flag_parsed() {
let re = JsRegExp::new("a", "v").unwrap();
assert!(re.flags().contains(RegExpFlags::UNICODE_SETS));
}
#[test]
fn test_v_flag_in_flags_string() {
let re = JsRegExp::new("a", "v").unwrap();
assert_eq!(re.flags().to_flags_string(), "v");
}
#[test]
fn test_v_flag_enables_unicode_matching() {
let re = JsRegExp::new(r"\p{L}+", "v").unwrap();
assert!(re.test("hello"));
assert!(!re.test("123"));
}
#[test]
fn test_v_flag_unicode_accessor_is_true() {
let re = JsRegExp::new("a", "v").unwrap();
assert!(re.flags().contains(RegExpFlags::UNICODE_SETS));
assert!(!re.flags().contains(RegExpFlags::UNICODE));
assert!(build_regress_flags(re.flags()).unicode);
}
#[test]
fn test_v_and_u_cannot_combine() {
let err = RegExpFlags::parse("uv").unwrap_err();
assert!(matches!(err, StatorError::SyntaxError(_)));
}
#[test]
fn test_v_flag_in_to_string() {
let re = JsRegExp::new("abc", "gv").unwrap();
assert_eq!(re.to_string(), "/abc/gv");
}
#[test]
fn test_flags_canonical_order_all() {
let f = RegExpFlags::parse("ysmigd").unwrap();
assert_eq!(f.to_flags_string(), "dgimsy");
}
#[test]
fn test_flags_canonical_order_with_v() {
let f = RegExpFlags::parse("yvgd").unwrap();
assert_eq!(f.to_flags_string(), "dgvy");
}
#[test]
fn test_flags_empty() {
let f = RegExpFlags::parse("").unwrap();
assert_eq!(f.to_flags_string(), "");
}
#[test]
fn test_match_all_named_groups() {
let re = JsRegExp::new(r"(?<num>\d+)", "g").unwrap();
let results = re.symbol_match_all("a1 b22 c333");
assert_eq!(results.len(), 3);
assert_eq!(
results[0]
.named_groups
.get("num")
.and_then(|v| v.as_deref()),
Some("1")
);
assert_eq!(
results[2]
.named_groups
.get("num")
.and_then(|v| v.as_deref()),
Some("333")
);
}
#[test]
fn test_match_all_with_indices() {
let re = JsRegExp::new(r"\d+", "gd").unwrap();
let results = re.symbol_match_all("a1 b22");
assert_eq!(results.len(), 2);
let idx0 = results[0].indices.as_ref().unwrap();
assert_eq!(idx0.pairs[0], Some((1, 2)));
let idx1 = results[1].indices.as_ref().unwrap();
assert_eq!(idx1.pairs[0], Some((4, 6)));
}
#[test]
fn test_named_group_empty_match() {
let re = JsRegExp::new(r"(?<empty>)", "").unwrap();
let m = re.exec("abc").unwrap();
assert_eq!(
m.named_groups.get("empty").and_then(|v| v.as_deref()),
Some("")
);
}
#[test]
fn test_exec_no_named_groups_empty_map() {
let re = JsRegExp::new(r"\d+", "").unwrap();
let m = re.exec("42").unwrap();
assert!(m.named_groups.is_empty());
}
#[test]
fn test_symbol_replace_dollar_n_and_named_combined() {
let re = JsRegExp::new(r"(\d+)-(?<w>\w+)", "").unwrap();
assert_eq!(re.symbol_replace("42-foo", "$1=$<w>"), "42=foo");
}
#[test]
fn test_source_text_empty_pattern() {
let re = JsRegExp::new("", "").unwrap();
assert_eq!(re.source_text(), "(?:)");
}
#[test]
fn test_source_text_escapes_slash() {
let re = JsRegExp::new("a/b", "").unwrap();
assert_eq!(re.source_text(), r"a\/b");
}
#[test]
fn test_global_empty_pattern_advances() {
let re = JsRegExp::new("", "g").unwrap();
let results = re.symbol_match_all("ab");
assert_eq!(results.len(), 3);
}
#[test]
fn test_lookbehind_positive_dollar_sign() {
let re = JsRegExp::new(r"(?<=\$)\d+", "").unwrap();
let m = re.exec("price $100 and €200").unwrap();
assert_eq!(m.matched, "100");
}
#[test]
fn test_lookbehind_positive_word_boundary() {
let re = JsRegExp::new(r"(?<=\bfoo)\w+", "").unwrap();
let m = re.exec("foobar baz").unwrap();
assert_eq!(m.matched, "bar");
}
#[test]
fn test_lookbehind_positive_global_all() {
let re = JsRegExp::new(r"(?<=@)\w+", "g").unwrap();
let results = re.symbol_match_all("@alice and @bob");
assert_eq!(results.len(), 2);
assert_eq!(results[0].matched, "alice");
assert_eq!(results[1].matched, "bob");
}
#[test]
fn test_lookbehind_negative_no_dollar() {
let re = JsRegExp::new(r"(?<!\$)\d+", "").unwrap();
let m = re.exec("free 42 items").unwrap();
assert_eq!(m.matched, "42");
}
#[test]
fn test_lookbehind_negative_skips_prefixed() {
let re = JsRegExp::new(r"(?<!un)happy", "").unwrap();
assert!(re.test("happy day"));
assert!(!re.test("unhappy day"));
}
#[test]
fn test_lookbehind_negative_global() {
let re = JsRegExp::new(r"(?<!#)\b\w+", "g").unwrap();
let results = re.symbol_match_all("hello #world foo");
let matched: Vec<&str> = results.iter().map(|m| m.matched.as_str()).collect();
assert!(matched.contains(&"hello"));
assert!(matched.contains(&"foo"));
}
#[test]
fn test_lookbehind_with_capture_group() {
let re = JsRegExp::new(r"(?<=(\d+)\s)\w+", "").unwrap();
let m = re.exec("42 apples").unwrap();
assert_eq!(m.matched, "apples");
assert_eq!(m.captures[0], Some("42".to_string()));
}
#[test]
fn test_lookbehind_capture_numbering() {
let re = JsRegExp::new(r"(?<=(a)(b))cd", "").unwrap();
let m = re.exec("abcd").unwrap();
assert_eq!(m.matched, "cd");
assert_eq!(m.captures[0], Some("a".to_string()));
assert_eq!(m.captures[1], Some("b".to_string()));
}
#[test]
fn test_lookbehind_variable_length() {
let re = JsRegExp::new(r"(?<=\d+)\s\w+", "").unwrap();
let m = re.exec("123 abc").unwrap();
assert_eq!(m.matched, " abc");
}
#[test]
fn test_lookbehind_alternation_variable_length() {
let re = JsRegExp::new(r"(?<=cat|hello)\s\w+", "").unwrap();
let m = re.exec("hello world").unwrap();
assert_eq!(m.matched, " world");
}
#[test]
fn test_dotall_dot_matches_newline() {
let re = JsRegExp::new("a.b", "s").unwrap();
assert!(re.test("a\nb"));
assert!(re.test("a\rb"));
assert!(re.test("axb"));
}
#[test]
fn test_dotall_off_dot_rejects_newline() {
let re = JsRegExp::new("a.b", "").unwrap();
assert!(!re.test("a\nb"));
assert!(re.test("axb"));
}
#[test]
fn test_dotall_multiline_interaction() {
let re = JsRegExp::new(r"^.+$", "sm").unwrap();
let m = re.exec("line1\nline2").unwrap();
assert!(m.matched.contains('\n'));
}
#[test]
fn test_dotall_flag_accessor() {
let re = JsRegExp::new("a", "s").unwrap();
assert!(re.flags().contains(RegExpFlags::DOT_ALL));
}
#[test]
fn test_dotall_in_flags_string() {
let re = JsRegExp::new("a", "gs").unwrap();
assert_eq!(re.flags().to_flags_string(), "gs");
}
#[test]
fn test_dotall_global_replaces_across_newlines() {
let re = JsRegExp::new(".+", "gs").unwrap();
let result = re.symbol_replace("a\nb", "x");
assert_eq!(result, "x");
}
#[test]
fn test_unicode_property_letter_match() {
let re = JsRegExp::new(r"\p{Letter}+", "u").unwrap();
let m = re.exec("hello123").unwrap();
assert_eq!(m.matched, "hello");
}
#[test]
fn test_unicode_property_number_match() {
let re = JsRegExp::new(r"\p{Number}+", "u").unwrap();
let m = re.exec("abc42def").unwrap();
assert_eq!(m.matched, "42");
}
#[test]
fn test_unicode_property_negated_number() {
let re = JsRegExp::new(r"\P{Number}+", "u").unwrap();
let m = re.exec("42abc99").unwrap();
assert_eq!(m.matched, "abc");
}
#[test]
fn test_unicode_property_script_greek() {
let re = JsRegExp::new(r"\p{Script=Greek}+", "u").unwrap();
assert!(re.test("αβγ"));
assert!(!re.test("abc"));
}
#[test]
fn test_unicode_property_script_latin() {
let re = JsRegExp::new(r"\p{Script=Latin}+", "u").unwrap();
let m = re.exec("hello世界").unwrap();
assert_eq!(m.matched, "hello");
}
#[test]
fn test_unicode_property_general_category_uppercase() {
let re = JsRegExp::new(r"\p{General_Category=Uppercase_Letter}+", "u").unwrap();
let m = re.exec("helloWORLD").unwrap();
assert_eq!(m.matched, "WORLD");
}
#[test]
fn test_unicode_property_emoji_like() {
let re = JsRegExp::new(r"\p{L}+", "u").unwrap();
assert!(re.test("你好"));
}
#[test]
fn test_unicode_property_global_all_letters() {
let re = JsRegExp::new(r"\p{L}+", "gu").unwrap();
let results = re.symbol_match_all("hello 42 world");
assert_eq!(results.len(), 2);
assert_eq!(results[0].matched, "hello");
assert_eq!(results[1].matched, "world");
}
#[test]
fn test_backreference_named_repeat() {
let re = JsRegExp::new(r"(?<ch>.)\k<ch>", "").unwrap();
let m = re.exec("aabbcc").unwrap();
assert_eq!(m.matched, "aa");
}
#[test]
fn test_backreference_named_no_match_different() {
let re = JsRegExp::new(r"(?<ch>.)\k<ch>", "").unwrap();
assert!(re.exec("abcd").is_none());
}
#[test]
fn test_backreference_named_html_tags() {
let re = JsRegExp::new(r"<(?<tag>\w+)>[^<]*</\k<tag>>", "").unwrap();
let m = re.exec("<b>bold</b>").unwrap();
assert_eq!(m.matched, "<b>bold</b>");
assert!(re.exec("<b>bold</i>").is_none());
}
#[test]
fn test_backreference_named_with_unicode() {
let re = JsRegExp::new(r"(?<w>\w+)\s\k<w>", "u").unwrap();
let m = re.exec("the the dog").unwrap();
assert_eq!(m.matched, "the the");
}
#[test]
fn test_backreference_named_global_replace() {
let re = JsRegExp::new(r"(?<w>\w+)\s\k<w>", "g").unwrap();
let result = re.symbol_replace("the the is is ok", "[$<w>]");
assert_eq!(result, "[the] [is] ok");
}
#[test]
fn test_flags_gimus() {
let re = JsRegExp::new(".", "gimus").unwrap();
assert_eq!(re.flags().to_flags_string(), "gimsu");
assert!(re.flags().contains(RegExpFlags::GLOBAL));
assert!(re.flags().contains(RegExpFlags::IGNORE_CASE));
assert!(re.flags().contains(RegExpFlags::MULTILINE));
assert!(re.flags().contains(RegExpFlags::UNICODE));
assert!(re.flags().contains(RegExpFlags::DOT_ALL));
}
#[test]
fn test_flags_gimsuy() {
let re = JsRegExp::new(".", "gimsuy").unwrap();
assert_eq!(re.flags().to_flags_string(), "gimsuy");
assert!(re.flags().contains(RegExpFlags::GLOBAL));
assert!(re.flags().contains(RegExpFlags::IGNORE_CASE));
assert!(re.flags().contains(RegExpFlags::MULTILINE));
assert!(re.flags().contains(RegExpFlags::DOT_ALL));
assert!(re.flags().contains(RegExpFlags::UNICODE));
assert!(re.flags().contains(RegExpFlags::STICKY));
}
#[test]
fn test_flags_du_combined() {
let re = JsRegExp::new(r"\p{L}+", "du").unwrap();
assert_eq!(re.flags().to_flags_string(), "du");
let m = re.exec("hello").unwrap();
assert!(m.indices.is_some());
}
#[test]
fn test_flags_dgs_combined() {
let re = JsRegExp::new(".+", "dgs").unwrap();
let results = re.symbol_match_all("a\nb");
assert_eq!(results.len(), 1);
assert_eq!(results[0].matched, "a\nb");
assert!(results[0].indices.is_some());
}
#[test]
fn test_dotall_with_lookbehind() {
let re = JsRegExp::new(r"(?<=\n)\w+", "s").unwrap();
let m = re.exec("line1\nline2").unwrap();
assert_eq!(m.matched, "line2");
}
#[test]
fn test_unicode_property_with_lookbehind() {
let re = JsRegExp::new(r"(?<=\p{L})\d+", "u").unwrap();
let m = re.exec("abc42").unwrap();
assert_eq!(m.matched, "42");
assert!(re.exec(" 42").is_none());
}
#[test]
fn test_unicode_dotall_combined() {
let re = JsRegExp::new(r"\p{L}.+\p{L}", "su").unwrap();
let m = re.exec("a\nb").unwrap();
assert_eq!(m.matched, "a\nb");
}
#[test]
fn test_named_capture_with_indices() {
let re = JsRegExp::new(r"(?<word>\w+)", "du").unwrap();
let m = re.exec("hello").unwrap();
let idx = m.indices.as_ref().unwrap();
assert_eq!(idx.pairs[0], Some((0, 5)));
assert_eq!(idx.groups.get("word"), Some(&(0, 5)));
}
#[test]
fn test_lookbehind_in_global_replace() {
let re = JsRegExp::new(r"(?<=\$)\d+", "g").unwrap();
let result = re.symbol_replace("$100 and $200", "XXX");
assert_eq!(result, "$XXX and $XXX");
}
#[test]
fn test_lookbehind_at_start_no_match() {
let re = JsRegExp::new(r"(?<=x)\d+", "").unwrap();
assert!(re.exec("42").is_none());
}
#[test]
fn test_lookahead_and_lookbehind_combined() {
let re = JsRegExp::new(r"(?<=\$)\d+(?=\s)", "").unwrap();
let m = re.exec("$100 dollars").unwrap();
assert_eq!(m.matched, "100");
assert!(re.exec("$100dollars").is_none());
}
}