mod nfa;
mod parser;
pub use nfa::{clear_fa_shell_cache, make_regexp_nfa_arena, regexp_has_plus_star};
pub use parser::{
Branch as RegexpBranch, Error as RegexpError, LookaroundType, QuantifiedAtom,
REGEXP_QUANTIFIER_MAX, RUNE_MAX, Root as RegexpRoot, RunePair, RuneRange, collect_lookarounds,
expand_word_boundaries, has_top_level_lookaround, has_word_boundary, parse as parse_regexp,
};
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::parser::{invert_rune_range, simplify_rune_range};
use super::*;
#[test]
fn test_parse_simple() {
let root = parse_regexp("abc").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 3);
}
#[test]
fn test_parse_alternation() {
let root = parse_regexp("a|b").unwrap();
assert_eq!(root.len(), 2);
}
#[test]
fn test_parse_char_class() {
let root = parse_regexp("[abc]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 1);
assert_eq!(root[0][0].runes[0].lo, 'a');
assert_eq!(root[0][0].runes[0].hi, 'c');
}
#[test]
fn test_parse_char_range() {
let root = parse_regexp("[a-z]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0][0].runes.len(), 1);
assert_eq!(root[0][0].runes[0].lo, 'a');
assert_eq!(root[0][0].runes[0].hi, 'z');
}
#[test]
fn test_parse_dot() {
let root = parse_regexp("a.b").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 3);
assert!(root[0][1].is_dot);
}
#[test]
fn test_parse_optional() {
let root = parse_regexp("ab?c").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 3);
assert_eq!(root[0][1].quant_min, 0);
assert_eq!(root[0][1].quant_max, 1);
}
#[test]
fn test_parse_group() {
let root = parse_regexp("(a|b)c").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 2);
assert!(root[0][0].subtree.is_some());
let subtree = root[0][0].subtree.as_ref().unwrap();
assert_eq!(subtree.len(), 2); }
#[test]
fn test_simplify_rune_range() {
let rr = vec![RunePair { lo: 'a', hi: 'c' }, RunePair { lo: 'b', hi: 'd' }];
let simplified = simplify_rune_range(rr);
assert_eq!(simplified.len(), 1);
assert_eq!(simplified[0].lo, 'a');
assert_eq!(simplified[0].hi, 'd');
}
#[test]
fn test_parse_invalid_unclosed_bracket() {
let result = parse_regexp("[invalid");
assert!(result.is_err(), "Unclosed bracket should fail parsing");
}
#[test]
fn test_parse_plus() {
let root = parse_regexp("[a-z]+").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert!(
root[0][0].is_plus(),
"Should be recognized as plus quantifier"
);
}
#[test]
fn test_parse_star() {
let root = parse_regexp("[a-z]*").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert!(
root[0][0].is_star(),
"Should be recognized as star quantifier"
);
}
#[test]
fn test_parse_negated_class() {
let root = parse_regexp("[^abc]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert!(
root[0][0].runes.len() > 1,
"Negated class should produce multiple ranges"
);
}
#[test]
fn test_parse_non_capturing_group() {
let root = parse_regexp("a(?:b|c)d").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 3); assert!(
root[0][1].subtree.is_some(),
"Non-capturing group should have subtree"
);
let subtree = root[0][1].subtree.as_ref().unwrap();
assert_eq!(subtree.len(), 2); }
#[test]
fn test_parse_non_capturing_nested() {
let root = parse_regexp("(?:(?:a))").unwrap();
assert_eq!(root.len(), 1);
assert!(root[0][0].subtree.is_some());
let inner = root[0][0].subtree.as_ref().unwrap();
assert!(inner[0][0].subtree.is_some());
}
#[test]
fn test_parse_lazy_quantifiers() {
let root = parse_regexp("a*?").unwrap();
assert_eq!(root.len(), 1);
assert!(root[0][0].is_star(), "Should be star quantifier");
let root = parse_regexp("a+?").unwrap();
assert_eq!(root.len(), 1);
assert!(root[0][0].is_plus(), "Should be plus quantifier");
let root = parse_regexp("a{2,5}?").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0][0].quant_min, 2);
assert_eq!(root[0][0].quant_max, 5);
let root = parse_regexp("a??").unwrap();
assert_eq!(root.len(), 1);
assert!(root[0][0].is_qm(), "Should be optional quantifier");
}
#[test]
fn test_parse_lookaround_supported() {
assert!(
parse_regexp("a(?=b)").is_ok(),
"Positive lookahead should parse"
);
assert!(
parse_regexp("a(?!b)").is_ok(),
"Negative lookahead should parse"
);
assert!(
parse_regexp("(?<=a)b").is_ok(),
"Positive lookbehind should parse"
);
assert!(
parse_regexp("(?<!a)b").is_ok(),
"Negative lookbehind should parse"
);
}
#[test]
fn test_parse_unsupported_group_extension() {
assert!(parse_regexp("(?>a)").is_err(), "Atomic group should fail");
assert!(
parse_regexp("(?<name>a)").is_err(),
"Named group should fail"
);
}
#[test]
fn test_parse_nested_lookaround_rejected() {
assert!(
parse_regexp("(?=(?=a)b)").is_err(),
"Nested lookahead should fail"
);
assert!(
parse_regexp("(?=a(?!b))").is_err(),
"Lookahead containing negative lookahead should fail"
);
assert!(
parse_regexp("(?<=(?<=a)b)").is_err(),
"Nested lookbehind should fail"
);
}
#[test]
fn test_parse_variable_length_lookbehind_rejected() {
assert!(
parse_regexp("(?<=a+)b").is_err(),
"Variable-length lookbehind (plus) should fail"
);
assert!(
parse_regexp("(?<=a*)b").is_err(),
"Variable-length lookbehind (star) should fail"
);
assert!(
parse_regexp("(?<=a?)b").is_err(),
"Variable-length lookbehind (optional) should fail"
);
assert!(
parse_regexp("(?<=ab)c").is_ok(),
"Fixed-length lookbehind should parse"
);
assert!(
parse_regexp("(?<=abc)d").is_ok(),
"Fixed-length lookbehind (3 chars) should parse"
);
}
#[test]
fn test_lookaround_atom_properties() {
let root = parse_regexp("foo(?=bar)").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 4);
assert_eq!(
root[0][3].lookaround,
Some(LookaroundType::PositiveLookahead)
);
assert!(root[0][3].subtree.is_some());
let root = parse_regexp("foo(?!bar)").unwrap();
assert_eq!(
root[0][3].lookaround,
Some(LookaroundType::NegativeLookahead)
);
let root = parse_regexp("(?<=foo)bar").unwrap();
assert_eq!(root[0].len(), 4);
assert_eq!(
root[0][0].lookaround,
Some(LookaroundType::PositiveLookbehind)
);
let root = parse_regexp("(?<!foo)bar").unwrap();
assert_eq!(
root[0][0].lookaround,
Some(LookaroundType::NegativeLookbehind)
);
}
#[test]
fn test_parse_empty() {
let result = parse_regexp("");
assert!(result.is_ok(), "Empty pattern should parse successfully");
let root = result.unwrap();
assert_eq!(root.len(), 0);
}
#[test]
fn test_nfa_empty_pattern() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let empty_value = vec![b'"', b'"', ARENA_VALUE_TERMINATOR];
let mut bufs = NfaBuffers::new();
traverse_arena_nfa(&arena, start, &empty_value, &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"Empty regexp should match empty string"
);
assert!(
bufs.transitions[0] == Arc::as_ptr(&field_matcher) as usize,
"Should transition to field_matcher"
);
let non_empty_value = vec![b'"', b'h', b'i', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &non_empty_value, &mut bufs);
assert!(
bufs.transitions.is_empty(),
"Empty regexp should NOT match non-empty string"
);
}
#[test]
fn test_nfa_simple_singleton() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[abc]").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value_a = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_a, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc] should match 'a'"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_nfa_plus_quantifier() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[abc]+").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value_a = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_a, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]+ should match 'a'"
);
let value_abc = vec![b'"', b'a', b'b', b'c', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_abc, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]+ should match 'abc'"
);
let empty = vec![b'"', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &empty, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]+ should NOT match empty string"
);
let value_x = vec![b'"', b'x', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_x, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]+ should NOT match 'x'"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_nfa_star_quantifier() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[abc]*").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let empty = vec![b'"', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &empty, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]* should match empty string"
);
let value_a = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_a, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]* should match 'a'"
);
let value_abc = vec![b'"', b'a', b'b', b'c', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_abc, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]* should match 'abc'"
);
}
#[test]
fn test_parse_range_quantifier() {
let root = parse_regexp("a{3}").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].quant_min, 3);
assert_eq!(root[0][0].quant_max, 3);
let root = parse_regexp("a{2,5}").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].quant_min, 2);
assert_eq!(root[0][0].quant_max, 5);
let root = parse_regexp("a{2,}").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].quant_min, 2);
assert_eq!(root[0][0].quant_max, REGEXP_QUANTIFIER_MAX);
}
#[test]
fn test_nfa_range_exact() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{3}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value_aa = vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aa, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{3}} should NOT match 'aa'"
);
let value_aaa = vec![b'"', b'a', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aaa, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{3}} should match 'aaa'"
);
let value_aaaa = vec![b'"', b'a', b'a', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aaaa, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{3}} should NOT match 'aaaa'"
);
}
#[allow(clippy::similar_names)]
#[test]
fn test_nfa_range_bounded() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{2,4}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value_a = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_a, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{2,4}} should NOT match 'a'"
);
let value_aa = vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aa, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{2,4}} should match 'aa'"
);
let value_aaa = vec![b'"', b'a', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aaa, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{2,4}} should match 'aaa'"
);
let value_aaaa = vec![b'"', b'a', b'a', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aaaa, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{2,4}} should match 'aaaa'"
);
let value_5a = vec![
b'"',
b'a',
b'a',
b'a',
b'a',
b'a',
b'"',
ARENA_VALUE_TERMINATOR,
];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_5a, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{2,4}} should NOT match 'aaaaa'"
);
}
#[test]
fn test_nfa_range_with_class() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[abc]{2,3}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value_a = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_a, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]{{2,3}} should NOT match 'a'"
);
let value_ab = vec![b'"', b'a', b'b', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_ab, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]{{2,3}} should match 'ab'"
);
let value_abc = vec![b'"', b'a', b'b', b'c', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_abc, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]{{2,3}} should match 'abc'"
);
let value_abcd = vec![b'"', b'a', b'b', b'c', b'd', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_abcd, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [abc]{{2,3}} should NOT match 'abcd'"
);
}
#[test]
fn test_nfa_range_zero_min() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{0,2}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let empty = vec![b'"', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &empty, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{0,2}} should match empty string"
);
let value_a = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_a, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{0,2}} should match 'a'"
);
let value_aa = vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aa, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{0,2}} should match 'aa'"
);
let value_aaa = vec![b'"', b'a', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value_aaa, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern a{{0,2}} should NOT match 'aaa'"
);
}
#[test]
fn test_invert_rune_range() {
let test_cases = vec![
(
vec![RunePair { lo: 'b', hi: 'b' }],
vec![
RunePair { lo: '\0', hi: 'a' },
RunePair {
lo: 'c',
hi: '\u{D7FF}',
},
RunePair {
lo: '\u{E000}',
hi: RUNE_MAX,
},
],
),
(
vec![RunePair { lo: 'l', hi: 'n' }],
vec![
RunePair { lo: '\0', hi: 'k' },
RunePair {
lo: 'o',
hi: '\u{D7FF}',
},
RunePair {
lo: '\u{E000}',
hi: RUNE_MAX,
},
],
),
(
vec![RunePair { lo: 'b', hi: 'n' }, RunePair { lo: 'p', hi: 'q' }],
vec![
RunePair { lo: '\0', hi: 'a' },
RunePair { lo: 'o', hi: 'o' },
RunePair {
lo: 'r',
hi: '\u{D7FF}',
},
RunePair {
lo: '\u{E000}',
hi: RUNE_MAX,
},
],
),
(
vec![
RunePair { lo: '\0', hi: 'x' },
RunePair {
lo: 'z',
hi: RUNE_MAX,
},
],
vec![RunePair { lo: 'y', hi: 'y' }],
),
(
vec![
RunePair { lo: 'd', hi: 'd' },
RunePair { lo: 'b', hi: 'b' },
RunePair { lo: 'c', hi: 'c' },
],
vec![
RunePair { lo: '\0', hi: 'a' },
RunePair {
lo: 'e',
hi: '\u{D7FF}',
},
RunePair {
lo: '\u{E000}',
hi: RUNE_MAX,
},
],
),
];
for (i, (input, expected)) in test_cases.into_iter().enumerate() {
let result = invert_rune_range(input);
assert_eq!(
result.len(),
expected.len(),
"Test case {i}: wrong number of ranges. Got {result:?}, expected {expected:?}"
);
for (j, (got, want)) in result.iter().zip(expected.iter()).enumerate() {
assert_eq!(
got.lo, want.lo,
"Test case {} range {}: wrong lo. Got {:?}, expected {:?}",
i, j, got.lo, want.lo
);
assert_eq!(
got.hi, want.hi,
"Test case {} range {}: wrong hi. Got {:?}, expected {:?}",
i, j, got.hi, want.hi
);
}
}
}
#[test]
fn test_toxic_stack_arena() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let re = "(([~.~~~?~*~+~{~}~[~]~(~)~|]?)*)+";
let root = parse_regexp(re).expect("Should parse toxic stack pattern");
assert!(regexp_has_plus_star(&root), "Toxic pattern should have +/*");
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let test_str = ".~?*+{}[]()|.~?*+{}[]()|.~?*+{}[]()|";
let mut value: Vec<u8> = Vec::new();
value.push(b'"');
value.extend_from_slice(test_str.as_bytes());
value.push(b'"');
value.push(ARENA_VALUE_TERMINATOR);
let mut bufs = NfaBuffers::new();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Toxic stack pattern should match test string via arena NFA"
);
}
#[test]
fn test_nfa_positive_class_miri_friendly() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[a-z]").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
for ch in b"abc" {
let value = vec![b'"', *ch, b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [a-z] should match '{}'",
*ch as char
);
}
for ch in b"ABC123" {
let value = vec![b'"', *ch, b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [a-z] should NOT match '{}'",
*ch as char
);
}
let root = parse_regexp("[a-zA-Z0-9]").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
for ch in b"aZ5" {
let value = vec![b'"', *ch, b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [a-zA-Z0-9] should match '{}'",
*ch as char
);
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_negated_class_nfa() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[^abc]").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
for ch in b"abc" {
let value = vec![b'"', *ch, b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [^abc] should NOT match '{}'",
*ch as char
);
}
for ch in b"xyz" {
let value = vec![b'"', *ch, b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [^abc] should match '{}'",
*ch as char
);
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_star_matches_empty() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let star_patterns = vec!["[a-z]*", "[0-9]*", ".*", "([abc]*)"];
for pattern in star_patterns {
let root = parse_regexp(pattern).unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let empty = vec![b'"', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &empty, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern {pattern} should match empty string"
);
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_star_matches_empty_miri_friendly() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[a-z]*").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let empty = vec![b'"', b'"', ARENA_VALUE_TERMINATOR];
bufs.clear();
traverse_arena_nfa(&arena, start, &empty, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern [a-z]* should match empty string"
);
}
#[test]
fn test_arena_nfa_email_pattern() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let pattern = "[a-z]+@example~.com";
let root = parse_regexp(pattern).unwrap();
assert!(
regexp_has_plus_star(&root),
"Pattern should be detected as having + quantifier"
);
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value = Vec::new();
value.push(b'"');
value.extend_from_slice(b"alice@example.com");
value.push(b'"');
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern {pattern} should match 'alice@example.com'"
);
}
#[test]
fn test_arena_nfa_plus_simple() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let pattern = "[a-z]+";
let root = parse_regexp(pattern).unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value = Vec::new();
value.push(b'"');
value.extend_from_slice(b"abc");
value.push(b'"');
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"Pattern {pattern} should match 'abc'"
);
}
#[test]
fn test_arena_nfa_star_plus_miri_friendly() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
fn matches(pattern: &str, input: &str) -> bool {
let root =
parse_regexp(pattern).unwrap_or_else(|_| panic!("Failed to parse: {pattern}"));
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value: Vec<u8> = Vec::new();
value.push(b'"');
value.extend_from_slice(input.as_bytes());
value.push(b'"');
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize))
}
assert!(matches("[abc]*", ""), "[abc]* should match empty");
assert!(matches("[abc]*", "a"), "[abc]* should match 'a'");
assert!(matches("[abc]*", "abc"), "[abc]* should match 'abc'");
assert!(matches("[abc]*", "aabbcc"), "[abc]* should match 'aabbcc'");
assert!(!matches("[abc]*", "x"), "[abc]* should not match 'x'");
assert!(!matches("[abc]*", "abx"), "[abc]* should not match 'abx'");
assert!(!matches("[abc]+", ""), "[abc]+ should not match empty");
assert!(matches("[abc]+", "a"), "[abc]+ should match 'a'");
assert!(matches("[abc]+", "abc"), "[abc]+ should match 'abc'");
assert!(!matches("[abc]+", "x"), "[abc]+ should not match 'x'");
assert!(matches("[a-z]*", ""), "[a-z]* should match empty");
assert!(matches("[a-z]*", "hello"), "[a-z]* should match 'hello'");
assert!(
!matches("[a-z]*", "Hello"),
"[a-z]* should not match 'Hello'"
);
assert!(!matches("[0-9]+", ""), "[0-9]+ should not match empty");
assert!(matches("[0-9]+", "123"), "[0-9]+ should match '123'");
assert!(!matches("[0-9]+", "12a"), "[0-9]+ should not match '12a'");
assert!(
matches("[a-z]+@[a-z]+", "foo@bar"),
"email-like should match"
);
assert!(
!matches("[a-z]+@[a-z]+", "foo@"),
"incomplete email should not match"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_negated_category_star_edge_cases() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
fn matches(pattern: &str, input: &str) -> bool {
let root =
parse_regexp(pattern).unwrap_or_else(|_| panic!("Failed to parse: {pattern}"));
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value: Vec<u8> = Vec::new();
value.push(b'"');
value.extend_from_slice(input.as_bytes());
value.push(b'"');
traverse_arena_nfa(&arena, start, &value, &mut bufs);
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize))
}
fn matches_with_vt(pattern: &str, input: &str) -> bool {
let root =
parse_regexp(pattern).unwrap_or_else(|_| panic!("Failed to parse: {pattern}"));
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value: Vec<u8> = Vec::new();
value.push(b'"');
value.extend_from_slice(input.as_bytes());
value.push(b'"');
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize))
}
assert!(matches(".*", "a"), ".* should match 'a'");
assert!(matches(".*", "₠"), ".* should match '₠'");
assert!(matches(".*", ""), ".* should match empty");
assert!(matches("~P{C}", "₠"), "~P{{C}} should match '₠'");
assert!(
matches("~P{C}*", ""),
"~P{{C}}* should match empty (zero chars)"
);
assert!(
matches("~P{C}*", "₠"),
"~P{{C}}* should match '₠' (single non-C char)"
);
assert!(matches("~p{Lo}", "א"), "~p{{Lo}} should match Hebrew Alef");
assert!(
matches("~p{Lo}", "𪘀"),
"~p{{Lo}} should match CJK Extension B char"
);
assert!(
matches("~p{Lo}*", "א"),
"~p{{Lo}}* should match Hebrew Alef"
);
assert!(
matches("~p{Lo}*", "𪘀"),
"~p{{Lo}}* should match CJK Extension B char"
);
assert!(
matches("~p{Lo}*", "א𪘀"),
"~p{{Lo}}* should match Hebrew + CJK"
);
assert!(
matches_with_vt("~p{Lo}*", "א𪘀"),
"~p{{Lo}}* should match Hebrew + CJK (with VT)"
);
}
#[test]
#[cfg_attr(miri, ignore)] fn test_negated_single_char_ascii_fast_path() {
use crate::automaton::arena::{NfaBuffers, traverse_arena_nfa};
let pattern = "[^x]+";
let root = parse_regexp(pattern).unwrap_or_else(|_| panic!("Failed to parse: {pattern}"));
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let inner_start = arena[start].table.dstep(b'"');
assert!(!inner_start.is_none(), "start should transition on '\"'");
let inner_state = &arena[inner_start];
assert!(
inner_state.table.accel.is_some(),
"[^x]+ should have acceleration with ASCII fast path"
);
let accel = inner_state.table.accel.as_ref().unwrap();
assert_eq!(accel.len, 1, "Should have 1 exit byte");
assert_eq!(accel.exit_bytes[0], b'x', "Exit byte should be 'x'");
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&arena, start, b"\"abc\"", &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"[^x]+ should match 'abc'"
);
bufs.clear();
traverse_arena_nfa(&arena, start, b"\"xabc\"", &mut bufs);
assert!(bufs.transitions.is_empty(), "[^x]+ should not match 'xabc'");
bufs.clear();
traverse_arena_nfa(&arena, start, b"\"\"", &mut bufs);
assert!(
bufs.transitions.is_empty(),
"[^x]+ should not match empty string"
);
bufs.clear();
{
let mut value = Vec::new();
value.push(b'"');
value.extend_from_slice("αβγ".as_bytes());
value.push(b'"');
traverse_arena_nfa(&arena, start, &value, &mut bufs);
}
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"[^x]+ should match Unicode 'αβγ'"
);
}
#[test]
#[cfg_attr(miri, ignore)] fn test_negated_unicode_char_no_ascii_fast_path() {
let pattern = "[^ü]+";
let root = parse_regexp(pattern).unwrap_or_else(|_| panic!("Failed to parse: {pattern}"));
let (arena, start, _field_matcher) = make_regexp_nfa_arena(root);
let start_state = &arena[start];
assert!(
start_state.table.accel.is_none(),
"[^ü]+ should NOT have ASCII fast path acceleration"
);
}
#[test]
fn test_range_quantifier_parse_errors() {
let error_cases = vec![
("a{9999999999998,9999999999999}", "overflow in lo"),
("a{2x-3}", "invalid char after digits"),
("a{2,", "incomplete - no closing brace"),
("a{2,r}", "invalid char after comma"),
("a{2,4x", "invalid after complete range"),
("a{2,9999999999999}", "overflow in hi"),
("a{5,2}", "min > max"),
("a{,3}", "missing lo"),
("a{}", "empty braces"),
];
for (pattern, desc) in error_cases {
let result = parse_regexp(pattern);
assert!(result.is_err(), "Pattern '{pattern}' should fail: {desc}");
}
}
#[test]
fn test_range_quantifier_equivalence_question() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root_range = parse_regexp("a{0,1}").unwrap();
let root_qm = parse_regexp("a?").unwrap();
let (arena_range, start_range, fm_range) = make_regexp_nfa_arena(root_range);
let (arena_qm, start_qm, fm_qm) = make_regexp_nfa_arena(root_qm);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], true, "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], true, "a"),
(
vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR],
false,
"aa",
),
(vec![b'"', b'b', b'"', ARENA_VALUE_TERMINATOR], false, "b"),
];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena_range, start_range, &value, &mut bufs);
let range_matched = bufs
.transitions
.contains(&(Arc::as_ptr(&fm_range) as usize));
bufs.clear();
traverse_arena_nfa(&arena_qm, start_qm, &value, &mut bufs);
let qm_matched = bufs.transitions.contains(&(Arc::as_ptr(&fm_qm) as usize));
assert_eq!(
range_matched, qm_matched,
"a{{0,1}} and a? should agree on '{desc}': range={range_matched}, qm={qm_matched}"
);
assert_eq!(
range_matched,
should_match,
"Pattern should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_range_quantifier_equivalence_plus() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root_range = parse_regexp("a{1,}").unwrap();
let root_plus = parse_regexp("a+").unwrap();
let (arena_range, start_range, fm_range) = make_regexp_nfa_arena(root_range);
let (arena_plus, start_plus, fm_plus) = make_regexp_nfa_arena(root_plus);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], false, "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], true, "a"),
(
vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR],
true,
"aa",
),
(
vec![b'"', b'a', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR],
true,
"aaa",
),
(vec![b'"', b'b', b'"', ARENA_VALUE_TERMINATOR], false, "b"),
];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena_range, start_range, &value, &mut bufs);
let range_matched = bufs
.transitions
.contains(&(Arc::as_ptr(&fm_range) as usize));
bufs.clear();
traverse_arena_nfa(&arena_plus, start_plus, &value, &mut bufs);
let plus_matched = bufs.transitions.contains(&(Arc::as_ptr(&fm_plus) as usize));
assert_eq!(
range_matched, plus_matched,
"a{{1,}} and a+ should agree on '{desc}': range={range_matched}, plus={plus_matched}"
);
assert_eq!(
range_matched,
should_match,
"Pattern should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_range_quantifier_equivalence_star() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root_range = parse_regexp("a{0,}").unwrap();
let root_star = parse_regexp("a*").unwrap();
let (arena_range, start_range, fm_range) = make_regexp_nfa_arena(root_range);
let (arena_star, start_star, fm_star) = make_regexp_nfa_arena(root_star);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], true, "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], true, "a"),
(
vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR],
true,
"aa",
),
(vec![b'"', b'b', b'"', ARENA_VALUE_TERMINATOR], false, "b"),
];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena_range, start_range, &value, &mut bufs);
let range_matched = bufs
.transitions
.contains(&(Arc::as_ptr(&fm_range) as usize));
bufs.clear();
traverse_arena_nfa(&arena_star, start_star, &value, &mut bufs);
let star_matched = bufs.transitions.contains(&(Arc::as_ptr(&fm_star) as usize));
assert_eq!(
range_matched, star_matched,
"a{{0,}} and a* should agree on '{desc}': range={range_matched}, star={star_matched}"
);
assert_eq!(
range_matched,
should_match,
"Pattern should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_range_quantifier_equivalence_miri_friendly() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let mut bufs = NfaBuffers::new();
let root_range = parse_regexp("a{0,}").unwrap();
let root_star = parse_regexp("a*").unwrap();
let (arena_range, start_range, fm_range) = make_regexp_nfa_arena(root_range);
let (arena_star, start_star, fm_star) = make_regexp_nfa_arena(root_star);
for (value, desc) in [
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], "a"),
] {
bufs.clear();
traverse_arena_nfa(&arena_range, start_range, &value, &mut bufs);
let range_matched = bufs
.transitions
.contains(&(Arc::as_ptr(&fm_range) as usize));
bufs.clear();
traverse_arena_nfa(&arena_star, start_star, &value, &mut bufs);
let star_matched = bufs.transitions.contains(&(Arc::as_ptr(&fm_star) as usize));
assert_eq!(
range_matched, star_matched,
"a{{0,}} and a* should agree on '{desc}'"
);
}
let root_range = parse_regexp("a{1,}").unwrap();
let root_plus = parse_regexp("a+").unwrap();
let (arena_range, start_range, fm_range) = make_regexp_nfa_arena(root_range);
let (arena_plus, start_plus, fm_plus) = make_regexp_nfa_arena(root_plus);
for (value, desc) in [
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], "a"),
] {
bufs.clear();
traverse_arena_nfa(&arena_range, start_range, &value, &mut bufs);
let range_matched = bufs
.transitions
.contains(&(Arc::as_ptr(&fm_range) as usize));
bufs.clear();
traverse_arena_nfa(&arena_plus, start_plus, &value, &mut bufs);
let plus_matched = bufs.transitions.contains(&(Arc::as_ptr(&fm_plus) as usize));
assert_eq!(
range_matched, plus_matched,
"a{{1,}} and a+ should agree on '{desc}'"
);
}
}
#[test]
fn test_range_quantifier_exact_one() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{1}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], false, "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], true, "a"),
(
vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR],
false,
"aa",
), ];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
let matched = bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize));
assert_eq!(
matched,
should_match,
"a{{1}} should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
fn test_range_quantifier_exact_zero() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{0,0}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], true, "empty"),
(vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR], false, "a"),
(
vec![b'"', b'a', b'a', b'"', ARENA_VALUE_TERMINATOR],
false,
"aa",
),
];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
let matched = bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize));
assert_eq!(
matched,
should_match,
"a{{0,0}} should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
fn test_range_quantifier_with_dot() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp(".{2,4}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], false, "empty"),
(vec![b'"', b'x', b'"', ARENA_VALUE_TERMINATOR], false, "x"),
(
vec![b'"', b'x', b'y', b'"', ARENA_VALUE_TERMINATOR],
true,
"xy",
),
(
vec![b'"', b'a', b'b', b'c', b'"', ARENA_VALUE_TERMINATOR],
true,
"abc",
),
(
vec![b'"', b'a', b'b', b'c', b'd', b'"', ARENA_VALUE_TERMINATOR],
true,
"abcd",
),
(
vec![
b'"',
b'a',
b'b',
b'c',
b'd',
b'e',
b'"',
ARENA_VALUE_TERMINATOR,
],
false,
"abcde",
),
];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
let matched = bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize));
assert_eq!(
matched,
should_match,
".{{2,4}} should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
fn test_range_quantifier_with_group() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("(ab){2,3}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let test_cases = vec![
(vec![b'"', b'"', ARENA_VALUE_TERMINATOR], false, "empty"),
(
vec![b'"', b'a', b'b', b'"', ARENA_VALUE_TERMINATOR],
false,
"ab",
),
(
vec![b'"', b'a', b'b', b'a', b'b', b'"', ARENA_VALUE_TERMINATOR],
true,
"abab",
),
(
vec![
b'"',
b'a',
b'b',
b'a',
b'b',
b'a',
b'b',
b'"',
ARENA_VALUE_TERMINATOR,
],
true,
"ababab",
),
(
vec![
b'"',
b'a',
b'b',
b'a',
b'b',
b'a',
b'b',
b'a',
b'b',
b'"',
ARENA_VALUE_TERMINATOR,
],
false,
"abababab",
),
];
for (value, should_match, desc) in test_cases {
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
let matched = bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize));
assert_eq!(
matched,
should_match,
"(ab){{2,3}} should {} match '{}'",
if should_match { "" } else { "NOT" },
desc
);
}
}
#[test]
fn test_range_quantifier_larger_values() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{5,10}").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let test_cases: Vec<(usize, bool)> = vec![
(4, false), (5, true), (7, true), (10, true), (11, false), ];
for (count, should_match) in test_cases {
let mut value: Vec<u8> = Vec::with_capacity(count + 3);
value.push(b'"');
value.extend(std::iter::repeat_n(b'a', count));
value.push(b'"');
value.push(ARENA_VALUE_TERMINATOR);
bufs.clear();
traverse_arena_nfa(&arena, start, &value, &mut bufs);
let matched = bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize));
assert_eq!(
matched,
should_match,
"a{{5,10}} should {} match {} 'a's",
if should_match { "" } else { "NOT" },
count
);
}
}
#[test]
fn test_multi_char_escapes_parse() {
let root = parse_regexp("~d").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 1);
assert_eq!(root[0][0].runes[0].lo, '0');
assert_eq!(root[0][0].runes[0].hi, '9');
let root = parse_regexp("~w").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 4);
let root = parse_regexp("~s").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 4);
let root = parse_regexp("~D").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert!(root[0][0].runes.len() >= 2);
assert!(parse_regexp("~W").is_ok());
assert!(parse_regexp("~S").is_ok());
let root = parse_regexp("~i").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 16);
let root = parse_regexp("~c").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 22);
assert!(parse_regexp("~I").is_ok());
assert!(parse_regexp("~C").is_ok());
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_xml_escapes_nfa() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("~i").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~i should match 'a'"
);
bufs.clear();
let value = vec![b'"', b':', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~i should match ':'"
);
bufs.clear();
let value = vec![b'"', b'_', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~i should match '_'"
);
bufs.clear();
let value = vec![b'"', b'1', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~i should NOT match '1'"
);
bufs.clear();
let value = vec![b'"', b'-', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~i should NOT match '-'"
);
let root = parse_regexp("~c").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
bufs.clear();
let value = vec![b'"', b'1', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~c should match '1'"
);
bufs.clear();
let value = vec![b'"', b'-', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~c should match '-'"
);
bufs.clear();
let value = vec![b'"', b'.', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~c should match '.'"
);
bufs.clear();
let value = vec![b'"', b' ', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~c should NOT match ' '"
);
}
#[test]
fn test_multi_char_escapes_nfa() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("~d").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value = vec![b'"', b'5', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~d should match '5'"
);
bufs.clear();
let value = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~d should NOT match 'a'"
);
let root = parse_regexp("~w").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
bufs.clear();
let value = vec![b'"', b'a', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~w should match 'a'"
);
bufs.clear();
let value = vec![b'"', b'_', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~w should match '_'"
);
bufs.clear();
let value = vec![b'"', b'-', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~w should NOT match '-'"
);
let root = parse_regexp("~s").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
bufs.clear();
let value = vec![b'"', b' ', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~s should match ' '"
);
bufs.clear();
let value = vec![b'"', b'\t', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~s should match '\\t'"
);
bufs.clear();
let value = vec![b'"', b'x', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs
.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~s should NOT match 'x'"
);
}
#[test]
fn test_multi_char_escapes_in_class() {
let root = parse_regexp("[~d]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(root[0][0].runes.len(), 1);
assert_eq!(root[0][0].runes[0].lo, '0');
assert_eq!(root[0][0].runes[0].hi, '9');
let root = parse_regexp("[~da-z]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert!(root[0][0].runes.len() >= 2);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_multi_char_escape_with_quantifier() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("~d+").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
let value = vec![b'"', b'1', b'2', b'3', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"~d+ should match '123'"
);
let root = parse_regexp("a~s{0,3}b").unwrap();
let (arena, start, field_matcher) = make_regexp_nfa_arena(root);
bufs.clear();
let value = vec![b'"', b'a', b'b', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"a~s{{0,3}}b should match 'ab'"
);
bufs.clear();
let value = vec![b'"', b'a', b' ', b' ', b'b', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
bufs.transitions
.contains(&(Arc::as_ptr(&field_matcher) as usize)),
"a~s{{0,3}}b should match 'a b'"
);
}
#[test]
#[cfg(miri)]
fn test_multi_char_escape_quantifier_miri_minimal() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("~d{1}").unwrap();
let (arena, start, fm) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::new();
bufs.clear();
traverse_arena_nfa(
&arena,
start,
&[b'"', b'5', b'"', ARENA_VALUE_TERMINATOR],
&mut bufs,
);
assert!(
bufs.transitions
.iter()
.any(|&m| m == Arc::as_ptr(&fm) as usize),
"~d{{1}} should match '5'"
);
bufs.clear();
traverse_arena_nfa(
&arena,
start,
&[b'"', b'x', b'"', ARENA_VALUE_TERMINATOR],
&mut bufs,
);
assert!(
!bufs
.transitions
.iter()
.any(|&m| m == Arc::as_ptr(&fm) as usize),
"~d{{1}} should NOT match 'x'"
);
}
#[test]
fn test_shell_caching_cache_key() {
let root = parse_regexp("~p{L}").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("L"),
"~p{{L}} should have cache_key 'L'"
);
let root = parse_regexp("~p{Lu}").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("Lu"),
"~p{{Lu}} should have cache_key 'Lu'"
);
let root = parse_regexp("~P{L}").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("-L"),
"~P{{L}} should have cache_key '-L'"
);
let root = parse_regexp("~P{Nd}").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("-Nd"),
"~P{{Nd}} should have cache_key '-Nd'"
);
let root = parse_regexp("~p{IsBasicLatin}").unwrap();
assert_eq!(
root[0][0].cache_key, None,
"~p{{IsBasicLatin}} should NOT have cache_key"
);
let root = parse_regexp("[a-z]").unwrap();
assert_eq!(
root[0][0].cache_key, None,
"[a-z] should NOT have cache_key"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_shell_caching_nfa_correctness() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root1 = parse_regexp("~p{L}").unwrap();
let root2 = parse_regexp("~p{L}").unwrap();
let (arena1, start1, fm1) = make_regexp_nfa_arena(root1);
let (arena2, start2, fm2) = make_regexp_nfa_arena(root2);
let mut bufs = NfaBuffers::new();
let value = vec![b'"', b'A', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena1, start1, &value, &mut bufs);
assert!(
bufs.transitions.contains(&(Arc::as_ptr(&fm1) as usize)),
"First ~p{{L}} should match 'A'"
);
bufs.clear();
traverse_arena_nfa(&arena2, start2, &value, &mut bufs);
assert!(
bufs.transitions.contains(&(Arc::as_ptr(&fm2) as usize)),
"Second ~p{{L}} should match 'A' (from cache)"
);
bufs.clear();
let value = vec![b'"', b'5', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena1, start1, &value, &mut bufs);
assert!(
!bufs.transitions.contains(&(Arc::as_ptr(&fm1) as usize)),
"First ~p{{L}} should NOT match '5'"
);
bufs.clear();
traverse_arena_nfa(&arena2, start2, &value, &mut bufs);
assert!(
!bufs.transitions.contains(&(Arc::as_ptr(&fm2) as usize)),
"Second ~p{{L}} should NOT match '5'"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_shell_caching_independent_categories() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
clear_fa_shell_cache();
let root_l = parse_regexp("~p{L}").unwrap();
let root_nd = parse_regexp("~p{Nd}").unwrap();
let (arena_l, start_l, fm_l) = make_regexp_nfa_arena(root_l);
let (arena_nd, start_nd, fm_nd) = make_regexp_nfa_arena(root_nd);
let mut bufs = NfaBuffers::new();
let value_a = vec![b'"', b'A', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena_l, start_l, &value_a, &mut bufs);
assert!(
bufs.transitions.contains(&(Arc::as_ptr(&fm_l) as usize)),
"~p{{L}} should match 'A'"
);
bufs.clear();
traverse_arena_nfa(&arena_nd, start_nd, &value_a, &mut bufs);
assert!(
!bufs.transitions.contains(&(Arc::as_ptr(&fm_nd) as usize)),
"~p{{Nd}} should NOT match 'A'"
);
bufs.clear();
let value_5 = vec![b'"', b'5', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena_l, start_l, &value_5, &mut bufs);
assert!(
!bufs.transitions.contains(&(Arc::as_ptr(&fm_l) as usize)),
"~p{{L}} should NOT match '5'"
);
bufs.clear();
traverse_arena_nfa(&arena_nd, start_nd, &value_5, &mut bufs);
assert!(
bufs.transitions.contains(&(Arc::as_ptr(&fm_nd) as usize)),
"~p{{Nd}} should match '5'"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_shell_caching_negated_independent() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
clear_fa_shell_cache();
let root_pos = parse_regexp("~p{L}").unwrap();
let root_neg = parse_regexp("~P{L}").unwrap();
let (arena_pos, start_pos, fm_pos) = make_regexp_nfa_arena(root_pos);
let (arena_neg, start_neg, fm_neg) = make_regexp_nfa_arena(root_neg);
let mut bufs = NfaBuffers::new();
let value_a = vec![b'"', b'A', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena_pos, start_pos, &value_a, &mut bufs);
assert!(
bufs.transitions.contains(&(Arc::as_ptr(&fm_pos) as usize)),
"~p{{L}} should match 'A'"
);
bufs.clear();
traverse_arena_nfa(&arena_neg, start_neg, &value_a, &mut bufs);
assert!(
!bufs.transitions.contains(&(Arc::as_ptr(&fm_neg) as usize)),
"~P{{L}} should NOT match 'A'"
);
bufs.clear();
let value_5 = vec![b'"', b'5', b'"', ARENA_VALUE_TERMINATOR];
traverse_arena_nfa(&arena_pos, start_pos, &value_5, &mut bufs);
assert!(
!bufs.transitions.contains(&(Arc::as_ptr(&fm_pos) as usize)),
"~p{{L}} should NOT match '5'"
);
bufs.clear();
traverse_arena_nfa(&arena_neg, start_neg, &value_5, &mut bufs);
assert!(
bufs.transitions.contains(&(Arc::as_ptr(&fm_neg) as usize)),
"~P{{L}} should match '5'"
);
}
#[test]
fn test_shell_caching_xml_escape_cache_keys() {
let root = parse_regexp("~i").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("i"),
"~i should have cache_key 'i'"
);
let root = parse_regexp("~I").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("-i"),
"~I should have cache_key '-i'"
);
let root = parse_regexp("~c").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("c"),
"~c should have cache_key 'c'"
);
let root = parse_regexp("~C").unwrap();
assert_eq!(
root[0][0].cache_key.as_deref(),
Some("-c"),
"~C should have cache_key '-c'"
);
let root = parse_regexp("~d").unwrap();
assert_eq!(root[0][0].cache_key, None, "~d should NOT have cache_key");
let root = parse_regexp("~w").unwrap();
assert_eq!(root[0][0].cache_key, None, "~w should NOT have cache_key");
let root = parse_regexp("~s").unwrap();
assert_eq!(root[0][0].cache_key, None, "~s should NOT have cache_key");
}
#[test]
fn test_backreferences_not_supported() {
let patterns = [
"(.)~1",
"([abc])~1",
"x(.)~1y",
"(.)~1~1",
"~1",
"(.)(.)~2",
"(abc)~1",
"(.)+~1",
"~9",
];
for pattern in patterns {
let result = parse_regexp(pattern);
assert!(
result.is_err(),
"Backreference pattern '{pattern}' should fail"
);
let err = result.unwrap_err();
assert!(
err.message.contains("backreference"),
"Error for '{}' should mention backreference: {}",
pattern,
err.message
);
}
}
#[test]
fn test_subtract_rune_range_basic() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'a', hi: 'd' }];
let sub = vec![RunePair { lo: 'b', hi: 'c' }];
let result = subtract_rune_range(base, sub);
assert_eq!(result.len(), 2);
assert_eq!(result[0], RunePair { lo: 'a', hi: 'a' });
assert_eq!(result[1], RunePair { lo: 'd', hi: 'd' });
}
#[test]
fn test_subtract_rune_range_no_overlap() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'a', hi: 'c' }];
let sub = vec![RunePair { lo: 'x', hi: 'z' }];
let result = subtract_rune_range(base, sub);
assert_eq!(result.len(), 1);
assert_eq!(result[0], RunePair { lo: 'a', hi: 'c' });
}
#[test]
fn test_subtract_rune_range_complete_overlap() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'b', hi: 'c' }];
let sub = vec![RunePair { lo: 'a', hi: 'd' }];
let result = subtract_rune_range(base, sub);
assert!(result.is_empty());
}
#[test]
fn test_subtract_rune_range_prefix_removal() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'a', hi: 'f' }];
let sub = vec![RunePair { lo: 'a', hi: 'c' }];
let result = subtract_rune_range(base, sub);
assert_eq!(result.len(), 1);
assert_eq!(result[0], RunePair { lo: 'd', hi: 'f' });
}
#[test]
fn test_subtract_rune_range_suffix_removal() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'a', hi: 'f' }];
let sub = vec![RunePair { lo: 'd', hi: 'f' }];
let result = subtract_rune_range(base, sub);
assert_eq!(result.len(), 1);
assert_eq!(result[0], RunePair { lo: 'a', hi: 'c' });
}
#[test]
fn test_subtract_rune_range_empty_subtract() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'a', hi: 'z' }];
let sub = vec![];
let result = subtract_rune_range(base, sub);
assert_eq!(result.len(), 1);
assert_eq!(result[0], RunePair { lo: 'a', hi: 'z' });
}
#[test]
fn test_subtract_rune_range_multiple_holes() {
use super::parser::subtract_rune_range;
let base = vec![RunePair { lo: 'a', hi: 'z' }];
let sub = vec![RunePair { lo: 'c', hi: 'c' }, RunePair { lo: 'm', hi: 'm' }];
let result = subtract_rune_range(base, sub);
assert_eq!(result.len(), 3);
assert_eq!(result[0], RunePair { lo: 'a', hi: 'b' });
assert_eq!(result[1], RunePair { lo: 'd', hi: 'l' });
assert_eq!(result[2], RunePair { lo: 'n', hi: 'z' });
}
#[test]
fn test_parse_char_class_subtraction() {
let root = parse_regexp("[a-d-[b-c]]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 2);
assert_eq!(runes[0], RunePair { lo: 'a', hi: 'a' });
assert_eq!(runes[1], RunePair { lo: 'd', hi: 'd' });
}
#[test]
fn test_parse_char_class_subtraction_negated_inner() {
let root = parse_regexp("[a-c-[^a-c]]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 1);
assert_eq!(runes[0], RunePair { lo: 'a', hi: 'c' });
}
#[test]
fn test_parse_char_class_subtraction_single_result() {
let root = parse_regexp("[a-z-[^a]]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 1);
assert_eq!(runes[0], RunePair { lo: 'a', hi: 'a' });
}
#[test]
fn test_parse_char_class_subtraction_no_overlap() {
let root = parse_regexp("[a-b-[0-9]]+").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 1);
assert_eq!(runes[0], RunePair { lo: 'a', hi: 'b' });
assert!(root[0][0].is_plus());
}
#[test]
fn test_parse_char_class_subtraction_with_negated_outer() {
let root = parse_regexp("[^a-z-[aeiou]]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
let has_a = runes.iter().any(|rp| rp.lo <= 'a' && 'a' <= rp.hi);
assert!(has_a, "negated consonants should include 'a'");
let has_b = runes.iter().any(|rp| rp.lo <= 'b' && 'b' <= rp.hi);
assert!(!has_b, "negated consonants should not include 'b'");
}
#[test]
fn test_parse_char_class_subtraction_nested() {
let root = parse_regexp("[0-9-[0-6-[0-3]]]").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 2);
assert_eq!(runes[0], RunePair { lo: '0', hi: '3' });
assert_eq!(runes[1], RunePair { lo: '7', hi: '9' });
}
#[test]
fn test_parse_char_class_subtraction_single_chars_before_bracket() {
let root = parse_regexp("[abcd-[d]]+").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1);
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 1);
assert_eq!(runes[0], RunePair { lo: 'a', hi: 'c' });
assert!(root[0][0].is_plus());
}
#[test]
fn test_parse_char_class_subtraction_alternation() {
let root = parse_regexp("([0-9-[02468]]|[0-9-[13579]])+").unwrap();
assert_eq!(root.len(), 1);
assert_eq!(root[0].len(), 1); let subtree = root[0][0].subtree.as_ref().unwrap();
assert_eq!(subtree.len(), 2); }
#[test]
fn test_parse_char_class_subtraction_negated_subtract() {
let root = parse_regexp("[abcdef-[^bce]]+").unwrap();
let runes = &root[0][0].runes;
assert_eq!(runes.len(), 2);
assert_eq!(runes[0], RunePair { lo: 'b', hi: 'c' });
assert_eq!(runes[1], RunePair { lo: 'e', hi: 'e' });
}
#[test]
fn test_parse_char_class_subtraction_depth_limit() {
let mut pattern = String::from("[a-z");
for _ in 0..9 {
pattern.push_str("-[a-z");
}
pattern.push_str("-[a]");
for _ in 0..10 {
pattern.push(']');
}
let result = parse_regexp(&pattern);
assert!(
result.is_err(),
"deeply nested subtraction should be rejected"
);
let err = result.unwrap_err();
assert!(
err.message.contains("nested too deeply"),
"error should mention nesting: {}",
err.message
);
}
#[test]
fn test_regexp_has_plus_star_returns_false() {
let root = parse_regexp("[a-z]").unwrap();
assert!(!regexp_has_plus_star(&root));
let root = parse_regexp("abc").unwrap();
assert!(!regexp_has_plus_star(&root));
let root = parse_regexp("a{2,3}").unwrap();
assert!(!regexp_has_plus_star(&root));
}
#[test]
fn test_zero_quantifier() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("a{0}b").unwrap();
let (arena, start, _fm) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value = Vec::from(b"\"b\"".as_slice());
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"{{0}} quantifier should allow skipping 'a'"
);
bufs.clear();
let mut value = Vec::from(b"\"ab\"".as_slice());
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(bufs.transitions.is_empty(), "a{{0}}b should not match 'ab'");
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_clear_fa_shell_cache_works() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
let root = parse_regexp("[a-z]+").unwrap();
let (arena, start, _fm) = make_regexp_nfa_arena(root);
let mut bufs = NfaBuffers::with_capacity();
let mut value = Vec::from(b"\"hello\"".as_slice());
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena, start, &value, &mut bufs);
assert!(!bufs.transitions.is_empty());
clear_fa_shell_cache();
let root2 = parse_regexp("[a-z]+").unwrap();
let (arena2, start2, _fm2) = make_regexp_nfa_arena(root2);
bufs.clear();
let mut value2 = Vec::from(b"\"world\"".as_slice());
value2.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena2, start2, &value2, &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"NFA should work after cache clear"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_shell_instantiation_with_epsilons() {
use crate::automaton::arena::{ARENA_VALUE_TERMINATOR, NfaBuffers, traverse_arena_nfa};
clear_fa_shell_cache();
let root1 = parse_regexp("~d+").unwrap();
let (arena1, start1, _fm1) = make_regexp_nfa_arena(root1);
let root2 = parse_regexp("~d+").unwrap();
let (arena2, start2, _fm2) = make_regexp_nfa_arena(root2);
let mut bufs = NfaBuffers::with_capacity();
let mut value = Vec::from(b"\"42\"".as_slice());
value.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena1, start1, &value, &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"first NFA should match digits"
);
bufs.clear();
traverse_arena_nfa(&arena2, start2, &value, &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"cached NFA should match digits"
);
bufs.clear();
let mut value2 = Vec::from(b"\"abc\"".as_slice());
value2.push(ARENA_VALUE_TERMINATOR);
traverse_arena_nfa(&arena2, start2, &value2, &mut bufs);
assert!(
bufs.transitions.is_empty(),
"digit pattern should not match letters"
);
clear_fa_shell_cache();
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_shell_cache_instantiate_epsilon_remap() {
use crate::automaton::arena::{
ARENA_VALUE_TERMINATOR, NfaBuffers, StateArena, StateId, traverse_arena_nfa,
};
fn quoted_value(content: &str) -> Vec<u8> {
let mut v = Vec::with_capacity(content.len() + 3);
v.push(b'"');
v.extend_from_slice(content.as_bytes());
v.push(b'"');
v.push(ARENA_VALUE_TERMINATOR);
v
}
#[allow(clippy::too_many_arguments)]
fn check_both(
arena1: &StateArena,
start1: StateId,
fm1_ptr: usize,
arena2: &StateArena,
start2: StateId,
fm2_ptr: usize,
bufs: &mut NfaBuffers,
value: &[u8],
expect_match: bool,
label: &str,
) {
bufs.clear();
traverse_arena_nfa(arena1, start1, value, bufs);
assert_eq!(
bufs.transitions.contains(&fm1_ptr),
expect_match,
"first NFA: {label}"
);
bufs.clear();
traverse_arena_nfa(arena2, start2, value, bufs);
assert_eq!(
bufs.transitions.contains(&fm2_ptr),
expect_match,
"cached NFA: {label}"
);
}
clear_fa_shell_cache();
let root1 = parse_regexp("~p{L}?x").unwrap();
let (arena1, start1, fm1) = make_regexp_nfa_arena(root1);
let root2 = parse_regexp("~p{L}?x").unwrap();
let (arena2, start2, fm2) = make_regexp_nfa_arena(root2);
let mut bufs = NfaBuffers::with_capacity();
let fm1_ptr = std::sync::Arc::as_ptr(&fm1) as usize;
let fm2_ptr = std::sync::Arc::as_ptr(&fm2) as usize;
for (content, label) in [
("x", "'x' matches via epsilon skip"),
("Ax", "'Ax' matches via ~p{L} then 'x'"),
("éx", "'éx' matches with 2-byte UTF-8 letter"),
("中x", "'中x' matches with 3-byte UTF-8 letter"),
("\u{20000}x", "U+20000 x matches with 4-byte UTF-8 letter"),
] {
let val = quoted_value(content);
check_both(
&arena1, start1, fm1_ptr, &arena2, start2, fm2_ptr, &mut bufs, &val, true, label,
);
}
let val = quoted_value("5x");
check_both(
&arena1,
start1,
fm1_ptr,
&arena2,
start2,
fm2_ptr,
&mut bufs,
&val,
false,
"'5x' should not match (digit is not ~p{L})",
);
let root_state = &arena2[start2];
let inner = root_state.table.dstep(b'"');
assert!(
!inner.is_none(),
"should have transition past opening quote"
);
let inner_state = &arena2[inner];
assert!(
!inner_state.table.epsilons.is_empty(),
"the ~p{{L}}? atom root should carry epsilon transitions even from the shell cache"
);
for &eps in &inner_state.table.epsilons {
assert!(!eps.is_none(), "epsilon target should not be NONE");
assert!(
eps.index() < arena2.len(),
"epsilon target {} out of range (arena has {} states)",
eps.index(),
arena2.len()
);
}
clear_fa_shell_cache();
}
}