use crate::regex::{
Error, Haystack,
ast::{Assertion, parse},
compile::{CompiledOps, Inst, Op, Prog, compile_ast, optimise},
matches::{Match, MatchIter},
};
use aho_corasick::AhoCorasick;
use std::{
collections::HashSet,
fmt,
mem::swap,
sync::{Arc, Mutex},
};
pub(super) const N_SLOTS: usize = 30;
pub struct Regex {
re: Arc<str>,
inner: Mutex<RegexInner>,
}
impl Clone for Regex {
fn clone(&self) -> Self {
let inner = self.inner.lock().unwrap().clone();
Self {
re: self.re.clone(),
inner: Mutex::new(inner),
}
}
}
impl PartialEq for Regex {
fn eq(&self, other: &Self) -> bool {
self.re == other.re
}
}
impl Eq for Regex {}
impl fmt::Debug for Regex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_tuple("Regex").field(&self.re).finish()
}
}
impl fmt::Display for Regex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.re)
}
}
impl Regex {
pub fn compile(re: impl AsRef<str>) -> Result<Self, Error> {
let mut ast = parse(re.as_ref())?;
ast.optimise();
let lits = ast.leading_literals();
let CompiledOps {
ops,
n_submatches,
submatch_names,
} = compile_ast(ast, false);
Ok(Self::new(
re.as_ref(),
ops,
n_submatches,
submatch_names,
lits,
))
}
fn new(
re: &str,
ops: Vec<Op>,
n_submatches: usize,
submatch_names: Vec<String>,
leading_lits: HashSet<String>,
) -> Self {
let prog: Prog = optimise(ops)
.into_iter()
.map(|op| Inst { op, generation: 0 })
.collect();
let clist = vec![Thread::default(); prog.len()].into_boxed_slice();
let nlist = vec![Thread::default(); prog.len()].into_boxed_slice();
let sms = vec![SubMatches::default(); prog.len()].into_boxed_slice();
let free_sms = (1..prog.len()).collect();
let fast_start = if leading_lits.is_empty() {
None
} else {
Some(Box::new(
AhoCorasick::new(leading_lits).expect("using auto builder so no errors possible"),
))
};
Self {
re: Arc::from(re),
inner: Mutex::new(RegexInner {
prog,
fast_start,
n_submatches,
submatch_names: Arc::from(submatch_names.into_boxed_slice()),
clist,
nlist,
generation: 0,
p: 0,
prev: None,
next: None,
sms,
free_sms,
track_submatches: true,
}),
}
}
pub fn matches<H>(&self, haystack: &H) -> bool
where
H: Haystack,
{
let mut inner = self.inner.lock().unwrap();
inner.track_submatches = false;
inner.match_from_byte_offset(haystack, 0).is_some()
}
pub fn matches_from<H>(&self, haystack: &H, offset: usize) -> bool
where
H: Haystack,
{
let mut inner = self.inner.lock().unwrap();
inner.track_submatches = false;
inner.match_from_byte_offset(haystack, offset).is_some()
}
pub fn matches_between<H>(&self, haystack: &H, from: usize, to: usize) -> bool
where
H: Haystack,
{
let mut inner = self.inner.lock().unwrap();
inner.track_submatches = false;
inner
.match_between_byte_offsets(haystack, from, to)
.is_some()
}
pub fn find<H>(&self, haystack: &H) -> Option<Match>
where
H: Haystack,
{
let mut inner = self.inner.lock().unwrap();
inner.track_submatches = true;
inner.match_from_byte_offset(haystack, 0)
}
pub fn find_from<H>(&self, haystack: &H, offset: usize) -> Option<Match>
where
H: Haystack,
{
let mut inner = self.inner.lock().unwrap();
inner.track_submatches = true;
inner.match_from_byte_offset(haystack, offset)
}
pub fn find_between<H>(&self, haystack: &H, from: usize, to: usize) -> Option<Match>
where
H: Haystack,
{
let mut inner = self.inner.lock().unwrap();
inner.track_submatches = true;
inner.match_between_byte_offsets(haystack, from, to)
}
pub fn find_iter<'a, H>(&'a mut self, haystack: &'a H) -> MatchIter<'a, H>
where
H: Haystack,
{
self.inner.lock().unwrap().track_submatches = true;
MatchIter {
haystack,
r: self,
from: 0,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RevRegex(Regex);
impl RevRegex {
pub fn compile(re: impl AsRef<str>) -> Result<Self, Error> {
let mut ast = parse(re.as_ref())?;
ast.optimise();
let CompiledOps {
ops,
n_submatches,
submatch_names,
} = compile_ast(ast, true);
Ok(Self(Regex::new(
re.as_ref(),
ops,
n_submatches,
submatch_names,
HashSet::new(),
)))
}
pub fn find_rev_from<H>(&self, haystack: &H, offset: usize) -> Option<Match>
where
H: Haystack,
{
let mut inner = self.0.inner.lock().unwrap();
inner.track_submatches = true;
inner.run_vm(&mut haystack.rev_iter_between(0, offset), offset)
}
}
#[derive(Clone)]
struct RegexInner {
prog: Prog,
fast_start: Option<Box<AhoCorasick>>,
n_submatches: usize,
submatch_names: Arc<[String]>,
clist: Box<[Thread]>,
nlist: Box<[Thread]>,
sms: Box<[SubMatches]>,
free_sms: Vec<usize>,
track_submatches: bool,
generation: usize,
p: usize,
prev: Option<char>,
next: Option<char>,
}
impl RegexInner {
fn fast_update_byte_offset<H>(&self, haystack: &H, mut byte_offset: usize) -> Option<usize>
where
H: Haystack,
{
assert!(
haystack.is_contiguous(),
"fast_update_byte_offset called for discontiguous haystack"
);
let ac = self.fast_start.as_ref()?;
if let Some(m) = ac.find(haystack.substr_from(byte_offset)?.as_ref()) {
byte_offset += m.start();
}
Some(byte_offset)
}
fn match_from_byte_offset<H>(&mut self, haystack: &H, mut offset: usize) -> Option<Match>
where
H: Haystack,
{
if haystack.is_contiguous() {
offset = self
.fast_update_byte_offset(haystack, offset)
.unwrap_or(offset);
}
self.run_vm(&mut haystack.iter_from(offset)?, offset)
}
fn match_between_byte_offsets<H>(
&mut self,
haystack: &H,
mut from: usize,
to: usize,
) -> Option<Match>
where
H: Haystack,
{
if haystack.is_contiguous()
&& let Some(new_from) = self.fast_update_byte_offset(haystack, from)
{
if new_from > to {
return None; }
from = new_from;
}
self.run_vm(&mut haystack.iter_between(from, to), from)
}
fn run_vm<I>(&mut self, input: &mut I, mut sp: usize) -> Option<Match>
where
I: Iterator<Item = (usize, char)>,
{
let mut sub_matches = [0; N_SLOTS];
self.free_sms = (1..self.prog.len()).collect();
self.sms[0] = SubMatches {
refs: 1,
inner: [0; N_SLOTS],
};
self.generation += 1;
self.add_thread(Thread::default(), sp, '\0', true);
swap(&mut self.clist, &mut self.nlist);
self.generation += 1;
let mut n = self.p;
self.p = 0;
let mut matched = false;
let mut it = input.peekable();
self.prev = None;
self.next = None;
while let Some((i, ch)) = it.next() {
sp = i;
self.next = it.peek().map(|(_, c)| *c);
for i in 0..n {
if let Some(sm) = self.step_thread(i, sp, ch) {
if !self.track_submatches {
return Some(Match::synthetic(0, 0));
}
matched = true;
sub_matches = self.sms[sm].inner;
for j in i..n {
self.sm_dec_ref(self.clist[j].sm);
}
break;
}
}
swap(&mut self.clist, &mut self.nlist);
self.prev = Some(ch);
self.generation += 1;
n = self.p;
if self.p == 0 {
break;
}
self.p = 0;
}
self.prev = None;
self.next = None;
for t in self.clist.iter_mut().take(n) {
if self.prog[t.pc].op == Op::Match && self.sms[t.sm].inner[1] >= sub_matches[1] {
matched = true;
sub_matches = self.sms[t.sm].inner;
break;
}
}
if !matched {
return None;
}
Some(Match {
n_submatches: self.n_submatches,
sub_matches,
submatch_names: self.submatch_names.clone(),
})
}
#[inline]
fn step_thread(&mut self, i: usize, sp: usize, ch: char) -> Option<usize> {
let t = &self.clist[i];
match &self.prog[t.pc].op {
Op::Comp(comp) if comp.matches(ch) => match t.assertion {
Some(a) if !a.holds_for(self.prev, ch, self.next) => {
self.sm_dec_ref(t.sm);
return None;
}
_ => self.add_thread(thread(t.pc + 1, t.sm), sp, ch, false),
},
Op::Match => return Some(t.sm),
_ => self.sm_dec_ref(t.sm),
}
None
}
#[inline]
fn add_thread(&mut self, t: Thread, sp: usize, ch: char, initial: bool) {
if self.prog[t.pc].generation == self.generation {
self.sm_dec_ref(t.sm);
return; }
self.prog[t.pc].generation = self.generation;
if let Op::Jump(l1) = self.prog[t.pc].op {
let th = match t.assertion {
Some(a) => assert_thread(l1, t.sm, a),
None => thread(l1, t.sm),
};
self.add_thread(th, sp, ch, initial);
} else if let Op::Split(l1, l2) = self.prog[t.pc].op {
self.sms[t.sm].refs += 1;
let (t1, t2) = match t.assertion {
Some(a) => (assert_thread(l1, t.sm, a), assert_thread(l2, t.sm, a)),
None => (thread(l1, t.sm), thread(l2, t.sm)),
};
self.add_thread(t1, sp, ch, initial);
self.add_thread(t2, sp, ch, initial);
} else if let Op::Assertion(a) = self.prog[t.pc].op {
self.add_thread(assert_thread(t.pc + 1, t.sm, a), sp, ch, initial);
} else if let Op::Save(s) = self.prog[t.pc].op {
self.handle_save(t, s, sp, ch, initial, false)
} else if let Op::RSave(s) = self.prog[t.pc].op {
self.handle_save(t, s, sp, ch, initial, true)
} else {
self.nlist[self.p] = t;
self.p += 1;
}
}
#[inline]
fn handle_save(&mut self, t: Thread, s: usize, sp: usize, ch: char, initial: bool, rev: bool) {
let inc_bytes = if !initial && !rev { ch.len_utf8() } else { 0 };
if (!rev && s.is_multiple_of(2)) || (rev && !s.is_multiple_of(2)) {
let sm = self.sm_update(t.sm, s, sp + inc_bytes);
let th = match t.assertion {
Some(a) => assert_thread(t.pc + 1, sm, a),
None => thread(t.pc + 1, sm),
};
self.add_thread(th, sp, ch, initial);
} else {
match t.assertion {
Some(a) if !a.holds_for(self.prev, ch, self.next) => self.sm_dec_ref(t.sm),
_ => {
let sm = self.sm_update(t.sm, s, sp + inc_bytes);
self.add_thread(thread(t.pc + 1, sm), sp, ch, initial);
}
}
}
}
#[inline]
fn sm_dec_ref(&mut self, i: usize) {
if !self.track_submatches {
return;
}
self.sms[i].refs -= 1;
if self.sms[i].refs == 0 {
self.free_sms.push(i);
}
}
#[inline]
fn sm_update(&mut self, i: usize, s: usize, sp: usize) -> usize {
if !self.track_submatches || s >= N_SLOTS {
return i;
}
let i = if self.sms[i].refs == 1 {
i
} else {
self.sm_dec_ref(i);
let j = self.free_sms.swap_remove(0);
self.sms[j].inner = self.sms[i].inner;
self.sms[j].refs = 1;
j
};
self.sms[i].inner[s] = sp;
i
}
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
struct SubMatches {
refs: usize,
inner: [usize; N_SLOTS],
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
struct Thread {
pc: usize,
assertion: Option<Assertion>,
sm: usize,
}
#[inline]
fn thread(pc: usize, sm: usize) -> Thread {
Thread {
pc,
sm,
assertion: None,
}
}
#[inline]
fn assert_thread(pc: usize, sm: usize, a: Assertion) -> Thread {
Thread {
pc,
sm,
assertion: Some(a),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::buffer::Buffer;
use simple_test_case::test_case;
#[test_case("foo", "foo", Some("foo"); "literal full string")]
#[test_case("ba*", "baaaaa", Some("baaaaa"); "zero or more present")]
#[test_case("ba*", "b", Some("b"); "zero or more not present")]
#[test_case("ba+", "baaaaa", Some("baaaaa"); "one or more present")]
#[test_case("ba+", "b", None; "one or more not present")]
#[test_case("b?a", "ba", Some("ba"); "optional present")]
#[test_case("b?a", "a", Some("a"); "optional not present")]
#[test_case("a(bb)+a", "abbbba", Some("abbbba"); "article example matching")]
#[test_case("a(bb)+a", "abbba", None; "article example non matching")]
#[test_case(".*b", "123b", Some("123b"); "dot star prefix")]
#[test_case("1.*", "123b", Some("123b"); "dot star suffix")]
#[test_case("1.*b", "123b", Some("123b"); "dot star inner")]
#[test_case("(c|C)ase matters", "case matters", Some("case matters"); "alternation first")]
#[test_case("(c|C)ase matters", "Case matters", Some("Case matters"); "alternation second")]
#[test_case("(aa|bbb|c|dd)", "c", Some("c"); "chained alternation")]
#[test_case("this@*works", "this contains\nbut still works", Some("this contains\nbut still works"); "true any")]
#[test_case(r"literal\?", "literal?", Some("literal?"); "escape special char")]
#[test_case(r"literal\t", "literal\t", Some("literal\t"); "escape sequence")]
#[test_case("[abc] happy cow", "a happy cow", Some("a happy cow"); "character class")]
#[test_case("[^abc] happy cow", "a happy cow", None; "negated character class")]
#[test_case("[a-zA-Z]*", "camelCaseFtw", Some("camelCaseFtw"); "char class ranges matching")]
#[test_case("[a-zA-Z]*1", "kebab-case-not-so-much", None; "char class ranges non matching")]
#[test_case("[a-zA-Z ]*", "this should work", Some("this should work"); "char class mixed")]
#[test_case("[\\]5]*", "5]]5555]]", Some("5]]5555]]"); "char class escaped bracket")]
#[test_case("[0-9]+", "0123", Some("0123"); "digit range")]
#[test_case("[0-9]+", "0", Some("0"); "digit range range start only")]
#[test_case("25[0-5]", "255", Some("255"); "ipv4 element one")]
#[test_case("2[0-4][0-9]", "231", Some("231"); "ipv4 element two")]
#[test_case("1?[0-9]?[0-9]", "155", Some("155"); "ipv4 element three three digit")]
#[test_case("1?[0-9]?[0-9]", "72", Some("72"); "ipv4 element three two digit")]
#[test_case("1?[0-9]?[0-9]", "8", Some("8"); "ipv4 element three one digit")]
#[test_case("1?[0-9]?[0-9]", "0", Some("0"); "ipv4 element three zero")]
#[test_case("(25[0-5]|2[0-4][0-9])", "255", Some("255"); "ipv4 elements one and two matching one")]
#[test_case("(25[0-5]|2[0-4][0-9])", "219", Some("219"); "ipv4 elements one and two matching two")]
#[test_case("(25[0-5]|2[0-4][0-9])", "42", None; "ipv4 elements one and two not matching")]
#[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 elements two and three matching two")]
#[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 elements two and three matching three")]
#[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "251", Some("251"); "ipv4 all elements matching one")]
#[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 all elements matching two")]
#[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 all elements matching three")]
#[test_case(
r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
"127.0.0.1 ",
Some("127.0.0.1");
"ipv4 full"
)]
#[test_case("^foo", "foo at the start", Some("foo"); "SOL holding")]
#[test_case("^foo", "bar\nfoo at the start", Some("foo"); "SOL holding after newline")]
#[test_case("^foo", "we have foo but not at the start", None; "SOL not holding")]
#[test_case("foo$", "a line that ends with foo", Some("foo"); "BOL holding")]
#[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
#[test_case("foo$", "a line with foo in the middle", None; "BOL not holding")]
#[test_case("foo", "│foo", Some("foo"); "after a multibyte char")]
#[test_case("a{3}", "aaa", Some("aaa"); "counted repetition")]
#[test_case("a{3}", "aa", None; "counted repetition non matching")]
#[test_case("a{3,}", "aaaaaa", Some("aaaaaa"); "counted repetition at least")]
#[test_case("a{3,}", "aa", None; "counted repetition at least non matching")]
#[test_case("a{3,5}", "aaa", Some("aaa"); "counted repetition between lower")]
#[test_case("a{3,5}", "aaaaa", Some("aaaaa"); "counted repetition between upper")]
#[test_case("a{3,5}", "aaaa", Some("aaaa"); "counted repetition in range")]
#[test_case("a{3,5}", "aa", None; "counted repetition less")]
#[test_case("^a{3,5}$", "aaaaaa", None; "counted repetition more")]
#[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
#[test_case("\\bfor\\b", "forward", None; "word boundary for match at start of word")]
#[test_case("\\bfor\\b", "for ward", Some("for"); "word boundary for match not inside word")]
#[test_case("\\bfor\\b", "bob for", Some("for"); "word boundary match not at BOF")]
#[test_case("\\bfor\\b", "bob for bob", Some("for"); "word boundary match not at BOF or EOF")]
#[test_case("\\bin\\b", "min", None; "word boundary for match at end of word")]
#[test_case("\\b(in)\\b", "min", None; "word boundary for sub expression match at end of word")]
#[test_case("\\b(in|for)\\b", "min", None; "word boundary for alt match at end of word")]
#[test_case("\\b(in|for)\\b", "bob for", Some("for"); "word boundary for alt match not at BOF")]
#[test_case("[a-zA-Z0-9_\\-./@]+\\.jpe?g", "glenda_space_medium.jpg", Some("glenda_space_medium.jpg"); "complex group")]
#[test_case("[a-zA-Z¡-0-9_\\-./@]+", "foo-bar_99.pdf", Some("foo-bar_99.pdf"); "multibyte group")]
#[test]
fn find_works(re: &str, s: &str, expected: Option<&str>) {
let r = Regex::compile(re).unwrap();
let m = r.find(&s).map(|m| m.match_text(&s));
assert_eq!(m.as_deref(), expected);
}
#[test_case("foo", "foo", Some("foo"); "literal full string")]
#[test_case("ba*", " baaaaa foo", Some("baaaaa"); "zero or more present")] #[test_case("ba*", "b foo", Some("b"); "zero or more not present")] #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
#[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
#[test_case(
r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
"127.0.0.1 ",
Some("127.0.0.1");
"ipv4 full"
)]
#[test_case(
"his",
"this is a line\nand another\n- [ ] something to do\n",
Some("his");
"multiline input"
)]
#[test]
fn find_rev_works(re: &str, s: &str, expected: Option<&str>) {
let r = RevRegex::compile(re).unwrap();
let b = Buffer::new_unnamed(0, s, Default::default());
let m = r.find_rev_from(&b, s.len()).map(|m| m.match_text(&b));
assert_eq!(m.as_deref(), expected);
}
#[test_case("[0-9]+", " 42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers")]
#[test_case("[0-9]+", " 42 3 127 9991", &["42", "3", "127", "9991"]; "integers to EOF")]
#[test_case("[0-9]+", "42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers from BOF")]
#[test_case("[0-9]+", "42 3 127 9991", &["42", "3", "127", "9991"]; "integers full input")]
#[test_case("foo|bar|baz", "baz bar foo bar", &["baz", "bar", "foo", "bar"]; "alts spaced in s")]
#[test_case("foo|bar|baz", "bazbarfoobar", &["baz", "bar", "foo", "bar"]; "alts back to back in s")]
#[test_case("(foo|bar|baz)", "foo foobar barfoo baz", &["foo", "foo", "bar", "bar", "foo", "baz"]; "alts in parens")]
#[test_case("\\b(foo|bar|baz)\\b", "foo foobar barfoo baz", &["foo", "baz"]; "alts with word boundaries")]
#[test]
fn find_iter_works(re: &str, s: &str, expected: &[&str]) {
let mut r = Regex::compile(re).unwrap();
let matches: Vec<String> = r
.find_iter(&s)
.map(|m| m.match_text(&s).into_owned())
.collect();
assert_eq!(&matches, expected);
}
#[test]
fn dot_star_works() {
let r = Regex::compile(".*").unwrap();
let s = "\nthis is\na multiline\nfile";
let m1 = r.find(&s).unwrap();
assert_eq!(m1.match_text(&s), "");
let m2 = r.find(&&s[1..]).unwrap();
assert_eq!(m2.match_text(&&s[1..]), "this is");
}
#[test]
fn match_extraction_works() {
let re = "([0-9]+)-([0-9]+)-([0-9]+)";
let r = Regex::compile(re).unwrap();
let s = "this should work 123-456-789 other stuff";
let m = r.find(&s).unwrap();
assert_eq!(m.match_text(&s), "123-456-789");
assert_eq!(m.submatch_text(1, &s).as_deref(), Some("123"));
assert_eq!(m.submatch_text(2, &s).as_deref(), Some("456"));
assert_eq!(m.submatch_text(3, &s).as_deref(), Some("789"));
}
#[test_case("(?<xy>X|Y)", "xy", "X"; "named match on its own")]
#[test_case("(?<xy>X|Y)(a|b)", "xy", "X"; "named match before unnamed")]
#[test_case("(e| )(?<xy>X|Y)", "xy", "X"; "named match after unnamed")]
#[test_case("(e| )(?<xy>X|Y)(a|b)", "xy", "X"; "named match inbetween unnamed")]
#[test]
fn named_submatch_works(re: &str, name: &str, expected: &str) {
let r = Regex::compile(re).unwrap();
let s = "text before Xanadu";
let m = r.find(&s).unwrap();
assert_eq!(m.named_matches(), vec![name]);
assert_eq!(m.submatch_text_by_name(name, &s).as_deref(), Some(expected));
}
#[test]
fn multiline_input_match_dot_star_works() {
let r = Regex::compile(".*").unwrap();
let s = "this is\na multiline\nfile";
let m = r.find(&s).unwrap();
assert_eq!(m.match_text(&s), "this is");
}
#[test]
fn multiline_input_find_from_dot_star_works_with_non_zero_initial_sp() {
let r = Regex::compile(".*").unwrap();
let s = "this is\na multiline\nfile";
assert_eq!(s.chars().skip(7).collect::<String>(), "\na multiline\nfile");
let m1 = r.find_from(&s, 7).unwrap();
assert_eq!(m1.match_text(&s), "");
let m2 = r.find_from(&s, 8).unwrap();
assert_eq!(m2.match_text(&s), "a multiline");
}
#[test]
fn multiline_input_find_iter_dot_star_works() {
let mut r = Regex::compile(".*").unwrap();
let s = "this is\na multiline\nfile";
let mut it = r.find_iter(&s);
let m1 = it.next().unwrap();
assert_eq!(m1.match_text(&s), "this is");
let m2 = it.next().unwrap();
assert_eq!(m2.match_text(&s), "");
let m3 = it.next().unwrap();
assert_eq!(m3.match_text(&s), "a multiline");
let m4 = it.next().unwrap();
assert_eq!(m4.match_text(&s), "");
let m5 = it.next().unwrap();
assert_eq!(m5.match_text(&s), "file");
assert_eq!(it.next(), None);
}
#[test]
fn match_extraction_works_when_multibyte_characters_are_present() {
let s: &str = "const VLINE: char = '│';
impl Editor {
";
let re = r"impl (\w+) \{";
let r = Regex::compile(re).unwrap();
let m = r.find(&s).unwrap();
assert_eq!(m.submatch_text(1, &s).as_deref(), Some("Editor"));
assert_eq!(m.match_text(&s), "impl Editor {");
}
#[test]
fn pathological_match_doesnt_explode() {
let s = "a".repeat(100);
let mut re = "a?".repeat(100);
re.push_str(&s);
let r = Regex::compile(&re).unwrap();
assert!(r.find(&s.as_str()).is_some());
}
#[test]
fn repeated_match_works() {
let re = "a(bb)+a";
let r = Regex::compile(re).unwrap();
for _ in 0..10 {
assert!(r.find(&"abbbba").is_some());
assert!(r.find(&"foo").is_none());
}
}
#[test_case("Tracing_Summit_2025_Perfet_RYQoyoF.pdf"; "tracing")]
#[test_case("Bracing_Summit_2025_Perfet_RYQoyoF.pdf"; "bracing")]
#[test]
fn leading_literal_truncation_doesnt_affect_matching(s: &str) {
let re = "([a-zA-Z¡-�0-9_\\-./@]+).[Pp][Dd][Ff]";
let r = Regex::compile(re).unwrap();
let m = r.find(&s).unwrap();
assert_eq!(m.match_text(&s), s);
}
}