1use crate::regex::{
12 Error, Haystack,
13 ast::{Assertion, parse},
14 compile::{CompiledOps, Inst, Op, Prog, compile_ast, optimise},
15 matches::{Match, MatchIter},
16};
17use aho_corasick::AhoCorasick;
18use std::{
19 collections::HashSet,
20 fmt,
21 mem::swap,
22 sync::{Arc, Mutex},
23};
24
25pub(super) const N_SLOTS: usize = 30;
26
27pub struct Regex {
34 re: Arc<str>,
35 inner: Mutex<RegexInner>,
36}
37
38impl Clone for Regex {
39 fn clone(&self) -> Self {
40 let inner = self.inner.lock().unwrap().clone();
41
42 Self {
43 re: self.re.clone(),
44 inner: Mutex::new(inner),
45 }
46 }
47}
48
49impl PartialEq for Regex {
50 fn eq(&self, other: &Self) -> bool {
51 self.re == other.re
52 }
53}
54
55impl Eq for Regex {}
56
57impl fmt::Debug for Regex {
58 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
59 f.debug_tuple("Regex").field(&self.re).finish()
60 }
61}
62
63impl fmt::Display for Regex {
64 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65 write!(f, "{}", self.re)
66 }
67}
68
69impl Regex {
70 pub fn compile(re: impl AsRef<str>) -> Result<Self, Error> {
75 let mut ast = parse(re.as_ref())?;
76 ast.optimise();
77 let lits = ast.leading_literals();
78 let CompiledOps {
79 ops,
80 n_submatches,
81 submatch_names,
82 } = compile_ast(ast, false);
83
84 Ok(Self::new(
85 re.as_ref(),
86 ops,
87 n_submatches,
88 submatch_names,
89 lits,
90 ))
91 }
92
93 fn new(
94 re: &str,
95 ops: Vec<Op>,
96 n_submatches: usize,
97 submatch_names: Vec<String>,
98 leading_lits: HashSet<String>,
99 ) -> Self {
100 let prog: Prog = optimise(ops)
101 .into_iter()
102 .map(|op| Inst { op, generation: 0 })
103 .collect();
104
105 let clist = vec![Thread::default(); prog.len()].into_boxed_slice();
106 let nlist = vec![Thread::default(); prog.len()].into_boxed_slice();
107 let sms = vec![SubMatches::default(); prog.len()].into_boxed_slice();
108 let free_sms = (1..prog.len()).collect();
109
110 let fast_start = if leading_lits.is_empty() {
111 None
112 } else {
113 Some(Box::new(
114 AhoCorasick::new(leading_lits).expect("using auto builder so no errors possible"),
115 ))
116 };
117
118 Self {
119 re: Arc::from(re),
120 inner: Mutex::new(RegexInner {
121 prog,
122 fast_start,
123 n_submatches,
124 submatch_names: Arc::from(submatch_names.into_boxed_slice()),
125 clist,
126 nlist,
127 generation: 0,
128 p: 0,
129 prev: None,
130 next: None,
131 sms,
132 free_sms,
133 track_submatches: true,
134 }),
135 }
136 }
137
138 pub fn matches<H>(&self, haystack: &H) -> bool
141 where
142 H: Haystack,
143 {
144 let mut inner = self.inner.lock().unwrap();
145 inner.track_submatches = false;
146 inner.match_from_byte_offset(haystack, 0).is_some()
147 }
148
149 pub fn matches_from<H>(&self, haystack: &H, offset: usize) -> bool
152 where
153 H: Haystack,
154 {
155 let mut inner = self.inner.lock().unwrap();
156 inner.track_submatches = false;
157 inner.match_from_byte_offset(haystack, offset).is_some()
158 }
159
160 pub fn matches_between<H>(&self, haystack: &H, from: usize, to: usize) -> bool
163 where
164 H: Haystack,
165 {
166 let mut inner = self.inner.lock().unwrap();
167 inner.track_submatches = false;
168 inner
169 .match_between_byte_offsets(haystack, from, to)
170 .is_some()
171 }
172
173 pub fn find<H>(&self, haystack: &H) -> Option<Match>
179 where
180 H: Haystack,
181 {
182 let mut inner = self.inner.lock().unwrap();
183 inner.track_submatches = true;
184 inner.match_from_byte_offset(haystack, 0)
185 }
186
187 pub fn find_from<H>(&self, haystack: &H, offset: usize) -> Option<Match>
194 where
195 H: Haystack,
196 {
197 let mut inner = self.inner.lock().unwrap();
198 inner.track_submatches = true;
199 inner.match_from_byte_offset(haystack, offset)
200 }
201
202 pub fn find_between<H>(&self, haystack: &H, from: usize, to: usize) -> Option<Match>
209 where
210 H: Haystack,
211 {
212 let mut inner = self.inner.lock().unwrap();
213 inner.track_submatches = true;
214 inner.match_between_byte_offsets(haystack, from, to)
215 }
216
217 pub fn find_iter<'a, H>(&'a mut self, haystack: &'a H) -> MatchIter<'a, H>
220 where
221 H: Haystack,
222 {
223 self.inner.lock().unwrap().track_submatches = true;
224
225 MatchIter {
226 haystack,
227 r: self,
228 from: 0,
229 }
230 }
231}
232
233#[derive(Debug, Clone, PartialEq, Eq)]
234pub struct RevRegex(Regex);
235
236impl RevRegex {
237 pub fn compile(re: impl AsRef<str>) -> Result<Self, Error> {
243 let mut ast = parse(re.as_ref())?;
244 ast.optimise();
245 let CompiledOps {
246 ops,
247 n_submatches,
248 submatch_names,
249 } = compile_ast(ast, true);
250
251 Ok(Self(Regex::new(
252 re.as_ref(),
253 ops,
254 n_submatches,
255 submatch_names,
256 HashSet::new(),
257 )))
258 }
259
260 pub fn find_rev_from<H>(&self, haystack: &H, offset: usize) -> Option<Match>
264 where
265 H: Haystack,
266 {
267 let mut inner = self.0.inner.lock().unwrap();
268 inner.track_submatches = true;
269 inner.run_vm(&mut haystack.rev_iter_between(0, offset), offset)
270 }
271}
272
273#[derive(Clone)]
274struct RegexInner {
275 prog: Prog,
277 fast_start: Option<Box<AhoCorasick>>,
279 n_submatches: usize,
281 submatch_names: Arc<[String]>,
283 clist: Box<[Thread]>,
285 nlist: Box<[Thread]>,
287 sms: Box<[SubMatches]>,
289 free_sms: Vec<usize>,
291 track_submatches: bool,
292 generation: usize,
295 p: usize,
297 prev: Option<char>,
299 next: Option<char>,
301}
302
303impl RegexInner {
304 fn fast_update_byte_offset<H>(&self, haystack: &H, mut byte_offset: usize) -> Option<usize>
309 where
310 H: Haystack,
311 {
312 assert!(
313 haystack.is_contiguous(),
314 "fast_update_byte_offset called for discontiguous haystack"
315 );
316
317 let ac = self.fast_start.as_ref()?;
318 if let Some(m) = ac.find(haystack.substr_from(byte_offset)?.as_ref()) {
324 byte_offset += m.start();
327 }
328
329 Some(byte_offset)
330 }
331
332 fn match_from_byte_offset<H>(&mut self, haystack: &H, mut offset: usize) -> Option<Match>
337 where
338 H: Haystack,
339 {
340 if haystack.is_contiguous() {
341 offset = self
342 .fast_update_byte_offset(haystack, offset)
343 .unwrap_or(offset);
344 }
345
346 self.run_vm(&mut haystack.iter_from(offset)?, offset)
347 }
348
349 fn match_between_byte_offsets<H>(
350 &mut self,
351 haystack: &H,
352 mut from: usize,
353 to: usize,
354 ) -> Option<Match>
355 where
356 H: Haystack,
357 {
358 if haystack.is_contiguous()
359 && let Some(new_from) = self.fast_update_byte_offset(haystack, from)
360 {
361 if new_from > to {
362 return None; }
364 from = new_from;
365 }
366
367 self.run_vm(&mut haystack.iter_between(from, to), from)
368 }
369
370 fn run_vm<I>(&mut self, input: &mut I, mut sp: usize) -> Option<Match>
379 where
380 I: Iterator<Item = (usize, char)>,
381 {
382 let mut sub_matches = [0; N_SLOTS];
383 self.free_sms = (1..self.prog.len()).collect();
384 self.sms[0] = SubMatches {
385 refs: 1,
386 inner: [0; N_SLOTS],
387 };
388
389 self.generation += 1;
392 self.add_thread(Thread::default(), sp, '\0', true);
396 swap(&mut self.clist, &mut self.nlist);
397 self.generation += 1;
398
399 let mut n = self.p;
402 self.p = 0;
403 let mut matched = false;
404
405 let mut it = input.peekable();
406 self.prev = None;
407 self.next = None;
408
409 while let Some((i, ch)) = it.next() {
410 sp = i;
411 self.next = it.peek().map(|(_, c)| *c);
412
413 for i in 0..n {
414 if let Some(sm) = self.step_thread(i, sp, ch) {
415 if !self.track_submatches {
416 return Some(Match::synthetic(0, 0));
417 }
418
419 matched = true;
420 sub_matches = self.sms[sm].inner;
421
422 for j in i..n {
425 self.sm_dec_ref(self.clist[j].sm);
426 }
427
428 break;
429 }
430 }
431
432 swap(&mut self.clist, &mut self.nlist);
433 self.prev = Some(ch);
434 self.generation += 1;
435 n = self.p;
436
437 if self.p == 0 {
438 break;
439 }
440
441 self.p = 0;
442 }
443
444 self.prev = None;
445 self.next = None;
446
447 for t in self.clist.iter_mut().take(n) {
450 if self.prog[t.pc].op == Op::Match && self.sms[t.sm].inner[1] >= sub_matches[1] {
451 matched = true;
452 sub_matches = self.sms[t.sm].inner;
453 break;
454 }
455 }
456
457 if !matched {
458 return None;
459 }
460
461 Some(Match {
462 n_submatches: self.n_submatches,
463 sub_matches,
464 submatch_names: self.submatch_names.clone(),
465 })
466 }
467
468 #[inline]
469 fn step_thread(&mut self, i: usize, sp: usize, ch: char) -> Option<usize> {
470 let t = &self.clist[i];
471 match &self.prog[t.pc].op {
472 Op::Comp(comp) if comp.matches(ch) => match t.assertion {
474 Some(a) if !a.holds_for(self.prev, ch, self.next) => {
475 self.sm_dec_ref(t.sm);
476 return None;
477 }
478 _ => self.add_thread(thread(t.pc + 1, t.sm), sp, ch, false),
479 },
480
481 Op::Match => return Some(t.sm),
482
483 _ => self.sm_dec_ref(t.sm),
486 }
487
488 None
489 }
490
491 #[inline]
492 fn add_thread(&mut self, t: Thread, sp: usize, ch: char, initial: bool) {
493 if self.prog[t.pc].generation == self.generation {
494 self.sm_dec_ref(t.sm);
495 return; }
497 self.prog[t.pc].generation = self.generation;
498
499 if let Op::Jump(l1) = self.prog[t.pc].op {
504 let th = match t.assertion {
505 Some(a) => assert_thread(l1, t.sm, a),
506 None => thread(l1, t.sm),
507 };
508 self.add_thread(th, sp, ch, initial);
509 } else if let Op::Split(l1, l2) = self.prog[t.pc].op {
510 self.sms[t.sm].refs += 1;
511 let (t1, t2) = match t.assertion {
512 Some(a) => (assert_thread(l1, t.sm, a), assert_thread(l2, t.sm, a)),
513 None => (thread(l1, t.sm), thread(l2, t.sm)),
514 };
515 self.add_thread(t1, sp, ch, initial);
516 self.add_thread(t2, sp, ch, initial);
517 } else if let Op::Assertion(a) = self.prog[t.pc].op {
518 self.add_thread(assert_thread(t.pc + 1, t.sm, a), sp, ch, initial);
519 } else if let Op::Save(s) = self.prog[t.pc].op {
520 self.handle_save(t, s, sp, ch, initial, false)
521 } else if let Op::RSave(s) = self.prog[t.pc].op {
522 self.handle_save(t, s, sp, ch, initial, true)
523 } else {
524 self.nlist[self.p] = t;
525 self.p += 1;
526 }
527 }
528
529 #[inline]
530 fn handle_save(&mut self, t: Thread, s: usize, sp: usize, ch: char, initial: bool, rev: bool) {
531 let inc_bytes = if !initial && !rev { ch.len_utf8() } else { 0 };
535
536 if (!rev && s.is_multiple_of(2)) || (rev && !s.is_multiple_of(2)) {
537 let sm = self.sm_update(t.sm, s, sp + inc_bytes);
538 let th = match t.assertion {
539 Some(a) => assert_thread(t.pc + 1, sm, a),
540 None => thread(t.pc + 1, sm),
541 };
542 self.add_thread(th, sp, ch, initial);
543 } else {
544 match t.assertion {
545 Some(a) if !a.holds_for(self.prev, ch, self.next) => self.sm_dec_ref(t.sm),
546 _ => {
547 let sm = self.sm_update(t.sm, s, sp + inc_bytes);
548 self.add_thread(thread(t.pc + 1, sm), sp, ch, initial);
549 }
550 }
551 }
552 }
553
554 #[inline]
555 fn sm_dec_ref(&mut self, i: usize) {
556 if !self.track_submatches {
557 return;
558 }
559
560 self.sms[i].refs -= 1;
561 if self.sms[i].refs == 0 {
562 self.free_sms.push(i);
563 }
564 }
565
566 #[inline]
567 fn sm_update(&mut self, i: usize, s: usize, sp: usize) -> usize {
568 if !self.track_submatches || s >= N_SLOTS {
571 return i;
572 }
573
574 let i = if self.sms[i].refs == 1 {
575 i
576 } else {
577 self.sm_dec_ref(i);
578 let j = self.free_sms.swap_remove(0);
579 self.sms[j].inner = self.sms[i].inner;
580 self.sms[j].refs = 1;
581 j
582 };
583
584 self.sms[i].inner[s] = sp;
585
586 i
587 }
588}
589
590#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
591struct SubMatches {
592 refs: usize,
594 inner: [usize; N_SLOTS],
597}
598
599#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
600struct Thread {
601 pc: usize,
603 assertion: Option<Assertion>,
605 sm: usize,
607}
608
609#[inline]
610fn thread(pc: usize, sm: usize) -> Thread {
611 Thread {
612 pc,
613 sm,
614 assertion: None,
615 }
616}
617
618#[inline]
619fn assert_thread(pc: usize, sm: usize, a: Assertion) -> Thread {
620 Thread {
621 pc,
622 sm,
623 assertion: Some(a),
624 }
625}
626
627#[cfg(test)]
628mod tests {
629 use super::*;
630 use crate::buffer::Buffer;
631 use simple_test_case::test_case;
632
633 #[test_case("foo", "foo", Some("foo"); "literal full string")]
635 #[test_case("ba*", "baaaaa", Some("baaaaa"); "zero or more present")]
636 #[test_case("ba*", "b", Some("b"); "zero or more not present")]
637 #[test_case("ba+", "baaaaa", Some("baaaaa"); "one or more present")]
638 #[test_case("ba+", "b", None; "one or more not present")]
639 #[test_case("b?a", "ba", Some("ba"); "optional present")]
640 #[test_case("b?a", "a", Some("a"); "optional not present")]
641 #[test_case("a(bb)+a", "abbbba", Some("abbbba"); "article example matching")]
642 #[test_case("a(bb)+a", "abbba", None; "article example non matching")]
643 #[test_case(".*b", "123b", Some("123b"); "dot star prefix")]
644 #[test_case("1.*", "123b", Some("123b"); "dot star suffix")]
645 #[test_case("1.*b", "123b", Some("123b"); "dot star inner")]
646 #[test_case("(c|C)ase matters", "case matters", Some("case matters"); "alternation first")]
647 #[test_case("(c|C)ase matters", "Case matters", Some("Case matters"); "alternation second")]
648 #[test_case("(aa|bbb|c|dd)", "c", Some("c"); "chained alternation")]
649 #[test_case("this@*works", "this contains\nbut still works", Some("this contains\nbut still works"); "true any")]
650 #[test_case(r"literal\?", "literal?", Some("literal?"); "escape special char")]
651 #[test_case(r"literal\t", "literal\t", Some("literal\t"); "escape sequence")]
652 #[test_case("[abc] happy cow", "a happy cow", Some("a happy cow"); "character class")]
653 #[test_case("[^abc] happy cow", "a happy cow", None; "negated character class")]
654 #[test_case("[a-zA-Z]*", "camelCaseFtw", Some("camelCaseFtw"); "char class ranges matching")]
655 #[test_case("[a-zA-Z]*1", "kebab-case-not-so-much", None; "char class ranges non matching")]
656 #[test_case("[a-zA-Z ]*", "this should work", Some("this should work"); "char class mixed")]
657 #[test_case("[\\]5]*", "5]]5555]]", Some("5]]5555]]"); "char class escaped bracket")]
658 #[test_case("[0-9]+", "0123", Some("0123"); "digit range")]
659 #[test_case("[0-9]+", "0", Some("0"); "digit range range start only")]
660 #[test_case("25[0-5]", "255", Some("255"); "ipv4 element one")]
661 #[test_case("2[0-4][0-9]", "231", Some("231"); "ipv4 element two")]
662 #[test_case("1?[0-9]?[0-9]", "155", Some("155"); "ipv4 element three three digit")]
663 #[test_case("1?[0-9]?[0-9]", "72", Some("72"); "ipv4 element three two digit")]
664 #[test_case("1?[0-9]?[0-9]", "8", Some("8"); "ipv4 element three one digit")]
665 #[test_case("1?[0-9]?[0-9]", "0", Some("0"); "ipv4 element three zero")]
666 #[test_case("(25[0-5]|2[0-4][0-9])", "255", Some("255"); "ipv4 elements one and two matching one")]
667 #[test_case("(25[0-5]|2[0-4][0-9])", "219", Some("219"); "ipv4 elements one and two matching two")]
668 #[test_case("(25[0-5]|2[0-4][0-9])", "42", None; "ipv4 elements one and two not matching")]
669 #[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 elements two and three matching two")]
670 #[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 elements two and three matching three")]
671 #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "251", Some("251"); "ipv4 all elements matching one")]
672 #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 all elements matching two")]
673 #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 all elements matching three")]
674 #[test_case(
675 r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
676 "127.0.0.1 ",
677 Some("127.0.0.1");
678 "ipv4 full"
679 )]
680 #[test_case("^foo", "foo at the start", Some("foo"); "SOL holding")]
681 #[test_case("^foo", "bar\nfoo at the start", Some("foo"); "SOL holding after newline")]
682 #[test_case("^foo", "we have foo but not at the start", None; "SOL not holding")]
683 #[test_case("foo$", "a line that ends with foo", Some("foo"); "BOL holding")]
684 #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
685 #[test_case("foo$", "a line with foo in the middle", None; "BOL not holding")]
686 #[test_case("foo", "│foo", Some("foo"); "after a multibyte char")]
687 #[test_case("a{3}", "aaa", Some("aaa"); "counted repetition")]
688 #[test_case("a{3}", "aa", None; "counted repetition non matching")]
689 #[test_case("a{3,}", "aaaaaa", Some("aaaaaa"); "counted repetition at least")]
690 #[test_case("a{3,}", "aa", None; "counted repetition at least non matching")]
691 #[test_case("a{3,5}", "aaa", Some("aaa"); "counted repetition between lower")]
692 #[test_case("a{3,5}", "aaaaa", Some("aaaaa"); "counted repetition between upper")]
693 #[test_case("a{3,5}", "aaaa", Some("aaaa"); "counted repetition in range")]
694 #[test_case("a{3,5}", "aa", None; "counted repetition less")]
695 #[test_case("^a{3,5}$", "aaaaaa", None; "counted repetition more")]
696 #[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
697 #[test_case("\\bfor\\b", "forward", None; "word boundary for match at start of word")]
698 #[test_case("\\bfor\\b", "for ward", Some("for"); "word boundary for match not inside word")]
699 #[test_case("\\bfor\\b", "bob for", Some("for"); "word boundary match not at BOF")]
700 #[test_case("\\bfor\\b", "bob for bob", Some("for"); "word boundary match not at BOF or EOF")]
701 #[test_case("\\bin\\b", "min", None; "word boundary for match at end of word")]
702 #[test_case("\\b(in)\\b", "min", None; "word boundary for sub expression match at end of word")]
703 #[test_case("\\b(in|for)\\b", "min", None; "word boundary for alt match at end of word")]
704 #[test_case("\\b(in|for)\\b", "bob for", Some("for"); "word boundary for alt match not at BOF")]
705 #[test_case("[a-zA-Z0-9_\\-./@]+\\.jpe?g", "glenda_space_medium.jpg", Some("glenda_space_medium.jpg"); "complex group")]
706 #[test_case("[a-zA-Z¡-0-9_\\-./@]+", "foo-bar_99.pdf", Some("foo-bar_99.pdf"); "multibyte group")]
707 #[test]
709 fn find_works(re: &str, s: &str, expected: Option<&str>) {
710 let r = Regex::compile(re).unwrap();
711 let m = r.find(&s).map(|m| m.match_text(&s));
712 assert_eq!(m.as_deref(), expected);
713 }
714
715 #[test_case("foo", "foo", Some("foo"); "literal full string")]
716 #[test_case("ba*", " baaaaa foo", Some("baaaaa"); "zero or more present")] #[test_case("ba*", "b foo", Some("b"); "zero or more not present")] #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
719 #[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
720 #[test_case(
721 r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
722 "127.0.0.1 ",
723 Some("127.0.0.1");
724 "ipv4 full"
725 )]
726 #[test_case(
727 "his",
728 "this is a line\nand another\n- [ ] something to do\n",
729 Some("his");
730 "multiline input"
731 )]
732 #[test]
733 fn find_rev_works(re: &str, s: &str, expected: Option<&str>) {
734 let r = RevRegex::compile(re).unwrap();
735 let b = Buffer::new_unnamed(0, s, Default::default());
736 let m = r.find_rev_from(&b, s.len()).map(|m| m.match_text(&b));
737
738 assert_eq!(m.as_deref(), expected);
739 }
740
741 #[test_case("[0-9]+", " 42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers")]
742 #[test_case("[0-9]+", " 42 3 127 9991", &["42", "3", "127", "9991"]; "integers to EOF")]
743 #[test_case("[0-9]+", "42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers from BOF")]
744 #[test_case("[0-9]+", "42 3 127 9991", &["42", "3", "127", "9991"]; "integers full input")]
745 #[test_case("foo|bar|baz", "baz bar foo bar", &["baz", "bar", "foo", "bar"]; "alts spaced in s")]
746 #[test_case("foo|bar|baz", "bazbarfoobar", &["baz", "bar", "foo", "bar"]; "alts back to back in s")]
747 #[test_case("(foo|bar|baz)", "foo foobar barfoo baz", &["foo", "foo", "bar", "bar", "foo", "baz"]; "alts in parens")]
748 #[test_case("\\b(foo|bar|baz)\\b", "foo foobar barfoo baz", &["foo", "baz"]; "alts with word boundaries")]
749 #[test]
750 fn find_iter_works(re: &str, s: &str, expected: &[&str]) {
751 let mut r = Regex::compile(re).unwrap();
752 let matches: Vec<String> = r
753 .find_iter(&s)
754 .map(|m| m.match_text(&s).into_owned())
755 .collect();
756
757 assert_eq!(&matches, expected);
758 }
759
760 #[test]
761 fn dot_star_works() {
762 let r = Regex::compile(".*").unwrap();
763 let s = "\nthis is\na multiline\nfile";
764 let m1 = r.find(&s).unwrap();
765 assert_eq!(m1.match_text(&s), "");
766
767 let m2 = r.find(&&s[1..]).unwrap();
769 assert_eq!(m2.match_text(&&s[1..]), "this is");
770 }
771
772 #[test]
773 fn match_extraction_works() {
774 let re = "([0-9]+)-([0-9]+)-([0-9]+)";
775 let r = Regex::compile(re).unwrap();
776 let s = "this should work 123-456-789 other stuff";
777 let m = r.find(&s).unwrap();
778
779 assert_eq!(m.match_text(&s), "123-456-789");
780 assert_eq!(m.submatch_text(1, &s).as_deref(), Some("123"));
781 assert_eq!(m.submatch_text(2, &s).as_deref(), Some("456"));
782 assert_eq!(m.submatch_text(3, &s).as_deref(), Some("789"));
783 }
784
785 #[test_case("(?<xy>X|Y)", "xy", "X"; "named match on its own")]
786 #[test_case("(?<xy>X|Y)(a|b)", "xy", "X"; "named match before unnamed")]
787 #[test_case("(e| )(?<xy>X|Y)", "xy", "X"; "named match after unnamed")]
788 #[test_case("(e| )(?<xy>X|Y)(a|b)", "xy", "X"; "named match inbetween unnamed")]
789 #[test]
790 fn named_submatch_works(re: &str, name: &str, expected: &str) {
791 let r = Regex::compile(re).unwrap();
792 let s = "text before Xanadu";
793 let m = r.find(&s).unwrap();
794
795 assert_eq!(m.named_matches(), vec![name]);
796 assert_eq!(m.submatch_text_by_name(name, &s).as_deref(), Some(expected));
797 }
798
799 #[test]
800 fn multiline_input_match_dot_star_works() {
801 let r = Regex::compile(".*").unwrap();
802 let s = "this is\na multiline\nfile";
803
804 let m = r.find(&s).unwrap();
805 assert_eq!(m.match_text(&s), "this is");
806 }
807
808 #[test]
809 fn multiline_input_find_from_dot_star_works_with_non_zero_initial_sp() {
810 let r = Regex::compile(".*").unwrap();
811 let s = "this is\na multiline\nfile";
812
813 assert_eq!(s.chars().skip(7).collect::<String>(), "\na multiline\nfile");
815
816 let m1 = r.find_from(&s, 7).unwrap();
817 assert_eq!(m1.match_text(&s), "");
818
819 let m2 = r.find_from(&s, 8).unwrap();
820 assert_eq!(m2.match_text(&s), "a multiline");
821 }
822
823 #[test]
824 fn multiline_input_find_iter_dot_star_works() {
825 let mut r = Regex::compile(".*").unwrap();
826 let s = "this is\na multiline\nfile";
827
828 let mut it = r.find_iter(&s);
829
830 let m1 = it.next().unwrap();
833 assert_eq!(m1.match_text(&s), "this is");
834
835 let m2 = it.next().unwrap();
836 assert_eq!(m2.match_text(&s), "");
837
838 let m3 = it.next().unwrap();
839 assert_eq!(m3.match_text(&s), "a multiline");
840
841 let m4 = it.next().unwrap();
842 assert_eq!(m4.match_text(&s), "");
843
844 let m5 = it.next().unwrap();
845 assert_eq!(m5.match_text(&s), "file");
846 assert_eq!(it.next(), None);
847 }
848
849 #[test]
850 fn match_extraction_works_when_multibyte_characters_are_present() {
851 let s: &str = "const VLINE: char = '│';
852
853impl Editor {
854";
855
856 let re = r"impl (\w+) \{";
857 let r = Regex::compile(re).unwrap();
858 let m = r.find(&s).unwrap();
859
860 assert_eq!(m.submatch_text(1, &s).as_deref(), Some("Editor"));
861 assert_eq!(m.match_text(&s), "impl Editor {");
862 }
863
864 #[test]
867 fn pathological_match_doesnt_explode() {
868 let s = "a".repeat(100);
869 let mut re = "a?".repeat(100);
870 re.push_str(&s);
871
872 let r = Regex::compile(&re).unwrap();
873 assert!(r.find(&s.as_str()).is_some());
874 }
875
876 #[test]
879 fn repeated_match_works() {
880 let re = "a(bb)+a";
881
882 let r = Regex::compile(re).unwrap();
883 for _ in 0..10 {
884 assert!(r.find(&"abbbba").is_some());
885 assert!(r.find(&"foo").is_none());
886 }
887 }
888
889 #[test_case("Tracing_Summit_2025_Perfet_RYQoyoF.pdf"; "tracing")]
891 #[test_case("Bracing_Summit_2025_Perfet_RYQoyoF.pdf"; "bracing")]
892 #[test]
893 fn leading_literal_truncation_doesnt_affect_matching(s: &str) {
894 let re = "([a-zA-Z¡-�0-9_\\-./@]+).[Pp][Dd][Ff]";
895 let r = Regex::compile(re).unwrap();
896 let m = r.find(&s).unwrap();
897
898 assert_eq!(m.match_text(&s), s);
899 }
900}