pcre2/bytes.rs
1use std::{
2 collections::HashMap,
3 panic::{RefUnwindSafe, UnwindSafe},
4 sync::Arc,
5};
6
7use pcre2_sys::{
8 PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF,
9 PCRE2_MULTILINE, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF,
10};
11
12use crate::{
13 error::Error,
14 ffi::{Code, CompileContext, MatchConfig, MatchData},
15 pool::{Pool, PoolGuard},
16};
17
18/// Match represents a single match of a regex in a subject string.
19///
20/// The lifetime parameter `'s` refers to the lifetime of the matched portion
21/// of the subject string.
22#[derive(Clone, Copy, Debug, Eq, PartialEq)]
23pub struct Match<'s> {
24 subject: &'s [u8],
25 start: usize,
26 end: usize,
27}
28
29impl<'s> Match<'s> {
30 /// Returns the starting byte offset of the match in the subject.
31 #[inline]
32 pub fn start(&self) -> usize {
33 self.start
34 }
35
36 /// Returns the ending byte offset of the match in the subject.
37 #[inline]
38 pub fn end(&self) -> usize {
39 self.end
40 }
41
42 /// Returns the matched portion of the subject string.
43 #[inline]
44 pub fn as_bytes(&self) -> &'s [u8] {
45 &self.subject[self.start..self.end]
46 }
47
48 /// Creates a new match from the given subject string and byte offsets.
49 fn new(subject: &'s [u8], start: usize, end: usize) -> Match<'s> {
50 Match { subject, start, end }
51 }
52
53 #[cfg(test)]
54 fn as_pair(&self) -> (usize, usize) {
55 (self.start, self.end)
56 }
57}
58
59#[derive(Clone, Debug)]
60struct Config {
61 /// PCRE2_CASELESS
62 caseless: bool,
63 /// PCRE2_DOTALL
64 dotall: bool,
65 /// PCRE2_EXTENDED
66 extended: bool,
67 /// PCRE2_MULTILINE
68 multi_line: bool,
69 /// PCRE2_NEWLINE_ANYCRLF
70 crlf: bool,
71 /// PCRE2_UCP
72 ucp: bool,
73 /// PCRE2_UTF
74 utf: bool,
75 /// use pcre2_jit_compile
76 jit: JITChoice,
77 /// Match-time specific configuration knobs.
78 match_config: MatchConfig,
79}
80
81#[derive(Clone, Debug)]
82enum JITChoice {
83 /// Never do JIT compilation.
84 Never,
85 /// Always do JIT compilation and return an error if it fails.
86 Always,
87 /// Attempt to do JIT compilation but silently fall back to non-JIT.
88 Attempt,
89}
90
91impl Default for Config {
92 fn default() -> Config {
93 Config {
94 caseless: false,
95 dotall: false,
96 extended: false,
97 multi_line: false,
98 crlf: false,
99 ucp: false,
100 utf: false,
101 jit: JITChoice::Never,
102 match_config: MatchConfig::default(),
103 }
104 }
105}
106
107/// A builder for configuring the compilation of a PCRE2 regex.
108#[derive(Clone, Debug)]
109pub struct RegexBuilder {
110 config: Config,
111}
112
113impl RegexBuilder {
114 /// Create a new builder with a default configuration.
115 pub fn new() -> RegexBuilder {
116 RegexBuilder { config: Config::default() }
117 }
118
119 /// Compile the given pattern into a PCRE regex using the current
120 /// configuration.
121 ///
122 /// If there was a problem compiling the pattern, then an error is
123 /// returned.
124 pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
125 let mut options = 0;
126 if self.config.caseless {
127 options |= PCRE2_CASELESS;
128 }
129 if self.config.dotall {
130 options |= PCRE2_DOTALL;
131 }
132 if self.config.extended {
133 options |= PCRE2_EXTENDED;
134 }
135 if self.config.multi_line {
136 options |= PCRE2_MULTILINE;
137 }
138 if self.config.ucp {
139 options |= PCRE2_UCP;
140 options |= PCRE2_UTF;
141 options |= PCRE2_MATCH_INVALID_UTF;
142 }
143 if self.config.utf {
144 options |= PCRE2_UTF;
145 }
146
147 let mut ctx = CompileContext::new();
148 if self.config.crlf {
149 ctx.set_newline(PCRE2_NEWLINE_ANYCRLF)
150 .expect("PCRE2_NEWLINE_ANYCRLF is a legal value");
151 }
152
153 let mut code = Code::new(pattern, options, ctx)?;
154 match self.config.jit {
155 JITChoice::Never => {} // fallthrough
156 JITChoice::Always => {
157 code.jit_compile()?;
158 }
159 JITChoice::Attempt => {
160 if let Err(err) = code.jit_compile() {
161 log::debug!("JIT compilation failed: {}", err);
162 }
163 }
164 }
165 let capture_names = code.capture_names()?;
166 let mut idx = HashMap::new();
167 for (i, group) in capture_names.iter().enumerate() {
168 if let Some(ref name) = *group {
169 idx.insert(name.to_string(), i);
170 }
171 }
172 let code = Arc::new(code);
173 let match_data = {
174 let config = self.config.match_config.clone();
175 let code = Arc::clone(&code);
176 let create: MatchDataPoolFn =
177 Box::new(move || MatchData::new(config.clone(), &code));
178 Pool::new(create)
179 };
180 Ok(Regex {
181 config: Arc::new(self.config.clone()),
182 pattern: pattern.to_string(),
183 code,
184 capture_names: Arc::new(capture_names),
185 capture_names_idx: Arc::new(idx),
186 match_data,
187 })
188 }
189
190 /// Enables case insensitive matching.
191 ///
192 /// If the `utf` option is also set, then Unicode case folding is used
193 /// to determine case insensitivity. When the `utf` option is not set,
194 /// then only standard ASCII case insensitivity is considered.
195 ///
196 /// This option corresponds to the `i` flag.
197 pub fn caseless(&mut self, yes: bool) -> &mut RegexBuilder {
198 self.config.caseless = yes;
199 self
200 }
201
202 /// Enables "dot all" matching.
203 ///
204 /// When enabled, the `.` metacharacter in the pattern matches any
205 /// character, include `\n`. When disabled (the default), `.` will match
206 /// any character except for `\n`.
207 ///
208 /// This option corresponds to the `s` flag.
209 pub fn dotall(&mut self, yes: bool) -> &mut RegexBuilder {
210 self.config.dotall = yes;
211 self
212 }
213
214 /// Enable "extended" mode in the pattern, where whitespace is ignored.
215 ///
216 /// This option corresponds to the `x` flag.
217 pub fn extended(&mut self, yes: bool) -> &mut RegexBuilder {
218 self.config.extended = yes;
219 self
220 }
221
222 /// Enable multiline matching mode.
223 ///
224 /// When enabled, the `^` and `$` anchors will match both at the beginning
225 /// and end of a subject string, in addition to matching at the start of
226 /// a line and the end of a line. When disabled, the `^` and `$` anchors
227 /// will only match at the beginning and end of a subject string.
228 ///
229 /// This option corresponds to the `m` flag.
230 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
231 self.config.multi_line = yes;
232 self
233 }
234
235 /// Enable matching of CRLF as a line terminator.
236 ///
237 /// When enabled, anchors such as `^` and `$` will match any of the
238 /// following as a line terminator: `\r`, `\n` or `\r\n`.
239 ///
240 /// This is disabled by default, in which case, only `\n` is recognized as
241 /// a line terminator.
242 pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
243 self.config.crlf = yes;
244 self
245 }
246
247 /// Enable Unicode matching mode.
248 ///
249 /// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
250 /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
251 ///
252 /// When set, this implies UTF matching mode. It is not possible to enable
253 /// Unicode matching mode without enabling UTF matching mode.
254 ///
255 /// This is disabled by default.
256 pub fn ucp(&mut self, yes: bool) -> &mut RegexBuilder {
257 self.config.ucp = yes;
258 self
259 }
260
261 /// Enable UTF matching mode.
262 ///
263 /// When enabled, characters are treated as sequences of code units that
264 /// make up a single codepoint instead of as single bytes. For example,
265 /// this will cause `.` to match any single UTF-8 encoded codepoint, where
266 /// as when this is disabled, `.` will any single byte (except for `\n` in
267 /// both cases, unless "dot all" mode is enabled).
268 ///
269 /// This is disabled by default.
270 pub fn utf(&mut self, yes: bool) -> &mut RegexBuilder {
271 self.config.utf = yes;
272 self
273 }
274
275 /// This is now deprecated and is a no-op.
276 ///
277 /// Previously, this option permitted disabling PCRE2's UTF-8 validity
278 /// check, which could result in undefined behavior if the haystack was
279 /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`,
280 /// in 10.34 which this crate always sets. When this option is enabled,
281 /// PCRE2 claims to not have undefined behavior when the haystack is
282 /// invalid UTF-8.
283 ///
284 /// Therefore, disabling the UTF-8 check is not something that is exposed
285 /// by this crate.
286 #[deprecated(
287 since = "0.2.4",
288 note = "now a no-op due to new PCRE2 features"
289 )]
290 pub fn disable_utf_check(&mut self) -> &mut RegexBuilder {
291 self
292 }
293
294 /// Enable PCRE2's JIT and return an error if it's not available.
295 ///
296 /// This generally speeds up matching quite a bit. The downside is that it
297 /// can increase the time it takes to compile a pattern.
298 ///
299 /// If the JIT isn't available or if JIT compilation returns an error, then
300 /// regex compilation will fail with the corresponding error.
301 ///
302 /// This is disabled by default, and always overrides `jit_if_available`.
303 pub fn jit(&mut self, yes: bool) -> &mut RegexBuilder {
304 if yes {
305 self.config.jit = JITChoice::Always;
306 } else {
307 self.config.jit = JITChoice::Never;
308 }
309 self
310 }
311
312 /// Enable PCRE2's JIT if it's available.
313 ///
314 /// This generally speeds up matching quite a bit. The downside is that it
315 /// can increase the time it takes to compile a pattern.
316 ///
317 /// If the JIT isn't available or if JIT compilation returns an error,
318 /// then a debug message with the error will be emitted and the regex will
319 /// otherwise silently fall back to non-JIT matching.
320 ///
321 /// This is disabled by default, and always overrides `jit`.
322 pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexBuilder {
323 if yes {
324 self.config.jit = JITChoice::Attempt;
325 } else {
326 self.config.jit = JITChoice::Never;
327 }
328 self
329 }
330
331 /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
332 /// not enabled, then this has no effect.
333 ///
334 /// When `None` is given, no custom JIT stack will be created, and instead,
335 /// the default JIT stack is used. When the default is used, its maximum
336 /// size is 32 KB.
337 ///
338 /// When this is set, then a new JIT stack will be created with the given
339 /// maximum size as its limit.
340 ///
341 /// Increasing the stack size can be useful for larger regular expressions.
342 ///
343 /// By default, this is set to `None`.
344 pub fn max_jit_stack_size(
345 &mut self,
346 bytes: Option<usize>,
347 ) -> &mut RegexBuilder {
348 self.config.match_config.max_jit_stack_size = bytes;
349 self
350 }
351}
352
353/// A compiled PCRE2 regular expression.
354///
355/// This regex is safe to use from multiple threads simultaneously. For top
356/// performance, it is better to clone a new regex for each thread.
357pub struct Regex {
358 /// The configuration used to build the regex.
359 config: Arc<Config>,
360 /// The original pattern string.
361 pattern: String,
362 /// The underlying compiled PCRE2 object.
363 code: Arc<Code>,
364 /// The capture group names for this regex.
365 capture_names: Arc<Vec<Option<String>>>,
366 /// A map from capture group name to capture group index.
367 capture_names_idx: Arc<HashMap<String, usize>>,
368 /// A pool of mutable scratch data used by PCRE2 during matching.
369 match_data: MatchDataPool,
370}
371
372impl Clone for Regex {
373 fn clone(&self) -> Regex {
374 let match_data = {
375 let config = self.config.match_config.clone();
376 let code = Arc::clone(&self.code);
377 let create: MatchDataPoolFn =
378 Box::new(move || MatchData::new(config.clone(), &code));
379 Pool::new(create)
380 };
381 Regex {
382 config: Arc::clone(&self.config),
383 pattern: self.pattern.clone(),
384 code: Arc::clone(&self.code),
385 capture_names: Arc::clone(&self.capture_names),
386 capture_names_idx: Arc::clone(&self.capture_names_idx),
387 match_data,
388 }
389 }
390}
391
392impl std::fmt::Debug for Regex {
393 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
394 write!(f, "Regex({:?})", self.pattern)
395 }
396}
397
398impl Regex {
399 /// Compiles a regular expression using the default configuration.
400 ///
401 /// Once compiled, it can be used repeatedly to search, split or replace
402 /// text in a string.
403 ///
404 /// If an invalid expression is given, then an error is returned.
405 ///
406 /// To configure compilation options for the regex, use the
407 /// [`RegexBuilder`](struct.RegexBuilder.html).
408 pub fn new(pattern: &str) -> Result<Regex, Error> {
409 RegexBuilder::new().build(pattern)
410 }
411
412 /// Returns true if and only if the regex matches the subject string given.
413 ///
414 /// # Example
415 ///
416 /// Test if some text contains at least one word with exactly 13 ASCII word
417 /// bytes:
418 ///
419 /// ```rust
420 /// # fn example() -> Result<(), ::pcre2::Error> {
421 /// use pcre2::bytes::Regex;
422 ///
423 /// let text = b"I categorically deny having triskaidekaphobia.";
424 /// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?);
425 /// # Ok(()) }; example().unwrap()
426 /// ```
427 pub fn is_match(&self, subject: &[u8]) -> Result<bool, Error> {
428 self.is_match_at(subject, 0)
429 }
430
431 /// Returns the start and end byte range of the leftmost-first match in
432 /// `subject`. If no match exists, then `None` is returned.
433 ///
434 /// # Example
435 ///
436 /// Find the start and end location of the first word with exactly 13
437 /// ASCII word bytes:
438 ///
439 /// ```rust
440 /// # fn example() -> Result<(), ::pcre2::Error> {
441 /// use pcre2::bytes::Regex;
442 ///
443 /// let text = b"I categorically deny having triskaidekaphobia.";
444 /// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap();
445 /// assert_eq!((mat.start(), mat.end()), (2, 15));
446 /// # Ok(()) }; example().unwrap()
447 /// ```
448 pub fn find<'s>(
449 &self,
450 subject: &'s [u8],
451 ) -> Result<Option<Match<'s>>, Error> {
452 self.find_at(subject, 0)
453 }
454
455 /// Returns an iterator for each successive non-overlapping match in
456 /// `subject`, returning the start and end byte indices with respect to
457 /// `subject`.
458 ///
459 /// # Example
460 ///
461 /// Find the start and end location of every word with exactly 13 ASCII
462 /// word bytes:
463 ///
464 /// ```rust
465 /// # fn example() -> Result<(), ::pcre2::Error> {
466 /// use pcre2::bytes::Regex;
467 ///
468 /// let text = b"Retroactively relinquishing remunerations is reprehensible.";
469 /// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) {
470 /// let mat = result?;
471 /// println!("{:?}", mat);
472 /// }
473 /// # Ok(()) }; example().unwrap()
474 /// ```
475 pub fn find_iter<'r, 's>(&'r self, subject: &'s [u8]) -> Matches<'r, 's> {
476 Matches {
477 re: self,
478 match_data: self.match_data(),
479 subject,
480 last_end: 0,
481 last_match: None,
482 }
483 }
484
485 /// Returns the capture groups corresponding to the leftmost-first
486 /// match in `subject`. Capture group `0` always corresponds to the entire
487 /// match. If no match is found, then `None` is returned.
488 ///
489 /// # Examples
490 ///
491 /// Say you have some text with movie names and their release years,
492 /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
493 /// looking like that, while also extracting the movie name and its release
494 /// year separately.
495 ///
496 /// ```rust
497 /// # fn example() -> Result<(), ::pcre2::Error> {
498 /// use pcre2::bytes::Regex;
499 ///
500 /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?;
501 /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
502 /// let caps = re.captures(text)?.unwrap();
503 /// assert_eq!(&caps[1], &b"Citizen Kane"[..]);
504 /// assert_eq!(&caps[2], &b"1941"[..]);
505 /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]);
506 /// // You can also access the groups by index using the Index notation.
507 /// // Note that this will panic on an invalid index.
508 /// assert_eq!(&caps[1], b"Citizen Kane");
509 /// assert_eq!(&caps[2], b"1941");
510 /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
511 /// # Ok(()) }; example().unwrap()
512 /// ```
513 ///
514 /// Note that the full match is at capture group `0`. Each subsequent
515 /// capture group is indexed by the order of its opening `(`.
516 ///
517 /// We can make this example a bit clearer by using *named* capture groups:
518 ///
519 /// ```rust
520 /// # fn example() -> Result<(), ::pcre2::Error> {
521 /// use pcre2::bytes::Regex;
522 ///
523 /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?;
524 /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
525 /// let caps = re.captures(text)?.unwrap();
526 /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]);
527 /// assert_eq!(&caps["year"], &b"1941"[..]);
528 /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]);
529 /// // You can also access the groups by name using the Index notation.
530 /// // Note that this will panic on an invalid group name.
531 /// assert_eq!(&caps["title"], b"Citizen Kane");
532 /// assert_eq!(&caps["year"], b"1941");
533 /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
534 /// # Ok(()) }; example().unwrap()
535 /// ```
536 ///
537 /// Here we name the capture groups, which we can access with the `name`
538 /// method or the `Index` notation with a `&str`. Note that the named
539 /// capture groups are still accessible with `get` or the `Index` notation
540 /// with a `usize`.
541 ///
542 /// The `0`th capture group is always unnamed, so it must always be
543 /// accessed with `get(0)` or `[0]`.
544 pub fn captures<'s>(
545 &self,
546 subject: &'s [u8],
547 ) -> Result<Option<Captures<'s>>, Error> {
548 let mut locs = self.capture_locations();
549 Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures {
550 subject,
551 locs,
552 idx: Arc::clone(&self.capture_names_idx),
553 }))
554 }
555
556 /// Returns an iterator over all the non-overlapping capture groups matched
557 /// in `subject`. This is operationally the same as `find_iter`, except it
558 /// yields information about capturing group matches.
559 ///
560 /// # Example
561 ///
562 /// We can use this to find all movie titles and their release years in
563 /// some text, where the movie is formatted like "'Title' (xxxx)":
564 ///
565 /// ```rust
566 /// # fn example() -> Result<(), ::pcre2::Error> {
567 /// use std::str;
568 ///
569 /// use pcre2::bytes::Regex;
570 ///
571 /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?;
572 /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
573 /// for result in re.captures_iter(text) {
574 /// let caps = result?;
575 /// let title = str::from_utf8(&caps["title"]).unwrap();
576 /// let year = str::from_utf8(&caps["year"]).unwrap();
577 /// println!("Movie: {:?}, Released: {:?}", title, year);
578 /// }
579 /// // Output:
580 /// // Movie: Citizen Kane, Released: 1941
581 /// // Movie: The Wizard of Oz, Released: 1939
582 /// // Movie: M, Released: 1931
583 /// # Ok(()) }; example().unwrap()
584 /// ```
585 pub fn captures_iter<'r, 's>(
586 &'r self,
587 subject: &'s [u8],
588 ) -> CaptureMatches<'r, 's> {
589 CaptureMatches { re: self, subject, last_end: 0, last_match: None }
590 }
591}
592
593/// Advanced or "lower level" search methods.
594impl Regex {
595 /// Returns the same as is_match, but starts the search at the given
596 /// offset.
597 ///
598 /// The significance of the starting point is that it takes the surrounding
599 /// context into consideration. For example, the `\A` anchor can only
600 /// match when `start == 0`.
601 pub fn is_match_at(
602 &self,
603 subject: &[u8],
604 start: usize,
605 ) -> Result<bool, Error> {
606 assert!(
607 start <= subject.len(),
608 "start ({}) must be <= subject.len() ({})",
609 start,
610 subject.len()
611 );
612
613 let options = 0;
614 let mut match_data = self.match_data();
615 // SAFETY: We don't use any dangerous PCRE2 options.
616 let res =
617 unsafe { match_data.find(&self.code, subject, start, options) };
618 PoolGuard::put(match_data);
619 res
620 }
621
622 /// Returns the same as find, but starts the search at the given
623 /// offset.
624 ///
625 /// The significance of the starting point is that it takes the surrounding
626 /// context into consideration. For example, the `\A` anchor can only
627 /// match when `start == 0`.
628 pub fn find_at<'s>(
629 &self,
630 subject: &'s [u8],
631 start: usize,
632 ) -> Result<Option<Match<'s>>, Error> {
633 let mut match_data = self.match_data();
634 let res =
635 self.find_at_with_match_data(&mut match_data, subject, start);
636 PoolGuard::put(match_data);
637 res
638 }
639
640 /// Like find_at, but accepts match data instead of acquiring one itself.
641 ///
642 /// This is useful for implementing the iterator, which permits avoiding
643 /// the synchronization overhead of acquiring the match data.
644 #[inline(always)]
645 fn find_at_with_match_data<'s>(
646 &self,
647 match_data: &mut MatchDataPoolGuard<'_>,
648 subject: &'s [u8],
649 start: usize,
650 ) -> Result<Option<Match<'s>>, Error> {
651 assert!(
652 start <= subject.len(),
653 "start ({}) must be <= subject.len() ({})",
654 start,
655 subject.len()
656 );
657
658 let options = 0;
659 // SAFETY: We don't use any dangerous PCRE2 options.
660 if unsafe { !match_data.find(&self.code, subject, start, options)? } {
661 return Ok(None);
662 }
663 let ovector = match_data.ovector();
664 let (s, e) = (ovector[0], ovector[1]);
665 Ok(Some(Match::new(&subject, s, e)))
666 }
667
668 /// This is like `captures`, but uses
669 /// [`CaptureLocations`](struct.CaptureLocations.html)
670 /// instead of
671 /// [`Captures`](struct.Captures.html) in order to amortize allocations.
672 ///
673 /// To create a `CaptureLocations` value, use the
674 /// `Regex::capture_locations` method.
675 ///
676 /// This returns the overall match if this was successful, which is always
677 /// equivalent to the `0`th capture group.
678 pub fn captures_read<'s>(
679 &self,
680 locs: &mut CaptureLocations,
681 subject: &'s [u8],
682 ) -> Result<Option<Match<'s>>, Error> {
683 self.captures_read_at(locs, subject, 0)
684 }
685
686 /// Returns the same as `captures_read`, but starts the search at the given
687 /// offset and populates the capture locations given.
688 ///
689 /// The significance of the starting point is that it takes the surrounding
690 /// context into consideration. For example, the `\A` anchor can only
691 /// match when `start == 0`.
692 pub fn captures_read_at<'s>(
693 &self,
694 locs: &mut CaptureLocations,
695 subject: &'s [u8],
696 start: usize,
697 ) -> Result<Option<Match<'s>>, Error> {
698 assert!(
699 start <= subject.len(),
700 "start ({}) must be <= subject.len() ({})",
701 start,
702 subject.len()
703 );
704
705 let options = 0;
706 // SAFETY: We don't use any dangerous PCRE2 options.
707 if unsafe { !locs.data.find(&self.code, subject, start, options)? } {
708 return Ok(None);
709 }
710 let ovector = locs.data.ovector();
711 let (s, e) = (ovector[0], ovector[1]);
712 Ok(Some(Match::new(&subject, s, e)))
713 }
714}
715
716/// Auxiliary methods.
717impl Regex {
718 /// Returns the original pattern string for this regex.
719 pub fn as_str(&self) -> &str {
720 &self.pattern
721 }
722
723 /// Returns a sequence of all capturing groups and their names, if present.
724 ///
725 /// The length of the slice returned is always equal to the result of
726 /// `captures_len`, which is the number of capturing groups (including the
727 /// capturing group for the entire pattern).
728 ///
729 /// Each entry in the slice is the name of the corresponding capturing
730 /// group, if one exists. The first capturing group (at index `0`) is
731 /// always unnamed.
732 ///
733 /// Capturing groups are indexed by the order of the opening parenthesis.
734 pub fn capture_names(&self) -> &[Option<String>] {
735 &self.capture_names
736 }
737
738 /// Returns the number of capturing groups in the pattern.
739 ///
740 /// This is always 1 more than the number of syntactic groups in the
741 /// pattern, since the first group always corresponds to the entire match.
742 pub fn captures_len(&self) -> usize {
743 self.code.capture_count().expect("a valid capture count from PCRE2")
744 }
745
746 /// Returns an empty set of capture locations that can be reused in
747 /// multiple calls to `captures_read` or `captures_read_at`.
748 pub fn capture_locations(&self) -> CaptureLocations {
749 CaptureLocations {
750 code: Arc::clone(&self.code),
751 data: self.new_match_data(),
752 }
753 }
754
755 fn match_data(&self) -> MatchDataPoolGuard<'_> {
756 self.match_data.get()
757 }
758
759 fn new_match_data(&self) -> MatchData {
760 MatchData::new(self.config.match_config.clone(), &self.code)
761 }
762}
763
764/// CaptureLocations is a low level representation of the raw offsets of each
765/// submatch.
766///
767/// Primarily, this type is useful when using `Regex` APIs such as
768/// `captures_read`, which permits amortizing the allocation in which capture
769/// match locations are stored.
770///
771/// In order to build a value of this type, you'll need to call the
772/// `capture_locations` method on the `Regex` being used to execute the search.
773/// The value returned can then be reused in subsequent searches.
774pub struct CaptureLocations {
775 code: Arc<Code>,
776 data: MatchData,
777}
778
779impl Clone for CaptureLocations {
780 fn clone(&self) -> CaptureLocations {
781 CaptureLocations {
782 code: Arc::clone(&self.code),
783 data: MatchData::new(self.data.config().clone(), &self.code),
784 }
785 }
786}
787
788impl std::fmt::Debug for CaptureLocations {
789 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
790 let mut offsets: Vec<Option<usize>> = vec![];
791 for &offset in self.data.ovector() {
792 if offset == PCRE2_UNSET {
793 offsets.push(None);
794 } else {
795 offsets.push(Some(offset));
796 }
797 }
798 write!(f, "CaptureLocations(")?;
799 f.debug_list().entries(offsets).finish()?;
800 write!(f, ")")
801 }
802}
803
804impl CaptureLocations {
805 /// Returns the start and end positions of the Nth capture group.
806 ///
807 /// This returns `None` if `i` is not a valid capture group or if the
808 /// capture group did not match anything.
809 ///
810 /// The positions returned are always byte indices with respect to the
811 /// original subject string matched.
812 #[inline]
813 pub fn get(&self, i: usize) -> Option<(usize, usize)> {
814 let start_index = i.checked_mul(2)?;
815 let end_index = start_index.checked_add(1)?;
816 let ovec = self.data.ovector();
817 let start = *ovec.get(start_index)?;
818 let end = *ovec.get(end_index)?;
819 if start == PCRE2_UNSET || end == PCRE2_UNSET {
820 return None;
821 }
822 Some((start, end))
823 }
824
825 /// Returns the total number of capturing groups.
826 ///
827 /// This is always at least `1` since every regex has at least `1`
828 /// capturing group that corresponds to the entire match.
829 #[inline]
830 pub fn len(&self) -> usize {
831 self.data.ovector().len() / 2
832 }
833}
834
835/// Captures represents a group of captured byte strings for a single match.
836///
837/// The 0th capture always corresponds to the entire match. Each subsequent
838/// index corresponds to the next capture group in the regex. If a capture
839/// group is named, then the matched byte string is *also* available via the
840/// `name` method. (Note that the 0th capture is always unnamed and so must be
841/// accessed with the `get` method.)
842///
843/// Positions returned from a capture group are always byte indices.
844///
845/// `'s` is the lifetime of the matched subject string.
846pub struct Captures<'s> {
847 subject: &'s [u8],
848 locs: CaptureLocations,
849 idx: Arc<HashMap<String, usize>>,
850}
851
852impl<'s> Captures<'s> {
853 /// Returns the match associated with the capture group at index `i`. If
854 /// `i` does not correspond to a capture group, or if the capture group
855 /// did not participate in the match, then `None` is returned.
856 ///
857 /// # Examples
858 ///
859 /// Get the text of the match with a default of an empty string if this
860 /// group didn't participate in the match:
861 ///
862 /// ```rust
863 /// # fn example() -> Result<(), ::pcre2::Error> {
864 /// use pcre2::bytes::Regex;
865 ///
866 /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?;
867 /// let caps = re.captures(b"abc123")?.unwrap();
868 ///
869 /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
870 /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
871 /// assert_eq!(text1, &b"123"[..]);
872 /// assert_eq!(text2, &b""[..]);
873 /// # Ok(()) }; example().unwrap()
874 /// ```
875 pub fn get(&self, i: usize) -> Option<Match<'s>> {
876 self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e))
877 }
878
879 /// Returns the match for the capture group named `name`. If `name` isn't a
880 /// valid capture group or didn't match anything, then `None` is returned.
881 pub fn name(&self, name: &str) -> Option<Match<'s>> {
882 self.idx.get(name).and_then(|&i| self.get(i))
883 }
884
885 /// Returns the number of captured groups.
886 ///
887 /// This is always at least `1`, since every regex has at least one capture
888 /// group that corresponds to the full match.
889 #[inline]
890 pub fn len(&self) -> usize {
891 self.locs.len()
892 }
893}
894
895impl<'s> std::fmt::Debug for Captures<'s> {
896 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
897 f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
898 }
899}
900
901struct CapturesDebug<'c, 's: 'c>(&'c Captures<'s>);
902
903impl<'c, 's> std::fmt::Debug for CapturesDebug<'c, 's> {
904 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
905 fn escape_bytes(bytes: &[u8]) -> String {
906 let mut s = String::new();
907 for &b in bytes {
908 s.push_str(&escape_byte(b));
909 }
910 s
911 }
912
913 fn escape_byte(byte: u8) -> String {
914 use std::ascii::escape_default;
915
916 let escaped: Vec<u8> = escape_default(byte).collect();
917 String::from_utf8_lossy(&escaped).into_owned()
918 }
919
920 // We'd like to show something nice here, even if it means an
921 // allocation to build a reverse index.
922 let slot_to_name: HashMap<&usize, &String> =
923 self.0.idx.iter().map(|(a, b)| (b, a)).collect();
924 let mut map = f.debug_map();
925 for slot in 0..self.0.len() {
926 let m = self
927 .0
928 .locs
929 .get(slot)
930 .map(|(s, e)| escape_bytes(&self.0.subject[s..e]));
931 if let Some(name) = slot_to_name.get(&slot) {
932 map.entry(&name, &m);
933 } else {
934 map.entry(&slot, &m);
935 }
936 }
937 map.finish()
938 }
939}
940
941/// Get a group by index.
942///
943/// `'s` is the lifetime of the matched subject string.
944///
945/// The subject can't outlive the `Captures` object if this method is
946/// used, because of how `Index` is defined (normally `a[i]` is part
947/// of `a` and can't outlive it); to do that, use `get()` instead.
948///
949/// # Panics
950///
951/// If there is no group at the given index.
952impl<'s> std::ops::Index<usize> for Captures<'s> {
953 type Output = [u8];
954
955 fn index(&self, i: usize) -> &[u8] {
956 self.get(i)
957 .map(|m| m.as_bytes())
958 .unwrap_or_else(|| panic!("no group at index '{}'", i))
959 }
960}
961
962/// Get a group by name.
963///
964/// `'s` is the lifetime of the matched subject string and `'i` is the lifetime
965/// of the group name (the index).
966///
967/// The text can't outlive the `Captures` object if this method is
968/// used, because of how `Index` is defined (normally `a[i]` is part
969/// of `a` and can't outlive it); to do that, use `name` instead.
970///
971/// # Panics
972///
973/// If there is no group named by the given value.
974impl<'s, 'i> std::ops::Index<&'i str> for Captures<'s> {
975 type Output = [u8];
976
977 fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
978 self.name(name)
979 .map(|m| m.as_bytes())
980 .unwrap_or_else(|| panic!("no group named '{}'", name))
981 }
982}
983
984/// An iterator over all non-overlapping matches for a particular subject
985/// string.
986///
987/// The iterator yields matches (if no error occurred while searching)
988/// corresponding to the start and end of the match. The indices are byte
989/// offsets. The iterator stops when no more matches can be found.
990///
991/// `'r` is the lifetime of the compiled regular expression and `'s` is the
992/// lifetime of the subject string.
993pub struct Matches<'r, 's> {
994 re: &'r Regex,
995 match_data: MatchDataPoolGuard<'r>,
996 subject: &'s [u8],
997 last_end: usize,
998 last_match: Option<usize>,
999}
1000
1001impl<'r, 's> Iterator for Matches<'r, 's> {
1002 type Item = Result<Match<'s>, Error>;
1003
1004 fn next(&mut self) -> Option<Result<Match<'s>, Error>> {
1005 if self.last_end > self.subject.len() {
1006 return None;
1007 }
1008 let res = self.re.find_at_with_match_data(
1009 &mut self.match_data,
1010 self.subject,
1011 self.last_end,
1012 );
1013 let m = match res {
1014 Err(err) => return Some(Err(err)),
1015 Ok(None) => return None,
1016 Ok(Some(m)) => m,
1017 };
1018 if m.start() == m.end() {
1019 // This is an empty match. To ensure we make progress, start
1020 // the next search at the smallest possible starting position
1021 // of the next match following this one.
1022 self.last_end = m.end() + 1;
1023 // Don't accept empty matches immediately following a match.
1024 // Just move on to the next match.
1025 if Some(m.end()) == self.last_match {
1026 return self.next();
1027 }
1028 } else {
1029 self.last_end = m.end();
1030 }
1031 self.last_match = Some(m.end());
1032 Some(Ok(m))
1033 }
1034}
1035
1036/// An iterator that yields all non-overlapping capture groups matching a
1037/// particular regular expression.
1038///
1039/// The iterator stops when no more matches can be found.
1040///
1041/// `'r` is the lifetime of the compiled regular expression and `'s` is the
1042/// lifetime of the subject string.
1043pub struct CaptureMatches<'r, 's> {
1044 re: &'r Regex,
1045 subject: &'s [u8],
1046 last_end: usize,
1047 last_match: Option<usize>,
1048}
1049
1050impl<'r, 's> Iterator for CaptureMatches<'r, 's> {
1051 type Item = Result<Captures<'s>, Error>;
1052
1053 fn next(&mut self) -> Option<Result<Captures<'s>, Error>> {
1054 if self.last_end > self.subject.len() {
1055 return None;
1056 }
1057 let mut locs = self.re.capture_locations();
1058 let res =
1059 self.re.captures_read_at(&mut locs, self.subject, self.last_end);
1060 let m = match res {
1061 Err(err) => return Some(Err(err)),
1062 Ok(None) => return None,
1063 Ok(Some(m)) => m,
1064 };
1065 if m.start() == m.end() {
1066 // This is an empty match. To ensure we make progress, start
1067 // the next search at the smallest possible starting position
1068 // of the next match following this one.
1069 self.last_end = m.end() + 1;
1070 // Don't accept empty matches immediately following a match.
1071 // Just move on to the next match.
1072 if Some(m.end()) == self.last_match {
1073 return self.next();
1074 }
1075 } else {
1076 self.last_end = m.end();
1077 }
1078 self.last_match = Some(m.end());
1079 Some(Ok(Captures {
1080 subject: self.subject,
1081 locs,
1082 idx: Arc::clone(&self.re.capture_names_idx),
1083 }))
1084 }
1085}
1086
1087/// A type alias for our pool of `MatchData` that fixes the type parameters to
1088/// what we actually use in practice.
1089type MatchDataPool = Pool<MatchData, MatchDataPoolFn>;
1090
1091/// Same as above, but for the guard returned by a pool.
1092type MatchDataPoolGuard<'a> = PoolGuard<'a, MatchData, MatchDataPoolFn>;
1093
1094/// The type of the closure we use to create new caches. We need to spell out
1095/// all of the marker traits or else we risk leaking !MARKER impls.
1096type MatchDataPoolFn =
1097 Box<dyn Fn() -> MatchData + Send + Sync + UnwindSafe + RefUnwindSafe>;
1098
1099#[cfg(test)]
1100mod tests {
1101 use super::{Regex, RegexBuilder};
1102 use crate::is_jit_available;
1103
1104 fn b(string: &str) -> &[u8] {
1105 string.as_bytes()
1106 }
1107
1108 fn find_iter_tuples(re: &Regex, subject: &[u8]) -> Vec<(usize, usize)> {
1109 let mut tuples = vec![];
1110 for result in re.find_iter(subject) {
1111 let m = result.unwrap();
1112 tuples.push((m.start(), m.end()));
1113 }
1114 tuples
1115 }
1116
1117 fn cap_iter_tuples(re: &Regex, subject: &[u8]) -> Vec<(usize, usize)> {
1118 let mut tuples = vec![];
1119 for result in re.captures_iter(subject) {
1120 let caps = result.unwrap();
1121 let m = caps.get(0).unwrap();
1122 tuples.push((m.start(), m.end()));
1123 }
1124 tuples
1125 }
1126
1127 #[test]
1128 fn caseless() {
1129 let re = RegexBuilder::new().caseless(true).build("a").unwrap();
1130 assert!(re.is_match(b("A")).unwrap());
1131
1132 let re =
1133 RegexBuilder::new().caseless(true).ucp(true).build("β").unwrap();
1134 assert!(re.is_match(b("Β")).unwrap());
1135 }
1136
1137 #[test]
1138 fn crlf() {
1139 let re = RegexBuilder::new().crlf(true).build("a$").unwrap();
1140 let m = re.find(b("a\r\n")).unwrap().unwrap();
1141 assert_eq!(m.as_pair(), (0, 1));
1142 }
1143
1144 #[test]
1145 fn dotall() {
1146 let re = RegexBuilder::new().dotall(false).build(".").unwrap();
1147 assert!(!re.is_match(b("\n")).unwrap());
1148
1149 let re = RegexBuilder::new().dotall(true).build(".").unwrap();
1150 assert!(re.is_match(b("\n")).unwrap());
1151 }
1152
1153 #[test]
1154 fn extended() {
1155 let re = RegexBuilder::new().extended(true).build("a b c").unwrap();
1156 assert!(re.is_match(b("abc")).unwrap());
1157 }
1158
1159 #[test]
1160 fn multi_line() {
1161 let re = RegexBuilder::new().multi_line(false).build("^abc$").unwrap();
1162 assert!(!re.is_match(b("foo\nabc\nbar")).unwrap());
1163
1164 let re = RegexBuilder::new().multi_line(true).build("^abc$").unwrap();
1165 assert!(re.is_match(b("foo\nabc\nbar")).unwrap());
1166 }
1167
1168 #[test]
1169 fn ucp() {
1170 let re = RegexBuilder::new().ucp(false).build(r"\w").unwrap();
1171 assert!(!re.is_match(b("β")).unwrap());
1172
1173 let re = RegexBuilder::new().ucp(true).build(r"\w").unwrap();
1174 assert!(re.is_match(b("β")).unwrap());
1175 }
1176
1177 #[test]
1178 fn utf() {
1179 let re = RegexBuilder::new().utf(false).build(".").unwrap();
1180 assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 1));
1181
1182 let re = RegexBuilder::new().utf(true).build(".").unwrap();
1183 assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 2));
1184 }
1185
1186 #[test]
1187 fn jit4lyfe() {
1188 if is_jit_available() {
1189 let re = RegexBuilder::new().jit(true).build(r"\w").unwrap();
1190 assert!(re.is_match(b("a")).unwrap());
1191 } else {
1192 // Check that if JIT isn't enabled, then we get an error if we
1193 // require JIT.
1194 RegexBuilder::new().jit(true).build(r"\w").unwrap_err();
1195 }
1196 }
1197
1198 // Unlike jit4lyfe, this tests that everything works when requesting the
1199 // JIT only if it's available. In jit4lyfe, we require the JIT or fail.
1200 // If the JIT isn't available, then in this test, we simply don't use it.
1201 #[test]
1202 fn jit_if_available() {
1203 let re =
1204 RegexBuilder::new().jit_if_available(true).build(r"\w").unwrap();
1205 assert!(re.is_match(b("a")).unwrap());
1206 }
1207
1208 // This tests a regression caused a segfault in the pcre2 library
1209 // https://github.com/BurntSushi/rust-pcre2/issues/10
1210 #[test]
1211 fn jit_test_lazy_alloc_subject() {
1212 let subject: Vec<u8> = vec![];
1213
1214 let re = RegexBuilder::new()
1215 .jit_if_available(true)
1216 .build(r"xxxx|xxxx|xxxx")
1217 .unwrap();
1218 assert!(!re.is_match(&subject).unwrap());
1219 }
1220
1221 #[test]
1222 fn utf_with_invalid_data() {
1223 let re = RegexBuilder::new().build(r".").unwrap();
1224 assert_eq!(re.find(b"\xFF").unwrap().unwrap().as_pair(), (0, 1));
1225
1226 let re = RegexBuilder::new().utf(true).build(r".").unwrap();
1227 assert!(re.find(b"\xFF").is_err());
1228 }
1229
1230 #[test]
1231 fn capture_names() {
1232 let re = RegexBuilder::new()
1233 .build(r"(?P<foo>abc)|(def)|(?P<a>ghi)|(?P<springsteen>jkl)")
1234 .unwrap();
1235 assert_eq!(
1236 re.capture_names().to_vec(),
1237 vec![
1238 None,
1239 Some("foo".to_string()),
1240 None,
1241 Some("a".to_string()),
1242 Some("springsteen".to_string()),
1243 ]
1244 );
1245
1246 // Test our internal map as well.
1247 assert_eq!(re.capture_names_idx.len(), 3);
1248 assert_eq!(re.capture_names_idx["foo"], 1);
1249 assert_eq!(re.capture_names_idx["a"], 3);
1250 assert_eq!(re.capture_names_idx["springsteen"], 4);
1251 }
1252
1253 #[test]
1254 fn captures_get() {
1255 let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
1256 let caps = re.captures(b"abc123").unwrap().unwrap();
1257
1258 let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
1259 let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
1260 assert_eq!(text1, &b"123"[..]);
1261 assert_eq!(text2, &b""[..]);
1262 assert_eq!(caps.get(usize::MAX), None);
1263 }
1264
1265 #[test]
1266 fn find_iter_empty() {
1267 let re = Regex::new(r"(?m:^)").unwrap();
1268 assert_eq!(find_iter_tuples(&re, b""), vec![(0, 0)]);
1269 assert_eq!(find_iter_tuples(&re, b"\n"), vec![(0, 0)]);
1270 assert_eq!(find_iter_tuples(&re, b"\n\n"), vec![(0, 0), (1, 1)]);
1271 assert_eq!(find_iter_tuples(&re, b"\na\n"), vec![(0, 0), (1, 1)]);
1272 assert_eq!(
1273 find_iter_tuples(&re, b"\na\n\n"),
1274 vec![(0, 0), (1, 1), (3, 3),]
1275 );
1276 }
1277
1278 #[test]
1279 fn captures_iter_empty() {
1280 let re = Regex::new(r"(?m:^)").unwrap();
1281 assert_eq!(cap_iter_tuples(&re, b""), vec![(0, 0)]);
1282 assert_eq!(cap_iter_tuples(&re, b"\n"), vec![(0, 0)]);
1283 assert_eq!(cap_iter_tuples(&re, b"\n\n"), vec![(0, 0), (1, 1)]);
1284 assert_eq!(cap_iter_tuples(&re, b"\na\n"), vec![(0, 0), (1, 1)]);
1285 assert_eq!(
1286 cap_iter_tuples(&re, b"\na\n\n"),
1287 vec![(0, 0), (1, 1), (3, 3),]
1288 );
1289 }
1290
1291 #[test]
1292 fn max_jit_stack_size_does_something() {
1293 if !is_jit_available() {
1294 return;
1295 }
1296
1297 let hundred = "\
1298 ABCDEFGHIJKLMNOPQRSTUVWXY\
1299 ABCDEFGHIJKLMNOPQRSTUVWXY\
1300 ABCDEFGHIJKLMNOPQRSTUVWXY\
1301 ABCDEFGHIJKLMNOPQRSTUVWXY\
1302 ";
1303 let hay = format!("{}", hundred.repeat(100));
1304
1305 // First, try a regex that checks that we can blow the JIT stack limit.
1306 let re = RegexBuilder::new()
1307 .ucp(true)
1308 .jit(true)
1309 .max_jit_stack_size(Some(1))
1310 .build(r"((((\w{10})){100}))+")
1311 .unwrap();
1312 let result = re.is_match(hay.as_bytes());
1313 if result.is_ok() {
1314 // Skip this test, since for some reason we weren't able to blow
1315 // the stack limit.
1316 return;
1317 }
1318 let err = result.unwrap_err();
1319 assert!(err.to_string().contains("JIT stack limit reached"));
1320
1321 // Now bump up the JIT stack limit and check that it succeeds.
1322 let re = RegexBuilder::new()
1323 .ucp(true)
1324 .jit(true)
1325 .max_jit_stack_size(Some(1 << 20))
1326 .build(r"((((\w{10})){100}))+")
1327 .unwrap();
1328 assert!(re.is_match(hay.as_bytes()).unwrap());
1329 }
1330
1331 #[test]
1332 fn find_start_end_and_as_bytes() {
1333 let hay =
1334 "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1335 let pattern = r"
1336 (?x) (?#: Allow comments and whitespace.)
1337
1338 [a-z] (?#: Lowercase letter.)
1339 + (?#: One or more times.)
1340 ";
1341 let re = RegexBuilder::new()
1342 .extended(true)
1343 .utf(true)
1344 .jit_if_available(true)
1345 .build(pattern)
1346 .unwrap();
1347 let matched = re.find(hay.as_bytes()).unwrap().unwrap();
1348 assert_eq!(matched.start(), 10);
1349 assert_eq!(matched.end(), 10 + 26);
1350 assert_eq!(matched.as_bytes(), b"abcdefghijklmnopqrstuvwxyz");
1351 }
1352
1353 #[test]
1354 fn find_utf_emoji_as_bytes() {
1355 let hay = "0123456789😀👍🏼🎉abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1356 let pattern = r"(*UTF)
1357 (?x) (?#: Allow comments and whitespace.)
1358
1359 [^\N{U+0000}-\N{U+007F}] (?#: Non-ascii code points.)
1360 + (?#: One or more times.)
1361 ";
1362 let re = RegexBuilder::new()
1363 .extended(true)
1364 .utf(true)
1365 .jit_if_available(true)
1366 .build(pattern)
1367 .unwrap();
1368 let matched = re.find(hay.as_bytes()).unwrap().unwrap();
1369 assert_eq!(matched.as_bytes(), "😀👍🏼🎉".as_bytes());
1370 }
1371
1372 // See: https://github.com/BurntSushi/rust-pcre2/issues/50
1373 #[test]
1374 fn capture_get_does_not_panic() {
1375 let re = Regex::new("").unwrap();
1376 let caps = re.captures(b"abc").unwrap().unwrap();
1377 assert_eq!(Some((0, 0)), caps.get(0).map(|m| (m.start(), m.end())));
1378 assert_eq!(None, caps.get(1));
1379 assert_eq!(None, caps.get(usize::MAX - 1));
1380 assert_eq!(None, caps.get(usize::MAX));
1381 }
1382}