regress/
api.rs

1use crate::classicalbacktrack;
2use crate::emit;
3use crate::exec;
4use crate::indexing;
5use crate::insn::CompiledRegex;
6use crate::optimizer;
7use crate::parse;
8
9#[cfg(feature = "utf16")]
10use crate::{
11    classicalbacktrack::MatchAttempter,
12    indexing::{InputIndexer, Ucs2Input, Utf16Input},
13};
14
15#[cfg(feature = "backend-pikevm")]
16use crate::pikevm;
17use crate::util::to_char_sat;
18
19use core::{fmt, str::FromStr};
20#[cfg(feature = "std")]
21#[cfg(not(feature = "std"))]
22use {
23    alloc::{string::String, vec::Vec},
24    hashbrown::{hash_map::Iter, HashMap},
25};
26
27pub use parse::Error;
28
29/// Flags used to control regex parsing.
30/// The default flags are case-sensitive, not-multiline, and optimizing.
31#[derive(Debug, Copy, Clone, Default)]
32pub struct Flags {
33    /// If set, make the regex case-insensitive.
34    /// Equivalent to the 'i' flag in JavaScript.
35    pub icase: bool,
36
37    /// If set, ^ and $ match at line separators, not just the input boundaries.
38    /// Equivalent to the 'm' flag in JavaScript.
39    pub multiline: bool,
40
41    /// If set, . matches at line separators as well as any other character.
42    /// Equivalent to the 'm' flag in JavaScript.
43    pub dot_all: bool,
44
45    /// If set, disable regex IR passes.
46    pub no_opt: bool,
47
48    /// If set, the regex is interpreted as a Unicode regex.
49    /// Equivalent to the 'u' flag in JavaScript.
50    pub unicode: bool,
51
52    /// If set, the regex is interpreted as a UnicodeSets regex.
53    /// Equivalent to the 'v' flag in JavaScript.
54    pub unicode_sets: bool,
55}
56
57impl Flags {
58    /// Construct a Flags from a Unicode codepoints iterator, using JavaScript field names.
59    /// 'i' means to ignore case, 'm' means multiline, 'u' means unicode.
60    /// Note the 'g' flag implies a stateful regex and is not supported.
61    /// Other flags are not implemented and are ignored.
62    #[inline]
63    pub fn new<T: Iterator<Item = u32>>(chars: T) -> Self {
64        let mut result = Self::default();
65        for c in chars {
66            match to_char_sat(c) {
67                'm' => {
68                    result.multiline = true;
69                }
70                'i' => {
71                    result.icase = true;
72                }
73                's' => {
74                    result.dot_all = true;
75                }
76                'u' => {
77                    result.unicode = true;
78                }
79                'v' => {
80                    result.unicode_sets = true;
81                }
82                _ => {
83                    // Silently skip unsupported flags.
84                }
85            }
86        }
87        result
88    }
89}
90
91impl From<&str> for Flags {
92    /// Construct a Flags from a string, using JavaScript field names.
93    ///
94    /// See also: [`Flags::new`].
95    #[inline]
96    fn from(s: &str) -> Self {
97        Self::new(s.chars().map(u32::from))
98    }
99}
100
101impl fmt::Display for Flags {
102    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
103        if self.multiline {
104            f.write_str("m")?;
105        }
106        if self.icase {
107            f.write_str("i")?;
108        }
109        if self.dot_all {
110            f.write_str("s")?;
111        }
112        if self.unicode {
113            f.write_str("u")?;
114        }
115        Ok(())
116    }
117}
118
119/// Range is used to express the extent of a match, as indexes into the input
120/// string.
121pub type Range = core::ops::Range<usize>;
122
123/// An iterator type which yields `Match`es found in a string.
124pub type Matches<'r, 't> = exec::Matches<backends::DefaultExecutor<'r, 't>>;
125
126/// An iterator type which yields `Match`es found in a string, supporting ASCII
127/// only.
128pub type AsciiMatches<'r, 't> = exec::Matches<backends::DefaultAsciiExecutor<'r, 't>>;
129
130/// A Match represents a portion of a string which was found to match a Regex.
131#[derive(Debug, Clone)]
132pub struct Match {
133    /// The total range of the match. Note this may be empty, if the regex
134    /// matched an empty string.
135    pub range: Range,
136
137    /// The list of captures. This has length equal to the number of capturing
138    /// groups in the regex. For each capture, if the value is None, that group
139    /// did not match (for example, it was in a not-taken branch of an
140    /// alternation). If the value is Some, the group did match with the
141    /// enclosed range.
142    pub captures: Vec<Option<Range>>,
143
144    // A list of capture group names. This is either:
145    //   - Empty, if there were no named capture groups.
146    //   - A list of names with length `captures.len()`, corresponding to the
147    //     capture group names in order. Groups without names have an empty string.
148    pub(crate) group_names: Box<[Box<str>]>,
149}
150
151impl Match {
152    /// Access a group by index, using the convention of Python's group()
153    /// function. Index 0 is the total match, index 1 is the first capture
154    /// group.
155    #[inline]
156    pub fn group(&self, idx: usize) -> Option<Range> {
157        if idx == 0 {
158            Some(self.range.clone())
159        } else {
160            self.captures[idx - 1].clone()
161        }
162    }
163
164    /// Access a named group by name.
165    #[inline]
166    pub fn named_group(&self, name: &str) -> Option<Range> {
167        // Empty strings are used as sentinels to indicate unnamed group.
168        if name.is_empty() {
169            return None;
170        }
171        let pos = self.group_names.iter().position(|s| s.as_ref() == name)?;
172        self.captures[pos].clone()
173    }
174
175    /// Return an iterator over the named groups of a Match.
176    #[inline]
177    pub fn named_groups(&self) -> NamedGroups {
178        NamedGroups::new(self)
179    }
180
181    /// Returns the range over the starting and ending byte offsets of the match in the haystack.
182    ///
183    /// This is a convenience function to work around
184    /// the fact that Range does not support Copy.
185    #[inline]
186    pub fn range(&self) -> Range {
187        self.range.clone()
188    }
189
190    /// Returns the starting byte offset of the match in the haystack.
191    #[inline]
192    pub fn start(&self) -> usize {
193        self.range.start
194    }
195
196    /// Returns the ending byte offset of the match in the haystack.
197    #[inline]
198    pub fn end(&self) -> usize {
199        self.range.end
200    }
201
202    /// Return an iterator over a Match. The first returned value is the total
203    /// match, and subsequent values represent the capture groups.
204    #[inline]
205    pub fn groups(&self) -> Groups {
206        Groups::new(self)
207    }
208}
209
210/// An iterator over the capture groups of a [`Match`]
211///
212/// This struct is created by the [`groups`] method on [`Match`].
213///
214/// [`Match`]: ../struct.Match.html
215/// [`groups`]: ../struct.Match.html#method.groups
216#[derive(Clone)]
217pub struct Groups<'m> {
218    mat: &'m Match,
219    i: usize,
220    max: usize,
221}
222
223impl<'m> Groups<'m> {
224    #[inline]
225    fn new(mat: &'m Match) -> Self {
226        Self {
227            mat,
228            i: 0,
229            max: mat.captures.len() + 1,
230        }
231    }
232}
233
234impl Iterator for Groups<'_> {
235    type Item = Option<Range>;
236
237    #[inline]
238    fn next(&mut self) -> Option<Self::Item> {
239        let i = self.i;
240        if i < self.max {
241            self.i += 1;
242            Some(self.mat.group(i))
243        } else {
244            None
245        }
246    }
247}
248
249/// An iterator over the named capture groups of a [`Match`]
250///
251/// This struct is created by the [`named_groups`] method on [`Match`].
252///
253/// [`Match`]: ../struct.Match.html
254/// [`named_groups`]: ../struct.Match.html#method.named_groups
255#[derive(Clone)]
256pub struct NamedGroups<'m> {
257    mat: &'m Match,
258    next_group_name_idx: usize,
259}
260
261impl<'m> NamedGroups<'m> {
262    #[inline]
263    fn new(mat: &'m Match) -> Self {
264        Self {
265            mat,
266            next_group_name_idx: 0,
267        }
268    }
269}
270
271impl<'m> Iterator for NamedGroups<'m> {
272    type Item = (&'m str, Option<Range>);
273
274    #[inline]
275    fn next(&mut self) -> Option<Self::Item> {
276        // Increment next_group_name_idx until we find a non-empty name.
277        debug_assert!(self.next_group_name_idx <= self.mat.group_names.len());
278        let end = self.mat.group_names.len();
279        let mut idx = self.next_group_name_idx;
280        while idx < end && self.mat.group_names[idx].is_empty() {
281            idx += 1;
282        }
283        if idx == end {
284            return None;
285        }
286        let name = self.mat.group_names[idx].as_ref();
287        let range = self.mat.captures[idx].clone();
288        self.next_group_name_idx = idx + 1;
289        Some((name, range))
290    }
291}
292
293/// A Regex is the compiled version of a pattern.
294#[derive(Debug, Clone)]
295pub struct Regex {
296    cr: CompiledRegex,
297}
298
299impl From<CompiledRegex> for Regex {
300    fn from(cr: CompiledRegex) -> Self {
301        Self { cr }
302    }
303}
304
305impl Regex {
306    /// Construct a regex by parsing `pattern` using the default flags.
307    /// An Error may be returned if the syntax is invalid.
308    /// Note that this is rather expensive; prefer to cache a Regex which is
309    /// intended to be used more than once.
310    #[inline]
311    pub fn new(pattern: &str) -> Result<Regex, Error> {
312        Self::with_flags(pattern, Flags::default())
313    }
314
315    /// Construct a regex by parsing `pattern` with `flags`.
316    /// An Error may be returned if the syntax is invalid.
317    //
318    /// Note it is preferable to cache a Regex which is intended to be used more
319    /// than once, as the parse may be expensive. For example:
320    #[inline]
321    pub fn with_flags<F>(pattern: &str, flags: F) -> Result<Regex, Error>
322    where
323        F: Into<Flags>,
324    {
325        Self::from_unicode(pattern.chars().map(u32::from), flags)
326    }
327
328    /// Construct a regex by parsing `pattern` with `flags`, where
329    /// `pattern` is an iterator of `u32` Unicode codepoints.
330    /// An Error may be returned if the syntax is invalid.
331    /// This allows parsing regular expressions from exotic strings in
332    /// other encodings, such as UTF-16 or UTF-32.
333    pub fn from_unicode<I, F>(pattern: I, flags: F) -> Result<Regex, Error>
334    where
335        I: Iterator<Item = u32> + Clone,
336        F: Into<Flags>,
337    {
338        let flags = flags.into();
339        let mut ire = parse::try_parse(pattern, flags)?;
340        if !flags.no_opt {
341            optimizer::optimize(&mut ire);
342        }
343        let cr = emit::emit(&ire);
344        Ok(Regex { cr })
345    }
346
347    /// Searches `text` to find the first match.
348    #[inline]
349    pub fn find(&self, text: &str) -> Option<Match> {
350        self.find_iter(text).next()
351    }
352
353    /// Searches `text`, returning an iterator over non-overlapping matches.
354    /// Note that the resulting Iterator borrows both the regex `'r` and the
355    /// input string as `'t`.
356    #[inline]
357    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
358        self.find_from(text, 0)
359    }
360
361    /// Returns an iterator for matches found in 'text' starting at byte index
362    /// `start`. Note this may be different from passing a sliced `text` in
363    /// the case of lookbehind assertions.
364    /// Example:
365    ///
366    ///  ```rust
367    ///   use regress::Regex;
368    ///   let text = "xyxy";
369    ///   let re = Regex::new(r"(?<=x)y").unwrap();
370    ///   let t1 = re.find(&text[1..]).unwrap().range();
371    ///   assert!(t1 == (2..3));
372    ///   let t2 = re.find_from(text, 1).next().unwrap().range();
373    ///   assert!(t2 == (1..2));
374    ///   ```
375    #[inline]
376    pub fn find_from<'r, 't>(&'r self, text: &'t str, start: usize) -> Matches<'r, 't> {
377        backends::find(self, text, start)
378    }
379
380    /// Searches `text` to find the first match.
381    /// The input text is expected to be ascii-only: only ASCII case-folding is
382    /// supported.
383    #[inline]
384    pub fn find_ascii(&self, text: &str) -> Option<Match> {
385        self.find_iter_ascii(text).next()
386    }
387
388    /// Searches `text`, returning an iterator over non-overlapping matches.
389    /// The input text is expected to be ascii-only: only ASCII case-folding is
390    /// supported.
391    #[inline]
392    pub fn find_iter_ascii<'r, 't>(&'r self, text: &'t str) -> AsciiMatches<'r, 't> {
393        self.find_from_ascii(text, 0)
394    }
395
396    /// Returns an iterator for matches found in 'text' starting at byte index
397    /// `start`.
398    #[inline]
399    pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> {
400        backends::find(self, text, start)
401    }
402
403    /// Returns an iterator for matches found in 'text' starting at index `start`.
404    #[cfg(feature = "utf16")]
405    pub fn find_from_utf16<'r, 't>(
406        &'r self,
407        text: &'t [u16],
408        start: usize,
409    ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf16Input<'t>>>
410    {
411        let input = Utf16Input::new(text, self.cr.flags.unicode);
412        exec::Matches::new(
413            super::classicalbacktrack::BacktrackExecutor::new(
414                input,
415                MatchAttempter::new(&self.cr, input.left_end()),
416            ),
417            start,
418        )
419    }
420
421    /// Returns an iterator for matches found in 'text' starting at index `start`.
422    #[cfg(feature = "utf16")]
423    pub fn find_from_ucs2<'r, 't>(
424        &'r self,
425        text: &'t [u16],
426        start: usize,
427    ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Ucs2Input<'t>>>
428    {
429        let input = Ucs2Input::new(text, self.cr.flags.unicode);
430        exec::Matches::new(
431            super::classicalbacktrack::BacktrackExecutor::new(
432                input,
433                MatchAttempter::new(&self.cr, input.left_end()),
434            ),
435            start,
436        )
437    }
438}
439
440impl FromStr for Regex {
441    type Err = Error;
442
443    /// Attempts to parse a string into a regular expression
444    #[inline]
445    fn from_str(s: &str) -> Result<Self, Error> {
446        Self::new(s)
447    }
448}
449
450// Support for using regress with different regex backends.
451// Currently there is only the classical backtracking, and PikeVM.
452#[doc(hidden)]
453pub mod backends {
454    use super::exec;
455    use super::indexing;
456    use super::Regex;
457    pub use crate::emit::emit;
458    pub use crate::optimizer::optimize;
459    pub use crate::parse::try_parse;
460
461    /// An Executor using the classical backtracking algorithm.
462    pub type BacktrackExecutor<'r, 't> =
463        super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf8Input<'t>>;
464
465    /// A Executor using the PikeVM executor.
466    #[cfg(feature = "backend-pikevm")]
467    pub type PikeVMExecutor<'r, 't> = super::pikevm::PikeVMExecutor<'r, indexing::Utf8Input<'t>>;
468
469    /// An alias type to the default Executor.
470    pub type DefaultExecutor<'r, 't> = BacktrackExecutor<'r, 't>;
471
472    /// An alias type to the default executor's ASCII form.
473    pub type DefaultAsciiExecutor<'r, 't> =
474        <DefaultExecutor<'r, 't> as exec::Executor<'r, 't>>::AsAscii;
475
476    /// Searches `text`, returning an iterator over non-overlapping matches.
477    pub fn find<'r, 't, Executor: exec::Executor<'r, 't>>(
478        re: &'r Regex,
479        text: &'t str,
480        start: usize,
481    ) -> exec::Matches<Executor> {
482        exec::Matches::new(Executor::new(&re.cr, text), start)
483    }
484
485    /// Searches `text`, returning an iterator over non-overlapping matches.
486    /// This is a convenience method to avoid E0223.
487    pub fn find_ascii<'r, 't, Executor: exec::Executor<'r, 't>>(
488        re: &'r Regex,
489        text: &'t str,
490        start: usize,
491    ) -> exec::Matches<Executor::AsAscii> {
492        find::<Executor::AsAscii>(re, text, start)
493    }
494}