regress/api.rs
1use crate::classicalbacktrack;
2use crate::emit;
3use crate::exec;
4use crate::indexing;
5use crate::insn::CompiledRegex;
6use crate::optimizer;
7use crate::parse;
8
9#[cfg(feature = "utf16")]
10use crate::{
11 classicalbacktrack::MatchAttempter,
12 indexing::{InputIndexer, Ucs2Input, Utf16Input},
13};
14
15#[cfg(feature = "backend-pikevm")]
16use crate::pikevm;
17use crate::util::to_char_sat;
18
19use core::{fmt, str::FromStr};
20#[cfg(feature = "std")]
21#[cfg(not(feature = "std"))]
22use {
23 alloc::{string::String, vec::Vec},
24 hashbrown::{hash_map::Iter, HashMap},
25};
26
27pub use parse::Error;
28
29/// Flags used to control regex parsing.
30/// The default flags are case-sensitive, not-multiline, and optimizing.
31#[derive(Debug, Copy, Clone, Default)]
32pub struct Flags {
33 /// If set, make the regex case-insensitive.
34 /// Equivalent to the 'i' flag in JavaScript.
35 pub icase: bool,
36
37 /// If set, ^ and $ match at line separators, not just the input boundaries.
38 /// Equivalent to the 'm' flag in JavaScript.
39 pub multiline: bool,
40
41 /// If set, . matches at line separators as well as any other character.
42 /// Equivalent to the 'm' flag in JavaScript.
43 pub dot_all: bool,
44
45 /// If set, disable regex IR passes.
46 pub no_opt: bool,
47
48 /// If set, the regex is interpreted as a Unicode regex.
49 /// Equivalent to the 'u' flag in JavaScript.
50 pub unicode: bool,
51
52 /// If set, the regex is interpreted as a UnicodeSets regex.
53 /// Equivalent to the 'v' flag in JavaScript.
54 pub unicode_sets: bool,
55}
56
57impl Flags {
58 /// Construct a Flags from a Unicode codepoints iterator, using JavaScript field names.
59 /// 'i' means to ignore case, 'm' means multiline, 'u' means unicode.
60 /// Note the 'g' flag implies a stateful regex and is not supported.
61 /// Other flags are not implemented and are ignored.
62 #[inline]
63 pub fn new<T: Iterator<Item = u32>>(chars: T) -> Self {
64 let mut result = Self::default();
65 for c in chars {
66 match to_char_sat(c) {
67 'm' => {
68 result.multiline = true;
69 }
70 'i' => {
71 result.icase = true;
72 }
73 's' => {
74 result.dot_all = true;
75 }
76 'u' => {
77 result.unicode = true;
78 }
79 'v' => {
80 result.unicode_sets = true;
81 }
82 _ => {
83 // Silently skip unsupported flags.
84 }
85 }
86 }
87 result
88 }
89}
90
91impl From<&str> for Flags {
92 /// Construct a Flags from a string, using JavaScript field names.
93 ///
94 /// See also: [`Flags::new`].
95 #[inline]
96 fn from(s: &str) -> Self {
97 Self::new(s.chars().map(u32::from))
98 }
99}
100
101impl fmt::Display for Flags {
102 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
103 if self.multiline {
104 f.write_str("m")?;
105 }
106 if self.icase {
107 f.write_str("i")?;
108 }
109 if self.dot_all {
110 f.write_str("s")?;
111 }
112 if self.unicode {
113 f.write_str("u")?;
114 }
115 Ok(())
116 }
117}
118
119/// Range is used to express the extent of a match, as indexes into the input
120/// string.
121pub type Range = core::ops::Range<usize>;
122
123/// An iterator type which yields `Match`es found in a string.
124pub type Matches<'r, 't> = exec::Matches<backends::DefaultExecutor<'r, 't>>;
125
126/// An iterator type which yields `Match`es found in a string, supporting ASCII
127/// only.
128pub type AsciiMatches<'r, 't> = exec::Matches<backends::DefaultAsciiExecutor<'r, 't>>;
129
130/// A Match represents a portion of a string which was found to match a Regex.
131#[derive(Debug, Clone)]
132pub struct Match {
133 /// The total range of the match. Note this may be empty, if the regex
134 /// matched an empty string.
135 pub range: Range,
136
137 /// The list of captures. This has length equal to the number of capturing
138 /// groups in the regex. For each capture, if the value is None, that group
139 /// did not match (for example, it was in a not-taken branch of an
140 /// alternation). If the value is Some, the group did match with the
141 /// enclosed range.
142 pub captures: Vec<Option<Range>>,
143
144 // A list of capture group names. This is either:
145 // - Empty, if there were no named capture groups.
146 // - A list of names with length `captures.len()`, corresponding to the
147 // capture group names in order. Groups without names have an empty string.
148 pub(crate) group_names: Box<[Box<str>]>,
149}
150
151impl Match {
152 /// Access a group by index, using the convention of Python's group()
153 /// function. Index 0 is the total match, index 1 is the first capture
154 /// group.
155 #[inline]
156 pub fn group(&self, idx: usize) -> Option<Range> {
157 if idx == 0 {
158 Some(self.range.clone())
159 } else {
160 self.captures[idx - 1].clone()
161 }
162 }
163
164 /// Access a named group by name.
165 #[inline]
166 pub fn named_group(&self, name: &str) -> Option<Range> {
167 // Empty strings are used as sentinels to indicate unnamed group.
168 if name.is_empty() {
169 return None;
170 }
171 let pos = self.group_names.iter().position(|s| s.as_ref() == name)?;
172 self.captures[pos].clone()
173 }
174
175 /// Return an iterator over the named groups of a Match.
176 #[inline]
177 pub fn named_groups(&self) -> NamedGroups {
178 NamedGroups::new(self)
179 }
180
181 /// Returns the range over the starting and ending byte offsets of the match in the haystack.
182 ///
183 /// This is a convenience function to work around
184 /// the fact that Range does not support Copy.
185 #[inline]
186 pub fn range(&self) -> Range {
187 self.range.clone()
188 }
189
190 /// Returns the starting byte offset of the match in the haystack.
191 #[inline]
192 pub fn start(&self) -> usize {
193 self.range.start
194 }
195
196 /// Returns the ending byte offset of the match in the haystack.
197 #[inline]
198 pub fn end(&self) -> usize {
199 self.range.end
200 }
201
202 /// Return an iterator over a Match. The first returned value is the total
203 /// match, and subsequent values represent the capture groups.
204 #[inline]
205 pub fn groups(&self) -> Groups {
206 Groups::new(self)
207 }
208}
209
210/// An iterator over the capture groups of a [`Match`]
211///
212/// This struct is created by the [`groups`] method on [`Match`].
213///
214/// [`Match`]: ../struct.Match.html
215/// [`groups`]: ../struct.Match.html#method.groups
216#[derive(Clone)]
217pub struct Groups<'m> {
218 mat: &'m Match,
219 i: usize,
220 max: usize,
221}
222
223impl<'m> Groups<'m> {
224 #[inline]
225 fn new(mat: &'m Match) -> Self {
226 Self {
227 mat,
228 i: 0,
229 max: mat.captures.len() + 1,
230 }
231 }
232}
233
234impl Iterator for Groups<'_> {
235 type Item = Option<Range>;
236
237 #[inline]
238 fn next(&mut self) -> Option<Self::Item> {
239 let i = self.i;
240 if i < self.max {
241 self.i += 1;
242 Some(self.mat.group(i))
243 } else {
244 None
245 }
246 }
247}
248
249/// An iterator over the named capture groups of a [`Match`]
250///
251/// This struct is created by the [`named_groups`] method on [`Match`].
252///
253/// [`Match`]: ../struct.Match.html
254/// [`named_groups`]: ../struct.Match.html#method.named_groups
255#[derive(Clone)]
256pub struct NamedGroups<'m> {
257 mat: &'m Match,
258 next_group_name_idx: usize,
259}
260
261impl<'m> NamedGroups<'m> {
262 #[inline]
263 fn new(mat: &'m Match) -> Self {
264 Self {
265 mat,
266 next_group_name_idx: 0,
267 }
268 }
269}
270
271impl<'m> Iterator for NamedGroups<'m> {
272 type Item = (&'m str, Option<Range>);
273
274 #[inline]
275 fn next(&mut self) -> Option<Self::Item> {
276 // Increment next_group_name_idx until we find a non-empty name.
277 debug_assert!(self.next_group_name_idx <= self.mat.group_names.len());
278 let end = self.mat.group_names.len();
279 let mut idx = self.next_group_name_idx;
280 while idx < end && self.mat.group_names[idx].is_empty() {
281 idx += 1;
282 }
283 if idx == end {
284 return None;
285 }
286 let name = self.mat.group_names[idx].as_ref();
287 let range = self.mat.captures[idx].clone();
288 self.next_group_name_idx = idx + 1;
289 Some((name, range))
290 }
291}
292
293/// A Regex is the compiled version of a pattern.
294#[derive(Debug, Clone)]
295pub struct Regex {
296 cr: CompiledRegex,
297}
298
299impl From<CompiledRegex> for Regex {
300 fn from(cr: CompiledRegex) -> Self {
301 Self { cr }
302 }
303}
304
305impl Regex {
306 /// Construct a regex by parsing `pattern` using the default flags.
307 /// An Error may be returned if the syntax is invalid.
308 /// Note that this is rather expensive; prefer to cache a Regex which is
309 /// intended to be used more than once.
310 #[inline]
311 pub fn new(pattern: &str) -> Result<Regex, Error> {
312 Self::with_flags(pattern, Flags::default())
313 }
314
315 /// Construct a regex by parsing `pattern` with `flags`.
316 /// An Error may be returned if the syntax is invalid.
317 //
318 /// Note it is preferable to cache a Regex which is intended to be used more
319 /// than once, as the parse may be expensive. For example:
320 #[inline]
321 pub fn with_flags<F>(pattern: &str, flags: F) -> Result<Regex, Error>
322 where
323 F: Into<Flags>,
324 {
325 Self::from_unicode(pattern.chars().map(u32::from), flags)
326 }
327
328 /// Construct a regex by parsing `pattern` with `flags`, where
329 /// `pattern` is an iterator of `u32` Unicode codepoints.
330 /// An Error may be returned if the syntax is invalid.
331 /// This allows parsing regular expressions from exotic strings in
332 /// other encodings, such as UTF-16 or UTF-32.
333 pub fn from_unicode<I, F>(pattern: I, flags: F) -> Result<Regex, Error>
334 where
335 I: Iterator<Item = u32> + Clone,
336 F: Into<Flags>,
337 {
338 let flags = flags.into();
339 let mut ire = parse::try_parse(pattern, flags)?;
340 if !flags.no_opt {
341 optimizer::optimize(&mut ire);
342 }
343 let cr = emit::emit(&ire);
344 Ok(Regex { cr })
345 }
346
347 /// Searches `text` to find the first match.
348 #[inline]
349 pub fn find(&self, text: &str) -> Option<Match> {
350 self.find_iter(text).next()
351 }
352
353 /// Searches `text`, returning an iterator over non-overlapping matches.
354 /// Note that the resulting Iterator borrows both the regex `'r` and the
355 /// input string as `'t`.
356 #[inline]
357 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
358 self.find_from(text, 0)
359 }
360
361 /// Returns an iterator for matches found in 'text' starting at byte index
362 /// `start`. Note this may be different from passing a sliced `text` in
363 /// the case of lookbehind assertions.
364 /// Example:
365 ///
366 /// ```rust
367 /// use regress::Regex;
368 /// let text = "xyxy";
369 /// let re = Regex::new(r"(?<=x)y").unwrap();
370 /// let t1 = re.find(&text[1..]).unwrap().range();
371 /// assert!(t1 == (2..3));
372 /// let t2 = re.find_from(text, 1).next().unwrap().range();
373 /// assert!(t2 == (1..2));
374 /// ```
375 #[inline]
376 pub fn find_from<'r, 't>(&'r self, text: &'t str, start: usize) -> Matches<'r, 't> {
377 backends::find(self, text, start)
378 }
379
380 /// Searches `text` to find the first match.
381 /// The input text is expected to be ascii-only: only ASCII case-folding is
382 /// supported.
383 #[inline]
384 pub fn find_ascii(&self, text: &str) -> Option<Match> {
385 self.find_iter_ascii(text).next()
386 }
387
388 /// Searches `text`, returning an iterator over non-overlapping matches.
389 /// The input text is expected to be ascii-only: only ASCII case-folding is
390 /// supported.
391 #[inline]
392 pub fn find_iter_ascii<'r, 't>(&'r self, text: &'t str) -> AsciiMatches<'r, 't> {
393 self.find_from_ascii(text, 0)
394 }
395
396 /// Returns an iterator for matches found in 'text' starting at byte index
397 /// `start`.
398 #[inline]
399 pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> {
400 backends::find(self, text, start)
401 }
402
403 /// Returns an iterator for matches found in 'text' starting at index `start`.
404 #[cfg(feature = "utf16")]
405 pub fn find_from_utf16<'r, 't>(
406 &'r self,
407 text: &'t [u16],
408 start: usize,
409 ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf16Input<'t>>>
410 {
411 let input = Utf16Input::new(text, self.cr.flags.unicode);
412 exec::Matches::new(
413 super::classicalbacktrack::BacktrackExecutor::new(
414 input,
415 MatchAttempter::new(&self.cr, input.left_end()),
416 ),
417 start,
418 )
419 }
420
421 /// Returns an iterator for matches found in 'text' starting at index `start`.
422 #[cfg(feature = "utf16")]
423 pub fn find_from_ucs2<'r, 't>(
424 &'r self,
425 text: &'t [u16],
426 start: usize,
427 ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Ucs2Input<'t>>>
428 {
429 let input = Ucs2Input::new(text, self.cr.flags.unicode);
430 exec::Matches::new(
431 super::classicalbacktrack::BacktrackExecutor::new(
432 input,
433 MatchAttempter::new(&self.cr, input.left_end()),
434 ),
435 start,
436 )
437 }
438}
439
440impl FromStr for Regex {
441 type Err = Error;
442
443 /// Attempts to parse a string into a regular expression
444 #[inline]
445 fn from_str(s: &str) -> Result<Self, Error> {
446 Self::new(s)
447 }
448}
449
450// Support for using regress with different regex backends.
451// Currently there is only the classical backtracking, and PikeVM.
452#[doc(hidden)]
453pub mod backends {
454 use super::exec;
455 use super::indexing;
456 use super::Regex;
457 pub use crate::emit::emit;
458 pub use crate::optimizer::optimize;
459 pub use crate::parse::try_parse;
460
461 /// An Executor using the classical backtracking algorithm.
462 pub type BacktrackExecutor<'r, 't> =
463 super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf8Input<'t>>;
464
465 /// A Executor using the PikeVM executor.
466 #[cfg(feature = "backend-pikevm")]
467 pub type PikeVMExecutor<'r, 't> = super::pikevm::PikeVMExecutor<'r, indexing::Utf8Input<'t>>;
468
469 /// An alias type to the default Executor.
470 pub type DefaultExecutor<'r, 't> = BacktrackExecutor<'r, 't>;
471
472 /// An alias type to the default executor's ASCII form.
473 pub type DefaultAsciiExecutor<'r, 't> =
474 <DefaultExecutor<'r, 't> as exec::Executor<'r, 't>>::AsAscii;
475
476 /// Searches `text`, returning an iterator over non-overlapping matches.
477 pub fn find<'r, 't, Executor: exec::Executor<'r, 't>>(
478 re: &'r Regex,
479 text: &'t str,
480 start: usize,
481 ) -> exec::Matches<Executor> {
482 exec::Matches::new(Executor::new(&re.cr, text), start)
483 }
484
485 /// Searches `text`, returning an iterator over non-overlapping matches.
486 /// This is a convenience method to avoid E0223.
487 pub fn find_ascii<'r, 't, Executor: exec::Executor<'r, 't>>(
488 re: &'r Regex,
489 text: &'t str,
490 start: usize,
491 ) -> exec::Matches<Executor::AsAscii> {
492 find::<Executor::AsAscii>(re, text, start)
493 }
494}