regex_automata/dfa/onepass.rs
1/*!
2A DFA that can return spans for matching capturing groups.
3
4This module is the home of a [one-pass DFA](DFA).
5
6This module also contains a [`Builder`] and a [`Config`] for building and
7configuring a one-pass DFA.
8*/
9
10// A note on naming and credit:
11//
12// As far as I know, Russ Cox came up with the practical vision and
13// implementation of a "one-pass regex engine." He mentions and describes it
14// briefly in the third article of his regexp article series:
15// https://swtch.com/~rsc/regexp/regexp3.html
16//
17// Cox's implementation is in RE2, and the implementation below is most
18// heavily inspired by RE2's. The key thing they have in common is that
19// their transitions are defined over an alphabet of bytes. In contrast,
20// Go's regex engine also has a one-pass engine, but its transitions are
21// more firmly rooted on Unicode codepoints. The ideas are the same, but the
22// implementations are different.
23//
24// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA."
25// They're both true in their own ways:
26//
27// * The "one-pass" criterion is generally a property of the NFA itself. In
28// particular, it is said that an NFA is one-pass if, after each byte of input
29// during a search, there is at most one "VM thread" remaining to take for the
30// next byte of input. That is, there is never any ambiguity as to the path to
31// take through the NFA during a search.
32//
33// * On the other hand, once a one-pass NFA has its representation converted
34// to something where a constant number of instructions is used for each byte
35// of input, the implementation looks a lot more like a DFA. It's technically
36// more powerful than a DFA since it has side effects (storing offsets inside
37// of slots activated by a transition), but it is far closer to a DFA than an
38// NFA simulation.
39//
40// Thus, in this crate, we call it a one-pass DFA.
41
42use alloc::{vec, vec::Vec};
43
44use crate::{
45 dfa::{remapper::Remapper, DEAD},
46 nfa::thompson::{self, NFA},
47 util::{
48 alphabet::ByteClasses,
49 captures::Captures,
50 escape::DebugByte,
51 int::{Usize, U32, U64, U8},
52 look::{Look, LookSet, UnicodeWordBoundaryError},
53 primitives::{NonMaxUsize, PatternID, StateID},
54 search::{Anchored, Input, Match, MatchError, MatchKind, Span},
55 sparse_set::SparseSet,
56 },
57};
58
59/// The configuration used for building a [one-pass DFA](DFA).
60///
61/// A one-pass DFA configuration is a simple data object that is typically used
62/// with [`Builder::configure`]. It can be cheaply cloned.
63///
64/// A default configuration can be created either with `Config::new`, or
65/// perhaps more conveniently, with [`DFA::config`].
66#[derive(Clone, Debug, Default)]
67pub struct Config {
68 match_kind: Option<MatchKind>,
69 starts_for_each_pattern: Option<bool>,
70 byte_classes: Option<bool>,
71 size_limit: Option<Option<usize>>,
72}
73
74impl Config {
75 /// Return a new default one-pass DFA configuration.
76 pub fn new() -> Config {
77 Config::default()
78 }
79
80 /// Set the desired match semantics.
81 ///
82 /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
83 /// match semantics of Perl-like regex engines. That is, when multiple
84 /// patterns would match at the same leftmost position, the pattern that
85 /// appears first in the concrete syntax is chosen.
86 ///
87 /// Currently, the only other kind of match semantics supported is
88 /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
89 /// where all possible matches are visited.
90 ///
91 /// When it comes to the one-pass DFA, it is rarer for preference order and
92 /// "longest match" to actually disagree. Since if they did disagree, then
93 /// the regex typically isn't one-pass. For example, searching `Samwise`
94 /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and
95 /// `Samwise` for "longest match" or "all" matching. However, this regex is
96 /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)`
97 /// is one-pass and `Sam|Samwise` may be optimized to it.
98 ///
99 /// The other main difference is that "all" match semantics don't support
100 /// non-greedy matches. "All" match semantics always try to match as much
101 /// as possible.
102 pub fn match_kind(mut self, kind: MatchKind) -> Config {
103 self.match_kind = Some(kind);
104 self
105 }
106
107 /// Whether to compile a separate start state for each pattern in the
108 /// one-pass DFA.
109 ///
110 /// When enabled, a separate **anchored** start state is added for each
111 /// pattern in the DFA. When this start state is used, then the DFA will
112 /// only search for matches for the pattern specified, even if there are
113 /// other patterns in the DFA.
114 ///
115 /// The main downside of this option is that it can potentially increase
116 /// the size of the DFA and/or increase the time it takes to build the DFA.
117 ///
118 /// You might want to enable this option when you want to both search for
119 /// anchored matches of any pattern or to search for anchored matches of
120 /// one particular pattern while using the same DFA. (Otherwise, you would
121 /// need to compile a new DFA for each pattern.)
122 ///
123 /// By default this is disabled.
124 ///
125 /// # Example
126 ///
127 /// This example shows how to build a multi-regex and then search for
128 /// matches for a any of the patterns or matches for a specific pattern.
129 ///
130 /// ```
131 /// use regex_automata::{
132 /// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
133 /// };
134 ///
135 /// let re = DFA::builder()
136 /// .configure(DFA::config().starts_for_each_pattern(true))
137 /// .build_many(&["[a-z]+", "[0-9]+"])?;
138 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
139 /// let haystack = "123abc";
140 /// let input = Input::new(haystack).anchored(Anchored::Yes);
141 ///
142 /// // A normal multi-pattern search will show pattern 1 matches.
143 /// re.try_search(&mut cache, &input, &mut caps)?;
144 /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
145 ///
146 /// // If we only want to report pattern 0 matches, then we'll get no
147 /// // match here.
148 /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
149 /// re.try_search(&mut cache, &input, &mut caps)?;
150 /// assert_eq!(None, caps.get_match());
151 ///
152 /// # Ok::<(), Box<dyn std::error::Error>>(())
153 /// ```
154 pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
155 self.starts_for_each_pattern = Some(yes);
156 self
157 }
158
159 /// Whether to attempt to shrink the size of the DFA's alphabet or not.
160 ///
161 /// This option is enabled by default and should never be disabled unless
162 /// one is debugging a one-pass DFA.
163 ///
164 /// When enabled, the DFA will use a map from all possible bytes to their
165 /// corresponding equivalence class. Each equivalence class represents a
166 /// set of bytes that does not discriminate between a match and a non-match
167 /// in the DFA. For example, the pattern `[ab]+` has at least two
168 /// equivalence classes: a set containing `a` and `b` and a set containing
169 /// every byte except for `a` and `b`. `a` and `b` are in the same
170 /// equivalence class because they never discriminate between a match and a
171 /// non-match.
172 ///
173 /// The advantage of this map is that the size of the transition table
174 /// can be reduced drastically from (approximately) `#states * 256 *
175 /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the
176 /// number of equivalence classes (rounded up to the nearest power of 2).
177 /// As a result, total space usage can decrease substantially. Moreover,
178 /// since a smaller alphabet is used, DFA compilation becomes faster as
179 /// well.
180 ///
181 /// **WARNING:** This is only useful for debugging DFAs. Disabling this
182 /// does not yield any speed advantages. Namely, even when this is
183 /// disabled, a byte class map is still used while searching. The only
184 /// difference is that every byte will be forced into its own distinct
185 /// equivalence class. This is useful for debugging the actual generated
186 /// transitions because it lets one see the transitions defined on actual
187 /// bytes instead of the equivalence classes.
188 pub fn byte_classes(mut self, yes: bool) -> Config {
189 self.byte_classes = Some(yes);
190 self
191 }
192
193 /// Set a size limit on the total heap used by a one-pass DFA.
194 ///
195 /// This size limit is expressed in bytes and is applied during
196 /// construction of a one-pass DFA. If the DFA's heap usage exceeds
197 /// this configured limit, then construction is stopped and an error is
198 /// returned.
199 ///
200 /// The default is no limit.
201 ///
202 /// # Example
203 ///
204 /// This example shows a one-pass DFA that fails to build because of
205 /// a configured size limit. This particular example also serves as a
206 /// cautionary tale demonstrating just how big DFAs with large Unicode
207 /// character classes can get.
208 ///
209 /// ```
210 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
211 /// use regex_automata::{dfa::onepass::DFA, Match};
212 ///
213 /// // 6MB isn't enough!
214 /// DFA::builder()
215 /// .configure(DFA::config().size_limit(Some(6_000_000)))
216 /// .build(r"\w{20}")
217 /// .unwrap_err();
218 ///
219 /// // ... but 7MB probably is!
220 /// // (Note that DFA sizes aren't necessarily stable between releases.)
221 /// let re = DFA::builder()
222 /// .configure(DFA::config().size_limit(Some(7_000_000)))
223 /// .build(r"\w{20}")?;
224 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
225 /// let haystack = "A".repeat(20);
226 /// re.captures(&mut cache, &haystack, &mut caps);
227 /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match());
228 ///
229 /// # Ok::<(), Box<dyn std::error::Error>>(())
230 /// ```
231 ///
232 /// While one needs a little more than 3MB to represent `\w{20}`, it
233 /// turns out that you only need a little more than 4KB to represent
234 /// `(?-u:\w{20})`. So only use Unicode if you need it!
235 pub fn size_limit(mut self, limit: Option<usize>) -> Config {
236 self.size_limit = Some(limit);
237 self
238 }
239
240 /// Returns the match semantics set in this configuration.
241 pub fn get_match_kind(&self) -> MatchKind {
242 self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
243 }
244
245 /// Returns whether this configuration has enabled anchored starting states
246 /// for every pattern in the DFA.
247 pub fn get_starts_for_each_pattern(&self) -> bool {
248 self.starts_for_each_pattern.unwrap_or(false)
249 }
250
251 /// Returns whether this configuration has enabled byte classes or not.
252 /// This is typically a debugging oriented option, as disabling it confers
253 /// no speed benefit.
254 pub fn get_byte_classes(&self) -> bool {
255 self.byte_classes.unwrap_or(true)
256 }
257
258 /// Returns the DFA size limit of this configuration if one was set.
259 /// The size limit is total number of bytes on the heap that a DFA is
260 /// permitted to use. If the DFA exceeds this limit during construction,
261 /// then construction is stopped and an error is returned.
262 pub fn get_size_limit(&self) -> Option<usize> {
263 self.size_limit.unwrap_or(None)
264 }
265
266 /// Overwrite the default configuration such that the options in `o` are
267 /// always used. If an option in `o` is not set, then the corresponding
268 /// option in `self` is used. If it's not set in `self` either, then it
269 /// remains not set.
270 pub(crate) fn overwrite(&self, o: Config) -> Config {
271 Config {
272 match_kind: o.match_kind.or(self.match_kind),
273 starts_for_each_pattern: o
274 .starts_for_each_pattern
275 .or(self.starts_for_each_pattern),
276 byte_classes: o.byte_classes.or(self.byte_classes),
277 size_limit: o.size_limit.or(self.size_limit),
278 }
279 }
280}
281
282/// A builder for a [one-pass DFA](DFA).
283///
284/// This builder permits configuring options for the syntax of a pattern, the
285/// NFA construction and the DFA construction. This builder is different from a
286/// general purpose regex builder in that it permits fine grain configuration
287/// of the construction process. The trade off for this is complexity, and
288/// the possibility of setting a configuration that might not make sense. For
289/// example, there are two different UTF-8 modes:
290///
291/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
292/// whether the pattern itself can contain sub-expressions that match invalid
293/// UTF-8.
294/// * [`thompson::Config::utf8`] controls whether empty matches that split a
295/// Unicode codepoint are reported or not.
296///
297/// Generally speaking, callers will want to either enable all of these or
298/// disable all of these.
299///
300/// # Example
301///
302/// This example shows how to disable UTF-8 mode in the syntax and the NFA.
303/// This is generally what you want for matching on arbitrary bytes.
304///
305/// ```
306/// # if cfg!(miri) { return Ok(()); } // miri takes too long
307/// use regex_automata::{
308/// dfa::onepass::DFA,
309/// nfa::thompson,
310/// util::syntax,
311/// Match,
312/// };
313///
314/// let re = DFA::builder()
315/// .syntax(syntax::Config::new().utf8(false))
316/// .thompson(thompson::Config::new().utf8(false))
317/// .build(r"foo(?-u:[^b])ar.*")?;
318/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
319///
320/// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
321/// re.captures(&mut cache, haystack, &mut caps);
322/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
323/// // but the subsequent `.*` does not! Disabling UTF-8
324/// // on the syntax permits this.
325/// //
326/// // N.B. This example does not show the impact of
327/// // disabling UTF-8 mode on a one-pass DFA Config,
328/// // since that only impacts regexes that can
329/// // produce matches of length 0.
330/// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
331///
332/// # Ok::<(), Box<dyn std::error::Error>>(())
333/// ```
334#[derive(Clone, Debug)]
335pub struct Builder {
336 config: Config,
337 #[cfg(feature = "syntax")]
338 thompson: thompson::Compiler,
339}
340
341impl Builder {
342 /// Create a new one-pass DFA builder with the default configuration.
343 pub fn new() -> Builder {
344 Builder {
345 config: Config::default(),
346 #[cfg(feature = "syntax")]
347 thompson: thompson::Compiler::new(),
348 }
349 }
350
351 /// Build a one-pass DFA from the given pattern.
352 ///
353 /// If there was a problem parsing or compiling the pattern, then an error
354 /// is returned.
355 #[cfg(feature = "syntax")]
356 pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
357 self.build_many(&[pattern])
358 }
359
360 /// Build a one-pass DFA from the given patterns.
361 ///
362 /// When matches are returned, the pattern ID corresponds to the index of
363 /// the pattern in the slice given.
364 #[cfg(feature = "syntax")]
365 pub fn build_many<P: AsRef<str>>(
366 &self,
367 patterns: &[P],
368 ) -> Result<DFA, BuildError> {
369 let nfa =
370 self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
371 self.build_from_nfa(nfa)
372 }
373
374 /// Build a DFA from the given NFA.
375 ///
376 /// # Example
377 ///
378 /// This example shows how to build a DFA if you already have an NFA in
379 /// hand.
380 ///
381 /// ```
382 /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match};
383 ///
384 /// // This shows how to set non-default options for building an NFA.
385 /// let nfa = NFA::compiler()
386 /// .configure(NFA::config().shrink(true))
387 /// .build(r"[a-z0-9]+")?;
388 /// let re = DFA::builder().build_from_nfa(nfa)?;
389 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
390 /// re.captures(&mut cache, "foo123bar", &mut caps);
391 /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match());
392 ///
393 /// # Ok::<(), Box<dyn std::error::Error>>(())
394 /// ```
395 pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> {
396 // Why take ownership if we're just going to pass a reference to the
397 // NFA to our internal builder? Well, the first thing to note is that
398 // an NFA uses reference counting internally, so either choice is going
399 // to be cheap. So there isn't much cost either way.
400 //
401 // The real reason is that a one-pass DFA, semantically, shares
402 // ownership of an NFA. This is unlike other DFAs that don't share
403 // ownership of an NFA at all, primarily because they want to be
404 // self-contained in order to support cheap (de)serialization.
405 //
406 // But then why pass a '&nfa' below if we want to share ownership?
407 // Well, it turns out that using a '&NFA' in our internal builder
408 // separates its lifetime from the DFA we're building, and this turns
409 // out to make code a bit more composable. e.g., We can iterate over
410 // things inside the NFA while borrowing the builder as mutable because
411 // we know the NFA cannot be mutated. So TL;DR --- this weirdness is
412 // "because borrow checker."
413 InternalBuilder::new(self.config.clone(), &nfa).build()
414 }
415
416 /// Apply the given one-pass DFA configuration options to this builder.
417 pub fn configure(&mut self, config: Config) -> &mut Builder {
418 self.config = self.config.overwrite(config);
419 self
420 }
421
422 /// Set the syntax configuration for this builder using
423 /// [`syntax::Config`](crate::util::syntax::Config).
424 ///
425 /// This permits setting things like case insensitivity, Unicode and multi
426 /// line mode.
427 ///
428 /// These settings only apply when constructing a one-pass DFA directly
429 /// from a pattern.
430 #[cfg(feature = "syntax")]
431 pub fn syntax(
432 &mut self,
433 config: crate::util::syntax::Config,
434 ) -> &mut Builder {
435 self.thompson.syntax(config);
436 self
437 }
438
439 /// Set the Thompson NFA configuration for this builder using
440 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
441 ///
442 /// This permits setting things like whether additional time should be
443 /// spent shrinking the size of the NFA.
444 ///
445 /// These settings only apply when constructing a DFA directly from a
446 /// pattern.
447 #[cfg(feature = "syntax")]
448 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
449 self.thompson.configure(config);
450 self
451 }
452}
453
454/// An internal builder for encapsulating the state necessary to build a
455/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`.
456///
457/// There is no separate pass for determining whether the NFA is one-pass or
458/// not. We just try to build the DFA. If during construction we discover that
459/// it is not one-pass, we bail out. This is likely to lead to some undesirable
460/// expense in some cases, so it might make sense to try an identify common
461/// patterns in the NFA that make it definitively not one-pass. That way, we
462/// can avoid ever trying to build a one-pass DFA in the first place. For
463/// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by
464/// default, it's probably not a trivial cost to try and build a one-pass DFA
465/// for it and then fail.
466///
467/// Note that some (immutable) fields are duplicated here. For example, the
468/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing,
469/// but we duplicate them because it makes composition easier below. Otherwise,
470/// since the borrow checker can't see through method calls, the mutable borrow
471/// we use to mutate the DFA winds up preventing borrowing from any other part
472/// of the DFA, even though we aren't mutating those parts. We only do this
473/// because the duplication is cheap.
474#[derive(Debug)]
475struct InternalBuilder<'a> {
476 /// The DFA we're building.
477 dfa: DFA,
478 /// An unordered collection of NFA state IDs that we haven't yet tried to
479 /// build into a DFA state yet.
480 ///
481 /// This collection does not ultimately wind up including every NFA state
482 /// ID. Instead, each ID represents a "start" state for a sub-graph of the
483 /// NFA. The set of NFA states we then use to build a DFA state consists
484 /// of that "start" state and all states reachable from it via epsilon
485 /// transitions.
486 uncompiled_nfa_ids: Vec<StateID>,
487 /// A map from NFA state ID to DFA state ID. This is useful for easily
488 /// determining whether an NFA state has been used as a "starting" point
489 /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD,
490 /// and since DEAD is specially added and never corresponds to any NFA
491 /// state, it follows that a mapping to DEAD implies the NFA state has
492 /// no corresponding DFA state yet.
493 nfa_to_dfa_id: Vec<StateID>,
494 /// A stack used to traverse the NFA states that make up a single DFA
495 /// state. Traversal occurs until the stack is empty, and we only push to
496 /// the stack when the state ID isn't in 'seen'. Actually, even more than
497 /// that, if we try to push something on to this stack that is already in
498 /// 'seen', then we bail out on construction completely, since it implies
499 /// that the NFA is not one-pass.
500 stack: Vec<(StateID, Epsilons)>,
501 /// The set of NFA states that we've visited via 'stack'.
502 seen: SparseSet,
503 /// Whether a match NFA state has been observed while constructing a
504 /// one-pass DFA state. Once a match state is seen, assuming we are using
505 /// leftmost-first match semantics, then we don't add any more transitions
506 /// to the DFA state we're building.
507 matched: bool,
508 /// The config passed to the builder.
509 ///
510 /// This is duplicated in dfa.config.
511 config: Config,
512 /// The NFA we're building a one-pass DFA from.
513 ///
514 /// This is duplicated in dfa.nfa.
515 nfa: &'a NFA,
516 /// The equivalence classes that make up the alphabet for this DFA>
517 ///
518 /// This is duplicated in dfa.classes.
519 classes: ByteClasses,
520}
521
522impl<'a> InternalBuilder<'a> {
523 /// Create a new builder with an initial empty DFA.
524 fn new(config: Config, nfa: &'a NFA) -> InternalBuilder<'a> {
525 let classes = if !config.get_byte_classes() {
526 // A one-pass DFA will always use the equivalence class map, but
527 // enabling this option is useful for debugging. Namely, this will
528 // cause all transitions to be defined over their actual bytes
529 // instead of an opaque equivalence class identifier. The former is
530 // much easier to grok as a human.
531 ByteClasses::singletons()
532 } else {
533 nfa.byte_classes().clone()
534 };
535 // Normally a DFA alphabet includes the EOI symbol, but we don't need
536 // that in the one-pass DFA since we handle look-around explicitly
537 // without encoding it into the DFA. Thus, we don't need to delay
538 // matches by 1 byte. However, we reuse the space that *would* be used
539 // by the EOI transition by putting match information there (like which
540 // pattern matches and which look-around assertions need to hold). So
541 // this means our real alphabet length is 1 fewer than what the byte
542 // classes report, since we don't use EOI.
543 let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap();
544 let stride2 = classes.stride2();
545 let dfa = DFA {
546 config: config.clone(),
547 nfa: nfa.clone(),
548 table: vec![],
549 starts: vec![],
550 // Since one-pass DFAs have a smaller state ID max than
551 // StateID::MAX, it follows that StateID::MAX is a valid initial
552 // value for min_match_id since no state ID can ever be greater
553 // than it. In the case of a one-pass DFA with no match states, the
554 // min_match_id will keep this sentinel value.
555 min_match_id: StateID::MAX,
556 classes: classes.clone(),
557 alphabet_len,
558 stride2,
559 pateps_offset: alphabet_len,
560 // OK because PatternID::MAX*2 is guaranteed not to overflow.
561 explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(),
562 };
563 InternalBuilder {
564 dfa,
565 uncompiled_nfa_ids: vec![],
566 nfa_to_dfa_id: vec![DEAD; nfa.states().len()],
567 stack: vec![],
568 seen: SparseSet::new(nfa.states().len()),
569 matched: false,
570 config,
571 nfa,
572 classes,
573 }
574 }
575
576 /// Build the DFA from the NFA given to this builder. If the NFA is not
577 /// one-pass, then return an error. An error may also be returned if a
578 /// particular limit is exceeded. (Some limits, like the total heap memory
579 /// used, are configurable. Others, like the total patterns or slots, are
580 /// hard-coded based on representational limitations.)
581 fn build(mut self) -> Result<DFA, BuildError> {
582 self.nfa.look_set_any().available().map_err(BuildError::word)?;
583 for look in self.nfa.look_set_any().iter() {
584 // This is a future incompatibility check where if we add any
585 // more look-around assertions, then the one-pass DFA either
586 // needs to reject them (what we do here) or it needs to have its
587 // Transition representation modified to be capable of storing the
588 // new assertions.
589 if look.as_repr() > Look::WordUnicodeNegate.as_repr() {
590 return Err(BuildError::unsupported_look(look));
591 }
592 }
593 if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT
594 {
595 return Err(BuildError::too_many_patterns(
596 PatternEpsilons::PATTERN_ID_LIMIT,
597 ));
598 }
599 if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT {
600 return Err(BuildError::not_one_pass(
601 "too many explicit capturing groups (max is 16)",
602 ));
603 }
604 assert_eq!(DEAD, self.add_empty_state()?);
605
606 // This is where the explicit slots start. We care about this because
607 // we only need to track explicit slots. The implicit slots---two for
608 // each pattern---are tracked as part of the search routine itself.
609 let explicit_slot_start = self.nfa.pattern_len() * 2;
610 self.add_start_state(None, self.nfa.start_anchored())?;
611 if self.config.get_starts_for_each_pattern() {
612 for pid in self.nfa.patterns() {
613 self.add_start_state(
614 Some(pid),
615 self.nfa.start_pattern(pid).unwrap(),
616 )?;
617 }
618 }
619 // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids'
620 // as a stack are. It is really an unordered *set* of NFA state IDs.
621 // If it, for example, in practice led to discovering whether a regex
622 // was or wasn't one-pass later than if we processed NFA state IDs in
623 // ascending order, then that would make this routine more costly in
624 // the somewhat common case of a regex that isn't one-pass.
625 while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() {
626 let dfa_id = self.nfa_to_dfa_id[nfa_id];
627 // Once we see a match, we keep going, but don't add any new
628 // transitions. Normally we'd just stop, but we have to keep
629 // going in order to verify that our regex is actually one-pass.
630 self.matched = false;
631 // The NFA states we've already explored for this DFA state.
632 self.seen.clear();
633 // The NFA states to explore via epsilon transitions. If we ever
634 // try to push an NFA state that we've already seen, then the NFA
635 // is not one-pass because it implies there are multiple epsilon
636 // transition paths that lead to the same NFA state. In other
637 // words, there is ambiguity.
638 self.stack_push(nfa_id, Epsilons::empty())?;
639 while let Some((id, epsilons)) = self.stack.pop() {
640 match *self.nfa.state(id) {
641 thompson::State::ByteRange { ref trans } => {
642 self.compile_transition(dfa_id, trans, epsilons)?;
643 }
644 thompson::State::Sparse(ref sparse) => {
645 for trans in sparse.transitions.iter() {
646 self.compile_transition(dfa_id, trans, epsilons)?;
647 }
648 }
649 thompson::State::Dense(ref dense) => {
650 for trans in dense.iter() {
651 self.compile_transition(dfa_id, &trans, epsilons)?;
652 }
653 }
654 thompson::State::Look { look, next } => {
655 let looks = epsilons.looks().insert(look);
656 self.stack_push(next, epsilons.set_looks(looks))?;
657 }
658 thompson::State::Union { ref alternates } => {
659 for &sid in alternates.iter().rev() {
660 self.stack_push(sid, epsilons)?;
661 }
662 }
663 thompson::State::BinaryUnion { alt1, alt2 } => {
664 self.stack_push(alt2, epsilons)?;
665 self.stack_push(alt1, epsilons)?;
666 }
667 thompson::State::Capture { next, slot, .. } => {
668 let slot = slot.as_usize();
669 let epsilons = if slot < explicit_slot_start {
670 // If this is an implicit slot, we don't care
671 // about it, since we handle implicit slots in
672 // the search routine. We can get away with that
673 // because there are 2 implicit slots for every
674 // pattern.
675 epsilons
676 } else {
677 // Offset our explicit slots so that they start
678 // at index 0.
679 let offset = slot - explicit_slot_start;
680 epsilons.set_slots(epsilons.slots().insert(offset))
681 };
682 self.stack_push(next, epsilons)?;
683 }
684 thompson::State::Fail => {
685 continue;
686 }
687 thompson::State::Match { pattern_id } => {
688 // If we found two different paths to a match state
689 // for the same DFA state, then we have ambiguity.
690 // Thus, it's not one-pass.
691 if self.matched {
692 return Err(BuildError::not_one_pass(
693 "multiple epsilon transitions to match state",
694 ));
695 }
696 self.matched = true;
697 // Shove the matching pattern ID and the 'epsilons'
698 // into the current DFA state's pattern epsilons. The
699 // 'epsilons' includes the slots we need to capture
700 // before reporting the match and also the conditional
701 // epsilon transitions we need to check before we can
702 // report a match.
703 self.dfa.set_pattern_epsilons(
704 dfa_id,
705 PatternEpsilons::empty()
706 .set_pattern_id(pattern_id)
707 .set_epsilons(epsilons),
708 );
709 // N.B. It is tempting to just bail out here when
710 // compiling a leftmost-first DFA, since we will never
711 // compile any more transitions in that case. But we
712 // actually need to keep going in order to verify that
713 // we actually have a one-pass regex. e.g., We might
714 // see more Match states (e.g., for other patterns)
715 // that imply that we don't have a one-pass regex.
716 // So instead, we mark that we've found a match and
717 // continue on. When we go to compile a new DFA state,
718 // we just skip that part. But otherwise check that the
719 // one-pass property is upheld.
720 }
721 }
722 }
723 }
724 self.shuffle_states();
725 self.dfa.starts.shrink_to_fit();
726 self.dfa.table.shrink_to_fit();
727 Ok(self.dfa)
728 }
729
730 /// Shuffle all match states to the end of the transition table and set
731 /// 'min_match_id' to the ID of the first such match state.
732 ///
733 /// The point of this is to make it extremely cheap to determine whether
734 /// a state is a match state or not. We need to check on this on every
735 /// transition during a search, so it being cheap is important. This
736 /// permits us to check it by simply comparing two state identifiers, as
737 /// opposed to looking for the pattern ID in the state's `PatternEpsilons`.
738 /// (Which requires a memory load and some light arithmetic.)
739 fn shuffle_states(&mut self) {
740 let mut remapper = Remapper::new(&self.dfa);
741 let mut next_dest = self.dfa.last_state_id();
742 for i in (0..self.dfa.state_len()).rev() {
743 let id = StateID::must(i);
744 let is_match =
745 self.dfa.pattern_epsilons(id).pattern_id().is_some();
746 if !is_match {
747 continue;
748 }
749 remapper.swap(&mut self.dfa, next_dest, id);
750 self.dfa.min_match_id = next_dest;
751 next_dest = self.dfa.prev_state_id(next_dest).expect(
752 "match states should be a proper subset of all states",
753 );
754 }
755 remapper.remap(&mut self.dfa);
756 }
757
758 /// Compile the given NFA transition into the DFA state given.
759 ///
760 /// 'Epsilons' corresponds to any conditional epsilon transitions that need
761 /// to be satisfied to follow this transition, and any slots that need to
762 /// be saved if the transition is followed.
763 ///
764 /// If this transition indicates that the NFA is not one-pass, then
765 /// this returns an error. (This occurs, for example, if the DFA state
766 /// already has a transition defined for the same input symbols as the
767 /// given transition, *and* the result of the old and new transitions is
768 /// different.)
769 fn compile_transition(
770 &mut self,
771 dfa_id: StateID,
772 trans: &thompson::Transition,
773 epsilons: Epsilons,
774 ) -> Result<(), BuildError> {
775 let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?;
776 for byte in self
777 .classes
778 .representatives(trans.start..=trans.end)
779 .filter_map(|r| r.as_u8())
780 {
781 let oldtrans = self.dfa.transition(dfa_id, byte);
782 let newtrans =
783 Transition::new(self.matched, next_dfa_id, epsilons);
784 // If the old transition points to the DEAD state, then we know
785 // 'byte' has not been mapped to any transition for this DFA state
786 // yet. So set it unconditionally. Otherwise, we require that the
787 // old and new transitions are equivalent. Otherwise, there is
788 // ambiguity and thus the regex is not one-pass.
789 if oldtrans.state_id() == DEAD {
790 self.dfa.set_transition(dfa_id, byte, newtrans);
791 } else if oldtrans != newtrans {
792 return Err(BuildError::not_one_pass(
793 "conflicting transition",
794 ));
795 }
796 }
797 Ok(())
798 }
799
800 /// Add a start state to the DFA corresponding to the given NFA starting
801 /// state ID.
802 ///
803 /// If adding a state would blow any limits (configured or hard-coded),
804 /// then an error is returned.
805 ///
806 /// If the starting state is an anchored state for a particular pattern,
807 /// then callers must provide the pattern ID for that starting state.
808 /// Callers must also ensure that the first starting state added is the
809 /// start state for all patterns, and then each anchored starting state for
810 /// each pattern (if necessary) added in order. Otherwise, this panics.
811 fn add_start_state(
812 &mut self,
813 pid: Option<PatternID>,
814 nfa_id: StateID,
815 ) -> Result<StateID, BuildError> {
816 match pid {
817 // With no pid, this should be the start state for all patterns
818 // and thus be the first one.
819 None => assert!(self.dfa.starts.is_empty()),
820 // With a pid, we want it to be at self.dfa.starts[pid+1].
821 Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()),
822 }
823 let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?;
824 self.dfa.starts.push(dfa_id);
825 Ok(dfa_id)
826 }
827
828 /// Add a new DFA state corresponding to the given NFA state. If adding a
829 /// state would blow any limits (configured or hard-coded), then an error
830 /// is returned. If a DFA state already exists for the given NFA state,
831 /// then that DFA state's ID is returned and no new states are added.
832 ///
833 /// It is not expected that this routine is called for every NFA state.
834 /// Instead, an NFA state ID will usually correspond to the "start" state
835 /// for a sub-graph of the NFA, where all states in the sub-graph are
836 /// reachable via epsilon transitions (conditional or unconditional). That
837 /// sub-graph of NFA states is ultimately what produces a single DFA state.
838 fn add_dfa_state_for_nfa_state(
839 &mut self,
840 nfa_id: StateID,
841 ) -> Result<StateID, BuildError> {
842 // If we've already built a DFA state for the given NFA state, then
843 // just return that. We definitely do not want to have more than one
844 // DFA state in existence for the same NFA state, since all but one of
845 // them will likely become unreachable. And at least some of them are
846 // likely to wind up being incomplete.
847 let existing_dfa_id = self.nfa_to_dfa_id[nfa_id];
848 if existing_dfa_id != DEAD {
849 return Ok(existing_dfa_id);
850 }
851 // If we don't have any DFA state yet, add it and then add the given
852 // NFA state to the list of states to explore.
853 let dfa_id = self.add_empty_state()?;
854 self.nfa_to_dfa_id[nfa_id] = dfa_id;
855 self.uncompiled_nfa_ids.push(nfa_id);
856 Ok(dfa_id)
857 }
858
859 /// Unconditionally add a new empty DFA state. If adding it would exceed
860 /// any limits (configured or hard-coded), then an error is returned. The
861 /// ID of the new state is returned on success.
862 ///
863 /// The added state is *not* a match state.
864 fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
865 let state_limit = Transition::STATE_ID_LIMIT;
866 // Note that unlike dense and lazy DFAs, we specifically do NOT
867 // premultiply our state IDs here. The reason is that we want to pack
868 // our state IDs into 64-bit transitions with other info, so the fewer
869 // the bits we use for state IDs the better. If we premultiply, then
870 // our state ID space shrinks. We justify this by the assumption that
871 // a one-pass DFA is just already doing a fair bit more work than a
872 // normal DFA anyway, so an extra multiplication to compute a state
873 // transition doesn't seem like a huge deal.
874 let next_id = self.dfa.table.len() >> self.dfa.stride2();
875 let id = StateID::new(next_id)
876 .map_err(|_| BuildError::too_many_states(state_limit))?;
877 if id.as_u64() > Transition::STATE_ID_LIMIT {
878 return Err(BuildError::too_many_states(state_limit));
879 }
880 self.dfa
881 .table
882 .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride()));
883 // The default empty value for 'PatternEpsilons' is sadly not all
884 // zeroes. Instead, a special sentinel is used to indicate that there
885 // is no pattern. So we need to explicitly set the pattern epsilons to
886 // the correct "empty" PatternEpsilons.
887 self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty());
888 if let Some(size_limit) = self.config.get_size_limit() {
889 if self.dfa.memory_usage() > size_limit {
890 return Err(BuildError::exceeded_size_limit(size_limit));
891 }
892 }
893 Ok(id)
894 }
895
896 /// Push the given NFA state ID and its corresponding epsilons (slots and
897 /// conditional epsilon transitions) on to a stack for use in a depth first
898 /// traversal of a sub-graph of the NFA.
899 ///
900 /// If the given NFA state ID has already been pushed on to the stack, then
901 /// it indicates the regex is not one-pass and this correspondingly returns
902 /// an error.
903 fn stack_push(
904 &mut self,
905 nfa_id: StateID,
906 epsilons: Epsilons,
907 ) -> Result<(), BuildError> {
908 // If we already have seen a match and we are compiling a leftmost
909 // first DFA, then we shouldn't add any more states to look at. This is
910 // effectively how preference order and non-greediness is implemented.
911 // if !self.config.get_match_kind().continue_past_first_match()
912 // && self.matched
913 // {
914 // return Ok(());
915 // }
916 if !self.seen.insert(nfa_id) {
917 return Err(BuildError::not_one_pass(
918 "multiple epsilon transitions to same state",
919 ));
920 }
921 self.stack.push((nfa_id, epsilons));
922 Ok(())
923 }
924}
925
926/// A one-pass DFA for executing a subset of anchored regex searches while
927/// resolving capturing groups.
928///
929/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is
930/// one-pass when there is never any ambiguity about how to continue a search.
931/// For example, `a*a` is not one-pass because during a search, it's not
932/// possible to know whether to continue matching the `a*` or to move on to
933/// the single `a`. However, `a*b` is one-pass, because for every byte in the
934/// input, it's always clear when to move on from `a*` to `b`.
935///
936/// # Only anchored searches are supported
937///
938/// In this crate, especially for DFAs, unanchored searches are implemented by
939/// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix
940/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will
941/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches
942/// any byte, and there is therefore ambiguity as to when the prefix should
943/// stop matching and something else should start matching.
944///
945/// Therefore, one-pass DFAs do not support unanchored searches. In addition
946/// to many regexes simply not being one-pass, it implies that one-pass DFAs
947/// have limited utility. With that said, when a one-pass DFA can be used, it
948/// can potentially provide a dramatic speed up over alternatives like the
949/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
950/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular,
951/// a one-pass DFA is the only DFA capable of reporting the spans of matching
952/// capturing groups.
953///
954/// To clarify, when we say that unanchored searches are not supported, what
955/// that actually means is:
956///
957/// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always
958/// do anchored searches.
959/// * Since iterators are most useful in the context of unanchored searches,
960/// there is no `DFA::captures_iter` method.
961/// * For lower level routines like [`DFA::try_search`], an error will be
962/// returned if the given [`Input`] is configured to do an unanchored search or
963/// search for an invalid pattern ID. (Note that an [`Input`] is configured to
964/// do an unanchored search by default, so just giving a `Input::new` is
965/// guaranteed to return an error.)
966///
967/// # Other limitations
968///
969/// In addition to the [configurable heap limit](Config::size_limit) and
970/// the requirement that a regex pattern be one-pass, there are some other
971/// limitations:
972///
973/// * There is an internal limit on the total number of explicit capturing
974/// groups that appear across all patterns. It is somewhat small and there is
975/// no way to configure it. If your pattern(s) exceed this limit, then building
976/// a one-pass DFA will fail.
977/// * If the number of patterns exceeds an internal unconfigurable limit, then
978/// building a one-pass DFA will fail. This limit is quite large and you're
979/// unlikely to hit it.
980/// * If the total number of states exceeds an internal unconfigurable limit,
981/// then building a one-pass DFA will fail. This limit is quite large and
982/// you're unlikely to hit it.
983///
984/// # Other examples of regexes that aren't one-pass
985///
986/// One particularly unfortunate example is that enabling Unicode can cause
987/// regexes that were one-pass to no longer be one-pass. Consider the regex
988/// `(?-u)\w*\s` for example. It is one-pass because there is exactly no
989/// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s`
990/// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get
991/// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s`
992/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the
993/// overlap between UTF-8 automata, the use of Unicode character classes will
994/// tend to vastly increase the likelihood of a regex not being one-pass.
995///
996/// # How does one know if a regex is one-pass or not?
997///
998/// At the time of writing, the only way to know is to try and build a one-pass
999/// DFA. The one-pass property is checked while constructing the DFA.
1000///
1001/// This does mean that you might potentially waste some CPU cycles and memory
1002/// by optimistically trying to build a one-pass DFA. But this is currently the
1003/// only way. In the future, building a one-pass DFA might be able to use some
1004/// heuristics to detect common violations of the one-pass property and bail
1005/// more quickly.
1006///
1007/// # Resource usage
1008///
1009/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource
1010/// usage. Namely, construction of a one-pass DFA has a time and space
1011/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time
1012/// and space complexity is `O(2^n)`.) This smaller time bound is achieved
1013/// because there is at most one DFA state created for each NFA state. If
1014/// additional DFA states would be required, then the pattern is not one-pass
1015/// and construction will fail.
1016///
1017/// Note though that currently, this DFA uses a fully dense representation.
1018/// This means that while its space complexity is no worse than an NFA, it may
1019/// in practice use more memory because of higher constant factors. The reason
1020/// for this trade off is two-fold. Firstly, a dense representation makes the
1021/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be
1022/// one-pass. Therefore, most one-pass DFAs are usually pretty small.
1023///
1024/// # Example
1025///
1026/// This example shows that the one-pass DFA implements Unicode word boundaries
1027/// correctly while simultaneously reporting spans for capturing groups that
1028/// participate in a match. (This is the only DFA that implements full support
1029/// for Unicode word boundaries.)
1030///
1031/// ```
1032/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1033/// use regex_automata::{dfa::onepass::DFA, Match, Span};
1034///
1035/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?;
1036/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1037///
1038/// re.captures(&mut cache, "Шерлок Холмс", &mut caps);
1039/// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match());
1040/// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first"));
1041/// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last"));
1042/// # Ok::<(), Box<dyn std::error::Error>>(())
1043/// ```
1044///
1045/// # Example: iteration
1046///
1047/// Unlike other regex engines in this crate, this one does not provide
1048/// iterator search functions. This is because a one-pass DFA only supports
1049/// anchored searches, and so iterator functions are generally not applicable.
1050///
1051/// However, if you know that all of your matches are
1052/// directly adjacent, then an iterator can be used. The
1053/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for
1054/// this purpose:
1055///
1056/// ```
1057/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1058/// use regex_automata::{
1059/// dfa::onepass::DFA,
1060/// util::iter::Searcher,
1061/// Anchored, Input, Span,
1062/// };
1063///
1064/// let re = DFA::new(r"\w(\d)\w")?;
1065/// let (mut cache, caps) = (re.create_cache(), re.create_captures());
1066/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes);
1067///
1068/// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| {
1069/// Ok(re.try_search(&mut cache, input, caps)?)
1070/// }).infallible();
1071/// let caps0 = it.next().unwrap();
1072/// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1));
1073///
1074/// # Ok::<(), Box<dyn std::error::Error>>(())
1075/// ```
1076#[derive(Clone)]
1077pub struct DFA {
1078 /// The configuration provided by the caller.
1079 config: Config,
1080 /// The NFA used to build this DFA.
1081 ///
1082 /// NOTE: We probably don't need to store the NFA here, but we use enough
1083 /// bits from it that it's convenient to do so. And there really isn't much
1084 /// cost to doing so either, since an NFA is reference counted internally.
1085 nfa: NFA,
1086 /// The transition table. Given a state ID 's' and a byte of haystack 'b',
1087 /// the next state is `table[sid + classes[byte]]`.
1088 ///
1089 /// The stride of this table (i.e., the number of columns) is always
1090 /// a power of 2, even if the alphabet length is smaller. This makes
1091 /// converting between state IDs and state indices very cheap.
1092 ///
1093 /// Note that the stride always includes room for one extra "transition"
1094 /// that isn't actually a transition. It is a 'PatternEpsilons' that is
1095 /// used for match states only. Because of this, the maximum number of
1096 /// active columns in the transition table is 257, which means the maximum
1097 /// stride is 512 (the next power of 2 greater than or equal to 257).
1098 table: Vec<Transition>,
1099 /// The DFA state IDs of the starting states.
1100 ///
1101 /// `starts[0]` is always present and corresponds to the starting state
1102 /// when searching for matches of any pattern in the DFA.
1103 ///
1104 /// `starts[i]` where i>0 corresponds to the starting state for the pattern
1105 /// ID 'i-1'. These starting states are optional.
1106 starts: Vec<StateID>,
1107 /// Every state ID >= this value corresponds to a match state.
1108 ///
1109 /// This is what a search uses to detect whether a state is a match state
1110 /// or not. It requires only a simple comparison instead of bit-unpacking
1111 /// the PatternEpsilons from every state.
1112 min_match_id: StateID,
1113 /// The alphabet of this DFA, split into equivalence classes. Bytes in the
1114 /// same equivalence class can never discriminate between a match and a
1115 /// non-match.
1116 classes: ByteClasses,
1117 /// The number of elements in each state in the transition table. This may
1118 /// be less than the stride, since the stride is always a power of 2 and
1119 /// the alphabet length can be anything up to and including 256.
1120 alphabet_len: usize,
1121 /// The number of columns in the transition table, expressed as a power of
1122 /// 2.
1123 stride2: usize,
1124 /// The offset at which the PatternEpsilons for a match state is stored in
1125 /// the transition table.
1126 ///
1127 /// PERF: One wonders whether it would be better to put this in a separate
1128 /// allocation, since only match states have a non-empty PatternEpsilons
1129 /// and the number of match states tends be dwarfed by the number of
1130 /// non-match states. So this would save '8*len(non_match_states)' for each
1131 /// DFA. The question is whether moving this to a different allocation will
1132 /// lead to a perf hit during searches. You might think dealing with match
1133 /// states is rare, but some regexes spend a lot of time in match states
1134 /// gobbling up input. But... match state handling is already somewhat
1135 /// expensive, so maybe this wouldn't do much? Either way, it's worth
1136 /// experimenting.
1137 pateps_offset: usize,
1138 /// The first explicit slot index. This refers to the first slot appearing
1139 /// immediately after the last implicit slot. It is always 'patterns.len()
1140 /// * 2'.
1141 ///
1142 /// We record this because we only store the explicit slots in our DFA
1143 /// transition table that need to be saved. Implicit slots are handled
1144 /// automatically as part of the search.
1145 explicit_slot_start: usize,
1146}
1147
1148impl DFA {
1149 /// Parse the given regular expression using the default configuration and
1150 /// return the corresponding one-pass DFA.
1151 ///
1152 /// If you want a non-default configuration, then use the [`Builder`] to
1153 /// set your own configuration.
1154 ///
1155 /// # Example
1156 ///
1157 /// ```
1158 /// use regex_automata::{dfa::onepass::DFA, Match};
1159 ///
1160 /// let re = DFA::new("foo[0-9]+bar")?;
1161 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1162 ///
1163 /// re.captures(&mut cache, "foo12345barzzz", &mut caps);
1164 /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match());
1165 /// # Ok::<(), Box<dyn std::error::Error>>(())
1166 /// ```
1167 #[cfg(feature = "syntax")]
1168 #[inline]
1169 pub fn new(pattern: &str) -> Result<DFA, BuildError> {
1170 DFA::builder().build(pattern)
1171 }
1172
1173 /// Like `new`, but parses multiple patterns into a single "multi regex."
1174 /// This similarly uses the default regex configuration.
1175 ///
1176 /// # Example
1177 ///
1178 /// ```
1179 /// use regex_automata::{dfa::onepass::DFA, Match};
1180 ///
1181 /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?;
1182 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1183 ///
1184 /// re.captures(&mut cache, "abc123", &mut caps);
1185 /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match());
1186 ///
1187 /// re.captures(&mut cache, "123abc", &mut caps);
1188 /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
1189 ///
1190 /// # Ok::<(), Box<dyn std::error::Error>>(())
1191 /// ```
1192 #[cfg(feature = "syntax")]
1193 #[inline]
1194 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
1195 DFA::builder().build_many(patterns)
1196 }
1197
1198 /// Like `new`, but builds a one-pass DFA directly from an NFA. This is
1199 /// useful if you already have an NFA, or even if you hand-assembled the
1200 /// NFA.
1201 ///
1202 /// # Example
1203 ///
1204 /// This shows how to hand assemble a regular expression via its HIR,
1205 /// compile an NFA from it and build a one-pass DFA from the NFA.
1206 ///
1207 /// ```
1208 /// use regex_automata::{
1209 /// dfa::onepass::DFA,
1210 /// nfa::thompson::NFA,
1211 /// Match,
1212 /// };
1213 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
1214 ///
1215 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
1216 /// ClassBytesRange::new(b'0', b'9'),
1217 /// ClassBytesRange::new(b'A', b'Z'),
1218 /// ClassBytesRange::new(b'_', b'_'),
1219 /// ClassBytesRange::new(b'a', b'z'),
1220 /// ])));
1221 ///
1222 /// let config = NFA::config().nfa_size_limit(Some(1_000));
1223 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
1224 ///
1225 /// let re = DFA::new_from_nfa(nfa)?;
1226 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1227 /// let expected = Some(Match::must(0, 0..1));
1228 /// re.captures(&mut cache, "A", &mut caps);
1229 /// assert_eq!(expected, caps.get_match());
1230 ///
1231 /// # Ok::<(), Box<dyn std::error::Error>>(())
1232 /// ```
1233 pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> {
1234 DFA::builder().build_from_nfa(nfa)
1235 }
1236
1237 /// Create a new one-pass DFA that matches every input.
1238 ///
1239 /// # Example
1240 ///
1241 /// ```
1242 /// use regex_automata::{dfa::onepass::DFA, Match};
1243 ///
1244 /// let dfa = DFA::always_match()?;
1245 /// let mut cache = dfa.create_cache();
1246 /// let mut caps = dfa.create_captures();
1247 ///
1248 /// let expected = Match::must(0, 0..0);
1249 /// dfa.captures(&mut cache, "", &mut caps);
1250 /// assert_eq!(Some(expected), caps.get_match());
1251 /// dfa.captures(&mut cache, "foo", &mut caps);
1252 /// assert_eq!(Some(expected), caps.get_match());
1253 /// # Ok::<(), Box<dyn std::error::Error>>(())
1254 /// ```
1255 pub fn always_match() -> Result<DFA, BuildError> {
1256 let nfa = thompson::NFA::always_match();
1257 Builder::new().build_from_nfa(nfa)
1258 }
1259
1260 /// Create a new one-pass DFA that never matches any input.
1261 ///
1262 /// # Example
1263 ///
1264 /// ```
1265 /// use regex_automata::dfa::onepass::DFA;
1266 ///
1267 /// let dfa = DFA::never_match()?;
1268 /// let mut cache = dfa.create_cache();
1269 /// let mut caps = dfa.create_captures();
1270 ///
1271 /// dfa.captures(&mut cache, "", &mut caps);
1272 /// assert_eq!(None, caps.get_match());
1273 /// dfa.captures(&mut cache, "foo", &mut caps);
1274 /// assert_eq!(None, caps.get_match());
1275 /// # Ok::<(), Box<dyn std::error::Error>>(())
1276 /// ```
1277 pub fn never_match() -> Result<DFA, BuildError> {
1278 let nfa = thompson::NFA::never_match();
1279 Builder::new().build_from_nfa(nfa)
1280 }
1281
1282 /// Return a default configuration for a DFA.
1283 ///
1284 /// This is a convenience routine to avoid needing to import the `Config`
1285 /// type when customizing the construction of a DFA.
1286 ///
1287 /// # Example
1288 ///
1289 /// This example shows how to change the match semantics of this DFA from
1290 /// its default "leftmost first" to "all." When using "all," non-greediness
1291 /// doesn't apply and neither does preference order matching. Instead, the
1292 /// longest match possible is always returned. (Although, by construction,
1293 /// it's impossible for a one-pass DFA to have a different answer for
1294 /// "preference order" vs "longest match.")
1295 ///
1296 /// ```
1297 /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind};
1298 ///
1299 /// let re = DFA::builder()
1300 /// .configure(DFA::config().match_kind(MatchKind::All))
1301 /// .build(r"(abc)+?")?;
1302 /// let mut cache = re.create_cache();
1303 /// let mut caps = re.create_captures();
1304 ///
1305 /// re.captures(&mut cache, "abcabc", &mut caps);
1306 /// // Normally, the non-greedy repetition would give us a 0..3 match.
1307 /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
1308 /// # Ok::<(), Box<dyn std::error::Error>>(())
1309 /// ```
1310 #[inline]
1311 pub fn config() -> Config {
1312 Config::new()
1313 }
1314
1315 /// Return a builder for configuring the construction of a DFA.
1316 ///
1317 /// This is a convenience routine to avoid needing to import the
1318 /// [`Builder`] type in common cases.
1319 ///
1320 /// # Example
1321 ///
1322 /// This example shows how to use the builder to disable UTF-8 mode.
1323 ///
1324 /// ```
1325 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1326 /// use regex_automata::{
1327 /// dfa::onepass::DFA,
1328 /// nfa::thompson,
1329 /// util::syntax,
1330 /// Match,
1331 /// };
1332 ///
1333 /// let re = DFA::builder()
1334 /// .syntax(syntax::Config::new().utf8(false))
1335 /// .thompson(thompson::Config::new().utf8(false))
1336 /// .build(r"foo(?-u:[^b])ar.*")?;
1337 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1338 ///
1339 /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
1340 /// let expected = Some(Match::must(0, 0..8));
1341 /// re.captures(&mut cache, haystack, &mut caps);
1342 /// assert_eq!(expected, caps.get_match());
1343 ///
1344 /// # Ok::<(), Box<dyn std::error::Error>>(())
1345 /// ```
1346 #[inline]
1347 pub fn builder() -> Builder {
1348 Builder::new()
1349 }
1350
1351 /// Create a new empty set of capturing groups that is guaranteed to be
1352 /// valid for the search APIs on this DFA.
1353 ///
1354 /// A `Captures` value created for a specific DFA cannot be used with any
1355 /// other DFA.
1356 ///
1357 /// This is a convenience function for [`Captures::all`]. See the
1358 /// [`Captures`] documentation for an explanation of its alternative
1359 /// constructors that permit the DFA to do less work during a search, and
1360 /// thus might make it faster.
1361 #[inline]
1362 pub fn create_captures(&self) -> Captures {
1363 Captures::all(self.nfa.group_info().clone())
1364 }
1365
1366 /// Create a new cache for this DFA.
1367 ///
1368 /// The cache returned should only be used for searches for this
1369 /// DFA. If you want to reuse the cache for another DFA, then you
1370 /// must call [`Cache::reset`] with that DFA (or, equivalently,
1371 /// [`DFA::reset_cache`]).
1372 #[inline]
1373 pub fn create_cache(&self) -> Cache {
1374 Cache::new(self)
1375 }
1376
1377 /// Reset the given cache such that it can be used for searching with the
1378 /// this DFA (and only this DFA).
1379 ///
1380 /// A cache reset permits reusing memory already allocated in this cache
1381 /// with a different DFA.
1382 ///
1383 /// # Example
1384 ///
1385 /// This shows how to re-purpose a cache for use with a different DFA.
1386 ///
1387 /// ```
1388 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1389 /// use regex_automata::{dfa::onepass::DFA, Match};
1390 ///
1391 /// let re1 = DFA::new(r"\w")?;
1392 /// let re2 = DFA::new(r"\W")?;
1393 /// let mut caps1 = re1.create_captures();
1394 /// let mut caps2 = re2.create_captures();
1395 ///
1396 /// let mut cache = re1.create_cache();
1397 /// assert_eq!(
1398 /// Some(Match::must(0, 0..2)),
1399 /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
1400 /// );
1401 ///
1402 /// // Using 'cache' with re2 is not allowed. It may result in panics or
1403 /// // incorrect results. In order to re-purpose the cache, we must reset
1404 /// // it with the one-pass DFA we'd like to use it with.
1405 /// //
1406 /// // Similarly, after this reset, using the cache with 're1' is also not
1407 /// // allowed.
1408 /// re2.reset_cache(&mut cache);
1409 /// assert_eq!(
1410 /// Some(Match::must(0, 0..3)),
1411 /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
1412 /// );
1413 ///
1414 /// # Ok::<(), Box<dyn std::error::Error>>(())
1415 /// ```
1416 #[inline]
1417 pub fn reset_cache(&self, cache: &mut Cache) {
1418 cache.reset(self);
1419 }
1420
1421 /// Return the config for this one-pass DFA.
1422 #[inline]
1423 pub fn get_config(&self) -> &Config {
1424 &self.config
1425 }
1426
1427 /// Returns a reference to the underlying NFA.
1428 #[inline]
1429 pub fn get_nfa(&self) -> &NFA {
1430 &self.nfa
1431 }
1432
1433 /// Returns the total number of patterns compiled into this DFA.
1434 ///
1435 /// In the case of a DFA that contains no patterns, this returns `0`.
1436 #[inline]
1437 pub fn pattern_len(&self) -> usize {
1438 self.get_nfa().pattern_len()
1439 }
1440
1441 /// Returns the total number of states in this one-pass DFA.
1442 ///
1443 /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1444 /// a low level DFA API. Therefore, this routine has little use other than
1445 /// being informational.
1446 #[inline]
1447 pub fn state_len(&self) -> usize {
1448 self.table.len() >> self.stride2()
1449 }
1450
1451 /// Returns the total number of elements in the alphabet for this DFA.
1452 ///
1453 /// That is, this returns the total number of transitions that each
1454 /// state in this DFA must have. The maximum alphabet size is 256, which
1455 /// corresponds to each possible byte value.
1456 ///
1457 /// The alphabet size may be less than 256 though, and unless
1458 /// [`Config::byte_classes`] is disabled, it is typically must less than
1459 /// 256. Namely, bytes are grouped into equivalence classes such that no
1460 /// two bytes in the same class can distinguish a match from a non-match.
1461 /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could
1462 /// all be in the same equivalence class. This leads to a massive space
1463 /// savings.
1464 ///
1465 /// Note though that the alphabet length does _not_ necessarily equal the
1466 /// total stride space taken up by a single DFA state in the transition
1467 /// table. Namely, for performance reasons, the stride is always the
1468 /// smallest power of two that is greater than or equal to the alphabet
1469 /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
1470 /// often more useful. The alphabet length is typically useful only for
1471 /// informational purposes.
1472 ///
1473 /// Note also that unlike dense or sparse DFAs, a one-pass DFA does
1474 /// not have a special end-of-input (EOI) transition. This is because
1475 /// a one-pass DFA handles look-around assertions explicitly (like the
1476 /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build
1477 /// them into the transitions of the DFA.
1478 #[inline]
1479 pub fn alphabet_len(&self) -> usize {
1480 self.alphabet_len
1481 }
1482
1483 /// Returns the total stride for every state in this DFA, expressed as the
1484 /// exponent of a power of 2. The stride is the amount of space each state
1485 /// takes up in the transition table, expressed as a number of transitions.
1486 /// (Unused transitions map to dead states.)
1487 ///
1488 /// The stride of a DFA is always equivalent to the smallest power of
1489 /// 2 that is greater than or equal to the DFA's alphabet length. This
1490 /// definition uses extra space, but possibly permits faster translation
1491 /// between state identifiers and their corresponding offsets in this DFA's
1492 /// transition table.
1493 ///
1494 /// For example, if the DFA's stride is 16 transitions, then its `stride2`
1495 /// is `4` since `2^4 = 16`.
1496 ///
1497 /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
1498 /// while the maximum `stride2` value is `9` (corresponding to a stride
1499 /// of `512`). The maximum in theory should be `8`, but because of some
1500 /// implementation quirks that may be relaxed in the future, it is one more
1501 /// than `8`. (Do note that a maximal stride is incredibly rare, as it
1502 /// would imply that there is almost no redundant in the regex pattern.)
1503 ///
1504 /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1505 /// a low level DFA API. Therefore, this routine has little use other than
1506 /// being informational.
1507 #[inline]
1508 pub fn stride2(&self) -> usize {
1509 self.stride2
1510 }
1511
1512 /// Returns the total stride for every state in this DFA. This corresponds
1513 /// to the total number of transitions used by each state in this DFA's
1514 /// transition table.
1515 ///
1516 /// Please see [`DFA::stride2`] for more information. In particular, this
1517 /// returns the stride as the number of transitions, where as `stride2`
1518 /// returns it as the exponent of a power of 2.
1519 ///
1520 /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1521 /// a low level DFA API. Therefore, this routine has little use other than
1522 /// being informational.
1523 #[inline]
1524 pub fn stride(&self) -> usize {
1525 1 << self.stride2()
1526 }
1527
1528 /// Returns the memory usage, in bytes, of this DFA.
1529 ///
1530 /// The memory usage is computed based on the number of bytes used to
1531 /// represent this DFA.
1532 ///
1533 /// This does **not** include the stack size used up by this DFA. To
1534 /// compute that, use `std::mem::size_of::<onepass::DFA>()`.
1535 #[inline]
1536 pub fn memory_usage(&self) -> usize {
1537 use core::mem::size_of;
1538
1539 self.table.len() * size_of::<Transition>()
1540 + self.starts.len() * size_of::<StateID>()
1541 }
1542}
1543
1544impl DFA {
1545 /// Executes an anchored leftmost forward search, and returns true if and
1546 /// only if this one-pass DFA matches the given haystack.
1547 ///
1548 /// This routine may short circuit if it knows that scanning future
1549 /// input will never lead to a different result. In particular, if the
1550 /// underlying DFA enters a match state, then this routine will return
1551 /// `true` immediately without inspecting any future input. (Consider how
1552 /// this might make a difference given the regex `a+` on the haystack
1553 /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
1554 /// but routines like `find` need to continue searching because `+` is
1555 /// greedy by default.)
1556 ///
1557 /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1558 /// given configuration was [`Anchored::No`] (which is the default).
1559 ///
1560 /// # Panics
1561 ///
1562 /// This routine panics if the search could not complete. This can occur
1563 /// in the following circumstances:
1564 ///
1565 /// * When the provided `Input` configuration is not supported. For
1566 /// example, by providing an unsupported anchor mode. Concretely,
1567 /// this occurs when using [`Anchored::Pattern`] without enabling
1568 /// [`Config::starts_for_each_pattern`].
1569 ///
1570 /// When a search panics, callers cannot know whether a match exists or
1571 /// not.
1572 ///
1573 /// Use [`DFA::try_search`] if you want to handle these panics as error
1574 /// values instead.
1575 ///
1576 /// # Example
1577 ///
1578 /// This shows basic usage:
1579 ///
1580 /// ```
1581 /// use regex_automata::dfa::onepass::DFA;
1582 ///
1583 /// let re = DFA::new("foo[0-9]+bar")?;
1584 /// let mut cache = re.create_cache();
1585 ///
1586 /// assert!(re.is_match(&mut cache, "foo12345bar"));
1587 /// assert!(!re.is_match(&mut cache, "foobar"));
1588 /// # Ok::<(), Box<dyn std::error::Error>>(())
1589 /// ```
1590 ///
1591 /// # Example: consistency with search APIs
1592 ///
1593 /// `is_match` is guaranteed to return `true` whenever `captures` returns
1594 /// a match. This includes searches that are executed entirely within a
1595 /// codepoint:
1596 ///
1597 /// ```
1598 /// use regex_automata::{dfa::onepass::DFA, Input};
1599 ///
1600 /// let re = DFA::new("a*")?;
1601 /// let mut cache = re.create_cache();
1602 ///
1603 /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
1604 /// # Ok::<(), Box<dyn std::error::Error>>(())
1605 /// ```
1606 ///
1607 /// Notice that when UTF-8 mode is disabled, then the above reports a
1608 /// match because the restriction against zero-width matches that split a
1609 /// codepoint has been lifted:
1610 ///
1611 /// ```
1612 /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input};
1613 ///
1614 /// let re = DFA::builder()
1615 /// .thompson(NFA::config().utf8(false))
1616 /// .build("a*")?;
1617 /// let mut cache = re.create_cache();
1618 ///
1619 /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
1620 /// # Ok::<(), Box<dyn std::error::Error>>(())
1621 /// ```
1622 #[inline]
1623 pub fn is_match<'h, I: Into<Input<'h>>>(
1624 &self,
1625 cache: &mut Cache,
1626 input: I,
1627 ) -> bool {
1628 let mut input = input.into().earliest(true);
1629 if matches!(input.get_anchored(), Anchored::No) {
1630 input.set_anchored(Anchored::Yes);
1631 }
1632 self.try_search_slots(cache, &input, &mut []).unwrap().is_some()
1633 }
1634
1635 /// Executes an anchored leftmost forward search, and returns a `Match` if
1636 /// and only if this one-pass DFA matches the given haystack.
1637 ///
1638 /// This routine only includes the overall match span. To get access to the
1639 /// individual spans of each capturing group, use [`DFA::captures`].
1640 ///
1641 /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1642 /// given configuration was [`Anchored::No`] (which is the default).
1643 ///
1644 /// # Panics
1645 ///
1646 /// This routine panics if the search could not complete. This can occur
1647 /// in the following circumstances:
1648 ///
1649 /// * When the provided `Input` configuration is not supported. For
1650 /// example, by providing an unsupported anchor mode. Concretely,
1651 /// this occurs when using [`Anchored::Pattern`] without enabling
1652 /// [`Config::starts_for_each_pattern`].
1653 ///
1654 /// When a search panics, callers cannot know whether a match exists or
1655 /// not.
1656 ///
1657 /// Use [`DFA::try_search`] if you want to handle these panics as error
1658 /// values instead.
1659 ///
1660 /// # Example
1661 ///
1662 /// Leftmost first match semantics corresponds to the match with the
1663 /// smallest starting offset, but where the end offset is determined by
1664 /// preferring earlier branches in the original regular expression. For
1665 /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
1666 /// will match `Samwise` in `Samwise`.
1667 ///
1668 /// Generally speaking, the "leftmost first" match is how most backtracking
1669 /// regular expressions tend to work. This is in contrast to POSIX-style
1670 /// regular expressions that yield "leftmost longest" matches. Namely,
1671 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
1672 /// leftmost longest semantics. (This crate does not currently support
1673 /// leftmost longest semantics.)
1674 ///
1675 /// ```
1676 /// use regex_automata::{dfa::onepass::DFA, Match};
1677 ///
1678 /// let re = DFA::new("foo[0-9]+")?;
1679 /// let mut cache = re.create_cache();
1680 /// let expected = Match::must(0, 0..8);
1681 /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
1682 ///
1683 /// // Even though a match is found after reading the first byte (`a`),
1684 /// // the leftmost first match semantics demand that we find the earliest
1685 /// // match that prefers earlier parts of the pattern over later parts.
1686 /// let re = DFA::new("abc|a")?;
1687 /// let mut cache = re.create_cache();
1688 /// let expected = Match::must(0, 0..3);
1689 /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
1690 ///
1691 /// # Ok::<(), Box<dyn std::error::Error>>(())
1692 /// ```
1693 #[inline]
1694 pub fn find<'h, I: Into<Input<'h>>>(
1695 &self,
1696 cache: &mut Cache,
1697 input: I,
1698 ) -> Option<Match> {
1699 let mut input = input.into();
1700 if matches!(input.get_anchored(), Anchored::No) {
1701 input.set_anchored(Anchored::Yes);
1702 }
1703 if self.get_nfa().pattern_len() == 1 {
1704 let mut slots = [None, None];
1705 let pid =
1706 self.try_search_slots(cache, &input, &mut slots).unwrap()?;
1707 let start = slots[0].unwrap().get();
1708 let end = slots[1].unwrap().get();
1709 return Some(Match::new(pid, Span { start, end }));
1710 }
1711 let ginfo = self.get_nfa().group_info();
1712 let slots_len = ginfo.implicit_slot_len();
1713 let mut slots = vec![None; slots_len];
1714 let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?;
1715 let start = slots[pid.as_usize() * 2].unwrap().get();
1716 let end = slots[pid.as_usize() * 2 + 1].unwrap().get();
1717 Some(Match::new(pid, Span { start, end }))
1718 }
1719
1720 /// Executes an anchored leftmost forward search and writes the spans
1721 /// of capturing groups that participated in a match into the provided
1722 /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
1723 /// is guaranteed to return `false`.
1724 ///
1725 /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1726 /// given configuration was [`Anchored::No`] (which is the default).
1727 ///
1728 /// # Panics
1729 ///
1730 /// This routine panics if the search could not complete. This can occur
1731 /// in the following circumstances:
1732 ///
1733 /// * When the provided `Input` configuration is not supported. For
1734 /// example, by providing an unsupported anchor mode. Concretely,
1735 /// this occurs when using [`Anchored::Pattern`] without enabling
1736 /// [`Config::starts_for_each_pattern`].
1737 ///
1738 /// When a search panics, callers cannot know whether a match exists or
1739 /// not.
1740 ///
1741 /// Use [`DFA::try_search`] if you want to handle these panics as error
1742 /// values instead.
1743 ///
1744 /// # Example
1745 ///
1746 /// This shows a simple example of a one-pass regex that extracts
1747 /// capturing group spans.
1748 ///
1749 /// ```
1750 /// use regex_automata::{dfa::onepass::DFA, Match, Span};
1751 ///
1752 /// let re = DFA::new(
1753 /// // Notice that we use ASCII here. The corresponding Unicode regex
1754 /// // is sadly not one-pass.
1755 /// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)",
1756 /// )?;
1757 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1758 ///
1759 /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
1760 /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
1761 /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1));
1762 /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last"));
1763 ///
1764 /// # Ok::<(), Box<dyn std::error::Error>>(())
1765 /// ```
1766 #[inline]
1767 pub fn captures<'h, I: Into<Input<'h>>>(
1768 &self,
1769 cache: &mut Cache,
1770 input: I,
1771 caps: &mut Captures,
1772 ) {
1773 let mut input = input.into();
1774 if matches!(input.get_anchored(), Anchored::No) {
1775 input.set_anchored(Anchored::Yes);
1776 }
1777 self.try_search(cache, &input, caps).unwrap();
1778 }
1779
1780 /// Executes an anchored leftmost forward search and writes the spans
1781 /// of capturing groups that participated in a match into the provided
1782 /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
1783 /// is guaranteed to return `false`.
1784 ///
1785 /// The differences with [`DFA::captures`] are:
1786 ///
1787 /// 1. This returns an error instead of panicking if the search fails.
1788 /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing
1789 /// the same input for multiple searches, which _may_ be important for
1790 /// latency.
1791 /// 3. This does not automatically change the [`Anchored`] mode from `No`
1792 /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an
1793 /// error is returned.
1794 ///
1795 /// # Errors
1796 ///
1797 /// This routine errors if the search could not complete. This can occur
1798 /// in the following circumstances:
1799 ///
1800 /// * When the provided `Input` configuration is not supported. For
1801 /// example, by providing an unsupported anchor mode. Concretely,
1802 /// this occurs when using [`Anchored::Pattern`] without enabling
1803 /// [`Config::starts_for_each_pattern`].
1804 ///
1805 /// When a search returns an error, callers cannot know whether a match
1806 /// exists or not.
1807 ///
1808 /// # Example: specific pattern search
1809 ///
1810 /// This example shows how to build a multi-regex that permits searching
1811 /// for specific patterns. Note that this is somewhat less useful than
1812 /// in other regex engines, since a one-pass DFA by definition has no
1813 /// ambiguity about which pattern can match at a position. That is, if it
1814 /// were possible for two different patterns to match at the same starting
1815 /// position, then the multi-regex would not be one-pass and construction
1816 /// would have failed.
1817 ///
1818 /// Nevertheless, this can still be useful if you only care about matches
1819 /// for a specific pattern, and want the DFA to report "no match" even if
1820 /// some other pattern would have matched.
1821 ///
1822 /// Note that in order to make use of this functionality,
1823 /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled
1824 /// by default since it may result in higher memory usage.
1825 ///
1826 /// ```
1827 /// use regex_automata::{
1828 /// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
1829 /// };
1830 ///
1831 /// let re = DFA::builder()
1832 /// .configure(DFA::config().starts_for_each_pattern(true))
1833 /// .build_many(&["[a-z]+", "[0-9]+"])?;
1834 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1835 /// let haystack = "123abc";
1836 /// let input = Input::new(haystack).anchored(Anchored::Yes);
1837 ///
1838 /// // A normal multi-pattern search will show pattern 1 matches.
1839 /// re.try_search(&mut cache, &input, &mut caps)?;
1840 /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
1841 ///
1842 /// // If we only want to report pattern 0 matches, then we'll get no
1843 /// // match here.
1844 /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
1845 /// re.try_search(&mut cache, &input, &mut caps)?;
1846 /// assert_eq!(None, caps.get_match());
1847 ///
1848 /// # Ok::<(), Box<dyn std::error::Error>>(())
1849 /// ```
1850 ///
1851 /// # Example: specifying the bounds of a search
1852 ///
1853 /// This example shows how providing the bounds of a search can produce
1854 /// different results than simply sub-slicing the haystack.
1855 ///
1856 /// ```
1857 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1858 /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match};
1859 ///
1860 /// // one-pass DFAs fully support Unicode word boundaries!
1861 /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass.
1862 /// // :-(
1863 /// let re = DFA::new(r"\b[0-9]{3}\b")?;
1864 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1865 /// let haystack = "foo123bar";
1866 ///
1867 /// // Since we sub-slice the haystack, the search doesn't know about
1868 /// // the larger context and assumes that `123` is surrounded by word
1869 /// // boundaries. And of course, the match position is reported relative
1870 /// // to the sub-slice as well, which means we get `0..3` instead of
1871 /// // `3..6`.
1872 /// let expected = Some(Match::must(0, 0..3));
1873 /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes);
1874 /// re.try_search(&mut cache, &input, &mut caps)?;
1875 /// assert_eq!(expected, caps.get_match());
1876 ///
1877 /// // But if we provide the bounds of the search within the context of the
1878 /// // entire haystack, then the search can take the surrounding context
1879 /// // into account. (And if we did find a match, it would be reported
1880 /// // as a valid offset into `haystack` instead of its sub-slice.)
1881 /// let expected = None;
1882 /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes);
1883 /// re.try_search(&mut cache, &input, &mut caps)?;
1884 /// assert_eq!(expected, caps.get_match());
1885 ///
1886 /// # Ok::<(), Box<dyn std::error::Error>>(())
1887 /// ```
1888 #[inline]
1889 pub fn try_search(
1890 &self,
1891 cache: &mut Cache,
1892 input: &Input<'_>,
1893 caps: &mut Captures,
1894 ) -> Result<(), MatchError> {
1895 let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1896 caps.set_pattern(pid);
1897 Ok(())
1898 }
1899
1900 /// Executes an anchored leftmost forward search and writes the spans
1901 /// of capturing groups that participated in a match into the provided
1902 /// `slots`, and returns the matching pattern ID. The contents of the
1903 /// slots for patterns other than the matching pattern are unspecified. If
1904 /// no match was found, then `None` is returned and the contents of all
1905 /// `slots` is unspecified.
1906 ///
1907 /// This is like [`DFA::try_search`], but it accepts a raw slots slice
1908 /// instead of a `Captures` value. This is useful in contexts where you
1909 /// don't want or need to allocate a `Captures`.
1910 ///
1911 /// It is legal to pass _any_ number of slots to this routine. If the regex
1912 /// engine would otherwise write a slot offset that doesn't fit in the
1913 /// provided slice, then it is simply skipped. In general though, there are
1914 /// usually three slice lengths you might want to use:
1915 ///
1916 /// * An empty slice, if you only care about which pattern matched.
1917 /// * A slice with
1918 /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len)
1919 /// slots, if you only care about the overall match spans for each matching
1920 /// pattern.
1921 /// * A slice with
1922 /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1923 /// permits recording match offsets for every capturing group in every
1924 /// pattern.
1925 ///
1926 /// # Errors
1927 ///
1928 /// This routine errors if the search could not complete. This can occur
1929 /// in the following circumstances:
1930 ///
1931 /// * When the provided `Input` configuration is not supported. For
1932 /// example, by providing an unsupported anchor mode. Concretely,
1933 /// this occurs when using [`Anchored::Pattern`] without enabling
1934 /// [`Config::starts_for_each_pattern`].
1935 ///
1936 /// When a search returns an error, callers cannot know whether a match
1937 /// exists or not.
1938 ///
1939 /// # Example
1940 ///
1941 /// This example shows how to find the overall match offsets in a
1942 /// multi-pattern search without allocating a `Captures` value. Indeed, we
1943 /// can put our slots right on the stack.
1944 ///
1945 /// ```
1946 /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID};
1947 ///
1948 /// let re = DFA::new_many(&[
1949 /// r"[a-zA-Z]+",
1950 /// r"[0-9]+",
1951 /// ])?;
1952 /// let mut cache = re.create_cache();
1953 /// let input = Input::new("123").anchored(Anchored::Yes);
1954 ///
1955 /// // We only care about the overall match offsets here, so we just
1956 /// // allocate two slots for each pattern. Each slot records the start
1957 /// // and end of the match.
1958 /// let mut slots = [None; 4];
1959 /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1960 /// assert_eq!(Some(PatternID::must(1)), pid);
1961 ///
1962 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1963 /// // See 'GroupInfo' for more details on the mapping between groups and
1964 /// // slot indices.
1965 /// let slot_start = pid.unwrap().as_usize() * 2;
1966 /// let slot_end = slot_start + 1;
1967 /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get()));
1968 /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get()));
1969 ///
1970 /// # Ok::<(), Box<dyn std::error::Error>>(())
1971 /// ```
1972 #[inline]
1973 pub fn try_search_slots(
1974 &self,
1975 cache: &mut Cache,
1976 input: &Input<'_>,
1977 slots: &mut [Option<NonMaxUsize>],
1978 ) -> Result<Option<PatternID>, MatchError> {
1979 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1980 if !utf8empty {
1981 return self.try_search_slots_imp(cache, input, slots);
1982 }
1983 // See PikeVM::try_search_slots for why we do this.
1984 let min = self.get_nfa().group_info().implicit_slot_len();
1985 if slots.len() >= min {
1986 return self.try_search_slots_imp(cache, input, slots);
1987 }
1988 if self.get_nfa().pattern_len() == 1 {
1989 let mut enough = [None, None];
1990 let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1991 // This is OK because we know `enough_slots` is strictly bigger
1992 // than `slots`, otherwise this special case isn't reached.
1993 slots.copy_from_slice(&enough[..slots.len()]);
1994 return Ok(got);
1995 }
1996 let mut enough = vec![None; min];
1997 let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1998 // This is OK because we know `enough_slots` is strictly bigger than
1999 // `slots`, otherwise this special case isn't reached.
2000 slots.copy_from_slice(&enough[..slots.len()]);
2001 Ok(got)
2002 }
2003
2004 #[inline(never)]
2005 fn try_search_slots_imp(
2006 &self,
2007 cache: &mut Cache,
2008 input: &Input<'_>,
2009 slots: &mut [Option<NonMaxUsize>],
2010 ) -> Result<Option<PatternID>, MatchError> {
2011 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
2012 match self.search_imp(cache, input, slots)? {
2013 None => return Ok(None),
2014 Some(pid) if !utf8empty => return Ok(Some(pid)),
2015 Some(pid) => {
2016 // These slot indices are always correct because we know our
2017 // 'pid' is valid and thus we know that the slot indices for it
2018 // are valid.
2019 let slot_start = pid.as_usize().wrapping_mul(2);
2020 let slot_end = slot_start.wrapping_add(1);
2021 // OK because we know we have a match and we know our caller
2022 // provided slots are big enough (which we make true above if
2023 // the caller didn't). Namely, we're only here when 'utf8empty'
2024 // is true, and when that's true, we require slots for every
2025 // pattern.
2026 let start = slots[slot_start].unwrap().get();
2027 let end = slots[slot_end].unwrap().get();
2028 // If our match splits a codepoint, then we cannot report is
2029 // as a match. And since one-pass DFAs only support anchored
2030 // searches, we don't try to skip ahead to find the next match.
2031 // We can just quit with nothing.
2032 if start == end && !input.is_char_boundary(start) {
2033 return Ok(None);
2034 }
2035 Ok(Some(pid))
2036 }
2037 }
2038 }
2039}
2040
2041impl DFA {
2042 fn search_imp(
2043 &self,
2044 cache: &mut Cache,
2045 input: &Input<'_>,
2046 slots: &mut [Option<NonMaxUsize>],
2047 ) -> Result<Option<PatternID>, MatchError> {
2048 // PERF: Some ideas. I ran out of steam after my initial impl to try
2049 // many of these.
2050 //
2051 // 1) Try doing more state shuffling. Right now, all we do is push
2052 // match states to the end of the transition table so that we can do
2053 // 'if sid >= self.min_match_id' to know whether we're in a match
2054 // state or not. But what about doing something like dense DFAs and
2055 // pushing dead, match and states with captures/looks all toward the
2056 // beginning of the transition table. Then we could do 'if sid <=
2057 // self.max_special_id', in which case, we need to do some special
2058 // handling of some sort. Otherwise, we get the happy path, just
2059 // like in a DFA search. The main argument against this is that the
2060 // one-pass DFA is likely to be used most often with capturing groups
2061 // and if capturing groups are common, then this might wind up being a
2062 // pessimization.
2063 //
2064 // 2) Consider moving 'PatternEpsilons' out of the transition table.
2065 // It is only needed for match states and usually a small minority of
2066 // states are match states. Therefore, we're using an extra 'u64' for
2067 // most states.
2068 //
2069 // 3) I played around with the match state handling and it seems like
2070 // there is probably a lot left on the table for improvement. The
2071 // key tension is that the 'find_match' routine is a giant mess, but
2072 // splitting it out into a non-inlineable function is a non-starter
2073 // because the match state might consume input, so 'find_match' COULD
2074 // be called quite a lot, and a function call at that point would trash
2075 // perf. In theory, we could detect whether a match state consumes
2076 // input and then specialize our search routine based on that. In that
2077 // case, maybe an extra function call is OK, but even then, it might be
2078 // too much of a latency hit. Another idea is to just try and figure
2079 // out how to reduce the code size of 'find_match'. RE2 has a trick
2080 // here where the match handling isn't done if we know the next byte of
2081 // input yields a match too. Maybe we adopt that?
2082 //
2083 // This just might be a tricky DFA to optimize.
2084
2085 if input.is_done() {
2086 return Ok(None);
2087 }
2088 // We unfortunately have a bit of book-keeping to do to set things
2089 // up. We do have to setup our cache and clear all of our slots. In
2090 // particular, clearing the slots is necessary for the case where we
2091 // report a match, but one of the capturing groups didn't participate
2092 // in the match but had a span set from a previous search. That would
2093 // be bad. In theory, we could avoid all this slot clearing if we knew
2094 // that every slot was always activated for every match. Then we would
2095 // know they would always be overwritten when a match is found.
2096 let explicit_slots_len = core::cmp::min(
2097 Slots::LIMIT,
2098 slots.len().saturating_sub(self.explicit_slot_start),
2099 );
2100 cache.setup_search(explicit_slots_len);
2101 for slot in cache.explicit_slots() {
2102 *slot = None;
2103 }
2104 for slot in slots.iter_mut() {
2105 *slot = None;
2106 }
2107 // We set the starting slots for every pattern up front. This does
2108 // increase our latency somewhat, but it avoids having to do it every
2109 // time we see a match state (which could be many times in a single
2110 // search if the match state consumes input).
2111 for pid in self.nfa.patterns() {
2112 let i = pid.as_usize() * 2;
2113 if i >= slots.len() {
2114 break;
2115 }
2116 slots[i] = NonMaxUsize::new(input.start());
2117 }
2118 let mut pid = None;
2119 let mut next_sid = match input.get_anchored() {
2120 Anchored::Yes => self.start(),
2121 Anchored::Pattern(pid) => self.start_pattern(pid)?,
2122 Anchored::No => {
2123 // If the regex is itself always anchored, then we're fine,
2124 // even if the search is configured to be unanchored.
2125 if !self.nfa.is_always_start_anchored() {
2126 return Err(MatchError::unsupported_anchored(
2127 Anchored::No,
2128 ));
2129 }
2130 self.start()
2131 }
2132 };
2133 let leftmost_first =
2134 matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst);
2135 for at in input.start()..input.end() {
2136 let sid = next_sid;
2137 let trans = self.transition(sid, input.haystack()[at]);
2138 next_sid = trans.state_id();
2139 let epsilons = trans.epsilons();
2140 if sid >= self.min_match_id {
2141 if self.find_match(cache, input, at, sid, slots, &mut pid) {
2142 if input.get_earliest()
2143 || (leftmost_first && trans.match_wins())
2144 {
2145 return Ok(pid);
2146 }
2147 }
2148 }
2149 if sid == DEAD
2150 || (!epsilons.looks().is_empty()
2151 && !self.nfa.look_matcher().matches_set_inline(
2152 epsilons.looks(),
2153 input.haystack(),
2154 at,
2155 ))
2156 {
2157 return Ok(pid);
2158 }
2159 epsilons.slots().apply(at, cache.explicit_slots());
2160 }
2161 if next_sid >= self.min_match_id {
2162 self.find_match(
2163 cache,
2164 input,
2165 input.end(),
2166 next_sid,
2167 slots,
2168 &mut pid,
2169 );
2170 }
2171 Ok(pid)
2172 }
2173
2174 /// Assumes 'sid' is a match state and looks for whether a match can
2175 /// be reported. If so, appropriate offsets are written to 'slots' and
2176 /// 'matched_pid' is set to the matching pattern ID.
2177 ///
2178 /// Even when 'sid' is a match state, it's possible that a match won't
2179 /// be reported. For example, when the conditional epsilon transitions
2180 /// leading to the match state aren't satisfied at the given position in
2181 /// the haystack.
2182 #[cfg_attr(feature = "perf-inline", inline(always))]
2183 fn find_match(
2184 &self,
2185 cache: &mut Cache,
2186 input: &Input<'_>,
2187 at: usize,
2188 sid: StateID,
2189 slots: &mut [Option<NonMaxUsize>],
2190 matched_pid: &mut Option<PatternID>,
2191 ) -> bool {
2192 debug_assert!(sid >= self.min_match_id);
2193 let pateps = self.pattern_epsilons(sid);
2194 let epsilons = pateps.epsilons();
2195 if !epsilons.looks().is_empty()
2196 && !self.nfa.look_matcher().matches_set_inline(
2197 epsilons.looks(),
2198 input.haystack(),
2199 at,
2200 )
2201 {
2202 return false;
2203 }
2204 let pid = pateps.pattern_id_unchecked();
2205 // This calculation is always correct because we know our 'pid' is
2206 // valid and thus we know that the slot indices for it are valid.
2207 let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1);
2208 // Set the implicit 'end' slot for the matching pattern. (The 'start'
2209 // slot was set at the beginning of the search.)
2210 if slot_end < slots.len() {
2211 slots[slot_end] = NonMaxUsize::new(at);
2212 }
2213 // If the caller provided enough room, copy the previously recorded
2214 // explicit slots from our scratch space to the caller provided slots.
2215 // We *also* need to set any explicit slots that are active as part of
2216 // the path to the match state.
2217 if self.explicit_slot_start < slots.len() {
2218 // NOTE: The 'cache.explicit_slots()' slice is setup at the
2219 // beginning of every search such that it is guaranteed to return a
2220 // slice of length equivalent to 'slots[explicit_slot_start..]'.
2221 slots[self.explicit_slot_start..]
2222 .copy_from_slice(cache.explicit_slots());
2223 epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]);
2224 }
2225 *matched_pid = Some(pid);
2226 true
2227 }
2228}
2229
2230impl DFA {
2231 /// Returns the anchored start state for matching any pattern in this DFA.
2232 fn start(&self) -> StateID {
2233 self.starts[0]
2234 }
2235
2236 /// Returns the anchored start state for matching the given pattern. If
2237 /// 'starts_for_each_pattern'
2238 /// was not enabled, then this returns an error. If the given pattern is
2239 /// not in this DFA, then `Ok(None)` is returned.
2240 fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> {
2241 if !self.config.get_starts_for_each_pattern() {
2242 return Err(MatchError::unsupported_anchored(Anchored::Pattern(
2243 pid,
2244 )));
2245 }
2246 // 'starts' always has non-zero length. The first entry is always the
2247 // anchored starting state for all patterns, and the following entries
2248 // are optional and correspond to the anchored starting states for
2249 // patterns at pid+1. Thus, starts.len()-1 corresponds to the total
2250 // number of patterns that one can explicitly search for. (And it may
2251 // be zero.)
2252 Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD))
2253 }
2254
2255 /// Returns the transition from the given state ID and byte of input. The
2256 /// transition includes the next state ID, the slots that should be saved
2257 /// and any conditional epsilon transitions that must be satisfied in order
2258 /// to take this transition.
2259 fn transition(&self, sid: StateID, byte: u8) -> Transition {
2260 let offset = sid.as_usize() << self.stride2();
2261 let class = self.classes.get(byte).as_usize();
2262 self.table[offset + class]
2263 }
2264
2265 /// Set the transition from the given state ID and byte of input to the
2266 /// transition given.
2267 fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) {
2268 let offset = sid.as_usize() << self.stride2();
2269 let class = self.classes.get(byte).as_usize();
2270 self.table[offset + class] = to;
2271 }
2272
2273 /// Return an iterator of "sparse" transitions for the given state ID.
2274 /// "sparse" in this context means that consecutive transitions that are
2275 /// equivalent are returned as one group, and transitions to the DEAD state
2276 /// are ignored.
2277 ///
2278 /// This winds up being useful for debug printing, since it's much terser
2279 /// to display runs of equivalent transitions than the transition for every
2280 /// possible byte value. Indeed, in practice, it's very common for runs
2281 /// of equivalent transitions to appear.
2282 fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> {
2283 let start = sid.as_usize() << self.stride2();
2284 let end = start + self.alphabet_len();
2285 SparseTransitionIter {
2286 it: self.table[start..end].iter().enumerate(),
2287 cur: None,
2288 }
2289 }
2290
2291 /// Return the pattern epsilons for the given state ID.
2292 ///
2293 /// If the given state ID does not correspond to a match state ID, then the
2294 /// pattern epsilons returned is empty.
2295 fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons {
2296 let offset = sid.as_usize() << self.stride2();
2297 PatternEpsilons(self.table[offset + self.pateps_offset].0)
2298 }
2299
2300 /// Set the pattern epsilons for the given state ID.
2301 fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) {
2302 let offset = sid.as_usize() << self.stride2();
2303 self.table[offset + self.pateps_offset] = Transition(pateps.0);
2304 }
2305
2306 /// Returns the state ID prior to the one given. This returns None if the
2307 /// given ID is the first DFA state.
2308 fn prev_state_id(&self, id: StateID) -> Option<StateID> {
2309 if id == DEAD {
2310 None
2311 } else {
2312 // CORRECTNESS: Since 'id' is not the first state, subtracting 1
2313 // is always valid.
2314 Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap()))
2315 }
2316 }
2317
2318 /// Returns the state ID of the last state in this DFA's transition table.
2319 /// "last" in this context means the last state to appear in memory, i.e.,
2320 /// the one with the greatest ID.
2321 fn last_state_id(&self) -> StateID {
2322 // CORRECTNESS: A DFA table is always non-empty since it always at
2323 // least contains a DEAD state. Since every state has the same stride,
2324 // we can just compute what the "next" state ID would have been and
2325 // then subtract 1 from it.
2326 StateID::new_unchecked(
2327 (self.table.len() >> self.stride2()).checked_sub(1).unwrap(),
2328 )
2329 }
2330
2331 /// Move the transitions from 'id1' to 'id2' and vice versa.
2332 ///
2333 /// WARNING: This does not update the rest of the transition table to have
2334 /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves
2335 /// the states in memory.
2336 pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) {
2337 let o1 = id1.as_usize() << self.stride2();
2338 let o2 = id2.as_usize() << self.stride2();
2339 for b in 0..self.stride() {
2340 self.table.swap(o1 + b, o2 + b);
2341 }
2342 }
2343
2344 /// Map all state IDs in this DFA (transition table + start states)
2345 /// according to the closure given.
2346 pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
2347 for i in 0..self.state_len() {
2348 let offset = i << self.stride2();
2349 for b in 0..self.alphabet_len() {
2350 let next = self.table[offset + b].state_id();
2351 self.table[offset + b].set_state_id(map(next));
2352 }
2353 }
2354 for i in 0..self.starts.len() {
2355 self.starts[i] = map(self.starts[i]);
2356 }
2357 }
2358}
2359
2360impl core::fmt::Debug for DFA {
2361 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2362 fn debug_state_transitions(
2363 f: &mut core::fmt::Formatter,
2364 dfa: &DFA,
2365 sid: StateID,
2366 ) -> core::fmt::Result {
2367 for (i, (start, end, trans)) in
2368 dfa.sparse_transitions(sid).enumerate()
2369 {
2370 let next = trans.state_id();
2371 if i > 0 {
2372 write!(f, ", ")?;
2373 }
2374 if start == end {
2375 write!(
2376 f,
2377 "{:?} => {:?}",
2378 DebugByte(start),
2379 next.as_usize(),
2380 )?;
2381 } else {
2382 write!(
2383 f,
2384 "{:?}-{:?} => {:?}",
2385 DebugByte(start),
2386 DebugByte(end),
2387 next.as_usize(),
2388 )?;
2389 }
2390 if trans.match_wins() {
2391 write!(f, " (MW)")?;
2392 }
2393 if !trans.epsilons().is_empty() {
2394 write!(f, " ({:?})", trans.epsilons())?;
2395 }
2396 }
2397 Ok(())
2398 }
2399
2400 writeln!(f, "onepass::DFA(")?;
2401 for index in 0..self.state_len() {
2402 let sid = StateID::must(index);
2403 let pateps = self.pattern_epsilons(sid);
2404 if sid == DEAD {
2405 write!(f, "D ")?;
2406 } else if pateps.pattern_id().is_some() {
2407 write!(f, "* ")?;
2408 } else {
2409 write!(f, " ")?;
2410 }
2411 write!(f, "{:06?}", sid.as_usize())?;
2412 if !pateps.is_empty() {
2413 write!(f, " ({pateps:?})")?;
2414 }
2415 write!(f, ": ")?;
2416 debug_state_transitions(f, self, sid)?;
2417 write!(f, "\n")?;
2418 }
2419 writeln!(f, "")?;
2420 for (i, &sid) in self.starts.iter().enumerate() {
2421 if i == 0 {
2422 writeln!(f, "START(ALL): {:?}", sid.as_usize())?;
2423 } else {
2424 writeln!(
2425 f,
2426 "START(pattern: {:?}): {:?}",
2427 i - 1,
2428 sid.as_usize(),
2429 )?;
2430 }
2431 }
2432 writeln!(f, "state length: {:?}", self.state_len())?;
2433 writeln!(f, "pattern length: {:?}", self.pattern_len())?;
2434 writeln!(f, ")")?;
2435 Ok(())
2436 }
2437}
2438
2439/// An iterator over groups of consecutive equivalent transitions in a single
2440/// state.
2441#[derive(Debug)]
2442struct SparseTransitionIter<'a> {
2443 it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>,
2444 cur: Option<(u8, u8, Transition)>,
2445}
2446
2447impl<'a> Iterator for SparseTransitionIter<'a> {
2448 type Item = (u8, u8, Transition);
2449
2450 fn next(&mut self) -> Option<(u8, u8, Transition)> {
2451 while let Some((b, &trans)) = self.it.next() {
2452 // Fine because we'll never have more than u8::MAX transitions in
2453 // one state.
2454 let b = b.as_u8();
2455 let (prev_start, prev_end, prev_trans) = match self.cur {
2456 Some(t) => t,
2457 None => {
2458 self.cur = Some((b, b, trans));
2459 continue;
2460 }
2461 };
2462 if prev_trans == trans {
2463 self.cur = Some((prev_start, b, prev_trans));
2464 } else {
2465 self.cur = Some((b, b, trans));
2466 if prev_trans.state_id() != DEAD {
2467 return Some((prev_start, prev_end, prev_trans));
2468 }
2469 }
2470 }
2471 if let Some((start, end, trans)) = self.cur.take() {
2472 if trans.state_id() != DEAD {
2473 return Some((start, end, trans));
2474 }
2475 }
2476 None
2477 }
2478}
2479
2480/// A cache represents mutable state that a one-pass [`DFA`] requires during a
2481/// search.
2482///
2483/// For a given one-pass DFA, its corresponding cache may be created either via
2484/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every
2485/// way, except the former does not require explicitly importing `Cache`.
2486///
2487/// A particular `Cache` is coupled with the one-pass DFA from which it was
2488/// created. It may only be used with that one-pass DFA. A cache and its
2489/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
2490/// only be used with the new one-pass DFA (and not the old one).
2491#[derive(Clone, Debug)]
2492pub struct Cache {
2493 /// Scratch space used to store slots during a search. Basically, we use
2494 /// the caller provided slots to store slots known when a match occurs.
2495 /// But after a match occurs, we might continue a search but ultimately
2496 /// fail to extend the match. When continuing the search, we need some
2497 /// place to store candidate capture offsets without overwriting the slot
2498 /// offsets recorded for the most recently seen match.
2499 explicit_slots: Vec<Option<NonMaxUsize>>,
2500 /// The number of slots in the caller-provided 'Captures' value for the
2501 /// current search. This is always at most 'explicit_slots.len()', but
2502 /// might be less than it, if the caller provided fewer slots to fill.
2503 explicit_slot_len: usize,
2504}
2505
2506impl Cache {
2507 /// Create a new [`onepass::DFA`](DFA) cache.
2508 ///
2509 /// A potentially more convenient routine to create a cache is
2510 /// [`DFA::create_cache`], as it does not require also importing the
2511 /// `Cache` type.
2512 ///
2513 /// If you want to reuse the returned `Cache` with some other one-pass DFA,
2514 /// then you must call [`Cache::reset`] with the desired one-pass DFA.
2515 pub fn new(re: &DFA) -> Cache {
2516 let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 };
2517 cache.reset(re);
2518 cache
2519 }
2520
2521 /// Reset this cache such that it can be used for searching with a
2522 /// different [`onepass::DFA`](DFA).
2523 ///
2524 /// A cache reset permits reusing memory already allocated in this cache
2525 /// with a different one-pass DFA.
2526 ///
2527 /// # Example
2528 ///
2529 /// This shows how to re-purpose a cache for use with a different one-pass
2530 /// DFA.
2531 ///
2532 /// ```
2533 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
2534 /// use regex_automata::{dfa::onepass::DFA, Match};
2535 ///
2536 /// let re1 = DFA::new(r"\w")?;
2537 /// let re2 = DFA::new(r"\W")?;
2538 /// let mut caps1 = re1.create_captures();
2539 /// let mut caps2 = re2.create_captures();
2540 ///
2541 /// let mut cache = re1.create_cache();
2542 /// assert_eq!(
2543 /// Some(Match::must(0, 0..2)),
2544 /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
2545 /// );
2546 ///
2547 /// // Using 'cache' with re2 is not allowed. It may result in panics or
2548 /// // incorrect results. In order to re-purpose the cache, we must reset
2549 /// // it with the one-pass DFA we'd like to use it with.
2550 /// //
2551 /// // Similarly, after this reset, using the cache with 're1' is also not
2552 /// // allowed.
2553 /// re2.reset_cache(&mut cache);
2554 /// assert_eq!(
2555 /// Some(Match::must(0, 0..3)),
2556 /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
2557 /// );
2558 ///
2559 /// # Ok::<(), Box<dyn std::error::Error>>(())
2560 /// ```
2561 pub fn reset(&mut self, re: &DFA) {
2562 let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len();
2563 self.explicit_slots.resize(explicit_slot_len, None);
2564 self.explicit_slot_len = explicit_slot_len;
2565 }
2566
2567 /// Returns the heap memory usage, in bytes, of this cache.
2568 ///
2569 /// This does **not** include the stack size used up by this cache. To
2570 /// compute that, use `std::mem::size_of::<Cache>()`.
2571 pub fn memory_usage(&self) -> usize {
2572 self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2573 }
2574
2575 fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] {
2576 &mut self.explicit_slots[..self.explicit_slot_len]
2577 }
2578
2579 fn setup_search(&mut self, explicit_slot_len: usize) {
2580 self.explicit_slot_len = explicit_slot_len;
2581 }
2582}
2583
2584/// Represents a single transition in a one-pass DFA.
2585///
2586/// The high 21 bits corresponds to the state ID. The bit following corresponds
2587/// to the special "match wins" flag. The remaining low 42 bits corresponds to
2588/// the transition epsilons, which contains the slots that should be saved when
2589/// this transition is followed and the conditional epsilon transitions that
2590/// must be satisfied in order to follow this transition.
2591#[derive(Clone, Copy, Eq, PartialEq)]
2592struct Transition(u64);
2593
2594impl Transition {
2595 const STATE_ID_BITS: u64 = 21;
2596 const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS;
2597 const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS;
2598 const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1);
2599 const INFO_MASK: u64 = 0x000003FF_FFFFFFFF;
2600
2601 /// Return a new transition to the given state ID with the given epsilons.
2602 fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition {
2603 let match_wins =
2604 if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 };
2605 let sid = sid.as_u64() << Transition::STATE_ID_SHIFT;
2606 Transition(sid | match_wins | epsilons.0)
2607 }
2608
2609 /// Returns true if and only if this transition points to the DEAD state.
2610 fn is_dead(self) -> bool {
2611 self.state_id() == DEAD
2612 }
2613
2614 /// Return whether this transition has a "match wins" property.
2615 ///
2616 /// When a transition has this property, it means that if a match has been
2617 /// found and the search uses leftmost-first semantics, then that match
2618 /// should be returned immediately instead of continuing on.
2619 ///
2620 /// The "match wins" name comes from RE2, which uses a pretty much
2621 /// identical mechanism for implementing leftmost-first semantics.
2622 fn match_wins(&self) -> bool {
2623 (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1
2624 }
2625
2626 /// Return the "next" state ID that this transition points to.
2627 fn state_id(&self) -> StateID {
2628 // OK because a Transition has a valid StateID in its upper bits by
2629 // construction. The cast to usize is also correct, even on 16-bit
2630 // targets because, again, we know the upper bits is a valid StateID,
2631 // which can never overflow usize on any supported target.
2632 StateID::new_unchecked(
2633 (self.0 >> Transition::STATE_ID_SHIFT).as_usize(),
2634 )
2635 }
2636
2637 /// Set the "next" state ID in this transition.
2638 fn set_state_id(&mut self, sid: StateID) {
2639 *self = Transition::new(self.match_wins(), sid, self.epsilons());
2640 }
2641
2642 /// Return the epsilons embedded in this transition.
2643 fn epsilons(&self) -> Epsilons {
2644 Epsilons(self.0 & Transition::INFO_MASK)
2645 }
2646}
2647
2648impl core::fmt::Debug for Transition {
2649 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2650 if self.is_dead() {
2651 return write!(f, "0");
2652 }
2653 write!(f, "{}", self.state_id().as_usize())?;
2654 if self.match_wins() {
2655 write!(f, "-MW")?;
2656 }
2657 if !self.epsilons().is_empty() {
2658 write!(f, "-{:?}", self.epsilons())?;
2659 }
2660 Ok(())
2661 }
2662}
2663
2664/// A representation of a match state's pattern ID along with the epsilons for
2665/// when a match occurs.
2666///
2667/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly
2668/// one pattern ID. If it had more, then the original NFA would not have been
2669/// one-pass.
2670///
2671/// The "epsilons" part of this corresponds to what was found in the epsilon
2672/// transitions between the transition taken in the last byte of input and the
2673/// ultimate match state. This might include saving slots and/or conditional
2674/// epsilon transitions that must be satisfied before one can report the match.
2675///
2676/// Technically, every state has room for a 'PatternEpsilons', but it is only
2677/// ever non-empty for match states.
2678#[derive(Clone, Copy)]
2679struct PatternEpsilons(u64);
2680
2681impl PatternEpsilons {
2682 const PATTERN_ID_BITS: u64 = 22;
2683 const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS;
2684 // A sentinel value indicating that this is not a match state. We don't
2685 // use 0 since 0 is a valid pattern ID.
2686 const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF;
2687 const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE;
2688 const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000;
2689 const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF;
2690
2691 /// Return a new empty pattern epsilons that has no pattern ID and has no
2692 /// epsilons. This is suitable for non-match states.
2693 fn empty() -> PatternEpsilons {
2694 PatternEpsilons(
2695 PatternEpsilons::PATTERN_ID_NONE
2696 << PatternEpsilons::PATTERN_ID_SHIFT,
2697 )
2698 }
2699
2700 /// Whether this pattern epsilons is empty or not. It's empty when it has
2701 /// no pattern ID and an empty epsilons.
2702 fn is_empty(self) -> bool {
2703 self.pattern_id().is_none() && self.epsilons().is_empty()
2704 }
2705
2706 /// Return the pattern ID in this pattern epsilons if one exists.
2707 fn pattern_id(self) -> Option<PatternID> {
2708 let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
2709 if pid == PatternEpsilons::PATTERN_ID_LIMIT {
2710 None
2711 } else {
2712 Some(PatternID::new_unchecked(pid.as_usize()))
2713 }
2714 }
2715
2716 /// Returns the pattern ID without checking whether it's valid. If this is
2717 /// called and there is no pattern ID in this `PatternEpsilons`, then this
2718 /// will likely produce an incorrect result or possibly even a panic or
2719 /// an overflow. But safety will not be violated.
2720 ///
2721 /// This is useful when you know a particular state is a match state. If
2722 /// it's a match state, then it must have a pattern ID.
2723 fn pattern_id_unchecked(self) -> PatternID {
2724 let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
2725 PatternID::new_unchecked(pid.as_usize())
2726 }
2727
2728 /// Return a new pattern epsilons with the given pattern ID, but the same
2729 /// epsilons.
2730 fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons {
2731 PatternEpsilons(
2732 (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT)
2733 | (self.0 & PatternEpsilons::EPSILONS_MASK),
2734 )
2735 }
2736
2737 /// Return the epsilons part of this pattern epsilons.
2738 fn epsilons(self) -> Epsilons {
2739 Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK)
2740 }
2741
2742 /// Return a new pattern epsilons with the given epsilons, but the same
2743 /// pattern ID.
2744 fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
2745 PatternEpsilons(
2746 (self.0 & PatternEpsilons::PATTERN_ID_MASK)
2747 | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK),
2748 )
2749 }
2750}
2751
2752impl core::fmt::Debug for PatternEpsilons {
2753 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2754 if self.is_empty() {
2755 return write!(f, "N/A");
2756 }
2757 if let Some(pid) = self.pattern_id() {
2758 write!(f, "{}", pid.as_usize())?;
2759 }
2760 if !self.epsilons().is_empty() {
2761 if self.pattern_id().is_some() {
2762 write!(f, "/")?;
2763 }
2764 write!(f, "{:?}", self.epsilons())?;
2765 }
2766 Ok(())
2767 }
2768}
2769
2770/// Epsilons represents all of the NFA epsilons transitions that went into a
2771/// single transition in a single DFA state. In this case, it only represents
2772/// the epsilon transitions that have some kind of non-consuming side effect:
2773/// either the transition requires storing the current position of the search
2774/// into a slot, or the transition is conditional and requires the current
2775/// position in the input to satisfy an assertion before the transition may be
2776/// taken.
2777///
2778/// This folds the cumulative effect of a group of NFA states (all connected
2779/// by epsilon transitions) down into a single set of bits. While these bits
2780/// can represent all possible conditional epsilon transitions, it only permits
2781/// storing up to a somewhat small number of slots.
2782///
2783/// Epsilons is represented as a 42-bit integer. For example, it is packed into
2784/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a
2785/// `StateID` and a special "match wins" property.)
2786#[derive(Clone, Copy)]
2787struct Epsilons(u64);
2788
2789impl Epsilons {
2790 const SLOT_MASK: u64 = 0x000003FF_FFFFFC00;
2791 const SLOT_SHIFT: u64 = 10;
2792 const LOOK_MASK: u64 = 0x00000000_000003FF;
2793
2794 /// Create a new empty epsilons. It has no slots and no assertions that
2795 /// need to be satisfied.
2796 fn empty() -> Epsilons {
2797 Epsilons(0)
2798 }
2799
2800 /// Returns true if this epsilons contains no slots and no assertions.
2801 fn is_empty(self) -> bool {
2802 self.0 == 0
2803 }
2804
2805 /// Returns the slot epsilon transitions.
2806 fn slots(self) -> Slots {
2807 Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32())
2808 }
2809
2810 /// Set the slot epsilon transitions.
2811 fn set_slots(self, slots: Slots) -> Epsilons {
2812 Epsilons(
2813 (u64::from(slots.0) << Epsilons::SLOT_SHIFT)
2814 | (self.0 & Epsilons::LOOK_MASK),
2815 )
2816 }
2817
2818 /// Return the set of look-around assertions in these epsilon transitions.
2819 fn looks(self) -> LookSet {
2820 LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() }
2821 }
2822
2823 /// Set the look-around assertions on these epsilon transitions.
2824 fn set_looks(self, look_set: LookSet) -> Epsilons {
2825 Epsilons(
2826 (self.0 & Epsilons::SLOT_MASK)
2827 | (u64::from(look_set.bits) & Epsilons::LOOK_MASK),
2828 )
2829 }
2830}
2831
2832impl core::fmt::Debug for Epsilons {
2833 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2834 let mut wrote = false;
2835 if !self.slots().is_empty() {
2836 write!(f, "{:?}", self.slots())?;
2837 wrote = true;
2838 }
2839 if !self.looks().is_empty() {
2840 if wrote {
2841 write!(f, "/")?;
2842 }
2843 write!(f, "{:?}", self.looks())?;
2844 wrote = true;
2845 }
2846 if !wrote {
2847 write!(f, "N/A")?;
2848 }
2849 Ok(())
2850 }
2851}
2852
2853/// The set of epsilon transitions indicating that the current position in a
2854/// search should be saved to a slot.
2855///
2856/// This *only* represents explicit slots. So for example, the pattern
2857/// `[a-z]+([0-9]+)([a-z]+)` has:
2858///
2859/// * 3 capturing groups, thus 6 slots.
2860/// * 1 implicit capturing group, thus 2 implicit slots.
2861/// * 2 explicit capturing groups, thus 4 explicit slots.
2862///
2863/// While implicit slots are represented by epsilon transitions in an NFA, we
2864/// do not explicitly represent them here. Instead, implicit slots are assumed
2865/// to be present and handled automatically in the search code. Therefore,
2866/// that means we only need to represent explicit slots in our epsilon
2867/// transitions.
2868///
2869/// Its representation is a bit set. The bit 'i' is set if and only if there
2870/// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That
2871/// is, the bit 'i' corresponds to the first explicit slot and the first
2872/// explicit slot appears immediately following the last implicit slot. (If
2873/// this is confusing, see `GroupInfo` for more details on how slots works.)
2874///
2875/// A single `Slots` represents all the active slots in a sub-graph of an NFA,
2876/// where all the states are connected by epsilon transitions. In effect, when
2877/// traversing the one-pass DFA during a search, all slots set in a particular
2878/// transition must be captured by recording the current search position.
2879///
2880/// The API of `Slots` requires the caller to handle the explicit slot offset.
2881/// That is, a `Slots` doesn't know where the explicit slots start for a
2882/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they
2883/// need to do the arithmetic above to find 'c', which is the real actual slot
2884/// index in the corresponding NFA.
2885#[derive(Clone, Copy)]
2886struct Slots(u32);
2887
2888impl Slots {
2889 const LIMIT: usize = 32;
2890
2891 /// Insert the slot at the given bit index.
2892 fn insert(self, slot: usize) -> Slots {
2893 debug_assert!(slot < Slots::LIMIT);
2894 Slots(self.0 | (1 << slot.as_u32()))
2895 }
2896
2897 /// Remove the slot at the given bit index.
2898 fn remove(self, slot: usize) -> Slots {
2899 debug_assert!(slot < Slots::LIMIT);
2900 Slots(self.0 & !(1 << slot.as_u32()))
2901 }
2902
2903 /// Returns true if and only if this set contains no slots.
2904 fn is_empty(self) -> bool {
2905 self.0 == 0
2906 }
2907
2908 /// Returns an iterator over all of the set bits in this set.
2909 fn iter(self) -> SlotsIter {
2910 SlotsIter { slots: self }
2911 }
2912
2913 /// For the position `at` in the current haystack, copy it to
2914 /// `caller_explicit_slots` for all slots that are in this set.
2915 ///
2916 /// Callers may pass a slice of any length. Slots in this set bigger than
2917 /// the length of the given explicit slots are simply skipped.
2918 ///
2919 /// The slice *must* correspond only to the explicit slots and the first
2920 /// element of the slice must always correspond to the first explicit slot
2921 /// in the corresponding NFA.
2922 fn apply(
2923 self,
2924 at: usize,
2925 caller_explicit_slots: &mut [Option<NonMaxUsize>],
2926 ) {
2927 if self.is_empty() {
2928 return;
2929 }
2930 let at = NonMaxUsize::new(at);
2931 for slot in self.iter() {
2932 if slot >= caller_explicit_slots.len() {
2933 break;
2934 }
2935 caller_explicit_slots[slot] = at;
2936 }
2937 }
2938}
2939
2940impl core::fmt::Debug for Slots {
2941 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2942 write!(f, "S")?;
2943 for slot in self.iter() {
2944 write!(f, "-{slot:?}")?;
2945 }
2946 Ok(())
2947 }
2948}
2949
2950/// An iterator over all of the bits set in a slot set.
2951///
2952/// This returns the bit index that is set, so callers may need to offset it
2953/// to get the actual NFA slot index.
2954#[derive(Debug)]
2955struct SlotsIter {
2956 slots: Slots,
2957}
2958
2959impl Iterator for SlotsIter {
2960 type Item = usize;
2961
2962 fn next(&mut self) -> Option<usize> {
2963 // Number of zeroes here is always <= u8::MAX, and so fits in a usize.
2964 let slot = self.slots.0.trailing_zeros().as_usize();
2965 if slot >= Slots::LIMIT {
2966 return None;
2967 }
2968 self.slots = self.slots.remove(slot);
2969 Some(slot)
2970 }
2971}
2972
2973/// An error that occurred during the construction of a one-pass DFA.
2974///
2975/// This error does not provide many introspection capabilities. There are
2976/// generally only two things you can do with it:
2977///
2978/// * Obtain a human readable message via its `std::fmt::Display` impl.
2979/// * Access an underlying [`thompson::BuildError`] type from its `source`
2980/// method via the `std::error::Error` trait. This error only occurs when using
2981/// convenience routines for building a one-pass DFA directly from a pattern
2982/// string.
2983///
2984/// When the `std` feature is enabled, this implements the `std::error::Error`
2985/// trait.
2986#[derive(Clone, Debug)]
2987pub struct BuildError {
2988 kind: BuildErrorKind,
2989}
2990
2991/// The kind of error that occurred during the construction of a one-pass DFA.
2992#[derive(Clone, Debug)]
2993enum BuildErrorKind {
2994 NFA(crate::nfa::thompson::BuildError),
2995 Word(UnicodeWordBoundaryError),
2996 TooManyStates { limit: u64 },
2997 TooManyPatterns { limit: u64 },
2998 UnsupportedLook { look: Look },
2999 ExceededSizeLimit { limit: usize },
3000 NotOnePass { msg: &'static str },
3001}
3002
3003impl BuildError {
3004 fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError {
3005 BuildError { kind: BuildErrorKind::NFA(err) }
3006 }
3007
3008 fn word(err: UnicodeWordBoundaryError) -> BuildError {
3009 BuildError { kind: BuildErrorKind::Word(err) }
3010 }
3011
3012 fn too_many_states(limit: u64) -> BuildError {
3013 BuildError { kind: BuildErrorKind::TooManyStates { limit } }
3014 }
3015
3016 fn too_many_patterns(limit: u64) -> BuildError {
3017 BuildError { kind: BuildErrorKind::TooManyPatterns { limit } }
3018 }
3019
3020 fn unsupported_look(look: Look) -> BuildError {
3021 BuildError { kind: BuildErrorKind::UnsupportedLook { look } }
3022 }
3023
3024 fn exceeded_size_limit(limit: usize) -> BuildError {
3025 BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
3026 }
3027
3028 fn not_one_pass(msg: &'static str) -> BuildError {
3029 BuildError { kind: BuildErrorKind::NotOnePass { msg } }
3030 }
3031}
3032
3033#[cfg(feature = "std")]
3034impl std::error::Error for BuildError {
3035 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
3036 use self::BuildErrorKind::*;
3037
3038 match self.kind {
3039 NFA(ref err) => Some(err),
3040 Word(ref err) => Some(err),
3041 _ => None,
3042 }
3043 }
3044}
3045
3046impl core::fmt::Display for BuildError {
3047 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
3048 use self::BuildErrorKind::*;
3049
3050 match self.kind {
3051 NFA(_) => write!(f, "error building NFA"),
3052 Word(_) => write!(f, "NFA contains Unicode word boundary"),
3053 TooManyStates { limit } => write!(
3054 f,
3055 "one-pass DFA exceeded a limit of {limit:?} \
3056 for number of states",
3057 ),
3058 TooManyPatterns { limit } => write!(
3059 f,
3060 "one-pass DFA exceeded a limit of {limit:?} \
3061 for number of patterns",
3062 ),
3063 UnsupportedLook { look } => write!(
3064 f,
3065 "one-pass DFA does not support the {look:?} assertion",
3066 ),
3067 ExceededSizeLimit { limit } => write!(
3068 f,
3069 "one-pass DFA exceeded size limit of {limit:?} during building",
3070 ),
3071 NotOnePass { msg } => write!(
3072 f,
3073 "one-pass DFA could not be built because \
3074 pattern is not one-pass: {}",
3075 msg,
3076 ),
3077 }
3078 }
3079}
3080
3081#[cfg(all(test, feature = "syntax"))]
3082mod tests {
3083 use alloc::string::ToString;
3084
3085 use super::*;
3086
3087 #[test]
3088 fn fail_conflicting_transition() {
3089 let predicate = |err: &str| err.contains("conflicting transition");
3090
3091 let err = DFA::new(r"a*[ab]").unwrap_err().to_string();
3092 assert!(predicate(&err), "{err}");
3093 }
3094
3095 #[test]
3096 fn fail_multiple_epsilon() {
3097 let predicate = |err: &str| {
3098 err.contains("multiple epsilon transitions to same state")
3099 };
3100
3101 let err = DFA::new(r"(^|$)a").unwrap_err().to_string();
3102 assert!(predicate(&err), "{err}");
3103 }
3104
3105 #[test]
3106 fn fail_multiple_match() {
3107 let predicate = |err: &str| {
3108 err.contains("multiple epsilon transitions to match state")
3109 };
3110
3111 let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string();
3112 assert!(predicate(&err), "{err}");
3113 }
3114
3115 // This test is meant to build a one-pass regex with the maximum number of
3116 // possible slots.
3117 //
3118 // NOTE: Remember that the slot limit only applies to explicit capturing
3119 // groups. Any number of implicit capturing groups is supported (up to the
3120 // maximum number of supported patterns), since implicit groups are handled
3121 // by the search loop itself.
3122 #[test]
3123 fn max_slots() {
3124 // One too many...
3125 let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)";
3126 assert!(DFA::new(pat).is_err());
3127 // Just right.
3128 let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)";
3129 assert!(DFA::new(pat).is_ok());
3130 }
3131
3132 // This test ensures that the one-pass DFA works with all look-around
3133 // assertions that we expect it to work with.
3134 //
3135 // The utility of this test is that each one-pass transition has a small
3136 // amount of space to store look-around assertions. Currently, there is
3137 // logic in the one-pass constructor to ensure there aren't more than ten
3138 // possible assertions. And indeed, there are only ten possible assertions
3139 // (at time of writing), so this is okay. But conceivably, more assertions
3140 // could be added. So we check that things at least work with what we
3141 // expect them to work with.
3142 #[test]
3143 fn assertions() {
3144 // haystack anchors
3145 assert!(DFA::new(r"^").is_ok());
3146 assert!(DFA::new(r"$").is_ok());
3147
3148 // line anchors
3149 assert!(DFA::new(r"(?m)^").is_ok());
3150 assert!(DFA::new(r"(?m)$").is_ok());
3151 assert!(DFA::new(r"(?Rm)^").is_ok());
3152 assert!(DFA::new(r"(?Rm)$").is_ok());
3153
3154 // word boundaries
3155 if cfg!(feature = "unicode-word-boundary") {
3156 assert!(DFA::new(r"\b").is_ok());
3157 assert!(DFA::new(r"\B").is_ok());
3158 }
3159 assert!(DFA::new(r"(?-u)\b").is_ok());
3160 assert!(DFA::new(r"(?-u)\B").is_ok());
3161 }
3162
3163 #[cfg(not(miri))] // takes too long on miri
3164 #[test]
3165 fn is_one_pass() {
3166 use crate::util::syntax;
3167
3168 assert!(DFA::new(r"a*b").is_ok());
3169 if cfg!(feature = "unicode-perl") {
3170 assert!(DFA::new(r"\w").is_ok());
3171 }
3172 assert!(DFA::new(r"(?-u)\w*\s").is_ok());
3173 assert!(DFA::new(r"(?s:.)*?").is_ok());
3174 assert!(DFA::builder()
3175 .syntax(syntax::Config::new().utf8(false))
3176 .build(r"(?s-u:.)*?")
3177 .is_ok());
3178 }
3179
3180 #[test]
3181 fn is_not_one_pass() {
3182 assert!(DFA::new(r"a*a").is_err());
3183 assert!(DFA::new(r"(?s-u:.)*?").is_err());
3184 assert!(DFA::new(r"(?s:.)*?a").is_err());
3185 }
3186
3187 #[cfg(not(miri))]
3188 #[test]
3189 fn is_not_one_pass_bigger() {
3190 assert!(DFA::new(r"\w*\s").is_err());
3191 }
3192}