regex_automata/dfa/
sparse.rs

1/*!
2Types and routines specific to sparse DFAs.
3
4This module is the home of [`sparse::DFA`](DFA).
5
6Unlike the [`dense`] module, this module does not contain a builder or
7configuration specific for sparse DFAs. Instead, the intended way to build a
8sparse DFA is either by using a default configuration with its constructor
9[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a
10dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`].
11For example, this configures a sparse DFA to do an overlapping search:
12
13```
14use regex_automata::{
15    dfa::{Automaton, OverlappingState, dense},
16    HalfMatch, Input, MatchKind,
17};
18
19let dense_re = dense::Builder::new()
20    .configure(dense::Config::new().match_kind(MatchKind::All))
21    .build(r"Samwise|Sam")?;
22let sparse_re = dense_re.to_sparse()?;
23
24// Setup our haystack and initial start state.
25let input = Input::new("Samwise");
26let mut state = OverlappingState::start();
27
28// First, 'Sam' will match.
29sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
30assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match());
31
32// And now 'Samwise' will match.
33sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
34assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match());
35# Ok::<(), Box<dyn std::error::Error>>(())
36```
37*/
38
39#[cfg(feature = "dfa-build")]
40use core::iter;
41use core::{fmt, mem::size_of};
42
43#[cfg(feature = "dfa-build")]
44use alloc::{vec, vec::Vec};
45
46#[cfg(feature = "dfa-build")]
47use crate::dfa::dense::{self, BuildError};
48use crate::{
49    dfa::{
50        automaton::{fmt_state_indicator, Automaton, StartError},
51        dense::Flags,
52        special::Special,
53        StartKind, DEAD,
54    },
55    util::{
56        alphabet::{ByteClasses, ByteSet},
57        escape::DebugByte,
58        int::{Pointer, Usize, U16, U32},
59        prefilter::Prefilter,
60        primitives::{PatternID, StateID},
61        search::Anchored,
62        start::{self, Start, StartByteMap},
63        wire::{self, DeserializeError, Endian, SerializeError},
64    },
65};
66
67const LABEL: &str = "rust-regex-automata-dfa-sparse";
68const VERSION: u32 = 2;
69
70/// A sparse deterministic finite automaton (DFA) with variable sized states.
71///
72/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient
73/// representation for its transitions. Consequently, sparse DFAs may use much
74/// less memory than dense DFAs, but this comes at a price. In particular,
75/// reading the more space efficient transitions takes more work, and
76/// consequently, searching using a sparse DFA is typically slower than a dense
77/// DFA.
78///
79/// A sparse DFA can be built using the default configuration via the
80/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a
81/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse
82/// DFA using [`dense::DFA::to_sparse`].
83///
84/// In general, a sparse DFA supports all the same search operations as a dense
85/// DFA.
86///
87/// Making the choice between a dense and sparse DFA depends on your specific
88/// work load. If you can sacrifice a bit of search time performance, then a
89/// sparse DFA might be the best choice. In particular, while sparse DFAs are
90/// probably always slower than dense DFAs, you may find that they are easily
91/// fast enough for your purposes!
92///
93/// # Type parameters
94///
95/// A `DFA` has one type parameter, `T`, which is used to represent the parts
96/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
97///
98/// # The `Automaton` trait
99///
100/// This type implements the [`Automaton`] trait, which means it can be used
101/// for searching. For example:
102///
103/// ```
104/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
105///
106/// let dfa = DFA::new("foo[0-9]+")?;
107/// let expected = Some(HalfMatch::must(0, 8));
108/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
109/// # Ok::<(), Box<dyn std::error::Error>>(())
110/// ```
111#[derive(Clone)]
112pub struct DFA<T> {
113    // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
114    // representation-wise. In reality, it is perhaps more complicated. Namely,
115    // in a dense DFA, all information needs to be very cheaply accessible
116    // using only state IDs. In a sparse DFA however, each state uses a
117    // variable amount of space because each state encodes more information
118    // than just its transitions. Each state also includes an accelerator if
119    // one exists, along with the matching pattern IDs if the state is a match
120    // state.
121    //
122    // That is, a lot of the complexity is pushed down into how each state
123    // itself is represented.
124    tt: Transitions<T>,
125    st: StartTable<T>,
126    special: Special,
127    pre: Option<Prefilter>,
128    quitset: ByteSet,
129    flags: Flags,
130}
131
132#[cfg(feature = "dfa-build")]
133impl DFA<Vec<u8>> {
134    /// Parse the given regular expression using a default configuration and
135    /// return the corresponding sparse DFA.
136    ///
137    /// If you want a non-default configuration, then use the
138    /// [`dense::Builder`] to set your own configuration, and then call
139    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
140    ///
141    /// # Example
142    ///
143    /// ```
144    /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
145    ///
146    /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
147    ///
148    /// let expected = Some(HalfMatch::must(0, 11));
149    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
150    /// # Ok::<(), Box<dyn std::error::Error>>(())
151    /// ```
152    #[cfg(feature = "syntax")]
153    pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> {
154        dense::Builder::new()
155            .build(pattern)
156            .and_then(|dense| dense.to_sparse())
157    }
158
159    /// Parse the given regular expressions using a default configuration and
160    /// return the corresponding multi-DFA.
161    ///
162    /// If you want a non-default configuration, then use the
163    /// [`dense::Builder`] to set your own configuration, and then call
164    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
165    ///
166    /// # Example
167    ///
168    /// ```
169    /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
170    ///
171    /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
172    /// let expected = Some(HalfMatch::must(1, 3));
173    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
174    /// # Ok::<(), Box<dyn std::error::Error>>(())
175    /// ```
176    #[cfg(feature = "syntax")]
177    pub fn new_many<P: AsRef<str>>(
178        patterns: &[P],
179    ) -> Result<DFA<Vec<u8>>, BuildError> {
180        dense::Builder::new()
181            .build_many(patterns)
182            .and_then(|dense| dense.to_sparse())
183    }
184}
185
186#[cfg(feature = "dfa-build")]
187impl DFA<Vec<u8>> {
188    /// Create a new DFA that matches every input.
189    ///
190    /// # Example
191    ///
192    /// ```
193    /// use regex_automata::{
194    ///     dfa::{Automaton, sparse},
195    ///     HalfMatch, Input,
196    /// };
197    ///
198    /// let dfa = sparse::DFA::always_match()?;
199    ///
200    /// let expected = Some(HalfMatch::must(0, 0));
201    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
202    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
203    /// # Ok::<(), Box<dyn std::error::Error>>(())
204    /// ```
205    pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> {
206        dense::DFA::always_match()?.to_sparse()
207    }
208
209    /// Create a new sparse DFA that never matches any input.
210    ///
211    /// # Example
212    ///
213    /// ```
214    /// use regex_automata::{dfa::{Automaton, sparse}, Input};
215    ///
216    /// let dfa = sparse::DFA::never_match()?;
217    /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
218    /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
219    /// # Ok::<(), Box<dyn std::error::Error>>(())
220    /// ```
221    pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> {
222        dense::DFA::never_match()?.to_sparse()
223    }
224
225    /// The implementation for constructing a sparse DFA from a dense DFA.
226    pub(crate) fn from_dense<T: AsRef<[u32]>>(
227        dfa: &dense::DFA<T>,
228    ) -> Result<DFA<Vec<u8>>, BuildError> {
229        // In order to build the transition table, we need to be able to write
230        // state identifiers for each of the "next" transitions in each state.
231        // Our state identifiers correspond to the byte offset in the
232        // transition table at which the state is encoded. Therefore, we do not
233        // actually know what the state identifiers are until we've allocated
234        // exactly as much space as we need for each state. Thus, construction
235        // of the transition table happens in two passes.
236        //
237        // In the first pass, we fill out the shell of each state, which
238        // includes the transition length, the input byte ranges and
239        // zero-filled space for the transitions and accelerators, if present.
240        // In this first pass, we also build up a map from the state identifier
241        // index of the dense DFA to the state identifier in this sparse DFA.
242        //
243        // In the second pass, we fill in the transitions based on the map
244        // built in the first pass.
245
246        // The capacity given here reflects a minimum. (Well, the true minimum
247        // is likely even bigger, but hopefully this saves a few reallocs.)
248        let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len());
249        // This maps state indices from the dense DFA to StateIDs in the sparse
250        // DFA. We build out this map on the first pass, and then use it in the
251        // second pass to back-fill our transitions.
252        let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()];
253        for state in dfa.states() {
254            let pos = sparse.len();
255
256            remap[dfa.to_index(state.id())] = StateID::new(pos)
257                .map_err(|_| BuildError::too_many_states())?;
258            // zero-filled space for the transition length
259            sparse.push(0);
260            sparse.push(0);
261
262            let mut transition_len = 0;
263            for (unit1, unit2, _) in state.sparse_transitions() {
264                match (unit1.as_u8(), unit2.as_u8()) {
265                    (Some(b1), Some(b2)) => {
266                        transition_len += 1;
267                        sparse.push(b1);
268                        sparse.push(b2);
269                    }
270                    (None, None) => {}
271                    (Some(_), None) | (None, Some(_)) => {
272                        // can never occur because sparse_transitions never
273                        // groups EOI with any other transition.
274                        unreachable!()
275                    }
276                }
277            }
278            // Add dummy EOI transition. This is never actually read while
279            // searching, but having space equivalent to the total number
280            // of transitions is convenient. Otherwise, we'd need to track
281            // a different number of transitions for the byte ranges as for
282            // the 'next' states.
283            //
284            // N.B. The loop above is not guaranteed to yield the EOI
285            // transition, since it may point to a DEAD state. By putting
286            // it here, we always write the EOI transition, and thus
287            // guarantee that our transition length is >0. Why do we always
288            // need the EOI transition? Because in order to implement
289            // Automaton::next_eoi_state, this lets us just ask for the last
290            // transition. There are probably other/better ways to do this.
291            transition_len += 1;
292            sparse.push(0);
293            sparse.push(0);
294
295            // Check some assumptions about transition length.
296            assert_ne!(
297                transition_len, 0,
298                "transition length should be non-zero",
299            );
300            assert!(
301                transition_len <= 257,
302                "expected transition length {} to be <= 257",
303                transition_len,
304            );
305
306            // Fill in the transition length.
307            // Since transition length is always <= 257, we use the most
308            // significant bit to indicate whether this is a match state or
309            // not.
310            let ntrans = if dfa.is_match_state(state.id()) {
311                transition_len | (1 << 15)
312            } else {
313                transition_len
314            };
315            wire::NE::write_u16(ntrans, &mut sparse[pos..]);
316
317            // zero-fill the actual transitions.
318            // Unwraps are OK since transition_length <= 257 and our minimum
319            // support usize size is 16-bits.
320            let zeros = usize::try_from(transition_len)
321                .unwrap()
322                .checked_mul(StateID::SIZE)
323                .unwrap();
324            sparse.extend(iter::repeat(0).take(zeros));
325
326            // If this is a match state, write the pattern IDs matched by this
327            // state.
328            if dfa.is_match_state(state.id()) {
329                let plen = dfa.match_pattern_len(state.id());
330                // Write the actual pattern IDs with a u32 length prefix.
331                // First, zero-fill space.
332                let mut pos = sparse.len();
333                // Unwraps are OK since it's guaranteed that plen <=
334                // PatternID::LIMIT, which is in turn guaranteed to fit into a
335                // u32.
336                let zeros = size_of::<u32>()
337                    .checked_mul(plen)
338                    .unwrap()
339                    .checked_add(size_of::<u32>())
340                    .unwrap();
341                sparse.extend(iter::repeat(0).take(zeros));
342
343                // Now write the length prefix.
344                wire::NE::write_u32(
345                    // Will never fail since u32::MAX is invalid pattern ID.
346                    // Thus, the number of pattern IDs is representable by a
347                    // u32.
348                    plen.try_into().expect("pattern ID length fits in u32"),
349                    &mut sparse[pos..],
350                );
351                pos += size_of::<u32>();
352
353                // Now write the pattern IDs.
354                for &pid in dfa.pattern_id_slice(state.id()) {
355                    pos += wire::write_pattern_id::<wire::NE>(
356                        pid,
357                        &mut sparse[pos..],
358                    );
359                }
360            }
361
362            // And now add the accelerator, if one exists. An accelerator is
363            // at most 4 bytes and at least 1 byte. The first byte is the
364            // length, N. N bytes follow the length. The set of bytes that
365            // follow correspond (exhaustively) to the bytes that must be seen
366            // to leave this state.
367            let accel = dfa.accelerator(state.id());
368            sparse.push(accel.len().try_into().unwrap());
369            sparse.extend_from_slice(accel);
370        }
371
372        let mut new = DFA {
373            tt: Transitions {
374                sparse,
375                classes: dfa.byte_classes().clone(),
376                state_len: dfa.state_len(),
377                pattern_len: dfa.pattern_len(),
378            },
379            st: StartTable::from_dense_dfa(dfa, &remap)?,
380            special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
381            pre: dfa.get_prefilter().map(|p| p.clone()),
382            quitset: dfa.quitset().clone(),
383            flags: dfa.flags().clone(),
384        };
385        // And here's our second pass. Iterate over all of the dense states
386        // again, and update the transitions in each of the states in the
387        // sparse DFA.
388        for old_state in dfa.states() {
389            let new_id = remap[dfa.to_index(old_state.id())];
390            let mut new_state = new.tt.state_mut(new_id);
391            let sparse = old_state.sparse_transitions();
392            for (i, (_, _, next)) in sparse.enumerate() {
393                let next = remap[dfa.to_index(next)];
394                new_state.set_next_at(i, next);
395            }
396        }
397        debug!(
398            "created sparse DFA, memory usage: {} (dense memory usage: {})",
399            new.memory_usage(),
400            dfa.memory_usage(),
401        );
402        Ok(new)
403    }
404}
405
406impl<T: AsRef<[u8]>> DFA<T> {
407    /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
408    /// DFA returned always uses `&[u8]` for its transitions.
409    pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
410        DFA {
411            tt: self.tt.as_ref(),
412            st: self.st.as_ref(),
413            special: self.special,
414            pre: self.pre.clone(),
415            quitset: self.quitset,
416            flags: self.flags,
417        }
418    }
419
420    /// Return an owned version of this sparse DFA. Specifically, the DFA
421    /// returned always uses `Vec<u8>` for its transitions.
422    ///
423    /// Effectively, this returns a sparse DFA whose transitions live on the
424    /// heap.
425    #[cfg(feature = "alloc")]
426    pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> {
427        DFA {
428            tt: self.tt.to_owned(),
429            st: self.st.to_owned(),
430            special: self.special,
431            pre: self.pre.clone(),
432            quitset: self.quitset,
433            flags: self.flags,
434        }
435    }
436
437    /// Returns the starting state configuration for this DFA.
438    ///
439    /// The default is [`StartKind::Both`], which means the DFA supports both
440    /// unanchored and anchored searches. However, this can generally lead to
441    /// bigger DFAs. Therefore, a DFA might be compiled with support for just
442    /// unanchored or anchored searches. In that case, running a search with
443    /// an unsupported configuration will panic.
444    pub fn start_kind(&self) -> StartKind {
445        self.st.kind
446    }
447
448    /// Returns true only if this DFA has starting states for each pattern.
449    ///
450    /// When a DFA has starting states for each pattern, then a search with the
451    /// DFA can be configured to only look for anchored matches of a specific
452    /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
453    /// accept a [`Anchored::Pattern`] if and only if this method returns true.
454    /// Otherwise, an error will be returned.
455    ///
456    /// Note that if the DFA is empty, this always returns false.
457    pub fn starts_for_each_pattern(&self) -> bool {
458        self.st.pattern_len.is_some()
459    }
460
461    /// Returns the equivalence classes that make up the alphabet for this DFA.
462    ///
463    /// Unless [`dense::Config::byte_classes`] was disabled, it is possible
464    /// that multiple distinct bytes are grouped into the same equivalence
465    /// class if it is impossible for them to discriminate between a match and
466    /// a non-match. This has the effect of reducing the overall alphabet size
467    /// and in turn potentially substantially reducing the size of the DFA's
468    /// transition table.
469    ///
470    /// The downside of using equivalence classes like this is that every state
471    /// transition will automatically use this map to convert an arbitrary
472    /// byte to its corresponding equivalence class. In practice this has a
473    /// negligible impact on performance.
474    pub fn byte_classes(&self) -> &ByteClasses {
475        &self.tt.classes
476    }
477
478    /// Returns the memory usage, in bytes, of this DFA.
479    ///
480    /// The memory usage is computed based on the number of bytes used to
481    /// represent this DFA.
482    ///
483    /// This does **not** include the stack size used up by this DFA. To
484    /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
485    pub fn memory_usage(&self) -> usize {
486        self.tt.memory_usage() + self.st.memory_usage()
487    }
488}
489
490/// Routines for converting a sparse DFA to other representations, such as raw
491/// bytes suitable for persistent storage.
492impl<T: AsRef<[u8]>> DFA<T> {
493    /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
494    /// format.
495    ///
496    /// The written bytes are guaranteed to be deserialized correctly and
497    /// without errors in a semver compatible release of this crate by a
498    /// `DFA`'s deserialization APIs (assuming all other criteria for the
499    /// deserialization APIs has been satisfied):
500    ///
501    /// * [`DFA::from_bytes`]
502    /// * [`DFA::from_bytes_unchecked`]
503    ///
504    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
505    /// not add any initial padding to the returned bytes. Padding isn't
506    /// required for sparse DFAs since they have no alignment requirements.
507    ///
508    /// # Example
509    ///
510    /// This example shows how to serialize and deserialize a DFA:
511    ///
512    /// ```
513    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
514    ///
515    /// // Compile our original DFA.
516    /// let original_dfa = DFA::new("foo[0-9]+")?;
517    ///
518    /// // N.B. We use native endianness here to make the example work, but
519    /// // using to_bytes_little_endian would work on a little endian target.
520    /// let buf = original_dfa.to_bytes_native_endian();
521    /// // Even if buf has initial padding, DFA::from_bytes will automatically
522    /// // ignore it.
523    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
524    ///
525    /// let expected = Some(HalfMatch::must(0, 8));
526    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
527    /// # Ok::<(), Box<dyn std::error::Error>>(())
528    /// ```
529    #[cfg(feature = "dfa-build")]
530    pub fn to_bytes_little_endian(&self) -> Vec<u8> {
531        self.to_bytes::<wire::LE>()
532    }
533
534    /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
535    /// format.
536    ///
537    /// The written bytes are guaranteed to be deserialized correctly and
538    /// without errors in a semver compatible release of this crate by a
539    /// `DFA`'s deserialization APIs (assuming all other criteria for the
540    /// deserialization APIs has been satisfied):
541    ///
542    /// * [`DFA::from_bytes`]
543    /// * [`DFA::from_bytes_unchecked`]
544    ///
545    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
546    /// not add any initial padding to the returned bytes. Padding isn't
547    /// required for sparse DFAs since they have no alignment requirements.
548    ///
549    /// # Example
550    ///
551    /// This example shows how to serialize and deserialize a DFA:
552    ///
553    /// ```
554    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
555    ///
556    /// // Compile our original DFA.
557    /// let original_dfa = DFA::new("foo[0-9]+")?;
558    ///
559    /// // N.B. We use native endianness here to make the example work, but
560    /// // using to_bytes_big_endian would work on a big endian target.
561    /// let buf = original_dfa.to_bytes_native_endian();
562    /// // Even if buf has initial padding, DFA::from_bytes will automatically
563    /// // ignore it.
564    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
565    ///
566    /// let expected = Some(HalfMatch::must(0, 8));
567    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
568    /// # Ok::<(), Box<dyn std::error::Error>>(())
569    /// ```
570    #[cfg(feature = "dfa-build")]
571    pub fn to_bytes_big_endian(&self) -> Vec<u8> {
572        self.to_bytes::<wire::BE>()
573    }
574
575    /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
576    /// format.
577    ///
578    /// The written bytes are guaranteed to be deserialized correctly and
579    /// without errors in a semver compatible release of this crate by a
580    /// `DFA`'s deserialization APIs (assuming all other criteria for the
581    /// deserialization APIs has been satisfied):
582    ///
583    /// * [`DFA::from_bytes`]
584    /// * [`DFA::from_bytes_unchecked`]
585    ///
586    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
587    /// not add any initial padding to the returned bytes. Padding isn't
588    /// required for sparse DFAs since they have no alignment requirements.
589    ///
590    /// Generally speaking, native endian format should only be used when
591    /// you know that the target you're compiling the DFA for matches the
592    /// endianness of the target on which you're compiling DFA. For example,
593    /// if serialization and deserialization happen in the same process or on
594    /// the same machine. Otherwise, when serializing a DFA for use in a
595    /// portable environment, you'll almost certainly want to serialize _both_
596    /// a little endian and a big endian version and then load the correct one
597    /// based on the target's configuration.
598    ///
599    /// # Example
600    ///
601    /// This example shows how to serialize and deserialize a DFA:
602    ///
603    /// ```
604    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
605    ///
606    /// // Compile our original DFA.
607    /// let original_dfa = DFA::new("foo[0-9]+")?;
608    ///
609    /// let buf = original_dfa.to_bytes_native_endian();
610    /// // Even if buf has initial padding, DFA::from_bytes will automatically
611    /// // ignore it.
612    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
613    ///
614    /// let expected = Some(HalfMatch::must(0, 8));
615    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
616    /// # Ok::<(), Box<dyn std::error::Error>>(())
617    /// ```
618    #[cfg(feature = "dfa-build")]
619    pub fn to_bytes_native_endian(&self) -> Vec<u8> {
620        self.to_bytes::<wire::NE>()
621    }
622
623    /// The implementation of the public `to_bytes` serialization methods,
624    /// which is generic over endianness.
625    #[cfg(feature = "dfa-build")]
626    fn to_bytes<E: Endian>(&self) -> Vec<u8> {
627        let mut buf = vec![0; self.write_to_len()];
628        // This should always succeed since the only possible serialization
629        // error is providing a buffer that's too small, but we've ensured that
630        // `buf` is big enough here.
631        self.write_to::<E>(&mut buf).unwrap();
632        buf
633    }
634
635    /// Serialize this DFA as raw bytes to the given slice, in little endian
636    /// format. Upon success, the total number of bytes written to `dst` is
637    /// returned.
638    ///
639    /// The written bytes are guaranteed to be deserialized correctly and
640    /// without errors in a semver compatible release of this crate by a
641    /// `DFA`'s deserialization APIs (assuming all other criteria for the
642    /// deserialization APIs has been satisfied):
643    ///
644    /// * [`DFA::from_bytes`]
645    /// * [`DFA::from_bytes_unchecked`]
646    ///
647    /// # Errors
648    ///
649    /// This returns an error if the given destination slice is not big enough
650    /// to contain the full serialized DFA. If an error occurs, then nothing
651    /// is written to `dst`.
652    ///
653    /// # Example
654    ///
655    /// This example shows how to serialize and deserialize a DFA without
656    /// dynamic memory allocation.
657    ///
658    /// ```
659    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
660    ///
661    /// // Compile our original DFA.
662    /// let original_dfa = DFA::new("foo[0-9]+")?;
663    ///
664    /// // Create a 4KB buffer on the stack to store our serialized DFA.
665    /// let mut buf = [0u8; 4 * (1<<10)];
666    /// // N.B. We use native endianness here to make the example work, but
667    /// // using write_to_little_endian would work on a little endian target.
668    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
669    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
670    ///
671    /// let expected = Some(HalfMatch::must(0, 8));
672    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
673    /// # Ok::<(), Box<dyn std::error::Error>>(())
674    /// ```
675    pub fn write_to_little_endian(
676        &self,
677        dst: &mut [u8],
678    ) -> Result<usize, SerializeError> {
679        self.write_to::<wire::LE>(dst)
680    }
681
682    /// Serialize this DFA as raw bytes to the given slice, in big endian
683    /// format. Upon success, the total number of bytes written to `dst` is
684    /// returned.
685    ///
686    /// The written bytes are guaranteed to be deserialized correctly and
687    /// without errors in a semver compatible release of this crate by a
688    /// `DFA`'s deserialization APIs (assuming all other criteria for the
689    /// deserialization APIs has been satisfied):
690    ///
691    /// * [`DFA::from_bytes`]
692    /// * [`DFA::from_bytes_unchecked`]
693    ///
694    /// # Errors
695    ///
696    /// This returns an error if the given destination slice is not big enough
697    /// to contain the full serialized DFA. If an error occurs, then nothing
698    /// is written to `dst`.
699    ///
700    /// # Example
701    ///
702    /// This example shows how to serialize and deserialize a DFA without
703    /// dynamic memory allocation.
704    ///
705    /// ```
706    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
707    ///
708    /// // Compile our original DFA.
709    /// let original_dfa = DFA::new("foo[0-9]+")?;
710    ///
711    /// // Create a 4KB buffer on the stack to store our serialized DFA.
712    /// let mut buf = [0u8; 4 * (1<<10)];
713    /// // N.B. We use native endianness here to make the example work, but
714    /// // using write_to_big_endian would work on a big endian target.
715    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
716    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
717    ///
718    /// let expected = Some(HalfMatch::must(0, 8));
719    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
720    /// # Ok::<(), Box<dyn std::error::Error>>(())
721    /// ```
722    pub fn write_to_big_endian(
723        &self,
724        dst: &mut [u8],
725    ) -> Result<usize, SerializeError> {
726        self.write_to::<wire::BE>(dst)
727    }
728
729    /// Serialize this DFA as raw bytes to the given slice, in native endian
730    /// format. Upon success, the total number of bytes written to `dst` is
731    /// returned.
732    ///
733    /// The written bytes are guaranteed to be deserialized correctly and
734    /// without errors in a semver compatible release of this crate by a
735    /// `DFA`'s deserialization APIs (assuming all other criteria for the
736    /// deserialization APIs has been satisfied):
737    ///
738    /// * [`DFA::from_bytes`]
739    /// * [`DFA::from_bytes_unchecked`]
740    ///
741    /// Generally speaking, native endian format should only be used when
742    /// you know that the target you're compiling the DFA for matches the
743    /// endianness of the target on which you're compiling DFA. For example,
744    /// if serialization and deserialization happen in the same process or on
745    /// the same machine. Otherwise, when serializing a DFA for use in a
746    /// portable environment, you'll almost certainly want to serialize _both_
747    /// a little endian and a big endian version and then load the correct one
748    /// based on the target's configuration.
749    ///
750    /// # Errors
751    ///
752    /// This returns an error if the given destination slice is not big enough
753    /// to contain the full serialized DFA. If an error occurs, then nothing
754    /// is written to `dst`.
755    ///
756    /// # Example
757    ///
758    /// This example shows how to serialize and deserialize a DFA without
759    /// dynamic memory allocation.
760    ///
761    /// ```
762    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
763    ///
764    /// // Compile our original DFA.
765    /// let original_dfa = DFA::new("foo[0-9]+")?;
766    ///
767    /// // Create a 4KB buffer on the stack to store our serialized DFA.
768    /// let mut buf = [0u8; 4 * (1<<10)];
769    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
770    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
771    ///
772    /// let expected = Some(HalfMatch::must(0, 8));
773    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
774    /// # Ok::<(), Box<dyn std::error::Error>>(())
775    /// ```
776    pub fn write_to_native_endian(
777        &self,
778        dst: &mut [u8],
779    ) -> Result<usize, SerializeError> {
780        self.write_to::<wire::NE>(dst)
781    }
782
783    /// The implementation of the public `write_to` serialization methods,
784    /// which is generic over endianness.
785    fn write_to<E: Endian>(
786        &self,
787        dst: &mut [u8],
788    ) -> Result<usize, SerializeError> {
789        let mut nw = 0;
790        nw += wire::write_label(LABEL, &mut dst[nw..])?;
791        nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
792        nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
793        nw += {
794            // Currently unused, intended for future flexibility
795            E::write_u32(0, &mut dst[nw..]);
796            size_of::<u32>()
797        };
798        nw += self.flags.write_to::<E>(&mut dst[nw..])?;
799        nw += self.tt.write_to::<E>(&mut dst[nw..])?;
800        nw += self.st.write_to::<E>(&mut dst[nw..])?;
801        nw += self.special.write_to::<E>(&mut dst[nw..])?;
802        nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
803        Ok(nw)
804    }
805
806    /// Return the total number of bytes required to serialize this DFA.
807    ///
808    /// This is useful for determining the size of the buffer required to pass
809    /// to one of the serialization routines:
810    ///
811    /// * [`DFA::write_to_little_endian`]
812    /// * [`DFA::write_to_big_endian`]
813    /// * [`DFA::write_to_native_endian`]
814    ///
815    /// Passing a buffer smaller than the size returned by this method will
816    /// result in a serialization error.
817    ///
818    /// # Example
819    ///
820    /// This example shows how to dynamically allocate enough room to serialize
821    /// a sparse DFA.
822    ///
823    /// ```
824    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
825    ///
826    /// // Compile our original DFA.
827    /// let original_dfa = DFA::new("foo[0-9]+")?;
828    ///
829    /// let mut buf = vec![0; original_dfa.write_to_len()];
830    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
831    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
832    ///
833    /// let expected = Some(HalfMatch::must(0, 8));
834    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
835    /// # Ok::<(), Box<dyn std::error::Error>>(())
836    /// ```
837    pub fn write_to_len(&self) -> usize {
838        wire::write_label_len(LABEL)
839        + wire::write_endianness_check_len()
840        + wire::write_version_len()
841        + size_of::<u32>() // unused, intended for future flexibility
842        + self.flags.write_to_len()
843        + self.tt.write_to_len()
844        + self.st.write_to_len()
845        + self.special.write_to_len()
846        + self.quitset.write_to_len()
847    }
848}
849
850impl<'a> DFA<&'a [u8]> {
851    /// Safely deserialize a sparse DFA with a specific state identifier
852    /// representation. Upon success, this returns both the deserialized DFA
853    /// and the number of bytes read from the given slice. Namely, the contents
854    /// of the slice beyond the DFA are not read.
855    ///
856    /// Deserializing a DFA using this routine will never allocate heap memory.
857    /// For safety purposes, the DFA's transitions will be verified such that
858    /// every transition points to a valid state. If this verification is too
859    /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
860    /// will always execute in constant time.
861    ///
862    /// The bytes given must be generated by one of the serialization APIs
863    /// of a `DFA` using a semver compatible release of this crate. Those
864    /// include:
865    ///
866    /// * [`DFA::to_bytes_little_endian`]
867    /// * [`DFA::to_bytes_big_endian`]
868    /// * [`DFA::to_bytes_native_endian`]
869    /// * [`DFA::write_to_little_endian`]
870    /// * [`DFA::write_to_big_endian`]
871    /// * [`DFA::write_to_native_endian`]
872    ///
873    /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
874    /// `write_to` methods do not allocate and write to an existing slice
875    /// (which may be on the stack). Since deserialization always uses the
876    /// native endianness of the target platform, the serialization API you use
877    /// should match the endianness of the target platform. (It's often a good
878    /// idea to generate serialized DFAs for both forms of endianness and then
879    /// load the correct one based on endianness.)
880    ///
881    /// # Errors
882    ///
883    /// Generally speaking, it's easier to state the conditions in which an
884    /// error is _not_ returned. All of the following must be true:
885    ///
886    /// * The bytes given must be produced by one of the serialization APIs
887    ///   on this DFA, as mentioned above.
888    /// * The endianness of the target platform matches the endianness used to
889    ///   serialized the provided DFA.
890    ///
891    /// If any of the above are not true, then an error will be returned.
892    ///
893    /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse
894    /// DFA has no alignment requirements. That is, an alignment of `1` is
895    /// valid.
896    ///
897    /// # Panics
898    ///
899    /// This routine will never panic for any input.
900    ///
901    /// # Example
902    ///
903    /// This example shows how to serialize a DFA to raw bytes, deserialize it
904    /// and then use it for searching.
905    ///
906    /// ```
907    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
908    ///
909    /// let initial = DFA::new("foo[0-9]+")?;
910    /// let bytes = initial.to_bytes_native_endian();
911    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
912    ///
913    /// let expected = Some(HalfMatch::must(0, 8));
914    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
915    /// # Ok::<(), Box<dyn std::error::Error>>(())
916    /// ```
917    ///
918    /// # Example: loading a DFA from static memory
919    ///
920    /// One use case this library supports is the ability to serialize a
921    /// DFA to disk and then use `include_bytes!` to store it in a compiled
922    /// Rust program. Those bytes can then be cheaply deserialized into a
923    /// `DFA` structure at runtime and used for searching without having to
924    /// re-compile the DFA (which can be quite costly).
925    ///
926    /// We can show this in two parts. The first part is serializing the DFA to
927    /// a file:
928    ///
929    /// ```no_run
930    /// use regex_automata::dfa::sparse::DFA;
931    ///
932    /// let dfa = DFA::new("foo[0-9]+")?;
933    ///
934    /// // Write a big endian serialized version of this DFA to a file.
935    /// let bytes = dfa.to_bytes_big_endian();
936    /// std::fs::write("foo.bigendian.dfa", &bytes)?;
937    ///
938    /// // Do it again, but this time for little endian.
939    /// let bytes = dfa.to_bytes_little_endian();
940    /// std::fs::write("foo.littleendian.dfa", &bytes)?;
941    /// # Ok::<(), Box<dyn std::error::Error>>(())
942    /// ```
943    ///
944    /// And now the second part is embedding the DFA into the compiled program
945    /// and deserializing it at runtime on first use. We use conditional
946    /// compilation to choose the correct endianness. We do not need to employ
947    /// any special tricks to ensure a proper alignment, since a sparse DFA has
948    /// no alignment requirements.
949    ///
950    /// ```no_run
951    /// use regex_automata::{
952    ///     dfa::{Automaton, sparse::DFA},
953    ///     util::lazy::Lazy,
954    ///     HalfMatch, Input,
955    /// };
956    ///
957    /// // This crate provides its own "lazy" type, kind of like
958    /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
959    /// // no-std environments and let's us write this using completely
960    /// // safe code.
961    /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
962    ///     # const _: &str = stringify! {
963    ///     #[cfg(target_endian = "big")]
964    ///     static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
965    ///     #[cfg(target_endian = "little")]
966    ///     static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
967    ///     # };
968    ///     # static BYTES: &[u8] = b"";
969    ///
970    ///     let (dfa, _) = DFA::from_bytes(BYTES)
971    ///         .expect("serialized DFA should be valid");
972    ///     dfa
973    /// });
974    ///
975    /// let expected = Ok(Some(HalfMatch::must(0, 8)));
976    /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
977    /// ```
978    ///
979    /// Alternatively, consider using
980    /// [`lazy_static`](https://crates.io/crates/lazy_static)
981    /// or
982    /// [`once_cell`](https://crates.io/crates/once_cell),
983    /// which will guarantee safety for you.
984    pub fn from_bytes(
985        slice: &'a [u8],
986    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
987        // SAFETY: This is safe because we validate both the sparse transitions
988        // (by trying to decode every state) and start state ID list below. If
989        // either validation fails, then we return an error.
990        let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
991        let seen = dfa.tt.validate(&dfa.special)?;
992        dfa.st.validate(&dfa.special, &seen)?;
993        // N.B. dfa.special doesn't have a way to do unchecked deserialization,
994        // so it has already been validated.
995        Ok((dfa, nread))
996    }
997
998    /// Deserialize a DFA with a specific state identifier representation in
999    /// constant time by omitting the verification of the validity of the
1000    /// sparse transitions.
1001    ///
1002    /// This is just like [`DFA::from_bytes`], except it can potentially return
1003    /// a DFA that exhibits undefined behavior if its transitions contains
1004    /// invalid state identifiers.
1005    ///
1006    /// This routine is useful if you need to deserialize a DFA cheaply and
1007    /// cannot afford the transition validation performed by `from_bytes`.
1008    ///
1009    /// # Safety
1010    ///
1011    /// This routine is not safe because it permits callers to provide
1012    /// arbitrary transitions with possibly incorrect state identifiers. While
1013    /// the various serialization routines will never return an incorrect
1014    /// DFA, there is no guarantee that the bytes provided here are correct.
1015    /// While `from_bytes_unchecked` will still do several forms of basic
1016    /// validation, this routine does not check that the transitions themselves
1017    /// are correct. Given an incorrect transition table, it is possible for
1018    /// the search routines to access out-of-bounds memory because of explicit
1019    /// bounds check elision.
1020    ///
1021    /// # Example
1022    ///
1023    /// ```
1024    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
1025    ///
1026    /// let initial = DFA::new("foo[0-9]+")?;
1027    /// let bytes = initial.to_bytes_native_endian();
1028    /// // SAFETY: This is guaranteed to be safe since the bytes given come
1029    /// // directly from a compatible serialization routine.
1030    /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
1031    ///
1032    /// let expected = Some(HalfMatch::must(0, 8));
1033    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1034    /// # Ok::<(), Box<dyn std::error::Error>>(())
1035    /// ```
1036    pub unsafe fn from_bytes_unchecked(
1037        slice: &'a [u8],
1038    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1039        let mut nr = 0;
1040
1041        nr += wire::read_label(&slice[nr..], LABEL)?;
1042        nr += wire::read_endianness_check(&slice[nr..])?;
1043        nr += wire::read_version(&slice[nr..], VERSION)?;
1044
1045        let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
1046        nr += size_of::<u32>();
1047
1048        let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
1049        nr += nread;
1050
1051        let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
1052        nr += nread;
1053
1054        let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
1055        nr += nread;
1056
1057        let (special, nread) = Special::from_bytes(&slice[nr..])?;
1058        nr += nread;
1059        if special.max.as_usize() >= tt.sparse().len() {
1060            return Err(DeserializeError::generic(
1061                "max should not be greater than or equal to sparse bytes",
1062            ));
1063        }
1064
1065        let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
1066        nr += nread;
1067
1068        // Prefilters don't support serialization, so they're always absent.
1069        let pre = None;
1070        Ok((DFA { tt, st, special, pre, quitset, flags }, nr))
1071    }
1072}
1073
1074/// Other routines that work for all `T`.
1075impl<T> DFA<T> {
1076    /// Set or unset the prefilter attached to this DFA.
1077    ///
1078    /// This is useful when one has deserialized a DFA from `&[u8]`.
1079    /// Deserialization does not currently include prefilters, so if you
1080    /// want prefilter acceleration, you'll need to rebuild it and attach
1081    /// it here.
1082    pub fn set_prefilter(&mut self, prefilter: Option<Prefilter>) {
1083        self.pre = prefilter
1084    }
1085}
1086
1087impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
1088    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1089        writeln!(f, "sparse::DFA(")?;
1090        for state in self.tt.states() {
1091            fmt_state_indicator(f, self, state.id())?;
1092            writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?;
1093        }
1094        writeln!(f, "")?;
1095        for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() {
1096            if i % self.st.stride == 0 {
1097                match anchored {
1098                    Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
1099                    Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
1100                    Anchored::Pattern(pid) => writeln!(
1101                        f,
1102                        "START_GROUP(pattern: {:?})",
1103                        pid.as_usize()
1104                    )?,
1105                }
1106            }
1107            writeln!(f, "  {:?} => {:06?}", sty, start_id.as_usize())?;
1108        }
1109        writeln!(f, "state length: {:?}", self.tt.state_len)?;
1110        writeln!(f, "pattern length: {:?}", self.pattern_len())?;
1111        writeln!(f, "flags: {:?}", self.flags)?;
1112        writeln!(f, ")")?;
1113        Ok(())
1114    }
1115}
1116
1117// SAFETY: We assert that our implementation of each method is correct.
1118unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
1119    #[inline]
1120    fn is_special_state(&self, id: StateID) -> bool {
1121        self.special.is_special_state(id)
1122    }
1123
1124    #[inline]
1125    fn is_dead_state(&self, id: StateID) -> bool {
1126        self.special.is_dead_state(id)
1127    }
1128
1129    #[inline]
1130    fn is_quit_state(&self, id: StateID) -> bool {
1131        self.special.is_quit_state(id)
1132    }
1133
1134    #[inline]
1135    fn is_match_state(&self, id: StateID) -> bool {
1136        self.special.is_match_state(id)
1137    }
1138
1139    #[inline]
1140    fn is_start_state(&self, id: StateID) -> bool {
1141        self.special.is_start_state(id)
1142    }
1143
1144    #[inline]
1145    fn is_accel_state(&self, id: StateID) -> bool {
1146        self.special.is_accel_state(id)
1147    }
1148
1149    // This is marked as inline to help dramatically boost sparse searching,
1150    // which decodes each state it enters to follow the next transition.
1151    #[cfg_attr(feature = "perf-inline", inline(always))]
1152    fn next_state(&self, current: StateID, input: u8) -> StateID {
1153        let input = self.tt.classes.get(input);
1154        self.tt.state(current).next(input)
1155    }
1156
1157    #[inline]
1158    unsafe fn next_state_unchecked(
1159        &self,
1160        current: StateID,
1161        input: u8,
1162    ) -> StateID {
1163        self.next_state(current, input)
1164    }
1165
1166    #[inline]
1167    fn next_eoi_state(&self, current: StateID) -> StateID {
1168        self.tt.state(current).next_eoi()
1169    }
1170
1171    #[inline]
1172    fn pattern_len(&self) -> usize {
1173        self.tt.pattern_len
1174    }
1175
1176    #[inline]
1177    fn match_len(&self, id: StateID) -> usize {
1178        self.tt.state(id).pattern_len()
1179    }
1180
1181    #[inline]
1182    fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
1183        // This is an optimization for the very common case of a DFA with a
1184        // single pattern. This conditional avoids a somewhat more costly path
1185        // that finds the pattern ID from the state machine, which requires
1186        // a bit of slicing/pointer-chasing. This optimization tends to only
1187        // matter when matches are frequent.
1188        if self.tt.pattern_len == 1 {
1189            return PatternID::ZERO;
1190        }
1191        self.tt.state(id).pattern_id(match_index)
1192    }
1193
1194    #[inline]
1195    fn has_empty(&self) -> bool {
1196        self.flags.has_empty
1197    }
1198
1199    #[inline]
1200    fn is_utf8(&self) -> bool {
1201        self.flags.is_utf8
1202    }
1203
1204    #[inline]
1205    fn is_always_start_anchored(&self) -> bool {
1206        self.flags.is_always_start_anchored
1207    }
1208
1209    #[inline]
1210    fn start_state(
1211        &self,
1212        config: &start::Config,
1213    ) -> Result<StateID, StartError> {
1214        let anchored = config.get_anchored();
1215        let start = match config.get_look_behind() {
1216            None => Start::Text,
1217            Some(byte) => {
1218                if !self.quitset.is_empty() && self.quitset.contains(byte) {
1219                    return Err(StartError::quit(byte));
1220                }
1221                self.st.start_map.get(byte)
1222            }
1223        };
1224        self.st.start(anchored, start)
1225    }
1226
1227    #[inline]
1228    fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
1229        match mode {
1230            Anchored::No => self.st.universal_start_unanchored,
1231            Anchored::Yes => self.st.universal_start_anchored,
1232            Anchored::Pattern(_) => None,
1233        }
1234    }
1235
1236    #[inline]
1237    fn accelerator(&self, id: StateID) -> &[u8] {
1238        self.tt.state(id).accelerator()
1239    }
1240
1241    #[inline]
1242    fn get_prefilter(&self) -> Option<&Prefilter> {
1243        self.pre.as_ref()
1244    }
1245}
1246
1247/// The transition table portion of a sparse DFA.
1248///
1249/// The transition table is the core part of the DFA in that it describes how
1250/// to move from one state to another based on the input sequence observed.
1251///
1252/// Unlike a typical dense table based DFA, states in a sparse transition
1253/// table have variable size. That is, states with more transitions use more
1254/// space than states with fewer transitions. This means that finding the next
1255/// transition takes more work than with a dense DFA, but also typically uses
1256/// much less space.
1257#[derive(Clone)]
1258struct Transitions<T> {
1259    /// The raw encoding of each state in this DFA.
1260    ///
1261    /// Each state has the following information:
1262    ///
1263    /// * A set of transitions to subsequent states. Transitions to the dead
1264    ///   state are omitted.
1265    /// * If the state can be accelerated, then any additional accelerator
1266    ///   information.
1267    /// * If the state is a match state, then the state contains all pattern
1268    ///   IDs that match when in that state.
1269    ///
1270    /// To decode a state, use Transitions::state.
1271    ///
1272    /// In practice, T is either Vec<u8> or &[u8].
1273    sparse: T,
1274    /// A set of equivalence classes, where a single equivalence class
1275    /// represents a set of bytes that never discriminate between a match
1276    /// and a non-match in the DFA. Each equivalence class corresponds to a
1277    /// single character in this DFA's alphabet, where the maximum number of
1278    /// characters is 257 (each possible value of a byte plus the special
1279    /// EOI transition). Consequently, the number of equivalence classes
1280    /// corresponds to the number of transitions for each DFA state. Note
1281    /// though that the *space* used by each DFA state in the transition table
1282    /// may be larger. The total space used by each DFA state is known as the
1283    /// stride and is documented above.
1284    ///
1285    /// The only time the number of equivalence classes is fewer than 257 is
1286    /// if the DFA's kind uses byte classes which is the default. Equivalence
1287    /// classes should generally only be disabled when debugging, so that
1288    /// the transitions themselves aren't obscured. Disabling them has no
1289    /// other benefit, since the equivalence class map is always used while
1290    /// searching. In the vast majority of cases, the number of equivalence
1291    /// classes is substantially smaller than 257, particularly when large
1292    /// Unicode classes aren't used.
1293    ///
1294    /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
1295    /// in the current implementation, since equivalence classes generally tend
1296    /// to correspond to continuous ranges of bytes that map to the same
1297    /// transition. So in a sparse DFA, equivalence classes don't really lead
1298    /// to a space savings. In the future, it would be good to try and remove
1299    /// them from sparse DFAs entirely, but requires a bit of work since sparse
1300    /// DFAs are built from dense DFAs, which are in turn built on top of
1301    /// equivalence classes.
1302    classes: ByteClasses,
1303    /// The total number of states in this DFA. Note that a DFA always has at
1304    /// least one state---the dead state---even the empty DFA. In particular,
1305    /// the dead state always has ID 0 and is correspondingly always the first
1306    /// state. The dead state is never a match state.
1307    state_len: usize,
1308    /// The total number of unique patterns represented by these match states.
1309    pattern_len: usize,
1310}
1311
1312impl<'a> Transitions<&'a [u8]> {
1313    unsafe fn from_bytes_unchecked(
1314        mut slice: &'a [u8],
1315    ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
1316        let slice_start = slice.as_ptr().as_usize();
1317
1318        let (state_len, nr) =
1319            wire::try_read_u32_as_usize(&slice, "state length")?;
1320        slice = &slice[nr..];
1321
1322        let (pattern_len, nr) =
1323            wire::try_read_u32_as_usize(&slice, "pattern length")?;
1324        slice = &slice[nr..];
1325
1326        let (classes, nr) = ByteClasses::from_bytes(&slice)?;
1327        slice = &slice[nr..];
1328
1329        let (len, nr) =
1330            wire::try_read_u32_as_usize(&slice, "sparse transitions length")?;
1331        slice = &slice[nr..];
1332
1333        wire::check_slice_len(slice, len, "sparse states byte length")?;
1334        let sparse = &slice[..len];
1335        slice = &slice[len..];
1336
1337        let trans = Transitions { sparse, classes, state_len, pattern_len };
1338        Ok((trans, slice.as_ptr().as_usize() - slice_start))
1339    }
1340}
1341
1342impl<T: AsRef<[u8]>> Transitions<T> {
1343    /// Writes a serialized form of this transition table to the buffer given.
1344    /// If the buffer is too small, then an error is returned. To determine
1345    /// how big the buffer must be, use `write_to_len`.
1346    fn write_to<E: Endian>(
1347        &self,
1348        mut dst: &mut [u8],
1349    ) -> Result<usize, SerializeError> {
1350        let nwrite = self.write_to_len();
1351        if dst.len() < nwrite {
1352            return Err(SerializeError::buffer_too_small(
1353                "sparse transition table",
1354            ));
1355        }
1356        dst = &mut dst[..nwrite];
1357
1358        // write state length
1359        E::write_u32(u32::try_from(self.state_len).unwrap(), dst);
1360        dst = &mut dst[size_of::<u32>()..];
1361
1362        // write pattern length
1363        E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
1364        dst = &mut dst[size_of::<u32>()..];
1365
1366        // write byte class map
1367        let n = self.classes.write_to(dst)?;
1368        dst = &mut dst[n..];
1369
1370        // write number of bytes in sparse transitions
1371        E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
1372        dst = &mut dst[size_of::<u32>()..];
1373
1374        // write actual transitions
1375        let mut id = DEAD;
1376        while id.as_usize() < self.sparse().len() {
1377            let state = self.state(id);
1378            let n = state.write_to::<E>(&mut dst)?;
1379            dst = &mut dst[n..];
1380            // The next ID is the offset immediately following `state`.
1381            id = StateID::new(id.as_usize() + state.write_to_len()).unwrap();
1382        }
1383        Ok(nwrite)
1384    }
1385
1386    /// Returns the number of bytes the serialized form of this transition
1387    /// table will use.
1388    fn write_to_len(&self) -> usize {
1389        size_of::<u32>()   // state length
1390        + size_of::<u32>() // pattern length
1391        + self.classes.write_to_len()
1392        + size_of::<u32>() // sparse transitions length
1393        + self.sparse().len()
1394    }
1395
1396    /// Validates that every state ID in this transition table is valid.
1397    ///
1398    /// That is, every state ID can be used to correctly index a state in this
1399    /// table.
1400    fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
1401        let mut verified = Seen::new();
1402        // We need to make sure that we decode the correct number of states.
1403        // Otherwise, an empty set of transitions would validate even if the
1404        // recorded state length is non-empty.
1405        let mut len = 0;
1406        // We can't use the self.states() iterator because it assumes the state
1407        // encodings are valid. It could panic if they aren't.
1408        let mut id = DEAD;
1409        while id.as_usize() < self.sparse().len() {
1410            // Before we even decode the state, we check that the ID itself
1411            // is well formed. That is, if it's a special state then it must
1412            // actually be a quit, dead, accel, match or start state.
1413            if sp.is_special_state(id) {
1414                let is_actually_special = sp.is_dead_state(id)
1415                    || sp.is_quit_state(id)
1416                    || sp.is_match_state(id)
1417                    || sp.is_start_state(id)
1418                    || sp.is_accel_state(id);
1419                if !is_actually_special {
1420                    // This is kind of a cryptic error message...
1421                    return Err(DeserializeError::generic(
1422                        "found sparse state tagged as special but \
1423                         wasn't actually special",
1424                    ));
1425                }
1426            }
1427            let state = self.try_state(sp, id)?;
1428            verified.insert(id);
1429            // The next ID should be the offset immediately following `state`.
1430            id = StateID::new(wire::add(
1431                id.as_usize(),
1432                state.write_to_len(),
1433                "next state ID offset",
1434            )?)
1435            .map_err(|err| {
1436                DeserializeError::state_id_error(err, "next state ID offset")
1437            })?;
1438            len += 1;
1439        }
1440        // Now that we've checked that all top-level states are correct and
1441        // importantly, collected a set of valid state IDs, we have all the
1442        // information we need to check that all transitions are correct too.
1443        //
1444        // Note that we can't use `valid_ids` to iterate because it will
1445        // be empty in no-std no-alloc contexts. (And yes, that means our
1446        // verification isn't quite as good.) We can use `self.states()`
1447        // though at least, since we know that all states can at least be
1448        // decoded and traversed correctly.
1449        for state in self.states() {
1450            // Check that all transitions in this state are correct.
1451            for i in 0..state.ntrans {
1452                let to = state.next_at(i);
1453                // For no-alloc, we just check that the state can decode. It is
1454                // technically possible that the state ID could still point to
1455                // a non-existent state even if it decodes (fuzzing proved this
1456                // to be true), but it shouldn't result in any memory unsafety
1457                // or panics in non-debug mode.
1458                #[cfg(not(feature = "alloc"))]
1459                {
1460                    let _ = self.try_state(sp, to)?;
1461                }
1462                #[cfg(feature = "alloc")]
1463                {
1464                    if !verified.contains(&to) {
1465                        return Err(DeserializeError::generic(
1466                            "found transition that points to a \
1467                             non-existent state",
1468                        ));
1469                    }
1470                }
1471            }
1472        }
1473        if len != self.state_len {
1474            return Err(DeserializeError::generic(
1475                "mismatching sparse state length",
1476            ));
1477        }
1478        Ok(verified)
1479    }
1480
1481    /// Converts these transitions to a borrowed value.
1482    fn as_ref(&self) -> Transitions<&'_ [u8]> {
1483        Transitions {
1484            sparse: self.sparse(),
1485            classes: self.classes.clone(),
1486            state_len: self.state_len,
1487            pattern_len: self.pattern_len,
1488        }
1489    }
1490
1491    /// Converts these transitions to an owned value.
1492    #[cfg(feature = "alloc")]
1493    fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> {
1494        Transitions {
1495            sparse: self.sparse().to_vec(),
1496            classes: self.classes.clone(),
1497            state_len: self.state_len,
1498            pattern_len: self.pattern_len,
1499        }
1500    }
1501
1502    /// Return a convenient representation of the given state.
1503    ///
1504    /// This panics if the state is invalid.
1505    ///
1506    /// This is marked as inline to help dramatically boost sparse searching,
1507    /// which decodes each state it enters to follow the next transition. Other
1508    /// functions involved are also inlined, which should hopefully eliminate
1509    /// a lot of the extraneous decoding that is never needed just to follow
1510    /// the next transition.
1511    #[cfg_attr(feature = "perf-inline", inline(always))]
1512    fn state(&self, id: StateID) -> State<'_> {
1513        let mut state = &self.sparse()[id.as_usize()..];
1514        let mut ntrans = wire::read_u16(&state).as_usize();
1515        let is_match = (1 << 15) & ntrans != 0;
1516        ntrans &= !(1 << 15);
1517        state = &state[2..];
1518
1519        let (input_ranges, state) = state.split_at(ntrans * 2);
1520        let (next, state) = state.split_at(ntrans * StateID::SIZE);
1521        let (pattern_ids, state) = if is_match {
1522            let npats = wire::read_u32(&state).as_usize();
1523            state[4..].split_at(npats * 4)
1524        } else {
1525            (&[][..], state)
1526        };
1527
1528        let accel_len = usize::from(state[0]);
1529        let accel = &state[1..accel_len + 1];
1530        State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
1531    }
1532
1533    /// Like `state`, but will return an error if the state encoding is
1534    /// invalid. This is useful for verifying states after deserialization,
1535    /// which is required for a safe deserialization API.
1536    ///
1537    /// Note that this only verifies that this state is decodable and that
1538    /// all of its data is consistent. It does not verify that its state ID
1539    /// transitions point to valid states themselves, nor does it verify that
1540    /// every pattern ID is valid.
1541    fn try_state(
1542        &self,
1543        sp: &Special,
1544        id: StateID,
1545    ) -> Result<State<'_>, DeserializeError> {
1546        if id.as_usize() > self.sparse().len() {
1547            return Err(DeserializeError::generic(
1548                "invalid caller provided sparse state ID",
1549            ));
1550        }
1551        let mut state = &self.sparse()[id.as_usize()..];
1552        // Encoding format starts with a u16 that stores the total number of
1553        // transitions in this state.
1554        let (mut ntrans, _) =
1555            wire::try_read_u16_as_usize(state, "state transition length")?;
1556        let is_match = ((1 << 15) & ntrans) != 0;
1557        ntrans &= !(1 << 15);
1558        state = &state[2..];
1559        if ntrans > 257 || ntrans == 0 {
1560            return Err(DeserializeError::generic(
1561                "invalid transition length",
1562            ));
1563        }
1564        if is_match && !sp.is_match_state(id) {
1565            return Err(DeserializeError::generic(
1566                "state marked as match but not in match ID range",
1567            ));
1568        } else if !is_match && sp.is_match_state(id) {
1569            return Err(DeserializeError::generic(
1570                "state in match ID range but not marked as match state",
1571            ));
1572        }
1573
1574        // Each transition has two pieces: an inclusive range of bytes on which
1575        // it is defined, and the state ID that those bytes transition to. The
1576        // pairs come first, followed by a corresponding sequence of state IDs.
1577        let input_ranges_len = ntrans.checked_mul(2).unwrap();
1578        wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
1579        let (input_ranges, state) = state.split_at(input_ranges_len);
1580        // Every range should be of the form A-B, where A<=B.
1581        for pair in input_ranges.chunks(2) {
1582            let (start, end) = (pair[0], pair[1]);
1583            if start > end {
1584                return Err(DeserializeError::generic("invalid input range"));
1585            }
1586        }
1587
1588        // And now extract the corresponding sequence of state IDs. We leave
1589        // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
1590        // not have any alignment requirements.
1591        let next_len = ntrans
1592            .checked_mul(self.id_len())
1593            .expect("state size * #trans should always fit in a usize");
1594        wire::check_slice_len(state, next_len, "sparse trans state IDs")?;
1595        let (next, state) = state.split_at(next_len);
1596        // We can at least verify that every state ID is in bounds.
1597        for idbytes in next.chunks(self.id_len()) {
1598            let (id, _) =
1599                wire::read_state_id(idbytes, "sparse state ID in try_state")?;
1600            wire::check_slice_len(
1601                self.sparse(),
1602                id.as_usize(),
1603                "invalid sparse state ID",
1604            )?;
1605        }
1606
1607        // If this is a match state, then read the pattern IDs for this state.
1608        // Pattern IDs is a u32-length prefixed sequence of native endian
1609        // encoded 32-bit integers.
1610        let (pattern_ids, state) = if is_match {
1611            let (npats, nr) =
1612                wire::try_read_u32_as_usize(state, "pattern ID length")?;
1613            let state = &state[nr..];
1614            if npats == 0 {
1615                return Err(DeserializeError::generic(
1616                    "state marked as a match, but pattern length is zero",
1617                ));
1618            }
1619
1620            let pattern_ids_len =
1621                wire::mul(npats, 4, "sparse pattern ID byte length")?;
1622            wire::check_slice_len(
1623                state,
1624                pattern_ids_len,
1625                "sparse pattern IDs",
1626            )?;
1627            let (pattern_ids, state) = state.split_at(pattern_ids_len);
1628            for patbytes in pattern_ids.chunks(PatternID::SIZE) {
1629                wire::read_pattern_id(
1630                    patbytes,
1631                    "sparse pattern ID in try_state",
1632                )?;
1633            }
1634            (pattern_ids, state)
1635        } else {
1636            (&[][..], state)
1637        };
1638        if is_match && pattern_ids.is_empty() {
1639            return Err(DeserializeError::generic(
1640                "state marked as a match, but has no pattern IDs",
1641            ));
1642        }
1643        if sp.is_match_state(id) && pattern_ids.is_empty() {
1644            return Err(DeserializeError::generic(
1645                "state marked special as a match, but has no pattern IDs",
1646            ));
1647        }
1648        if sp.is_match_state(id) != is_match {
1649            return Err(DeserializeError::generic(
1650                "whether state is a match or not is inconsistent",
1651            ));
1652        }
1653
1654        // Now read this state's accelerator info. The first byte is the length
1655        // of the accelerator, which is typically 0 (for no acceleration) but
1656        // is no bigger than 3. The length indicates the number of bytes that
1657        // follow, where each byte corresponds to a transition out of this
1658        // state.
1659        if state.is_empty() {
1660            return Err(DeserializeError::generic("no accelerator length"));
1661        }
1662        let (accel_len, state) = (usize::from(state[0]), &state[1..]);
1663
1664        if accel_len > 3 {
1665            return Err(DeserializeError::generic(
1666                "sparse invalid accelerator length",
1667            ));
1668        } else if accel_len == 0 && sp.is_accel_state(id) {
1669            return Err(DeserializeError::generic(
1670                "got no accelerators in state, but in accelerator ID range",
1671            ));
1672        } else if accel_len > 0 && !sp.is_accel_state(id) {
1673            return Err(DeserializeError::generic(
1674                "state in accelerator ID range, but has no accelerators",
1675            ));
1676        }
1677
1678        wire::check_slice_len(
1679            state,
1680            accel_len,
1681            "sparse corrupt accelerator length",
1682        )?;
1683        let (accel, _) = (&state[..accel_len], &state[accel_len..]);
1684
1685        let state = State {
1686            id,
1687            is_match,
1688            ntrans,
1689            input_ranges,
1690            next,
1691            pattern_ids,
1692            accel,
1693        };
1694        if sp.is_quit_state(state.next_at(state.ntrans - 1)) {
1695            return Err(DeserializeError::generic(
1696                "state with EOI transition to quit state is illegal",
1697            ));
1698        }
1699        Ok(state)
1700    }
1701
1702    /// Return an iterator over all of the states in this DFA.
1703    ///
1704    /// The iterator returned yields tuples, where the first element is the
1705    /// state ID and the second element is the state itself.
1706    fn states(&self) -> StateIter<'_, T> {
1707        StateIter { trans: self, id: DEAD.as_usize() }
1708    }
1709
1710    /// Returns the sparse transitions as raw bytes.
1711    fn sparse(&self) -> &[u8] {
1712        self.sparse.as_ref()
1713    }
1714
1715    /// Returns the number of bytes represented by a single state ID.
1716    fn id_len(&self) -> usize {
1717        StateID::SIZE
1718    }
1719
1720    /// Return the memory usage, in bytes, of these transitions.
1721    ///
1722    /// This does not include the size of a `Transitions` value itself.
1723    fn memory_usage(&self) -> usize {
1724        self.sparse().len()
1725    }
1726}
1727
1728#[cfg(feature = "dfa-build")]
1729impl<T: AsMut<[u8]>> Transitions<T> {
1730    /// Return a convenient mutable representation of the given state.
1731    /// This panics if the state is invalid.
1732    fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
1733        let mut state = &mut self.sparse_mut()[id.as_usize()..];
1734        let mut ntrans = wire::read_u16(&state).as_usize();
1735        let is_match = (1 << 15) & ntrans != 0;
1736        ntrans &= !(1 << 15);
1737        state = &mut state[2..];
1738
1739        let (input_ranges, state) = state.split_at_mut(ntrans * 2);
1740        let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
1741        let (pattern_ids, state) = if is_match {
1742            let npats = wire::read_u32(&state).as_usize();
1743            state[4..].split_at_mut(npats * 4)
1744        } else {
1745            (&mut [][..], state)
1746        };
1747
1748        let accel_len = usize::from(state[0]);
1749        let accel = &mut state[1..accel_len + 1];
1750        StateMut {
1751            id,
1752            is_match,
1753            ntrans,
1754            input_ranges,
1755            next,
1756            pattern_ids,
1757            accel,
1758        }
1759    }
1760
1761    /// Returns the sparse transitions as raw mutable bytes.
1762    fn sparse_mut(&mut self) -> &mut [u8] {
1763        self.sparse.as_mut()
1764    }
1765}
1766
1767/// The set of all possible starting states in a DFA.
1768///
1769/// See the eponymous type in the `dense` module for more details. This type
1770/// is very similar to `dense::StartTable`, except that its underlying
1771/// representation is `&[u8]` instead of `&[S]`. (The latter would require
1772/// sparse DFAs to be aligned, which is explicitly something we do not require
1773/// because we don't really need it.)
1774#[derive(Clone)]
1775struct StartTable<T> {
1776    /// The initial start state IDs as a contiguous table of native endian
1777    /// encoded integers, represented by `S`.
1778    ///
1779    /// In practice, T is either Vec<u8> or &[u8] and has no alignment
1780    /// requirements.
1781    ///
1782    /// The first `2 * stride` (currently always 8) entries always correspond
1783    /// to the starts states for the entire DFA, with the first 4 entries being
1784    /// for unanchored searches and the second 4 entries being for anchored
1785    /// searches. To keep things simple, we always use 8 entries even if the
1786    /// `StartKind` is not both.
1787    ///
1788    /// After that, there are `stride * patterns` state IDs, where `patterns`
1789    /// may be zero in the case of a DFA with no patterns or in the case where
1790    /// the DFA was built without enabling starting states for each pattern.
1791    table: T,
1792    /// The starting state configuration supported. When 'both', both
1793    /// unanchored and anchored searches work. When 'unanchored', anchored
1794    /// searches panic. When 'anchored', unanchored searches panic.
1795    kind: StartKind,
1796    /// The start state configuration for every possible byte.
1797    start_map: StartByteMap,
1798    /// The number of starting state IDs per pattern.
1799    stride: usize,
1800    /// The total number of patterns for which starting states are encoded.
1801    /// This is `None` for DFAs that were built without start states for each
1802    /// pattern. Thus, one cannot use this field to say how many patterns
1803    /// are in the DFA in all cases. It is specific to how many patterns are
1804    /// represented in this start table.
1805    pattern_len: Option<usize>,
1806    /// The universal starting state for unanchored searches. This is only
1807    /// present when the DFA supports unanchored searches and when all starting
1808    /// state IDs for an unanchored search are equivalent.
1809    universal_start_unanchored: Option<StateID>,
1810    /// The universal starting state for anchored searches. This is only
1811    /// present when the DFA supports anchored searches and when all starting
1812    /// state IDs for an anchored search are equivalent.
1813    universal_start_anchored: Option<StateID>,
1814}
1815
1816#[cfg(feature = "dfa-build")]
1817impl StartTable<Vec<u8>> {
1818    fn new<T: AsRef<[u32]>>(
1819        dfa: &dense::DFA<T>,
1820        pattern_len: Option<usize>,
1821    ) -> StartTable<Vec<u8>> {
1822        let stride = Start::len();
1823        // This is OK since the only way we're here is if a dense DFA could be
1824        // constructed successfully, which uses the same space.
1825        let len = stride
1826            .checked_mul(pattern_len.unwrap_or(0))
1827            .unwrap()
1828            .checked_add(stride.checked_mul(2).unwrap())
1829            .unwrap()
1830            .checked_mul(StateID::SIZE)
1831            .unwrap();
1832        StartTable {
1833            table: vec![0; len],
1834            kind: dfa.start_kind(),
1835            start_map: dfa.start_map().clone(),
1836            stride,
1837            pattern_len,
1838            universal_start_unanchored: dfa
1839                .universal_start_state(Anchored::No),
1840            universal_start_anchored: dfa.universal_start_state(Anchored::Yes),
1841        }
1842    }
1843
1844    fn from_dense_dfa<T: AsRef<[u32]>>(
1845        dfa: &dense::DFA<T>,
1846        remap: &[StateID],
1847    ) -> Result<StartTable<Vec<u8>>, BuildError> {
1848        // Unless the DFA has start states compiled for each pattern, then
1849        // as far as the starting state table is concerned, there are zero
1850        // patterns to account for. It will instead only store starting states
1851        // for the entire DFA.
1852        let start_pattern_len = if dfa.starts_for_each_pattern() {
1853            Some(dfa.pattern_len())
1854        } else {
1855            None
1856        };
1857        let mut sl = StartTable::new(dfa, start_pattern_len);
1858        for (old_start_id, anchored, sty) in dfa.starts() {
1859            let new_start_id = remap[dfa.to_index(old_start_id)];
1860            sl.set_start(anchored, sty, new_start_id);
1861        }
1862        Ok(sl)
1863    }
1864}
1865
1866impl<'a> StartTable<&'a [u8]> {
1867    unsafe fn from_bytes_unchecked(
1868        mut slice: &'a [u8],
1869    ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
1870        let slice_start = slice.as_ptr().as_usize();
1871
1872        let (kind, nr) = StartKind::from_bytes(slice)?;
1873        slice = &slice[nr..];
1874
1875        let (start_map, nr) = StartByteMap::from_bytes(slice)?;
1876        slice = &slice[nr..];
1877
1878        let (stride, nr) =
1879            wire::try_read_u32_as_usize(slice, "sparse start table stride")?;
1880        slice = &slice[nr..];
1881        if stride != Start::len() {
1882            return Err(DeserializeError::generic(
1883                "invalid sparse starting table stride",
1884            ));
1885        }
1886
1887        let (maybe_pattern_len, nr) =
1888            wire::try_read_u32_as_usize(slice, "sparse start table patterns")?;
1889        slice = &slice[nr..];
1890        let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
1891            None
1892        } else {
1893            Some(maybe_pattern_len)
1894        };
1895        if pattern_len.map_or(false, |len| len > PatternID::LIMIT) {
1896            return Err(DeserializeError::generic(
1897                "sparse invalid number of patterns",
1898            ));
1899        }
1900
1901        let (universal_unanchored, nr) =
1902            wire::try_read_u32(slice, "universal unanchored start")?;
1903        slice = &slice[nr..];
1904        let universal_start_unanchored = if universal_unanchored == u32::MAX {
1905            None
1906        } else {
1907            Some(StateID::try_from(universal_unanchored).map_err(|e| {
1908                DeserializeError::state_id_error(
1909                    e,
1910                    "universal unanchored start",
1911                )
1912            })?)
1913        };
1914
1915        let (universal_anchored, nr) =
1916            wire::try_read_u32(slice, "universal anchored start")?;
1917        slice = &slice[nr..];
1918        let universal_start_anchored = if universal_anchored == u32::MAX {
1919            None
1920        } else {
1921            Some(StateID::try_from(universal_anchored).map_err(|e| {
1922                DeserializeError::state_id_error(e, "universal anchored start")
1923            })?)
1924        };
1925
1926        let pattern_table_size = wire::mul(
1927            stride,
1928            pattern_len.unwrap_or(0),
1929            "sparse invalid pattern length",
1930        )?;
1931        // Our start states always start with a single stride of start states
1932        // for the entire automaton which permit it to match any pattern. What
1933        // follows it are an optional set of start states for each pattern.
1934        let start_state_len = wire::add(
1935            wire::mul(2, stride, "start state stride too big")?,
1936            pattern_table_size,
1937            "sparse invalid 'any' pattern starts size",
1938        )?;
1939        let table_bytes_len = wire::mul(
1940            start_state_len,
1941            StateID::SIZE,
1942            "sparse pattern table bytes length",
1943        )?;
1944        wire::check_slice_len(
1945            slice,
1946            table_bytes_len,
1947            "sparse start ID table",
1948        )?;
1949        let table = &slice[..table_bytes_len];
1950        slice = &slice[table_bytes_len..];
1951
1952        let sl = StartTable {
1953            table,
1954            kind,
1955            start_map,
1956            stride,
1957            pattern_len,
1958            universal_start_unanchored,
1959            universal_start_anchored,
1960        };
1961        Ok((sl, slice.as_ptr().as_usize() - slice_start))
1962    }
1963}
1964
1965impl<T: AsRef<[u8]>> StartTable<T> {
1966    fn write_to<E: Endian>(
1967        &self,
1968        mut dst: &mut [u8],
1969    ) -> Result<usize, SerializeError> {
1970        let nwrite = self.write_to_len();
1971        if dst.len() < nwrite {
1972            return Err(SerializeError::buffer_too_small(
1973                "sparse starting table ids",
1974            ));
1975        }
1976        dst = &mut dst[..nwrite];
1977
1978        // write start kind
1979        let nw = self.kind.write_to::<E>(dst)?;
1980        dst = &mut dst[nw..];
1981        // write start byte map
1982        let nw = self.start_map.write_to(dst)?;
1983        dst = &mut dst[nw..];
1984        // write stride
1985        E::write_u32(u32::try_from(self.stride).unwrap(), dst);
1986        dst = &mut dst[size_of::<u32>()..];
1987        // write pattern length
1988        E::write_u32(
1989            u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(),
1990            dst,
1991        );
1992        dst = &mut dst[size_of::<u32>()..];
1993        // write universal start unanchored state id, u32::MAX if absent
1994        E::write_u32(
1995            self.universal_start_unanchored
1996                .map_or(u32::MAX, |sid| sid.as_u32()),
1997            dst,
1998        );
1999        dst = &mut dst[size_of::<u32>()..];
2000        // write universal start anchored state id, u32::MAX if absent
2001        E::write_u32(
2002            self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()),
2003            dst,
2004        );
2005        dst = &mut dst[size_of::<u32>()..];
2006        // write start IDs
2007        for (sid, _, _) in self.iter() {
2008            E::write_u32(sid.as_u32(), dst);
2009            dst = &mut dst[StateID::SIZE..];
2010        }
2011        Ok(nwrite)
2012    }
2013
2014    /// Returns the number of bytes the serialized form of this transition
2015    /// table will use.
2016    fn write_to_len(&self) -> usize {
2017        self.kind.write_to_len()
2018        + self.start_map.write_to_len()
2019        + size_of::<u32>() // stride
2020        + size_of::<u32>() // # patterns
2021        + size_of::<u32>() // universal unanchored start
2022        + size_of::<u32>() // universal anchored start
2023        + self.table().len()
2024    }
2025
2026    /// Validates that every starting state ID in this table is valid.
2027    ///
2028    /// That is, every starting state ID can be used to correctly decode a
2029    /// state in the DFA's sparse transitions.
2030    fn validate(
2031        &self,
2032        sp: &Special,
2033        seen: &Seen,
2034    ) -> Result<(), DeserializeError> {
2035        for (id, _, _) in self.iter() {
2036            if !seen.contains(&id) {
2037                return Err(DeserializeError::generic(
2038                    "found invalid start state ID",
2039                ));
2040            }
2041            if sp.is_match_state(id) {
2042                return Err(DeserializeError::generic(
2043                    "start states cannot be match states",
2044                ));
2045            }
2046        }
2047        Ok(())
2048    }
2049
2050    /// Converts this start list to a borrowed value.
2051    fn as_ref(&self) -> StartTable<&'_ [u8]> {
2052        StartTable {
2053            table: self.table(),
2054            kind: self.kind,
2055            start_map: self.start_map.clone(),
2056            stride: self.stride,
2057            pattern_len: self.pattern_len,
2058            universal_start_unanchored: self.universal_start_unanchored,
2059            universal_start_anchored: self.universal_start_anchored,
2060        }
2061    }
2062
2063    /// Converts this start list to an owned value.
2064    #[cfg(feature = "alloc")]
2065    fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> {
2066        StartTable {
2067            table: self.table().to_vec(),
2068            kind: self.kind,
2069            start_map: self.start_map.clone(),
2070            stride: self.stride,
2071            pattern_len: self.pattern_len,
2072            universal_start_unanchored: self.universal_start_unanchored,
2073            universal_start_anchored: self.universal_start_anchored,
2074        }
2075    }
2076
2077    /// Return the start state for the given index and pattern ID. If the
2078    /// pattern ID is None, then the corresponding start state for the entire
2079    /// DFA is returned. If the pattern ID is not None, then the corresponding
2080    /// starting state for the given pattern is returned. If this start table
2081    /// does not have individual starting states for each pattern, then this
2082    /// panics.
2083    fn start(
2084        &self,
2085        anchored: Anchored,
2086        start: Start,
2087    ) -> Result<StateID, StartError> {
2088        let start_index = start.as_usize();
2089        let index = match anchored {
2090            Anchored::No => {
2091                if !self.kind.has_unanchored() {
2092                    return Err(StartError::unsupported_anchored(anchored));
2093                }
2094                start_index
2095            }
2096            Anchored::Yes => {
2097                if !self.kind.has_anchored() {
2098                    return Err(StartError::unsupported_anchored(anchored));
2099                }
2100                self.stride + start_index
2101            }
2102            Anchored::Pattern(pid) => {
2103                let len = match self.pattern_len {
2104                    None => {
2105                        return Err(StartError::unsupported_anchored(anchored))
2106                    }
2107                    Some(len) => len,
2108                };
2109                if pid.as_usize() >= len {
2110                    return Ok(DEAD);
2111                }
2112                (2 * self.stride)
2113                    + (self.stride * pid.as_usize())
2114                    + start_index
2115            }
2116        };
2117        let start = index * StateID::SIZE;
2118        // This OK since we're allowed to assume that the start table contains
2119        // valid StateIDs.
2120        Ok(wire::read_state_id_unchecked(&self.table()[start..]).0)
2121    }
2122
2123    /// Return an iterator over all start IDs in this table.
2124    fn iter(&self) -> StartStateIter<'_, T> {
2125        StartStateIter { st: self, i: 0 }
2126    }
2127
2128    /// Returns the total number of start state IDs in this table.
2129    fn len(&self) -> usize {
2130        self.table().len() / StateID::SIZE
2131    }
2132
2133    /// Returns the table as a raw slice of bytes.
2134    fn table(&self) -> &[u8] {
2135        self.table.as_ref()
2136    }
2137
2138    /// Return the memory usage, in bytes, of this start list.
2139    ///
2140    /// This does not include the size of a `StartTable` value itself.
2141    fn memory_usage(&self) -> usize {
2142        self.table().len()
2143    }
2144}
2145
2146#[cfg(feature = "dfa-build")]
2147impl<T: AsMut<[u8]>> StartTable<T> {
2148    /// Set the start state for the given index and pattern.
2149    ///
2150    /// If the pattern ID or state ID are not valid, then this will panic.
2151    fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
2152        let start_index = start.as_usize();
2153        let index = match anchored {
2154            Anchored::No => start_index,
2155            Anchored::Yes => self.stride + start_index,
2156            Anchored::Pattern(pid) => {
2157                let pid = pid.as_usize();
2158                let len = self
2159                    .pattern_len
2160                    .expect("start states for each pattern enabled");
2161                assert!(pid < len, "invalid pattern ID {:?}", pid);
2162                self.stride
2163                    .checked_mul(pid)
2164                    .unwrap()
2165                    .checked_add(self.stride.checked_mul(2).unwrap())
2166                    .unwrap()
2167                    .checked_add(start_index)
2168                    .unwrap()
2169            }
2170        };
2171        let start = index * StateID::SIZE;
2172        let end = start + StateID::SIZE;
2173        wire::write_state_id::<wire::NE>(
2174            id,
2175            &mut self.table.as_mut()[start..end],
2176        );
2177    }
2178}
2179
2180/// An iterator over all state state IDs in a sparse DFA.
2181struct StartStateIter<'a, T> {
2182    st: &'a StartTable<T>,
2183    i: usize,
2184}
2185
2186impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
2187    type Item = (StateID, Anchored, Start);
2188
2189    fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
2190        let i = self.i;
2191        if i >= self.st.len() {
2192            return None;
2193        }
2194        self.i += 1;
2195
2196        // This unwrap is okay since the stride of any DFA must always match
2197        // the number of start state types.
2198        let start_type = Start::from_usize(i % self.st.stride).unwrap();
2199        let anchored = if i < self.st.stride {
2200            Anchored::No
2201        } else if i < (2 * self.st.stride) {
2202            Anchored::Yes
2203        } else {
2204            let pid = (i - (2 * self.st.stride)) / self.st.stride;
2205            Anchored::Pattern(PatternID::new(pid).unwrap())
2206        };
2207        let start = i * StateID::SIZE;
2208        let end = start + StateID::SIZE;
2209        let bytes = self.st.table()[start..end].try_into().unwrap();
2210        // This is OK since we're allowed to assume that any IDs in this start
2211        // table are correct and valid for this DFA.
2212        let id = StateID::from_ne_bytes_unchecked(bytes);
2213        Some((id, anchored, start_type))
2214    }
2215}
2216
2217impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
2218    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2219        f.debug_struct("StartStateIter").field("i", &self.i).finish()
2220    }
2221}
2222
2223/// An iterator over all states in a sparse DFA.
2224///
2225/// This iterator yields tuples, where the first element is the state ID and
2226/// the second element is the state itself.
2227struct StateIter<'a, T> {
2228    trans: &'a Transitions<T>,
2229    id: usize,
2230}
2231
2232impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
2233    type Item = State<'a>;
2234
2235    fn next(&mut self) -> Option<State<'a>> {
2236        if self.id >= self.trans.sparse().len() {
2237            return None;
2238        }
2239        let state = self.trans.state(StateID::new_unchecked(self.id));
2240        self.id = self.id + state.write_to_len();
2241        Some(state)
2242    }
2243}
2244
2245impl<'a, T> fmt::Debug for StateIter<'a, T> {
2246    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2247        f.debug_struct("StateIter").field("id", &self.id).finish()
2248    }
2249}
2250
2251/// A representation of a sparse DFA state that can be cheaply materialized
2252/// from a state identifier.
2253#[derive(Clone)]
2254struct State<'a> {
2255    /// The identifier of this state.
2256    id: StateID,
2257    /// Whether this is a match state or not.
2258    is_match: bool,
2259    /// The number of transitions in this state.
2260    ntrans: usize,
2261    /// Pairs of input ranges, where there is one pair for each transition.
2262    /// Each pair specifies an inclusive start and end byte range for the
2263    /// corresponding transition.
2264    input_ranges: &'a [u8],
2265    /// Transitions to the next state. This slice contains native endian
2266    /// encoded state identifiers, with `S` as the representation. Thus, there
2267    /// are `ntrans * size_of::<S>()` bytes in this slice.
2268    next: &'a [u8],
2269    /// If this is a match state, then this contains the pattern IDs that match
2270    /// when the DFA is in this state.
2271    ///
2272    /// This is a contiguous sequence of 32-bit native endian encoded integers.
2273    pattern_ids: &'a [u8],
2274    /// An accelerator for this state, if present. If this state has no
2275    /// accelerator, then this is an empty slice. When non-empty, this slice
2276    /// has length at most 3 and corresponds to the exhaustive set of bytes
2277    /// that must be seen in order to transition out of this state.
2278    accel: &'a [u8],
2279}
2280
2281impl<'a> State<'a> {
2282    /// Searches for the next transition given an input byte. If no such
2283    /// transition could be found, then a dead state is returned.
2284    ///
2285    /// This is marked as inline to help dramatically boost sparse searching,
2286    /// which decodes each state it enters to follow the next transition.
2287    #[cfg_attr(feature = "perf-inline", inline(always))]
2288    fn next(&self, input: u8) -> StateID {
2289        // This straight linear search was observed to be much better than
2290        // binary search on ASCII haystacks, likely because a binary search
2291        // visits the ASCII case last but a linear search sees it first. A
2292        // binary search does do a little better on non-ASCII haystacks, but
2293        // not by much. There might be a better trade off lurking here.
2294        for i in 0..(self.ntrans - 1) {
2295            let (start, end) = self.range(i);
2296            if start <= input && input <= end {
2297                return self.next_at(i);
2298            }
2299            // We could bail early with an extra branch: if input < b1, then
2300            // we know we'll never find a matching transition. Interestingly,
2301            // this extra branch seems to not help performance, or will even
2302            // hurt it. It's likely very dependent on the DFA itself and what
2303            // is being searched.
2304        }
2305        DEAD
2306    }
2307
2308    /// Returns the next state ID for the special EOI transition.
2309    fn next_eoi(&self) -> StateID {
2310        self.next_at(self.ntrans - 1)
2311    }
2312
2313    /// Returns the identifier for this state.
2314    fn id(&self) -> StateID {
2315        self.id
2316    }
2317
2318    /// Returns the inclusive input byte range for the ith transition in this
2319    /// state.
2320    fn range(&self, i: usize) -> (u8, u8) {
2321        (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
2322    }
2323
2324    /// Returns the next state for the ith transition in this state.
2325    fn next_at(&self, i: usize) -> StateID {
2326        let start = i * StateID::SIZE;
2327        let end = start + StateID::SIZE;
2328        let bytes = self.next[start..end].try_into().unwrap();
2329        StateID::from_ne_bytes_unchecked(bytes)
2330    }
2331
2332    /// Returns the pattern ID for the given match index. If the match index
2333    /// is invalid, then this panics.
2334    fn pattern_id(&self, match_index: usize) -> PatternID {
2335        let start = match_index * PatternID::SIZE;
2336        wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
2337    }
2338
2339    /// Returns the total number of pattern IDs for this state. This is always
2340    /// zero when `is_match` is false.
2341    fn pattern_len(&self) -> usize {
2342        assert_eq!(0, self.pattern_ids.len() % 4);
2343        self.pattern_ids.len() / 4
2344    }
2345
2346    /// Return an accelerator for this state.
2347    fn accelerator(&self) -> &'a [u8] {
2348        self.accel
2349    }
2350
2351    /// Write the raw representation of this state to the given buffer using
2352    /// the given endianness.
2353    fn write_to<E: Endian>(
2354        &self,
2355        mut dst: &mut [u8],
2356    ) -> Result<usize, SerializeError> {
2357        let nwrite = self.write_to_len();
2358        if dst.len() < nwrite {
2359            return Err(SerializeError::buffer_too_small(
2360                "sparse state transitions",
2361            ));
2362        }
2363
2364        let ntrans =
2365            if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans };
2366        E::write_u16(u16::try_from(ntrans).unwrap(), dst);
2367        dst = &mut dst[size_of::<u16>()..];
2368
2369        dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges);
2370        dst = &mut dst[self.input_ranges.len()..];
2371
2372        for i in 0..self.ntrans {
2373            E::write_u32(self.next_at(i).as_u32(), dst);
2374            dst = &mut dst[StateID::SIZE..];
2375        }
2376
2377        if self.is_match {
2378            E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst);
2379            dst = &mut dst[size_of::<u32>()..];
2380            for i in 0..self.pattern_len() {
2381                let pid = self.pattern_id(i);
2382                E::write_u32(pid.as_u32(), dst);
2383                dst = &mut dst[PatternID::SIZE..];
2384            }
2385        }
2386
2387        dst[0] = u8::try_from(self.accel.len()).unwrap();
2388        dst[1..][..self.accel.len()].copy_from_slice(self.accel);
2389
2390        Ok(nwrite)
2391    }
2392
2393    /// Return the total number of bytes that this state consumes in its
2394    /// encoded form.
2395    fn write_to_len(&self) -> usize {
2396        let mut len = 2
2397            + (self.ntrans * 2)
2398            + (self.ntrans * StateID::SIZE)
2399            + (1 + self.accel.len());
2400        if self.is_match {
2401            len += size_of::<u32>() + self.pattern_ids.len();
2402        }
2403        len
2404    }
2405}
2406
2407impl<'a> fmt::Debug for State<'a> {
2408    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2409        let mut printed = false;
2410        for i in 0..(self.ntrans - 1) {
2411            let next = self.next_at(i);
2412            if next == DEAD {
2413                continue;
2414            }
2415
2416            if printed {
2417                write!(f, ", ")?;
2418            }
2419            let (start, end) = self.range(i);
2420            if start == end {
2421                write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?;
2422            } else {
2423                write!(
2424                    f,
2425                    "{:?}-{:?} => {:?}",
2426                    DebugByte(start),
2427                    DebugByte(end),
2428                    next.as_usize(),
2429                )?;
2430            }
2431            printed = true;
2432        }
2433        let eoi = self.next_at(self.ntrans - 1);
2434        if eoi != DEAD {
2435            if printed {
2436                write!(f, ", ")?;
2437            }
2438            write!(f, "EOI => {:?}", eoi.as_usize())?;
2439        }
2440        Ok(())
2441    }
2442}
2443
2444/// A representation of a mutable sparse DFA state that can be cheaply
2445/// materialized from a state identifier.
2446#[cfg(feature = "dfa-build")]
2447struct StateMut<'a> {
2448    /// The identifier of this state.
2449    id: StateID,
2450    /// Whether this is a match state or not.
2451    is_match: bool,
2452    /// The number of transitions in this state.
2453    ntrans: usize,
2454    /// Pairs of input ranges, where there is one pair for each transition.
2455    /// Each pair specifies an inclusive start and end byte range for the
2456    /// corresponding transition.
2457    input_ranges: &'a mut [u8],
2458    /// Transitions to the next state. This slice contains native endian
2459    /// encoded state identifiers, with `S` as the representation. Thus, there
2460    /// are `ntrans * size_of::<S>()` bytes in this slice.
2461    next: &'a mut [u8],
2462    /// If this is a match state, then this contains the pattern IDs that match
2463    /// when the DFA is in this state.
2464    ///
2465    /// This is a contiguous sequence of 32-bit native endian encoded integers.
2466    pattern_ids: &'a [u8],
2467    /// An accelerator for this state, if present. If this state has no
2468    /// accelerator, then this is an empty slice. When non-empty, this slice
2469    /// has length at most 3 and corresponds to the exhaustive set of bytes
2470    /// that must be seen in order to transition out of this state.
2471    accel: &'a mut [u8],
2472}
2473
2474#[cfg(feature = "dfa-build")]
2475impl<'a> StateMut<'a> {
2476    /// Sets the ith transition to the given state.
2477    fn set_next_at(&mut self, i: usize, next: StateID) {
2478        let start = i * StateID::SIZE;
2479        let end = start + StateID::SIZE;
2480        wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]);
2481    }
2482}
2483
2484#[cfg(feature = "dfa-build")]
2485impl<'a> fmt::Debug for StateMut<'a> {
2486    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2487        let state = State {
2488            id: self.id,
2489            is_match: self.is_match,
2490            ntrans: self.ntrans,
2491            input_ranges: self.input_ranges,
2492            next: self.next,
2493            pattern_ids: self.pattern_ids,
2494            accel: self.accel,
2495        };
2496        fmt::Debug::fmt(&state, f)
2497    }
2498}
2499
2500// In order to validate everything, we not only need to make sure we
2501// can decode every state, but that every transition in every state
2502// points to a valid state. There are many duplicative transitions, so
2503// we record state IDs that we've verified so that we don't redo the
2504// decoding work.
2505//
2506// Except, when in no_std mode, we don't have dynamic memory allocation
2507// available to us, so we skip this optimization. It's not clear
2508// whether doing something more clever is worth it just yet. If you're
2509// profiling this code and need it to run faster, please file an issue.
2510//
2511// OK, so we also use this to record the set of valid state IDs. Since
2512// it is possible for a transition to point to an invalid state ID that
2513// still (somehow) deserializes to a valid state. So we need to make
2514// sure our transitions are limited to actually correct state IDs.
2515// The problem is, I'm not sure how to do this verification step in
2516// no-std no-alloc mode. I think we'd *have* to store the set of valid
2517// state IDs in the DFA itself. For now, we don't do this verification
2518// in no-std no-alloc mode. The worst thing that can happen is an
2519// incorrect result. But no panics or memory safety problems should
2520// result. Because we still do validate that the state itself is
2521// "valid" in the sense that everything it points to actually exists.
2522//
2523// ---AG
2524#[derive(Debug)]
2525struct Seen {
2526    #[cfg(feature = "alloc")]
2527    set: alloc::collections::BTreeSet<StateID>,
2528    #[cfg(not(feature = "alloc"))]
2529    set: core::marker::PhantomData<StateID>,
2530}
2531
2532#[cfg(feature = "alloc")]
2533impl Seen {
2534    fn new() -> Seen {
2535        Seen { set: alloc::collections::BTreeSet::new() }
2536    }
2537    fn insert(&mut self, id: StateID) {
2538        self.set.insert(id);
2539    }
2540    fn contains(&self, id: &StateID) -> bool {
2541        self.set.contains(id)
2542    }
2543}
2544
2545#[cfg(not(feature = "alloc"))]
2546impl Seen {
2547    fn new() -> Seen {
2548        Seen { set: core::marker::PhantomData }
2549    }
2550    fn insert(&mut self, _id: StateID) {}
2551    fn contains(&self, _id: &StateID) -> bool {
2552        true
2553    }
2554}
2555
2556/*
2557/// A binary search routine specialized specifically to a sparse DFA state's
2558/// transitions. Specifically, the transitions are defined as a set of pairs
2559/// of input bytes that delineate an inclusive range of bytes. If the input
2560/// byte is in the range, then the corresponding transition is a match.
2561///
2562/// This binary search accepts a slice of these pairs and returns the position
2563/// of the matching pair (the ith transition), or None if no matching pair
2564/// could be found.
2565///
2566/// Note that this routine is not currently used since it was observed to
2567/// either decrease performance when searching ASCII, or did not provide enough
2568/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
2569/// for posterity in case we can find a way to use it.
2570///
2571/// In theory, we could use the standard library's search routine if we could
2572/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
2573/// guaranteed to be safe and is thus UB (since I don't think the in-memory
2574/// representation of `(u8, u8)` has been nailed down). One could define a
2575/// repr(C) type, but the casting doesn't seem justified.
2576#[cfg_attr(feature = "perf-inline", inline(always))]
2577fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
2578    debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
2579    debug_assert!(ranges.len() <= 512, "ranges should be short");
2580
2581    let (mut left, mut right) = (0, ranges.len() / 2);
2582    while left < right {
2583        let mid = (left + right) / 2;
2584        let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
2585        if needle < b1 {
2586            right = mid;
2587        } else if needle > b2 {
2588            left = mid + 1;
2589        } else {
2590            return Some(mid);
2591        }
2592    }
2593    None
2594}
2595*/
2596
2597#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
2598mod tests {
2599    use crate::{
2600        dfa::{dense::DFA, Automaton},
2601        nfa::thompson,
2602        Input, MatchError,
2603    };
2604
2605    // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
2606    #[test]
2607    fn heuristic_unicode_forward() {
2608        let dfa = DFA::builder()
2609            .configure(DFA::config().unicode_word_boundary(true))
2610            .thompson(thompson::Config::new().reverse(true))
2611            .build(r"\b[0-9]+\b")
2612            .unwrap()
2613            .to_sparse()
2614            .unwrap();
2615
2616        let input = Input::new("β123").range(2..);
2617        let expected = MatchError::quit(0xB2, 1);
2618        let got = dfa.try_search_fwd(&input);
2619        assert_eq!(Err(expected), got);
2620
2621        let input = Input::new("123β").range(..3);
2622        let expected = MatchError::quit(0xCE, 3);
2623        let got = dfa.try_search_fwd(&input);
2624        assert_eq!(Err(expected), got);
2625    }
2626
2627    // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
2628    #[test]
2629    fn heuristic_unicode_reverse() {
2630        let dfa = DFA::builder()
2631            .configure(DFA::config().unicode_word_boundary(true))
2632            .thompson(thompson::Config::new().reverse(true))
2633            .build(r"\b[0-9]+\b")
2634            .unwrap()
2635            .to_sparse()
2636            .unwrap();
2637
2638        let input = Input::new("β123").range(2..);
2639        let expected = MatchError::quit(0xB2, 1);
2640        let got = dfa.try_search_rev(&input);
2641        assert_eq!(Err(expected), got);
2642
2643        let input = Input::new("123β").range(..3);
2644        let expected = MatchError::quit(0xCE, 3);
2645        let got = dfa.try_search_rev(&input);
2646        assert_eq!(Err(expected), got);
2647    }
2648}