Skip to main content

resharp/
lib.rs

1//! regex engine with intersection, complement, and lookarounds
2//!
3//! # quick start
4//!
5//! ```
6//! let re = resharp::Regex::new(r"\d{3}-\d{4}").unwrap();
7//! let matches = re.find_all(b"call 555-1234 or 555-5678").unwrap();
8//! assert_eq!(matches.len(), 2);
9//! ```
10//!
11//! # options
12//!
13//! use [`EngineOptions`] with [`Regex::with_options`] for non-default settings:
14//!
15//! ```
16//! use resharp::{Regex, EngineOptions};
17//!
18//! let re = Regex::with_options(
19//!     r"hello world",
20//!     EngineOptions::default()
21//!         .case_insensitive(true)
22//!         .dot_matches_new_line(true),
23//! ).unwrap();
24//! assert!(re.is_match(b"Hello World").unwrap());
25//! ```
26//!
27//! # escaping user input
28//!
29//! use [`escape`] to safely embed literal strings in patterns:
30//!
31//! ```
32//! let user_input = "file (1).txt";
33//! let pattern = format!(r"^{}$", resharp::escape(user_input));
34//! let re = resharp::Regex::new(&pattern).unwrap();
35//! assert!(re.is_match(b"file (1).txt").unwrap());
36//! ```
37
38#![deny(missing_docs)]
39
40pub(crate) mod accel;
41pub(crate) mod engine;
42pub(crate) mod prefix;
43pub(crate) mod simd;
44
45#[cfg(feature = "diag")]
46pub use engine::BDFA;
47#[cfg(feature = "diag")]
48pub use prefix::calc_potential_start;
49#[cfg(feature = "diag")]
50pub use prefix::calc_potential_start_prune;
51#[cfg(feature = "diag")]
52pub use prefix::calc_prefix_sets;
53#[cfg(feature = "diag")]
54pub use prefix::PrefixSets;
55#[cfg(feature = "diag")]
56pub use resharp_algebra::solver::TSetId;
57use resharp_algebra::Kind;
58
59// bdfa_scan / bdfa_inner const-generic PREFIX modes
60const PREFIX_NONE: u8 = 0;
61const PREFIX_SEARCH: u8 = 1;
62const PREFIX_LITERAL: u8 = 2;
63
64pub use resharp_algebra::nulls::Nullability;
65pub use resharp_algebra::NodeId;
66pub use resharp_algebra::RegexBuilder;
67/// escape all resharp meta characters in `text`, returning a pattern
68/// that matches the literal string.
69///
70/// ```
71/// assert_eq!(resharp::escape("a+b"), r"a\+b");
72/// ```
73pub use resharp_parser::escape;
74/// like [`escape`] but appends to an existing buffer.
75pub use resharp_parser::escape_into;
76
77use std::sync::Mutex;
78
79/// error from compiling or matching a regex.
80#[derive(Debug)]
81#[non_exhaustive]
82pub enum Error {
83    /// parse failure.
84    Parse(Box<resharp_parser::ResharpError>),
85    /// algebra error (unsupported pattern, anchor limit).
86    Algebra(resharp_algebra::AlgebraError),
87    /// DFA state cache exceeded `max_dfa_capacity`.
88    CapacityExceeded,
89    /// serialization or deserialization failure.
90    Serialize(String),
91}
92
93impl std::fmt::Display for Error {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        match self {
96            Error::Parse(e) => write!(f, "parse error: {}", e),
97            Error::Algebra(e) => write!(f, "{}", e),
98            Error::CapacityExceeded => write!(f, "DFA state capacity exceeded"),
99            Error::Serialize(ref s) => write!(f, "serialization error: {}", s),
100        }
101    }
102}
103
104impl std::error::Error for Error {
105    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
106        match self {
107            Error::Parse(e) => Some(e),
108            Error::Algebra(e) => Some(e),
109            Error::CapacityExceeded => None,
110            Error::Serialize(_) => None,
111        }
112    }
113}
114
115impl From<resharp_parser::ResharpError> for Error {
116    fn from(e: resharp_parser::ResharpError) -> Self {
117        Error::Parse(Box::new(e))
118    }
119}
120
121impl From<resharp_algebra::AlgebraError> for Error {
122    fn from(e: resharp_algebra::AlgebraError) -> Self {
123        Error::Algebra(e)
124    }
125}
126
127/// configuration for pattern compilation and engine behavior.
128///
129/// all options have sensible defaults via [`Default`]. use the builder
130/// methods to override:
131///
132/// ```
133/// use resharp::EngineOptions;
134///
135/// let opts = EngineOptions::default()
136///     .unicode(false)           // ASCII-only \w, \d, \s
137///     .case_insensitive(true)   // global (?i)
138///     .dot_matches_new_line(true); // . matches \n
139/// ```
140/// Controls which Unicode character tables `\w` and `\d` use.
141#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
142pub enum UnicodeMode {
143    /// ASCII only: `\w` = `[a-zA-Z0-9_]`, `\d` = `[0-9]`.
144    Ascii,
145    /// Default: covers major scripts up through U+07FF (Latin, Greek, Cyrillic,
146    /// Hebrew, Arabic, ...). All encoded as 1- or 2-byte UTF-8 sequences.
147    #[default]
148    Unicode,
149    /// All Unicode word/digit characters, including CJK, historic scripts,
150    /// and any code points requiring 3- or 4-byte UTF-8 sequences.
151    Full,
152}
153
154/// Engine configuration, passed to [`Regex::with_options`].
155pub struct EngineOptions {
156    /// states to eagerly precompile (0 = fully lazy).
157    pub dfa_threshold: usize,
158    /// max cached DFA states; clamped to `u16::MAX`.
159    pub max_dfa_capacity: usize,
160    /// max lookahead context distance (default: 800).
161    pub lookahead_context_max: u32,
162    /// Unicode coverage for `\w` and `\d` (default: `UnicodeMode::Unicode`).
163    pub unicode: UnicodeMode,
164    /// global case-insensitive matching (default: false).
165    pub case_insensitive: bool,
166    /// `.` matches `\n` (default: false). `_` always matches any byte.
167    pub dot_matches_new_line: bool,
168    /// allow whitespace and `#` comments in the pattern (default: false).
169    pub ignore_whitespace: bool,
170    /// use O(N·S) hardened forward scan (default: false).
171    /// prevents quadratic blowup on adversarial pattern+input combinations.
172    pub hardened: bool,
173}
174
175impl Default for EngineOptions {
176    fn default() -> Self {
177        Self {
178            dfa_threshold: 0,
179            max_dfa_capacity: u16::MAX as usize,
180            lookahead_context_max: 800,
181            unicode: UnicodeMode::Unicode,
182            case_insensitive: false,
183            dot_matches_new_line: false,
184            ignore_whitespace: false,
185            hardened: false,
186        }
187    }
188}
189
190impl EngineOptions {
191    /// set Unicode coverage for `\w` and `\d`.
192    pub fn unicode(mut self, mode: UnicodeMode) -> Self {
193        self.unicode = mode;
194        self
195    }
196    /// set case-insensitive mode.
197    pub fn case_insensitive(mut self, yes: bool) -> Self {
198        self.case_insensitive = yes;
199        self
200    }
201    /// set dot-matches-newline mode.
202    pub fn dot_matches_new_line(mut self, yes: bool) -> Self {
203        self.dot_matches_new_line = yes;
204        self
205    }
206    /// set ignore-whitespace (verbose) mode.
207    pub fn ignore_whitespace(mut self, yes: bool) -> Self {
208        self.ignore_whitespace = yes;
209        self
210    }
211    /// enable hardened mode for untrusted patterns: uses only O(N·S) forward scan (~5-20x constant overhead).
212    pub fn hardened(mut self, yes: bool) -> Self {
213        self.hardened = yes;
214        self
215    }
216}
217
218/// byte-offset range `[start, end)`.
219#[derive(Debug, Clone, Copy, PartialEq, Eq)]
220#[repr(C)]
221pub struct Match {
222    /// inclusive start.
223    pub start: usize,
224    /// exclusive end.
225    pub end: usize,
226}
227
228pub(crate) struct RegexInner {
229    pub(crate) b: RegexBuilder,
230    pub(crate) fwd: engine::LDFA,
231    pub(crate) rev_ts: engine::LDFA,
232    pub(crate) rev_bare: Option<engine::LDFA>,
233    pub(crate) nulls: Vec<usize>,
234    pub(crate) matches: Vec<Match>,
235    pub(crate) bounded: Option<engine::BDFA>,
236}
237
238/// Lazily compiled regex instance.
239/// Uses Mutex for interior mutability.
240pub struct Regex {
241    pub(crate) inner: Mutex<RegexInner>,
242    pub(crate) prefix: Option<prefix::PrefixKind>,
243    pub(crate) fixed_length: Option<u32>,
244    pub(crate) max_length: Option<u32>,
245    pub(crate) empty_nullable: bool,
246    pub(crate) fwd_end_nullable: bool,
247    pub(crate) hardened: bool,
248    pub(crate) has_bounded_prefix: bool,
249    pub(crate) has_bounded: bool,
250    /// Number of lb bytes baked into the AnchoredFwdLb SIMD prefix.
251    /// match.start = simd_candidate + lb_check_bytes.
252    pub(crate) lb_check_bytes: u8,
253    /// True when the lb contains \A: a match at position 0 is possible
254    /// and must be tried via begin_table before the SIMD scan.
255    pub(crate) fwd_lb_begin_nullable: bool,
256}
257
258#[inline(never)]
259fn bdfa_inner<const PREFIX: u8>(
260    table: *const u32,
261    ml: *const u8,
262    data: *const u8,
263    mt_log: u32,
264    initial: u16,
265    match_end_off: *const u32,
266    mut state: u16,
267    mut pos: usize,
268    len: usize,
269    match_buf: *mut Match,
270    match_cap: usize,
271) -> (u16, usize, usize) {
272    let mut mc: usize = 0;
273    unsafe {
274        while pos < len {
275            if PREFIX != PREFIX_NONE && state == initial {
276                return (state, pos, mc);
277            }
278            let mt = *ml.add(*data.add(pos) as usize) as usize;
279            let delta = (state as usize) << mt_log | mt;
280            let entry = *table.add(delta);
281            if entry == 0 {
282                return (state, pos, mc);
283            }
284            let rel = entry >> 16;
285            state = (entry & 0xFFFF) as u16;
286            if rel > 0 {
287                if mc >= match_cap {
288                    return (state, pos, mc);
289                }
290                let end_off = *match_end_off.add(state as usize);
291                *match_buf.add(mc) = Match {
292                    start: pos + 1 - rel as usize,
293                    end: pos + 1 - end_off as usize,
294                };
295                mc += 1;
296                state = initial;
297                continue;
298            }
299            pos += 1;
300        }
301        (state, pos, mc)
302    }
303}
304
305impl Regex {
306    /// compile a pattern with default options.
307    ///
308    /// ```
309    /// let re = resharp::Regex::new(r"\b\w+\b").unwrap();
310    /// ```
311    pub fn new(pattern: &str) -> Result<Regex, Error> {
312        Self::with_options(pattern, EngineOptions::default())
313    }
314
315    /// compile a pattern with custom [`EngineOptions`].
316    ///
317    /// ```
318    /// use resharp::{Regex, EngineOptions};
319    ///
320    /// let re = Regex::with_options(
321    ///     r"hello",
322    ///     EngineOptions::default().case_insensitive(true),
323    /// ).unwrap();
324    /// assert!(re.is_match(b"HELLO").unwrap());
325    /// ```
326    pub fn with_options(pattern: &str, opts: EngineOptions) -> Result<Regex, Error> {
327        let mut b = RegexBuilder::new();
328        b.lookahead_context_max = opts.lookahead_context_max;
329        let pflags = resharp_parser::PatternFlags {
330            unicode: opts.unicode != UnicodeMode::Ascii,
331            full_unicode: opts.unicode == UnicodeMode::Full,
332            case_insensitive: opts.case_insensitive,
333            dot_matches_new_line: opts.dot_matches_new_line,
334            ignore_whitespace: opts.ignore_whitespace,
335        };
336        let node = resharp_parser::parse_ast_with(&mut b, pattern, &pflags)?;
337        Self::from_node_inner(b, node, opts, pattern.len())
338    }
339
340    /// build from a pre-constructed AST node.
341    pub fn from_node(b: RegexBuilder, node: NodeId, opts: EngineOptions) -> Result<Regex, Error> {
342        Self::from_node_inner(b, node, opts, 0)
343    }
344
345    fn from_node_inner(
346        mut b: RegexBuilder,
347        node: NodeId,
348        opts: EngineOptions,
349        pattern_len: usize,
350    ) -> Result<Regex, Error> {
351        let empty_nullable = b
352            .nullability_emptystring(node)
353            .has(Nullability::EMPTYSTRING);
354
355        let fwd_start = b.strip_lb(node)?;
356        let fwd_end_nullable = b.nullability(fwd_start).has(Nullability::END);
357        let rev_start = b.reverse(node)?;
358        let rev_start = b.normalize_rev(rev_start)?;
359        let ts_rev_start =
360            if b.get_kind(rev_start) == Kind::Concat && rev_start.left(&b) == NodeId::BEGIN {
361                rev_start
362            } else {
363                b.mk_concat(NodeId::TS, rev_start)
364            };
365
366        let fixed_length = b.get_fixed_length(node);
367        let (min_len, max_len) = b.get_min_max_length(node);
368        let max_length = if max_len != u32::MAX {
369            Some(max_len)
370        } else {
371            None
372        };
373        let has_look = b.contains_look(node);
374
375        let max_cap = opts.max_dfa_capacity.min(u16::MAX as usize);
376
377        let (selected, rev_skip) =
378            prefix::select_prefix(&mut b, node, rev_start, has_look, min_len)?;
379
380        let has_fwd_prefix = matches!(
381            selected,
382            Some(
383                prefix::PrefixKind::AnchoredFwd(_)
384                    | prefix::PrefixKind::UnanchoredFwd(_)
385                    | prefix::PrefixKind::AnchoredFwdLb(_)
386            )
387        );
388        let fwd_prefix_stripped = matches!(selected, Some(prefix::PrefixKind::UnanchoredFwd(_)));
389
390        let mut fwd = engine::LDFA::new(&mut b, fwd_start, max_cap)?;
391        let mut rev = engine::LDFA::new(&mut b, ts_rev_start, max_cap)?;
392        rev.prefix_skip = rev_skip;
393
394        let (fwd_lb_begin_nullable, lb_check_bytes) =
395            if matches!(selected, Some(prefix::PrefixKind::AnchoredFwdLb(_))) {
396                let lb = node.left(&b);
397                let lb_inner = b.get_lookbehind_inner(lb);
398                let lb_nonbegin = b.nonbegins(lb_inner);
399                let lb_stripped = b.strip_prefix_safe(lb_nonbegin);
400                let (_, lb_max) = b.get_min_max_length(lb_stripped);
401                let begin_nullable = b.nullability(lb_inner).has(Nullability::BEGIN);
402                (begin_nullable, lb_max.min(4) as u8)
403            } else {
404                (false, 0)
405            };
406
407        if opts.dfa_threshold > 0 {
408            fwd.precompile(&mut b, opts.dfa_threshold);
409            if !has_fwd_prefix {
410                rev.precompile(&mut b, opts.dfa_threshold);
411            }
412        }
413
414        let rev_bare = if fwd_prefix_stripped {
415            Some(engine::LDFA::new(&mut b, rev_start, max_cap)?)
416        } else {
417            None
418        };
419        if fwd_prefix_stripped {
420            fwd.compute_fwd_skip(&mut b);
421        } else if !opts.hardened && pattern_len <= 150 {
422            fwd.compute_fwd_skip_inner(&mut b, 10);
423        }
424
425        let use_bounded = !has_fwd_prefix
426            && max_length.is_some()
427            && max_len <= 100
428            && fixed_length.is_none()
429            && !has_look
430            && !b.contains_anchors(node)
431            && pattern_len <= 150;
432
433        if cfg!(feature = "debug-nulls") {
434            eprintln!(
435                "  [bounded-check] max_length={:?} fixed_length={:?} has_look={} anchors={} fwd_prefix={} -> use={}",
436                max_length, fixed_length, has_look, b.contains_anchors(node), selected.is_some(), use_bounded
437            );
438        }
439
440        let bounded = if use_bounded {
441            Some(engine::BDFA::new(&mut b, fwd_start)?)
442        } else {
443            None
444        };
445
446        let has_bounded = bounded.is_some();
447        let has_bounded_prefix = bounded
448            .as_ref()
449            .is_some_and(|bd: &crate::engine::BDFA| bd.prefix.is_some());
450
451        let hardened = if opts.hardened && !has_bounded && fixed_length.is_none() && max_cap >= 64 {
452            fwd.has_nonnullable_cycle(&mut b, 256)
453        } else {
454            false
455        };
456
457        Ok(Regex {
458            inner: Mutex::new(RegexInner {
459                b,
460                fwd,
461                rev_ts: rev,
462                rev_bare,
463                nulls: Vec::new(),
464                matches: Vec::new(),
465                bounded,
466            }),
467            prefix: selected,
468            fixed_length,
469            max_length,
470            empty_nullable,
471            fwd_end_nullable,
472            hardened,
473            has_bounded_prefix,
474            has_bounded,
475            lb_check_bytes,
476            fwd_lb_begin_nullable,
477        })
478    }
479
480    #[cfg(feature = "diag")]
481    #[allow(missing_docs)]
482    pub fn node_count(&self) -> u32 {
483        self.inner.lock().unwrap().b.num_nodes()
484    }
485
486    #[cfg(feature = "diag")]
487    #[allow(missing_docs)]
488    pub fn dfa_stats(&self) -> (usize, usize) {
489        let inner = self.inner.lock().unwrap();
490        (inner.fwd.state_nodes.len(), inner.rev_ts.state_nodes.len())
491    }
492
493    /// whether hardened linear-scan mode is enabled
494    pub fn is_hardened(&self) -> bool {
495        self.hardened
496    }
497
498    #[cfg(feature = "diag")]
499    #[allow(missing_docs)]
500    pub fn bdfa_stats(&self) -> Option<(usize, usize, usize)> {
501        let inner = self.inner.lock().unwrap();
502        inner
503            .bounded
504            .as_ref()
505            .map(|b| (b.states.len(), 1usize << b.mt_log, b.prefix_len))
506    }
507
508    #[cfg(feature = "diag")]
509    #[allow(missing_docs)]
510    pub fn dump_fwd_dfa(&self) {
511        let inner = self.inner.lock().unwrap();
512        let fwd = &inner.fwd;
513        let stride = 1usize << fwd.mt_log;
514        eprintln!(
515            "fwd: minterms={} states={}",
516            fwd.mt_num,
517            fwd.state_nodes.len()
518        );
519        eprintln!(
520            "  mt['A']={} mt['a']={} mt['\\n']={} mt[0]={}",
521            fwd.mt_lookup[b'A' as usize],
522            fwd.mt_lookup[b'a' as usize],
523            fwd.mt_lookup[b'\n' as usize],
524            fwd.mt_lookup[0]
525        );
526        for sid in 2..fwd.state_nodes.len() {
527            let base = sid * stride;
528            if base + stride > fwd.center_table.len() {
529                continue;
530            }
531            let row: Vec<u16> = (0..fwd.mt_num as usize)
532                .map(|mt| fwd.center_table[base + mt])
533                .collect();
534            let nullable = sid < fwd.effects_id.len() && fwd.effects_id[sid] != 0;
535            let node = inner.b.pp(fwd.state_nodes[sid]);
536            let skip = fwd.skip_ids.get(sid).copied().unwrap_or(0);
537            eprintln!(
538                "  s{}: null={} skip={} node={} tr={:?}",
539                sid, nullable, skip, node, row
540            );
541        }
542    }
543
544    #[cfg(feature = "diag")]
545    #[allow(missing_docs)]
546    pub fn prefix_kind_name(&self) -> Option<&'static str> {
547        match &self.prefix {
548            None => None,
549            Some(prefix::PrefixKind::AnchoredFwd(_)) => Some("AnchoredFwd"),
550            Some(prefix::PrefixKind::UnanchoredFwd(_)) => Some("UnanchoredFwd"),
551            Some(prefix::PrefixKind::AnchoredFwdLb(_)) => Some("AnchoredFwdLb"),
552            Some(prefix::PrefixKind::AnchoredRev) => Some("AnchoredRev"),
553            Some(prefix::PrefixKind::PotentialStart) => Some("PotentialStart"),
554        }
555    }
556
557    #[cfg(feature = "diag")]
558    #[allow(missing_docs)]
559    pub fn max_length(&self) -> Option<u32> {
560        self.max_length
561    }
562
563    #[cfg(feature = "diag")]
564    #[allow(missing_docs)]
565    pub fn fwd_prefix_kind(&self) -> Option<(&'static str, usize)> {
566        match &self.prefix {
567            Some(prefix::PrefixKind::AnchoredFwd(fp))
568            | Some(prefix::PrefixKind::UnanchoredFwd(fp))
569            | Some(prefix::PrefixKind::AnchoredFwdLb(fp)) => Some((fp.variant_name(), fp.len())),
570            _ => None,
571        }
572    }
573
574    #[cfg(feature = "diag")]
575    #[allow(missing_docs)]
576    pub fn has_accel(&self) -> (bool, bool) {
577        let inner = self.inner.lock().unwrap();
578        let fwd = self.prefix.as_ref().is_some_and(|p| p.is_fwd());
579        let rev = self.prefix.as_ref().is_some_and(|p| p.is_rev())
580            || inner.rev_ts.prefix_skip.is_some()
581            || inner.rev_ts.can_skip();
582        (fwd, rev)
583    }
584
585    /// all non-overlapping leftmost-first matches as `[start, end)` byte ranges.
586    ///
587    /// ```
588    /// let re = resharp::Regex::new(r"\d+").unwrap();
589    /// let m = re.find_all(b"abc 123 def 456").unwrap();
590    /// assert_eq!(m.len(), 2);
591    /// assert_eq!((m[0].start, m[0].end), (4, 7));
592    /// ```
593    pub fn find_all(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
594        #[cfg(all(feature = "debug-nulls", debug_assertions))]
595        {
596            // eprintln!("prefix: '{:?}'", &self.prefix);
597        }
598
599        if input.is_empty() {
600            return if self.empty_nullable {
601                Ok(vec![Match { start: 0, end: 0 }])
602            } else {
603                Ok(vec![])
604            };
605        }
606        if self.hardened {
607            if self.has_bounded_prefix || self.has_bounded {
608                return self.find_all_fwd_bounded(input);
609            }
610            return self.find_all_dfa(input);
611        }
612        if self.has_bounded_prefix {
613            return self.find_all_fwd_bounded(input);
614        }
615        match &self.prefix {
616            Some(prefix::PrefixKind::UnanchoredFwd(_)) => {
617                return self.find_all_fwd_prefix_stripped(input);
618            }
619            Some(prefix::PrefixKind::AnchoredFwd(_)) => {
620                return self.find_all_fwd_prefix(input);
621            }
622            Some(prefix::PrefixKind::AnchoredFwdLb(_)) => {
623                return self.find_all_fwd_lb_prefix(input);
624            }
625            _ => {}
626        }
627        if self.has_bounded {
628            return self.find_all_fwd_bounded(input);
629        }
630        self.find_all_dfa(input)
631    }
632
633    #[cfg(feature = "diag")]
634    #[allow(missing_docs)]
635    pub fn effects_debug(&self) -> String {
636        let inner = self.inner.lock().unwrap();
637        let rev = &inner.rev_ts;
638        let mut out = String::new();
639        for (i, &eid) in rev.effects_id.iter().enumerate() {
640            if eid != 0 {
641                let nulls: Vec<String> = rev.effects[eid as usize]
642                    .iter()
643                    .map(|n| format!("(mask={},rel={})", n.mask.0, n.rel))
644                    .collect();
645                out += &format!("  state[{}] eid={} nulls=[{}]\n", i, eid, nulls.join(", "));
646            }
647        }
648        out
649    }
650
651    #[cfg(feature = "diag")]
652    #[allow(missing_docs)]
653    pub fn collect_rev_nulls_debug(&self, input: &[u8]) -> Vec<usize> {
654        let inner = &mut *self.inner.lock().unwrap();
655        inner.nulls.clear();
656        inner
657            .rev_ts
658            .collect_rev(&mut inner.b, input.len() - 1, input, &mut inner.nulls)
659            .unwrap();
660        inner.nulls.clone()
661    }
662
663    fn find_all_dfa(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
664        if self.fwd_end_nullable {
665            self.find_all_dfa_inner::<true>(input)
666        } else {
667            self.find_all_dfa_inner::<false>(input)
668        }
669    }
670
671    fn find_all_dfa_inner<const FWD_NULL: bool>(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
672        let inner = &mut *self.inner.lock().unwrap();
673        #[cfg(feature = "debug-nulls")]
674        {
675            eprintln!("find_all_dfa_inner:");
676            eprintln!(
677                "rev0: {}",
678                inner
679                    .b
680                    .pp(inner.rev.state_nodes[engine::DFA_INITIAL as usize])
681            );
682        }
683
684        let rev_initial_nullable = inner.rev_ts.effects_id[engine::DFA_INITIAL as usize] != 0;
685        if rev_initial_nullable {
686            inner.matches.clear();
687            Self::find_all_nullable_slow(&mut inner.fwd, &mut inner.b, input, &mut inner.matches)?;
688            return Ok(inner.matches.clone());
689        }
690
691        inner.nulls.clear();
692
693        if rev_initial_nullable {
694            inner.nulls.push(input.len());
695        }
696        inner
697            .rev_ts
698            .collect_rev(&mut inner.b, input.len() - 1, input, &mut inner.nulls)?;
699
700        // #[cfg(feature = "diag")]
701        // eprintln!("  [diag] dfa nulls={} input={}", inner.nulls.len(), input.len());
702
703        inner.matches.clear();
704        let matches = &mut inner.matches;
705        #[cfg(feature = "debug-nulls")]
706        {
707            eprintln!("nulls_buf={:?}", inner.nulls);
708        }
709
710        if let Some(fl) = self.fixed_length {
711            let fl = fl as usize;
712            let mut last_end = 0;
713            for &start in inner.nulls.iter().rev() {
714                if start >= last_end && start + fl <= input.len() {
715                    matches.push(Match {
716                        start,
717                        end: start + fl,
718                    });
719                    last_end = start + fl;
720                }
721            }
722        } else if self.hardened {
723            if cfg!(feature = "debug-nulls") {
724                eprintln!("  [dispatch] scan_fwd_ordered");
725            }
726            inner.fwd.scan_fwd_ordered(
727                &mut inner.b,
728                &inner.nulls,
729                input,
730                self.max_length,
731                matches,
732            )?;
733        } else {
734            inner
735                .fwd
736                .scan_fwd_all(&mut inner.b, &inner.nulls, input, self.max_length, matches)?;
737        }
738
739        if FWD_NULL
740            && inner.nulls.first() == Some(&input.len())
741            && matches.last().map_or(true, |m| m.end <= input.len())
742        {
743            matches.push(Match {
744                start: input.len(),
745                end: input.len(),
746            });
747        }
748
749        Ok(inner.matches.clone())
750    }
751
752    fn find_all_nullable_slow(
753        fwd: &mut engine::LDFA,
754        b: &mut RegexBuilder,
755        input: &[u8],
756        matches: &mut Vec<Match>,
757    ) -> Result<(), Error> {
758        let mut pos = 0;
759        fwd.build_skip_all(b);
760        while pos < input.len() {
761            let max_end = fwd.scan_fwd_slow(b, pos, input)?;
762            if max_end != engine::NO_MATCH && max_end > pos {
763                matches.push(Match {
764                    start: pos,
765                    end: max_end,
766                });
767                pos = max_end;
768            } else if max_end != engine::NO_MATCH {
769                matches.push(Match {
770                    start: pos,
771                    end: pos,
772                });
773                pos += 1;
774            } else {
775                pos += 1;
776            }
777        }
778        // trailing empty at end-of-input
779        let end_null = engine::has_any_null(
780            &fwd.effects_id,
781            &fwd.effects,
782            engine::DFA_INITIAL as u32,
783            Nullability::END,
784        );
785        if end_null {
786            matches.push(Match {
787                start: input.len(),
788                end: input.len(),
789            });
790        }
791        Ok(())
792    }
793
794    fn find_all_fwd_bounded(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
795        let RegexInner {
796            b,
797            bounded,
798            matches: matches_buf,
799            ..
800        } = &mut *self.inner.lock().unwrap();
801        let bounded = bounded.as_mut().unwrap();
802        matches_buf.clear();
803        match &bounded.prefix {
804            Some(p) if p.is_literal() => {
805                Self::bdfa_scan::<{ PREFIX_LITERAL }, false>(bounded, b, input, matches_buf)?;
806            }
807            Some(_) => {
808                Self::bdfa_scan::<{ PREFIX_SEARCH }, false>(bounded, b, input, matches_buf)?;
809            }
810            None => {
811                Self::bdfa_scan::<{ PREFIX_NONE }, false>(bounded, b, input, matches_buf)?;
812            }
813        }
814        Ok(matches_buf.clone())
815    }
816
817    fn is_match_fwd_bounded(&self, input: &[u8]) -> Result<bool, Error> {
818        let RegexInner {
819            b,
820            bounded,
821            matches: matches_buf,
822            ..
823        } = &mut *self.inner.lock().unwrap();
824        let bounded = bounded.as_mut().unwrap();
825        matches_buf.clear();
826        let found = match &bounded.prefix {
827            Some(p) if p.is_literal() => {
828                Self::bdfa_scan::<{ PREFIX_LITERAL }, true>(bounded, b, input, matches_buf)?
829            }
830            Some(_) => Self::bdfa_scan::<{ PREFIX_SEARCH }, true>(bounded, b, input, matches_buf)?,
831            None => Self::bdfa_scan::<{ PREFIX_NONE }, true>(bounded, b, input, matches_buf)?,
832        };
833        Ok(found)
834    }
835
836    fn bdfa_scan<const PREFIX: u8, const ISMATCH: bool>(
837        bounded: &mut engine::BDFA,
838        b: &mut RegexBuilder,
839        input: &[u8],
840        matches: &mut Vec<Match>,
841    ) -> Result<bool, Error> {
842        let initial = bounded.initial;
843        let mt_log = bounded.mt_log;
844        let ml = bounded.minterms_lookup;
845        let len = input.len();
846        let mut state = initial;
847        let mut pos: usize = 0;
848
849        if PREFIX == PREFIX_NONE {
850            let data = input.as_ptr();
851            if !ISMATCH {
852                matches.reserve(2048);
853            }
854            loop {
855                if !ISMATCH && matches.len() == matches.capacity() {
856                    matches.reserve(matches.capacity().max(256));
857                }
858                let spare = if ISMATCH {
859                    1
860                } else {
861                    matches.capacity() - matches.len()
862                };
863                let buf_ptr = unsafe { matches.as_mut_ptr().add(matches.len()) };
864                let table = bounded.table.as_ptr();
865                let meo = bounded.match_end_off.as_ptr();
866                let (s, p, mc) = bdfa_inner::<{ PREFIX_NONE }>(
867                    table,
868                    ml.as_ptr(),
869                    data,
870                    mt_log,
871                    initial,
872                    meo,
873                    state,
874                    pos,
875                    len,
876                    buf_ptr,
877                    spare,
878                );
879                state = s;
880                pos = p;
881                if ISMATCH && mc > 0 {
882                    return Ok(true);
883                }
884                unsafe { matches.set_len(matches.len() + mc) };
885                if pos >= len {
886                    break;
887                }
888                let mt = ml[input[pos] as usize] as usize;
889                let entry = bounded.transition(b, state, mt)?;
890                state = (entry & 0xFFFF) as u16;
891                let rel = entry >> 16;
892                if rel > 0 {
893                    if ISMATCH {
894                        return Ok(true);
895                    }
896                    let end_off = bounded.match_end_off[state as usize];
897                    matches.push(Match {
898                        start: pos + 1 - rel as usize,
899                        end: pos + 1 - end_off as usize,
900                    });
901                    state = initial;
902                } else {
903                    pos += 1;
904                }
905            }
906        } else {
907            // PREFIX_SEARCH / PREFIX_LITERAL
908            'main: loop {
909                if pos >= len {
910                    break;
911                }
912
913                if state == initial {
914                    let found = bounded.prefix.as_ref().unwrap().find_fwd(input, pos);
915                    match found {
916                        Some(p) => {
917                            if PREFIX == PREFIX_LITERAL {
918                                pos = p + bounded.prefix_len;
919                                state = bounded.after_prefix;
920                            } else {
921                                pos = p;
922                                for _ in 0..bounded.prefix_len {
923                                    if pos >= len {
924                                        break;
925                                    }
926                                    let mt = ml[input[pos] as usize] as usize;
927                                    let delta = (state as usize) << mt_log | mt;
928                                    let entry = bounded.table[delta];
929                                    let entry = if entry != 0 {
930                                        entry
931                                    } else {
932                                        bounded.transition(b, state, mt)?
933                                    };
934                                    state = (entry & 0xFFFF) as u16;
935                                    if state == initial {
936                                        break;
937                                    }
938                                    pos += 1;
939                                }
940                            }
941                            let rel = bounded.match_rel[state as usize];
942                            if rel > 0 {
943                                if ISMATCH {
944                                    return Ok(true);
945                                }
946                                let end_off = bounded.match_end_off[state as usize];
947                                matches.push(Match {
948                                    start: pos - rel as usize + 1,
949                                    end: pos - end_off as usize + 1,
950                                });
951                                state = initial;
952                            }
953                            continue 'main;
954                        }
955                        None => break 'main,
956                    }
957                }
958
959                unsafe {
960                    let table = bounded.table.as_ptr();
961                    let data = input.as_ptr();
962                    let ml_ptr = ml.as_ptr();
963                    let meo = bounded.match_end_off.as_ptr();
964
965                    while pos < len {
966                        let mt = *ml_ptr.add(*data.add(pos) as usize) as usize;
967                        let delta = (state as usize) << mt_log | mt;
968                        let entry = *table.add(delta);
969                        if entry == 0 {
970                            break;
971                        }
972                        let rel = entry >> 16;
973                        state = (entry & 0xFFFF) as u16;
974                        if state == initial {
975                            continue 'main;
976                        }
977                        if rel > 0 {
978                            if ISMATCH {
979                                return Ok(true);
980                            }
981                            let end_off = *meo.add(state as usize);
982                            matches.push(Match {
983                                start: pos + 1 - rel as usize,
984                                end: pos + 1 - end_off as usize,
985                            });
986                            state = initial;
987                            continue 'main;
988                        }
989                        pos += 1;
990                    }
991                }
992
993                if pos >= len {
994                    break;
995                }
996                let mt = ml[input[pos] as usize] as usize;
997                let entry = bounded.transition(b, state, mt)?;
998                state = (entry & 0xFFFF) as u16;
999                let rel = entry >> 16;
1000                if rel > 0 {
1001                    if ISMATCH {
1002                        return Ok(true);
1003                    }
1004                    let end_off = bounded.match_end_off[state as usize];
1005                    matches.push(Match {
1006                        start: pos + 1 - rel as usize,
1007                        end: pos + 1 - end_off as usize,
1008                    });
1009                    state = initial;
1010                } else {
1011                    pos += 1;
1012                }
1013            }
1014        }
1015
1016        if state != initial {
1017            let node = bounded.states[state as usize];
1018            if node != NodeId::MISSING {
1019                // walk chain: find best match among all chain nodes
1020                let mut best_val = 0u32;
1021                let mut best_step = 0u32;
1022                let mut cur = node;
1023                while cur.0 > NodeId::BOT.0 {
1024                    let packed = b.get_extra(cur);
1025                    let step = packed & 0xFFFF;
1026                    let best = packed >> 16;
1027                    if best > best_val {
1028                        best_val = best;
1029                        best_step = step;
1030                    }
1031                    cur = cur.right(b);
1032                }
1033                if best_val > 0 {
1034                    if ISMATCH {
1035                        return Ok(true);
1036                    }
1037                    matches.push(Match {
1038                        start: len - best_step as usize,
1039                        end: len - best_step as usize + best_val as usize,
1040                    });
1041                }
1042            }
1043        }
1044
1045        Ok(false)
1046    }
1047
1048    fn find_all_fwd_prefix(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
1049        let fwd_prefix = self.prefix.as_ref().and_then(|p| p.fwd_search()).unwrap();
1050        let inner = &mut *self.inner.lock().unwrap();
1051        let matches = &mut inner.matches;
1052        matches.clear();
1053        let mut search_start = 0;
1054
1055        if self.fixed_length == Some(fwd_prefix.len() as u32)
1056            && fwd_prefix.find_all_literal(input, matches)
1057        {
1058        } else {
1059            let prefix_len = fwd_prefix.len();
1060            while let Some(candidate) = fwd_prefix.find_fwd(input, search_start) {
1061                let state = inner
1062                    .fwd
1063                    .walk_input(&mut inner.b, candidate, prefix_len, input)?;
1064                if state != 0 {
1065                    let max_end = inner.fwd.scan_fwd_from(
1066                        &mut inner.b,
1067                        state,
1068                        candidate + prefix_len,
1069                        input,
1070                    )?;
1071                    if max_end != engine::NO_MATCH && max_end > candidate {
1072                        matches.push(Match {
1073                            start: candidate,
1074                            end: max_end,
1075                        });
1076                        search_start = max_end;
1077                        continue;
1078                    }
1079                }
1080                search_start = candidate + 1;
1081            }
1082            #[cfg(feature = "debug-nulls")]
1083            eprintln!(
1084                "  [debug-nulls] fwd_prefix candidates={} confirmed={} false_positives={}",
1085                n_candidates,
1086                n_confirmed,
1087                n_candidates.saturating_sub(n_confirmed)
1088            );
1089        }
1090
1091        Ok(matches.clone())
1092    }
1093
1094    fn find_all_fwd_prefix_stripped(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
1095        let fwd_prefix = self.prefix.as_ref().and_then(|p| p.fwd_search()).unwrap();
1096        let inner = &mut *self.inner.lock().unwrap();
1097        let prefix_len = fwd_prefix.len();
1098        inner.fwd.create_state(&mut inner.b, engine::DFA_INITIAL)?;
1099        inner.matches.clear();
1100        let mut search_start = 0;
1101
1102        while let Some(candidate) = fwd_prefix.find_fwd(input, search_start) {
1103            // 1. confirm match end via fwd DFA
1104            let mut state = engine::DFA_INITIAL;
1105            for i in 0..prefix_len {
1106                let mt = inner.fwd.mt_lookup[input[candidate + i] as usize] as u32;
1107                state = inner.fwd.lazy_transition(&mut inner.b, state, mt)?;
1108                if state == engine::DFA_DEAD {
1109                    break;
1110                }
1111            }
1112            if state == engine::DFA_DEAD {
1113                search_start = candidate + 1;
1114                continue;
1115            }
1116            let max_end = inner.fwd.scan_fwd_from(
1117                &mut inner.b,
1118                state as u32,
1119                candidate + prefix_len,
1120                input,
1121            )?;
1122            if max_end == engine::NO_MATCH {
1123                search_start = candidate + 1;
1124                continue;
1125            }
1126
1127            // 2. find match start via bare rev DFA from confirmed end
1128            let rev_bare = inner.rev_bare.as_mut().unwrap();
1129            let match_start = rev_bare.scan_rev_from(&mut inner.b, max_end, search_start, input)?;
1130            if match_start != engine::NO_MATCH {
1131                inner.matches.push(Match {
1132                    start: match_start,
1133                    end: max_end,
1134                });
1135                search_start = max_end;
1136            } else {
1137                search_start = candidate + 1;
1138            }
1139        }
1140
1141        Ok(inner.matches.clone())
1142    }
1143
1144    fn find_all_fwd_lb_prefix(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
1145        let fwd_prefix = self.prefix.as_ref().and_then(|p| p.fwd_search()).unwrap();
1146        let inner = &mut *self.inner.lock().unwrap();
1147        inner.matches.clear();
1148        let lb_len = self.lb_check_bytes as usize;
1149        let mut search_start = 0usize;
1150
1151        // \A: lb contains \A so a match is possible at position 0 without any preceding lb
1152        // byte. Handle this explicitly via begin_table before the SIMD scan.
1153        if self.fwd_lb_begin_nullable && !input.is_empty() {
1154            let state = inner.fwd.walk_input(&mut inner.b, 0, 1, input)?;
1155            if state != 0 {
1156                let max_end = inner.fwd.scan_fwd_from(&mut inner.b, state, 1, input)?;
1157                if max_end != engine::NO_MATCH {
1158                    inner.matches.push(Match {
1159                        start: 0,
1160                        end: max_end,
1161                    });
1162                    search_start = max_end;
1163                }
1164            }
1165        }
1166
1167        // SIMD scan for lb bytes. Only valid when in the initial/pruned DFA state,
1168        // which is always the case here (scan_fwd_from fully processes each candidate).
1169        while let Some(candidate) = fwd_prefix.find_fwd(input, search_start) {
1170            let body_start = candidate + lb_len;
1171            let max_end = inner.fwd.scan_fwd_from(
1172                &mut inner.b,
1173                engine::DFA_INITIAL as u32,
1174                body_start,
1175                input,
1176            )?;
1177            if max_end != engine::NO_MATCH {
1178                inner.matches.push(Match {
1179                    start: body_start,
1180                    end: max_end,
1181                });
1182                search_start = max_end;
1183            } else {
1184                search_start = body_start;
1185            }
1186        }
1187
1188        Ok(inner.matches.clone())
1189    }
1190
1191    /// longest match anchored at position 0.
1192    ///
1193    /// returns `None` if the pattern does not match at position 0.
1194    pub fn find_anchored(&self, input: &[u8]) -> Result<Option<Match>, Error> {
1195        if input.is_empty() {
1196            return if self.empty_nullable {
1197                Ok(Some(Match { start: 0, end: 0 }))
1198            } else {
1199                Ok(None)
1200            };
1201        }
1202        let inner = &mut *self.inner.lock().unwrap();
1203        let max_end = inner.fwd.scan_fwd_slow(&mut inner.b, 0, input)?;
1204        if max_end != engine::NO_MATCH {
1205            Ok(Some(Match {
1206                start: 0,
1207                end: max_end,
1208            }))
1209        } else {
1210            Ok(None)
1211        }
1212    }
1213
1214    /// whether the pattern matches anywhere in the input.
1215    ///
1216    /// faster than `find_all` when you only need a yes/no answer.
1217    pub fn is_match(&self, input: &[u8]) -> Result<bool, Error> {
1218        if input.is_empty() {
1219            return Ok(self.empty_nullable);
1220        }
1221        if self.has_bounded {
1222            return self.is_match_fwd_bounded(input);
1223        }
1224        // TODO: very critical of this, likely unnecessary special case, need to measure if it makes any difference
1225        if let Some(fwd_prefix) = self.prefix.as_ref().and_then(|p| p.fwd_search()) {
1226            if let Some(fl) = self.fixed_length {
1227                if fl as usize == fwd_prefix.len() {
1228                    if let Some(candidate) = fwd_prefix.find_fwd(input, 0) {
1229                        return Ok(candidate + fl as usize <= input.len());
1230                    }
1231                    return Ok(false);
1232                }
1233            }
1234            let inner = &mut *self.inner.lock().unwrap();
1235            let prefix_len = fwd_prefix.len();
1236            let mut search_start = 0;
1237            while let Some(candidate) = fwd_prefix.find_fwd(input, search_start) {
1238                let state = inner
1239                    .fwd
1240                    .walk_input(&mut inner.b, candidate, prefix_len, input)?;
1241                if state != 0 {
1242                    let max_end = inner.fwd.scan_fwd_from(
1243                        &mut inner.b,
1244                        state,
1245                        candidate + prefix_len,
1246                        input,
1247                    )?;
1248                    if max_end != engine::NO_MATCH && max_end > candidate {
1249                        return Ok(true);
1250                    }
1251                }
1252                search_start = candidate + 1;
1253            }
1254            return Ok(false);
1255        }
1256        let inner = &mut *self.inner.lock().unwrap();
1257        if inner.rev_ts.effects_id[engine::DFA_INITIAL as usize] != 0 {
1258            return Ok(true);
1259        }
1260        inner.nulls.clear();
1261        inner
1262            .rev_ts
1263            .collect_rev_first(&mut inner.b, input.len() - 1, input, &mut inner.nulls)?;
1264        Ok(!inner.nulls.is_empty())
1265    }
1266}