Skip to main content

virtual_frame/
regex_engine.rs

1//! NFA-based regex engine — zero-dependency, deterministic, linear-time.
2//!
3//! Compiles regex patterns into NFA states, then executes via Thompson NFA
4//! simulation. No backtracking, no per-match heap allocation beyond the
5//! initial state-set swap buffers. Guarantees O(n×m) worst-case matching.
6//!
7//! # Public API
8//!
9//! | Function     | Purpose                                          |
10//! |--------------|--------------------------------------------------|
11//! | [`is_match`] | Test whether the pattern matches anywhere         |
12//! | [`find`]     | Return the first match span `(start, end)`       |
13//! | [`find_all`] | Return all non-overlapping match spans            |
14//! | [`split`]    | Split input by pattern, returning segment spans   |
15//!
16//! # Supported syntax (Perl-spirit subset)
17//!
18//! ```text
19//!   .         any byte (or any byte except \n without `s` flag)
20//!   \d        ASCII digit [0-9]
21//!   \w        ASCII word  [a-zA-Z0-9_]
22//!   \s        ASCII whitespace [\t\n\r\x0C\x20]
23//!   \D \W \S  negated classes
24//!   [abc]     character class
25//!   [^abc]    negated character class
26//!   [a-z]     character range
27//!   a|b       alternation
28//!   (...)     grouping
29//!   *         zero or more (greedy)
30//!   +         one or more (greedy)
31//!   ?         zero or one (greedy)
32//!   *? +? ??  non-greedy (lazy) variants
33//!   ^         start of input (or line in `m` mode)
34//!   $         end of input (or line in `m` mode)
35//!   \b        word boundary
36//!   \\        literal backslash
37//!   \xNN      hex byte
38//! ```
39//!
40//! # Flags
41//!
42//! | Flag | Meaning                                          |
43//! |------|--------------------------------------------------|
44//! | `i`  | Case-insensitive (ASCII only)                    |
45//! | `m`  | Multiline (`^`/`$` match line boundaries)        |
46//! | `s`  | Dotall (`.` matches `\n`)                        |
47//! | `x`  | Extended (whitespace ignored, `#` comments)      |
48
49// ---------------------------------------------------------------------------
50// Public API
51// ---------------------------------------------------------------------------
52
53/// Test whether `pattern` matches anywhere inside `haystack`.
54pub fn is_match(pattern: &str, flags: &str, haystack: &[u8]) -> bool {
55    let opts = Flags::parse(flags);
56    let nfa = match compile(pattern, &opts) {
57        Ok(nfa) => nfa,
58        Err(_) => return false,
59    };
60    nfa_search(&nfa, haystack, &opts).is_some()
61}
62
63/// Find the byte-offset span of the first match in `haystack`.
64pub fn find(pattern: &str, flags: &str, haystack: &[u8]) -> Option<(usize, usize)> {
65    let opts = Flags::parse(flags);
66    let nfa = compile(pattern, &opts).ok()?;
67    nfa_search(&nfa, haystack, &opts)
68}
69
70/// Find all non-overlapping match spans in `haystack`.
71pub fn find_all(pattern: &str, flags: &str, haystack: &[u8]) -> Vec<(usize, usize)> {
72    let opts = Flags::parse(flags);
73    let nfa = match compile(pattern, &opts) {
74        Ok(nfa) => nfa,
75        Err(_) => return Vec::new(),
76    };
77    let mut results = Vec::new();
78    let mut start = 0;
79    while start <= haystack.len() {
80        if let Some((ms, me)) = nfa_search_from(&nfa, haystack, &opts, start) {
81            results.push((ms, me));
82            start = if me == ms { me + 1 } else { me };
83        } else {
84            break;
85        }
86    }
87    results
88}
89
90/// Split `haystack` by a regex pattern, returning byte-ranges of non-matching segments.
91pub fn split(pattern: &str, flags: &str, haystack: &[u8]) -> Vec<(usize, usize)> {
92    let matches = find_all(pattern, flags, haystack);
93    let mut segments = Vec::new();
94    let mut pos = 0;
95    for (ms, me) in &matches {
96        segments.push((pos, *ms));
97        pos = *me;
98    }
99    segments.push((pos, haystack.len()));
100    segments
101}
102
103// ---------------------------------------------------------------------------
104// Flags
105// ---------------------------------------------------------------------------
106
107#[derive(Clone, Debug)]
108struct Flags {
109    case_insensitive: bool,
110    multiline: bool,
111    dotall: bool,
112    extended: bool,
113}
114
115impl Flags {
116    fn parse(s: &str) -> Self {
117        Self {
118            case_insensitive: s.contains('i'),
119            multiline: s.contains('m'),
120            dotall: s.contains('s'),
121            extended: s.contains('x'),
122        }
123    }
124}
125
126// ---------------------------------------------------------------------------
127// NFA representation
128// ---------------------------------------------------------------------------
129
130#[derive(Clone, Debug)]
131enum NfaNode {
132    Byte(u8),
133    Class(ByteClass),
134    AnyByte,
135    AnyByteNoNl,
136    Split(usize, usize),
137    Epsilon(usize),
138    Accept,
139    AnchorStart,
140    AnchorEnd,
141    WordBoundary,
142}
143
144#[derive(Clone, Debug)]
145struct ByteClass {
146    bits: [u64; 4], // 256 bits
147}
148
149impl ByteClass {
150    fn new() -> Self {
151        Self { bits: [0; 4] }
152    }
153
154    fn set(&mut self, b: u8) {
155        let idx = (b / 64) as usize;
156        let bit = b % 64;
157        self.bits[idx] |= 1u64 << bit;
158    }
159
160    fn contains(&self, b: u8) -> bool {
161        let idx = (b / 64) as usize;
162        let bit = b % 64;
163        (self.bits[idx] >> bit) & 1 == 1
164    }
165
166    fn negate(&mut self) {
167        for b in &mut self.bits {
168            *b = !*b;
169        }
170    }
171
172    fn set_range(&mut self, lo: u8, hi: u8) {
173        for b in lo..=hi {
174            self.set(b);
175        }
176    }
177}
178
179struct Nfa {
180    nodes: Vec<NfaNode>,
181    start: usize,
182    has_lazy: bool,
183}
184
185// ---------------------------------------------------------------------------
186// Compiler: pattern string → NFA
187// ---------------------------------------------------------------------------
188
189struct Compiler<'a> {
190    pattern: &'a [u8],
191    pos: usize,
192    nodes: Vec<NfaNode>,
193    flags: Flags,
194    has_lazy: bool,
195}
196
197impl<'a> Compiler<'a> {
198    fn new(pattern: &'a str, flags: &Flags) -> Self {
199        Self {
200            pattern: pattern.as_bytes(),
201            pos: 0,
202            nodes: Vec::new(),
203            flags: flags.clone(),
204            has_lazy: false,
205        }
206    }
207
208    fn peek(&self) -> Option<u8> {
209        if self.pos < self.pattern.len() {
210            Some(self.pattern[self.pos])
211        } else {
212            None
213        }
214    }
215
216    fn advance(&mut self) -> Option<u8> {
217        let ch = self.peek()?;
218        self.pos += 1;
219        Some(ch)
220    }
221
222    fn emit(&mut self, node: NfaNode) -> usize {
223        let id = self.nodes.len();
224        self.nodes.push(node);
225        id
226    }
227
228    fn compile(mut self) -> Result<Nfa, String> {
229        let (start, end) = self.parse_alternation()?;
230        let accept = self.emit(NfaNode::Accept);
231        self.patch(end, accept);
232        Ok(Nfa {
233            nodes: self.nodes,
234            start,
235            has_lazy: self.has_lazy,
236        })
237    }
238
239    fn patch(&mut self, placeholder: usize, target: usize) {
240        match &mut self.nodes[placeholder] {
241            NfaNode::Epsilon(ref mut t) if *t == usize::MAX => *t = target,
242            NfaNode::Split(ref mut a, ref mut b) => {
243                if *a == usize::MAX {
244                    *a = target;
245                }
246                if *b == usize::MAX {
247                    *b = target;
248                }
249            }
250            _ => {
251                let _next = self.nodes.len();
252                self.nodes.push(NfaNode::Epsilon(target));
253            }
254        }
255    }
256
257    fn parse_alternation(&mut self) -> Result<(usize, usize), String> {
258        let (mut start, mut end) = self.parse_sequence()?;
259        while self.peek() == Some(b'|') {
260            self.advance();
261            let (s2, e2) = self.parse_sequence()?;
262            let split = self.emit(NfaNode::Split(start, s2));
263            let join = self.emit(NfaNode::Epsilon(usize::MAX));
264            self.patch(end, join);
265            self.patch(e2, join);
266            start = split;
267            end = join;
268        }
269        Ok((start, end))
270    }
271
272    fn parse_sequence(&mut self) -> Result<(usize, usize), String> {
273        let mut fragments: Vec<(usize, usize)> = Vec::new();
274        loop {
275            match self.peek() {
276                None | Some(b'|') | Some(b')') => break,
277                _ => {
278                    let frag = self.parse_quantified()?;
279                    fragments.push(frag);
280                }
281            }
282        }
283        if fragments.is_empty() {
284            let e = self.emit(NfaNode::Epsilon(usize::MAX));
285            return Ok((e, e));
286        }
287        for i in 0..fragments.len() - 1 {
288            let next_start = fragments[i + 1].0;
289            self.patch(fragments[i].1, next_start);
290        }
291        Ok((fragments[0].0, fragments[fragments.len() - 1].1))
292    }
293
294    fn parse_quantified(&mut self) -> Result<(usize, usize), String> {
295        let (s, e) = self.parse_atom()?;
296        match self.peek() {
297            Some(b'*') => {
298                self.advance();
299                let lazy = self.peek() == Some(b'?');
300                if lazy {
301                    self.advance();
302                    self.has_lazy = true;
303                }
304                let split = self.emit(NfaNode::Split(s, usize::MAX));
305                self.patch(e, split);
306                let out = self.emit(NfaNode::Epsilon(usize::MAX));
307                if lazy {
308                    if let NfaNode::Split(ref mut a, ref mut b) = self.nodes[split] {
309                        *a = out;
310                        *b = s;
311                    }
312                } else if let NfaNode::Split(_, ref mut b) = self.nodes[split] {
313                    *b = out;
314                }
315                Ok((split, out))
316            }
317            Some(b'+') => {
318                self.advance();
319                let lazy = self.peek() == Some(b'?');
320                if lazy {
321                    self.advance();
322                    self.has_lazy = true;
323                }
324                let split = self.emit(NfaNode::Split(s, usize::MAX));
325                self.patch(e, split);
326                let out = self.emit(NfaNode::Epsilon(usize::MAX));
327                if lazy {
328                    if let NfaNode::Split(ref mut a, ref mut b) = self.nodes[split] {
329                        *a = out;
330                        *b = s;
331                    }
332                } else if let NfaNode::Split(_, ref mut b) = self.nodes[split] {
333                    *b = out;
334                }
335                Ok((s, out))
336            }
337            Some(b'?') => {
338                self.advance();
339                let lazy = self.peek() == Some(b'?');
340                if lazy {
341                    self.advance();
342                    self.has_lazy = true;
343                }
344                let out = self.emit(NfaNode::Epsilon(usize::MAX));
345                self.patch(e, out);
346                let split = if lazy {
347                    self.emit(NfaNode::Split(out, s))
348                } else {
349                    self.emit(NfaNode::Split(s, out))
350                };
351                Ok((split, out))
352            }
353            _ => Ok((s, e)),
354        }
355    }
356
357    fn parse_atom(&mut self) -> Result<(usize, usize), String> {
358        if self.flags.extended {
359            self.skip_extended_ws();
360        }
361
362        match self.peek() {
363            Some(b'(') => {
364                self.advance();
365                let inner = self.parse_alternation()?;
366                if self.peek() == Some(b')') {
367                    self.advance();
368                } else {
369                    return Err("unclosed group `(`".into());
370                }
371                Ok(inner)
372            }
373            Some(b'[') => self.parse_char_class(),
374            Some(b'.') => {
375                self.advance();
376                let node = if self.flags.dotall {
377                    NfaNode::AnyByte
378                } else {
379                    NfaNode::AnyByteNoNl
380                };
381                let s = self.emit(node);
382                let e = self.emit(NfaNode::Epsilon(usize::MAX));
383                Ok((s, e))
384            }
385            Some(b'^') => {
386                self.advance();
387                let s = self.emit(NfaNode::AnchorStart);
388                let e = self.emit(NfaNode::Epsilon(usize::MAX));
389                Ok((s, e))
390            }
391            Some(b'$') => {
392                self.advance();
393                let s = self.emit(NfaNode::AnchorEnd);
394                let e = self.emit(NfaNode::Epsilon(usize::MAX));
395                Ok((s, e))
396            }
397            Some(b'\\') => {
398                self.advance();
399                let (class_or_byte, negate) = self.parse_escape()?;
400                match class_or_byte {
401                    EscapeResult::Byte(b) => {
402                        let actual_byte = if self.flags.case_insensitive {
403                            b.to_ascii_lowercase()
404                        } else {
405                            b
406                        };
407                        let s =
408                            if self.flags.case_insensitive && actual_byte.is_ascii_alphabetic() {
409                                let mut cls = ByteClass::new();
410                                cls.set(actual_byte.to_ascii_lowercase());
411                                cls.set(actual_byte.to_ascii_uppercase());
412                                self.emit(NfaNode::Class(cls))
413                            } else {
414                                self.emit(NfaNode::Byte(actual_byte))
415                            };
416                        let e = self.emit(NfaNode::Epsilon(usize::MAX));
417                        Ok((s, e))
418                    }
419                    EscapeResult::Class(mut cls) => {
420                        if negate {
421                            cls.negate();
422                        }
423                        let s = self.emit(NfaNode::Class(cls));
424                        let e = self.emit(NfaNode::Epsilon(usize::MAX));
425                        Ok((s, e))
426                    }
427                    EscapeResult::WordBoundary => {
428                        let s = self.emit(NfaNode::WordBoundary);
429                        let e = self.emit(NfaNode::Epsilon(usize::MAX));
430                        Ok((s, e))
431                    }
432                }
433            }
434            Some(ch) if ch != b'|' && ch != b')' && ch != b'*' && ch != b'+' && ch != b'?' => {
435                self.advance();
436                let s = if self.flags.case_insensitive && ch.is_ascii_alphabetic() {
437                    let mut cls = ByteClass::new();
438                    cls.set(ch.to_ascii_lowercase());
439                    cls.set(ch.to_ascii_uppercase());
440                    self.emit(NfaNode::Class(cls))
441                } else {
442                    self.emit(NfaNode::Byte(ch))
443                };
444                let e = self.emit(NfaNode::Epsilon(usize::MAX));
445                Ok((s, e))
446            }
447            _ => Err(format!(
448                "unexpected character in regex at position {}",
449                self.pos
450            )),
451        }
452    }
453
454    fn parse_char_class(&mut self) -> Result<(usize, usize), String> {
455        self.advance(); // consume '['
456        let negated = self.peek() == Some(b'^');
457        if negated {
458            self.advance();
459        }
460        let mut cls = ByteClass::new();
461        let mut first = true;
462        loop {
463            match self.peek() {
464                None => return Err("unclosed character class `[`".into()),
465                Some(b']') if !first => {
466                    self.advance();
467                    break;
468                }
469                _ => {
470                    first = false;
471                    let lo = self.parse_class_atom()?;
472                    if self.peek() == Some(b'-')
473                        && self.pos + 1 < self.pattern.len()
474                        && self.pattern[self.pos + 1] != b']'
475                    {
476                        self.advance(); // consume '-'
477                        let hi = self.parse_class_atom()?;
478                        if self.flags.case_insensitive {
479                            for b in lo..=hi {
480                                cls.set(b.to_ascii_lowercase());
481                                cls.set(b.to_ascii_uppercase());
482                            }
483                        } else {
484                            cls.set_range(lo, hi);
485                        }
486                    } else if self.flags.case_insensitive && lo.is_ascii_alphabetic() {
487                        cls.set(lo.to_ascii_lowercase());
488                        cls.set(lo.to_ascii_uppercase());
489                    } else {
490                        cls.set(lo);
491                    }
492                }
493            }
494        }
495        if negated {
496            cls.negate();
497        }
498        let s = self.emit(NfaNode::Class(cls));
499        let e = self.emit(NfaNode::Epsilon(usize::MAX));
500        Ok((s, e))
501    }
502
503    fn parse_class_atom(&mut self) -> Result<u8, String> {
504        match self.advance() {
505            Some(b'\\') => match self.advance() {
506                Some(b'n') => Ok(b'\n'),
507                Some(b't') => Ok(b'\t'),
508                Some(b'r') => Ok(b'\r'),
509                Some(b'\\') => Ok(b'\\'),
510                Some(b']') => Ok(b']'),
511                Some(b'-') => Ok(b'-'),
512                Some(b'x') => self.parse_hex_byte(),
513                Some(ch) => Ok(ch),
514                None => Err("unexpected end of escape in character class".into()),
515            },
516            Some(ch) => Ok(ch),
517            None => Err("unexpected end of character class".into()),
518        }
519    }
520
521    fn parse_hex_byte(&mut self) -> Result<u8, String> {
522        let hi = self.advance().ok_or("incomplete hex escape")?;
523        let lo = self.advance().ok_or("incomplete hex escape")?;
524        let h = hex_val(hi).ok_or_else(|| format!("invalid hex digit `{}`", hi as char))?;
525        let l = hex_val(lo).ok_or_else(|| format!("invalid hex digit `{}`", lo as char))?;
526        Ok(h * 16 + l)
527    }
528
529    fn parse_escape(&mut self) -> Result<(EscapeResult, bool), String> {
530        match self.advance() {
531            Some(b'd') => {
532                let mut cls = ByteClass::new();
533                cls.set_range(b'0', b'9');
534                Ok((EscapeResult::Class(cls), false))
535            }
536            Some(b'D') => {
537                let mut cls = ByteClass::new();
538                cls.set_range(b'0', b'9');
539                Ok((EscapeResult::Class(cls), true))
540            }
541            Some(b'w') => {
542                let mut cls = ByteClass::new();
543                cls.set_range(b'a', b'z');
544                cls.set_range(b'A', b'Z');
545                cls.set_range(b'0', b'9');
546                cls.set(b'_');
547                Ok((EscapeResult::Class(cls), false))
548            }
549            Some(b'W') => {
550                let mut cls = ByteClass::new();
551                cls.set_range(b'a', b'z');
552                cls.set_range(b'A', b'Z');
553                cls.set_range(b'0', b'9');
554                cls.set(b'_');
555                Ok((EscapeResult::Class(cls), true))
556            }
557            Some(b's') => {
558                let mut cls = ByteClass::new();
559                cls.set(b' ');
560                cls.set(b'\t');
561                cls.set(b'\n');
562                cls.set(b'\r');
563                cls.set(0x0C);
564                Ok((EscapeResult::Class(cls), false))
565            }
566            Some(b'S') => {
567                let mut cls = ByteClass::new();
568                cls.set(b' ');
569                cls.set(b'\t');
570                cls.set(b'\n');
571                cls.set(b'\r');
572                cls.set(0x0C);
573                Ok((EscapeResult::Class(cls), true))
574            }
575            Some(b'b') => Ok((EscapeResult::WordBoundary, false)),
576            Some(b'n') => Ok((EscapeResult::Byte(b'\n'), false)),
577            Some(b't') => Ok((EscapeResult::Byte(b'\t'), false)),
578            Some(b'r') => Ok((EscapeResult::Byte(b'\r'), false)),
579            Some(b'0') => Ok((EscapeResult::Byte(0), false)),
580            Some(b'x') => {
581                let b = self.parse_hex_byte()?;
582                Ok((EscapeResult::Byte(b), false))
583            }
584            Some(ch) => Ok((EscapeResult::Byte(ch), false)),
585            None => Err("unexpected end of escape sequence".into()),
586        }
587    }
588
589    fn skip_extended_ws(&mut self) {
590        while let Some(ch) = self.peek() {
591            if ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r' {
592                self.advance();
593            } else if ch == b'#' {
594                while let Some(c) = self.peek() {
595                    self.advance();
596                    if c == b'\n' {
597                        break;
598                    }
599                }
600            } else {
601                break;
602            }
603        }
604    }
605}
606
607enum EscapeResult {
608    Byte(u8),
609    Class(ByteClass),
610    WordBoundary,
611}
612
613fn compile(pattern: &str, flags: &Flags) -> Result<Nfa, String> {
614    Compiler::new(pattern, flags).compile()
615}
616
617fn hex_val(b: u8) -> Option<u8> {
618    match b {
619        b'0'..=b'9' => Some(b - b'0'),
620        b'a'..=b'f' => Some(b - b'a' + 10),
621        b'A'..=b'F' => Some(b - b'A' + 10),
622        _ => None,
623    }
624}
625
626// ---------------------------------------------------------------------------
627// NFA executor: Thompson NFA simulation
628// ---------------------------------------------------------------------------
629
630fn nfa_search(nfa: &Nfa, haystack: &[u8], opts: &Flags) -> Option<(usize, usize)> {
631    for start in 0..=haystack.len() {
632        if let Some(end) = nfa_match_at(nfa, haystack, opts, start) {
633            return Some((start, end));
634        }
635    }
636    None
637}
638
639fn nfa_search_from(
640    nfa: &Nfa,
641    haystack: &[u8],
642    opts: &Flags,
643    from: usize,
644) -> Option<(usize, usize)> {
645    for start in from..=haystack.len() {
646        if let Some(end) = nfa_match_at(nfa, haystack, opts, start) {
647            return Some((start, end));
648        }
649    }
650    None
651}
652
653/// Try to match the NFA starting at position `start` in the haystack.
654fn nfa_match_at(nfa: &Nfa, haystack: &[u8], opts: &Flags, start: usize) -> Option<usize> {
655    let mut current = Vec::new();
656    let mut next = Vec::new();
657    let mut in_current = vec![false; nfa.nodes.len()];
658    let mut in_next = vec![false; nfa.nodes.len()];
659    let mut last_match: Option<usize> = None;
660
661    add_state(
662        nfa,
663        nfa.start,
664        &mut current,
665        &mut in_current,
666        haystack,
667        opts,
668        start,
669    );
670
671    for &state in &current {
672        if matches!(nfa.nodes[state], NfaNode::Accept) {
673            last_match = Some(start);
674            if nfa.has_lazy {
675                return last_match;
676            }
677        }
678    }
679
680    let mut pos = start;
681    while pos < haystack.len() && !current.is_empty() {
682        let byte = haystack[pos];
683
684        for &state in &current {
685            match &nfa.nodes[state] {
686                NfaNode::Byte(b) => {
687                    if byte == *b {
688                        add_state(nfa, state + 1, &mut next, &mut in_next, haystack, opts, pos + 1);
689                    }
690                }
691                NfaNode::Class(cls) => {
692                    if cls.contains(byte) {
693                        add_state(nfa, state + 1, &mut next, &mut in_next, haystack, opts, pos + 1);
694                    }
695                }
696                NfaNode::AnyByte => {
697                    add_state(nfa, state + 1, &mut next, &mut in_next, haystack, opts, pos + 1);
698                }
699                NfaNode::AnyByteNoNl => {
700                    if byte != b'\n' {
701                        add_state(nfa, state + 1, &mut next, &mut in_next, haystack, opts, pos + 1);
702                    }
703                }
704                _ => {}
705            }
706        }
707
708        for &state in &next {
709            if matches!(nfa.nodes[state], NfaNode::Accept) {
710                last_match = Some(pos + 1);
711                if nfa.has_lazy {
712                    return last_match;
713                }
714            }
715        }
716
717        std::mem::swap(&mut current, &mut next);
718        std::mem::swap(&mut in_current, &mut in_next);
719        next.clear();
720        for b in in_next.iter_mut() {
721            *b = false;
722        }
723
724        pos += 1;
725    }
726
727    last_match
728}
729
730/// Add a state to the set, following epsilon transitions (closure).
731fn add_state(
732    nfa: &Nfa,
733    state: usize,
734    set: &mut Vec<usize>,
735    in_set: &mut [bool],
736    haystack: &[u8],
737    opts: &Flags,
738    pos: usize,
739) {
740    if state >= nfa.nodes.len() || in_set[state] {
741        return;
742    }
743    in_set[state] = true;
744
745    match &nfa.nodes[state] {
746        NfaNode::Epsilon(target) => {
747            if *target != usize::MAX {
748                add_state(nfa, *target, set, in_set, haystack, opts, pos);
749            }
750        }
751        NfaNode::Split(a, b) => {
752            add_state(nfa, *a, set, in_set, haystack, opts, pos);
753            add_state(nfa, *b, set, in_set, haystack, opts, pos);
754        }
755        NfaNode::AnchorStart => {
756            let ok = if opts.multiline {
757                pos == 0 || (pos > 0 && haystack[pos - 1] == b'\n')
758            } else {
759                pos == 0
760            };
761            if ok {
762                add_state(nfa, state + 1, set, in_set, haystack, opts, pos);
763            }
764        }
765        NfaNode::AnchorEnd => {
766            let ok = if opts.multiline {
767                pos == haystack.len() || haystack[pos] == b'\n'
768            } else {
769                pos == haystack.len()
770            };
771            if ok {
772                add_state(nfa, state + 1, set, in_set, haystack, opts, pos);
773            }
774        }
775        NfaNode::WordBoundary => {
776            let before_word = pos > 0 && is_word_byte(haystack[pos - 1]);
777            let after_word = pos < haystack.len() && is_word_byte(haystack[pos]);
778            if before_word != after_word {
779                add_state(nfa, state + 1, set, in_set, haystack, opts, pos);
780            }
781        }
782        _ => {
783            set.push(state);
784        }
785    }
786}
787
788fn is_word_byte(b: u8) -> bool {
789    b.is_ascii_alphanumeric() || b == b'_'
790}
791
792// ---------------------------------------------------------------------------
793// Tests
794// ---------------------------------------------------------------------------
795
796#[cfg(test)]
797mod tests {
798    use super::*;
799
800    #[test]
801    fn test_simple_literal() {
802        assert!(is_match("hello", "", b"say hello world"));
803        assert!(!is_match("hello", "", b"say helo world"));
804    }
805
806    #[test]
807    fn test_dot() {
808        assert!(is_match("h.llo", "", b"hello"));
809        assert!(is_match("h.llo", "", b"hxllo"));
810        assert!(!is_match("h.llo", "", b"hlo"));
811    }
812
813    #[test]
814    fn test_star() {
815        assert!(is_match("ab*c", "", b"ac"));
816        assert!(is_match("ab*c", "", b"abc"));
817        assert!(is_match("ab*c", "", b"abbc"));
818        assert!(is_match("ab*c", "", b"abbbbc"));
819    }
820
821    #[test]
822    fn test_plus() {
823        assert!(!is_match("ab+c", "", b"ac"));
824        assert!(is_match("ab+c", "", b"abc"));
825        assert!(is_match("ab+c", "", b"abbc"));
826    }
827
828    #[test]
829    fn test_question() {
830        assert!(is_match("ab?c", "", b"ac"));
831        assert!(is_match("ab?c", "", b"abc"));
832        assert!(!is_match("ab?c", "", b"abbc"));
833    }
834
835    #[test]
836    fn test_alternation() {
837        assert!(is_match("cat|dog", "", b"I have a cat"));
838        assert!(is_match("cat|dog", "", b"I have a dog"));
839        assert!(!is_match("cat|dog", "", b"I have a bird"));
840    }
841
842    #[test]
843    fn test_char_class() {
844        assert!(is_match("[abc]", "", b"a"));
845        assert!(is_match("[abc]", "", b"b"));
846        assert!(!is_match("[abc]", "", b"d"));
847    }
848
849    #[test]
850    fn test_char_class_range() {
851        assert!(is_match("[a-z]", "", b"m"));
852        assert!(!is_match("[a-z]", "", b"M"));
853    }
854
855    #[test]
856    fn test_negated_class() {
857        assert!(!is_match("[^abc]", "", b"a"));
858        assert!(is_match("[^abc]", "", b"d"));
859    }
860
861    #[test]
862    fn test_digit_class() {
863        assert!(is_match("\\d+", "", b"abc123def"));
864        assert!(!is_match("^\\d+$", "", b"abc123def"));
865        assert!(is_match("^\\d+$", "", b"123"));
866    }
867
868    #[test]
869    fn test_word_class() {
870        assert!(is_match("\\w+", "", b"hello_world"));
871        assert!(!is_match("^\\w+$", "", b"hello world"));
872    }
873
874    #[test]
875    fn test_anchors() {
876        assert!(is_match("^hello", "", b"hello world"));
877        assert!(!is_match("^hello", "", b"say hello"));
878        assert!(is_match("world$", "", b"hello world"));
879        assert!(!is_match("world$", "", b"world hello"));
880    }
881
882    #[test]
883    fn test_case_insensitive() {
884        assert!(is_match("hello", "i", b"HELLO"));
885        assert!(is_match("hello", "i", b"HeLLo"));
886    }
887
888    #[test]
889    fn test_dotall() {
890        assert!(!is_match("a.b", "", b"a\nb"));
891        assert!(is_match("a.b", "s", b"a\nb"));
892    }
893
894    #[test]
895    fn test_multiline() {
896        assert!(is_match("^world", "m", b"hello\nworld"));
897        assert!(!is_match("^world", "", b"hello\nworld"));
898    }
899
900    #[test]
901    fn test_groups() {
902        assert!(is_match("(abc)+", "", b"abcabc"));
903        assert!(!is_match("(abc)+", "", b"abab"));
904    }
905
906    #[test]
907    fn test_hex_escape() {
908        assert!(is_match("\\x41\\x42", "", b"AB"));
909    }
910
911    #[test]
912    fn test_word_boundary() {
913        assert!(is_match("\\bhello\\b", "", b"say hello world"));
914        assert!(!is_match("\\bhello\\b", "", b"sayhelloworld"));
915    }
916
917    #[test]
918    fn test_find_span() {
919        let span = find("\\d+", "", b"abc123def");
920        assert_eq!(span, Some((3, 6)));
921    }
922
923    #[test]
924    fn test_find_all() {
925        let spans = find_all("\\d+", "", b"a1b22c333");
926        assert_eq!(spans, vec![(1, 2), (3, 5), (6, 9)]);
927    }
928
929    #[test]
930    fn test_split() {
931        let segs = split(",", "", b"a,b,c");
932        assert_eq!(segs, vec![(0, 1), (2, 3), (4, 5)]);
933    }
934
935    #[test]
936    fn test_split_whitespace() {
937        let segs = split("\\s+", "", b"hello  world");
938        assert_eq!(segs, vec![(0, 5), (7, 12)]);
939    }
940
941    #[test]
942    fn test_empty_pattern() {
943        assert!(is_match("", "", b"anything"));
944    }
945
946    #[test]
947    fn test_empty_haystack() {
948        assert!(!is_match("a", "", b""));
949        assert!(is_match("", "", b""));
950        assert!(is_match("a*", "", b""));
951    }
952
953    #[test]
954    fn test_determinism() {
955        for _ in 0..10 {
956            let r = find_all("\\d+", "", b"x1y22z333");
957            assert_eq!(r, vec![(1, 2), (3, 5), (6, 9)]);
958        }
959    }
960
961    #[test]
962    fn test_whitespace_class() {
963        assert!(is_match("\\s", "", b" "));
964        assert!(is_match("\\s", "", b"\t"));
965        assert!(is_match("\\s", "", b"\n"));
966        assert!(!is_match("\\s", "", b"a"));
967    }
968
969    #[test]
970    fn test_negated_digit() {
971        assert!(is_match("\\D", "", b"a"));
972        assert!(!is_match("\\D", "", b"5"));
973    }
974
975    #[test]
976    fn test_lazy_star() {
977        let greedy = find("a.*b", "", b"aXbYb");
978        assert_eq!(greedy, Some((0, 5)));
979        let lazy = find("a.*?b", "", b"aXbYb");
980        assert_eq!(lazy, Some((0, 3)));
981    }
982
983    #[test]
984    fn test_lazy_plus() {
985        let greedy = find("a.+b", "", b"aXYZb");
986        assert_eq!(greedy, Some((0, 5)));
987        let lazy = find("a.+?b", "", b"aXbYb");
988        assert_eq!(lazy, Some((0, 3)));
989    }
990
991    #[test]
992    fn test_lazy_star_find_all() {
993        let greedy = find_all("<.+>", "", b"<a><b>");
994        assert_eq!(greedy, vec![(0, 6)]);
995        let lazy = find_all("<.+?>", "", b"<a><b>");
996        assert_eq!(lazy, vec![(0, 3), (3, 6)]);
997    }
998
999    #[test]
1000    fn test_lazy_star_zero_length() {
1001        let lazy = find("a*?", "", b"aaa");
1002        assert_eq!(lazy, Some((0, 0)));
1003    }
1004
1005    #[test]
1006    fn test_lazy_plus_minimum() {
1007        let lazy = find("a+?", "", b"aaa");
1008        assert_eq!(lazy, Some((0, 1)));
1009    }
1010
1011    #[test]
1012    fn test_greedy_unchanged() {
1013        let greedy = find("a+", "", b"aaa");
1014        assert_eq!(greedy, Some((0, 3)));
1015        let greedy2 = find("a*", "", b"aaa");
1016        assert_eq!(greedy2, Some((0, 3)));
1017        let greedy3 = find("<.+>", "", b"<a><b>");
1018        assert_eq!(greedy3, Some((0, 6)));
1019    }
1020}