fancy_regex/
compile.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Compilation of regexes to VM.
22
23use alloc::boxed::Box;
24use alloc::string::{String, ToString};
25#[cfg(feature = "variable-lookbehinds")]
26use alloc::sync::Arc;
27use alloc::vec::Vec;
28use regex_automata::meta::Regex as RaRegex;
29use regex_automata::meta::{Builder as RaBuilder, Config as RaConfig};
30#[cfg(feature = "variable-lookbehinds")]
31use regex_automata::util::pool::Pool;
32#[cfg(all(test, feature = "std"))]
33use std::{collections::BTreeMap, sync::RwLock};
34
35use crate::analyze::Info;
36#[cfg(feature = "variable-lookbehinds")]
37use crate::vm::{CachePoolFn, ReverseBackwardsDelegate};
38use crate::vm::{CaptureGroupRange, Delegate, Insn, Prog};
39use crate::LookAround::*;
40use crate::{CompileError, Error, Expr, LookAround, RegexOptions, Result};
41
42// I'm thinking it probably doesn't make a lot of sense having this split
43// out from Compiler.
44struct VMBuilder {
45    prog: Vec<Insn>,
46    n_saves: usize,
47}
48
49impl VMBuilder {
50    fn new(max_group: usize) -> VMBuilder {
51        VMBuilder {
52            prog: Vec::new(),
53            n_saves: max_group * 2,
54        }
55    }
56
57    fn build(self) -> Prog {
58        Prog::new(self.prog, self.n_saves)
59    }
60
61    fn newsave(&mut self) -> usize {
62        let result = self.n_saves;
63        self.n_saves += 1;
64        result
65    }
66
67    fn pc(&self) -> usize {
68        self.prog.len()
69    }
70
71    // would "emit" be a better name?
72    fn add(&mut self, insn: Insn) {
73        self.prog.push(insn);
74    }
75
76    fn set_jmp_target(&mut self, jmp_pc: usize, target: usize) {
77        match self.prog[jmp_pc] {
78            Insn::Jmp(ref mut next) => *next = target,
79            _ => panic!("mutating instruction other than Jmp"),
80        }
81    }
82
83    fn set_split_target(&mut self, split_pc: usize, target: usize, second: bool) {
84        match self.prog[split_pc] {
85            Insn::Split(_, ref mut y) if second => *y = target,
86            Insn::Split(ref mut x, _) => *x = target,
87            _ => panic!("mutating instruction other than Split"),
88        }
89    }
90
91    fn set_repeat_target(&mut self, repeat_pc: usize, target: usize) {
92        match self.prog[repeat_pc] {
93            Insn::RepeatGr { ref mut next, .. }
94            | Insn::RepeatNg { ref mut next, .. }
95            | Insn::RepeatEpsilonGr { ref mut next, .. }
96            | Insn::RepeatEpsilonNg { ref mut next, .. } => *next = target,
97            _ => panic!("mutating instruction other than Repeat"),
98        }
99    }
100}
101
102struct Compiler {
103    b: VMBuilder,
104    options: RegexOptions,
105    inside_alternation: bool,
106}
107
108impl Compiler {
109    fn new(max_group: usize) -> Compiler {
110        Compiler {
111            b: VMBuilder::new(max_group),
112            options: Default::default(),
113            inside_alternation: false,
114        }
115    }
116
117    fn visit(&mut self, info: &Info<'_>, hard: bool) -> Result<()> {
118        if !hard && !info.hard {
119            // easy case, delegate entire subexpr
120            return self.compile_delegate(info);
121        }
122        match *info.expr {
123            Expr::Empty => (),
124            Expr::Literal { ref val, casei } => {
125                if !casei {
126                    self.b.add(Insn::Lit(val.clone()));
127                } else {
128                    self.compile_delegate(info)?;
129                }
130            }
131            Expr::Any { newline: true } => {
132                self.b.add(Insn::Any);
133            }
134            Expr::Any { newline: false } => {
135                self.b.add(Insn::AnyNoNL);
136            }
137            Expr::Concat(_) => {
138                self.compile_concat(info, hard)?;
139            }
140            Expr::Alt(_) => {
141                let count = info.children.len();
142                let inside_alternation = self.inside_alternation;
143                self.inside_alternation = true;
144                self.compile_alt(count, |compiler, i| compiler.visit(&info.children[i], hard))?;
145                self.inside_alternation = inside_alternation;
146            }
147            Expr::Group(_) => {
148                let group = info.start_group();
149                self.b.add(Insn::Save(group * 2));
150                self.visit(&info.children[0], hard)?;
151                self.b.add(Insn::Save(group * 2 + 1));
152            }
153            Expr::Repeat { lo, hi, greedy, .. } => {
154                self.compile_repeat(info, lo, hi, greedy, hard)?;
155            }
156            Expr::LookAround(_, la) => {
157                self.compile_lookaround(info, la)?;
158            }
159            Expr::Backref { group, casei } => {
160                self.b.add(Insn::Backref {
161                    slot: group * 2,
162                    casei,
163                });
164            }
165            Expr::BackrefExistsCondition(group) => {
166                self.b.add(Insn::BackrefExistsCondition(group));
167            }
168            Expr::AtomicGroup(_) => {
169                // TODO optimization: atomic insns are not needed if the
170                // child doesn't do any backtracking.
171                self.b.add(Insn::BeginAtomic);
172                self.visit(&info.children[0], false)?;
173                self.b.add(Insn::EndAtomic);
174            }
175            Expr::Delegate { .. } => {
176                // TODO: might want to have more specialized impls
177                self.compile_delegate(info)?;
178            }
179            Expr::Assertion(assertion) => {
180                self.b.add(Insn::Assertion(assertion));
181            }
182            Expr::KeepOut => {
183                self.b.add(Insn::Save(0));
184            }
185            Expr::ContinueFromPreviousMatchEnd => {
186                self.b.add(Insn::ContinueFromPreviousMatchEnd {
187                    at_start: info.start_group() == 1
188                        && info.min_pos_in_group == 0
189                        && !self.inside_alternation,
190                });
191            }
192            Expr::Conditional { .. } => {
193                self.compile_conditional(|compiler, i| compiler.visit(&info.children[i], hard))?;
194            }
195            Expr::SubroutineCall(_) => {
196                return Err(Error::CompileError(Box::new(
197                    CompileError::FeatureNotYetSupported("Subroutine Call".to_string()),
198                )));
199            }
200            Expr::UnresolvedNamedSubroutineCall { .. } => unreachable!(),
201            Expr::BackrefWithRelativeRecursionLevel { .. } => unreachable!(),
202        }
203        Ok(())
204    }
205
206    fn compile_alt<F>(&mut self, count: usize, mut handle_alternative: F) -> Result<()>
207    where
208        F: FnMut(&mut Compiler, usize) -> Result<()>,
209    {
210        let mut jmps = Vec::new();
211        let mut last_pc = usize::MAX;
212        for i in 0..count {
213            let has_next = i != count - 1;
214            let pc = self.b.pc();
215            if has_next {
216                self.b.add(Insn::Split(pc + 1, usize::MAX));
217            }
218            if last_pc != usize::MAX {
219                self.b.set_split_target(last_pc, pc, true);
220            }
221            last_pc = pc;
222
223            handle_alternative(self, i)?;
224
225            if has_next {
226                // All except the last branch need to jump over instructions of
227                // other branches. The last branch can just continue to the next
228                // instruction.
229                let pc = self.b.pc();
230                jmps.push(pc);
231                self.b.add(Insn::Jmp(0));
232            }
233        }
234        let next_pc = self.b.pc();
235        for jmp_pc in jmps {
236            self.b.set_jmp_target(jmp_pc, next_pc);
237        }
238        Ok(())
239    }
240
241    fn compile_conditional<F>(&mut self, mut handle_child: F) -> Result<()>
242    where
243        F: FnMut(&mut Compiler, usize) -> Result<()>,
244    {
245        // here we use atomic group functionality to be able to remove the program counter
246        // relating to the split instruction's second position if the conditional succeeds
247        // This is to ensure that if the condition succeeds, but the "true" branch from the
248        // conditional fails, that it wouldn't jump to the "false" branch.
249        self.b.add(Insn::BeginAtomic);
250
251        let split_pc = self.b.pc();
252        // add the split instruction - we will update it's second pc later
253        self.b.add(Insn::Split(split_pc + 1, usize::MAX));
254
255        // add the conditional expression
256        handle_child(self, 0)?;
257
258        // mark it as successful to remove the state we added as a split earlier
259        self.b.add(Insn::EndAtomic);
260
261        // add the truth branch
262        handle_child(self, 1)?;
263        // add an instruction to jump over the false branch - we will update the jump target later
264        let jump_over_false_pc = self.b.pc();
265        self.b.add(Insn::Jmp(0));
266
267        // add the false branch, update the split target
268        self.b.set_split_target(split_pc, self.b.pc(), true);
269        handle_child(self, 2)?;
270
271        // update the jump target for jumping over the false branch
272        self.b.set_jmp_target(jump_over_false_pc, self.b.pc());
273
274        Ok(())
275    }
276
277    fn compile_concat(&mut self, info: &Info<'_>, hard: bool) -> Result<()> {
278        // First: determine a prefix which is constant size and not hard.
279        let prefix_end = info
280            .children
281            .iter()
282            .take_while(|c| c.const_size && !c.hard)
283            .count();
284
285        // If incoming difficulty is not hard, the suffix after the last
286        // hard child can be done with NFA.
287        let suffix_len = if !hard {
288            info.children[prefix_end..]
289                .iter()
290                .rev()
291                .take_while(|c| !c.hard)
292                .count()
293        } else {
294            // Even for hard, we can delegate a const-sized suffix
295            info.children[prefix_end..]
296                .iter()
297                .rev()
298                .take_while(|c| c.const_size && !c.hard)
299                .count()
300        };
301        let suffix_begin = info.children.len() - suffix_len;
302
303        self.compile_delegates(&info.children[..prefix_end])?;
304
305        for child in info.children[prefix_end..suffix_begin].iter() {
306            self.visit(child, true)?;
307        }
308
309        self.compile_delegates(&info.children[suffix_begin..])
310    }
311
312    fn compile_repeat(
313        &mut self,
314        info: &Info<'_>,
315        lo: usize,
316        hi: usize,
317        greedy: bool,
318        hard: bool,
319    ) -> Result<()> {
320        let child = &info.children[0];
321        if lo == 0 && hi == 1 {
322            // e?
323            let pc = self.b.pc();
324            self.b.add(Insn::Split(pc + 1, pc + 1));
325            // TODO: do we want to do an epsilon check here? If we do
326            // it here and in Alt, we might be able to make a good
327            // bound on stack depth
328            self.visit(child, hard)?;
329            let next_pc = self.b.pc();
330            self.b.set_split_target(pc, next_pc, greedy);
331            return Ok(());
332        }
333        let hard = hard | info.hard;
334        if hi == usize::MAX && child.min_size == 0 {
335            // Use RepeatEpsilon instructions to prevent empty repeat
336            let repeat = self.b.newsave();
337            let check = self.b.newsave();
338            self.b.add(Insn::Save0(repeat));
339            let pc = self.b.pc();
340            if greedy {
341                self.b.add(Insn::RepeatEpsilonGr {
342                    lo,
343                    next: usize::MAX,
344                    repeat,
345                    check,
346                });
347            } else {
348                self.b.add(Insn::RepeatEpsilonNg {
349                    lo,
350                    next: usize::MAX,
351                    repeat,
352                    check,
353                });
354            }
355            self.visit(child, hard)?;
356            self.b.add(Insn::Jmp(pc));
357            let next_pc = self.b.pc();
358            self.b.set_repeat_target(pc, next_pc);
359        } else if lo == 0 && hi == usize::MAX {
360            // e*
361            let pc = self.b.pc();
362            self.b.add(Insn::Split(pc + 1, pc + 1));
363            self.visit(child, hard)?;
364            self.b.add(Insn::Jmp(pc));
365            let next_pc = self.b.pc();
366            self.b.set_split_target(pc, next_pc, greedy);
367        } else if lo == 1 && hi == usize::MAX {
368            // e+
369            let pc = self.b.pc();
370            self.visit(child, hard)?;
371            let next = self.b.pc() + 1;
372            let (x, y) = if greedy { (pc, next) } else { (next, pc) };
373            self.b.add(Insn::Split(x, y));
374        } else {
375            let repeat = self.b.newsave();
376            self.b.add(Insn::Save0(repeat));
377            let pc = self.b.pc();
378            if greedy {
379                self.b.add(Insn::RepeatGr {
380                    lo,
381                    hi,
382                    next: usize::MAX,
383                    repeat,
384                });
385            } else {
386                self.b.add(Insn::RepeatNg {
387                    lo,
388                    hi,
389                    next: usize::MAX,
390                    repeat,
391                });
392            }
393            self.visit(child, hard)?;
394            self.b.add(Insn::Jmp(pc));
395            let next_pc = self.b.pc();
396            self.b.set_repeat_target(pc, next_pc);
397        }
398        Ok(())
399    }
400
401    fn compile_lookaround(&mut self, info: &Info<'_>, la: LookAround) -> Result<()> {
402        let inner = &info.children[0];
403        match la {
404            LookBehind => {
405                if let &Info {
406                    const_size: false,
407                    expr: &Expr::Alt(_),
408                    ..
409                } = inner
410                {
411                    // Make const size by transforming `(?<=a|bb)` to `(?<=a)|(?<=bb)`
412                    let alternatives = &inner.children;
413                    self.compile_alt(alternatives.len(), |compiler, i| {
414                        let alternative = &alternatives[i];
415                        compiler.compile_positive_lookaround(alternative, la)
416                    })
417                } else {
418                    self.compile_positive_lookaround(inner, la)
419                }
420            }
421            LookBehindNeg => {
422                if let &Info {
423                    const_size: false,
424                    expr: &Expr::Alt(_),
425                    ..
426                } = inner
427                {
428                    // Make const size by transforming `(?<!a|bb)` to `(?<!a)(?<!bb)`
429                    let alternatives = &inner.children;
430                    for alternative in alternatives {
431                        self.compile_negative_lookaround(alternative, la)?;
432                    }
433                    Ok(())
434                } else {
435                    self.compile_negative_lookaround(inner, la)
436                }
437            }
438            LookAhead => self.compile_positive_lookaround(inner, la),
439            LookAheadNeg => self.compile_negative_lookaround(inner, la),
440        }
441    }
442
443    fn compile_positive_lookaround(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
444        let save = self.b.newsave();
445        self.b.add(Insn::Save(save));
446        self.compile_lookaround_inner(inner, la)?;
447        self.b.add(Insn::Restore(save));
448        Ok(())
449    }
450
451    fn compile_negative_lookaround(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
452        let pc = self.b.pc();
453        self.b.add(Insn::Split(pc + 1, usize::MAX));
454        self.compile_lookaround_inner(inner, la)?;
455        self.b.add(Insn::FailNegativeLookAround);
456        let next_pc = self.b.pc();
457        self.b.set_split_target(pc, next_pc, true);
458        Ok(())
459    }
460
461    fn compile_lookaround_inner(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
462        if la == LookBehind || la == LookBehindNeg {
463            if inner.const_size {
464                self.b.add(Insn::GoBack(inner.min_size));
465                self.visit(inner, false)
466            } else if !inner.hard {
467                #[cfg(feature = "variable-lookbehinds")]
468                {
469                    let mut delegate_builder = DelegateBuilder::new();
470                    delegate_builder.push(inner);
471                    let pattern = &delegate_builder.re;
472                    let capture_groups = delegate_builder
473                        .capture_groups
474                        .expect("Expected at least one expression");
475
476                    // Use reverse matching for variable-sized lookbehinds without fancy features
477                    use regex_automata::nfa::thompson;
478                    // Build a reverse DFA for the pattern
479                    let dfa = match regex_automata::hybrid::dfa::DFA::builder()
480                        .thompson(thompson::Config::new().reverse(true))
481                        .build(pattern)
482                    {
483                        Ok(dfa) => Arc::new(dfa),
484                        Err(e) => {
485                            return Err(Error::CompileError(Box::new(CompileError::DfaBuildError(
486                                e.to_string(),
487                            ))))
488                        }
489                    };
490
491                    let create: CachePoolFn = alloc::boxed::Box::new({
492                        let dfa = Arc::clone(&dfa);
493                        move || dfa.create_cache()
494                    });
495                    let cache_pool = Pool::new(create);
496
497                    // Build the forward regex for capture group extraction
498                    let forward_regex = if inner.start_group() != inner.end_group() {
499                        Some(compile_inner(pattern, &self.options)?)
500                    } else {
501                        None
502                    };
503
504                    self.b
505                        .add(Insn::BackwardsDelegate(ReverseBackwardsDelegate {
506                            dfa,
507                            cache_pool,
508                            pattern: pattern.to_string(),
509                            capture_group_extraction_inner: forward_regex,
510                            capture_groups: capture_groups.to_option_if_non_empty(),
511                        }));
512                    Ok(())
513                }
514                #[cfg(not(feature = "variable-lookbehinds"))]
515                {
516                    Err(Error::CompileError(Box::new(
517                        CompileError::VariableLookBehindRequiresFeature,
518                    )))
519                }
520            } else {
521                // variable sized lookbehinds with fancy features are currently unsupported
522                Err(Error::CompileError(Box::new(
523                    CompileError::LookBehindNotConst,
524                )))
525            }
526        } else {
527            self.visit(inner, false)
528        }
529    }
530
531    fn compile_delegates(&mut self, infos: &[Info<'_>]) -> Result<()> {
532        if infos.is_empty() {
533            return Ok(());
534        }
535        // TODO: might want to do something similar for case insensitive literals
536        // (have is_literal return an additional bool for casei)
537        if infos.iter().all(|e| e.is_literal()) {
538            let mut val = String::new();
539            for info in infos {
540                info.push_literal(&mut val);
541            }
542            self.b.add(Insn::Lit(val));
543            return Ok(());
544        }
545
546        let mut delegate_builder = DelegateBuilder::new();
547        for info in infos {
548            delegate_builder.push(info);
549        }
550        let delegate = delegate_builder.build(&self.options)?;
551
552        self.b.add(delegate);
553        Ok(())
554    }
555
556    fn compile_delegate(&mut self, info: &Info) -> Result<()> {
557        let insn = if info.is_literal() {
558            let mut val = String::new();
559            info.push_literal(&mut val);
560            Insn::Lit(val)
561        } else {
562            DelegateBuilder::new().push(info).build(&self.options)?
563        };
564        self.b.add(insn);
565        Ok(())
566    }
567}
568
569// Unlike Regex in `regex`, `regex-automata` does not store the pattern string,
570// and we cannot retrieve the pattern string using `as_str`.
571// Unfortunately we need to get the pattern string in our tests,
572// so we just store it in a global map.
573#[cfg(all(test, feature = "std"))]
574static PATTERN_MAPPING: RwLock<BTreeMap<String, String>> = RwLock::new(BTreeMap::new());
575
576pub(crate) fn compile_inner(inner_re: &str, options: &RegexOptions) -> Result<RaRegex> {
577    let mut config = RaConfig::new();
578    if let Some(size_limit) = options.delegate_size_limit {
579        config = config.nfa_size_limit(Some(size_limit));
580    }
581    if let Some(dfa_size_limit) = options.delegate_dfa_size_limit {
582        config = config.dfa_size_limit(Some(dfa_size_limit));
583    }
584
585    let re = RaBuilder::new()
586        .configure(config)
587        .syntax(options.syntaxc)
588        .build(inner_re)
589        .map_err(CompileError::InnerError)
590        .map_err(|e| Error::CompileError(Box::new(e)))?;
591
592    #[cfg(all(test, feature = "std"))]
593    PATTERN_MAPPING
594        .write()
595        .unwrap()
596        .insert(format!("{:?}", re), inner_re.to_owned());
597
598    Ok(re)
599}
600
601/// Compile the analyzed expressions into a program.
602pub fn compile(info: &Info<'_>, anchored: bool) -> Result<Prog> {
603    let mut c = Compiler::new(info.end_group());
604    if !anchored {
605        // add instructions as if \O*? was used at the start of the expression
606        // so that we bump the haystack index by one when failing to match at the current position
607        let current_pc = c.b.pc();
608        // we are adding 3 instructions, so the current program counter plus 3 gives us the first real instruction
609        c.b.add(Insn::Split(current_pc + 3, current_pc + 1));
610        c.b.add(Insn::Any);
611        c.b.add(Insn::Jmp(current_pc));
612    }
613    if info.start_group() == 1 {
614        // add implicit capture group 0 begin
615        c.b.add(Insn::Save(0));
616    }
617    c.visit(info, false)?;
618    if info.start_group() == 1 {
619        // add implicit capture group 0 end
620        c.b.add(Insn::Save(1));
621    }
622    c.b.add(Insn::End);
623    Ok(c.b.build())
624}
625
626struct DelegateBuilder {
627    re: String,
628    min_size: usize,
629    const_size: bool,
630    capture_groups: Option<CaptureGroupRange>,
631}
632
633impl DelegateBuilder {
634    fn new() -> Self {
635        Self {
636            re: String::new(),
637            min_size: 0,
638            const_size: true,
639            capture_groups: None,
640        }
641    }
642
643    fn push(&mut self, info: &Info<'_>) -> &mut DelegateBuilder {
644        // TODO: might want to detect case of a group with no captures
645        //  inside, so we can run find() instead of captures()
646
647        self.min_size += info.min_size;
648        self.const_size &= info.const_size;
649        if self.capture_groups.is_none() {
650            self.capture_groups = Some(info.capture_groups);
651        } else {
652            // Update the end_group to the latest
653            self.capture_groups = self
654                .capture_groups
655                .map(|range| CaptureGroupRange(range.start(), info.end_group()));
656        }
657
658        // Add expression. The precedence argument has to be 1 here to
659        // ensure correct grouping in these cases:
660        //
661        // If we have multiple expressions, we are building a concat.
662        // Without grouping, we'd turn ["a", "b|c"] into "^ab|c". But we
663        // want "^a(?:b|c)".
664        //
665        // Even with a single expression, because we add `^` at the
666        // beginning, we need a group. Otherwise `["a|b"]` would be turned
667        // into `"^a|b"` instead of `"^(?:a|b)"`.
668        info.expr.to_str(&mut self.re, 1);
669        self
670    }
671
672    fn build(&self, options: &RegexOptions) -> Result<Insn> {
673        let capture_groups = self
674            .capture_groups
675            .expect("Expected at least one expression");
676
677        let compiled = compile_inner(&self.re, options)?;
678
679        Ok(Insn::Delegate(Delegate {
680            inner: compiled,
681            pattern: self.re.clone(),
682            capture_groups: capture_groups.to_option_if_non_empty(),
683        }))
684    }
685}
686
687#[cfg(test)]
688mod tests {
689    use super::*;
690    use crate::analyze::analyze;
691    use crate::parse::ExprTree;
692    use crate::vm::Insn::*;
693    use alloc::vec;
694    use bit_set::BitSet;
695    use matches::assert_matches;
696
697    #[test]
698    fn jumps_for_alternation() {
699        let tree = ExprTree {
700            expr: Expr::Alt(vec![
701                Expr::Literal {
702                    val: "a".into(),
703                    casei: false,
704                },
705                Expr::Literal {
706                    val: "b".into(),
707                    casei: false,
708                },
709                Expr::Literal {
710                    val: "c".into(),
711                    casei: false,
712                },
713            ]),
714            backrefs: BitSet::new(),
715            named_groups: Default::default(),
716            contains_subroutines: false,
717            self_recursive: false,
718        };
719        let info = analyze(&tree, false).unwrap();
720
721        let mut c = Compiler::new(0);
722        // Force "hard" so that compiler doesn't just delegate
723        c.visit(&info, true).unwrap();
724        c.b.add(Insn::End);
725
726        let prog = c.b.prog;
727
728        assert_eq!(prog.len(), 8, "prog: {:?}", prog);
729        assert_matches!(prog[0], Split(1, 3));
730        assert_matches!(prog[1], Lit(ref l) if l == "a");
731        assert_matches!(prog[2], Jmp(7));
732        assert_matches!(prog[3], Split(4, 6));
733        assert_matches!(prog[4], Lit(ref l) if l == "b");
734        assert_matches!(prog[5], Jmp(7));
735        assert_matches!(prog[6], Lit(ref l) if l == "c");
736        assert_matches!(prog[7], End);
737    }
738
739    #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
740    #[test]
741    fn look_around_pattern_can_be_delegated() {
742        let prog = compile_prog("(?=ab*)c");
743
744        assert_eq!(prog.len(), 5, "prog: {:?}", prog);
745        assert_matches!(prog[0], Save(0));
746        assert_delegate(&prog[1], "ab*");
747        assert_matches!(prog[2], Restore(0));
748        assert_matches!(prog[3], Lit(ref l) if l == "c");
749        assert_matches!(prog[4], End);
750    }
751
752    #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
753    #[test]
754    fn easy_concat_can_delegate_end() {
755        let prog = compile_prog("(?!x)(?:a|ab)x*");
756
757        assert_eq!(prog.len(), 5, "prog: {:?}", prog);
758        assert_matches!(prog[0], Split(1, 3));
759        assert_matches!(prog[1], Lit(ref l) if l == "x");
760        assert_matches!(prog[2], FailNegativeLookAround);
761        assert_delegate(&prog[3], "(?:a|ab)x*");
762        assert_matches!(prog[4], End);
763    }
764
765    #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
766    #[test]
767    fn hard_concat_can_delegate_const_size_end() {
768        let prog = compile_prog("(?:(?!x)(?:a|b)c)x*");
769
770        assert_eq!(prog.len(), 6, "prog: {:?}", prog);
771        assert_matches!(prog[0], Split(1, 3));
772        assert_matches!(prog[1], Lit(ref l) if l == "x");
773        assert_matches!(prog[2], FailNegativeLookAround);
774        assert_delegate(&prog[3], "(?:a|b)c");
775        assert_delegate(&prog[4], "x*");
776        assert_matches!(prog[5], End);
777    }
778
779    #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
780    #[test]
781    fn hard_concat_can_not_delegate_variable_end() {
782        let prog = compile_prog("(?:(?!x)(?:a|ab))x*");
783
784        assert_eq!(prog.len(), 9, "prog: {:?}", prog);
785        assert_matches!(prog[0], Split(1, 3));
786        assert_matches!(prog[1], Lit(ref l) if l == "x");
787        assert_matches!(prog[2], FailNegativeLookAround);
788        assert_matches!(prog[3], Split(4, 6));
789        assert_matches!(prog[4], Lit(ref l) if l == "a");
790        assert_matches!(prog[5], Jmp(7));
791        assert_matches!(prog[6], Lit(ref l) if l == "ab");
792        assert_delegate(&prog[7], "x*");
793        assert_matches!(prog[8], End);
794    }
795
796    #[test]
797    fn conditional_expression_can_be_compiled() {
798        let prog = compile_prog(r"(?(ab)c|d)");
799
800        assert_eq!(prog.len(), 8, "prog: {:?}", prog);
801
802        assert_matches!(prog[0], BeginAtomic);
803        assert_matches!(prog[1], Split(2, 6));
804        assert_matches!(prog[2], Lit(ref l) if l == "ab");
805        assert_matches!(prog[3], EndAtomic);
806        assert_matches!(prog[4], Lit(ref l) if l == "c");
807        assert_matches!(prog[5], Jmp(7));
808        assert_matches!(prog[6], Lit(ref l) if l == "d");
809        assert_matches!(prog[7], End);
810    }
811
812    #[test]
813    fn lazy_any_can_be_compiled_explicit_capture_group_zero() {
814        let prog = compile_prog(r"\O*?((?!a))");
815
816        assert_eq!(prog.len(), 9, "prog: {:?}", prog);
817
818        assert_matches!(prog[0], Split(3, 1));
819        assert_matches!(prog[1], Any);
820        assert_matches!(prog[2], Jmp(0));
821        assert_matches!(prog[3], Save(0));
822        assert_matches!(prog[4], Split(5, 7));
823        assert_matches!(prog[5], Lit(ref l) if l == "a");
824        assert_matches!(prog[6], FailNegativeLookAround);
825        assert_matches!(prog[7], Save(1));
826        assert_matches!(prog[8], End);
827    }
828
829    #[test]
830    #[cfg(not(feature = "variable-lookbehinds"))]
831    fn variable_lookbehind_requires_feature() {
832        // Without the feature flag, variable-length lookbehinds should error
833        let tree = Expr::parse_tree(r"(?<=ab+)x").unwrap();
834        let info = analyze(&tree, true).unwrap();
835        let result = compile(&info, true);
836        assert!(result.is_err());
837        assert_matches!(
838            result.err().unwrap(),
839            Error::CompileError(box_err) if matches!(*box_err, CompileError::VariableLookBehindRequiresFeature)
840        );
841    }
842
843    #[test]
844    #[cfg(feature = "variable-lookbehinds")]
845    fn variable_lookbehind_with_required_feature_no_captures() {
846        let prog = compile_prog(r"(?<=ab+)x");
847
848        assert_eq!(prog.len(), 5, "prog: {:?}", prog);
849
850        assert_matches!(prog[0], Save(0));
851        assert_matches!(&prog[1], BackwardsDelegate(ReverseBackwardsDelegate { pattern, dfa: _, cache_pool: _, capture_group_extraction_inner: None, capture_groups: None }) if pattern == "ab+");
852        assert_matches!(prog[2], Restore(0));
853        assert_matches!(prog[3], Lit(ref l) if l == "x");
854        assert_matches!(prog[4], End);
855    }
856
857    #[test]
858    #[cfg(feature = "variable-lookbehinds")]
859    fn variable_lookbehind_with_required_feature_captures() {
860        let prog = compile_prog(r"(?<=a(b+))x");
861
862        assert_eq!(prog.len(), 5, "prog: {:?}", prog);
863
864        assert_matches!(prog[0], Save(2));
865        assert_matches!(&prog[1], BackwardsDelegate(ReverseBackwardsDelegate { pattern, dfa: _, cache_pool: _, capture_group_extraction_inner: ref inner, capture_groups: Some(CaptureGroupRange(0, 1)) }) if pattern == "a(b+)" && inner.is_some());
866        assert_matches!(prog[2], Restore(2));
867        assert_matches!(prog[3], Lit(ref l) if l == "x");
868        assert_matches!(prog[4], End);
869    }
870
871    #[test]
872    #[cfg(feature = "variable-lookbehinds")]
873    fn variable_lookbehind_with_required_feature_backref_captures() {
874        // currently hard variable lookbehinds are unsupported.
875        // the backref to a capture group inside the variable lookbehind makes the capture group hard
876        let tree = Expr::parse_tree(r"(?<=a(b+))\1").unwrap();
877        let info = analyze(&tree, false).unwrap();
878        let result = compile(&info, true);
879        assert!(result.is_err());
880        assert_matches!(
881            result.err().unwrap(),
882            Error::CompileError(box_err) if matches!(*box_err, CompileError::LookBehindNotConst)
883        );
884    }
885
886    fn compile_prog(re: &str) -> Vec<Insn> {
887        let tree = Expr::parse_tree(re).unwrap();
888        let info = analyze(&tree, true).unwrap();
889        let prog = compile(&info, true).unwrap();
890        prog.body
891    }
892
893    #[cfg(feature = "std")]
894    fn assert_delegate(insn: &Insn, re: &str) {
895        use crate::vm::Delegate;
896
897        match insn {
898            Insn::Delegate(Delegate { inner, .. }) => {
899                assert_eq!(
900                    PATTERN_MAPPING
901                        .read()
902                        .unwrap()
903                        .get(&alloc::format!("{:?}", inner))
904                        .unwrap(),
905                    re
906                );
907            }
908            _ => {
909                panic!("Expected Insn::Delegate but was {:#?}", insn);
910            }
911        }
912    }
913
914    #[cfg(not(feature = "std"))]
915    fn assert_delegate(_: &Insn, _: &str) {}
916}