sre_engine/
engine.rs

1// good luck to those that follow; here be dragons
2
3use crate::constants::SreInfo;
4
5use super::constants::{SreAtCode, SreCatCode, SreOpcode};
6use super::MAXREPEAT;
7use optional::Optioned;
8use std::convert::TryFrom;
9use std::ops::Deref;
10
11const fn is_py_ascii_whitespace(b: u8) -> bool {
12    matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
13}
14
15#[derive(Debug, Clone, Copy)]
16pub struct Request<'a, S: StrDrive> {
17    pub string: S,
18    pub start: usize,
19    pub end: usize,
20    pub pattern_codes: &'a [u32],
21    pub match_all: bool,
22    pub must_advance: bool,
23}
24
25impl<'a, S: StrDrive> Request<'a, S> {
26    pub fn new(
27        string: S,
28        start: usize,
29        end: usize,
30        pattern_codes: &'a [u32],
31        match_all: bool,
32    ) -> Self {
33        let end = std::cmp::min(end, string.count());
34        let start = std::cmp::min(start, end);
35
36        Self {
37            string,
38            start,
39            end,
40            pattern_codes,
41            match_all,
42            must_advance: false,
43        }
44    }
45}
46
47#[derive(Debug)]
48pub struct Marks {
49    last_index: isize,
50    marks: Vec<Optioned<usize>>,
51    marks_stack: Vec<(Vec<Optioned<usize>>, isize)>,
52}
53
54impl Default for Marks {
55    fn default() -> Self {
56        Self {
57            last_index: -1,
58            marks: Vec::new(),
59            marks_stack: Vec::new(),
60        }
61    }
62}
63
64impl Deref for Marks {
65    type Target = Vec<Optioned<usize>>;
66
67    fn deref(&self) -> &Self::Target {
68        &self.marks
69    }
70}
71
72impl Marks {
73    pub fn get(&self, group_index: usize) -> (Optioned<usize>, Optioned<usize>) {
74        let marks_index = 2 * group_index;
75        if marks_index + 1 < self.marks.len() {
76            (self.marks[marks_index], self.marks[marks_index + 1])
77        } else {
78            (Optioned::none(), Optioned::none())
79        }
80    }
81
82    pub fn last_index(&self) -> isize {
83        self.last_index
84    }
85
86    fn set(&mut self, mark_nr: usize, position: usize) {
87        if mark_nr & 1 != 0 {
88            self.last_index = mark_nr as isize / 2 + 1;
89        }
90        if mark_nr >= self.marks.len() {
91            self.marks.resize(mark_nr + 1, Optioned::none());
92        }
93        self.marks[mark_nr] = Optioned::some(position);
94    }
95
96    fn push(&mut self) {
97        self.marks_stack.push((self.marks.clone(), self.last_index));
98    }
99
100    fn pop(&mut self) {
101        let (marks, last_index) = self.marks_stack.pop().unwrap();
102        self.marks = marks;
103        self.last_index = last_index;
104    }
105
106    fn pop_keep(&mut self) {
107        let (marks, last_index) = self.marks_stack.last().unwrap().clone();
108        self.marks = marks;
109        self.last_index = last_index;
110    }
111
112    fn pop_discard(&mut self) {
113        self.marks_stack.pop();
114    }
115
116    fn clear(&mut self) {
117        self.last_index = -1;
118        self.marks.clear();
119        self.marks_stack.clear();
120    }
121}
122
123#[derive(Debug)]
124pub struct State<S: StrDrive> {
125    pub marks: Marks,
126    context_stack: Vec<MatchContext<S>>,
127    repeat_stack: Vec<RepeatContext>,
128    pub start: usize,
129    pub string_position: usize,
130    next_context: Option<MatchContext<S>>,
131    popped_has_matched: bool,
132    pub has_matched: bool,
133}
134
135impl<S: StrDrive> Default for State<S> {
136    fn default() -> Self {
137        Self {
138            marks: Marks::default(),
139            context_stack: Vec::new(),
140            repeat_stack: Vec::new(),
141            start: 0,
142            string_position: 0,
143            next_context: None,
144            popped_has_matched: false,
145            has_matched: false,
146        }
147    }
148}
149
150impl<S: StrDrive> State<S> {
151    pub fn reset(&mut self, start: usize) {
152        self.marks.clear();
153        self.context_stack.clear();
154        self.repeat_stack.clear();
155        self.start = start;
156        self.string_position = start;
157        self.next_context = None;
158        self.popped_has_matched = false;
159        self.has_matched = false;
160    }
161
162    fn _match(&mut self, req: &mut Request<S>) {
163        while let Some(mut ctx) = self.context_stack.pop() {
164            if let Some(handler) = ctx.handler.take() {
165                handler(req, self, &mut ctx);
166            } else if ctx.remaining_codes(req) > 0 {
167                let code = ctx.peek_code(req, 0);
168                let code = SreOpcode::try_from(code).unwrap();
169                dispatch(req, self, &mut ctx, code);
170            } else {
171                ctx.failure();
172            }
173
174            if let Some(has_matched) = ctx.has_matched {
175                self.popped_has_matched = has_matched;
176            } else {
177                self.context_stack.push(ctx);
178                if let Some(next_ctx) = self.next_context.take() {
179                    self.context_stack.push(next_ctx);
180                }
181            }
182        }
183        self.has_matched = self.popped_has_matched;
184    }
185
186    pub fn pymatch(&mut self, mut req: Request<S>) {
187        self.start = req.start;
188        self.string_position = req.start;
189
190        let ctx = MatchContext {
191            string_position: req.start,
192            string_offset: req.string.offset(0, req.start),
193            code_position: 0,
194            has_matched: None,
195            toplevel: true,
196            handler: None,
197            repeat_ctx_id: usize::MAX,
198            count: -1,
199        };
200        self.context_stack.push(ctx);
201
202        self._match(&mut req);
203    }
204
205    pub fn search(&mut self, mut req: Request<S>) {
206        self.start = req.start;
207        self.string_position = req.start;
208
209        // TODO: optimize by op info and skip prefix
210        if req.start > req.end {
211            return;
212        }
213
214        let mut end = req.end;
215
216        let mut start_offset = req.string.offset(0, req.start);
217
218        let mut ctx = MatchContext {
219            string_position: req.start,
220            string_offset: start_offset,
221            code_position: 0,
222            has_matched: None,
223            toplevel: true,
224            handler: None,
225            repeat_ctx_id: usize::MAX,
226            count: -1,
227        };
228
229        if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 {
230            /* optimization info block */
231            /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>  */
232            let req = &mut req;
233            let min = ctx.peek_code(req, 3) as usize;
234
235            if ctx.remaining_chars(req) < min {
236                return;
237            }
238
239            if min > 1 {
240                /* adjust end point (but make sure we leave at least one
241                character in there, so literal search will work) */
242                // no overflow can happen as remaining chars >= min
243                end -= min - 1;
244
245                // adjust ctx position
246                if end < ctx.string_position {
247                    ctx.string_position = end;
248                    ctx.string_offset = req.string.offset(0, ctx.string_position);
249                }
250            }
251
252            let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2));
253
254            if flags.contains(SreInfo::PREFIX) {
255                if flags.contains(SreInfo::LITERAL) {
256                    search_info_literal::<true, S>(req, self, ctx);
257                } else {
258                    search_info_literal::<false, S>(req, self, ctx);
259                }
260                return;
261            } else if flags.contains(SreInfo::CHARSET) {
262                return search_info_charset(req, self, ctx);
263            }
264            // fallback to general search
265        }
266
267        self.context_stack.push(ctx);
268        self._match(&mut req);
269
270        req.must_advance = false;
271        ctx.toplevel = false;
272        while !self.has_matched && req.start < end {
273            req.start += 1;
274            start_offset = req.string.offset(start_offset, 1);
275            self.reset(req.start);
276            ctx.string_position = req.start;
277            ctx.string_offset = start_offset;
278
279            self.context_stack.push(ctx);
280            self._match(&mut req);
281        }
282    }
283}
284
285pub struct SearchIter<'a, S: StrDrive> {
286    pub req: Request<'a, S>,
287    pub state: State<S>,
288}
289
290impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> {
291    type Item = ();
292
293    fn next(&mut self) -> Option<Self::Item> {
294        if self.req.start > self.req.end {
295            return None;
296        }
297
298        self.state.reset(self.req.start);
299        self.state.search(self.req);
300        if !self.state.has_matched {
301            return None;
302        }
303
304        self.req.must_advance = self.state.string_position == self.state.start;
305        self.req.start = self.state.string_position;
306
307        Some(())
308    }
309}
310
311fn dispatch<S: StrDrive>(
312    req: &Request<S>,
313    state: &mut State<S>,
314    ctx: &mut MatchContext<S>,
315    opcode: SreOpcode,
316) {
317    match opcode {
318        SreOpcode::FAILURE => {
319            ctx.failure();
320        }
321        SreOpcode::SUCCESS => {
322            if ctx.can_success(req) {
323                state.string_position = ctx.string_position;
324                ctx.success();
325            } else {
326                ctx.failure();
327            }
328        }
329        SreOpcode::ANY => {
330            if ctx.at_end(req) || ctx.at_linebreak(req) {
331                ctx.failure();
332            } else {
333                ctx.skip_code(1);
334                ctx.skip_char(req, 1);
335            }
336        }
337        SreOpcode::ANY_ALL => {
338            if ctx.at_end(req) {
339                ctx.failure();
340            } else {
341                ctx.skip_code(1);
342                ctx.skip_char(req, 1);
343            }
344        }
345        SreOpcode::ASSERT => op_assert(req, state, ctx),
346        SreOpcode::ASSERT_NOT => op_assert_not(req, state, ctx),
347        SreOpcode::AT => {
348            let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap();
349            if at(req, ctx, atcode) {
350                ctx.skip_code(2);
351            } else {
352                ctx.failure();
353            }
354        }
355        SreOpcode::BRANCH => op_branch(req, state, ctx),
356        SreOpcode::CATEGORY => {
357            let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap();
358            if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) {
359                ctx.failure();
360            } else {
361                ctx.skip_code(2);
362                ctx.skip_char(req, 1);
363            }
364        }
365        SreOpcode::IN => general_op_in(req, ctx, charset),
366        SreOpcode::IN_IGNORE => general_op_in(req, ctx, |set, c| charset(set, lower_ascii(c))),
367        SreOpcode::IN_UNI_IGNORE => {
368            general_op_in(req, ctx, |set, c| charset(set, lower_unicode(c)))
369        }
370        SreOpcode::IN_LOC_IGNORE => general_op_in(req, ctx, charset_loc_ignore),
371        SreOpcode::INFO => {
372            let min = ctx.peek_code(req, 3) as usize;
373            if ctx.remaining_chars(req) < min {
374                ctx.failure();
375            } else {
376                ctx.skip_code_from(req, 1);
377            }
378        }
379        SreOpcode::JUMP => ctx.skip_code_from(req, 1),
380        SreOpcode::LITERAL => general_op_literal(req, ctx, |code, c| code == c),
381        SreOpcode::NOT_LITERAL => general_op_literal(req, ctx, |code, c| code != c),
382        SreOpcode::LITERAL_IGNORE => general_op_literal(req, ctx, |code, c| code == lower_ascii(c)),
383        SreOpcode::NOT_LITERAL_IGNORE => {
384            general_op_literal(req, ctx, |code, c| code != lower_ascii(c))
385        }
386        SreOpcode::LITERAL_UNI_IGNORE => {
387            general_op_literal(req, ctx, |code, c| code == lower_unicode(c))
388        }
389        SreOpcode::NOT_LITERAL_UNI_IGNORE => {
390            general_op_literal(req, ctx, |code, c| code != lower_unicode(c))
391        }
392        SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(req, ctx, char_loc_ignore),
393        SreOpcode::NOT_LITERAL_LOC_IGNORE => {
394            general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c))
395        }
396        SreOpcode::MARK => {
397            state
398                .marks
399                .set(ctx.peek_code(req, 1) as usize, ctx.string_position);
400            ctx.skip_code(2);
401        }
402        SreOpcode::MAX_UNTIL => op_max_until(state, ctx),
403        SreOpcode::MIN_UNTIL => op_min_until(state, ctx),
404        SreOpcode::REPEAT => op_repeat(req, state, ctx),
405        SreOpcode::REPEAT_ONE => op_repeat_one(req, state, ctx),
406        SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(req, state, ctx),
407        SreOpcode::GROUPREF => general_op_groupref(req, state, ctx, |x| x),
408        SreOpcode::GROUPREF_IGNORE => general_op_groupref(req, state, ctx, lower_ascii),
409        SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate),
410        SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode),
411        SreOpcode::GROUPREF_EXISTS => {
412            let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize);
413            if group_start.is_some()
414                && group_end.is_some()
415                && group_start.unpack() <= group_end.unpack()
416            {
417                ctx.skip_code(3);
418            } else {
419                ctx.skip_code_from(req, 2)
420            }
421        }
422        _ => unreachable!("unexpected opcode"),
423    }
424}
425
426fn search_info_literal<const LITERAL: bool, S: StrDrive>(
427    req: &mut Request<S>,
428    state: &mut State<S>,
429    mut ctx: MatchContext<S>,
430) {
431    /* pattern starts with a known prefix */
432    /* <length> <skip> <prefix data> <overlap data> */
433    let len = ctx.peek_code(req, 5) as usize;
434    let skip = ctx.peek_code(req, 6) as usize;
435    let prefix = &ctx.pattern(req)[7..7 + len];
436    let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2];
437
438    // code_position ready for tail match
439    ctx.skip_code_from(req, 1);
440    ctx.skip_code(2 * skip);
441
442    req.must_advance = false;
443
444    if len == 1 {
445        // pattern starts with a literal character
446        let c = prefix[0];
447
448        while !ctx.at_end(req) {
449            // find the next matched literal
450            while ctx.peek_char(req) != c {
451                ctx.skip_char(req, 1);
452                if ctx.at_end(req) {
453                    return;
454                }
455            }
456
457            req.start = ctx.string_position;
458            state.start = ctx.string_position;
459            state.string_position = ctx.string_position + skip;
460
461            // literal only
462            if LITERAL {
463                state.has_matched = true;
464                return;
465            }
466
467            let mut next_ctx = ctx;
468            next_ctx.skip_char(req, skip);
469
470            state.context_stack.push(next_ctx);
471            state._match(req);
472
473            if state.has_matched {
474                return;
475            }
476
477            ctx.skip_char(req, 1);
478            state.marks.clear();
479        }
480    } else {
481        while !ctx.at_end(req) {
482            let c = prefix[0];
483            while ctx.peek_char(req) != c {
484                ctx.skip_char(req, 1);
485                if ctx.at_end(req) {
486                    return;
487                }
488            }
489            ctx.skip_char(req, 1);
490            if ctx.at_end(req) {
491                return;
492            }
493
494            let mut i = 1;
495            loop {
496                if ctx.peek_char(req) == prefix[i] {
497                    i += 1;
498                    if i != len {
499                        ctx.skip_char(req, 1);
500                        if ctx.at_end(req) {
501                            return;
502                        }
503                        continue;
504                    }
505
506                    req.start = ctx.string_position - (len - 1);
507                    state.start = req.start;
508                    state.string_position = state.start + skip;
509
510                    // literal only
511                    if LITERAL {
512                        state.has_matched = true;
513                        return;
514                    }
515
516                    let mut next_ctx = ctx;
517                    if skip != 0 {
518                        next_ctx.skip_char(req, 1);
519                    } else {
520                        next_ctx.string_position = state.string_position;
521                        next_ctx.string_offset = req.string.offset(0, state.string_position);
522                    }
523
524                    state.context_stack.push(next_ctx);
525                    state._match(req);
526
527                    if state.has_matched {
528                        return;
529                    }
530
531                    ctx.skip_char(req, 1);
532                    if ctx.at_end(req) {
533                        return;
534                    }
535                    state.marks.clear();
536                }
537
538                i = overlap[i] as usize;
539                if i == 0 {
540                    break;
541                }
542            }
543        }
544    }
545}
546
547fn search_info_charset<S: StrDrive>(
548    req: &mut Request<S>,
549    state: &mut State<S>,
550    mut ctx: MatchContext<S>,
551) {
552    let set = &ctx.pattern(req)[5..];
553
554    ctx.skip_code_from(req, 1);
555
556    req.must_advance = false;
557
558    loop {
559        while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) {
560            ctx.skip_char(req, 1);
561        }
562        if ctx.at_end(req) {
563            return;
564        }
565
566        req.start = ctx.string_position;
567        state.start = ctx.string_position;
568        state.string_position = ctx.string_position;
569
570        state.context_stack.push(ctx);
571        state._match(req);
572
573        if state.has_matched {
574            return;
575        }
576
577        ctx.skip_char(req, 1);
578        state.marks.clear();
579    }
580}
581
582/* assert subpattern */
583/* <ASSERT> <skip> <back> <pattern> */
584fn op_assert<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
585    let back = ctx.peek_code(req, 2) as usize;
586    if ctx.string_position < back {
587        return ctx.failure();
588    }
589
590    let next_ctx = ctx.next_offset(3, state, |req, state, ctx| {
591        if state.popped_has_matched {
592            ctx.skip_code_from(req, 1);
593        } else {
594            ctx.failure();
595        }
596    });
597    next_ctx.toplevel = false;
598    next_ctx.back_skip_char(req, back);
599    state.string_position = next_ctx.string_position;
600}
601
602/* assert not subpattern */
603/* <ASSERT_NOT> <skip> <back> <pattern> */
604fn op_assert_not<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
605    let back = ctx.peek_code(req, 2) as usize;
606
607    if ctx.string_position < back {
608        return ctx.skip_code_from(req, 1);
609    }
610
611    let next_ctx = ctx.next_offset(3, state, |req, state, ctx| {
612        if state.popped_has_matched {
613            ctx.failure();
614        } else {
615            ctx.skip_code_from(req, 1);
616        }
617    });
618    next_ctx.toplevel = false;
619    next_ctx.back_skip_char(req, back);
620    state.string_position = next_ctx.string_position;
621}
622
623// alternation
624// <BRANCH> <0=skip> code <JUMP> ... <NULL>
625fn op_branch<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
626    state.marks.push();
627
628    ctx.count = 1;
629    create_context(req, state, ctx);
630
631    fn create_context<S: StrDrive>(
632        req: &Request<S>,
633        state: &mut State<S>,
634        ctx: &mut MatchContext<S>,
635    ) {
636        let branch_offset = ctx.count as usize;
637        let next_length = ctx.peek_code(req, branch_offset) as isize;
638        if next_length == 0 {
639            state.marks.pop_discard();
640            return ctx.failure();
641        }
642
643        state.string_position = ctx.string_position;
644
645        ctx.count += next_length;
646        ctx.next_offset(branch_offset + 1, state, callback);
647    }
648
649    fn callback<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
650        if state.popped_has_matched {
651            return ctx.success();
652        }
653        state.marks.pop_keep();
654        create_context(req, state, ctx);
655    }
656}
657
658/* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
659fn op_min_repeat_one<S: StrDrive>(
660    req: &Request<S>,
661    state: &mut State<S>,
662    ctx: &mut MatchContext<S>,
663) {
664    let min_count = ctx.peek_code(req, 2) as usize;
665
666    if ctx.remaining_chars(req) < min_count {
667        return ctx.failure();
668    }
669
670    state.string_position = ctx.string_position;
671
672    ctx.count = if min_count == 0 {
673        0
674    } else {
675        let count = _count(req, state, ctx, min_count);
676        if count < min_count {
677            return ctx.failure();
678        }
679        ctx.skip_char(req, count);
680        count as isize
681    };
682
683    let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1);
684    if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) {
685        // tail is empty. we're finished
686        state.string_position = ctx.string_position;
687        return ctx.success();
688    }
689
690    state.marks.push();
691    create_context(req, state, ctx);
692
693    fn create_context<S: StrDrive>(
694        req: &Request<S>,
695        state: &mut State<S>,
696        ctx: &mut MatchContext<S>,
697    ) {
698        let max_count = ctx.peek_code(req, 3) as usize;
699
700        if max_count == MAXREPEAT || ctx.count as usize <= max_count {
701            state.string_position = ctx.string_position;
702            ctx.next_from(1, req, state, callback);
703        } else {
704            state.marks.pop_discard();
705            ctx.failure();
706        }
707    }
708
709    fn callback<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
710        if state.popped_has_matched {
711            return ctx.success();
712        }
713
714        state.string_position = ctx.string_position;
715
716        if _count(req, state, ctx, 1) == 0 {
717            state.marks.pop_discard();
718            return ctx.failure();
719        }
720
721        ctx.skip_char(req, 1);
722        ctx.count += 1;
723        state.marks.pop_keep();
724        create_context(req, state, ctx);
725    }
726}
727
728/* match repeated sequence (maximizing regexp) */
729/* this operator only works if the repeated item is
730exactly one character wide, and we're not already
731collecting backtracking points.  for other cases,
732use the MAX_REPEAT operator */
733/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
734fn op_repeat_one<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
735    let min_count = ctx.peek_code(req, 2) as usize;
736    let max_count = ctx.peek_code(req, 3) as usize;
737
738    if ctx.remaining_chars(req) < min_count {
739        return ctx.failure();
740    }
741
742    state.string_position = ctx.string_position;
743
744    let count = _count(req, state, ctx, max_count);
745    ctx.skip_char(req, count);
746    if count < min_count {
747        return ctx.failure();
748    }
749
750    let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1);
751    if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) {
752        // tail is empty. we're finished
753        state.string_position = ctx.string_position;
754        return ctx.success();
755    }
756
757    state.marks.push();
758    ctx.count = count as isize;
759    create_context(req, state, ctx);
760
761    fn create_context<S: StrDrive>(
762        req: &Request<S>,
763        state: &mut State<S>,
764        ctx: &mut MatchContext<S>,
765    ) {
766        let min_count = ctx.peek_code(req, 2) as isize;
767        let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1);
768        if next_code == SreOpcode::LITERAL as u32 {
769            // Special case: Tail starts with a literal. Skip positions where
770            // the rest of the pattern cannot possibly match.
771            let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2);
772            while ctx.at_end(req) || ctx.peek_char(req) != c {
773                if ctx.count <= min_count {
774                    state.marks.pop_discard();
775                    return ctx.failure();
776                }
777                ctx.back_skip_char(req, 1);
778                ctx.count -= 1;
779            }
780        }
781
782        state.string_position = ctx.string_position;
783
784        // General case: backtracking
785        ctx.next_from(1, req, state, callback);
786    }
787
788    fn callback<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
789        if state.popped_has_matched {
790            return ctx.success();
791        }
792
793        let min_count = ctx.peek_code(req, 2) as isize;
794
795        if ctx.count <= min_count {
796            state.marks.pop_discard();
797            return ctx.failure();
798        }
799
800        ctx.back_skip_char(req, 1);
801        ctx.count -= 1;
802
803        state.marks.pop_keep();
804        create_context(req, state, ctx);
805    }
806}
807
808#[derive(Debug, Clone, Copy)]
809struct RepeatContext {
810    count: isize,
811    min_count: usize,
812    max_count: usize,
813    code_position: usize,
814    last_position: usize,
815    prev_id: usize,
816}
817
818/* create repeat context.  all the hard work is done
819by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
820/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
821fn op_repeat<S: StrDrive>(req: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
822    let repeat_ctx = RepeatContext {
823        count: -1,
824        min_count: ctx.peek_code(req, 2) as usize,
825        max_count: ctx.peek_code(req, 3) as usize,
826        code_position: ctx.code_position,
827        last_position: std::usize::MAX,
828        prev_id: ctx.repeat_ctx_id,
829    };
830
831    state.repeat_stack.push(repeat_ctx);
832
833    state.string_position = ctx.string_position;
834
835    let repeat_ctx_id = state.repeat_stack.len() - 1;
836
837    let next_ctx = ctx.next_from(1, req, state, |_, state, ctx| {
838        ctx.has_matched = Some(state.popped_has_matched);
839        state.repeat_stack.pop();
840    });
841    next_ctx.repeat_ctx_id = repeat_ctx_id;
842}
843
844/* minimizing repeat */
845fn op_min_until<S: StrDrive>(state: &mut State<S>, ctx: &mut MatchContext<S>) {
846    let repeat_ctx = state.repeat_stack.last_mut().unwrap();
847
848    state.string_position = ctx.string_position;
849
850    repeat_ctx.count += 1;
851
852    if (repeat_ctx.count as usize) < repeat_ctx.min_count {
853        // not enough matches
854        ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| {
855            if state.popped_has_matched {
856                ctx.success();
857            } else {
858                state.repeat_stack[ctx.repeat_ctx_id].count -= 1;
859                state.string_position = ctx.string_position;
860                ctx.failure();
861            }
862        });
863        return;
864    }
865
866    state.marks.push();
867
868    ctx.count = ctx.repeat_ctx_id as isize;
869
870    let repeat_ctx_prev_id = repeat_ctx.prev_id;
871
872    // see if the tail matches
873    let next_ctx = ctx.next_offset(1, state, |_, state, ctx| {
874        if state.popped_has_matched {
875            return ctx.success();
876        }
877
878        ctx.repeat_ctx_id = ctx.count as usize;
879
880        let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id];
881
882        state.string_position = ctx.string_position;
883
884        state.marks.pop();
885
886        // match more until tail matches
887
888        if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT
889            || state.string_position == repeat_ctx.last_position
890        {
891            repeat_ctx.count -= 1;
892            return ctx.failure();
893        }
894
895        /* zero-width match protection */
896        repeat_ctx.last_position = state.string_position;
897
898        ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| {
899            if state.popped_has_matched {
900                ctx.success();
901            } else {
902                state.repeat_stack[ctx.repeat_ctx_id].count -= 1;
903                state.string_position = ctx.string_position;
904                ctx.failure();
905            }
906        });
907    });
908    next_ctx.repeat_ctx_id = repeat_ctx_prev_id;
909}
910
911/* maximizing repeat */
912fn op_max_until<S: StrDrive>(state: &mut State<S>, ctx: &mut MatchContext<S>) {
913    let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id];
914
915    state.string_position = ctx.string_position;
916
917    repeat_ctx.count += 1;
918
919    if (repeat_ctx.count as usize) < repeat_ctx.min_count {
920        // not enough matches
921        ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| {
922            if state.popped_has_matched {
923                ctx.success();
924            } else {
925                state.repeat_stack[ctx.repeat_ctx_id].count -= 1;
926                state.string_position = ctx.string_position;
927                ctx.failure();
928            }
929        });
930        return;
931    }
932
933    if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT)
934        && state.string_position != repeat_ctx.last_position
935    {
936        /* we may have enough matches, but if we can
937        match another item, do so */
938        state.marks.push();
939
940        ctx.count = repeat_ctx.last_position as isize;
941        repeat_ctx.last_position = state.string_position;
942
943        ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| {
944            let save_last_position = ctx.count as usize;
945            let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id];
946            repeat_ctx.last_position = save_last_position;
947
948            if state.popped_has_matched {
949                state.marks.pop_discard();
950                return ctx.success();
951            }
952
953            state.marks.pop();
954            repeat_ctx.count -= 1;
955
956            state.string_position = ctx.string_position;
957
958            /* cannot match more repeated items here.  make sure the
959            tail matches */
960            let repeat_ctx_prev_id = repeat_ctx.prev_id;
961            let next_ctx = ctx.next_offset(1, state, tail_callback);
962            next_ctx.repeat_ctx_id = repeat_ctx_prev_id;
963        });
964        return;
965    }
966
967    /* cannot match more repeated items here.  make sure the
968    tail matches */
969    let repeat_ctx_prev_id = repeat_ctx.prev_id;
970    let next_ctx = ctx.next_offset(1, state, tail_callback);
971    next_ctx.repeat_ctx_id = repeat_ctx_prev_id;
972
973    fn tail_callback<S: StrDrive>(_: &Request<S>, state: &mut State<S>, ctx: &mut MatchContext<S>) {
974        if state.popped_has_matched {
975            ctx.success();
976        } else {
977            state.string_position = ctx.string_position;
978            ctx.failure();
979        }
980    }
981}
982
983pub trait StrDrive: Copy {
984    fn offset(&self, offset: usize, skip: usize) -> usize;
985    fn count(&self) -> usize;
986    fn peek(&self, offset: usize) -> u32;
987    fn back_peek(&self, offset: usize) -> u32;
988    fn back_offset(&self, offset: usize, skip: usize) -> usize;
989}
990
991impl StrDrive for &str {
992    fn offset(&self, offset: usize, skip: usize) -> usize {
993        self.get(offset..)
994            .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset))
995            .unwrap_or(self.len())
996    }
997
998    fn count(&self) -> usize {
999        self.chars().count()
1000    }
1001
1002    fn peek(&self, offset: usize) -> u32 {
1003        unsafe { self.get_unchecked(offset..) }
1004            .chars()
1005            .next()
1006            .unwrap() as u32
1007    }
1008
1009    fn back_peek(&self, offset: usize) -> u32 {
1010        let bytes = self.as_bytes();
1011        let back_offset = utf8_back_peek_offset(bytes, offset);
1012        match offset - back_offset {
1013            1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]),
1014            2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]),
1015            3 => u32::from_be_bytes([0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1]]),
1016            4 => u32::from_be_bytes([
1017                bytes[offset - 4],
1018                bytes[offset - 3],
1019                bytes[offset - 2],
1020                bytes[offset - 1],
1021            ]),
1022            _ => unreachable!(),
1023        }
1024    }
1025
1026    fn back_offset(&self, offset: usize, skip: usize) -> usize {
1027        let bytes = self.as_bytes();
1028        let mut back_offset = offset;
1029        for _ in 0..skip {
1030            back_offset = utf8_back_peek_offset(bytes, back_offset);
1031        }
1032        back_offset
1033    }
1034}
1035
1036impl<'a> StrDrive for &'a [u8] {
1037    fn offset(&self, offset: usize, skip: usize) -> usize {
1038        offset + skip
1039    }
1040
1041    fn count(&self) -> usize {
1042        self.len()
1043    }
1044
1045    fn peek(&self, offset: usize) -> u32 {
1046        self[offset] as u32
1047    }
1048
1049    fn back_peek(&self, offset: usize) -> u32 {
1050        self[offset - 1] as u32
1051    }
1052
1053    fn back_offset(&self, offset: usize, skip: usize) -> usize {
1054        offset - skip
1055    }
1056}
1057
1058type OpFunc<S> = for<'a> fn(&Request<'a, S>, &mut State<S>, &mut MatchContext<S>);
1059
1060#[derive(Clone, Copy)]
1061struct MatchContext<S: StrDrive> {
1062    string_position: usize,
1063    string_offset: usize,
1064    code_position: usize,
1065    has_matched: Option<bool>,
1066    toplevel: bool,
1067    handler: Option<OpFunc<S>>,
1068    repeat_ctx_id: usize,
1069    count: isize,
1070}
1071
1072impl<S: StrDrive> std::fmt::Debug for MatchContext<S> {
1073    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1074        f.debug_struct("MatchContext")
1075            .field("string_position", &self.string_position)
1076            .field("string_offset", &self.string_offset)
1077            .field("code_position", &self.code_position)
1078            .field("has_matched", &self.has_matched)
1079            .field("toplevel", &self.toplevel)
1080            .field("handler", &self.handler.map(|x| x as usize))
1081            .field("repeat_ctx_id", &self.repeat_ctx_id)
1082            .field("count", &self.count)
1083            .finish()
1084    }
1085}
1086
1087impl<S: StrDrive> MatchContext<S> {
1088    fn pattern<'a>(&self, req: &Request<'a, S>) -> &'a [u32] {
1089        &req.pattern_codes[self.code_position..]
1090    }
1091
1092    fn remaining_codes(&self, req: &Request<S>) -> usize {
1093        req.pattern_codes.len() - self.code_position
1094    }
1095
1096    fn remaining_chars(&self, req: &Request<S>) -> usize {
1097        req.end - self.string_position
1098    }
1099
1100    fn peek_char(&self, req: &Request<S>) -> u32 {
1101        req.string.peek(self.string_offset)
1102    }
1103
1104    fn skip_char(&mut self, req: &Request<S>, skip: usize) {
1105        self.string_position += skip;
1106        self.string_offset = req.string.offset(self.string_offset, skip);
1107    }
1108
1109    fn back_peek_char(&self, req: &Request<S>) -> u32 {
1110        req.string.back_peek(self.string_offset)
1111    }
1112
1113    fn back_skip_char(&mut self, req: &Request<S>, skip: usize) {
1114        self.string_position -= skip;
1115        self.string_offset = req.string.back_offset(self.string_offset, skip);
1116    }
1117
1118    fn peek_code(&self, req: &Request<S>, peek: usize) -> u32 {
1119        req.pattern_codes[self.code_position + peek]
1120    }
1121
1122    fn skip_code(&mut self, skip: usize) {
1123        self.code_position += skip;
1124    }
1125
1126    fn skip_code_from(&mut self, req: &Request<S>, peek: usize) {
1127        self.skip_code(self.peek_code(req, peek) as usize + 1);
1128    }
1129
1130    fn at_beginning(&self) -> bool {
1131        // self.ctx().string_position == self.state().start
1132        self.string_position == 0
1133    }
1134
1135    fn at_end(&self, req: &Request<S>) -> bool {
1136        self.string_position == req.end
1137    }
1138
1139    fn at_linebreak(&self, req: &Request<S>) -> bool {
1140        !self.at_end(req) && is_linebreak(self.peek_char(req))
1141    }
1142
1143    fn at_boundary<F: FnMut(u32) -> bool>(&self, req: &Request<S>, mut word_checker: F) -> bool {
1144        if self.at_beginning() && self.at_end(req) {
1145            return false;
1146        }
1147        let that = !self.at_beginning() && word_checker(self.back_peek_char(req));
1148        let this = !self.at_end(req) && word_checker(self.peek_char(req));
1149        this != that
1150    }
1151
1152    fn at_non_boundary<F: FnMut(u32) -> bool>(
1153        &self,
1154        req: &Request<S>,
1155        mut word_checker: F,
1156    ) -> bool {
1157        if self.at_beginning() && self.at_end(req) {
1158            return false;
1159        }
1160        let that = !self.at_beginning() && word_checker(self.back_peek_char(req));
1161        let this = !self.at_end(req) && word_checker(self.peek_char(req));
1162        this == that
1163    }
1164
1165    fn can_success(&self, req: &Request<S>) -> bool {
1166        if !self.toplevel {
1167            return true;
1168        }
1169        if req.match_all && !self.at_end(req) {
1170            return false;
1171        }
1172        if req.must_advance && self.string_position == req.start {
1173            return false;
1174        }
1175        true
1176    }
1177
1178    fn success(&mut self) {
1179        self.has_matched = Some(true);
1180    }
1181
1182    fn failure(&mut self) {
1183        self.has_matched = Some(false);
1184    }
1185
1186    fn next_from<'b>(
1187        &mut self,
1188        peek: usize,
1189        req: &Request<S>,
1190        state: &'b mut State<S>,
1191        f: OpFunc<S>,
1192    ) -> &'b mut Self {
1193        self.next_offset(self.peek_code(req, peek) as usize + 1, state, f)
1194    }
1195
1196    fn next_offset<'b>(
1197        &mut self,
1198        offset: usize,
1199        state: &'b mut State<S>,
1200        f: OpFunc<S>,
1201    ) -> &'b mut Self {
1202        self.next_at(self.code_position + offset, state, f)
1203    }
1204
1205    fn next_at<'b>(
1206        &mut self,
1207        code_position: usize,
1208        state: &'b mut State<S>,
1209        f: OpFunc<S>,
1210    ) -> &'b mut Self {
1211        self.handler = Some(f);
1212        state.next_context.insert(MatchContext {
1213            code_position,
1214            has_matched: None,
1215            handler: None,
1216            count: -1,
1217            ..*self
1218        })
1219    }
1220}
1221
1222fn at<S: StrDrive>(req: &Request<S>, ctx: &MatchContext<S>, atcode: SreAtCode) -> bool {
1223    match atcode {
1224        SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(),
1225        SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)),
1226        SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word),
1227        SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word),
1228        SreAtCode::END => {
1229            (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req)
1230        }
1231        SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req),
1232        SreAtCode::END_STRING => ctx.at_end(req),
1233        SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word),
1234        SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word),
1235        SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word),
1236        SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word),
1237    }
1238}
1239
1240fn general_op_literal<S: StrDrive, F: FnOnce(u32, u32) -> bool>(
1241    req: &Request<S>,
1242    ctx: &mut MatchContext<S>,
1243    f: F,
1244) {
1245    if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) {
1246        ctx.failure();
1247    } else {
1248        ctx.skip_code(2);
1249        ctx.skip_char(req, 1);
1250    }
1251}
1252
1253fn general_op_in<S: StrDrive, F: FnOnce(&[u32], u32) -> bool>(
1254    req: &Request<S>,
1255    ctx: &mut MatchContext<S>,
1256    f: F,
1257) {
1258    if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) {
1259        ctx.failure();
1260    } else {
1261        ctx.skip_code_from(req, 1);
1262        ctx.skip_char(req, 1);
1263    }
1264}
1265
1266fn general_op_groupref<S: StrDrive, F: FnMut(u32) -> u32>(
1267    req: &Request<S>,
1268    state: &State<S>,
1269    ctx: &mut MatchContext<S>,
1270    mut f: F,
1271) {
1272    let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize);
1273    let (group_start, group_end) = if group_start.is_some()
1274        && group_end.is_some()
1275        && group_start.unpack() <= group_end.unpack()
1276    {
1277        (group_start.unpack(), group_end.unpack())
1278    } else {
1279        return ctx.failure();
1280    };
1281
1282    let mut gctx = MatchContext {
1283        string_position: group_start,
1284        string_offset: req.string.offset(0, group_start),
1285        ..*ctx
1286    };
1287
1288    for _ in group_start..group_end {
1289        if ctx.at_end(req) || f(ctx.peek_char(req)) != f(gctx.peek_char(req)) {
1290            return ctx.failure();
1291        }
1292        ctx.skip_char(req, 1);
1293        gctx.skip_char(req, 1);
1294    }
1295
1296    ctx.skip_code(2);
1297}
1298
1299fn char_loc_ignore(code: u32, c: u32) -> bool {
1300    code == c || code == lower_locate(c) || code == upper_locate(c)
1301}
1302
1303fn charset_loc_ignore(set: &[u32], c: u32) -> bool {
1304    let lo = lower_locate(c);
1305    if charset(set, c) {
1306        return true;
1307    }
1308    let up = upper_locate(c);
1309    up != lo && charset(set, up)
1310}
1311
1312fn category(catcode: SreCatCode, c: u32) -> bool {
1313    match catcode {
1314        SreCatCode::DIGIT => is_digit(c),
1315        SreCatCode::NOT_DIGIT => !is_digit(c),
1316        SreCatCode::SPACE => is_space(c),
1317        SreCatCode::NOT_SPACE => !is_space(c),
1318        SreCatCode::WORD => is_word(c),
1319        SreCatCode::NOT_WORD => !is_word(c),
1320        SreCatCode::LINEBREAK => is_linebreak(c),
1321        SreCatCode::NOT_LINEBREAK => !is_linebreak(c),
1322        SreCatCode::LOC_WORD => is_loc_word(c),
1323        SreCatCode::LOC_NOT_WORD => !is_loc_word(c),
1324        SreCatCode::UNI_DIGIT => is_uni_digit(c),
1325        SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c),
1326        SreCatCode::UNI_SPACE => is_uni_space(c),
1327        SreCatCode::UNI_NOT_SPACE => !is_uni_space(c),
1328        SreCatCode::UNI_WORD => is_uni_word(c),
1329        SreCatCode::UNI_NOT_WORD => !is_uni_word(c),
1330        SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c),
1331        SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c),
1332    }
1333}
1334
1335fn charset(set: &[u32], ch: u32) -> bool {
1336    /* check if character is a member of the given set */
1337    let mut ok = true;
1338    let mut i = 0;
1339    while i < set.len() {
1340        let opcode = match SreOpcode::try_from(set[i]) {
1341            Ok(code) => code,
1342            Err(_) => {
1343                break;
1344            }
1345        };
1346        match opcode {
1347            SreOpcode::FAILURE => {
1348                return !ok;
1349            }
1350            SreOpcode::CATEGORY => {
1351                /* <CATEGORY> <code> */
1352                let catcode = match SreCatCode::try_from(set[i + 1]) {
1353                    Ok(code) => code,
1354                    Err(_) => {
1355                        break;
1356                    }
1357                };
1358                if category(catcode, ch) {
1359                    return ok;
1360                }
1361                i += 2;
1362            }
1363            SreOpcode::CHARSET => {
1364                /* <CHARSET> <bitmap> */
1365                let set = &set[i + 1..];
1366                if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) {
1367                    return ok;
1368                }
1369                i += 1 + 8;
1370            }
1371            SreOpcode::BIGCHARSET => {
1372                /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
1373                let count = set[i + 1] as usize;
1374                if ch < 0x10000 {
1375                    let set = &set[i + 2..];
1376                    let block_index = ch >> 8;
1377                    let (_, blockindices, _) = unsafe { set.align_to::<u8>() };
1378                    let blocks = &set[64..];
1379                    let block = blockindices[block_index as usize];
1380                    if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize]
1381                        & (1u32 << (ch & 31))
1382                        != 0
1383                    {
1384                        return ok;
1385                    }
1386                }
1387                i += 2 + 64 + count * 8;
1388            }
1389            SreOpcode::LITERAL => {
1390                /* <LITERAL> <code> */
1391                if ch == set[i + 1] {
1392                    return ok;
1393                }
1394                i += 2;
1395            }
1396            SreOpcode::NEGATE => {
1397                ok = !ok;
1398                i += 1;
1399            }
1400            SreOpcode::RANGE => {
1401                /* <RANGE> <lower> <upper> */
1402                if set[i + 1] <= ch && ch <= set[i + 2] {
1403                    return ok;
1404                }
1405                i += 3;
1406            }
1407            SreOpcode::RANGE_UNI_IGNORE => {
1408                /* <RANGE_UNI_IGNORE> <lower> <upper> */
1409                if set[i + 1] <= ch && ch <= set[i + 2] {
1410                    return ok;
1411                }
1412                let ch = upper_unicode(ch);
1413                if set[i + 1] <= ch && ch <= set[i + 2] {
1414                    return ok;
1415                }
1416                i += 3;
1417            }
1418            _ => {
1419                break;
1420            }
1421        }
1422    }
1423    /* internal error -- there's not much we can do about it
1424    here, so let's just pretend it didn't match... */
1425    false
1426}
1427
1428fn _count<S: StrDrive>(
1429    req: &Request<S>,
1430    state: &mut State<S>,
1431    ctx: &MatchContext<S>,
1432    max_count: usize,
1433) -> usize {
1434    let mut ctx = *ctx;
1435    let max_count = std::cmp::min(max_count, ctx.remaining_chars(req));
1436    let end = ctx.string_position + max_count;
1437    let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap();
1438
1439    match opcode {
1440        SreOpcode::ANY => {
1441            while !ctx.string_position < end && !ctx.at_linebreak(req) {
1442                ctx.skip_char(req, 1);
1443            }
1444        }
1445        SreOpcode::ANY_ALL => {
1446            ctx.skip_char(req, max_count);
1447        }
1448        SreOpcode::IN => {
1449            while !ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req))
1450            {
1451                ctx.skip_char(req, 1);
1452            }
1453        }
1454        SreOpcode::LITERAL => {
1455            general_count_literal(req, &mut ctx, end, |code, c| code == c as u32);
1456        }
1457        SreOpcode::NOT_LITERAL => {
1458            general_count_literal(req, &mut ctx, end, |code, c| code != c as u32);
1459        }
1460        SreOpcode::LITERAL_IGNORE => {
1461            general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c) as u32);
1462        }
1463        SreOpcode::NOT_LITERAL_IGNORE => {
1464            general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c) as u32);
1465        }
1466        SreOpcode::LITERAL_LOC_IGNORE => {
1467            general_count_literal(req, &mut ctx, end, char_loc_ignore);
1468        }
1469        SreOpcode::NOT_LITERAL_LOC_IGNORE => {
1470            general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c));
1471        }
1472        SreOpcode::LITERAL_UNI_IGNORE => {
1473            general_count_literal(req, &mut ctx, end, |code, c| {
1474                code == lower_unicode(c) as u32
1475            });
1476        }
1477        SreOpcode::NOT_LITERAL_UNI_IGNORE => {
1478            general_count_literal(req, &mut ctx, end, |code, c| {
1479                code != lower_unicode(c) as u32
1480            });
1481        }
1482        _ => {
1483            /* General case */
1484            let mut count = 0;
1485
1486            ctx.skip_code(4);
1487            let reset_position = ctx.code_position;
1488
1489            while count < max_count {
1490                ctx.code_position = reset_position;
1491                let code = ctx.peek_code(req, 0);
1492                let code = SreOpcode::try_from(code).unwrap();
1493                dispatch(req, state, &mut ctx, code);
1494                if ctx.has_matched == Some(false) {
1495                    break;
1496                }
1497                count += 1;
1498            }
1499            return count;
1500        }
1501    }
1502
1503    // TODO: return offset
1504    ctx.string_position - state.string_position
1505}
1506
1507fn general_count_literal<S: StrDrive, F: FnMut(u32, u32) -> bool>(
1508    req: &Request<S>,
1509    ctx: &mut MatchContext<S>,
1510    end: usize,
1511    mut f: F,
1512) {
1513    let ch = ctx.peek_code(req, 1);
1514    while !ctx.string_position < end && f(ch, ctx.peek_char(req)) {
1515        ctx.skip_char(req, 1);
1516    }
1517}
1518
1519fn is_word(ch: u32) -> bool {
1520    ch == '_' as u32
1521        || u8::try_from(ch)
1522            .map(|x| x.is_ascii_alphanumeric())
1523            .unwrap_or(false)
1524}
1525fn is_space(ch: u32) -> bool {
1526    u8::try_from(ch)
1527        .map(is_py_ascii_whitespace)
1528        .unwrap_or(false)
1529}
1530fn is_digit(ch: u32) -> bool {
1531    u8::try_from(ch)
1532        .map(|x| x.is_ascii_digit())
1533        .unwrap_or(false)
1534}
1535fn is_loc_alnum(ch: u32) -> bool {
1536    // FIXME: Ignore the locales
1537    u8::try_from(ch)
1538        .map(|x| x.is_ascii_alphanumeric())
1539        .unwrap_or(false)
1540}
1541fn is_loc_word(ch: u32) -> bool {
1542    ch == '_' as u32 || is_loc_alnum(ch)
1543}
1544fn is_linebreak(ch: u32) -> bool {
1545    ch == '\n' as u32
1546}
1547pub fn lower_ascii(ch: u32) -> u32 {
1548    u8::try_from(ch)
1549        .map(|x| x.to_ascii_lowercase() as u32)
1550        .unwrap_or(ch)
1551}
1552fn lower_locate(ch: u32) -> u32 {
1553    // FIXME: Ignore the locales
1554    lower_ascii(ch)
1555}
1556fn upper_locate(ch: u32) -> u32 {
1557    // FIXME: Ignore the locales
1558    u8::try_from(ch)
1559        .map(|x| x.to_ascii_uppercase() as u32)
1560        .unwrap_or(ch)
1561}
1562fn is_uni_digit(ch: u32) -> bool {
1563    // TODO: check with cpython
1564    char::try_from(ch)
1565        .map(|x| x.is_ascii_digit())
1566        .unwrap_or(false)
1567}
1568fn is_uni_space(ch: u32) -> bool {
1569    // TODO: check with cpython
1570    is_space(ch)
1571        || matches!(
1572            ch,
1573            0x0009
1574                | 0x000A
1575                | 0x000B
1576                | 0x000C
1577                | 0x000D
1578                | 0x001C
1579                | 0x001D
1580                | 0x001E
1581                | 0x001F
1582                | 0x0020
1583                | 0x0085
1584                | 0x00A0
1585                | 0x1680
1586                | 0x2000
1587                | 0x2001
1588                | 0x2002
1589                | 0x2003
1590                | 0x2004
1591                | 0x2005
1592                | 0x2006
1593                | 0x2007
1594                | 0x2008
1595                | 0x2009
1596                | 0x200A
1597                | 0x2028
1598                | 0x2029
1599                | 0x202F
1600                | 0x205F
1601                | 0x3000
1602        )
1603}
1604fn is_uni_linebreak(ch: u32) -> bool {
1605    matches!(
1606        ch,
1607        0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
1608    )
1609}
1610fn is_uni_alnum(ch: u32) -> bool {
1611    // TODO: check with cpython
1612    char::try_from(ch)
1613        .map(|x| x.is_alphanumeric())
1614        .unwrap_or(false)
1615}
1616fn is_uni_word(ch: u32) -> bool {
1617    ch == '_' as u32 || is_uni_alnum(ch)
1618}
1619pub fn lower_unicode(ch: u32) -> u32 {
1620    // TODO: check with cpython
1621    char::try_from(ch)
1622        .map(|x| x.to_lowercase().next().unwrap() as u32)
1623        .unwrap_or(ch)
1624}
1625pub fn upper_unicode(ch: u32) -> u32 {
1626    // TODO: check with cpython
1627    char::try_from(ch)
1628        .map(|x| x.to_uppercase().next().unwrap() as u32)
1629        .unwrap_or(ch)
1630}
1631
1632fn is_utf8_first_byte(b: u8) -> bool {
1633    // In UTF-8, there are three kinds of byte...
1634    // 0xxxxxxx : ASCII
1635    // 10xxxxxx : 2nd, 3rd or 4th byte of code
1636    // 11xxxxxx : 1st byte of multibyte code
1637    (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000)
1638}
1639
1640fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize {
1641    let mut offset = offset - 1;
1642    if !is_utf8_first_byte(bytes[offset]) {
1643        offset -= 1;
1644        if !is_utf8_first_byte(bytes[offset]) {
1645            offset -= 1;
1646            if !is_utf8_first_byte(bytes[offset]) {
1647                offset -= 1;
1648                if !is_utf8_first_byte(bytes[offset]) {
1649                    panic!("not utf-8 code point");
1650                }
1651            }
1652        }
1653    }
1654    offset
1655}