Skip to main content

pipa/regexp/
engine.rs

1use super::charclass::{is_line_terminator, is_word_char};
2use super::opcode::*;
3use super::pool::BacktrackPool;
4
5#[inline(always)]
6fn likely(b: bool) -> bool {
7    b
8}
9
10#[inline(always)]
11fn unlikely(b: bool) -> bool {
12    b
13}
14
15#[derive(Debug, Clone)]
16pub struct Match {
17    pub start: usize,
18
19    pub end: usize,
20
21    pub captures: Vec<(Option<usize>, Option<usize>)>,
22}
23
24impl Match {
25    pub fn as_str<'a>(&self, input: &'a str) -> &'a str {
26        &input[self.start..self.end]
27    }
28}
29
30#[derive(Debug, Clone, Copy)]
31struct RegisterFile {
32    r: [usize; REG_COUNT],
33}
34
35impl Default for RegisterFile {
36    fn default() -> Self {
37        Self { r: [0; REG_COUNT] }
38    }
39}
40
41pub struct ExecContext<'a> {
42    input: &'a str,
43
44    input_bytes: &'a [u8],
45
46    char_positions: Vec<usize>,
47
48    char_len: usize,
49
50    is_ascii: bool,
51
52    bytecode: &'a [u8],
53
54    capture_count: usize,
55
56    backtrack_pool: BacktrackPool,
57
58    is_unicode: bool,
59
60    char_ranges: &'a [super::charclass::CharRange],
61
62    multi_line: bool,
63
64    sticky: bool,
65}
66
67pub fn execute(prog: &super::compiler::Program, input: &str, start_pos: usize) -> Option<Match> {
68    let ctx = ExecContext::new(prog, input);
69    ctx.execute(start_pos)
70}
71
72pub fn find_all(prog: &super::compiler::Program, input: &str) -> Vec<Match> {
73    let ctx = ExecContext::new(prog, input);
74    ctx.find_all()
75}
76
77impl<'a> ExecContext<'a> {
78    fn new(prog: &'a super::compiler::Program, input: &'a str) -> Self {
79        let input_bytes = input.as_bytes();
80        let flags = prog.flags;
81
82        let is_ascii = input_bytes.iter().all(|&b| b < 0x80);
83
84        let (char_positions, char_len) = if is_ascii {
85            (Vec::new(), input_bytes.len())
86        } else {
87            let positions: Vec<usize> = input.char_indices().map(|(i, _)| i).collect();
88            let len = positions.len();
89            (positions, len)
90        };
91
92        Self {
93            input,
94            input_bytes,
95            char_positions,
96            char_len,
97            is_ascii,
98            bytecode: &prog.bytecode[HEADER_LEN..],
99            capture_count: prog.capture_count,
100            backtrack_pool: BacktrackPool::new(),
101            is_unicode: (flags & FLAG_UNICODE) != 0 || (flags & FLAG_UNICODE_SETS) != 0,
102            multi_line: (flags & FLAG_MULTI_LINE) != 0,
103            sticky: (flags & FLAG_STICKY) != 0,
104            char_ranges: &prog.char_ranges,
105        }
106    }
107
108    #[inline(always)]
109    fn char_to_byte_pos(&self, char_pos: usize) -> usize {
110        if self.is_ascii {
111            char_pos.min(self.input_bytes.len())
112        } else {
113            self.char_positions
114                .get(char_pos)
115                .copied()
116                .unwrap_or(self.input_bytes.len())
117        }
118    }
119
120    fn execute(mut self, start_pos: usize) -> Option<Match> {
121        let is_sticky = self.sticky;
122
123        let char_len = self.char_len;
124        let mut pos = start_pos;
125
126        let starts_with_anchor = self.pattern_starts_with_start_anchor();
127
128        if starts_with_anchor && !is_sticky && start_pos == 0 {
129            let mut regs = RegisterFile::default();
130            regs.r[REG_POS] = 0;
131            let mut captures = vec![(None, None); self.capture_count];
132
133            if let Some(end_pos) = self.run(&mut regs, 0, &mut captures) {
134                return Some(Match {
135                    start: 0,
136                    end: end_pos,
137                    captures,
138                });
139            }
140            return None;
141        }
142
143        let ends_with_anchor = self.pattern_ends_with_end_anchor();
144
145        if ends_with_anchor && !is_sticky && char_len > 0 {
146            return self.execute_end_anchor(start_pos);
147        }
148
149        let mut captures: Vec<(Option<usize>, Option<usize>)> =
150            vec![(None, None); self.capture_count];
151
152        while pos < char_len {
153            let mut regs = RegisterFile::default();
154            regs.r[REG_POS] = pos;
155
156            for c in captures.iter_mut() {
157                *c = (None, None);
158            }
159
160            if let Some(end_pos) = self.run(&mut regs, 0, &mut captures) {
161                return Some(Match {
162                    start: pos,
163                    end: end_pos,
164                    captures,
165                });
166            }
167
168            if is_sticky {
169                break;
170            }
171            pos += 1;
172        }
173
174        None
175    }
176
177    fn pattern_starts_with_start_anchor(&self) -> bool {
178        let code = self.bytecode;
179        if code.is_empty() {
180            return false;
181        }
182
183        let mut i = 0;
184        while i < code.len() && i < 20 {
185            let op = code[i];
186            if op == OpCode::CheckLineStart as u8 {
187                return true;
188            }
189
190            match op {
191                27 | 28 => i += 2,
192                1 | 2 => i += 4,
193                9..=14 => i += 1,
194                _ => {
195                    break;
196                }
197            }
198        }
199        false
200    }
201
202    fn pattern_ends_with_end_anchor(&self) -> bool {
203        self.check_bytecode_has_end_anchor()
204    }
205
206    fn check_bytecode_has_end_anchor(&self) -> bool {
207        let code = self.bytecode;
208        let len = code.len();
209
210        if len < 2 {
211            return false;
212        }
213
214        for &op in code.iter().rev().take(50) {
215            if op == OpCode::CheckLineEnd as u8 {
216                return true;
217            }
218        }
219
220        false
221    }
222
223    fn execute_end_anchor(mut self, start_pos: usize) -> Option<Match> {
224        let char_len = self.char_len;
225
226        if self.multi_line {
227            let end_positions = self.find_all_line_ends(start_pos);
228            for end_pos in end_positions {
229                if let Some(m) = self.try_match_ending_at(end_pos, start_pos) {
230                    return Some(m);
231                }
232            }
233        } else {
234            if let Some(m) = self.try_match_ending_at(char_len, start_pos) {
235                return Some(m);
236            }
237        }
238
239        None
240    }
241
242    fn try_match_ending_at(&mut self, end_pos: usize, start_pos: usize) -> Option<Match> {
243        let max_lookback = 100;
244        let search_start = start_pos.max(end_pos.saturating_sub(max_lookback));
245
246        for pos in search_start..=end_pos {
247            let mut regs = RegisterFile::default();
248            regs.r[REG_POS] = pos;
249            let mut captures = vec![(None, None); self.capture_count];
250
251            if let Some(match_end) = self.run(&mut regs, 0, &mut captures) {
252                if match_end == end_pos {
253                    return Some(Match {
254                        start: pos,
255                        end: match_end,
256                        captures,
257                    });
258                }
259            }
260        }
261
262        None
263    }
264
265    fn find_all_line_ends(&self, start_pos: usize) -> Vec<usize> {
266        let mut ends = vec![self.char_len];
267
268        if !self.multi_line {
269            return ends;
270        }
271
272        for (i, c) in self.input.char_indices() {
273            if is_line_terminator(c as u32) {
274                let char_pos = self.byte_to_char_pos(i);
275                if char_pos >= start_pos && char_pos < self.char_len && !ends.contains(&char_pos) {
276                    ends.push(char_pos);
277                }
278            }
279        }
280
281        ends.sort_unstable();
282        ends
283    }
284
285    fn byte_to_char_pos(&self, byte_pos: usize) -> usize {
286        if self.is_ascii {
287            byte_pos
288        } else {
289            match self.char_positions.binary_search(&byte_pos) {
290                Ok(i) => i,
291                Err(i) => i.saturating_sub(1),
292            }
293        }
294    }
295
296    fn find_all(mut self) -> Vec<Match> {
297        let mut matches = Vec::new();
298        let mut pos = 0;
299
300        let char_len = self.char_len;
301
302        let mut captures: Vec<(Option<usize>, Option<usize>)> =
303            vec![(None, None); self.capture_count];
304
305        while pos < char_len {
306            let mut regs = RegisterFile::default();
307            regs.r[REG_POS] = pos;
308
309            self.backtrack_pool.clear();
310
311            for c in captures.iter_mut() {
312                *c = (None, None);
313            }
314
315            if let Some(end_pos) = self.run(&mut regs, 0, &mut captures) {
316                let match_start = pos;
317                let match_end = end_pos;
318
319                matches.push(Match {
320                    start: match_start,
321                    end: match_end,
322                    captures: captures.clone(),
323                });
324
325                if match_end <= match_start {
326                    pos += 1;
327                } else {
328                    pos = match_end;
329                }
330            } else {
331                pos += 1;
332            }
333        }
334
335        matches
336    }
337
338    fn run(
339        &mut self,
340        regs: &mut RegisterFile,
341        mut pc: usize,
342        captures: &mut [(Option<usize>, Option<usize>)],
343    ) -> Option<usize> {
344        loop {
345            if unlikely(pc >= self.bytecode.len()) {
346                return self.fail_or_backtrack(regs, pc, captures);
347            }
348
349            let opcode_byte = self.bytecode[pc];
350
351            let opcode = if likely(opcode_byte <= OpCode::Halt as u8) {
352                unsafe { std::mem::transmute(opcode_byte) }
353            } else {
354                return None;
355            };
356
357            match opcode {
358                OpCode::Success => {
359                    return Some(regs.r[REG_POS]);
360                }
361
362                OpCode::Fail => {
363                    return self.fail_or_backtrack(regs, pc, captures);
364                }
365
366                OpCode::Halt => {
367                    return None;
368                }
369
370                OpCode::MatchChar => {
371                    let expected =
372                        ((self.bytecode[pc + 3] as u16) << 8 | self.bytecode[pc + 2] as u16) as u32;
373                    let pos = regs.r[REG_POS];
374
375                    if likely(pos < self.char_len) {
376                        let byte_pos = if self.is_ascii {
377                            pos
378                        } else {
379                            self.char_positions[pos]
380                        };
381                        let b = self.input_bytes[byte_pos];
382                        if likely(b < 0x80 && b as u32 == expected) {
383                            regs.r[REG_POS] = pos + 1;
384                            pc += 4;
385                            continue;
386                        }
387
388                        if let Some(c) = self.get_char_fast(pos) {
389                            if c as u32 == expected {
390                                regs.r[REG_POS] = pos + 1;
391                                pc += 4;
392                                continue;
393                            }
394                        }
395                    }
396                    return self.fail_or_backtrack(regs, pc, captures);
397                }
398
399                OpCode::MatchCharI => {
400                    let expected = self.read_u16(pc + 2) as u32;
401                    let pos = regs.r[REG_POS];
402
403                    if likely(pos < self.char_len) {
404                        if let Some(c) = self.get_char_fast(pos) {
405                            if canonicalize(c as u32, self.is_unicode) == expected {
406                                regs.r[REG_POS] = pos + 1;
407                                pc += opcode.size();
408                                continue;
409                            }
410                        }
411                    }
412                    return self.fail_or_backtrack(regs, pc, captures);
413                }
414
415                OpCode::MatchChar32 => {
416                    let reg = self.bytecode[pc + 1] as usize;
417                    let expected = self.read_u32(pc + 2);
418                    let pos = regs.r[reg];
419
420                    if let Some(c) = self.get_char(pos) {
421                        if c as u32 == expected {
422                            regs.r[reg] = pos + 1;
423                            pc += opcode.size();
424                            continue;
425                        }
426                    }
427                    return self.fail_or_backtrack(regs, pc, captures);
428                }
429
430                OpCode::MatchChar32I => {
431                    let reg = self.bytecode[pc + 1] as usize;
432                    let expected = self.read_u32(pc + 2);
433                    let pos = regs.r[reg];
434
435                    if let Some(c) = self.get_char(pos) {
436                        if canonicalize(c as u32, self.is_unicode) == expected {
437                            regs.r[reg] = pos + 1;
438                            pc += opcode.size();
439                            continue;
440                        }
441                    }
442                    return self.fail_or_backtrack(regs, pc, captures);
443                }
444
445                OpCode::MatchDot => {
446                    let pos = regs.r[REG_POS];
447                    if let Some(c) = self.get_char(pos) {
448                        if !is_line_terminator(c as u32) {
449                            regs.r[REG_POS] = pos + 1;
450                            pc += 1;
451                            continue;
452                        }
453                    }
454                    return self.fail_or_backtrack(regs, pc, captures);
455                }
456
457                OpCode::MatchAny => {
458                    let pos = regs.r[REG_POS];
459                    if self.get_char(pos).is_some() {
460                        regs.r[REG_POS] = pos + 1;
461                        pc += 1;
462                        continue;
463                    }
464                    return self.fail_or_backtrack(regs, pc, captures);
465                }
466
467                OpCode::MatchClass => {
468                    let range_idx = self.read_u16(pc + 2) as usize;
469                    let pos = regs.r[REG_POS];
470                    if let Some(c) = self.get_char(pos) {
471                        if let Some(range) = self.char_ranges.get(range_idx) {
472                            if range.contains(c as u32) {
473                                regs.r[REG_POS] = pos + 1;
474                                pc += 4;
475                                continue;
476                            }
477                        }
478                    }
479                    return self.fail_or_backtrack(regs, pc, captures);
480                }
481
482                OpCode::MatchClassI => {
483                    let range_idx = self.read_u16(pc + 2) as usize;
484                    let pos = regs.r[REG_POS];
485                    if let Some(c) = self.get_char(pos) {
486                        let c_upper = canonicalize(c as u32, self.is_unicode);
487                        if let Some(range) = self.char_ranges.get(range_idx) {
488                            if range.contains(c as u32) || range.contains(c_upper) {
489                                regs.r[REG_POS] = pos + 1;
490                                pc += 4;
491                                continue;
492                            }
493                        }
494                    }
495                    return self.fail_or_backtrack(regs, pc, captures);
496                }
497
498                OpCode::CheckLineStart => {
499                    let pos = regs.r[REG_POS];
500                    let at_start = pos == 0;
501                    let after_newline = if self.multi_line {
502                        pos > 0 && is_line_terminator(self.get_char(pos - 1).unwrap_or('\0') as u32)
503                    } else {
504                        false
505                    };
506
507                    if at_start || after_newline {
508                        pc += 1;
509                        continue;
510                    }
511                    return self.fail_or_backtrack(regs, pc, captures);
512                }
513
514                OpCode::CheckLineEnd => {
515                    let pos = regs.r[REG_POS];
516                    let at_end = pos >= self.char_len;
517                    let before_newline = if self.multi_line {
518                        self.get_char(pos)
519                            .map_or(false, |c| is_line_terminator(c as u32))
520                    } else {
521                        false
522                    };
523
524                    if at_end || before_newline {
525                        pc += 1;
526                        continue;
527                    }
528                    return self.fail_or_backtrack(regs, pc, captures);
529                }
530
531                OpCode::CheckWordBoundary | OpCode::CheckWordBoundaryI => {
532                    let ignore_case = opcode == OpCode::CheckWordBoundaryI;
533                    if self.check_word_boundary(regs.r[REG_POS], ignore_case) {
534                        pc += 1;
535                        continue;
536                    }
537                    return self.fail_or_backtrack(regs, pc, captures);
538                }
539
540                OpCode::CheckNotWordBoundary | OpCode::CheckNotWordBoundaryI => {
541                    let ignore_case = opcode == OpCode::CheckNotWordBoundaryI;
542                    if !self.check_word_boundary(regs.r[REG_POS], ignore_case) {
543                        pc += 1;
544                        continue;
545                    }
546                    return self.fail_or_backtrack(regs, pc, captures);
547                }
548
549                OpCode::Jmp => {
550                    let offset = i32::from_le_bytes([
551                        self.bytecode[pc + 1],
552                        self.bytecode[pc + 2],
553                        self.bytecode[pc + 3],
554                        self.bytecode[pc + 4],
555                    ]);
556                    pc = (pc as i32 + 5 + offset) as usize;
557                    continue;
558                }
559
560                OpCode::JmpMatch => {
561                    let offset = i32::from_le_bytes([
562                        self.bytecode[pc + 1],
563                        self.bytecode[pc + 2],
564                        self.bytecode[pc + 3],
565                        self.bytecode[pc + 4],
566                    ]);
567                    pc = (pc as i32 + 5 + offset) as usize;
568                    continue;
569                }
570
571                OpCode::JmpFail => {
572                    let offset = i32::from_le_bytes([
573                        self.bytecode[pc + 1],
574                        self.bytecode[pc + 2],
575                        self.bytecode[pc + 3],
576                        self.bytecode[pc + 4],
577                    ]);
578                    pc = (pc as i32 + 5 + offset) as usize;
579                    continue;
580                }
581
582                OpCode::JmpEq => {
583                    let reg = self.bytecode[pc + 1] as usize;
584                    let imm = self.read_u32(pc + 2) as usize;
585                    let offset = self.read_i32(pc + 6);
586
587                    if regs.r[reg] == imm {
588                        pc = (pc as i32 + 10 + offset) as usize;
589                    } else {
590                        pc += 10;
591                    }
592                    continue;
593                }
594
595                OpCode::JmpNe => {
596                    let reg = self.bytecode[pc + 1] as usize;
597                    let imm = self.read_u32(pc + 2) as usize;
598                    let offset = self.read_i32(pc + 6);
599
600                    if regs.r[reg] != imm {
601                        pc = (pc as i32 + 10 + offset) as usize;
602                    } else {
603                        pc += 10;
604                    }
605                    continue;
606                }
607
608                OpCode::JmpLt => {
609                    let reg = self.bytecode[pc + 1] as usize;
610                    let imm = self.read_u32(pc + 2) as usize;
611                    let offset = self.read_i32(pc + 6);
612
613                    if regs.r[reg] < imm {
614                        pc = (pc as i32 + 10 + offset) as usize;
615                    } else {
616                        pc += 10;
617                    }
618                    continue;
619                }
620
621                OpCode::MovImm => {
622                    let reg = self.bytecode[pc + 1] as usize;
623                    let imm = self.read_u32(pc + 2);
624                    regs.r[reg] = imm as usize;
625                    pc += 6;
626                    continue;
627                }
628
629                OpCode::MovReg => {
630                    let dst = self.bytecode[pc + 1] as usize;
631                    let src = self.bytecode[pc + 2] as usize;
632                    regs.r[dst] = regs.r[src];
633                    pc += 3;
634                    continue;
635                }
636
637                OpCode::Inc => {
638                    let reg = self.bytecode[pc + 1] as usize;
639                    regs.r[reg] = regs.r[reg].wrapping_add(1);
640                    pc += 2;
641                    continue;
642                }
643
644                OpCode::Dec => {
645                    let reg = self.bytecode[pc + 1] as usize;
646                    regs.r[reg] = regs.r[reg].wrapping_sub(1);
647                    pc += 2;
648                    continue;
649                }
650
651                OpCode::AddImm => {
652                    let reg = self.bytecode[pc + 1] as usize;
653                    let imm = self.read_u32(pc + 2);
654                    regs.r[reg] = regs.r[reg].wrapping_add(imm as usize);
655                    pc += 6;
656                    continue;
657                }
658
659                OpCode::SaveStart => {
660                    let idx = self.bytecode[pc + 1] as usize;
661                    if idx < captures.len() {
662                        captures[idx].0 = Some(regs.r[REG_POS]);
663                    }
664                    pc += 2;
665                    continue;
666                }
667
668                OpCode::SaveEnd => {
669                    let idx = self.bytecode[pc + 1] as usize;
670                    if idx < captures.len() {
671                        captures[idx].1 = Some(regs.r[REG_POS]);
672                    }
673                    pc += 2;
674                    continue;
675                }
676
677                OpCode::ResetCaptures => {
678                    let start = self.bytecode[pc + 1] as usize;
679                    let end = self.bytecode[pc + 2] as usize;
680                    for i in start..=end {
681                        if i < captures.len() {
682                            captures[i] = (None, None);
683                        }
684                    }
685                    pc += 3;
686                    continue;
687                }
688
689                OpCode::PushBacktrack => {
690                    let offset = self.read_i32(pc + 1);
691                    let fail_target = (pc as i32 + 5 + offset) as usize;
692
693                    self.backtrack_pool.push(super::pool::BacktrackState {
694                        pc: fail_target as u32,
695                        pos: regs.r[REG_POS] as u32,
696                        counter: regs.r[REG_COUNTER] as u32,
697                        capture_start: captures
698                            .get(0)
699                            .and_then(|c| c.0)
700                            .unwrap_or(u32::MAX as usize)
701                            as u32,
702                        capture_end: captures
703                            .get(0)
704                            .and_then(|c| c.1)
705                            .unwrap_or(u32::MAX as usize)
706                            as u32,
707                    });
708
709                    pc += 5;
710                    continue;
711                }
712
713                OpCode::PopBacktrack => {
714                    self.backtrack_pool.pop();
715                    pc += 1;
716                    continue;
717                }
718
719                OpCode::InitCounter => {
720                    let reg = self.bytecode[pc + 1] as usize;
721                    let min = self.read_u32(pc + 2);
722                    let max = self.read_u32(pc + 6);
723
724                    regs.r[reg] = 0;
725
726                    if reg + 1 < REG_COUNT {
727                        regs.r[reg + 1] = min as usize;
728                    }
729                    if reg + 2 < REG_COUNT {
730                        regs.r[reg + 2] = max as usize;
731                    }
732
733                    pc += 10;
734                    continue;
735                }
736
737                OpCode::CheckCounter => {
738                    let reg = self.bytecode[pc + 1] as usize;
739                    let fail_offset = self.read_i32(pc + 2);
740
741                    let count = regs.r[reg];
742                    let max = if reg + 2 < REG_COUNT {
743                        regs.r[reg + 2]
744                    } else {
745                        usize::MAX
746                    };
747
748                    if count >= max {
749                        pc = (pc as i32 + 6 + fail_offset) as usize;
750                    } else {
751                        regs.r[reg] = count + 1;
752                        pc += 6;
753                    }
754                    continue;
755                }
756
757                OpCode::Invalid => {
758                    panic!("Invalid opcode at pc={}", pc);
759                }
760
761                _ => {
762                    return self.fail_or_backtrack(regs, pc, captures);
763                }
764            }
765        }
766    }
767
768    fn fail_or_backtrack(
769        &mut self,
770        regs: &mut RegisterFile,
771        _pc: usize,
772        captures: &mut [(Option<usize>, Option<usize>)],
773    ) -> Option<usize> {
774        while let Some(state) = self.backtrack_pool.pop() {
775            regs.r[REG_POS] = state.pos as usize;
776            regs.r[REG_COUNTER] = state.counter as usize;
777
778            if state.capture_start != u32::MAX && state.capture_end != u32::MAX {
779                if !captures.is_empty() {
780                    captures[0] = (
781                        Some(state.capture_start as usize),
782                        Some(state.capture_end as usize),
783                    );
784                }
785            }
786
787            if let Some(result) = self.run(regs, state.pc as usize, captures) {
788                return Some(result);
789            }
790        }
791        None
792    }
793
794    #[inline(always)]
795    fn get_char_fast(&self, pos: usize) -> Option<char> {
796        if unlikely(pos >= self.char_len) {
797            return None;
798        }
799
800        let byte_pos = self.char_to_byte_pos(pos);
801        let b = self.input_bytes[byte_pos];
802
803        if likely(b < 0x80) {
804            Some(b as char)
805        } else {
806            self.get_char_utf8(pos)
807        }
808    }
809
810    #[inline(never)]
811    fn get_char_utf8(&self, pos: usize) -> Option<char> {
812        if pos >= self.char_len {
813            return None;
814        }
815        let byte_pos = self.char_to_byte_pos(pos);
816        let bytes = &self.input_bytes[byte_pos..];
817        let first = *bytes.first()?;
818
819        let len = if first < 0xE0 {
820            2
821        } else if first < 0xF0 {
822            3
823        } else {
824            4
825        };
826
827        if bytes.len() < len {
828            return None;
829        }
830
831        std::str::from_utf8(&bytes[..len]).ok()?.chars().next()
832    }
833
834    fn get_char(&self, pos: usize) -> Option<char> {
835        if pos >= self.char_len {
836            return None;
837        }
838        self.get_char_fast(pos)
839    }
840
841    #[inline(always)]
842    fn read_u16(&self, pos: usize) -> u16 {
843        let bytes = &self.bytecode[pos..pos + 2];
844        u16::from_le_bytes([bytes[0], bytes[1]])
845    }
846
847    #[inline(always)]
848    fn read_u32(&self, pos: usize) -> u32 {
849        let bytes = &self.bytecode[pos..pos + 4];
850        u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])
851    }
852
853    #[inline(always)]
854    fn read_i32(&self, pos: usize) -> i32 {
855        self.read_u32(pos) as i32
856    }
857
858    fn check_word_boundary(&self, pos: usize, ignore_case: bool) -> bool {
859        let prev_is_word = if pos == 0 {
860            false
861        } else {
862            self.get_char(pos - 1).map_or(false, |c| {
863                let cp = if ignore_case {
864                    canonicalize(c as u32, self.is_unicode)
865                } else {
866                    c as u32
867                };
868                is_word_char(cp)
869            })
870        };
871
872        let next_is_word = self.get_char(pos).map_or(false, |c| {
873            let cp = if ignore_case {
874                canonicalize(c as u32, self.is_unicode)
875            } else {
876                c as u32
877            };
878            is_word_char(cp)
879        });
880
881        prev_is_word != next_is_word
882    }
883}
884
885#[inline(always)]
886fn canonicalize(c: u32, is_unicode: bool) -> u32 {
887    if c < 128 {
888        if is_unicode {
889            if c >= b'A' as u32 && c <= b'Z' as u32 {
890                c + 32
891            } else {
892                c
893            }
894        } else {
895            if c >= b'a' as u32 && c <= b'z' as u32 {
896                c - 32
897            } else {
898                c
899            }
900        }
901    } else {
902        c
903    }
904}
905
906#[cfg(test)]
907mod tests {
908    use super::super::compiler::compile;
909    use super::super::parser::parse;
910    use super::*;
911
912    #[test]
913    fn test_execute_simple() {
914        let ast = parse("abc", 0).unwrap();
915        let prog = compile(&ast, 0).unwrap();
916
917        let m = execute(&prog, "abc", 0).unwrap();
918        assert_eq!(m.start, 0);
919        assert_eq!(m.end, 3);
920    }
921
922    #[test]
923    fn test_execute_literal() {
924        let ast = parse("hello", 0).unwrap();
925        let prog = compile(&ast, 0).unwrap();
926
927        let m = execute(&prog, "hello world", 0).unwrap();
928        assert_eq!(m.start, 0);
929        assert_eq!(m.end, 5);
930    }
931
932    #[test]
933    fn test_no_match() {
934        let ast = parse("xyz", 0).unwrap();
935        let prog = compile(&ast, 0).unwrap();
936
937        assert!(execute(&prog, "abc", 0).is_none());
938    }
939
940    #[test]
941    fn test_ascii_fast_path() {
942        let ast = parse("test", 0).unwrap();
943        let prog = compile(&ast, 0).unwrap();
944
945        let m = execute(&prog, "this is a test", 0).unwrap();
946        assert_eq!(m.start, 10);
947        assert_eq!(m.end, 14);
948    }
949}