use super::Flags;
use super::parser::{PropKind, Shorthand};
use alloc::vec::Vec;
use core::cell::Cell;
const STEP_BASE: u64 = crate::limits::DEFAULT_REGEX_STEP_BASE;
const STEP_PER_CHAR: u64 = 1_000;
const MAX_DEPTH: u32 = crate::limits::DEFAULT_REGEX_MAX_DEPTH;
pub(crate) enum Inst {
Char(u32),
Any,
Class(Class),
Match,
Jmp(usize),
Split(usize, usize),
Save(usize),
Assert(Assert),
Look { neg: bool, prog: Vec<Inst> },
LookBehind { neg: bool, prog: Vec<Inst> },
Backref(usize),
}
pub(crate) struct Class {
pub neg: bool,
pub members: Vec<ClassMember>,
}
pub(crate) enum ClassMember {
Char(u32),
Range(u32, u32),
Shorthand(Shorthand),
}
pub(crate) enum Assert {
Start,
End,
WordBoundary,
NotWordBoundary,
}
pub(crate) fn budget_for(input_len: usize) -> u64 {
STEP_BASE.saturating_add(STEP_PER_CHAR.saturating_mul(input_len as u64))
}
fn read_cp(input: &[u16], sp: usize, unicode: bool) -> Option<(u32, usize)> {
let u = *input.get(sp)? as u32;
if unicode
&& (0xD800..=0xDBFF).contains(&u)
&& let Some(&lo) = input.get(sp + 1)
{
let lo = lo as u32;
if (0xDC00..=0xDFFF).contains(&lo) {
let cp = 0x10000 + ((u - 0xD800) << 10) + (lo - 0xDC00);
return Some((cp, 2));
}
}
Some((u, 1))
}
pub(crate) fn run_shared(
prog: &[Inst],
input: &[u16],
start: usize,
group_count: usize,
flags: Flags,
steps: &Cell<u64>,
budget: u64,
) -> Option<Vec<Option<(usize, usize)>>> {
let mut saves = alloc::vec![None; 2 * (group_count + 1)];
let ctx = Ctx {
prog,
input,
flags,
match_end: None,
steps,
budget,
};
let mut loops: Vec<(usize, usize)> = Vec::new();
if backtrack(&ctx, 0, start, &mut saves, 0, &mut loops) {
let mut groups = Vec::with_capacity(group_count + 1);
for g in 0..=group_count {
groups.push(match (saves[2 * g], saves[2 * g + 1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
});
}
Some(groups)
} else {
None
}
}
struct Ctx<'a> {
prog: &'a [Inst],
input: &'a [u16],
flags: Flags,
match_end: Option<usize>,
steps: &'a Cell<u64>,
budget: u64,
}
impl Ctx<'_> {
#[inline]
fn tick(&self) -> bool {
let n = self.steps.get().saturating_add(1);
self.steps.set(n);
n <= self.budget
}
}
fn backtrack(
ctx: &Ctx,
mut pc: usize,
mut sp: usize,
saves: &mut Vec<Option<usize>>,
depth: u32,
loops: &mut Vec<(usize, usize)>,
) -> bool {
if depth > MAX_DEPTH {
return false;
}
let unicode = ctx.flags.unicode;
loop {
if !ctx.tick() {
return false;
}
match &ctx.prog[pc] {
Inst::Match => return ctx.match_end.is_none_or(|p| sp == p),
Inst::Char(c) => {
if let Some((cp, len)) = read_cp(ctx.input, sp, unicode)
&& cp_eq(cp, *c, ctx.flags)
{
sp += len;
pc += 1;
continue;
}
return false;
}
Inst::Any => {
if let Some((cp, len)) = read_cp(ctx.input, sp, unicode)
&& (ctx.flags.dotall || !is_line_term(cp))
{
sp += len;
pc += 1;
continue;
}
return false;
}
Inst::Class(class) => {
if let Some((cp, len)) = read_cp(ctx.input, sp, unicode)
&& class_matches(class, cp, ctx.flags)
{
sp += len;
pc += 1;
continue;
}
return false;
}
Inst::Jmp(target) => pc = *target,
Inst::Split(a, b) => {
if let Some((consume_pc, cont_pc, greedy)) = simple_loop(ctx.prog, pc) {
if greedy {
let mut positions = alloc::vec![sp];
while let Some(adv) =
consume_one(&ctx.prog[consume_pc], ctx.input, sp, ctx.flags)
{
sp += adv;
positions.push(sp);
if !ctx.tick() {
return false;
}
}
while let Some(p) = positions.pop() {
if backtrack(ctx, cont_pc, p, saves, depth + 1, loops) {
return true;
}
if !ctx.tick() {
return false;
}
}
return false;
}
loop {
if backtrack(ctx, cont_pc, sp, saves, depth + 1, loops) {
return true;
}
let Some(adv) =
consume_one(&ctx.prog[consume_pc], ctx.input, sp, ctx.flags)
else {
return false;
};
sp += adv;
if !ctx.tick() {
return false;
}
}
}
if let Some((body, exit)) = loop_head(ctx.prog, pc) {
if loops.contains(&(pc, sp)) {
pc = exit;
continue;
}
loops.push((pc, sp));
let took_body = backtrack(ctx, body, sp, saves, depth + 1, loops);
loops.pop();
if took_body {
return true;
}
pc = exit;
continue;
}
if backtrack(ctx, *a, sp, saves, depth + 1, loops) {
return true;
}
pc = *b;
}
Inst::Save(slot) => {
let old = saves[*slot];
saves[*slot] = Some(sp);
if backtrack(ctx, pc + 1, sp, saves, depth + 1, loops) {
return true;
}
saves[*slot] = old;
return false;
}
Inst::Look { neg, prog } => {
let sub = Ctx {
prog,
input: ctx.input,
flags: ctx.flags,
match_end: None,
steps: ctx.steps,
budget: ctx.budget,
};
let mut sub_saves = alloc::vec![None; saves.len()];
let mut sub_loops = Vec::new();
let matched = backtrack(&sub, 0, sp, &mut sub_saves, depth + 1, &mut sub_loops);
if matched != *neg {
pc += 1;
} else {
return false;
}
}
Inst::LookBehind { neg, prog } => {
let sub = Ctx {
prog,
input: ctx.input,
flags: ctx.flags,
match_end: Some(sp),
steps: ctx.steps,
budget: ctx.budget,
};
let mut matched = false;
for j in (0..=sp).rev() {
if !ctx.tick() {
return false;
}
let mut sub_saves = alloc::vec![None; saves.len()];
let mut sub_loops = Vec::new();
if backtrack(&sub, 0, j, &mut sub_saves, depth + 1, &mut sub_loops) {
matched = true;
break;
}
}
if matched != *neg {
pc += 1;
} else {
return false;
}
}
Inst::Backref(g) => {
match (
saves.get(2 * g).copied().flatten(),
saves.get(2 * g + 1).copied().flatten(),
) {
(Some(s), Some(e)) => {
let len = e - s;
if sp + len <= ctx.input.len()
&& (0..len)
.all(|i| unit_eq(ctx.input[sp + i], ctx.input[s + i], ctx.flags))
{
sp += len;
pc += 1;
} else {
return false;
}
}
_ => pc += 1,
}
}
Inst::Assert(assert) => {
if assert_ok(assert, ctx.input, sp, ctx.flags) {
pc += 1;
} else {
return false;
}
}
}
}
}
fn simple_loop(prog: &[Inst], split_pc: usize) -> Option<(usize, usize, bool)> {
let Inst::Split(a, b) = &prog[split_pc] else {
return None;
};
if *a == split_pc + 1
&& is_single_consume(&prog[split_pc + 1])
&& matches!(prog.get(split_pc + 2), Some(Inst::Jmp(t)) if *t == split_pc)
{
return Some((split_pc + 1, *b, true));
}
if *b == split_pc + 1
&& is_single_consume(&prog[split_pc + 1])
&& matches!(prog.get(split_pc + 2), Some(Inst::Jmp(t)) if *t == split_pc)
{
return Some((split_pc + 1, *a, false));
}
None
}
fn is_single_consume(inst: &Inst) -> bool {
matches!(inst, Inst::Char(_) | Inst::Any | Inst::Class(_))
}
fn loop_head(prog: &[Inst], split_pc: usize) -> Option<(usize, usize)> {
let Inst::Split(a, b) = &prog[split_pc] else {
return None;
};
let body = split_pc + 1;
let exit = if *a == body {
*b
} else if *b == body {
*a
} else {
return None;
};
if exit > 0 && matches!(prog.get(exit - 1), Some(Inst::Jmp(t)) if *t == split_pc) {
Some((body, exit))
} else {
None
}
}
fn consume_one(inst: &Inst, input: &[u16], sp: usize, flags: Flags) -> Option<usize> {
let (cp, len) = read_cp(input, sp, flags.unicode)?;
let ok = match inst {
Inst::Char(c) => cp_eq(cp, *c, flags),
Inst::Any => flags.dotall || !is_line_term(cp),
Inst::Class(class) => class_matches(class, cp, flags),
_ => false,
};
ok.then_some(len)
}
fn cp_eq(a: u32, b: u32, flags: Flags) -> bool {
if a == b {
return true;
}
if !flags.ignore_case {
return false;
}
match (char::from_u32(a), char::from_u32(b)) {
(Some(ca), Some(cb)) => char_fold_eq(ca, cb),
_ => false,
}
}
fn unit_eq(a: u16, b: u16, flags: Flags) -> bool {
if a == b {
return true;
}
if !flags.ignore_case {
return false;
}
match (char::from_u32(a as u32), char::from_u32(b as u32)) {
(Some(ca), Some(cb)) => char_fold_eq(ca, cb),
_ => false,
}
}
fn char_fold_eq(a: char, b: char) -> bool {
#[cfg(feature = "intl")]
{
intl::unicode::case::case_fold(a).eq(intl::unicode::case::case_fold(b))
}
#[cfg(not(feature = "intl"))]
{
a.eq_ignore_ascii_case(&b) || a.to_lowercase().eq(b.to_lowercase())
}
}
fn is_line_term(c: u32) -> bool {
matches!(c, 0x0A | 0x0D | 0x2028 | 0x2029)
}
fn is_word(c: u32) -> bool {
matches!(c, 0x30..=0x39 | 0x41..=0x5A | 0x61..=0x7A | 0x5F)
}
fn class_matches(class: &Class, c: u32, flags: Flags) -> bool {
let mut hit = false;
for m in &class.members {
let matched = match m {
ClassMember::Char(ch) => cp_eq(c, *ch, flags),
ClassMember::Range(lo, hi) => {
(c >= *lo && c <= *hi) || (flags.ignore_case && range_fold_hit(c, *lo, *hi))
}
ClassMember::Shorthand(s) => shorthand_matches(*s, c),
};
if matched {
hit = true;
break;
}
}
hit ^ class.neg
}
fn range_fold_hit(c: u32, lo: u32, hi: u32) -> bool {
let Some(ch) = char::from_u32(c) else {
return false;
};
let cl = ch.to_ascii_lowercase() as u32;
let cu = ch.to_ascii_uppercase() as u32;
(cl >= lo && cl <= hi) || (cu >= lo && cu <= hi)
}
fn shorthand_matches(s: Shorthand, c: u32) -> bool {
match s {
Shorthand::Digit => is_ascii_digit(c),
Shorthand::NotDigit => !is_ascii_digit(c),
Shorthand::Word => is_word(c),
Shorthand::NotWord => !is_word(c),
Shorthand::Space => is_space(c),
Shorthand::NotSpace => !is_space(c),
Shorthand::Property(kind, neg) => property_matches(kind, c) ^ neg,
}
}
fn is_ascii_digit(c: u32) -> bool {
(0x30..=0x39).contains(&c)
}
fn is_space(c: u32) -> bool {
char::from_u32(c).is_some_and(|ch| ch.is_whitespace())
}
fn property_matches(kind: PropKind, c: u32) -> bool {
let Some(c) = char::from_u32(c) else {
return false;
};
match kind {
PropKind::Letter => c.is_alphabetic(),
PropKind::Upper => c.is_uppercase(),
PropKind::Lower => c.is_lowercase(),
PropKind::Number => c.is_numeric(),
PropKind::White => c.is_whitespace(),
PropKind::Alnum => c.is_alphanumeric(),
PropKind::Gc(code) => general_category_matches(code, c),
}
}
#[cfg(feature = "intl")]
fn general_category_matches(code: [u8; 2], c: char) -> bool {
use intl::unicode::category::Group;
let gc = intl::unicode::general_category(c);
if code[1] == 0 {
let want = match code[0] {
b'L' => Group::Letter,
b'M' => Group::Mark,
b'N' => Group::Number,
b'P' => Group::Punctuation,
b'S' => Group::Symbol,
b'Z' => Group::Separator,
b'C' => Group::Other,
_ => return false,
};
gc.group() == want
} else {
gc.abbr().as_bytes() == code
}
}
#[cfg(not(feature = "intl"))]
fn general_category_matches(code: [u8; 2], c: char) -> bool {
match &code {
b"L\0" => c.is_alphabetic(),
b"N\0" => c.is_numeric(),
b"Z\0" => c == ' ' || (c.is_whitespace() && !c.is_control()),
b"C\0" => c.is_control(),
b"P\0" => c.is_ascii_punctuation(),
b"Lu" => c.is_uppercase(),
b"Ll" => c.is_lowercase(),
b"Lo" => c.is_alphabetic() && !c.is_uppercase() && !c.is_lowercase(),
b"Nd" => c.is_ascii_digit(),
b"Cc" => c.is_control(),
_ => false,
}
}
fn assert_ok(assert: &Assert, input: &[u16], sp: usize, flags: Flags) -> bool {
match assert {
Assert::Start => sp == 0 || (flags.multiline && is_line_term(input[sp - 1] as u32)),
Assert::End => sp == input.len() || (flags.multiline && is_line_term(input[sp] as u32)),
Assert::WordBoundary => is_boundary(input, sp),
Assert::NotWordBoundary => !is_boundary(input, sp),
}
}
fn is_boundary(input: &[u16], sp: usize) -> bool {
let before = sp > 0 && is_word(input[sp - 1] as u32);
let after = sp < input.len() && is_word(input[sp] as u32);
before != after
}