perl-regex 0.16.0

Regex parsing and validation helpers for Perl syntax
Documentation
use crate::syntax::cursor::quoted_literal_end;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct GroupFrame {
    has_backtracking_quantifier: bool,
    is_atomic: bool,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum LastAtom {
    None,
    Other,
    Group { has_backtracking_quantifier: bool, is_atomic: bool },
}

pub(crate) fn detect_nested_quantifiers(pattern: &str) -> bool {
    find_nested_quantifier(pattern, 0).is_some()
}

pub(crate) fn find_nested_quantifier(pattern: &str, start_pos: usize) -> Option<usize> {
    let bytes = pattern.as_bytes();
    let mut i = 0;
    let mut group_stack = Vec::new();
    let mut last_atom = LastAtom::None;

    while i < bytes.len() {
        match bytes[i] {
            b'\\' => {
                if let Some(end) = quoted_literal_end(bytes, i) {
                    i = end;
                    last_atom = LastAtom::None;
                    continue;
                }
                i += 2;
                last_atom = LastAtom::None;
                continue;
            }
            b'[' => {
                i = skip_char_class(bytes, i + 1);
                last_atom = LastAtom::Other;
                continue;
            }
            b'(' => {
                let (next, is_atomic) = skip_group_prefix(bytes, i);
                group_stack.push(GroupFrame { has_backtracking_quantifier: false, is_atomic });
                i = next;
                last_atom = LastAtom::None;
                continue;
            }
            b')' => {
                if let Some(frame) = group_stack.pop() {
                    let has_backtracking_quantifier =
                        frame.has_backtracking_quantifier && !frame.is_atomic;
                    if has_backtracking_quantifier {
                        if let Some(parent) = group_stack.last_mut() {
                            parent.has_backtracking_quantifier = true;
                        }
                    }
                    last_atom =
                        LastAtom::Group { has_backtracking_quantifier, is_atomic: frame.is_atomic };
                } else {
                    last_atom = LastAtom::None;
                }
                i += 1;
                continue;
            }
            b'+' | b'*' | b'?' | b'{' => {
                if let Some(quantifier) = quantifier_at(bytes, i) {
                    if !quantifier.is_possessive {
                        if let LastAtom::Group { has_backtracking_quantifier: true, .. } = last_atom
                        {
                            return Some(start_pos + i);
                        }

                        if let Some(parent) = group_stack.last_mut() {
                            if !matches!(last_atom, LastAtom::Group { is_atomic: true, .. }) {
                                parent.has_backtracking_quantifier = true;
                            }
                        }
                    }
                    i += quantifier.len;
                    last_atom = LastAtom::None;
                    continue;
                }
            }
            _ => {}
        }

        last_atom = LastAtom::Other;
        i += 1;
    }

    None
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct Quantifier {
    len: usize,
    is_possessive: bool,
}

fn quantifier_at(bytes: &[u8], i: usize) -> Option<Quantifier> {
    let quantifier_len = match bytes.get(i).copied() {
        Some(b'+' | b'*' | b'?') => 1,
        Some(b'{') => brace_quantifier_len(bytes, i)?,
        _ => return None,
    };
    let is_possessive = bytes.get(i + quantifier_len) == Some(&b'+');
    Some(Quantifier { len: quantifier_len + usize::from(is_possessive), is_possessive })
}

fn brace_quantifier_len(bytes: &[u8], start: usize) -> Option<usize> {
    let mut i = start + 1;
    let mut has_digit = false;
    let mut has_comma = false;

    while i < bytes.len() {
        match bytes[i] {
            ch if ch.is_ascii_digit() => has_digit = true,
            b',' if !has_comma => has_comma = true,
            b'}' if has_digit => return Some(i - start + 1),
            _ => return None,
        }
        i += 1;
    }

    None
}

fn skip_group_prefix(bytes: &[u8], start: usize) -> (usize, bool) {
    let mut i = start + 1;
    let mut is_atomic = false;

    if bytes.get(i) == Some(&b'?') {
        i += 1;
        if bytes.get(i) == Some(&b'>') {
            is_atomic = true;
            i += 1;
        } else if i < bytes.len()
            && matches!(bytes[i], b':' | b'=' | b'!' | b'<' | b'|' | b'P' | b'#')
        {
            i += 1;
        }
    }

    (i, is_atomic)
}

fn skip_char_class(bytes: &[u8], mut i: usize) -> usize {
    while i < bytes.len() {
        match bytes[i] {
            b'\\' => i += 2,
            b']' => return i + 1,
            _ => i += 1,
        }
    }
    i
}