quamina 0.6.0 - Docs.rs

//! Regexp parsing for I-Regexp (RFC 9485 subset).
//!
//! This module parses regexp strings into a tree structure for NFA construction.
//! Supports:
//! - `.` matches any character
//! - `[...]` character classes with ranges
//! - `[^...]` negated character classes
//! - `|` alternation
//! - `(...)` grouping
//! - `?` optional quantifier
//! - `+` one-or-more quantifier
//! - `*` zero-or-more quantifier
//! - `{n,m}` range quantifiers
//!
//! The escape character is `~` (not `\`) to avoid JSON escaping issues.

use crate::unicode_categories::{get_block_ranges, get_category_ranges};

/// A pair of runes representing an inclusive range [lo, hi].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct RunePair {
    pub lo: char,
    pub hi: char,
}

/// A collection of rune pairs representing a character class.
pub type RuneRange = Vec<RunePair>;

/// Maximum quantifier value (Go uses 100).
pub const REGEXP_QUANTIFIER_MAX: i32 = 100;

/// The escape character (~ instead of \ to avoid JSON escaping).
const ESCAPE: char = '~';

/// Maximum Unicode code point value.
pub const RUNE_MAX: char = '\u{10FFFF}';

/// Surrogate range boundaries (these are invalid Unicode code points for chars)
const SURROGATE_START_CP: u32 = 0xD800;
const SURROGATE_END_CP: u32 = 0xDFFF;

/// A quantified atom in the regexp tree.
#[derive(Debug, Clone)]
pub struct QuantifiedAtom {
    /// True if this is a `.` (matches any character)
    pub is_dot: bool,
    /// Rune ranges for character matching
    pub runes: RuneRange,
    /// Minimum number of times to match
    pub quant_min: i32,
    /// Maximum number of times to match
    pub quant_max: i32,
    /// Subtree for parenthesized groups
    pub subtree: Option<Root>,
    /// Cache key for large Unicode categories (e.g., "L", "Lu", "-L" for negated)
    /// Used to cache pre-built FA shells for performance.
    pub cache_key: Option<String>,
    /// Lookaround assertion type (if this atom is a lookaround group)
    pub lookaround: Option<LookaroundType>,
    /// For ASCII-only negated patterns like `[^x]`, stores the negated bytes (1-3).
    /// Used for memchr acceleration - since JSON input is valid UTF-8, we don't
    /// need UTF-8 validation during matching, so these are the only exit bytes.
    pub ascii_negated_bytes: Option<Vec<u8>>,
    /// Word boundary marker: Some(true) = `~b`, Some(false) = `~B`.
    pub is_word_boundary: Option<bool>,
}

impl Default for QuantifiedAtom {
    fn default() -> Self {
        Self {
            is_dot: false,
            runes: Vec::new(),
            quant_min: 1,
            quant_max: 1,
            subtree: None,
            cache_key: None,
            lookaround: None,
            ascii_negated_bytes: None,
            is_word_boundary: None,
        }
    }
}

impl QuantifiedAtom {
    /// Returns true if this atom matches exactly once (no quantifier).
    #[inline]
    #[must_use]
    pub const fn is_singleton(&self) -> bool {
        self.quant_min == 1 && self.quant_max == 1
    }

    /// Returns true if this atom is optional (?).
    #[inline]
    #[must_use]
    pub const fn is_qm(&self) -> bool {
        self.quant_min == 0 && self.quant_max == 1
    }

    /// Returns true if this atom uses + (one or more).
    #[inline]
    #[must_use]
    pub const fn is_plus(&self) -> bool {
        self.quant_min == 1 && self.quant_max == REGEXP_QUANTIFIER_MAX
    }

    /// Returns true if this atom uses * (zero or more).
    #[inline]
    #[must_use]
    pub const fn is_star(&self) -> bool {
        self.quant_min == 0 && self.quant_max == REGEXP_QUANTIFIER_MAX
    }
}

/// A branch in the regexp (sequence of atoms).
pub type Branch = Vec<QuantifiedAtom>;

/// The root of a parsed regexp (alternatives separated by |).
pub type Root = Vec<Branch>;

/// Type of lookaround assertion
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LookaroundType {
    /// `(?=...)` - positive lookahead: match if followed by pattern
    PositiveLookahead,
    /// `(?!...)` - negative lookahead: match if NOT followed by pattern
    NegativeLookahead,
    /// `(?<=...)` - positive lookbehind: match if preceded by pattern
    PositiveLookbehind,
    /// `(?<!...)` - negative lookbehind: match if NOT preceded by pattern
    NegativeLookbehind,
}

impl LookaroundType {
    /// Returns true if this is a negative lookaround ((?!...) or (?<!...))
    #[must_use]
    pub const fn is_negative(&self) -> bool {
        matches!(self, Self::NegativeLookahead | Self::NegativeLookbehind)
    }

    /// Returns true if this is a lookbehind ((?<=...) or (?<!...))
    #[must_use]
    pub const fn is_lookbehind(&self) -> bool {
        matches!(self, Self::PositiveLookbehind | Self::NegativeLookbehind)
    }
}

/// Features found during parsing (for validation).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RegexpFeature {
    Dot,
    Star,
    Plus,
    QuestionMark,
    Range,
    ParenGroup,
    NonCapturingGroup,
    LazyQuantifier,
    Property,
    Class,
    NegatedClass,
    OrBar,
    /// Lookaround assertion (?=, ?!, ?<=, ?<!)
    Lookaround,
    /// Word boundary assertion (~b, ~B)
    WordBoundary,
}

/// Features that are implemented in the NFA builder.
const IMPLEMENTED_FEATURES: &[RegexpFeature] = &[
    RegexpFeature::Dot,
    RegexpFeature::Class,
    RegexpFeature::NegatedClass,
    RegexpFeature::OrBar,
    RegexpFeature::ParenGroup,
    RegexpFeature::NonCapturingGroup,
    RegexpFeature::LazyQuantifier,
    RegexpFeature::QuestionMark,
    RegexpFeature::Plus,
    RegexpFeature::Star,
    RegexpFeature::Range,
    RegexpFeature::Property,
    RegexpFeature::Lookaround,
    RegexpFeature::WordBoundary,
];

/// Parser state for regexp parsing.
struct RegexpParse {
    bytes: Vec<u8>,
    index: usize,
    last_index: usize,
    nesting: Vec<Root>,
    found_features: Vec<RegexpFeature>,
    tree: Root,
}

/// Error type for regexp parsing.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Error {
    pub message: String,
    pub offset: usize,
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} at offset {}", self.message, self.offset)
    }
}

impl std::error::Error for Error {}

impl RegexpParse {
    fn new(re: &str) -> Self {
        Self {
            bytes: re.as_bytes().to_vec(),
            index: 0,
            last_index: 0,
            nesting: Vec::new(),
            found_features: Vec::new(),
            tree: Vec::new(),
        }
    }

    fn nest(&mut self) {
        self.nesting.push(std::mem::take(&mut self.tree));
    }

    fn unnest(&mut self) -> Root {
        let subtree = std::mem::take(&mut self.tree);
        self.tree = self.nesting.pop().unwrap_or_default();
        subtree
    }

    const fn is_nested(&self) -> bool {
        !self.nesting.is_empty()
    }

    fn next_rune(&mut self) -> Result<char, Error> {
        if self.index >= self.bytes.len() {
            return Err(Error {
                message: "end of string".into(),
                offset: self.index,
            });
        }
        self.last_index = self.index;

        // Decode UTF-8
        let remaining = &self.bytes[self.index..];
        let s = std::str::from_utf8(remaining).map_err(|_| Error {
            message: "UTF-8 encoding error".into(),
            offset: self.index,
        })?;

        let c = s.chars().next().ok_or_else(|| Error {
            message: "empty string".into(),
            offset: self.index,
        })?;

        self.index += c.len_utf8();
        Ok(c)
    }

    fn require(&mut self, wanted: char) -> Result<(), Error> {
        let got = self.next_rune()?;
        if got != wanted {
            return Err(Error {
                message: format!("expected '{wanted}', got '{got}'"),
                offset: self.last_index,
            });
        }
        Ok(())
    }

    fn bypass_optional(&mut self, c: char) -> Result<bool, Error> {
        let next = self.next_rune()?;
        if next != c {
            self.backup1(next);
        }
        Ok(next == c)
    }

    const fn backup1(&mut self, one_rune: char) {
        self.index -= one_rune.len_utf8();
    }

    const fn is_empty(&self) -> bool {
        self.index >= self.bytes.len()
    }

    fn record_feature(&mut self, feature: RegexpFeature) {
        if !self.found_features.contains(&feature) {
            self.found_features.push(feature);
        }
    }

    fn found_unimplemented(&self) -> Vec<RegexpFeature> {
        self.found_features
            .iter()
            .filter(|f| !IMPLEMENTED_FEATURES.contains(f))
            .copied()
            .collect()
    }
}

// ============================================================================
// Lookaround Validation
// ============================================================================

/// Check if a tree contains any nested lookaround (lookaround inside lookaround).
/// This is not supported and should be rejected.
fn has_nested_lookaround(tree: &Root) -> bool {
    for branch in tree {
        for atom in branch {
            if atom.lookaround.is_some() {
                // This atom IS a lookaround. Check if its subtree contains another lookaround.
                if let Some(subtree) = &atom.subtree
                    && tree_has_lookaround(subtree)
                {
                    return true;
                }
            }
        }
    }
    false
}

/// Recursively check if a tree contains any lookaround at any depth.
fn tree_has_lookaround(tree: &Root) -> bool {
    for branch in tree {
        for atom in branch {
            if atom.lookaround.is_some() {
                return true;
            }
            if let Some(subtree) = &atom.subtree
                && tree_has_lookaround(subtree)
            {
                return true;
            }
        }
    }
    false
}

/// Validate lookaround constructs in a parsed tree.
/// Returns Ok(()) if valid, Err with message if invalid.
fn validate_lookarounds(tree: &Root) -> Result<(), String> {
    // Check for nested lookarounds
    if has_nested_lookaround(tree) {
        return Err("nested lookaround not supported: `(?=...(?=...)...)`".into());
    }

    // Check for variable-length lookbehind
    for branch in tree {
        for atom in branch {
            if let Some(la_type) = &atom.lookaround
                && la_type.is_lookbehind()
                && let Some(subtree) = &atom.subtree
                && has_variable_length_pattern(subtree)
            {
                return Err("variable-length lookbehind not yet supported: `(?<=a+)`; \
                                 use fixed-length like `(?<=aaa)`"
                    .into());
            }
        }
    }

    Ok(())
}

/// Check if a pattern can match strings of different lengths.
/// Used to validate lookbehind patterns which require fixed length.
fn has_variable_length_pattern(tree: &Root) -> bool {
    // A pattern is variable-length if its first branch is variable-length, or if
    // any alternation branch has a different fixed length than the first. An empty
    // tree matches only the empty string, which is fixed-length.
    if tree.is_empty() {
        return false;
    }
    let first_len = branch_fixed_length(&tree[0]);
    if first_len.is_none() {
        return true;
    }
    tree.iter()
        .skip(1)
        .any(|branch| branch_fixed_length(branch) != first_len)
}

/// Calculate the fixed length of a branch, or None if variable length.
fn branch_fixed_length(branch: &Branch) -> Option<usize> {
    let mut total = 0usize;
    for atom in branch {
        // A fixed-length atom repeats a fixed number of times, so quant_min must
        // equal quant_max; `*`, `+`, and `{m,n}` with m != n are all variable.
        if atom.quant_min != atom.quant_max {
            return None;
        }

        let atom_len = if atom.is_dot {
            // Dot matches one character, but UTF-8 encoding varies in length
            // For simplicity, we allow single-character matchers
            1
        } else if !atom.runes.is_empty() {
            // Character class - could have varying UTF-8 lengths
            // But from a character count perspective, it's always 1 char
            1
        } else if let Some(subtree) = &atom.subtree {
            // Group - recurse (but it's the match count, not UTF-8 bytes)
            if subtree.len() == 1 {
                branch_fixed_length(&subtree[0])?
            } else {
                // Alternation - check if all have same length
                let first_len = branch_fixed_length(&subtree[0])?;
                for b in subtree.iter().skip(1) {
                    if branch_fixed_length(b) != Some(first_len) {
                        return None;
                    }
                }
                first_len
            }
        } else {
            0 // Empty atom
        };

        // Multiply by quantifier. The parser doesn't produce negative
        // counts here, but if that ever changed we'd rather call the
        // length unknown (None) than wrap a -1 into a giant usize.
        total += atom_len * usize::try_from(atom.quant_min).ok()?;
    }
    Some(total)
}

/// Collect all lookaround atoms from a tree with their positions.
/// Used for pattern transformation.
#[must_use]
pub fn collect_lookarounds(tree: &Root) -> Vec<(usize, usize, LookaroundType, Root)> {
    let mut result = Vec::new();
    for (branch_idx, branch) in tree.iter().enumerate() {
        for (atom_idx, atom) in branch.iter().enumerate() {
            if let Some(la_type) = atom.lookaround
                && let Some(subtree) = &atom.subtree
            {
                result.push((branch_idx, atom_idx, la_type, subtree.clone()));
            }
        }
    }
    result
}

/// Check if a tree contains any lookaround at the top level.
#[must_use]
pub fn has_top_level_lookaround(tree: &Root) -> bool {
    for branch in tree {
        for atom in branch {
            if atom.lookaround.is_some() {
                return true;
            }
        }
    }
    false
}

/// Check if a tree contains any word boundary atom (`~b` or `~B`).
#[must_use]
pub fn has_word_boundary(tree: &Root) -> bool {
    for branch in tree {
        for atom in branch {
            if atom.is_word_boundary.is_some() {
                return true;
            }
        }
    }
    false
}

/// Word char ranges: [a-zA-Z0-9_]
fn word_char_runes() -> RuneRange {
    vec![
        RunePair { lo: 'a', hi: 'z' },
        RunePair { lo: 'A', hi: 'Z' },
        RunePair { lo: '0', hi: '9' },
        RunePair { lo: '_', hi: '_' },
    ]
}

/// Non-word char ranges (inverted word chars)
fn non_word_char_runes() -> RuneRange {
    invert_rune_range(word_char_runes())
}

/// Compute the intersection of two RuneRanges.
/// Returns the set of characters that are in BOTH ranges.
fn intersect_rune_ranges(a: &RuneRange, b: &RuneRange) -> RuneRange {
    let mut result = Vec::new();
    for ap in a {
        for bp in b {
            let lo = ap.lo.max(bp.lo);
            let hi = ap.hi.min(bp.hi);
            if lo <= hi {
                result.push(RunePair { lo, hi });
            }
        }
    }
    simplify_rune_range(result)
}

/// Result of constraining an atom's character class at a word boundary.
/// For singleton atoms, a single constrained atom is returned.
/// For quantified atoms (*, +, {n,m}), we may need to split into
/// "base quantified part" + "constrained last/first char".
enum ConstrainedAtom {
    /// Single atom with class intersection applied
    Single(QuantifiedAtom),
    /// Quantified atom that needs splitting: (base_quantified, constrained_single)
    /// Used when we need to constrain only the last/first char of a quantified run.
    Split(QuantifiedAtom, QuantifiedAtom),
    /// Quantified atom with quant_min=0 (can match zero chars).
    /// First element: Split(base, constrained) for when atom matches 1+ chars.
    /// The caller must also generate a branch where this atom is absent entirely
    /// (for the zero-match case where the boundary is at the value edge).
    SplitOrAbsent(QuantifiedAtom, QuantifiedAtom),
}

/// Constrain an atom's character class at a word boundary position.
///
/// For singleton atoms, intersects the class directly.
/// For quantified atoms (*, +, {n,m} where the boundary-adjacent occurrence
/// needs constraining), splits into base + constrained single.
///
/// `is_last_char`: true if we're constraining the last char (prefix side),
///                 false if constraining the first char (suffix side).
fn constrain_atom_at_boundary(
    atom: &QuantifiedAtom,
    class: &RuneRange,
    is_last_char: bool,
) -> Option<ConstrainedAtom> {
    if atom.subtree.is_some() {
        // Group atom — can't easily intersect
        return None;
    }

    let base_runes = if atom.is_dot {
        // Dot matches any char, use full Unicode range for intersection
        vec![RunePair {
            lo: '\0',
            hi: RUNE_MAX,
        }]
    } else {
        atom.runes.clone()
    };

    let intersected = intersect_rune_ranges(&base_runes, class);
    if intersected.is_empty() {
        return None;
    }

    // Determine cache key: if the result equals ~w or ~W, cache the FA shell
    let cache_key = if intersected == simplify_rune_range(word_char_runes()) {
        Some("wb_w".to_string())
    } else if intersected == simplify_rune_range(non_word_char_runes()) {
        Some("wb_W".to_string())
    } else {
        None
    };

    let constrained_single = QuantifiedAtom {
        runes: intersected,
        cache_key,
        ..Default::default()
    };

    if atom.is_singleton() {
        return Some(ConstrainedAtom::Single(constrained_single));
    }

    // Adjust the base quantifier: reduce count by 1 since we split off one char.
    // Same logic for both is_last_char and !is_last_char.
    let _ = is_last_char; // Used for documentation; both sides reduce by 1
    let (new_min, new_max) = (
        (atom.quant_min - 1).max(0),
        if atom.quant_max == REGEXP_QUANTIFIER_MAX {
            REGEXP_QUANTIFIER_MAX
        } else {
            atom.quant_max - 1
        },
    );

    if new_max == 0 {
        // The quantified atom was {1,1} effectively → just the constrained single
        return Some(ConstrainedAtom::Single(constrained_single));
    }

    let base = QuantifiedAtom {
        is_dot: atom.is_dot,
        runes: atom.runes.clone(),
        quant_min: new_min,
        quant_max: new_max,
        cache_key: atom.cache_key.clone(),
        ascii_negated_bytes: atom.ascii_negated_bytes.clone(),
        ..Default::default()
    };

    // If the original atom can match zero times (quant_min == 0, e.g., * or ?),
    // then the atom might be absent entirely, meaning the boundary is at the
    // value edge (where `"` is non-word). Return SplitOrAbsent so the caller
    // can generate an additional branch without this atom.
    if atom.quant_min == 0 {
        Some(ConstrainedAtom::SplitOrAbsent(base, constrained_single))
    } else {
        Some(ConstrainedAtom::Split(base, constrained_single))
    }
}

/// Expand a constrained atom into atom sequences.
/// `base_first=true` gives [base, single] (prefix/last-char side),
/// `base_first=false` gives [single, base] (suffix/first-char side).
fn expand_constrained(ca: &ConstrainedAtom, base_first: bool) -> Vec<Vec<QuantifiedAtom>> {
    match ca {
        ConstrainedAtom::Single(a) => vec![vec![a.clone()]],
        ConstrainedAtom::Split(base, single) | ConstrainedAtom::SplitOrAbsent(base, single) => {
            if base_first {
                vec![vec![base.clone(), single.clone()]]
            } else {
                vec![vec![single.clone(), base.clone()]]
            }
        }
    }
}

/// Expand a `~b`/`~B` at value start (no prefix atoms before the boundary).
///
/// The `"` delimiter before the value is non-word, so:
/// - `~b`: first char must be word
/// - `~B`: first char must be non-word
fn expand_wb_at_start(suffix: &[QuantifiedAtom], is_boundary: bool, out: &mut Vec<Branch>) {
    let wc = word_char_runes();
    let nwc = non_word_char_runes();
    let required_class = if is_boundary { &wc } else { &nwc };

    let Some(constrained) = constrain_atom_at_boundary(&suffix[0], required_class, false) else {
        return;
    };

    for atoms in expand_constrained(&constrained, false) {
        let mut branch = atoms;
        branch.extend_from_slice(&suffix[1..]);
        out.push(branch);
    }

    // SplitOrAbsent: the first suffix atom matched 0 chars, so the boundary
    // falls at value start. Constrain the next real atom instead.
    if matches!(constrained, ConstrainedAtom::SplitOrAbsent(..))
        && suffix.len() > 1
        && let Some(c2) = constrain_atom_at_boundary(&suffix[1], required_class, false)
    {
        for atoms in expand_constrained(&c2, false) {
            let mut branch = atoms;
            branch.extend_from_slice(&suffix[2..]);
            out.push(branch);
        }
    }
}

/// Expand a `~b`/`~B` at value end (no suffix atoms after the boundary).
///
/// The `"` delimiter after the value is non-word, so:
/// - `~b`: last char must be word
/// - `~B`: last char must be non-word
fn expand_wb_at_end(prefix: &[QuantifiedAtom], is_boundary: bool, out: &mut Vec<Branch>) {
    let wc = word_char_runes();
    let nwc = non_word_char_runes();
    let required_class = if is_boundary { &wc } else { &nwc };
    let last_idx = prefix.len() - 1;

    let Some(constrained) = constrain_atom_at_boundary(&prefix[last_idx], required_class, true)
    else {
        return;
    };

    for atoms in expand_constrained(&constrained, true) {
        let mut branch = prefix[..last_idx].to_vec();
        branch.extend(atoms);
        out.push(branch);
    }

    // SplitOrAbsent: last prefix atom matched 0 chars, so boundary falls
    // at the end after the preceding atom. Constrain that one instead.
    if matches!(constrained, ConstrainedAtom::SplitOrAbsent(..)) && last_idx > 0 {
        let prev = last_idx - 1;
        if let Some(c2) = constrain_atom_at_boundary(&prefix[prev], required_class, true) {
            for atoms in expand_constrained(&c2, true) {
                let mut branch = prefix[..prev].to_vec();
                branch.extend(atoms);
                out.push(branch);
            }
        }
    }
}

/// Expand a `~b`/`~B` in the middle (between prefix and suffix atoms).
///
/// Both sides are constrained: the last char of prefix and the first char of suffix
/// must be in opposite word-classes (`~b`) or the same class (`~B`).
fn expand_wb_in_middle(
    prefix: &[QuantifiedAtom],
    suffix: &[QuantifiedAtom],
    is_boundary: bool,
    out: &mut Vec<Branch>,
) {
    let wc = word_char_runes();
    let nwc = non_word_char_runes();
    let last_idx = prefix.len() - 1;

    // ~b: (last=word, first=nonword) OR (last=nonword, first=word)
    // ~B: (last=word, first=word) OR (last=nonword, first=nonword)
    let class_pairs: [(&RuneRange, &RuneRange); 2] = if is_boundary {
        [(&wc, &nwc), (&nwc, &wc)]
    } else {
        [(&wc, &wc), (&nwc, &nwc)]
    };

    for (last_class, first_class) in &class_pairs {
        let cl = constrain_atom_at_boundary(&prefix[last_idx], last_class, true);
        let cf = constrain_atom_at_boundary(&suffix[0], first_class, false);

        let (Some(cl), Some(cf)) = (&cl, &cf) else {
            continue;
        };

        // Generate all combinations from constrained prefix × suffix
        for pe in &expand_constrained(cl, true) {
            for se in &expand_constrained(cf, false) {
                let mut branch = prefix[..last_idx].to_vec();
                branch.extend(pe.clone());
                branch.extend(se.clone());
                branch.extend_from_slice(&suffix[1..]);
                out.push(branch);
            }
        }

        // SplitOrAbsent on prefix side: prefix atom absent → boundary at value start.
        // The `"` is non-word, so constrain suffix to edge class.
        if matches!(cl, ConstrainedAtom::SplitOrAbsent(..)) {
            let edge_class = if is_boundary { &wc } else { &nwc };
            if let Some(c2) = constrain_atom_at_boundary(&suffix[0], edge_class, false) {
                for se in expand_constrained(&c2, false) {
                    let mut branch = prefix[..last_idx].to_vec();
                    branch.extend(se);
                    branch.extend_from_slice(&suffix[1..]);
                    out.push(branch);
                }
            }
        }

        // SplitOrAbsent on suffix side: suffix atom absent → boundary at value end.
        if matches!(cf, ConstrainedAtom::SplitOrAbsent(..)) {
            let edge_class = if is_boundary { &wc } else { &nwc };
            if let Some(c2) = constrain_atom_at_boundary(&prefix[last_idx], edge_class, true) {
                for pe in expand_constrained(&c2, true) {
                    let mut branch = prefix[..last_idx].to_vec();
                    branch.extend(pe);
                    branch.extend_from_slice(&suffix[1..]);
                    out.push(branch);
                }
            }
        }
    }
}

/// Expand word boundaries (`~b`/`~B`) in a regexp tree using character-class intersection.
///
/// For `A~bB`: The last char of A and first char of B must be in different word-char classes.
/// This is implemented by intersecting A's last atom with `~w`/`~W` and B's first atom
/// with the opposite class, producing two alternative branches.
///
/// Returns the expanded tree (may have more branches than the input).
pub fn expand_word_boundaries(tree: &Root) -> Result<Root, String> {
    let mut result_branches: Vec<Branch> = Vec::new();

    for branch in tree {
        let wb_count = branch
            .iter()
            .filter(|a| a.is_word_boundary.is_some())
            .count();
        if wb_count == 0 {
            result_branches.push(branch.clone());
            continue;
        }
        if wb_count > 4 {
            return Err("too many word boundaries in pattern (max 4)".into());
        }

        // Expand one word boundary at a time (each pass resolves the first remaining ~b/~B)
        let mut alternatives = vec![branch.clone()];

        loop {
            let mut new_alternatives = Vec::new();
            let mut found_wb = false;

            for alt in &alternatives {
                let wb_pos = alt.iter().position(|a| a.is_word_boundary.is_some());
                let Some(pos) = wb_pos else {
                    new_alternatives.push(alt.clone());
                    continue;
                };
                found_wb = true;
                let is_boundary = alt[pos].is_word_boundary.unwrap();

                let prefix = &alt[..pos];
                let suffix = &alt[pos + 1..];

                match (prefix.is_empty(), suffix.is_empty()) {
                    (true, true) => {
                        // ~b alone: between two `"` (non-word). Boundary = never, non-boundary = always.
                        if !is_boundary {
                            new_alternatives.push(Vec::new());
                        }
                    }
                    (true, false) => expand_wb_at_start(suffix, is_boundary, &mut new_alternatives),
                    (false, true) => expand_wb_at_end(prefix, is_boundary, &mut new_alternatives),
                    (false, false) => {
                        expand_wb_in_middle(prefix, suffix, is_boundary, &mut new_alternatives);
                    }
                }
            }

            alternatives = new_alternatives;
            if !found_wb {
                break;
            }
        }

        result_branches.extend(alternatives);
    }

    // An empty result means no valid alternatives exist (e.g., hello~bworld
    // where both sides are word chars). The caller (json.rs) reports this as
    // an InvalidPattern error.
    Ok(result_branches)
}

/// Parse a regexp string into a tree structure.
pub fn parse(re: &str) -> Result<Root, Error> {
    let mut parse = RegexpParse::new(re);
    read_branches(&mut parse)?;

    // Check for unclosed parentheses
    if parse.is_nested() {
        return Err(Error {
            message: "unclosed '('".into(),
            offset: parse.index,
        });
    }

    let tree = std::mem::take(&mut parse.tree);

    // Validate lookaround constructs (nested, variable-length lookbehind)
    if parse.found_features.contains(&RegexpFeature::Lookaround)
        && let Err(msg) = validate_lookarounds(&tree)
    {
        return Err(Error {
            message: msg,
            offset: 0,
        });
    }

    let unimplemented = parse.found_unimplemented();
    if !unimplemented.is_empty() {
        return Err(Error {
            message: format!("unimplemented features: {unimplemented:?}"),
            offset: 0,
        });
    }

    Ok(tree)
}

/// Read branches separated by |
fn read_branches(parse: &mut RegexpParse) -> Result<(), Error> {
    while !parse.is_empty() {
        let branch = read_branch(parse)?;
        parse.tree.push(branch);

        if parse.is_empty() {
            return Ok(());
        }

        let b = parse.next_rune()?;
        if b == '|' {
            parse.record_feature(RegexpFeature::OrBar);
            continue;
        } else if b == ')' {
            parse.backup1(b);
            return Ok(());
        }
        // Shouldn't reach here
    }
    Ok(())
}

/// Read a single branch (sequence of pieces).
fn read_branch(parse: &mut RegexpParse) -> Result<Branch, Error> {
    let mut branch = Vec::new();

    loop {
        match read_piece(parse) {
            Ok(piece) => branch.push(piece),
            Err(e) if e.message == "stuck" => break,
            Err(e) if e.message == "end of string" => break,
            Err(e) => return Err(e),
        }
    }

    Ok(branch)
}

/// Read a piece (atom with optional quantifier).
fn read_piece(parse: &mut RegexpParse) -> Result<QuantifiedAtom, Error> {
    let mut qa = read_atom(parse)?;
    read_quantifier(parse, &mut qa)?;
    Ok(qa)
}

/// Check if a character is a "normal" character (not special).
fn is_normal_char(c: char) -> bool {
    let code = c as u32;

    if code <= 0x27 || c == ',' || c == '-' || (0x2F..=0x3E).contains(&code) {
        return true;
    }
    if (0x40..=0x5A).contains(&code) {
        return true;
    }
    // allow backslash
    if code == 0x5c {
        return true;
    }
    if (0x5E..=0x7A).contains(&code) {
        return true;
    }
    // exclude ~
    if (0x7F..=0xD7FF).contains(&code) {
        return true;
    }
    if (0xE000..=0x10FFFF).contains(&code) {
        return true;
    }
    false
}

/// Check for single-char escape sequences.
fn check_single_char_escape(c: char) -> Option<char> {
    let code = c as u32;

    // ( ) * +
    if (0x28..=0x2B).contains(&code) {
        return Some(c);
    }
    // - . ? [ \ ] ^
    if c == '-' || c == '.' || c == '?' || (0x5B..=0x5E).contains(&code) {
        return Some(c);
    }
    // Special escapes
    if c == 'n' {
        return Some('\n');
    }
    if c == 'r' {
        return Some('\r');
    }
    if c == 't' {
        return Some('\t');
    }
    // { | }
    if (0x7B..=0x7D).contains(&code) {
        return Some(c);
    }
    // Escape itself
    if c == ESCAPE {
        return Some(ESCAPE);
    }
    None
}

/// XML NameStartChar - characters that can start an XML name (XSD `\i`).
/// From W3C XML spec: <https://www.w3.org/TR/xml/#NT-NameStartChar>
fn xml_name_start_char() -> RuneRange {
    vec![
        RunePair { lo: ':', hi: ':' }, // 0x3A
        RunePair { lo: 'A', hi: 'Z' }, // 0x41-0x5A
        RunePair { lo: '_', hi: '_' }, // 0x5F
        RunePair { lo: 'a', hi: 'z' }, // 0x61-0x7A
        RunePair {
            lo: '\u{C0}',
            hi: '\u{D6}',
        },
        RunePair {
            lo: '\u{D8}',
            hi: '\u{F6}',
        },
        RunePair {
            lo: '\u{F8}',
            hi: '\u{2FF}',
        },
        RunePair {
            lo: '\u{370}',
            hi: '\u{37D}',
        },
        RunePair {
            lo: '\u{37F}',
            hi: '\u{1FFF}',
        },
        RunePair {
            lo: '\u{200C}',
            hi: '\u{200D}',
        },
        RunePair {
            lo: '\u{2070}',
            hi: '\u{218F}',
        },
        RunePair {
            lo: '\u{2C00}',
            hi: '\u{2FEF}',
        },
        RunePair {
            lo: '\u{3001}',
            hi: '\u{D7FF}',
        },
        RunePair {
            lo: '\u{F900}',
            hi: '\u{FDCF}',
        },
        RunePair {
            lo: '\u{FDF0}',
            hi: '\u{FFFD}',
        },
        RunePair {
            lo: '\u{10000}',
            hi: '\u{EFFFF}',
        },
    ]
}

/// XML NameChar - characters that can appear in an XML name (XSD `\c`).
/// Includes all NameStartChar plus additional characters.
fn xml_name_char() -> RuneRange {
    let mut rr = xml_name_start_char();
    rr.extend([
        RunePair { lo: '-', hi: '-' }, // 0x2D
        RunePair { lo: '.', hi: '.' }, // 0x2E
        RunePair { lo: '0', hi: '9' }, // 0x30-0x39
        RunePair {
            lo: '\u{B7}',
            hi: '\u{B7}',
        },
        RunePair {
            lo: '\u{300}',
            hi: '\u{36F}',
        },
        RunePair {
            lo: '\u{203F}',
            hi: '\u{2040}',
        },
    ]);
    rr
}

/// Check for multi-char escape sequences that expand to character classes.
/// Returns Some((RuneRange, Option<cache_key>)) for recognized escapes, None otherwise.
/// Large Unicode range escapes (`~i`, `~I`, `~c`, `~C`) include a cache key
/// so the FA shell can be cached and reused.
fn check_multi_char_escape(c: char) -> Option<(RuneRange, Option<String>)> {
    match c {
        // ~d = digit [0-9]
        'd' => Some((vec![RunePair { lo: '0', hi: '9' }], None)),
        // ~D = non-digit (everything except 0-9)
        'D' => Some((invert_rune_range(vec![RunePair { lo: '0', hi: '9' }]), None)),
        // ~w = word char [a-zA-Z0-9_]
        'w' => Some((word_char_runes(), None)),
        // ~W = non-word char
        'W' => Some((invert_rune_range(word_char_runes()), None)),
        // ~s = whitespace [ \t\n\r]
        's' => Some((
            vec![
                RunePair { lo: ' ', hi: ' ' },
                RunePair { lo: '\t', hi: '\t' },
                RunePair { lo: '\n', hi: '\n' },
                RunePair { lo: '\r', hi: '\r' },
            ],
            None,
        )),
        // ~S = non-whitespace
        'S' => Some((
            invert_rune_range(vec![
                RunePair { lo: ' ', hi: ' ' },
                RunePair { lo: '\t', hi: '\t' },
                RunePair { lo: '\n', hi: '\n' },
                RunePair { lo: '\r', hi: '\r' },
            ]),
            None,
        )),
        // ~i = XML NameStartChar (initial name char) — large Unicode range, cache it
        'i' => Some((xml_name_start_char(), Some("i".to_string()))),
        // ~I = NOT XML NameStartChar — large Unicode range, cache it
        'I' => Some((
            invert_rune_range(xml_name_start_char()),
            Some("-i".to_string()),
        )),
        // ~c = XML NameChar (name char) — large Unicode range, cache it
        'c' => Some((xml_name_char(), Some("c".to_string()))),
        // ~C = NOT XML NameChar — large Unicode range, cache it
        'C' => Some((invert_rune_range(xml_name_char()), Some("-c".to_string()))),
        _ => None,
    }
}

/// Read an atom.
fn read_atom(parse: &mut RegexpParse) -> Result<QuantifiedAtom, Error> {
    let b = parse.next_rune()?;

    match b {
        c if is_normal_char(c) => Ok(QuantifiedAtom {
            runes: vec![RunePair { lo: c, hi: c }],
            ..Default::default()
        }),
        '.' => {
            parse.record_feature(RegexpFeature::Dot);
            Ok(QuantifiedAtom {
                is_dot: true,
                ..Default::default()
            })
        }
        '(' => read_group(parse),
        ')' => {
            if parse.is_nested() {
                parse.backup1(b);
                Err(Error {
                    message: "stuck".into(),
                    offset: parse.last_index,
                })
            } else {
                Err(Error {
                    message: "unbalanced ')'".into(),
                    offset: parse.last_index,
                })
            }
        }
        '[' => {
            parse.record_feature(RegexpFeature::Class);
            let (rr, ascii_negated_bytes) = read_char_class_expr(parse)?;
            Ok(QuantifiedAtom {
                runes: rr,
                ascii_negated_bytes,
                ..Default::default()
            })
        }
        ']' => Err(Error {
            message: "invalid ']'".into(),
            offset: parse.last_index,
        }),
        c if c == ESCAPE => read_escape(parse),
        '?' | '+' | '*' | '{' => Err(Error {
            message: format!("invalid character '{b}' (quantifier without atom)"),
            offset: parse.last_index,
        }),
        // '|' is handled here too: top-level alternation is the caller's
        // problem, so back up and signal "stuck" to let it resume parsing.
        _ => {
            parse.backup1(b);
            Err(Error {
                message: "stuck".into(),
                offset: parse.last_index,
            })
        }
    }
}

/// Read a `(...)` group, including the `(?:`, `(?=`, `(?!`, `(?<=`, `(?<!` extensions.
fn read_group(parse: &mut RegexpParse) -> Result<QuantifiedAtom, Error> {
    parse.nest();
    let lookaround_type = read_group_extension(parse)?;
    read_branches(parse)?;
    parse.require(')')?;
    let subtree = parse.unnest();
    Ok(QuantifiedAtom {
        subtree: Some(subtree),
        lookaround: lookaround_type,
        ..Default::default()
    })
}

/// Classify what follows the opening `(`. Returns the lookaround kind if any.
fn read_group_extension(parse: &mut RegexpParse) -> Result<Option<LookaroundType>, Error> {
    match parse.next_rune() {
        Ok('?') => read_question_extension(parse),
        Ok(c) => {
            // Regular capturing group - back up the character we peeked.
            parse.record_feature(RegexpFeature::ParenGroup);
            parse.backup1(c);
            Ok(None)
        }
        Err(_) => {
            // Empty group or EOF - will fail on require(')') later.
            parse.record_feature(RegexpFeature::ParenGroup);
            Ok(None)
        }
    }
}

/// Handle the part after `(?`: `:`, `=`, `!`, `<=`, `<!`.
fn read_question_extension(parse: &mut RegexpParse) -> Result<Option<LookaroundType>, Error> {
    match parse.next_rune() {
        Ok(':') => {
            parse.record_feature(RegexpFeature::NonCapturingGroup);
            Ok(None)
        }
        Ok('=') => {
            parse.record_feature(RegexpFeature::Lookaround);
            Ok(Some(LookaroundType::PositiveLookahead))
        }
        Ok('!') => {
            parse.record_feature(RegexpFeature::Lookaround);
            Ok(Some(LookaroundType::NegativeLookahead))
        }
        Ok('<') => read_lookbehind_extension(parse),
        Ok(c) => Err(Error {
            message: format!("unsupported group extension (?{c}...)"),
            offset: parse.last_index,
        }),
        Err(_) => Err(Error {
            message: "unexpected end after (?".into(),
            offset: parse.last_index,
        }),
    }
}

/// Handle the part after `(?<`: either `=` (positive) or `!` (negative).
fn read_lookbehind_extension(parse: &mut RegexpParse) -> Result<Option<LookaroundType>, Error> {
    match parse.next_rune() {
        Ok('=') => {
            parse.record_feature(RegexpFeature::Lookaround);
            Ok(Some(LookaroundType::PositiveLookbehind))
        }
        Ok('!') => {
            parse.record_feature(RegexpFeature::Lookaround);
            Ok(Some(LookaroundType::NegativeLookbehind))
        }
        Ok(c) => Err(Error {
            message: format!("named capturing groups (?<{c}...) not supported"),
            offset: parse.last_index,
        }),
        Err(_) => Err(Error {
            message: "unexpected end after (?<".into(),
            offset: parse.last_index,
        }),
    }
}

/// Read the atom that follows an `ESCAPE` (`~`) byte.
fn read_escape(parse: &mut RegexpParse) -> Result<QuantifiedAtom, Error> {
    let next = parse.next_rune().map_err(|_| Error {
        message: format!("'{ESCAPE}' at end of regular expression"),
        offset: parse.last_index,
    })?;

    if let Some(escaped) = check_single_char_escape(next) {
        return Ok(QuantifiedAtom {
            runes: vec![RunePair {
                lo: escaped,
                hi: escaped,
            }],
            ..Default::default()
        });
    }

    // Multi-char escapes (~d, ~w, ~s, ~D, ~W, ~S, ~i, ~I, ~c, ~C)
    if let Some((runes, cache_key)) = check_multi_char_escape(next) {
        return Ok(QuantifiedAtom {
            runes,
            cache_key,
            ..Default::default()
        });
    }

    if next == 'p' || next == 'P' {
        return read_property_escape(parse, next);
    }

    if next == 'b' || next == 'B' {
        parse.record_feature(RegexpFeature::WordBoundary);
        return Ok(QuantifiedAtom {
            is_word_boundary: Some(next == 'b'),
            ..Default::default()
        });
    }

    // Backreferences (~1 through ~9) are not supported.
    if let Some(digit) = next.to_digit(10)
        && (1..=9).contains(&digit)
    {
        return Err(Error {
            message: format!("backreferences (~{digit}) are not supported"),
            offset: parse.last_index,
        });
    }

    Err(Error {
        message: format!("invalid character '{next}' after '{ESCAPE}'"),
        offset: parse.last_index,
    })
}

/// `~p{Name}` and the inverted `~P{Name}`.
fn read_property_escape(parse: &mut RegexpParse, marker: char) -> Result<QuantifiedAtom, Error> {
    parse.record_feature(RegexpFeature::Property);
    let (mut runes, cache_key) = read_category(parse)?;
    // `~P{...}` means NOT in the category; prefix the cache key with "-".
    let cache_key = if marker == 'P' {
        runes = invert_rune_range(runes);
        cache_key.map(|k| format!("-{k}"))
    } else {
        cache_key
    };
    Ok(QuantifiedAtom {
        runes,
        cache_key,
        ..Default::default()
    })
}

/// Read a character class expression [...]
/// Returns `(RuneRange, Option<Vec<u8>>)` where the second element contains
/// ASCII negated bytes for patterns like [^x] that can use memchr acceleration.
fn read_char_class_expr(parse: &mut RegexpParse) -> Result<(RuneRange, Option<Vec<u8>>), Error> {
    read_char_class_expr_depth(parse, 0)
}

/// Maximum nesting depth for character class subtraction (e.g., [a-[b-[c-[...]]]]).
const MAX_CLASS_SUBTRACTION_DEPTH: usize = 8;

fn read_char_class_expr_depth(
    parse: &mut RegexpParse,
    depth: usize,
) -> Result<(RuneRange, Option<Vec<u8>>), Error> {
    if depth > MAX_CLASS_SUBTRACTION_DEPTH {
        return Err(Error {
            message: "character class subtraction nested too deeply".into(),
            offset: parse.index,
        });
    }

    // Check for unclosed bracket (EOF immediately after '[')
    if parse.is_empty() {
        return Err(Error {
            message: "unclosed character class".into(),
            offset: parse.index,
        });
    }

    // Check for negation
    let is_negated = parse.bypass_optional('^')?;
    if is_negated {
        parse.record_feature(RegexpFeature::NegatedClass);
    }

    let mut rr = read_cce1s(parse)?;

    // Check for character class subtraction -[...] or trailing -
    if parse.bypass_optional('-') == Ok(true) {
        // Peek ahead to see if this is subtraction syntax -[
        let next = parse.next_rune().map_err(|_| Error {
            message: "unclosed character class".into(),
            offset: parse.index,
        })?;
        if next == '[' {
            // Character class subtraction: base-[subtract]
            // Recursively parse the subtracted class (which may itself contain subtraction)
            let (subtract_rr, _) = read_char_class_expr_depth(parse, depth + 1)?;
            rr = subtract_rune_range(rr, subtract_rr);
        } else {
            // Not subtraction — it's a trailing literal '-'
            parse.backup1(next);
            rr.push(RunePair { lo: '-', hi: '-' });
        }
    }

    parse.require(']')?;

    // Detect ASCII-only negated patterns for memchr acceleration
    // For patterns like [^x], [^/], [^"], we can accelerate by searching for exit bytes
    let ascii_negated_bytes = if is_negated {
        detect_ascii_negated_bytes(&rr)
    } else {
        None
    };

    // Apply negation if needed
    if is_negated {
        rr = invert_rune_range(rr);
    }

    Ok((rr, ascii_negated_bytes))
}

/// Detect if a rune range (BEFORE inversion) represents an ASCII-only character set
/// with 1-3 characters. Used for memchr acceleration of negated patterns.
///
/// For patterns like `[^x]`, the pre-inversion runes are just `[{x, x}]`.
/// We return Some(vec![b'x']) which can be used for memchr acceleration.
fn detect_ascii_negated_bytes(rr: &RuneRange) -> Option<Vec<u8>> {
    let mut bytes = Vec::new();

    for pair in rr {
        // Check if both lo and hi are ASCII
        if pair.lo as u32 >= 128 || pair.hi as u32 >= 128 {
            return None; // Non-ASCII character in negated set
        }

        // Expand the range and collect bytes
        for c in (pair.lo as u8)..=(pair.hi as u8) {
            bytes.push(c);
            if bytes.len() > 3 {
                return None; // Too many exit bytes for memchr acceleration
            }
        }
    }

    if bytes.is_empty() || bytes.len() > 3 {
        return None;
    }

    Some(bytes)
}

/// Read CCE1 elements
fn read_cce1s(parse: &mut RegexpParse) -> Result<RuneRange, Error> {
    let mut rr = Vec::new();
    let mut first = true;

    loop {
        let cce1 = read_cce1(parse, first)?;
        rr.extend(cce1);
        first = false;

        match parse.next_rune() {
            Ok(r) => {
                parse.backup1(r);
                if r == '-' || r == ']' {
                    return Ok(simplify_rune_range(rr));
                }
            }
            Err(_) => {
                // EOF inside character class - unclosed bracket
                return Err(Error {
                    message: "unclosed character class".into(),
                    offset: parse.index,
                });
            }
        }
    }
}

/// Check if a character is valid in a character class
fn is_cc_char(r: char) -> bool {
    let code = r as u32;
    if code <= 0x2c || (0x2e..=0x5A).contains(&code) {
        return true;
    }
    if (0x5e..=0xd7ff).contains(&code) {
        return true;
    }
    if (0xe000..=0x10ffff).contains(&code) {
        return true;
    }
    if r == '\\' {
        return true;
    }
    false
}

/// Read a single CCE1 element
fn read_cce1(parse: &mut RegexpParse, first: bool) -> Result<RuneRange, Error> {
    let r = parse.next_rune().map_err(|_| Error {
        message: "unclosed character class".into(),
        offset: parse.index,
    })?;

    let lo = if first && r == '-' {
        return Ok(vec![RunePair { lo: '-', hi: '-' }]);
    } else if r == ESCAPE {
        let next = parse.next_rune().map_err(|_| Error {
            message: "unclosed character class".into(),
            offset: parse.index,
        })?;
        if next == 'p' || next == 'P' {
            parse.record_feature(RegexpFeature::Property);
            // Inside character class, we don't use the cache key
            let (mut runes, _cache_key) = read_category(parse)?;
            // ~P{...} means NOT in the category (inverted)
            if next == 'P' {
                runes = invert_rune_range(runes);
            }
            return Ok(runes);
        }
        // Check for multi-char escapes (can't participate in ranges)
        // Inside character classes, cache_key is not used since ranges are combined.
        if let Some((runes, _cache_key)) = check_multi_char_escape(next) {
            return Ok(runes);
        }
        check_single_char_escape(next).ok_or_else(|| Error {
            message: format!("invalid character '{next}' after {ESCAPE} in character class"),
            offset: parse.last_index,
        })?
    } else {
        if !is_cc_char(r) {
            return Err(Error {
                message: format!("invalid character '{r}' in character class"),
                offset: parse.last_index,
            });
        }
        r
    };

    // Check for range
    let next = parse.next_rune().map_err(|_| Error {
        message: "unclosed character class".into(),
        offset: parse.index,
    })?;
    if next != '-' {
        parse.backup1(next);
        return Ok(vec![RunePair { lo, hi: lo }]);
    }

    // Looking at a range?
    let range_end = parse.next_rune().map_err(|_| Error {
        message: "unclosed character class".into(),
        offset: parse.index,
    })?;

    // Might be end of class -]
    if range_end == ']' {
        parse.backup1(range_end);
        return Ok(vec![RunePair { lo, hi: lo }, RunePair { lo: '-', hi: '-' }]);
    }

    // Character class subtraction: d-[ should NOT be a range attempt.
    // Back up both '[' and '-' so the caller can handle subtraction syntax.
    if range_end == '[' {
        parse.backup1(range_end);
        parse.backup1('-');
        return Ok(vec![RunePair { lo, hi: lo }]);
    }

    let hi = if range_end == ESCAPE {
        let escaped = parse.next_rune().map_err(|_| Error {
            message: "unclosed character class".into(),
            offset: parse.index,
        })?;
        check_single_char_escape(escaped).ok_or_else(|| Error {
            message: format!("invalid char '{escaped}' after - in character class"),
            offset: parse.last_index,
        })?
    } else {
        if !is_cc_char(range_end) {
            return Err(Error {
                message: format!("invalid char '{range_end}' in range"),
                offset: parse.last_index,
            });
        }
        range_end
    };

    if lo > hi {
        return Err(Error {
            message: format!("invalid range {lo}-{hi}"),
            offset: parse.last_index,
        });
    }

    Ok(vec![RunePair { lo, hi }])
}

/// Subtract one rune range from another: `base - subtract`.
/// Returns characters that are in `base` but NOT in `subtract`.
/// Both inputs are simplified before processing.
pub fn subtract_rune_range(base: RuneRange, subtract: RuneRange) -> RuneRange {
    let base = simplify_rune_range(base);
    let subtract = simplify_rune_range(subtract);

    let mut result = Vec::new();
    let mut sub_idx = 0;

    for pair in &base {
        let mut lo = pair.lo as u32;
        let hi = pair.hi as u32;

        // Walk through subtract ranges that might overlap this base range
        while sub_idx < subtract.len() && (subtract[sub_idx].hi as u32) < lo {
            let _prev = sub_idx;
            sub_idx += 1;
            // This loop terminates only because sub_idx strictly increases each
            // pass; assert it so a broken increment fails fast instead of hanging.
            debug_assert!(sub_idx > _prev, "subtract_rune_range: sub_idx must advance");
        }

        let mut si = sub_idx;
        while si < subtract.len() && (subtract[si].lo as u32) <= hi {
            let _prev_si = si;
            let sub_lo = subtract[si].lo as u32;
            let sub_hi = subtract[si].hi as u32;

            // Add the gap before this subtract range (if any)
            if lo < sub_lo
                && let (Some(r_lo), Some(r_hi)) = (char::from_u32(lo), char::from_u32(sub_lo - 1))
            {
                result.push(RunePair { lo: r_lo, hi: r_hi });
            }

            // Advance past the subtracted portion
            lo = sub_hi + 1;
            si += 1;
            // This inner loop also terminates only because si strictly increases
            // each pass; assert it so a broken increment fails fast, not hangs.
            debug_assert!(si > _prev_si, "subtract_rune_range: si must advance");
        }

        // Add remaining portion of base range after all subtract ranges
        if lo <= hi
            && let (Some(r_lo), Some(r_hi)) = (char::from_u32(lo), char::from_u32(hi))
        {
            result.push(RunePair { lo: r_lo, hi: r_hi });
        }
    }

    result
}

/// Simplify and merge overlapping rune ranges
pub fn simplify_rune_range(mut rranges: RuneRange) -> RuneRange {
    if rranges.is_empty() {
        return rranges;
    }

    rranges.sort_by_key(|rp| rp.lo);

    let mut out = Vec::new();
    let mut current = rranges[0];

    for next in rranges.iter().skip(1).copied() {
        if next.lo as u32 > current.hi as u32 + 1 {
            out.push(current);
            current = next;
            continue;
        }
        if next.hi <= current.hi {
            continue;
        }
        current.hi = next.hi;
    }
    out.push(current);
    out
}

/// Add a gap range to inverted, handling surrogate boundary
fn add_gap_range(inverted: &mut Vec<RunePair>, start: u32, end: u32) {
    // Skip empty or invalid ranges
    if start > end {
        return;
    }

    // If the range spans the surrogate area, split it.
    if spans_into_surrogate(start, end) {
        // Part before surrogates
        if let (Some(lo), Some(hi)) = (
            char::from_u32(start),
            char::from_u32(SURROGATE_START_CP - 1),
        ) {
            inverted.push(RunePair { lo, hi });
        }
        // Part after surrogates (if any)
        if past_surrogate_end(end)
            && let (Some(lo), Some(hi)) =
                (char::from_u32(SURROGATE_END_CP + 1), char::from_u32(end))
        {
            inverted.push(RunePair { lo, hi });
        }
    } else if (SURROGATE_START_CP..=SURROGATE_END_CP).contains(&start) {
        // Starts in surrogate range, only add part after
        if past_surrogate_end(end)
            && let (Some(lo), Some(hi)) =
                (char::from_u32(SURROGATE_END_CP + 1), char::from_u32(end))
        {
            inverted.push(RunePair { lo, hi });
        }
    } else {
        // Normal range (not touching surrogates)
        if let (Some(lo), Some(hi)) = (char::from_u32(start), char::from_u32(end)) {
            inverted.push(RunePair { lo, hi });
        }
    }
}

/// Returns true when `[start, end]` (inclusive) crosses into the surrogate
/// codepoint window — `start` is strictly below `SURROGATE_START_CP` and
/// `end` reaches or passes it.
const fn spans_into_surrogate(start: u32, end: u32) -> bool {
    start < SURROGATE_START_CP && end >= SURROGATE_START_CP
}

/// Returns true when a codepoint lies strictly past the surrogate window —
/// the post-surrogate tail of a split range.
const fn past_surrogate_end(end: u32) -> bool {
    end > SURROGATE_END_CP
}

/// Returns true when a sorted RunePair starting at `next_lo` overlaps or is
/// adjacent to a previously merged RunePair ending at `prev_hi`.
const fn range_pair_overlaps_or_adjacent(next_lo: u32, prev_hi: u32) -> bool {
    next_lo <= prev_hi + 1
}

/// Invert a rune range (for negated character classes).
/// Returns a range that matches everything NOT in the input range.
pub fn invert_rune_range(mut rr: RuneRange) -> RuneRange {
    rr.sort_by_key(|rp| rp.lo);

    // Merge overlapping/adjacent ranges after sorting.
    let mut merged: Vec<RunePair> = Vec::new();
    for pair in rr {
        if let Some(last) = merged.last_mut()
            && range_pair_overlaps_or_adjacent(pair.lo as u32, last.hi as u32)
        {
            last.hi = last.hi.max(pair.hi);
            continue;
        }
        merged.push(pair);
    }

    let mut inverted = Vec::new();
    let mut point: u32 = 0;

    for pair in &merged {
        let lo = pair.lo as u32;
        if lo > point {
            add_gap_range(&mut inverted, point, lo - 1);
        }
        point = pair.hi as u32 + 1;
    }

    if point <= RUNE_MAX as u32 {
        add_gap_range(&mut inverted, point, RUNE_MAX as u32);
    }

    inverted
}

/// Read a Unicode category ~p{...} or ~P{...} and return the character ranges
/// along with a cache key for large categories.
/// Handles both general categories (Lu, Ll, Nd, etc.) and Unicode blocks (IsBasicLatin, etc.).
/// Returns (ranges, cache_key) where cache_key is Some for general categories.
fn read_category(parse: &mut RegexpParse) -> Result<(RuneRange, Option<String>), Error> {
    parse.require('{')?;

    // Collect all characters until '}'
    let mut name = String::new();
    loop {
        let c = parse.next_rune()?;
        if c == '}' {
            break;
        }
        name.push(c);
    }

    if name.is_empty() {
        return Err(Error {
            message: "empty category name".into(),
            offset: parse.last_index,
        });
    }

    // Check for Unicode block (starts with "Is")
    // Unicode blocks are not cached (smaller than categories)
    if name.starts_with("Is") {
        if let Some(ranges) = get_block_ranges(&name) {
            return Ok((ranges, None));
        }
        return Err(Error {
            message: format!("unknown Unicode block ~p{{{name}}}"),
            offset: parse.last_index,
        });
    }

    // Parse as general category
    let mut chars = name.chars();
    let initial = match chars.next() {
        Some(c) => c,
        None => {
            return Err(Error {
                message: "empty category name".into(),
                offset: parse.last_index,
            });
        }
    };

    let valid_initials = ['L', 'M', 'N', 'P', 'Z', 'S', 'C'];
    if !valid_initials.contains(&initial) {
        return Err(Error {
            message: format!("unknown category {initial}"),
            offset: parse.last_index,
        });
    }

    let detail = chars.next();

    // Validate detail letter based on initial
    if let Some(d) = detail {
        let valid_details = match initial {
            'L' => "ultmo",
            'M' => "nce",
            'N' => "dlo",
            'P' => "cdseifo",
            'Z' => "slp",
            'S' => "mcko",
            'C' => "cfon",
            _ => "",
        };

        if !valid_details.contains(d) {
            return Err(Error {
                message: format!("unknown category {ESCAPE}p{{{initial}{d}"),
                offset: parse.last_index,
            });
        }

        // Check for extra characters
        if chars.next().is_some() {
            return Err(Error {
                message: format!("invalid category name ~p{{{name}}}"),
                offset: parse.last_index,
            });
        }
    }

    // Build the cache key (e.g., "L", "Lu", "Nd")
    let cache_key = detail.map_or_else(|| initial.to_string(), |d| format!("{initial}{d}"));

    // Look up the category ranges
    if let Some(ranges) = get_category_ranges(initial, detail) {
        Ok((ranges, Some(cache_key)))
    } else {
        Err(Error {
            message: format!("unknown category ~p{{{name}}}"),
            offset: parse.last_index,
        })
    }
}

/// Read a quantifier (?, *, +, {m,n}) with optional lazy modifier (?)
fn read_quantifier(parse: &mut RegexpParse, qa: &mut QuantifiedAtom) -> Result<(), Error> {
    let b = if let Ok(c) = parse.next_rune() {
        c
    } else {
        qa.quant_min = 1;
        qa.quant_max = 1;
        return Ok(());
    };

    let mut is_quantifier = true;
    match b {
        '*' => {
            parse.record_feature(RegexpFeature::Star);
            qa.quant_min = 0;
            qa.quant_max = REGEXP_QUANTIFIER_MAX;
        }
        '+' => {
            parse.record_feature(RegexpFeature::Plus);
            qa.quant_min = 1;
            qa.quant_max = REGEXP_QUANTIFIER_MAX;
        }
        '?' => {
            parse.record_feature(RegexpFeature::QuestionMark);
            qa.quant_min = 0;
            qa.quant_max = 1;
        }
        '{' => {
            parse.record_feature(RegexpFeature::Range);
            read_range_quantifier(parse, qa)?;
        }
        _ => {
            qa.quant_min = 1;
            qa.quant_max = 1;
            parse.backup1(b);
            is_quantifier = false;
        }
    }

    // Check for lazy quantifier modifier (e.g., *?, +?, {n,m}?)
    // Since we're doing pattern matching (not extraction), lazy vs greedy
    // makes no semantic difference - we just consume the optional '?'
    if is_quantifier {
        match parse.next_rune() {
            Ok('?') => {
                parse.record_feature(RegexpFeature::LazyQuantifier);
                // Lazy quantifier - consumed but has no effect on matching
            }
            Ok(c) => {
                // We consumed a char that wasn't '?', backup
                parse.backup1(c);
            }
            Err(_) => {
                // End of string - nothing to do
            }
        }
    }

    Ok(())
}

fn quantifier_too_large_error(parse: &RegexpParse) -> Error {
    Error {
        message: format!(
            "invalid range quantifier, repetition count exceeds maximum of {REGEXP_QUANTIFIER_MAX}"
        ),
        offset: parse.last_index,
    }
}

/// Read a range quantifier {m,n}
fn read_range_quantifier(parse: &mut RegexpParse, qa: &mut QuantifiedAtom) -> Result<(), Error> {
    // Helper to convert EOF to a more specific error
    let next_or_eof = |p: &mut RegexpParse| -> Result<char, Error> {
        p.next_rune().map_err(|e| {
            if e.message == "end of string" {
                Error {
                    message: "unexpected end of string in quantifier".into(),
                    offset: e.offset,
                }
            } else {
                e
            }
        })
    };

    let mut lo_digits = String::new();

    loop {
        let b = next_or_eof(parse)?;
        if b.is_ascii_digit() {
            lo_digits.push(b);
        } else {
            if lo_digits.is_empty() {
                return Err(Error {
                    message: "invalid range quantifier, expecting digits".into(),
                    offset: parse.last_index,
                });
            }

            let lo: i32 = lo_digits.parse().map_err(|_| Error {
                message: "invalid number in quantifier".into(),
                offset: parse.last_index,
            })?;
            // The NFA builder materializes one state per repetition, so
            // repetition counts must be bounded before any states are built.
            if lo > REGEXP_QUANTIFIER_MAX {
                return Err(quantifier_too_large_error(parse));
            }
            qa.quant_min = lo;
            // Default to exact match; will be updated if comma is present
            qa.quant_max = lo;

            match b {
                // {n} means exactly n times
                '}' => return Ok(()),
                // {n,} or {n,m} - will parse upper bound below
                ',' => {
                    // Set to unbounded initially for {n,} case
                    qa.quant_max = REGEXP_QUANTIFIER_MAX;
                    break;
                }
                _ => {
                    return Err(Error {
                        message: format!("unexpected character '{b}' in quantifier"),
                        offset: parse.last_index,
                    });
                }
            }
        }
    }

    // After comma
    let b = next_or_eof(parse)?;
    if b == '}' {
        return Ok(());
    }

    if !b.is_ascii_digit() {
        return Err(Error {
            message: format!("invalid character '{b}' in quantifier range, wanted a digit"),
            offset: parse.last_index,
        });
    }

    let mut hi_digits = String::from(b);
    loop {
        let b = next_or_eof(parse)?;
        if b.is_ascii_digit() {
            hi_digits.push(b);
        } else if b == '}' {
            let hi: i32 = hi_digits.parse().map_err(|_| Error {
                message: "invalid number in quantifier".into(),
                offset: parse.last_index,
            })?;
            if hi > REGEXP_QUANTIFIER_MAX {
                return Err(quantifier_too_large_error(parse));
            }
            if hi < qa.quant_min {
                return Err(Error {
                    message: "invalid range quantifier, top must be greater than bottom".into(),
                    offset: parse.last_index,
                });
            }
            qa.quant_max = hi;
            return Ok(());
        } else {
            return Err(Error {
                message: format!("invalid character '{b}', expected '}}'"),
                offset: parse.last_index,
            });
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_spans_into_surrogate() {
        // True when [start, end] crosses into the surrogate window: start lies
        // below the window AND end reaches it. Cases walk both edges.
        assert!(!spans_into_surrogate(0, 0xD7FF)); // end just below the window
        assert!(spans_into_surrogate(0, SURROGATE_START_CP)); // end at lower edge
        assert!(spans_into_surrogate(0, SURROGATE_END_CP));
        assert!(spans_into_surrogate(0xD7FF, SURROGATE_START_CP));
        // A range that starts at or past the window does not "span into" it.
        assert!(!spans_into_surrogate(SURROGATE_START_CP, 0xFFFF));
        assert!(!spans_into_surrogate(SURROGATE_END_CP, 0xFFFF));
        assert!(!spans_into_surrogate(SURROGATE_END_CP + 1, 0xFFFF));
    }

    #[test]
    fn test_range_pair_overlaps_or_adjacent() {
        // True when the next range overlaps or sits immediately adjacent to the
        // previous one (`next_lo <= prev_hi + 1`); the `+ 1` is what lets a
        // zero-gap adjacency count as mergeable. A gap of one or more is false.
        assert!(range_pair_overlaps_or_adjacent(5, 10)); // overlapping inside
        assert!(range_pair_overlaps_or_adjacent(10, 10)); // overlap at edge
        assert!(range_pair_overlaps_or_adjacent(11, 10)); // adjacent (lo == hi + 1)
        assert!(!range_pair_overlaps_or_adjacent(12, 10)); // gap of one
        assert!(!range_pair_overlaps_or_adjacent(100, 10)); // far gap
        assert!(range_pair_overlaps_or_adjacent(0, 0));
        assert!(range_pair_overlaps_or_adjacent(1, 0));
        assert!(!range_pair_overlaps_or_adjacent(2, 0));
    }

    #[test]
    fn test_past_surrogate_end() {
        // True only strictly beyond the surrogate window; the window end itself
        // is not "past".
        assert!(!past_surrogate_end(SURROGATE_END_CP)); // window end is not past
        assert!(!past_surrogate_end(SURROGATE_END_CP - 1));
        assert!(past_surrogate_end(SURROGATE_END_CP + 1));
        assert!(past_surrogate_end(0x10FFFF));
    }

    #[test]
    fn test_detect_ascii_negated_bytes_single_char() {
        // [^x] -> should detect 'x' as exit byte
        let rr = vec![RunePair { lo: 'x', hi: 'x' }];
        let result = detect_ascii_negated_bytes(&rr);
        assert_eq!(result, Some(vec![b'x']));
    }

    #[test]
    fn test_detect_ascii_negated_bytes_multiple_chars() {
        // [^abc] -> should detect a, b, c as exit bytes
        let rr = vec![
            RunePair { lo: 'a', hi: 'a' },
            RunePair { lo: 'b', hi: 'b' },
            RunePair { lo: 'c', hi: 'c' },
        ];
        let result = detect_ascii_negated_bytes(&rr);
        assert_eq!(result, Some(vec![b'a', b'b', b'c']));
    }

    #[test]
    fn test_detect_ascii_negated_bytes_range() {
        // [^a-c] -> should detect a, b, c as exit bytes
        let rr = vec![RunePair { lo: 'a', hi: 'c' }];
        let result = detect_ascii_negated_bytes(&rr);
        assert_eq!(result, Some(vec![b'a', b'b', b'c']));
    }

    #[test]
    fn test_detect_ascii_negated_bytes_too_many() {
        // [^a-z] -> too many exit bytes (26), should return None
        let rr = vec![RunePair { lo: 'a', hi: 'z' }];
        let result = detect_ascii_negated_bytes(&rr);
        assert_eq!(result, None);
    }

    #[test]
    fn test_detect_ascii_negated_bytes_non_ascii() {
        // [^ü] -> non-ASCII, should return None
        let rr = vec![RunePair { lo: 'ü', hi: 'ü' }];
        let result = detect_ascii_negated_bytes(&rr);
        assert_eq!(result, None);
    }

    #[test]
    fn test_detect_ascii_negated_bytes_mixed() {
        // [^aü] -> contains non-ASCII, should return None
        let rr = vec![RunePair { lo: 'a', hi: 'a' }, RunePair { lo: 'ü', hi: 'ü' }];
        let result = detect_ascii_negated_bytes(&rr);
        assert_eq!(result, None);
    }

    #[test]
    fn test_parse_negated_char_class_stores_ascii_bytes() {
        // Parse [^x]+ and verify ascii_negated_bytes is set
        let tree = parse("[^x]+").unwrap();
        assert_eq!(tree.len(), 1);
        assert_eq!(tree[0].len(), 1);

        let qa = &tree[0][0];
        assert_eq!(qa.ascii_negated_bytes, Some(vec![b'x']));
    }

    #[test]
    fn test_parse_non_negated_class_no_ascii_bytes() {
        // Parse [abc]+ - not negated, so no ascii_negated_bytes
        let tree = parse("[abc]+").unwrap();
        assert_eq!(tree.len(), 1);
        assert_eq!(tree[0].len(), 1);

        let qa = &tree[0][0];
        assert_eq!(qa.ascii_negated_bytes, None);
    }

    #[test]
    fn test_parse_negated_unicode_class_no_ascii_bytes() {
        // Parse [^ü]+ - negated but non-ASCII, so no ascii_negated_bytes
        let tree = parse("[^ü]+").unwrap();
        assert_eq!(tree.len(), 1);
        assert_eq!(tree[0].len(), 1);

        let qa = &tree[0][0];
        assert_eq!(qa.ascii_negated_bytes, None);
    }

    #[test]
    fn test_parse_negated_slash_class() {
        // Parse [^/]+ - common pattern for path parsing
        let tree = parse("[^/]+").unwrap();
        assert_eq!(tree.len(), 1);
        assert_eq!(tree[0].len(), 1);

        let qa = &tree[0][0];
        assert_eq!(qa.ascii_negated_bytes, Some(vec![b'/']));
    }

    #[test]
    fn test_parse_negated_quote_class() {
        // Parse [^"]+ - common pattern for quoted string parsing
        // Note: in I-Regexp, " is a normal character, no escaping needed
        let tree = parse("[^\"]+").unwrap();
        assert_eq!(tree.len(), 1);
        assert_eq!(tree[0].len(), 1);

        let qa = &tree[0][0];
        assert_eq!(qa.ascii_negated_bytes, Some(vec![b'"']));
    }

    // ========================================================================
    // Quantifier type detection
    // ========================================================================

    #[test]
    fn test_quantifier_detection() {
        // Parse "a" (singleton)
        let tree = parse("a").unwrap();
        assert_eq!(tree.len(), 1);
        assert_eq!(tree[0].len(), 1);
        let qa = &tree[0][0];
        assert!(
            qa.is_singleton(),
            "single char without quantifier should be singleton"
        );
        assert!(!qa.is_qm(), "singleton should not be optional");
        assert!(!qa.is_plus(), "singleton should not be plus");
        assert!(!qa.is_star(), "singleton should not be star");

        // Parse "a?" (optional, catches is_qm)
        let tree = parse("a?").unwrap();
        let qa = &tree[0][0];
        assert!(!qa.is_singleton(), "optional should not be singleton");
        assert!(qa.is_qm(), "a? should be optional");
        assert!(!qa.is_plus(), "a? should not be plus");
        assert!(!qa.is_star(), "a? should not be star");

        // Parse "a+" (one or more, catches is_plus)
        let tree = parse("a+").unwrap();
        let qa = &tree[0][0];
        assert!(!qa.is_singleton(), "a+ should not be singleton");
        assert!(!qa.is_qm(), "a+ should not be optional");
        assert!(qa.is_plus(), "a+ should be plus");
        assert!(!qa.is_star(), "a+ should not be star");

        // Parse "a*" (zero or more, catches is_star)
        let tree = parse("a*").unwrap();
        let qa = &tree[0][0];
        assert!(!qa.is_singleton(), "a* should not be singleton");
        assert!(!qa.is_qm(), "a* should not be optional");
        assert!(!qa.is_plus(), "a* should not be plus");
        assert!(qa.is_star(), "a* should be star");
    }

    // ========================================================================
    // Lookaround type detection
    // ========================================================================

    #[test]
    fn test_lookaround_type_classification() {
        // Each variant should correctly report negative/lookbehind classification

        let pos_lookahead = LookaroundType::PositiveLookahead;
        assert!(
            !pos_lookahead.is_negative(),
            "positive lookahead should not be negative"
        );
        assert!(
            !pos_lookahead.is_lookbehind(),
            "lookahead should not be lookbehind"
        );

        let neg_lookahead = LookaroundType::NegativeLookahead;
        assert!(
            neg_lookahead.is_negative(),
            "negative lookahead should be negative"
        );
        assert!(
            !neg_lookahead.is_lookbehind(),
            "lookahead should not be lookbehind"
        );

        let pos_lookbehind = LookaroundType::PositiveLookbehind;
        assert!(
            !pos_lookbehind.is_negative(),
            "positive lookbehind should not be negative"
        );
        assert!(
            pos_lookbehind.is_lookbehind(),
            "positive lookbehind should be lookbehind"
        );

        let neg_lookbehind = LookaroundType::NegativeLookbehind;
        assert!(
            neg_lookbehind.is_negative(),
            "negative lookbehind should be negative"
        );
        assert!(
            neg_lookbehind.is_lookbehind(),
            "negative lookbehind should be lookbehind"
        );
    }

    // ========================================================================
    // Atom dispatch + character-class membership
    // ========================================================================

    #[test]
    fn test_atom_unbalanced_close_paren() {
        // An unmatched ')' is a hard error, not the end of an empty branch.
        assert!(parse(")").unwrap_err().message.contains("unbalanced"));
        assert!(parse("a)").is_err());
    }

    #[test]
    fn test_atom_bare_quantifier_rejected() {
        // A quantifier at atom start has nothing to quantify and is rejected.
        for q in ["?", "+", "*", "{"] {
            assert!(
                parse(q)
                    .unwrap_err()
                    .message
                    .contains("quantifier without atom")
            );
        }
    }

    #[test]
    fn test_char_class_backslash_is_valid_member() {
        // '\' falls in the gap between is_cc_char's member ranges, so it is a
        // valid class member only via the explicit backslash case.
        let tree = parse("[\\]").unwrap();
        assert_eq!(tree[0][0].runes, vec![RunePair { lo: '\\', hi: '\\' }]);
    }

    #[test]
    fn test_inverted_category_in_class() {
        // ~P{...} is the inverted category, ~p{...} the plain one: a control
        // char belongs to ~p{Cc} only, an ordinary letter to ~P{Cc} only.
        let contains = |runes: &RuneRange, c: char| runes.iter().any(|p| p.lo <= c && c <= p.hi);

        let plain = parse("[~p{Cc}]").unwrap()[0][0].runes.clone();
        assert!(contains(&plain, '\u{01}'));
        assert!(!contains(&plain, 'a'));

        let inverted = parse("[~P{Cc}]").unwrap()[0][0].runes.clone();
        assert!(!contains(&inverted, '\u{01}'));
        assert!(contains(&inverted, 'a'));
    }

    // ========================================================================
    // Word boundary (~b/~B) expansion
    // ========================================================================

    #[test]
    fn test_wb_parse_boundary_marker() {
        let tree = parse("~bhello").unwrap();
        assert!(has_word_boundary(&tree));
        let expanded = expand_word_boundaries(&tree).unwrap();
        // ~b at start: first char ('h' = word char) must be word → trivially satisfied
        assert_eq!(expanded.len(), 1);
        assert_eq!(expanded[0].len(), 5);
    }

    #[test]
    fn test_wb_impossible_boundary_empty_tree() {
        // ab~bcd: 'b' and 'c' are both word chars → ~b impossible → empty tree
        let tree = parse("ab~bcd").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert!(
            expanded.is_empty(),
            "impossible boundary should produce empty tree"
        );
    }

    #[test]
    fn test_wb_middle_boundary_expansion() {
        // ab~b cd: 'b' is word, ' ' is non-word → boundary valid
        let tree = parse("ab~b cd").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert_eq!(expanded.len(), 1); // Only word→nonword branch survives
    }

    #[test]
    fn test_wb_dot_star_expansion() {
        // .*~bcat: dot-star before boundary, 'c' (word) after
        let tree = parse(".*~bcat").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // Two branches:
        // 1. .*{0,MAX-1} ~W c a t (last of .* is non-word, c is word)
        // 2. c a t (when .* matches 0 chars, boundary at value start, c must be word)
        assert_eq!(expanded.len(), 2);
    }

    // ========================================================================
    // Lookbehind fixed-length validation
    // ========================================================================

    #[test]
    fn test_lookbehind_same_length_alternation_accepted() {
        // (?<=ab|cd)x — two branches, both length 2 → fixed length, should succeed
        let result = parse("(?<=ab|cd)x");
        assert!(
            result.is_ok(),
            "lookbehind with same-length alternation should parse: {:?}",
            result.err()
        );
    }

    #[test]
    fn test_lookbehind_different_length_alternation_rejected() {
        // (?<=a|cd)x — branches of length 1 and 2 → variable length, should fail
        let result = parse("(?<=a|cd)x");
        assert!(
            result.is_err(),
            "lookbehind with different-length alternation must be rejected"
        );
        assert!(
            result.unwrap_err().message.contains("variable-length"),
            "error should mention variable-length"
        );
    }

    #[test]
    fn test_lookbehind_star_rejected() {
        // (?<=a*)x — star is variable length
        let result = parse("(?<=a*)x");
        assert!(result.is_err(), "lookbehind with star must be rejected");
    }

    #[test]
    fn test_lookbehind_plus_rejected() {
        // (?<=a+)x — plus is variable length
        let result = parse("(?<=a+)x");
        assert!(result.is_err(), "lookbehind with plus must be rejected");
    }

    #[test]
    fn test_lookbehind_char_class_accepted() {
        // (?<=[abc])x — character class, fixed length 1
        let result = parse("(?<=[abc])x");
        assert!(
            result.is_ok(),
            "lookbehind with char class should parse: {:?}",
            result.err()
        );
    }

    #[test]
    fn test_lookbehind_single_branch_group_accepted() {
        // (?<=(?:ab))x — single-branch group of length 2
        let result = parse("(?<=(?:ab))x");
        assert!(
            result.is_ok(),
            "lookbehind with single-branch group should parse: {:?}",
            result.err()
        );
    }

    #[test]
    fn test_lookbehind_multi_branch_group_same_length() {
        // (?<=(?:ab|cd))x — alternation inside group, both length 2
        let result = parse("(?<=(?:ab|cd))x");
        assert!(
            result.is_ok(),
            "lookbehind with same-length alternation group should parse: {:?}",
            result.err()
        );
    }

    #[test]
    fn test_lookbehind_multi_branch_group_different_length() {
        // (?<=(?:a|cd))x — alternation inside group, different lengths
        let result = parse("(?<=(?:a|cd))x");
        assert!(
            result.is_err(),
            "lookbehind with different-length alternation group must be rejected"
        );
    }

    #[test]
    fn test_lookbehind_quantified_group_accepted() {
        // (?<=a{3})x — fixed quantifier {3,3}, total length 3
        let result = parse("(?<=a{3})x");
        assert!(
            result.is_ok(),
            "lookbehind with fixed quantifier should parse: {:?}",
            result.err()
        );
    }

    #[test]
    fn test_lookbehind_range_quantifier_rejected() {
        // (?<=a{2,4})x — variable range quantifier
        let result = parse("(?<=a{2,4})x");
        assert!(
            result.is_err(),
            "lookbehind with range quantifier must be rejected"
        );
    }

    // ========================================================================
    // Word boundary with quantified atoms (constrain_atom_at_boundary)
    // ========================================================================

    #[test]
    fn test_wb_quantified_plus_at_end() {
        // a+~b — plus-quantified word char at boundary end
        // The last char of a+ must be word (trivially true for 'a'),
        // so this splits into [a{0,MAX}, a_constrained]
        let tree = parse("a+~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // Should produce at least one branch (a+ ends with word char,
        // boundary before implicit " which is non-word)
        assert!(
            !expanded.is_empty(),
            "a+~b should have valid expansion since 'a' is a word char"
        );
        // The expansion should have more atoms than the original (split quantifier)
        assert!(
            expanded[0].len() >= 2,
            "quantified atom should split into base + constrained"
        );
    }

    #[test]
    fn test_wb_quantified_star_at_end() {
        // x.*~b — star before boundary at end, triggers SplitOrAbsent path
        // When .* matches 0 chars, boundary falls after 'x' (word → non-word " = valid)
        let tree = parse("x.*~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // Should produce multiple branches:
        // 1. x + .*{adjusted} constrained (dot split)
        // 2. x alone (when .* is absent, 'x' is word, " is non-word → boundary)
        assert!(
            expanded.len() >= 2,
            "x.*~b should produce SplitOrAbsent branches, got {}",
            expanded.len()
        );
    }

    #[test]
    fn test_wb_star_at_start() {
        // ~ba*x — star-quantified 'a' at boundary start, triggers SplitOrAbsent
        // in expand_wb_at_start. When a* matches 0 chars, boundary falls before 'x'.
        let tree = parse("~ba*x").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // Should produce branches for both: a present (constrained) and a absent (x constrained)
        assert!(
            expanded.len() >= 2,
            "~ba*x should produce SplitOrAbsent branches at start, got {}",
            expanded.len()
        );
    }

    #[test]
    fn test_wb_dot_at_end_constrain() {
        // .~b — dot before boundary at end, tests is_dot field in constrained atom
        let tree = parse(".~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // Dot intersected with word chars → should produce a branch with rune-constrained atom
        assert!(
            !expanded.is_empty(),
            ".~b should expand (dot can match word chars)"
        );
        // The expanded atom should NOT be a dot anymore (it's constrained to word chars)
        assert!(
            !expanded[0][0].is_dot,
            "constrained dot should no longer be marked as dot"
        );
    }

    #[test]
    fn test_wb_negated_class_at_boundary() {
        // [^x]+~b — negated class with ascii_negated_bytes at boundary
        let tree = parse("[^x]+~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // [^x] intersected with word chars still has valid members
        assert!(!expanded.is_empty(), "[^x]+~b should have valid expansion");
    }

    #[test]
    fn test_wb_count_limit_exactly_four() {
        // Pattern with exactly 4 word boundaries (the maximum allowed)
        let tree = parse("~ba~b ~Bc~b").unwrap();
        let result = expand_word_boundaries(&tree);
        assert!(
            result.is_ok(),
            "4 word boundaries should be accepted (max is 4)"
        );
    }

    #[test]
    fn test_wb_count_limit_five_rejected() {
        // Pattern with 5 word boundaries (exceeds max of 4)
        let tree = parse("~ba~b ~Bc~bd~b").unwrap();
        let result = expand_word_boundaries(&tree);
        assert!(result.is_err(), "5 word boundaries should be rejected");
        assert!(
            result.unwrap_err().contains("too many word boundaries"),
            "error should mention word boundary limit"
        );
    }

    // ========================================================================
    // branch_fixed_length with quantifiers
    // ========================================================================

    #[test]
    fn test_lookbehind_quantifier_vs_individual_same_length() {
        // (?<=a{2}|bc)x — branch 1 uses quantifier {2}, branch 2 spells out two chars
        // Both have character-count length 2. Must be accepted as equal-length.
        let result = parse("(?<=a{2}|bc)x");
        assert!(
            result.is_ok(),
            "a{{2}} and bc are both length 2: {:?}",
            result.err()
        );
    }

    // ========================================================================
    // Word boundary with SplitOrAbsent fallback (expand_wb_at_end)
    // ========================================================================

    #[test]
    fn test_wb_at_end_with_star_after_literal() {
        // xa*~b — 'x' is a word char, 'a*' can match zero chars (SplitOrAbsent).
        // When a* is absent, boundary falls after 'x' (word → non-word " = valid).
        // When a* is present, its last char must be word (trivially true for 'a').
        // The SplitOrAbsent path constrains the preceding atom ('x') as fallback.
        let tree = parse("xa*~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // Must produce branches for both the present case (a* constrained) and
        // the absent case (x constrained as fallback)
        assert!(
            expanded.len() >= 2,
            "xa*~b needs SplitOrAbsent fallback branches, got {}",
            expanded.len()
        );
    }

    // ========================================================================
    // ~B alone (non-boundary between two implicit " delimiters)
    // ========================================================================

    #[test]
    fn test_non_boundary_alone_always_matches() {
        // ~B alone: both sides are `"` (non-word), so they're in the same class → always a match
        let tree = parse("~B").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert!(
            !expanded.is_empty(),
            "~B alone should produce an empty-branch alternative (always matches)"
        );
    }

    #[test]
    fn test_boundary_alone_never_matches() {
        // ~b alone: both sides are `"` (non-word), same class → boundary never holds
        let tree = parse("~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert!(
            expanded.is_empty(),
            "~b alone should produce no alternatives (impossible boundary)"
        );
    }

    #[test]
    fn test_wb_single_star_at_end() {
        // .*~b — single zero-or-more atom before boundary at end.
        // Prefix is [.*] (last_idx = 0). SplitOrAbsent path must NOT
        // try to access prefix[-1]; the fallback is skipped when there is
        // no preceding atom.
        let tree = parse(".*~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        // .* can match word chars, so at least one branch constrains the dot
        assert!(
            !expanded.is_empty(),
            ".*~b should produce at least one branch"
        );
    }

    #[test]
    fn test_read_category_two_letter_initials() {
        // Two-letter Unicode categories are validated by their initial letter.
        // Every supported initial (M, P, Z, S, C) must accept a valid detail.
        for cat in ["~p{Mn}", "~p{Pc}", "~p{Zs}", "~p{Sm}", "~p{Cc}"] {
            assert!(
                parse(cat).is_ok(),
                "category {cat} must parse: {:?}",
                parse(cat).err()
            );
        }
    }

    #[test]
    fn test_class_subtraction_at_max_depth() {
        // Eight nested `-[` levels sits exactly at MAX_CLASS_SUBTRACTION_DEPTH,
        // which is still allowed; only deeper nesting is rejected.
        let re = "[0-[1-[2-[3-[4-[5-[6-[7-[8]]]]]]]]]";
        assert!(
            parse(re).is_ok(),
            "depth-8 subtraction: {:?}",
            parse(re).err()
        );
    }

    #[test]
    fn test_property_escape_inside_char_class() {
        // A `~p{...}` / `~P{...}` property escape is valid inside a character
        // class, not just at top level.
        assert!(parse("[~p{L}]").is_ok(), "{:?}", parse("[~p{L}]").err());
        assert!(parse("[~P{N}]").is_ok(), "{:?}", parse("[~P{N}]").err());
    }

    #[test]
    fn test_equal_endpoint_char_range() {
        // A range whose endpoints are equal (`[a-a]`) is a valid single-char
        // class; only lo strictly greater than hi is an error.
        assert!(parse("[a-a]").is_ok(), "{:?}", parse("[a-a]").err());
    }

    #[test]
    fn test_standalone_close_bracket_error_message() {
        // A standalone `]` reports its own specific diagnostic rather than the
        // generic "stuck" fallthrough.
        let err = parse("]").expect_err("standalone ] must error");
        assert!(
            err.message.contains("invalid ']'"),
            "expected \"invalid ']'\", got {:?}",
            err.message
        );
    }

    #[test]
    fn test_invert_rune_range_skips_surrogate_window() {
        // Inverting [U+0000-U+D7FF] yields the high plane [U+E000..=MAX]: the
        // complement jumps over the surrogate window rather than including it.
        let inv = invert_rune_range(vec![RunePair {
            lo: '\u{0}',
            hi: '\u{D7FF}',
        }]);
        assert!(!inv.is_empty(), "inverted range must not be empty");
        assert!(
            inv.iter()
                .any(|p| (p.lo as u32) >= 0xE000 && p.hi == RUNE_MAX),
            "inverted range must cover the post-surrogate high plane, got {inv:?}"
        );
    }

    #[test]
    fn test_detect_ascii_negated_bytes_boundary_and_empty() {
        // A pair straddling the ASCII boundary (lo below 128, hi at/above 128)
        // is not pure-ASCII, so no exit bytes are produced.
        assert_eq!(
            detect_ascii_negated_bytes(&vec![RunePair {
                lo: '\u{7F}',
                hi: '\u{80}',
            }]),
            None
        );
        // No ranges means nothing to negate.
        assert_eq!(detect_ascii_negated_bytes(&Vec::new()), None);
    }

    #[test]
    fn test_wb_at_start_single_optional_suffix() {
        // `~b` at value start followed by a single optional atom (`a*`) is the
        // minimal SplitOrAbsent case: the suffix has exactly one element.
        let tree = parse("~ba*").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert!(!expanded.is_empty(), "~ba* should expand");
    }

    #[test]
    fn test_wb_at_end_bounded_quantifier_remainder() {
        // `a{2,3}~b` splits one boundary char off the end; the remainder atom
        // keeps a quant_max of one less than the original (3 - 1 = 2).
        let tree = parse("a{2,3}~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert_eq!(expanded.len(), 1, "got {expanded:?}");
        assert_eq!(expanded[0].len(), 2, "got {expanded:?}");
        assert_eq!(
            expanded[0][0].quant_max, 2,
            "remainder atom quant_max must be original_max-1=2, got {}",
            expanded[0][0].quant_max
        );
    }

    #[test]
    fn test_wb_at_end_split_fallback_uses_previous_atom() {
        // In ` a*~b` the prefix is [' ', a*] and a* is SplitOrAbsent. When a*
        // is absent the boundary lands after the space, a non-word char, so
        // constraining the atom before a* adds no fallback branch — the
        // fallback considers the previous atom, not a* itself.
        let tree = parse(" a*~b").unwrap();
        let expanded = expand_word_boundaries(&tree).unwrap();
        assert_eq!(
            expanded.len(),
            1,
            "space is non-word so the SplitOrAbsent fallback adds nothing; got {expanded:?}"
        );
    }

    #[test]
    fn test_constrain_atom_base_keeps_cache_key_and_ascii_bytes() {
        // Constraining a quantified atom at a boundary splits off one occurrence
        // and keeps the remaining count as a `base` atom. That base must carry
        // over the original atom's cache_key and ascii_negated_bytes rather than
        // reset them to their defaults.
        let atom = QuantifiedAtom {
            runes: vec![RunePair { lo: 'a', hi: 'z' }],
            quant_min: 2,
            quant_max: 3,
            cache_key: Some("wb_test".to_string()),
            ascii_negated_bytes: Some(vec![b'q']),
            ..Default::default()
        };
        let constrained = constrain_atom_at_boundary(&atom, &word_char_runes(), true)
            .expect("a-z intersects the word class, so the atom constrains");
        let ConstrainedAtom::Split(base, _single) = constrained else {
            panic!("a non-zero-min {{2,3}} quantifier must produce a Split");
        };
        assert_eq!(base.cache_key.as_deref(), Some("wb_test"));
        assert_eq!(base.ascii_negated_bytes, Some(vec![b'q']));
    }
}