succinctly 0.7.0

//! Balanced parentheses operations for succinct tree navigation.
//!
//! This module provides efficient operations on balanced parentheses (BP) sequences,
//! which are a fundamental building block for succinct tree representations.
//!
//! In BP encoding:
//! - `1` = open parenthesis `(`
//! - `0` = close parenthesis `)`
//!
//! Bit positions are indexed from LSB (position 0) to MSB (position 63).
//!
//! # Key Operations
//!
//! | Operation | Description |
//! |-----------|-------------|
//! | `find_close(i)` | Find position of `)` matching `(` at position `i` |
//! | `find_open(i)` | Find position of `(` matching `)` at position `i` |
//! | `enclose(i)` | Find position of `(` enclosing the node at position `i` (parent) |
//!
//! # Implementation
//!
//! Two implementations are provided:
//!
//! 1. **Word-level operations** (`find_close_in_word`): O(w) per word using bit scanning
//! 2. **RangeMin structure** (`BalancedParens`): O(1) operations using hierarchical min-excess
//!
//! Based on:
//! - Sadakane & Navarro, "Fully-Functional Succinct Trees", SODA 2010

#[cfg(not(test))]
use alloc::vec::Vec;

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

use crate::bits::popcount::popcount_word;
use crate::bits::SelectIndex;
use crate::util::broadword::select_in_word;

// ============================================================================
// Select support traits for zero-cost abstraction
// ============================================================================

/// Trait for optional select support on balanced parentheses.
///
/// This enables zero-cost abstraction: JSON uses `NoSelect` (ZST, no overhead),
/// while YAML uses `WithSelect` for O(1) select1 queries.
pub trait SelectSupport: Clone + Default {
    /// Build select support from word data.
    fn build(words: &[u64], total_ones: usize) -> Self;

    /// Perform select1 query: find the position of the k-th 1-bit (0-indexed).
    /// Returns None if k >= total_ones.
    fn select1(&self, words: &[u64], len: usize, total_ones: usize, k: usize) -> Option<usize>;
}

/// No select support (zero-sized type for JSON).
///
/// This is a ZST that adds no memory overhead. The select1 method falls back
/// to binary search using rank1 when needed.
#[derive(Clone, Debug, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct NoSelect;

impl SelectSupport for NoSelect {
    #[inline]
    fn build(_words: &[u64], _total_ones: usize) -> Self {
        NoSelect
    }

    #[inline]
    fn select1(&self, _words: &[u64], _len: usize, _total_ones: usize, _k: usize) -> Option<usize> {
        // NoSelect doesn't support select1 - caller should use binary search on rank1
        None
    }
}

/// Select support using sampled index (for YAML).
///
/// Uses the existing SelectIndex from bits module for O(1) jump + short scan.
#[derive(Clone, Debug, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct WithSelect {
    select_idx: SelectIndex,
}

impl SelectSupport for WithSelect {
    fn build(words: &[u64], total_ones: usize) -> Self {
        // Use default sample rate of 256 for ~3% overhead
        WithSelect {
            select_idx: SelectIndex::build(words, total_ones, 256),
        }
    }

    fn select1(&self, words: &[u64], len: usize, total_ones: usize, k: usize) -> Option<usize> {
        if k >= total_ones {
            return None;
        }

        // Use select index to jump to approximate position
        let (start_word, mut remaining) = self.select_idx.jump_to(k);

        // Scan words from the starting position
        for (word_idx, &word) in words.iter().enumerate().skip(start_word) {
            let pop = popcount_word(word) as usize;

            if pop > remaining {
                // Found the target word
                let bit_pos = select_in_word(word, remaining as u32) as usize;
                let result = word_idx * 64 + bit_pos;
                return if result < len { Some(result) } else { None };
            }

            remaining -= pop;
        }

        None
    }
}

// ============================================================================
// Byte lookup tables for fast word_min_excess computation
// ============================================================================

/// Lookup table for minimum excess within each byte value (0-255).
/// For byte value b, BYTE_MIN_EXCESS[b] is the minimum prefix sum reached
/// when scanning bits 0..7, where 1=+1 (open), 0=-1 (close).
const BYTE_MIN_EXCESS: [i8; 256] = {
    let mut table = [0i8; 256];
    let mut byte_val: u16 = 0;
    while byte_val < 256 {
        let mut excess: i8 = 0;
        let mut min_e: i8 = 0;
        let mut bit = 0;
        while bit < 8 {
            if (byte_val >> bit) & 1 == 1 {
                excess += 1;
            } else {
                excess -= 1;
                if excess < min_e {
                    min_e = excess;
                }
            }
            bit += 1;
        }
        table[byte_val as usize] = min_e;
        byte_val += 1;
    }
    table
};

/// Lookup table for maximum prefix excess when scanning a byte backwards (bits 7..0).
/// For byte value b, BYTE_MAX_EXCESS_REV[b] is the maximum running excess reached
/// when scanning bits 7 down to 0, where 1=+1 (open), 0=-1 (close).
/// This is used by `enclose()` to determine if a word might contain the enclosing parent.
const BYTE_MAX_EXCESS_REV: [i8; 256] = {
    let mut table = [0i8; 256];
    let mut byte_val: u16 = 0;
    while byte_val < 256 {
        let mut excess: i8 = 0;
        let mut max_e: i8 = 0;
        let mut bit: i8 = 7;
        while bit >= 0 {
            if (byte_val >> bit) & 1 == 1 {
                excess += 1;
                if excess > max_e {
                    max_e = excess;
                }
            } else {
                excess -= 1;
            }
            bit -= 1;
        }
        table[byte_val as usize] = max_e;
        byte_val += 1;
    }
    table
};

/// Lookup table for total excess of each byte value (0-255).
/// For byte value b, BYTE_TOTAL_EXCESS[b] = popcount(b)*2 - 8 = opens - closes.
const BYTE_TOTAL_EXCESS: [i8; 256] = {
    let mut table = [0i8; 256];
    let mut byte_val: u16 = 0;
    while byte_val < 256 {
        // Count 1-bits (opens)
        let mut ones: i8 = 0;
        let mut tmp = byte_val;
        while tmp != 0 {
            ones += (tmp & 1) as i8;
            tmp >>= 1;
        }
        // total_excess = opens - closes = opens - (8 - opens) = 2*opens - 8
        table[byte_val as usize] = 2 * ones - 8;
        byte_val += 1;
    }
    table
};

/// Lookup table to find position where excess drops to target within a byte.
/// Index: byte_value * 16 + initial_excess (initial_excess 1-16 mapped to 0-15)
/// Value: bit position (0-7) where excess reaches 0, or 8 if not found in this byte.
///
/// This enables O(1) find_close within a byte instead of bit-by-bit scanning.
const BYTE_FIND_CLOSE: [[u8; 16]; 256] = {
    let mut table = [[8u8; 16]; 256];
    let mut byte_val: usize = 0;
    while byte_val < 256 {
        // For each possible initial excess (1-16)
        let mut init_excess: usize = 0;
        while init_excess < 16 {
            let starting_excess = (init_excess + 1) as i8; // Map 0-15 to 1-16
            let mut excess = starting_excess;
            let mut bit = 0;
            while bit < 8 {
                if (byte_val >> bit) & 1 == 1 {
                    excess += 1;
                } else {
                    excess -= 1;
                    if excess == 0 {
                        table[byte_val][init_excess] = bit as u8;
                        break;
                    }
                }
                bit += 1;
            }
            // If we didn't find a match, table stays at 8
            init_excess += 1;
        }
        byte_val += 1;
    }
    table
};

// ============================================================================
// Unrolled Scalar Optimization
// ============================================================================

/// Optimized computation of min excess and total excess for a full 64-bit word.
///
/// This version unrolls the loop and batches the lookup table accesses,
/// which is faster than the generic version due to better instruction scheduling
/// and cache locality. This is a pure scalar optimization (no SIMD intrinsics)
/// that benefits all architectures (x86_64, ARM, etc.).
#[inline]
fn word_min_excess_unrolled(word: u64) -> (i8, i16) {
    // Get bytes as array
    let bytes = word.to_le_bytes();

    // Load lookup values for all 8 bytes at once
    // Batching lookups helps with cache locality
    let b0 = bytes[0] as usize;
    let b1 = bytes[1] as usize;
    let b2 = bytes[2] as usize;
    let b3 = bytes[3] as usize;
    let b4 = bytes[4] as usize;
    let b5 = bytes[5] as usize;
    let b6 = bytes[6] as usize;
    let b7 = bytes[7] as usize;

    // Get min excess for each byte
    let m0 = BYTE_MIN_EXCESS[b0] as i16;
    let m1 = BYTE_MIN_EXCESS[b1] as i16;
    let m2 = BYTE_MIN_EXCESS[b2] as i16;
    let m3 = BYTE_MIN_EXCESS[b3] as i16;
    let m4 = BYTE_MIN_EXCESS[b4] as i16;
    let m5 = BYTE_MIN_EXCESS[b5] as i16;
    let m6 = BYTE_MIN_EXCESS[b6] as i16;
    let m7 = BYTE_MIN_EXCESS[b7] as i16;

    // Get total excess for each byte
    let t0 = BYTE_TOTAL_EXCESS[b0] as i16;
    let t1 = BYTE_TOTAL_EXCESS[b1] as i16;
    let t2 = BYTE_TOTAL_EXCESS[b2] as i16;
    let t3 = BYTE_TOTAL_EXCESS[b3] as i16;
    let t4 = BYTE_TOTAL_EXCESS[b4] as i16;
    let t5 = BYTE_TOTAL_EXCESS[b5] as i16;
    let t6 = BYTE_TOTAL_EXCESS[b6] as i16;
    let t7 = BYTE_TOTAL_EXCESS[b7] as i16;

    // Compute prefix sums (running excess at each byte boundary)
    let _p0: i16 = 0;
    let p1 = t0;
    let p2 = p1 + t1;
    let p3 = p2 + t2;
    let p4 = p3 + t3;
    let p5 = p4 + t4;
    let p6 = p5 + t5;
    let p7 = p6 + t6;
    let total = p7 + t7;

    // Compute global minimum: min of (prefix + byte_min) for each byte
    // Unrolled for better branch prediction
    let mut global_min = m0; // p0 + m0 = 0 + m0 = m0
    global_min = global_min.min(p1 + m1);
    global_min = global_min.min(p2 + m2);
    global_min = global_min.min(p3 + m3);
    global_min = global_min.min(p4 + m4);
    global_min = global_min.min(p5 + m5);
    global_min = global_min.min(p6 + m6);
    global_min = global_min.min(p7 + m7);

    let min_clamped = global_min.clamp(-128, 127) as i8;
    (min_clamped, total)
}

/// Build L0 index (per-word min excess and total excess).
///
/// Uses the unrolled scalar optimization which provides ~17-21% speedup
/// over a naive loop on x86_64. This is a pure scalar optimization that
/// benefits from loop unrolling and batched table lookups.
fn build_l0_index(words: &[u64], len: usize, num_words: usize) -> (Vec<i8>, Vec<i16>) {
    let mut l0_min_excess = Vec::with_capacity(num_words);
    let mut l0_word_excess = Vec::with_capacity(num_words);

    // Process all full words with unrolled optimization
    let full_words = if len % 64 == 0 {
        num_words
    } else {
        num_words - 1
    };

    for &word in words.iter().take(full_words) {
        let (min_e, total_e) = word_min_excess_unrolled(word);
        l0_min_excess.push(min_e);
        l0_word_excess.push(total_e);
    }

    // Handle last partial word with generic code (handles partial bit counts)
    if full_words < num_words {
        let valid_bits = len % 64;
        let (min_e, total_e) = word_min_excess(words[num_words - 1], valid_bits);
        l0_min_excess.push(min_e);
        l0_word_excess.push(total_e);
    }

    (l0_min_excess, l0_word_excess)
}

/// Build L1 index using NEON SIMD with prefix sums and VMINV.
///
/// Processes 8 words at a time using SIMD operations:
/// 1. Load min_excess and word_excess vectors
/// 2. Compute prefix sums of word_excess
/// 3. Add prefix sums to min_excess
/// 4. Find block minimum using vminvq_s16
#[cfg(all(feature = "simd", target_arch = "aarch64"))]
fn build_l1_index_neon(
    l0_min_excess: &[i8],
    l0_word_excess: &[i16],
    num_l1: usize,
) -> (Vec<i16>, Vec<i16>) {
    use core::arch::aarch64::*;

    let mut l1_min_excess = Vec::with_capacity(num_l1);
    let mut l1_block_excess = Vec::with_capacity(num_l1);

    let num_words = l0_min_excess.len();

    for block_idx in 0..num_l1 {
        let start = block_idx * FACTOR_L1;
        let end = (start + FACTOR_L1).min(num_words);
        let mut block_min: i16 = 0;
        let mut running_excess: i16 = 0;

        // Process 8 words at a time with SIMD
        let mut i = start;
        while i + 8 <= end {
            // SAFETY: We verified bounds above
            unsafe {
                // Load 8 min_excess values (i8) and widen to i16
                let min_ptr = l0_min_excess.as_ptr().add(i);
                let min_i8 = vld1_s8(min_ptr);
                let min_lo = vmovl_s8(min_i8); // First 8 i8 -> 8 i16

                // Load 8 word_excess values (i16)
                let excess_ptr = l0_word_excess.as_ptr().add(i);
                let excess = vld1q_s16(excess_ptr);

                // Compute prefix sum of excess using parallel prefix pattern
                // Step 1: Add adjacent pairs
                let shifted1 = vextq_s16(vdupq_n_s16(0), excess, 7); // [0, e0, e1, e2, e3, e4, e5, e6]
                let sum1 = vaddq_s16(excess, shifted1); // [e0, e0+e1, e1+e2, e2+e3, e3+e4, e4+e5, e5+e6, e6+e7]

                // Step 2: Add pairs of pairs
                let shifted2 = vextq_s16(vdupq_n_s16(0), sum1, 6); // [0, 0, s0, s1, s2, s3, s4, s5]
                let sum2 = vaddq_s16(sum1, shifted2);

                // Step 3: Add quads
                let shifted4 = vextq_s16(vdupq_n_s16(0), sum2, 4); // [0, 0, 0, 0, s0, s1, s2, s3]
                let prefix_sum = vaddq_s16(sum2, shifted4);

                // Add running_excess offset to prefix sum
                let offset = vdupq_n_s16(running_excess);
                let adjusted_prefix = vaddq_s16(prefix_sum, offset);

                // We need exclusive prefix sum for min calculation
                // adjusted_min[i] = running_excess + sum(excess[0..i]) + min[i]
                // The prefix_sum is inclusive, so we shift right and insert running_excess
                let exclusive_prefix = vextq_s16(offset, adjusted_prefix, 7);

                // Add to min_excess: min[i] + exclusive_prefix[i]
                let adjusted_min = vaddq_s16(min_lo, exclusive_prefix);

                // Find minimum using vminvq_s16
                let chunk_min = vminvq_s16(adjusted_min);
                block_min = block_min.min(chunk_min);

                // Update running excess with sum of all 8 values
                running_excess += vaddvq_s16(excess);
            }
            i += 8;
        }

        // Handle remaining words (< 8) with scalar code
        while i < end {
            let word_min = l0_min_excess[i] as i16;
            let word_excess = l0_word_excess[i];
            block_min = block_min.min(running_excess + word_min);
            running_excess += word_excess;
            i += 1;
        }

        l1_min_excess.push(block_min);
        l1_block_excess.push(running_excess);
    }

    (l1_min_excess, l1_block_excess)
}

/// NEON-accelerated L2 index building.
///
/// Uses the same SIMD pattern as L1 but processes L1 entries (all i16).
/// Provides consistent code structure with L1 even if performance gain is small.
#[cfg(all(feature = "simd", target_arch = "aarch64"))]
fn build_l2_index_neon(
    l1_min_excess: &[i16],
    l1_block_excess: &[i16],
    num_l2: usize,
) -> (Vec<i16>, Vec<i16>) {
    use core::arch::aarch64::*;

    let mut l2_min_excess = Vec::with_capacity(num_l2);
    let mut l2_block_excess = Vec::with_capacity(num_l2);

    let num_l1 = l1_min_excess.len();

    for block_idx in 0..num_l2 {
        let start = block_idx * FACTOR_L2;
        let end = (start + FACTOR_L2).min(num_l1);
        let mut block_min: i16 = 0;
        let mut running_excess: i16 = 0;

        // Process 8 L1 blocks at a time with SIMD
        let mut i = start;
        while i + 8 <= end {
            // SAFETY: We verified bounds above
            unsafe {
                // Load 8 min_excess values (i16)
                let min_ptr = l1_min_excess.as_ptr().add(i);
                let min_vals = vld1q_s16(min_ptr);

                // Load 8 block_excess values (i16)
                let excess_ptr = l1_block_excess.as_ptr().add(i);
                let excess = vld1q_s16(excess_ptr);

                // Compute prefix sum of excess using parallel prefix pattern
                // Step 1: Add adjacent pairs
                let shifted1 = vextq_s16(vdupq_n_s16(0), excess, 7);
                let sum1 = vaddq_s16(excess, shifted1);

                // Step 2: Add pairs of pairs
                let shifted2 = vextq_s16(vdupq_n_s16(0), sum1, 6);
                let sum2 = vaddq_s16(sum1, shifted2);

                // Step 3: Add quads
                let shifted4 = vextq_s16(vdupq_n_s16(0), sum2, 4);
                let prefix_sum = vaddq_s16(sum2, shifted4);

                // Add running_excess offset to prefix sum
                let offset = vdupq_n_s16(running_excess);
                let adjusted_prefix = vaddq_s16(prefix_sum, offset);

                // We need exclusive prefix sum for min calculation
                let exclusive_prefix = vextq_s16(offset, adjusted_prefix, 7);

                // Add to min_excess: min[i] + exclusive_prefix[i]
                let adjusted_min = vaddq_s16(min_vals, exclusive_prefix);

                // Find minimum using vminvq_s16
                let chunk_min = vminvq_s16(adjusted_min);
                block_min = block_min.min(chunk_min);

                // Update running excess with sum of all 8 values
                running_excess += vaddvq_s16(excess);
            }
            i += 8;
        }

        // Handle remaining L1 blocks (< 8) with scalar code
        while i < end {
            let l1_min = l1_min_excess[i];
            let l1_excess = l1_block_excess[i];
            block_min = block_min.min(running_excess + l1_min);
            running_excess += l1_excess;
            i += 1;
        }

        l2_min_excess.push(block_min);
        l2_block_excess.push(running_excess);
    }

    (l2_min_excess, l2_block_excess)
}

/// Build L1 index using SSE4.1 SIMD with prefix sums and PHMINPOSUW.
///
/// Processes 8 words at a time using SIMD operations:
/// 1. Load min_excess and word_excess vectors
/// 2. Compute prefix sums of word_excess
/// 3. Add prefix sums to min_excess
/// 4. Find block minimum using PHMINPOSUW (with bias trick for signed values)
///
/// # SSE4.1 Signed Minimum Workaround
///
/// PHMINPOSUW (`_mm_minpos_epu16`) only works on **unsigned** 16-bit values.
/// Since min_excess can be negative, we bias values by adding 32768 before
/// the minimum operation, then unbias the result.
///
/// This gives the same result as ARM NEON's `vminvq_s16` which handles signed
/// values directly.
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.1")]
unsafe fn build_l1_index_sse41_impl(
    l0_min_excess: &[i8],
    l0_word_excess: &[i16],
    num_l1: usize,
) -> (Vec<i16>, Vec<i16>) {
    use core::arch::x86_64::*;

    let mut l1_min_excess = Vec::with_capacity(num_l1);
    let mut l1_block_excess = Vec::with_capacity(num_l1);

    let num_words = l0_min_excess.len();

    // Bias constant for signed->unsigned conversion (0x8000 = 32768)
    let bias = _mm_set1_epi16(i16::MIN); // 0x8000 as i16 = -32768, but bit pattern is 0x8000

    for block_idx in 0..num_l1 {
        let start = block_idx * FACTOR_L1;
        let end = (start + FACTOR_L1).min(num_words);
        let mut block_min: i16 = 0;
        let mut running_excess: i16 = 0;

        // Process 8 words at a time with SIMD
        let mut i = start;
        while i + 8 <= end {
            // Load 8 min_excess values (i8) and widen to i16
            let min_ptr = l0_min_excess.as_ptr().add(i);
            let min_i8 = _mm_loadl_epi64(min_ptr as *const __m128i); // Load 8 bytes into low 64 bits
            let min_lo = _mm_cvtepi8_epi16(min_i8); // Sign-extend i8 -> i16

            // Load 8 word_excess values (i16)
            let excess_ptr = l0_word_excess.as_ptr().add(i);
            let excess = _mm_loadu_si128(excess_ptr as *const __m128i);

            // Compute prefix sum of excess using parallel prefix pattern
            // SSE doesn't have VEXT, so we use shuffles and shifts

            // Step 1: Add adjacent pairs
            // shifted1 = [0, e0, e1, e2, e3, e4, e5, e6]
            let shifted1 = _mm_slli_si128(excess, 2); // Shift left by 2 bytes (1 i16)
            let sum1 = _mm_add_epi16(excess, shifted1);

            // Step 2: Add pairs of pairs
            // shifted2 = [0, 0, s0, s1, s2, s3, s4, s5]
            let shifted2 = _mm_slli_si128(sum1, 4); // Shift left by 4 bytes (2 i16)
            let sum2 = _mm_add_epi16(sum1, shifted2);

            // Step 3: Add quads
            // shifted4 = [0, 0, 0, 0, s0, s1, s2, s3]
            let shifted4 = _mm_slli_si128(sum2, 8); // Shift left by 8 bytes (4 i16)
            let prefix_sum = _mm_add_epi16(sum2, shifted4);

            // Add running_excess offset to prefix sum
            let offset = _mm_set1_epi16(running_excess);
            let adjusted_prefix = _mm_add_epi16(prefix_sum, offset);

            // We need exclusive prefix sum for min calculation
            // exclusive_prefix[i] = running_excess + sum(excess[0..i])
            // The prefix_sum is inclusive, so we shift right and insert running_excess
            let exclusive_prefix = _mm_slli_si128(adjusted_prefix, 2);
            // Insert running_excess at position 0
            let exclusive_prefix = _mm_insert_epi16(exclusive_prefix, running_excess as i32, 0);

            // Add to min_excess: min[i] + exclusive_prefix[i]
            let adjusted_min = _mm_add_epi16(min_lo, exclusive_prefix);

            // Find minimum using PHMINPOSUW with bias trick
            // 1. Add bias (0x8000) to convert signed -> unsigned range
            // 2. Find unsigned minimum with PHMINPOSUW
            // 3. Subtract bias to convert back to signed
            let biased = _mm_add_epi16(adjusted_min, bias);
            let minpos = _mm_minpos_epu16(biased);
            // minpos[0] contains the minimum value (still biased)
            let biased_min = _mm_extract_epi16(minpos, 0) as i16;
            // Unbias: subtract 0x8000 (add 0x8000 in signed arithmetic wraps correctly)
            let chunk_min = biased_min.wrapping_add(i16::MIN);

            block_min = block_min.min(chunk_min);

            // Update running excess with sum of all 8 values
            // Use horizontal add or extract and sum
            // SSE doesn't have vaddvq equivalent, so we reduce manually
            let sum_lo = _mm_add_epi16(excess, _mm_srli_si128(excess, 8)); // Add high 4 to low 4
            let sum_lo = _mm_add_epi16(sum_lo, _mm_srli_si128(sum_lo, 4)); // Add pairs
            let sum_lo = _mm_add_epi16(sum_lo, _mm_srli_si128(sum_lo, 2)); // Add final pair
            running_excess += _mm_extract_epi16(sum_lo, 0) as i16;

            i += 8;
        }

        // Handle remaining words (< 8) with scalar code
        while i < end {
            let word_min = l0_min_excess[i] as i16;
            let word_excess = l0_word_excess[i];
            block_min = block_min.min(running_excess + word_min);
            running_excess += word_excess;
            i += 1;
        }

        l1_min_excess.push(block_min);
        l1_block_excess.push(running_excess);
    }

    (l1_min_excess, l1_block_excess)
}

/// Safe wrapper for SSE4.1 L1 index building.
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
fn build_l1_index_sse41(
    l0_min_excess: &[i8],
    l0_word_excess: &[i16],
    num_l1: usize,
) -> (Vec<i16>, Vec<i16>) {
    // SAFETY: We check for SSE4.1 support before calling
    unsafe { build_l1_index_sse41_impl(l0_min_excess, l0_word_excess, num_l1) }
}

/// Build L2 index using SSE4.1 SIMD.
///
/// Uses the same SIMD pattern as L1 but processes L1 entries (all i16).
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.1")]
unsafe fn build_l2_index_sse41_impl(
    l1_min_excess: &[i16],
    l1_block_excess: &[i16],
    num_l2: usize,
) -> (Vec<i16>, Vec<i16>) {
    use core::arch::x86_64::*;

    let mut l2_min_excess = Vec::with_capacity(num_l2);
    let mut l2_block_excess = Vec::with_capacity(num_l2);

    let num_l1 = l1_min_excess.len();

    // Bias constant for signed->unsigned conversion
    let bias = _mm_set1_epi16(i16::MIN);

    for block_idx in 0..num_l2 {
        let start = block_idx * FACTOR_L2;
        let end = (start + FACTOR_L2).min(num_l1);
        let mut block_min: i16 = 0;
        let mut running_excess: i16 = 0;

        // Process 8 L1 blocks at a time with SIMD
        let mut i = start;
        while i + 8 <= end {
            // Load 8 min_excess values (i16)
            let min_ptr = l1_min_excess.as_ptr().add(i);
            let min_vals = _mm_loadu_si128(min_ptr as *const __m128i);

            // Load 8 block_excess values (i16)
            let excess_ptr = l1_block_excess.as_ptr().add(i);
            let excess = _mm_loadu_si128(excess_ptr as *const __m128i);

            // Compute prefix sum using parallel prefix pattern
            let shifted1 = _mm_slli_si128(excess, 2);
            let sum1 = _mm_add_epi16(excess, shifted1);

            let shifted2 = _mm_slli_si128(sum1, 4);
            let sum2 = _mm_add_epi16(sum1, shifted2);

            let shifted4 = _mm_slli_si128(sum2, 8);
            let prefix_sum = _mm_add_epi16(sum2, shifted4);

            // Add running_excess offset
            let offset = _mm_set1_epi16(running_excess);
            let adjusted_prefix = _mm_add_epi16(prefix_sum, offset);

            // Exclusive prefix sum
            let exclusive_prefix = _mm_slli_si128(adjusted_prefix, 2);
            let exclusive_prefix = _mm_insert_epi16(exclusive_prefix, running_excess as i32, 0);

            // Add to min_excess
            let adjusted_min = _mm_add_epi16(min_vals, exclusive_prefix);

            // Find minimum with bias trick
            let biased = _mm_add_epi16(adjusted_min, bias);
            let minpos = _mm_minpos_epu16(biased);
            let biased_min = _mm_extract_epi16(minpos, 0) as i16;
            let chunk_min = biased_min.wrapping_add(i16::MIN);

            block_min = block_min.min(chunk_min);

            // Horizontal sum for running excess
            let sum_lo = _mm_add_epi16(excess, _mm_srli_si128(excess, 8));
            let sum_lo = _mm_add_epi16(sum_lo, _mm_srli_si128(sum_lo, 4));
            let sum_lo = _mm_add_epi16(sum_lo, _mm_srli_si128(sum_lo, 2));
            running_excess += _mm_extract_epi16(sum_lo, 0) as i16;

            i += 8;
        }

        // Handle remaining L1 blocks (< 8) with scalar code
        while i < end {
            let l1_min = l1_min_excess[i];
            let l1_excess = l1_block_excess[i];
            block_min = block_min.min(running_excess + l1_min);
            running_excess += l1_excess;
            i += 1;
        }

        l2_min_excess.push(block_min);
        l2_block_excess.push(running_excess);
    }

    (l2_min_excess, l2_block_excess)
}

/// Safe wrapper for SSE4.1 L2 index building.
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
fn build_l2_index_sse41(
    l1_min_excess: &[i16],
    l1_block_excess: &[i16],
    num_l2: usize,
) -> (Vec<i16>, Vec<i16>) {
    // SAFETY: We check for SSE4.1 support before calling
    unsafe { build_l2_index_sse41_impl(l1_min_excess, l1_block_excess, num_l2) }
}

/// Scalar implementation of L1 index building for testing.
///
/// This is the reference implementation that SSE4.1 and NEON should match exactly.
#[cfg(test)]
#[allow(dead_code)]
fn build_l1_index_scalar(
    l0_min_excess: &[i8],
    l0_word_excess: &[i16],
    num_l1: usize,
) -> (Vec<i16>, Vec<i16>) {
    let mut l1_min_excess = Vec::with_capacity(num_l1);
    let mut l1_block_excess = Vec::with_capacity(num_l1);

    let num_words = l0_min_excess.len();

    for block_idx in 0..num_l1 {
        let start = block_idx * FACTOR_L1;
        let end = (start + FACTOR_L1).min(num_words);

        let mut block_min: i16 = 0;
        let mut running_excess: i16 = 0;

        for i in start..end {
            let word_min = l0_min_excess[i] as i16;
            let word_excess = l0_word_excess[i];
            block_min = block_min.min(running_excess + word_min);
            running_excess += word_excess;
        }

        l1_min_excess.push(block_min);
        l1_block_excess.push(running_excess);
    }

    (l1_min_excess, l1_block_excess)
}

/// Scalar implementation of L2 index building for testing.
#[cfg(test)]
#[allow(dead_code)]
fn build_l2_index_scalar(
    l1_min_excess: &[i16],
    l1_block_excess: &[i16],
    num_l2: usize,
) -> (Vec<i16>, Vec<i16>) {
    let mut l2_min_excess = Vec::with_capacity(num_l2);
    let mut l2_block_excess = Vec::with_capacity(num_l2);

    let num_l1 = l1_min_excess.len();

    for block_idx in 0..num_l2 {
        let start = block_idx * FACTOR_L2;
        let end = (start + FACTOR_L2).min(num_l1);

        let mut block_min: i16 = 0;
        let mut running_excess: i16 = 0;

        for i in start..end {
            let l1_min = l1_min_excess[i];
            let l1_excess = l1_block_excess[i];
            block_min = block_min.min(running_excess + l1_min);
            running_excess += l1_excess;
        }

        l2_min_excess.push(block_min);
        l2_block_excess.push(running_excess);
    }

    (l2_min_excess, l2_block_excess)
}

/// Fast byte-level scan to find where excess drops to 0.
///
/// Given a word starting at `start_bit`, scans bytes using lookup tables
/// to find where excess first reaches 0.
///
/// Returns `Some(bit_position)` if found within valid bits, or `None` if not found.
///
/// # Arguments
/// * `word` - The 64-bit word to scan
/// * `start_bit` - Starting bit position within the word (0-63)
/// * `initial_excess` - The excess value before scanning (must be >= 1)
/// * `valid_bits` - Number of valid bits in the word (for partial words)
#[inline]
fn find_close_in_word_fast(
    word: u64,
    start_bit: usize,
    initial_excess: i32,
    valid_bits: usize,
) -> Option<usize> {
    if start_bit >= valid_bits || initial_excess <= 0 {
        return None;
    }

    let bytes = word.to_le_bytes();
    let mut pos = start_bit;
    let mut excess = initial_excess;

    // Handle partial first byte if not byte-aligned
    let first_byte_idx = pos / 8;
    let bit_in_byte = pos % 8;

    if bit_in_byte != 0 {
        // Scan remaining bits in first partial byte
        let byte_val = bytes[first_byte_idx];
        let end_bit = 8.min(valid_bits - first_byte_idx * 8);

        for bit in bit_in_byte..end_bit {
            if (byte_val >> bit) & 1 == 1 {
                excess += 1;
            } else {
                excess -= 1;
                if excess == 0 {
                    return Some(first_byte_idx * 8 + bit);
                }
            }
        }
        pos = (first_byte_idx + 1) * 8;
    }

    // Process full bytes using lookup table
    while pos + 8 <= valid_bits && excess > 0 {
        let byte_idx = pos / 8;
        let byte_val = bytes[byte_idx] as usize;

        // Check if excess can drop to 0 in this byte
        let min_excess_in_byte = BYTE_MIN_EXCESS[byte_val] as i32;
        if excess + min_excess_in_byte <= 0 {
            // Match is in this byte - use lookup table
            if excess <= 16 {
                let lookup_idx = (excess - 1) as usize;
                let match_pos = BYTE_FIND_CLOSE[byte_val][lookup_idx];
                if match_pos < 8 {
                    return Some(pos + match_pos as usize);
                }
            }
            // Fallback: bit-by-bit scan for this byte (excess > 16 or lookup failed)
            let mut e = excess;
            for bit in 0..8 {
                if (byte_val >> bit) & 1 == 1 {
                    e += 1;
                } else {
                    e -= 1;
                    if e == 0 {
                        return Some(pos + bit);
                    }
                }
            }
        }

        excess += BYTE_TOTAL_EXCESS[byte_val] as i32;
        pos += 8;
    }

    // Handle remaining bits in last partial byte
    if pos < valid_bits && excess > 0 {
        let byte_idx = pos / 8;
        if byte_idx < 8 {
            let byte_val = bytes[byte_idx];
            let end_bit = valid_bits - pos;

            for bit in 0..end_bit {
                if (byte_val >> bit) & 1 == 1 {
                    excess += 1;
                } else {
                    excess -= 1;
                    if excess == 0 {
                        return Some(pos + bit);
                    }
                }
            }
        }
    }

    None
}

// ============================================================================
// Phase 1: Word-Level Operations
// ============================================================================

/// Find position of first unmatched close parenthesis in a 64-bit word.
///
/// Returns bit position (0-63) of the first `)` that doesn't have a matching `(`
/// to its left within the word. Returns 64 if no unmatched close exists.
///
/// Bits are: 1 = open `(`, 0 = close `)`
///
/// # Example
///
/// ```
/// use succinctly::bp::find_unmatched_close_in_word;
///
/// // All opens - no unmatched close
/// assert_eq!(find_unmatched_close_in_word(u64::MAX), 64);
///
/// // All closes - unmatched at position 0
/// assert_eq!(find_unmatched_close_in_word(0), 0);
///
/// // "()" padded with opens - no unmatched close
/// let pattern = 0b01 | (u64::MAX << 2);
/// assert_eq!(find_unmatched_close_in_word(pattern), 64);
/// ```
#[inline]
pub fn find_unmatched_close_in_word(x: u64) -> u32 {
    // Simple linear scan tracking excess
    let mut excess: i32 = 0;

    for bit in 0..64 {
        if (x >> bit) & 1 == 1 {
            excess += 1; // open
        } else {
            excess -= 1; // close
            if excess < 0 {
                return bit;
            }
        }
    }

    64 // No unmatched close found
}

/// Find matching close parenthesis within a single word.
///
/// Given an open parenthesis at bit position `p` (0-indexed from LSB),
/// returns the position of its matching close.
///
/// Returns `None` if the matching close is beyond this word.
///
/// # Example
///
/// ```
/// use succinctly::bp::find_close_in_word;
///
/// // "()" = 0b01 - open at 0 matches close at 1
/// assert_eq!(find_close_in_word(0b01, 0), Some(1));
///
/// // "(())" = 0b0011 - open at 0 matches close at 3
/// assert_eq!(find_close_in_word(0b0011, 0), Some(3));
///
/// // "(())" = 0b0011 - open at 1 matches close at 2
/// assert_eq!(find_close_in_word(0b0011, 1), Some(2));
/// ```
#[inline]
pub fn find_close_in_word(word: u64, p: u32) -> Option<u32> {
    if p >= 64 {
        return None;
    }

    // Shift so position p is at bit 0
    let shifted = word >> p;

    // If bit at p is 0 (close), it "matches itself" (degenerate case)
    if shifted & 1 == 0 {
        return Some(p);
    }

    // Skip the open at position 0, find first unmatched close after it
    let after_open = shifted >> 1;
    let remaining_bits = 63 - p;

    if remaining_bits == 0 {
        return None;
    }

    let result = find_unmatched_close_in_word(after_open);

    if result < remaining_bits {
        Some(p + 1 + result)
    } else {
        None // Match is in a later word
    }
}

// ============================================================================
// Phase 1: Vector-Level Linear Scan
// ============================================================================

/// Find matching close parenthesis in a bitvector (linear scan).
///
/// Scans word-by-word from position `p` until finding the matching close.
/// Time: O(distance to matching close / 64)
///
/// # Arguments
///
/// * `words` - The bit vector words (1=open, 0=close)
/// * `len` - Total number of valid bits
/// * `p` - Position of the open parenthesis to match
///
/// # Returns
///
/// Position of the matching close, or `None` if:
/// - `p` is out of bounds
/// - Position `p` is a close parenthesis
/// - No matching close exists (unbalanced)
///
/// # Example
///
/// ```
/// use succinctly::bp::find_close;
///
/// // "(()())" = bit0=1, bit1=1, bit2=0, bit3=1, bit4=0, bit5=0 = 0b001011
/// let words = vec![0b001011u64];
/// assert_eq!(find_close(&words, 6, 0), Some(5)); // outer pair
/// assert_eq!(find_close(&words, 6, 1), Some(2)); // first inner pair
/// assert_eq!(find_close(&words, 6, 3), Some(4)); // second inner pair
/// ```
pub fn find_close(words: &[u64], len: usize, p: usize) -> Option<usize> {
    if p >= len || words.is_empty() {
        return None;
    }

    let word_idx = p / 64;
    let bit_idx = (p % 64) as u32;

    // Check if position p is an open parenthesis
    if (words[word_idx] >> bit_idx) & 1 == 0 {
        return None; // Position p is a close, not an open
    }

    // Try to find match in the same word
    if let Some(local) = find_close_in_word(words[word_idx], bit_idx) {
        let result = word_idx * 64 + local as usize;
        if result < len {
            return Some(result);
        }
    }

    // Calculate excess from the partial first word
    let first_word_part = words[word_idx] >> bit_idx;
    let first_word_bits = (64 - bit_idx) as usize;
    let first_word_ones = first_word_part.count_ones() as i32;
    let mut excess = 2 * first_word_ones - first_word_bits as i32;
    // We started with excess = 1 (from the open at p), so adjust
    // Actually, we want excess = 1 after seeing the open at p
    // The first word after bit p has: excess = opens - closes = 2*opens - bits

    // Scan subsequent words
    for (i, &word) in words[word_idx + 1..].iter().enumerate() {
        let actual_word_idx = word_idx + 1 + i;
        let word_bits = if actual_word_idx * 64 + 64 <= len {
            64
        } else {
            len - actual_word_idx * 64
        };

        // Mask the word to only valid bits
        let masked_word = if word_bits == 64 {
            word
        } else {
            word & ((1u64 << word_bits) - 1)
        };

        // Compute the minimum excess reached within this word
        let (word_min, word_excess) = word_min_excess_i32(masked_word, word_bits);

        // Check if excess might drop to 0 within this word
        // The minimum excess reached is: excess + word_min
        if excess + word_min <= 0 {
            // The match might be in this word - do fine-grained search
            let mut bit_excess = excess;
            for bit in 0..word_bits {
                if (masked_word >> bit) & 1 == 1 {
                    bit_excess += 1;
                } else {
                    bit_excess -= 1;
                    if bit_excess == 0 {
                        return Some(actual_word_idx * 64 + bit);
                    }
                }
            }
        }

        excess += word_excess;

        if actual_word_idx * 64 >= len {
            break;
        }
    }

    None
}

/// Helper function to compute min excess and total excess for a word (i32 version).
/// Uses byte-level lookup tables for fast computation.
fn word_min_excess_i32(word: u64, valid_bits: usize) -> (i32, i32) {
    if valid_bits == 0 {
        return (0, 0);
    }

    let full_bytes = valid_bits / 8;
    let remaining_bits = valid_bits % 8;

    let mut running_excess: i32 = 0;
    let mut global_min: i32 = 0;

    // Process full bytes using lookup tables
    let bytes = word.to_le_bytes();
    for byte in bytes.iter().take(full_bytes) {
        let byte_val = *byte as usize;
        let byte_min = running_excess + BYTE_MIN_EXCESS[byte_val] as i32;
        global_min = global_min.min(byte_min);
        running_excess += BYTE_TOTAL_EXCESS[byte_val] as i32;
    }

    // Process remaining bits (0-7) with bit-by-bit scan
    if remaining_bits > 0 {
        let byte_val = bytes[full_bytes];
        let mut excess = running_excess;
        for bit in 0..remaining_bits {
            if (byte_val >> bit) & 1 == 1 {
                excess += 1;
            } else {
                excess -= 1;
                global_min = global_min.min(excess);
            }
        }
        running_excess = excess;
    }

    (global_min, running_excess)
}

/// Compute minimum excess and total excess for a single word.
///
/// Returns (min_excess, total_excess) where:
/// - min_excess: The minimum excess reached at any point in the word (negative if unmatched closes)
/// - total_excess: The total excess at the end of the word (opens - closes)
///
/// Uses byte-level lookup tables for ~20x faster computation compared to bit-by-bit scanning.
fn word_min_excess(word: u64, valid_bits: usize) -> (i8, i16) {
    if valid_bits == 0 {
        return (0, 0);
    }

    let full_bytes = valid_bits / 8;
    let remaining_bits = valid_bits % 8;

    let mut running_excess: i16 = 0;
    let mut global_min: i16 = 0;

    // Process full bytes using lookup tables
    let bytes = word.to_le_bytes();
    for byte in bytes.iter().take(full_bytes) {
        let byte_val = *byte as usize;
        // Min within this byte = running_excess + table lookup
        let byte_min = running_excess + BYTE_MIN_EXCESS[byte_val] as i16;
        global_min = global_min.min(byte_min);
        running_excess += BYTE_TOTAL_EXCESS[byte_val] as i16;
    }

    // Process remaining bits (0-7) with bit-by-bit scan
    if remaining_bits > 0 {
        let byte_val = bytes[full_bytes];
        let mut excess = running_excess;
        for bit in 0..remaining_bits {
            if (byte_val >> bit) & 1 == 1 {
                excess += 1;
            } else {
                excess -= 1;
                global_min = global_min.min(excess);
            }
        }
        running_excess = excess;
    }

    // Clamp min_excess to i8 range (always valid for 64-bit words: range is -64..64)
    let min_clamped = global_min.clamp(-128, 127) as i8;

    (min_clamped, running_excess)
}

/// Compute the maximum prefix excess when scanning a word backwards (from bit 63 to 0).
///
/// Returns `(max_excess, total_excess)` where:
/// - `max_excess`: maximum running excess reached during the backward scan
/// - `total_excess`: total change in excess after processing all 64 bits (= 2*popcount - 64)
///
/// This is used by `enclose()` to quickly determine if a word might contain
/// the enclosing parent without doing a bit-by-bit scan.
fn word_max_excess_rev(word: u64) -> (i32, i32) {
    let bytes = word.to_le_bytes();
    let mut running_excess: i32 = 0;
    let mut global_max: i32 = 0;

    // Process bytes from high to low (reverse order for backward scan)
    for byte in bytes.iter().rev() {
        let byte_val = *byte as usize;
        // Max within this byte when scanning backwards = running_excess + table lookup
        let byte_max = running_excess + BYTE_MAX_EXCESS_REV[byte_val] as i32;
        global_max = global_max.max(byte_max);
        running_excess += BYTE_TOTAL_EXCESS[byte_val] as i32;
    }

    (global_max, running_excess)
}

/// Find matching open parenthesis (scanning backwards).
///
/// Given a close parenthesis at position `p`, find the matching open.
///
/// # Arguments
///
/// * `words` - The bit vector words (1=open, 0=close)
/// * `len` - Total number of valid bits
/// * `p` - Position of the close parenthesis to match
///
/// # Returns
///
/// Position of the matching open, or `None` if:
/// - `p` is out of bounds
/// - Position `p` is an open parenthesis
/// - No matching open exists
pub fn find_open(words: &[u64], len: usize, p: usize) -> Option<usize> {
    if p >= len || words.is_empty() {
        return None;
    }

    let word_idx = p / 64;
    let bit_idx = p % 64;

    // Check if position p is a close parenthesis
    if (words[word_idx] >> bit_idx) & 1 == 1 {
        return None; // Position p is an open, not a close
    }

    // Scan backwards, tracking excess
    // We start with excess = -1 (we just saw a close)
    // We want to find where excess becomes 0 (the matching open)
    let mut excess: i32 = -1;

    // First, scan backwards in the current word
    for bit in (0..bit_idx).rev() {
        if (words[word_idx] >> bit) & 1 == 1 {
            excess += 1; // open
            if excess == 0 {
                return Some(word_idx * 64 + bit);
            }
        } else {
            excess -= 1; // close
        }
    }

    // Scan previous words
    for word_idx in (0..word_idx).rev() {
        let word = words[word_idx];

        // Scan this word bit by bit (backwards)
        for bit in (0..64).rev() {
            if (word >> bit) & 1 == 1 {
                excess += 1;
                if excess == 0 {
                    return Some(word_idx * 64 + bit);
                }
            } else {
                excess -= 1;
            }
        }
    }

    None
}

/// Find the enclosing open parenthesis (parent node).
///
/// Given an open parenthesis at position `p`, find the open parenthesis
/// that encloses it (i.e., the parent in the tree).
///
/// # Returns
///
/// Position of the enclosing open, or `None` if `p` is at the root.
pub fn enclose(words: &[u64], len: usize, p: usize) -> Option<usize> {
    if p == 0 || p >= len || words.is_empty() {
        return None;
    }

    // Check if position p is an open parenthesis
    let word_idx = p / 64;
    let bit_idx = p % 64;
    if (words[word_idx] >> bit_idx) & 1 == 0 {
        return None; // Position p is a close, not an open
    }

    // We want to find the first unmatched open before position p
    // This is equivalent to find_open(p-1) if p-1 is a close,
    // or finding where excess goes to -1 scanning backwards

    let mut excess: i32 = 0;

    // Scan backwards from position p-1
    let start_bit = if bit_idx > 0 { bit_idx - 1 } else { 63 };
    let start_word = if bit_idx > 0 {
        word_idx
    } else {
        word_idx.saturating_sub(1)
    };

    if word_idx == 0 && bit_idx == 0 {
        return None; // At the very beginning, no enclosing parent
    }

    // First, scan backwards in the starting word
    let first_word = words[start_word];
    for bit in (0..=start_bit).rev() {
        if (first_word >> bit) & 1 == 1 {
            excess += 1; // open
            if excess == 1 {
                return Some(start_word * 64 + bit);
            }
        } else {
            excess -= 1; // close
        }
    }

    // Scan previous words
    for word_idx in (0..start_word).rev() {
        let word = words[word_idx];
        let (max_excess_in_word, word_excess) = word_max_excess_rev(word);

        // Check if the match might be in this word.
        // When scanning backwards, the running excess could reach 1 if:
        //   excess + max_excess_in_word >= 1
        //
        // This is different from checking word_excess (total change).
        // Example: word ")(" = 0xAA...AA has word_excess=0 (32 opens, 32 closes)
        // but max_excess_in_word=1 (hits 1 at the first '(' when scanning back).
        if excess + max_excess_in_word >= 1 {
            for bit in (0..64).rev() {
                if (word >> bit) & 1 == 1 {
                    excess += 1;
                    if excess == 1 {
                        return Some(word_idx * 64 + bit);
                    }
                } else {
                    excess -= 1;
                }
            }
        } else {
            excess += word_excess;
        }
    }

    None
}

// ============================================================================
// RangeMin Structure for O(1) Operations
// ============================================================================

/// Number of 64-bit words per basic block for rank directory.
const WORDS_PER_RANK_BLOCK: usize = 8;

/// L1 factor: words per L1 block
const FACTOR_L1: usize = 32;
/// L2 factor: L1 blocks per L2 block
const FACTOR_L2: usize = 32;

/// Balanced parentheses with O(1) navigation and rank operations.
///
/// Stores auxiliary min-excess data at multiple levels for fast find_close/find_open,
/// plus a rank directory for O(1) rank1() queries.
///
/// Based on the RangeMin data structure from haskell-works.
///
/// # Space Overhead
///
/// Approximately 6-7% overhead:
/// - L0: 2 bytes per word (min excess + word excess)
/// - L1: 4 bytes per 32 words
/// - L2: 4 bytes per 1024 words
/// - Rank: ~1.5 bytes per 8 words
///
/// # Example
///
/// ```
/// use succinctly::bp::BalancedParens;
///
/// // "(()())" = bit0=1, bit1=1, bit2=0, bit3=1, bit4=0, bit5=0 = 0b001011
/// let bp = BalancedParens::new(vec![0b001011u64], 6);
///
/// // Find matching close for each open
/// assert_eq!(bp.find_close(0), Some(5)); // outer pair
/// assert_eq!(bp.find_close(1), Some(2)); // first inner pair
/// assert_eq!(bp.find_close(3), Some(4)); // second inner pair
/// ```
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct BalancedParens<W = Vec<u64>, S: SelectSupport = NoSelect> {
    /// The underlying bit vector (1=open, 0=close)
    words: W,
    /// Number of valid bits
    len: usize,
    /// Total number of 1-bits (open parentheses)
    total_ones: usize,

    // --- Min-excess indices for find_close (same as original) ---
    /// L0: Per-word min excess (signed, relative to start of word)
    l0_min_excess: Vec<i8>,
    /// L0: Per-word excess (total excess within this word)
    l0_word_excess: Vec<i16>,

    /// L1: Per-32-word block min excess (relative to block start)
    l1_min_excess: Vec<i16>,
    /// L1: Per-32-word block excess (total excess within this block)
    l1_block_excess: Vec<i16>,

    /// L2: Per-1024-word block min excess (relative to block start)
    l2_min_excess: Vec<i16>,
    /// L2: Per-1024-word block excess (total excess within this block)
    l2_block_excess: Vec<i16>,

    // --- Cumulative rank directory for O(1) rank1() ---
    /// Cumulative 1-bit count at the start of each 512-bit block (8 words)
    /// This is an absolute value from position 0.
    rank_l1: Vec<u32>,

    /// Cumulative 1-bit count within each 512-bit block (7 x 9-bit offsets packed)
    /// rank_l2[block] contains offsets for words 1-7 within the block.
    rank_l2: Vec<u64>,

    // --- Optional select support (zero-cost when NoSelect) ---
    /// Select index for O(1) select1 queries (ZST for NoSelect)
    select: S,
}

/// Build the V2 index structures with absolute cumulative rank.
/// Returns (l0_min_excess, l0_word_excess, l1_min_excess, l1_block_excess,
///          l2_min_excess, l2_block_excess, rank_l1, rank_l2, total_ones)
#[allow(clippy::type_complexity)]
fn build_bp_index(
    words: &[u64],
    len: usize,
) -> (
    Vec<i8>,
    Vec<i16>,
    Vec<i16>,
    Vec<i16>,
    Vec<i16>,
    Vec<i16>,
    Vec<u32>,
    Vec<u64>,
    usize,
) {
    if words.is_empty() || len == 0 {
        return (
            Vec::new(),
            Vec::new(),
            Vec::new(),
            Vec::new(),
            Vec::new(),
            Vec::new(),
            Vec::new(),
            Vec::new(),
            0,
        );
    }

    let num_words = words.len();
    let num_l1 = num_words.div_ceil(FACTOR_L1);
    let num_l2 = num_l1.div_ceil(FACTOR_L2);
    let num_rank_blocks = num_words.div_ceil(WORDS_PER_RANK_BLOCK);

    // Build L0: per-word statistics
    // Use NEON-optimized path on aarch64 with simd feature
    let (l0_min_excess, l0_word_excess) = build_l0_index(words, len, num_words);

    // Build L1: per-32-word block statistics
    // Use NEON SIMD with VMINV on aarch64, SSE4.1 with PHMINPOSUW on x86_64
    #[cfg(all(feature = "simd", target_arch = "aarch64"))]
    let (l1_min_excess, l1_block_excess) =
        build_l1_index_neon(&l0_min_excess, &l0_word_excess, num_l1);

    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    let (l1_min_excess, l1_block_excess) = {
        if is_x86_feature_detected!("sse4.1") {
            build_l1_index_sse41(&l0_min_excess, &l0_word_excess, num_l1)
        } else {
            // Fallback to scalar
            let mut l1_min_excess = Vec::with_capacity(num_l1);
            let mut l1_block_excess = Vec::with_capacity(num_l1);

            for block_idx in 0..num_l1 {
                let start = block_idx * FACTOR_L1;
                let end = (start + FACTOR_L1).min(num_words);

                let mut block_min: i16 = 0;
                let mut running_excess: i16 = 0;

                for i in start..end {
                    let word_min = l0_min_excess[i] as i16;
                    let word_excess = l0_word_excess[i];
                    block_min = block_min.min(running_excess + word_min);
                    running_excess += word_excess;
                }

                l1_min_excess.push(block_min);
                l1_block_excess.push(running_excess);
            }

            (l1_min_excess, l1_block_excess)
        }
    };

    #[cfg(not(any(
        all(feature = "simd", target_arch = "aarch64"),
        all(feature = "simd", target_arch = "x86_64")
    )))]
    let (l1_min_excess, l1_block_excess) = {
        let mut l1_min_excess = Vec::with_capacity(num_l1);
        let mut l1_block_excess = Vec::with_capacity(num_l1);

        for block_idx in 0..num_l1 {
            let start = block_idx * FACTOR_L1;
            let end = (start + FACTOR_L1).min(num_words);

            let mut block_min: i16 = 0;
            let mut running_excess: i16 = 0;

            for i in start..end {
                let word_min = l0_min_excess[i] as i16;
                let word_excess = l0_word_excess[i];
                block_min = block_min.min(running_excess + word_min);
                running_excess += word_excess;
            }

            l1_min_excess.push(block_min);
            l1_block_excess.push(running_excess);
        }

        (l1_min_excess, l1_block_excess)
    };

    // Build L2: per-1024-word block statistics
    // Use SIMD on ARM/x86 for consistency with L1 (benefit scales with data size)
    #[cfg(all(feature = "simd", target_arch = "aarch64"))]
    let (l2_min_excess, l2_block_excess) =
        build_l2_index_neon(&l1_min_excess, &l1_block_excess, num_l2);

    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    let (l2_min_excess, l2_block_excess) = {
        if is_x86_feature_detected!("sse4.1") {
            build_l2_index_sse41(&l1_min_excess, &l1_block_excess, num_l2)
        } else {
            // Fallback to scalar
            let mut l2_min_excess = Vec::with_capacity(num_l2);
            let mut l2_block_excess = Vec::with_capacity(num_l2);

            for block_idx in 0..num_l2 {
                let start = block_idx * FACTOR_L2;
                let end = (start + FACTOR_L2).min(num_l1);

                let mut block_min: i16 = 0;
                let mut running_excess: i16 = 0;

                for i in start..end {
                    let l1_min = l1_min_excess[i];
                    let l1_excess = l1_block_excess[i];
                    block_min = block_min.min(running_excess + l1_min);
                    running_excess += l1_excess;
                }

                l2_min_excess.push(block_min);
                l2_block_excess.push(running_excess);
            }

            (l2_min_excess, l2_block_excess)
        }
    };

    #[cfg(not(any(
        all(feature = "simd", target_arch = "aarch64"),
        all(feature = "simd", target_arch = "x86_64")
    )))]
    let (l2_min_excess, l2_block_excess) = {
        let mut l2_min_excess = Vec::with_capacity(num_l2);
        let mut l2_block_excess = Vec::with_capacity(num_l2);

        for block_idx in 0..num_l2 {
            let start = block_idx * FACTOR_L2;
            let end = (start + FACTOR_L2).min(num_l1);

            let mut block_min: i16 = 0;
            let mut running_excess: i16 = 0;

            for i in start..end {
                let l1_min = l1_min_excess[i];
                let l1_excess = l1_block_excess[i];
                block_min = block_min.min(running_excess + l1_min);
                running_excess += l1_excess;
            }

            l2_min_excess.push(block_min);
            l2_block_excess.push(running_excess);
        }

        (l2_min_excess, l2_block_excess)
    };

    // Build rank directory with ABSOLUTE cumulative values
    let mut rank_l1 = Vec::with_capacity(num_rank_blocks);
    let mut rank_l2 = Vec::with_capacity(num_rank_blocks);

    let mut cumulative_rank: u64 = 0;

    for block_idx in 0..num_rank_blocks {
        let block_start = block_idx * WORDS_PER_RANK_BLOCK;
        let block_end = (block_start + WORDS_PER_RANK_BLOCK).min(num_words);

        // Store absolute cumulative rank at block start
        rank_l1.push(cumulative_rank as u32);

        // Build L2 offsets within this block
        let mut l2_packed: u64 = 0;
        let mut block_cumulative: u16 = 0;

        for (i, word_idx) in (block_start..block_end).enumerate() {
            if i > 0 && i < 8 {
                // Pack offset into 9 bits at position (i-1)*9
                l2_packed |= (block_cumulative as u64) << ((i - 1) * 9);
            }
            block_cumulative += words[word_idx].count_ones() as u16;
        }

        rank_l2.push(l2_packed);
        cumulative_rank += block_cumulative as u64;
    }

    (
        l0_min_excess,
        l0_word_excess,
        l1_min_excess,
        l1_block_excess,
        l2_min_excess,
        l2_block_excess,
        rank_l1,
        rank_l2,
        cumulative_rank as usize,
    )
}

impl BalancedParens<Vec<u64>, NoSelect> {
    /// Build from an owned bitvector representing balanced parentheses.
    ///
    /// Creates a BalancedParens without select support (default for JSON).
    pub fn new(words: Vec<u64>, len: usize) -> Self {
        let (
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            total_ones,
        ) = build_bp_index(&words, len);

        Self {
            words,
            len,
            total_ones,
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            select: NoSelect,
        }
    }
}

impl BalancedParens<Vec<u64>, WithSelect> {
    /// Build from an owned bitvector with select support enabled.
    ///
    /// Creates a BalancedParens with O(1) select1 support (for YAML).
    pub fn new_with_select(words: Vec<u64>, len: usize) -> Self {
        let (
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            total_ones,
        ) = build_bp_index(&words, len);

        let select = WithSelect::build(&words, total_ones);

        Self {
            words,
            len,
            total_ones,
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            select,
        }
    }
}

impl<W: AsRef<[u64]>> BalancedParens<W, NoSelect> {
    /// Build from a generic storage type representing balanced parentheses.
    ///
    /// Creates a BalancedParens without select support (default for JSON).
    /// This is useful for borrowed data, e.g., from mmap.
    pub fn from_words(words: W, len: usize) -> Self {
        let (
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            total_ones,
        ) = build_bp_index(words.as_ref(), len);

        Self {
            words,
            len,
            total_ones,
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            select: NoSelect,
        }
    }
}

impl<W: AsRef<[u64]>> BalancedParens<W, WithSelect> {
    /// Build from a generic storage type with select support enabled.
    ///
    /// Creates a BalancedParens with O(1) select1 support (for YAML).
    /// This is useful for borrowed data, e.g., from mmap.
    pub fn from_words_with_select(words: W, len: usize) -> Self {
        let (
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            total_ones,
        ) = build_bp_index(words.as_ref(), len);

        let select = WithSelect::build(words.as_ref(), total_ones);

        Self {
            words,
            len,
            total_ones,
            l0_min_excess,
            l0_word_excess,
            l1_min_excess,
            l1_block_excess,
            l2_min_excess,
            l2_block_excess,
            rank_l1,
            rank_l2,
            select,
        }
    }
}

impl<W: AsRef<[u64]>, S: SelectSupport> BalancedParens<W, S> {
    /// Get the length in bits.
    #[inline]
    pub fn len(&self) -> usize {
        self.len
    }

    /// Check if the bit vector is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    /// Get a reference to the underlying words.
    #[inline]
    pub fn words(&self) -> &[u64] {
        self.words.as_ref()
    }

    /// Get the total number of 1-bits (open parentheses).
    #[inline]
    pub fn total_ones(&self) -> usize {
        self.total_ones
    }

    /// Find the position of the k-th 1-bit (0-indexed).
    ///
    /// Returns `None` if k >= total_ones.
    /// Uses the select index if available (WithSelect), otherwise returns None.
    #[inline]
    pub fn select1(&self, k: usize) -> Option<usize> {
        self.select
            .select1(self.words.as_ref(), self.len, self.total_ones, k)
    }

    /// Check if the bit at position p is a 1 (open parenthesis).
    #[inline]
    pub fn is_open(&self, p: usize) -> bool {
        if p >= self.len {
            return false;
        }
        let words = self.words.as_ref();
        let word_idx = p / 64;
        let bit_idx = p % 64;
        (words[word_idx] >> bit_idx) & 1 == 1
    }

    /// Check if the bit at position p is a 0 (close parenthesis).
    #[inline]
    pub fn is_close(&self, p: usize) -> bool {
        if p >= self.len {
            return false;
        }
        !self.is_open(p)
    }

    /// O(1) rank1: Count 1-bits (opens) in positions [0, p).
    ///
    /// This is the key improvement over the original BalancedParens.
    /// Uses absolute cumulative prefix sums for constant-time queries.
    #[inline]
    pub fn rank1(&self, p: usize) -> usize {
        if p == 0 {
            return 0;
        }
        let p = p.min(self.len);

        let words = self.words.as_ref();
        let word_idx = p / 64;
        let bit_idx = p % 64;

        // Handle boundary case: word_idx beyond actual words
        // This happens when p == len and len is a multiple of 64
        if word_idx >= words.len() {
            return self.rank1_slow(p);
        }

        // Which 512-bit block (8 words)?
        let block_idx = word_idx / WORDS_PER_RANK_BLOCK;
        let word_in_block = word_idx % WORDS_PER_RANK_BLOCK;

        // L1: absolute cumulative rank at block start
        let l1_rank = if block_idx < self.rank_l1.len() {
            self.rank_l1[block_idx] as usize
        } else {
            // Handle edge case: query beyond indexed range
            return self.rank1_slow(p);
        };

        // L2: offset within block
        let l2_rank = if word_in_block == 0 {
            0
        } else if block_idx < self.rank_l2.len() {
            let l2_packed = self.rank_l2[block_idx];
            let l2_idx = word_in_block - 1;
            ((l2_packed >> (l2_idx * 9)) & 0x1FF) as usize
        } else {
            return self.rank1_slow(p);
        };

        // Count remaining bits in partial word
        let partial = if bit_idx > 0 {
            let word = words[word_idx];
            let mask = (1u64 << bit_idx) - 1;
            (word & mask).count_ones() as usize
        } else {
            0
        };

        l1_rank + l2_rank + partial
    }

    /// Fallback slow rank1 for edge cases.
    fn rank1_slow(&self, p: usize) -> usize {
        let words = self.words.as_ref();
        let word_idx = p / 64;
        let bit_idx = p % 64;

        let mut count = 0usize;
        for word in words.iter().take(word_idx) {
            count += word.count_ones() as usize;
        }
        if bit_idx > 0 && word_idx < words.len() {
            let mask = (1u64 << bit_idx) - 1;
            count += (words[word_idx] & mask).count_ones() as usize;
        }
        count
    }

    /// Get excess at position p (number of opens minus closes in [0, p]).
    #[inline]
    pub fn excess(&self, p: usize) -> i32 {
        if p >= self.len {
            return 0;
        }
        // excess = 2 * rank1(p+1) - (p+1)
        // Because: opens contribute +1, closes contribute -1
        // rank1 counts opens, so closes = (p+1) - rank1
        // excess = rank1 - ((p+1) - rank1) = 2*rank1 - (p+1)
        let ones = self.rank1(p + 1) as i32;
        let total = (p + 1) as i32;
        2 * ones - total
    }

    /// Find matching close parenthesis.
    pub fn find_close(&self, p: usize) -> Option<usize> {
        if p >= self.len || self.is_close(p) {
            return None;
        }
        self.find_close_from(p + 1, 1)
    }

    /// Internal: find position where excess drops to 0.
    ///
    /// Uses byte-level lookup tables for fast scanning instead of bit-by-bit.
    fn find_close_from(&self, start_pos: usize, initial_excess: i32) -> Option<usize> {
        if start_pos >= self.len {
            return None;
        }

        #[derive(Clone, Copy, Debug)]
        enum State {
            ScanWord,
            CheckL0,
            CheckL1,
            CheckL2,
            FromL0,
            FromL1,
            FromL2,
        }

        let words = self.words.as_ref();
        let mut state = State::FromL0;
        let mut excess = initial_excess;
        let mut pos = start_pos;

        loop {
            match state {
                State::ScanWord => {
                    // Fast path: scan current word using byte-level lookup tables
                    if pos >= self.len {
                        return None;
                    }

                    let word_idx = pos / 64;
                    let bit_idx = pos % 64;
                    let word = words[word_idx];

                    // Calculate valid bits in this word
                    let valid_bits = if word_idx * 64 + 64 <= self.len {
                        64
                    } else {
                        self.len - word_idx * 64
                    };

                    // Use fast byte-level scan
                    if let Some(match_bit) =
                        find_close_in_word_fast(word, bit_idx, excess, valid_bits)
                    {
                        return Some(word_idx * 64 + match_bit);
                    }

                    // No match in this word - compute excess change and move to next word
                    // We need to compute excess change from bit_idx to end of valid bits
                    let remaining_word = word >> bit_idx;
                    let remaining_bits = valid_bits - bit_idx;
                    let ones = if remaining_bits == 64 {
                        remaining_word.count_ones() as i32
                    } else {
                        (remaining_word & ((1u64 << remaining_bits) - 1)).count_ones() as i32
                    };
                    excess += 2 * ones - remaining_bits as i32;

                    // Move to start of next word
                    pos = (word_idx + 1) * 64;
                    state = State::FromL0;
                }

                State::CheckL0 => {
                    debug_assert!(pos % 64 == 0);

                    let word_idx = pos / 64;
                    if word_idx >= self.l0_min_excess.len() {
                        return None;
                    }

                    let min_e = self.l0_min_excess[word_idx] as i32;
                    if excess + min_e <= 0 {
                        // Match is in this word - scan it
                        state = State::ScanWord;
                    } else {
                        // Skip this word entirely
                        let word_excess = self.l0_word_excess[word_idx] as i32;
                        excess += word_excess;
                        pos += 64;
                        state = State::FromL0;
                    }
                }

                State::CheckL1 => {
                    debug_assert!(pos % 64 == 0);

                    let l1_idx = pos / (64 * FACTOR_L1);
                    if l1_idx >= self.l1_min_excess.len() {
                        return None;
                    }

                    let min_e = self.l1_min_excess[l1_idx] as i32;
                    if excess + min_e <= 0 {
                        state = State::CheckL0;
                    } else if pos < self.len {
                        if self.is_close(pos) && excess <= 1 {
                            return Some(pos);
                        }
                        let l1_excess = self.l1_block_excess[l1_idx] as i32;
                        excess += l1_excess;
                        pos += 64 * FACTOR_L1;
                        state = State::FromL1;
                    } else {
                        return None;
                    }
                }

                State::CheckL2 => {
                    let l2_idx = pos / (64 * FACTOR_L1 * FACTOR_L2);
                    if l2_idx >= self.l2_min_excess.len() {
                        return None;
                    }

                    let min_e = self.l2_min_excess[l2_idx] as i32;
                    if excess + min_e <= 0 {
                        state = State::CheckL1;
                    } else if pos < self.len {
                        if self.is_close(pos) && excess <= 1 {
                            return Some(pos);
                        }
                        let l2_excess = self.l2_block_excess[l2_idx] as i32;
                        excess += l2_excess;
                        pos += 64 * FACTOR_L1 * FACTOR_L2;
                        state = State::FromL2;
                    } else {
                        return None;
                    }
                }

                State::FromL0 => {
                    if pos % 64 == 0 {
                        state = State::FromL1;
                    } else if pos < self.len {
                        state = State::ScanWord;
                    } else {
                        return None;
                    }
                }

                State::FromL1 => {
                    if pos % (64 * FACTOR_L1) == 0 {
                        if pos < self.len {
                            state = State::FromL2;
                        } else {
                            return None;
                        }
                    } else if pos < self.len {
                        state = State::CheckL0;
                    } else {
                        return None;
                    }
                }

                State::FromL2 => {
                    if pos % (64 * FACTOR_L1 * FACTOR_L2) == 0 {
                        if pos < self.len {
                            state = State::CheckL2;
                        } else {
                            return None;
                        }
                    } else if pos < self.len {
                        state = State::CheckL1;
                    } else {
                        return None;
                    }
                }
            }
        }
    }

    /// Find matching open parenthesis.
    ///
    /// Given a close parenthesis at position `p`, returns the position
    /// of its matching open.
    pub fn find_open(&self, p: usize) -> Option<usize> {
        if p >= self.len || self.is_open(p) {
            return None;
        }

        find_open(self.words.as_ref(), self.len, p)
    }

    /// Find enclosing open parenthesis (parent node).
    ///
    /// Given an open parenthesis at position `p`, returns the position
    /// of the open parenthesis that encloses it.
    pub fn enclose(&self, p: usize) -> Option<usize> {
        if p >= self.len || self.is_close(p) {
            return None;
        }

        enclose(self.words.as_ref(), self.len, p)
    }

    /// Navigate to parent node.
    ///
    /// Alias for `enclose`.
    pub fn parent(&self, p: usize) -> Option<usize> {
        self.enclose(p)
    }

    /// Navigate to next sibling in tree.
    pub fn next_sibling(&self, p: usize) -> Option<usize> {
        if !self.is_open(p) {
            return None;
        }
        let close = self.find_close(p)?;
        if close + 1 < self.len && self.is_open(close + 1) {
            Some(close + 1)
        } else {
            None
        }
    }

    /// Navigate to first child.
    pub fn first_child(&self, p: usize) -> Option<usize> {
        if !self.is_open(p) || p + 1 >= self.len {
            return None;
        }
        if self.is_open(p + 1) {
            Some(p + 1)
        } else {
            None
        }
    }

    /// Get depth of node at position p.
    ///
    /// The depth is the number of ancestors (including the node itself).
    /// For a root node (position 0), depth is 1.
    /// Returns `None` if position is out of bounds.
    ///
    /// # Example
    ///
    /// For the sequence "(()(()())) = 1101101000":
    /// - depth(1) = 1 (root)
    /// - depth(2) = 2 (first child)
    /// - depth(5) = 3 (grandchild)
    pub fn depth(&self, p: usize) -> Option<usize> {
        if p >= self.len {
            return None;
        }

        // Depth is the excess at position p (for an open) or excess at the matching open
        // For 1-indexed positions in Haskell, depth = excess
        // For 0-indexed, we compute excess up to and including position p
        // The excess at an open parenthesis equals its depth
        Some(self.excess(p) as usize)
    }

    /// Get the size of the subtree rooted at position p.
    ///
    /// The subtree size is the number of nodes (opens) in the subtree,
    /// not including the node at p itself.
    /// Returns `None` if position is out of bounds or not an open.
    ///
    /// # Example
    ///
    /// For the sequence "(()(()())) = 1101101000":
    /// - subtree_size(1) = 5 (root has 5 descendants: 2 children + 2 grandchildren + their closes)
    /// - subtree_size(2) = 1 (leaf node, subtree size = 1 for the () pair)
    pub fn subtree_size(&self, p: usize) -> Option<usize> {
        if p >= self.len || self.is_close(p) {
            return None;
        }

        // Subtree size = (find_close(p) - p - 1) / 2
        // The distance to matching close, minus 1 for the open itself,
        // divided by 2 since each node has an open and close.
        let close = self.find_close(p)?;
        Some((close - p) / 2)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // ========================================================================
    // Phase 1: Word-level tests
    // ========================================================================

    // Bit pattern reference:
    // "()" = open at pos 0, close at pos 1 = 0b01 (bit 0 = 1, bit 1 = 0)
    // "(())" = open at 0, open at 1, close at 2, close at 3 = 0b0011
    // "(()())" = open at 0, open at 1, close at 2, open at 3, close at 4, close at 5
    //         = bits: 1 1 0 1 0 0 = 0b001011

    #[test]
    fn test_find_unmatched_close_simple() {
        // Note: find_unmatched_close_in_word scans all 64 bits.
        // Unused high bits are 0s (closes), so small test values need padding.
        // Use u64::MAX with low bits cleared to represent "opens" in unused positions.

        // Full word of closes - first unmatched at position 0
        assert_eq!(find_unmatched_close_in_word(0), 0);

        // Full word of opens - no unmatched close
        assert_eq!(find_unmatched_close_in_word(u64::MAX), 64);

        // "()" padded with opens: opens in high bits
        // Positions 0-1: "()", positions 2-63: all opens
        // 0b01 | (all ones from bit 2 onwards) = 0xFFFF_FFFF_FFFF_FFFD
        let pattern = 0b01 | (u64::MAX << 2);
        assert_eq!(find_unmatched_close_in_word(pattern), 64); // balanced prefix, opens after
    }

    #[test]
    fn test_find_unmatched_close_nested() {
        // "(())" padded with opens in high bits
        // Pattern: 0b0011 in low 4 bits, opens (1s) in bits 4-63
        let pattern = 0b0011 | (u64::MAX << 4);
        assert_eq!(find_unmatched_close_in_word(pattern), 64);

        // "(()" = 0b011 padded with opens
        let pattern = 0b011 | (u64::MAX << 3);
        assert_eq!(find_unmatched_close_in_word(pattern), 64);
    }

    #[test]
    fn test_find_unmatched_close_with_extra() {
        // "())" padded with opens in high bits
        // Pattern: 0b001 in low 3 bits (open, close, close)
        // Unmatched close at position 2
        let pattern = 0b001 | (u64::MAX << 3);
        assert_eq!(find_unmatched_close_in_word(pattern), 2);

        // "(()))" padded with opens
        // Pattern: 0b00011 in low 5 bits
        // Unmatched close at position 4
        let pattern = 0b00011 | (u64::MAX << 5);
        assert_eq!(find_unmatched_close_in_word(pattern), 4);
    }

    #[test]
    fn test_find_close_in_word_simple() {
        // "()" = 0b01 - open at 0, close at 1
        assert_eq!(find_close_in_word(0b01, 0), Some(1));
    }

    #[test]
    fn test_find_close_in_word_nested() {
        // "(())" = 0b0011 - opens at 0,1; closes at 2,3
        assert_eq!(find_close_in_word(0b0011, 0), Some(3)); // outer
        assert_eq!(find_close_in_word(0b0011, 1), Some(2)); // inner
    }

    #[test]
    fn test_find_close_in_word_sequential() {
        // "()()" = 0b0101 - opens at 0,2; closes at 1,3
        assert_eq!(find_close_in_word(0b0101, 0), Some(1));
        assert_eq!(find_close_in_word(0b0101, 2), Some(3));
    }

    #[test]
    fn test_find_close_in_word_complex() {
        // "(()())" = opens at 0,1,3; closes at 2,4,5
        // bits: pos0=1, pos1=1, pos2=0, pos3=1, pos4=0, pos5=0
        // binary: 0b001011
        assert_eq!(find_close_in_word(0b001011, 0), Some(5)); // outer
        assert_eq!(find_close_in_word(0b001011, 1), Some(2)); // first inner
        assert_eq!(find_close_in_word(0b001011, 3), Some(4)); // second inner
    }

    #[test]
    fn test_find_close_in_word_at_close() {
        // "()" = 0b01 - if we ask at close position 1, returns 1
        assert_eq!(find_close_in_word(0b01, 1), Some(1));
    }

    #[test]
    fn test_find_close_in_word_beyond_word() {
        // "(" padded with opens - no matching close in word
        // Pattern is all 1s (all opens), so excess never goes negative
        assert_eq!(find_close_in_word(u64::MAX, 0), None);

        // "((" padded with opens - neither has matching close in word
        assert_eq!(find_close_in_word(u64::MAX, 0), None);
        assert_eq!(find_close_in_word(u64::MAX, 1), None);
    }

    // ========================================================================
    // Phase 1: Vector-level tests
    // ========================================================================

    #[test]
    fn test_find_close_single_word() {
        // "(()())" = 0b001011
        let words = vec![0b001011u64];
        assert_eq!(find_close(&words, 6, 0), Some(5)); // outer
        assert_eq!(find_close(&words, 6, 1), Some(2)); // first inner
        assert_eq!(find_close(&words, 6, 3), Some(4)); // second inner
    }

    #[test]
    fn test_find_close_multi_word() {
        // 64 opens followed by 64 closes
        let words = vec![u64::MAX, 0u64];
        let len = 128;

        // First open (bit 0) matches close at bit 127
        assert_eq!(find_close(&words, len, 0), Some(127));

        // Second open (bit 1) matches close at bit 126
        assert_eq!(find_close(&words, len, 1), Some(126));

        // Last open (bit 63) matches close at bit 64
        assert_eq!(find_close(&words, len, 63), Some(64));
    }

    #[test]
    fn test_find_close_at_close_returns_none() {
        // "(()())" = 0b001011 - closes at positions 2, 4, 5
        let words = vec![0b001011u64];
        assert_eq!(find_close(&words, 6, 2), None);
        assert_eq!(find_close(&words, 6, 4), None);
        assert_eq!(find_close(&words, 6, 5), None);
    }

    #[test]
    fn test_find_open_single_word() {
        // "(()())" = 0b001011 - opens at 0,1,3; closes at 2,4,5
        let words = vec![0b001011u64];
        assert_eq!(find_open(&words, 6, 5), Some(0)); // close at 5 matches open at 0
        assert_eq!(find_open(&words, 6, 2), Some(1)); // close at 2 matches open at 1
        assert_eq!(find_open(&words, 6, 4), Some(3)); // close at 4 matches open at 3
    }

    #[test]
    fn test_find_open_at_open_returns_none() {
        // "(()())" = 0b001011
        let words = vec![0b001011u64];
        assert_eq!(find_open(&words, 6, 0), None);
        assert_eq!(find_open(&words, 6, 1), None);
        assert_eq!(find_open(&words, 6, 3), None);
    }

    #[test]
    fn test_enclose_simple() {
        // "(())" = 0b0011 - opens at 0,1; closes at 2,3
        let words = vec![0b0011u64];

        // The inner open at position 1 is enclosed by the outer open at position 0
        assert_eq!(enclose(&words, 4, 1), Some(0));

        // The outer open at position 0 has no enclosing parent
        assert_eq!(enclose(&words, 4, 0), None);
    }

    #[test]
    fn test_enclose_nested() {
        // "((()))" = opens at 0,1,2; closes at 3,4,5
        // bits: 1 1 1 0 0 0 = 0b000111
        let words = vec![0b000111u64];

        assert_eq!(enclose(&words, 6, 2), Some(1)); // innermost enclosed by middle
        assert_eq!(enclose(&words, 6, 1), Some(0)); // middle enclosed by outer
        assert_eq!(enclose(&words, 6, 0), None); // outer has no parent
    }

    // ========================================================================
    // Phase 2: BalancedParens struct tests
    // ========================================================================

    #[test]
    fn test_balanced_parens_new() {
        // "(()())" = 0b001011
        let bp = BalancedParens::new(vec![0b001011u64], 6);
        assert_eq!(bp.len(), 6);
        assert!(!bp.is_empty());
    }

    #[test]
    fn test_balanced_parens_empty() {
        let bp = BalancedParens::new(vec![], 0);
        assert_eq!(bp.len(), 0);
        assert!(bp.is_empty());
    }

    #[test]
    fn test_balanced_parens_is_open_close() {
        // "(()())" = 0b001011 - opens at 0,1,3; closes at 2,4,5
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        assert!(bp.is_open(0));
        assert!(bp.is_open(1));
        assert!(bp.is_close(2));
        assert!(bp.is_open(3));
        assert!(bp.is_close(4));
        assert!(bp.is_close(5));
    }

    #[test]
    fn test_balanced_parens_find_close() {
        // "(()())" = 0b001011
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        assert_eq!(bp.find_close(0), Some(5));
        assert_eq!(bp.find_close(1), Some(2));
        assert_eq!(bp.find_close(3), Some(4));

        // At close positions, returns None
        assert_eq!(bp.find_close(2), None);
        assert_eq!(bp.find_close(4), None);
        assert_eq!(bp.find_close(5), None);
    }

    #[test]
    fn test_balanced_parens_find_open() {
        // "(()())" = 0b001011
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        assert_eq!(bp.find_open(5), Some(0));
        assert_eq!(bp.find_open(2), Some(1));
        assert_eq!(bp.find_open(4), Some(3));

        // At open positions, returns None
        assert_eq!(bp.find_open(0), None);
        assert_eq!(bp.find_open(1), None);
        assert_eq!(bp.find_open(3), None);
    }

    #[test]
    fn test_balanced_parens_enclose() {
        // "(()())" = 0b001011
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        assert_eq!(bp.enclose(1), Some(0)); // inner open enclosed by outer
        assert_eq!(bp.enclose(3), Some(0)); // inner open enclosed by outer
        assert_eq!(bp.enclose(0), None); // root has no parent
    }

    #[test]
    fn test_balanced_parens_navigation() {
        // "(()())" = 0b001011
        // Tree structure: root has two children
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        // First child of root (position 0)
        assert_eq!(bp.first_child(0), Some(1));

        // Next sibling of first child
        assert_eq!(bp.next_sibling(1), Some(3));

        // No next sibling for second child
        assert_eq!(bp.next_sibling(3), None);

        // Parent navigation
        assert_eq!(bp.parent(1), Some(0));
        assert_eq!(bp.parent(3), Some(0));
        assert_eq!(bp.parent(0), None);
    }

    #[test]
    fn test_balanced_parens_multi_word() {
        // 64 opens followed by 64 closes
        let words = vec![u64::MAX, 0u64];
        let bp = BalancedParens::new(words, 128);

        // Outermost: open at 0, close at 127
        assert_eq!(bp.find_close(0), Some(127));

        // Second level: open at 1, close at 126
        assert_eq!(bp.find_close(1), Some(126));

        // Last nested: open at 63, close at 64
        assert_eq!(bp.find_close(63), Some(64));
    }

    #[test]
    fn test_balanced_parens_excess() {
        // "(())" = 0b0011 - opens at 0,1; closes at 2,3
        let bp = BalancedParens::new(vec![0b0011u64], 4);

        assert_eq!(bp.excess(0), 1); // after first open
        assert_eq!(bp.excess(1), 2); // after second open
        assert_eq!(bp.excess(2), 1); // after first close
        assert_eq!(bp.excess(3), 0); // after second close (balanced)
    }

    #[test]
    fn test_word_min_excess() {
        // "()" = 0b01 - open at 0, close at 1
        // After bit 0: excess = 1
        // After bit 1: excess = 0 (min reached)
        let (min, total) = word_min_excess(0b01, 2);
        assert_eq!(total, 0);
        assert_eq!(min, 0);

        // ")(" = 0b10 - close at 0, open at 1
        // After bit 0: excess = -1 (min)
        // After bit 1: excess = 0
        let (min, total) = word_min_excess(0b10, 2);
        assert_eq!(min, -1);
        assert_eq!(total, 0);

        // "((" = 0b11 - opens at 0,1
        let (min, total) = word_min_excess(0b11, 2);
        assert_eq!(min, 0); // never goes negative
        assert_eq!(total, 2);

        // "))" = 0b00 - closes at 0,1
        let (min, total) = word_min_excess(0b00, 2);
        assert_eq!(min, -2);
        assert_eq!(total, -2);
    }

    // ========================================================================
    // Roundtrip tests
    // ========================================================================

    #[test]
    fn test_find_close_open_roundtrip() {
        // "(()())" = 0b001011
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        // For each open, find_close then find_open should return to original
        for p in [0, 1, 3] {
            let close = bp.find_close(p).unwrap();
            let open = bp.find_open(close).unwrap();
            assert_eq!(open, p, "roundtrip failed for position {}", p);
        }
    }

    #[test]
    fn test_balanced_parens_matches_linear_scan() {
        // "(()())" = 0b001011
        let words = vec![0b001011u64];
        let len = 6;
        let bp = BalancedParens::new(words.clone(), len);

        for p in 0..len {
            if bp.is_open(p) {
                let bp_result = bp.find_close(p);
                let linear_result = find_close(&words, len, p);
                assert_eq!(bp_result, linear_result, "mismatch at position {}", p);
            }
        }
    }

    // ========================================================================
    // Depth tests (matching Haskell SimpleSpec)
    // ========================================================================

    #[test]
    fn test_depth() {
        // "(()(()())) = 1101101000" in Haskell's bit order
        // Haskell uses 1-indexed positions, we use 0-indexed
        // Pattern: opens at 0,1,3,4,6; closes at 2,5,7,8,9
        // bits from LSB: 1 1 0 1 1 0 1 0 0 0 = 0b0001011011 = 91
        let bp = BalancedParens::new(vec![91u64], 10);

        // Haskell tests (1-indexed): depth 1=1, 2=2, 3=2, 4=2, 5=3, 6=3, 7=3, 8=3, 9=2, 10=1
        // Convert to 0-indexed: depth 0=1, 1=2, 2=2, 3=2, 4=3, 5=3, 6=3, 7=3, 8=2, 9=1
        assert_eq!(bp.depth(0), Some(1)); // depth 1 (root open)
        assert_eq!(bp.depth(1), Some(2)); // depth 2 (first child open)
        assert_eq!(bp.depth(2), Some(1)); // depth 1 (first child close - back to depth 1)
        assert_eq!(bp.depth(3), Some(2)); // depth 2 (second child open)
        assert_eq!(bp.depth(4), Some(3)); // depth 3 (grandchild open)
        assert_eq!(bp.depth(5), Some(2)); // depth 2 (grandchild close)
        assert_eq!(bp.depth(6), Some(3)); // depth 3 (another grandchild open)
        assert_eq!(bp.depth(7), Some(2)); // depth 2 (grandchild close)
        assert_eq!(bp.depth(8), Some(1)); // depth 1 (second child close)
        assert_eq!(bp.depth(9), Some(0)); // depth 0 (root close - balanced)
    }

    #[test]
    fn test_depth_simple() {
        // "(())" = 0b0011
        let bp = BalancedParens::new(vec![0b0011u64], 4);

        assert_eq!(bp.depth(0), Some(1)); // first open
        assert_eq!(bp.depth(1), Some(2)); // nested open
        assert_eq!(bp.depth(2), Some(1)); // first close
        assert_eq!(bp.depth(3), Some(0)); // second close (balanced)
    }

    // ========================================================================
    // Subtree size tests (matching Haskell SimpleSpec)
    // ========================================================================

    #[test]
    fn test_subtree_size() {
        // "(()(()())) = 1101101000" = 91
        // Tree structure:
        //   0 (root)
        //   ├── 1 (first child, leaf)
        //   └── 3 (second child)
        //       ├── 4 (grandchild, leaf)
        //       └── 6 (grandchild, leaf)
        let bp = BalancedParens::new(vec![91u64], 10);

        // Haskell (1-indexed): subtreeSize 1=5, 2=1, 3=0, 4=3, 5=1, 6=0, 7=1, 8=0, 9=0, 10=0
        // In Haskell, subtreeSize seems to count nodes in subtree
        // Our formula: (close - open) / 2 = number of pairs in subtree (excluding self)

        // Position 0 (root open): close at 9, subtree_size = (9-0)/2 = 4 (4 descendant nodes)
        assert_eq!(bp.subtree_size(0), Some(4));

        // Position 1 (first child open): close at 2, subtree_size = (2-1)/2 = 0 (leaf)
        assert_eq!(bp.subtree_size(1), Some(0));

        // Position 2 is a close, should return None
        assert_eq!(bp.subtree_size(2), None);

        // Position 3 (second child open): close at 8, subtree_size = (8-3)/2 = 2 (2 grandchildren)
        assert_eq!(bp.subtree_size(3), Some(2));

        // Position 4 (grandchild open): close at 5, subtree_size = (5-4)/2 = 0 (leaf)
        assert_eq!(bp.subtree_size(4), Some(0));

        // Position 5 is a close
        assert_eq!(bp.subtree_size(5), None);

        // Position 6 (grandchild open): close at 7, subtree_size = (7-6)/2 = 0 (leaf)
        assert_eq!(bp.subtree_size(6), Some(0));
    }

    #[test]
    fn test_subtree_size_simple() {
        // "(())" = 0b0011
        let bp = BalancedParens::new(vec![0b0011u64], 4);

        // Position 0 (outer open): close at 3, subtree_size = (3-0)/2 = 1
        assert_eq!(bp.subtree_size(0), Some(1));

        // Position 1 (inner open): close at 2, subtree_size = (2-1)/2 = 0 (leaf)
        assert_eq!(bp.subtree_size(1), Some(0));

        // Closes return None
        assert_eq!(bp.subtree_size(2), None);
        assert_eq!(bp.subtree_size(3), None);
    }

    #[test]
    fn test_find_close_multi_word_regression() {
        // Regression test from property test failure - tests that accelerated
        // find_close matches linear scan for a multi-word bit vector
        let words: Vec<u64> = vec![
            6148914691236517205,
            17768248446614328661,
            7652237512791978695,
            10679207262162398901,
            11775793721244512726,
            8946029940302164653,
            4836752526886871298,
            6851556326407803519,
            4920914883505671372,
            2108412058774050545,
            11226564708885533661,
            14782248937799897235,
            3628355826111158270,
            16652187963763892183,
            15371706386568957536,
            56015962,
        ];
        let len = 992;
        let p = 153;

        let bp = BalancedParens::new(words.clone(), len);
        let linear_result = find_close(&words, len, p);
        let bp_result = bp.find_close(p);

        // First, make sure we're testing an open position
        assert!(bp.is_open(p), "Position {} should be open", p);

        // The results should match
        assert_eq!(
            bp_result, linear_result,
            "find_close({}) mismatch: accelerated={:?}, linear={:?}",
            p, bp_result, linear_result
        );
    }

    // ========================================================================
    // Edge case tests for block boundaries and special cases
    // ========================================================================

    #[test]
    fn test_single_pair() {
        // Simplest possible: "()" = 0b01
        let bp = BalancedParens::new(vec![0b01], 2);
        assert_eq!(bp.find_close(0), Some(1));
        assert_eq!(bp.find_open(1), Some(0));
        assert_eq!(bp.depth(0), Some(1));
        assert_eq!(bp.depth(1), Some(0));
        assert_eq!(bp.subtree_size(0), Some(0));
    }

    #[test]
    fn test_find_close_at_word_boundary() {
        // Create a sequence where open is at bit 63 (end of word 0)
        // and close is at bit 64 (start of word 1)
        // Pattern: 63 opens, then open at 63, close at 64, then 63 closes
        // This tests the word boundary transition
        let words = vec![u64::MAX, 0u64]; // 64 opens, 64 closes
        let len = 128;
        let bp = BalancedParens::new(words.clone(), len);

        // Open at position 63 should match close at position 64
        assert_eq!(bp.find_close(63), Some(64));
        assert_eq!(find_close(&words, len, 63), Some(64));

        // Open at position 0 should match close at position 127
        assert_eq!(bp.find_close(0), Some(127));
    }

    #[test]
    fn test_find_close_spanning_multiple_words() {
        // 128 opens followed by 128 closes (4 words total)
        let words = vec![u64::MAX, u64::MAX, 0u64, 0u64];
        let len = 256;
        let bp = BalancedParens::new(words.clone(), len);

        // First open matches last close
        assert_eq!(bp.find_close(0), Some(255));
        assert_eq!(find_close(&words, len, 0), Some(255));

        // Open at 127 matches close at 128
        assert_eq!(bp.find_close(127), Some(128));
        assert_eq!(find_close(&words, len, 127), Some(128));
    }

    #[test]
    fn test_find_close_l1_boundary() {
        // L1 boundary is at 32 words = 2048 bits
        // Create: 2048 opens followed by 2048 closes
        let num_opens = 32; // words
        let mut words = vec![u64::MAX; num_opens]; // 32 words of opens
        words.extend(std::iter::repeat_n(0u64, num_opens)); // 32 words of closes
        let len = 64 * 64; // 4096 bits
        let bp = BalancedParens::new(words.clone(), len);

        // Open at position 0 should match close at position 4095
        assert_eq!(bp.find_close(0), Some(4095));
        assert_eq!(find_close(&words, len, 0), Some(4095));

        // Open at position 2047 (last open) matches close at 2048 (first close)
        assert_eq!(bp.find_close(2047), Some(2048));
        assert_eq!(find_close(&words, len, 2047), Some(2048));

        // Test some positions near L1 boundary (2048 bits)
        for p in [2040, 2044, 2046, 2047] {
            let bp_result = bp.find_close(p);
            let linear_result = find_close(&words, len, p);
            assert_eq!(
                bp_result, linear_result,
                "L1 boundary: find_close({}) mismatch",
                p
            );
        }
    }

    #[test]
    fn test_find_close_across_l1_boundary() {
        // Test where open is before L1 boundary and close is after
        // Create pattern: many opens, then balanced pairs around boundary
        // 31 words of opens (1984 bits), then "()" pairs, then closes

        // Simple version: open at bit 2000, close somewhere after L1 boundary
        let mut words = vec![u64::MAX; 64]; // Start with all opens
        let len = 64 * 64;

        // Turn some bits into closes to create a match across L1 boundary
        // Position 2048 is the L1 boundary (word 32, bit 0)
        // Put a close there to match an open before the boundary
        words[32] = 0; // Word 32 = all closes

        let bp = BalancedParens::new(words.clone(), len);

        // Open at 2047 (word 31, bit 63) should match close at 2048 (word 32, bit 0)
        assert_eq!(bp.find_close(2047), Some(2048));
        assert_eq!(find_close(&words, len, 2047), Some(2048));
    }

    #[test]
    fn test_partial_last_word() {
        // Test with len not a multiple of 64
        // "(())" but len = 4 (only 4 valid bits)
        let bp = BalancedParens::new(vec![0b0011u64], 4);
        assert_eq!(bp.find_close(0), Some(3));
        assert_eq!(bp.find_close(1), Some(2));

        // Larger partial word test: 100 bits (1 full word + 36 bits)
        // Pattern: 50 opens followed by 50 closes
        // For 50 opens then 50 closes:
        // Word 0: bits 0-49 = opens (1), bits 50-63 = closes (0)
        // Word 1: bits 0-35 = closes (0), bits 36-63 = invalid
        let word0 = (1u64 << 50) - 1; // bits 0-49 are 1
        let word1 = 0u64; // all closes
        let words = vec![word0, word1];
        let len = 100;
        let bp = BalancedParens::new(words.clone(), len);

        // Open at 0 matches close at 99
        assert_eq!(bp.find_close(0), Some(99));
        assert_eq!(find_close(&words, len, 0), Some(99));

        // Open at 49 matches close at 50
        assert_eq!(bp.find_close(49), Some(50));
        assert_eq!(find_close(&words, len, 49), Some(50));
    }

    #[test]
    fn test_deeply_nested() {
        // Create very deep nesting: (((((...))))) with 32 levels
        // 32 opens followed by 32 closes = 64 bits = 1 word
        let word = (1u64 << 32) - 1; // bits 0-31 are opens
        let bp = BalancedParens::new(vec![word], 64);

        // Open at 0 matches close at 63
        assert_eq!(bp.find_close(0), Some(63));

        // Open at 31 (innermost) matches close at 32
        assert_eq!(bp.find_close(31), Some(32));

        // Test depths
        assert_eq!(bp.depth(0), Some(1));
        assert_eq!(bp.depth(31), Some(32));
        assert_eq!(bp.depth(32), Some(31)); // After first close
    }

    #[test]
    fn test_alternating_pairs() {
        // ()()()()... - sequential pairs
        // Pattern: 0b01 repeated = 0x5555555555555555
        let word = 0x5555555555555555u64;
        let bp = BalancedParens::new(vec![word], 64);

        // Each open at even position matches close at odd position
        for i in 0..32 {
            let open = i * 2;
            let close = i * 2 + 1;
            assert_eq!(bp.find_close(open), Some(close));
            assert_eq!(bp.find_open(close), Some(open));
        }
    }

    #[test]
    fn test_out_of_bounds() {
        let bp = BalancedParens::new(vec![0b01u64], 2);

        // Out of bounds positions should return None
        assert_eq!(bp.find_close(2), None);
        assert_eq!(bp.find_close(100), None);
        assert_eq!(bp.find_open(2), None);
        assert_eq!(bp.find_open(100), None);
        assert_eq!(bp.depth(2), None);
        assert_eq!(bp.depth(100), None);
        assert_eq!(bp.subtree_size(2), None);
    }

    #[test]
    fn test_empty() {
        let bp = BalancedParens::new(vec![], 0);
        assert!(bp.is_empty());
        assert_eq!(bp.len(), 0);
        assert_eq!(bp.find_close(0), None);
        assert_eq!(bp.find_open(0), None);
    }

    #[test]
    fn test_unbalanced_sequence() {
        // "((" = 0b11 - two opens, no closes
        // This is an invalid BP sequence, but the functions should handle it gracefully
        let words = vec![0b11u64];
        let bp = BalancedParens::new(words.clone(), 2);

        // find_close should return None for unmatched opens
        assert_eq!(bp.find_close(0), None);
        assert_eq!(bp.find_close(1), None);
        assert_eq!(find_close(&words, 2, 0), None);
    }

    #[test]
    fn test_all_closes() {
        // "))" = 0b00 - two closes, no opens
        let words = vec![0u64];
        let bp = BalancedParens::new(words.clone(), 2);

        // find_close on close positions should return None
        assert_eq!(bp.find_close(0), None);
        assert_eq!(bp.find_close(1), None);
    }

    #[test]
    fn test_find_close_near_end() {
        // Test find_close where match is near the end of the bit vector
        // "(())" but the close at position 3 is near len
        let bp = BalancedParens::new(vec![0b0011u64], 4);
        assert_eq!(bp.find_close(0), Some(3));

        // Larger test: open near the beginning, close at the very end
        let words = vec![u64::MAX, 0u64];
        let len = 128;
        let bp = BalancedParens::new(words.clone(), len);
        assert_eq!(bp.find_close(0), Some(127));
    }

    #[test]
    fn test_index_structure_correctness() {
        // Verify the L0/L1/L2 indices are computed correctly
        // for a specific known pattern
        let words = vec![0b001011u64]; // "(()())" - balanced
        let bp = BalancedParens::new(words, 6);

        // L0 min_excess for this word should reflect the minimum reached
        // Scanning: 1, 2, 1, 2, 1, 0 - min is 0
        assert_eq!(bp.l0_min_excess[0], 0);

        // L0 word_excess should be 0 (balanced)
        assert_eq!(bp.l0_word_excess[0], 0);
    }

    #[test]
    fn test_word_min_excess_full_word() {
        // Test word_min_excess with full 64-bit patterns

        // All opens: excess goes 1,2,3,...,64, min=0
        let (min, total) = word_min_excess(u64::MAX, 64);
        assert_eq!(min, 0);
        assert_eq!(total, 64);

        // All closes: excess goes -1,-2,...,-64, min=-64
        let (min, total) = word_min_excess(0, 64);
        assert_eq!(min, -64);
        assert_eq!(total, -64);

        // Alternating ()()()...: min=0, total=0
        let (min, total) = word_min_excess(0x5555555555555555, 64);
        assert_eq!(min, 0);
        assert_eq!(total, 0);

        // Alternating )()()(... (starting with close): min=-1, total=0
        let (min, total) = word_min_excess(0xAAAAAAAAAAAAAAAAu64, 64);
        assert_eq!(min, -1);
        assert_eq!(total, 0);
    }

    #[test]
    fn test_l1_index_construction() {
        // Test that L1 indices are built correctly
        // Create exactly 32 words to have one full L1 block
        let words = vec![0x5555555555555555u64; 32]; // ()()()... - balanced
        let len = 32 * 64;
        let bp = BalancedParens::new(words, len);

        // Should have exactly 1 L1 block
        assert_eq!(bp.l1_min_excess.len(), 1);
        assert_eq!(bp.l1_block_excess.len(), 1);

        // L1 should also show balanced (block_excess = 0)
        assert_eq!(bp.l1_block_excess[0], 0);
    }

    #[test]
    fn test_multiple_l1_blocks() {
        // Create 64 words = 2 L1 blocks
        // First L1 block: 32 words of opens (positive excess)
        let mut words = vec![u64::MAX; 32];
        // Second L1 block: 32 words of closes (negative excess)
        words.extend(std::iter::repeat_n(0u64, 32));
        let len = 64 * 64;
        let bp = BalancedParens::new(words.clone(), len);

        // Should have 2 L1 blocks
        assert_eq!(bp.l1_min_excess.len(), 2);

        // First L1 block: all opens, min_excess = 0, block_excess = 32*64 = 2048
        assert_eq!(bp.l1_min_excess[0], 0);
        assert_eq!(bp.l1_block_excess[0], 2048);

        // Second L1 block: all closes, min_excess = -2048, block_excess = -2048
        assert_eq!(bp.l1_min_excess[1], -2048);
        assert_eq!(bp.l1_block_excess[1], -2048);

        // Test find_close across L1 boundary
        assert_eq!(bp.find_close(2047), Some(2048));
        assert_eq!(find_close(&words, len, 2047), Some(2048));
    }

    #[test]
    fn test_find_close_immediate_match() {
        // Test cases where match is immediately after the open
        // "()" repeated
        let word = 0x5555555555555555u64;
        let bp = BalancedParens::new(vec![word], 64);

        for i in 0..32 {
            assert_eq!(bp.find_close(i * 2), Some(i * 2 + 1));
        }
    }

    #[test]
    fn test_sibling_chain() {
        // "()()()..." - chain of siblings
        let word = 0x5555555555555555u64;
        let bp = BalancedParens::new(vec![word], 64);

        // Each pair is a sibling of the next
        for i in 0..31 {
            let pos = i * 2;
            assert_eq!(bp.next_sibling(pos), Some(pos + 2));
        }
        // Last sibling has no next
        assert_eq!(bp.next_sibling(62), None);
    }

    #[test]
    fn test_no_first_child_for_leaf() {
        // "()" - leaf node has no first child
        let bp = BalancedParens::new(vec![0b01u64], 2);
        assert_eq!(bp.first_child(0), None);
    }

    #[test]
    fn test_first_child_exists() {
        // "(())" - root has first child
        let bp = BalancedParens::new(vec![0b0011u64], 4);
        assert_eq!(bp.first_child(0), Some(1));
        assert_eq!(bp.first_child(1), None); // inner leaf
    }

    // ========================================================================
    // BalancedParens tests
    // ========================================================================

    #[test]
    fn test_v2_rank1_simple() {
        // "(()())" = 0b001011 - bits: 1 1 0 1 0 0
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        // rank1(p) = count of 1s in [0, p)
        assert_eq!(bp.rank1(0), 0); // no bits before 0
        assert_eq!(bp.rank1(1), 1); // bit 0 is 1
        assert_eq!(bp.rank1(2), 2); // bits 0,1 are 1
        assert_eq!(bp.rank1(3), 2); // bit 2 is 0
        assert_eq!(bp.rank1(4), 3); // bit 3 is 1
        assert_eq!(bp.rank1(5), 3); // bit 4 is 0
        assert_eq!(bp.rank1(6), 3); // bit 5 is 0
    }

    #[test]
    fn test_v2_rank1_matches_v1() {
        // Compare V1 and V2 rank1 on various patterns
        // Note: V1 has an edge case bug at p==len when len is word-aligned,
        // so we test up to len-1 for those cases
        let patterns: &[(Vec<u64>, usize)] = &[
            (vec![0b001011u64], 6),
            (vec![0b0011u64], 4),
            (vec![0x5555555555555555u64], 64), // alternating "()()()"
            (vec![u64::MAX, 0], 128),          // all opens then all closes
        ];

        for (words, len) in patterns {
            let v1 = BalancedParens::new(words.clone(), *len);
            let v2 = BalancedParens::new(words.clone(), *len);

            // Test positions 0 to len-1 (avoiding V1 edge case bug at p==len)
            for p in 0..*len {
                assert_eq!(
                    v1.rank1(p),
                    v2.rank1(p),
                    "rank1({}) mismatch for pattern {:?}",
                    p,
                    words
                );
            }
        }
    }

    #[test]
    fn test_v2_find_close_matches_v1() {
        // Compare V1 and V2 find_close on various patterns
        let patterns: &[(Vec<u64>, usize)] = &[
            (vec![0b001011u64], 6),
            (vec![0b0011u64], 4),
            (vec![0x5555555555555555u64], 64),
        ];

        for (words, len) in patterns {
            let v1 = BalancedParens::new(words.clone(), *len);
            let v2 = BalancedParens::new(words.clone(), *len);

            for p in 0..*len {
                if v1.is_open(p) {
                    assert_eq!(
                        v1.find_close(p),
                        v2.find_close(p),
                        "find_close({}) mismatch for pattern {:?}",
                        p,
                        words
                    );
                }
            }
        }
    }

    #[test]
    fn test_v2_excess() {
        // "(()())" = 0b001011
        let bp = BalancedParens::new(vec![0b001011u64], 6);

        // excess(p) = opens - closes in [0, p]
        assert_eq!(bp.excess(0), 1); // 1 open
        assert_eq!(bp.excess(1), 2); // 2 opens
        assert_eq!(bp.excess(2), 1); // 2 opens, 1 close
        assert_eq!(bp.excess(3), 2); // 3 opens, 1 close
        assert_eq!(bp.excess(4), 1); // 3 opens, 2 closes
        assert_eq!(bp.excess(5), 0); // 3 opens, 3 closes
    }

    #[test]
    fn test_v2_large_bitvector() {
        // Test with multiple 512-bit blocks to verify the rank directory
        // 64 words = 8 blocks of 8 words each
        let mut words = Vec::with_capacity(64);

        // First 32 words: all opens (1s)
        words.extend(std::iter::repeat_n(u64::MAX, 32));
        // Next 32 words: all closes (0s)
        words.extend(std::iter::repeat_n(0u64, 32));

        let len = 64 * 64; // 4096 bits
        let v1 = BalancedParens::new(words.clone(), len);
        let v2 = BalancedParens::new(words, len);

        // Test rank1 at various positions
        let test_positions = [0, 1, 63, 64, 512, 1024, 2048, 2049, 4095, 4096];
        for &p in &test_positions {
            assert_eq!(
                v1.rank1(p),
                v2.rank1(p),
                "rank1({}) mismatch on large bitvector",
                p
            );
        }

        // Test find_close
        assert_eq!(v1.find_close(0), v2.find_close(0));
        assert_eq!(v1.find_close(2047), v2.find_close(2047));
    }

    #[test]
    fn test_enclose_word_boundary_with_zero_excess() {
        // Regression test for a bug where enclose() would skip words when the
        // word's total excess was 0 (equal opens and closes), even though the
        // enclosing parent was within that word.
        //
        // The bug was in the condition:
        //   if excess + word_excess >= 1  // WRONG: checks final excess
        // Fixed to:
        //   if excess + max_excess_in_word >= 1  // CORRECT: checks max during scan
        //
        // To trigger the bug, we need:
        // 1. Position p in word N where we call enclose(p)
        // 2. When scanning backwards from p, the partial scan of word N gives excess = 0
        // 3. Word N-1 has word_excess = 0 but contains the enclosing parent
        //
        // This means the buggy code checks: 0 + 0 >= 1 = false, and SKIPS word N-1

        let mut words = vec![0u64; 4];

        // Word 0: Root structure
        words[0] = 0x0000_0000_0000_0001; // Just bit 0 is open (root)

        // Word 1: Use ")(" pattern = 0xAAAA... (close at even bits, open at odd bits)
        // This has word_excess = 0 but when scanning backwards from bit 63:
        //   bit 63: 1 (open) -> excess = +1 -> FOUND!
        // So max_excess_backward = 1
        words[1] = 0xAAAA_AAAA_AAAA_AAAA;

        // Word 2: ALSO use ")(" pattern
        // When scanning this word in the FULL word loop, max_excess = 1 but word_excess = 0
        words[2] = 0xAAAA_AAAA_AAAA_AAAA;

        // Word 3: Open at bit 2 with balanced prefix (bits 0,1 = open,close = "()")
        // Pattern: bit 0 = 1 (open), bit 1 = 0 (close), bit 2 = 1 (open) = 0b101 = 0x5
        // When we call enclose(194) (word 3 bit 2):
        // - Partial word 3 scan: bits 1, 0 = close, open -> excess = -1 + 1 = 0
        // - Now we check word 2 (the FULL word loop)
        // - Word 2 has word_excess = 0
        // - Buggy code: 0 + 0 >= 1 is FALSE -> SKIP word 2 (BUG!)
        // - Fixed code: 0 + 1 >= 1 is TRUE -> scan word 2, find parent at bit 63
        words[3] = 0x0000_0000_0000_0005; // Open at bit 0, close at bit 1, open at bit 2

        let len = 256; // 4 words
        let parent = enclose(&words, len, 194);

        assert_eq!(
            parent,
            Some(191), // Word 2 bit 63
            "enclose(194) should find parent at position 191 (word 2 bit 63). \
             With the buggy code that checked word_excess instead of max_excess, \
             word 2 would be skipped because word_excess=0, returning wrong parent."
        );

        // Verify the conditions for the bug:
        let word2_popcount = words[2].count_ones();
        assert_eq!(word2_popcount, 32, "Word 2 should have exactly 32 ones");

        let word2_excess = 2 * word2_popcount as i32 - 64;
        assert_eq!(
            word2_excess, 0,
            "Word 2 excess should be 0 (this is what triggered the bug)"
        );
    }

    // ========================================================================
    // SSE4.1 vs Scalar Equivalence Tests
    // ========================================================================

    /// Test that SSE4.1 L1 index building produces identical results to scalar.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_matches_scalar_simple() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Simple case: 32 words (exactly 1 L1 block)
        let words: Vec<u64> = (0..32).map(|_| 0x5555555555555555u64).collect();
        let (l0_min, l0_excess) = build_l0_index(&words, 32 * 64, 32);
        let num_l1 = 1;

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(
            scalar_min, sse41_min,
            "L1 min_excess mismatch: scalar={:?}, sse41={:?}",
            scalar_min, sse41_min
        );
        assert_eq!(
            scalar_excess, sse41_excess,
            "L1 block_excess mismatch: scalar={:?}, sse41={:?}",
            scalar_excess, sse41_excess
        );
    }

    /// Test SSE4.1 L1 with multiple blocks and partial blocks.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_matches_scalar_multiple_blocks() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // 100 words = 3 L1 blocks (32 + 32 + 36) with partial last block
        let words: Vec<u64> = (0..100)
            .map(|i| if i % 2 == 0 { u64::MAX } else { 0 })
            .collect();
        let (l0_min, l0_excess) = build_l0_index(&words, 100 * 64, 100);
        let num_l1 = 100usize.div_ceil(FACTOR_L1);

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(
            scalar_min, sse41_min,
            "L1 min_excess mismatch for {} blocks",
            num_l1
        );
        assert_eq!(
            scalar_excess, sse41_excess,
            "L1 block_excess mismatch for {} blocks",
            num_l1
        );
    }

    /// Test SSE4.1 L1 with negative excess values (all closes).
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_matches_scalar_negative_excess() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // All closes = all zeros, so each word has excess -64
        let words: Vec<u64> = vec![0u64; 64];
        let (l0_min, l0_excess) = build_l0_index(&words, 64 * 64, 64);
        let num_l1 = 64usize.div_ceil(FACTOR_L1);

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(scalar_min, sse41_min);
        assert_eq!(scalar_excess, sse41_excess);

        // Verify expected values: each L1 block of 32 words should have
        // block_excess = 32 * (-64) = -2048
        assert_eq!(sse41_excess[0], -2048);
        assert_eq!(sse41_excess[1], -2048);
    }

    /// Test SSE4.1 L1 with mixed positive and negative excess.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_matches_scalar_mixed_excess() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Alternating pattern: opens then closes
        let mut words = Vec::new();
        // First 16 words: all opens
        words.extend(std::iter::repeat_n(u64::MAX, 16));
        // Next 16 words: all closes
        words.extend(std::iter::repeat_n(0u64, 16));
        // Repeat for second L1 block
        words.extend(std::iter::repeat_n(u64::MAX, 16));
        words.extend(std::iter::repeat_n(0u64, 16));

        let (l0_min, l0_excess) = build_l0_index(&words, 64 * 64, 64);
        let num_l1 = 64usize.div_ceil(FACTOR_L1);

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(scalar_min, sse41_min);
        assert_eq!(scalar_excess, sse41_excess);
    }

    /// Test SSE4.1 L2 index building produces identical results to scalar.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l2_matches_scalar() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Create enough L1 blocks to need L2 index (32+ L1 blocks)
        // 32 L1 blocks * 32 words = 1024 words
        let words: Vec<u64> = (0..1024)
            .map(|i| {
                if i % 3 == 0 {
                    u64::MAX
                } else {
                    0x5555555555555555
                }
            })
            .collect();
        let (l0_min, l0_excess) = build_l0_index(&words, 1024 * 64, 1024);
        let num_l1 = 1024usize.div_ceil(FACTOR_L1);
        let num_l2 = num_l1.div_ceil(FACTOR_L2);

        // Build L1 first (using scalar for consistency)
        let (l1_min, l1_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);

        let (scalar_min, scalar_excess) = build_l2_index_scalar(&l1_min, &l1_excess, num_l2);
        let (sse41_min, sse41_excess) = build_l2_index_sse41(&l1_min, &l1_excess, num_l2);

        assert_eq!(
            scalar_min, sse41_min,
            "L2 min_excess mismatch: scalar={:?}, sse41={:?}",
            scalar_min, sse41_min
        );
        assert_eq!(
            scalar_excess, sse41_excess,
            "L2 block_excess mismatch: scalar={:?}, sse41={:?}",
            scalar_excess, sse41_excess
        );
    }

    /// Test that full BalancedParens construction with SSE4.1 produces correct results.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_full_construction_correctness() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Large balanced sequence spanning multiple L1/L2 blocks
        let mut words = Vec::new();
        // Opens for first half
        words.extend(std::iter::repeat_n(u64::MAX, 64));
        // Closes for second half
        words.extend(std::iter::repeat_n(0u64, 64));
        let len = 128 * 64;

        let bp = BalancedParens::new(words.clone(), len);

        // Test find_close on various positions
        for i in 0..64 * 64 {
            if let Some(close) = bp.find_close(i) {
                // The matching close should be at the mirror position
                let expected = len - 1 - i;
                assert_eq!(
                    close, expected,
                    "find_close({}) = {}, expected {}",
                    i, close, expected
                );
            }
        }
    }

    /// Test SSE4.1 L1 with edge case: exactly 8 words (one SIMD iteration).
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_single_simd_iteration() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Exactly 8 words, so one SIMD iteration with no scalar tail
        let words: Vec<u64> = vec![
            u64::MAX,           // +64
            0,                  // -64
            0x5555555555555555, // 0
            0xAAAAAAAAAAAAAAAA, // 0
            u64::MAX,           // +64
            u64::MAX,           // +64
            0,                  // -64
            0,                  // -64
        ];
        let (l0_min, l0_excess) = build_l0_index(&words, 8 * 64, 8);
        let num_l1 = 1;

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(scalar_min, sse41_min);
        assert_eq!(scalar_excess, sse41_excess);
    }

    /// Test SSE4.1 L1 with non-aligned block sizes (exercises scalar tail).
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_with_scalar_tail() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // 37 words = 1 L1 block with 32 words + 5 words in second block
        // First block: 32 words = 4 SIMD iterations
        // Second block: 5 words = no SIMD iterations, all scalar
        let words: Vec<u64> = (0..37)
            .map(|i| if i % 2 == 0 { u64::MAX } else { 0 })
            .collect();
        let (l0_min, l0_excess) = build_l0_index(&words, 37 * 64, 37);
        let num_l1 = 37usize.div_ceil(FACTOR_L1);

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(scalar_min, sse41_min);
        assert_eq!(scalar_excess, sse41_excess);
    }

    /// Test SSE4.1 with extreme excess values near i16 limits.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_extreme_excess_values() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Create a sequence with very negative excess
        // 512 words of all closes = -32768 total excess
        let words: Vec<u64> = vec![0u64; 512];
        let (l0_min, l0_excess) = build_l0_index(&words, 512 * 64, 512);
        let num_l1 = 512usize.div_ceil(FACTOR_L1);

        let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
        let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

        assert_eq!(scalar_min, sse41_min);
        assert_eq!(scalar_excess, sse41_excess);
    }

    /// Fuzz-style test: random patterns should produce same results.
    #[test]
    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
    fn test_sse41_l1_random_patterns() {
        if !is_x86_feature_detected!("sse4.1") {
            println!("SSE4.1 not available, skipping test");
            return;
        }

        // Generate various patterns
        let patterns: Vec<Vec<u64>> = vec![
            // Pattern 1: all ones
            vec![u64::MAX; 100],
            // Pattern 2: all zeros
            vec![0u64; 100],
            // Pattern 3: alternating words
            (0..100)
                .map(|i| if i % 2 == 0 { u64::MAX } else { 0 })
                .collect(),
            // Pattern 4: mixed pattern based on index
            (0..100)
                .map(|i| {
                    let shift = i % 64;
                    (1u64 << shift).wrapping_sub(1)
                })
                .collect(),
            // Pattern 5: pseudo-random using xorshift
            {
                let mut state = 0xDEADBEEFu64;
                (0..100)
                    .map(|_| {
                        state ^= state << 13;
                        state ^= state >> 7;
                        state ^= state << 17;
                        state
                    })
                    .collect()
            },
        ];

        for (pattern_idx, words) in patterns.iter().enumerate() {
            let (l0_min, l0_excess) = build_l0_index(words, words.len() * 64, words.len());
            let num_l1 = words.len().div_ceil(FACTOR_L1);

            let (scalar_min, scalar_excess) = build_l1_index_scalar(&l0_min, &l0_excess, num_l1);
            let (sse41_min, sse41_excess) = build_l1_index_sse41(&l0_min, &l0_excess, num_l1);

            assert_eq!(
                scalar_min, sse41_min,
                "L1 min_excess mismatch for pattern {}",
                pattern_idx
            );
            assert_eq!(
                scalar_excess, sse41_excess,
                "L1 block_excess mismatch for pattern {}",
                pattern_idx
            );
        }
    }
}