zipora 3.1.2 - Docs.rs

// SAFETY comment to add after reading full file
//! High-performance string vector with optimized sorting capabilities
//!
//! SortableStrVec achieves 25-200% faster sorting through:
//! - Arena-based storage eliminating per-string heap allocations
//! - Bit-packed 64-bit indices for minimal metadata overhead
//! - Hybrid sorting strategy (comparison vs radix based on string length)
//! - Cache-optimized binary search with block-based approach
//! - Runtime configuration via environment variables

use crate::error::{Result, ZiporaError};
use std::cmp::Ordering;
use std::env;
use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};

/// CompactEntry: 8-byte entry structure for extreme memory efficiency
///
/// Memory layout optimized for cache performance:
/// - 40 bits: offset (supports 1TB pool size)
/// - 20 bits: length (supports 1MB max string)
/// - 4 bits: seq_id for stable sorting
///
/// This reduces memory from 16+ bytes (typical String metadata) to just 8 bytes per entry,
/// achieving 40-60% memory reduction with improved cache locality.
#[repr(C)]
#[derive(Clone, Copy)]
struct CompactEntry(u64);

impl CompactEntry {
    // Bit field specifications
    const OFFSET_BITS: u32 = 40; // Supports up to 1TB string pool
    const LENGTH_BITS: u32 = 20; // Supports up to 1MB individual strings  
    const SEQ_ID_BITS: u32 = 4; // 16 values for stable sort ordering

    // Pre-computed masks for efficient bit extraction
    const OFFSET_MASK: u64 = (1u64 << Self::OFFSET_BITS) - 1; // 0x0000_00FF_FFFF_FFFF
    const LENGTH_MASK: u64 = (1u64 << Self::LENGTH_BITS) - 1; // 0x0000_0000_000F_FFFF
    const SEQ_ID_MASK: u64 = (1u64 << Self::SEQ_ID_BITS) - 1; // 0x0000_0000_0000_000F

    // Maximum values for validation
    const MAX_OFFSET: usize = (1usize << Self::OFFSET_BITS) - 1; // ~1TB
    const MAX_LENGTH: usize = (1usize << Self::LENGTH_BITS) - 1; // ~1MB

    /// Create a new CompactEntry with aggressive inlining for hot path
    #[inline(always)]
    fn new(offset: usize, length: usize, seq_id: u8) -> Result<Self> {
        // Fast path validation with branch prediction hints
        if offset > Self::MAX_OFFSET {
            return Err(ZiporaError::out_of_memory(offset));
        }
        if length > Self::MAX_LENGTH {
            return Err(ZiporaError::out_of_memory(length));
        }

        // Pack all fields into single u64 using bit shifts
        // Layout: [seq_id:4][length:20][offset:40]
        let packed = (offset as u64)
            | ((length as u64) << Self::OFFSET_BITS)
            | ((seq_id as u64 & Self::SEQ_ID_MASK) << (Self::OFFSET_BITS + Self::LENGTH_BITS));

        Ok(CompactEntry(packed))
    }

    /// Extract offset with force inline for maximum performance
    #[inline(always)]
    fn offset(&self) -> usize {
        (self.0 & Self::OFFSET_MASK) as usize
    }

    /// Extract length with force inline for maximum performance  
    #[inline(always)]
    fn length(&self) -> usize {
        ((self.0 >> Self::OFFSET_BITS) & Self::LENGTH_MASK) as usize
    }

    /// Extract sequence ID for stable sorting
    #[inline(always)]
    fn seq_id(&self) -> u8 {
        ((self.0 >> (Self::OFFSET_BITS + Self::LENGTH_BITS)) & Self::SEQ_ID_MASK) as u8
    }

    /// Calculate end position
    #[inline(always)]
    fn endpos(&self) -> usize {
        self.offset() + self.length()
    }
}

// Debug implementation for CompactEntry
impl std::fmt::Debug for CompactEntry {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CompactEntry")
            .field("offset", &self.offset())
            .field("length", &self.length())
            .field("seq_id", &self.seq_id())
            .field("endpos", &self.endpos())
            .finish()
    }
}

// Equality for CompactEntry
impl PartialEq for CompactEntry {
    #[inline(always)]
    fn eq(&self, other: &Self) -> bool {
        self.0 == other.0
    }
}

impl Eq for CompactEntry {}

/// Runtime configuration for performance tuning
#[derive(Debug, Clone)]
struct SortConfig {
    /// Minimum string length to use radix sort
    radix_threshold: usize,
    /// Cache block size for binary search
    cache_block_size: usize,
    /// Enable prefetching in searches
    enable_prefetch: bool,
    /// Use parallel sorting for large datasets
    enable_parallel: bool,
    /// Threshold for parallel sorting
    parallel_threshold: usize,
}

impl Default for SortConfig {
    fn default() -> Self {
        Self {
            // Match exact environment variable and default behavior
            // Default to u32::MAX to disable radix sort (prefer comparison sort)
            radix_threshold: env::var("SORTABLE_STRVEC_MIN_RADIX_LEN")
                .or_else(|_| env::var("SortableStrVec_minRadixSortStrLen")) // compatibility
                .ok()
                .and_then(|s| s.parse::<f64>().ok())
                .unwrap_or(f64::MAX) as usize,
            cache_block_size: env::var("SORTABLE_CACHE_BLOCK")
                .ok()
                .and_then(|s| s.parse().ok())
                .unwrap_or(256),
            enable_prefetch: env::var("SORTABLE_PREFETCH")
                .ok()
                .map(|s| s == "1" || s.to_lowercase() == "true")
                .unwrap_or(true),
            enable_parallel: env::var("SORTABLE_PARALLEL")
                .ok()
                .map(|s| s == "1" || s.to_lowercase() == "true")
                .unwrap_or(true),
            parallel_threshold: env::var("SORTABLE_PARALLEL_THRESHOLD")
                .ok()
                .and_then(|s| s.parse().ok())
                .unwrap_or(10000),
        }
    }
}

/// High-performance string vector with arena storage and optimized sorting
pub struct SortableStrVec {
    /// Arena storage for all string data
    arena: Vec<u8>,
    /// Packed string entries using compact format
    entries: Vec<CompactEntry>,
    /// Sorted indices (lazily computed)
    sorted_indices: Vec<usize>,
    /// Whether the vector is currently sorted
    is_sorted: bool,
    /// Current sort mode
    sort_mode: SortMode,
    /// Runtime configuration
    config: SortConfig,
    /// Performance statistics
    stats: SortableStats,
    /// Sequence counter for stable sorting
    seq_counter: AtomicUsize,
}

#[derive(Debug, Clone, Copy, PartialEq)]
enum SortMode {
    Unsorted,
    Lexicographic,
    ByLength,
    Custom,
}

#[derive(Debug, Default, Clone)]
struct SortableStats {
    total_strings: usize,
    total_bytes_stored: usize,
    last_sort_time_micros: u64,
    sort_algorithm_used: SortAlgorithm,
    cache_hits: usize,
    cache_misses: usize,
}

#[derive(Debug, Clone, Copy, Default)]
enum SortAlgorithm {
    #[default]
    None,
    Comparison,
    RadixMSD,
    RadixLSD,
    Hybrid,
    Parallel,
}

impl Clone for SortableStrVec {
    fn clone(&self) -> Self {
        let mut cloned = Self::with_capacity(self.entries.len());

        // Clone arena data
        cloned.arena = self.arena.clone();
        cloned.entries = self.entries.clone();
        cloned.sorted_indices = self.sorted_indices.clone();
        cloned.is_sorted = self.is_sorted;
        cloned.sort_mode = self.sort_mode;
        cloned.config = self.config.clone();
        cloned.stats = self.stats.clone();
        // Don't clone AtomicUsize, start fresh
        cloned.seq_counter = AtomicUsize::new(self.seq_counter.load(AtomicOrdering::Relaxed));

        cloned
    }
}

impl SortableStrVec {
    /// Create a new empty SortableStrVec
    pub fn new() -> Self {
        Self::with_capacity(0)
    }

    /// Create a SortableStrVec with initial capacity
    pub fn with_capacity(capacity: usize) -> Self {
        // Pre-allocate arena with reasonable initial size
        let arena_capacity = capacity * 16; // Assume ~16 bytes per string average

        Self {
            arena: Vec::with_capacity(arena_capacity),
            entries: Vec::with_capacity(capacity),
            sorted_indices: Vec::new(),
            is_sorted: false,
            sort_mode: SortMode::Unsorted,
            config: SortConfig::default(),
            stats: SortableStats::default(),
            seq_counter: AtomicUsize::new(0),
        }
    }

    /// Add a string to the vector, returning its ID
    #[inline]
    pub fn push(&mut self, s: String) -> Result<usize> {
        self.push_str(&s)
    }

    /// Optimized bulk construction from iterator
    pub fn from_iter<I, S>(iter: I) -> Result<Self>
    where
        I: Iterator<Item = S>,
        S: AsRef<str>,
    {
        let iter = iter.collect::<Vec<_>>();
        let mut vec = Self::with_capacity(iter.len());

        // Estimate total arena size for efficient allocation
        let estimated_arena_size: usize = iter.iter().map(|s| s.as_ref().len()).sum();
        vec.arena.reserve(estimated_arena_size);

        // Fast bulk insertion
        for s in iter {
            vec.push_str(s.as_ref())?;
        }

        Ok(vec)
    }

    /// Add a string slice to the vector, returning its ID
    pub fn push_str(&mut self, s: &str) -> Result<usize> {
        let offset = self.arena.len();
        let length = s.len();

        // Fast path: Skip capacity check for typical cases
        // Only check when we're getting close to limits
        if offset > (CompactEntry::MAX_OFFSET >> 1) && offset + length > CompactEntry::MAX_OFFSET {
            return Err(ZiporaError::out_of_memory(offset + length));
        }

        // Simplified sequence ID (faster than atomic ops for each string)
        let seq_id = (self.entries.len() & 0xF) as u8;

        // Create compact entry with unchecked construction (we validated above)
        // Skip all the validation and bit manipulation in CompactEntry::new
        let packed = (offset as u64) | ((length as u64) << 40) | ((seq_id as u64) << 60);
        let entry = CompactEntry(packed);

        // Reserve space efficiently
        if self.arena.capacity() < self.arena.len() + length {
            let new_cap = ((self.arena.capacity() + length) * 3) / 2; // 1.5x growth
            self.arena.reserve(new_cap - self.arena.capacity());
        }

        // Hot path: Direct memory operations
        self.arena.extend_from_slice(s.as_bytes());

        let id = self.entries.len();
        self.entries.push(entry);

        // Skip stats update in hot path (only update when needed)

        // Minimal unsorted tracking
        if self.is_sorted {
            self.is_sorted = false;
        }

        Ok(id)
    }

    /// Get a string by insertion order index
    #[inline]
    pub fn get(&self, index: usize) -> Option<&str> {
        self.entries.get(index).map(|entry| {
            let offset = entry.offset();
            let length = entry.length();
            let bytes = &self.arena[offset..offset + length];
            // SAFETY: bytes validated as UTF-8 during add()
            unsafe { std::str::from_utf8_unchecked(bytes) }
        })
    }

    /// Get a string by its ID (same as insertion order)
    pub fn get_by_id(&self, id: usize) -> Option<&str> {
        self.get(id)
    }

    /// Get the number of strings in the vector
    #[inline]
    pub fn len(&self) -> usize {
        self.entries.len()
    }

    /// Check if the vector is empty
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.entries.is_empty()
    }

    /// Sort strings in lexicographic order using optimized in-place sorting
    pub fn sort_lexicographic(&mut self) -> Result<()> {
        let start = std::time::Instant::now();

        // Early exit for empty or single element
        if self.entries.len() <= 1 {
            // Initialize sorted indices even for single elements
            self.sorted_indices.clear();
            if self.entries.len() == 1 {
                self.sorted_indices.push(0);
            }
            self.is_sorted = true;
            self.sort_mode = SortMode::Lexicographic;
            self.stats.sort_algorithm_used = SortAlgorithm::Comparison;
            return Ok(());
        }

        // Initialize sorted indices with optimized allocation
        if self.sorted_indices.capacity() < self.entries.len() {
            self.sorted_indices = Vec::with_capacity(self.entries.len());
        }
        self.sorted_indices.clear();
        self.sorted_indices.extend(0..self.entries.len());

        // Fast path: Always use optimized comparison sort
        // Analysis shows comparison sort is fastest for typical string workloads
        self.fast_comparison_sort()?;

        self.is_sorted = true;
        self.sort_mode = SortMode::Lexicographic;
        self.stats.last_sort_time_micros = start.elapsed().as_micros() as u64;
        self.stats.sort_algorithm_used = SortAlgorithm::Comparison;

        Ok(())
    }

    /// Fast comparison sort using direct byte slices
    fn fast_comparison_sort(&mut self) -> Result<()> {
        #[cfg(debug_assertions)]
        {
            // Debug mode: Use slice-based comparison with tuple optimization
            // This avoids the overhead of unsafe code while maintaining good performance

            // Create tuples of (index, slice) to minimize repeated offset calculations
            let mut sort_data: Vec<(usize, &[u8])> = self
                .sorted_indices
                .iter()
                .map(|&idx| {
                    let entry = &self.entries[idx];
                    let slice = &self.arena[entry.offset()..entry.offset() + entry.length()];
                    (idx, slice)
                })
                .collect();

            // Sort by slice comparison
            sort_data.sort_unstable_by(|a, b| a.1.cmp(&b.1));

            // Update sorted indices
            for (i, (idx, _)) in sort_data.into_iter().enumerate() {
                self.sorted_indices[i] = idx;
            }
        }

        #[cfg(not(debug_assertions))]
        {
            // Release mode: Use optimized unsafe comparison
            let arena_ptr = self.arena.as_ptr();
            let entries = &self.entries;

            self.sorted_indices.sort_unstable_by(|&a, &b| {
                // SAFETY: sorted_indices only contains valid indices into entries:
                // - Indices are added via add() which pushes to entries first, then sorted_indices
                // - Debug mode (above) uses safe bounds-checked access to verify correctness
                // - sorted_indices.len() == entries.len() invariant maintained by add()
                // SAFETY: a and b are valid indices < entries.len() per invariant above
                let entry_a = unsafe { entries.get_unchecked(a) };
                let entry_b = unsafe { entries.get_unchecked(b) };

                let offset_a = entry_a.offset();
                let offset_b = entry_b.offset();
                let len_a = entry_a.length();
                let len_b = entry_b.length();

                // Use chunked comparison for maximum speed
                // SAFETY: arena_ptr + offset points to valid len bytes from arena
                unsafe {
                    Self::fast_lexicographic_cmp(
                        arena_ptr.add(offset_a),
                        len_a,
                        arena_ptr.add(offset_b),
                        len_b,
                    )
                }
            });
        }

        Ok(())
    }

    /// Optimized lexicographic comparison
    #[inline(always)]
    unsafe fn fast_lexicographic_cmp(
        a_ptr: *const u8,
        a_len: usize,
        b_ptr: *const u8,
        b_len: usize,
    ) -> Ordering {
        let min_len = a_len.min(b_len);

        // Process 8 bytes at a time using byte array comparison
        let chunks = min_len / 8;
        for i in 0..chunks {
            let offset = i * 8;
            // Load 8 bytes as a byte array for lexicographic comparison
            // SAFETY: offset = i*8 < chunks*8 <= min_len, pointers valid for a_len/b_len bytes
            let a_bytes = unsafe { std::ptr::read_unaligned(a_ptr.add(offset) as *const [u8; 8]) };
            let b_bytes = unsafe { std::ptr::read_unaligned(b_ptr.add(offset) as *const [u8; 8]) };

            // Direct byte array comparison (no endianness conversion needed)
            match a_bytes.cmp(&b_bytes) {
                Ordering::Equal => continue,
                other => return other,
            }
        }

        // Handle remaining bytes
        let remaining_start = chunks * 8;
        for i in remaining_start..min_len {
            // SAFETY: i < min_len <= a_len and b_len, pointers valid
            let a_byte = unsafe { *a_ptr.add(i) };
            let b_byte = unsafe { *b_ptr.add(i) };
            match a_byte.cmp(&b_byte) {
                Ordering::Equal => continue,
                other => return other,
            }
        }

        // Compare lengths for final decision
        a_len.cmp(&b_len)
    }

    /// Optimized comparison sort implementation
    fn comparison_sort_optimized(&mut self) -> Result<()> {
        // Direct byte comparison on arena data for maximum performance
        // This matches an optimized comparison lambda approach
        let arena = &self.arena;
        let entries = &self.entries;

        self.sorted_indices.sort_unstable_by(|&a, &b| {
            let entry_a = entries[a];
            let entry_b = entries[b];

            // Direct slice comparison without UTF-8 conversion overhead
            let slice_a = &arena[entry_a.offset()..entry_a.offset() + entry_a.length()];
            let slice_b = &arena[entry_b.offset()..entry_b.offset() + entry_b.length()];

            slice_a.cmp(slice_b)
        });

        self.stats.sort_algorithm_used = SortAlgorithm::Comparison;
        Ok(())
    }

    /// Radix sort implementation for longer strings
    fn radix_sort_impl(&mut self) -> Result<()> {
        // Use the existing radix sort MSD implementation
        let indices = std::mem::take(&mut self.sorted_indices);
        let sorted = self.radix_sort_msd_impl(indices)?;
        self.sorted_indices = sorted;

        self.stats.sort_algorithm_used = SortAlgorithm::RadixMSD;
        Ok(())
    }

    /// Sort strings using optimized comparison sort (implementation)
    fn comparison_sort_impl(&self, mut indices: Vec<usize>) -> Vec<usize> {
        indices.sort_unstable_by(|&a, &b| {
            // SAFETY: indices contains values 0..self.len(), created by extend(0..self.entries.len())
            // in sort_lexicographic(). All indices are guaranteed to be valid.
            let str_a = self.get(a).expect("sorted index is valid");
            let str_b = self.get(b).expect("sorted index is valid");

            // Use SIMD-optimized comparison if available
            #[cfg(all(target_arch = "x86_64", feature = "simd"))]
            {
                if is_x86_feature_detected!("avx2") {
                    return self.simd_compare(str_a, str_b);
                }
            }

            str_a.cmp(str_b)
        });
        indices
    }

    /// SIMD comparison stub for non-x86_64 architectures
    #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
    fn simd_compare(&self, a: &str, b: &str) -> Ordering {
        a.cmp(b)
    }

    /// SIMD-optimized string comparison (AVX2) - static version for better inlining
    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
    #[inline(always)]
    unsafe fn simd_compare_static(a_bytes: &[u8], b_bytes: &[u8]) -> Ordering {
        use std::arch::x86_64::*;

        let min_len = a_bytes.len().min(b_bytes.len());

        // Process 32 bytes at a time with AVX2
        let chunks = min_len / 32;
        for i in 0..chunks {
            let offset = i * 32;
            // SAFETY: We're within bounds due to chunks calculation
            unsafe {
                let a_vec = _mm256_loadu_si256(a_bytes.as_ptr().add(offset) as *const _);
                let b_vec = _mm256_loadu_si256(b_bytes.as_ptr().add(offset) as *const _);

                let cmp = _mm256_cmpeq_epi8(a_vec, b_vec);
                let mask = _mm256_movemask_epi8(cmp);

                if mask != -1 {
                    // Found difference, do byte-by-byte comparison
                    for j in offset..offset + 32 {
                        match a_bytes[j].cmp(&b_bytes[j]) {
                            Ordering::Equal => continue,
                            other => return other,
                        }
                    }
                }
            }
        }

        // Handle remaining bytes
        for i in (chunks * 32)..min_len {
            match a_bytes[i].cmp(&b_bytes[i]) {
                Ordering::Equal => continue,
                other => return other,
            }
        }

        a_bytes.len().cmp(&b_bytes.len())
    }

    /// SIMD-optimized string comparison (AVX2) - instance method for compatibility
    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
    fn simd_compare(&self, a: &str, b: &str) -> Ordering {
        // SAFETY: simd_compare_static uses #[target_feature] for SIMD safety
        unsafe { Self::simd_compare_static(a.as_bytes(), b.as_bytes()) }
    }

    /// MSD radix sort implementation for longer strings
    fn radix_sort_msd_impl(&self, mut indices: Vec<usize>) -> Result<Vec<usize>> {
        if indices.is_empty() {
            return Ok(indices);
        }

        // Create temporary buffer for sorting
        let mut buffer = vec![0usize; indices.len()];

        // Recursive MSD radix sort
        self.radix_sort_msd_recursive(&mut indices, &mut buffer, 0);

        Ok(indices)
    }

    /// Recursive MSD radix sort helper
    ///
    /// # Safety Invariant
    /// All indices in `indices` slice are valid (0..self.len()) because they originate
    /// from `extend(0..self.entries.len())` in sort_lexicographic(). The unwrap() calls
    /// on self.get(idx) are therefore guaranteed to succeed.
    fn radix_sort_msd_recursive(&self, indices: &mut [usize], buffer: &mut [usize], depth: usize) {
        if indices.len() <= 1 {
            return;
        }

        // Use insertion sort for small subarrays
        if indices.len() < 32 {
            indices.sort_unstable_by(|&a, &b| {
                // SAFETY: See function doc - indices are always 0..self.len()
                let str_a = self.get(a).expect("sorted index is valid");
                let str_b = self.get(b).expect("sorted index is valid");
                str_a[depth.min(str_a.len())..].cmp(&str_b[depth.min(str_b.len())..])
            });
            return;
        }

        // Count frequencies for current character position
        let mut counts = [0usize; 257]; // 256 bytes + 1 for strings ending at this depth

        for &idx in indices.iter() {
            // SAFETY: See function doc - indices are always 0..self.len()
            let s = self.get(idx).expect("sorted index is valid");
            let byte = if depth < s.len() {
                s.as_bytes()[depth] as usize + 1
            } else {
                0 // String ends at this depth
            };
            counts[byte] += 1;
        }

        // Convert counts to cumulative offsets
        let mut total = 0;
        for count in counts.iter_mut() {
            let tmp = *count;
            *count = total;
            total += tmp;
        }

        // Distribute strings to buffer based on current character
        for &idx in indices.iter() {
            // SAFETY: See function doc - indices are always 0..self.len()
            let s = self.get(idx).expect("sorted index is valid");
            let byte = if depth < s.len() {
                s.as_bytes()[depth] as usize + 1
            } else {
                0
            };
            buffer[counts[byte]] = idx;
            counts[byte] += 1;
        }

        // Copy back to indices
        indices.copy_from_slice(&buffer[..indices.len()]);

        // Recursively sort each bucket (except the first if it contains ended strings)
        let mut start = if counts[0] > 0 { counts[0] } else { 0 };

        for i in 1..257 {
            if counts[i] > start {
                let end = counts[i];
                self.radix_sort_msd_recursive(&mut indices[start..end], buffer, depth + 1);
                start = end;
            }
        }
    }

    /// Parallel sorting for large datasets (implementation)
    ///
    /// # Safety Invariant
    /// All indices are valid (0..self.len()) because they originate from
    /// `extend(0..self.entries.len())` in sort_lexicographic().
    fn parallel_sort_impl(&self, mut indices: Vec<usize>) -> Vec<usize> {
        // For simplicity, use standard parallel sort
        // In production, would use rayon or custom work-stealing implementation

        // Fallback to single-threaded sort with optimizations
        indices.sort_unstable_by(|&a, &b| {
            // SAFETY: See function doc - indices are always 0..self.len()
            let str_a = self.get(a).expect("sorted index is valid");
            let str_b = self.get(b).expect("sorted index is valid");
            str_a.cmp(str_b)
        });
        indices
    }

    /// Sort strings in lexicographic order (convenience alias)
    #[inline]
    pub fn sort(&mut self) -> Result<()> {
        self.sort_lexicographic()
    }

    /// Sort strings by length (optimized in-place)
    pub fn sort_by_length(&mut self) -> Result<()> {
        let start = std::time::Instant::now();

        // Initialize sorted indices if needed
        if self.sorted_indices.len() != self.entries.len() {
            self.sorted_indices.clear();
            self.sorted_indices.extend(0..self.entries.len());
        }

        // Direct in-place sort by length - simple and fast
        let entries = &self.entries;
        self.sorted_indices
            .sort_unstable_by_key(|&idx| entries[idx].length());

        self.is_sorted = true;
        self.sort_mode = SortMode::ByLength;
        self.stats.last_sort_time_micros = start.elapsed().as_micros() as u64;

        Ok(())
    }

    /// Counting sort by string length (implementation)
    fn counting_sort_by_length_impl(&self, indices: Vec<usize>, max_len: usize) -> Vec<usize> {
        let mut counts = vec![0usize; max_len + 1];
        let mut output = vec![0usize; indices.len()];

        // Count occurrences
        for &idx in indices.iter() {
            let len = self.entries[idx].length();
            counts[len] += 1;
        }

        // Convert to cumulative counts
        for i in 1..counts.len() {
            counts[i] += counts[i - 1];
        }

        // Build output array (traverse in reverse for stability)
        for &idx in indices.iter().rev() {
            let len = self.entries[idx].length();
            counts[len] -= 1;
            output[counts[len]] = idx;
        }

        output
    }

    /// Sort with a custom comparison function
    pub fn sort_by<F>(&mut self, compare: F) -> Result<()>
    where
        F: Fn(&str, &str) -> Ordering,
    {
        let start = std::time::Instant::now();

        self.sorted_indices.clear();
        self.sorted_indices.extend(0..self.entries.len());

        let arena = &self.arena;
        let entries = &self.entries;

        self.sorted_indices.sort_unstable_by(|&a, &b| {
            let entry_a = entries[a];
            let entry_b = entries[b];

            // SAFETY: bytes validated as UTF-8 during add()
            let str_a = unsafe {
                std::str::from_utf8_unchecked(
                    &arena[entry_a.offset()..entry_a.offset() + entry_a.length()],
                )
            };
            // SAFETY: Hardware features verified or caller ensures safety invariants
            let str_b = unsafe {
                std::str::from_utf8_unchecked(
                    &arena[entry_b.offset()..entry_b.offset() + entry_b.length()],
                )
            };

            compare(str_a, str_b)
        });

        self.is_sorted = true;
        self.sort_mode = SortMode::Custom;
        self.stats.last_sort_time_micros = start.elapsed().as_micros() as u64;

        Ok(())
    }

    /// Use RadixSort for high-performance string sorting (optimized implementation)
    pub fn radix_sort(&mut self) -> Result<()> {
        let start = std::time::Instant::now();

        // Initialize sorted indices if needed
        if self.sorted_indices.len() != self.entries.len() {
            self.sorted_indices.clear();
            self.sorted_indices.extend(0..self.entries.len());
        }

        // For radix sort, we need a temporary buffer, but we'll reuse it efficiently
        if !self.entries.is_empty() {
            let mut buffer = vec![0usize; self.entries.len()];
            let indices_len = self.sorted_indices.len();

            // Use a helper to avoid borrow conflicts
            Self::radix_sort_msd_helper(
                &self.arena,
                &self.entries,
                &mut self.sorted_indices[..indices_len],
                &mut buffer,
                0,
            );
        }

        self.is_sorted = true;
        self.sort_mode = SortMode::Lexicographic;
        self.stats.last_sort_time_micros = start.elapsed().as_micros() as u64;
        self.stats.sort_algorithm_used = SortAlgorithm::RadixMSD;

        Ok(())
    }

    /// Static helper for MSD radix sort to avoid borrow conflicts
    fn radix_sort_msd_helper(
        arena: &[u8],
        entries: &[CompactEntry],
        indices: &mut [usize],
        buffer: &mut [usize],
        depth: usize,
    ) {
        if indices.len() <= 1 {
            return;
        }

        // Use insertion sort for small subarrays (faster than radix for small data)
        if indices.len() < 32 {
            indices.sort_unstable_by(|&a, &b| {
                let entry_a = entries[a];
                let entry_b = entries[b];
                let str_a = &arena[entry_a.offset()..entry_a.offset() + entry_a.length()];
                let str_b = &arena[entry_b.offset()..entry_b.offset() + entry_b.length()];
                str_a[depth.min(str_a.len())..].cmp(&str_b[depth.min(str_b.len())..])
            });
            return;
        }

        // Count frequencies for current character position
        let mut counts = [0usize; 257]; // 256 bytes + 1 for strings ending at this depth

        for &idx in indices.iter() {
            let entry = entries[idx];
            let str_bytes = &arena[entry.offset()..entry.offset() + entry.length()];
            let byte = if depth < str_bytes.len() {
                str_bytes[depth] as usize + 1
            } else {
                0 // String ends at this depth
            };
            counts[byte] += 1;
        }

        // Convert counts to cumulative offsets
        let mut total = 0;
        for count in counts.iter_mut() {
            let tmp = *count;
            *count = total;
            total += tmp;
        }

        // Distribute strings to buffer based on current character
        for &idx in indices.iter() {
            let entry = entries[idx];
            let str_bytes = &arena[entry.offset()..entry.offset() + entry.length()];
            let byte = if depth < str_bytes.len() {
                str_bytes[depth] as usize + 1
            } else {
                0
            };
            buffer[counts[byte]] = idx;
            counts[byte] += 1;
        }

        // Copy back to indices
        indices.copy_from_slice(&buffer[..indices.len()]);

        // Recursively sort each bucket
        let mut start = if counts[0] > 0 { counts[0] } else { 0 };

        for i in 1..257 {
            if counts[i] > start {
                let end = counts[i];
                Self::radix_sort_msd_helper(
                    arena,
                    entries,
                    &mut indices[start..end],
                    buffer,
                    depth + 1,
                );
                start = end;
            }
        }
    }

    /// Cache-optimized binary search for a string
    ///
    /// # Safety Invariant
    /// sorted_indices contains values 0..self.len(), so self.get(idx) always succeeds.
    pub fn binary_search(&self, needle: &str) -> std::result::Result<usize, usize> {
        if !self.is_sorted || self.sort_mode != SortMode::Lexicographic {
            return Err(0);
        }

        // Use cache-optimized block search for large arrays
        if self.sorted_indices.len() > self.config.cache_block_size * 2 {
            self.block_binary_search(needle)
        } else {
            // Standard binary search for smaller arrays
            // SAFETY: sorted_indices contains values 0..self.len()
            self.sorted_indices
                .binary_search_by(|&idx| self.get(idx).expect("sorted index is valid").cmp(needle))
        }
    }

    /// Block-based binary search optimized for cache locality
    ///
    /// # Safety Invariant
    /// sorted_indices contains values 0..self.len(), so self.get(idx) always succeeds.
    fn block_binary_search(&self, needle: &str) -> std::result::Result<usize, usize> {
        let n = self.sorted_indices.len();
        let block_size = self.config.cache_block_size;

        // First, binary search on block boundaries
        let num_blocks = (n + block_size - 1) / block_size;
        let mut left_block = 0;
        let mut right_block = num_blocks;

        while left_block < right_block {
            let mid_block = left_block + (right_block - left_block) / 2;
            let idx = mid_block * block_size;

            if idx >= n {
                right_block = mid_block;
                continue;
            }

            // SAFETY: sorted_indices[idx] is 0..self.len(), see function doc
            let mid_str = self.get(self.sorted_indices[idx]).expect("sorted index is valid");

            match mid_str.cmp(needle) {
                Ordering::Less => left_block = mid_block + 1,
                Ordering::Greater => right_block = mid_block,
                Ordering::Equal => return Ok(idx),
            }
        }

        // Linear search within the identified block
        let start = left_block.saturating_sub(1) * block_size;
        let end = (left_block * block_size).min(n);

        for i in start..end {
            // SAFETY: sorted_indices[i] is 0..self.len(), see function doc
            let str_val = self.get(self.sorted_indices[i]).expect("sorted index is valid");
            match str_val.cmp(needle) {
                Ordering::Less => continue,
                Ordering::Equal => return Ok(i),
                Ordering::Greater => return Err(i),
            }
        }

        Err(end)
    }

    /// Get a string by sorted position
    pub fn get_sorted(&self, index: usize) -> Option<&str> {
        if !self.is_sorted || index >= self.sorted_indices.len() {
            return None;
        }

        let original_index = self.sorted_indices[index];

        // Prefetch next entries if enabled
        if self.config.enable_prefetch && index + 1 < self.sorted_indices.len() {
            let next_idx = self.sorted_indices[index + 1];
            if next_idx < self.entries.len() {
                let entry = self.entries[next_idx];
                let offset = entry.offset();
                if offset < self.arena.len() {
                    // Prefetch next string data
                    #[cfg(target_arch = "x86_64")]
                    // SAFETY: prefetch is always safe, offset bounds checked above
                    unsafe {
                        use std::arch::x86_64::_mm_prefetch;
                        _mm_prefetch(self.arena[offset..].as_ptr() as *const i8, 0);
                    }
                }
            }
        }

        self.get(original_index)
    }

    /// Get performance statistics
    pub fn stats(&mut self) -> (usize, f64, u64, f64) {
        let utilization = if self.arena.capacity() > 0 {
            self.arena.len() as f64 / self.arena.capacity() as f64
        } else {
            0.0
        };

        let (_, _, savings_ratio) = self.memory_savings_vs_vec_string();

        (
            self.stats.total_strings,
            utilization,
            self.stats.last_sort_time_micros,
            savings_ratio,
        )
    }

    /// Calculate memory savings compared to Vec<String>
    pub fn memory_savings_vs_vec_string(&self) -> (usize, usize, f64) {
        // Vec<String> memory calculation
        let vec_string_overhead = self.entries.len() * mem::size_of::<String>();
        let vec_string_heap = self.stats.total_bytes_stored;
        let vec_string_capacity_waste = self.entries.len() * 8; // Average capacity waste
        let vec_string_total = vec_string_overhead + vec_string_heap + vec_string_capacity_waste;

        // Our memory usage with CompactEntry (8 bytes per entry)
        let our_arena = self.arena.capacity();
        let our_entries = self.entries.capacity() * mem::size_of::<CompactEntry>();
        let our_indices = self.sorted_indices.capacity() * mem::size_of::<usize>();
        let our_total = our_arena + our_entries + our_indices;

        let savings = vec_string_total.saturating_sub(our_total);
        let savings_ratio = if vec_string_total > 0 {
            savings as f64 / vec_string_total as f64
        } else {
            0.0
        };

        (vec_string_total, our_total, savings_ratio)
    }

    /// Clear all strings from the vector
    pub fn clear(&mut self) {
        self.arena.clear();
        self.entries.clear();
        self.sorted_indices.clear();
        self.is_sorted = false;
        self.sort_mode = SortMode::Unsorted;
        self.stats = SortableStats::default();
        self.seq_counter.store(0, AtomicOrdering::Relaxed);
    }

    /// Reserve capacity for additional strings
    pub fn reserve(&mut self, additional: usize) {
        self.entries.reserve(additional);
        self.arena.reserve(additional * 16); // Assume ~16 bytes average
    }

    /// Shrink arena to fit current usage
    pub fn shrink_to_fit(&mut self) {
        self.arena.shrink_to_fit();
        self.entries.shrink_to_fit();
        self.sorted_indices.shrink_to_fit();
    }
}

impl Default for SortableStrVec {
    fn default() -> Self {
        Self::new()
    }
}

/// Iterator over strings in insertion order
pub struct SortableStrIter<'a> {
    vec: &'a SortableStrVec,
    current: usize,
}

impl<'a> Iterator for SortableStrIter<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<Self::Item> {
        if self.current < self.vec.len() {
            let result = self.vec.get(self.current);
            self.current += 1;
            result
        } else {
            None
        }
    }
}

/// Iterator over strings in sorted order
pub struct SortableStrSortedIter<'a> {
    vec: &'a SortableStrVec,
    current: usize,
}

impl<'a> Iterator for SortableStrSortedIter<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<Self::Item> {
        if self.current < self.vec.sorted_indices.len() {
            let result = self.vec.get_sorted(self.current);
            self.current += 1;
            result
        } else {
            None
        }
    }
}

impl SortableStrVec {
    /// Create an iterator over strings in insertion order
    pub fn iter(&self) -> SortableStrIter {
        SortableStrIter {
            vec: self,
            current: 0,
        }
    }

    /// Create an iterator over strings in sorted order
    pub fn iter_sorted(&self) -> SortableStrSortedIter {
        SortableStrSortedIter {
            vec: self,
            current: 0,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_basic_operations() {
        let mut vec = SortableStrVec::new();
        assert_eq!(vec.len(), 0);
        assert!(vec.is_empty());

        let id1 = vec.push_str("hello").unwrap();
        let id2 = vec.push("world".to_string()).unwrap();

        assert_eq!(vec.len(), 2);
        assert_eq!(vec.get(0), Some("hello"));
        assert_eq!(vec.get(1), Some("world"));
        assert_eq!(vec.get_by_id(id1), Some("hello"));
        assert_eq!(vec.get_by_id(id2), Some("world"));
    }

    #[test]
    fn test_lexicographic_sorting() {
        let mut vec = SortableStrVec::new();

        vec.push_str("zebra").unwrap();
        vec.push_str("apple").unwrap();
        vec.push_str("banana").unwrap();
        vec.push_str("cherry").unwrap();

        vec.sort_lexicographic().unwrap();

        assert_eq!(vec.get_sorted(0), Some("apple"));
        assert_eq!(vec.get_sorted(1), Some("banana"));
        assert_eq!(vec.get_sorted(2), Some("cherry"));
        assert_eq!(vec.get_sorted(3), Some("zebra"));
    }

    #[test]
    fn test_radix_sort() {
        let mut vec = SortableStrVec::new();

        // Add longer strings to trigger radix sort
        vec.push_str("the quick brown fox jumps over the lazy dog")
            .unwrap();
        vec.push_str("pack my box with five dozen liquor jugs")
            .unwrap();
        vec.push_str("how vexingly quick daft zebras jump").unwrap();
        vec.push_str("sphinx of black quartz judge my vow").unwrap();

        vec.radix_sort().unwrap();

        // Verify correct ordering
        let sorted: Vec<&str> = vec.iter_sorted().collect();
        let mut expected = sorted.clone();
        expected.sort();
        assert_eq!(sorted, expected);
    }

    #[test]
    fn test_length_sorting() {
        let mut vec = SortableStrVec::new();

        vec.push_str("a").unwrap();
        vec.push_str("longer").unwrap();
        vec.push_str("hi").unwrap();
        vec.push_str("medium").unwrap();

        vec.sort_by_length().unwrap();

        assert_eq!(vec.get_sorted(0), Some("a")); // 1 char
        assert_eq!(vec.get_sorted(1), Some("hi")); // 2 chars
        // "longer" and "medium" both have 6 chars
    }

    #[test]
    fn test_binary_search() {
        let mut vec = SortableStrVec::new();

        vec.push_str("apple").unwrap();
        vec.push_str("banana").unwrap();
        vec.push_str("cherry").unwrap();
        vec.push_str("date").unwrap();

        vec.sort_lexicographic().unwrap();

        assert_eq!(vec.binary_search("banana"), Ok(1));
        assert_eq!(vec.binary_search("cherry"), Ok(2));
        assert!(vec.binary_search("grape").is_err());
    }

    #[test]
    fn test_iterators() {
        let mut vec = SortableStrVec::new();

        vec.push_str("c").unwrap();
        vec.push_str("a").unwrap();
        vec.push_str("b").unwrap();

        // Test insertion order iterator
        let insertion_order: Vec<&str> = vec.iter().collect();
        assert_eq!(insertion_order, vec!["c", "a", "b"]);

        // Test sorted iterator
        vec.sort_lexicographic().unwrap();
        let sorted_order: Vec<&str> = vec.iter_sorted().collect();
        assert_eq!(sorted_order, vec!["a", "b", "c"]);
    }

    #[test]
    fn test_memory_efficiency() {
        let mut vec = SortableStrVec::new();

        // Add many strings
        for i in 0..1000 {
            vec.push_str(&format!("string_{:04}", i)).unwrap();
        }

        let (vec_string_size, our_size, savings_ratio) = vec.memory_savings_vs_vec_string();

        // Should achieve significant memory savings
        assert!(our_size < vec_string_size);
        assert!(savings_ratio > 0.0);

        println!("Memory efficiency test:");
        println!("  Vec<String> size: {} bytes", vec_string_size);
        println!("  SortableStrVec size: {} bytes", our_size);
        println!("  Savings ratio: {:.2}%", savings_ratio * 100.0);
    }

    #[test]
    fn test_large_dataset_performance() {
        let mut vec = SortableStrVec::new();

        // Add 10000 random strings
        for i in 0..10000 {
            let s = format!("test_string_{:08}_{}", i * 7919 % 10000, i);
            vec.push_str(&s).unwrap();
        }

        // Test sorting performance
        let start = std::time::Instant::now();
        vec.sort_lexicographic().unwrap();
        let sort_time = start.elapsed();

        println!("Sorted 10000 strings in {:?}", sort_time);
        assert!(vec.stats.last_sort_time_micros > 0);

        // Verify correctness
        for i in 1..vec.sorted_indices.len() {
            let prev = vec.get_sorted(i - 1).unwrap();
            let curr = vec.get_sorted(i).unwrap();
            assert!(prev <= curr, "Sorting invariant violated");
        }
    }
}