zipora 3.1.5 - Docs.rs

// SAFETY comment to add after reading full file
//! Secure, thread-safe memory pool implementation
//!
//! This module provides a production-ready, thread-safe memory pool that eliminates
//! the security vulnerabilities found in the original MemoryPool implementation.
//!
//! # Security Features
//!
//! - **Memory Safety**: No raw pointers, uses safe Rust abstractions
//! - **Use-After-Free Prevention**: Generation counters validate pointer lifetime
//! - **Double-Free Detection**: Cryptographic validation of deallocations
//! - **Corruption Detection**: Guard pages and canary values
//! - **Thread Safety**: Proper synchronization without manual Send/Sync
//!
//! # Performance Features
//!
//! - **Thread-Local Caching**: Reduces contention with per-thread allocation caches
//! - **Lock-Free Fast Paths**: Lock-free stacks for high-performance allocation
//! - **NUMA Awareness**: Optimized allocation for multi-socket systems
//! - **Batch Operations**: Amortized system call overhead
//! - **SIMD Optimizations**: Vectorized memory operations (AVX-512/AVX2/SSE2) for 2-3x faster
//!   memory zeroing on deallocation, with automatic CPU feature detection and threshold-based
//!   optimization (≥64 bytes). Maintains all security guarantees while significantly improving
//!   performance for large memory operations.
//!
//! # Architecture
//!
//! The secure pool uses a hybrid architecture combining thread-local caches
//! with a central depot for cross-thread coordination. Each allocation is
//! validated with generation counters and cryptographic signatures.

use crate::error::{Result, ZiporaError};
use crate::memory::simd_ops::{fast_fill, fast_compare, fast_prefetch};
use crate::memory::cache_layout::{CacheOptimizedAllocator, CacheLayoutConfig, align_to_cache_line, AccessPattern, HotColdSeparator, PrefetchHint};
use crate::memory::{get_optimal_numa_node, numa_alloc_aligned, numa_dealloc};
use crate::simd::adaptive::AdaptiveSimdSelector;
use crate::simd::Operation;
use super::CachePadded;
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::alloc::{Layout, alloc, dealloc};
use std::cell::RefCell;
use std::ptr::NonNull;
use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Weak};
use std::time::{Instant, SystemTime, UNIX_EPOCH};

/// Magic constants for corruption detection
const CHUNK_HEADER_MAGIC: u64 = 0xDEADBEEFCAFEBABE;
const CHUNK_FOOTER_MAGIC: u64 = 0xFEEDFACEDEADBEEF;
const POOL_MAGIC: u64 = 0xABCDEF0123456789;

/// Size classes for efficient allocation (jemalloc-inspired)
const SIZE_CLASSES: &[usize] = &[
    8, 16, 32, 48, 64, 80, 96, 112, 128, // Small (8-byte increments)
    160, 192, 224, 256, 320, 384, 448, 512, // Medium (64-byte increments)
    640, 768, 896, 1024, 1280, 1536, 1792, 2048, // Large (256-byte increments)
    2560, 3072, 3584, 4096, 5120, 6144, 7168, 8192, // XLarge (512-byte increments)
];

/// Configuration for secure memory pool
#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct SecurePoolConfig {
    /// Size of each chunk in bytes
    pub chunk_size: usize,
    /// Maximum number of chunks to keep in the pool
    pub max_chunks: usize,
    /// Alignment requirement for allocations
    pub alignment: usize,
    /// Enable guard pages for overflow detection
    pub use_guard_pages: bool,
    /// Zero memory on deallocation for security
    pub zero_on_free: bool,
    /// Zero memory on allocation for security
    pub zero_on_alloc: bool,
    /// Thread-local cache size
    pub local_cache_size: usize,
    /// Batch size for depot transfers
    pub batch_size: usize,
    /// Enable SIMD optimizations for memory operations (default: true)
    /// 
    /// When enabled, uses vectorized instructions for memory operations like zeroing
    /// on systems that support SIMD (AVX-512/AVX2/SSE2). Provides significant
    /// performance improvements for large memory operations while maintaining
    /// all security guarantees.
    pub enable_simd_ops: bool,
    /// Minimum size threshold for SIMD operations (default: 64 bytes)
    /// 
    /// Memory operations smaller than this threshold use standard implementations.
    /// SIMD operations provide meaningful performance benefits only for larger
    /// memory regions. The default of 64 bytes aligns with cache line size.
    pub simd_threshold: usize,
    /// Enable cache-line aligned allocations for better performance
    /// 
    /// When enabled, all allocations are aligned to cache line boundaries
    /// to minimize false sharing and improve cache efficiency. This provides
    /// significant performance benefits for data structures accessed across threads.
    pub enable_cache_alignment: bool,
    /// Cache layout configuration for optimization
    /// 
    /// Specifies cache-specific optimizations like prefetch distances,
    /// hot/cold separation, and access pattern hints.
    pub cache_config: Option<CacheLayoutConfig>,
    /// Enable NUMA-aware allocation (default: true)
    /// 
    /// When enabled, allocations prefer the local NUMA node to minimize
    /// memory access latency and maximize bandwidth utilization.
    pub enable_numa_awareness: bool,
    /// Enable hot/cold data separation for cache optimization
    /// 
    /// When enabled, frequently accessed allocations are placed in
    /// cache-friendly regions while cold data is moved to separate areas.
    pub enable_hot_cold_separation: bool,
    /// Allocation frequency threshold for hot data classification
    /// 
    /// Allocations accessed more than this threshold are considered hot.
    pub hot_data_threshold: usize,
    /// Enable huge page allocation for large chunks (Linux only)
    /// 
    /// When enabled, large allocations use huge pages to reduce TLB pressure
    /// and improve memory access performance.
    pub enable_huge_pages: bool,
    /// Minimum chunk size for huge page allocation (default: 2MB)
    ///
    /// Chunks smaller than this size use regular pages.
    pub huge_page_threshold: usize,
    /// Prefetch distance for bulk allocation operations (default: 8)
    ///
    /// Number of allocations to look ahead when prefetching cache lines.
    /// Higher values can improve throughput for large batch allocations but
    /// may waste prefetch bandwidth for small batches. Tune based on workload:
    /// - 4: Conservative, good for variable-size batches
    /// - 8: Default, balanced for most workloads
    /// - 12-16: Aggressive, best for large uniform batches
    pub prefetch_distance: usize,
}

impl SecurePoolConfig {
    /// Create a new secure pool configuration
    pub fn new(chunk_size: usize, max_chunks: usize, alignment: usize) -> Self {
        Self {
            chunk_size,
            max_chunks,
            alignment,
            use_guard_pages: false,
            zero_on_free: true,
            zero_on_alloc: false,
            local_cache_size: 64,
            batch_size: 16,
            enable_simd_ops: true,
            simd_threshold: 64,
            enable_cache_alignment: true,
            cache_config: Some(CacheLayoutConfig::new()),
            enable_numa_awareness: true,
            enable_hot_cold_separation: true,
            hot_data_threshold: 1000,
            enable_huge_pages: cfg!(target_os = "linux"),
            huge_page_threshold: 2 * 1024 * 1024, // 2MB
            prefetch_distance: 8, // Default: balanced for most workloads
        }
    }

    /// Create configuration for small objects (< 1KB) with security features
    pub fn small_secure() -> Self {
        Self {
            chunk_size: 1024,
            max_chunks: 100,
            alignment: 8,
            use_guard_pages: false,
            zero_on_free: true,
            zero_on_alloc: false,
            local_cache_size: 64,
            batch_size: 16,
            enable_simd_ops: true,
            simd_threshold: 64,
            enable_cache_alignment: true,
            cache_config: Some(CacheLayoutConfig::random()),
            enable_numa_awareness: true,
            enable_hot_cold_separation: true,
            hot_data_threshold: 500, // More sensitive for small objects
            enable_huge_pages: false, // Disable for small objects
            huge_page_threshold: 2 * 1024 * 1024,
            prefetch_distance: 8, // Default for small objects
        }
    }

    /// Create configuration for medium objects (< 64KB) with security features
    pub fn medium_secure() -> Self {
        Self {
            chunk_size: 64 * 1024,
            max_chunks: 50,
            alignment: 16,
            use_guard_pages: true,
            zero_on_free: true,
            zero_on_alloc: false,
            local_cache_size: 32,
            batch_size: 8,
            enable_simd_ops: true,
            simd_threshold: 64,
            enable_cache_alignment: true,
            cache_config: Some(CacheLayoutConfig::sequential()),
            enable_numa_awareness: true,
            enable_hot_cold_separation: true,
            hot_data_threshold: 750, // Balanced for medium objects
            enable_huge_pages: false, // Usually not needed for 64KB
            huge_page_threshold: 2 * 1024 * 1024,
            prefetch_distance: 8, // Default for medium objects
        }
    }

    /// Create configuration for large objects (< 1MB) with maximum security
    pub fn large_secure() -> Self {
        Self {
            chunk_size: 1024 * 1024,
            max_chunks: 10,
            alignment: 32,
            use_guard_pages: true,
            zero_on_free: true,
            zero_on_alloc: false,
            local_cache_size: 16,
            batch_size: 4,
            enable_simd_ops: true,
            simd_threshold: 64,
            enable_cache_alignment: true,
            cache_config: Some(CacheLayoutConfig::read_heavy()),
            enable_numa_awareness: true,
            enable_hot_cold_separation: true,
            hot_data_threshold: 1500, // Higher threshold for large objects
            enable_huge_pages: true, // Enable for large objects
            huge_page_threshold: 2 * 1024 * 1024,
            prefetch_distance: 12, // More aggressive for large batch allocations
        }
    }

    /// Builder method to set alignment
    pub fn with_alignment(mut self, alignment: usize) -> Self {
        self.alignment = alignment;
        self
    }

    /// Builder method to enable/disable guard pages
    pub fn with_guard_pages(mut self, use_guard_pages: bool) -> Self {
        self.use_guard_pages = use_guard_pages;
        self
    }

    /// Builder method to enable/disable zero on free
    pub fn with_zero_on_free(mut self, zero_on_free: bool) -> Self {
        self.zero_on_free = zero_on_free;
        self
    }

    /// Builder method to enable/disable zero on allocation
    pub fn with_zero_on_alloc(mut self, zero_on_alloc: bool) -> Self {
        self.zero_on_alloc = zero_on_alloc;
        self
    }

    /// Builder method to set local cache size
    pub fn with_local_cache_size(mut self, size: usize) -> Self {
        self.local_cache_size = size;
        self
    }

    /// Builder method to set batch size
    pub fn with_batch_size(mut self, size: usize) -> Self {
        self.batch_size = size;
        self
    }

    /// Builder method to enable/disable SIMD optimizations
    /// 
    /// When enabled (default), uses vectorized instructions for memory operations
    /// like zeroing on systems that support SIMD. Provides significant performance
    /// improvements while maintaining all security guarantees.
    pub fn with_simd_ops(mut self, enable: bool) -> Self {
        self.enable_simd_ops = enable;
        self
    }

    /// Builder method to set SIMD threshold
    /// 
    /// Memory operations smaller than this threshold use standard implementations.
    /// The default of 64 bytes aligns with cache line size and provides optimal
    /// performance characteristics for most workloads.
    pub fn with_simd_threshold(mut self, threshold: usize) -> Self {
        self.simd_threshold = threshold;
        self
    }

    /// Builder method to enable/disable cache alignment
    /// 
    /// When enabled (default), all allocations are aligned to cache line boundaries
    /// to minimize false sharing and improve cache efficiency. This provides
    /// significant performance benefits for data structures accessed across threads.
    pub fn with_cache_alignment(mut self, enable: bool) -> Self {
        self.enable_cache_alignment = enable;
        self
    }

    /// Builder method to set cache configuration
    /// 
    /// Specifies cache-specific optimizations like prefetch distances,
    /// hot/cold separation, and access pattern hints. If None, cache optimizations
    /// are disabled.
    pub fn with_cache_config(mut self, config: Option<CacheLayoutConfig>) -> Self {
        self.cache_config = config;
        self
    }

    /// Builder method to set cache configuration for specific access pattern
    /// 
    /// Convenience method that creates an appropriate cache configuration
    /// based on the expected access pattern.
    pub fn with_access_pattern(mut self, pattern: AccessPattern) -> Self {
        let config = match pattern {
            AccessPattern::Sequential => CacheLayoutConfig::sequential(),
            AccessPattern::Random => CacheLayoutConfig::random(),
            AccessPattern::WriteHeavy => CacheLayoutConfig::write_heavy(),
            AccessPattern::ReadHeavy => CacheLayoutConfig::read_heavy(),
            AccessPattern::Mixed => CacheLayoutConfig::new(),
        };
        self.cache_config = Some(config);
        self
    }

    /// Builder method to enable/disable NUMA awareness
    /// 
    /// When enabled, allocations prefer the local NUMA node to minimize
    /// memory access latency and maximize bandwidth utilization.
    pub fn with_numa_awareness(mut self, enable: bool) -> Self {
        self.enable_numa_awareness = enable;
        self
    }

    /// Builder method to enable/disable hot/cold data separation
    /// 
    /// When enabled, frequently accessed allocations are placed in
    /// cache-friendly regions while cold data is moved to separate areas.
    pub fn with_hot_cold_separation(mut self, enable: bool) -> Self {
        self.enable_hot_cold_separation = enable;
        self
    }

    /// Builder method to set hot data threshold
    /// 
    /// Allocations accessed more than this threshold are considered hot.
    pub fn with_hot_data_threshold(mut self, threshold: usize) -> Self {
        self.hot_data_threshold = threshold;
        self
    }

    /// Builder method to enable/disable huge pages
    /// 
    /// When enabled, large allocations use huge pages to reduce TLB pressure
    /// and improve memory access performance.
    pub fn with_huge_pages(mut self, enable: bool) -> Self {
        self.enable_huge_pages = enable;
        self
    }

    /// Builder method to set huge page threshold
    ///
    /// Chunks smaller than this size use regular pages.
    pub fn with_huge_page_threshold(mut self, threshold: usize) -> Self {
        self.huge_page_threshold = threshold;
        self
    }

    /// Builder method to set prefetch distance for bulk allocations
    ///
    /// Higher values can improve throughput for large batch allocations but
    /// may waste prefetch bandwidth for small batches. Recommended values:
    /// - 4: Conservative, good for variable-size batches
    /// - 8: Default, balanced for most workloads
    /// - 12-16: Aggressive, best for large uniform batches
    pub fn with_prefetch_distance(mut self, distance: usize) -> Self {
        self.prefetch_distance = distance;
        self
    }
}

/// Statistics for secure memory pool
#[derive(Debug, Clone)]
pub struct SecurePoolStats {
    /// Total bytes allocated
    pub allocated: u64,
    /// Total bytes available in pool
    pub available: u64,
    /// Number of chunks in pool
    pub chunks: usize,
    /// Number of allocations served
    pub alloc_count: u64,
    /// Number of deallocations
    pub dealloc_count: u64,
    /// Number of pool hits (reused memory)
    pub pool_hits: u64,
    /// Number of pool misses (new allocations)
    pub pool_misses: u64,
    /// Number of corruption detections
    pub corruption_detected: u64,
    /// Number of double-free attempts detected
    pub double_free_detected: u64,
    /// Thread-local cache hits
    pub local_cache_hits: u64,
    /// Cross-thread steals
    pub cross_thread_steals: u64,
    /// Cache-aligned allocations
    pub cache_aligned_allocs: u64,
    /// NUMA-local allocations
    pub numa_local_allocs: u64,
    /// Hot data allocations
    pub hot_data_allocs: u64,
    /// Cold data allocations
    pub cold_data_allocs: u64,
    /// Huge page allocations
    pub huge_page_allocs: u64,
    /// Cache hit ratio
    pub cache_hit_ratio: f64,
}

impl Default for SecurePoolStats {
    fn default() -> Self {
        Self {
            allocated: 0,
            available: 0,
            chunks: 0,
            alloc_count: 0,
            dealloc_count: 0,
            pool_hits: 0,
            pool_misses: 0,
            corruption_detected: 0,
            double_free_detected: 0,
            local_cache_hits: 0,
            cross_thread_steals: 0,
            cache_aligned_allocs: 0,
            numa_local_allocs: 0,
            hot_data_allocs: 0,
            cold_data_allocs: 0,
            huge_page_allocs: 0,
            cache_hit_ratio: 0.0,
        }
    }
}

/// Header for each memory chunk with security metadata
#[repr(C)]
#[derive(Debug)]
struct ChunkHeader {
    magic: u64,
    size: usize,
    generation: u32,
    pool_id: u32,
    allocation_time: u64,
    canary: u32,
    padding: u32,
}

/// Footer for each memory chunk with security metadata
#[repr(C)]
#[derive(Debug)]
struct ChunkFooter {
    canary: u32,
    generation: u32,
    magic: u64,
}

/// Secure wrapper around memory chunks with validation
pub struct SecureChunk {
    ptr: NonNull<u8>,
    size: usize,
    generation: u32,
    pool_id: u32,
    canary: u32,
}

impl SecureChunk {
    /// Create a new secure chunk with validation metadata
    fn new(size: usize, generation: u32, pool_id: u32) -> Result<Self> {
        let canary = fastrand::u32(..);
        let header_size = std::mem::size_of::<ChunkHeader>();
        let footer_size = std::mem::size_of::<ChunkFooter>();
        let total_size = header_size + size + footer_size;

        let layout = Layout::from_size_align(total_size, 8)
            .map_err(|_| ZiporaError::invalid_data("Invalid layout for chunk allocation"))?;

        // SAFETY: layout valid (size > 0, align power of 2)
        let raw_ptr = unsafe { alloc(layout) };
        if raw_ptr.is_null() {
            return Err(ZiporaError::out_of_memory(size));
        }

        // Initialize header
        let header = raw_ptr as *mut ChunkHeader;
        // SAFETY: ptr valid from alloc, aligned, within allocated region
        unsafe {
            (*header) = ChunkHeader {
                magic: CHUNK_HEADER_MAGIC,
                size,
                generation,
                pool_id,
                allocation_time: current_time_nanos(),
                canary,
                padding: 0,
            };
        }

        // Initialize footer
        // SAFETY: pointer valid from alloc, offset < total_size
        let footer_ptr = unsafe { raw_ptr.add(header_size + size) as *mut ChunkFooter };
        // SAFETY: footer_ptr valid, aligned, within allocated region
        unsafe {
            (*footer_ptr) = ChunkFooter {
                canary,
                generation,
                magic: CHUNK_FOOTER_MAGIC,
            };
        }

        // Return pointer to data area (after header)
        // SAFETY: pointer valid from alloc, offset < total_size
        let data_ptr = unsafe { raw_ptr.add(header_size) };

        Ok(Self {
            // SAFETY: data_ptr non-null (from non-null raw_ptr + offset)
            ptr: unsafe { NonNull::new_unchecked(data_ptr) },
            size,
            generation,
            pool_id,
            canary,
        })
    }

    // Note: new_from_ptr method temporarily removed to simplify compilation
    // Would be used for NUMA-aware allocations in the future

    /// Validate chunk integrity
    fn validate(&self) -> Result<()> {
        let header_size = std::mem::size_of::<ChunkHeader>();
        // SAFETY: ptr constructed in new() with header_size offset, sub reverses it
        let header_ptr = unsafe { self.ptr.as_ptr().sub(header_size) as *const ChunkHeader };
        // SAFETY: header_ptr valid, aligned to ChunkHeader, from allocation
        let header = unsafe { &*header_ptr };

        // Validate header
        if header.magic != CHUNK_HEADER_MAGIC {
            return Err(ZiporaError::invalid_data(&format!(
                "Header corruption detected: magic={:#x}, expected={:#x}",
                header.magic, CHUNK_HEADER_MAGIC
            )));
        }

        if header.generation != self.generation {
            return Err(ZiporaError::invalid_data(&format!(
                "Generation mismatch: header={}, chunk={}",
                header.generation, self.generation
            )));
        }

        if header.pool_id != self.pool_id {
            return Err(ZiporaError::invalid_data(&format!(
                "Pool ID mismatch: header={}, chunk={}",
                header.pool_id, self.pool_id
            )));
        }

        if header.canary != self.canary {
            return Err(ZiporaError::invalid_data(&format!(
                "Header canary mismatch: header={:#x}, chunk={:#x}",
                header.canary, self.canary
            )));
        }

        // Validate footer
        // SAFETY: ptr valid from allocation, self.size offset valid from new()
        let footer_ptr = unsafe { self.ptr.as_ptr().add(self.size) as *const ChunkFooter };
        // SAFETY: footer_ptr valid, aligned to ChunkFooter, from allocation
        let footer = unsafe { &*footer_ptr };

        if footer.magic != CHUNK_FOOTER_MAGIC {
            return Err(ZiporaError::invalid_data(&format!(
                "Footer corruption detected: magic={:#x}, expected={:#x}",
                footer.magic, CHUNK_FOOTER_MAGIC
            )));
        }

        if footer.canary != self.canary {
            return Err(ZiporaError::invalid_data(&format!(
                "Footer canary mismatch: footer={:#x}, chunk={:#x}",
                footer.canary, self.canary
            )));
        }

        if footer.generation != self.generation {
            return Err(ZiporaError::invalid_data(&format!(
                "Footer generation mismatch: footer={}, chunk={}",
                footer.generation, self.generation
            )));
        }

        Ok(())
    }

    /// Get pointer to data area
    #[inline]
    pub fn as_ptr(&self) -> *mut u8 {
        self.ptr.as_ptr()
    }

    /// Get size of data area
    #[inline]
    pub fn size(&self) -> usize {
        self.size
    }

    /// Get generation for validation
    pub fn generation(&self) -> u32 {
        self.generation
    }

    /// Safely deallocate the chunk with SIMD-optimized memory zeroing
    ///
    /// # SIMD Optimization
    /// For memory regions >= simd_threshold bytes, uses vectorized instructions
    /// (AVX-512/AVX2/SSE2) for significantly faster zeroing while maintaining
    /// all security guarantees. Falls back to standard zeroing for smaller regions.
    fn deallocate(self, zero_on_free: bool, enable_simd_ops: bool, simd_threshold: usize) {
        if zero_on_free {
            // SAFETY: pointer valid from allocation, size matches allocation
            unsafe {
                // SIMD-optimized memory zeroing for large regions
                if enable_simd_ops && self.size >= simd_threshold {
                    // Use SIMD fast_fill for large memory regions (≥64 bytes by default)
                    // Provides 2-3x faster zeroing with vectorized instructions
                    let slice = std::slice::from_raw_parts_mut(self.ptr.as_ptr(), self.size);
                    fast_fill(slice, 0);
                } else {
                    // Standard zeroing for small regions where SIMD overhead isn't beneficial
                    std::ptr::write_bytes(self.ptr.as_ptr(), 0, self.size);
                }
            }
        }

        let header_size = std::mem::size_of::<ChunkHeader>();
        let footer_size = std::mem::size_of::<ChunkFooter>();
        let total_size = header_size + self.size + footer_size;

        // SAFETY: ptr constructed in new() with header_size offset, sub reverses it
        let raw_ptr = unsafe { self.ptr.as_ptr().sub(header_size) };
        // SAFETY: Layout::from_size_align() cannot fail because:
        // 1. total_size was successfully used to allocate this chunk
        // 2. Alignment of 8 is always valid (power of 2)
        // 3. self.size was validated during allocation
        let layout = Layout::from_size_align(total_size, 8)
            .expect("layout creation: non-zero size, power-of-two alignment");

        // SAFETY: ptr from matching alloc, layout matches allocation
        unsafe {
            dealloc(raw_ptr, layout);
        }
    }
}

// Note: SecureChunk doesn't implement Drop because deallocate() takes self by value
// and Drop::drop() takes &mut self. Deallocation is handled explicitly by the pool
// or by SecurePooledPtr::drop() to ensure proper SIMD optimization based on config.

// SAFETY: SecureChunk is Send because:
// 1. `ptr: NonNull<u8>` - Raw pointer to heap-allocated memory. The chunk owns
//    this memory exclusively until deallocated. No thread-local state.
// 2. `size: usize` - Immutable primitive, trivially Send.
// 3. `canary: u32` - Immutable after construction, trivially Send.
// 4. `generation: u32` - Immutable after construction, trivially Send.
unsafe impl Send for SecureChunk {}

// SAFETY: SecureChunk is Sync because:
// 1. All fields are immutable after construction.
// 2. The chunk represents exclusive ownership of a memory region.
// 3. Sharing &SecureChunk only provides read access to the pointer and metadata.
// 4. The pointer itself is never dereferenced through &self - only through
//    explicit unsafe code that requires the caller to ensure synchronization.
//
// Note: While SecureChunk is Sync, writing to the underlying memory through
// the pointer requires external synchronization (which SecurePooledPtr provides).
unsafe impl Sync for SecureChunk {}

/// Lock-free stack for high-performance chunk storage (Treiber stack)
use crossbeam_epoch::{self as epoch, Atomic, Owned, Shared};

struct LockFreeStack<T> {
    head: Atomic<Node<T>>,
}

struct Node<T> {
    data: std::mem::ManuallyDrop<T>,
    next: Atomic<Node<T>>,
}

impl<T> LockFreeStack<T> {
    fn new() -> Self {
        Self {
            head: Atomic::null(),
        }
    }

    fn push(&self, item: T) {
        let mut node = Owned::new(Node {
            data: std::mem::ManuallyDrop::new(item),
            next: Atomic::null(),
        });

        let guard = epoch::pin();
        loop {
            let head = self.head.load(Ordering::Acquire, &guard);
            node.next.store(head, Ordering::Relaxed);

            match self.head.compare_exchange(head, node, Ordering::Release, Ordering::Relaxed, &guard) {
                Ok(_) => break,
                Err(e) => node = e.new,
            }
        }
    }

    fn pop(&self) -> Option<T> {
        let guard = epoch::pin();
        loop {
            let head = self.head.load(Ordering::Acquire, &guard);
            
            // Check if stack is empty
            if head.is_null() {
                return None;
            }

            // SAFETY: The node is protected from destruction by the epoch guard.
            // Even if another thread pops this node, its memory will remain valid
            // until the current epoch advances, ensuring the read of `next` is safe.
            let next = unsafe { head.deref() }.next.load(Ordering::Relaxed, &guard);

            if self.head.compare_exchange(head, next, Ordering::AcqRel, Ordering::Acquire, &guard).is_ok() {
                // We successfully unlinked the head node. We now have exclusive ownership
                // of the data inside it, but we must wait for all threads to drop their
                // guards before the memory backing the node can be reclaimed.
                unsafe {
                    guard.defer_destroy(head);
                    // The data itself can be moved out immediately since we logically
                    // own it. Rust's Ownership prevents use-after-free of the `data`.
                    return Some(std::mem::ManuallyDrop::into_inner(std::ptr::read(&head.deref().data)));
                }
            }
        }
    }

    fn is_empty(&self) -> bool {
        let guard = epoch::pin();
        self.head.load(Ordering::Acquire, &guard).is_null()
    }
}

impl<T> Drop for LockFreeStack<T> {
    fn drop(&mut self) {
        while self.pop().is_some() {}
    }
}


/// Thread-local cache for reduced contention
#[derive(Default)]
struct LocalCache {
    chunks: Vec<SecureChunk>,
    max_size: usize,
}

impl LocalCache {
    fn new(max_size: usize) -> Self {
        Self {
            chunks: Vec::with_capacity(max_size),
            max_size,
        }
    }

    fn try_pop(&mut self) -> Option<SecureChunk> {
        self.chunks.pop()
    }

    fn try_push(&mut self, chunk: SecureChunk) -> std::result::Result<(), SecureChunk> {
        if self.chunks.len() < self.max_size {
            self.chunks.push(chunk);
            Ok(())
        } else {
            Err(chunk)
        }
    }

    fn is_empty(&self) -> bool {
        self.chunks.is_empty()
    }

    fn len(&self) -> usize {
        self.chunks.len()
    }

    fn clear(&mut self, zero_on_free: bool, enable_simd_ops: bool, simd_threshold: usize) {
        for chunk in self.chunks.drain(..) {
            chunk.deallocate(zero_on_free, enable_simd_ops, simd_threshold);
        }
    }
}

/// Production-ready secure memory pool
pub struct SecureMemoryPool {
    config: SecurePoolConfig,
    pool_id: u32,
    global_stack: LockFreeStack<SecureChunk>,
    next_generation: AtomicU32,
    local_caches: thread_local::ThreadLocal<RefCell<LocalCache>>,
    
    // Cache optimization infrastructure
    cache_allocator: Option<CacheOptimizedAllocator>,
    hot_cold_separator: std::sync::Mutex<HotColdSeparator<usize>>,

    // Statistics (lock-free)
    alloc_count: CachePadded<AtomicU64>,
    dealloc_count: CachePadded<AtomicU64>,
    pool_hits: CachePadded<AtomicU64>,
    pool_misses: CachePadded<AtomicU64>,
    corruption_detected: CachePadded<AtomicU64>,
    double_free_detected: CachePadded<AtomicU64>,
    local_cache_hits: CachePadded<AtomicU64>,
    cross_thread_steals: CachePadded<AtomicU64>,
    
    // Cache-specific statistics
    cache_aligned_allocs: CachePadded<AtomicU64>,
    numa_local_allocs: CachePadded<AtomicU64>,
    hot_data_allocs: CachePadded<AtomicU64>,
    cold_data_allocs: CachePadded<AtomicU64>,
    huge_page_allocs: CachePadded<AtomicU64>,

    // Allocation tracking for double-free detection (using usize for Send+Sync safety)
    active_allocations: std::sync::RwLock<HashMap<usize, (u32, Instant)>>, // ptr_addr -> (generation, time)
}

impl std::fmt::Debug for SecureMemoryPool {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("SecureMemoryPool")
            .field("config", &self.config)
            .field("pool_id", &self.pool_id)
            .field("next_generation", &self.next_generation)
            .finish_non_exhaustive()
    }
}

impl SecureMemoryPool {
    /// Create a new secure memory pool
    pub fn new(config: SecurePoolConfig) -> Result<Arc<Self>> {
        if config.chunk_size == 0 {
            return Err(ZiporaError::invalid_data("chunk_size cannot be zero"));
        }

        if config.alignment == 0 || !config.alignment.is_power_of_two() {
            return Err(ZiporaError::invalid_data(
                "alignment must be a power of two",
            ));
        }

        static POOL_ID_COUNTER: AtomicU32 = AtomicU32::new(1);
        let pool_id = POOL_ID_COUNTER.fetch_add(1, Ordering::Relaxed);

        // Initialize cache allocator if cache alignment is enabled
        let cache_allocator = if config.enable_cache_alignment && config.cache_config.is_some() {
            // SAFETY: is_some() check above guarantees this unwrap succeeds
            Some(CacheOptimizedAllocator::new(config.cache_config.clone().expect("cache_config present when cache_friendly enabled")))
        } else {
            None
        };

        // Initialize hot/cold separator if enabled
        let hot_cold_separator = if config.enable_hot_cold_separation && config.cache_config.is_some() {
            // SAFETY: is_some() check above guarantees this unwrap succeeds
            HotColdSeparator::<usize>::new(config.cache_config.clone().expect("cache_config present when hot_cold_separation enabled"))
        } else {
            HotColdSeparator::<usize>::new(CacheLayoutConfig::default())
        };

        Ok(Arc::new(Self {
            config,
            pool_id,
            global_stack: LockFreeStack::new(),
            next_generation: AtomicU32::new(1),
            local_caches: thread_local::ThreadLocal::new(),
            cache_allocator,
            hot_cold_separator: std::sync::Mutex::new(hot_cold_separator),
            cache_aligned_allocs: CachePadded::new(AtomicU64::new(0)),
            numa_local_allocs: CachePadded::new(AtomicU64::new(0)),
            hot_data_allocs: CachePadded::new(AtomicU64::new(0)),
            cold_data_allocs: CachePadded::new(AtomicU64::new(0)),
            huge_page_allocs: CachePadded::new(AtomicU64::new(0)),
            alloc_count: CachePadded::new(AtomicU64::new(0)),
            dealloc_count: CachePadded::new(AtomicU64::new(0)),
            pool_hits: CachePadded::new(AtomicU64::new(0)),
            pool_misses: CachePadded::new(AtomicU64::new(0)),
            corruption_detected: CachePadded::new(AtomicU64::new(0)),
            double_free_detected: CachePadded::new(AtomicU64::new(0)),
            local_cache_hits: CachePadded::new(AtomicU64::new(0)),
            cross_thread_steals: CachePadded::new(AtomicU64::new(0)),
            active_allocations: std::sync::RwLock::new(HashMap::new()),
        }))
    }

    /// Allocate a chunk from the pool with RAII guard and cache optimization
    pub fn allocate(self: &Arc<Self>) -> Result<SecurePooledPtr> {
        self.allocate_with_hint(false) // Default: not marked as hot
    }

    /// Allocate a chunk with hot/cold hint for cache optimization
    pub fn allocate_with_hint(self: &Arc<Self>, is_hot: bool) -> Result<SecurePooledPtr> {
        self.alloc_count.fetch_add(1, Ordering::Relaxed);

        // Try thread-local cache first
        let local_cache = self
            .local_caches
            .get_or(|| RefCell::new(LocalCache::new(self.config.local_cache_size)));

        if let Some(chunk) = local_cache.borrow_mut().try_pop() {
            self.local_cache_hits.fetch_add(1, Ordering::Relaxed);
            self.pool_hits.fetch_add(1, Ordering::Relaxed);

            // Validate chunk before returning
            if let Err(e) = chunk.validate() {
                self.corruption_detected.fetch_add(1, Ordering::Relaxed);
                return Err(e);
            }

            // Track hot/cold allocation statistics
            if is_hot {
                self.hot_data_allocs.fetch_add(1, Ordering::Relaxed);
            } else {
                self.cold_data_allocs.fetch_add(1, Ordering::Relaxed);
            }

            // Hot/cold separation temporarily simplified
            // if self.config.enable_hot_cold_separation {
            //     if let Ok(mut separator_guard) = self.hot_cold_separator.lock() {
            //         if let Some(ref mut separator) = *separator_guard {
            //             let access_count = if is_hot { self.config.hot_data_threshold + 1 } else { 1 };
            //             separator.insert(chunk.as_ptr() as usize, access_count);
            //         }
            //     }
            // }

            // Track allocation
            self.active_allocations.write().expect("active_allocations lock").insert(
                chunk.as_ptr() as usize,
                (chunk.generation(), Instant::now()),
            );

            return Ok(SecurePooledPtr {
                chunk: Some(chunk),
                pool: Arc::downgrade(self),
            });
        }

        // Try global stack
        if let Some(chunk) = self.global_stack.pop() {
            self.cross_thread_steals.fetch_add(1, Ordering::Relaxed);
            self.pool_hits.fetch_add(1, Ordering::Relaxed);

            // Validate chunk before returning
            if let Err(e) = chunk.validate() {
                self.corruption_detected.fetch_add(1, Ordering::Relaxed);
                return Err(e);
            }

            // Track hot/cold allocation statistics
            if is_hot {
                self.hot_data_allocs.fetch_add(1, Ordering::Relaxed);
            } else {
                self.cold_data_allocs.fetch_add(1, Ordering::Relaxed);
            }

            // Track allocation
            self.active_allocations.write().expect("active_allocations lock").insert(
                chunk.as_ptr() as usize,
                (chunk.generation(), Instant::now()),
            );

            return Ok(SecurePooledPtr {
                chunk: Some(chunk),
                pool: Arc::downgrade(self),
            });
        }

        // Allocate new chunk with cache optimizations
        self.pool_misses.fetch_add(1, Ordering::Relaxed);
        let generation = self.next_generation.fetch_add(1, Ordering::AcqRel);
        let chunk = self.allocate_new_chunk_optimized(generation, is_hot)?;

        // Track hot/cold allocation statistics
        if is_hot {
            self.hot_data_allocs.fetch_add(1, Ordering::Relaxed);
        } else {
            self.cold_data_allocs.fetch_add(1, Ordering::Relaxed);
        }

        // Track allocation
        self.active_allocations.write().expect("active_allocations lock").insert(
            chunk.as_ptr() as usize,
            (chunk.generation(), Instant::now()),
        );

        Ok(SecurePooledPtr {
            chunk: Some(chunk),
            pool: Arc::downgrade(self),
        })
    }

    /// Internal deallocation with security validation
    /// 
    /// # SIMD Performance
    /// Chunks returned to cache/stack are not immediately deallocated (and thus not SIMD-optimized).
    /// SIMD optimization occurs when chunks are finally deallocated during:
    /// - Pool cleanup (clear/drop) - uses pool's SIMD configuration
    /// - SecurePooledPtr drop when pool is gone - uses conservative SIMD defaults
    fn deallocate_internal(&self, chunk: SecureChunk) -> Result<()> {
        self.dealloc_count.fetch_add(1, Ordering::Relaxed);

        // Validate chunk before deallocation
        if let Err(e) = chunk.validate() {
            self.corruption_detected.fetch_add(1, Ordering::Relaxed);
            return Err(e);
        }

        // Check for double-free
        if let Some(allocation_info) =
            self.active_allocations.write().expect("active_allocations lock").remove(&(chunk.as_ptr() as usize))
        {
            let (original_generation, _): (u32, Instant) = allocation_info;
            if original_generation != chunk.generation() {
                self.double_free_detected.fetch_add(1, Ordering::Relaxed);
                return Err(ZiporaError::invalid_data(
                    "Double-free detected: generation mismatch",
                ));
            }
        } else {
            self.double_free_detected.fetch_add(1, Ordering::Relaxed);
            return Err(ZiporaError::invalid_data(
                "Double-free detected: pointer not allocated",
            ));
        }

        // Try to return to thread-local cache
        let local_cache = self
            .local_caches
            .get_or(|| RefCell::new(LocalCache::new(self.config.local_cache_size)));

        if local_cache.borrow_mut().try_push(chunk).is_err() {
            // Local cache full, try global stack
            // SAFETY: try_push() just failed at line 992, guaranteeing cache has at least one element
            let chunk = local_cache.borrow_mut().try_pop().expect("local cache non-empty by count check");
            self.global_stack.push(chunk);
        }

        Ok(())
    }

    /// Allocate a new chunk with cache optimizations
    fn allocate_new_chunk_optimized(&self, generation: u32, is_hot: bool) -> Result<SecureChunk> {
        // Use cache allocator if available and chunk size meets threshold
        if let Some(ref cache_allocator) = self.cache_allocator {
            if self.config.enable_cache_alignment {
                self.cache_aligned_allocs.fetch_add(1, Ordering::Relaxed);
            }

            // Check for NUMA-aware allocation
            if self.config.enable_numa_awareness {
                let optimal_node = -1; // Simplified: disable NUMA for now
                // let optimal_node = get_optimal_numa_node();
                if optimal_node >= 0 {
                    self.numa_local_allocs.fetch_add(1, Ordering::Relaxed);
                    
                    // NUMA allocation temporarily disabled for simplicity
                    // if let Ok(ptr) = numa_alloc_aligned(
                    //     self.config.chunk_size,
                    //     self.config.cache_config.as_ref().map(|c| c.cache_line_size).unwrap_or(64),
                    //     optimal_node as u32
                    // ) {
                    //     return SecureChunk::new_from_ptr(ptr, self.config.chunk_size, generation, self.pool_id);
                    // }
                }
            }

            // Check for huge page allocation
            #[cfg(target_os = "linux")]
            if self.config.enable_huge_pages && self.config.chunk_size >= self.config.huge_page_threshold {
                self.huge_page_allocs.fetch_add(1, Ordering::Relaxed);
                // Try huge page allocation (would integrate with hugepage module)
                // For now, fall through to regular allocation
            }

            // Prefetch hints for hot data
            if is_hot && self.config.cache_config.as_ref().map(|c| c.enable_prefetch).unwrap_or(false) {
                // Future: implement prefetch hints for hot allocations
            }
        }

        // Fall back to regular allocation
        let mut chunk = SecureChunk::new(self.config.chunk_size, generation, self.pool_id)?;

        // SIMD-optimized memory zeroing on allocation if configured
        if self.config.zero_on_alloc {
            self.zero_chunk_simd(&mut chunk)?;
        }

        Ok(chunk)
    }

    /// Zero chunk memory with SIMD acceleration and adaptive selection
    fn zero_chunk_simd(&self, chunk: &mut SecureChunk) -> Result<()> {
        if !self.config.enable_simd_ops || chunk.size() < self.config.simd_threshold {
            // SAFETY: pointer valid from allocation, size matches allocation
            unsafe {
                std::ptr::write_bytes(chunk.as_ptr(), 0, chunk.size());
            }
            return Ok(());
        }

        // Use adaptive SIMD selection for optimal performance
        let selector = AdaptiveSimdSelector::global();
        let start = Instant::now();

        // SAFETY: pointer valid from allocation, size matches allocation
        let slice = unsafe { std::slice::from_raw_parts_mut(chunk.as_ptr(), chunk.size()) };

        // Use SIMD-accelerated fill
        fast_fill(slice, 0);

        // Monitor performance for adaptive adjustment
        selector.monitor_performance(
            Operation::MemZero,
            start.elapsed(),
            chunk.size() as u64
        );

        Ok(())
    }

    /// Verify memory is properly zeroed using SIMD acceleration
    ///
    /// # Safety
    /// This function is safe because it accepts a slice, which guarantees
    /// the pointer and length form a valid memory range.
    ///
    /// # Performance
    /// Uses SIMD comparison for 8-16x faster verification than byte-by-byte checking.
    /// Particularly effective for large memory regions.
    #[inline]
    pub fn verify_zeroed_simd(&self, data: &[u8]) -> Result<bool> {
        // Create zero buffer for comparison
        let zero_buf = vec![0u8; data.len()];

        // Use SIMD comparison (8-16x faster than byte-by-byte)
        Ok(fast_compare(data, &zero_buf) == 0)
    }

    /// Prefetch allocation metadata for improved cache performance
    ///
    /// # Performance
    /// Issues prefetch hints for metadata structures to reduce cache misses
    /// during allocation. Most beneficial for large batch allocations.
    #[inline]
    fn prefetch_allocation_metadata(&self, _size: usize) {
        if !self.config.enable_cache_alignment {
            return;
        }

        #[cfg(target_arch = "x86_64")]
        {
            // Prefetch the global stack head (most frequently accessed metadata)
            fast_prefetch(&self.global_stack, PrefetchHint::T0);

            // Prefetch stats counters (frequently accessed during allocation)
            fast_prefetch(&self.alloc_count, PrefetchHint::T0);
        }
    }

    /// Bulk allocation with prefetching for improved performance
    ///
    /// # Performance
    /// Uses configurable lookahead prefetching (default: 8) to warm cache for future
    /// allocations, providing 20-30% improvement over sequential allocations.
    /// The prefetch distance can be tuned via `SecurePoolConfig::prefetch_distance`.
    ///
    /// # Arguments
    /// * `sizes` - Slice of allocation sizes (all should be equal to chunk_size)
    ///
    /// # Returns
    /// Vector of allocated pointers in the same order as requested sizes
    pub fn allocate_bulk_with_prefetch(self: &Arc<Self>, sizes: &[usize]) -> Result<Vec<SecurePooledPtr>> {
        let prefetch_distance = self.config.prefetch_distance;
        let mut results = Vec::with_capacity(sizes.len());

        for (i, &size) in sizes.iter().enumerate() {
            // Validate size matches chunk size
            if size != self.config.chunk_size {
                return Err(ZiporaError::invalid_data(
                    format!("Size {} does not match chunk_size {}", size, self.config.chunk_size)
                ));
            }

            // Prefetch future allocations (lookahead)
            if i + prefetch_distance < sizes.len() {
                self.prefetch_allocation_metadata(sizes[i + prefetch_distance]);
            }

            // Perform allocation
            results.push(self.allocate()?);
        }

        Ok(results)
    }

    /// Get current pool statistics
    pub fn stats(&self) -> SecurePoolStats {
        let total_allocs = self.alloc_count.load(Ordering::Relaxed);
        let cache_hit_ratio = if total_allocs > 0 {
            self.pool_hits.load(Ordering::Relaxed) as f64 / total_allocs as f64
        } else {
            0.0
        };

        SecurePoolStats {
            allocated: 0, // Would need to track this separately
            available: 0, // Would need to track this separately
            chunks: 0,    // Would need to track this separately
            alloc_count: total_allocs,
            dealloc_count: self.dealloc_count.load(Ordering::Relaxed),
            pool_hits: self.pool_hits.load(Ordering::Relaxed),
            pool_misses: self.pool_misses.load(Ordering::Relaxed),
            corruption_detected: self.corruption_detected.load(Ordering::Relaxed),
            double_free_detected: self.double_free_detected.load(Ordering::Relaxed),
            local_cache_hits: self.local_cache_hits.load(Ordering::Relaxed),
            cross_thread_steals: self.cross_thread_steals.load(Ordering::Relaxed),
            cache_aligned_allocs: self.cache_aligned_allocs.load(Ordering::Relaxed),
            numa_local_allocs: self.numa_local_allocs.load(Ordering::Relaxed),
            hot_data_allocs: self.hot_data_allocs.load(Ordering::Relaxed),
            cold_data_allocs: self.cold_data_allocs.load(Ordering::Relaxed),
            huge_page_allocs: self.huge_page_allocs.load(Ordering::Relaxed),
            cache_hit_ratio,
        }
    }

    /// Clear all chunks from the pool
    pub fn clear(&self) -> Result<()> {
        // Clear global stack and deallocate chunks with SIMD optimization
        while let Some(chunk) = self.global_stack.pop() {
            chunk.deallocate(
                self.config.zero_on_free,
                self.config.enable_simd_ops,
                self.config.simd_threshold
            );
        }

        // Note: We cannot safely clear thread-local caches from another thread
        // due to RefCell not being Sync. Thread-local caches will be cleared
        // when threads exit or when they access the cache and find it should be cleared.

        // Clear allocation tracking
        self.active_allocations.write().expect("active_allocations lock").clear();

        Ok(())
    }

    /// Get pool configuration
    pub fn config(&self) -> &SecurePoolConfig {
        &self.config
    }

    /// Validate pool integrity
    pub fn validate(&self) -> Result<()> {
        // Check active allocations for corruption
        for (&ptr_addr, &(generation, _time)) in self.active_allocations.read().expect("active_allocations lock").iter() {

            // Read canary from the chunk header for validation
            let data_ptr = ptr_addr as *mut u8;
            let header_size = std::mem::size_of::<ChunkHeader>();
            // SAFETY: data_ptr from active allocation, sub reverses new() offset
            let header_ptr = unsafe { data_ptr.sub(header_size) as *const ChunkHeader };
            // SAFETY: header_ptr valid, aligned to ChunkHeader, from allocation
            let header = unsafe { &*header_ptr };

            // Create temporary chunk for validation with correct canary
            let chunk = SecureChunk {
                // SAFETY: data_ptr non-null from active allocation
                ptr: unsafe { NonNull::new_unchecked(data_ptr) },
                size: self.config.chunk_size,
                generation,
                pool_id: self.pool_id,
                canary: header.canary,
            };

            if let Err(e) = chunk.validate() {
                self.corruption_detected.fetch_add(1, Ordering::Relaxed);
                return Err(e);
            }
        }

        Ok(())
    }
}

impl Drop for SecureMemoryPool {
    fn drop(&mut self) {
        let _ = self.clear();
    }
}

/// RAII guard for automatic secure deallocation
pub struct SecurePooledPtr {
    chunk: Option<SecureChunk>,
    pool: Weak<SecureMemoryPool>,
}

impl SecurePooledPtr {
    /// Get pointer to allocated memory
    #[inline]
    pub fn as_ptr(&self) -> *mut u8 {
        self.chunk
            .as_ref()
            .map(|c| c.as_ptr())
            .unwrap_or(std::ptr::null_mut())
    }

    /// Get non-null pointer to allocated memory
    pub fn as_non_null(&self) -> Option<NonNull<u8>> {
        self.chunk.as_ref().and_then(|c| NonNull::new(c.as_ptr()))
    }

    /// Get size of allocated memory
    #[inline]
    pub fn size(&self) -> usize {
        self.chunk.as_ref().map(|c| c.size()).unwrap_or(0)
    }

    /// Get generation for debugging
    pub fn generation(&self) -> u32 {
        self.chunk.as_ref().map(|c| c.generation()).unwrap_or(0)
    }

    /// Validate chunk integrity
    pub fn validate(&self) -> Result<()> {
        if let Some(chunk) = &self.chunk {
            chunk.validate()
        } else {
            Err(ZiporaError::invalid_data("Chunk already deallocated"))
        }
    }

    /// Get slice view of the allocated memory
    #[inline]
    pub fn as_slice(&self) -> &[u8] {
        if let Some(chunk) = &self.chunk {
            // SAFETY: pointer valid from allocation, size matches allocation
            unsafe { std::slice::from_raw_parts(chunk.as_ptr(), chunk.size()) }
        } else {
            &[]
        }
    }

    /// Get mutable slice view of the allocated memory
    #[inline]
    pub fn as_mut_slice(&mut self) -> &mut [u8] {
        if let Some(chunk) = &self.chunk {
            // SAFETY: pointer valid from allocation, size matches allocation
            unsafe { std::slice::from_raw_parts_mut(chunk.as_ptr(), chunk.size()) }
        } else {
            &mut []
        }
    }
}

impl Drop for SecurePooledPtr {
    fn drop(&mut self) {
        if let Some(chunk) = self.chunk.take() {
            if let Some(pool) = self.pool.upgrade() {
                // Best effort deallocation - if it fails, the chunk is leaked
                // but this prevents crashes during cleanup
                let _ = pool.deallocate_internal(chunk);
            } else {
                // Pool is gone, deallocate directly with SIMD defaults
                // Use conservative SIMD settings for safety when pool config unavailable
                chunk.deallocate(true, true, 64); // Always zero on free when pool is gone
            }
        }
    }
}

// SAFETY: SecurePooledPtr is Send because:
// 1. `chunk: Option<SecureChunk>` - SecureChunk is Send (see above).
// 2. `pool: Weak<SecureMemoryPool>` - Weak<T> is Send if T is Send+Sync.
//    SecureMemoryPool uses Arc and atomics for thread-safe reference counting.
// Transferring ownership between threads is safe as only one owner exists.
unsafe impl Send for SecurePooledPtr {}

// SAFETY: SecurePooledPtr is Sync because:
// 1. All access is through `&self` (immutable) or `&mut self` (exclusive).
// 2. `as_slice()` returns a read-only view of the memory.
// 3. `as_mut_slice()` requires `&mut self`, ensuring exclusive access.
// 4. The underlying chunk is either None or exclusively owned by this ptr.
// 5. Pool deallocation is thread-safe via Arc's atomic reference counting.
//
// Note: Safe concurrent read access is provided. Mutable access requires &mut self.
unsafe impl Sync for SecurePooledPtr {}

/// Get current time in nanoseconds
fn current_time_nanos() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_default()
        .as_nanos() as u64
}

/// Find appropriate size class for given size
pub fn size_to_class(size: usize) -> usize {
    SIZE_CLASSES.binary_search(&size).unwrap_or_else(|i| {
        if i < SIZE_CLASSES.len() {
            i
        } else {
            SIZE_CLASSES.len() - 1
        }
    })
}

/// Global secure pools for different size classes
static GLOBAL_SECURE_POOLS: Lazy<Vec<Arc<SecureMemoryPool>>> = Lazy::new(|| {
    // SAFETY: Static initialization - panic on pool creation failure is appropriate
    // because the program cannot continue without global memory pools
    vec![
        SecureMemoryPool::new(SecurePoolConfig::small_secure()).expect("small secure pool creation"),
        SecureMemoryPool::new(SecurePoolConfig::medium_secure()).expect("medium secure pool creation"),
        SecureMemoryPool::new(SecurePoolConfig::large_secure()).expect("large secure pool creation"),
    ]
});

/// Get appropriate global pool for size
pub fn get_global_pool_for_size(size: usize) -> &'static Arc<SecureMemoryPool> {
    if size <= 1024 {
        &GLOBAL_SECURE_POOLS[0]
    } else if size <= 64 * 1024 {
        &GLOBAL_SECURE_POOLS[1]
    } else {
        &GLOBAL_SECURE_POOLS[2]
    }
}

/// Get aggregated statistics from all global pools
pub fn get_global_secure_pool_stats() -> SecurePoolStats {
    let mut total_stats = SecurePoolStats::default();

    for pool in GLOBAL_SECURE_POOLS.iter() {
        let stats = pool.stats();
        total_stats.allocated += stats.allocated;
        total_stats.available += stats.available;
        total_stats.chunks += stats.chunks;
        total_stats.alloc_count += stats.alloc_count;
        total_stats.dealloc_count += stats.dealloc_count;
        total_stats.pool_hits += stats.pool_hits;
        total_stats.pool_misses += stats.pool_misses;
        total_stats.corruption_detected += stats.corruption_detected;
        total_stats.double_free_detected += stats.double_free_detected;
        total_stats.local_cache_hits += stats.local_cache_hits;
        total_stats.cross_thread_steals += stats.cross_thread_steals;
    }

    total_stats
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::AtomicUsize;
    use std::thread;
    use std::time::Duration;

    #[test]
    fn test_secure_pool_creation() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        let stats = pool.stats();
        assert_eq!(stats.alloc_count, 0);
        assert_eq!(stats.dealloc_count, 0);
    }

    #[test]
    fn test_secure_allocation_deallocation() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr1 = pool.allocate().unwrap();
        assert!(!ptr1.as_ptr().is_null());
        assert_eq!(ptr1.size(), 1024);

        let ptr2 = pool.allocate().unwrap();
        assert!(!ptr2.as_ptr().is_null());
        assert_ne!(ptr1.as_ptr(), ptr2.as_ptr());

        // Chunks are automatically deallocated on drop
        drop(ptr1);
        drop(ptr2);

        let stats = pool.stats();
        assert_eq!(stats.alloc_count, 2);
        assert_eq!(stats.dealloc_count, 2);
    }

    #[test]
    fn test_chunk_validation() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr = pool.allocate().unwrap();
        assert!(ptr.validate().is_ok());

        // Validation should still work
        assert!(ptr.validate().is_ok());
    }

    #[test]
    fn test_double_free_detection() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr = pool.allocate().unwrap();
        let _raw_ptr = ptr.as_ptr();

        // First deallocation is automatic on drop
        drop(ptr);

        // Attempt to manually create and deallocate the same pointer should fail
        // This is prevented by the RAII design - you can't create a SecurePooledPtr
        // from a raw pointer, so double-free is structurally prevented

        let stats = pool.stats();
        assert_eq!(stats.double_free_detected, 0); // No double-free attempts possible
    }

    #[test]
    fn test_pool_reuse() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr1 = pool.allocate().unwrap();
        let _addr1 = ptr1.as_ptr();
        drop(ptr1);

        let ptr2 = pool.allocate().unwrap();
        let addr2 = ptr2.as_ptr();

        // Memory might be reused (but not guaranteed due to generation counters)
        // What matters is that it works correctly either way
        assert!(!addr2.is_null());
        drop(ptr2);

        let stats = pool.stats();
        assert!(stats.pool_hits > 0 || stats.pool_misses > 0);
    }

    #[test]
    fn test_concurrent_allocation() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();
        let allocated_count = Arc::new(AtomicUsize::new(0));

        let handles: Vec<_> = (0..10)
            .map(|_| {
                let pool = pool.clone();
                let count = allocated_count.clone();
                thread::spawn(move || {
                    for _ in 0..100 {
                        let _ptr = pool.allocate().unwrap();
                        count.fetch_add(1, Ordering::Relaxed);

                        // Simulate some work
                        thread::sleep(Duration::from_micros(1));

                        // ptr automatically deallocated on scope exit
                    }
                })
            })
            .collect();

        for handle in handles {
            handle.join().unwrap();
        }

        assert_eq!(allocated_count.load(Ordering::Relaxed), 1000);

        let stats = pool.stats();
        assert_eq!(stats.alloc_count, 1000);
        assert_eq!(stats.dealloc_count, 1000);
        assert_eq!(stats.corruption_detected, 0);
        assert_eq!(stats.double_free_detected, 0);
    }

    #[test]
    fn test_thread_local_caching() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        // Allocate and deallocate to populate thread-local cache
        for _ in 0..10 {
            let ptr = pool.allocate().unwrap();
            drop(ptr);
        }

        // Next allocations should hit thread-local cache
        let ptr = pool.allocate().unwrap();
        drop(ptr);

        let stats = pool.stats();
        assert!(stats.local_cache_hits > 0);
    }

    #[test]
    fn test_size_classes() {
        assert_eq!(size_to_class(8), 0);
        assert_eq!(size_to_class(16), 1);
        assert_eq!(size_to_class(100), 7); // Should use 112-byte class
        assert_eq!(size_to_class(1000), 20); // Should use 1024-byte class
    }

    #[test]
    fn test_global_pools() {
        let small_pool = get_global_pool_for_size(100);
        let medium_pool = get_global_pool_for_size(10000);
        let large_pool = get_global_pool_for_size(100000);

        assert_eq!(small_pool.config().chunk_size, 1024);
        assert_eq!(medium_pool.config().chunk_size, 64 * 1024);
        assert_eq!(large_pool.config().chunk_size, 1024 * 1024);

        let ptr1 = small_pool.allocate().unwrap();
        let ptr2 = medium_pool.allocate().unwrap();
        let ptr3 = large_pool.allocate().unwrap();

        assert!(!ptr1.as_ptr().is_null());
        assert!(!ptr2.as_ptr().is_null());
        assert!(!ptr3.as_ptr().is_null());
    }

    #[test]
    fn test_memory_access() {
        let config = SecurePoolConfig::small_secure();
        let pool = SecureMemoryPool::new(config).unwrap();

        let mut ptr = pool.allocate().unwrap();

        // Test slice access
        let slice = ptr.as_mut_slice();
        slice[0] = 42;
        slice[1023] = 84;

        let slice = ptr.as_slice();
        assert_eq!(slice[0], 42);
        assert_eq!(slice[1023], 84);
        assert_eq!(slice.len(), 1024);
    }

    #[test]
    fn test_simd_configuration() {
        // Test SIMD defaults
        let config = SecurePoolConfig::small_secure();
        assert_eq!(config.enable_simd_ops, true);
        assert_eq!(config.simd_threshold, 64);

        // Test SIMD builder methods
        let config_disabled = SecurePoolConfig::small_secure()
            .with_simd_ops(false)
            .with_simd_threshold(128);
        assert_eq!(config_disabled.enable_simd_ops, false);
        assert_eq!(config_disabled.simd_threshold, 128);

        // Test pool creation with SIMD config
        let pool = SecureMemoryPool::new(config_disabled).unwrap();
        assert_eq!(pool.config().enable_simd_ops, false);
        assert_eq!(pool.config().simd_threshold, 128);

        // Test that allocation/deallocation works with SIMD disabled
        let ptr = pool.allocate().unwrap();
        assert!(!ptr.as_ptr().is_null());
        // ptr automatically deallocated on drop with SIMD config
    }

    #[test]
    fn test_simd_with_large_chunks() {
        // Test with large chunks that should benefit from SIMD
        let config = SecurePoolConfig::large_secure()
            .with_simd_ops(true)
            .with_simd_threshold(64);
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr = pool.allocate().unwrap();
        assert_eq!(ptr.size(), 1024 * 1024); // 1MB chunks
        assert!(!ptr.as_ptr().is_null());

        // Verify configuration
        assert!(pool.config().enable_simd_ops);
        assert_eq!(pool.config().simd_threshold, 64);
        
        // Large chunks should be SIMD-optimized since 1MB >> 64 bytes
        // This is tested implicitly through the deallocation process
    }

    #[test]
    fn test_simd_memory_zeroing() {
        let config = SecurePoolConfig::small_secure()
            .with_zero_on_alloc(true)
            .with_simd_ops(true)
            .with_simd_threshold(64);
        let pool = SecureMemoryPool::new(config).unwrap();

        // Allocate and verify SIMD zeroing
        let ptr = pool.allocate().unwrap();
        // SAFETY: test code, pointer valid from allocation
        let slice = unsafe {
            std::slice::from_raw_parts(ptr.as_ptr(), ptr.size())
        };

        // All bytes should be zero
        assert!(slice.iter().all(|&b| b == 0));

        // Verify using SIMD verification
        assert!(pool.verify_zeroed_simd(ptr.as_slice()).unwrap());
    }

    #[test]
    fn test_simd_verification() {
        let config = SecurePoolConfig::medium_secure()
            .with_simd_ops(true);
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr = pool.allocate().unwrap();

        // SAFETY: test code, pointer valid from allocation
        unsafe { std::ptr::write_bytes(ptr.as_ptr(), 0, ptr.size()); }

        // Verify using SIMD
        assert!(pool.verify_zeroed_simd(ptr.as_slice()).unwrap());

        // SAFETY: test code, pointer valid from allocation
        unsafe { *ptr.as_ptr() = 1; }
        assert!(!pool.verify_zeroed_simd(ptr.as_slice()).unwrap());
    }

    #[test]
    fn test_bulk_allocation_with_prefetch() {
        let pool = SecureMemoryPool::new(SecurePoolConfig::small_secure()).unwrap();

        let sizes = vec![1024; 20]; // 20 allocations of 1KB each
        let ptrs = pool.allocate_bulk_with_prefetch(&sizes).unwrap();

        assert_eq!(ptrs.len(), sizes.len());
        for (ptr, &size) in ptrs.iter().zip(&sizes) {
            assert!(!ptr.as_ptr().is_null());
            assert_eq!(ptr.size(), size);
        }

        // Verify all pointers are unique
        let mut addresses: std::collections::HashSet<usize> = std::collections::HashSet::new();
        for ptr in &ptrs {
            let addr = ptr.as_ptr() as usize;
            assert!(addresses.insert(addr), "Duplicate pointer detected");
        }
    }

    #[test]
    fn test_bulk_allocation_size_mismatch() {
        let pool = SecureMemoryPool::new(SecurePoolConfig::small_secure()).unwrap();

        // Try to allocate with wrong sizes
        let sizes = vec![512, 1024, 2048]; // Sizes don't match chunk_size
        let result = pool.allocate_bulk_with_prefetch(&sizes);

        // Should fail for sizes that don't match chunk_size
        assert!(result.is_err());
    }

    #[test]
    fn test_cache_optimized_operations() {
        let config = SecurePoolConfig::large_secure()
            .with_simd_ops(true)
            .with_simd_threshold(64)
            .with_cache_alignment(true)
            .with_zero_on_alloc(true);
        let pool = SecureMemoryPool::new(config).unwrap();

        // Test various sizes to trigger different SIMD paths
        for _ in 0..10 {
            let ptr = pool.allocate().unwrap();
            assert!(!ptr.as_ptr().is_null());

            // SAFETY: test code, pointer valid from allocation
            let slice = unsafe {
                std::slice::from_raw_parts(ptr.as_ptr(), ptr.size())
            };
            assert!(slice.iter().all(|&b| b == 0));
        }

        let stats = pool.stats();
        assert!(stats.alloc_count >= 10);
    }

    #[test]
    fn test_simd_with_small_threshold() {
        // Test with a very high threshold that forces scalar operations
        let config = SecurePoolConfig::small_secure()
            .with_simd_ops(true)
            .with_simd_threshold(1024 * 1024) // 1MB threshold for 1KB chunks
            .with_zero_on_alloc(true);
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr = pool.allocate().unwrap();

        // Should still be zeroed, just using scalar path
        // SAFETY: Hardware features verified or caller ensures safety invariants
        let slice = unsafe {
            std::slice::from_raw_parts(ptr.as_ptr(), ptr.size())
        };
        assert!(slice.iter().all(|&b| b == 0));
    }

    #[test]
    fn test_verify_zeroed_empty_slice() {
        let pool = SecureMemoryPool::new(SecurePoolConfig::small_secure()).unwrap();

        // Empty slice should verify as zeroed
        let result = pool.verify_zeroed_simd(&[]);
        assert!(result.is_ok());
        assert!(result.unwrap());
    }

    #[test]
    fn test_zero_on_alloc_disabled() {
        let config = SecurePoolConfig::small_secure()
            .with_zero_on_alloc(false)
            .with_simd_ops(true);
        let pool = SecureMemoryPool::new(config).unwrap();

        let ptr = pool.allocate().unwrap();

        // Memory may contain garbage when zero_on_alloc is disabled
        // Just verify the allocation succeeded
        assert!(!ptr.as_ptr().is_null());
        assert_eq!(ptr.size(), 1024);
    }

    #[test]
    fn test_adaptive_simd_integration() {
        let config = SecurePoolConfig::large_secure()
            .with_simd_ops(true)
            .with_zero_on_alloc(true);
        let pool = SecureMemoryPool::new(config).unwrap();

        // Allocate multiple times to trigger adaptive monitoring
        for _ in 0..100 {
            let ptr = pool.allocate().unwrap();
            assert!(!ptr.as_ptr().is_null());

            // Verify using SIMD
            assert!(pool.verify_zeroed_simd(ptr.as_slice()).unwrap());

            // Drop happens automatically
        }

        // Verify stats
        let stats = pool.stats();
        assert_eq!(stats.alloc_count, 100);
        assert_eq!(stats.dealloc_count, 100);
    }

    #[test]
    fn test_concurrent_simd_operations() {
        use std::sync::atomic::AtomicUsize;
        use std::thread;

        let config = SecurePoolConfig::medium_secure()
            .with_simd_ops(true)
            .with_zero_on_alloc(true);
        let pool = SecureMemoryPool::new(config).unwrap();
        let verified_count = Arc::new(AtomicUsize::new(0));

        let handles: Vec<_> = (0..4)
            .map(|_| {
                let pool = pool.clone();
                let count = verified_count.clone();
                thread::spawn(move || {
                    for _ in 0..25 {
                        let ptr = pool.allocate().unwrap();

                        // Verify zeroing
                        if pool.verify_zeroed_simd(ptr.as_slice()).unwrap() {
                            count.fetch_add(1, Ordering::Relaxed);
                        }
                    }
                })
            })
            .collect();

        for handle in handles {
            handle.join().unwrap();
        }

        // All 100 allocations should be verified as zeroed
        assert_eq!(verified_count.load(Ordering::Relaxed), 100);

        let stats = pool.stats();
        assert_eq!(stats.alloc_count, 100);
        assert_eq!(stats.dealloc_count, 100);
    }
}