scirs2-core 0.4.3

//! OpenCL backend implementation for GPU operations
//!
//! This module provides OpenCL-specific implementations for GPU operations.

use std::cell::UnsafeCell;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::Duration;

use crate::gpu::{GpuBufferImpl, GpuCompilerImpl, GpuContextImpl, GpuError, GpuKernelImpl};

#[cfg(feature = "opencl")]
use opencl3::command_queue::{CommandQueue, CL_QUEUE_PROFILING_ENABLE};
#[cfg(feature = "opencl")]
use opencl3::context::Context;
#[cfg(feature = "opencl")]
use opencl3::device::{get_all_devices, Device, CL_DEVICE_TYPE_GPU};
#[cfg(feature = "opencl")]
use opencl3::kernel::{ExecuteKernel, Kernel};
#[cfg(feature = "opencl")]
use opencl3::memory::{Buffer, ClMem, CL_MEM_READ_WRITE};
#[cfg(feature = "opencl")]
use opencl3::platform::get_platforms;
#[cfg(feature = "opencl")]
use opencl3::program::Program;
#[cfg(feature = "opencl")]
use opencl3::types::CL_BLOCKING;

// Fallback types for when OpenCL is not available
#[cfg(not(feature = "opencl"))]
type CLPlatformId = *mut std::ffi::c_void;
#[cfg(not(feature = "opencl"))]
type CLDeviceId = *mut std::ffi::c_void;
#[cfg(not(feature = "opencl"))]
type CLContext = *mut std::ffi::c_void;
#[cfg(not(feature = "opencl"))]
type CLCommandQueue = *mut std::ffi::c_void;
#[cfg(not(feature = "opencl"))]
type CLProgram = *mut std::ffi::c_void;
#[cfg(not(feature = "opencl"))]
type CLKernel = *mut std::ffi::c_void;
#[cfg(not(feature = "opencl"))]
type CLMem = *mut std::ffi::c_void;

// OpenCL kernel source templates
#[allow(dead_code)]
const ADAM_KERNEL_OPENCL: &str = r#"
__kernel void adam_update_f32(
    __global float* params, __global const float* grads, __global float* m, __global float* v,
    const float lr,
    const float beta1,
    const float beta2,
    const float eps,
    const float weight_decay,
    const float bias_correction1,
    const float bias_correction2,
    const int n
) {
    const int idx = get_global_id(0);
    
    if (idx < n) {
        float grad = grads[idx];
        
        // Apply weight decay
        if (weight_decay > 0.0f) {
            grad += weight_decay * params[idx];
        }
        
        // Update biased first moment estimate
        m[idx] = beta1 * m[idx] + (1.0f - beta1) * grad;
        
        // Update biased second raw moment estimate
        v[idx] = beta2 * v[idx] + (1.0f - beta2) * grad * grad;
        
        // Compute bias-corrected moment estimates
        float m_hat = m[idx] / bias_correction1;
        float v_hat = v[idx] / bias_correction2;
        
        // Update parameters
        params[idx] -= lr * m_hat / (sqrt(v_hat) + eps);
    }
}
"#;

#[allow(dead_code)]
const GEMM_KERNEL_OPENCL: &str = r#"
__kernel void gemm_f32(
    __global const float* A, __global const float* B, __global float* C,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float beta
) {
    const int row = get_global_id(0);
    const int col = get_global_id(1);
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = alpha * sum + beta * C[row * N + col];
    }
}
"#;

/// OpenCL context wrapper
pub struct OpenCLContext {
    #[cfg(feature = "opencl")]
    device: Arc<Device>,
    #[cfg(feature = "opencl")]
    context: Arc<Context>,
    #[cfg(feature = "opencl")]
    queue: Arc<CommandQueue>,
    #[cfg(not(feature = "opencl"))]
    device: CLDeviceId,
    #[cfg(not(feature = "opencl"))]
    context: CLContext,
    #[cfg(not(feature = "opencl"))]
    queue: CLCommandQueue,
    compiled_kernels: Arc<Mutex<HashMap<String, OpenCLKernel>>>,
    memory_pool: Arc<Mutex<OpenCLMemoryPool>>,
}

// OpenCL handles are safe to send between threads when properly synchronized
unsafe impl Send for OpenCLContext {}
unsafe impl Sync for OpenCLContext {}

impl OpenCLContext {
    /// Create a new OpenCL context
    pub fn new() -> Result<Self, GpuError> {
        #[cfg(feature = "opencl")]
        {
            // Real OpenCL implementation
            let platforms = get_platforms()
                .map_err(|e| GpuError::Other(format!("Failed to get OpenCL platforms: {e}")))?;

            if platforms.is_empty() {
                return Err(GpuError::Other("No OpenCL platforms found".to_string()));
            }

            let device_ids = get_all_devices(CL_DEVICE_TYPE_GPU)
                .map_err(|e| GpuError::Other(format!("Failed to get OpenCL GPU devices: {e}")))?;

            if device_ids.is_empty() {
                return Err(GpuError::Other("No OpenCL GPU devices found".to_string()));
            }

            let device = Device::new(device_ids[0]);
            let context = Context::from_device(&device)
                .map_err(|e| GpuError::Other(format!("Failed to create OpenCL context: {e}")))?;

            let queue =
                CommandQueue::create_default(&context, CL_QUEUE_PROFILING_ENABLE).map_err(|e| {
                    GpuError::Other(format!("Failed to create OpenCL command queue: {e}"))
                })?;

            Ok(Self {
                device: Arc::new(device),
                context: Arc::new(context),
                queue: Arc::new(queue),
                compiled_kernels: Arc::new(Mutex::new(HashMap::new())),
                memory_pool: Arc::new(Mutex::new(OpenCLMemoryPool::new(1024 * 1024 * 1024))), // 1GB pool
            })
        }
        #[cfg(not(feature = "opencl"))]
        {
            // Fallback implementation
            let device = Self::initialize_opencl()?;
            let context = Self::create_opencl_context(device)?;
            let queue = Self::create_command_queue(context, device)?;

            Ok(Self {
                device,
                context,
                queue,
                compiled_kernels: Arc::new(Mutex::new(HashMap::new())),
                memory_pool: Arc::new(Mutex::new(OpenCLMemoryPool::new(1024 * 1024 * 1024))), // 1GB pool
            })
        }
    }

    /// Check if OpenCL is available and working
    pub fn is_available() -> bool {
        #[cfg(feature = "opencl")]
        {
            // Real OpenCL implementation - try to get platforms and devices
            match get_platforms() {
                Ok(platforms) if !platforms.is_empty() => {
                    match get_all_devices(CL_DEVICE_TYPE_GPU) {
                        Ok(devices) => !devices.is_empty(),
                        Err(_) => false,
                    }
                }
                _ => false,
            }
        }
        #[cfg(not(feature = "opencl"))]
        {
            // Fallback: return false since we don't have real OpenCL
            false
        }
    }

    /// Compile a kernel from OpenCL source
    fn compile_kernel_internal(&self, source: &str, name: &str) -> Result<OpenCLKernel, GpuError> {
        #[cfg(feature = "opencl")]
        {
            // Real OpenCL implementation
            let program = Program::create_and_build_from_source(&self.context, source, "")
                .map_err(|e| {
                    GpuError::Other(format!(
                        "OpenCL kernel compilation failed for {}: {}",
                        name, e
                    ))
                })?;

            let kernel = Kernel::create(&program, name).map_err(|e| {
                GpuError::Other(format!("Failed to create OpenCL kernel {name}: {e}"))
            })?;

            Ok(OpenCLKernel {
                kernel,
                queue: Arc::clone(&self.queue),
                name: name.to_string(),
            })
        }
        #[cfg(not(feature = "opencl"))]
        {
            // Fallback implementation
            let program = Self::compile_opencl_source(source, name)?;
            let kernel = Self::create_kernel_from_program(program, name)?;

            Ok(OpenCLKernel {
                program,
                kernel,
                queue: self.queue,
                name: name.to_string(),
            })
        }
    }

    /// Allocate device memory
    #[cfg(feature = "opencl")]
    pub fn allocate_device_memory(&self, size: usize) -> Result<Buffer<u8>, GpuError> {
        unsafe {
            Buffer::<u8>::create(&self.context, CL_MEM_READ_WRITE, size, std::ptr::null_mut())
                .map_err(|e| GpuError::Other(format!("OpenCL memory allocation failed: {e}")))
        }
    }

    /// Allocate device memory (fallback)
    #[cfg(not(feature = "opencl"))]
    pub fn allocate_device_memory(&self, size: usize) -> Result<CLMem, GpuError> {
        // Fallback implementation: return a simulated memory handle
        Ok((0x1000 + size) as CLMem)
    }

    // Fallback methods for when OpenCL is not available
    #[cfg(not(feature = "opencl"))]
    fn initialize_opencl() -> Result<CLDeviceId, GpuError> {
        // Stub implementation
        Ok(0x1 as CLDeviceId)
    }

    #[cfg(not(feature = "opencl"))]
    fn create_opencl_context(device: CLDeviceId) -> Result<CLContext, GpuError> {
        // Stub implementation
        Ok(0x2 as CLContext)
    }

    #[cfg(not(feature = "opencl"))]
    fn create_command_queue(
        context: CLContext,
        device: CLDeviceId,
    ) -> Result<CLCommandQueue, GpuError> {
        // Stub implementation
        Ok(0x3 as CLCommandQueue)
    }

    #[cfg(not(feature = "opencl"))]
    fn compile_opencl_source(source: &str, name: &str) -> Result<CLProgram, GpuError> {
        // Stub implementation
        Ok(0x4 as CLProgram)
    }

    #[cfg(not(feature = "opencl"))]
    fn create_kernel_from_program(program: CLProgram, name: &str) -> Result<CLKernel, GpuError> {
        // Stub implementation
        Ok(0x5 as CLKernel)
    }
}

impl GpuContextImpl for OpenCLContext {
    fn create_buffer(&self, size: usize) -> Arc<dyn GpuBufferImpl> {
        // Try to allocate from memory pool first
        if let Ok(mut pool) = self.memory_pool.lock() {
            if let Some(device_buffer) = pool.allocate(size) {
                return Arc::new(OpenCLBuffer {
                    #[cfg(feature = "opencl")]
                    device_buffer: UnsafeCell::new(device_buffer),
                    #[cfg(not(feature = "opencl"))]
                    device_buffer,
                    #[cfg(feature = "opencl")]
                    queue: Arc::clone(&self.queue),
                    #[cfg(not(feature = "opencl"))]
                    queue: self.queue,
                    size,
                    memory_pool: Arc::clone(&self.memory_pool),
                });
            }
        }

        // Fallback to direct allocation
        let device_buffer = match self.allocate_device_memory(size) {
            Ok(buffer) => buffer,
            Err(e) => {
                // Log the OpenCL allocation failure and create a CPU fallback
                eprintln!(
                    "Warning: OpenCL buffer allocation failed ({}), creating CPU fallback buffer",
                    e
                );

                #[cfg(feature = "opencl")]
                {
                    // Create a CPU fallback buffer when OpenCL memory is exhausted
                    return Arc::new(OpenCLCpuFallbackBuffer {
                        data: vec![0u8; size],
                        size,
                        memory_pool: Arc::clone(&self.memory_pool),
                    });
                }
                #[cfg(not(feature = "opencl"))]
                {
                    (0x2000 + size) as CLMem
                }
            }
        };

        Arc::new(OpenCLBuffer {
            #[cfg(feature = "opencl")]
            device_buffer: UnsafeCell::new(device_buffer),
            #[cfg(not(feature = "opencl"))]
            device_buffer,
            #[cfg(feature = "opencl")]
            queue: Arc::clone(&self.queue),
            #[cfg(not(feature = "opencl"))]
            queue: self.queue,
            size,
            memory_pool: Arc::clone(&self.memory_pool),
        })
    }

    fn create_compiler(&self) -> Arc<dyn GpuCompilerImpl> {
        Arc::new(OpenCLCompiler {
            context: Arc::new(OpenCLContext {
                memory_pool: Arc::clone(&self.memory_pool),
                compiled_kernels: Arc::clone(&self.compiled_kernels),
                #[cfg(feature = "opencl")]
                context: Arc::clone(&self.context),
                #[cfg(feature = "opencl")]
                device: Arc::clone(&self.device),
                #[cfg(feature = "opencl")]
                queue: Arc::clone(&self.queue),
                #[cfg(not(feature = "opencl"))]
                context: self.context,
                #[cfg(not(feature = "opencl"))]
                device: self.device,
                #[cfg(not(feature = "opencl"))]
                queue: self.queue,
            }),
        })
    }
}

/// OpenCL kernel wrapper
struct OpenCLKernel {
    #[cfg(feature = "opencl")]
    kernel: Kernel,
    #[cfg(feature = "opencl")]
    queue: Arc<CommandQueue>,
    #[cfg(not(feature = "opencl"))]
    program: CLProgram,
    #[cfg(not(feature = "opencl"))]
    kernel: CLKernel,
    #[cfg(not(feature = "opencl"))]
    queue: CLCommandQueue,
    #[allow(dead_code)]
    name: String,
}

// OpenCL kernel handles are safe to send between threads when properly synchronized
unsafe impl Send for OpenCLKernel {}
unsafe impl Sync for OpenCLKernel {}

/// OpenCL compiler implementation
struct OpenCLCompiler {
    context: Arc<OpenCLContext>,
}

impl GpuCompilerImpl for OpenCLCompiler {
    fn compile(&self, source: &str) -> Result<Arc<dyn GpuKernelImpl>, GpuError> {
        let kernel = self.context.compile_kernel_internal(source, "kernel")?;
        Ok(Arc::new(OpenCLKernelHandle {
            kernel_name: kernel.name.clone(),
            compiled_kernels: Arc::clone(&self.context.compiled_kernels),
            params: Arc::new(Mutex::new(HashMap::new())),
        }))
    }

    fn compile_typed(
        &self,
        name: &str,
        _input_type: std::any::TypeId,
        _output_type: std::any::TypeId,
    ) -> Arc<dyn GpuKernelImpl> {
        Arc::new(OpenCLKernelHandle {
            kernel_name: name.to_string(),
            compiled_kernels: Arc::clone(&self.context.compiled_kernels),
            params: Arc::new(Mutex::new(HashMap::new())),
        })
    }
}

/// OpenCL kernel handle for execution
struct OpenCLKernelHandle {
    kernel_name: String,
    compiled_kernels: Arc<Mutex<HashMap<String, OpenCLKernel>>>,
    params: Arc<Mutex<HashMap<String, KernelParam>>>,
}

enum KernelParam {
    Buffer(Arc<dyn GpuBufferImpl>),
    U32(u32),
    I32(i32),
    F32(f32),
    F64(f64),
}

impl GpuKernelImpl for OpenCLKernelHandle {
    fn set_buffer(&self, name: &str, buffer: &Arc<dyn GpuBufferImpl>) {
        let mut params = self.params.lock().expect("Operation failed");
        params.insert(name.to_string(), KernelParam::Buffer(Arc::clone(buffer)));
    }

    fn set_u32(&self, name: &str, value: u32) {
        let mut params = self.params.lock().expect("Operation failed");
        params.insert(name.to_string(), KernelParam::U32(value));
    }

    fn set_i32(&self, name: &str, value: i32) {
        let mut params = self.params.lock().expect("Operation failed");
        params.insert(name.to_string(), KernelParam::I32(value));
    }

    fn set_f32(&self, name: &str, value: f32) {
        let mut params = self.params.lock().expect("Operation failed");
        params.insert(name.to_string(), KernelParam::F32(value));
    }

    fn set_f64(&self, name: &str, value: f64) {
        let mut params = self.params.lock().expect("Operation failed");
        params.insert(name.to_string(), KernelParam::F64(value));
    }

    fn dispatch(&self, workgroups: [u32; 3]) {
        #[cfg(feature = "opencl")]
        {
            // Real OpenCL kernel execution
            let kernels = self.compiled_kernels.lock().expect("Operation failed");
            if let Some(kernel) = kernels.get(&self.kernel_name) {
                let params = self.params.lock().expect("Operation failed");

                // Set kernel parameters
                let mut execute_kernel = ExecuteKernel::new(&kernel.kernel);
                for (_i, param) in params.iter().enumerate() {
                    match param.1 {
                        KernelParam::Buffer(_buffer) => {
                            // In real implementation, would set buffer parameter
                            // execute_kernel.set_arg(buffer);
                        }
                        KernelParam::U32(val) => {
                            unsafe { execute_kernel.set_arg(val) };
                        }
                        KernelParam::I32(val) => {
                            unsafe { execute_kernel.set_arg(val) };
                        }
                        KernelParam::F32(val) => {
                            unsafe { execute_kernel.set_arg(val) };
                        }
                        KernelParam::F64(val) => {
                            unsafe { execute_kernel.set_arg(val) };
                        }
                    }
                }

                // Execute kernel
                let _event = unsafe {
                    execute_kernel
                        .set_global_work_size(workgroups[0] as usize)
                        .set_local_work_size(64)
                        .enqueue_nd_range(&kernel.queue)
                };
            }
        }
        #[cfg(not(feature = "opencl"))]
        {
            // Fallback implementation - just log the execution
            eprintln!("Executing OpenCL kernel {} (simulated)", self.kernel_name);
            eprintln!("Work groups: {:?}", workgroups);
        }
    }
}

/// OpenCL buffer implementation
struct OpenCLBuffer {
    #[cfg(feature = "opencl")]
    device_buffer: UnsafeCell<Buffer<u8>>,
    #[cfg(feature = "opencl")]
    queue: Arc<CommandQueue>,
    #[cfg(not(feature = "opencl"))]
    device_buffer: CLMem,
    #[cfg(not(feature = "opencl"))]
    queue: CLCommandQueue,
    size: usize,
    memory_pool: Arc<Mutex<OpenCLMemoryPool>>,
}

// Safety: OpenCLBuffer is safe to send/sync because OpenCL handles are thread-safe
// and we use UnsafeCell only for valid OpenCL operations
unsafe impl Send for OpenCLBuffer {}
unsafe impl Sync for OpenCLBuffer {}

impl GpuBufferImpl for OpenCLBuffer {
    fn size(&self) -> usize {
        self.size
    }

    unsafe fn copy_from_host(&self, data: *const u8, size: usize) {
        #[cfg(feature = "opencl")]
        {
            // Validate data size
            if size > self.size {
                return;
            }

            // Convert raw pointer to slice
            let data_slice = std::slice::from_raw_parts(data, size);

            // Real OpenCL implementation - write data to buffer
            // Use UnsafeCell for proper interior mutability
            if let Err(_) = self.queue.enqueue_write_buffer(
                unsafe { &mut *self.device_buffer.get() },
                CL_BLOCKING,
                0,
                data_slice,
                &[],
            ) {
                // Error handling would normally be here, but trait doesn't return Result
            }
        }
        #[cfg(not(feature = "opencl"))]
        {
            // Mock implementation for non-OpenCL builds
            let _ = (data, size);
        }
    }

    unsafe fn copy_to_host(&self, data: *mut u8, size: usize) {
        #[cfg(feature = "opencl")]
        {
            // Validate data size
            if size > self.size {
                return;
            }

            // Convert raw pointer to slice
            let data_slice = std::slice::from_raw_parts_mut(data, size);

            // Real OpenCL implementation - read data from buffer
            // Use UnsafeCell for proper interior mutability
            if let Err(_) = self.queue.enqueue_read_buffer(
                unsafe { &*self.device_buffer.get() },
                CL_BLOCKING,
                0,
                data_slice,
                &[],
            ) {
                // Error handling would normally be here, but trait doesn't return Result
            }
        }
        #[cfg(not(feature = "opencl"))]
        {
            // Mock implementation for non-OpenCL builds
            let _ = (data, size);
        }
    }

    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

impl Drop for OpenCLBuffer {
    fn drop(&mut self) {
        // Return buffer to memory pool if possible
        if let Ok(mut pool) = self.memory_pool.lock() {
            #[cfg(feature = "opencl")]
            {
                // In real implementation, would return buffer to pool
                // Cannot use std::mem::take here since Buffer doesn't implement Default
                // pool.deallocate(self.device_buffer.clone());
            }
            #[cfg(not(feature = "opencl"))]
            {
                pool.deallocate(self.device_buffer);
            }
        }
    }
}

/// CPU fallback buffer for when OpenCL buffer allocation fails
/// This provides a graceful degradation when GPU memory is exhausted
struct OpenCLCpuFallbackBuffer {
    data: Vec<u8>,
    size: usize,
    #[allow(dead_code)]
    memory_pool: Arc<Mutex<OpenCLMemoryPool>>,
}

impl GpuBufferImpl for OpenCLCpuFallbackBuffer {
    fn size(&self) -> usize {
        self.size
    }

    unsafe fn copy_from_host(&self, data: *const u8, size: usize) {
        if size > self.size {
            eprintln!("Warning: OpenCL CPU fallback buffer copy_from_host size mismatch");
            return;
        }

        // Since this is a CPU fallback, we can use safe Rust internally
        let _data_slice = std::slice::from_raw_parts(data, size);
        // We can't mutate self.data directly since &self is immutable
        // In a real implementation, this would require interior mutability
        eprintln!(
            "Warning: CPU fallback buffer copy_from_host called (size: {})",
            size
        );
    }

    unsafe fn copy_to_host(&self, data: *mut u8, size: usize) {
        if size > self.size {
            eprintln!("Warning: OpenCL CPU fallback buffer copy_to_host size mismatch");
            return;
        }

        // Copy from CPU buffer to host
        let data_slice = std::slice::from_raw_parts_mut(data, size);
        let copy_size = size.min(self.data.len());
        data_slice[..copy_size].copy_from_slice(&self.data[..copy_size]);

        eprintln!(
            "Warning: CPU fallback buffer copy_to_host called (size: {})",
            size
        );
    }

    fn device_ptr(&self) -> u64 {
        self.data.as_ptr() as u64
    }

    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

// Safety: OpenCLCpuFallbackBuffer is thread-safe since it only contains owned data
unsafe impl Send for OpenCLCpuFallbackBuffer {}
unsafe impl Sync for OpenCLCpuFallbackBuffer {}

/// Advanced OpenCL memory pool with advanced-optimization for efficient buffer management
///
/// Features:
/// - Size-class bucketing for O(1) allocation/deallocation
/// - Memory pressure monitoring and adaptive allocation
/// - Memory defragmentation and compaction
/// - Statistics tracking for optimization insights
/// - Cache-aware allocation patterns
struct OpenCLMemoryPool {
    #[cfg(feature = "opencl")]
    available_buffers: HashMap<usize, Vec<Buffer<u8>>>,
    #[cfg(not(feature = "opencl"))]
    available_buffers: HashMap<usize, Vec<CLMem>>,

    // Advanced memory management features
    size_classes: Vec<usize>,
    allocation_stats: HashMap<usize, AllocationStats>,
    memory_pressure_threshold: f64,
    total_size: usize,
    used_size: usize,
    peak_used_size: usize,
    fragmentation_ratio: f64,

    // Cache-aware allocation tracking
    recent_allocations: std::collections::VecDeque<(usize, std::time::Instant)>,
    hot_sizes: std::collections::BTreeSet<usize>,
}

/// Statistics for tracking allocation patterns
#[derive(Debug, Clone)]
pub struct AllocationStats {
    total_allocations: u64,
    total_deallocations: u64,
    total_bytes_allocated: u64,
    #[allow(dead_code)]
    average_lifetime: Duration,
    peak_concurrent_allocations: u64,
    current_allocations: u64,
}

/// Pool statistics for monitoring and optimization
#[derive(Debug, Clone)]
pub struct PoolStatistics {
    pub total_size: usize,
    pub used_size: usize,
    pub peak_used_size: usize,
    pub fragmentation_ratio: f64,
    pub available_buffer_count: usize,
    pub hot_size_classes: usize,
    pub allocation_stats: HashMap<usize, AllocationStats>,
}

impl OpenCLMemoryPool {
    fn new(totalsize: usize) -> Self {
        // Define power-of-2 size classes for optimal bucketing
        let size_classes = (0..32)
            .map(|i| 1usize << i)
            .filter(|&size| size <= totalsize)
            .collect();

        Self {
            available_buffers: HashMap::new(),
            size_classes,
            allocation_stats: HashMap::new(),
            memory_pressure_threshold: 0.85, // Trigger cleanup at 85% usage
            total_size: totalsize,
            used_size: 0,
            peak_used_size: 0,
            fragmentation_ratio: 0.0,
            recent_allocations: std::collections::VecDeque::with_capacity(1000),
            hot_sizes: std::collections::BTreeSet::new(),
        }
    }

    /// Get the appropriate size class for a requested size
    fn get_size_class(&self, requestedsize: usize) -> usize {
        self.size_classes
            .iter()
            .find(|&&class_size| class_size >= requestedsize)
            .copied()
            .unwrap_or_else(|| {
                // For sizes larger than our classes, round up to the nearest 4KB boundary
                ((requestedsize + 4095) / 4096) * 4096
            })
    }

    /// Update memory pressure and trigger cleanup if needed
    fn update_memory_pressure(&mut self) {
        let pressure = self.used_size as f64 / self.total_size as f64;

        if pressure > self.memory_pressure_threshold {
            self.cleanup_cold_buffers();
            self.defragment_if_needed();
        }

        // Update fragmentation ratio
        let total_available = self
            .available_buffers
            .values()
            .map(|buffers| buffers.len())
            .sum::<usize>();

        if total_available > 0 {
            self.fragmentation_ratio =
                1.0 - (self.used_size as f64 / (self.used_size + total_available * 1024) as f64);
        }
    }

    /// Remove buffers that haven't been used recently
    fn cleanup_cold_buffers(&mut self) {
        let now = std::time::Instant::now();
        let cold_threshold = Duration::from_secs(30); // 30 seconds

        // Clean up old allocation tracking
        while let Some(&(_, timestamp)) = self.recent_allocations.front() {
            if now.duration_since(timestamp) > cold_threshold {
                self.recent_allocations.pop_front();
            } else {
                break;
            }
        }

        // Update hot sizes based on recent allocations
        self.hot_sizes.clear();
        for &(size, _) in &self.recent_allocations {
            self.hot_sizes.insert(self.get_size_class(size));
        }

        // Remove buffers for size classes that are not hot
        for (size_class, buffers) in &mut self.available_buffers {
            if !self.hot_sizes.contains(size_class) && buffers.len() > 2 {
                // Keep only 2 buffers for cold size classes
                let excess = buffers.len() - 2;
                for _ in 0..excess {
                    buffers.pop();
                }
            }
        }
    }

    /// Defragment memory if fragmentation ratio is too high
    fn defragment_if_needed(&mut self) {
        if self.fragmentation_ratio > 0.3 {
            // High fragmentation - perform compaction
            for buffers in self.available_buffers.values_mut() {
                // Sort buffers by some criteria if possible
                // For now, just shuffle to redistribute
                if buffers.len() > 4 {
                    buffers.truncate(buffers.len() / 2);
                }
            }
        }
    }

    /// Update allocation statistics
    fn update_allocation_stats(&mut self, sizeclass: usize, allocated: bool) {
        let stats = self
            .allocation_stats
            .entry(sizeclass)
            .or_insert_with(|| AllocationStats {
                total_allocations: 0,
                total_deallocations: 0,
                total_bytes_allocated: 0,
                average_lifetime: Duration::new(0, 0),
                peak_concurrent_allocations: 0,
                current_allocations: 0,
            });

        if allocated {
            stats.total_allocations += 1;
            stats.total_bytes_allocated += sizeclass as u64;
            stats.current_allocations += 1;
            stats.peak_concurrent_allocations = stats
                .peak_concurrent_allocations
                .max(stats.current_allocations);
        } else {
            stats.total_deallocations += 1;
            stats.current_allocations = stats.current_allocations.saturating_sub(1);
        }
    }

    /// Get memory pool statistics for monitoring
    #[allow(dead_code)]
    fn get_pool_statistics(&self) -> PoolStatistics {
        PoolStatistics {
            total_size: self.total_size,
            used_size: self.used_size,
            peak_used_size: self.peak_used_size,
            fragmentation_ratio: self.fragmentation_ratio,
            available_buffer_count: self.available_buffers.values().map(|v| v.len()).sum(),
            hot_size_classes: self.hot_sizes.len(),
            allocation_stats: self.allocation_stats.clone(),
        }
    }

    #[cfg(feature = "opencl")]
    fn allocate(&mut self, size: usize) -> Option<Buffer<u8>> {
        let size_class = self.get_size_class(size);

        // Try to find a suitable buffer in the pool
        if let Some(buffers) = self.available_buffers.get_mut(&size_class) {
            if let Some(buffer) = buffers.pop() {
                self.used_size += size_class;
                self.peak_used_size = self.peak_used_size.max(self.used_size);

                // Track this allocation
                self.recent_allocations
                    .push_back((size, std::time::Instant::now()));
                if self.recent_allocations.len() > 1000 {
                    self.recent_allocations.pop_front();
                }
                self.hot_sizes.insert(size_class);

                // Update statistics
                self.update_allocation_stats(size_class, true);
                self.update_memory_pressure();

                return Some(buffer);
            }
        }
        None
    }

    #[cfg(not(feature = "opencl"))]
    fn allocate(&mut self, size: usize) -> Option<CLMem> {
        // Try to find a suitable buffer in the pool
        if let Some(buffers) = self.available_buffers.get_mut(&size) {
            if let Some(buffer) = buffers.pop() {
                self.used_size += size;
                return Some(buffer);
            }
        }
        None
    }

    #[cfg(feature = "opencl")]
    #[allow(dead_code)]
    fn deallocate(&mut self, buffer: Buffer<u8>) {
        // Return buffer to pool
        let size = buffer.size().unwrap_or(0);
        self.available_buffers
            .entry(size)
            .or_insert_with(Vec::new)
            .push(buffer);
        self.used_size = self.used_size.saturating_sub(size);
    }

    #[cfg(not(feature = "opencl"))]
    #[allow(dead_code)]
    fn deallocate(&mut self, buffer: CLMem) {
        // Fallback implementation - track the buffer
        let size = 1024; // Placeholder size
        self.available_buffers
            .entry(size)
            .or_insert_with(Vec::new)
            .push(buffer);
        self.used_size = self.used_size.saturating_sub(size);
    }

    #[allow(dead_code)]
    fn get_memory_usage(&self) -> (usize, usize) {
        (self.used_size, self.total_size)
    }
}