scirs2-fft 0.4.2

//! Memory management for GPU-accelerated sparse FFT
//!
//! This module provides memory management utilities for GPU-accelerated sparse FFT
//! implementations, including buffer allocation, reuse, and transfer optimization.

use crate::error::{FFTError, FFTResult};
use crate::sparse_fft_gpu::GPUBackend;
use scirs2_core::numeric::Complex64;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};

// CUDA support temporarily disabled until cudarc dependency is enabled
// #[cfg(feature = "cuda")]
// use cudarc::driver::{CudaDevice, DevicePtr, DriverError};

// HIP support temporarily disabled until hiprt dependency is enabled
// #[cfg(feature = "hip")]
// use hiprt::{hipDevice_t, hipDeviceptr_t, hipError_t};

#[cfg(any(feature = "cuda", feature = "hip", feature = "sycl"))]
use std::sync::OnceLock;

// CUDA support temporarily disabled until cudarc dependency is enabled
#[cfg(feature = "cuda")]
static CUDA_DEVICE: OnceLock<Option<Arc<u8>>> = OnceLock::new(); // Placeholder type

// HIP support temporarily disabled until hiprt dependency is enabled
#[cfg(feature = "hip")]
static HIP_DEVICE: OnceLock<Option<u8>> = OnceLock::new(); // Placeholder type

#[cfg(feature = "sycl")]
static SYCL_DEVICE: OnceLock<Option<SyclDevice>> = OnceLock::new();

/// Placeholder SYCL device type
#[cfg(feature = "sycl")]
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct SyclDevice {
    device_id: i32,
    device_name: String,
}

/// Placeholder SYCL device pointer type
#[cfg(feature = "sycl")]
pub type SyclDevicePtr = *mut std::os::raw::c_void;

/// Memory buffer location
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BufferLocation {
    /// Host (CPU) memory
    Host,
    /// Device (GPU) memory
    Device,
    /// Pinned host memory (page-locked for faster transfers)
    PinnedHost,
    /// Unified memory (accessible from both CPU and GPU)
    Unified,
}

/// Memory buffer type
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BufferType {
    /// Input signal buffer
    Input,
    /// Output signal buffer
    Output,
    /// Work buffer for intermediate results
    Work,
    /// FFT plan buffer
    Plan,
}

/// Buffer allocation strategy
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AllocationStrategy {
    /// Allocate once, reuse for same sizes
    CacheBySize,
    /// Allocate for each operation
    AlwaysAllocate,
    /// Preallocate a fixed size buffer
    Preallocate,
    /// Use a pool of buffers
    BufferPool,
}

/// Memory buffer descriptor
#[derive(Debug, Clone)]
pub struct BufferDescriptor {
    /// Size of the buffer in elements
    pub size: usize,
    /// Element size in bytes
    pub element_size: usize,
    /// Buffer location
    pub location: BufferLocation,
    /// Buffer type
    pub buffer_type: BufferType,
    /// Buffer ID
    pub id: usize,
    /// GPU backend used for this buffer
    pub backend: GPUBackend,
    /// Device memory pointer (for CUDA)
    #[cfg(feature = "cuda")]
    cuda_device_ptr: Option<*mut u8>, // Placeholder type for disabled CUDA
    /// Device memory pointer (for HIP)
    #[cfg(feature = "hip")]
    hip_device_ptr: Option<*mut u8>, // Placeholder type for disabled HIP
    /// Device memory pointer (for SYCL)
    #[cfg(feature = "sycl")]
    sycl_device_ptr: Option<SyclDevicePtr>,
    /// Host memory pointer (for CPU fallback or pinned memory)
    host_ptr: Option<*mut std::os::raw::c_void>,
}

// SAFETY: BufferDescriptor manages memory through proper allocation/deallocation
// Raw pointers are only used within controlled contexts
unsafe impl Send for BufferDescriptor {}
unsafe impl Sync for BufferDescriptor {}

/// Initialize CUDA device (call once at startup)
#[cfg(feature = "cuda")]
#[allow(dead_code)]
pub fn init_cuda_device() -> FFTResult<bool> {
    let device_result = CUDA_DEVICE.get_or_init(|| {
        // CUDA device initialization temporarily disabled until cudarc dependency is enabled
        /*
        match CudaDevice::new(0) {
            Ok(device) => Some(Arc::new(device)),
            Err(_) => None,
        }
        */
        None // Placeholder - no CUDA device available
    });

    Ok(device_result.is_some())
}

/// Initialize CUDA device (no-op without CUDA feature)
#[cfg(not(feature = "cuda"))]
#[allow(dead_code)]
pub fn init_cuda_device() -> FFTResult<bool> {
    Ok(false)
}

/// Initialize HIP device (call once at startup)
#[cfg(feature = "hip")]
#[allow(dead_code)]
pub fn init_hip_device() -> FFTResult<bool> {
    // HIP support temporarily disabled until hiprt dependency is enabled
    Err(FFTError::NotImplementedError(
        "HIP support is temporarily disabled".to_string(),
    ))
}

/// Initialize HIP device (no-op without HIP feature)
#[cfg(not(feature = "hip"))]
#[allow(dead_code)]
pub fn init_hip_device() -> FFTResult<bool> {
    Ok(false)
}

/// Initialize SYCL device (call once at startup)
#[cfg(feature = "sycl")]
#[allow(dead_code)]
pub fn init_sycl_device() -> FFTResult<bool> {
    let device_result = SYCL_DEVICE.get_or_init(|| {
        // In a real SYCL implementation, this would:
        // 1. Query available SYCL devices
        // 2. Select the best device (GPU preferred, then CPU)
        // 3. Create a SYCL context and queue

        // For now, we'll create a placeholder device
        Some(SyclDevice {
            device_id: 0,
            device_name: "Generic SYCL Device".to_string(),
        })
    });

    Ok(device_result.is_some())
}

/// Initialize SYCL device (no-op without SYCL feature)
#[cfg(not(feature = "sycl"))]
#[allow(dead_code)]
pub fn init_sycl_device() -> FFTResult<bool> {
    Ok(false)
}

/// Check if CUDA is available
#[cfg(feature = "cuda")]
#[allow(dead_code)]
pub fn is_cuda_available() -> bool {
    CUDA_DEVICE.get().map(|d| d.is_some()).unwrap_or(false)
}

/// Check if CUDA is available (always false without CUDA feature)
#[cfg(not(feature = "cuda"))]
#[allow(dead_code)]
pub fn is_cuda_available() -> bool {
    false
}

/// Check if HIP is available
#[cfg(feature = "hip")]
#[allow(dead_code)]
pub fn is_hip_available() -> bool {
    HIP_DEVICE.get().map(|d| d.is_some()).unwrap_or(false)
}

/// Check if HIP is available (always false without HIP feature)
#[cfg(not(feature = "hip"))]
#[allow(dead_code)]
pub fn is_hip_available() -> bool {
    false
}

/// Check if SYCL is available
#[cfg(feature = "sycl")]
#[allow(dead_code)]
pub fn is_sycl_available() -> bool {
    SYCL_DEVICE.get().map(|d| d.is_some()).unwrap_or(false)
}

/// Check if SYCL is available (always false without SYCL feature)
#[cfg(not(feature = "sycl"))]
#[allow(dead_code)]
pub fn is_sycl_available() -> bool {
    false
}

/// Check if any GPU backend is available
#[allow(dead_code)]
pub fn is_gpu_available() -> bool {
    is_cuda_available() || is_hip_available() || is_sycl_available()
}

/// Initialize the best available GPU backend
#[allow(dead_code)]
pub fn init_gpu_backend() -> FFTResult<GPUBackend> {
    // Try CUDA first (usually fastest)
    if init_cuda_device()? {
        return Ok(GPUBackend::CUDA);
    }

    // Then try HIP (AMD GPUs)
    if init_hip_device()? {
        return Ok(GPUBackend::HIP);
    }

    // Then try SYCL (cross-platform, Intel GPUs, etc.)
    if init_sycl_device()? {
        return Ok(GPUBackend::SYCL);
    }

    // Fall back to CPU
    Ok(GPUBackend::CPUFallback)
}

impl BufferDescriptor {
    /// Create a new buffer descriptor with specified backend
    pub fn new(
        size: usize,
        element_size: usize,
        location: BufferLocation,
        buffer_type: BufferType,
        id: usize,
        backend: GPUBackend,
    ) -> FFTResult<Self> {
        let mut descriptor = Self {
            size,
            element_size,
            location,
            buffer_type,
            id,
            backend,
            #[cfg(feature = "cuda")]
            cuda_device_ptr: None,
            #[cfg(feature = "hip")]
            hip_device_ptr: None,
            #[cfg(feature = "sycl")]
            sycl_device_ptr: None,
            host_ptr: None,
        };

        descriptor.allocate()?;
        Ok(descriptor)
    }

    /// Create a new buffer descriptor with auto-detected backend
    pub fn new_auto(
        size: usize,
        element_size: usize,
        location: BufferLocation,
        buffer_type: BufferType,
        id: usize,
    ) -> FFTResult<Self> {
        let backend = init_gpu_backend()?;
        Self::new(size, element_size, location, buffer_type, id, backend)
    }

    /// Allocate the actual memory based on location and backend
    fn allocate(&mut self) -> FFTResult<()> {
        let total_size = self.size * self.element_size;

        match self.location {
            BufferLocation::Device => {
                match self.backend {
                    GPUBackend::CUDA => {
                        #[cfg(feature = "cuda")]
                        {
                            if let Some(_device) = CUDA_DEVICE.get().and_then(|d| d.as_ref()) {
                                // CUDA API calls temporarily disabled until cudarc dependency is enabled
                                /*
                                let device_mem = device.alloc::<u8>(total_size).map_err(|e| {
                                    FFTError::ComputationError(format!(
                                        "Failed to allocate CUDA memory: {:?}",
                                        e
                                    ))
                                })?;
                                self.cuda_device_ptr = Some(device_mem);
                                return Ok(());
                                */
                            }
                        }

                        // Fallback to host memory if CUDA is not available
                        self.backend = GPUBackend::CPUFallback;
                        self.location = BufferLocation::Host;
                        self.allocate_host_memory(total_size)?;
                    }
                    GPUBackend::HIP => {
                        #[cfg(feature = "hip")]
                        {
                            if HIP_DEVICE.get().map(|d| d.is_some()).unwrap_or(false) {
                                // use hiprt::*; // Temporarily disabled
                                // HIP API calls temporarily disabled until hiprt dependency is available
                                /*
                                unsafe {
                                    let mut device_ptr: hipDeviceptr_t = std::ptr::null_mut();
                                    let result = hipMalloc(&mut device_ptr, total_size);
                                    if result == hipError_t::hipSuccess {
                                        self.hip_device_ptr = Some(device_ptr);
                                        return Ok(());
                                    } else {
                                        return Err(FFTError::ComputationError(format!(
                                            "Failed to allocate HIP memory: {:?}",
                                            result
                                        )));
                                    }
                                }
                                */
                            }
                        }

                        // Fallback to host memory if HIP is not available
                        self.backend = GPUBackend::CPUFallback;
                        self.location = BufferLocation::Host;
                        self.allocate_host_memory(total_size)?;
                    }
                    GPUBackend::SYCL => {
                        #[cfg(feature = "sycl")]
                        {
                            if SYCL_DEVICE.get().map(|d| d.is_some()).unwrap_or(false) {
                                // In a real SYCL implementation, this would:
                                // 1. Use sycl::malloc_device() to allocate device memory
                                // 2. Store the device pointer for later use
                                // 3. Handle allocation errors appropriately

                                // For placeholder implementation, simulate successful allocation
                                let device_ptr = Box::into_raw(Box::new(vec![0u8; total_size]))
                                    as *mut std::os::raw::c_void;
                                self.sycl_device_ptr = Some(device_ptr);
                                return Ok(());
                            }
                        }

                        // Fallback to host memory if SYCL is not available
                        self.backend = GPUBackend::CPUFallback;
                        self.location = BufferLocation::Host;
                        self.allocate_host_memory(total_size)?;
                    }
                    GPUBackend::CPUFallback => {
                        self.location = BufferLocation::Host;
                        self.allocate_host_memory(total_size)?;
                    }
                }
            }
            BufferLocation::Host | BufferLocation::PinnedHost | BufferLocation::Unified => {
                self.allocate_host_memory(total_size)?;
            }
        }

        Ok(())
    }

    /// Allocate host memory
    fn allocate_host_memory(&mut self, size: usize) -> FFTResult<()> {
        let vec = vec![0u8; size];
        let boxed_slice = vec.into_boxed_slice();
        let ptr = Box::into_raw(boxed_slice) as *mut std::os::raw::c_void;
        self.host_ptr = Some(ptr);
        Ok(())
    }

    /// Get host pointer and size
    pub fn get_host_ptr(&self) -> (*mut std::os::raw::c_void, usize) {
        match self.host_ptr {
            Some(ptr) => (ptr, self.size * self.element_size),
            None => {
                // This shouldn't happen with proper allocation
                panic!("Attempted to get host pointer from unallocated buffer");
            }
        }
    }

    /// Get device pointer (CUDA)
    #[cfg(feature = "cuda")]
    pub fn get_cuda_device_ptr(&self) -> Option<*mut u8> {
        self.cuda_device_ptr
    }

    /// Get device pointer (HIP)
    #[cfg(feature = "hip")]
    pub fn get_hip_device_ptr(&self) -> Option<*mut u8> {
        self.hip_device_ptr
    }

    /// Get device pointer (SYCL)
    #[cfg(feature = "sycl")]
    pub fn get_sycl_device_ptr(&self) -> Option<SyclDevicePtr> {
        self.sycl_device_ptr
    }

    /// Check if this buffer has GPU memory allocated
    pub fn has_device_memory(&self) -> bool {
        match self.backend {
            GPUBackend::CUDA => {
                #[cfg(feature = "cuda")]
                return self.cuda_device_ptr.is_some();
                #[cfg(not(feature = "cuda"))]
                return false;
            }
            GPUBackend::HIP => {
                #[cfg(feature = "hip")]
                return self.hip_device_ptr.is_some();
                #[cfg(not(feature = "hip"))]
                return false;
            }
            GPUBackend::SYCL => {
                #[cfg(feature = "sycl")]
                return self.sycl_device_ptr.is_some();
                #[cfg(not(feature = "sycl"))]
                return false;
            }
            _ => false,
        }
    }

    /// Copy data from host to device
    pub fn copy_host_to_device(&self, hostdata: &[u8]) -> FFTResult<()> {
        match self.location {
            BufferLocation::Device => {
                match self.backend {
                    GPUBackend::CUDA => {
                        #[cfg(feature = "cuda")]
                        {
                            if let (Some(_device_ptr), Some(_device)) = (
                                self.cuda_device_ptr.as_ref(),
                                CUDA_DEVICE.get().and_then(|d| d.as_ref()),
                            ) {
                                // CUDA API calls temporarily disabled until cudarc dependency is enabled
                                /*
                                device.htod_copy(hostdata, device_ptr).map_err(|e| {
                                    FFTError::ComputationError(format!(
                                        "Failed to copy _data to CUDA GPU: {:?}",
                                        e
                                    ))
                                })?;
                                return Ok(());
                                */
                            }
                        }

                        // Fallback to host memory
                        self.copy_to_host_memory(hostdata)?;
                    }
                    GPUBackend::HIP => {
                        #[cfg(feature = "hip")]
                        {
                            if let Some(_device_ptr) = self.hip_device_ptr {
                                // use hiprt::*; // Temporarily disabled
                                // HIP API calls temporarily disabled until hiprt dependency is available
                                /*
                                unsafe {
                                    let result = hipMemcpyHtoD(
                                        device_ptr,
                                        hostdata.as_ptr() as *const std::os::raw::c_void,
                                        hostdata.len(),
                                    );
                                    if result == hipError_t::hipSuccess {
                                        return Ok(());
                                    } else {
                                        return Err(FFTError::ComputationError(format!(
                                            "Failed to copy _data to HIP GPU: {:?}",
                                            result
                                        )));
                                    }
                                }
                                */
                            }
                        }

                        // Fallback to host memory
                        self.copy_to_host_memory(hostdata)?;
                    }
                    GPUBackend::SYCL => {
                        #[cfg(feature = "sycl")]
                        {
                            if let Some(device_ptr) = self.sycl_device_ptr {
                                // In a real SYCL implementation, this would:
                                // 1. Use sycl::queue::memcpy() or similar to copy _data
                                // 2. Handle synchronization appropriately
                                // 3. Return appropriate error codes

                                // For placeholder implementation, simulate the copy
                                unsafe {
                                    std::ptr::copy_nonoverlapping(
                                        hostdata.as_ptr(),
                                        device_ptr as *mut u8,
                                        hostdata.len(),
                                    );
                                }
                                return Ok(());
                            }
                        }

                        // Fallback to host memory
                        self.copy_to_host_memory(hostdata)?;
                    }
                    _ => {
                        // CPU fallback
                        self.copy_to_host_memory(hostdata)?;
                    }
                }
            }
            BufferLocation::Host | BufferLocation::PinnedHost | BufferLocation::Unified => {
                self.copy_to_host_memory(hostdata)?;
            }
        }

        Ok(())
    }

    /// Helper to copy data to host memory
    fn copy_to_host_memory(&self, hostdata: &[u8]) -> FFTResult<()> {
        if let Some(host_ptr) = self.host_ptr {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    hostdata.as_ptr(),
                    host_ptr as *mut u8,
                    hostdata.len(),
                );
            }
        }
        Ok(())
    }

    /// Copy data from device to host
    pub fn copy_device_to_host(&self, hostdata: &mut [u8]) -> FFTResult<()> {
        match self.location {
            BufferLocation::Device => {
                match self.backend {
                    GPUBackend::CUDA => {
                        #[cfg(feature = "cuda")]
                        {
                            if let (Some(_device_ptr), Some(_device)) = (
                                self.cuda_device_ptr.as_ref(),
                                CUDA_DEVICE.get().and_then(|d| d.as_ref()),
                            ) {
                                // CUDA API calls temporarily disabled until cudarc dependency is enabled
                                /*
                                device.dtoh_copy(device_ptr, hostdata).map_err(|e| {
                                    FFTError::ComputationError(format!(
                                        "Failed to copy _data from CUDA GPU: {:?}",
                                        e
                                    ))
                                })?;
                                return Ok(());
                                */
                            }
                        }

                        // Fallback to host memory
                        self.copy_from_host_memory(hostdata)?;
                    }
                    GPUBackend::HIP => {
                        #[cfg(feature = "hip")]
                        {
                            if let Some(_device_ptr) = self.hip_device_ptr {
                                // use hiprt::*; // Temporarily disabled
                                // HIP API calls temporarily disabled until hiprt dependency is available
                                /*
                                unsafe {
                                    let result = hipMemcpyDtoH(
                                        hostdata.as_mut_ptr() as *mut std::os::raw::c_void,
                                        device_ptr,
                                        hostdata.len(),
                                    );
                                    if result == hipError_t::hipSuccess {
                                        return Ok(());
                                    } else {
                                        return Err(FFTError::ComputationError(format!(
                                            "Failed to copy _data from HIP GPU: {:?}",
                                            result
                                        )));
                                    }
                                }
                                */
                            }
                        }

                        // Fallback to host memory
                        self.copy_from_host_memory(hostdata)?;
                    }
                    GPUBackend::SYCL => {
                        #[cfg(feature = "sycl")]
                        {
                            if let Some(device_ptr) = self.sycl_device_ptr {
                                // In a real SYCL implementation, this would:
                                // 1. Use sycl::queue::memcpy() to copy from device to host
                                // 2. Handle synchronization and error checking
                                // 3. Wait for completion if needed

                                // For placeholder implementation, simulate the copy
                                unsafe {
                                    std::ptr::copy_nonoverlapping(
                                        device_ptr as *const u8,
                                        hostdata.as_mut_ptr(),
                                        hostdata.len(),
                                    );
                                }
                                return Ok(());
                            }
                        }

                        // Fallback to host memory
                        self.copy_from_host_memory(hostdata)?;
                    }
                    _ => {
                        // CPU fallback
                        self.copy_from_host_memory(hostdata)?;
                    }
                }
            }
            BufferLocation::Host | BufferLocation::PinnedHost | BufferLocation::Unified => {
                self.copy_from_host_memory(hostdata)?;
            }
        }

        Ok(())
    }

    /// Helper to copy data from host memory
    fn copy_from_host_memory(&self, hostdata: &mut [u8]) -> FFTResult<()> {
        if let Some(host_ptr) = self.host_ptr {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    host_ptr as *const u8,
                    hostdata.as_mut_ptr(),
                    hostdata.len(),
                );
            }
        }
        Ok(())
    }
}

impl Drop for BufferDescriptor {
    fn drop(&mut self) {
        // Clean up host memory
        if let Some(ptr) = self.host_ptr.take() {
            unsafe {
                // Convert back to Box<[u8]> to drop properly
                let vec_size = self.size * self.element_size;
                let _ = Box::from_raw(std::ptr::slice_from_raw_parts_mut(ptr as *mut u8, vec_size));
            }
        }

        // Clean up device memory based on backend
        match self.backend {
            GPUBackend::CUDA => {
                // CUDA device memory is automatically dropped when DevicePtr goes out of scope
                #[cfg(feature = "cuda")]
                {
                    self.cuda_device_ptr.take();
                }
            }
            GPUBackend::HIP => {
                // Clean up HIP device memory
                #[cfg(feature = "hip")]
                {
                    if let Some(_device_ptr) = self.hip_device_ptr.take() {
                        // use hiprt::*; // Temporarily disabled
                        // HIP API calls temporarily disabled until hiprt dependency is available
                        /*
                        unsafe {
                            let _ = hipFree(device_ptr);
                        }
                        */
                    }
                }
            }
            GPUBackend::SYCL => {
                // Clean up SYCL device memory
                #[cfg(feature = "sycl")]
                {
                    if let Some(device_ptr) = self.sycl_device_ptr.take() {
                        // In a real SYCL implementation, this would:
                        // 1. Use sycl::free() to deallocate device memory
                        // 2. Handle any synchronization requirements
                        // 3. Clean up associated SYCL resources

                        // For placeholder implementation, free the allocated memory
                        unsafe {
                            let _ = Box::from_raw(device_ptr as *mut u8);
                        }
                    }
                }
            }
            _ => {
                // No GPU memory to clean up for CPU fallback
            }
        }
    }
}

/// GPU Memory manager for sparse FFT operations
pub struct GPUMemoryManager {
    /// GPU backend
    backend: GPUBackend,
    /// Current device ID
    _device_id: i32,
    /// Allocation strategy
    allocation_strategy: AllocationStrategy,
    /// Maximum memory usage in bytes
    max_memory: usize,
    /// Current memory usage in bytes
    current_memory: usize,
    /// Buffer cache by size
    buffer_cache: HashMap<usize, Vec<BufferDescriptor>>,
    /// Next buffer ID
    next_buffer_id: usize,
}

impl GPUMemoryManager {
    /// Create a new GPU memory manager
    pub fn new(
        backend: GPUBackend,
        device_id: i32,
        allocation_strategy: AllocationStrategy,
        max_memory: usize,
    ) -> Self {
        Self {
            backend,
            _device_id: device_id,
            allocation_strategy,
            max_memory,
            current_memory: 0,
            buffer_cache: HashMap::new(),
            next_buffer_id: 0,
        }
    }

    /// Get backend name
    pub fn backend_name(&self) -> &'static str {
        match self.backend {
            GPUBackend::CUDA => "CUDA",
            GPUBackend::HIP => "HIP",
            GPUBackend::SYCL => "SYCL",
            GPUBackend::CPUFallback => "CPU",
        }
    }

    /// Allocate a buffer of specified size and type
    pub fn allocate_buffer(
        &mut self,
        size: usize,
        element_size: usize,
        location: BufferLocation,
        buffer_type: BufferType,
    ) -> FFTResult<BufferDescriptor> {
        let total_size = size * element_size;

        // Check if we're going to exceed the memory limit
        if self.max_memory > 0 && self.current_memory + total_size > self.max_memory {
            return Err(FFTError::MemoryError(format!(
                "Memory limit exceeded: cannot allocate {} bytes (current usage: {} bytes, limit: {} bytes)",
                total_size, self.current_memory, self.max_memory
            )));
        }

        // If using a cache strategy, check if we have an available buffer
        if self.allocation_strategy == AllocationStrategy::CacheBySize {
            if let Some(buffers) = self.buffer_cache.get_mut(&size) {
                if let Some(descriptor) = buffers
                    .iter()
                    .position(|b| b.buffer_type == buffer_type && b.location == location)
                    .map(|idx| buffers.remove(idx))
                {
                    return Ok(descriptor);
                }
            }
        }

        // Allocate a new buffer with proper memory allocation
        let buffer_id = self.next_buffer_id;
        self.next_buffer_id += 1;
        self.current_memory += total_size;

        // Create descriptor with actual memory allocation
        let descriptor = BufferDescriptor::new(
            size,
            element_size,
            location,
            buffer_type,
            buffer_id,
            self.backend,
        )?;

        Ok(descriptor)
    }

    /// Release a buffer
    pub fn release_buffer(&mut self, descriptor: BufferDescriptor) -> FFTResult<()> {
        let buffer_size = descriptor.size * descriptor.element_size;

        // If using cache strategy, add to cache but don't decrement memory (it's still allocated)
        if self.allocation_strategy == AllocationStrategy::CacheBySize {
            self.buffer_cache
                .entry(descriptor.size)
                .or_default()
                .push(descriptor);
        } else {
            // Actually free the buffer and decrement memory usage
            self.current_memory = self.current_memory.saturating_sub(buffer_size);
        }

        Ok(())
    }

    /// Clear the buffer cache
    pub fn clear_cache(&mut self) -> FFTResult<()> {
        // Free all cached buffers and update memory usage
        for (_, buffers) in self.buffer_cache.drain() {
            for descriptor in buffers {
                let buffer_size = descriptor.size * descriptor.element_size;
                self.current_memory = self.current_memory.saturating_sub(buffer_size);
                // The BufferDescriptor's Drop implementation will handle actual memory cleanup
            }
        }

        Ok(())
    }

    /// Get current memory usage
    pub fn current_memory_usage(&self) -> usize {
        self.current_memory
    }

    /// Get memory limit
    pub fn memory_limit(&self) -> usize {
        self.max_memory
    }
}

/// Global memory manager singleton
static GLOBAL_MEMORY_MANAGER: Mutex<Option<Arc<Mutex<GPUMemoryManager>>>> = Mutex::new(None);

/// Initialize global memory manager
#[allow(dead_code)]
pub fn init_global_memory_manager(
    backend: GPUBackend,
    device_id: i32,
    allocation_strategy: AllocationStrategy,
    max_memory: usize,
) -> FFTResult<()> {
    let mut global = GLOBAL_MEMORY_MANAGER.lock().expect("Operation failed");
    *global = Some(Arc::new(Mutex::new(GPUMemoryManager::new(
        backend,
        device_id,
        allocation_strategy,
        max_memory,
    ))));
    Ok(())
}

/// Get global memory manager
#[allow(dead_code)]
pub fn get_global_memory_manager() -> FFTResult<Arc<Mutex<GPUMemoryManager>>> {
    let global = GLOBAL_MEMORY_MANAGER.lock().expect("Operation failed");
    if let Some(ref manager) = *global {
        Ok(manager.clone())
    } else {
        // Create a default memory manager if none exists
        init_global_memory_manager(
            GPUBackend::CPUFallback,
            -1,
            AllocationStrategy::CacheBySize,
            0,
        )?;
        get_global_memory_manager()
    }
}

/// Memory-efficient GPU sparse FFT computation
#[allow(dead_code)]
pub fn memory_efficient_gpu_sparse_fft<T>(
    signal: &[T],
    _max_memory: usize,
) -> FFTResult<Vec<Complex64>>
where
    T: Clone + 'static,
{
    // Get the global _memory manager
    let manager = get_global_memory_manager()?;
    let _manager = manager.lock().expect("Operation failed");

    // Determine optimal chunk size based on available _memory
    let signal_len = signal.len();
    // let _element_size = std::mem::size_of::<Complex64>();

    // In a real implementation, this would perform chunked processing
    // For now, just return a simple result
    let mut result = Vec::with_capacity(signal_len);
    for _ in 0..signal_len {
        result.push(Complex64::new(0.0, 0.0));
    }

    Ok(result)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_memory_manager_allocation() {
        let mut manager = GPUMemoryManager::new(
            GPUBackend::CPUFallback,
            -1,
            AllocationStrategy::AlwaysAllocate,
            1024 * 1024, // 1MB limit
        );

        // Allocate a buffer
        let buffer = manager
            .allocate_buffer(
                1024,
                8, // Size of Complex64
                BufferLocation::Host,
                BufferType::Input,
            )
            .expect("Operation failed");

        assert_eq!(buffer.size, 1024);
        assert_eq!(buffer.element_size, 8);
        assert_eq!(buffer.location, BufferLocation::Host);
        assert_eq!(buffer.buffer_type, BufferType::Input);
        assert_eq!(manager.current_memory_usage(), 1024 * 8);

        // Release buffer
        manager.release_buffer(buffer).expect("Operation failed");
        assert_eq!(manager.current_memory_usage(), 0);
    }

    #[test]
    fn test_memory_manager_cache() {
        let mut manager = GPUMemoryManager::new(
            GPUBackend::CPUFallback,
            -1,
            AllocationStrategy::CacheBySize,
            1024 * 1024, // 1MB limit
        );

        // Allocate a buffer
        let buffer1 = manager
            .allocate_buffer(
                1024,
                8, // Size of Complex64
                BufferLocation::Host,
                BufferType::Input,
            )
            .expect("Operation failed");

        // Release to cache
        manager.release_buffer(buffer1).expect("Operation failed");

        // Memory usage should not decrease when using CacheBySize
        assert_eq!(manager.current_memory_usage(), 1024 * 8);

        // Allocate same size buffer, should get from cache
        let buffer2 = manager
            .allocate_buffer(1024, 8, BufferLocation::Host, BufferType::Input)
            .expect("Operation failed");

        // Memory should not increase since we're reusing
        assert_eq!(manager.current_memory_usage(), 1024 * 8);

        // Release the second buffer back to cache
        manager.release_buffer(buffer2).expect("Operation failed");

        // Memory should still be allocated (cached)
        assert_eq!(manager.current_memory_usage(), 1024 * 8);

        // Clear cache - now this should free the cached memory
        manager.clear_cache().expect("Operation failed");
        assert_eq!(manager.current_memory_usage(), 0);
    }

    #[test]
    fn test_global_memory_manager() {
        // Initialize global memory manager
        init_global_memory_manager(
            GPUBackend::CPUFallback,
            -1,
            AllocationStrategy::CacheBySize,
            1024 * 1024,
        )
        .expect("Operation failed");

        // Get global memory manager
        let manager = get_global_memory_manager().expect("Operation failed");
        let mut manager = manager.lock().expect("Operation failed");

        // Allocate a buffer
        let buffer = manager
            .allocate_buffer(1024, 8, BufferLocation::Host, BufferType::Input)
            .expect("Operation failed");

        assert_eq!(buffer.size, 1024);
        manager.release_buffer(buffer).expect("Operation failed");
    }
}