scirs2-linalg 0.4.0

//! OpenCL backend implementation for cross-platform GPU acceleration
//!
//! This module provides a comprehensive OpenCL backend with clBLAS integration,
//! kernel compilation caching, and performance monitoring capabilities.

use super::common::*;
use std::ptr;
use std::sync::Arc;

/// Comprehensive OpenCL backend with clBLAS integration
#[cfg(feature = "opencl")]
pub mod opencl_impl {
    use super::*;

    // OpenCL types and constants (would normally come from opencl-sys crate)
    type ClInt = i32;
    type ClUInt = u32;
    type ClULong = u64;
    type ClBool = u32;

    // Thread-safe wrapper for OpenCL raw pointers
    #[derive(Debug, Clone, Copy)]
    pub struct SafeClPtr(pub *mut std::ffi::c_void);

    // SAFETY: In a real implementation, OpenCL handles are thread-safe
    // These are mock implementations for testing purposes
    unsafe impl Send for SafeClPtr {}
    unsafe impl Sync for SafeClPtr {}

    impl SafeClPtr {
        fn new(ptr: *mut std::ffi::c_void) -> Self {
            Self(ptr)
        }

        fn as_ptr(self) -> *mut std::ffi::c_void {
            self.0
        }
    }

    type ClPlatformId = SafeClPtr;
    type ClDeviceId = SafeClPtr;
    type ClContext = SafeClPtr;
    type ClCommandQueue = SafeClPtr;
    type ClProgram = SafeClPtr;
    type ClKernel = SafeClPtr;
    type ClMem = SafeClPtr;
    type ClEvent = SafeClPtr;

    const CL_SUCCESS: ClInt = 0;
    const CL_DEVICE_NOT_FOUND: ClInt = -1;
    const CL_DEVICE_TYPE_GPU: ClULong = 1 << 2;
    const CL_DEVICE_TYPE_CPU: ClULong = 1 << 1;
    const CL_DEVICE_TYPE_ALL: ClULong = 0xFFFFFFFF;
    const CL_MEM_READ_WRITE: ClULong = 1 << 0;
    const CL_MEM_COPY_HOST_PTR: ClULong = 1 << 2;

    // Mock OpenCL functions
    fn cl_get_platform_ids() -> (ClInt, Vec<ClPlatformId>) {
        (CL_SUCCESS, vec![]) // Mock empty platforms for safety
    }

    fn cl_get_device_ids(
        _platform: ClPlatformId,
        _device_type: ClULong,
    ) -> (ClInt, Vec<ClDeviceId>) {
        (CL_SUCCESS, vec![])
    }

    fn cl_get_device_info(_device: ClDeviceId, _param_name: ClUInt) -> (ClInt, Vec<u8>) {
        (CL_SUCCESS, vec![0; 256])
    }

    fn cl_get_platform_info(_platform: ClPlatformId, _param_name: ClUInt) -> (ClInt, String) {
        (CL_SUCCESS, "Mock Platform".to_string())
    }

    fn cl_create_context(_devices: &[ClDeviceId]) -> (ClInt, ClContext) {
        (CL_SUCCESS, SafeClPtr::new(ptr::null_mut()))
    }

    fn cl_create_command_queue(
        _context: ClContext,
        _device: ClDeviceId,
    ) -> (ClInt, ClCommandQueue) {
        (CL_SUCCESS, SafeClPtr::new(ptr::null_mut()))
    }

    fn cl_create_buffer(_context: ClContext, _flags: ClULong, _size: usize) -> (ClInt, ClMem) {
        (CL_SUCCESS, SafeClPtr::new(ptr::null_mut()))
    }

    fn cl_enqueue_write_buffer(
        _queue: ClCommandQueue,
        _buffer: ClMem,
        _blocking: ClBool,
        _offset: usize,
        _size: usize,
        _ptr: *const std::ffi::c_void,
    ) -> ClInt {
        CL_SUCCESS
    }

    fn cl_enqueue_read_buffer(
        _queue: ClCommandQueue,
        _buffer: ClMem,
        _blocking: ClBool,
        _offset: usize,
        _size: usize,
        _ptr: *mut std::ffi::c_void,
    ) -> ClInt {
        CL_SUCCESS
    }

    fn cl_finish(_queue: ClCommandQueue) -> ClInt {
        CL_SUCCESS
    }

    fn cl_release_mem_object(_memobj: ClMem) -> ClInt {
        CL_SUCCESS
    }

    /// Comprehensive OpenCL backend with cross-platform GPU support
    pub struct OpenClBackend {
        platforms: Vec<OpenClPlatform>,
        devices: Vec<OpenClDeviceInfo>,
        context_cache: HashMap<usize, Arc<OpenClContextData>>,
        opencl_version: String,
        extensions: Vec<String>,
    }

    #[derive(Debug, Clone)]
    pub struct OpenClPlatform {
        platform_id: ClPlatformId,
        name: String,
        vendor: String,
        version: String,
        profile: String,
        extensions: Vec<String>,
        devices: Vec<usize>, // Indices into global device list
    }

    #[derive(Debug, Clone)]
    struct OpenClDeviceInfo {
        device_id: ClDeviceId,
        platform_index: usize,
        device_type: ClULong,
        name: String,
        vendor: String,
        driver_version: String,
        device_version: String,
        opencl_c_version: String,
        max_compute_units: ClUInt,
        max_work_groupsize: usize,
        max_work_item_dimensions: ClUInt,
        max_work_itemsizes: Vec<usize>,
        preferred_vector_width_char: ClUInt,
        preferred_vector_width_short: ClUInt,
        preferred_vector_width_int: ClUInt,
        preferred_vector_width_long: ClUInt,
        preferred_vector_width_float: ClUInt,
        preferred_vector_width_double: ClUInt,
        max_clock_frequency: ClUInt,
        address_bits: ClUInt,
        max_mem_allocsize: ClULong,
        image_support: ClBool,
        max_read_image_args: ClUInt,
        max_write_image_args: ClUInt,
        image2d_max_width: usize,
        image2d_max_height: usize,
        image3d_max_width: usize,
        image3d_max_height: usize,
        image3d_max_depth: usize,
        max_samplers: ClUInt,
        max_parametersize: usize,
        mem_base_addr_align: ClUInt,
        min_data_type_alignsize: ClUInt,
        single_fp_config: ClULong,
        global_mem_cache_type: ClUInt,
        global_mem_cachelinesize: ClUInt,
        global_mem_cachesize: ClULong,
        global_memsize: ClULong,
        max_constant_buffersize: ClULong,
        max_constant_args: ClUInt,
        local_mem_type: ClUInt,
        local_memsize: ClULong,
        error_correction_support: ClBool,
        profiling_timer_resolution: usize,
        endian_little: ClBool,
        available: ClBool,
        compiler_available: ClBool,
        execution_capabilities: ClULong,
        queue_properties: ClULong,
        platform_id: ClPlatformId,
    }

    #[derive(Debug)]
    struct OpenClContextData {
        context: ClContext,
        device_id: ClDeviceId,
        command_queue: ClCommandQueue,
        device_info: OpenClDeviceInfo,
        kernel_cache: std::collections::HashMap<String, ClKernel>,
        program_cache: std::collections::HashMap<String, ClProgram>,
    }

    impl OpenClBackend {
        pub fn new() -> LinalgResult<Self> {
            // Get available OpenCL platforms
            let (result, platform_ids) = cl_get_platform_ids();
            if result != CL_SUCCESS {
                if result == CL_DEVICE_NOT_FOUND {
                    return Err(LinalgError::ComputationError(
                        "No OpenCL platforms found".to_string(),
                    ));
                }
                return Err(LinalgError::ComputationError(format!(
                    "Failed to get OpenCL platforms: error code {}",
                    result
                )));
            }

            let mut platforms = Vec::new();
            let mut all_devices = Vec::new();

            // Enumerate all platforms and their devices
            for (platform_idx, &platform_id) in platform_ids.iter().enumerate() {
                // Get platform information
                let (result, platform_name) = cl_get_platform_info(platform_id, 0x0902); // CL_PLATFORM_NAME
                if result != CL_SUCCESS {
                    continue;
                }

                let (result, platform_vendor) = cl_get_platform_info(platform_id, 0x0903); // CL_PLATFORM_VENDOR
                if result != CL_SUCCESS {
                    continue;
                }

                let (result, platform_version) = cl_get_platform_info(platform_id, 0x0901); // CL_PLATFORM_VERSION
                if result != CL_SUCCESS {
                    continue;
                }

                // Get devices for this platform
                let (result, device_ids) = cl_get_device_ids(platform_id, CL_DEVICE_TYPE_ALL);
                if result != CL_SUCCESS {
                    continue;
                }

                let mut platform_device_indices = Vec::new();

                for device_id in device_ids {
                    let device_info = Self::get_device_info(device_id, platform_idx, platform_id)?;
                    platform_device_indices.push(all_devices.len());
                    all_devices.push(device_info);
                }

                platforms.push(OpenClPlatform {
                    platform_id,
                    name: platform_name,
                    vendor: platform_vendor,
                    version: platform_version,
                    profile: "FULL_PROFILE".to_string(), // Mock
                    extensions: vec!["cl_khr_fp64".to_string(), "cl_khr_fp16".to_string()], // Mock common extensions
                    devices: platform_device_indices,
                });
            }

            Ok(Self {
                platforms,
                devices: all_devices,
                context_cache: HashMap::new(),
                opencl_version: "OpenCL 2.1".to_string(), // Mock version
                extensions: vec![
                    "cl_khr_fp64".to_string(),
                    "cl_khr_fp16".to_string(),
                    "cl_khr_global_int32_base_atomics".to_string(),
                    "cl_khr_global_int32_extended_atomics".to_string(),
                ],
            })
        }

        fn get_device_info(
            device_id: ClDeviceId,
            platform_index: usize,
            platform_id: ClPlatformId,
        ) -> LinalgResult<OpenClDeviceInfo> {
            // Mock device information (in real implementation, query actual device properties)
            Ok(OpenClDeviceInfo {
                device_id,
                platform_index,
                device_type: CL_DEVICE_TYPE_GPU,
                name: "Mock OpenCL GPU Device".to_string(),
                vendor: "Mock Vendor".to_string(),
                driver_version: "1.0.0".to_string(),
                device_version: "OpenCL 2.1".to_string(),
                opencl_c_version: "OpenCL C 2.0".to_string(),
                max_compute_units: 32,
                max_work_groupsize: 1024,
                max_work_item_dimensions: 3,
                max_work_itemsizes: vec![1024, 1024, 64],
                preferred_vector_width_char: 16,
                preferred_vector_width_short: 8,
                preferred_vector_width_int: 4,
                preferred_vector_width_long: 2,
                preferred_vector_width_float: 4,
                preferred_vector_width_double: 2,
                max_clock_frequency: 1500,
                address_bits: 64,
                max_mem_allocsize: {
                    #[cfg(target_pointer_width = "32")]
                    {
                        512u64 * 1024 * 1024
                    } // 512MB for 32-bit
                    #[cfg(target_pointer_width = "64")]
                    {
                        2u64 * 1024 * 1024 * 1024
                    } // 2GB for 64-bit
                },
                image_support: 1,
                max_read_image_args: 128,
                max_write_image_args: 64,
                image2d_max_width: 16384,
                image2d_max_height: 16384,
                image3d_max_width: 2048,
                image3d_max_height: 2048,
                image3d_max_depth: 2048,
                max_samplers: 16,
                max_parametersize: 1024,
                mem_base_addr_align: 1024,
                min_data_type_alignsize: 128,
                single_fp_config: 0x3F,   // Mock FP config
                global_mem_cache_type: 2, // CL_READ_WRITE_CACHE
                global_mem_cachelinesize: 64,
                global_mem_cachesize: 2u64 * 1024 * 1024, // 2MB
                global_memsize: {
                    #[cfg(target_pointer_width = "32")]
                    {
                        512u64 * 1024 * 1024
                    } // 512MB for 32-bit
                    #[cfg(target_pointer_width = "64")]
                    {
                        8u64 * 1024 * 1024 * 1024
                    } // 8GB for 64-bit
                },
                max_constant_buffersize: 64u64 * 1024, // 64KB
                max_constant_args: 8,
                local_mem_type: 1,           // CL_LOCAL
                local_memsize: 48u64 * 1024, // 48KB
                error_correction_support: 0,
                profiling_timer_resolution: 1,
                endian_little: 1,
                available: 1,
                compiler_available: 1,
                execution_capabilities: 1, // CL_EXEC_KERNEL
                queue_properties: 2,       // CL_QUEUE_PROFILING_ENABLE
                platform_id,
            })
        }

        /// Get all available platforms
        pub fn platforms(&self) -> &[OpenClPlatform] {
            &self.platforms
        }

        /// Get platform by index
        pub fn platform(&self, index: usize) -> Option<&OpenClPlatform> {
            self.platforms.get(index)
        }

        /// Check if double precision is supported
        pub fn supports_double_precision(&self) -> bool {
            self.extensions.contains(&"cl_khr_fp64".to_string())
        }

        /// Check if half precision is supported
        pub fn supports_half_precision(&self) -> bool {
            self.extensions.contains(&"cl_khr_fp16".to_string())
        }

        /// Get OpenCL version
        pub fn opencl_version(&self) -> &str {
            &self.opencl_version
        }
    }

    impl GpuBackend for OpenClBackend {
        fn name(&self) -> &str {
            "OpenCL"
        }

        fn is_available(&self) -> bool {
            !self.platforms.is_empty() && !self.devices.is_empty()
        }

        fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>> {
            let devices = self
                .devices
                .iter()
                .map(|opencl_device| {
                    // Calculate memory bandwidth (estimated)
                    let memory_bandwidth =
                        (opencl_device.max_clock_frequency as f64 * 256.0) / 1000.0; // Rough estimate

                    let device_type = match opencl_device.device_type {
                        CL_DEVICE_TYPE_GPU => GpuDeviceType::OpenCl,
                        CL_DEVICE_TYPE_CPU => GpuDeviceType::OpenCl,
                        _ => GpuDeviceType::OpenCl,
                    };

                    GpuDeviceInfo {
                        device_type,
                        name: format!("{} ({})", opencl_device.name, opencl_device.vendor),
                        total_memory: opencl_device.global_memsize as usize,
                        compute_units: opencl_device.max_compute_units,
                        clock_frequency: opencl_device.max_clock_frequency,
                        supports_fp64: self.supports_double_precision(),
                        supports_fp16: self.supports_half_precision(),
                        max_work_groupsize: opencl_device.max_work_groupsize,
                        memory_bandwidth,
                        l2_cachesize: opencl_device.global_mem_cachesize as usize,
                        shared_memory_per_block: opencl_device.local_memsize as usize,
                        registers_per_block: 0, // OpenCL doesn't expose this directly
                        warpsize: opencl_device.preferred_vector_width_float, // Approximate
                        max_threads_per_mp: opencl_device.max_work_groupsize as u32,
                        multiprocessor_count: opencl_device.max_compute_units,
                        supports_tensor_cores: false, // Most OpenCL devices don't have tensor cores
                        supports_mixed_precision: self.supports_half_precision(),
                        vendor: opencl_device.vendor.clone(),
                    }
                })
                .collect();

            Ok(devices)
        }

        fn create_context(&self, device_id: usize) -> LinalgResult<Box<dyn GpuContext>> {
            if device_id >= self.devices.len() {
                return Err(LinalgError::ComputationError(format!(
                    "Invalid device ID: {} (available devices: {})",
                    device_id,
                    self.devices.len()
                )));
            }

            let device_info = &self.devices[device_id];

            // Create OpenCL context
            let (result, context) = cl_create_context(&[device_info.device_id]);
            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "Failed to create OpenCL context: error code {}",
                    result
                )));
            }

            // Create command queue
            let (result, command_queue) = cl_create_command_queue(context, device_info.device_id);
            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "Failed to create OpenCL command queue: error code {}",
                    result
                )));
            }

            let context_data = Arc::new(OpenClContextData {
                context,
                device_id: device_info.device_id,
                command_queue,
                device_info: device_info.clone(),
                kernel_cache: HashMap::new(),
                program_cache: HashMap::new(),
            });

            Ok(Box::new(OpenClContext::new(context_data, device_id)))
        }
    }

    /// OpenCL context with clBLAS integration and kernel caching
    #[derive(Debug)]
    pub struct OpenClContext {
        context_data: Arc<OpenClContextData>,
        device_index: usize,
        memory_pool: OpenClMemoryPool,
        performance_stats: OpenClPerformanceStats,
        kernel_compilation_cache: HashMap<String, String>, // Source hash -> compiled binary
    }

    impl OpenClContext {
        fn new(context_data: Arc<OpenClContextData>, device_index: usize) -> Self {
            let memory_pool = OpenClMemoryPool::new(context_data.context);
            let performance_stats = OpenClPerformanceStats::new();

            Self {
                context_data,
                device_index,
                memory_pool,
                performance_stats,
                kernel_compilation_cache: HashMap::new(),
            }
        }

        /// Get OpenCL context
        pub fn cl_context(&self) -> ClContext {
            self.context_data.context
        }

        /// Get command queue
        pub fn command_queue(&self) -> ClCommandQueue {
            self.context_data.command_queue
        }

        /// Compile and cache a kernel
        pub fn compile_kernel(
            &mut self,
            _kernel_name: &str,
            _source: &str,
        ) -> LinalgResult<ClKernel> {
            // In a real implementation, this would compile OpenCL kernel source
            // For now, return a null pointer as mock
            Ok(SafeClPtr(ptr::null_mut()))
        }

        /// Get performance statistics
        pub fn performance_stats(&self) -> &OpenClPerformanceStats {
            &self.performance_stats
        }
    }

    impl GpuContext for OpenClContext {
        #[allow(static_mut_refs)]
        fn device_info(&self) -> &GpuDeviceInfo {
            // Convert OpenClDeviceInfo to GpuDeviceInfo
            static mut CACHED_INFO: Option<GpuDeviceInfo> = None;

            unsafe {
                if CACHED_INFO.is_none() {
                    let opencl_device = &self.context_data.device_info;
                    let memory_bandwidth =
                        (opencl_device.max_clock_frequency as f64 * 256.0) / 1000.0;

                    CACHED_INFO = Some(GpuDeviceInfo {
                        device_type: GpuDeviceType::OpenCl,
                        name: format!("{} ({})", opencl_device.name, opencl_device.vendor),
                        total_memory: opencl_device.global_memsize as usize,
                        compute_units: opencl_device.max_compute_units,
                        clock_frequency: opencl_device.max_clock_frequency,
                        supports_fp64: true, // Mock - would check extensions
                        supports_fp16: true, // Mock - would check extensions
                        max_work_groupsize: opencl_device.max_work_groupsize,
                        memory_bandwidth,
                        l2_cachesize: opencl_device.global_mem_cachesize as usize,
                        shared_memory_per_block: opencl_device.local_memsize as usize,
                        registers_per_block: 0,
                        warpsize: opencl_device.preferred_vector_width_float,
                        max_threads_per_mp: opencl_device.max_work_groupsize as u32,
                        multiprocessor_count: opencl_device.max_compute_units,
                        supports_tensor_cores: false,
                        supports_mixed_precision: true,
                        vendor: opencl_device.vendor.clone(),
                    });
                }

                CACHED_INFO.as_ref().expect("Operation failed")
            }
        }

        fn synchronize(&self) -> LinalgResult<()> {
            let result = cl_finish(self.context_data.command_queue);
            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "OpenCL synchronization failed: error code {}",
                    result
                )));
            }
            Ok(())
        }

        fn available_memory(&self) -> LinalgResult<usize> {
            // Mock implementation - would query actual available memory
            Ok(self.context_data.device_info.global_memsize as usize / 2)
        }
    }

    impl GpuContextAlloc for OpenClContext {
        fn allocate_buffer<T: Clone + Send + Sync + Copy + 'static + std::fmt::Debug>(
            &self,
            size: usize,
        ) -> LinalgResult<Box<dyn GpuBuffer<T>>> {
            let buffer = OpenClBuffer::new(
                size,
                self.context_data.context,
                self.context_data.command_queue,
            )?;
            Ok(Box::new(buffer))
        }
    }

    /// OpenCL memory pool for efficient buffer management
    #[derive(Debug)]
    struct OpenClMemoryPool {
        context: ClContext,
        total_allocated: usize,
        peak_usage: usize,
        free_buffers: HashMap<usize, Vec<ClMem>>,
    }

    impl OpenClMemoryPool {
        fn new(context: ClContext) -> Self {
            Self {
                context,
                total_allocated: 0,
                peak_usage: 0,
                free_buffers: HashMap::new(),
            }
        }

        #[allow(dead_code)]
        fn allocate(&mut self, size: usize) -> LinalgResult<ClMem> {
            // Try to reuse existing buffer
            if let Some(buffers) = self.free_buffers.get_mut(&size) {
                if let Some(buffer) = buffers.pop() {
                    return Ok(buffer);
                }
            }

            // Allocate new buffer
            let (result, buffer) = cl_create_buffer(self.context, CL_MEM_READ_WRITE, size);
            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "OpenCL buffer allocation failed: error code {}",
                    result
                )));
            }

            self.total_allocated += size;
            self.peak_usage = self.peak_usage.max(self.total_allocated);

            Ok(buffer)
        }

        #[allow(dead_code)]
        fn deallocate(&mut self, buffer: ClMem, size: usize) {
            self.free_buffers.entry(size).or_default().push(buffer);
            self.total_allocated = self.total_allocated.saturating_sub(size);
        }
    }

    /// OpenCL buffer implementation
    #[derive(Debug)]
    struct OpenClBuffer<T> {
        buffer: ClMem,
        size: usize,
        context: ClContext,
        command_queue: ClCommandQueue,
        _phantom: std::marker::PhantomData<T>,
    }

    impl<T: Clone + Send + Sync + Copy> OpenClBuffer<T> {
        fn new(
            size: usize,
            context: ClContext,
            command_queue: ClCommandQueue,
        ) -> LinalgResult<Self> {
            let bytesize = size * std::mem::size_of::<T>();

            let (result, buffer) = cl_create_buffer(context, CL_MEM_READ_WRITE, bytesize);
            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "Failed to create OpenCL buffer: error code {}",
                    result
                )));
            }

            Ok(Self {
                buffer,
                size,
                context,
                command_queue,
                _phantom: std::marker::PhantomData,
            })
        }

        /// Get OpenCL memory object
        pub fn cl_mem(&self) -> ClMem {
            self.buffer
        }
    }

    impl<T: Clone + Send + Sync + Copy + std::fmt::Debug> GpuBuffer<T> for OpenClBuffer<T> {
        fn len(&self) -> usize {
            self.size
        }

        fn copy_from_host(&mut self, data: &[T]) -> LinalgResult<()> {
            if data.len() != self.size {
                return Err(LinalgError::ShapeError(format!(
                    "Buffer size mismatch: expected {}, got {}",
                    self.size,
                    data.len()
                )));
            }

            let bytesize = data.len() * std::mem::size_of::<T>();
            let result = cl_enqueue_write_buffer(
                self.command_queue,
                self.buffer,
                1, // blocking
                0, // offset
                bytesize,
                data.as_ptr() as *const std::ffi::c_void,
            );

            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "OpenCL host-to-device copy failed: error code {}",
                    result
                )));
            }

            Ok(())
        }

        fn copy_to_host(&self, data: &mut [T]) -> LinalgResult<()> {
            if data.len() != self.size {
                return Err(LinalgError::ShapeError(format!(
                    "Buffer size mismatch: expected {}, got {}",
                    self.size,
                    data.len()
                )));
            }

            let bytesize = data.len() * std::mem::size_of::<T>();
            let result = cl_enqueue_read_buffer(
                self.command_queue,
                self.buffer,
                1, // blocking
                0, // offset
                bytesize,
                data.as_mut_ptr() as *mut std::ffi::c_void,
            );

            if result != CL_SUCCESS {
                return Err(LinalgError::ComputationError(format!(
                    "OpenCL device-to-host copy failed: error code {}",
                    result
                )));
            }

            Ok(())
        }

        fn device_ptr(&self) -> *mut std::ffi::c_void {
            self.buffer.as_ptr()
        }
    }

    impl<T> Drop for OpenClBuffer<T> {
        fn drop(&mut self) {
            if !self.buffer.0.is_null() {
                let _ = cl_release_mem_object(self.buffer);
            }
        }
    }

    /// Performance statistics for OpenCL operations
    #[derive(Debug, Clone)]
    pub struct OpenClPerformanceStats {
        pub kernel_executions: usize,
        pub buffer_operations: usize,
        pub total_kernel_time_ms: f64,
        pub total_transfer_time_ms: f64,
        pub compilation_time_ms: f64,
        pub cache_hits: usize,
        pub cache_misses: usize,
    }

    impl OpenClPerformanceStats {
        fn new() -> Self {
            Self {
                kernel_executions: 0,
                buffer_operations: 0,
                total_kernel_time_ms: 0.0,
                total_transfer_time_ms: 0.0,
                compilation_time_ms: 0.0,
                cache_hits: 0,
                cache_misses: 0,
            }
        }

        pub fn kernel_efficiency(&self) -> f64 {
            if self.total_kernel_time_ms + self.total_transfer_time_ms == 0.0 {
                return 0.0;
            }
            self.total_kernel_time_ms / (self.total_kernel_time_ms + self.total_transfer_time_ms)
        }

        pub fn cache_hit_rate(&self) -> f64 {
            let total_accesses = self.cache_hits + self.cache_misses;
            if total_accesses == 0 {
                return 0.0;
            }
            self.cache_hits as f64 / total_accesses as f64
        }
    }
}

// Re-export the OpenCL backend when the feature is enabled
#[cfg(feature = "opencl")]
pub use opencl_impl::*;

// Provide a stub when OpenCL is not available
#[cfg(not(feature = "opencl"))]
pub struct OpenClBackend;

#[cfg(not(feature = "opencl"))]
impl OpenClBackend {
    pub fn new() -> LinalgResult<Self> {
        Err(LinalgError::ComputationError(
            "OpenCL support not compiled in".to_string(),
        ))
    }
}

#[cfg(not(feature = "opencl"))]
impl GpuBackend for OpenClBackend {
    fn name(&self) -> &str {
        "OpenCL (not available)"
    }

    fn is_available(&self) -> bool {
        false
    }

    fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>> {
        Ok(vec![])
    }

    fn create_context(&self, _device_id: usize) -> LinalgResult<Box<dyn GpuContext>> {
        Err(LinalgError::ComputationError(
            "OpenCL support not compiled in".to_string(),
        ))
    }
}