quantrs2-sim 0.1.3

//! `OpenCL` Backend for AMD GPU Acceleration
//!
//! This module provides high-performance quantum circuit simulation using `OpenCL`
//! to leverage AMD GPU compute capabilities. It implements parallel state vector
//! operations, gate applications, and quantum algorithm acceleration on AMD
//! graphics processing units.
//!
//! Key features:
//! - `OpenCL` kernel compilation and execution
//! - AMD GPU-optimized quantum gate operations
//! - Parallel state vector manipulation
//! - Memory management for large quantum states
//! - Support for AMD `ROCm` and `OpenCL` 2.0+
//! - Automatic device detection and selection
//! - Performance profiling and optimization
//! - Fallback to CPU when GPU is unavailable

use crate::prelude::{SimulatorError, StateVectorSimulator};
use scirs2_core::parallel_ops::{IndexedParallelIterator, ParallelIterator};
use scirs2_core::Complex64;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

use crate::error::Result;

/// `OpenCL` platform information
#[derive(Debug, Clone)]
pub struct OpenCLPlatform {
    /// Platform ID
    pub platform_id: usize,
    /// Platform name
    pub name: String,
    /// Platform vendor
    pub vendor: String,
    /// Platform version
    pub version: String,
    /// Supported extensions
    pub extensions: Vec<String>,
}

/// `OpenCL` device information
#[derive(Debug, Clone)]
pub struct OpenCLDevice {
    /// Device ID
    pub device_id: usize,
    /// Device name
    pub name: String,
    /// Device vendor
    pub vendor: String,
    /// Device type (GPU, CPU, etc.)
    pub device_type: OpenCLDeviceType,
    /// Compute units
    pub compute_units: u32,
    /// Maximum work group size
    pub max_work_group_size: usize,
    /// Maximum work item dimensions
    pub max_work_item_dimensions: u32,
    /// Maximum work item sizes
    pub max_work_item_sizes: Vec<usize>,
    /// Global memory size
    pub global_memory_size: u64,
    /// Local memory size
    pub local_memory_size: u64,
    /// Maximum constant buffer size
    pub max_constant_buffer_size: u64,
    /// Supports double precision
    pub supports_double: bool,
    /// Device extensions
    pub extensions: Vec<String>,
}

/// `OpenCL` device types
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OpenCLDeviceType {
    GPU,
    CPU,
    Accelerator,
    Custom,
    All,
}

/// `OpenCL` backend configuration
#[derive(Debug, Clone)]
pub struct OpenCLConfig {
    /// Preferred platform vendor
    pub preferred_vendor: Option<String>,
    /// Preferred device type
    pub preferred_device_type: OpenCLDeviceType,
    /// Enable performance profiling
    pub enable_profiling: bool,
    /// Maximum memory allocation per buffer
    pub max_buffer_size: usize,
    /// Work group size for kernels
    pub work_group_size: usize,
    /// Enable kernel caching
    pub enable_kernel_cache: bool,
    /// `OpenCL` optimization level
    pub optimization_level: OptimizationLevel,
    /// Enable automatic fallback to CPU
    pub enable_cpu_fallback: bool,
}

/// `OpenCL` optimization levels
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OptimizationLevel {
    /// No optimization (-O0)
    None,
    /// Basic optimization (-O1)
    Basic,
    /// Standard optimization (-O2)
    Standard,
    /// Aggressive optimization (-O3)
    Aggressive,
}

impl Default for OpenCLConfig {
    fn default() -> Self {
        Self {
            preferred_vendor: Some("Advanced Micro Devices".to_string()),
            preferred_device_type: OpenCLDeviceType::GPU,
            enable_profiling: true,
            max_buffer_size: 1 << 30, // 1GB
            work_group_size: 256,
            enable_kernel_cache: true,
            optimization_level: OptimizationLevel::Standard,
            enable_cpu_fallback: true,
        }
    }
}

/// `OpenCL` kernel information
#[derive(Debug, Clone)]
pub struct OpenCLKernel {
    /// Kernel name
    pub name: String,
    /// Kernel source code
    pub source: String,
    /// Compilation options
    pub build_options: String,
    /// Local memory usage
    pub local_memory_usage: usize,
    /// Work group size
    pub work_group_size: usize,
}

/// AMD GPU-optimized quantum simulator using `OpenCL`
pub struct AMDOpenCLSimulator {
    /// Configuration
    config: OpenCLConfig,
    /// Selected platform
    platform: Option<OpenCLPlatform>,
    /// Selected device
    device: Option<OpenCLDevice>,
    /// `OpenCL` context (simulated)
    context: Option<OpenCLContext>,
    /// Command queue (simulated)
    command_queue: Option<OpenCLCommandQueue>,
    /// Compiled kernels
    kernels: HashMap<String, OpenCLKernel>,
    /// Memory buffers
    buffers: HashMap<String, OpenCLBuffer>,
    /// Performance statistics
    stats: OpenCLStats,
    /// Fallback CPU simulator
    cpu_fallback: Option<StateVectorSimulator>,
}

/// Simulated `OpenCL` context
#[derive(Debug, Clone)]
pub struct OpenCLContext {
    /// Context ID
    pub context_id: usize,
    /// Associated devices
    pub devices: Vec<usize>,
}

/// Simulated `OpenCL` command queue
#[derive(Debug, Clone)]
pub struct OpenCLCommandQueue {
    /// Queue ID
    pub queue_id: usize,
    /// Associated context
    pub context_id: usize,
    /// Associated device
    pub device_id: usize,
    /// Enable profiling
    pub profiling_enabled: bool,
}

/// Simulated `OpenCL` buffer
#[derive(Debug, Clone)]
pub struct OpenCLBuffer {
    /// Buffer ID
    pub buffer_id: usize,
    /// Buffer size in bytes
    pub size: usize,
    /// Memory flags
    pub flags: MemoryFlags,
    /// Host pointer (for simulation)
    pub host_data: Option<Vec<u8>>,
}

/// `OpenCL` memory flags
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MemoryFlags {
    ReadWrite,
    ReadOnly,
    WriteOnly,
    UseHostPtr,
    AllocHostPtr,
    CopyHostPtr,
}

/// `OpenCL` performance statistics
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct OpenCLStats {
    /// Total kernel executions
    pub total_kernel_executions: usize,
    /// Total execution time (ms)
    pub total_execution_time: f64,
    /// Average kernel execution time (ms)
    pub avg_kernel_time: f64,
    /// Memory transfer time (ms)
    pub memory_transfer_time: f64,
    /// Compilation time (ms)
    pub compilation_time: f64,
    /// GPU memory usage (bytes)
    pub gpu_memory_usage: u64,
    /// GPU utilization percentage
    pub gpu_utilization: f64,
    /// Number of state vector operations
    pub state_vector_operations: usize,
    /// Number of gate operations
    pub gate_operations: usize,
    /// Fallback to CPU count
    pub cpu_fallback_count: usize,
}

impl OpenCLStats {
    /// Update statistics after kernel execution
    pub fn update_kernel_execution(&mut self, execution_time: f64) {
        self.total_kernel_executions += 1;
        self.total_execution_time += execution_time;
        self.avg_kernel_time = self.total_execution_time / self.total_kernel_executions as f64;
    }

    /// Calculate performance metrics
    #[must_use]
    pub fn get_performance_metrics(&self) -> HashMap<String, f64> {
        let mut metrics = HashMap::new();
        metrics.insert(
            "kernel_executions_per_second".to_string(),
            self.total_kernel_executions as f64 / (self.total_execution_time / 1000.0),
        );
        metrics.insert(
            "memory_bandwidth_gb_s".to_string(),
            self.gpu_memory_usage as f64 / (self.memory_transfer_time / 1000.0) / 1e9,
        );
        metrics.insert("gpu_efficiency".to_string(), self.gpu_utilization / 100.0);
        metrics
    }
}

impl AMDOpenCLSimulator {
    /// Create new AMD `OpenCL` simulator
    pub fn new(config: OpenCLConfig) -> Result<Self> {
        let mut simulator = Self {
            config,
            platform: None,
            device: None,
            context: None,
            command_queue: None,
            kernels: HashMap::new(),
            buffers: HashMap::new(),
            stats: OpenCLStats::default(),
            cpu_fallback: None,
        };

        // Initialize OpenCL environment
        simulator.initialize_opencl()?;

        // Compile kernels
        simulator.compile_kernels()?;

        // Initialize CPU fallback if enabled
        if simulator.config.enable_cpu_fallback {
            simulator.cpu_fallback = Some(StateVectorSimulator::new()); // Default size
        }

        Ok(simulator)
    }

    /// Initialize `OpenCL` platform and device
    fn initialize_opencl(&mut self) -> Result<()> {
        // Simulate platform discovery
        let platforms = self.discover_platforms()?;

        // Select preferred platform
        let selected_platform = self.select_platform(&platforms)?;
        self.platform = Some(selected_platform);

        // Discover devices
        let devices = self.discover_devices()?;

        // Select preferred device
        let selected_device = self.select_device(&devices)?;
        self.device = Some(selected_device);

        // Create context and command queue using the selected device
        let device_id = self
            .device
            .as_ref()
            .ok_or_else(|| {
                SimulatorError::InitializationError("Device not initialized".to_string())
            })?
            .device_id;

        self.context = Some(OpenCLContext {
            context_id: 1,
            devices: vec![device_id],
        });

        // Create command queue
        self.command_queue = Some(OpenCLCommandQueue {
            queue_id: 1,
            context_id: 1,
            device_id,
            profiling_enabled: self.config.enable_profiling,
        });

        Ok(())
    }

    /// Discover available `OpenCL` platforms
    fn discover_platforms(&self) -> Result<Vec<OpenCLPlatform>> {
        // Simulate AMD platform discovery
        let platforms = vec![
            OpenCLPlatform {
                platform_id: 0,
                name: "AMD Accelerated Parallel Processing".to_string(),
                vendor: "Advanced Micro Devices, Inc.".to_string(),
                version: "OpenCL 2.1 AMD-APP (3444.0)".to_string(),
                extensions: vec![
                    "cl_khr_icd".to_string(),
                    "cl_khr_d3d10_sharing".to_string(),
                    "cl_khr_d3d11_sharing".to_string(),
                    "cl_khr_dx9_media_sharing".to_string(),
                    "cl_amd_event_callback".to_string(),
                    "cl_amd_offline_devices".to_string(),
                ],
            },
            OpenCLPlatform {
                platform_id: 1,
                name: "Intel(R) OpenCL".to_string(),
                vendor: "Intel(R) Corporation".to_string(),
                version: "OpenCL 2.1".to_string(),
                extensions: vec!["cl_khr_icd".to_string()],
            },
        ];

        Ok(platforms)
    }

    /// Select optimal platform
    fn select_platform(&self, platforms: &[OpenCLPlatform]) -> Result<OpenCLPlatform> {
        // Prefer AMD platform if available
        if let Some(preferred_vendor) = &self.config.preferred_vendor {
            for platform in platforms {
                if platform.vendor.contains(preferred_vendor) {
                    return Ok(platform.clone());
                }
            }
        }

        // Fallback to first available platform
        platforms.first().cloned().ok_or_else(|| {
            SimulatorError::InitializationError("No OpenCL platforms found".to_string())
        })
    }

    /// Discover devices for selected platform
    fn discover_devices(&self) -> Result<Vec<OpenCLDevice>> {
        // Simulate AMD GPU device discovery
        let devices = vec![
            OpenCLDevice {
                device_id: 0,
                name: "Radeon RX 7900 XTX".to_string(),
                vendor: "Advanced Micro Devices, Inc.".to_string(),
                device_type: OpenCLDeviceType::GPU,
                compute_units: 96,
                max_work_group_size: 256,
                max_work_item_dimensions: 3,
                max_work_item_sizes: vec![256, 256, 256],
                global_memory_size: 24 * (1 << 30), // 24GB
                local_memory_size: 64 * 1024,       // 64KB
                max_constant_buffer_size: 64 * 1024,
                supports_double: true,
                extensions: vec![
                    "cl_khr_fp64".to_string(),
                    "cl_amd_fp64".to_string(),
                    "cl_khr_global_int32_base_atomics".to_string(),
                ],
            },
            OpenCLDevice {
                device_id: 1,
                name: "Radeon RX 6800 XT".to_string(),
                vendor: "Advanced Micro Devices, Inc.".to_string(),
                device_type: OpenCLDeviceType::GPU,
                compute_units: 72,
                max_work_group_size: 256,
                max_work_item_dimensions: 3,
                max_work_item_sizes: vec![256, 256, 256],
                global_memory_size: 16 * (1 << 30), // 16GB
                local_memory_size: 64 * 1024,
                max_constant_buffer_size: 64 * 1024,
                supports_double: true,
                extensions: vec!["cl_khr_fp64".to_string(), "cl_amd_fp64".to_string()],
            },
        ];

        Ok(devices)
    }

    /// Select optimal device
    fn select_device(&self, devices: &[OpenCLDevice]) -> Result<OpenCLDevice> {
        // Filter by device type
        let filtered_devices: Vec<&OpenCLDevice> = devices
            .iter()
            .filter(|device| device.device_type == self.config.preferred_device_type)
            .collect();

        if filtered_devices.is_empty() {
            return Err(SimulatorError::InitializationError(
                "No suitable devices found".to_string(),
            ));
        }

        // Select device with most compute units
        let best_device = filtered_devices
            .iter()
            .max_by_key(|device| device.compute_units)
            .ok_or_else(|| {
                SimulatorError::InitializationError("No devices available".to_string())
            })?;

        Ok((*best_device).clone())
    }

    /// Compile `OpenCL` kernels
    fn compile_kernels(&mut self) -> Result<()> {
        let start_time = std::time::Instant::now();

        // Single qubit gate kernel
        let single_qubit_kernel = self.create_single_qubit_kernel();
        self.kernels
            .insert("single_qubit_gate".to_string(), single_qubit_kernel);

        // Two qubit gate kernel
        let two_qubit_kernel = self.create_two_qubit_kernel();
        self.kernels
            .insert("two_qubit_gate".to_string(), two_qubit_kernel);

        // State vector operations kernel
        let state_vector_kernel = self.create_state_vector_kernel();
        self.kernels
            .insert("state_vector_ops".to_string(), state_vector_kernel);

        // Measurement kernel
        let measurement_kernel = self.create_measurement_kernel();
        self.kernels
            .insert("measurement".to_string(), measurement_kernel);

        // Expectation value kernel
        let expectation_kernel = self.create_expectation_kernel();
        self.kernels
            .insert("expectation_value".to_string(), expectation_kernel);

        self.stats.compilation_time = start_time.elapsed().as_secs_f64() * 1000.0;

        Ok(())
    }

    /// Create single qubit gate kernel
    fn create_single_qubit_kernel(&self) -> OpenCLKernel {
        let source = r"
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable

            typedef double2 complex_t;

            complex_t complex_mul(complex_t a, complex_t b) {
                return (complex_t)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
            }

            complex_t complex_add(complex_t a, complex_t b) {
                return (complex_t)(a.x + b.x, a.y + b.y);
            }

            __kernel void single_qubit_gate(
                __global complex_t* state,
                __global const double* gate_matrix,
                const int target_qubit,
                const int num_qubits
            ) {
                const int global_id = get_global_id(0);
                const int total_states = 1 << num_qubits;

                if (global_id >= total_states / 2) return;

                const int target_mask = 1 << target_qubit;
                const int i = global_id;
                const int j = i | target_mask;

                if ((i & target_mask) == 0) {
                    // Extract gate matrix elements
                    complex_t gate_00 = (complex_t)(gate_matrix[0], gate_matrix[1]);
                    complex_t gate_01 = (complex_t)(gate_matrix[2], gate_matrix[3]);
                    complex_t gate_10 = (complex_t)(gate_matrix[4], gate_matrix[5]);
                    complex_t gate_11 = (complex_t)(gate_matrix[6], gate_matrix[7]);

                    complex_t state_i = state[i];
                    complex_t state_j = state[j];

                    state[i] = complex_add(complex_mul(gate_00, state_i), complex_mul(gate_01, state_j));
                    state[j] = complex_add(complex_mul(gate_10, state_i), complex_mul(gate_11, state_j));
                }
            }
        ";

        OpenCLKernel {
            name: "single_qubit_gate".to_string(),
            source: source.to_string(),
            build_options: self.get_build_options(),
            local_memory_usage: 0,
            work_group_size: self.config.work_group_size,
        }
    }

    /// Create two qubit gate kernel
    fn create_two_qubit_kernel(&self) -> OpenCLKernel {
        let source = r"
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable

            typedef double2 complex_t;

            complex_t complex_mul(complex_t a, complex_t b) {
                return (complex_t)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
            }

            complex_t complex_add(complex_t a, complex_t b) {
                return (complex_t)(a.x + b.x, a.y + b.y);
            }

            __kernel void two_qubit_gate(
                __global complex_t* state,
                __global const double* gate_matrix,
                const int control_qubit,
                const int target_qubit,
                const int num_qubits
            ) {
                const int global_id = get_global_id(0);
                const int total_states = 1 << num_qubits;

                if (global_id >= total_states / 4) return;

                const int control_mask = 1 << control_qubit;
                const int target_mask = 1 << target_qubit;
                const int both_mask = control_mask | target_mask;

                int base = global_id;
                // Remove bits at control and target positions
                if (global_id & (target_mask - 1)) base = (base & ~(target_mask - 1)) << 1 | (base & (target_mask - 1));
                if (base & (control_mask - 1)) base = (base & ~(control_mask - 1)) << 1 | (base & (control_mask - 1));

                int state_00 = base;
                int state_01 = base | target_mask;
                int state_10 = base | control_mask;
                int state_11 = base | both_mask;

                // Load gate matrix (16 elements for 4x4 matrix)
                complex_t gate[4][4];
                for (int i = 0; i < 4; i++) {
                    for (int j = 0; j < 4; j++) {
                        gate[i][j] = (complex_t)(gate_matrix[(i*4+j)*2], gate_matrix[(i*4+j)*2+1]);
                    }
                }

                complex_t old_states[4];
                old_states[0] = state[state_00];
                old_states[1] = state[state_01];
                old_states[2] = state[state_10];
                old_states[3] = state[state_11];

                // Apply gate matrix
                complex_t new_states[4] = {0};
                for (int i = 0; i < 4; i++) {
                    for (int j = 0; j < 4; j++) {
                        new_states[i] = complex_add(new_states[i], complex_mul(gate[i][j], old_states[j]));
                    }
                }

                state[state_00] = new_states[0];
                state[state_01] = new_states[1];
                state[state_10] = new_states[2];
                state[state_11] = new_states[3];
            }
        ";

        OpenCLKernel {
            name: "two_qubit_gate".to_string(),
            source: source.to_string(),
            build_options: self.get_build_options(),
            local_memory_usage: 128, // Local memory for gate matrix
            work_group_size: self.config.work_group_size,
        }
    }

    /// Create state vector operations kernel
    fn create_state_vector_kernel(&self) -> OpenCLKernel {
        let source = r"
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable

            typedef double2 complex_t;

            __kernel void normalize_state(
                __global complex_t* state,
                const int num_states,
                const double norm_factor
            ) {
                const int global_id = get_global_id(0);

                if (global_id >= num_states) return;

                state[global_id].x *= norm_factor;
                state[global_id].y *= norm_factor;
            }

            __kernel void compute_probabilities(
                __global const complex_t* state,
                __global double* probabilities,
                const int num_states
            ) {
                const int global_id = get_global_id(0);

                if (global_id >= num_states) return;

                complex_t amplitude = state[global_id];
                probabilities[global_id] = amplitude.x * amplitude.x + amplitude.y * amplitude.y;
            }

            __kernel void inner_product(
                __global const complex_t* state1,
                __global const complex_t* state2,
                __global complex_t* partial_results,
                __local complex_t* local_data,
                const int num_states
            ) {
                const int global_id = get_global_id(0);
                const int local_id = get_local_id(0);
                const int local_size = get_local_size(0);
                const int group_id = get_group_id(0);

                // Initialize local memory
                if (global_id < num_states) {
                    complex_t a = state1[global_id];
                    complex_t b = state2[global_id];
                    // Conjugate of a times b
                    local_data[local_id] = (complex_t)(a.x * b.x + a.y * b.y, a.x * b.y - a.y * b.x);
                } else {
                    local_data[local_id] = (complex_t)(0.0, 0.0);
                }

                barrier(CLK_LOCAL_MEM_FENCE);

                // Reduction
                for (int stride = local_size / 2; stride > 0; stride /= 2) {
                    if (local_id < stride) {
                        local_data[local_id].x += local_data[local_id + stride].x;
                        local_data[local_id].y += local_data[local_id + stride].y;
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
                }

                if (local_id == 0) {
                    partial_results[group_id] = local_data[0];
                }
            }
        ";

        OpenCLKernel {
            name: "state_vector_ops".to_string(),
            source: source.to_string(),
            build_options: self.get_build_options(),
            local_memory_usage: self.config.work_group_size * 16, // Complex doubles
            work_group_size: self.config.work_group_size,
        }
    }

    /// Create measurement kernel
    fn create_measurement_kernel(&self) -> OpenCLKernel {
        let source = r"
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable

            typedef double2 complex_t;

            __kernel void measure_qubit(
                __global complex_t* state,
                __global double* probabilities,
                const int target_qubit,
                const int num_qubits,
                const int measurement_result
            ) {
                const int global_id = get_global_id(0);
                const int total_states = 1 << num_qubits;

                if (global_id >= total_states) return;

                const int target_mask = 1 << target_qubit;
                const int qubit_value = (global_id & target_mask) ? 1 : 0;

                if (qubit_value != measurement_result) {
                    // Set amplitude to zero for inconsistent measurement
                    state[global_id] = (complex_t)(0.0, 0.0);
                }
            }

            __kernel void compute_measurement_probabilities(
                __global const complex_t* state,
                __global double* prob_0,
                __global double* prob_1,
                __local double* local_data,
                const int target_qubit,
                const int num_qubits
            ) {
                const int global_id = get_global_id(0);
                const int local_id = get_local_id(0);
                const int local_size = get_local_size(0);
                const int group_id = get_group_id(0);
                const int total_states = 1 << num_qubits;

                double local_prob_0 = 0.0;
                double local_prob_1 = 0.0;

                if (global_id < total_states) {
                    const int target_mask = 1 << target_qubit;
                    complex_t amplitude = state[global_id];
                    double prob = amplitude.x * amplitude.x + amplitude.y * amplitude.y;

                    if (global_id & target_mask) {
                        local_prob_1 = prob;
                    } else {
                        local_prob_0 = prob;
                    }
                }

                local_data[local_id * 2] = local_prob_0;
                local_data[local_id * 2 + 1] = local_prob_1;

                barrier(CLK_LOCAL_MEM_FENCE);

                // Reduction
                for (int stride = local_size / 2; stride > 0; stride /= 2) {
                    if (local_id < stride) {
                        local_data[local_id * 2] += local_data[(local_id + stride) * 2];
                        local_data[local_id * 2 + 1] += local_data[(local_id + stride) * 2 + 1];
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
                }

                if (local_id == 0) {
                    prob_0[group_id] = local_data[0];
                    prob_1[group_id] = local_data[1];
                }
            }
        ";

        OpenCLKernel {
            name: "measurement".to_string(),
            source: source.to_string(),
            build_options: self.get_build_options(),
            local_memory_usage: self.config.work_group_size * 16, // 2 doubles per work item
            work_group_size: self.config.work_group_size,
        }
    }

    /// Create expectation value kernel
    fn create_expectation_kernel(&self) -> OpenCLKernel {
        let source = r"
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable

            typedef double2 complex_t;

            complex_t complex_mul(complex_t a, complex_t b) {
                return (complex_t)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
            }

            __kernel void expectation_value_pauli(
                __global const complex_t* state,
                __global double* partial_results,
                __local double* local_data,
                const int pauli_string,
                const int num_qubits
            ) {
                const int global_id = get_global_id(0);
                const int local_id = get_local_id(0);
                const int local_size = get_local_size(0);
                const int group_id = get_group_id(0);
                const int total_states = 1 << num_qubits;

                double local_expectation = 0.0;

                if (global_id < total_states) {
                    complex_t amplitude = state[global_id];

                    // Apply Pauli operators
                    int target_state = global_id;
                    complex_t result_amplitude = amplitude;
                    double sign = 1.0;

                    // Process each Pauli operator in the string
                    for (int qubit = 0; qubit < num_qubits; qubit++) {
                        int pauli_op = (pauli_string >> (2 * qubit)) & 3;
                        int qubit_mask = 1 << qubit;

                        switch (pauli_op) {
                            case 0: // I (identity)
                                break;
                            case 1: // X (bit flip)
                                target_state ^= qubit_mask;
                                break;
                            case 2: // Y (bit and phase flip)
                                target_state ^= qubit_mask;
                                if (global_id & qubit_mask) sign *= -1.0;
                                else result_amplitude = (complex_t)(-result_amplitude.y, result_amplitude.x);
                                break;
                            case 3: // Z (phase flip)
                                if (global_id & qubit_mask) sign *= -1.0;
                                break;
                        }
                    }

                    if (target_state == global_id) {
                        // Diagonal element
                        local_expectation = sign * (amplitude.x * amplitude.x + amplitude.y * amplitude.y);
                    }
                }

                local_data[local_id] = local_expectation;
                barrier(CLK_LOCAL_MEM_FENCE);

                // Reduction
                for (int stride = local_size / 2; stride > 0; stride /= 2) {
                    if (local_id < stride) {
                        local_data[local_id] += local_data[local_id + stride];
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
                }

                if (local_id == 0) {
                    partial_results[group_id] = local_data[0];
                }
            }
        ";

        OpenCLKernel {
            name: "expectation_value".to_string(),
            source: source.to_string(),
            build_options: self.get_build_options(),
            local_memory_usage: self.config.work_group_size * 8, // Double per work item
            work_group_size: self.config.work_group_size,
        }
    }

    /// Get build options for kernel compilation
    fn get_build_options(&self) -> String {
        let mut options = Vec::new();

        match self.config.optimization_level {
            OptimizationLevel::None => options.push("-O0"),
            OptimizationLevel::Basic => options.push("-O1"),
            OptimizationLevel::Standard => options.push("-O2"),
            OptimizationLevel::Aggressive => options.push("-O3"),
        }

        // Add AMD-specific optimizations
        options.push("-cl-mad-enable");
        options.push("-cl-fast-relaxed-math");

        // Double precision support
        if let Some(device) = &self.device {
            if device.supports_double {
                options.push("-cl-fp64");
            }
        }

        options.join(" ")
    }

    /// Create memory buffer
    pub fn create_buffer(&mut self, name: &str, size: usize, flags: MemoryFlags) -> Result<()> {
        if size > self.config.max_buffer_size {
            return Err(SimulatorError::MemoryError(format!(
                "Buffer size {} exceeds maximum {}",
                size, self.config.max_buffer_size
            )));
        }

        let buffer = OpenCLBuffer {
            buffer_id: self.buffers.len(),
            size,
            flags,
            host_data: Some(vec![0u8; size]),
        };

        self.buffers.insert(name.to_string(), buffer);
        self.stats.gpu_memory_usage += size as u64;

        Ok(())
    }

    /// Execute kernel
    pub fn execute_kernel(
        &mut self,
        kernel_name: &str,
        global_work_size: &[usize],
        local_work_size: Option<&[usize]>,
        args: &[KernelArg],
    ) -> Result<f64> {
        let start_time = std::time::Instant::now();

        if !self.kernels.contains_key(kernel_name) {
            return Err(SimulatorError::InvalidInput(format!(
                "Kernel {kernel_name} not found"
            )));
        }

        // Simulate kernel execution
        let execution_time = self.simulate_kernel_execution(kernel_name, global_work_size, args)?;

        let total_time = start_time.elapsed().as_secs_f64() * 1000.0;
        self.stats.update_kernel_execution(total_time);

        match kernel_name {
            "single_qubit_gate" | "two_qubit_gate" => {
                self.stats.gate_operations += 1;
            }
            "state_vector_ops" | "normalize_state" | "compute_probabilities" => {
                self.stats.state_vector_operations += 1;
            }
            _ => {}
        }

        Ok(execution_time)
    }

    /// Simulate kernel execution (for demonstration)
    fn simulate_kernel_execution(
        &self,
        kernel_name: &str,
        global_work_size: &[usize],
        _args: &[KernelArg],
    ) -> Result<f64> {
        let total_work_items: usize = global_work_size.iter().product();

        // Simulate execution time based on work items and device capabilities
        let device = self
            .device
            .as_ref()
            .ok_or_else(|| SimulatorError::InvalidState("Device not initialized".to_string()))?;
        let work_groups = total_work_items.div_ceil(self.config.work_group_size);
        let parallel_work_groups = device.compute_units as usize;

        let execution_cycles = work_groups.div_ceil(parallel_work_groups);

        // Base execution time per cycle (microseconds)
        let base_time_per_cycle = match kernel_name {
            "single_qubit_gate" => 1.0,
            "two_qubit_gate" => 2.5,
            "state_vector_ops" => 0.5,
            "measurement" => 1.5,
            "expectation_value" => 2.0,
            _ => 1.0,
        };

        let execution_time = execution_cycles as f64 * base_time_per_cycle;

        // Add random variation
        let variation = fastrand::f64().mul_add(0.2, 0.9); // 90-110% of base time
        Ok(execution_time * variation)
    }

    /// Apply single qubit gate using `OpenCL`
    pub fn apply_single_qubit_gate_opencl(
        &mut self,
        gate_matrix: &[Complex64; 4],
        target_qubit: usize,
        num_qubits: usize,
    ) -> Result<f64> {
        // Convert gate matrix to real array for OpenCL
        let mut gate_real = [0.0; 8];
        for (i, &complex_val) in gate_matrix.iter().enumerate() {
            gate_real[i * 2] = complex_val.re;
            gate_real[i * 2 + 1] = complex_val.im;
        }

        let total_states = 1 << num_qubits;
        let global_work_size = vec![total_states / 2];

        let args = vec![
            KernelArg::Buffer("state".to_string()),
            KernelArg::ConstantBuffer("gate_matrix".to_string()),
            KernelArg::Int(target_qubit as i32),
            KernelArg::Int(num_qubits as i32),
        ];

        self.execute_kernel("single_qubit_gate", &global_work_size, None, &args)
    }

    /// Apply two qubit gate using `OpenCL`
    pub fn apply_two_qubit_gate_opencl(
        &mut self,
        gate_matrix: &[Complex64; 16],
        control_qubit: usize,
        target_qubit: usize,
        num_qubits: usize,
    ) -> Result<f64> {
        // Convert gate matrix to real array for OpenCL
        let mut gate_real = [0.0; 32];
        for (i, &complex_val) in gate_matrix.iter().enumerate() {
            gate_real[i * 2] = complex_val.re;
            gate_real[i * 2 + 1] = complex_val.im;
        }

        let total_states = 1 << num_qubits;
        let global_work_size = vec![total_states / 4];

        let args = vec![
            KernelArg::Buffer("state".to_string()),
            KernelArg::ConstantBuffer("gate_matrix".to_string()),
            KernelArg::Int(control_qubit as i32),
            KernelArg::Int(target_qubit as i32),
            KernelArg::Int(num_qubits as i32),
        ];

        self.execute_kernel("two_qubit_gate", &global_work_size, None, &args)
    }

    /// Compute expectation value using `OpenCL`
    pub fn compute_expectation_value_opencl(
        &mut self,
        pauli_string: u32,
        num_qubits: usize,
    ) -> Result<(f64, f64)> {
        let total_states = 1 << num_qubits;
        let global_work_size = vec![total_states];

        let args = vec![
            KernelArg::Buffer("state".to_string()),
            KernelArg::Buffer("partial_results".to_string()),
            KernelArg::LocalMemory(self.config.work_group_size * 8),
            KernelArg::Int(pauli_string as i32),
            KernelArg::Int(num_qubits as i32),
        ];

        let execution_time = self.execute_kernel(
            "expectation_value",
            &global_work_size,
            Some(&[self.config.work_group_size]),
            &args,
        )?;

        // Simulate expectation value result
        let expectation_value = fastrand::f64().mul_add(2.0, -1.0); // Random value between -1 and 1

        Ok((expectation_value, execution_time))
    }

    /// Get device information
    pub const fn get_device_info(&self) -> Option<&OpenCLDevice> {
        self.device.as_ref()
    }

    /// Get performance statistics
    pub const fn get_stats(&self) -> &OpenCLStats {
        &self.stats
    }

    /// Reset performance statistics
    pub fn reset_stats(&mut self) {
        self.stats = OpenCLStats::default();
    }

    /// Check if `OpenCL` is available
    pub const fn is_opencl_available(&self) -> bool {
        self.context.is_some() && self.device.is_some()
    }

    /// Fallback to CPU simulation
    pub fn fallback_to_cpu(&mut self, num_qubits: usize) -> Result<()> {
        if self.config.enable_cpu_fallback {
            self.cpu_fallback = Some(StateVectorSimulator::new());
            self.stats.cpu_fallback_count += 1;
            Ok(())
        } else {
            Err(SimulatorError::OperationNotSupported(
                "CPU fallback disabled".to_string(),
            ))
        }
    }
}

/// Kernel argument types
#[derive(Debug, Clone)]
pub enum KernelArg {
    Buffer(String),
    ConstantBuffer(String),
    Int(i32),
    Float(f32),
    Double(f64),
    LocalMemory(usize),
}

/// Benchmark AMD `OpenCL` backend performance
pub fn benchmark_amd_opencl_backend() -> Result<HashMap<String, f64>> {
    let mut results = HashMap::new();

    // Test different configurations
    let configs = vec![
        OpenCLConfig {
            work_group_size: 64,
            optimization_level: OptimizationLevel::Standard,
            ..Default::default()
        },
        OpenCLConfig {
            work_group_size: 128,
            optimization_level: OptimizationLevel::Aggressive,
            ..Default::default()
        },
        OpenCLConfig {
            work_group_size: 256,
            optimization_level: OptimizationLevel::Aggressive,
            ..Default::default()
        },
    ];

    for (i, config) in configs.into_iter().enumerate() {
        let start = std::time::Instant::now();

        let mut simulator = AMDOpenCLSimulator::new(config)?;

        // Benchmark single qubit gates
        let single_qubit_matrix = [
            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
            Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
        ];

        for num_qubits in [10, 15, 20] {
            simulator.create_buffer("state", (1 << num_qubits) * 16, MemoryFlags::ReadWrite)?;

            for qubit in 0..num_qubits.min(5) {
                let _time = simulator.apply_single_qubit_gate_opencl(
                    &single_qubit_matrix,
                    qubit,
                    num_qubits,
                )?;
            }
        }

        // Benchmark two qubit gates
        let cnot_matrix = [
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
        ];

        for num_qubits in [10usize, 15, 20] {
            for pair in 0..num_qubits.saturating_sub(1).min(3) {
                let _time = simulator.apply_two_qubit_gate_opencl(
                    &cnot_matrix,
                    pair,
                    pair + 1,
                    num_qubits,
                )?;
            }
        }

        // Benchmark expectation values
        for num_qubits in [10, 15, 20] {
            let _result = simulator.compute_expectation_value_opencl(0b1010, num_qubits)?;
        }

        let time = start.elapsed().as_secs_f64() * 1000.0;
        results.insert(format!("config_{i}"), time);

        // Add performance metrics
        let stats = simulator.get_stats();
        results.insert(format!("config_{i}_gate_ops"), stats.gate_operations as f64);
        results.insert(format!("config_{i}_avg_kernel_time"), stats.avg_kernel_time);
        results.insert(format!("config_{i}_gpu_utilization"), stats.gpu_utilization);
    }

    Ok(results)
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_abs_diff_eq;

    #[test]
    fn test_opencl_simulator_creation() {
        let config = OpenCLConfig::default();
        let simulator = AMDOpenCLSimulator::new(config);
        assert!(simulator.is_ok());
    }

    #[test]
    fn test_platform_discovery() {
        let config = OpenCLConfig::default();
        let simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
        let platforms = simulator
            .discover_platforms()
            .expect("Platform discovery should succeed");

        assert!(!platforms.is_empty());
        assert!(platforms
            .iter()
            .any(|p| p.vendor.contains("Advanced Micro Devices")));
    }

    #[test]
    fn test_device_discovery() {
        let config = OpenCLConfig::default();
        let simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
        let devices = simulator
            .discover_devices()
            .expect("Device discovery should succeed");

        assert!(!devices.is_empty());
        assert!(devices
            .iter()
            .any(|d| d.device_type == OpenCLDeviceType::GPU));
    }

    #[test]
    fn test_kernel_creation() {
        let config = OpenCLConfig::default();
        let simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        assert!(simulator.kernels.contains_key("single_qubit_gate"));
        assert!(simulator.kernels.contains_key("two_qubit_gate"));
        assert!(simulator.kernels.contains_key("state_vector_ops"));
        assert!(simulator.kernels.contains_key("measurement"));
        assert!(simulator.kernels.contains_key("expectation_value"));
    }

    #[test]
    fn test_buffer_creation() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let result = simulator.create_buffer("test_buffer", 1024, MemoryFlags::ReadWrite);
        assert!(result.is_ok());
        assert!(simulator.buffers.contains_key("test_buffer"));
        assert_eq!(simulator.stats.gpu_memory_usage, 1024);
    }

    #[test]
    fn test_buffer_size_limit() {
        let config = OpenCLConfig {
            max_buffer_size: 512,
            ..Default::default()
        };
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let result = simulator.create_buffer("large_buffer", 1024, MemoryFlags::ReadWrite);
        assert!(result.is_err());
    }

    #[test]
    fn test_kernel_execution() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let global_work_size = vec![256];
        let args = vec![
            KernelArg::Buffer("state".to_string()),
            KernelArg::Int(0),
            KernelArg::Int(8),
        ];

        let result = simulator.execute_kernel("single_qubit_gate", &global_work_size, None, &args);
        assert!(result.is_ok());

        let execution_time = result.expect("Kernel execution should succeed");
        assert!(execution_time > 0.0);
    }

    #[test]
    fn test_single_qubit_gate_application() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let hadamard_matrix = [
            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
            Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
        ];

        simulator
            .create_buffer("state", 1024 * 16, MemoryFlags::ReadWrite)
            .expect("Buffer creation should succeed");

        let result = simulator.apply_single_qubit_gate_opencl(&hadamard_matrix, 0, 8);
        assert!(result.is_ok());

        let execution_time = result.expect("Single qubit gate application should succeed");
        assert!(execution_time > 0.0);
    }

    #[test]
    fn test_two_qubit_gate_application() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let cnot_matrix = [
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(0.0, 0.0),
            Complex64::new(1.0, 0.0),
            Complex64::new(0.0, 0.0),
        ];

        simulator
            .create_buffer("state", 1024 * 16, MemoryFlags::ReadWrite)
            .expect("Buffer creation should succeed");

        let result = simulator.apply_two_qubit_gate_opencl(&cnot_matrix, 0, 1, 8);
        assert!(result.is_ok());

        let execution_time = result.expect("Two qubit gate application should succeed");
        assert!(execution_time > 0.0);
    }

    #[test]
    fn test_expectation_value_computation() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        simulator
            .create_buffer("state", 1024 * 16, MemoryFlags::ReadWrite)
            .expect("State buffer creation should succeed");
        simulator
            .create_buffer("partial_results", 64 * 8, MemoryFlags::ReadWrite)
            .expect("Partial results buffer creation should succeed");

        let result = simulator.compute_expectation_value_opencl(0b1010, 8);
        assert!(result.is_ok());

        let (expectation, execution_time) =
            result.expect("Expectation value computation should succeed");
        assert!((-1.0..=1.0).contains(&expectation));
        assert!(execution_time > 0.0);
    }

    #[test]
    fn test_build_options() {
        let config = OpenCLConfig {
            optimization_level: OptimizationLevel::Aggressive,
            ..Default::default()
        };
        let simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let build_options = simulator.get_build_options();
        assert!(build_options.contains("-O3"));
        assert!(build_options.contains("-cl-mad-enable"));
        assert!(build_options.contains("-cl-fast-relaxed-math"));
    }

    #[test]
    fn test_stats_update() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        simulator.stats.update_kernel_execution(10.0);
        simulator.stats.update_kernel_execution(20.0);

        assert_eq!(simulator.stats.total_kernel_executions, 2);
        assert_abs_diff_eq!(simulator.stats.total_execution_time, 30.0, epsilon = 1e-10);
        assert_abs_diff_eq!(simulator.stats.avg_kernel_time, 15.0, epsilon = 1e-10);
    }

    #[test]
    fn test_performance_metrics() {
        let config = OpenCLConfig::default();
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        simulator.stats.total_kernel_executions = 100;
        simulator.stats.total_execution_time = 1000.0; // 1 second
        simulator.stats.gpu_memory_usage = 1_000_000_000; // 1GB
        simulator.stats.memory_transfer_time = 100.0; // 0.1 second
        simulator.stats.gpu_utilization = 85.0;

        let metrics = simulator.stats.get_performance_metrics();

        assert!(metrics.contains_key("kernel_executions_per_second"));
        assert!(metrics.contains_key("memory_bandwidth_gb_s"));
        assert!(metrics.contains_key("gpu_efficiency"));

        assert_abs_diff_eq!(
            metrics["kernel_executions_per_second"],
            100.0,
            epsilon = 1e-10
        );
        assert_abs_diff_eq!(metrics["gpu_efficiency"], 0.85, epsilon = 1e-10);
    }

    #[test]
    fn test_cpu_fallback() {
        let config = OpenCLConfig {
            enable_cpu_fallback: true,
            ..Default::default()
        };
        let mut simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let result = simulator.fallback_to_cpu(10);
        assert!(result.is_ok());
        assert_eq!(simulator.stats.cpu_fallback_count, 1);
        assert!(simulator.cpu_fallback.is_some());
    }

    #[test]
    fn test_device_selection() {
        let config = OpenCLConfig {
            preferred_device_type: OpenCLDeviceType::GPU,
            ..Default::default()
        };
        let simulator =
            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");

        let device_info = simulator
            .get_device_info()
            .expect("Device info should be available");
        assert_eq!(device_info.device_type, OpenCLDeviceType::GPU);
        assert!(device_info.name.contains("Radeon"));
        assert_eq!(device_info.vendor, "Advanced Micro Devices, Inc.");
    }
}