quantrs2_sim/
opencl_amd_backend.rs

1//! `OpenCL` Backend for AMD GPU Acceleration
2//!
3//! This module provides high-performance quantum circuit simulation using `OpenCL`
4//! to leverage AMD GPU compute capabilities. It implements parallel state vector
5//! operations, gate applications, and quantum algorithm acceleration on AMD
6//! graphics processing units.
7//!
8//! Key features:
9//! - `OpenCL` kernel compilation and execution
10//! - AMD GPU-optimized quantum gate operations
11//! - Parallel state vector manipulation
12//! - Memory management for large quantum states
13//! - Support for AMD `ROCm` and `OpenCL` 2.0+
14//! - Automatic device detection and selection
15//! - Performance profiling and optimization
16//! - Fallback to CPU when GPU is unavailable
17
18use crate::prelude::{SimulatorError, StateVectorSimulator};
19use scirs2_core::parallel_ops::{IndexedParallelIterator, ParallelIterator};
20use scirs2_core::Complex64;
21use serde::{Deserialize, Serialize};
22use std::collections::HashMap;
23
24use crate::error::Result;
25
26/// `OpenCL` platform information
27#[derive(Debug, Clone)]
28pub struct OpenCLPlatform {
29    /// Platform ID
30    pub platform_id: usize,
31    /// Platform name
32    pub name: String,
33    /// Platform vendor
34    pub vendor: String,
35    /// Platform version
36    pub version: String,
37    /// Supported extensions
38    pub extensions: Vec<String>,
39}
40
41/// `OpenCL` device information
42#[derive(Debug, Clone)]
43pub struct OpenCLDevice {
44    /// Device ID
45    pub device_id: usize,
46    /// Device name
47    pub name: String,
48    /// Device vendor
49    pub vendor: String,
50    /// Device type (GPU, CPU, etc.)
51    pub device_type: OpenCLDeviceType,
52    /// Compute units
53    pub compute_units: u32,
54    /// Maximum work group size
55    pub max_work_group_size: usize,
56    /// Maximum work item dimensions
57    pub max_work_item_dimensions: u32,
58    /// Maximum work item sizes
59    pub max_work_item_sizes: Vec<usize>,
60    /// Global memory size
61    pub global_memory_size: u64,
62    /// Local memory size
63    pub local_memory_size: u64,
64    /// Maximum constant buffer size
65    pub max_constant_buffer_size: u64,
66    /// Supports double precision
67    pub supports_double: bool,
68    /// Device extensions
69    pub extensions: Vec<String>,
70}
71
72/// `OpenCL` device types
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum OpenCLDeviceType {
75    GPU,
76    CPU,
77    Accelerator,
78    Custom,
79    All,
80}
81
82/// `OpenCL` backend configuration
83#[derive(Debug, Clone)]
84pub struct OpenCLConfig {
85    /// Preferred platform vendor
86    pub preferred_vendor: Option<String>,
87    /// Preferred device type
88    pub preferred_device_type: OpenCLDeviceType,
89    /// Enable performance profiling
90    pub enable_profiling: bool,
91    /// Maximum memory allocation per buffer
92    pub max_buffer_size: usize,
93    /// Work group size for kernels
94    pub work_group_size: usize,
95    /// Enable kernel caching
96    pub enable_kernel_cache: bool,
97    /// `OpenCL` optimization level
98    pub optimization_level: OptimizationLevel,
99    /// Enable automatic fallback to CPU
100    pub enable_cpu_fallback: bool,
101}
102
103/// `OpenCL` optimization levels
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum OptimizationLevel {
106    /// No optimization (-O0)
107    None,
108    /// Basic optimization (-O1)
109    Basic,
110    /// Standard optimization (-O2)
111    Standard,
112    /// Aggressive optimization (-O3)
113    Aggressive,
114}
115
116impl Default for OpenCLConfig {
117    fn default() -> Self {
118        Self {
119            preferred_vendor: Some("Advanced Micro Devices".to_string()),
120            preferred_device_type: OpenCLDeviceType::GPU,
121            enable_profiling: true,
122            max_buffer_size: 1 << 30, // 1GB
123            work_group_size: 256,
124            enable_kernel_cache: true,
125            optimization_level: OptimizationLevel::Standard,
126            enable_cpu_fallback: true,
127        }
128    }
129}
130
131/// `OpenCL` kernel information
132#[derive(Debug, Clone)]
133pub struct OpenCLKernel {
134    /// Kernel name
135    pub name: String,
136    /// Kernel source code
137    pub source: String,
138    /// Compilation options
139    pub build_options: String,
140    /// Local memory usage
141    pub local_memory_usage: usize,
142    /// Work group size
143    pub work_group_size: usize,
144}
145
146/// AMD GPU-optimized quantum simulator using `OpenCL`
147pub struct AMDOpenCLSimulator {
148    /// Configuration
149    config: OpenCLConfig,
150    /// Selected platform
151    platform: Option<OpenCLPlatform>,
152    /// Selected device
153    device: Option<OpenCLDevice>,
154    /// `OpenCL` context (simulated)
155    context: Option<OpenCLContext>,
156    /// Command queue (simulated)
157    command_queue: Option<OpenCLCommandQueue>,
158    /// Compiled kernels
159    kernels: HashMap<String, OpenCLKernel>,
160    /// Memory buffers
161    buffers: HashMap<String, OpenCLBuffer>,
162    /// Performance statistics
163    stats: OpenCLStats,
164    /// Fallback CPU simulator
165    cpu_fallback: Option<StateVectorSimulator>,
166}
167
168/// Simulated `OpenCL` context
169#[derive(Debug, Clone)]
170pub struct OpenCLContext {
171    /// Context ID
172    pub context_id: usize,
173    /// Associated devices
174    pub devices: Vec<usize>,
175}
176
177/// Simulated `OpenCL` command queue
178#[derive(Debug, Clone)]
179pub struct OpenCLCommandQueue {
180    /// Queue ID
181    pub queue_id: usize,
182    /// Associated context
183    pub context_id: usize,
184    /// Associated device
185    pub device_id: usize,
186    /// Enable profiling
187    pub profiling_enabled: bool,
188}
189
190/// Simulated `OpenCL` buffer
191#[derive(Debug, Clone)]
192pub struct OpenCLBuffer {
193    /// Buffer ID
194    pub buffer_id: usize,
195    /// Buffer size in bytes
196    pub size: usize,
197    /// Memory flags
198    pub flags: MemoryFlags,
199    /// Host pointer (for simulation)
200    pub host_data: Option<Vec<u8>>,
201}
202
203/// `OpenCL` memory flags
204#[derive(Debug, Clone, Copy, PartialEq, Eq)]
205pub enum MemoryFlags {
206    ReadWrite,
207    ReadOnly,
208    WriteOnly,
209    UseHostPtr,
210    AllocHostPtr,
211    CopyHostPtr,
212}
213
214/// `OpenCL` performance statistics
215#[derive(Debug, Clone, Default, Serialize, Deserialize)]
216pub struct OpenCLStats {
217    /// Total kernel executions
218    pub total_kernel_executions: usize,
219    /// Total execution time (ms)
220    pub total_execution_time: f64,
221    /// Average kernel execution time (ms)
222    pub avg_kernel_time: f64,
223    /// Memory transfer time (ms)
224    pub memory_transfer_time: f64,
225    /// Compilation time (ms)
226    pub compilation_time: f64,
227    /// GPU memory usage (bytes)
228    pub gpu_memory_usage: u64,
229    /// GPU utilization percentage
230    pub gpu_utilization: f64,
231    /// Number of state vector operations
232    pub state_vector_operations: usize,
233    /// Number of gate operations
234    pub gate_operations: usize,
235    /// Fallback to CPU count
236    pub cpu_fallback_count: usize,
237}
238
239impl OpenCLStats {
240    /// Update statistics after kernel execution
241    pub fn update_kernel_execution(&mut self, execution_time: f64) {
242        self.total_kernel_executions += 1;
243        self.total_execution_time += execution_time;
244        self.avg_kernel_time = self.total_execution_time / self.total_kernel_executions as f64;
245    }
246
247    /// Calculate performance metrics
248    #[must_use]
249    pub fn get_performance_metrics(&self) -> HashMap<String, f64> {
250        let mut metrics = HashMap::new();
251        metrics.insert(
252            "kernel_executions_per_second".to_string(),
253            self.total_kernel_executions as f64 / (self.total_execution_time / 1000.0),
254        );
255        metrics.insert(
256            "memory_bandwidth_gb_s".to_string(),
257            self.gpu_memory_usage as f64 / (self.memory_transfer_time / 1000.0) / 1e9,
258        );
259        metrics.insert("gpu_efficiency".to_string(), self.gpu_utilization / 100.0);
260        metrics
261    }
262}
263
264impl AMDOpenCLSimulator {
265    /// Create new AMD `OpenCL` simulator
266    pub fn new(config: OpenCLConfig) -> Result<Self> {
267        let mut simulator = Self {
268            config,
269            platform: None,
270            device: None,
271            context: None,
272            command_queue: None,
273            kernels: HashMap::new(),
274            buffers: HashMap::new(),
275            stats: OpenCLStats::default(),
276            cpu_fallback: None,
277        };
278
279        // Initialize OpenCL environment
280        simulator.initialize_opencl()?;
281
282        // Compile kernels
283        simulator.compile_kernels()?;
284
285        // Initialize CPU fallback if enabled
286        if simulator.config.enable_cpu_fallback {
287            simulator.cpu_fallback = Some(StateVectorSimulator::new()); // Default size
288        }
289
290        Ok(simulator)
291    }
292
293    /// Initialize `OpenCL` platform and device
294    fn initialize_opencl(&mut self) -> Result<()> {
295        // Simulate platform discovery
296        let platforms = self.discover_platforms()?;
297
298        // Select preferred platform
299        let selected_platform = self.select_platform(&platforms)?;
300        self.platform = Some(selected_platform);
301
302        // Discover devices
303        let devices = self.discover_devices()?;
304
305        // Select preferred device
306        let selected_device = self.select_device(&devices)?;
307        self.device = Some(selected_device);
308
309        // Create context and command queue using the selected device
310        let device_id = self
311            .device
312            .as_ref()
313            .ok_or_else(|| {
314                SimulatorError::InitializationError("Device not initialized".to_string())
315            })?
316            .device_id;
317
318        self.context = Some(OpenCLContext {
319            context_id: 1,
320            devices: vec![device_id],
321        });
322
323        // Create command queue
324        self.command_queue = Some(OpenCLCommandQueue {
325            queue_id: 1,
326            context_id: 1,
327            device_id,
328            profiling_enabled: self.config.enable_profiling,
329        });
330
331        Ok(())
332    }
333
334    /// Discover available `OpenCL` platforms
335    fn discover_platforms(&self) -> Result<Vec<OpenCLPlatform>> {
336        // Simulate AMD platform discovery
337        let platforms = vec![
338            OpenCLPlatform {
339                platform_id: 0,
340                name: "AMD Accelerated Parallel Processing".to_string(),
341                vendor: "Advanced Micro Devices, Inc.".to_string(),
342                version: "OpenCL 2.1 AMD-APP (3444.0)".to_string(),
343                extensions: vec![
344                    "cl_khr_icd".to_string(),
345                    "cl_khr_d3d10_sharing".to_string(),
346                    "cl_khr_d3d11_sharing".to_string(),
347                    "cl_khr_dx9_media_sharing".to_string(),
348                    "cl_amd_event_callback".to_string(),
349                    "cl_amd_offline_devices".to_string(),
350                ],
351            },
352            OpenCLPlatform {
353                platform_id: 1,
354                name: "Intel(R) OpenCL".to_string(),
355                vendor: "Intel(R) Corporation".to_string(),
356                version: "OpenCL 2.1".to_string(),
357                extensions: vec!["cl_khr_icd".to_string()],
358            },
359        ];
360
361        Ok(platforms)
362    }
363
364    /// Select optimal platform
365    fn select_platform(&self, platforms: &[OpenCLPlatform]) -> Result<OpenCLPlatform> {
366        // Prefer AMD platform if available
367        if let Some(preferred_vendor) = &self.config.preferred_vendor {
368            for platform in platforms {
369                if platform.vendor.contains(preferred_vendor) {
370                    return Ok(platform.clone());
371                }
372            }
373        }
374
375        // Fallback to first available platform
376        platforms.first().cloned().ok_or_else(|| {
377            SimulatorError::InitializationError("No OpenCL platforms found".to_string())
378        })
379    }
380
381    /// Discover devices for selected platform
382    fn discover_devices(&self) -> Result<Vec<OpenCLDevice>> {
383        // Simulate AMD GPU device discovery
384        let devices = vec![
385            OpenCLDevice {
386                device_id: 0,
387                name: "Radeon RX 7900 XTX".to_string(),
388                vendor: "Advanced Micro Devices, Inc.".to_string(),
389                device_type: OpenCLDeviceType::GPU,
390                compute_units: 96,
391                max_work_group_size: 256,
392                max_work_item_dimensions: 3,
393                max_work_item_sizes: vec![256, 256, 256],
394                global_memory_size: 24 * (1 << 30), // 24GB
395                local_memory_size: 64 * 1024,       // 64KB
396                max_constant_buffer_size: 64 * 1024,
397                supports_double: true,
398                extensions: vec![
399                    "cl_khr_fp64".to_string(),
400                    "cl_amd_fp64".to_string(),
401                    "cl_khr_global_int32_base_atomics".to_string(),
402                ],
403            },
404            OpenCLDevice {
405                device_id: 1,
406                name: "Radeon RX 6800 XT".to_string(),
407                vendor: "Advanced Micro Devices, Inc.".to_string(),
408                device_type: OpenCLDeviceType::GPU,
409                compute_units: 72,
410                max_work_group_size: 256,
411                max_work_item_dimensions: 3,
412                max_work_item_sizes: vec![256, 256, 256],
413                global_memory_size: 16 * (1 << 30), // 16GB
414                local_memory_size: 64 * 1024,
415                max_constant_buffer_size: 64 * 1024,
416                supports_double: true,
417                extensions: vec!["cl_khr_fp64".to_string(), "cl_amd_fp64".to_string()],
418            },
419        ];
420
421        Ok(devices)
422    }
423
424    /// Select optimal device
425    fn select_device(&self, devices: &[OpenCLDevice]) -> Result<OpenCLDevice> {
426        // Filter by device type
427        let filtered_devices: Vec<&OpenCLDevice> = devices
428            .iter()
429            .filter(|device| device.device_type == self.config.preferred_device_type)
430            .collect();
431
432        if filtered_devices.is_empty() {
433            return Err(SimulatorError::InitializationError(
434                "No suitable devices found".to_string(),
435            ));
436        }
437
438        // Select device with most compute units
439        let best_device = filtered_devices
440            .iter()
441            .max_by_key(|device| device.compute_units)
442            .ok_or_else(|| {
443                SimulatorError::InitializationError("No devices available".to_string())
444            })?;
445
446        Ok((*best_device).clone())
447    }
448
449    /// Compile `OpenCL` kernels
450    fn compile_kernels(&mut self) -> Result<()> {
451        let start_time = std::time::Instant::now();
452
453        // Single qubit gate kernel
454        let single_qubit_kernel = self.create_single_qubit_kernel();
455        self.kernels
456            .insert("single_qubit_gate".to_string(), single_qubit_kernel);
457
458        // Two qubit gate kernel
459        let two_qubit_kernel = self.create_two_qubit_kernel();
460        self.kernels
461            .insert("two_qubit_gate".to_string(), two_qubit_kernel);
462
463        // State vector operations kernel
464        let state_vector_kernel = self.create_state_vector_kernel();
465        self.kernels
466            .insert("state_vector_ops".to_string(), state_vector_kernel);
467
468        // Measurement kernel
469        let measurement_kernel = self.create_measurement_kernel();
470        self.kernels
471            .insert("measurement".to_string(), measurement_kernel);
472
473        // Expectation value kernel
474        let expectation_kernel = self.create_expectation_kernel();
475        self.kernels
476            .insert("expectation_value".to_string(), expectation_kernel);
477
478        self.stats.compilation_time = start_time.elapsed().as_secs_f64() * 1000.0;
479
480        Ok(())
481    }
482
483    /// Create single qubit gate kernel
484    fn create_single_qubit_kernel(&self) -> OpenCLKernel {
485        let source = r"
486            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
487
488            typedef double2 complex_t;
489
490            complex_t complex_mul(complex_t a, complex_t b) {
491                return (complex_t)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
492            }
493
494            complex_t complex_add(complex_t a, complex_t b) {
495                return (complex_t)(a.x + b.x, a.y + b.y);
496            }
497
498            __kernel void single_qubit_gate(
499                __global complex_t* state,
500                __global const double* gate_matrix,
501                const int target_qubit,
502                const int num_qubits
503            ) {
504                const int global_id = get_global_id(0);
505                const int total_states = 1 << num_qubits;
506
507                if (global_id >= total_states / 2) return;
508
509                const int target_mask = 1 << target_qubit;
510                const int i = global_id;
511                const int j = i | target_mask;
512
513                if ((i & target_mask) == 0) {
514                    // Extract gate matrix elements
515                    complex_t gate_00 = (complex_t)(gate_matrix[0], gate_matrix[1]);
516                    complex_t gate_01 = (complex_t)(gate_matrix[2], gate_matrix[3]);
517                    complex_t gate_10 = (complex_t)(gate_matrix[4], gate_matrix[5]);
518                    complex_t gate_11 = (complex_t)(gate_matrix[6], gate_matrix[7]);
519
520                    complex_t state_i = state[i];
521                    complex_t state_j = state[j];
522
523                    state[i] = complex_add(complex_mul(gate_00, state_i), complex_mul(gate_01, state_j));
524                    state[j] = complex_add(complex_mul(gate_10, state_i), complex_mul(gate_11, state_j));
525                }
526            }
527        ";
528
529        OpenCLKernel {
530            name: "single_qubit_gate".to_string(),
531            source: source.to_string(),
532            build_options: self.get_build_options(),
533            local_memory_usage: 0,
534            work_group_size: self.config.work_group_size,
535        }
536    }
537
538    /// Create two qubit gate kernel
539    fn create_two_qubit_kernel(&self) -> OpenCLKernel {
540        let source = r"
541            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
542
543            typedef double2 complex_t;
544
545            complex_t complex_mul(complex_t a, complex_t b) {
546                return (complex_t)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
547            }
548
549            complex_t complex_add(complex_t a, complex_t b) {
550                return (complex_t)(a.x + b.x, a.y + b.y);
551            }
552
553            __kernel void two_qubit_gate(
554                __global complex_t* state,
555                __global const double* gate_matrix,
556                const int control_qubit,
557                const int target_qubit,
558                const int num_qubits
559            ) {
560                const int global_id = get_global_id(0);
561                const int total_states = 1 << num_qubits;
562
563                if (global_id >= total_states / 4) return;
564
565                const int control_mask = 1 << control_qubit;
566                const int target_mask = 1 << target_qubit;
567                const int both_mask = control_mask | target_mask;
568
569                int base = global_id;
570                // Remove bits at control and target positions
571                if (global_id & (target_mask - 1)) base = (base & ~(target_mask - 1)) << 1 | (base & (target_mask - 1));
572                if (base & (control_mask - 1)) base = (base & ~(control_mask - 1)) << 1 | (base & (control_mask - 1));
573
574                int state_00 = base;
575                int state_01 = base | target_mask;
576                int state_10 = base | control_mask;
577                int state_11 = base | both_mask;
578
579                // Load gate matrix (16 elements for 4x4 matrix)
580                complex_t gate[4][4];
581                for (int i = 0; i < 4; i++) {
582                    for (int j = 0; j < 4; j++) {
583                        gate[i][j] = (complex_t)(gate_matrix[(i*4+j)*2], gate_matrix[(i*4+j)*2+1]);
584                    }
585                }
586
587                complex_t old_states[4];
588                old_states[0] = state[state_00];
589                old_states[1] = state[state_01];
590                old_states[2] = state[state_10];
591                old_states[3] = state[state_11];
592
593                // Apply gate matrix
594                complex_t new_states[4] = {0};
595                for (int i = 0; i < 4; i++) {
596                    for (int j = 0; j < 4; j++) {
597                        new_states[i] = complex_add(new_states[i], complex_mul(gate[i][j], old_states[j]));
598                    }
599                }
600
601                state[state_00] = new_states[0];
602                state[state_01] = new_states[1];
603                state[state_10] = new_states[2];
604                state[state_11] = new_states[3];
605            }
606        ";
607
608        OpenCLKernel {
609            name: "two_qubit_gate".to_string(),
610            source: source.to_string(),
611            build_options: self.get_build_options(),
612            local_memory_usage: 128, // Local memory for gate matrix
613            work_group_size: self.config.work_group_size,
614        }
615    }
616
617    /// Create state vector operations kernel
618    fn create_state_vector_kernel(&self) -> OpenCLKernel {
619        let source = r"
620            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
621
622            typedef double2 complex_t;
623
624            __kernel void normalize_state(
625                __global complex_t* state,
626                const int num_states,
627                const double norm_factor
628            ) {
629                const int global_id = get_global_id(0);
630
631                if (global_id >= num_states) return;
632
633                state[global_id].x *= norm_factor;
634                state[global_id].y *= norm_factor;
635            }
636
637            __kernel void compute_probabilities(
638                __global const complex_t* state,
639                __global double* probabilities,
640                const int num_states
641            ) {
642                const int global_id = get_global_id(0);
643
644                if (global_id >= num_states) return;
645
646                complex_t amplitude = state[global_id];
647                probabilities[global_id] = amplitude.x * amplitude.x + amplitude.y * amplitude.y;
648            }
649
650            __kernel void inner_product(
651                __global const complex_t* state1,
652                __global const complex_t* state2,
653                __global complex_t* partial_results,
654                __local complex_t* local_data,
655                const int num_states
656            ) {
657                const int global_id = get_global_id(0);
658                const int local_id = get_local_id(0);
659                const int local_size = get_local_size(0);
660                const int group_id = get_group_id(0);
661
662                // Initialize local memory
663                if (global_id < num_states) {
664                    complex_t a = state1[global_id];
665                    complex_t b = state2[global_id];
666                    // Conjugate of a times b
667                    local_data[local_id] = (complex_t)(a.x * b.x + a.y * b.y, a.x * b.y - a.y * b.x);
668                } else {
669                    local_data[local_id] = (complex_t)(0.0, 0.0);
670                }
671
672                barrier(CLK_LOCAL_MEM_FENCE);
673
674                // Reduction
675                for (int stride = local_size / 2; stride > 0; stride /= 2) {
676                    if (local_id < stride) {
677                        local_data[local_id].x += local_data[local_id + stride].x;
678                        local_data[local_id].y += local_data[local_id + stride].y;
679                    }
680                    barrier(CLK_LOCAL_MEM_FENCE);
681                }
682
683                if (local_id == 0) {
684                    partial_results[group_id] = local_data[0];
685                }
686            }
687        ";
688
689        OpenCLKernel {
690            name: "state_vector_ops".to_string(),
691            source: source.to_string(),
692            build_options: self.get_build_options(),
693            local_memory_usage: self.config.work_group_size * 16, // Complex doubles
694            work_group_size: self.config.work_group_size,
695        }
696    }
697
698    /// Create measurement kernel
699    fn create_measurement_kernel(&self) -> OpenCLKernel {
700        let source = r"
701            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
702
703            typedef double2 complex_t;
704
705            __kernel void measure_qubit(
706                __global complex_t* state,
707                __global double* probabilities,
708                const int target_qubit,
709                const int num_qubits,
710                const int measurement_result
711            ) {
712                const int global_id = get_global_id(0);
713                const int total_states = 1 << num_qubits;
714
715                if (global_id >= total_states) return;
716
717                const int target_mask = 1 << target_qubit;
718                const int qubit_value = (global_id & target_mask) ? 1 : 0;
719
720                if (qubit_value != measurement_result) {
721                    // Set amplitude to zero for inconsistent measurement
722                    state[global_id] = (complex_t)(0.0, 0.0);
723                }
724            }
725
726            __kernel void compute_measurement_probabilities(
727                __global const complex_t* state,
728                __global double* prob_0,
729                __global double* prob_1,
730                __local double* local_data,
731                const int target_qubit,
732                const int num_qubits
733            ) {
734                const int global_id = get_global_id(0);
735                const int local_id = get_local_id(0);
736                const int local_size = get_local_size(0);
737                const int group_id = get_group_id(0);
738                const int total_states = 1 << num_qubits;
739
740                double local_prob_0 = 0.0;
741                double local_prob_1 = 0.0;
742
743                if (global_id < total_states) {
744                    const int target_mask = 1 << target_qubit;
745                    complex_t amplitude = state[global_id];
746                    double prob = amplitude.x * amplitude.x + amplitude.y * amplitude.y;
747
748                    if (global_id & target_mask) {
749                        local_prob_1 = prob;
750                    } else {
751                        local_prob_0 = prob;
752                    }
753                }
754
755                local_data[local_id * 2] = local_prob_0;
756                local_data[local_id * 2 + 1] = local_prob_1;
757
758                barrier(CLK_LOCAL_MEM_FENCE);
759
760                // Reduction
761                for (int stride = local_size / 2; stride > 0; stride /= 2) {
762                    if (local_id < stride) {
763                        local_data[local_id * 2] += local_data[(local_id + stride) * 2];
764                        local_data[local_id * 2 + 1] += local_data[(local_id + stride) * 2 + 1];
765                    }
766                    barrier(CLK_LOCAL_MEM_FENCE);
767                }
768
769                if (local_id == 0) {
770                    prob_0[group_id] = local_data[0];
771                    prob_1[group_id] = local_data[1];
772                }
773            }
774        ";
775
776        OpenCLKernel {
777            name: "measurement".to_string(),
778            source: source.to_string(),
779            build_options: self.get_build_options(),
780            local_memory_usage: self.config.work_group_size * 16, // 2 doubles per work item
781            work_group_size: self.config.work_group_size,
782        }
783    }
784
785    /// Create expectation value kernel
786    fn create_expectation_kernel(&self) -> OpenCLKernel {
787        let source = r"
788            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
789
790            typedef double2 complex_t;
791
792            complex_t complex_mul(complex_t a, complex_t b) {
793                return (complex_t)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
794            }
795
796            __kernel void expectation_value_pauli(
797                __global const complex_t* state,
798                __global double* partial_results,
799                __local double* local_data,
800                const int pauli_string,
801                const int num_qubits
802            ) {
803                const int global_id = get_global_id(0);
804                const int local_id = get_local_id(0);
805                const int local_size = get_local_size(0);
806                const int group_id = get_group_id(0);
807                const int total_states = 1 << num_qubits;
808
809                double local_expectation = 0.0;
810
811                if (global_id < total_states) {
812                    complex_t amplitude = state[global_id];
813
814                    // Apply Pauli operators
815                    int target_state = global_id;
816                    complex_t result_amplitude = amplitude;
817                    double sign = 1.0;
818
819                    // Process each Pauli operator in the string
820                    for (int qubit = 0; qubit < num_qubits; qubit++) {
821                        int pauli_op = (pauli_string >> (2 * qubit)) & 3;
822                        int qubit_mask = 1 << qubit;
823
824                        switch (pauli_op) {
825                            case 0: // I (identity)
826                                break;
827                            case 1: // X (bit flip)
828                                target_state ^= qubit_mask;
829                                break;
830                            case 2: // Y (bit and phase flip)
831                                target_state ^= qubit_mask;
832                                if (global_id & qubit_mask) sign *= -1.0;
833                                else result_amplitude = (complex_t)(-result_amplitude.y, result_amplitude.x);
834                                break;
835                            case 3: // Z (phase flip)
836                                if (global_id & qubit_mask) sign *= -1.0;
837                                break;
838                        }
839                    }
840
841                    if (target_state == global_id) {
842                        // Diagonal element
843                        local_expectation = sign * (amplitude.x * amplitude.x + amplitude.y * amplitude.y);
844                    }
845                }
846
847                local_data[local_id] = local_expectation;
848                barrier(CLK_LOCAL_MEM_FENCE);
849
850                // Reduction
851                for (int stride = local_size / 2; stride > 0; stride /= 2) {
852                    if (local_id < stride) {
853                        local_data[local_id] += local_data[local_id + stride];
854                    }
855                    barrier(CLK_LOCAL_MEM_FENCE);
856                }
857
858                if (local_id == 0) {
859                    partial_results[group_id] = local_data[0];
860                }
861            }
862        ";
863
864        OpenCLKernel {
865            name: "expectation_value".to_string(),
866            source: source.to_string(),
867            build_options: self.get_build_options(),
868            local_memory_usage: self.config.work_group_size * 8, // Double per work item
869            work_group_size: self.config.work_group_size,
870        }
871    }
872
873    /// Get build options for kernel compilation
874    fn get_build_options(&self) -> String {
875        let mut options = Vec::new();
876
877        match self.config.optimization_level {
878            OptimizationLevel::None => options.push("-O0"),
879            OptimizationLevel::Basic => options.push("-O1"),
880            OptimizationLevel::Standard => options.push("-O2"),
881            OptimizationLevel::Aggressive => options.push("-O3"),
882        }
883
884        // Add AMD-specific optimizations
885        options.push("-cl-mad-enable");
886        options.push("-cl-fast-relaxed-math");
887
888        // Double precision support
889        if let Some(device) = &self.device {
890            if device.supports_double {
891                options.push("-cl-fp64");
892            }
893        }
894
895        options.join(" ")
896    }
897
898    /// Create memory buffer
899    pub fn create_buffer(&mut self, name: &str, size: usize, flags: MemoryFlags) -> Result<()> {
900        if size > self.config.max_buffer_size {
901            return Err(SimulatorError::MemoryError(format!(
902                "Buffer size {} exceeds maximum {}",
903                size, self.config.max_buffer_size
904            )));
905        }
906
907        let buffer = OpenCLBuffer {
908            buffer_id: self.buffers.len(),
909            size,
910            flags,
911            host_data: Some(vec![0u8; size]),
912        };
913
914        self.buffers.insert(name.to_string(), buffer);
915        self.stats.gpu_memory_usage += size as u64;
916
917        Ok(())
918    }
919
920    /// Execute kernel
921    pub fn execute_kernel(
922        &mut self,
923        kernel_name: &str,
924        global_work_size: &[usize],
925        local_work_size: Option<&[usize]>,
926        args: &[KernelArg],
927    ) -> Result<f64> {
928        let start_time = std::time::Instant::now();
929
930        if !self.kernels.contains_key(kernel_name) {
931            return Err(SimulatorError::InvalidInput(format!(
932                "Kernel {kernel_name} not found"
933            )));
934        }
935
936        // Simulate kernel execution
937        let execution_time = self.simulate_kernel_execution(kernel_name, global_work_size, args)?;
938
939        let total_time = start_time.elapsed().as_secs_f64() * 1000.0;
940        self.stats.update_kernel_execution(total_time);
941
942        match kernel_name {
943            "single_qubit_gate" | "two_qubit_gate" => {
944                self.stats.gate_operations += 1;
945            }
946            "state_vector_ops" | "normalize_state" | "compute_probabilities" => {
947                self.stats.state_vector_operations += 1;
948            }
949            _ => {}
950        }
951
952        Ok(execution_time)
953    }
954
955    /// Simulate kernel execution (for demonstration)
956    fn simulate_kernel_execution(
957        &self,
958        kernel_name: &str,
959        global_work_size: &[usize],
960        _args: &[KernelArg],
961    ) -> Result<f64> {
962        let total_work_items: usize = global_work_size.iter().product();
963
964        // Simulate execution time based on work items and device capabilities
965        let device = self
966            .device
967            .as_ref()
968            .ok_or_else(|| SimulatorError::InvalidState("Device not initialized".to_string()))?;
969        let work_groups = total_work_items.div_ceil(self.config.work_group_size);
970        let parallel_work_groups = device.compute_units as usize;
971
972        let execution_cycles = work_groups.div_ceil(parallel_work_groups);
973
974        // Base execution time per cycle (microseconds)
975        let base_time_per_cycle = match kernel_name {
976            "single_qubit_gate" => 1.0,
977            "two_qubit_gate" => 2.5,
978            "state_vector_ops" => 0.5,
979            "measurement" => 1.5,
980            "expectation_value" => 2.0,
981            _ => 1.0,
982        };
983
984        let execution_time = execution_cycles as f64 * base_time_per_cycle;
985
986        // Add random variation
987        let variation = fastrand::f64().mul_add(0.2, 0.9); // 90-110% of base time
988        Ok(execution_time * variation)
989    }
990
991    /// Apply single qubit gate using `OpenCL`
992    pub fn apply_single_qubit_gate_opencl(
993        &mut self,
994        gate_matrix: &[Complex64; 4],
995        target_qubit: usize,
996        num_qubits: usize,
997    ) -> Result<f64> {
998        // Convert gate matrix to real array for OpenCL
999        let mut gate_real = [0.0; 8];
1000        for (i, &complex_val) in gate_matrix.iter().enumerate() {
1001            gate_real[i * 2] = complex_val.re;
1002            gate_real[i * 2 + 1] = complex_val.im;
1003        }
1004
1005        let total_states = 1 << num_qubits;
1006        let global_work_size = vec![total_states / 2];
1007
1008        let args = vec![
1009            KernelArg::Buffer("state".to_string()),
1010            KernelArg::ConstantBuffer("gate_matrix".to_string()),
1011            KernelArg::Int(target_qubit as i32),
1012            KernelArg::Int(num_qubits as i32),
1013        ];
1014
1015        self.execute_kernel("single_qubit_gate", &global_work_size, None, &args)
1016    }
1017
1018    /// Apply two qubit gate using `OpenCL`
1019    pub fn apply_two_qubit_gate_opencl(
1020        &mut self,
1021        gate_matrix: &[Complex64; 16],
1022        control_qubit: usize,
1023        target_qubit: usize,
1024        num_qubits: usize,
1025    ) -> Result<f64> {
1026        // Convert gate matrix to real array for OpenCL
1027        let mut gate_real = [0.0; 32];
1028        for (i, &complex_val) in gate_matrix.iter().enumerate() {
1029            gate_real[i * 2] = complex_val.re;
1030            gate_real[i * 2 + 1] = complex_val.im;
1031        }
1032
1033        let total_states = 1 << num_qubits;
1034        let global_work_size = vec![total_states / 4];
1035
1036        let args = vec![
1037            KernelArg::Buffer("state".to_string()),
1038            KernelArg::ConstantBuffer("gate_matrix".to_string()),
1039            KernelArg::Int(control_qubit as i32),
1040            KernelArg::Int(target_qubit as i32),
1041            KernelArg::Int(num_qubits as i32),
1042        ];
1043
1044        self.execute_kernel("two_qubit_gate", &global_work_size, None, &args)
1045    }
1046
1047    /// Compute expectation value using `OpenCL`
1048    pub fn compute_expectation_value_opencl(
1049        &mut self,
1050        pauli_string: u32,
1051        num_qubits: usize,
1052    ) -> Result<(f64, f64)> {
1053        let total_states = 1 << num_qubits;
1054        let global_work_size = vec![total_states];
1055
1056        let args = vec![
1057            KernelArg::Buffer("state".to_string()),
1058            KernelArg::Buffer("partial_results".to_string()),
1059            KernelArg::LocalMemory(self.config.work_group_size * 8),
1060            KernelArg::Int(pauli_string as i32),
1061            KernelArg::Int(num_qubits as i32),
1062        ];
1063
1064        let execution_time = self.execute_kernel(
1065            "expectation_value",
1066            &global_work_size,
1067            Some(&[self.config.work_group_size]),
1068            &args,
1069        )?;
1070
1071        // Simulate expectation value result
1072        let expectation_value = fastrand::f64().mul_add(2.0, -1.0); // Random value between -1 and 1
1073
1074        Ok((expectation_value, execution_time))
1075    }
1076
1077    /// Get device information
1078    pub const fn get_device_info(&self) -> Option<&OpenCLDevice> {
1079        self.device.as_ref()
1080    }
1081
1082    /// Get performance statistics
1083    pub const fn get_stats(&self) -> &OpenCLStats {
1084        &self.stats
1085    }
1086
1087    /// Reset performance statistics
1088    pub fn reset_stats(&mut self) {
1089        self.stats = OpenCLStats::default();
1090    }
1091
1092    /// Check if `OpenCL` is available
1093    pub const fn is_opencl_available(&self) -> bool {
1094        self.context.is_some() && self.device.is_some()
1095    }
1096
1097    /// Fallback to CPU simulation
1098    pub fn fallback_to_cpu(&mut self, num_qubits: usize) -> Result<()> {
1099        if self.config.enable_cpu_fallback {
1100            self.cpu_fallback = Some(StateVectorSimulator::new());
1101            self.stats.cpu_fallback_count += 1;
1102            Ok(())
1103        } else {
1104            Err(SimulatorError::OperationNotSupported(
1105                "CPU fallback disabled".to_string(),
1106            ))
1107        }
1108    }
1109}
1110
1111/// Kernel argument types
1112#[derive(Debug, Clone)]
1113pub enum KernelArg {
1114    Buffer(String),
1115    ConstantBuffer(String),
1116    Int(i32),
1117    Float(f32),
1118    Double(f64),
1119    LocalMemory(usize),
1120}
1121
1122/// Benchmark AMD `OpenCL` backend performance
1123pub fn benchmark_amd_opencl_backend() -> Result<HashMap<String, f64>> {
1124    let mut results = HashMap::new();
1125
1126    // Test different configurations
1127    let configs = vec![
1128        OpenCLConfig {
1129            work_group_size: 64,
1130            optimization_level: OptimizationLevel::Standard,
1131            ..Default::default()
1132        },
1133        OpenCLConfig {
1134            work_group_size: 128,
1135            optimization_level: OptimizationLevel::Aggressive,
1136            ..Default::default()
1137        },
1138        OpenCLConfig {
1139            work_group_size: 256,
1140            optimization_level: OptimizationLevel::Aggressive,
1141            ..Default::default()
1142        },
1143    ];
1144
1145    for (i, config) in configs.into_iter().enumerate() {
1146        let start = std::time::Instant::now();
1147
1148        let mut simulator = AMDOpenCLSimulator::new(config)?;
1149
1150        // Benchmark single qubit gates
1151        let single_qubit_matrix = [
1152            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
1153            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
1154            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
1155            Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
1156        ];
1157
1158        for num_qubits in [10, 15, 20] {
1159            simulator.create_buffer("state", (1 << num_qubits) * 16, MemoryFlags::ReadWrite)?;
1160
1161            for qubit in 0..num_qubits.min(5) {
1162                let _time = simulator.apply_single_qubit_gate_opencl(
1163                    &single_qubit_matrix,
1164                    qubit,
1165                    num_qubits,
1166                )?;
1167            }
1168        }
1169
1170        // Benchmark two qubit gates
1171        let cnot_matrix = [
1172            Complex64::new(1.0, 0.0),
1173            Complex64::new(0.0, 0.0),
1174            Complex64::new(0.0, 0.0),
1175            Complex64::new(0.0, 0.0),
1176            Complex64::new(0.0, 0.0),
1177            Complex64::new(1.0, 0.0),
1178            Complex64::new(0.0, 0.0),
1179            Complex64::new(0.0, 0.0),
1180            Complex64::new(0.0, 0.0),
1181            Complex64::new(0.0, 0.0),
1182            Complex64::new(0.0, 0.0),
1183            Complex64::new(1.0, 0.0),
1184            Complex64::new(0.0, 0.0),
1185            Complex64::new(0.0, 0.0),
1186            Complex64::new(1.0, 0.0),
1187            Complex64::new(0.0, 0.0),
1188        ];
1189
1190        for num_qubits in [10usize, 15, 20] {
1191            for pair in 0..num_qubits.saturating_sub(1).min(3) {
1192                let _time = simulator.apply_two_qubit_gate_opencl(
1193                    &cnot_matrix,
1194                    pair,
1195                    pair + 1,
1196                    num_qubits,
1197                )?;
1198            }
1199        }
1200
1201        // Benchmark expectation values
1202        for num_qubits in [10, 15, 20] {
1203            let _result = simulator.compute_expectation_value_opencl(0b1010, num_qubits)?;
1204        }
1205
1206        let time = start.elapsed().as_secs_f64() * 1000.0;
1207        results.insert(format!("config_{i}"), time);
1208
1209        // Add performance metrics
1210        let stats = simulator.get_stats();
1211        results.insert(format!("config_{i}_gate_ops"), stats.gate_operations as f64);
1212        results.insert(format!("config_{i}_avg_kernel_time"), stats.avg_kernel_time);
1213        results.insert(format!("config_{i}_gpu_utilization"), stats.gpu_utilization);
1214    }
1215
1216    Ok(results)
1217}
1218
1219#[cfg(test)]
1220mod tests {
1221    use super::*;
1222    use approx::assert_abs_diff_eq;
1223
1224    #[test]
1225    fn test_opencl_simulator_creation() {
1226        let config = OpenCLConfig::default();
1227        let simulator = AMDOpenCLSimulator::new(config);
1228        assert!(simulator.is_ok());
1229    }
1230
1231    #[test]
1232    fn test_platform_discovery() {
1233        let config = OpenCLConfig::default();
1234        let simulator =
1235            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1236        let platforms = simulator
1237            .discover_platforms()
1238            .expect("Platform discovery should succeed");
1239
1240        assert!(!platforms.is_empty());
1241        assert!(platforms
1242            .iter()
1243            .any(|p| p.vendor.contains("Advanced Micro Devices")));
1244    }
1245
1246    #[test]
1247    fn test_device_discovery() {
1248        let config = OpenCLConfig::default();
1249        let simulator =
1250            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1251        let devices = simulator
1252            .discover_devices()
1253            .expect("Device discovery should succeed");
1254
1255        assert!(!devices.is_empty());
1256        assert!(devices
1257            .iter()
1258            .any(|d| d.device_type == OpenCLDeviceType::GPU));
1259    }
1260
1261    #[test]
1262    fn test_kernel_creation() {
1263        let config = OpenCLConfig::default();
1264        let simulator =
1265            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1266
1267        assert!(simulator.kernels.contains_key("single_qubit_gate"));
1268        assert!(simulator.kernels.contains_key("two_qubit_gate"));
1269        assert!(simulator.kernels.contains_key("state_vector_ops"));
1270        assert!(simulator.kernels.contains_key("measurement"));
1271        assert!(simulator.kernels.contains_key("expectation_value"));
1272    }
1273
1274    #[test]
1275    fn test_buffer_creation() {
1276        let config = OpenCLConfig::default();
1277        let mut simulator =
1278            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1279
1280        let result = simulator.create_buffer("test_buffer", 1024, MemoryFlags::ReadWrite);
1281        assert!(result.is_ok());
1282        assert!(simulator.buffers.contains_key("test_buffer"));
1283        assert_eq!(simulator.stats.gpu_memory_usage, 1024);
1284    }
1285
1286    #[test]
1287    fn test_buffer_size_limit() {
1288        let config = OpenCLConfig {
1289            max_buffer_size: 512,
1290            ..Default::default()
1291        };
1292        let mut simulator =
1293            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1294
1295        let result = simulator.create_buffer("large_buffer", 1024, MemoryFlags::ReadWrite);
1296        assert!(result.is_err());
1297    }
1298
1299    #[test]
1300    fn test_kernel_execution() {
1301        let config = OpenCLConfig::default();
1302        let mut simulator =
1303            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1304
1305        let global_work_size = vec![256];
1306        let args = vec![
1307            KernelArg::Buffer("state".to_string()),
1308            KernelArg::Int(0),
1309            KernelArg::Int(8),
1310        ];
1311
1312        let result = simulator.execute_kernel("single_qubit_gate", &global_work_size, None, &args);
1313        assert!(result.is_ok());
1314
1315        let execution_time = result.expect("Kernel execution should succeed");
1316        assert!(execution_time > 0.0);
1317    }
1318
1319    #[test]
1320    fn test_single_qubit_gate_application() {
1321        let config = OpenCLConfig::default();
1322        let mut simulator =
1323            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1324
1325        let hadamard_matrix = [
1326            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
1327            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
1328            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
1329            Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
1330        ];
1331
1332        simulator
1333            .create_buffer("state", 1024 * 16, MemoryFlags::ReadWrite)
1334            .expect("Buffer creation should succeed");
1335
1336        let result = simulator.apply_single_qubit_gate_opencl(&hadamard_matrix, 0, 8);
1337        assert!(result.is_ok());
1338
1339        let execution_time = result.expect("Single qubit gate application should succeed");
1340        assert!(execution_time > 0.0);
1341    }
1342
1343    #[test]
1344    fn test_two_qubit_gate_application() {
1345        let config = OpenCLConfig::default();
1346        let mut simulator =
1347            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1348
1349        let cnot_matrix = [
1350            Complex64::new(1.0, 0.0),
1351            Complex64::new(0.0, 0.0),
1352            Complex64::new(0.0, 0.0),
1353            Complex64::new(0.0, 0.0),
1354            Complex64::new(0.0, 0.0),
1355            Complex64::new(1.0, 0.0),
1356            Complex64::new(0.0, 0.0),
1357            Complex64::new(0.0, 0.0),
1358            Complex64::new(0.0, 0.0),
1359            Complex64::new(0.0, 0.0),
1360            Complex64::new(0.0, 0.0),
1361            Complex64::new(1.0, 0.0),
1362            Complex64::new(0.0, 0.0),
1363            Complex64::new(0.0, 0.0),
1364            Complex64::new(1.0, 0.0),
1365            Complex64::new(0.0, 0.0),
1366        ];
1367
1368        simulator
1369            .create_buffer("state", 1024 * 16, MemoryFlags::ReadWrite)
1370            .expect("Buffer creation should succeed");
1371
1372        let result = simulator.apply_two_qubit_gate_opencl(&cnot_matrix, 0, 1, 8);
1373        assert!(result.is_ok());
1374
1375        let execution_time = result.expect("Two qubit gate application should succeed");
1376        assert!(execution_time > 0.0);
1377    }
1378
1379    #[test]
1380    fn test_expectation_value_computation() {
1381        let config = OpenCLConfig::default();
1382        let mut simulator =
1383            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1384
1385        simulator
1386            .create_buffer("state", 1024 * 16, MemoryFlags::ReadWrite)
1387            .expect("State buffer creation should succeed");
1388        simulator
1389            .create_buffer("partial_results", 64 * 8, MemoryFlags::ReadWrite)
1390            .expect("Partial results buffer creation should succeed");
1391
1392        let result = simulator.compute_expectation_value_opencl(0b1010, 8);
1393        assert!(result.is_ok());
1394
1395        let (expectation, execution_time) =
1396            result.expect("Expectation value computation should succeed");
1397        assert!((-1.0..=1.0).contains(&expectation));
1398        assert!(execution_time > 0.0);
1399    }
1400
1401    #[test]
1402    fn test_build_options() {
1403        let config = OpenCLConfig {
1404            optimization_level: OptimizationLevel::Aggressive,
1405            ..Default::default()
1406        };
1407        let simulator =
1408            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1409
1410        let build_options = simulator.get_build_options();
1411        assert!(build_options.contains("-O3"));
1412        assert!(build_options.contains("-cl-mad-enable"));
1413        assert!(build_options.contains("-cl-fast-relaxed-math"));
1414    }
1415
1416    #[test]
1417    fn test_stats_update() {
1418        let config = OpenCLConfig::default();
1419        let mut simulator =
1420            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1421
1422        simulator.stats.update_kernel_execution(10.0);
1423        simulator.stats.update_kernel_execution(20.0);
1424
1425        assert_eq!(simulator.stats.total_kernel_executions, 2);
1426        assert_abs_diff_eq!(simulator.stats.total_execution_time, 30.0, epsilon = 1e-10);
1427        assert_abs_diff_eq!(simulator.stats.avg_kernel_time, 15.0, epsilon = 1e-10);
1428    }
1429
1430    #[test]
1431    fn test_performance_metrics() {
1432        let config = OpenCLConfig::default();
1433        let mut simulator =
1434            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1435
1436        simulator.stats.total_kernel_executions = 100;
1437        simulator.stats.total_execution_time = 1000.0; // 1 second
1438        simulator.stats.gpu_memory_usage = 1_000_000_000; // 1GB
1439        simulator.stats.memory_transfer_time = 100.0; // 0.1 second
1440        simulator.stats.gpu_utilization = 85.0;
1441
1442        let metrics = simulator.stats.get_performance_metrics();
1443
1444        assert!(metrics.contains_key("kernel_executions_per_second"));
1445        assert!(metrics.contains_key("memory_bandwidth_gb_s"));
1446        assert!(metrics.contains_key("gpu_efficiency"));
1447
1448        assert_abs_diff_eq!(
1449            metrics["kernel_executions_per_second"],
1450            100.0,
1451            epsilon = 1e-10
1452        );
1453        assert_abs_diff_eq!(metrics["gpu_efficiency"], 0.85, epsilon = 1e-10);
1454    }
1455
1456    #[test]
1457    fn test_cpu_fallback() {
1458        let config = OpenCLConfig {
1459            enable_cpu_fallback: true,
1460            ..Default::default()
1461        };
1462        let mut simulator =
1463            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1464
1465        let result = simulator.fallback_to_cpu(10);
1466        assert!(result.is_ok());
1467        assert_eq!(simulator.stats.cpu_fallback_count, 1);
1468        assert!(simulator.cpu_fallback.is_some());
1469    }
1470
1471    #[test]
1472    fn test_device_selection() {
1473        let config = OpenCLConfig {
1474            preferred_device_type: OpenCLDeviceType::GPU,
1475            ..Default::default()
1476        };
1477        let simulator =
1478            AMDOpenCLSimulator::new(config).expect("OpenCL simulator should be created");
1479
1480        let device_info = simulator
1481            .get_device_info()
1482            .expect("Device info should be available");
1483        assert_eq!(device_info.device_type, OpenCLDeviceType::GPU);
1484        assert!(device_info.name.contains("Radeon"));
1485        assert_eq!(device_info.vendor, "Advanced Micro Devices, Inc.");
1486    }
1487}