quantrs2_core/gpu/
specialized_kernels.rs

1//! Enhanced GPU kernel optimization for specialized quantum gates
2//!
3//! This module provides high-performance GPU kernels optimized for specialized quantum gates
4//! including holonomic gates, post-quantum cryptography gates, and quantum ML gates.
5//! It leverages tensor cores, optimized memory access patterns, and gate fusion for maximum performance.
6
7use crate::{error::QuantRS2Result, gate::GateOp, qubit::QubitId};
8use num_complex::Complex64;
9use std::collections::HashMap;
10use std::sync::{Arc, Mutex};
11
12/// Enhanced GPU kernel manager for specialized gates
13pub struct SpecializedGpuKernels {
14    /// CUDA context for kernel execution
15    cuda_context: Option<CudaSpecializedContext>,
16    /// WebGPU context for cross-platform support
17    webgpu_context: Option<WebGpuSpecializedContext>,
18    /// Kernel cache for compiled kernels
19    kernel_cache: Arc<Mutex<KernelCache>>,
20    /// Performance statistics
21    performance_stats: Arc<Mutex<PerformanceStats>>,
22    /// Optimization configuration
23    config: OptimizationConfig,
24}
25
26/// CUDA context specialized for quantum gates
27pub struct CudaSpecializedContext {
28    /// Device compute capability
29    #[allow(dead_code)]
30    compute_capability: (i32, i32),
31    /// Tensor core availability
32    has_tensor_cores: bool,
33    /// Maximum shared memory per block
34    #[allow(dead_code)]
35    max_shared_memory: usize,
36    /// Warp size
37    #[allow(dead_code)]
38    warp_size: usize,
39    /// Compiled kernels
40    kernels: HashMap<String, CompiledKernel>,
41}
42
43/// WebGPU context for cross-platform support
44pub struct WebGpuSpecializedContext {
45    /// Device limits
46    #[allow(dead_code)]
47    device_limits: WebGpuLimits,
48    /// Compiled shaders
49    #[allow(dead_code)]
50    shaders: HashMap<String, CompiledShader>,
51    /// Buffer pools for efficient memory management
52    #[allow(dead_code)]
53    buffer_pools: HashMap<String, BufferPool>,
54}
55
56/// Kernel cache for compiled GPU kernels
57pub struct KernelCache {
58    /// Cached CUDA kernels
59    #[allow(dead_code)]
60    cuda_kernels: HashMap<String, CachedCudaKernel>,
61    /// Cached WebGPU shaders
62    #[allow(dead_code)]
63    webgpu_shaders: HashMap<String, CachedWebGpuShader>,
64    /// Cache hit statistics
65    cache_stats: CacheStatistics,
66}
67
68/// Performance statistics for optimization analysis
69pub struct PerformanceStats {
70    /// Kernel execution times
71    kernel_times: HashMap<String, Vec<f64>>,
72    /// Memory bandwidth utilization
73    memory_bandwidth: HashMap<String, f64>,
74    /// Tensor core utilization
75    tensor_core_utilization: f64,
76    /// Cache hit rates
77    #[allow(dead_code)]
78    cache_hit_rates: HashMap<String, f64>,
79}
80
81/// GPU optimization configuration
82#[derive(Debug, Clone)]
83pub struct OptimizationConfig {
84    /// Enable tensor core optimization
85    pub use_tensor_cores: bool,
86    /// Enable memory access optimization
87    pub optimize_memory_access: bool,
88    /// Enable gate fusion
89    pub enable_gate_fusion: bool,
90    /// Maximum fusion chain length
91    pub max_fusion_length: usize,
92    /// Memory coalescing threshold
93    pub coalescing_threshold: usize,
94    /// Use mixed precision
95    pub use_mixed_precision: bool,
96}
97
98impl Default for OptimizationConfig {
99    fn default() -> Self {
100        Self {
101            use_tensor_cores: true,
102            optimize_memory_access: true,
103            enable_gate_fusion: true,
104            max_fusion_length: 8,
105            coalescing_threshold: 32,
106            use_mixed_precision: true,
107        }
108    }
109}
110
111impl SpecializedGpuKernels {
112    /// Create a new specialized GPU kernel manager
113    pub fn new(config: OptimizationConfig) -> QuantRS2Result<Self> {
114        let cuda_context = Self::initialize_cuda_context(&config)?;
115        let webgpu_context = Self::initialize_webgpu_context(&config)?;
116
117        Ok(Self {
118            cuda_context,
119            webgpu_context,
120            kernel_cache: Arc::new(Mutex::new(KernelCache::new())),
121            performance_stats: Arc::new(Mutex::new(PerformanceStats::new())),
122            config,
123        })
124    }
125
126    /// Initialize CUDA context with specialized kernel compilation
127    fn initialize_cuda_context(
128        config: &OptimizationConfig,
129    ) -> QuantRS2Result<Option<CudaSpecializedContext>> {
130        // Check if CUDA is available
131        if !Self::is_cuda_available() {
132            return Ok(None);
133        }
134
135        let compute_capability = Self::get_compute_capability()?;
136        let has_tensor_cores = compute_capability.0 >= 7; // Volta and later
137        let device_props = Self::get_device_properties()?;
138
139        let mut kernels = HashMap::new();
140
141        // Compile specialized gate kernels
142        kernels.insert(
143            "holonomic_gate".to_string(),
144            Self::compile_holonomic_kernel(config)?,
145        );
146        kernels.insert(
147            "post_quantum_hash".to_string(),
148            Self::compile_post_quantum_kernel(config)?,
149        );
150        kernels.insert(
151            "quantum_ml_attention".to_string(),
152            Self::compile_qml_attention_kernel(config)?,
153        );
154        kernels.insert(
155            "fused_rotation_sequence".to_string(),
156            Self::compile_fused_rotation_kernel(config)?,
157        );
158        kernels.insert(
159            "tensor_core_matmul".to_string(),
160            Self::compile_tensor_core_kernel(config)?,
161        );
162
163        Ok(Some(CudaSpecializedContext {
164            compute_capability,
165            has_tensor_cores,
166            max_shared_memory: device_props.max_shared_memory,
167            warp_size: device_props.warp_size,
168            kernels,
169        }))
170    }
171
172    /// Initialize WebGPU context with cross-platform shaders
173    fn initialize_webgpu_context(
174        config: &OptimizationConfig,
175    ) -> QuantRS2Result<Option<WebGpuSpecializedContext>> {
176        let device_limits = Self::get_webgpu_limits()?;
177        let mut shaders = HashMap::new();
178        let mut buffer_pools = HashMap::new();
179
180        // Compile WebGPU shaders for specialized gates
181        shaders.insert(
182            "holonomic_gate".to_string(),
183            Self::compile_holonomic_shader(config)?,
184        );
185        shaders.insert(
186            "post_quantum_hash".to_string(),
187            Self::compile_post_quantum_shader(config)?,
188        );
189        shaders.insert(
190            "quantum_ml_attention".to_string(),
191            Self::compile_qml_attention_shader(config)?,
192        );
193
194        // Initialize buffer pools
195        buffer_pools.insert("state_vectors".to_string(), BufferPool::new(1024 * 1024)); // 1MB initial
196        buffer_pools.insert("gate_matrices".to_string(), BufferPool::new(512 * 1024)); // 512KB initial
197        buffer_pools.insert("temporary_buffers".to_string(), BufferPool::new(256 * 1024)); // 256KB initial
198
199        Ok(Some(WebGpuSpecializedContext {
200            device_limits,
201            shaders,
202            buffer_pools,
203        }))
204    }
205
206    /// Apply a holonomic gate with optimized GPU execution
207    pub fn apply_holonomic_gate(
208        &self,
209        state: &mut [Complex64],
210        holonomy_matrix: &[Complex64],
211        target_qubits: &[QubitId],
212    ) -> QuantRS2Result<()> {
213        let _num_qubits = target_qubits.len();
214        let state_size = state.len();
215
216        // Choose optimal execution path based on size and hardware
217        if state_size > 1024 && self.cuda_context.is_some() {
218            self.apply_holonomic_gate_cuda(state, holonomy_matrix, target_qubits)
219        } else if self.webgpu_context.is_some() {
220            self.apply_holonomic_gate_webgpu(state, holonomy_matrix, target_qubits)
221        } else {
222            // CPU fallback with SIMD optimization
223            self.apply_holonomic_gate_cpu_optimized(state, holonomy_matrix, target_qubits)
224        }
225    }
226
227    /// Apply holonomic gate using CUDA with tensor core optimization
228    fn apply_holonomic_gate_cuda(
229        &self,
230        state: &mut [Complex64],
231        holonomy_matrix: &[Complex64],
232        target_qubits: &[QubitId],
233    ) -> QuantRS2Result<()> {
234        let cuda_ctx = self.cuda_context.as_ref().unwrap();
235        let kernel = cuda_ctx.kernels.get("holonomic_gate").unwrap();
236
237        // Optimize block and grid dimensions
238        let (block_dim, grid_dim) =
239            self.calculate_optimal_dimensions(state.len(), target_qubits.len())?;
240
241        // Use tensor cores if available and matrix size is suitable
242        if cuda_ctx.has_tensor_cores && self.config.use_tensor_cores && holonomy_matrix.len() >= 256
243        {
244            self.launch_tensor_core_holonomic_kernel(
245                kernel,
246                state,
247                holonomy_matrix,
248                target_qubits,
249                block_dim,
250                grid_dim,
251            )?;
252        } else {
253            self.launch_standard_holonomic_kernel(
254                kernel,
255                state,
256                holonomy_matrix,
257                target_qubits,
258                block_dim,
259                grid_dim,
260            )?;
261        }
262
263        // Update performance statistics
264        self.update_performance_stats("holonomic_gate_cuda", kernel.last_execution_time);
265
266        Ok(())
267    }
268
269    /// Apply post-quantum cryptographic hash gate
270    pub fn apply_post_quantum_hash_gate(
271        &self,
272        state: &mut [Complex64],
273        hash_circuit: &[Complex64],
274        compression_type: PostQuantumCompressionType,
275    ) -> QuantRS2Result<()> {
276        match compression_type {
277            PostQuantumCompressionType::QuantumSponge { rate, capacity } => {
278                self.apply_quantum_sponge_gpu(state, hash_circuit, rate, capacity)
279            }
280            PostQuantumCompressionType::QuantumMerkleTree { depth, arity } => {
281                self.apply_quantum_merkle_gpu(state, hash_circuit, depth, arity)
282            }
283            PostQuantumCompressionType::QuantumGrover { iterations } => {
284                self.apply_quantum_grover_gpu(state, hash_circuit, iterations)
285            }
286        }
287    }
288
289    /// Apply quantum ML attention mechanism with GPU optimization
290    pub fn apply_quantum_ml_attention(
291        &self,
292        state: &mut [Complex64],
293        query_params: &[Complex64],
294        key_params: &[Complex64],
295        value_params: &[Complex64],
296        num_heads: usize,
297    ) -> QuantRS2Result<()> {
298        let attention_dim = state.len() / num_heads;
299
300        if self.cuda_context.is_some() && attention_dim >= 64 {
301            // Use CUDA for large attention computations
302            self.apply_qml_attention_cuda(state, query_params, key_params, value_params, num_heads)
303        } else if self.webgpu_context.is_some() {
304            // Use WebGPU for medium-sized computations
305            self.apply_qml_attention_webgpu(
306                state,
307                query_params,
308                key_params,
309                value_params,
310                num_heads,
311            )
312        } else {
313            // CPU fallback with vectorization
314            self.apply_qml_attention_cpu_vectorized(
315                state,
316                query_params,
317                key_params,
318                value_params,
319                num_heads,
320            )
321        }
322    }
323
324    /// Apply fused gate sequences for optimal performance
325    pub fn apply_fused_gate_sequence(
326        &self,
327        state: &mut [Complex64],
328        gates: &[Box<dyn GateOp>],
329    ) -> QuantRS2Result<()> {
330        if !self.config.enable_gate_fusion || gates.len() < 2 {
331            // Apply gates individually if fusion is disabled or insufficient gates
332            for gate in gates {
333                self.apply_single_gate_optimized(state, gate.as_ref())?;
334            }
335            return Ok(());
336        }
337
338        // Analyze gates for fusion opportunities
339        let fusion_chains = self.analyze_gate_fusion_opportunities(gates)?;
340
341        for chain in fusion_chains {
342            match chain.fusion_type {
343                FusionType::RotationSequence => {
344                    self.apply_fused_rotation_sequence(state, &chain.gates)?;
345                }
346                FusionType::PauliString => {
347                    self.apply_fused_pauli_string(state, &chain.gates)?;
348                }
349                FusionType::ControlledSequence => {
350                    self.apply_fused_controlled_sequence(state, &chain.gates)?;
351                }
352                FusionType::None => {
353                    // Apply gates individually
354                    for gate in &chain.gates {
355                        self.apply_single_gate_optimized(state, gate.as_ref())?;
356                    }
357                }
358            }
359        }
360
361        Ok(())
362    }
363
364    /// Calculate optimal GPU block and grid dimensions
365    fn calculate_optimal_dimensions(
366        &self,
367        state_size: usize,
368        num_target_qubits: usize,
369    ) -> QuantRS2Result<(u32, u32)> {
370        let _cuda_ctx = self.cuda_context.as_ref().unwrap();
371
372        // Calculate work per thread
373        let work_per_thread = 1 << num_target_qubits; // 2^num_target_qubits
374        let total_work_items = state_size / work_per_thread;
375
376        // Optimize for memory coalescing
377        let threads_per_block = if total_work_items >= 1024 {
378            1024
379        } else if total_work_items >= 512 {
380            512
381        } else if total_work_items >= 256 {
382            256
383        } else {
384            128.max(32) // Minimum warp size
385        };
386
387        let blocks = (total_work_items + threads_per_block - 1) / threads_per_block;
388
389        Ok((threads_per_block as u32, blocks as u32))
390    }
391
392    /// Update performance statistics
393    fn update_performance_stats(&self, kernel_name: &str, execution_time: f64) {
394        let mut stats = self.performance_stats.lock().unwrap();
395        stats
396            .kernel_times
397            .entry(kernel_name.to_string())
398            .or_insert_with(Vec::new)
399            .push(execution_time);
400    }
401
402    /// Get performance report
403    pub fn get_performance_report(&self) -> PerformanceReport {
404        let stats = self.performance_stats.lock().unwrap();
405        let cache = self.kernel_cache.lock().unwrap();
406
407        PerformanceReport {
408            average_kernel_times: stats
409                .kernel_times
410                .iter()
411                .map(|(k, v)| (k.clone(), v.iter().sum::<f64>() / v.len() as f64))
412                .collect(),
413            cache_hit_rate: cache.cache_stats.overall_hit_rate(),
414            tensor_core_utilization: stats.tensor_core_utilization,
415            memory_bandwidth_utilization: stats.memory_bandwidth.values().sum::<f64>()
416                / stats.memory_bandwidth.len() as f64,
417        }
418    }
419
420    // Placeholder implementations for specialized kernel methods
421    fn is_cuda_available() -> bool {
422        false
423    } // Would check actual CUDA availability
424    fn get_compute_capability() -> QuantRS2Result<(i32, i32)> {
425        Ok((7, 5))
426    }
427    fn get_device_properties() -> QuantRS2Result<DeviceProperties> {
428        Ok(DeviceProperties {
429            max_shared_memory: 49152,
430            warp_size: 32,
431        })
432    }
433    fn get_webgpu_limits() -> QuantRS2Result<WebGpuLimits> {
434        Ok(WebGpuLimits {
435            max_compute_workgroup_size: 256,
436        })
437    }
438
439    fn compile_holonomic_kernel(_config: &OptimizationConfig) -> QuantRS2Result<CompiledKernel> {
440        Ok(CompiledKernel {
441            name: "holonomic".to_string(),
442            last_execution_time: 0.0,
443        })
444    }
445    fn compile_post_quantum_kernel(_config: &OptimizationConfig) -> QuantRS2Result<CompiledKernel> {
446        Ok(CompiledKernel {
447            name: "post_quantum".to_string(),
448            last_execution_time: 0.0,
449        })
450    }
451    fn compile_qml_attention_kernel(
452        _config: &OptimizationConfig,
453    ) -> QuantRS2Result<CompiledKernel> {
454        Ok(CompiledKernel {
455            name: "qml_attention".to_string(),
456            last_execution_time: 0.0,
457        })
458    }
459    fn compile_fused_rotation_kernel(
460        _config: &OptimizationConfig,
461    ) -> QuantRS2Result<CompiledKernel> {
462        Ok(CompiledKernel {
463            name: "fused_rotation".to_string(),
464            last_execution_time: 0.0,
465        })
466    }
467    fn compile_tensor_core_kernel(_config: &OptimizationConfig) -> QuantRS2Result<CompiledKernel> {
468        Ok(CompiledKernel {
469            name: "tensor_core".to_string(),
470            last_execution_time: 0.0,
471        })
472    }
473
474    fn compile_holonomic_shader(_config: &OptimizationConfig) -> QuantRS2Result<CompiledShader> {
475        Ok(CompiledShader {
476            name: "holonomic".to_string(),
477        })
478    }
479    fn compile_post_quantum_shader(_config: &OptimizationConfig) -> QuantRS2Result<CompiledShader> {
480        Ok(CompiledShader {
481            name: "post_quantum".to_string(),
482        })
483    }
484    fn compile_qml_attention_shader(
485        _config: &OptimizationConfig,
486    ) -> QuantRS2Result<CompiledShader> {
487        Ok(CompiledShader {
488            name: "qml_attention".to_string(),
489        })
490    }
491
492    // Placeholder kernel launch methods
493    fn launch_tensor_core_holonomic_kernel(
494        &self,
495        _kernel: &CompiledKernel,
496        _state: &mut [Complex64],
497        _matrix: &[Complex64],
498        _qubits: &[QubitId],
499        _block: u32,
500        _grid: u32,
501    ) -> QuantRS2Result<()> {
502        Ok(())
503    }
504    fn launch_standard_holonomic_kernel(
505        &self,
506        _kernel: &CompiledKernel,
507        _state: &mut [Complex64],
508        _matrix: &[Complex64],
509        _qubits: &[QubitId],
510        _block: u32,
511        _grid: u32,
512    ) -> QuantRS2Result<()> {
513        Ok(())
514    }
515
516    fn apply_holonomic_gate_webgpu(
517        &self,
518        _state: &mut [Complex64],
519        _matrix: &[Complex64],
520        _qubits: &[QubitId],
521    ) -> QuantRS2Result<()> {
522        Ok(())
523    }
524    fn apply_holonomic_gate_cpu_optimized(
525        &self,
526        _state: &mut [Complex64],
527        _matrix: &[Complex64],
528        _qubits: &[QubitId],
529    ) -> QuantRS2Result<()> {
530        Ok(())
531    }
532
533    fn apply_quantum_sponge_gpu(
534        &self,
535        _state: &mut [Complex64],
536        _circuit: &[Complex64],
537        _rate: usize,
538        _capacity: usize,
539    ) -> QuantRS2Result<()> {
540        Ok(())
541    }
542    fn apply_quantum_merkle_gpu(
543        &self,
544        _state: &mut [Complex64],
545        _circuit: &[Complex64],
546        _depth: usize,
547        _arity: usize,
548    ) -> QuantRS2Result<()> {
549        Ok(())
550    }
551    fn apply_quantum_grover_gpu(
552        &self,
553        _state: &mut [Complex64],
554        _circuit: &[Complex64],
555        _iterations: usize,
556    ) -> QuantRS2Result<()> {
557        Ok(())
558    }
559
560    fn apply_qml_attention_cuda(
561        &self,
562        _state: &mut [Complex64],
563        _query: &[Complex64],
564        _key: &[Complex64],
565        _value: &[Complex64],
566        _heads: usize,
567    ) -> QuantRS2Result<()> {
568        Ok(())
569    }
570    fn apply_qml_attention_webgpu(
571        &self,
572        _state: &mut [Complex64],
573        _query: &[Complex64],
574        _key: &[Complex64],
575        _value: &[Complex64],
576        _heads: usize,
577    ) -> QuantRS2Result<()> {
578        Ok(())
579    }
580    fn apply_qml_attention_cpu_vectorized(
581        &self,
582        _state: &mut [Complex64],
583        _query: &[Complex64],
584        _key: &[Complex64],
585        _value: &[Complex64],
586        _heads: usize,
587    ) -> QuantRS2Result<()> {
588        Ok(())
589    }
590
591    fn apply_single_gate_optimized(
592        &self,
593        _state: &mut [Complex64],
594        _gate: &dyn GateOp,
595    ) -> QuantRS2Result<()> {
596        Ok(())
597    }
598    fn analyze_gate_fusion_opportunities(
599        &self,
600        _gates: &[Box<dyn GateOp>],
601    ) -> QuantRS2Result<Vec<FusionChain>> {
602        Ok(vec![])
603    }
604    fn apply_fused_rotation_sequence(
605        &self,
606        _state: &mut [Complex64],
607        _gates: &[Box<dyn GateOp>],
608    ) -> QuantRS2Result<()> {
609        Ok(())
610    }
611    fn apply_fused_pauli_string(
612        &self,
613        _state: &mut [Complex64],
614        _gates: &[Box<dyn GateOp>],
615    ) -> QuantRS2Result<()> {
616        Ok(())
617    }
618    fn apply_fused_controlled_sequence(
619        &self,
620        _state: &mut [Complex64],
621        _gates: &[Box<dyn GateOp>],
622    ) -> QuantRS2Result<()> {
623        Ok(())
624    }
625}
626
627/// Supporting types and structures
628
629#[derive(Debug, Clone)]
630pub enum PostQuantumCompressionType {
631    QuantumSponge { rate: usize, capacity: usize },
632    QuantumMerkleTree { depth: usize, arity: usize },
633    QuantumGrover { iterations: usize },
634}
635
636#[derive(Debug, Clone)]
637pub enum FusionType {
638    RotationSequence,
639    PauliString,
640    ControlledSequence,
641    None,
642}
643
644pub struct FusionChain {
645    pub gates: Vec<Box<dyn GateOp>>,
646    pub fusion_type: FusionType,
647}
648
649pub struct CompiledKernel {
650    pub name: String,
651    pub last_execution_time: f64,
652}
653
654pub struct CompiledShader {
655    pub name: String,
656}
657
658pub struct CachedCudaKernel {
659    pub kernel: CompiledKernel,
660    pub compilation_time: f64,
661}
662
663pub struct CachedWebGpuShader {
664    pub shader: CompiledShader,
665    pub compilation_time: f64,
666}
667
668pub struct CacheStatistics {
669    pub hits: usize,
670    pub misses: usize,
671}
672
673impl CacheStatistics {
674    pub fn overall_hit_rate(&self) -> f64 {
675        if self.hits + self.misses == 0 {
676            0.0
677        } else {
678            self.hits as f64 / (self.hits + self.misses) as f64
679        }
680    }
681}
682
683pub struct BufferPool {
684    pub initial_size: usize,
685}
686
687impl BufferPool {
688    pub fn new(initial_size: usize) -> Self {
689        Self { initial_size }
690    }
691}
692
693pub struct DeviceProperties {
694    pub max_shared_memory: usize,
695    pub warp_size: usize,
696}
697
698pub struct WebGpuLimits {
699    pub max_compute_workgroup_size: u32,
700}
701
702pub struct PerformanceReport {
703    pub average_kernel_times: HashMap<String, f64>,
704    pub cache_hit_rate: f64,
705    pub tensor_core_utilization: f64,
706    pub memory_bandwidth_utilization: f64,
707}
708
709impl KernelCache {
710    pub fn new() -> Self {
711        Self {
712            cuda_kernels: HashMap::new(),
713            webgpu_shaders: HashMap::new(),
714            cache_stats: CacheStatistics { hits: 0, misses: 0 },
715        }
716    }
717}
718
719impl Default for KernelCache {
720    fn default() -> Self {
721        Self::new()
722    }
723}
724
725impl PerformanceStats {
726    pub fn new() -> Self {
727        Self {
728            kernel_times: HashMap::new(),
729            memory_bandwidth: HashMap::new(),
730            tensor_core_utilization: 0.0,
731            cache_hit_rates: HashMap::new(),
732        }
733    }
734}
735
736impl Default for PerformanceStats {
737    fn default() -> Self {
738        Self::new()
739    }
740}
741
742#[cfg(test)]
743mod tests {
744    use super::*;
745
746    #[test]
747    fn test_specialized_gpu_kernels_creation() {
748        let config = OptimizationConfig::default();
749        let kernels = SpecializedGpuKernels::new(config);
750        assert!(kernels.is_ok());
751    }
752
753    #[test]
754    fn test_holonomic_gate_application() {
755        let config = OptimizationConfig::default();
756        let kernels = SpecializedGpuKernels::new(config).unwrap();
757
758        let mut state = vec![Complex64::new(1.0, 0.0), Complex64::new(0.0, 0.0)];
759        let holonomy_matrix = vec![
760            Complex64::new(1.0, 0.0),
761            Complex64::new(0.0, 0.0),
762            Complex64::new(0.0, 0.0),
763            Complex64::new(1.0, 0.0),
764        ];
765        let target_qubits = vec![QubitId(0)];
766
767        let result = kernels.apply_holonomic_gate(&mut state, &holonomy_matrix, &target_qubits);
768        assert!(result.is_ok());
769    }
770
771    #[test]
772    fn test_performance_reporting() {
773        let config = OptimizationConfig::default();
774        let kernels = SpecializedGpuKernels::new(config).unwrap();
775
776        let report = kernels.get_performance_report();
777        assert!(report.cache_hit_rate >= 0.0 && report.cache_hit_rate <= 1.0);
778    }
779}