quantrs2_core/gpu/
adaptive_simd.rs

1//! Adaptive SIMD dispatch based on CPU capabilities detection
2//!
3//! This module provides runtime detection of CPU capabilities and dispatches
4//! to the most optimized SIMD implementation available on the target hardware.
5
6use crate::error::{QuantRS2Error, QuantRS2Result};
7use crate::platform::PlatformCapabilities;
8use scirs2_core::Complex64;
9use std::sync::{Mutex, OnceLock};
10// use scirs2_core::simd_ops::SimdUnifiedOps;
11use crate::simd_ops_stubs::SimdF64;
12use scirs2_core::ndarray::ArrayView1;
13
14/// CPU feature detection results
15#[derive(Debug, Clone, Copy)]
16pub struct CpuFeatures {
17    /// AVX2 support (256-bit vectors)
18    pub has_avx2: bool,
19    /// AVX-512 support (512-bit vectors)
20    pub has_avx512: bool,
21    /// FMA (Fused Multiply-Add) support
22    pub has_fma: bool,
23    /// AVX-512 VL (Vector Length) support
24    pub has_avx512vl: bool,
25    /// AVX-512 DQ (Doubleword and Quadword) support
26    pub has_avx512dq: bool,
27    /// AVX-512 CD (Conflict Detection) support
28    pub has_avx512cd: bool,
29    /// SSE 4.1 support
30    pub has_sse41: bool,
31    /// SSE 4.2 support
32    pub has_sse42: bool,
33    /// Number of CPU cores
34    pub num_cores: usize,
35    /// L1 cache size per core (in bytes)
36    pub l1_cache_size: usize,
37    /// L2 cache size per core (in bytes)
38    pub l2_cache_size: usize,
39    /// L3 cache size (in bytes)
40    pub l3_cache_size: usize,
41}
42
43/// SIMD implementation variants
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum SimdVariant {
46    /// Scalar fallback implementation
47    Scalar,
48    /// SSE 4.1/4.2 implementation
49    Sse4,
50    /// AVX2 implementation (256-bit)
51    Avx2,
52    /// AVX-512 implementation (512-bit)
53    Avx512,
54}
55
56/// Adaptive SIMD dispatcher
57pub struct AdaptiveSimdDispatcher {
58    /// Detected CPU features
59    cpu_features: CpuFeatures,
60    /// Selected SIMD variant
61    selected_variant: SimdVariant,
62    /// Performance cache for different operation sizes
63    performance_cache: Mutex<std::collections::HashMap<String, PerformanceData>>,
64}
65
66/// Performance data for SIMD operations
67#[derive(Debug, Clone)]
68pub struct PerformanceData {
69    /// Average execution time (nanoseconds)
70    avg_time: f64,
71    /// Number of samples
72    samples: usize,
73    /// Best SIMD variant for this operation size
74    best_variant: SimdVariant,
75}
76
77/// Global dispatcher instance
78static GLOBAL_DISPATCHER: OnceLock<AdaptiveSimdDispatcher> = OnceLock::new();
79
80impl AdaptiveSimdDispatcher {
81    /// Initialize the global adaptive SIMD dispatcher
82    pub fn initialize() -> QuantRS2Result<()> {
83        let cpu_features = Self::detect_cpu_features();
84        let selected_variant = Self::select_optimal_variant(&cpu_features);
85
86        let dispatcher = AdaptiveSimdDispatcher {
87            cpu_features,
88            selected_variant,
89            performance_cache: Mutex::new(std::collections::HashMap::new()),
90        };
91
92        GLOBAL_DISPATCHER.set(dispatcher).map_err(|_| {
93            QuantRS2Error::RuntimeError("Adaptive SIMD dispatcher already initialized".to_string())
94        })?;
95
96        Ok(())
97    }
98
99    /// Get the global dispatcher instance
100    pub fn instance() -> QuantRS2Result<&'static AdaptiveSimdDispatcher> {
101        GLOBAL_DISPATCHER.get().ok_or_else(|| {
102            QuantRS2Error::RuntimeError("Adaptive SIMD dispatcher not initialized".to_string())
103        })
104    }
105
106    /// Detect CPU features at runtime
107    fn detect_cpu_features() -> CpuFeatures {
108        let platform = PlatformCapabilities::detect();
109
110        CpuFeatures {
111            has_avx2: platform.cpu.simd.avx2,
112            has_avx512: platform.cpu.simd.avx512,
113            has_fma: platform.cpu.simd.fma,
114            has_avx512vl: false, // Not detected in current platform capabilities
115            has_avx512dq: false, // Not detected in current platform capabilities
116            has_avx512cd: false, // Not detected in current platform capabilities
117            has_sse41: platform.cpu.simd.sse4_1,
118            has_sse42: platform.cpu.simd.sse4_2,
119            num_cores: platform.cpu.logical_cores,
120            l1_cache_size: platform.cpu.cache.l1_data.unwrap_or(32 * 1024),
121            l2_cache_size: platform.cpu.cache.l2.unwrap_or(256 * 1024),
122            l3_cache_size: platform.cpu.cache.l3.unwrap_or(8 * 1024 * 1024),
123        }
124    }
125
126    /// Select the optimal SIMD variant based on CPU features
127    fn select_optimal_variant(features: &CpuFeatures) -> SimdVariant {
128        if features.has_avx512 && features.has_avx512vl && features.has_avx512dq {
129            SimdVariant::Avx512
130        } else if features.has_avx2 && features.has_fma {
131            SimdVariant::Avx2
132        } else if features.has_sse41 && features.has_sse42 {
133            SimdVariant::Sse4
134        } else {
135            SimdVariant::Scalar
136        }
137    }
138
139    /// Apply a single-qubit gate with adaptive SIMD
140    pub fn apply_single_qubit_gate_adaptive(
141        &self,
142        state: &mut [Complex64],
143        target: usize,
144        matrix: &[Complex64; 4],
145    ) -> QuantRS2Result<()> {
146        let operation_key = format!("single_qubit_{}", state.len());
147        let variant = self.select_variant_for_operation(&operation_key, state.len());
148
149        let start_time = std::time::Instant::now();
150
151        let result = match variant {
152            SimdVariant::Avx512 => self.apply_single_qubit_sse4(state, target, matrix), // Fallback to SSE4
153            SimdVariant::Avx2 => self.apply_single_qubit_sse4(state, target, matrix), // Fallback to SSE4
154            SimdVariant::Sse4 => self.apply_single_qubit_sse4(state, target, matrix),
155            SimdVariant::Scalar => self.apply_single_qubit_scalar(state, target, matrix),
156        };
157
158        let execution_time = start_time.elapsed().as_nanos() as f64;
159        self.update_performance_cache(&operation_key, execution_time, variant);
160
161        result
162    }
163
164    /// Apply a two-qubit gate with adaptive SIMD
165    pub fn apply_two_qubit_gate_adaptive(
166        &self,
167        state: &mut [Complex64],
168        control: usize,
169        target: usize,
170        matrix: &[Complex64; 16],
171    ) -> QuantRS2Result<()> {
172        let operation_key = format!("two_qubit_{}", state.len());
173        let variant = self.select_variant_for_operation(&operation_key, state.len());
174
175        let start_time = std::time::Instant::now();
176
177        let result = match variant {
178            SimdVariant::Avx512 => self.apply_two_qubit_avx512(state, control, target, matrix),
179            SimdVariant::Avx2 => self.apply_two_qubit_avx2(state, control, target, matrix),
180            SimdVariant::Sse4 => self.apply_two_qubit_sse4(state, control, target, matrix),
181            SimdVariant::Scalar => self.apply_two_qubit_scalar(state, control, target, matrix),
182        };
183
184        let execution_time = start_time.elapsed().as_nanos() as f64;
185        self.update_performance_cache(&operation_key, execution_time, variant);
186
187        result
188    }
189
190    /// Batch apply gates with adaptive SIMD
191    pub fn apply_batch_gates_adaptive(
192        &self,
193        states: &mut [&mut [Complex64]],
194        gates: &[Box<dyn crate::gate::GateOp>],
195    ) -> QuantRS2Result<()> {
196        let batch_size = states.len();
197        let operation_key = format!("batch_{}_{}", batch_size, gates.len());
198        let variant = self.select_variant_for_operation(&operation_key, batch_size * 1000); // Estimate
199
200        let start_time = std::time::Instant::now();
201
202        let result = match variant {
203            SimdVariant::Avx512 => self.apply_batch_gates_avx512(states, gates),
204            SimdVariant::Avx2 => self.apply_batch_gates_avx2(states, gates),
205            SimdVariant::Sse4 => self.apply_batch_gates_sse4(states, gates),
206            SimdVariant::Scalar => self.apply_batch_gates_scalar(states, gates),
207        };
208
209        let execution_time = start_time.elapsed().as_nanos() as f64;
210        self.update_performance_cache(&operation_key, execution_time, variant);
211
212        result
213    }
214
215    /// Select the best SIMD variant for a specific operation
216    fn select_variant_for_operation(&self, operation_key: &str, data_size: usize) -> SimdVariant {
217        // Check performance cache first
218        if let Ok(cache) = self.performance_cache.lock() {
219            if let Some(perf_data) = cache.get(operation_key) {
220                if perf_data.samples >= 5 {
221                    return perf_data.best_variant;
222                }
223            }
224        }
225
226        // Heuristics based on data size and CPU features
227        if data_size >= 1024 && self.cpu_features.has_avx512 {
228            SimdVariant::Avx512
229        } else if data_size >= 256 && self.cpu_features.has_avx2 {
230            SimdVariant::Avx2
231        } else if data_size >= 64 && self.cpu_features.has_sse41 {
232            SimdVariant::Sse4
233        } else {
234            SimdVariant::Scalar
235        }
236    }
237
238    /// Update performance cache with execution time
239    fn update_performance_cache(
240        &self,
241        operation_key: &str,
242        execution_time: f64,
243        variant: SimdVariant,
244    ) {
245        if let Ok(mut cache) = self.performance_cache.lock() {
246            let perf_data =
247                cache
248                    .entry(operation_key.to_string())
249                    .or_insert_with(|| PerformanceData {
250                        avg_time: execution_time,
251                        samples: 0,
252                        best_variant: variant,
253                    });
254
255            // Update running average
256            perf_data.avg_time = (perf_data.avg_time * perf_data.samples as f64 + execution_time)
257                / (perf_data.samples + 1) as f64;
258            perf_data.samples += 1;
259
260            // Update best variant if this one is significantly faster
261            if execution_time < perf_data.avg_time * 0.9 {
262                perf_data.best_variant = variant;
263            }
264        }
265    }
266
267    /// Get performance report
268    pub fn get_performance_report(&self) -> AdaptivePerformanceReport {
269        let cache = self
270            .performance_cache
271            .lock()
272            .map(|cache| cache.clone())
273            .unwrap_or_default();
274
275        AdaptivePerformanceReport {
276            cpu_features: self.cpu_features,
277            selected_variant: self.selected_variant,
278            performance_cache: cache,
279        }
280    }
281
282    // SIMD implementation methods (simplified placeholders)
283
284    #[cfg(target_arch = "x86_64")]
285    fn apply_single_qubit_avx512(
286        &self,
287        state: &mut [Complex64],
288        target: usize,
289        matrix: &[Complex64; 4],
290    ) -> QuantRS2Result<()> {
291        // AVX-512 implementation using SciRS2 SIMD operations
292        // SciRS2 will automatically use AVX-512 if available
293        self.apply_single_qubit_simd_unified(state, target, matrix)
294    }
295
296    #[cfg(target_arch = "x86_64")]
297    fn apply_single_qubit_avx2(
298        &self,
299        state: &mut [Complex64],
300        target: usize,
301        matrix: &[Complex64; 4],
302    ) -> QuantRS2Result<()> {
303        // AVX2 implementation using SciRS2 SIMD operations
304        // SciRS2 will automatically use AVX2 if available
305        self.apply_single_qubit_simd_unified(state, target, matrix)
306    }
307
308    fn apply_single_qubit_sse4(
309        &self,
310        state: &mut [Complex64],
311        target: usize,
312        matrix: &[Complex64; 4],
313    ) -> QuantRS2Result<()> {
314        // SSE4 implementation using SciRS2 SIMD operations
315        // SciRS2 will automatically use SSE4 if available
316        self.apply_single_qubit_simd_unified(state, target, matrix)
317    }
318
319    fn apply_single_qubit_scalar(
320        &self,
321        state: &mut [Complex64],
322        target: usize,
323        matrix: &[Complex64; 4],
324    ) -> QuantRS2Result<()> {
325        // Scalar implementation
326        let n = state.len();
327        for i in 0..n {
328            if (i >> target) & 1 == 0 {
329                let j = i | (1 << target);
330                let temp0 = state[i];
331                let temp1 = state[j];
332                state[i] = matrix[0] * temp0 + matrix[1] * temp1;
333                state[j] = matrix[2] * temp0 + matrix[3] * temp1;
334            }
335        }
336        Ok(())
337    }
338
339    /// Apply single-qubit gate using SciRS2 unified SIMD operations
340    fn apply_single_qubit_simd_unified(
341        &self,
342        state: &mut [Complex64],
343        target: usize,
344        matrix: &[Complex64; 4],
345    ) -> QuantRS2Result<()> {
346        let qubit_mask = 1 << target;
347        let half_size = state.len() / 2;
348
349        // Collect pairs of indices that need to be processed
350        let mut idx0_list = Vec::new();
351        let mut idx1_list = Vec::new();
352
353        for i in 0..half_size {
354            let idx0 = (i & !(qubit_mask >> 1)) | ((i & (qubit_mask >> 1)) << 1);
355            let idx1 = idx0 | qubit_mask;
356
357            if idx1 < state.len() {
358                idx0_list.push(idx0);
359                idx1_list.push(idx1);
360            }
361        }
362
363        let pair_count = idx0_list.len();
364        if pair_count == 0 {
365            return Ok(());
366        }
367
368        // Extract amplitude pairs for SIMD processing
369        let mut a0_real = Vec::with_capacity(pair_count);
370        let mut a0_imag = Vec::with_capacity(pair_count);
371        let mut a1_real = Vec::with_capacity(pair_count);
372        let mut a1_imag = Vec::with_capacity(pair_count);
373
374        for i in 0..pair_count {
375            let a0 = state[idx0_list[i]];
376            let a1 = state[idx1_list[i]];
377            a0_real.push(a0.re);
378            a0_imag.push(a0.im);
379            a1_real.push(a1.re);
380            a1_imag.push(a1.im);
381        }
382
383        // Convert to array views for SciRS2 SIMD operations
384        let a0_real_view = ArrayView1::from(&a0_real);
385        let a0_imag_view = ArrayView1::from(&a0_imag);
386        let a1_real_view = ArrayView1::from(&a1_real);
387        let a1_imag_view = ArrayView1::from(&a1_imag);
388
389        // Extract matrix elements
390        let m00_re = matrix[0].re;
391        let m00_im = matrix[0].im;
392        let m01_re = matrix[1].re;
393        let m01_im = matrix[1].im;
394        let m10_re = matrix[2].re;
395        let m10_im = matrix[2].im;
396        let m11_re = matrix[3].re;
397        let m11_im = matrix[3].im;
398
399        // Compute new amplitudes using SciRS2 SIMD operations
400        // new_a0 = m00 * a0 + m01 * a1
401        // new_a1 = m10 * a0 + m11 * a1
402
403        // For new_a0_real: m00_re * a0_re - m00_im * a0_im + m01_re * a1_re - m01_im * a1_im
404        let term1 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m00_re);
405        let term2 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m00_im);
406        let term3 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m01_re);
407        let term4 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m01_im);
408        let sub1 = <f64 as SimdF64>::simd_sub_arrays(&term1.view(), &term2.view());
409        let sub2 = <f64 as SimdF64>::simd_sub_arrays(&term3.view(), &term4.view());
410        let new_a0_real_arr = <f64 as SimdF64>::simd_add_arrays(&sub1.view(), &sub2.view());
411
412        // For new_a0_imag: m00_re * a0_im + m00_im * a0_re + m01_re * a1_im + m01_im * a1_re
413        let term5 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m00_re);
414        let term6 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m00_im);
415        let term7 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m01_re);
416        let term8 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m01_im);
417        let add1 = <f64 as SimdF64>::simd_add_arrays(&term5.view(), &term6.view());
418        let add2 = <f64 as SimdF64>::simd_add_arrays(&term7.view(), &term8.view());
419        let new_a0_imag_arr = <f64 as SimdF64>::simd_add_arrays(&add1.view(), &add2.view());
420
421        // For new_a1_real: m10_re * a0_re - m10_im * a0_im + m11_re * a1_re - m11_im * a1_im
422        let term9 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m10_re);
423        let term10 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m10_im);
424        let term11 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m11_re);
425        let term12 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m11_im);
426        let sub3 = <f64 as SimdF64>::simd_sub_arrays(&term9.view(), &term10.view());
427        let sub4 = <f64 as SimdF64>::simd_sub_arrays(&term11.view(), &term12.view());
428        let new_a1_real_arr = <f64 as SimdF64>::simd_add_arrays(&sub3.view(), &sub4.view());
429
430        // For new_a1_imag: m10_re * a0_im + m10_im * a0_re + m11_re * a1_im + m11_im * a1_re
431        let term13 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m10_re);
432        let term14 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m10_im);
433        let term15 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m11_re);
434        let term16 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m11_im);
435        let add3 = <f64 as SimdF64>::simd_add_arrays(&term13.view(), &term14.view());
436        let add4 = <f64 as SimdF64>::simd_add_arrays(&term15.view(), &term16.view());
437        let new_a1_imag_arr = <f64 as SimdF64>::simd_add_arrays(&add3.view(), &add4.view());
438
439        // Write back results
440        for i in 0..pair_count {
441            state[idx0_list[i]] = Complex64::new(new_a0_real_arr[i], new_a0_imag_arr[i]);
442            state[idx1_list[i]] = Complex64::new(new_a1_real_arr[i], new_a1_imag_arr[i]);
443        }
444
445        Ok(())
446    }
447
448    // Similar implementations for two-qubit gates and batch operations
449
450    fn apply_two_qubit_avx512(
451        &self,
452        _state: &mut [Complex64],
453        _control: usize,
454        _target: usize,
455        _matrix: &[Complex64; 16],
456    ) -> QuantRS2Result<()> {
457        // Placeholder
458        Ok(())
459    }
460
461    fn apply_two_qubit_avx2(
462        &self,
463        _state: &mut [Complex64],
464        _control: usize,
465        _target: usize,
466        _matrix: &[Complex64; 16],
467    ) -> QuantRS2Result<()> {
468        // Placeholder
469        Ok(())
470    }
471
472    fn apply_two_qubit_sse4(
473        &self,
474        _state: &mut [Complex64],
475        _control: usize,
476        _target: usize,
477        _matrix: &[Complex64; 16],
478    ) -> QuantRS2Result<()> {
479        // Placeholder
480        Ok(())
481    }
482
483    fn apply_two_qubit_scalar(
484        &self,
485        _state: &mut [Complex64],
486        _control: usize,
487        _target: usize,
488        _matrix: &[Complex64; 16],
489    ) -> QuantRS2Result<()> {
490        // Placeholder
491        Ok(())
492    }
493
494    fn apply_batch_gates_avx512(
495        &self,
496        _states: &mut [&mut [Complex64]],
497        _gates: &[Box<dyn crate::gate::GateOp>],
498    ) -> QuantRS2Result<()> {
499        // Placeholder
500        Ok(())
501    }
502
503    fn apply_batch_gates_avx2(
504        &self,
505        _states: &mut [&mut [Complex64]],
506        _gates: &[Box<dyn crate::gate::GateOp>],
507    ) -> QuantRS2Result<()> {
508        // Placeholder
509        Ok(())
510    }
511
512    fn apply_batch_gates_sse4(
513        &self,
514        _states: &mut [&mut [Complex64]],
515        _gates: &[Box<dyn crate::gate::GateOp>],
516    ) -> QuantRS2Result<()> {
517        // Placeholder
518        Ok(())
519    }
520
521    fn apply_batch_gates_scalar(
522        &self,
523        _states: &mut [&mut [Complex64]],
524        _gates: &[Box<dyn crate::gate::GateOp>],
525    ) -> QuantRS2Result<()> {
526        // Placeholder
527        Ok(())
528    }
529}
530
531/// Performance report for adaptive SIMD
532#[derive(Debug, Clone)]
533pub struct AdaptivePerformanceReport {
534    pub cpu_features: CpuFeatures,
535    pub selected_variant: SimdVariant,
536    pub performance_cache: std::collections::HashMap<String, PerformanceData>,
537}
538
539/// Convenience functions for adaptive SIMD operations
540pub fn apply_single_qubit_adaptive(
541    state: &mut [Complex64],
542    target: usize,
543    matrix: &[Complex64; 4],
544) -> QuantRS2Result<()> {
545    AdaptiveSimdDispatcher::instance()?.apply_single_qubit_gate_adaptive(state, target, matrix)
546}
547
548pub fn apply_two_qubit_adaptive(
549    state: &mut [Complex64],
550    control: usize,
551    target: usize,
552    matrix: &[Complex64; 16],
553) -> QuantRS2Result<()> {
554    AdaptiveSimdDispatcher::instance()?
555        .apply_two_qubit_gate_adaptive(state, control, target, matrix)
556}
557
558pub fn apply_batch_gates_adaptive(
559    states: &mut [&mut [Complex64]],
560    gates: &[Box<dyn crate::gate::GateOp>],
561) -> QuantRS2Result<()> {
562    AdaptiveSimdDispatcher::instance()?.apply_batch_gates_adaptive(states, gates)
563}
564
565/// Initialize the adaptive SIMD system
566pub fn initialize_adaptive_simd() -> QuantRS2Result<()> {
567    AdaptiveSimdDispatcher::initialize()
568}
569
570/// Get the performance report
571pub fn get_adaptive_performance_report() -> QuantRS2Result<AdaptivePerformanceReport> {
572    Ok(AdaptiveSimdDispatcher::instance()?.get_performance_report())
573}
574
575#[cfg(test)]
576mod tests {
577    use super::*;
578    use scirs2_core::Complex64;
579
580    #[test]
581    fn test_cpu_feature_detection() {
582        let features = AdaptiveSimdDispatcher::detect_cpu_features();
583        println!("Detected CPU features: {:?}", features);
584
585        // Basic sanity checks
586        assert!(features.num_cores >= 1);
587        assert!(features.l1_cache_size > 0);
588    }
589
590    #[test]
591    fn test_simd_variant_selection() {
592        let features = CpuFeatures {
593            has_avx2: true,
594            has_avx512: false,
595            has_fma: true,
596            has_avx512vl: false,
597            has_avx512dq: false,
598            has_avx512cd: false,
599            has_sse41: true,
600            has_sse42: true,
601            num_cores: 8,
602            l1_cache_size: 32768,
603            l2_cache_size: 262144,
604            l3_cache_size: 8388608,
605        };
606
607        let variant = AdaptiveSimdDispatcher::select_optimal_variant(&features);
608        assert_eq!(variant, SimdVariant::Avx2);
609    }
610
611    #[test]
612    fn test_adaptive_single_qubit_gate() {
613        let _ = AdaptiveSimdDispatcher::initialize();
614
615        let mut state = vec![Complex64::new(1.0, 0.0), Complex64::new(0.0, 0.0)];
616
617        let hadamard_matrix = [
618            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
619            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
620            Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
621            Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
622        ];
623
624        let result = apply_single_qubit_adaptive(&mut state, 0, &hadamard_matrix);
625        assert!(result.is_ok());
626
627        // Check that the state has been modified
628        let expected_amplitude = 1.0 / 2.0_f64.sqrt();
629        assert!((state[0].re - expected_amplitude).abs() < 1e-10);
630        assert!((state[1].re - expected_amplitude).abs() < 1e-10);
631    }
632
633    #[test]
634    fn test_performance_caching() {
635        let dispatcher = AdaptiveSimdDispatcher {
636            cpu_features: AdaptiveSimdDispatcher::detect_cpu_features(),
637            selected_variant: SimdVariant::Avx2,
638            performance_cache: Mutex::new(std::collections::HashMap::new()),
639        };
640
641        dispatcher.update_performance_cache("test_op", 100.0, SimdVariant::Avx2);
642        dispatcher.update_performance_cache("test_op", 150.0, SimdVariant::Avx2);
643
644        let perf_data = dispatcher
645            .performance_cache
646            .lock()
647            .unwrap()
648            .get("test_op")
649            .unwrap()
650            .clone();
651        assert_eq!(perf_data.samples, 2);
652        assert!((perf_data.avg_time - 125.0).abs() < 1e-10);
653    }
654}