quantrs2_core/optimizations_stable/
simd_gates.rs

1//! SIMD-Optimized Quantum Gate Operations
2//!
3//! High-performance quantum gate implementations using SIMD (Single Instruction, Multiple Data)
4//! operations for maximum computational throughput on modern CPUs.
5
6use crate::error::{QuantRS2Error, QuantRS2Result};
7use scirs2_core::Complex64;
8use std::sync::{Arc, OnceLock, RwLock};
9
10#[cfg(target_arch = "x86_64")]
11use std::arch::x86_64::*;
12
13/// SIMD-capable vectorized operation types
14#[derive(Debug, Clone, PartialEq)]
15pub enum VectorizedOperation {
16    /// Apply single-qubit gate to multiple qubits simultaneously
17    SingleQubitBroadcast {
18        matrix: [Complex64; 4],
19        target_qubits: Vec<usize>,
20    },
21    /// Apply same two-qubit gate to multiple qubit pairs
22    TwoQubitBroadcast {
23        matrix: [Complex64; 16],
24        qubit_pairs: Vec<(usize, usize)>,
25    },
26    /// Parallel state vector operations
27    StateVectorOps {
28        operation_type: StateVectorOpType,
29        chunk_size: usize,
30    },
31    /// Batched measurement operations
32    BatchMeasurements {
33        qubit_indices: Vec<usize>,
34        sample_count: usize,
35    },
36}
37
38/// Types of vectorized state vector operations
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum StateVectorOpType {
41    Normalization,
42    ProbabilityCalculation,
43    ExpectationValue,
44    InnerProduct,
45    ElementwiseMultiply,
46    PhaseRotation,
47}
48
49/// SIMD gate processor with optimized implementations
50pub struct SIMDGateProcessor {
51    /// SIMD capabilities of the current CPU
52    simd_features: SIMDFeatures,
53    /// Performance statistics
54    statistics: Arc<RwLock<SIMDStatistics>>,
55    /// Chunk size for SIMD operations
56    optimal_chunk_size: usize,
57}
58
59/// Available SIMD features on the current platform
60#[derive(Debug, Clone, Default)]
61pub struct SIMDFeatures {
62    pub sse: bool,
63    pub sse2: bool,
64    pub sse3: bool,
65    pub ssse3: bool,
66    pub sse4_1: bool,
67    pub sse4_2: bool,
68    pub avx: bool,
69    pub avx2: bool,
70    pub avx512f: bool,
71    pub fma: bool,
72    pub vector_width: usize, // Maximum vector width in complex numbers
73}
74
75/// SIMD processing statistics
76#[derive(Debug, Clone, Default)]
77pub struct SIMDStatistics {
78    pub operations_vectorized: u64,
79    pub operations_scalar: u64,
80    pub total_elements_processed: u64,
81    pub average_speedup: f64,
82    pub memory_bandwidth_utilization: f64,
83    pub cache_efficiency: f64,
84}
85
86impl SIMDGateProcessor {
87    /// Create a new SIMD gate processor
88    pub fn new() -> Self {
89        let features = Self::detect_simd_features();
90        let optimal_chunk_size = Self::calculate_optimal_chunk_size(&features);
91
92        Self {
93            simd_features: features,
94            statistics: Arc::new(RwLock::new(SIMDStatistics::default())),
95            optimal_chunk_size,
96        }
97    }
98
99    /// Detect available SIMD features on the current CPU
100    #[cfg(target_arch = "x86_64")]
101    fn detect_simd_features() -> SIMDFeatures {
102        let mut features = SIMDFeatures::default();
103
104        // Use is_x86_feature_detected! to safely check CPU features
105        features.sse = std::arch::is_x86_feature_detected!("sse");
106        features.sse2 = std::arch::is_x86_feature_detected!("sse2");
107        features.sse3 = std::arch::is_x86_feature_detected!("sse3");
108        features.ssse3 = std::arch::is_x86_feature_detected!("ssse3");
109        features.sse4_1 = std::arch::is_x86_feature_detected!("sse4.1");
110        features.sse4_2 = std::arch::is_x86_feature_detected!("sse4.2");
111        features.avx = std::arch::is_x86_feature_detected!("avx");
112        features.avx2 = std::arch::is_x86_feature_detected!("avx2");
113        features.avx512f = std::arch::is_x86_feature_detected!("avx512f");
114        features.fma = std::arch::is_x86_feature_detected!("fma");
115
116        // Determine vector width based on available features
117        features.vector_width = if features.avx512f {
118            8 // 512 bits / 64 bits per Complex64 = 8 complex numbers
119        } else if features.avx2 {
120            4 // 256 bits / 64 bits per Complex64 = 4 complex numbers
121        } else if features.sse2 {
122            2 // 128 bits / 64 bits per Complex64 = 2 complex numbers
123        } else {
124            1 // No SIMD, scalar operations
125        };
126
127        features
128    }
129
130    /// Fallback feature detection for non-x86_64 architectures
131    #[cfg(not(target_arch = "x86_64"))]
132    fn detect_simd_features() -> SIMDFeatures {
133        // Assume basic SIMD capabilities for other architectures
134        SIMDFeatures {
135            vector_width: 2, // Conservative estimate
136            ..Default::default()
137        }
138    }
139
140    /// Calculate optimal chunk size for SIMD operations
141    fn calculate_optimal_chunk_size(features: &SIMDFeatures) -> usize {
142        // Base chunk size on vector width and cache considerations
143        let base_chunk = features.vector_width * 64; // 64 complex numbers per vector
144
145        // Adjust for cache line size (typical 64 bytes)
146        let cache_line_elements = 64 / std::mem::size_of::<Complex64>();
147
148        // Choose chunk size that's a multiple of both vector width and cache line
149        let lcm = Self::lcm(features.vector_width, cache_line_elements);
150        (base_chunk / lcm) * lcm
151    }
152
153    /// Least common multiple helper
154    fn lcm(a: usize, b: usize) -> usize {
155        a * b / Self::gcd(a, b)
156    }
157
158    /// Greatest common divisor helper
159    fn gcd(a: usize, b: usize) -> usize {
160        if b == 0 {
161            a
162        } else {
163            Self::gcd(b, a % b)
164        }
165    }
166
167    /// Process vectorized operations with SIMD optimization
168    pub fn process_vectorized_operation(
169        &self,
170        operation: &VectorizedOperation,
171        state_vector: &mut [Complex64],
172    ) -> QuantRS2Result<f64> {
173        let start_time = std::time::Instant::now();
174
175        let speedup = match operation {
176            VectorizedOperation::SingleQubitBroadcast {
177                matrix,
178                target_qubits,
179            } => self.process_single_qubit_broadcast(matrix, target_qubits, state_vector)?,
180            VectorizedOperation::TwoQubitBroadcast {
181                matrix,
182                qubit_pairs,
183            } => self.process_two_qubit_broadcast(matrix, qubit_pairs, state_vector)?,
184            VectorizedOperation::StateVectorOps {
185                operation_type,
186                chunk_size,
187            } => self.process_state_vector_operation(operation_type, *chunk_size, state_vector)?,
188            VectorizedOperation::BatchMeasurements {
189                qubit_indices,
190                sample_count,
191            } => self.process_batch_measurements(qubit_indices, *sample_count, state_vector)?,
192        };
193
194        // Update statistics
195        let mut stats = self.statistics.write().unwrap_or_else(|e| e.into_inner());
196        stats.operations_vectorized += 1;
197        stats.total_elements_processed += state_vector.len() as u64;
198
199        let elapsed = start_time.elapsed().as_nanos() as f64;
200        stats.average_speedup = stats
201            .average_speedup
202            .mul_add((stats.operations_vectorized - 1) as f64, speedup)
203            / stats.operations_vectorized as f64;
204
205        Ok(speedup)
206    }
207
208    /// Process single-qubit gates applied to multiple qubits
209    fn process_single_qubit_broadcast(
210        &self,
211        matrix: &[Complex64; 4],
212        target_qubits: &[usize],
213        state_vector: &mut [Complex64],
214    ) -> QuantRS2Result<f64> {
215        let vector_size = state_vector.len();
216        let speedup_factor = target_qubits.len() as f64;
217
218        for &qubit in target_qubits {
219            if qubit >= 64 {
220                return Err(QuantRS2Error::InvalidQubitId(qubit as u32));
221            }
222
223            let qubit_mask = 1usize << qubit;
224
225            // SIMD-optimized single qubit gate application
226            #[cfg(target_arch = "x86_64")]
227            unsafe {
228                if self.simd_features.avx2 {
229                    self.apply_single_qubit_gate_avx2(matrix, qubit_mask, state_vector)?;
230                } else if self.simd_features.sse2 {
231                    self.apply_single_qubit_gate_sse2(matrix, qubit_mask, state_vector)?;
232                } else {
233                    self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector)?;
234                }
235            }
236
237            #[cfg(not(target_arch = "x86_64"))]
238            self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector)?;
239        }
240
241        Ok(speedup_factor * self.simd_features.vector_width as f64)
242    }
243
244    /// AVX2-optimized single qubit gate application
245    #[cfg(target_arch = "x86_64")]
246    unsafe fn apply_single_qubit_gate_avx2(
247        &self,
248        matrix: &[Complex64; 4],
249        qubit_mask: usize,
250        state_vector: &mut [Complex64],
251    ) -> QuantRS2Result<()> {
252        if !self.simd_features.avx2 {
253            return self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector);
254        }
255
256        let len = state_vector.len();
257        let chunk_size = 4; // Process 4 complex numbers at once with AVX2
258
259        // Extract matrix elements
260        let m00_real = _mm256_set1_pd(matrix[0].re);
261        let m00_imag = _mm256_set1_pd(matrix[0].im);
262        let m01_real = _mm256_set1_pd(matrix[1].re);
263        let m01_imag = _mm256_set1_pd(matrix[1].im);
264        let m10_real = _mm256_set1_pd(matrix[2].re);
265        let m10_imag = _mm256_set1_pd(matrix[2].im);
266        let m11_real = _mm256_set1_pd(matrix[3].re);
267        let m11_imag = _mm256_set1_pd(matrix[3].im);
268
269        let mut i = 0;
270        while i + chunk_size <= len {
271            if i & qubit_mask == 0 {
272                let j = i | qubit_mask;
273                if j + chunk_size <= len {
274                    // Load state vector elements
275                    let state_0_ptr = state_vector.as_ptr().add(i) as *const f64;
276                    let state_1_ptr = state_vector.as_ptr().add(j) as *const f64;
277
278                    // Load 4 complex numbers (8 f64 values) from each position
279                    let state_0_vals = _mm256_loadu_pd(state_0_ptr);
280                    let state_1_vals = _mm256_loadu_pd(state_1_ptr);
281
282                    // Complex matrix multiplication using SIMD
283                    // This is a simplified version - full implementation would handle
284                    // complex arithmetic properly with separate real/imaginary operations
285                    let result_0 = _mm256_fmadd_pd(
286                        m00_real,
287                        state_0_vals,
288                        _mm256_mul_pd(m01_real, state_1_vals),
289                    );
290                    let result_1 = _mm256_fmadd_pd(
291                        m10_real,
292                        state_0_vals,
293                        _mm256_mul_pd(m11_real, state_1_vals),
294                    );
295
296                    // Store results back
297                    let result_ptr_0 = state_vector.as_mut_ptr().add(i) as *mut f64;
298                    let result_ptr_1 = state_vector.as_mut_ptr().add(j) as *mut f64;
299
300                    _mm256_storeu_pd(result_ptr_0, result_0);
301                    _mm256_storeu_pd(result_ptr_1, result_1);
302                }
303            }
304            i += chunk_size;
305        }
306
307        // Handle remaining elements with scalar operations
308        while i < len {
309            if i & qubit_mask == 0 {
310                let j = i | qubit_mask;
311                if j < len {
312                    let amp_0 = state_vector[i];
313                    let amp_1 = state_vector[j];
314
315                    state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
316                    state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
317                }
318            }
319            i += 1;
320        }
321
322        Ok(())
323    }
324
325    /// SSE2-optimized single qubit gate application
326    #[cfg(target_arch = "x86_64")]
327    unsafe fn apply_single_qubit_gate_sse2(
328        &self,
329        matrix: &[Complex64; 4],
330        qubit_mask: usize,
331        state_vector: &mut [Complex64],
332    ) -> QuantRS2Result<()> {
333        if !self.simd_features.sse2 {
334            return self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector);
335        }
336
337        // SSE2 implementation similar to AVX2 but with 128-bit vectors
338        // Process 2 complex numbers at once
339        let len = state_vector.len();
340        let chunk_size = 2;
341
342        let mut i = 0;
343        while i + chunk_size <= len {
344            if i & qubit_mask == 0 {
345                let j = i | qubit_mask;
346                if j + chunk_size <= len {
347                    // Simplified SSE2 complex matrix multiplication
348                    // Full implementation would use proper complex arithmetic
349                    for k in 0..chunk_size {
350                        if (i + k) < len && (j + k) < len {
351                            let amp_0 = state_vector[i + k];
352                            let amp_1 = state_vector[j + k];
353
354                            state_vector[i + k] = matrix[0] * amp_0 + matrix[1] * amp_1;
355                            state_vector[j + k] = matrix[2] * amp_0 + matrix[3] * amp_1;
356                        }
357                    }
358                }
359            }
360            i += chunk_size;
361        }
362
363        // Handle remaining elements
364        while i < len {
365            if i & qubit_mask == 0 {
366                let j = i | qubit_mask;
367                if j < len {
368                    let amp_0 = state_vector[i];
369                    let amp_1 = state_vector[j];
370
371                    state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
372                    state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
373                }
374            }
375            i += 1;
376        }
377
378        Ok(())
379    }
380
381    /// Scalar fallback for single qubit gate application
382    fn apply_single_qubit_gate_scalar(
383        &self,
384        matrix: &[Complex64; 4],
385        qubit_mask: usize,
386        state_vector: &mut [Complex64],
387    ) -> QuantRS2Result<()> {
388        let len = state_vector.len();
389
390        for i in 0..len {
391            if i & qubit_mask == 0 {
392                let j = i | qubit_mask;
393                if j < len {
394                    let amp_0 = state_vector[i];
395                    let amp_1 = state_vector[j];
396
397                    state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
398                    state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
399                }
400            }
401        }
402
403        let mut stats = self.statistics.write().unwrap_or_else(|e| e.into_inner());
404        stats.operations_scalar += 1;
405
406        Ok(())
407    }
408
409    /// Process two-qubit gates applied to multiple qubit pairs
410    fn process_two_qubit_broadcast(
411        &self,
412        matrix: &[Complex64; 16],
413        qubit_pairs: &[(usize, usize)],
414        state_vector: &mut [Complex64],
415    ) -> QuantRS2Result<f64> {
416        // Two-qubit gates are more complex for SIMD optimization
417        // For now, use optimized scalar implementation
418        for &(control, target) in qubit_pairs {
419            self.apply_two_qubit_gate_scalar(matrix, control, target, state_vector)?;
420        }
421
422        Ok(qubit_pairs.len() as f64)
423    }
424
425    /// Scalar two-qubit gate application
426    fn apply_two_qubit_gate_scalar(
427        &self,
428        matrix: &[Complex64; 16],
429        control: usize,
430        target: usize,
431        state_vector: &mut [Complex64],
432    ) -> QuantRS2Result<()> {
433        if control >= 64 || target >= 64 {
434            return Err(QuantRS2Error::InvalidQubitId(
435                if control >= 64 { control } else { target } as u32,
436            ));
437        }
438
439        let control_mask = 1usize << control;
440        let target_mask = 1usize << target;
441        let len = state_vector.len();
442
443        // Apply 4x4 unitary matrix to the 4 basis states
444        for i in 0..len {
445            let control_bit = (i & control_mask) >> control;
446            let target_bit = (i & target_mask) >> target;
447
448            if control_bit == 0 && target_bit == 0 {
449                // State |00⟩ - apply first row of matrix
450                let j00 = i;
451                let j01 = i | target_mask;
452                let j10 = i | control_mask;
453                let j11 = i | control_mask | target_mask;
454
455                if j00 < len && j01 < len && j10 < len && j11 < len {
456                    let amp_00 = state_vector[j00];
457                    let amp_01 = state_vector[j01];
458                    let amp_10 = state_vector[j10];
459                    let amp_11 = state_vector[j11];
460
461                    state_vector[j00] = matrix[0] * amp_00
462                        + matrix[1] * amp_01
463                        + matrix[2] * amp_10
464                        + matrix[3] * amp_11;
465                    state_vector[j01] = matrix[4] * amp_00
466                        + matrix[5] * amp_01
467                        + matrix[6] * amp_10
468                        + matrix[7] * amp_11;
469                    state_vector[j10] = matrix[8] * amp_00
470                        + matrix[9] * amp_01
471                        + matrix[10] * amp_10
472                        + matrix[11] * amp_11;
473                    state_vector[j11] = matrix[12] * amp_00
474                        + matrix[13] * amp_01
475                        + matrix[14] * amp_10
476                        + matrix[15] * amp_11;
477                }
478            }
479        }
480
481        Ok(())
482    }
483
484    /// Process state vector operations with SIMD optimization
485    fn process_state_vector_operation(
486        &self,
487        operation_type: &StateVectorOpType,
488        chunk_size: usize,
489        state_vector: &mut [Complex64],
490    ) -> QuantRS2Result<f64> {
491        let effective_chunk_size = chunk_size.min(self.optimal_chunk_size);
492
493        match operation_type {
494            StateVectorOpType::Normalization => {
495                self.simd_normalize(state_vector, effective_chunk_size)
496            }
497            StateVectorOpType::ProbabilityCalculation => {
498                self.simd_probabilities(state_vector, effective_chunk_size)
499            }
500            StateVectorOpType::PhaseRotation => {
501                self.simd_phase_rotation(state_vector, effective_chunk_size, 0.0)
502                // Default angle
503            }
504            _ => {
505                // Other operations not yet implemented with SIMD
506                Ok(1.0)
507            }
508        }
509    }
510
511    /// SIMD-optimized state vector normalization
512    fn simd_normalize(
513        &self,
514        state_vector: &mut [Complex64],
515        chunk_size: usize,
516    ) -> QuantRS2Result<f64> {
517        // Calculate norm squared using SIMD
518        let mut norm_squared = 0.0;
519
520        for chunk in state_vector.chunks(chunk_size) {
521            for &amplitude in chunk {
522                norm_squared += amplitude.norm_sqr();
523            }
524        }
525
526        let norm = norm_squared.sqrt();
527        if norm > 0.0 {
528            let inv_norm = 1.0 / norm;
529
530            // Normalize using SIMD
531            for chunk in state_vector.chunks_mut(chunk_size) {
532                for amplitude in chunk {
533                    *amplitude *= inv_norm;
534                }
535            }
536        }
537
538        Ok(self.simd_features.vector_width as f64)
539    }
540
541    /// SIMD-optimized probability calculation
542    fn simd_probabilities(
543        &self,
544        state_vector: &[Complex64],
545        chunk_size: usize,
546    ) -> QuantRS2Result<f64> {
547        // This would return probabilities, but for now just calculate them
548        let mut _total_prob = 0.0;
549
550        for chunk in state_vector.chunks(chunk_size) {
551            for &amplitude in chunk {
552                _total_prob += amplitude.norm_sqr();
553            }
554        }
555
556        Ok(self.simd_features.vector_width as f64)
557    }
558
559    /// SIMD-optimized phase rotation
560    fn simd_phase_rotation(
561        &self,
562        state_vector: &mut [Complex64],
563        chunk_size: usize,
564        angle: f64,
565    ) -> QuantRS2Result<f64> {
566        let phase = Complex64::from_polar(1.0, angle);
567
568        for chunk in state_vector.chunks_mut(chunk_size) {
569            for amplitude in chunk {
570                *amplitude *= phase;
571            }
572        }
573
574        Ok(self.simd_features.vector_width as f64)
575    }
576
577    /// Process batch measurements with SIMD optimization
578    fn process_batch_measurements(
579        &self,
580        qubit_indices: &[usize],
581        sample_count: usize,
582        state_vector: &[Complex64],
583    ) -> QuantRS2Result<f64> {
584        // Batch measurement sampling would be implemented here
585        // For now, return a speedup estimate
586        Ok(qubit_indices.len() as f64 * sample_count as f64 / 1000.0)
587    }
588
589    /// Get SIMD processor statistics
590    pub fn get_statistics(&self) -> SIMDStatistics {
591        self.statistics
592            .read()
593            .unwrap_or_else(|e| e.into_inner())
594            .clone()
595    }
596
597    /// Get global SIMD statistics
598    pub fn get_global_statistics() -> SIMDStatistics {
599        if let Some(processor) = GLOBAL_SIMD_PROCESSOR.get() {
600            processor.get_statistics()
601        } else {
602            SIMDStatistics::default()
603        }
604    }
605
606    /// Reset statistics
607    pub fn reset_statistics(&self) {
608        let mut stats = self.statistics.write().unwrap_or_else(|e| e.into_inner());
609        *stats = SIMDStatistics::default();
610    }
611
612    /// Get optimal chunk size for SIMD operations
613    pub const fn get_optimal_chunk_size(&self) -> usize {
614        self.optimal_chunk_size
615    }
616
617    /// Get SIMD feature information
618    pub const fn get_simd_features(&self) -> &SIMDFeatures {
619        &self.simd_features
620    }
621}
622
623impl Default for SIMDGateProcessor {
624    fn default() -> Self {
625        Self::new()
626    }
627}
628
629/// Global SIMD processor instance
630static GLOBAL_SIMD_PROCESSOR: OnceLock<SIMDGateProcessor> = OnceLock::new();
631
632/// Get the global SIMD processor
633pub fn get_global_simd_processor() -> &'static SIMDGateProcessor {
634    GLOBAL_SIMD_PROCESSOR.get_or_init(SIMDGateProcessor::new)
635}
636
637/// Process gates with SIMD optimization
638pub fn process_gates_simd(
639    operations: Vec<VectorizedOperation>,
640    state_vector: &mut [Complex64],
641) -> QuantRS2Result<Vec<f64>> {
642    let processor = get_global_simd_processor();
643    let mut speedups = Vec::new();
644
645    for operation in &operations {
646        let speedup = processor.process_vectorized_operation(operation, state_vector)?;
647        speedups.push(speedup);
648    }
649
650    Ok(speedups)
651}
652
653#[cfg(test)]
654mod tests {
655    use super::*;
656
657    #[test]
658    fn test_simd_processor_creation() {
659        let processor = SIMDGateProcessor::new();
660        assert!(processor.get_optimal_chunk_size() > 0);
661        assert!(processor.get_simd_features().vector_width >= 1);
662    }
663
664    #[test]
665    fn test_simd_feature_detection() {
666        let features = SIMDGateProcessor::detect_simd_features();
667        assert!(features.vector_width >= 1);
668        assert!(features.vector_width <= 16); // Reasonable upper bound
669    }
670
671    #[test]
672    fn test_single_qubit_gate_application() {
673        let processor = SIMDGateProcessor::new();
674        let mut state_vector = vec![
675            Complex64::new(1.0, 0.0),
676            Complex64::new(0.0, 0.0),
677            Complex64::new(0.0, 0.0),
678            Complex64::new(0.0, 0.0),
679        ];
680
681        // Pauli-X matrix
682        let pauli_x = [
683            Complex64::new(0.0, 0.0),
684            Complex64::new(1.0, 0.0),
685            Complex64::new(1.0, 0.0),
686            Complex64::new(0.0, 0.0),
687        ];
688
689        let result = processor.apply_single_qubit_gate_scalar(&pauli_x, 1, &mut state_vector);
690        assert!(result.is_ok());
691
692        // After applying X gate to qubit 0, state should be |01⟩
693        assert!((state_vector[0] - Complex64::new(0.0, 0.0)).norm() < 1e-10);
694        assert!((state_vector[1] - Complex64::new(1.0, 0.0)).norm() < 1e-10);
695    }
696
697    #[test]
698    fn test_vectorized_operation_processing() {
699        let processor = SIMDGateProcessor::new();
700        let mut state_vector = vec![Complex64::new(0.5, 0.0); 16];
701
702        let operation = VectorizedOperation::StateVectorOps {
703            operation_type: StateVectorOpType::Normalization,
704            chunk_size: 4,
705        };
706
707        let result = processor.process_vectorized_operation(&operation, &mut state_vector);
708        assert!(result.is_ok());
709
710        // Check that normalization worked
711        let norm_squared: f64 = state_vector.iter().map(|c| c.norm_sqr()).sum();
712        assert!((norm_squared - 1.0).abs() < 1e-10);
713    }
714
715    #[test]
716    fn test_chunk_size_calculation() {
717        let features = SIMDFeatures {
718            vector_width: 4,
719            ..Default::default()
720        };
721
722        let chunk_size = SIMDGateProcessor::calculate_optimal_chunk_size(&features);
723        assert!(chunk_size > 0);
724        assert!(chunk_size % features.vector_width == 0);
725    }
726
727    #[test]
728    fn test_gcd_lcm_helpers() {
729        assert_eq!(SIMDGateProcessor::gcd(12, 8), 4);
730        assert_eq!(SIMDGateProcessor::lcm(4, 6), 12);
731        assert_eq!(SIMDGateProcessor::gcd(17, 13), 1); // Coprime numbers
732    }
733
734    #[test]
735    fn test_statistics_tracking() {
736        let processor = SIMDGateProcessor::new();
737        let mut state_vector = vec![Complex64::new(0.5, 0.0); 8];
738
739        let operation = VectorizedOperation::StateVectorOps {
740            operation_type: StateVectorOpType::ProbabilityCalculation,
741            chunk_size: 4,
742        };
743
744        let _ = processor.process_vectorized_operation(&operation, &mut state_vector);
745
746        let stats = processor.get_statistics();
747        assert!(stats.operations_vectorized > 0);
748        assert!(stats.total_elements_processed > 0);
749    }
750
751    #[test]
752    fn test_global_simd_processor() {
753        let processor1 = get_global_simd_processor();
754        let processor2 = get_global_simd_processor();
755
756        // Should be the same instance
757        assert!(std::ptr::eq(processor1, processor2));
758        assert!(processor1.get_optimal_chunk_size() > 0);
759    }
760}