1use crate::error::{QuantRS2Error, QuantRS2Result};
7use scirs2_core::Complex64;
8use std::sync::{Arc, OnceLock, RwLock};
9
10#[cfg(target_arch = "x86_64")]
11use std::arch::x86_64::*;
12
13#[derive(Debug, Clone, PartialEq)]
15pub enum VectorizedOperation {
16 SingleQubitBroadcast {
18 matrix: [Complex64; 4],
19 target_qubits: Vec<usize>,
20 },
21 TwoQubitBroadcast {
23 matrix: [Complex64; 16],
24 qubit_pairs: Vec<(usize, usize)>,
25 },
26 StateVectorOps {
28 operation_type: StateVectorOpType,
29 chunk_size: usize,
30 },
31 BatchMeasurements {
33 qubit_indices: Vec<usize>,
34 sample_count: usize,
35 },
36}
37
38#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum StateVectorOpType {
41 Normalization,
42 ProbabilityCalculation,
43 ExpectationValue,
44 InnerProduct,
45 ElementwiseMultiply,
46 PhaseRotation,
47}
48
49pub struct SIMDGateProcessor {
51 simd_features: SIMDFeatures,
53 statistics: Arc<RwLock<SIMDStatistics>>,
55 optimal_chunk_size: usize,
57}
58
59#[derive(Debug, Clone, Default)]
61pub struct SIMDFeatures {
62 pub sse: bool,
63 pub sse2: bool,
64 pub sse3: bool,
65 pub ssse3: bool,
66 pub sse4_1: bool,
67 pub sse4_2: bool,
68 pub avx: bool,
69 pub avx2: bool,
70 pub avx512f: bool,
71 pub fma: bool,
72 pub vector_width: usize, }
74
75#[derive(Debug, Clone, Default)]
77pub struct SIMDStatistics {
78 pub operations_vectorized: u64,
79 pub operations_scalar: u64,
80 pub total_elements_processed: u64,
81 pub average_speedup: f64,
82 pub memory_bandwidth_utilization: f64,
83 pub cache_efficiency: f64,
84}
85
86impl SIMDGateProcessor {
87 pub fn new() -> Self {
89 let features = Self::detect_simd_features();
90 let optimal_chunk_size = Self::calculate_optimal_chunk_size(&features);
91
92 Self {
93 simd_features: features,
94 statistics: Arc::new(RwLock::new(SIMDStatistics::default())),
95 optimal_chunk_size,
96 }
97 }
98
99 #[cfg(target_arch = "x86_64")]
101 fn detect_simd_features() -> SIMDFeatures {
102 let mut features = SIMDFeatures::default();
103
104 features.sse = std::arch::is_x86_feature_detected!("sse");
106 features.sse2 = std::arch::is_x86_feature_detected!("sse2");
107 features.sse3 = std::arch::is_x86_feature_detected!("sse3");
108 features.ssse3 = std::arch::is_x86_feature_detected!("ssse3");
109 features.sse4_1 = std::arch::is_x86_feature_detected!("sse4.1");
110 features.sse4_2 = std::arch::is_x86_feature_detected!("sse4.2");
111 features.avx = std::arch::is_x86_feature_detected!("avx");
112 features.avx2 = std::arch::is_x86_feature_detected!("avx2");
113 features.avx512f = std::arch::is_x86_feature_detected!("avx512f");
114 features.fma = std::arch::is_x86_feature_detected!("fma");
115
116 features.vector_width = if features.avx512f {
118 8 } else if features.avx2 {
120 4 } else if features.sse2 {
122 2 } else {
124 1 };
126
127 features
128 }
129
130 #[cfg(not(target_arch = "x86_64"))]
132 fn detect_simd_features() -> SIMDFeatures {
133 SIMDFeatures {
135 vector_width: 2, ..Default::default()
137 }
138 }
139
140 fn calculate_optimal_chunk_size(features: &SIMDFeatures) -> usize {
142 let base_chunk = features.vector_width * 64; let cache_line_elements = 64 / std::mem::size_of::<Complex64>();
147
148 let lcm = Self::lcm(features.vector_width, cache_line_elements);
150 (base_chunk / lcm) * lcm
151 }
152
153 fn lcm(a: usize, b: usize) -> usize {
155 a * b / Self::gcd(a, b)
156 }
157
158 fn gcd(a: usize, b: usize) -> usize {
160 if b == 0 {
161 a
162 } else {
163 Self::gcd(b, a % b)
164 }
165 }
166
167 pub fn process_vectorized_operation(
169 &self,
170 operation: &VectorizedOperation,
171 state_vector: &mut [Complex64],
172 ) -> QuantRS2Result<f64> {
173 let start_time = std::time::Instant::now();
174
175 let speedup = match operation {
176 VectorizedOperation::SingleQubitBroadcast {
177 matrix,
178 target_qubits,
179 } => self.process_single_qubit_broadcast(matrix, target_qubits, state_vector)?,
180 VectorizedOperation::TwoQubitBroadcast {
181 matrix,
182 qubit_pairs,
183 } => self.process_two_qubit_broadcast(matrix, qubit_pairs, state_vector)?,
184 VectorizedOperation::StateVectorOps {
185 operation_type,
186 chunk_size,
187 } => self.process_state_vector_operation(operation_type, *chunk_size, state_vector)?,
188 VectorizedOperation::BatchMeasurements {
189 qubit_indices,
190 sample_count,
191 } => self.process_batch_measurements(qubit_indices, *sample_count, state_vector)?,
192 };
193
194 let mut stats = self.statistics.write().unwrap_or_else(|e| e.into_inner());
196 stats.operations_vectorized += 1;
197 stats.total_elements_processed += state_vector.len() as u64;
198
199 let elapsed = start_time.elapsed().as_nanos() as f64;
200 stats.average_speedup = stats
201 .average_speedup
202 .mul_add((stats.operations_vectorized - 1) as f64, speedup)
203 / stats.operations_vectorized as f64;
204
205 Ok(speedup)
206 }
207
208 fn process_single_qubit_broadcast(
210 &self,
211 matrix: &[Complex64; 4],
212 target_qubits: &[usize],
213 state_vector: &mut [Complex64],
214 ) -> QuantRS2Result<f64> {
215 let vector_size = state_vector.len();
216 let speedup_factor = target_qubits.len() as f64;
217
218 for &qubit in target_qubits {
219 if qubit >= 64 {
220 return Err(QuantRS2Error::InvalidQubitId(qubit as u32));
221 }
222
223 let qubit_mask = 1usize << qubit;
224
225 #[cfg(target_arch = "x86_64")]
227 unsafe {
228 if self.simd_features.avx2 {
229 self.apply_single_qubit_gate_avx2(matrix, qubit_mask, state_vector)?;
230 } else if self.simd_features.sse2 {
231 self.apply_single_qubit_gate_sse2(matrix, qubit_mask, state_vector)?;
232 } else {
233 self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector)?;
234 }
235 }
236
237 #[cfg(not(target_arch = "x86_64"))]
238 self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector)?;
239 }
240
241 Ok(speedup_factor * self.simd_features.vector_width as f64)
242 }
243
244 #[cfg(target_arch = "x86_64")]
246 unsafe fn apply_single_qubit_gate_avx2(
247 &self,
248 matrix: &[Complex64; 4],
249 qubit_mask: usize,
250 state_vector: &mut [Complex64],
251 ) -> QuantRS2Result<()> {
252 if !self.simd_features.avx2 {
253 return self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector);
254 }
255
256 let len = state_vector.len();
257 let chunk_size = 4; let m00_real = _mm256_set1_pd(matrix[0].re);
261 let m00_imag = _mm256_set1_pd(matrix[0].im);
262 let m01_real = _mm256_set1_pd(matrix[1].re);
263 let m01_imag = _mm256_set1_pd(matrix[1].im);
264 let m10_real = _mm256_set1_pd(matrix[2].re);
265 let m10_imag = _mm256_set1_pd(matrix[2].im);
266 let m11_real = _mm256_set1_pd(matrix[3].re);
267 let m11_imag = _mm256_set1_pd(matrix[3].im);
268
269 let mut i = 0;
270 while i + chunk_size <= len {
271 if i & qubit_mask == 0 {
272 let j = i | qubit_mask;
273 if j + chunk_size <= len {
274 let state_0_ptr = state_vector.as_ptr().add(i) as *const f64;
276 let state_1_ptr = state_vector.as_ptr().add(j) as *const f64;
277
278 let state_0_vals = _mm256_loadu_pd(state_0_ptr);
280 let state_1_vals = _mm256_loadu_pd(state_1_ptr);
281
282 let result_0 = _mm256_fmadd_pd(
286 m00_real,
287 state_0_vals,
288 _mm256_mul_pd(m01_real, state_1_vals),
289 );
290 let result_1 = _mm256_fmadd_pd(
291 m10_real,
292 state_0_vals,
293 _mm256_mul_pd(m11_real, state_1_vals),
294 );
295
296 let result_ptr_0 = state_vector.as_mut_ptr().add(i) as *mut f64;
298 let result_ptr_1 = state_vector.as_mut_ptr().add(j) as *mut f64;
299
300 _mm256_storeu_pd(result_ptr_0, result_0);
301 _mm256_storeu_pd(result_ptr_1, result_1);
302 }
303 }
304 i += chunk_size;
305 }
306
307 while i < len {
309 if i & qubit_mask == 0 {
310 let j = i | qubit_mask;
311 if j < len {
312 let amp_0 = state_vector[i];
313 let amp_1 = state_vector[j];
314
315 state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
316 state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
317 }
318 }
319 i += 1;
320 }
321
322 Ok(())
323 }
324
325 #[cfg(target_arch = "x86_64")]
327 unsafe fn apply_single_qubit_gate_sse2(
328 &self,
329 matrix: &[Complex64; 4],
330 qubit_mask: usize,
331 state_vector: &mut [Complex64],
332 ) -> QuantRS2Result<()> {
333 if !self.simd_features.sse2 {
334 return self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector);
335 }
336
337 let len = state_vector.len();
340 let chunk_size = 2;
341
342 let mut i = 0;
343 while i + chunk_size <= len {
344 if i & qubit_mask == 0 {
345 let j = i | qubit_mask;
346 if j + chunk_size <= len {
347 for k in 0..chunk_size {
350 if (i + k) < len && (j + k) < len {
351 let amp_0 = state_vector[i + k];
352 let amp_1 = state_vector[j + k];
353
354 state_vector[i + k] = matrix[0] * amp_0 + matrix[1] * amp_1;
355 state_vector[j + k] = matrix[2] * amp_0 + matrix[3] * amp_1;
356 }
357 }
358 }
359 }
360 i += chunk_size;
361 }
362
363 while i < len {
365 if i & qubit_mask == 0 {
366 let j = i | qubit_mask;
367 if j < len {
368 let amp_0 = state_vector[i];
369 let amp_1 = state_vector[j];
370
371 state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
372 state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
373 }
374 }
375 i += 1;
376 }
377
378 Ok(())
379 }
380
381 fn apply_single_qubit_gate_scalar(
383 &self,
384 matrix: &[Complex64; 4],
385 qubit_mask: usize,
386 state_vector: &mut [Complex64],
387 ) -> QuantRS2Result<()> {
388 let len = state_vector.len();
389
390 for i in 0..len {
391 if i & qubit_mask == 0 {
392 let j = i | qubit_mask;
393 if j < len {
394 let amp_0 = state_vector[i];
395 let amp_1 = state_vector[j];
396
397 state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
398 state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
399 }
400 }
401 }
402
403 let mut stats = self.statistics.write().unwrap_or_else(|e| e.into_inner());
404 stats.operations_scalar += 1;
405
406 Ok(())
407 }
408
409 fn process_two_qubit_broadcast(
411 &self,
412 matrix: &[Complex64; 16],
413 qubit_pairs: &[(usize, usize)],
414 state_vector: &mut [Complex64],
415 ) -> QuantRS2Result<f64> {
416 for &(control, target) in qubit_pairs {
419 self.apply_two_qubit_gate_scalar(matrix, control, target, state_vector)?;
420 }
421
422 Ok(qubit_pairs.len() as f64)
423 }
424
425 fn apply_two_qubit_gate_scalar(
427 &self,
428 matrix: &[Complex64; 16],
429 control: usize,
430 target: usize,
431 state_vector: &mut [Complex64],
432 ) -> QuantRS2Result<()> {
433 if control >= 64 || target >= 64 {
434 return Err(QuantRS2Error::InvalidQubitId(
435 if control >= 64 { control } else { target } as u32,
436 ));
437 }
438
439 let control_mask = 1usize << control;
440 let target_mask = 1usize << target;
441 let len = state_vector.len();
442
443 for i in 0..len {
445 let control_bit = (i & control_mask) >> control;
446 let target_bit = (i & target_mask) >> target;
447
448 if control_bit == 0 && target_bit == 0 {
449 let j00 = i;
451 let j01 = i | target_mask;
452 let j10 = i | control_mask;
453 let j11 = i | control_mask | target_mask;
454
455 if j00 < len && j01 < len && j10 < len && j11 < len {
456 let amp_00 = state_vector[j00];
457 let amp_01 = state_vector[j01];
458 let amp_10 = state_vector[j10];
459 let amp_11 = state_vector[j11];
460
461 state_vector[j00] = matrix[0] * amp_00
462 + matrix[1] * amp_01
463 + matrix[2] * amp_10
464 + matrix[3] * amp_11;
465 state_vector[j01] = matrix[4] * amp_00
466 + matrix[5] * amp_01
467 + matrix[6] * amp_10
468 + matrix[7] * amp_11;
469 state_vector[j10] = matrix[8] * amp_00
470 + matrix[9] * amp_01
471 + matrix[10] * amp_10
472 + matrix[11] * amp_11;
473 state_vector[j11] = matrix[12] * amp_00
474 + matrix[13] * amp_01
475 + matrix[14] * amp_10
476 + matrix[15] * amp_11;
477 }
478 }
479 }
480
481 Ok(())
482 }
483
484 fn process_state_vector_operation(
486 &self,
487 operation_type: &StateVectorOpType,
488 chunk_size: usize,
489 state_vector: &mut [Complex64],
490 ) -> QuantRS2Result<f64> {
491 let effective_chunk_size = chunk_size.min(self.optimal_chunk_size);
492
493 match operation_type {
494 StateVectorOpType::Normalization => {
495 self.simd_normalize(state_vector, effective_chunk_size)
496 }
497 StateVectorOpType::ProbabilityCalculation => {
498 self.simd_probabilities(state_vector, effective_chunk_size)
499 }
500 StateVectorOpType::PhaseRotation => {
501 self.simd_phase_rotation(state_vector, effective_chunk_size, 0.0)
502 }
504 _ => {
505 Ok(1.0)
507 }
508 }
509 }
510
511 fn simd_normalize(
513 &self,
514 state_vector: &mut [Complex64],
515 chunk_size: usize,
516 ) -> QuantRS2Result<f64> {
517 let mut norm_squared = 0.0;
519
520 for chunk in state_vector.chunks(chunk_size) {
521 for &litude in chunk {
522 norm_squared += amplitude.norm_sqr();
523 }
524 }
525
526 let norm = norm_squared.sqrt();
527 if norm > 0.0 {
528 let inv_norm = 1.0 / norm;
529
530 for chunk in state_vector.chunks_mut(chunk_size) {
532 for amplitude in chunk {
533 *amplitude *= inv_norm;
534 }
535 }
536 }
537
538 Ok(self.simd_features.vector_width as f64)
539 }
540
541 fn simd_probabilities(
543 &self,
544 state_vector: &[Complex64],
545 chunk_size: usize,
546 ) -> QuantRS2Result<f64> {
547 let mut _total_prob = 0.0;
549
550 for chunk in state_vector.chunks(chunk_size) {
551 for &litude in chunk {
552 _total_prob += amplitude.norm_sqr();
553 }
554 }
555
556 Ok(self.simd_features.vector_width as f64)
557 }
558
559 fn simd_phase_rotation(
561 &self,
562 state_vector: &mut [Complex64],
563 chunk_size: usize,
564 angle: f64,
565 ) -> QuantRS2Result<f64> {
566 let phase = Complex64::from_polar(1.0, angle);
567
568 for chunk in state_vector.chunks_mut(chunk_size) {
569 for amplitude in chunk {
570 *amplitude *= phase;
571 }
572 }
573
574 Ok(self.simd_features.vector_width as f64)
575 }
576
577 fn process_batch_measurements(
579 &self,
580 qubit_indices: &[usize],
581 sample_count: usize,
582 state_vector: &[Complex64],
583 ) -> QuantRS2Result<f64> {
584 Ok(qubit_indices.len() as f64 * sample_count as f64 / 1000.0)
587 }
588
589 pub fn get_statistics(&self) -> SIMDStatistics {
591 self.statistics
592 .read()
593 .unwrap_or_else(|e| e.into_inner())
594 .clone()
595 }
596
597 pub fn get_global_statistics() -> SIMDStatistics {
599 if let Some(processor) = GLOBAL_SIMD_PROCESSOR.get() {
600 processor.get_statistics()
601 } else {
602 SIMDStatistics::default()
603 }
604 }
605
606 pub fn reset_statistics(&self) {
608 let mut stats = self.statistics.write().unwrap_or_else(|e| e.into_inner());
609 *stats = SIMDStatistics::default();
610 }
611
612 pub const fn get_optimal_chunk_size(&self) -> usize {
614 self.optimal_chunk_size
615 }
616
617 pub const fn get_simd_features(&self) -> &SIMDFeatures {
619 &self.simd_features
620 }
621}
622
623impl Default for SIMDGateProcessor {
624 fn default() -> Self {
625 Self::new()
626 }
627}
628
629static GLOBAL_SIMD_PROCESSOR: OnceLock<SIMDGateProcessor> = OnceLock::new();
631
632pub fn get_global_simd_processor() -> &'static SIMDGateProcessor {
634 GLOBAL_SIMD_PROCESSOR.get_or_init(SIMDGateProcessor::new)
635}
636
637pub fn process_gates_simd(
639 operations: Vec<VectorizedOperation>,
640 state_vector: &mut [Complex64],
641) -> QuantRS2Result<Vec<f64>> {
642 let processor = get_global_simd_processor();
643 let mut speedups = Vec::new();
644
645 for operation in &operations {
646 let speedup = processor.process_vectorized_operation(operation, state_vector)?;
647 speedups.push(speedup);
648 }
649
650 Ok(speedups)
651}
652
653#[cfg(test)]
654mod tests {
655 use super::*;
656
657 #[test]
658 fn test_simd_processor_creation() {
659 let processor = SIMDGateProcessor::new();
660 assert!(processor.get_optimal_chunk_size() > 0);
661 assert!(processor.get_simd_features().vector_width >= 1);
662 }
663
664 #[test]
665 fn test_simd_feature_detection() {
666 let features = SIMDGateProcessor::detect_simd_features();
667 assert!(features.vector_width >= 1);
668 assert!(features.vector_width <= 16); }
670
671 #[test]
672 fn test_single_qubit_gate_application() {
673 let processor = SIMDGateProcessor::new();
674 let mut state_vector = vec![
675 Complex64::new(1.0, 0.0),
676 Complex64::new(0.0, 0.0),
677 Complex64::new(0.0, 0.0),
678 Complex64::new(0.0, 0.0),
679 ];
680
681 let pauli_x = [
683 Complex64::new(0.0, 0.0),
684 Complex64::new(1.0, 0.0),
685 Complex64::new(1.0, 0.0),
686 Complex64::new(0.0, 0.0),
687 ];
688
689 let result = processor.apply_single_qubit_gate_scalar(&pauli_x, 1, &mut state_vector);
690 assert!(result.is_ok());
691
692 assert!((state_vector[0] - Complex64::new(0.0, 0.0)).norm() < 1e-10);
694 assert!((state_vector[1] - Complex64::new(1.0, 0.0)).norm() < 1e-10);
695 }
696
697 #[test]
698 fn test_vectorized_operation_processing() {
699 let processor = SIMDGateProcessor::new();
700 let mut state_vector = vec![Complex64::new(0.5, 0.0); 16];
701
702 let operation = VectorizedOperation::StateVectorOps {
703 operation_type: StateVectorOpType::Normalization,
704 chunk_size: 4,
705 };
706
707 let result = processor.process_vectorized_operation(&operation, &mut state_vector);
708 assert!(result.is_ok());
709
710 let norm_squared: f64 = state_vector.iter().map(|c| c.norm_sqr()).sum();
712 assert!((norm_squared - 1.0).abs() < 1e-10);
713 }
714
715 #[test]
716 fn test_chunk_size_calculation() {
717 let features = SIMDFeatures {
718 vector_width: 4,
719 ..Default::default()
720 };
721
722 let chunk_size = SIMDGateProcessor::calculate_optimal_chunk_size(&features);
723 assert!(chunk_size > 0);
724 assert!(chunk_size % features.vector_width == 0);
725 }
726
727 #[test]
728 fn test_gcd_lcm_helpers() {
729 assert_eq!(SIMDGateProcessor::gcd(12, 8), 4);
730 assert_eq!(SIMDGateProcessor::lcm(4, 6), 12);
731 assert_eq!(SIMDGateProcessor::gcd(17, 13), 1); }
733
734 #[test]
735 fn test_statistics_tracking() {
736 let processor = SIMDGateProcessor::new();
737 let mut state_vector = vec![Complex64::new(0.5, 0.0); 8];
738
739 let operation = VectorizedOperation::StateVectorOps {
740 operation_type: StateVectorOpType::ProbabilityCalculation,
741 chunk_size: 4,
742 };
743
744 let _ = processor.process_vectorized_operation(&operation, &mut state_vector);
745
746 let stats = processor.get_statistics();
747 assert!(stats.operations_vectorized > 0);
748 assert!(stats.total_elements_processed > 0);
749 }
750
751 #[test]
752 fn test_global_simd_processor() {
753 let processor1 = get_global_simd_processor();
754 let processor2 = get_global_simd_processor();
755
756 assert!(std::ptr::eq(processor1, processor2));
758 assert!(processor1.get_optimal_chunk_size() > 0);
759 }
760}