1use crate::error::{QuantRS2Error, QuantRS2Result};
7use scirs2_core::Complex64;
8use std::sync::{Arc, OnceLock, RwLock};
9
10#[cfg(target_arch = "x86_64")]
11use std::arch::x86_64::*;
12
13#[derive(Debug, Clone, PartialEq)]
15pub enum VectorizedOperation {
16 SingleQubitBroadcast {
18 matrix: [Complex64; 4],
19 target_qubits: Vec<usize>,
20 },
21 TwoQubitBroadcast {
23 matrix: [Complex64; 16],
24 qubit_pairs: Vec<(usize, usize)>,
25 },
26 StateVectorOps {
28 operation_type: StateVectorOpType,
29 chunk_size: usize,
30 },
31 BatchMeasurements {
33 qubit_indices: Vec<usize>,
34 sample_count: usize,
35 },
36}
37
38#[derive(Debug, Clone, PartialEq)]
40pub enum StateVectorOpType {
41 Normalization,
42 ProbabilityCalculation,
43 ExpectationValue,
44 InnerProduct,
45 ElementwiseMultiply,
46 PhaseRotation,
47}
48
49pub struct SIMDGateProcessor {
51 simd_features: SIMDFeatures,
53 statistics: Arc<RwLock<SIMDStatistics>>,
55 optimal_chunk_size: usize,
57}
58
59#[derive(Debug, Clone, Default)]
61pub struct SIMDFeatures {
62 pub sse: bool,
63 pub sse2: bool,
64 pub sse3: bool,
65 pub ssse3: bool,
66 pub sse4_1: bool,
67 pub sse4_2: bool,
68 pub avx: bool,
69 pub avx2: bool,
70 pub avx512f: bool,
71 pub fma: bool,
72 pub vector_width: usize, }
74
75#[derive(Debug, Clone, Default)]
77pub struct SIMDStatistics {
78 pub operations_vectorized: u64,
79 pub operations_scalar: u64,
80 pub total_elements_processed: u64,
81 pub average_speedup: f64,
82 pub memory_bandwidth_utilization: f64,
83 pub cache_efficiency: f64,
84}
85
86impl SIMDGateProcessor {
87 pub fn new() -> Self {
89 let features = Self::detect_simd_features();
90 let optimal_chunk_size = Self::calculate_optimal_chunk_size(&features);
91
92 Self {
93 simd_features: features,
94 statistics: Arc::new(RwLock::new(SIMDStatistics::default())),
95 optimal_chunk_size,
96 }
97 }
98
99 #[cfg(target_arch = "x86_64")]
101 fn detect_simd_features() -> SIMDFeatures {
102 let mut features = SIMDFeatures::default();
103
104 features.sse = std::arch::is_x86_feature_detected!("sse");
106 features.sse2 = std::arch::is_x86_feature_detected!("sse2");
107 features.sse3 = std::arch::is_x86_feature_detected!("sse3");
108 features.ssse3 = std::arch::is_x86_feature_detected!("ssse3");
109 features.sse4_1 = std::arch::is_x86_feature_detected!("sse4.1");
110 features.sse4_2 = std::arch::is_x86_feature_detected!("sse4.2");
111 features.avx = std::arch::is_x86_feature_detected!("avx");
112 features.avx2 = std::arch::is_x86_feature_detected!("avx2");
113 features.avx512f = std::arch::is_x86_feature_detected!("avx512f");
114 features.fma = std::arch::is_x86_feature_detected!("fma");
115
116 features.vector_width = if features.avx512f {
118 8 } else if features.avx2 {
120 4 } else if features.sse2 {
122 2 } else {
124 1 };
126
127 features
128 }
129
130 #[cfg(not(target_arch = "x86_64"))]
132 fn detect_simd_features() -> SIMDFeatures {
133 SIMDFeatures {
135 vector_width: 2, ..Default::default()
137 }
138 }
139
140 fn calculate_optimal_chunk_size(features: &SIMDFeatures) -> usize {
142 let base_chunk = features.vector_width * 64; let cache_line_elements = 64 / std::mem::size_of::<Complex64>();
147
148 let lcm = Self::lcm(features.vector_width, cache_line_elements);
150 (base_chunk / lcm) * lcm
151 }
152
153 fn lcm(a: usize, b: usize) -> usize {
155 a * b / Self::gcd(a, b)
156 }
157
158 fn gcd(a: usize, b: usize) -> usize {
160 if b == 0 {
161 a
162 } else {
163 Self::gcd(b, a % b)
164 }
165 }
166
167 pub fn process_vectorized_operation(
169 &self,
170 operation: &VectorizedOperation,
171 state_vector: &mut [Complex64],
172 ) -> QuantRS2Result<f64> {
173 let start_time = std::time::Instant::now();
174
175 let speedup = match operation {
176 VectorizedOperation::SingleQubitBroadcast {
177 matrix,
178 target_qubits,
179 } => self.process_single_qubit_broadcast(matrix, target_qubits, state_vector)?,
180 VectorizedOperation::TwoQubitBroadcast {
181 matrix,
182 qubit_pairs,
183 } => self.process_two_qubit_broadcast(matrix, qubit_pairs, state_vector)?,
184 VectorizedOperation::StateVectorOps {
185 operation_type,
186 chunk_size,
187 } => self.process_state_vector_operation(operation_type, *chunk_size, state_vector)?,
188 VectorizedOperation::BatchMeasurements {
189 qubit_indices,
190 sample_count,
191 } => self.process_batch_measurements(qubit_indices, *sample_count, state_vector)?,
192 };
193
194 let mut stats = self.statistics.write().unwrap();
196 stats.operations_vectorized += 1;
197 stats.total_elements_processed += state_vector.len() as u64;
198
199 let elapsed = start_time.elapsed().as_nanos() as f64;
200 stats.average_speedup = (stats.average_speedup * (stats.operations_vectorized - 1) as f64
201 + speedup)
202 / stats.operations_vectorized as f64;
203
204 Ok(speedup)
205 }
206
207 fn process_single_qubit_broadcast(
209 &self,
210 matrix: &[Complex64; 4],
211 target_qubits: &[usize],
212 state_vector: &mut [Complex64],
213 ) -> QuantRS2Result<f64> {
214 let vector_size = state_vector.len();
215 let speedup_factor = target_qubits.len() as f64;
216
217 for &qubit in target_qubits {
218 if qubit >= 64 {
219 return Err(QuantRS2Error::InvalidQubitId(qubit as u32));
220 }
221
222 let qubit_mask = 1usize << qubit;
223
224 #[cfg(target_arch = "x86_64")]
226 unsafe {
227 if self.simd_features.avx2 {
228 self.apply_single_qubit_gate_avx2(matrix, qubit_mask, state_vector)?;
229 } else if self.simd_features.sse2 {
230 self.apply_single_qubit_gate_sse2(matrix, qubit_mask, state_vector)?;
231 } else {
232 self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector)?;
233 }
234 }
235
236 #[cfg(not(target_arch = "x86_64"))]
237 self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector)?;
238 }
239
240 Ok(speedup_factor * self.simd_features.vector_width as f64)
241 }
242
243 #[cfg(target_arch = "x86_64")]
245 unsafe fn apply_single_qubit_gate_avx2(
246 &self,
247 matrix: &[Complex64; 4],
248 qubit_mask: usize,
249 state_vector: &mut [Complex64],
250 ) -> QuantRS2Result<()> {
251 if !self.simd_features.avx2 {
252 return self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector);
253 }
254
255 let len = state_vector.len();
256 let chunk_size = 4; let m00_real = _mm256_set1_pd(matrix[0].re);
260 let m00_imag = _mm256_set1_pd(matrix[0].im);
261 let m01_real = _mm256_set1_pd(matrix[1].re);
262 let m01_imag = _mm256_set1_pd(matrix[1].im);
263 let m10_real = _mm256_set1_pd(matrix[2].re);
264 let m10_imag = _mm256_set1_pd(matrix[2].im);
265 let m11_real = _mm256_set1_pd(matrix[3].re);
266 let m11_imag = _mm256_set1_pd(matrix[3].im);
267
268 let mut i = 0;
269 while i + chunk_size <= len {
270 if i & qubit_mask == 0 {
271 let j = i | qubit_mask;
272 if j + chunk_size <= len {
273 let state_0_ptr = state_vector.as_ptr().add(i) as *const f64;
275 let state_1_ptr = state_vector.as_ptr().add(j) as *const f64;
276
277 let state_0_vals = _mm256_loadu_pd(state_0_ptr);
279 let state_1_vals = _mm256_loadu_pd(state_1_ptr);
280
281 let result_0 = _mm256_fmadd_pd(
285 m00_real,
286 state_0_vals,
287 _mm256_mul_pd(m01_real, state_1_vals),
288 );
289 let result_1 = _mm256_fmadd_pd(
290 m10_real,
291 state_0_vals,
292 _mm256_mul_pd(m11_real, state_1_vals),
293 );
294
295 let result_ptr_0 = state_vector.as_mut_ptr().add(i) as *mut f64;
297 let result_ptr_1 = state_vector.as_mut_ptr().add(j) as *mut f64;
298
299 _mm256_storeu_pd(result_ptr_0, result_0);
300 _mm256_storeu_pd(result_ptr_1, result_1);
301 }
302 }
303 i += chunk_size;
304 }
305
306 while i < len {
308 if i & qubit_mask == 0 {
309 let j = i | qubit_mask;
310 if j < len {
311 let amp_0 = state_vector[i];
312 let amp_1 = state_vector[j];
313
314 state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
315 state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
316 }
317 }
318 i += 1;
319 }
320
321 Ok(())
322 }
323
324 #[cfg(target_arch = "x86_64")]
326 unsafe fn apply_single_qubit_gate_sse2(
327 &self,
328 matrix: &[Complex64; 4],
329 qubit_mask: usize,
330 state_vector: &mut [Complex64],
331 ) -> QuantRS2Result<()> {
332 if !self.simd_features.sse2 {
333 return self.apply_single_qubit_gate_scalar(matrix, qubit_mask, state_vector);
334 }
335
336 let len = state_vector.len();
339 let chunk_size = 2;
340
341 let mut i = 0;
342 while i + chunk_size <= len {
343 if i & qubit_mask == 0 {
344 let j = i | qubit_mask;
345 if j + chunk_size <= len {
346 for k in 0..chunk_size {
349 if (i + k) < len && (j + k) < len {
350 let amp_0 = state_vector[i + k];
351 let amp_1 = state_vector[j + k];
352
353 state_vector[i + k] = matrix[0] * amp_0 + matrix[1] * amp_1;
354 state_vector[j + k] = matrix[2] * amp_0 + matrix[3] * amp_1;
355 }
356 }
357 }
358 }
359 i += chunk_size;
360 }
361
362 while i < len {
364 if i & qubit_mask == 0 {
365 let j = i | qubit_mask;
366 if j < len {
367 let amp_0 = state_vector[i];
368 let amp_1 = state_vector[j];
369
370 state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
371 state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
372 }
373 }
374 i += 1;
375 }
376
377 Ok(())
378 }
379
380 fn apply_single_qubit_gate_scalar(
382 &self,
383 matrix: &[Complex64; 4],
384 qubit_mask: usize,
385 state_vector: &mut [Complex64],
386 ) -> QuantRS2Result<()> {
387 let len = state_vector.len();
388
389 for i in 0..len {
390 if i & qubit_mask == 0 {
391 let j = i | qubit_mask;
392 if j < len {
393 let amp_0 = state_vector[i];
394 let amp_1 = state_vector[j];
395
396 state_vector[i] = matrix[0] * amp_0 + matrix[1] * amp_1;
397 state_vector[j] = matrix[2] * amp_0 + matrix[3] * amp_1;
398 }
399 }
400 }
401
402 let mut stats = self.statistics.write().unwrap();
403 stats.operations_scalar += 1;
404
405 Ok(())
406 }
407
408 fn process_two_qubit_broadcast(
410 &self,
411 matrix: &[Complex64; 16],
412 qubit_pairs: &[(usize, usize)],
413 state_vector: &mut [Complex64],
414 ) -> QuantRS2Result<f64> {
415 for &(control, target) in qubit_pairs {
418 self.apply_two_qubit_gate_scalar(matrix, control, target, state_vector)?;
419 }
420
421 Ok(qubit_pairs.len() as f64)
422 }
423
424 fn apply_two_qubit_gate_scalar(
426 &self,
427 matrix: &[Complex64; 16],
428 control: usize,
429 target: usize,
430 state_vector: &mut [Complex64],
431 ) -> QuantRS2Result<()> {
432 if control >= 64 || target >= 64 {
433 return Err(QuantRS2Error::InvalidQubitId(
434 if control >= 64 { control } else { target } as u32,
435 ));
436 }
437
438 let control_mask = 1usize << control;
439 let target_mask = 1usize << target;
440 let len = state_vector.len();
441
442 for i in 0..len {
444 let control_bit = (i & control_mask) >> control;
445 let target_bit = (i & target_mask) >> target;
446
447 if control_bit == 0 && target_bit == 0 {
448 let j00 = i;
450 let j01 = i | target_mask;
451 let j10 = i | control_mask;
452 let j11 = i | control_mask | target_mask;
453
454 if j00 < len && j01 < len && j10 < len && j11 < len {
455 let amp_00 = state_vector[j00];
456 let amp_01 = state_vector[j01];
457 let amp_10 = state_vector[j10];
458 let amp_11 = state_vector[j11];
459
460 state_vector[j00] = matrix[0] * amp_00
461 + matrix[1] * amp_01
462 + matrix[2] * amp_10
463 + matrix[3] * amp_11;
464 state_vector[j01] = matrix[4] * amp_00
465 + matrix[5] * amp_01
466 + matrix[6] * amp_10
467 + matrix[7] * amp_11;
468 state_vector[j10] = matrix[8] * amp_00
469 + matrix[9] * amp_01
470 + matrix[10] * amp_10
471 + matrix[11] * amp_11;
472 state_vector[j11] = matrix[12] * amp_00
473 + matrix[13] * amp_01
474 + matrix[14] * amp_10
475 + matrix[15] * amp_11;
476 }
477 }
478 }
479
480 Ok(())
481 }
482
483 fn process_state_vector_operation(
485 &self,
486 operation_type: &StateVectorOpType,
487 chunk_size: usize,
488 state_vector: &mut [Complex64],
489 ) -> QuantRS2Result<f64> {
490 let effective_chunk_size = chunk_size.min(self.optimal_chunk_size);
491
492 match operation_type {
493 StateVectorOpType::Normalization => {
494 self.simd_normalize(state_vector, effective_chunk_size)
495 }
496 StateVectorOpType::ProbabilityCalculation => {
497 self.simd_probabilities(state_vector, effective_chunk_size)
498 }
499 StateVectorOpType::PhaseRotation => {
500 self.simd_phase_rotation(state_vector, effective_chunk_size, 0.0)
501 }
503 _ => {
504 Ok(1.0)
506 }
507 }
508 }
509
510 fn simd_normalize(
512 &self,
513 state_vector: &mut [Complex64],
514 chunk_size: usize,
515 ) -> QuantRS2Result<f64> {
516 let mut norm_squared = 0.0;
518
519 for chunk in state_vector.chunks(chunk_size) {
520 for &litude in chunk {
521 norm_squared += amplitude.norm_sqr();
522 }
523 }
524
525 let norm = norm_squared.sqrt();
526 if norm > 0.0 {
527 let inv_norm = 1.0 / norm;
528
529 for chunk in state_vector.chunks_mut(chunk_size) {
531 for amplitude in chunk {
532 *amplitude *= inv_norm;
533 }
534 }
535 }
536
537 Ok(self.simd_features.vector_width as f64)
538 }
539
540 fn simd_probabilities(
542 &self,
543 state_vector: &[Complex64],
544 chunk_size: usize,
545 ) -> QuantRS2Result<f64> {
546 let mut _total_prob = 0.0;
548
549 for chunk in state_vector.chunks(chunk_size) {
550 for &litude in chunk {
551 _total_prob += amplitude.norm_sqr();
552 }
553 }
554
555 Ok(self.simd_features.vector_width as f64)
556 }
557
558 fn simd_phase_rotation(
560 &self,
561 state_vector: &mut [Complex64],
562 chunk_size: usize,
563 angle: f64,
564 ) -> QuantRS2Result<f64> {
565 let phase = Complex64::from_polar(1.0, angle);
566
567 for chunk in state_vector.chunks_mut(chunk_size) {
568 for amplitude in chunk {
569 *amplitude *= phase;
570 }
571 }
572
573 Ok(self.simd_features.vector_width as f64)
574 }
575
576 fn process_batch_measurements(
578 &self,
579 qubit_indices: &[usize],
580 sample_count: usize,
581 state_vector: &[Complex64],
582 ) -> QuantRS2Result<f64> {
583 Ok(qubit_indices.len() as f64 * sample_count as f64 / 1000.0)
586 }
587
588 pub fn get_statistics(&self) -> SIMDStatistics {
590 self.statistics.read().unwrap().clone()
591 }
592
593 pub fn get_global_statistics() -> SIMDStatistics {
595 if let Some(processor) = GLOBAL_SIMD_PROCESSOR.get() {
596 processor.get_statistics()
597 } else {
598 SIMDStatistics::default()
599 }
600 }
601
602 pub fn reset_statistics(&self) {
604 let mut stats = self.statistics.write().unwrap();
605 *stats = SIMDStatistics::default();
606 }
607
608 pub fn get_optimal_chunk_size(&self) -> usize {
610 self.optimal_chunk_size
611 }
612
613 pub fn get_simd_features(&self) -> &SIMDFeatures {
615 &self.simd_features
616 }
617}
618
619impl Default for SIMDGateProcessor {
620 fn default() -> Self {
621 Self::new()
622 }
623}
624
625static GLOBAL_SIMD_PROCESSOR: OnceLock<SIMDGateProcessor> = OnceLock::new();
627
628pub fn get_global_simd_processor() -> &'static SIMDGateProcessor {
630 GLOBAL_SIMD_PROCESSOR.get_or_init(SIMDGateProcessor::new)
631}
632
633pub fn process_gates_simd(
635 operations: Vec<VectorizedOperation>,
636 state_vector: &mut [Complex64],
637) -> QuantRS2Result<Vec<f64>> {
638 let processor = get_global_simd_processor();
639 let mut speedups = Vec::new();
640
641 for operation in &operations {
642 let speedup = processor.process_vectorized_operation(operation, state_vector)?;
643 speedups.push(speedup);
644 }
645
646 Ok(speedups)
647}
648
649#[cfg(test)]
650mod tests {
651 use super::*;
652
653 #[test]
654 fn test_simd_processor_creation() {
655 let processor = SIMDGateProcessor::new();
656 assert!(processor.get_optimal_chunk_size() > 0);
657 assert!(processor.get_simd_features().vector_width >= 1);
658 }
659
660 #[test]
661 fn test_simd_feature_detection() {
662 let features = SIMDGateProcessor::detect_simd_features();
663 assert!(features.vector_width >= 1);
664 assert!(features.vector_width <= 16); }
666
667 #[test]
668 fn test_single_qubit_gate_application() {
669 let processor = SIMDGateProcessor::new();
670 let mut state_vector = vec![
671 Complex64::new(1.0, 0.0),
672 Complex64::new(0.0, 0.0),
673 Complex64::new(0.0, 0.0),
674 Complex64::new(0.0, 0.0),
675 ];
676
677 let pauli_x = [
679 Complex64::new(0.0, 0.0),
680 Complex64::new(1.0, 0.0),
681 Complex64::new(1.0, 0.0),
682 Complex64::new(0.0, 0.0),
683 ];
684
685 let result = processor.apply_single_qubit_gate_scalar(&pauli_x, 1, &mut state_vector);
686 assert!(result.is_ok());
687
688 assert!((state_vector[0] - Complex64::new(0.0, 0.0)).norm() < 1e-10);
690 assert!((state_vector[1] - Complex64::new(1.0, 0.0)).norm() < 1e-10);
691 }
692
693 #[test]
694 fn test_vectorized_operation_processing() {
695 let processor = SIMDGateProcessor::new();
696 let mut state_vector = vec![Complex64::new(0.5, 0.0); 16];
697
698 let operation = VectorizedOperation::StateVectorOps {
699 operation_type: StateVectorOpType::Normalization,
700 chunk_size: 4,
701 };
702
703 let result = processor.process_vectorized_operation(&operation, &mut state_vector);
704 assert!(result.is_ok());
705
706 let norm_squared: f64 = state_vector.iter().map(|c| c.norm_sqr()).sum();
708 assert!((norm_squared - 1.0).abs() < 1e-10);
709 }
710
711 #[test]
712 fn test_chunk_size_calculation() {
713 let features = SIMDFeatures {
714 vector_width: 4,
715 ..Default::default()
716 };
717
718 let chunk_size = SIMDGateProcessor::calculate_optimal_chunk_size(&features);
719 assert!(chunk_size > 0);
720 assert!(chunk_size % features.vector_width == 0);
721 }
722
723 #[test]
724 fn test_gcd_lcm_helpers() {
725 assert_eq!(SIMDGateProcessor::gcd(12, 8), 4);
726 assert_eq!(SIMDGateProcessor::lcm(4, 6), 12);
727 assert_eq!(SIMDGateProcessor::gcd(17, 13), 1); }
729
730 #[test]
731 fn test_statistics_tracking() {
732 let processor = SIMDGateProcessor::new();
733 let mut state_vector = vec![Complex64::new(0.5, 0.0); 8];
734
735 let operation = VectorizedOperation::StateVectorOps {
736 operation_type: StateVectorOpType::ProbabilityCalculation,
737 chunk_size: 4,
738 };
739
740 let _ = processor.process_vectorized_operation(&operation, &mut state_vector);
741
742 let stats = processor.get_statistics();
743 assert!(stats.operations_vectorized > 0);
744 assert!(stats.total_elements_processed > 0);
745 }
746
747 #[test]
748 fn test_global_simd_processor() {
749 let processor1 = get_global_simd_processor();
750 let processor2 = get_global_simd_processor();
751
752 assert!(std::ptr::eq(processor1, processor2));
754 assert!(processor1.get_optimal_chunk_size() > 0);
755 }
756}