1use crate::error::{QuantRS2Error, QuantRS2Result};
7use crate::platform::PlatformCapabilities;
8use scirs2_core::Complex64;
9use std::sync::{Mutex, OnceLock};
10use crate::simd_ops_stubs::SimdF64;
12use scirs2_core::ndarray::ArrayView1;
13
14#[derive(Debug, Clone, Copy)]
16pub struct CpuFeatures {
17 pub has_avx2: bool,
19 pub has_avx512: bool,
21 pub has_fma: bool,
23 pub has_avx512vl: bool,
25 pub has_avx512dq: bool,
27 pub has_avx512cd: bool,
29 pub has_sse41: bool,
31 pub has_sse42: bool,
33 pub num_cores: usize,
35 pub l1_cache_size: usize,
37 pub l2_cache_size: usize,
39 pub l3_cache_size: usize,
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum SimdVariant {
46 Scalar,
48 Sse4,
50 Avx2,
52 Avx512,
54}
55
56pub struct AdaptiveSimdDispatcher {
58 cpu_features: CpuFeatures,
60 selected_variant: SimdVariant,
62 performance_cache: Mutex<std::collections::HashMap<String, PerformanceData>>,
64}
65
66#[derive(Debug, Clone)]
68pub struct PerformanceData {
69 avg_time: f64,
71 samples: usize,
73 best_variant: SimdVariant,
75}
76
77static GLOBAL_DISPATCHER: OnceLock<AdaptiveSimdDispatcher> = OnceLock::new();
79
80impl AdaptiveSimdDispatcher {
81 pub fn initialize() -> QuantRS2Result<()> {
83 let cpu_features = Self::detect_cpu_features();
84 let selected_variant = Self::select_optimal_variant(&cpu_features);
85
86 let dispatcher = AdaptiveSimdDispatcher {
87 cpu_features,
88 selected_variant,
89 performance_cache: Mutex::new(std::collections::HashMap::new()),
90 };
91
92 GLOBAL_DISPATCHER.set(dispatcher).map_err(|_| {
93 QuantRS2Error::RuntimeError("Adaptive SIMD dispatcher already initialized".to_string())
94 })?;
95
96 Ok(())
97 }
98
99 pub fn instance() -> QuantRS2Result<&'static AdaptiveSimdDispatcher> {
101 GLOBAL_DISPATCHER.get().ok_or_else(|| {
102 QuantRS2Error::RuntimeError("Adaptive SIMD dispatcher not initialized".to_string())
103 })
104 }
105
106 fn detect_cpu_features() -> CpuFeatures {
108 let platform = PlatformCapabilities::detect();
109
110 CpuFeatures {
111 has_avx2: platform.cpu.simd.avx2,
112 has_avx512: platform.cpu.simd.avx512,
113 has_fma: platform.cpu.simd.fma,
114 has_avx512vl: false, has_avx512dq: false, has_avx512cd: false, has_sse41: platform.cpu.simd.sse4_1,
118 has_sse42: platform.cpu.simd.sse4_2,
119 num_cores: platform.cpu.logical_cores,
120 l1_cache_size: platform.cpu.cache.l1_data.unwrap_or(32 * 1024),
121 l2_cache_size: platform.cpu.cache.l2.unwrap_or(256 * 1024),
122 l3_cache_size: platform.cpu.cache.l3.unwrap_or(8 * 1024 * 1024),
123 }
124 }
125
126 fn select_optimal_variant(features: &CpuFeatures) -> SimdVariant {
128 if features.has_avx512 && features.has_avx512vl && features.has_avx512dq {
129 SimdVariant::Avx512
130 } else if features.has_avx2 && features.has_fma {
131 SimdVariant::Avx2
132 } else if features.has_sse41 && features.has_sse42 {
133 SimdVariant::Sse4
134 } else {
135 SimdVariant::Scalar
136 }
137 }
138
139 pub fn apply_single_qubit_gate_adaptive(
141 &self,
142 state: &mut [Complex64],
143 target: usize,
144 matrix: &[Complex64; 4],
145 ) -> QuantRS2Result<()> {
146 let operation_key = format!("single_qubit_{}", state.len());
147 let variant = self.select_variant_for_operation(&operation_key, state.len());
148
149 let start_time = std::time::Instant::now();
150
151 let result = match variant {
152 SimdVariant::Avx512 => self.apply_single_qubit_sse4(state, target, matrix), SimdVariant::Avx2 => self.apply_single_qubit_sse4(state, target, matrix), SimdVariant::Sse4 => self.apply_single_qubit_sse4(state, target, matrix),
155 SimdVariant::Scalar => self.apply_single_qubit_scalar(state, target, matrix),
156 };
157
158 let execution_time = start_time.elapsed().as_nanos() as f64;
159 self.update_performance_cache(&operation_key, execution_time, variant);
160
161 result
162 }
163
164 pub fn apply_two_qubit_gate_adaptive(
166 &self,
167 state: &mut [Complex64],
168 control: usize,
169 target: usize,
170 matrix: &[Complex64; 16],
171 ) -> QuantRS2Result<()> {
172 let operation_key = format!("two_qubit_{}", state.len());
173 let variant = self.select_variant_for_operation(&operation_key, state.len());
174
175 let start_time = std::time::Instant::now();
176
177 let result = match variant {
178 SimdVariant::Avx512 => self.apply_two_qubit_avx512(state, control, target, matrix),
179 SimdVariant::Avx2 => self.apply_two_qubit_avx2(state, control, target, matrix),
180 SimdVariant::Sse4 => self.apply_two_qubit_sse4(state, control, target, matrix),
181 SimdVariant::Scalar => self.apply_two_qubit_scalar(state, control, target, matrix),
182 };
183
184 let execution_time = start_time.elapsed().as_nanos() as f64;
185 self.update_performance_cache(&operation_key, execution_time, variant);
186
187 result
188 }
189
190 pub fn apply_batch_gates_adaptive(
192 &self,
193 states: &mut [&mut [Complex64]],
194 gates: &[Box<dyn crate::gate::GateOp>],
195 ) -> QuantRS2Result<()> {
196 let batch_size = states.len();
197 let operation_key = format!("batch_{}_{}", batch_size, gates.len());
198 let variant = self.select_variant_for_operation(&operation_key, batch_size * 1000); let start_time = std::time::Instant::now();
201
202 let result = match variant {
203 SimdVariant::Avx512 => self.apply_batch_gates_avx512(states, gates),
204 SimdVariant::Avx2 => self.apply_batch_gates_avx2(states, gates),
205 SimdVariant::Sse4 => self.apply_batch_gates_sse4(states, gates),
206 SimdVariant::Scalar => self.apply_batch_gates_scalar(states, gates),
207 };
208
209 let execution_time = start_time.elapsed().as_nanos() as f64;
210 self.update_performance_cache(&operation_key, execution_time, variant);
211
212 result
213 }
214
215 fn select_variant_for_operation(&self, operation_key: &str, data_size: usize) -> SimdVariant {
217 if let Ok(cache) = self.performance_cache.lock() {
219 if let Some(perf_data) = cache.get(operation_key) {
220 if perf_data.samples >= 5 {
221 return perf_data.best_variant;
222 }
223 }
224 }
225
226 if data_size >= 1024 && self.cpu_features.has_avx512 {
228 SimdVariant::Avx512
229 } else if data_size >= 256 && self.cpu_features.has_avx2 {
230 SimdVariant::Avx2
231 } else if data_size >= 64 && self.cpu_features.has_sse41 {
232 SimdVariant::Sse4
233 } else {
234 SimdVariant::Scalar
235 }
236 }
237
238 fn update_performance_cache(
240 &self,
241 operation_key: &str,
242 execution_time: f64,
243 variant: SimdVariant,
244 ) {
245 if let Ok(mut cache) = self.performance_cache.lock() {
246 let perf_data =
247 cache
248 .entry(operation_key.to_string())
249 .or_insert_with(|| PerformanceData {
250 avg_time: execution_time,
251 samples: 0,
252 best_variant: variant,
253 });
254
255 perf_data.avg_time = (perf_data.avg_time * perf_data.samples as f64 + execution_time)
257 / (perf_data.samples + 1) as f64;
258 perf_data.samples += 1;
259
260 if execution_time < perf_data.avg_time * 0.9 {
262 perf_data.best_variant = variant;
263 }
264 }
265 }
266
267 pub fn get_performance_report(&self) -> AdaptivePerformanceReport {
269 let cache = self
270 .performance_cache
271 .lock()
272 .map(|cache| cache.clone())
273 .unwrap_or_default();
274
275 AdaptivePerformanceReport {
276 cpu_features: self.cpu_features,
277 selected_variant: self.selected_variant,
278 performance_cache: cache,
279 }
280 }
281
282 #[cfg(target_arch = "x86_64")]
285 fn apply_single_qubit_avx512(
286 &self,
287 state: &mut [Complex64],
288 target: usize,
289 matrix: &[Complex64; 4],
290 ) -> QuantRS2Result<()> {
291 self.apply_single_qubit_simd_unified(state, target, matrix)
294 }
295
296 #[cfg(target_arch = "x86_64")]
297 fn apply_single_qubit_avx2(
298 &self,
299 state: &mut [Complex64],
300 target: usize,
301 matrix: &[Complex64; 4],
302 ) -> QuantRS2Result<()> {
303 self.apply_single_qubit_simd_unified(state, target, matrix)
306 }
307
308 fn apply_single_qubit_sse4(
309 &self,
310 state: &mut [Complex64],
311 target: usize,
312 matrix: &[Complex64; 4],
313 ) -> QuantRS2Result<()> {
314 self.apply_single_qubit_simd_unified(state, target, matrix)
317 }
318
319 fn apply_single_qubit_scalar(
320 &self,
321 state: &mut [Complex64],
322 target: usize,
323 matrix: &[Complex64; 4],
324 ) -> QuantRS2Result<()> {
325 let n = state.len();
327 for i in 0..n {
328 if (i >> target) & 1 == 0 {
329 let j = i | (1 << target);
330 let temp0 = state[i];
331 let temp1 = state[j];
332 state[i] = matrix[0] * temp0 + matrix[1] * temp1;
333 state[j] = matrix[2] * temp0 + matrix[3] * temp1;
334 }
335 }
336 Ok(())
337 }
338
339 fn apply_single_qubit_simd_unified(
341 &self,
342 state: &mut [Complex64],
343 target: usize,
344 matrix: &[Complex64; 4],
345 ) -> QuantRS2Result<()> {
346 let qubit_mask = 1 << target;
347 let half_size = state.len() / 2;
348
349 let mut idx0_list = Vec::new();
351 let mut idx1_list = Vec::new();
352
353 for i in 0..half_size {
354 let idx0 = (i & !(qubit_mask >> 1)) | ((i & (qubit_mask >> 1)) << 1);
355 let idx1 = idx0 | qubit_mask;
356
357 if idx1 < state.len() {
358 idx0_list.push(idx0);
359 idx1_list.push(idx1);
360 }
361 }
362
363 let pair_count = idx0_list.len();
364 if pair_count == 0 {
365 return Ok(());
366 }
367
368 let mut a0_real = Vec::with_capacity(pair_count);
370 let mut a0_imag = Vec::with_capacity(pair_count);
371 let mut a1_real = Vec::with_capacity(pair_count);
372 let mut a1_imag = Vec::with_capacity(pair_count);
373
374 for i in 0..pair_count {
375 let a0 = state[idx0_list[i]];
376 let a1 = state[idx1_list[i]];
377 a0_real.push(a0.re);
378 a0_imag.push(a0.im);
379 a1_real.push(a1.re);
380 a1_imag.push(a1.im);
381 }
382
383 let a0_real_view = ArrayView1::from(&a0_real);
385 let a0_imag_view = ArrayView1::from(&a0_imag);
386 let a1_real_view = ArrayView1::from(&a1_real);
387 let a1_imag_view = ArrayView1::from(&a1_imag);
388
389 let m00_re = matrix[0].re;
391 let m00_im = matrix[0].im;
392 let m01_re = matrix[1].re;
393 let m01_im = matrix[1].im;
394 let m10_re = matrix[2].re;
395 let m10_im = matrix[2].im;
396 let m11_re = matrix[3].re;
397 let m11_im = matrix[3].im;
398
399 let term1 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m00_re);
405 let term2 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m00_im);
406 let term3 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m01_re);
407 let term4 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m01_im);
408 let sub1 = <f64 as SimdF64>::simd_sub_arrays(&term1.view(), &term2.view());
409 let sub2 = <f64 as SimdF64>::simd_sub_arrays(&term3.view(), &term4.view());
410 let new_a0_real_arr = <f64 as SimdF64>::simd_add_arrays(&sub1.view(), &sub2.view());
411
412 let term5 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m00_re);
414 let term6 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m00_im);
415 let term7 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m01_re);
416 let term8 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m01_im);
417 let add1 = <f64 as SimdF64>::simd_add_arrays(&term5.view(), &term6.view());
418 let add2 = <f64 as SimdF64>::simd_add_arrays(&term7.view(), &term8.view());
419 let new_a0_imag_arr = <f64 as SimdF64>::simd_add_arrays(&add1.view(), &add2.view());
420
421 let term9 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m10_re);
423 let term10 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m10_im);
424 let term11 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m11_re);
425 let term12 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m11_im);
426 let sub3 = <f64 as SimdF64>::simd_sub_arrays(&term9.view(), &term10.view());
427 let sub4 = <f64 as SimdF64>::simd_sub_arrays(&term11.view(), &term12.view());
428 let new_a1_real_arr = <f64 as SimdF64>::simd_add_arrays(&sub3.view(), &sub4.view());
429
430 let term13 = <f64 as SimdF64>::simd_scalar_mul(&a0_imag_view, m10_re);
432 let term14 = <f64 as SimdF64>::simd_scalar_mul(&a0_real_view, m10_im);
433 let term15 = <f64 as SimdF64>::simd_scalar_mul(&a1_imag_view, m11_re);
434 let term16 = <f64 as SimdF64>::simd_scalar_mul(&a1_real_view, m11_im);
435 let add3 = <f64 as SimdF64>::simd_add_arrays(&term13.view(), &term14.view());
436 let add4 = <f64 as SimdF64>::simd_add_arrays(&term15.view(), &term16.view());
437 let new_a1_imag_arr = <f64 as SimdF64>::simd_add_arrays(&add3.view(), &add4.view());
438
439 for i in 0..pair_count {
441 state[idx0_list[i]] = Complex64::new(new_a0_real_arr[i], new_a0_imag_arr[i]);
442 state[idx1_list[i]] = Complex64::new(new_a1_real_arr[i], new_a1_imag_arr[i]);
443 }
444
445 Ok(())
446 }
447
448 fn apply_two_qubit_avx512(
451 &self,
452 _state: &mut [Complex64],
453 _control: usize,
454 _target: usize,
455 _matrix: &[Complex64; 16],
456 ) -> QuantRS2Result<()> {
457 Ok(())
459 }
460
461 fn apply_two_qubit_avx2(
462 &self,
463 _state: &mut [Complex64],
464 _control: usize,
465 _target: usize,
466 _matrix: &[Complex64; 16],
467 ) -> QuantRS2Result<()> {
468 Ok(())
470 }
471
472 fn apply_two_qubit_sse4(
473 &self,
474 _state: &mut [Complex64],
475 _control: usize,
476 _target: usize,
477 _matrix: &[Complex64; 16],
478 ) -> QuantRS2Result<()> {
479 Ok(())
481 }
482
483 fn apply_two_qubit_scalar(
484 &self,
485 _state: &mut [Complex64],
486 _control: usize,
487 _target: usize,
488 _matrix: &[Complex64; 16],
489 ) -> QuantRS2Result<()> {
490 Ok(())
492 }
493
494 fn apply_batch_gates_avx512(
495 &self,
496 _states: &mut [&mut [Complex64]],
497 _gates: &[Box<dyn crate::gate::GateOp>],
498 ) -> QuantRS2Result<()> {
499 Ok(())
501 }
502
503 fn apply_batch_gates_avx2(
504 &self,
505 _states: &mut [&mut [Complex64]],
506 _gates: &[Box<dyn crate::gate::GateOp>],
507 ) -> QuantRS2Result<()> {
508 Ok(())
510 }
511
512 fn apply_batch_gates_sse4(
513 &self,
514 _states: &mut [&mut [Complex64]],
515 _gates: &[Box<dyn crate::gate::GateOp>],
516 ) -> QuantRS2Result<()> {
517 Ok(())
519 }
520
521 fn apply_batch_gates_scalar(
522 &self,
523 _states: &mut [&mut [Complex64]],
524 _gates: &[Box<dyn crate::gate::GateOp>],
525 ) -> QuantRS2Result<()> {
526 Ok(())
528 }
529}
530
531#[derive(Debug, Clone)]
533pub struct AdaptivePerformanceReport {
534 pub cpu_features: CpuFeatures,
535 pub selected_variant: SimdVariant,
536 pub performance_cache: std::collections::HashMap<String, PerformanceData>,
537}
538
539pub fn apply_single_qubit_adaptive(
541 state: &mut [Complex64],
542 target: usize,
543 matrix: &[Complex64; 4],
544) -> QuantRS2Result<()> {
545 AdaptiveSimdDispatcher::instance()?.apply_single_qubit_gate_adaptive(state, target, matrix)
546}
547
548pub fn apply_two_qubit_adaptive(
549 state: &mut [Complex64],
550 control: usize,
551 target: usize,
552 matrix: &[Complex64; 16],
553) -> QuantRS2Result<()> {
554 AdaptiveSimdDispatcher::instance()?
555 .apply_two_qubit_gate_adaptive(state, control, target, matrix)
556}
557
558pub fn apply_batch_gates_adaptive(
559 states: &mut [&mut [Complex64]],
560 gates: &[Box<dyn crate::gate::GateOp>],
561) -> QuantRS2Result<()> {
562 AdaptiveSimdDispatcher::instance()?.apply_batch_gates_adaptive(states, gates)
563}
564
565pub fn initialize_adaptive_simd() -> QuantRS2Result<()> {
567 AdaptiveSimdDispatcher::initialize()
568}
569
570pub fn get_adaptive_performance_report() -> QuantRS2Result<AdaptivePerformanceReport> {
572 Ok(AdaptiveSimdDispatcher::instance()?.get_performance_report())
573}
574
575#[cfg(test)]
576mod tests {
577 use super::*;
578 use scirs2_core::Complex64;
579
580 #[test]
581 fn test_cpu_feature_detection() {
582 let features = AdaptiveSimdDispatcher::detect_cpu_features();
583 println!("Detected CPU features: {:?}", features);
584
585 assert!(features.num_cores >= 1);
587 assert!(features.l1_cache_size > 0);
588 }
589
590 #[test]
591 fn test_simd_variant_selection() {
592 let features = CpuFeatures {
593 has_avx2: true,
594 has_avx512: false,
595 has_fma: true,
596 has_avx512vl: false,
597 has_avx512dq: false,
598 has_avx512cd: false,
599 has_sse41: true,
600 has_sse42: true,
601 num_cores: 8,
602 l1_cache_size: 32768,
603 l2_cache_size: 262144,
604 l3_cache_size: 8388608,
605 };
606
607 let variant = AdaptiveSimdDispatcher::select_optimal_variant(&features);
608 assert_eq!(variant, SimdVariant::Avx2);
609 }
610
611 #[test]
612 fn test_adaptive_single_qubit_gate() {
613 let _ = AdaptiveSimdDispatcher::initialize();
614
615 let mut state = vec![Complex64::new(1.0, 0.0), Complex64::new(0.0, 0.0)];
616
617 let hadamard_matrix = [
618 Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
619 Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
620 Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
621 Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
622 ];
623
624 let result = apply_single_qubit_adaptive(&mut state, 0, &hadamard_matrix);
625 assert!(result.is_ok());
626
627 let expected_amplitude = 1.0 / 2.0_f64.sqrt();
629 assert!((state[0].re - expected_amplitude).abs() < 1e-10);
630 assert!((state[1].re - expected_amplitude).abs() < 1e-10);
631 }
632
633 #[test]
634 fn test_performance_caching() {
635 let dispatcher = AdaptiveSimdDispatcher {
636 cpu_features: AdaptiveSimdDispatcher::detect_cpu_features(),
637 selected_variant: SimdVariant::Avx2,
638 performance_cache: Mutex::new(std::collections::HashMap::new()),
639 };
640
641 dispatcher.update_performance_cache("test_op", 100.0, SimdVariant::Avx2);
642 dispatcher.update_performance_cache("test_op", 150.0, SimdVariant::Avx2);
643
644 let perf_data = dispatcher
645 .performance_cache
646 .lock()
647 .unwrap()
648 .get("test_op")
649 .unwrap()
650 .clone();
651 assert_eq!(perf_data.samples, 2);
652 assert!((perf_data.avg_time - 125.0).abs() < 1e-10);
653 }
654}