1use crate::error::{QuantRS2Error, QuantRS2Result};
7use num_complex::Complex64;
8use std::sync::{Mutex, OnceLock};
9
10#[derive(Debug, Clone, Copy)]
12pub struct CpuFeatures {
13 pub has_avx2: bool,
15 pub has_avx512: bool,
17 pub has_fma: bool,
19 pub has_avx512vl: bool,
21 pub has_avx512dq: bool,
23 pub has_avx512cd: bool,
25 pub has_sse41: bool,
27 pub has_sse42: bool,
29 pub num_cores: usize,
31 pub l1_cache_size: usize,
33 pub l2_cache_size: usize,
35 pub l3_cache_size: usize,
37}
38
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum SimdVariant {
42 Scalar,
44 Sse4,
46 Avx2,
48 Avx512,
50}
51
52pub struct AdaptiveSimdDispatcher {
54 cpu_features: CpuFeatures,
56 selected_variant: SimdVariant,
58 performance_cache: Mutex<std::collections::HashMap<String, PerformanceData>>,
60}
61
62#[derive(Debug, Clone)]
64pub struct PerformanceData {
65 avg_time: f64,
67 samples: usize,
69 best_variant: SimdVariant,
71}
72
73static GLOBAL_DISPATCHER: OnceLock<AdaptiveSimdDispatcher> = OnceLock::new();
75
76impl AdaptiveSimdDispatcher {
77 pub fn initialize() -> QuantRS2Result<()> {
79 let cpu_features = Self::detect_cpu_features();
80 let selected_variant = Self::select_optimal_variant(&cpu_features);
81
82 let dispatcher = AdaptiveSimdDispatcher {
83 cpu_features,
84 selected_variant,
85 performance_cache: Mutex::new(std::collections::HashMap::new()),
86 };
87
88 GLOBAL_DISPATCHER.set(dispatcher).map_err(|_| {
89 QuantRS2Error::RuntimeError("Adaptive SIMD dispatcher already initialized".to_string())
90 })?;
91
92 Ok(())
93 }
94
95 pub fn instance() -> QuantRS2Result<&'static AdaptiveSimdDispatcher> {
97 GLOBAL_DISPATCHER.get().ok_or_else(|| {
98 QuantRS2Error::RuntimeError("Adaptive SIMD dispatcher not initialized".to_string())
99 })
100 }
101
102 fn detect_cpu_features() -> CpuFeatures {
104 #[cfg(target_arch = "x86_64")]
106 {
107 Self::detect_x86_64_features()
108 }
109 #[cfg(target_arch = "aarch64")]
110 {
111 Self::detect_aarch64_features()
112 }
113 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
114 {
115 CpuFeatures {
117 has_avx2: false,
118 has_avx512: false,
119 has_fma: false,
120 has_avx512vl: false,
121 has_avx512dq: false,
122 has_avx512cd: false,
123 has_sse41: false,
124 has_sse42: false,
125 num_cores: 1,
126 l1_cache_size: 32768,
127 l2_cache_size: 262144,
128 l3_cache_size: 8388608,
129 }
130 }
131 }
132
133 #[cfg(target_arch = "x86_64")]
134 fn detect_x86_64_features() -> CpuFeatures {
135 use std::arch::x86_64::*;
136
137 let has_avx2 = is_x86_feature_detected!("avx2");
139 let has_avx512 = is_x86_feature_detected!("avx512f");
140 let has_fma = is_x86_feature_detected!("fma");
141 let has_avx512vl = is_x86_feature_detected!("avx512vl");
142 let has_avx512dq = is_x86_feature_detected!("avx512dq");
143 let has_avx512cd = is_x86_feature_detected!("avx512cd");
144 let has_sse41 = is_x86_feature_detected!("sse4.1");
145 let has_sse42 = is_x86_feature_detected!("sse4.2");
146
147 let num_cores = 8; let (l1_cache, l2_cache, l3_cache) = Self::detect_cache_sizes();
150
151 CpuFeatures {
152 has_avx2,
153 has_avx512,
154 has_fma,
155 has_avx512vl,
156 has_avx512dq,
157 has_avx512cd,
158 has_sse41,
159 has_sse42,
160 num_cores,
161 l1_cache_size: l1_cache,
162 l2_cache_size: l2_cache,
163 l3_cache_size: l3_cache,
164 }
165 }
166
167 #[cfg(target_arch = "aarch64")]
168 fn detect_aarch64_features() -> CpuFeatures {
169 let num_cores = 8; let (l1_cache, l2_cache, l3_cache) = Self::detect_cache_sizes();
172
173 CpuFeatures {
174 has_avx2: false, has_avx512: false, has_fma: true, has_avx512vl: false,
178 has_avx512dq: false,
179 has_avx512cd: false,
180 has_sse41: false, has_sse42: false, num_cores,
183 l1_cache_size: l1_cache,
184 l2_cache_size: l2_cache,
185 l3_cache_size: l3_cache,
186 }
187 }
188
189 fn detect_cache_sizes() -> (usize, usize, usize) {
191 let l1_cache = 32768; let l2_cache = 262144; let l3_cache = 8388608; (l1_cache, l2_cache, l3_cache)
198 }
199
200 fn select_optimal_variant(features: &CpuFeatures) -> SimdVariant {
202 if features.has_avx512 && features.has_avx512vl && features.has_avx512dq {
203 SimdVariant::Avx512
204 } else if features.has_avx2 && features.has_fma {
205 SimdVariant::Avx2
206 } else if features.has_sse41 && features.has_sse42 {
207 SimdVariant::Sse4
208 } else {
209 SimdVariant::Scalar
210 }
211 }
212
213 pub fn apply_single_qubit_gate_adaptive(
215 &self,
216 state: &mut [Complex64],
217 target: usize,
218 matrix: &[Complex64; 4],
219 ) -> QuantRS2Result<()> {
220 let operation_key = format!("single_qubit_{}", state.len());
221 let variant = self.select_variant_for_operation(&operation_key, state.len());
222
223 let start_time = std::time::Instant::now();
224
225 let result = match variant {
226 SimdVariant::Avx512 => self.apply_single_qubit_sse4(state, target, matrix), SimdVariant::Avx2 => self.apply_single_qubit_sse4(state, target, matrix), SimdVariant::Sse4 => self.apply_single_qubit_sse4(state, target, matrix),
229 SimdVariant::Scalar => self.apply_single_qubit_scalar(state, target, matrix),
230 };
231
232 let execution_time = start_time.elapsed().as_nanos() as f64;
233 self.update_performance_cache(&operation_key, execution_time, variant);
234
235 result
236 }
237
238 pub fn apply_two_qubit_gate_adaptive(
240 &self,
241 state: &mut [Complex64],
242 control: usize,
243 target: usize,
244 matrix: &[Complex64; 16],
245 ) -> QuantRS2Result<()> {
246 let operation_key = format!("two_qubit_{}", state.len());
247 let variant = self.select_variant_for_operation(&operation_key, state.len());
248
249 let start_time = std::time::Instant::now();
250
251 let result = match variant {
252 SimdVariant::Avx512 => self.apply_two_qubit_avx512(state, control, target, matrix),
253 SimdVariant::Avx2 => self.apply_two_qubit_avx2(state, control, target, matrix),
254 SimdVariant::Sse4 => self.apply_two_qubit_sse4(state, control, target, matrix),
255 SimdVariant::Scalar => self.apply_two_qubit_scalar(state, control, target, matrix),
256 };
257
258 let execution_time = start_time.elapsed().as_nanos() as f64;
259 self.update_performance_cache(&operation_key, execution_time, variant);
260
261 result
262 }
263
264 pub fn apply_batch_gates_adaptive(
266 &self,
267 states: &mut [&mut [Complex64]],
268 gates: &[Box<dyn crate::gate::GateOp>],
269 ) -> QuantRS2Result<()> {
270 let batch_size = states.len();
271 let operation_key = format!("batch_{}_{}", batch_size, gates.len());
272 let variant = self.select_variant_for_operation(&operation_key, batch_size * 1000); let start_time = std::time::Instant::now();
275
276 let result = match variant {
277 SimdVariant::Avx512 => self.apply_batch_gates_avx512(states, gates),
278 SimdVariant::Avx2 => self.apply_batch_gates_avx2(states, gates),
279 SimdVariant::Sse4 => self.apply_batch_gates_sse4(states, gates),
280 SimdVariant::Scalar => self.apply_batch_gates_scalar(states, gates),
281 };
282
283 let execution_time = start_time.elapsed().as_nanos() as f64;
284 self.update_performance_cache(&operation_key, execution_time, variant);
285
286 result
287 }
288
289 fn select_variant_for_operation(&self, operation_key: &str, data_size: usize) -> SimdVariant {
291 if let Ok(cache) = self.performance_cache.lock() {
293 if let Some(perf_data) = cache.get(operation_key) {
294 if perf_data.samples >= 5 {
295 return perf_data.best_variant;
296 }
297 }
298 }
299
300 if data_size >= 1024 && self.cpu_features.has_avx512 {
302 SimdVariant::Avx512
303 } else if data_size >= 256 && self.cpu_features.has_avx2 {
304 SimdVariant::Avx2
305 } else if data_size >= 64 && self.cpu_features.has_sse41 {
306 SimdVariant::Sse4
307 } else {
308 SimdVariant::Scalar
309 }
310 }
311
312 fn update_performance_cache(
314 &self,
315 operation_key: &str,
316 execution_time: f64,
317 variant: SimdVariant,
318 ) {
319 if let Ok(mut cache) = self.performance_cache.lock() {
320 let perf_data =
321 cache
322 .entry(operation_key.to_string())
323 .or_insert_with(|| PerformanceData {
324 avg_time: execution_time,
325 samples: 0,
326 best_variant: variant,
327 });
328
329 perf_data.avg_time = (perf_data.avg_time * perf_data.samples as f64 + execution_time)
331 / (perf_data.samples + 1) as f64;
332 perf_data.samples += 1;
333
334 if execution_time < perf_data.avg_time * 0.9 {
336 perf_data.best_variant = variant;
337 }
338 }
339 }
340
341 pub fn get_performance_report(&self) -> AdaptivePerformanceReport {
343 let cache = self
344 .performance_cache
345 .lock()
346 .map(|cache| cache.clone())
347 .unwrap_or_default();
348
349 AdaptivePerformanceReport {
350 cpu_features: self.cpu_features,
351 selected_variant: self.selected_variant,
352 performance_cache: cache,
353 }
354 }
355
356 #[cfg(target_arch = "x86_64")]
359 fn apply_single_qubit_avx512(
360 &self,
361 state: &mut [Complex64],
362 target: usize,
363 matrix: &[Complex64; 4],
364 ) -> QuantRS2Result<()> {
365 simd_ops::apply_single_qubit_gate_simd(state, target, matrix)
367 }
368
369 #[cfg(target_arch = "x86_64")]
370 fn apply_single_qubit_avx2(
371 &self,
372 state: &mut [Complex64],
373 target: usize,
374 matrix: &[Complex64; 4],
375 ) -> QuantRS2Result<()> {
376 simd_ops::apply_single_qubit_gate_simd(state, target, matrix)
378 }
379
380 fn apply_single_qubit_sse4(
381 &self,
382 state: &mut [Complex64],
383 target: usize,
384 matrix: &[Complex64; 4],
385 ) -> QuantRS2Result<()> {
386 self.apply_single_qubit_scalar(state, target, matrix)
389 }
390
391 fn apply_single_qubit_scalar(
392 &self,
393 state: &mut [Complex64],
394 target: usize,
395 matrix: &[Complex64; 4],
396 ) -> QuantRS2Result<()> {
397 let n = state.len();
399 for i in 0..n {
400 if (i >> target) & 1 == 0 {
401 let j = i | (1 << target);
402 let temp0 = state[i];
403 let temp1 = state[j];
404 state[i] = matrix[0] * temp0 + matrix[1] * temp1;
405 state[j] = matrix[2] * temp0 + matrix[3] * temp1;
406 }
407 }
408 Ok(())
409 }
410
411 fn apply_two_qubit_avx512(
414 &self,
415 _state: &mut [Complex64],
416 _control: usize,
417 _target: usize,
418 _matrix: &[Complex64; 16],
419 ) -> QuantRS2Result<()> {
420 Ok(())
422 }
423
424 fn apply_two_qubit_avx2(
425 &self,
426 _state: &mut [Complex64],
427 _control: usize,
428 _target: usize,
429 _matrix: &[Complex64; 16],
430 ) -> QuantRS2Result<()> {
431 Ok(())
433 }
434
435 fn apply_two_qubit_sse4(
436 &self,
437 _state: &mut [Complex64],
438 _control: usize,
439 _target: usize,
440 _matrix: &[Complex64; 16],
441 ) -> QuantRS2Result<()> {
442 Ok(())
444 }
445
446 fn apply_two_qubit_scalar(
447 &self,
448 _state: &mut [Complex64],
449 _control: usize,
450 _target: usize,
451 _matrix: &[Complex64; 16],
452 ) -> QuantRS2Result<()> {
453 Ok(())
455 }
456
457 fn apply_batch_gates_avx512(
458 &self,
459 _states: &mut [&mut [Complex64]],
460 _gates: &[Box<dyn crate::gate::GateOp>],
461 ) -> QuantRS2Result<()> {
462 Ok(())
464 }
465
466 fn apply_batch_gates_avx2(
467 &self,
468 _states: &mut [&mut [Complex64]],
469 _gates: &[Box<dyn crate::gate::GateOp>],
470 ) -> QuantRS2Result<()> {
471 Ok(())
473 }
474
475 fn apply_batch_gates_sse4(
476 &self,
477 _states: &mut [&mut [Complex64]],
478 _gates: &[Box<dyn crate::gate::GateOp>],
479 ) -> QuantRS2Result<()> {
480 Ok(())
482 }
483
484 fn apply_batch_gates_scalar(
485 &self,
486 _states: &mut [&mut [Complex64]],
487 _gates: &[Box<dyn crate::gate::GateOp>],
488 ) -> QuantRS2Result<()> {
489 Ok(())
491 }
492}
493
494#[derive(Debug, Clone)]
496pub struct AdaptivePerformanceReport {
497 pub cpu_features: CpuFeatures,
498 pub selected_variant: SimdVariant,
499 pub performance_cache: std::collections::HashMap<String, PerformanceData>,
500}
501
502pub fn apply_single_qubit_adaptive(
504 state: &mut [Complex64],
505 target: usize,
506 matrix: &[Complex64; 4],
507) -> QuantRS2Result<()> {
508 AdaptiveSimdDispatcher::instance()?.apply_single_qubit_gate_adaptive(state, target, matrix)
509}
510
511pub fn apply_two_qubit_adaptive(
512 state: &mut [Complex64],
513 control: usize,
514 target: usize,
515 matrix: &[Complex64; 16],
516) -> QuantRS2Result<()> {
517 AdaptiveSimdDispatcher::instance()?
518 .apply_two_qubit_gate_adaptive(state, control, target, matrix)
519}
520
521pub fn apply_batch_gates_adaptive(
522 states: &mut [&mut [Complex64]],
523 gates: &[Box<dyn crate::gate::GateOp>],
524) -> QuantRS2Result<()> {
525 AdaptiveSimdDispatcher::instance()?.apply_batch_gates_adaptive(states, gates)
526}
527
528pub fn initialize_adaptive_simd() -> QuantRS2Result<()> {
530 AdaptiveSimdDispatcher::initialize()
531}
532
533pub fn get_adaptive_performance_report() -> QuantRS2Result<AdaptivePerformanceReport> {
535 Ok(AdaptiveSimdDispatcher::instance()?.get_performance_report())
536}
537
538#[cfg(test)]
539mod tests {
540 use super::*;
541 use num_complex::Complex64;
542
543 #[test]
544 fn test_cpu_feature_detection() {
545 let features = AdaptiveSimdDispatcher::detect_cpu_features();
546 println!("Detected CPU features: {:?}", features);
547
548 assert!(features.num_cores >= 1);
550 assert!(features.l1_cache_size > 0);
551 }
552
553 #[test]
554 fn test_simd_variant_selection() {
555 let features = CpuFeatures {
556 has_avx2: true,
557 has_avx512: false,
558 has_fma: true,
559 has_avx512vl: false,
560 has_avx512dq: false,
561 has_avx512cd: false,
562 has_sse41: true,
563 has_sse42: true,
564 num_cores: 8,
565 l1_cache_size: 32768,
566 l2_cache_size: 262144,
567 l3_cache_size: 8388608,
568 };
569
570 let variant = AdaptiveSimdDispatcher::select_optimal_variant(&features);
571 assert_eq!(variant, SimdVariant::Avx2);
572 }
573
574 #[test]
575 fn test_adaptive_single_qubit_gate() {
576 let _ = AdaptiveSimdDispatcher::initialize();
577
578 let mut state = vec![Complex64::new(1.0, 0.0), Complex64::new(0.0, 0.0)];
579
580 let hadamard_matrix = [
581 Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
582 Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
583 Complex64::new(1.0 / 2.0_f64.sqrt(), 0.0),
584 Complex64::new(-1.0 / 2.0_f64.sqrt(), 0.0),
585 ];
586
587 let result = apply_single_qubit_adaptive(&mut state, 0, &hadamard_matrix);
588 assert!(result.is_ok());
589
590 let expected_amplitude = 1.0 / 2.0_f64.sqrt();
592 assert!((state[0].re - expected_amplitude).abs() < 1e-10);
593 assert!((state[1].re - expected_amplitude).abs() < 1e-10);
594 }
595
596 #[test]
597 fn test_performance_caching() {
598 let dispatcher = AdaptiveSimdDispatcher {
599 cpu_features: AdaptiveSimdDispatcher::detect_cpu_features(),
600 selected_variant: SimdVariant::Avx2,
601 performance_cache: Mutex::new(std::collections::HashMap::new()),
602 };
603
604 dispatcher.update_performance_cache("test_op", 100.0, SimdVariant::Avx2);
605 dispatcher.update_performance_cache("test_op", 150.0, SimdVariant::Avx2);
606
607 let perf_data = dispatcher
608 .performance_cache
609 .lock()
610 .unwrap()
611 .get("test_op")
612 .unwrap()
613 .clone();
614 assert_eq!(perf_data.samples, 2);
615 assert!((perf_data.avg_time - 125.0).abs() < 1e-10);
616 }
617}