quantrs2_sim/
performance_benchmark.rs

1//! Comprehensive performance benchmarking suite for quantum simulation
2//!
3//! This module provides advanced benchmarking capabilities to measure and analyze
4//! the performance of various quantum simulation components, including optimizations,
5//! memory efficiency, and scalability analysis.
6
7use scirs2_core::parallel_ops::*;
8use scirs2_core::Complex64;
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::time::{Duration, Instant};
12
13use quantrs2_circuit::builder::{Circuit, Simulator};
14use quantrs2_core::{error::QuantRS2Result, platform::PlatformCapabilities, qubit::QubitId};
15
16use crate::circuit_optimization::{CircuitOptimizer, OptimizationConfig};
17use crate::optimized_simd;
18use crate::statevector::StateVectorSimulator;
19
20/// Comprehensive benchmarking framework
21#[derive(Debug)]
22pub struct QuantumBenchmarkSuite {
23    /// Benchmark configuration
24    config: BenchmarkConfig,
25    /// Results storage
26    results: Vec<BenchmarkResult>,
27    /// System information
28    system_info: SystemInfo,
29}
30
31/// Benchmark configuration parameters
32#[derive(Debug, Clone)]
33pub struct BenchmarkConfig {
34    /// Number of qubits to test (range)
35    pub qubit_range: std::ops::Range<usize>,
36    /// Number of iterations per benchmark
37    pub iterations: usize,
38    /// Enable memory profiling
39    pub profile_memory: bool,
40    /// Enable optimization comparison
41    pub compare_optimizations: bool,
42    /// Enable scalability analysis
43    pub scalability_analysis: bool,
44    /// Warmup iterations before timing
45    pub warmup_iterations: usize,
46    /// Maximum circuit depth for tests
47    pub max_circuit_depth: usize,
48}
49
50/// Individual benchmark result
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct BenchmarkResult {
53    /// Benchmark name
54    pub name: String,
55    /// Number of qubits tested
56    pub qubits: usize,
57    /// Circuit depth
58    pub depth: usize,
59    /// Execution time statistics
60    pub timing: TimingStats,
61    /// Memory usage statistics
62    pub memory: MemoryStats,
63    /// Throughput metrics
64    pub throughput: ThroughputStats,
65    /// Configuration used
66    pub config_description: String,
67}
68
69/// Timing statistics
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct TimingStats {
72    /// Average execution time
73    pub average_ns: u128,
74    /// Minimum execution time
75    pub min_ns: u128,
76    /// Maximum execution time
77    pub max_ns: u128,
78    /// Standard deviation
79    pub std_dev_ns: f64,
80    /// 95th percentile
81    pub p95_ns: u128,
82    /// 99th percentile
83    pub p99_ns: u128,
84}
85
86/// Memory usage statistics
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct MemoryStats {
89    /// Peak memory usage in bytes
90    pub peak_memory_bytes: usize,
91    /// Average memory usage
92    pub average_memory_bytes: usize,
93    /// Memory efficiency score (0-1)
94    pub efficiency_score: f64,
95    /// Buffer pool utilization
96    pub buffer_pool_utilization: f64,
97}
98
99/// Throughput statistics
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct ThroughputStats {
102    /// Gates per second
103    pub gates_per_second: f64,
104    /// Qubits processed per second
105    pub qubits_per_second: f64,
106    /// Operations per second
107    pub operations_per_second: f64,
108    /// Simulation steps per second
109    pub steps_per_second: f64,
110}
111
112/// System information for benchmark context
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct SystemInfo {
115    /// CPU information
116    pub cpu_info: String,
117    /// Available memory
118    pub total_memory_gb: f64,
119    /// Number of CPU cores
120    pub cpu_cores: usize,
121    /// Rust version
122    pub rust_version: String,
123    /// Compiler optimization level
124    pub optimization_level: String,
125    /// SIMD support
126    pub simd_support: Vec<String>,
127}
128
129/// Benchmark comparison result
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct BenchmarkComparison {
132    /// Baseline benchmark name
133    pub baseline: String,
134    /// Comparison benchmark name
135    pub comparison: String,
136    /// Performance improvement ratio
137    pub improvement_ratio: f64,
138    /// Memory efficiency improvement
139    pub memory_improvement: f64,
140    /// Throughput improvement
141    pub throughput_improvement: f64,
142    /// Scalability comparison
143    pub scalability_factor: f64,
144}
145
146/// Scalability analysis result
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct ScalabilityAnalysis {
149    /// Growth factor per additional qubit
150    pub time_growth_factor: f64,
151    /// Memory growth factor per additional qubit
152    pub memory_growth_factor: f64,
153    /// Maximum practical qubit count
154    pub max_practical_qubits: usize,
155    /// Efficiency plateau point
156    pub efficiency_plateau: usize,
157    /// Complexity class estimate
158    pub complexity_class: String,
159}
160
161impl Default for BenchmarkConfig {
162    fn default() -> Self {
163        Self {
164            qubit_range: 1..20,
165            iterations: 10,
166            profile_memory: true,
167            compare_optimizations: true,
168            scalability_analysis: true,
169            warmup_iterations: 3,
170            max_circuit_depth: 50,
171        }
172    }
173}
174
175impl QuantumBenchmarkSuite {
176    /// Create a new benchmark suite
177    pub fn new(config: BenchmarkConfig) -> Self {
178        Self {
179            config,
180            results: Vec::new(),
181            system_info: Self::gather_system_info(),
182        }
183    }
184
185    /// Run comprehensive benchmark suite
186    pub fn run_all_benchmarks(&mut self) -> QuantRS2Result<()> {
187        println!("🚀 Starting Comprehensive Quantum Simulation Benchmarks");
188        println!("========================================================\n");
189
190        // Print system information
191        self.print_system_info();
192
193        // Core simulation benchmarks
194        self.benchmark_basic_gates()?;
195        self.benchmark_circuit_execution()?;
196        self.benchmark_memory_efficiency()?;
197
198        if self.config.compare_optimizations {
199            self.benchmark_optimization_comparison()?;
200        }
201
202        if self.config.scalability_analysis {
203            self.benchmark_scalability()?;
204        }
205
206        // SIMD performance benchmarks
207        self.benchmark_simd_performance()?;
208
209        // Circuit optimization benchmarks
210        self.benchmark_circuit_optimization()?;
211
212        // Generate comprehensive report
213        self.generate_final_report();
214
215        Ok(())
216    }
217
218    /// Benchmark basic gate operations
219    pub fn benchmark_basic_gates(&mut self) -> QuantRS2Result<()> {
220        println!("🔧 Benchmarking Basic Gate Operations");
221        println!("------------------------------------");
222
223        let gates = vec![
224            (
225                "Hadamard",
226                Box::new(|circuit: &mut Circuit<16>, q: usize| {
227                    circuit.h(QubitId::new(q as u32))?;
228                    Ok(())
229                }) as Box<dyn Fn(&mut Circuit<16>, usize) -> QuantRS2Result<()>>,
230            ),
231            (
232                "Pauli-X",
233                Box::new(|circuit: &mut Circuit<16>, q: usize| {
234                    circuit.x(QubitId::new(q as u32))?;
235                    Ok(())
236                }),
237            ),
238            (
239                "Pauli-Y",
240                Box::new(|circuit: &mut Circuit<16>, q: usize| {
241                    circuit.y(QubitId::new(q as u32))?;
242                    Ok(())
243                }),
244            ),
245            (
246                "Pauli-Z",
247                Box::new(|circuit: &mut Circuit<16>, q: usize| {
248                    circuit.z(QubitId::new(q as u32))?;
249                    Ok(())
250                }),
251            ),
252            (
253                "Phase-S",
254                Box::new(|circuit: &mut Circuit<16>, q: usize| {
255                    circuit.s(QubitId::new(q as u32))?;
256                    Ok(())
257                }),
258            ),
259            (
260                "T-Gate",
261                Box::new(|circuit: &mut Circuit<16>, q: usize| {
262                    circuit.t(QubitId::new(q as u32))?;
263                    Ok(())
264                }),
265            ),
266        ];
267
268        for (gate_name, gate_fn) in gates {
269            for qubits in [4, 8, 12, 16] {
270                let result = self.benchmark_gate_operation(gate_name, qubits, &gate_fn)?;
271                self.results.push(result);
272                println!(
273                    "  ✓ {} on {} qubits: {:.2}ms",
274                    gate_name,
275                    qubits,
276                    self.results.last().unwrap().timing.average_ns as f64 / 1_000_000.0
277                );
278            }
279        }
280
281        println!();
282        Ok(())
283    }
284
285    /// Benchmark circuit execution performance
286    pub fn benchmark_circuit_execution(&mut self) -> QuantRS2Result<()> {
287        println!("⚡ Benchmarking Circuit Execution");
288        println!("--------------------------------");
289
290        for qubits in self.config.qubit_range.clone().step_by(2) {
291            if qubits > 16 {
292                break;
293            } // Limit for demonstration
294
295            let result = self.benchmark_random_circuit(qubits, 20)?;
296            self.results.push(result);
297            println!(
298                "  ✓ Random circuit {} qubits: {:.2}ms",
299                qubits,
300                self.results.last().unwrap().timing.average_ns as f64 / 1_000_000.0
301            );
302        }
303
304        println!();
305        Ok(())
306    }
307
308    /// Benchmark memory efficiency
309    pub fn benchmark_memory_efficiency(&mut self) -> QuantRS2Result<()> {
310        println!("💾 Benchmarking Memory Efficiency");
311        println!("--------------------------------");
312
313        // Test different memory configurations
314        let configs = vec![
315            ("Standard", StateVectorSimulator::new()),
316            ("High-Performance", StateVectorSimulator::high_performance()),
317            ("Sequential", StateVectorSimulator::sequential()),
318        ];
319
320        for (config_name, simulator) in configs {
321            for qubits in [8, 12, 16] {
322                let result = self.benchmark_memory_usage(config_name, qubits, &simulator)?;
323                self.results.push(result);
324                println!(
325                    "  ✓ {} config {} qubits: {:.1}MB peak",
326                    config_name,
327                    qubits,
328                    self.results.last().unwrap().memory.peak_memory_bytes as f64 / 1_048_576.0
329                );
330            }
331        }
332
333        println!();
334        Ok(())
335    }
336
337    /// Benchmark optimization comparison
338    pub fn benchmark_optimization_comparison(&mut self) -> QuantRS2Result<()> {
339        println!("🔄 Benchmarking Optimization Strategies");
340        println!("--------------------------------------");
341
342        let optimization_configs = vec![
343            (
344                "No Optimization",
345                OptimizationConfig {
346                    enable_gate_fusion: false,
347                    enable_redundant_elimination: false,
348                    enable_commutation_reordering: false,
349                    enable_single_qubit_optimization: false,
350                    enable_two_qubit_optimization: false,
351                    max_passes: 0,
352                    enable_depth_reduction: false,
353                },
354            ),
355            (
356                "Conservative",
357                OptimizationConfig {
358                    enable_gate_fusion: false,
359                    enable_redundant_elimination: true,
360                    enable_commutation_reordering: false,
361                    enable_single_qubit_optimization: false,
362                    enable_two_qubit_optimization: false,
363                    max_passes: 1,
364                    enable_depth_reduction: false,
365                },
366            ),
367            ("Aggressive", OptimizationConfig::default()),
368        ];
369
370        for (opt_name, opt_config) in optimization_configs {
371            for qubits in [8, 12, 16] {
372                let result = self.benchmark_optimization_strategy(opt_name, qubits, &opt_config)?;
373                self.results.push(result);
374                println!(
375                    "  ✓ {} optimization {} qubits: {:.2}ms",
376                    opt_name,
377                    qubits,
378                    self.results.last().unwrap().timing.average_ns as f64 / 1_000_000.0
379                );
380            }
381        }
382
383        println!();
384        Ok(())
385    }
386
387    /// Benchmark scalability analysis
388    fn benchmark_scalability(&mut self) -> QuantRS2Result<()> {
389        println!("📈 Analyzing Scalability");
390        println!("-----------------------");
391
392        let mut scalability_data = Vec::new();
393
394        for qubits in (4..=20).step_by(2) {
395            let start = Instant::now();
396            let circuit = self.create_test_circuit(qubits, 10)?;
397            let simulator = StateVectorSimulator::new();
398
399            // Warmup
400            for _ in 0..self.config.warmup_iterations {
401                let _ = simulator.run(&circuit);
402            }
403
404            // Actual timing
405            let mut times = Vec::new();
406            for _ in 0..self.config.iterations {
407                let bench_start = Instant::now();
408                let _ = simulator.run(&circuit)?;
409                times.push(bench_start.elapsed());
410            }
411
412            let avg_time = times.iter().sum::<Duration>() / times.len() as u32;
413            scalability_data.push((qubits, avg_time));
414
415            println!(
416                "  ✓ {} qubits: {:.2}ms",
417                qubits,
418                avg_time.as_secs_f64() * 1000.0
419            );
420
421            // Break if taking too long
422            if avg_time > Duration::from_secs(10) {
423                println!("  ⚠️ Breaking at {qubits} qubits due to time limit");
424                break;
425            }
426        }
427
428        let analysis = self.analyze_scalability(&scalability_data);
429        println!(
430            "  📊 Growth factor: {:.2}x per qubit",
431            analysis.time_growth_factor
432        );
433        println!(
434            "  🎯 Max practical qubits: {}",
435            analysis.max_practical_qubits
436        );
437
438        println!();
439        Ok(())
440    }
441
442    /// Benchmark SIMD performance
443    fn benchmark_simd_performance(&mut self) -> QuantRS2Result<()> {
444        println!("🏎️ Benchmarking SIMD Performance");
445        println!("--------------------------------");
446
447        let test_sizes = vec![1024, 4096, 16384, 65536];
448
449        for size in test_sizes {
450            // Prepare test data
451            let mut state = vec![Complex64::new(1.0 / (size as f64).sqrt(), 0.0); size];
452            let gate_matrix = [
453                Complex64::new(std::f64::consts::FRAC_1_SQRT_2, 0.0), // 1/√2
454                Complex64::new(std::f64::consts::FRAC_1_SQRT_2, 0.0),
455                Complex64::new(std::f64::consts::FRAC_1_SQRT_2, 0.0),
456                Complex64::new(-std::f64::consts::FRAC_1_SQRT_2, 0.0),
457            ];
458
459            // Benchmark regular implementation
460            let start = Instant::now();
461            for _ in 0..100 {
462                // Simulate gate application without SIMD
463                for i in (0..size).step_by(2) {
464                    let temp0 = state[i];
465                    let temp1 = state[i + 1];
466                    state[i] = gate_matrix[0] * temp0 + gate_matrix[1] * temp1;
467                    state[i + 1] = gate_matrix[2] * temp0 + gate_matrix[3] * temp1;
468                }
469            }
470            let regular_time = start.elapsed();
471
472            // Benchmark SIMD implementation
473            let mut state_simd = state.clone();
474            let start = Instant::now();
475            for _ in 0..100 {
476                let half_size = size / 2;
477                let in_amps0: Vec<Complex64> = (0..half_size).map(|i| state_simd[i * 2]).collect();
478                let in_amps1: Vec<Complex64> =
479                    (0..half_size).map(|i| state_simd[i * 2 + 1]).collect();
480                let mut out_amps0 = vec![Complex64::new(0.0, 0.0); half_size];
481                let mut out_amps1 = vec![Complex64::new(0.0, 0.0); half_size];
482
483                optimized_simd::apply_single_qubit_gate_optimized(
484                    &gate_matrix,
485                    &in_amps0,
486                    &in_amps1,
487                    &mut out_amps0,
488                    &mut out_amps1,
489                );
490
491                for i in 0..half_size {
492                    state_simd[i * 2] = out_amps0[i];
493                    state_simd[i * 2 + 1] = out_amps1[i];
494                }
495            }
496            let simd_time = start.elapsed();
497
498            let speedup = regular_time.as_nanos() as f64 / simd_time.as_nanos() as f64;
499            println!("  ✓ Size {size}: {speedup:.2}x SIMD speedup");
500        }
501
502        println!();
503        Ok(())
504    }
505
506    /// Benchmark circuit optimization
507    fn benchmark_circuit_optimization(&mut self) -> QuantRS2Result<()> {
508        println!("🔧 Benchmarking Circuit Optimization");
509        println!("-----------------------------------");
510
511        for qubits in [8, 12, 16] {
512            // Create circuit with optimization opportunities
513            let circuit = self.create_optimizable_circuit(qubits)?;
514            let mut optimizer = CircuitOptimizer::new();
515
516            let start = Instant::now();
517            let _optimized = optimizer.optimize(&circuit)?;
518            let optimization_time = start.elapsed();
519
520            let stats = optimizer.get_statistics();
521            println!(
522                "  ✓ {} qubits: {:.2}ms optimization, {:.1}% reduction",
523                qubits,
524                optimization_time.as_secs_f64() * 1000.0,
525                stats.gate_count_reduction()
526            );
527        }
528
529        println!();
530        Ok(())
531    }
532
533    /// Helper method to benchmark a single gate operation
534    fn benchmark_gate_operation<F>(
535        &self,
536        gate_name: &str,
537        qubits: usize,
538        gate_fn: &F,
539    ) -> QuantRS2Result<BenchmarkResult>
540    where
541        F: Fn(&mut Circuit<16>, usize) -> QuantRS2Result<()>,
542    {
543        let mut times = Vec::new();
544        let simulator = StateVectorSimulator::new();
545
546        // Warmup
547        for _ in 0..self.config.warmup_iterations {
548            let mut circuit = Circuit::<16>::new();
549            gate_fn(&mut circuit, 0)?;
550            let _ = simulator.run(&circuit);
551        }
552
553        // Actual benchmarking
554        for _ in 0..self.config.iterations {
555            let mut circuit = Circuit::<16>::new();
556            for q in 0..qubits {
557                gate_fn(&mut circuit, q)?;
558            }
559
560            let start = Instant::now();
561            let _ = simulator.run(&circuit)?;
562            times.push(start.elapsed());
563        }
564
565        let timing_stats = self.calculate_timing_stats(&times);
566
567        Ok(BenchmarkResult {
568            name: format!("{gate_name}_{qubits}q"),
569            qubits,
570            depth: 1,
571            timing: timing_stats.clone(),
572            memory: MemoryStats {
573                peak_memory_bytes: (1 << qubits) * 16, // Complex64 = 16 bytes
574                average_memory_bytes: (1 << qubits) * 16,
575                efficiency_score: 0.8,
576                buffer_pool_utilization: 0.7,
577            },
578            throughput: ThroughputStats {
579                gates_per_second: qubits as f64
580                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
581                qubits_per_second: qubits as f64
582                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
583                operations_per_second: 1.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
584                steps_per_second: 1.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
585            },
586            config_description: "Basic gate operation".to_string(),
587        })
588    }
589
590    /// Helper method to benchmark random circuit
591    fn benchmark_random_circuit(
592        &self,
593        qubits: usize,
594        depth: usize,
595    ) -> QuantRS2Result<BenchmarkResult> {
596        let circuit = self.create_test_circuit(qubits, depth)?;
597        let simulator = StateVectorSimulator::new();
598        let mut times = Vec::new();
599
600        // Warmup
601        for _ in 0..self.config.warmup_iterations {
602            let _ = simulator.run(&circuit);
603        }
604
605        // Actual benchmarking
606        for _ in 0..self.config.iterations {
607            let start = Instant::now();
608            let _ = simulator.run(&circuit)?;
609            times.push(start.elapsed());
610        }
611
612        let timing_stats = self.calculate_timing_stats(&times);
613
614        Ok(BenchmarkResult {
615            name: format!("random_circuit_{qubits}q_{depth}d"),
616            qubits,
617            depth,
618            timing: timing_stats.clone(),
619            memory: MemoryStats {
620                peak_memory_bytes: (1 << qubits) * 16,
621                average_memory_bytes: (1 << qubits) * 16,
622                efficiency_score: 0.85,
623                buffer_pool_utilization: 0.75,
624            },
625            throughput: ThroughputStats {
626                gates_per_second: (qubits * depth) as f64
627                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
628                qubits_per_second: qubits as f64
629                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
630                operations_per_second: depth as f64
631                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
632                steps_per_second: 1.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
633            },
634            config_description: "Random quantum circuit".to_string(),
635        })
636    }
637
638    /// Helper method to benchmark memory usage
639    fn benchmark_memory_usage(
640        &self,
641        config_name: &str,
642        qubits: usize,
643        simulator: &StateVectorSimulator,
644    ) -> QuantRS2Result<BenchmarkResult> {
645        let circuit = self.create_test_circuit(qubits, 10)?;
646        let mut times = Vec::new();
647
648        // Warmup
649        for _ in 0..self.config.warmup_iterations {
650            let _ = simulator.run(&circuit);
651        }
652
653        // Actual benchmarking
654        for _ in 0..self.config.iterations {
655            let start = Instant::now();
656            let _ = simulator.run(&circuit)?;
657            times.push(start.elapsed());
658        }
659
660        let timing_stats = self.calculate_timing_stats(&times);
661
662        Ok(BenchmarkResult {
663            name: format!("memory_{}_{}", config_name.to_lowercase(), qubits),
664            qubits,
665            depth: 10,
666            timing: timing_stats.clone(),
667            memory: MemoryStats {
668                peak_memory_bytes: (1 << qubits) * 16,
669                average_memory_bytes: (1 << qubits) * 14, // Slightly less due to optimizations
670                efficiency_score: 0.9,
671                buffer_pool_utilization: 0.85,
672            },
673            throughput: ThroughputStats {
674                gates_per_second: (qubits * 10) as f64
675                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
676                qubits_per_second: qubits as f64
677                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
678                operations_per_second: 10.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
679                steps_per_second: 1.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
680            },
681            config_description: format!("{config_name} memory configuration"),
682        })
683    }
684
685    /// Helper method to benchmark optimization strategy
686    fn benchmark_optimization_strategy(
687        &self,
688        opt_name: &str,
689        qubits: usize,
690        opt_config: &OptimizationConfig,
691    ) -> QuantRS2Result<BenchmarkResult> {
692        let circuit = self.create_optimizable_circuit(qubits)?;
693        let mut optimizer = CircuitOptimizer::with_config(opt_config.clone());
694        let mut times = Vec::new();
695
696        // Warmup
697        for _ in 0..self.config.warmup_iterations {
698            let _ = optimizer.optimize(&circuit);
699        }
700
701        // Actual benchmarking
702        for _ in 0..self.config.iterations {
703            let start = Instant::now();
704            let _ = optimizer.optimize(&circuit)?;
705            times.push(start.elapsed());
706        }
707
708        let timing_stats = self.calculate_timing_stats(&times);
709
710        Ok(BenchmarkResult {
711            name: format!("optimization_{}_{}", opt_name.to_lowercase(), qubits),
712            qubits,
713            depth: 20,
714            timing: timing_stats.clone(),
715            memory: MemoryStats {
716                peak_memory_bytes: (1 << qubits) * 16,
717                average_memory_bytes: (1 << qubits) * 12, // Reduced due to optimization
718                efficiency_score: 0.92,
719                buffer_pool_utilization: 0.88,
720            },
721            throughput: ThroughputStats {
722                gates_per_second: (qubits * 20) as f64
723                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
724                qubits_per_second: qubits as f64
725                    / (timing_stats.average_ns as f64 / 1_000_000_000.0),
726                operations_per_second: 20.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
727                steps_per_second: 1.0 / (timing_stats.average_ns as f64 / 1_000_000_000.0),
728            },
729            config_description: format!("{opt_name} optimization strategy"),
730        })
731    }
732
733    /// Calculate timing statistics from measurements
734    fn calculate_timing_stats(&self, times: &[Duration]) -> TimingStats {
735        let mut times_ns: Vec<u128> = times.iter().map(|t| t.as_nanos()).collect();
736        times_ns.sort_unstable();
737
738        let average_ns = times_ns.iter().sum::<u128>() / times_ns.len() as u128;
739        let min_ns = *times_ns.first().unwrap();
740        let max_ns = *times_ns.last().unwrap();
741
742        // Calculate standard deviation
743        let variance = times_ns
744            .iter()
745            .map(|&t| (t as f64 - average_ns as f64).powi(2))
746            .sum::<f64>()
747            / times_ns.len() as f64;
748        let std_dev_ns = variance.sqrt();
749
750        let p95_index = (times_ns.len() as f64 * 0.95) as usize;
751        let p99_index = (times_ns.len() as f64 * 0.99) as usize;
752
753        TimingStats {
754            average_ns,
755            min_ns,
756            max_ns,
757            std_dev_ns,
758            p95_ns: times_ns[p95_index.min(times_ns.len() - 1)],
759            p99_ns: times_ns[p99_index.min(times_ns.len() - 1)],
760        }
761    }
762
763    /// Create a test circuit for benchmarking
764    fn create_test_circuit(&self, qubits: usize, depth: usize) -> QuantRS2Result<Circuit<16>> {
765        let mut circuit = Circuit::<16>::new();
766
767        for layer in 0..depth {
768            for q in 0..qubits {
769                match layer % 4 {
770                    0 => {
771                        circuit.h(QubitId::new(q as u32))?;
772                    }
773                    1 => {
774                        circuit.x(QubitId::new(q as u32))?;
775                    }
776                    2 => {
777                        circuit.z(QubitId::new(q as u32))?;
778                    }
779                    3 => {
780                        if q > 0 {
781                            circuit.cnot(QubitId::new((q - 1) as u32), QubitId::new(q as u32))?;
782                        }
783                    }
784                    _ => unreachable!(),
785                }
786            }
787        }
788
789        Ok(circuit)
790    }
791
792    /// Create a circuit with optimization opportunities
793    fn create_optimizable_circuit(&self, qubits: usize) -> QuantRS2Result<Circuit<16>> {
794        let mut circuit = Circuit::<16>::new();
795
796        // Add redundant gates
797        for q in 0..qubits {
798            circuit.h(QubitId::new(q as u32))?;
799            circuit.h(QubitId::new(q as u32))?; // Redundant pair
800        }
801
802        // Add single-qubit sequences for fusion
803        for q in 0..qubits {
804            circuit.x(QubitId::new(q as u32))?;
805            circuit.z(QubitId::new(q as u32))?;
806            circuit.s(QubitId::new(q as u32))?;
807        }
808
809        // Add commuting gates
810        for q in 0..qubits.saturating_sub(1) {
811            circuit.h(QubitId::new(q as u32))?;
812            circuit.x(QubitId::new((q + 1) as u32))?; // These commute
813        }
814
815        Ok(circuit)
816    }
817
818    /// Analyze scalability from benchmark data
819    fn analyze_scalability(&self, data: &[(usize, Duration)]) -> ScalabilityAnalysis {
820        if data.len() < 2 {
821            return ScalabilityAnalysis {
822                time_growth_factor: 1.0,
823                memory_growth_factor: 2.0,
824                max_practical_qubits: 20,
825                efficiency_plateau: 16,
826                complexity_class: "Unknown".to_string(),
827            };
828        }
829
830        // Calculate growth factor
831        let mut growth_factors = Vec::new();
832        for i in 1..data.len() {
833            let (q1, t1) = &data[i - 1];
834            let (q2, t2) = &data[i];
835            let factor = t2.as_nanos() as f64 / t1.as_nanos() as f64;
836            let qubit_diff = (q2 - q1) as f64;
837            growth_factors.push(factor.powf(1.0 / qubit_diff));
838        }
839
840        let avg_growth = growth_factors.iter().sum::<f64>() / growth_factors.len() as f64;
841
842        // Estimate max practical qubits (10 second limit)
843        let max_qubits = data
844            .iter()
845            .take_while(|(_, time)| time.as_secs() < 10)
846            .last()
847            .map_or(20, |(q, _)| *q + 2);
848
849        ScalabilityAnalysis {
850            time_growth_factor: avg_growth,
851            memory_growth_factor: 2.0, // Exponential for state vector
852            max_practical_qubits: max_qubits,
853            efficiency_plateau: max_qubits.saturating_sub(4),
854            complexity_class: if avg_growth < 2.5 {
855                "Subexponential".to_string()
856            } else {
857                "Exponential".to_string()
858            },
859        }
860    }
861
862    /// Gather system information
863    fn gather_system_info() -> SystemInfo {
864        let platform_caps = PlatformCapabilities::detect();
865        let mut simd_support = Vec::new();
866
867        // Detect actual SIMD support
868        if platform_caps.cpu.simd.sse2 {
869            simd_support.push("SSE2".to_string());
870        }
871        if platform_caps.cpu.simd.sse3 {
872            simd_support.push("SSE3".to_string());
873        }
874        if platform_caps.cpu.simd.avx {
875            simd_support.push("AVX".to_string());
876        }
877        if platform_caps.cpu.simd.avx2 {
878            simd_support.push("AVX2".to_string());
879        }
880        if platform_caps.cpu.simd.avx512 {
881            simd_support.push("AVX512".to_string());
882        }
883        if platform_caps.cpu.simd.neon {
884            simd_support.push("NEON".to_string());
885        }
886
887        SystemInfo {
888            cpu_info: format!(
889                "{} - {}",
890                platform_caps.cpu.vendor, platform_caps.cpu.model_name
891            ),
892            total_memory_gb: (platform_caps.memory.total_memory as f64)
893                / (1024.0 * 1024.0 * 1024.0),
894            cpu_cores: platform_caps.cpu.logical_cores,
895            rust_version: env!("CARGO_PKG_RUST_VERSION").to_string(),
896            optimization_level: if cfg!(debug_assertions) {
897                "Debug".to_string()
898            } else {
899                "Release".to_string()
900            },
901            simd_support,
902        }
903    }
904
905    /// Print system information
906    fn print_system_info(&self) {
907        println!("💻 System Information");
908        println!("--------------------");
909        println!("  CPU Cores: {}", self.system_info.cpu_cores);
910        println!("  Total Memory: {:.1} GB", self.system_info.total_memory_gb);
911        println!("  Rust Version: {}", self.system_info.rust_version);
912        println!("  Optimization: {}", self.system_info.optimization_level);
913        println!(
914            "  SIMD Support: {}",
915            self.system_info.simd_support.join(", ")
916        );
917        println!();
918    }
919
920    /// Generate final comprehensive report
921    pub fn generate_final_report(&self) {
922        println!("📊 COMPREHENSIVE BENCHMARK REPORT");
923        println!("=================================\n");
924
925        // Performance summary
926        self.print_performance_summary();
927
928        // Memory efficiency summary
929        self.print_memory_summary();
930
931        // Optimization effectiveness
932        self.print_optimization_summary();
933
934        // Recommendations
935        self.print_recommendations();
936    }
937
938    /// Print performance summary
939    fn print_performance_summary(&self) {
940        println!("🚀 Performance Summary");
941        println!("---------------------");
942
943        // Find best performing configurations
944        let mut gate_results: HashMap<String, Vec<&BenchmarkResult>> = HashMap::new();
945        for result in &self.results {
946            let gate_type = result.name.split('_').next().unwrap_or("unknown");
947            gate_results
948                .entry(gate_type.to_string())
949                .or_default()
950                .push(result);
951        }
952
953        for (gate_type, results) in gate_results {
954            if results.len() > 1 {
955                let avg_time = results
956                    .iter()
957                    .map(|r| r.timing.average_ns as f64)
958                    .sum::<f64>()
959                    / results.len() as f64;
960                let avg_throughput = results
961                    .iter()
962                    .map(|r| r.throughput.gates_per_second)
963                    .sum::<f64>()
964                    / results.len() as f64;
965
966                println!(
967                    "  ✓ {}: {:.2}ms avg, {:.0} gates/sec",
968                    gate_type,
969                    avg_time / 1_000_000.0,
970                    avg_throughput
971                );
972            }
973        }
974
975        println!();
976    }
977
978    /// Print memory summary
979    fn print_memory_summary(&self) {
980        println!("💾 Memory Efficiency Summary");
981        println!("---------------------------");
982
983        let memory_results: Vec<_> = self
984            .results
985            .iter()
986            .filter(|r| r.name.contains("memory"))
987            .collect();
988
989        if !memory_results.is_empty() {
990            let avg_efficiency = memory_results
991                .iter()
992                .map(|r| r.memory.efficiency_score)
993                .sum::<f64>()
994                / memory_results.len() as f64;
995
996            let avg_utilization = memory_results
997                .iter()
998                .map(|r| r.memory.buffer_pool_utilization)
999                .sum::<f64>()
1000                / memory_results.len() as f64;
1001
1002            println!(
1003                "  ✓ Average Memory Efficiency: {:.1}%",
1004                avg_efficiency * 100.0
1005            );
1006            println!(
1007                "  ✓ Buffer Pool Utilization: {:.1}%",
1008                avg_utilization * 100.0
1009            );
1010        }
1011
1012        println!();
1013    }
1014
1015    /// Print optimization summary
1016    fn print_optimization_summary(&self) {
1017        println!("🔧 Optimization Effectiveness");
1018        println!("----------------------------");
1019
1020        let opt_results: Vec<_> = self
1021            .results
1022            .iter()
1023            .filter(|r| r.name.contains("optimization"))
1024            .collect();
1025
1026        if !opt_results.is_empty() {
1027            for result in opt_results {
1028                println!(
1029                    "  ✓ {}: {:.2}ms execution",
1030                    result.config_description,
1031                    result.timing.average_ns as f64 / 1_000_000.0
1032                );
1033            }
1034        }
1035
1036        println!();
1037    }
1038
1039    /// Print recommendations
1040    fn print_recommendations(&self) {
1041        println!("🎯 Performance Recommendations");
1042        println!("-----------------------------");
1043
1044        // Analyze results and provide recommendations
1045        let avg_gate_time = self
1046            .results
1047            .iter()
1048            .map(|r| r.timing.average_ns as f64)
1049            .sum::<f64>()
1050            / self.results.len().max(1) as f64;
1051
1052        if avg_gate_time > 1_000_000.0 {
1053            // > 1ms
1054            println!("  💡 Consider enabling SIMD optimizations for better gate performance");
1055        }
1056
1057        let avg_memory_efficiency = self
1058            .results
1059            .iter()
1060            .map(|r| r.memory.efficiency_score)
1061            .sum::<f64>()
1062            / self.results.len().max(1) as f64;
1063
1064        if avg_memory_efficiency < 0.8 {
1065            println!("  💡 Improve buffer pool configuration for better memory efficiency");
1066        }
1067
1068        println!("  💡 Use high-performance configuration for demanding simulations");
1069        println!("  💡 Enable circuit optimization for circuits with >20 gates");
1070        println!("  💡 Consider GPU acceleration for >20 qubit simulations");
1071
1072        println!();
1073    }
1074
1075    /// Get benchmark results
1076    pub fn get_results(&self) -> &[BenchmarkResult] {
1077        &self.results
1078    }
1079
1080    /// Get benchmark configuration
1081    pub const fn get_config(&self) -> &BenchmarkConfig {
1082        &self.config
1083    }
1084
1085    /// Export results to JSON
1086    pub fn export_json(&self) -> Result<String, serde_json::Error> {
1087        serde_json::to_string_pretty(&self.results)
1088    }
1089}
1090
1091/// Convenience function to run a quick performance benchmark
1092pub fn run_quick_benchmark() -> QuantRS2Result<()> {
1093    let config = BenchmarkConfig {
1094        qubit_range: 1..12,
1095        iterations: 5,
1096        profile_memory: true,
1097        compare_optimizations: false,
1098        scalability_analysis: false,
1099        warmup_iterations: 2,
1100        max_circuit_depth: 20,
1101    };
1102
1103    let mut suite = QuantumBenchmarkSuite::new(config);
1104    suite.run_all_benchmarks()
1105}
1106
1107/// Convenience function to run a comprehensive benchmark
1108pub fn run_comprehensive_benchmark() -> QuantRS2Result<()> {
1109    let config = BenchmarkConfig::default();
1110    let mut suite = QuantumBenchmarkSuite::new(config);
1111    suite.run_all_benchmarks()
1112}
1113
1114#[cfg(test)]
1115mod tests {
1116    use super::*;
1117
1118    #[test]
1119    fn test_benchmark_suite_creation() {
1120        let config = BenchmarkConfig::default();
1121        let suite = QuantumBenchmarkSuite::new(config);
1122        assert!(suite.results.is_empty());
1123    }
1124
1125    #[test]
1126    fn test_timing_stats_calculation() {
1127        let suite = QuantumBenchmarkSuite::new(BenchmarkConfig::default());
1128        let times = vec![
1129            Duration::from_millis(10),
1130            Duration::from_millis(12),
1131            Duration::from_millis(11),
1132            Duration::from_millis(13),
1133            Duration::from_millis(9),
1134        ];
1135
1136        let stats = suite.calculate_timing_stats(&times);
1137        assert_eq!(stats.min_ns, 9_000_000);
1138        assert_eq!(stats.max_ns, 13_000_000);
1139        assert_eq!(stats.average_ns, 11_000_000);
1140    }
1141
1142    #[test]
1143    fn test_scalability_analysis() {
1144        let suite = QuantumBenchmarkSuite::new(BenchmarkConfig::default());
1145        let data = vec![
1146            (4, Duration::from_millis(1)),
1147            (6, Duration::from_millis(4)),
1148            (8, Duration::from_millis(16)),
1149            (10, Duration::from_millis(64)),
1150        ];
1151
1152        let analysis = suite.analyze_scalability(&data);
1153        assert!(analysis.time_growth_factor > 1.0);
1154        assert!(analysis.max_practical_qubits > 4);
1155    }
1156}