Skip to main content

scirs2_ndimage/
profiling.rs

1//! Performance profiling and optimization tools
2//!
3//! This module provides comprehensive tools for profiling and optimizing ndimage operations,
4//! including timing measurements, memory usage tracking, performance analysis, backend
5//! comparison, and automatic optimization recommendations.
6
7use scirs2_core::numeric::Float;
8use std::cmp;
9use std::collections::HashMap;
10use std::fmt::{self, Debug, Display};
11use std::sync::atomic::Ordering;
12use std::sync::{Arc, Mutex};
13use std::thread;
14use std::time::{Duration, Instant};
15
16use crate::backend::Backend;
17use crate::error::NdimageResult;
18
19// Global profiler instance
20lazy_static::lazy_static! {
21    static ref PROFILER: Arc<Mutex<Profiler>> = Arc::new(Mutex::new(Profiler::new()));
22}
23
24/// Performance metrics for a single operation
25#[derive(Debug, Clone)]
26pub struct OperationMetrics {
27    pub name: String,
28    pub duration: Duration,
29    pub memory_allocated: usize,
30    pub memory_deallocated: usize,
31    pub arrayshape: Vec<usize>,
32    pub backend: Backend,
33    pub thread_count: usize,
34    pub timestamp: Instant,
35}
36
37impl Display for OperationMetrics {
38    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39        write!(
40            f,
41            "{}: {:.3}ms, shape={:?}, backend={:?}, threads={}",
42            self.name,
43            self.duration.as_secs_f64() * 1000.0,
44            self.arrayshape,
45            self.backend,
46            self.thread_count
47        )
48    }
49}
50
51/// Profiler for tracking performance metrics
52#[derive(Debug)]
53pub struct Profiler {
54    metrics: Vec<OperationMetrics>,
55    enabled: bool,
56    memory_tracking: bool,
57    current_memory: usize,
58    peak_memory: usize,
59}
60
61impl Profiler {
62    pub fn new() -> Self {
63        Self {
64            metrics: Vec::new(),
65            enabled: false,
66            memory_tracking: false,
67            current_memory: 0,
68            peak_memory: 0,
69        }
70    }
71
72    /// Enable profiling
73    pub fn enable(&mut self) {
74        self.enabled = true;
75    }
76
77    /// Disable profiling
78    pub fn disable(&mut self) {
79        self.enabled = false;
80    }
81
82    /// Enable memory tracking
83    pub fn enable_memory_tracking(&mut self) {
84        self.memory_tracking = true;
85    }
86
87    /// Record a metric
88    pub fn record(&mut self, metric: OperationMetrics) {
89        if self.enabled {
90            self.metrics.push(metric);
91        }
92    }
93
94    /// Clear all metrics
95    pub fn clear(&mut self) {
96        self.metrics.clear();
97        self.current_memory = 0;
98        self.peak_memory = 0;
99    }
100
101    /// Get all metrics
102    pub fn metrics(&self) -> &[OperationMetrics] {
103        &self.metrics
104    }
105
106    /// Generate a performance report
107    pub fn report(&self) -> PerformanceReport {
108        PerformanceReport::frommetrics(&self.metrics)
109    }
110
111    /// Track memory allocation
112    pub fn track_allocation(&mut self, bytes: usize) {
113        if self.memory_tracking {
114            self.current_memory += bytes;
115            self.peak_memory = self.peak_memory.max(self.current_memory);
116        }
117    }
118
119    /// Track memory deallocation
120    pub fn track_deallocation(&mut self, bytes: usize) {
121        if self.memory_tracking {
122            self.current_memory = self.current_memory.saturating_sub(bytes);
123        }
124    }
125}
126
127/// Performance report with analysis
128#[derive(Debug)]
129pub struct PerformanceReport {
130    pub total_time: Duration,
131    pub operation_breakdown: HashMap<String, OperationSummary>,
132    pub backend_usage: HashMap<String, usize>,
133    pub memory_stats: MemoryStats,
134    pub recommendations: Vec<String>,
135}
136
137/// Summary statistics for an operation type
138#[derive(Debug)]
139pub struct OperationSummary {
140    pub count: usize,
141    pub total_time: Duration,
142    pub mean_time: Duration,
143    pub min_time: Duration,
144    pub max_time: Duration,
145    pub std_dev: f64,
146}
147
148/// Memory usage statistics
149#[derive(Debug)]
150pub struct MemoryStats {
151    pub peak_usage: usize,
152    pub total_allocated: usize,
153    pub total_deallocated: usize,
154}
155
156impl PerformanceReport {
157    fn frommetrics(metrics: &[OperationMetrics]) -> Self {
158        let total_time = metrics.iter().map(|m| m.duration).sum();
159
160        // Group metrics by operation name
161        let mut op_groups: HashMap<String, Vec<&OperationMetrics>> = HashMap::new();
162        let mut backend_usage: HashMap<String, usize> = HashMap::new();
163
164        for metric in metrics {
165            op_groups
166                .entry(metric.name.clone())
167                .or_default()
168                .push(metric);
169
170            *backend_usage
171                .entry(format!("{:?}", metric.backend))
172                .or_default() += 1;
173        }
174
175        // Compute operation summaries
176        let operation_breakdown: HashMap<String, OperationSummary> = op_groups
177            .into_iter()
178            .map(|(name, group)| {
179                let count = group.len();
180                let total: Duration = group.iter().map(|m| m.duration).sum();
181                let mean = total / count as u32;
182
183                let times: Vec<f64> = group.iter().map(|m| m.duration.as_secs_f64()).collect();
184
185                let min = times
186                    .iter()
187                    .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
188                    .unwrap_or(&0.0);
189                let max = times
190                    .iter()
191                    .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
192                    .unwrap_or(&0.0);
193
194                let mean_f64 = times.iter().sum::<f64>() / count as f64;
195                let variance =
196                    times.iter().map(|t| (t - mean_f64).powi(2)).sum::<f64>() / count as f64;
197                let std_dev = variance.sqrt();
198
199                (
200                    name,
201                    OperationSummary {
202                        count,
203                        total_time: total,
204                        mean_time: mean,
205                        min_time: Duration::from_secs_f64(*min),
206                        max_time: Duration::from_secs_f64(*max),
207                        std_dev,
208                    },
209                )
210            })
211            .collect();
212
213        // Compute memory statistics
214        let total_allocated: usize = metrics.iter().map(|m| m.memory_allocated).sum();
215        let total_deallocated: usize = metrics.iter().map(|m| m.memory_deallocated).sum();
216        let peak_usage = metrics
217            .iter()
218            .scan(0isize, |acc, m| {
219                *acc += m.memory_allocated as isize - m.memory_deallocated as isize;
220                Some(*acc as usize)
221            })
222            .max()
223            .unwrap_or(0);
224
225        let memory_stats = MemoryStats {
226            peak_usage,
227            total_allocated,
228            total_deallocated,
229        };
230
231        // Generate recommendations
232        let recommendations =
233            generate_recommendations(&operation_breakdown, &backend_usage, metrics);
234
235        Self {
236            total_time,
237            operation_breakdown,
238            backend_usage,
239            memory_stats,
240            recommendations,
241        }
242    }
243
244    /// Display the report in a human-readable format
245    pub fn display(&self) {
246        println!("\n=== Performance Report ===\n");
247
248        println!(
249            "Total execution time: {:.3}ms",
250            self.total_time.as_secs_f64() * 1000.0
251        );
252        println!();
253
254        println!("Operation Breakdown:");
255        let mut ops: Vec<_> = self.operation_breakdown.iter().collect();
256        ops.sort_by_key(|(_, summary)| std::cmp::Reverse(summary.total_time));
257
258        for (name, summary) in ops {
259            println!("  {}: {} calls", name, summary.count);
260            println!(
261                "    Total: {:.3}ms ({:.1}%)",
262                summary.total_time.as_secs_f64() * 1000.0,
263                (summary.total_time.as_secs_f64() / self.total_time.as_secs_f64()) * 100.0
264            );
265            println!(
266                "    Mean: {:.3}ms, Min: {:.3}ms, Max: {:.3}ms, StdDev: {:.3}ms",
267                summary.mean_time.as_secs_f64() * 1000.0,
268                summary.min_time.as_secs_f64() * 1000.0,
269                summary.max_time.as_secs_f64() * 1000.0,
270                summary.std_dev * 1000.0
271            );
272        }
273        println!();
274
275        println!("Backend Usage:");
276        for (backend, count) in &self.backend_usage {
277            println!("  {}: {} operations", backend, count);
278        }
279        println!();
280
281        println!("Memory Statistics:");
282        println!(
283            "  Peak usage: {} MB",
284            self.memory_stats.peak_usage / (1024 * 1024)
285        );
286        println!(
287            "  Total allocated: {} MB",
288            self.memory_stats.total_allocated / (1024 * 1024)
289        );
290        println!(
291            "  Total deallocated: {} MB",
292            self.memory_stats.total_deallocated / (1024 * 1024)
293        );
294        println!();
295
296        if !self.recommendations.is_empty() {
297            println!("Recommendations:");
298            for rec in &self.recommendations {
299                println!("  • {}", rec);
300            }
301        }
302    }
303}
304
305/// Generate performance recommendations
306#[allow(dead_code)]
307fn generate_recommendations(
308    operation_breakdown: &HashMap<String, OperationSummary>,
309    backend_usage: &HashMap<String, usize>,
310    metrics: &[OperationMetrics],
311) -> Vec<String> {
312    let mut recommendations = Vec::new();
313
314    // Check for operations that could benefit from GPU acceleration
315    let cpu_only = backend_usage.get("Cpu").copied().unwrap_or(0);
316    let total_ops = backend_usage.values().sum::<usize>();
317
318    if cpu_only == total_ops && total_ops > 10 {
319        // Check if there are large arrays that could benefit from GPU
320        let large_arrays = metrics
321            .iter()
322            .filter(|m| m.arrayshape.iter().product::<usize>() > 1_000_000)
323            .count();
324
325        if large_arrays > 0 {
326            recommendations.push(format!(
327                "Consider enabling GPU acceleration - {} operations processed large arrays (>1M elements)",
328                large_arrays
329            ));
330        }
331    }
332
333    // Check for operations with high variance in execution time
334    for (name, summary) in operation_breakdown {
335        let cv = summary.std_dev / summary.mean_time.as_secs_f64(); // Coefficient of variation
336        if cv > 0.5 && summary.count > 5 {
337            recommendations.push(format!(
338                "High variance in '{}' execution times (CV={:.2}) - consider investigating data-dependent performance",
339                name, cv
340            ));
341        }
342    }
343
344    // Check for potential memory issues
345    let total_time_ms = metrics.iter().map(|m| m.duration.as_millis()).sum::<u128>();
346    let ops_per_ms = total_ops as f64 / total_time_ms as f64;
347
348    if ops_per_ms < 0.1 {
349        recommendations.push(
350            "Low throughput detected - consider batch processing or parallelization".to_string(),
351        );
352    }
353
354    recommendations
355}
356
357/// Profiling scope guard
358pub struct ProfilingScope {
359    name: String,
360    start: Instant,
361    shape: Vec<usize>,
362    backend: Backend,
363    initial_memory: usize,
364}
365
366impl ProfilingScope {
367    pub fn new(name: impl Into<String>, shape: &[usize], backend: Backend) -> Self {
368        let profiler = PROFILER
369            .lock()
370            .expect("PROFILER mutex should not be poisoned");
371        let initial_memory = profiler.current_memory;
372        drop(profiler);
373
374        Self {
375            name: name.into(),
376            start: Instant::now(),
377            shape: shape.to_vec(),
378            backend,
379            initial_memory,
380        }
381    }
382}
383
384impl Drop for ProfilingScope {
385    fn drop(&mut self) {
386        let duration = self.start.elapsed();
387        let thread_count = scirs2_core::parallel_ops::get_num_threads();
388
389        let mut profiler = PROFILER
390            .lock()
391            .expect("PROFILER mutex should not be poisoned");
392        let memory_allocated = profiler.current_memory.saturating_sub(self.initial_memory);
393
394        let metric = OperationMetrics {
395            name: self.name.clone(),
396            duration,
397            memory_allocated,
398            memory_deallocated: 0,
399            arrayshape: self.shape.clone(),
400            backend: self.backend,
401            thread_count,
402            timestamp: self.start,
403        };
404
405        profiler.record(metric);
406    }
407}
408
409/// Profile an operation
410#[macro_export]
411macro_rules! profile_op {
412    ($name:expr, $shape:expr, $backend:expr, $body:expr) => {{
413        let _scope = $crate::profiling::ProfilingScope::new($name, $shape, $backend);
414        $body
415    }};
416}
417
418/// Enable global profiling
419#[allow(dead_code)]
420pub fn enable_profiling() {
421    PROFILER
422        .lock()
423        .expect("PROFILER mutex should not be poisoned")
424        .enable();
425}
426
427/// Disable global profiling
428#[allow(dead_code)]
429pub fn disable_profiling() {
430    PROFILER
431        .lock()
432        .expect("PROFILER mutex should not be poisoned")
433        .disable();
434}
435
436/// Enable memory tracking
437#[allow(dead_code)]
438pub fn enable_memory_tracking() {
439    PROFILER
440        .lock()
441        .expect("PROFILER mutex should not be poisoned")
442        .enable_memory_tracking();
443}
444
445/// Clear all profiling data
446#[allow(dead_code)]
447pub fn clear_profiling_data() {
448    PROFILER
449        .lock()
450        .expect("PROFILER mutex should not be poisoned")
451        .clear();
452}
453
454/// Get performance report
455#[allow(dead_code)]
456pub fn get_performance_report() -> PerformanceReport {
457    PROFILER
458        .lock()
459        .expect("PROFILER mutex should not be poisoned")
460        .report()
461}
462
463/// Display performance report
464#[allow(dead_code)]
465pub fn display_performance_report() {
466    let report = get_performance_report();
467    report.display();
468}
469
470/// Benchmark utility for comparing implementations
471pub struct Benchmark<T> {
472    name: String,
473    iterations: usize,
474    warmup_iterations: usize,
475    results: Vec<BenchmarkResult<T>>,
476}
477
478#[derive(Debug)]
479pub struct BenchmarkResult<T> {
480    pub variant: String,
481    pub times: Vec<Duration>,
482    pub result: T,
483}
484
485impl<T> Benchmark<T> {
486    pub fn new(name: impl Into<String>) -> Self {
487        Self {
488            name: name.into(),
489            iterations: 100,
490            warmup_iterations: 10,
491            results: Vec::new(),
492        }
493    }
494
495    pub fn iterations(mut self, iterations: usize) -> Self {
496        self.iterations = iterations;
497        self
498    }
499
500    pub fn warmup_iterations(mut self, warmup: usize) -> Self {
501        self.warmup_iterations = warmup;
502        self
503    }
504
505    pub fn run<F>(&mut self, variant: impl Into<String>, mut f: F) -> NdimageResult<()>
506    where
507        F: FnMut() -> NdimageResult<T>,
508    {
509        let variant = variant.into();
510
511        // Warmup
512        for _ in 0..self.warmup_iterations {
513            f()?;
514        }
515
516        // Actual benchmark
517        let mut times = Vec::with_capacity(self.iterations);
518        let mut result = None;
519
520        for _ in 0..self.iterations {
521            let start = Instant::now();
522            result = Some(f()?);
523            times.push(start.elapsed());
524        }
525
526        self.results.push(BenchmarkResult {
527            variant,
528            times,
529            result: result.expect("Benchmark result should be available after iterations"),
530        });
531
532        Ok(())
533    }
534
535    pub fn compare(&self) -> BenchmarkComparison {
536        BenchmarkComparison::from_results(&self.name, &self.results)
537    }
538}
539
540/// Comparison of benchmark results
541#[derive(Debug)]
542pub struct BenchmarkComparison {
543    pub name: String,
544    pub variants: Vec<VariantStats>,
545    pub fastest: String,
546    pub baseline: String,
547}
548
549#[derive(Debug)]
550pub struct VariantStats {
551    pub name: String,
552    pub mean: Duration,
553    pub median: Duration,
554    pub std_dev: Duration,
555    pub min: Duration,
556    pub max: Duration,
557    pub speedup: f64,
558}
559
560impl BenchmarkComparison {
561    fn from_results<T>(name: &str, results: &[BenchmarkResult<T>]) -> Self {
562        let mut variants = Vec::new();
563
564        for result in results {
565            let mut times = result.times.clone();
566            times.sort();
567
568            let mean = times.iter().sum::<Duration>() / times.len() as u32;
569            let median = times[times.len() / 2];
570            let min = times[0];
571            let max = times[times.len() - 1];
572
573            let mean_nanos = mean.as_nanos() as f64;
574            let variance = times
575                .iter()
576                .map(|t| {
577                    let diff = t.as_nanos() as f64 - mean_nanos;
578                    diff * diff
579                })
580                .sum::<f64>()
581                / times.len() as f64;
582            let std_dev = Duration::from_nanos(variance.sqrt() as u64);
583
584            variants.push(VariantStats {
585                name: result.variant.clone(),
586                mean,
587                median,
588                std_dev,
589                min,
590                max,
591                speedup: 1.0, // Will be updated
592            });
593        }
594
595        // Find fastest variant
596        let fastest_idx = variants
597            .iter()
598            .enumerate()
599            .min_by_key(|(_, v)| v.median)
600            .map(|(i, _)| i)
601            .unwrap_or(0);
602
603        let fastest = variants[fastest_idx].name.clone();
604        let baseline = variants.first().map(|v| v.name.clone()).unwrap_or_default();
605
606        // Calculate speedups relative to baseline
607        let baseline_time = variants[0].median.as_nanos() as f64;
608        for variant in &mut variants {
609            variant.speedup = baseline_time / variant.median.as_nanos() as f64;
610        }
611
612        Self {
613            name: name.to_string(),
614            variants,
615            fastest,
616            baseline,
617        }
618    }
619
620    pub fn display(&self) {
621        println!("\n=== Benchmark: {} ===\n", self.name);
622
623        for variant in &self.variants {
624            println!("{}: ", variant.name);
625            println!(
626                "  Mean: {:.3}ms ± {:.3}ms",
627                variant.mean.as_secs_f64() * 1000.0,
628                variant.std_dev.as_secs_f64() * 1000.0
629            );
630            println!("  Median: {:.3}ms", variant.median.as_secs_f64() * 1000.0);
631            println!(
632                "  Min: {:.3}ms, Max: {:.3}ms",
633                variant.min.as_secs_f64() * 1000.0,
634                variant.max.as_secs_f64() * 1000.0
635            );
636
637            if variant.name == self.baseline {
638                println!("  (baseline)");
639            } else {
640                println!("  Speedup: {:.2}x", variant.speedup);
641            }
642            println!();
643        }
644
645        println!(
646            "Fastest: {} ({:.2}x faster than baseline)",
647            self.fastest,
648            self.variants
649                .iter()
650                .find(|v| v.name == self.fastest)
651                .map(|v| v.speedup)
652                .unwrap_or(1.0)
653        );
654    }
655}
656
657/// Auto-tuning for optimal parameters
658pub struct AutoTuner {
659    pub name: String,
660    pub test_data: Vec<(String, Box<dyn Fn() -> NdimageResult<Duration>>)>,
661}
662
663impl AutoTuner {
664    pub fn new(name: impl Into<String>) -> Self {
665        Self {
666            name: name.into(),
667            test_data: Vec::new(),
668        }
669    }
670
671    pub fn add_variant<F>(&mut self, name: impl Into<String>, f: F)
672    where
673        F: Fn() -> NdimageResult<Duration> + 'static,
674    {
675        self.test_data.push((name.into(), Box::new(f)));
676    }
677
678    pub fn find_optimal(&self) -> NdimageResult<String> {
679        let mut best_time = Duration::MAX;
680        let mut best_variant = String::new();
681
682        for (name, test_fn) in &self.test_data {
683            let time = test_fn()?;
684            if time < best_time {
685                best_time = time;
686                best_variant = name.clone();
687            }
688        }
689
690        Ok(best_variant)
691    }
692}
693
694/// Performance optimization advisor
695///
696/// Analyzes profiling data and provides specific optimization recommendations
697pub struct OptimizationAdvisor {
698    metrics: Vec<OperationMetrics>,
699    hardware_info: HardwareInfo,
700}
701
702#[derive(Debug, Clone)]
703pub struct HardwareInfo {
704    pub cpu_cores: usize,
705    pub simd_support: SimdSupport,
706    pub gpu_available: bool,
707    pub total_memory: usize,
708    pub cache_sizes: CacheSizes,
709}
710
711#[derive(Debug, Clone)]
712pub struct SimdSupport {
713    pub sse: bool,
714    pub avx: bool,
715    pub avx2: bool,
716    pub avx512: bool,
717    pub neon: bool,
718}
719
720#[derive(Debug, Clone)]
721pub struct CacheSizes {
722    pub l1: usize,
723    pub l2: usize,
724    pub l3: usize,
725}
726
727impl OptimizationAdvisor {
728    pub fn new() -> Self {
729        Self {
730            metrics: Vec::new(),
731            hardware_info: HardwareInfo::detect(),
732        }
733    }
734
735    pub fn analyze(&mut self, metrics: &[OperationMetrics]) -> OptimizationReport {
736        self.metrics = metrics.to_vec();
737
738        let mut recommendations = Vec::new();
739
740        // Analyze memory access patterns
741        recommendations.extend(self.analyze_memory_patterns());
742
743        // Analyze computation patterns
744        recommendations.extend(self.analyze_computation_patterns());
745
746        // Analyze parallelization opportunities
747        recommendations.extend(self.analyze_parallelization());
748
749        // Analyze GPU offloading opportunities
750        recommendations.extend(self.analyze_gpu_opportunities());
751
752        let estimated_speedup = self.estimate_speedup(&recommendations);
753        let implementation_difficulty = self.assess_difficulty(&recommendations);
754
755        OptimizationReport {
756            recommendations,
757            estimated_speedup,
758            implementation_difficulty,
759        }
760    }
761
762    fn analyze_memory_patterns(&self) -> Vec<OptimizationRecommendation> {
763        let mut recommendations = Vec::new();
764
765        // Group operations by type
766        let mut op_groups: HashMap<String, Vec<&OperationMetrics>> = HashMap::new();
767        for metric in &self.metrics {
768            op_groups
769                .entry(metric.name.clone())
770                .or_default()
771                .push(metric);
772        }
773
774        // Check for cache-unfriendly access patterns
775        for (op_name, metrics) in op_groups {
776            let avg_array_size: usize = metrics
777                .iter()
778                .map(|m| m.arrayshape.iter().product::<usize>())
779                .sum::<usize>()
780                / metrics.len().max(1);
781
782            let element_size = std::mem::size_of::<f64>(); // Assume f64
783            let working_set_size = avg_array_size * element_size;
784
785            if working_set_size > self.hardware_info.cache_sizes.l3 {
786                recommendations.push(OptimizationRecommendation {
787                    operation: op_name.clone(),
788                    category: OptimizationCategory::Memory,
789                    description: "Working set exceeds L3 cache".to_string(),
790                    suggestion: "Consider tiling/blocking to improve cache locality".to_string(),
791                    estimated_improvement: 1.5,
792                });
793            }
794
795            // Check for strided access patterns
796            if op_name.contains("transpose") || op_name.contains("permute") {
797                recommendations.push(OptimizationRecommendation {
798                    operation: op_name,
799                    category: OptimizationCategory::Memory,
800                    description: "Potentially cache-unfriendly access pattern".to_string(),
801                    suggestion: "Use blocked/tiled algorithms for better cache usage".to_string(),
802                    estimated_improvement: 1.3,
803                });
804            }
805        }
806
807        recommendations
808    }
809
810    fn analyze_computation_patterns(&self) -> Vec<OptimizationRecommendation> {
811        let mut recommendations = Vec::new();
812
813        // Check for SIMD opportunities
814        for metric in &self.metrics {
815            let array_size: usize = metric.arrayshape.iter().product();
816
817            if array_size > 1000 && !metric.name.contains("simd") {
818                if self.hardware_info.simd_support.avx2 {
819                    recommendations.push(OptimizationRecommendation {
820                        operation: metric.name.clone(),
821                        category: OptimizationCategory::Vectorization,
822                        description: "Operation could benefit from SIMD vectorization".to_string(),
823                        suggestion: "Implement SIMD version using AVX2 intrinsics".to_string(),
824                        estimated_improvement: 2.0,
825                    });
826                }
827            }
828        }
829
830        recommendations
831    }
832
833    fn analyze_parallelization(&self) -> Vec<OptimizationRecommendation> {
834        let mut recommendations = Vec::new();
835
836        for metric in &self.metrics {
837            let array_size: usize = metric.arrayshape.iter().product();
838
839            // Check if operation is large enough to benefit from parallelization
840            if array_size > 50_000 && metric.thread_count == 1 {
841                recommendations.push(OptimizationRecommendation {
842                    operation: metric.name.clone(),
843                    category: OptimizationCategory::Parallelization,
844                    description: "Large operation running on single thread".to_string(),
845                    suggestion: format!(
846                        "Parallelize across {} cores for better performance",
847                        self.hardware_info.cpu_cores
848                    ),
849                    estimated_improvement: (self.hardware_info.cpu_cores as f64).min(4.0),
850                });
851            }
852        }
853
854        recommendations
855    }
856
857    fn analyze_gpu_opportunities(&self) -> Vec<OptimizationRecommendation> {
858        let mut recommendations = Vec::new();
859
860        if !self.hardware_info.gpu_available {
861            return recommendations;
862        }
863
864        for metric in &self.metrics {
865            let array_size: usize = metric.arrayshape.iter().product();
866
867            // GPU is beneficial for large arrays and compute-intensive operations
868            if array_size > 1_000_000 && metric.backend == Backend::Cpu {
869                recommendations.push(OptimizationRecommendation {
870                    operation: metric.name.clone(),
871                    category: OptimizationCategory::GpuOffloading,
872                    description: "Large array operation suitable for GPU acceleration".to_string(),
873                    suggestion: "Offload to GPU for significant speedup".to_string(),
874                    estimated_improvement: 10.0,
875                });
876            }
877        }
878
879        recommendations
880    }
881
882    fn estimate_speedup(&self, recommendations: &[OptimizationRecommendation]) -> f64 {
883        // Estimate overall speedup (simplified model)
884        let mut total_improvement = 1.0;
885
886        for rec in recommendations {
887            // Apply diminishing returns
888            total_improvement *= 1.0 + (rec.estimated_improvement - 1.0) * 0.7;
889        }
890
891        total_improvement
892    }
893
894    fn assess_difficulty(
895        &self,
896        recommendations: &[OptimizationRecommendation],
897    ) -> ImplementationDifficulty {
898        let max_difficulty = recommendations
899            .iter()
900            .map(|r| match r.category {
901                OptimizationCategory::Memory => 2,
902                OptimizationCategory::Vectorization => 3,
903                OptimizationCategory::Parallelization => 2,
904                OptimizationCategory::GpuOffloading => 4,
905                OptimizationCategory::Algorithm => 3,
906            })
907            .max()
908            .unwrap_or(1);
909
910        match max_difficulty {
911            1 => ImplementationDifficulty::Easy,
912            2 => ImplementationDifficulty::Moderate,
913            3 => ImplementationDifficulty::Hard,
914            _ => ImplementationDifficulty::Expert,
915        }
916    }
917}
918
919#[derive(Debug)]
920pub struct OptimizationReport {
921    pub recommendations: Vec<OptimizationRecommendation>,
922    pub estimated_speedup: f64,
923    pub implementation_difficulty: ImplementationDifficulty,
924}
925
926#[derive(Debug)]
927pub struct OptimizationRecommendation {
928    pub operation: String,
929    pub category: OptimizationCategory,
930    pub description: String,
931    pub suggestion: String,
932    pub estimated_improvement: f64,
933}
934
935#[derive(Debug)]
936pub enum OptimizationCategory {
937    Memory,
938    Vectorization,
939    Parallelization,
940    GpuOffloading,
941    Algorithm,
942}
943
944#[derive(Debug)]
945pub enum ImplementationDifficulty {
946    Easy,
947    Moderate,
948    Hard,
949    Expert,
950}
951
952impl HardwareInfo {
953    fn detect() -> Self {
954        Self {
955            cpu_cores: num_cpus::get(),
956            simd_support: SimdSupport::detect(),
957            gpu_available: cfg!(feature = "cuda") || cfg!(feature = "opencl"),
958            total_memory: 16_000_000_000, // 16GB default
959            cache_sizes: CacheSizes {
960                l1: 32_768,    // 32KB
961                l2: 262_144,   // 256KB
962                l3: 8_388_608, // 8MB
963            },
964        }
965    }
966}
967
968impl SimdSupport {
969    fn detect() -> Self {
970        #[cfg(target_arch = "x86_64")]
971        {
972            Self {
973                sse: is_x86_feature_detected!("sse"),
974                avx: is_x86_feature_detected!("avx"),
975                avx2: is_x86_feature_detected!("avx2"),
976                avx512: false, // Conservative default
977                neon: false,
978            }
979        }
980        #[cfg(target_arch = "aarch64")]
981        {
982            Self {
983                sse: false,
984                avx: false,
985                avx2: false,
986                avx512: false,
987                neon: true,
988            }
989        }
990        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
991        {
992            Self {
993                sse: false,
994                avx: false,
995                avx2: false,
996                avx512: false,
997                neon: false,
998            }
999        }
1000    }
1001}
1002
1003impl OptimizationReport {
1004    pub fn display(&self) {
1005        println!("\n=== Optimization Report ===\n");
1006
1007        println!("Estimated overall speedup: {:.1}x", self.estimated_speedup);
1008        println!(
1009            "Implementation difficulty: {:?}\n",
1010            self.implementation_difficulty
1011        );
1012
1013        println!("Recommendations:");
1014        for (i, rec) in self.recommendations.iter().enumerate() {
1015            println!("\n{}. {} - {:?}", i + 1, rec.operation, rec.category);
1016            println!("   Issue: {}", rec.description);
1017            println!("   Suggestion: {}", rec.suggestion);
1018            println!(
1019                "   Potential improvement: {:.1}x",
1020                rec.estimated_improvement
1021            );
1022        }
1023    }
1024}
1025
1026/// Memory profiler for tracking allocations
1027pub struct MemoryProfiler {
1028    allocations: Mutex<HashMap<String, AllocationInfo>>,
1029    enabled: AtomicBool,
1030}
1031
1032#[derive(Debug, Clone)]
1033struct AllocationInfo {
1034    total_allocated: usize,
1035    current_allocated: usize,
1036    peak_allocated: usize,
1037    allocation_count: usize,
1038}
1039
1040impl MemoryProfiler {
1041    pub fn new() -> Self {
1042        Self {
1043            allocations: Mutex::new(HashMap::new()),
1044            enabled: AtomicBool::new(false),
1045        }
1046    }
1047
1048    pub fn enable(&self) {
1049        self.enabled
1050            .store(true, std::sync::atomic::Ordering::Relaxed);
1051    }
1052
1053    pub fn disable(&self) {
1054        self.enabled
1055            .store(false, std::sync::atomic::Ordering::Relaxed);
1056    }
1057
1058    pub fn track_allocation(&self, operation: &str, size: usize) {
1059        if !self.enabled.load(std::sync::atomic::Ordering::Relaxed) {
1060            return;
1061        }
1062
1063        let mut allocations = self
1064            .allocations
1065            .lock()
1066            .expect("Memory allocations mutex should not be poisoned");
1067        let info = allocations
1068            .entry(operation.to_string())
1069            .or_insert(AllocationInfo {
1070                total_allocated: 0,
1071                current_allocated: 0,
1072                peak_allocated: 0,
1073                allocation_count: 0,
1074            });
1075
1076        info.total_allocated += size;
1077        info.current_allocated += size;
1078        info.peak_allocated = info.peak_allocated.max(info.current_allocated);
1079        info.allocation_count += 1;
1080    }
1081
1082    pub fn track_deallocation(&self, operation: &str, size: usize) {
1083        if !self.enabled.load(std::sync::atomic::Ordering::Relaxed) {
1084            return;
1085        }
1086
1087        let mut allocations = self
1088            .allocations
1089            .lock()
1090            .expect("Memory allocations mutex should not be poisoned");
1091        if let Some(info) = allocations.get_mut(operation) {
1092            info.current_allocated = info.current_allocated.saturating_sub(size);
1093        }
1094    }
1095
1096    pub fn report(&self) -> MemoryReport {
1097        let allocations = self
1098            .allocations
1099            .lock()
1100            .expect("Memory allocations mutex should not be poisoned");
1101
1102        let mut operations: Vec<_> = allocations
1103            .iter()
1104            .map(|(name, info)| (name.clone(), info.clone()))
1105            .collect();
1106
1107        operations.sort_by_key(|(_, info)| std::cmp::Reverse(info.peak_allocated));
1108
1109        MemoryReport { operations }
1110    }
1111}
1112
1113#[derive(Debug)]
1114pub struct MemoryReport {
1115    operations: Vec<(String, AllocationInfo)>,
1116}
1117
1118impl MemoryReport {
1119    pub fn display(&self) {
1120        println!("\n=== Memory Usage Report ===\n");
1121
1122        for (name, info) in &self.operations {
1123            println!("{}: ", name);
1124            println!(
1125                "  Total allocated: {} MB",
1126                info.total_allocated / (1024 * 1024)
1127            );
1128            println!("  Peak usage: {} MB", info.peak_allocated / (1024 * 1024));
1129            println!("  Allocations: {}", info.allocation_count);
1130            println!(
1131                "  Avg allocation: {} KB",
1132                (info.total_allocated / info.allocation_count.max(1)) / 1024
1133            );
1134        }
1135    }
1136}
1137
1138// Global memory profiler instance
1139lazy_static::lazy_static! {
1140    static ref MEMORY_PROFILER: Arc<MemoryProfiler> = Arc::new(MemoryProfiler::new());
1141}
1142
1143#[allow(dead_code)]
1144pub fn enable_memory_profiling() {
1145    MEMORY_PROFILER.enable();
1146}
1147
1148#[allow(dead_code)]
1149pub fn disable_memory_profiling() {
1150    MEMORY_PROFILER.disable();
1151}
1152
1153#[allow(dead_code)]
1154pub fn get_memory_report() -> MemoryReport {
1155    MEMORY_PROFILER.report()
1156}
1157
1158use std::sync::atomic::AtomicBool;
1159
1160#[cfg(test)]
1161mod tests {
1162    use super::*;
1163    use scirs2_core::ndarray::array;
1164
1165    #[test]
1166    fn test_profiling_scope() {
1167        enable_profiling();
1168        clear_profiling_data();
1169
1170        {
1171            let _scope = ProfilingScope::new("test_op", &[100, 100], Backend::Cpu);
1172            std::thread::sleep(Duration::from_millis(10));
1173        }
1174
1175        let report = get_performance_report();
1176        assert_eq!(report.operation_breakdown.len(), 1);
1177        assert!(report.operation_breakdown.contains_key("test_op"));
1178    }
1179
1180    #[test]
1181    fn test_benchmark() {
1182        let mut bench = Benchmark::new("array_operations");
1183
1184        bench
1185            .run("baseline", || {
1186                let a = array![[1.0, 2.0], [3.0, 4.0]];
1187                Ok(a.sum())
1188            })
1189            .expect("benchmark baseline run should succeed");
1190
1191        bench
1192            .run("optimized", || {
1193                let a = array![[1.0, 2.0], [3.0, 4.0]];
1194                Ok(a.sum())
1195            })
1196            .expect("benchmark optimized run should succeed");
1197
1198        let comparison = bench.compare();
1199        assert_eq!(comparison.variants.len(), 2);
1200    }
1201}