1use scirs2_core::numeric::Float;
8use std::cmp;
9use std::collections::HashMap;
10use std::fmt::{self, Debug, Display};
11use std::sync::atomic::Ordering;
12use std::sync::{Arc, Mutex};
13use std::thread;
14use std::time::{Duration, Instant};
15
16use crate::backend::Backend;
17use crate::error::NdimageResult;
18
19lazy_static::lazy_static! {
21 static ref PROFILER: Arc<Mutex<Profiler>> = Arc::new(Mutex::new(Profiler::new()));
22}
23
24#[derive(Debug, Clone)]
26pub struct OperationMetrics {
27 pub name: String,
28 pub duration: Duration,
29 pub memory_allocated: usize,
30 pub memory_deallocated: usize,
31 pub arrayshape: Vec<usize>,
32 pub backend: Backend,
33 pub thread_count: usize,
34 pub timestamp: Instant,
35}
36
37impl Display for OperationMetrics {
38 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39 write!(
40 f,
41 "{}: {:.3}ms, shape={:?}, backend={:?}, threads={}",
42 self.name,
43 self.duration.as_secs_f64() * 1000.0,
44 self.arrayshape,
45 self.backend,
46 self.thread_count
47 )
48 }
49}
50
51#[derive(Debug)]
53pub struct Profiler {
54 metrics: Vec<OperationMetrics>,
55 enabled: bool,
56 memory_tracking: bool,
57 current_memory: usize,
58 peak_memory: usize,
59}
60
61impl Profiler {
62 pub fn new() -> Self {
63 Self {
64 metrics: Vec::new(),
65 enabled: false,
66 memory_tracking: false,
67 current_memory: 0,
68 peak_memory: 0,
69 }
70 }
71
72 pub fn enable(&mut self) {
74 self.enabled = true;
75 }
76
77 pub fn disable(&mut self) {
79 self.enabled = false;
80 }
81
82 pub fn enable_memory_tracking(&mut self) {
84 self.memory_tracking = true;
85 }
86
87 pub fn record(&mut self, metric: OperationMetrics) {
89 if self.enabled {
90 self.metrics.push(metric);
91 }
92 }
93
94 pub fn clear(&mut self) {
96 self.metrics.clear();
97 self.current_memory = 0;
98 self.peak_memory = 0;
99 }
100
101 pub fn metrics(&self) -> &[OperationMetrics] {
103 &self.metrics
104 }
105
106 pub fn report(&self) -> PerformanceReport {
108 PerformanceReport::frommetrics(&self.metrics)
109 }
110
111 pub fn track_allocation(&mut self, bytes: usize) {
113 if self.memory_tracking {
114 self.current_memory += bytes;
115 self.peak_memory = self.peak_memory.max(self.current_memory);
116 }
117 }
118
119 pub fn track_deallocation(&mut self, bytes: usize) {
121 if self.memory_tracking {
122 self.current_memory = self.current_memory.saturating_sub(bytes);
123 }
124 }
125}
126
127#[derive(Debug)]
129pub struct PerformanceReport {
130 pub total_time: Duration,
131 pub operation_breakdown: HashMap<String, OperationSummary>,
132 pub backend_usage: HashMap<String, usize>,
133 pub memory_stats: MemoryStats,
134 pub recommendations: Vec<String>,
135}
136
137#[derive(Debug)]
139pub struct OperationSummary {
140 pub count: usize,
141 pub total_time: Duration,
142 pub mean_time: Duration,
143 pub min_time: Duration,
144 pub max_time: Duration,
145 pub std_dev: f64,
146}
147
148#[derive(Debug)]
150pub struct MemoryStats {
151 pub peak_usage: usize,
152 pub total_allocated: usize,
153 pub total_deallocated: usize,
154}
155
156impl PerformanceReport {
157 fn frommetrics(metrics: &[OperationMetrics]) -> Self {
158 let total_time = metrics.iter().map(|m| m.duration).sum();
159
160 let mut op_groups: HashMap<String, Vec<&OperationMetrics>> = HashMap::new();
162 let mut backend_usage: HashMap<String, usize> = HashMap::new();
163
164 for metric in metrics {
165 op_groups
166 .entry(metric.name.clone())
167 .or_default()
168 .push(metric);
169
170 *backend_usage
171 .entry(format!("{:?}", metric.backend))
172 .or_default() += 1;
173 }
174
175 let operation_breakdown: HashMap<String, OperationSummary> = op_groups
177 .into_iter()
178 .map(|(name, group)| {
179 let count = group.len();
180 let total: Duration = group.iter().map(|m| m.duration).sum();
181 let mean = total / count as u32;
182
183 let times: Vec<f64> = group.iter().map(|m| m.duration.as_secs_f64()).collect();
184
185 let min = times
186 .iter()
187 .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
188 .unwrap_or(&0.0);
189 let max = times
190 .iter()
191 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
192 .unwrap_or(&0.0);
193
194 let mean_f64 = times.iter().sum::<f64>() / count as f64;
195 let variance =
196 times.iter().map(|t| (t - mean_f64).powi(2)).sum::<f64>() / count as f64;
197 let std_dev = variance.sqrt();
198
199 (
200 name,
201 OperationSummary {
202 count,
203 total_time: total,
204 mean_time: mean,
205 min_time: Duration::from_secs_f64(*min),
206 max_time: Duration::from_secs_f64(*max),
207 std_dev,
208 },
209 )
210 })
211 .collect();
212
213 let total_allocated: usize = metrics.iter().map(|m| m.memory_allocated).sum();
215 let total_deallocated: usize = metrics.iter().map(|m| m.memory_deallocated).sum();
216 let peak_usage = metrics
217 .iter()
218 .scan(0isize, |acc, m| {
219 *acc += m.memory_allocated as isize - m.memory_deallocated as isize;
220 Some(*acc as usize)
221 })
222 .max()
223 .unwrap_or(0);
224
225 let memory_stats = MemoryStats {
226 peak_usage,
227 total_allocated,
228 total_deallocated,
229 };
230
231 let recommendations =
233 generate_recommendations(&operation_breakdown, &backend_usage, metrics);
234
235 Self {
236 total_time,
237 operation_breakdown,
238 backend_usage,
239 memory_stats,
240 recommendations,
241 }
242 }
243
244 pub fn display(&self) {
246 println!("\n=== Performance Report ===\n");
247
248 println!(
249 "Total execution time: {:.3}ms",
250 self.total_time.as_secs_f64() * 1000.0
251 );
252 println!();
253
254 println!("Operation Breakdown:");
255 let mut ops: Vec<_> = self.operation_breakdown.iter().collect();
256 ops.sort_by_key(|(_, summary)| std::cmp::Reverse(summary.total_time));
257
258 for (name, summary) in ops {
259 println!(" {}: {} calls", name, summary.count);
260 println!(
261 " Total: {:.3}ms ({:.1}%)",
262 summary.total_time.as_secs_f64() * 1000.0,
263 (summary.total_time.as_secs_f64() / self.total_time.as_secs_f64()) * 100.0
264 );
265 println!(
266 " Mean: {:.3}ms, Min: {:.3}ms, Max: {:.3}ms, StdDev: {:.3}ms",
267 summary.mean_time.as_secs_f64() * 1000.0,
268 summary.min_time.as_secs_f64() * 1000.0,
269 summary.max_time.as_secs_f64() * 1000.0,
270 summary.std_dev * 1000.0
271 );
272 }
273 println!();
274
275 println!("Backend Usage:");
276 for (backend, count) in &self.backend_usage {
277 println!(" {}: {} operations", backend, count);
278 }
279 println!();
280
281 println!("Memory Statistics:");
282 println!(
283 " Peak usage: {} MB",
284 self.memory_stats.peak_usage / (1024 * 1024)
285 );
286 println!(
287 " Total allocated: {} MB",
288 self.memory_stats.total_allocated / (1024 * 1024)
289 );
290 println!(
291 " Total deallocated: {} MB",
292 self.memory_stats.total_deallocated / (1024 * 1024)
293 );
294 println!();
295
296 if !self.recommendations.is_empty() {
297 println!("Recommendations:");
298 for rec in &self.recommendations {
299 println!(" • {}", rec);
300 }
301 }
302 }
303}
304
305#[allow(dead_code)]
307fn generate_recommendations(
308 operation_breakdown: &HashMap<String, OperationSummary>,
309 backend_usage: &HashMap<String, usize>,
310 metrics: &[OperationMetrics],
311) -> Vec<String> {
312 let mut recommendations = Vec::new();
313
314 let cpu_only = backend_usage.get("Cpu").copied().unwrap_or(0);
316 let total_ops = backend_usage.values().sum::<usize>();
317
318 if cpu_only == total_ops && total_ops > 10 {
319 let large_arrays = metrics
321 .iter()
322 .filter(|m| m.arrayshape.iter().product::<usize>() > 1_000_000)
323 .count();
324
325 if large_arrays > 0 {
326 recommendations.push(format!(
327 "Consider enabling GPU acceleration - {} operations processed large arrays (>1M elements)",
328 large_arrays
329 ));
330 }
331 }
332
333 for (name, summary) in operation_breakdown {
335 let cv = summary.std_dev / summary.mean_time.as_secs_f64(); if cv > 0.5 && summary.count > 5 {
337 recommendations.push(format!(
338 "High variance in '{}' execution times (CV={:.2}) - consider investigating data-dependent performance",
339 name, cv
340 ));
341 }
342 }
343
344 let total_time_ms = metrics.iter().map(|m| m.duration.as_millis()).sum::<u128>();
346 let ops_per_ms = total_ops as f64 / total_time_ms as f64;
347
348 if ops_per_ms < 0.1 {
349 recommendations.push(
350 "Low throughput detected - consider batch processing or parallelization".to_string(),
351 );
352 }
353
354 recommendations
355}
356
357pub struct ProfilingScope {
359 name: String,
360 start: Instant,
361 shape: Vec<usize>,
362 backend: Backend,
363 initial_memory: usize,
364}
365
366impl ProfilingScope {
367 pub fn new(name: impl Into<String>, shape: &[usize], backend: Backend) -> Self {
368 let profiler = PROFILER
369 .lock()
370 .expect("PROFILER mutex should not be poisoned");
371 let initial_memory = profiler.current_memory;
372 drop(profiler);
373
374 Self {
375 name: name.into(),
376 start: Instant::now(),
377 shape: shape.to_vec(),
378 backend,
379 initial_memory,
380 }
381 }
382}
383
384impl Drop for ProfilingScope {
385 fn drop(&mut self) {
386 let duration = self.start.elapsed();
387 let thread_count = scirs2_core::parallel_ops::get_num_threads();
388
389 let mut profiler = PROFILER
390 .lock()
391 .expect("PROFILER mutex should not be poisoned");
392 let memory_allocated = profiler.current_memory.saturating_sub(self.initial_memory);
393
394 let metric = OperationMetrics {
395 name: self.name.clone(),
396 duration,
397 memory_allocated,
398 memory_deallocated: 0,
399 arrayshape: self.shape.clone(),
400 backend: self.backend,
401 thread_count,
402 timestamp: self.start,
403 };
404
405 profiler.record(metric);
406 }
407}
408
409#[macro_export]
411macro_rules! profile_op {
412 ($name:expr, $shape:expr, $backend:expr, $body:expr) => {{
413 let _scope = $crate::profiling::ProfilingScope::new($name, $shape, $backend);
414 $body
415 }};
416}
417
418#[allow(dead_code)]
420pub fn enable_profiling() {
421 PROFILER
422 .lock()
423 .expect("PROFILER mutex should not be poisoned")
424 .enable();
425}
426
427#[allow(dead_code)]
429pub fn disable_profiling() {
430 PROFILER
431 .lock()
432 .expect("PROFILER mutex should not be poisoned")
433 .disable();
434}
435
436#[allow(dead_code)]
438pub fn enable_memory_tracking() {
439 PROFILER
440 .lock()
441 .expect("PROFILER mutex should not be poisoned")
442 .enable_memory_tracking();
443}
444
445#[allow(dead_code)]
447pub fn clear_profiling_data() {
448 PROFILER
449 .lock()
450 .expect("PROFILER mutex should not be poisoned")
451 .clear();
452}
453
454#[allow(dead_code)]
456pub fn get_performance_report() -> PerformanceReport {
457 PROFILER
458 .lock()
459 .expect("PROFILER mutex should not be poisoned")
460 .report()
461}
462
463#[allow(dead_code)]
465pub fn display_performance_report() {
466 let report = get_performance_report();
467 report.display();
468}
469
470pub struct Benchmark<T> {
472 name: String,
473 iterations: usize,
474 warmup_iterations: usize,
475 results: Vec<BenchmarkResult<T>>,
476}
477
478#[derive(Debug)]
479pub struct BenchmarkResult<T> {
480 pub variant: String,
481 pub times: Vec<Duration>,
482 pub result: T,
483}
484
485impl<T> Benchmark<T> {
486 pub fn new(name: impl Into<String>) -> Self {
487 Self {
488 name: name.into(),
489 iterations: 100,
490 warmup_iterations: 10,
491 results: Vec::new(),
492 }
493 }
494
495 pub fn iterations(mut self, iterations: usize) -> Self {
496 self.iterations = iterations;
497 self
498 }
499
500 pub fn warmup_iterations(mut self, warmup: usize) -> Self {
501 self.warmup_iterations = warmup;
502 self
503 }
504
505 pub fn run<F>(&mut self, variant: impl Into<String>, mut f: F) -> NdimageResult<()>
506 where
507 F: FnMut() -> NdimageResult<T>,
508 {
509 let variant = variant.into();
510
511 for _ in 0..self.warmup_iterations {
513 f()?;
514 }
515
516 let mut times = Vec::with_capacity(self.iterations);
518 let mut result = None;
519
520 for _ in 0..self.iterations {
521 let start = Instant::now();
522 result = Some(f()?);
523 times.push(start.elapsed());
524 }
525
526 self.results.push(BenchmarkResult {
527 variant,
528 times,
529 result: result.expect("Benchmark result should be available after iterations"),
530 });
531
532 Ok(())
533 }
534
535 pub fn compare(&self) -> BenchmarkComparison {
536 BenchmarkComparison::from_results(&self.name, &self.results)
537 }
538}
539
540#[derive(Debug)]
542pub struct BenchmarkComparison {
543 pub name: String,
544 pub variants: Vec<VariantStats>,
545 pub fastest: String,
546 pub baseline: String,
547}
548
549#[derive(Debug)]
550pub struct VariantStats {
551 pub name: String,
552 pub mean: Duration,
553 pub median: Duration,
554 pub std_dev: Duration,
555 pub min: Duration,
556 pub max: Duration,
557 pub speedup: f64,
558}
559
560impl BenchmarkComparison {
561 fn from_results<T>(name: &str, results: &[BenchmarkResult<T>]) -> Self {
562 let mut variants = Vec::new();
563
564 for result in results {
565 let mut times = result.times.clone();
566 times.sort();
567
568 let mean = times.iter().sum::<Duration>() / times.len() as u32;
569 let median = times[times.len() / 2];
570 let min = times[0];
571 let max = times[times.len() - 1];
572
573 let mean_nanos = mean.as_nanos() as f64;
574 let variance = times
575 .iter()
576 .map(|t| {
577 let diff = t.as_nanos() as f64 - mean_nanos;
578 diff * diff
579 })
580 .sum::<f64>()
581 / times.len() as f64;
582 let std_dev = Duration::from_nanos(variance.sqrt() as u64);
583
584 variants.push(VariantStats {
585 name: result.variant.clone(),
586 mean,
587 median,
588 std_dev,
589 min,
590 max,
591 speedup: 1.0, });
593 }
594
595 let fastest_idx = variants
597 .iter()
598 .enumerate()
599 .min_by_key(|(_, v)| v.median)
600 .map(|(i, _)| i)
601 .unwrap_or(0);
602
603 let fastest = variants[fastest_idx].name.clone();
604 let baseline = variants.first().map(|v| v.name.clone()).unwrap_or_default();
605
606 let baseline_time = variants[0].median.as_nanos() as f64;
608 for variant in &mut variants {
609 variant.speedup = baseline_time / variant.median.as_nanos() as f64;
610 }
611
612 Self {
613 name: name.to_string(),
614 variants,
615 fastest,
616 baseline,
617 }
618 }
619
620 pub fn display(&self) {
621 println!("\n=== Benchmark: {} ===\n", self.name);
622
623 for variant in &self.variants {
624 println!("{}: ", variant.name);
625 println!(
626 " Mean: {:.3}ms ± {:.3}ms",
627 variant.mean.as_secs_f64() * 1000.0,
628 variant.std_dev.as_secs_f64() * 1000.0
629 );
630 println!(" Median: {:.3}ms", variant.median.as_secs_f64() * 1000.0);
631 println!(
632 " Min: {:.3}ms, Max: {:.3}ms",
633 variant.min.as_secs_f64() * 1000.0,
634 variant.max.as_secs_f64() * 1000.0
635 );
636
637 if variant.name == self.baseline {
638 println!(" (baseline)");
639 } else {
640 println!(" Speedup: {:.2}x", variant.speedup);
641 }
642 println!();
643 }
644
645 println!(
646 "Fastest: {} ({:.2}x faster than baseline)",
647 self.fastest,
648 self.variants
649 .iter()
650 .find(|v| v.name == self.fastest)
651 .map(|v| v.speedup)
652 .unwrap_or(1.0)
653 );
654 }
655}
656
657pub struct AutoTuner {
659 pub name: String,
660 pub test_data: Vec<(String, Box<dyn Fn() -> NdimageResult<Duration>>)>,
661}
662
663impl AutoTuner {
664 pub fn new(name: impl Into<String>) -> Self {
665 Self {
666 name: name.into(),
667 test_data: Vec::new(),
668 }
669 }
670
671 pub fn add_variant<F>(&mut self, name: impl Into<String>, f: F)
672 where
673 F: Fn() -> NdimageResult<Duration> + 'static,
674 {
675 self.test_data.push((name.into(), Box::new(f)));
676 }
677
678 pub fn find_optimal(&self) -> NdimageResult<String> {
679 let mut best_time = Duration::MAX;
680 let mut best_variant = String::new();
681
682 for (name, test_fn) in &self.test_data {
683 let time = test_fn()?;
684 if time < best_time {
685 best_time = time;
686 best_variant = name.clone();
687 }
688 }
689
690 Ok(best_variant)
691 }
692}
693
694pub struct OptimizationAdvisor {
698 metrics: Vec<OperationMetrics>,
699 hardware_info: HardwareInfo,
700}
701
702#[derive(Debug, Clone)]
703pub struct HardwareInfo {
704 pub cpu_cores: usize,
705 pub simd_support: SimdSupport,
706 pub gpu_available: bool,
707 pub total_memory: usize,
708 pub cache_sizes: CacheSizes,
709}
710
711#[derive(Debug, Clone)]
712pub struct SimdSupport {
713 pub sse: bool,
714 pub avx: bool,
715 pub avx2: bool,
716 pub avx512: bool,
717 pub neon: bool,
718}
719
720#[derive(Debug, Clone)]
721pub struct CacheSizes {
722 pub l1: usize,
723 pub l2: usize,
724 pub l3: usize,
725}
726
727impl OptimizationAdvisor {
728 pub fn new() -> Self {
729 Self {
730 metrics: Vec::new(),
731 hardware_info: HardwareInfo::detect(),
732 }
733 }
734
735 pub fn analyze(&mut self, metrics: &[OperationMetrics]) -> OptimizationReport {
736 self.metrics = metrics.to_vec();
737
738 let mut recommendations = Vec::new();
739
740 recommendations.extend(self.analyze_memory_patterns());
742
743 recommendations.extend(self.analyze_computation_patterns());
745
746 recommendations.extend(self.analyze_parallelization());
748
749 recommendations.extend(self.analyze_gpu_opportunities());
751
752 let estimated_speedup = self.estimate_speedup(&recommendations);
753 let implementation_difficulty = self.assess_difficulty(&recommendations);
754
755 OptimizationReport {
756 recommendations,
757 estimated_speedup,
758 implementation_difficulty,
759 }
760 }
761
762 fn analyze_memory_patterns(&self) -> Vec<OptimizationRecommendation> {
763 let mut recommendations = Vec::new();
764
765 let mut op_groups: HashMap<String, Vec<&OperationMetrics>> = HashMap::new();
767 for metric in &self.metrics {
768 op_groups
769 .entry(metric.name.clone())
770 .or_default()
771 .push(metric);
772 }
773
774 for (op_name, metrics) in op_groups {
776 let avg_array_size: usize = metrics
777 .iter()
778 .map(|m| m.arrayshape.iter().product::<usize>())
779 .sum::<usize>()
780 / metrics.len().max(1);
781
782 let element_size = std::mem::size_of::<f64>(); let working_set_size = avg_array_size * element_size;
784
785 if working_set_size > self.hardware_info.cache_sizes.l3 {
786 recommendations.push(OptimizationRecommendation {
787 operation: op_name.clone(),
788 category: OptimizationCategory::Memory,
789 description: "Working set exceeds L3 cache".to_string(),
790 suggestion: "Consider tiling/blocking to improve cache locality".to_string(),
791 estimated_improvement: 1.5,
792 });
793 }
794
795 if op_name.contains("transpose") || op_name.contains("permute") {
797 recommendations.push(OptimizationRecommendation {
798 operation: op_name,
799 category: OptimizationCategory::Memory,
800 description: "Potentially cache-unfriendly access pattern".to_string(),
801 suggestion: "Use blocked/tiled algorithms for better cache usage".to_string(),
802 estimated_improvement: 1.3,
803 });
804 }
805 }
806
807 recommendations
808 }
809
810 fn analyze_computation_patterns(&self) -> Vec<OptimizationRecommendation> {
811 let mut recommendations = Vec::new();
812
813 for metric in &self.metrics {
815 let array_size: usize = metric.arrayshape.iter().product();
816
817 if array_size > 1000 && !metric.name.contains("simd") {
818 if self.hardware_info.simd_support.avx2 {
819 recommendations.push(OptimizationRecommendation {
820 operation: metric.name.clone(),
821 category: OptimizationCategory::Vectorization,
822 description: "Operation could benefit from SIMD vectorization".to_string(),
823 suggestion: "Implement SIMD version using AVX2 intrinsics".to_string(),
824 estimated_improvement: 2.0,
825 });
826 }
827 }
828 }
829
830 recommendations
831 }
832
833 fn analyze_parallelization(&self) -> Vec<OptimizationRecommendation> {
834 let mut recommendations = Vec::new();
835
836 for metric in &self.metrics {
837 let array_size: usize = metric.arrayshape.iter().product();
838
839 if array_size > 50_000 && metric.thread_count == 1 {
841 recommendations.push(OptimizationRecommendation {
842 operation: metric.name.clone(),
843 category: OptimizationCategory::Parallelization,
844 description: "Large operation running on single thread".to_string(),
845 suggestion: format!(
846 "Parallelize across {} cores for better performance",
847 self.hardware_info.cpu_cores
848 ),
849 estimated_improvement: (self.hardware_info.cpu_cores as f64).min(4.0),
850 });
851 }
852 }
853
854 recommendations
855 }
856
857 fn analyze_gpu_opportunities(&self) -> Vec<OptimizationRecommendation> {
858 let mut recommendations = Vec::new();
859
860 if !self.hardware_info.gpu_available {
861 return recommendations;
862 }
863
864 for metric in &self.metrics {
865 let array_size: usize = metric.arrayshape.iter().product();
866
867 if array_size > 1_000_000 && metric.backend == Backend::Cpu {
869 recommendations.push(OptimizationRecommendation {
870 operation: metric.name.clone(),
871 category: OptimizationCategory::GpuOffloading,
872 description: "Large array operation suitable for GPU acceleration".to_string(),
873 suggestion: "Offload to GPU for significant speedup".to_string(),
874 estimated_improvement: 10.0,
875 });
876 }
877 }
878
879 recommendations
880 }
881
882 fn estimate_speedup(&self, recommendations: &[OptimizationRecommendation]) -> f64 {
883 let mut total_improvement = 1.0;
885
886 for rec in recommendations {
887 total_improvement *= 1.0 + (rec.estimated_improvement - 1.0) * 0.7;
889 }
890
891 total_improvement
892 }
893
894 fn assess_difficulty(
895 &self,
896 recommendations: &[OptimizationRecommendation],
897 ) -> ImplementationDifficulty {
898 let max_difficulty = recommendations
899 .iter()
900 .map(|r| match r.category {
901 OptimizationCategory::Memory => 2,
902 OptimizationCategory::Vectorization => 3,
903 OptimizationCategory::Parallelization => 2,
904 OptimizationCategory::GpuOffloading => 4,
905 OptimizationCategory::Algorithm => 3,
906 })
907 .max()
908 .unwrap_or(1);
909
910 match max_difficulty {
911 1 => ImplementationDifficulty::Easy,
912 2 => ImplementationDifficulty::Moderate,
913 3 => ImplementationDifficulty::Hard,
914 _ => ImplementationDifficulty::Expert,
915 }
916 }
917}
918
919#[derive(Debug)]
920pub struct OptimizationReport {
921 pub recommendations: Vec<OptimizationRecommendation>,
922 pub estimated_speedup: f64,
923 pub implementation_difficulty: ImplementationDifficulty,
924}
925
926#[derive(Debug)]
927pub struct OptimizationRecommendation {
928 pub operation: String,
929 pub category: OptimizationCategory,
930 pub description: String,
931 pub suggestion: String,
932 pub estimated_improvement: f64,
933}
934
935#[derive(Debug)]
936pub enum OptimizationCategory {
937 Memory,
938 Vectorization,
939 Parallelization,
940 GpuOffloading,
941 Algorithm,
942}
943
944#[derive(Debug)]
945pub enum ImplementationDifficulty {
946 Easy,
947 Moderate,
948 Hard,
949 Expert,
950}
951
952impl HardwareInfo {
953 fn detect() -> Self {
954 Self {
955 cpu_cores: num_cpus::get(),
956 simd_support: SimdSupport::detect(),
957 gpu_available: cfg!(feature = "cuda") || cfg!(feature = "opencl"),
958 total_memory: 16_000_000_000, cache_sizes: CacheSizes {
960 l1: 32_768, l2: 262_144, l3: 8_388_608, },
964 }
965 }
966}
967
968impl SimdSupport {
969 fn detect() -> Self {
970 #[cfg(target_arch = "x86_64")]
971 {
972 Self {
973 sse: is_x86_feature_detected!("sse"),
974 avx: is_x86_feature_detected!("avx"),
975 avx2: is_x86_feature_detected!("avx2"),
976 avx512: false, neon: false,
978 }
979 }
980 #[cfg(target_arch = "aarch64")]
981 {
982 Self {
983 sse: false,
984 avx: false,
985 avx2: false,
986 avx512: false,
987 neon: true,
988 }
989 }
990 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
991 {
992 Self {
993 sse: false,
994 avx: false,
995 avx2: false,
996 avx512: false,
997 neon: false,
998 }
999 }
1000 }
1001}
1002
1003impl OptimizationReport {
1004 pub fn display(&self) {
1005 println!("\n=== Optimization Report ===\n");
1006
1007 println!("Estimated overall speedup: {:.1}x", self.estimated_speedup);
1008 println!(
1009 "Implementation difficulty: {:?}\n",
1010 self.implementation_difficulty
1011 );
1012
1013 println!("Recommendations:");
1014 for (i, rec) in self.recommendations.iter().enumerate() {
1015 println!("\n{}. {} - {:?}", i + 1, rec.operation, rec.category);
1016 println!(" Issue: {}", rec.description);
1017 println!(" Suggestion: {}", rec.suggestion);
1018 println!(
1019 " Potential improvement: {:.1}x",
1020 rec.estimated_improvement
1021 );
1022 }
1023 }
1024}
1025
1026pub struct MemoryProfiler {
1028 allocations: Mutex<HashMap<String, AllocationInfo>>,
1029 enabled: AtomicBool,
1030}
1031
1032#[derive(Debug, Clone)]
1033struct AllocationInfo {
1034 total_allocated: usize,
1035 current_allocated: usize,
1036 peak_allocated: usize,
1037 allocation_count: usize,
1038}
1039
1040impl MemoryProfiler {
1041 pub fn new() -> Self {
1042 Self {
1043 allocations: Mutex::new(HashMap::new()),
1044 enabled: AtomicBool::new(false),
1045 }
1046 }
1047
1048 pub fn enable(&self) {
1049 self.enabled
1050 .store(true, std::sync::atomic::Ordering::Relaxed);
1051 }
1052
1053 pub fn disable(&self) {
1054 self.enabled
1055 .store(false, std::sync::atomic::Ordering::Relaxed);
1056 }
1057
1058 pub fn track_allocation(&self, operation: &str, size: usize) {
1059 if !self.enabled.load(std::sync::atomic::Ordering::Relaxed) {
1060 return;
1061 }
1062
1063 let mut allocations = self
1064 .allocations
1065 .lock()
1066 .expect("Memory allocations mutex should not be poisoned");
1067 let info = allocations
1068 .entry(operation.to_string())
1069 .or_insert(AllocationInfo {
1070 total_allocated: 0,
1071 current_allocated: 0,
1072 peak_allocated: 0,
1073 allocation_count: 0,
1074 });
1075
1076 info.total_allocated += size;
1077 info.current_allocated += size;
1078 info.peak_allocated = info.peak_allocated.max(info.current_allocated);
1079 info.allocation_count += 1;
1080 }
1081
1082 pub fn track_deallocation(&self, operation: &str, size: usize) {
1083 if !self.enabled.load(std::sync::atomic::Ordering::Relaxed) {
1084 return;
1085 }
1086
1087 let mut allocations = self
1088 .allocations
1089 .lock()
1090 .expect("Memory allocations mutex should not be poisoned");
1091 if let Some(info) = allocations.get_mut(operation) {
1092 info.current_allocated = info.current_allocated.saturating_sub(size);
1093 }
1094 }
1095
1096 pub fn report(&self) -> MemoryReport {
1097 let allocations = self
1098 .allocations
1099 .lock()
1100 .expect("Memory allocations mutex should not be poisoned");
1101
1102 let mut operations: Vec<_> = allocations
1103 .iter()
1104 .map(|(name, info)| (name.clone(), info.clone()))
1105 .collect();
1106
1107 operations.sort_by_key(|(_, info)| std::cmp::Reverse(info.peak_allocated));
1108
1109 MemoryReport { operations }
1110 }
1111}
1112
1113#[derive(Debug)]
1114pub struct MemoryReport {
1115 operations: Vec<(String, AllocationInfo)>,
1116}
1117
1118impl MemoryReport {
1119 pub fn display(&self) {
1120 println!("\n=== Memory Usage Report ===\n");
1121
1122 for (name, info) in &self.operations {
1123 println!("{}: ", name);
1124 println!(
1125 " Total allocated: {} MB",
1126 info.total_allocated / (1024 * 1024)
1127 );
1128 println!(" Peak usage: {} MB", info.peak_allocated / (1024 * 1024));
1129 println!(" Allocations: {}", info.allocation_count);
1130 println!(
1131 " Avg allocation: {} KB",
1132 (info.total_allocated / info.allocation_count.max(1)) / 1024
1133 );
1134 }
1135 }
1136}
1137
1138lazy_static::lazy_static! {
1140 static ref MEMORY_PROFILER: Arc<MemoryProfiler> = Arc::new(MemoryProfiler::new());
1141}
1142
1143#[allow(dead_code)]
1144pub fn enable_memory_profiling() {
1145 MEMORY_PROFILER.enable();
1146}
1147
1148#[allow(dead_code)]
1149pub fn disable_memory_profiling() {
1150 MEMORY_PROFILER.disable();
1151}
1152
1153#[allow(dead_code)]
1154pub fn get_memory_report() -> MemoryReport {
1155 MEMORY_PROFILER.report()
1156}
1157
1158use std::sync::atomic::AtomicBool;
1159
1160#[cfg(test)]
1161mod tests {
1162 use super::*;
1163 use scirs2_core::ndarray::array;
1164
1165 #[test]
1166 fn test_profiling_scope() {
1167 enable_profiling();
1168 clear_profiling_data();
1169
1170 {
1171 let _scope = ProfilingScope::new("test_op", &[100, 100], Backend::Cpu);
1172 std::thread::sleep(Duration::from_millis(10));
1173 }
1174
1175 let report = get_performance_report();
1176 assert_eq!(report.operation_breakdown.len(), 1);
1177 assert!(report.operation_breakdown.contains_key("test_op"));
1178 }
1179
1180 #[test]
1181 fn test_benchmark() {
1182 let mut bench = Benchmark::new("array_operations");
1183
1184 bench
1185 .run("baseline", || {
1186 let a = array![[1.0, 2.0], [3.0, 4.0]];
1187 Ok(a.sum())
1188 })
1189 .expect("benchmark baseline run should succeed");
1190
1191 bench
1192 .run("optimized", || {
1193 let a = array![[1.0, 2.0], [3.0, 4.0]];
1194 Ok(a.sum())
1195 })
1196 .expect("benchmark optimized run should succeed");
1197
1198 let comparison = bench.compare();
1199 assert_eq!(comparison.variants.len(), 2);
1200 }
1201}