Skip to main content

torsh_profiler/
lib.rs

1//! Performance profiling for ToRSh
2//!
3//! This crate provides comprehensive performance profiling capabilities for the ToRSh
4//! deep learning framework, including CPU, GPU, memory, and system profiling.
5//!
6//! # Refactored Modular Structure
7//!
8//! The profiler has been successfully refactored from a massive 9,517-line monolithic file
9//! into a clean, maintainable modular structure:
10//!
11//! - `core`: Core profiling types, event management, and profiler implementation
12//! - `platforms`: Platform-specific profiling (CPU, GPU, system)
13//! - `analysis`: Performance analysis and optimization recommendations
14//! - `export`: Export and reporting functionality with multiple format support
15//! - `distributed`: Distributed profiling coordination
16//!
17//! # Usage Examples
18//!
19//! ## Basic Profiling
20//!
21//! ```rust
22//! use torsh_profiler::{start_profiling, stop_profiling, profile_scope};
23//!
24//! // Start global profiling
25//! start_profiling();
26//!
27//! {
28//!     profile_scope!("computation");
29//!     // Your code here
30//! }
31//!
32//! stop_profiling();
33//! ```
34//!
35//! ## Advanced Profiling with Metrics
36//!
37//! ```rust
38//! use torsh_profiler::{MetricsScope, export_global_events, ExportFormat};
39//!
40//! fn main() -> Result<(), Box<dyn std::error::Error>> {
41//!     {
42//!         let mut scope = MetricsScope::new("training_step");
43//!         scope.set_operation_count(1000);
44//!         scope.set_flops(50000);
45//!         scope.set_bytes_transferred(4096);
46//!         // Training code here
47//!     }
48//!
49//!     // Export results
50//!     export_global_events(ExportFormat::ChromeTrace, "profile.json")?;
51//!     Ok(())
52//! }
53//! ```
54//!
55//! ## Platform-Specific Profiling
56//!
57//! ```rust
58//! use torsh_profiler::{UnifiedProfiler, CudaProfiler, MemoryProfiler};
59//!
60//! fn main() -> Result<(), Box<dyn std::error::Error>> {
61//!     let mut profiler = UnifiedProfiler::with_auto_detection();
62//!     profiler.start_all()?;
63//!
64//!     // Your GPU/CPU workload
65//!
66//!     profiler.stop_all()?;
67//!     Ok(())
68//! }
69//! ```
70
71// Allow attributes for library code that may have unused items in public API
72#![allow(dead_code)]
73#![allow(unused_imports)]
74#![allow(unused_variables)]
75#![allow(unused_mut)]
76#![allow(static_mut_refs)]
77
78use backtrace::Backtrace;
79use once_cell::sync::Lazy;
80use parking_lot::Mutex;
81use std::sync::Arc;
82use std::time::Instant;
83use torsh_core::{Result, TorshError};
84
85/// Convenience type alias for Results in this crate
86pub type TorshResult<T> = Result<T>;
87
88// ========================================
89// CORE MODULAR STRUCTURE
90// ========================================
91
92/// Core profiling functionality
93pub mod core;
94
95/// Platform-specific profiling implementations
96pub mod platforms;
97
98/// Performance analysis and optimization
99pub mod analysis;
100
101/// Export and reporting capabilities
102pub mod export;
103
104/// Distributed profiling coordination
105pub mod distributed;
106
107// ========================================
108// EXISTING MODULES (maintained for compatibility)
109// ========================================
110
111pub mod advanced_visualization;
112pub mod alerts;
113pub mod amd;
114pub mod attributes;
115pub mod chrome_trace;
116pub mod ci_cd;
117pub mod cloud_providers;
118pub mod cloudwatch;
119pub mod cpu;
120pub mod cross_platform;
121pub mod cuda;
122pub mod custom_export;
123pub mod custom_tools;
124pub mod dashboard;
125pub mod grafana;
126pub mod instruments;
127pub mod integrated_profiler;
128pub mod kubernetes;
129pub mod macros;
130pub mod memory;
131pub mod memory_optimization;
132pub mod ml_analysis;
133pub mod nsight;
134pub mod online_learning;
135pub mod optimization;
136pub mod power;
137pub mod prometheus;
138pub mod regression;
139pub mod reporting;
140pub mod scirs2_integration;
141pub mod streaming;
142pub mod tensorboard;
143pub mod thermal;
144pub mod vtune;
145pub mod workload_characterization;
146
147// ========================================
148// STRUCTURED RE-EXPORTS FOR ENHANCED API
149// ========================================
150
151// Core profiling functionality - Enhanced interface
152pub use core::{
153    add_global_event,
154    add_global_event as add_event,
155    clear_global_events,
156    // Events and metrics
157    events::*,
158    get_global_stats,
159
160    global_profiler,
161    metrics::*,
162    profile_function_with_category,
163
164    start_profiling,
165    stop_profiling,
166    MetricsScope,
167    // Core profiler implementation
168    Profiler,
169    // Scope-based profiling
170    ScopeGuard,
171};
172
173// Enhanced export functionality
174pub use export::{
175    available_format_names,
176    // Existing functionality
177    dashboard::*,
178    export_chrome_trace_format,
179    export_csv_format,
180
181    export_events,
182    export_global_events,
183    export_json_format,
184    formats::*,
185    parse_format,
186    reporting::*,
187    ExportFormat,
188};
189
190// Prometheus metrics export
191pub use prometheus::{PrometheusExporter, PrometheusExporterBuilder};
192
193// Grafana dashboard integration
194pub use grafana::{
195    Dashboard as GrafanaDashboard, DashboardTemplates, GrafanaDashboardGenerator, GridPos, Panel,
196    Target,
197};
198
199// AWS CloudWatch metrics integration
200pub use cloudwatch::{
201    CloudWatchConfig, CloudWatchPublisher, CloudWatchPublisherBuilder, Dimension, MetricDatum,
202    StatisticSet, Unit as CloudWatchUnit,
203};
204
205// Platform profiling interfaces
206pub use platforms::{cpu::*, gpu::*, system::*};
207
208// Analysis capabilities
209pub use analysis::{ml_analysis::*, optimization::*, regression::*};
210
211// Distributed profiling
212pub use distributed::profiling::*;
213
214// Real-time streaming capabilities
215pub use streaming::{
216    create_high_performance_streaming_engine, create_low_latency_streaming_engine,
217    create_streaming_engine, AdaptiveBitrateConfig, AdaptiveRateController, AdjustmentReason,
218    AdvancedFeatures, BitrateAdjustment, BufferedEvent, CompressionAlgorithm, CompressionConfig,
219    CompressionManager, ConnectionManager, ControlMessage, EnhancedStreamingEngine, EventBuffer,
220    EventPriority, ProtocolConfig, QualityConfig, QualityLevel, QualityMetricsThreshold,
221    SSEConnection, StreamConnection, StreamingConfig, StreamingProtocol, StreamingStats,
222    StreamingStatsSnapshot, TcpConnection, UdpConnection, WebSocketConnection, WebSocketMessage,
223};
224
225// ========================================
226// ESSENTIAL BACKWARD COMPATIBILITY RE-EXPORTS
227// ========================================
228
229// Critical re-exports for existing API compatibility
230pub use alerts::{
231    create_alert_manager_with_config, get_alert_manager, AlertConfig, AlertManager,
232    NotificationChannel,
233};
234pub use attributes::{
235    get_registry, with_profiling, AsyncProfiler, AttributeRegistry, ConditionalProfiler,
236    ProfileAttribute, ProfiledFunction, ProfiledStruct,
237};
238pub use chrome_trace::{create_chrome_event, export, export_to_writer, phases, scopes};
239pub use ci_cd::{CiCdConfig, CiCdIntegration, CiCdPlatform};
240pub use cpu::{CpuProfiler, ProfileScope};
241pub use cuda::{
242    get_cuda_device_properties, get_cuda_memory_stats, CudaEvent, CudaMemoryStats, CudaProfiler,
243    CudaSynchronizationStats, NvtxRange,
244};
245pub use custom_export::{
246    CsvColumn, CsvFormatter, CustomExportFormat, CustomExporter, ExportSchema,
247};
248pub use dashboard::alerts::create_alert_manager;
249pub use dashboard::{
250    create_dashboard, create_dashboard_with_config, export_dashboard_html, generate_3d_landscape,
251    generate_performance_heatmap, Dashboard, DashboardAlert, DashboardAlertSeverity,
252    DashboardConfig, DashboardData, HeatmapCell, MemoryMetrics, OperationSummary,
253    PerformanceHeatmap, PerformanceLandscape, PerformanceMetrics, PerformancePoint3D,
254    SystemMetrics, VisualizationColorScheme, VisualizationConfig, WebSocketConfig,
255};
256
257// SCIRS2 Integration re-exports - Enhanced with comprehensive features
258pub use scirs2_integration::{
259    AdvancedProfilingConfig, BenchmarkResults, HistogramStats, MetricsSummary, PerformanceAnalysis,
260    PerformanceTargets, SamplingStrategy, ScirS2EnhancedProfiler, ScirS2ProfilingData,
261    ValidationLevel,
262};
263
264// Memory Optimization re-exports
265pub use instruments::{
266    create_instruments_profiler, create_instruments_profiler_with_config, export_instruments_json,
267    get_instruments_statistics, AllocationType, EnergyComponent, InstrumentsConfig,
268    InstrumentsExportData, InstrumentsProfiler, InstrumentsStats, SignpostInterval,
269};
270pub use macros::ProfileResult;
271pub use memory::{
272    FragmentationAnalysis, LeakDetectionResults, MemoryBlock, MemoryEvent, MemoryEventType,
273    MemoryLeak, MemoryProfiler, MemoryStats, MemoryTimeline, SystemMemoryInfo,
274};
275pub use memory_optimization::{
276    create_memory_optimizer, create_memory_optimizer_for_low_memory,
277    create_memory_optimizer_with_aggressive_settings, AdaptivePoolManager, AdvancedMemoryOptimizer,
278    MemoryOptimizationConfig, MemoryOptimizationStats, MemorySnapshot, MemoryStrategies,
279    MemoryUsagePredictor, OptimizationExportData, OptimizationStatsSummary,
280};
281
282// ========================================
283// ENHANCED UNIFIED PROFILING INTERFACE
284// ========================================
285
286/// Enhanced unified profiler combining all platform profilers with simplified API
287pub struct UnifiedProfiler {
288    pub cpu_platform: platforms::cpu::CpuProfilerPlatform,
289    pub gpu_platform: platforms::gpu::GpuProfilerPlatform,
290    pub system_platform: platforms::system::SystemProfilerPlatform,
291    pub event_collector: core::events::EventCollector,
292}
293
294impl UnifiedProfiler {
295    /// Create a new unified profiler with all platforms
296    pub fn new() -> Self {
297        Self {
298            cpu_platform: platforms::cpu::CpuProfilerPlatform::new(),
299            gpu_platform: platforms::gpu::GpuProfilerPlatform::new(),
300            system_platform: platforms::system::SystemProfilerPlatform::new(),
301            event_collector: core::events::EventCollector::new(),
302        }
303    }
304
305    /// Create with optimal platform detection
306    pub fn with_auto_detection() -> Self {
307        let cpu_platform = platforms::cpu::CpuProfilerPlatform::new().with_cpu_profiler();
308
309        #[cfg(target_os = "macos")]
310        let cpu_platform = cpu_platform.with_instruments();
311
312        #[cfg(target_os = "linux")]
313        let cpu_platform = cpu_platform.with_vtune();
314
315        let gpu_platform = platforms::gpu::GpuProfilerPlatform::new().with_optimal_profiler();
316        let system_platform =
317            platforms::system::SystemProfilerPlatform::new().with_all_system_profiling();
318
319        Self {
320            cpu_platform,
321            gpu_platform,
322            system_platform,
323            event_collector: core::events::EventCollector::new(),
324        }
325    }
326
327    /// Start all profiling platforms
328    pub fn start_all(&mut self) -> TorshResult<()> {
329        self.cpu_platform.start_profiling()?;
330        self.gpu_platform.start_profiling()?;
331        self.system_platform.start_profiling()?;
332        Ok(())
333    }
334
335    /// Stop all profiling platforms
336    pub fn stop_all(&mut self) -> TorshResult<()> {
337        self.cpu_platform.stop_profiling()?;
338        self.gpu_platform.stop_profiling()?;
339        self.system_platform.stop_profiling()?;
340        Ok(())
341    }
342
343    /// Export all collected data in specified format
344    pub fn export_all(&self, format: export::ExportFormat, base_path: &str) -> TorshResult<()> {
345        let profiling_events = self.event_collector.get_events();
346        // Convert ProfilingEvent to ProfileEvent
347        let events: Vec<ProfileEvent> = profiling_events
348            .iter()
349            .map(|pe| ProfileEvent {
350                name: pe.name.clone(),
351                category: pe.category.clone(),
352                start_us: pe.start_time.elapsed().as_micros() as u64,
353                duration_us: pe.duration.map(|d| d.as_micros() as u64).unwrap_or(0),
354                thread_id: pe.thread_id,
355                operation_count: None,
356                flops: None,
357                bytes_transferred: None,
358                stack_trace: None,
359            })
360            .collect();
361        export::export_events(&events, format, base_path)
362    }
363}
364
365impl Default for UnifiedProfiler {
366    fn default() -> Self {
367        Self::new()
368    }
369}
370
371// ========================================
372// CONVENIENCE FACTORY FUNCTIONS
373// ========================================
374
375/// Create a unified profiler with automatic platform detection
376pub fn create_unified_profiler() -> UnifiedProfiler {
377    UnifiedProfiler::with_auto_detection()
378}
379
380/// Create a basic profiler for development
381pub fn create_basic_profiler() -> UnifiedProfiler {
382    UnifiedProfiler::new()
383}
384
385/// Create a profiler optimized for production use
386pub fn create_production_profiler() -> UnifiedProfiler {
387    let mut profiler = UnifiedProfiler::with_auto_detection();
388    // Configure for minimal overhead
389    profiler
390}
391
392// ========================================
393// ENHANCED GLOBAL API FUNCTIONS
394// ========================================
395
396/// Enhanced global export functions with multiple format support
397pub fn export_global_trace(path: &str) -> TorshResult<()> {
398    export_global_events(export::ExportFormat::ChromeTrace, path)
399}
400
401pub fn export_global_json(path: &str) -> TorshResult<()> {
402    export_global_events(export::ExportFormat::Json, path)
403}
404
405pub fn export_global_csv(path: &str) -> TorshResult<()> {
406    export_global_events(export::ExportFormat::Csv, path)
407}
408
409pub fn export_global_tensorboard(base_path: &str) -> TorshResult<()> {
410    let profiler_arc = global_profiler();
411    let profiler_guard = profiler_arc.lock();
412    let events = profiler_guard.events().to_vec();
413
414    crate::tensorboard::export_tensorboard_profile(&events, base_path)
415}
416
417/// Global custom exporter instance
418static GLOBAL_CUSTOM_EXPORTER: Lazy<Mutex<custom_export::CustomExporter>> =
419    Lazy::new(|| Mutex::new(custom_export::CustomExporter::new()));
420
421/// Get available custom export format names
422pub fn get_global_custom_export_formats() -> Vec<String> {
423    let exporter = GLOBAL_CUSTOM_EXPORTER.lock();
424    exporter.get_format_names()
425}
426
427/// Register a custom export format globally
428pub fn register_global_custom_export_format(format: custom_export::CustomExportFormat) {
429    let mut exporter = GLOBAL_CUSTOM_EXPORTER.lock();
430    exporter.register_format(format);
431}
432
433/// Export using a custom format
434pub fn export_global_custom(format_name: &str, path: &str) -> TorshResult<()> {
435    let profiler_arc = global_profiler();
436    let profiler_guard = profiler_arc.lock();
437    let events = profiler_guard.events().to_vec();
438    drop(profiler_guard);
439
440    let exporter = GLOBAL_CUSTOM_EXPORTER.lock();
441    exporter.export(&events, format_name, path)
442}
443
444/// Set global stack traces enabled with enhanced functionality
445pub fn set_global_stack_traces_enabled(enabled: bool) {
446    core::profiler::set_global_stack_traces_enabled(enabled);
447}
448
449/// Performance anomaly data structure
450#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
451pub struct PerformanceAnomaly {
452    pub event_name: String,
453    pub description: String,
454    pub confidence: f64,
455    pub severity: String,
456}
457
458/// Memory anomaly data structure
459#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
460pub struct MemoryAnomaly {
461    pub anomaly_type: String,
462    pub confidence: f64,
463}
464
465/// Anomaly analysis result structure
466#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
467pub struct AnomalyAnalysis {
468    pub performance_anomalies: Vec<PerformanceAnomaly>,
469    pub memory_anomalies: Vec<MemoryAnomaly>,
470    pub throughput_anomalies: Vec<String>,
471    pub temporal_anomalies: Vec<String>,
472}
473
474/// Performance pattern data structure
475#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
476pub struct PerformancePattern {
477    pub pattern_type: String,
478    pub description: String,
479    pub confidence_score: f64,
480    pub optimization_type: String,
481    pub potential_improvement: String,
482    pub implementation_complexity: String,
483}
484
485/// Pattern analysis result structure
486#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
487pub struct PatternAnalysis {
488    pub performance_patterns: Vec<PerformancePattern>,
489    pub bottleneck_patterns: Vec<String>,
490    pub resource_patterns: Vec<String>,
491    pub temporal_patterns: Vec<String>,
492    pub optimization_patterns: Vec<PerformancePattern>,
493}
494
495/// Detect global anomalies in profiling data (stub implementation)
496pub fn detect_global_anomalies() -> AnomalyAnalysis {
497    AnomalyAnalysis {
498        performance_anomalies: Vec::new(),
499        memory_anomalies: Vec::new(),
500        throughput_anomalies: Vec::new(),
501        temporal_anomalies: Vec::new(),
502    }
503}
504
505/// Detect global patterns in profiling data (stub implementation)
506pub fn detect_global_patterns() -> PatternAnalysis {
507    PatternAnalysis {
508        performance_patterns: Vec::new(),
509        bottleneck_patterns: Vec::new(),
510        resource_patterns: Vec::new(),
511        temporal_patterns: Vec::new(),
512        optimization_patterns: Vec::new(),
513    }
514}
515
516/// Export global anomaly analysis (stub implementation)
517pub fn export_global_anomaly_analysis(path: &str) -> TorshResult<()> {
518    let analysis = detect_global_anomalies();
519    let json = serde_json::to_string_pretty(&analysis).map_err(|e| {
520        TorshError::SerializationError(format!("Failed to serialize anomaly analysis: {e}"))
521    })?;
522    std::fs::write(path, json)
523        .map_err(|e| TorshError::IoError(format!("Failed to write anomaly analysis: {e}")))?;
524    Ok(())
525}
526
527/// Export global pattern analysis (stub implementation)
528pub fn export_global_pattern_analysis(path: &str) -> TorshResult<()> {
529    let analysis = detect_global_patterns();
530    let json = serde_json::to_string_pretty(&analysis).map_err(|e| {
531        TorshError::SerializationError(format!("Failed to serialize pattern analysis: {e}"))
532    })?;
533    std::fs::write(path, json)
534        .map_err(|e| TorshError::IoError(format!("Failed to write pattern analysis: {e}")))?;
535    Ok(())
536}
537
538// Import proper correlation analysis types from core::metrics
539pub use core::metrics::{
540    CorrelationAnalysis, CorrelationStrength, CorrelationSummary, CorrelationType,
541    MemoryCorrelation, OperationCorrelation, PerformanceCorrelation, TemporalCorrelation,
542};
543
544/// Analyze global correlations with proper implementation
545pub fn analyze_global_correlations() -> CorrelationAnalysis {
546    use crate::core::metrics::*;
547    use std::collections::HashMap;
548
549    let profiler_arc = global_profiler();
550    let profiler_guard = profiler_arc.lock();
551    let events = profiler_guard.events().to_vec();
552
553    if events.len() < 2 {
554        return CorrelationAnalysis {
555            operation_correlations: Vec::new(),
556            performance_correlations: Vec::new(),
557            memory_correlations: Vec::new(),
558            temporal_correlations: Vec::new(),
559            correlation_summary: CorrelationSummary {
560                total_correlations_analyzed: 0,
561                strong_correlations_found: 0,
562                causal_relationships: 0,
563                bottleneck_correlations: 0,
564                optimization_opportunities: Vec::new(),
565                key_insights: Vec::new(),
566            },
567        };
568    }
569
570    let mut operation_correlations = Vec::new();
571    let mut performance_correlations = Vec::new();
572    let mut memory_correlations = Vec::new();
573    let mut temporal_correlations = Vec::new();
574
575    // Group events by operation name
576    let mut operation_groups: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
577    for event in &events {
578        operation_groups
579            .entry(event.name.clone())
580            .or_default()
581            .push(event);
582    }
583
584    // Analyze operation correlations
585    let operations: Vec<String> = operation_groups.keys().cloned().collect();
586    for (i, op_a) in operations.iter().enumerate() {
587        for op_b in operations.iter().skip(i + 1) {
588            let events_a = &operation_groups[op_a];
589            let events_b = &operation_groups[op_b];
590
591            // Calculate co-occurrence frequency
592            let co_occurrence = calculate_co_occurrence(events_a, events_b);
593            let temporal_proximity = calculate_temporal_proximity(events_a, events_b);
594
595            if co_occurrence > 0.1 || temporal_proximity > 0.5 {
596                let correlation_strength = if co_occurrence > 0.8 && temporal_proximity > 0.8 {
597                    CorrelationStrength::VeryStrong
598                } else if co_occurrence > 0.6 || temporal_proximity > 0.6 {
599                    CorrelationStrength::Strong
600                } else if co_occurrence > 0.4 || temporal_proximity > 0.4 {
601                    CorrelationStrength::Moderate
602                } else {
603                    CorrelationStrength::Weak
604                };
605
606                let insights =
607                    generate_correlation_insights(op_a, op_b, co_occurrence, temporal_proximity);
608
609                operation_correlations.push(OperationCorrelation {
610                    operation_a: op_a.clone(),
611                    operation_b: op_b.clone(),
612                    correlation_coefficient: (co_occurrence + temporal_proximity) / 2.0,
613                    co_occurrence_frequency: co_occurrence,
614                    temporal_proximity,
615                    correlation_strength,
616                    correlation_type: if temporal_proximity > co_occurrence {
617                        CorrelationType::Sequential
618                    } else {
619                        CorrelationType::Complementary
620                    },
621                    insights,
622                });
623            }
624        }
625    }
626
627    // Generate performance correlations
628    for event_group in operation_groups.values() {
629        if event_group.len() >= 2 {
630            let durations: Vec<f64> = event_group.iter().map(|e| e.duration_us as f64).collect();
631            let avg_duration = durations.iter().sum::<f64>() / durations.len() as f64;
632            let variance = durations
633                .iter()
634                .map(|d| (d - avg_duration).powi(2))
635                .sum::<f64>()
636                / durations.len() as f64;
637
638            if variance > 0.0 {
639                performance_correlations.push(PerformanceCorrelation {
640                    metric_a: "duration".to_string(),
641                    metric_b: "variance".to_string(),
642                    correlation_coefficient: (variance / avg_duration).min(1.0),
643                    significance_level: if variance > avg_duration * 0.5 {
644                        0.95
645                    } else {
646                        0.7
647                    },
648                    sample_size: event_group.len(),
649                    correlation_strength: if variance > avg_duration {
650                        CorrelationStrength::Strong
651                    } else {
652                        CorrelationStrength::Moderate
653                    },
654                });
655            }
656        }
657    }
658
659    // Generate summary
660    let total_correlations = operation_correlations.len() + performance_correlations.len();
661    let strong_count = operation_correlations
662        .iter()
663        .filter(|c| {
664            matches!(
665                c.correlation_strength,
666                CorrelationStrength::Strong | CorrelationStrength::VeryStrong
667            )
668        })
669        .count()
670        + performance_correlations
671            .iter()
672            .filter(|c| {
673                matches!(
674                    c.correlation_strength,
675                    CorrelationStrength::Strong | CorrelationStrength::VeryStrong
676                )
677            })
678            .count();
679
680    let correlation_summary = CorrelationSummary {
681        total_correlations_analyzed: total_correlations,
682        strong_correlations_found: strong_count,
683        causal_relationships: operation_correlations
684            .iter()
685            .filter(|c| matches!(c.correlation_type, CorrelationType::Causal))
686            .count(),
687        bottleneck_correlations: operation_correlations
688            .iter()
689            .filter(|c| matches!(c.correlation_type, CorrelationType::Competitive))
690            .count(),
691        optimization_opportunities: operation_correlations
692            .iter()
693            .take(3)
694            .map(|c| {
695                format!(
696                    "{} ↔ {}: Consider optimization",
697                    c.operation_a, c.operation_b
698                )
699            })
700            .collect(),
701        key_insights: vec![
702            format!(
703                "Found {} operation correlations with {} strong relationships",
704                operation_correlations.len(),
705                strong_count
706            ),
707            "Operations with high co-occurrence may benefit from batching".to_string(),
708            "Sequential operations may benefit from pipelining optimizations".to_string(),
709        ],
710    };
711
712    CorrelationAnalysis {
713        operation_correlations,
714        performance_correlations,
715        memory_correlations,
716        temporal_correlations,
717        correlation_summary,
718    }
719}
720
721// Helper functions for correlation analysis
722fn calculate_co_occurrence(events_a: &[&ProfileEvent], events_b: &[&ProfileEvent]) -> f64 {
723    let mut co_occurrences = 0;
724    let window_us = 10000; // 10ms window
725
726    for event_a in events_a {
727        for event_b in events_b {
728            let time_diff = if event_a.start_us > event_b.start_us {
729                event_a.start_us - event_b.start_us
730            } else {
731                event_b.start_us - event_a.start_us
732            };
733
734            if time_diff <= window_us {
735                co_occurrences += 1;
736                break;
737            }
738        }
739    }
740
741    co_occurrences as f64 / events_a.len().max(events_b.len()) as f64
742}
743
744fn calculate_temporal_proximity(events_a: &[&ProfileEvent], events_b: &[&ProfileEvent]) -> f64 {
745    if events_a.is_empty() || events_b.is_empty() {
746        return 0.0;
747    }
748
749    let avg_gap = events_a
750        .iter()
751        .zip(events_b.iter())
752        .map(|(a, b)| {
753            if a.start_us > b.start_us {
754                a.start_us - b.start_us
755            } else {
756                b.start_us - a.start_us
757            }
758        })
759        .sum::<u64>() as f64
760        / events_a.len().min(events_b.len()) as f64;
761
762    // Convert proximity to a 0-1 scale (closer = higher score)
763    1.0 / (1.0 + avg_gap / 1000000.0) // Normalize by 1 second
764}
765
766fn generate_correlation_insights(
767    op_a: &str,
768    op_b: &str,
769    co_occurrence: f64,
770    temporal_proximity: f64,
771) -> Vec<String> {
772    let mut insights = Vec::new();
773
774    if co_occurrence > 0.8 {
775        insights.push(format!(
776            "{} and {} frequently occur together - consider batching",
777            op_a, op_b
778        ));
779    }
780
781    if temporal_proximity > 0.8 {
782        insights.push(format!(
783            "{} and {} have high temporal proximity - potential for optimization",
784            op_a, op_b
785        ));
786    }
787
788    if co_occurrence > 0.5 && temporal_proximity > 0.5 {
789        insights.push("Strong correlation suggests dependency relationship".to_string());
790    }
791
792    insights
793}
794
795/// Export performance trend chart (stub implementation)
796pub fn export_performance_trend_chart(
797    profiler: &parking_lot::MutexGuard<'_, Profiler>,
798    path: &str,
799) -> TorshResult<()> {
800    let html = format!(
801        r#"<!DOCTYPE html>
802<html>
803<head><title>Performance Trends</title></head>
804<body>
805<h1>Performance Trends</h1>
806<p>Total events: {}</p>
807<p>Chart generation placeholder</p>
808</body>
809</html>"#,
810        profiler.events.len()
811    );
812    std::fs::write(path, html)
813        .map_err(|e| TorshError::IoError(format!("Failed to write performance trends: {e}")))?;
814    Ok(())
815}
816
817/// Export operation frequency chart (stub implementation)
818pub fn export_operation_frequency_chart(
819    profiler: &parking_lot::MutexGuard<'_, Profiler>,
820    path: &str,
821) -> TorshResult<()> {
822    let html = format!(
823        r#"<!DOCTYPE html>
824<html>
825<head><title>Operation Frequency</title></head>
826<body>
827<h1>Operation Frequency</h1>
828<p>Total events: {}</p>
829<p>Frequency chart generation placeholder</p>
830</body>
831</html>"#,
832        profiler.events.len()
833    );
834    std::fs::write(path, html).map_err(|e| {
835        TorshError::IoError(format!("Failed to write operation frequency chart: {e}"))
836    })?;
837    Ok(())
838}
839
840/// Export global correlation analysis (stub implementation)
841pub fn export_global_correlation_analysis(path: &str) -> TorshResult<()> {
842    let analysis = analyze_global_correlations();
843    let json = serde_json::to_string_pretty(&analysis).map_err(|e| {
844        TorshError::SerializationError(format!("Failed to serialize correlation analysis: {e}"))
845    })?;
846    std::fs::write(path, json)
847        .map_err(|e| TorshError::IoError(format!("Failed to write correlation analysis: {e}")))?;
848    Ok(())
849}
850
851/// Export memory scatter plot (stub implementation)
852pub fn export_memory_scatter_plot(
853    _memory_profiler: &crate::MemoryProfiler,
854    path: &str,
855) -> TorshResult<()> {
856    let html = format!(
857        r#"<!DOCTYPE html>
858<html>
859<head><title>Memory Scatter Plot</title></head>
860<body>
861<h1>Memory Scatter Plot</h1>
862<p>Memory profiler status: active</p>
863<p>Scatter plot generation placeholder</p>
864</body>
865</html>"#
866    );
867    std::fs::write(path, html)
868        .map_err(|e| TorshError::IoError(format!("Failed to write memory scatter plot: {e}")))?;
869    Ok(())
870}
871
872/// Export duration histogram (stub implementation)
873pub fn export_duration_histogram(
874    profiler: &parking_lot::MutexGuard<'_, Profiler>,
875    path: &str,
876) -> TorshResult<()> {
877    let html = format!(
878        r#"<!DOCTYPE html>
879<html>
880<head><title>Duration Histogram</title></head>
881<body>
882<h1>Duration Histogram</h1>
883<p>Total events: {}</p>
884<p>Histogram generation placeholder</p>
885</body>
886</html>"#,
887        profiler.events.len()
888    );
889    std::fs::write(path, html)
890        .map_err(|e| TorshError::IoError(format!("Failed to write duration histogram: {e}")))?;
891    Ok(())
892}
893
894/// Check if global stack traces are enabled
895pub fn are_global_stack_traces_enabled() -> bool {
896    core::profiler::are_global_stack_traces_enabled()
897}
898
899/// Enhanced overhead tracking
900pub fn set_global_overhead_tracking_enabled(enabled: bool) {
901    core::profiler::set_global_overhead_tracking_enabled(enabled);
902}
903
904pub fn is_global_overhead_tracking_enabled() -> bool {
905    core::profiler::is_global_overhead_tracking_enabled()
906}
907
908pub fn get_global_overhead_stats() -> OverheadStats {
909    core::profiler::get_global_overhead_stats()
910}
911
912pub fn reset_global_overhead_stats() {
913    core::profiler::reset_global_overhead_stats();
914}
915
916// ========================================
917// TYPE DEFINITIONS (extracted from original)
918// ========================================
919
920/// Core profiling event structure
921#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
922pub struct ProfileEvent {
923    pub name: String,
924    pub category: String,
925    pub start_us: u64,
926    pub duration_us: u64,
927    pub thread_id: usize,
928    pub operation_count: Option<u64>,
929    pub flops: Option<u64>,
930    pub bytes_transferred: Option<u64>,
931    pub stack_trace: Option<String>,
932}
933
934/// Overhead statistics for profiling operations
935#[derive(Debug, Clone, Default)]
936pub struct OverheadStats {
937    pub add_event_time_ns: u64,
938    pub add_event_count: u64,
939    pub stack_trace_time_ns: u64,
940    pub stack_trace_count: u64,
941    pub export_time_ns: u64,
942    pub export_count: u64,
943    pub total_overhead_ns: u64,
944}
945
946/// Bottleneck analysis results
947#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
948pub struct BottleneckAnalysis {
949    pub slowest_operations: Vec<BottleneckEvent>,
950    pub memory_hotspots: Vec<MemoryHotspot>,
951    pub thread_contention: Vec<ThreadContentionEvent>,
952    pub efficiency_issues: Vec<EfficiencyIssue>,
953    pub recommendations: Vec<String>,
954}
955
956/// A performance bottleneck event
957#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
958pub struct BottleneckEvent {
959    pub name: String,
960    pub category: String,
961    pub duration_us: u64,
962    pub thread_id: usize,
963    pub severity: BottleneckSeverity,
964    pub impact_score: f64,
965    pub recommendation: String,
966}
967
968/// Memory hotspot information
969#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
970pub struct MemoryHotspot {
971    pub location: String,
972    pub total_allocations: usize,
973    pub total_bytes: usize,
974    pub average_size: f64,
975    pub peak_concurrent_allocations: usize,
976    pub severity: BottleneckSeverity,
977}
978
979/// Thread contention event
980#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
981pub struct ThreadContentionEvent {
982    pub thread_id: usize,
983    pub operation: String,
984    pub wait_time_us: u64,
985    pub contention_count: usize,
986}
987
988/// Efficiency issue
989#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
990pub struct EfficiencyIssue {
991    pub issue_type: EfficiencyIssueType,
992    pub description: String,
993    pub affected_operations: Vec<String>,
994    pub performance_impact: f64,
995    pub recommendation: String,
996}
997
998/// Type of efficiency issue
999#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
1000pub enum EfficiencyIssueType {
1001    LowThroughput,
1002    HighLatency,
1003    MemoryWaste,
1004    CpuUnderutilization,
1005    FrequentAllocation,
1006    LargeAllocation,
1007}
1008
1009/// Severity of a bottleneck
1010#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
1011pub enum BottleneckSeverity {
1012    Low,
1013    Medium,
1014    High,
1015    Critical,
1016}
1017
1018// ========================================
1019// ENHANCED MACRO EXPORTS
1020// ========================================
1021
1022// Re-export enhanced macros
1023// Macros with #[macro_export] are automatically available at crate root
1024
1025// ========================================
1026// COMPREHENSIVE TESTING SUPPORT
1027// ========================================
1028
1029#[cfg(test)]
1030mod tests {
1031    use super::*;
1032    use std::thread;
1033    use std::time::Duration;
1034
1035    #[test]
1036    fn test_enhanced_profiling_workflow() {
1037        // Test the complete enhanced profiling workflow
1038        start_profiling();
1039
1040        {
1041            profile_scope!("test_enhanced_workflow");
1042            thread::sleep(Duration::from_millis(10));
1043
1044            let mut metrics_scope = MetricsScope::new("computation");
1045            metrics_scope.set_operation_count(1000);
1046            metrics_scope.set_flops(5000);
1047            metrics_scope.set_bytes_transferred(2048);
1048
1049            thread::sleep(Duration::from_millis(5));
1050        }
1051
1052        stop_profiling();
1053
1054        // Test export functionality
1055        let json_path = std::env::temp_dir().join("test_enhanced.json");
1056        let json_str = json_path.display().to_string();
1057        let result = export_global_json(&json_str);
1058        assert!(result.is_ok());
1059
1060        let csv_path = std::env::temp_dir().join("test_enhanced.csv");
1061        let csv_str = csv_path.display().to_string();
1062        let result = export_global_csv(&csv_str);
1063        assert!(result.is_ok());
1064
1065        // Clean up
1066        let _ = std::fs::remove_file(&json_path);
1067        let _ = std::fs::remove_file(&csv_path);
1068    }
1069
1070    #[test]
1071    fn test_unified_profiler() {
1072        let mut profiler = create_unified_profiler();
1073        let result = profiler.start_all();
1074
1075        // Should succeed even if some platforms are unavailable
1076        thread::sleep(Duration::from_millis(5));
1077
1078        let stop_result = profiler.stop_all();
1079        // Export test
1080        let unified_path = std::env::temp_dir().join("test_unified.json");
1081        let unified_str = unified_path.display().to_string();
1082        let export_result = profiler.export_all(export::ExportFormat::Json, &unified_str);
1083
1084        // Clean up
1085        let _ = std::fs::remove_file(&unified_path);
1086    }
1087
1088    #[test]
1089    fn test_enhanced_export_formats() {
1090        start_profiling();
1091        {
1092            profile_scope!("format_test");
1093            thread::sleep(Duration::from_millis(5));
1094        }
1095        stop_profiling();
1096
1097        // Test all available formats
1098        let formats = export::available_format_names();
1099        for format_name in formats {
1100            if let Some(format) = export::parse_format(&format_name) {
1101                let path = std::env::temp_dir().join(format!(
1102                    "test_{}.{}",
1103                    format_name,
1104                    format.extension()
1105                ));
1106                let path_str = path.display().to_string();
1107                let result = export_global_events(format, &path_str);
1108
1109                // Clean up
1110                let _ = std::fs::remove_file(&path);
1111            }
1112        }
1113    }
1114
1115    #[test]
1116    #[ignore = "Flaky test - passes individually but may fail in full suite"]
1117    fn test_overhead_tracking() {
1118        set_global_overhead_tracking_enabled(true);
1119        start_profiling();
1120
1121        {
1122            profile_scope!("overhead_test");
1123            thread::sleep(Duration::from_millis(5));
1124        }
1125
1126        stop_profiling();
1127
1128        let stats = get_global_overhead_stats();
1129        assert!(stats.add_event_count > 0);
1130        assert!(stats.total_overhead_ns > 0);
1131
1132        reset_global_overhead_stats();
1133        set_global_overhead_tracking_enabled(false);
1134    }
1135}
1136
1137// Version information
1138pub const VERSION: &str = env!("CARGO_PKG_VERSION");
1139pub const VERSION_MAJOR: u32 = 0;
1140pub const VERSION_MINOR: u32 = 1;
1141pub const VERSION_PATCH: u32 = 0;
1142
1143/// Prelude module for convenient imports
1144#[allow(ambiguous_glob_reexports)]
1145pub mod prelude {
1146    pub use crate::analysis::*;
1147    pub use crate::core::*;
1148    pub use crate::distributed::*;
1149    pub use crate::export::*;
1150    pub use crate::platforms::*;
1151}