torsh-profiler 0.1.2

//! Performance profiling for ToRSh
//!
//! This crate provides comprehensive performance profiling capabilities for the ToRSh
//! deep learning framework, including CPU, GPU, memory, and system profiling.
//!
//! # Refactored Modular Structure
//!
//! The profiler has been successfully refactored from a massive 9,517-line monolithic file
//! into a clean, maintainable modular structure:
//!
//! - `core`: Core profiling types, event management, and profiler implementation
//! - `platforms`: Platform-specific profiling (CPU, GPU, system)
//! - `analysis`: Performance analysis and optimization recommendations
//! - `export`: Export and reporting functionality with multiple format support
//! - `distributed`: Distributed profiling coordination
//!
//! # Usage Examples
//!
//! ## Basic Profiling
//!
//! ```rust
//! use torsh_profiler::{start_profiling, stop_profiling, profile_scope};
//!
//! // Start global profiling
//! start_profiling();
//!
//! {
//!     profile_scope!("computation");
//!     // Your code here
//! }
//!
//! stop_profiling();
//! ```
//!
//! ## Advanced Profiling with Metrics
//!
//! ```rust
//! use torsh_profiler::{MetricsScope, export_global_events, ExportFormat};
//!
//! fn main() -> Result<(), Box<dyn std::error::Error>> {
//!     {
//!         let mut scope = MetricsScope::new("training_step");
//!         scope.set_operation_count(1000);
//!         scope.set_flops(50000);
//!         scope.set_bytes_transferred(4096);
//!         // Training code here
//!     }
//!
//!     // Export results
//!     export_global_events(ExportFormat::ChromeTrace, "profile.json")?;
//!     Ok(())
//! }
//! ```
//!
//! ## Platform-Specific Profiling
//!
//! ```rust
//! use torsh_profiler::{UnifiedProfiler, CudaProfiler, MemoryProfiler};
//!
//! fn main() -> Result<(), Box<dyn std::error::Error>> {
//!     let mut profiler = UnifiedProfiler::with_auto_detection();
//!     profiler.start_all()?;
//!
//!     // Your GPU/CPU workload
//!
//!     profiler.stop_all()?;
//!     Ok(())
//! }
//! ```

// Allow attributes for library code that may have unused items in public API
#![allow(dead_code)]
#![allow(unused_imports)]
#![allow(unused_variables)]
#![allow(unused_mut)]
#![allow(static_mut_refs)]

use backtrace::Backtrace;
use once_cell::sync::Lazy;
use parking_lot::Mutex;
use std::sync::Arc;
use std::time::Instant;
use torsh_core::{Result, TorshError};

/// Convenience type alias for Results in this crate
pub type TorshResult<T> = Result<T>;

// ========================================
// CORE MODULAR STRUCTURE
// ========================================

/// Core profiling functionality
pub mod core;

/// Platform-specific profiling implementations
pub mod platforms;

/// Performance analysis and optimization
pub mod analysis;

/// Export and reporting capabilities
pub mod export;

/// Distributed profiling coordination
pub mod distributed;

// ========================================
// EXISTING MODULES (maintained for compatibility)
// ========================================

pub mod advanced_visualization;
pub mod alerts;
pub mod amd;
pub mod attributes;
pub mod chrome_trace;
pub mod ci_cd;
pub mod cloud_providers;
pub mod cloudwatch;
pub mod cpu;
pub mod cross_platform;
pub mod cuda;
pub mod custom_export;
pub mod custom_tools;
pub mod dashboard;
pub mod grafana;
pub mod instruments;
pub mod integrated_profiler;
pub mod kubernetes;
pub mod macros;
pub mod memory;
pub mod memory_optimization;
pub mod ml_analysis;
pub mod nsight;
pub mod online_learning;
pub mod optimization;
pub mod power;
pub mod prometheus;
pub mod regression;
pub mod reporting;
pub mod scirs2_integration;
pub mod streaming;
pub mod tensorboard;
pub mod thermal;
pub mod vtune;
pub mod workload_characterization;

// ========================================
// STRUCTURED RE-EXPORTS FOR ENHANCED API
// ========================================

// Core profiling functionality - Enhanced interface
pub use core::{
    add_global_event,
    add_global_event as add_event,
    clear_global_events,
    // Events and metrics
    events::*,
    get_global_stats,

    global_profiler,
    metrics::*,
    profile_function_with_category,

    start_profiling,
    stop_profiling,
    MetricsScope,
    // Core profiler implementation
    Profiler,
    // Scope-based profiling
    ScopeGuard,
};

// Enhanced export functionality
pub use export::{
    available_format_names,
    // Existing functionality
    dashboard::*,
    export_chrome_trace_format,
    export_csv_format,

    export_events,
    export_global_events,
    export_json_format,
    formats::*,
    parse_format,
    reporting::*,
    ExportFormat,
};

// Prometheus metrics export
pub use prometheus::{PrometheusExporter, PrometheusExporterBuilder};

// Grafana dashboard integration
pub use grafana::{
    Dashboard as GrafanaDashboard, DashboardTemplates, GrafanaDashboardGenerator, GridPos, Panel,
    Target,
};

// AWS CloudWatch metrics integration
pub use cloudwatch::{
    CloudWatchConfig, CloudWatchPublisher, CloudWatchPublisherBuilder, Dimension, MetricDatum,
    StatisticSet, Unit as CloudWatchUnit,
};

// Platform profiling interfaces
pub use platforms::{cpu::*, gpu::*, system::*};

// Analysis capabilities
pub use analysis::{ml_analysis::*, optimization::*, regression::*};

// Distributed profiling
pub use distributed::profiling::*;

// Real-time streaming capabilities
pub use streaming::{
    create_high_performance_streaming_engine, create_low_latency_streaming_engine,
    create_streaming_engine, AdaptiveBitrateConfig, AdaptiveRateController, AdjustmentReason,
    AdvancedFeatures, BitrateAdjustment, BufferedEvent, CompressionAlgorithm, CompressionConfig,
    CompressionManager, ConnectionManager, ControlMessage, EnhancedStreamingEngine, EventBuffer,
    EventPriority, ProtocolConfig, QualityConfig, QualityLevel, QualityMetricsThreshold,
    SSEConnection, StreamConnection, StreamingConfig, StreamingProtocol, StreamingStats,
    StreamingStatsSnapshot, TcpConnection, UdpConnection, WebSocketConnection, WebSocketMessage,
};

// ========================================
// ESSENTIAL BACKWARD COMPATIBILITY RE-EXPORTS
// ========================================

// Critical re-exports for existing API compatibility
pub use alerts::{
    create_alert_manager_with_config, get_alert_manager, AlertConfig, AlertManager,
    NotificationChannel,
};
pub use attributes::{
    get_registry, with_profiling, AsyncProfiler, AttributeRegistry, ConditionalProfiler,
    ProfileAttribute, ProfiledFunction, ProfiledStruct,
};
pub use chrome_trace::{create_chrome_event, export, export_to_writer, phases, scopes};
pub use ci_cd::{CiCdConfig, CiCdIntegration, CiCdPlatform};
pub use cpu::{CpuProfiler, ProfileScope};
pub use cuda::{
    get_cuda_device_properties, get_cuda_memory_stats, CudaEvent, CudaMemoryStats, CudaProfiler,
    CudaSynchronizationStats, NvtxRange,
};
pub use custom_export::{
    CsvColumn, CsvFormatter, CustomExportFormat, CustomExporter, ExportSchema,
};
pub use dashboard::alerts::create_alert_manager;
pub use dashboard::{
    create_dashboard, create_dashboard_with_config, export_dashboard_html, generate_3d_landscape,
    generate_performance_heatmap, Dashboard, DashboardAlert, DashboardAlertSeverity,
    DashboardConfig, DashboardData, HeatmapCell, MemoryMetrics, OperationSummary,
    PerformanceHeatmap, PerformanceLandscape, PerformanceMetrics, PerformancePoint3D,
    SystemMetrics, VisualizationColorScheme, VisualizationConfig, WebSocketConfig,
};

// SCIRS2 Integration re-exports - Enhanced with comprehensive features
pub use scirs2_integration::{
    AdvancedProfilingConfig, BenchmarkResults, HistogramStats, MetricsSummary, PerformanceAnalysis,
    PerformanceTargets, SamplingStrategy, ScirS2EnhancedProfiler, ScirS2ProfilingData,
    ValidationLevel,
};

// Memory Optimization re-exports
pub use instruments::{
    create_instruments_profiler, create_instruments_profiler_with_config, export_instruments_json,
    get_instruments_statistics, AllocationType, EnergyComponent, InstrumentsConfig,
    InstrumentsExportData, InstrumentsProfiler, InstrumentsStats, SignpostInterval,
};
pub use macros::ProfileResult;
pub use memory::{
    FragmentationAnalysis, LeakDetectionResults, MemoryBlock, MemoryEvent, MemoryEventType,
    MemoryLeak, MemoryProfiler, MemoryStats, MemoryTimeline, SystemMemoryInfo,
};
pub use memory_optimization::{
    create_memory_optimizer, create_memory_optimizer_for_low_memory,
    create_memory_optimizer_with_aggressive_settings, AdaptivePoolManager, AdvancedMemoryOptimizer,
    MemoryOptimizationConfig, MemoryOptimizationStats, MemorySnapshot, MemoryStrategies,
    MemoryUsagePredictor, OptimizationExportData, OptimizationStatsSummary,
};

// ========================================
// ENHANCED UNIFIED PROFILING INTERFACE
// ========================================

/// Enhanced unified profiler combining all platform profilers with simplified API
pub struct UnifiedProfiler {
    pub cpu_platform: platforms::cpu::CpuProfilerPlatform,
    pub gpu_platform: platforms::gpu::GpuProfilerPlatform,
    pub system_platform: platforms::system::SystemProfilerPlatform,
    pub event_collector: core::events::EventCollector,
}

impl UnifiedProfiler {
    /// Create a new unified profiler with all platforms
    pub fn new() -> Self {
        Self {
            cpu_platform: platforms::cpu::CpuProfilerPlatform::new(),
            gpu_platform: platforms::gpu::GpuProfilerPlatform::new(),
            system_platform: platforms::system::SystemProfilerPlatform::new(),
            event_collector: core::events::EventCollector::new(),
        }
    }

    /// Create with optimal platform detection
    pub fn with_auto_detection() -> Self {
        let cpu_platform = platforms::cpu::CpuProfilerPlatform::new().with_cpu_profiler();

        #[cfg(target_os = "macos")]
        let cpu_platform = cpu_platform.with_instruments();

        #[cfg(target_os = "linux")]
        let cpu_platform = cpu_platform.with_vtune();

        let gpu_platform = platforms::gpu::GpuProfilerPlatform::new().with_optimal_profiler();
        let system_platform =
            platforms::system::SystemProfilerPlatform::new().with_all_system_profiling();

        Self {
            cpu_platform,
            gpu_platform,
            system_platform,
            event_collector: core::events::EventCollector::new(),
        }
    }

    /// Start all profiling platforms
    pub fn start_all(&mut self) -> TorshResult<()> {
        self.cpu_platform.start_profiling()?;
        self.gpu_platform.start_profiling()?;
        self.system_platform.start_profiling()?;
        Ok(())
    }

    /// Stop all profiling platforms
    pub fn stop_all(&mut self) -> TorshResult<()> {
        self.cpu_platform.stop_profiling()?;
        self.gpu_platform.stop_profiling()?;
        self.system_platform.stop_profiling()?;
        Ok(())
    }

    /// Export all collected data in specified format
    pub fn export_all(&self, format: export::ExportFormat, base_path: &str) -> TorshResult<()> {
        let profiling_events = self.event_collector.get_events();
        // Convert ProfilingEvent to ProfileEvent
        let events: Vec<ProfileEvent> = profiling_events
            .iter()
            .map(|pe| ProfileEvent {
                name: pe.name.clone(),
                category: pe.category.clone(),
                start_us: pe.start_time.elapsed().as_micros() as u64,
                duration_us: pe.duration.map(|d| d.as_micros() as u64).unwrap_or(0),
                thread_id: pe.thread_id,
                operation_count: None,
                flops: None,
                bytes_transferred: None,
                stack_trace: None,
            })
            .collect();
        export::export_events(&events, format, base_path)
    }
}

impl Default for UnifiedProfiler {
    fn default() -> Self {
        Self::new()
    }
}

// ========================================
// CONVENIENCE FACTORY FUNCTIONS
// ========================================

/// Create a unified profiler with automatic platform detection
pub fn create_unified_profiler() -> UnifiedProfiler {
    UnifiedProfiler::with_auto_detection()
}

/// Create a basic profiler for development
pub fn create_basic_profiler() -> UnifiedProfiler {
    UnifiedProfiler::new()
}

/// Create a profiler optimized for production use
pub fn create_production_profiler() -> UnifiedProfiler {
    let mut profiler = UnifiedProfiler::with_auto_detection();
    // Configure for minimal overhead
    profiler
}

// ========================================
// ENHANCED GLOBAL API FUNCTIONS
// ========================================

/// Enhanced global export functions with multiple format support
pub fn export_global_trace(path: &str) -> TorshResult<()> {
    export_global_events(export::ExportFormat::ChromeTrace, path)
}

pub fn export_global_json(path: &str) -> TorshResult<()> {
    export_global_events(export::ExportFormat::Json, path)
}

pub fn export_global_csv(path: &str) -> TorshResult<()> {
    export_global_events(export::ExportFormat::Csv, path)
}

pub fn export_global_tensorboard(base_path: &str) -> TorshResult<()> {
    let profiler_arc = global_profiler();
    let profiler_guard = profiler_arc.lock();
    let events = profiler_guard.events().to_vec();

    crate::tensorboard::export_tensorboard_profile(&events, base_path)
}

/// Global custom exporter instance
static GLOBAL_CUSTOM_EXPORTER: Lazy<Mutex<custom_export::CustomExporter>> =
    Lazy::new(|| Mutex::new(custom_export::CustomExporter::new()));

/// Get available custom export format names
pub fn get_global_custom_export_formats() -> Vec<String> {
    let exporter = GLOBAL_CUSTOM_EXPORTER.lock();
    exporter.get_format_names()
}

/// Register a custom export format globally
pub fn register_global_custom_export_format(format: custom_export::CustomExportFormat) {
    let mut exporter = GLOBAL_CUSTOM_EXPORTER.lock();
    exporter.register_format(format);
}

/// Export using a custom format
pub fn export_global_custom(format_name: &str, path: &str) -> TorshResult<()> {
    let profiler_arc = global_profiler();
    let profiler_guard = profiler_arc.lock();
    let events = profiler_guard.events().to_vec();
    drop(profiler_guard);

    let exporter = GLOBAL_CUSTOM_EXPORTER.lock();
    exporter.export(&events, format_name, path)
}

/// Set global stack traces enabled with enhanced functionality
pub fn set_global_stack_traces_enabled(enabled: bool) {
    core::profiler::set_global_stack_traces_enabled(enabled);
}

/// Performance anomaly data structure
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PerformanceAnomaly {
    pub event_name: String,
    pub description: String,
    pub confidence: f64,
    pub severity: String,
}

/// Memory anomaly data structure
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct MemoryAnomaly {
    pub anomaly_type: String,
    pub confidence: f64,
}

/// Anomaly analysis result structure
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct AnomalyAnalysis {
    pub performance_anomalies: Vec<PerformanceAnomaly>,
    pub memory_anomalies: Vec<MemoryAnomaly>,
    pub throughput_anomalies: Vec<String>,
    pub temporal_anomalies: Vec<String>,
}

/// Performance pattern data structure
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PerformancePattern {
    pub pattern_type: String,
    pub description: String,
    pub confidence_score: f64,
    pub optimization_type: String,
    pub potential_improvement: String,
    pub implementation_complexity: String,
}

/// Pattern analysis result structure
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PatternAnalysis {
    pub performance_patterns: Vec<PerformancePattern>,
    pub bottleneck_patterns: Vec<String>,
    pub resource_patterns: Vec<String>,
    pub temporal_patterns: Vec<String>,
    pub optimization_patterns: Vec<PerformancePattern>,
}

/// Detect global anomalies in profiling data (stub implementation)
pub fn detect_global_anomalies() -> AnomalyAnalysis {
    AnomalyAnalysis {
        performance_anomalies: Vec::new(),
        memory_anomalies: Vec::new(),
        throughput_anomalies: Vec::new(),
        temporal_anomalies: Vec::new(),
    }
}

/// Detect global patterns in profiling data (stub implementation)
pub fn detect_global_patterns() -> PatternAnalysis {
    PatternAnalysis {
        performance_patterns: Vec::new(),
        bottleneck_patterns: Vec::new(),
        resource_patterns: Vec::new(),
        temporal_patterns: Vec::new(),
        optimization_patterns: Vec::new(),
    }
}

/// Export global anomaly analysis (stub implementation)
pub fn export_global_anomaly_analysis(path: &str) -> TorshResult<()> {
    let analysis = detect_global_anomalies();
    let json = serde_json::to_string_pretty(&analysis).map_err(|e| {
        TorshError::SerializationError(format!("Failed to serialize anomaly analysis: {e}"))
    })?;
    std::fs::write(path, json)
        .map_err(|e| TorshError::IoError(format!("Failed to write anomaly analysis: {e}")))?;
    Ok(())
}

/// Export global pattern analysis (stub implementation)
pub fn export_global_pattern_analysis(path: &str) -> TorshResult<()> {
    let analysis = detect_global_patterns();
    let json = serde_json::to_string_pretty(&analysis).map_err(|e| {
        TorshError::SerializationError(format!("Failed to serialize pattern analysis: {e}"))
    })?;
    std::fs::write(path, json)
        .map_err(|e| TorshError::IoError(format!("Failed to write pattern analysis: {e}")))?;
    Ok(())
}

// Import proper correlation analysis types from core::metrics
pub use core::metrics::{
    CorrelationAnalysis, CorrelationStrength, CorrelationSummary, CorrelationType,
    MemoryCorrelation, OperationCorrelation, PerformanceCorrelation, TemporalCorrelation,
};

/// Analyze global correlations with proper implementation
pub fn analyze_global_correlations() -> CorrelationAnalysis {
    use crate::core::metrics::*;
    use std::collections::HashMap;

    let profiler_arc = global_profiler();
    let profiler_guard = profiler_arc.lock();
    let events = profiler_guard.events().to_vec();

    if events.len() < 2 {
        return CorrelationAnalysis {
            operation_correlations: Vec::new(),
            performance_correlations: Vec::new(),
            memory_correlations: Vec::new(),
            temporal_correlations: Vec::new(),
            correlation_summary: CorrelationSummary {
                total_correlations_analyzed: 0,
                strong_correlations_found: 0,
                causal_relationships: 0,
                bottleneck_correlations: 0,
                optimization_opportunities: Vec::new(),
                key_insights: Vec::new(),
            },
        };
    }

    let mut operation_correlations = Vec::new();
    let mut performance_correlations = Vec::new();
    let mut memory_correlations = Vec::new();
    let mut temporal_correlations = Vec::new();

    // Group events by operation name
    let mut operation_groups: HashMap<String, Vec<&ProfileEvent>> = HashMap::new();
    for event in &events {
        operation_groups
            .entry(event.name.clone())
            .or_default()
            .push(event);
    }

    // Analyze operation correlations
    let operations: Vec<String> = operation_groups.keys().cloned().collect();
    for (i, op_a) in operations.iter().enumerate() {
        for op_b in operations.iter().skip(i + 1) {
            let events_a = &operation_groups[op_a];
            let events_b = &operation_groups[op_b];

            // Calculate co-occurrence frequency
            let co_occurrence = calculate_co_occurrence(events_a, events_b);
            let temporal_proximity = calculate_temporal_proximity(events_a, events_b);

            if co_occurrence > 0.1 || temporal_proximity > 0.5 {
                let correlation_strength = if co_occurrence > 0.8 && temporal_proximity > 0.8 {
                    CorrelationStrength::VeryStrong
                } else if co_occurrence > 0.6 || temporal_proximity > 0.6 {
                    CorrelationStrength::Strong
                } else if co_occurrence > 0.4 || temporal_proximity > 0.4 {
                    CorrelationStrength::Moderate
                } else {
                    CorrelationStrength::Weak
                };

                let insights =
                    generate_correlation_insights(op_a, op_b, co_occurrence, temporal_proximity);

                operation_correlations.push(OperationCorrelation {
                    operation_a: op_a.clone(),
                    operation_b: op_b.clone(),
                    correlation_coefficient: (co_occurrence + temporal_proximity) / 2.0,
                    co_occurrence_frequency: co_occurrence,
                    temporal_proximity,
                    correlation_strength,
                    correlation_type: if temporal_proximity > co_occurrence {
                        CorrelationType::Sequential
                    } else {
                        CorrelationType::Complementary
                    },
                    insights,
                });
            }
        }
    }

    // Generate performance correlations
    for event_group in operation_groups.values() {
        if event_group.len() >= 2 {
            let durations: Vec<f64> = event_group.iter().map(|e| e.duration_us as f64).collect();
            let avg_duration = durations.iter().sum::<f64>() / durations.len() as f64;
            let variance = durations
                .iter()
                .map(|d| (d - avg_duration).powi(2))
                .sum::<f64>()
                / durations.len() as f64;

            if variance > 0.0 {
                performance_correlations.push(PerformanceCorrelation {
                    metric_a: "duration".to_string(),
                    metric_b: "variance".to_string(),
                    correlation_coefficient: (variance / avg_duration).min(1.0),
                    significance_level: if variance > avg_duration * 0.5 {
                        0.95
                    } else {
                        0.7
                    },
                    sample_size: event_group.len(),
                    correlation_strength: if variance > avg_duration {
                        CorrelationStrength::Strong
                    } else {
                        CorrelationStrength::Moderate
                    },
                });
            }
        }
    }

    // Generate summary
    let total_correlations = operation_correlations.len() + performance_correlations.len();
    let strong_count = operation_correlations
        .iter()
        .filter(|c| {
            matches!(
                c.correlation_strength,
                CorrelationStrength::Strong | CorrelationStrength::VeryStrong
            )
        })
        .count()
        + performance_correlations
            .iter()
            .filter(|c| {
                matches!(
                    c.correlation_strength,
                    CorrelationStrength::Strong | CorrelationStrength::VeryStrong
                )
            })
            .count();

    let correlation_summary = CorrelationSummary {
        total_correlations_analyzed: total_correlations,
        strong_correlations_found: strong_count,
        causal_relationships: operation_correlations
            .iter()
            .filter(|c| matches!(c.correlation_type, CorrelationType::Causal))
            .count(),
        bottleneck_correlations: operation_correlations
            .iter()
            .filter(|c| matches!(c.correlation_type, CorrelationType::Competitive))
            .count(),
        optimization_opportunities: operation_correlations
            .iter()
            .take(3)
            .map(|c| {
                format!(
                    "{} ↔ {}: Consider optimization",
                    c.operation_a, c.operation_b
                )
            })
            .collect(),
        key_insights: vec![
            format!(
                "Found {} operation correlations with {} strong relationships",
                operation_correlations.len(),
                strong_count
            ),
            "Operations with high co-occurrence may benefit from batching".to_string(),
            "Sequential operations may benefit from pipelining optimizations".to_string(),
        ],
    };

    CorrelationAnalysis {
        operation_correlations,
        performance_correlations,
        memory_correlations,
        temporal_correlations,
        correlation_summary,
    }
}

// Helper functions for correlation analysis
fn calculate_co_occurrence(events_a: &[&ProfileEvent], events_b: &[&ProfileEvent]) -> f64 {
    let mut co_occurrences = 0;
    let window_us = 10000; // 10ms window

    for event_a in events_a {
        for event_b in events_b {
            let time_diff = if event_a.start_us > event_b.start_us {
                event_a.start_us - event_b.start_us
            } else {
                event_b.start_us - event_a.start_us
            };

            if time_diff <= window_us {
                co_occurrences += 1;
                break;
            }
        }
    }

    co_occurrences as f64 / events_a.len().max(events_b.len()) as f64
}

fn calculate_temporal_proximity(events_a: &[&ProfileEvent], events_b: &[&ProfileEvent]) -> f64 {
    if events_a.is_empty() || events_b.is_empty() {
        return 0.0;
    }

    let avg_gap = events_a
        .iter()
        .zip(events_b.iter())
        .map(|(a, b)| {
            if a.start_us > b.start_us {
                a.start_us - b.start_us
            } else {
                b.start_us - a.start_us
            }
        })
        .sum::<u64>() as f64
        / events_a.len().min(events_b.len()) as f64;

    // Convert proximity to a 0-1 scale (closer = higher score)
    1.0 / (1.0 + avg_gap / 1000000.0) // Normalize by 1 second
}

fn generate_correlation_insights(
    op_a: &str,
    op_b: &str,
    co_occurrence: f64,
    temporal_proximity: f64,
) -> Vec<String> {
    let mut insights = Vec::new();

    if co_occurrence > 0.8 {
        insights.push(format!(
            "{} and {} frequently occur together - consider batching",
            op_a, op_b
        ));
    }

    if temporal_proximity > 0.8 {
        insights.push(format!(
            "{} and {} have high temporal proximity - potential for optimization",
            op_a, op_b
        ));
    }

    if co_occurrence > 0.5 && temporal_proximity > 0.5 {
        insights.push("Strong correlation suggests dependency relationship".to_string());
    }

    insights
}

/// Export performance trend chart (stub implementation)
pub fn export_performance_trend_chart(
    profiler: &parking_lot::MutexGuard<'_, Profiler>,
    path: &str,
) -> TorshResult<()> {
    let html = format!(
        r#"<!DOCTYPE html>
<html>
<head><title>Performance Trends</title></head>
<body>
<h1>Performance Trends</h1>
<p>Total events: {}</p>
<p>Chart generation placeholder</p>
</body>
</html>"#,
        profiler.events.len()
    );
    std::fs::write(path, html)
        .map_err(|e| TorshError::IoError(format!("Failed to write performance trends: {e}")))?;
    Ok(())
}

/// Export operation frequency chart (stub implementation)
pub fn export_operation_frequency_chart(
    profiler: &parking_lot::MutexGuard<'_, Profiler>,
    path: &str,
) -> TorshResult<()> {
    let html = format!(
        r#"<!DOCTYPE html>
<html>
<head><title>Operation Frequency</title></head>
<body>
<h1>Operation Frequency</h1>
<p>Total events: {}</p>
<p>Frequency chart generation placeholder</p>
</body>
</html>"#,
        profiler.events.len()
    );
    std::fs::write(path, html).map_err(|e| {
        TorshError::IoError(format!("Failed to write operation frequency chart: {e}"))
    })?;
    Ok(())
}

/// Export global correlation analysis (stub implementation)
pub fn export_global_correlation_analysis(path: &str) -> TorshResult<()> {
    let analysis = analyze_global_correlations();
    let json = serde_json::to_string_pretty(&analysis).map_err(|e| {
        TorshError::SerializationError(format!("Failed to serialize correlation analysis: {e}"))
    })?;
    std::fs::write(path, json)
        .map_err(|e| TorshError::IoError(format!("Failed to write correlation analysis: {e}")))?;
    Ok(())
}

/// Export memory scatter plot (stub implementation)
pub fn export_memory_scatter_plot(
    _memory_profiler: &crate::MemoryProfiler,
    path: &str,
) -> TorshResult<()> {
    let html = format!(
        r#"<!DOCTYPE html>
<html>
<head><title>Memory Scatter Plot</title></head>
<body>
<h1>Memory Scatter Plot</h1>
<p>Memory profiler status: active</p>
<p>Scatter plot generation placeholder</p>
</body>
</html>"#
    );
    std::fs::write(path, html)
        .map_err(|e| TorshError::IoError(format!("Failed to write memory scatter plot: {e}")))?;
    Ok(())
}

/// Export duration histogram (stub implementation)
pub fn export_duration_histogram(
    profiler: &parking_lot::MutexGuard<'_, Profiler>,
    path: &str,
) -> TorshResult<()> {
    let html = format!(
        r#"<!DOCTYPE html>
<html>
<head><title>Duration Histogram</title></head>
<body>
<h1>Duration Histogram</h1>
<p>Total events: {}</p>
<p>Histogram generation placeholder</p>
</body>
</html>"#,
        profiler.events.len()
    );
    std::fs::write(path, html)
        .map_err(|e| TorshError::IoError(format!("Failed to write duration histogram: {e}")))?;
    Ok(())
}

/// Check if global stack traces are enabled
pub fn are_global_stack_traces_enabled() -> bool {
    core::profiler::are_global_stack_traces_enabled()
}

/// Enhanced overhead tracking
pub fn set_global_overhead_tracking_enabled(enabled: bool) {
    core::profiler::set_global_overhead_tracking_enabled(enabled);
}

pub fn is_global_overhead_tracking_enabled() -> bool {
    core::profiler::is_global_overhead_tracking_enabled()
}

pub fn get_global_overhead_stats() -> OverheadStats {
    core::profiler::get_global_overhead_stats()
}

pub fn reset_global_overhead_stats() {
    core::profiler::reset_global_overhead_stats();
}

// ========================================
// TYPE DEFINITIONS (extracted from original)
// ========================================

/// Core profiling event structure
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ProfileEvent {
    pub name: String,
    pub category: String,
    pub start_us: u64,
    pub duration_us: u64,
    pub thread_id: usize,
    pub operation_count: Option<u64>,
    pub flops: Option<u64>,
    pub bytes_transferred: Option<u64>,
    pub stack_trace: Option<String>,
}

/// Overhead statistics for profiling operations
#[derive(Debug, Clone, Default)]
pub struct OverheadStats {
    pub add_event_time_ns: u64,
    pub add_event_count: u64,
    pub stack_trace_time_ns: u64,
    pub stack_trace_count: u64,
    pub export_time_ns: u64,
    pub export_count: u64,
    pub total_overhead_ns: u64,
}

/// Bottleneck analysis results
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct BottleneckAnalysis {
    pub slowest_operations: Vec<BottleneckEvent>,
    pub memory_hotspots: Vec<MemoryHotspot>,
    pub thread_contention: Vec<ThreadContentionEvent>,
    pub efficiency_issues: Vec<EfficiencyIssue>,
    pub recommendations: Vec<String>,
}

/// A performance bottleneck event
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct BottleneckEvent {
    pub name: String,
    pub category: String,
    pub duration_us: u64,
    pub thread_id: usize,
    pub severity: BottleneckSeverity,
    pub impact_score: f64,
    pub recommendation: String,
}

/// Memory hotspot information
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct MemoryHotspot {
    pub location: String,
    pub total_allocations: usize,
    pub total_bytes: usize,
    pub average_size: f64,
    pub peak_concurrent_allocations: usize,
    pub severity: BottleneckSeverity,
}

/// Thread contention event
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ThreadContentionEvent {
    pub thread_id: usize,
    pub operation: String,
    pub wait_time_us: u64,
    pub contention_count: usize,
}

/// Efficiency issue
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct EfficiencyIssue {
    pub issue_type: EfficiencyIssueType,
    pub description: String,
    pub affected_operations: Vec<String>,
    pub performance_impact: f64,
    pub recommendation: String,
}

/// Type of efficiency issue
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub enum EfficiencyIssueType {
    LowThroughput,
    HighLatency,
    MemoryWaste,
    CpuUnderutilization,
    FrequentAllocation,
    LargeAllocation,
}

/// Severity of a bottleneck
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
pub enum BottleneckSeverity {
    Low,
    Medium,
    High,
    Critical,
}

// ========================================
// ENHANCED MACRO EXPORTS
// ========================================

// Re-export enhanced macros
// Macros with #[macro_export] are automatically available at crate root

// ========================================
// COMPREHENSIVE TESTING SUPPORT
// ========================================

#[cfg(test)]
mod tests {
    use super::*;
    use std::thread;
    use std::time::Duration;

    #[test]
    fn test_enhanced_profiling_workflow() {
        // Test the complete enhanced profiling workflow
        start_profiling();

        {
            profile_scope!("test_enhanced_workflow");
            thread::sleep(Duration::from_millis(10));

            let mut metrics_scope = MetricsScope::new("computation");
            metrics_scope.set_operation_count(1000);
            metrics_scope.set_flops(5000);
            metrics_scope.set_bytes_transferred(2048);

            thread::sleep(Duration::from_millis(5));
        }

        stop_profiling();

        // Test export functionality
        let json_path = std::env::temp_dir().join("test_enhanced.json");
        let json_str = json_path.display().to_string();
        let result = export_global_json(&json_str);
        assert!(result.is_ok());

        let csv_path = std::env::temp_dir().join("test_enhanced.csv");
        let csv_str = csv_path.display().to_string();
        let result = export_global_csv(&csv_str);
        assert!(result.is_ok());

        // Clean up
        let _ = std::fs::remove_file(&json_path);
        let _ = std::fs::remove_file(&csv_path);
    }

    #[test]
    fn test_unified_profiler() {
        let mut profiler = create_unified_profiler();
        let result = profiler.start_all();

        // Should succeed even if some platforms are unavailable
        thread::sleep(Duration::from_millis(5));

        let stop_result = profiler.stop_all();
        // Export test
        let unified_path = std::env::temp_dir().join("test_unified.json");
        let unified_str = unified_path.display().to_string();
        let export_result = profiler.export_all(export::ExportFormat::Json, &unified_str);

        // Clean up
        let _ = std::fs::remove_file(&unified_path);
    }

    #[test]
    fn test_enhanced_export_formats() {
        start_profiling();
        {
            profile_scope!("format_test");
            thread::sleep(Duration::from_millis(5));
        }
        stop_profiling();

        // Test all available formats
        let formats = export::available_format_names();
        for format_name in formats {
            if let Some(format) = export::parse_format(&format_name) {
                let path = std::env::temp_dir().join(format!(
                    "test_{}.{}",
                    format_name,
                    format.extension()
                ));
                let path_str = path.display().to_string();
                let result = export_global_events(format, &path_str);

                // Clean up
                let _ = std::fs::remove_file(&path);
            }
        }
    }

    #[test]
    #[ignore = "Flaky test - passes individually but may fail in full suite"]
    fn test_overhead_tracking() {
        set_global_overhead_tracking_enabled(true);
        start_profiling();

        {
            profile_scope!("overhead_test");
            thread::sleep(Duration::from_millis(5));
        }

        stop_profiling();

        let stats = get_global_overhead_stats();
        assert!(stats.add_event_count > 0);
        assert!(stats.total_overhead_ns > 0);

        reset_global_overhead_stats();
        set_global_overhead_tracking_enabled(false);
    }
}

// Version information
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub const VERSION_MAJOR: u32 = 0;
pub const VERSION_MINOR: u32 = 1;
pub const VERSION_PATCH: u32 = 0;

/// Prelude module for convenient imports
#[allow(ambiguous_glob_reexports)]
pub mod prelude {
    pub use crate::analysis::*;
    pub use crate::core::*;
    pub use crate::distributed::*;
    pub use crate::export::*;
    pub use crate::platforms::*;
}