torsh-profiler 0.1.2

//! Intel VTune profiling integration
//!
//! This module provides integration with Intel VTune Profiler for comprehensive
//! CPU performance analysis, including hotspot analysis, threading analysis,
//! and microarchitecture exploration.

use crate::{ProfileEvent, TorshResult};
use serde::{Deserialize, Serialize};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use torsh_core::TorshError;

/// Intel VTune profiling configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VTuneConfig {
    /// Whether to enable ITT (Intel Threading Tools) API
    pub itt_enabled: bool,
    /// Whether to enable hotspot analysis
    pub hotspot_analysis: bool,
    /// Whether to enable threading analysis
    pub threading_analysis: bool,
    /// Whether to enable memory access analysis
    pub memory_access_analysis: bool,
    /// Whether to enable microarchitecture exploration
    pub microarchitecture_analysis: bool,
    /// Whether to enable hardware event sampling
    pub hardware_events: bool,
    /// Sampling frequency in Hz
    pub sampling_frequency: u32,
    /// Output directory for VTune results
    pub output_dir: Option<String>,
    /// CPU core mask for targeted profiling
    pub cpu_mask: Option<u64>,
}

impl Default for VTuneConfig {
    fn default() -> Self {
        Self {
            itt_enabled: true,
            hotspot_analysis: true,
            threading_analysis: true,
            memory_access_analysis: false,
            microarchitecture_analysis: false,
            hardware_events: true,
            sampling_frequency: 1000, // 1 kHz
            output_dir: None,
            cpu_mask: None,
        }
    }
}

/// Intel VTune profiler
pub struct VTuneProfiler {
    config: VTuneConfig,
    events: Arc<Mutex<Vec<ProfileEvent>>>,
    start_time: Instant,
    enabled: bool,
    session_id: String,
    collection_id: u64,
}

impl VTuneProfiler {
    /// Create a new VTune profiler
    pub fn new(config: VTuneConfig) -> Self {
        Self {
            config,
            events: Arc::new(Mutex::new(Vec::new())),
            start_time: Instant::now(),
            enabled: false,
            session_id: format!("vtune_session_{}", chrono::Utc::now().timestamp()),
            collection_id: 0,
        }
    }

    /// Enable VTune profiling
    pub fn enable(&mut self) -> TorshResult<()> {
        self.enabled = true;
        self.start_time = Instant::now();
        self.collection_id += 1;

        if let Ok(mut events) = self.events.lock() {
            events.clear();
        }

        // Initialize ITT API if enabled
        if self.config.itt_enabled {
            self.init_itt_api()?;
        }

        // Start VTune collection
        self.start_vtune_collection()?;

        Ok(())
    }

    /// Disable VTune profiling
    pub fn disable(&mut self) -> TorshResult<()> {
        self.enabled = false;

        // Stop VTune collection
        self.stop_vtune_collection()?;

        // Finalize ITT API if enabled
        if self.config.itt_enabled {
            self.finalize_itt_api()?;
        }

        Ok(())
    }

    /// Start an ITT task
    pub fn start_itt_task(&self, name: &str) -> TorshResult<ITTTask> {
        if !self.enabled || !self.config.itt_enabled {
            return Ok(ITTTask::new_disabled());
        }

        let start_time = Instant::now();

        // In a real implementation, we would call __itt_task_begin()
        let task = ITTTask::new(name.to_string(), start_time);

        Ok(task)
    }

    /// Record a function execution for VTune analysis
    #[allow(clippy::too_many_arguments)]
    pub fn record_function_execution(
        &self,
        function_name: &str,
        module: &str,
        file: &str,
        line: u32,
        duration: Duration,
        cpu_cycles: Option<u64>,
        cache_misses: Option<u64>,
        branch_mispredicts: Option<u64>,
    ) -> TorshResult<()> {
        if !self.enabled {
            return Ok(());
        }

        let mut events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let start_us = self.start_time.elapsed().as_micros() as u64;
        let duration_us = duration.as_micros() as u64;

        let event_name = format!("{module}::{function_name}() [{file}:{line}]");

        let mut metadata = format!(
            "{{\"session_id\": \"{}\", \"collection_id\": {}",
            self.session_id, self.collection_id
        );

        if let Some(cycles) = cpu_cycles {
            metadata.push_str(&format!(", \"cpu_cycles\": {cycles}"));
        }

        if let Some(misses) = cache_misses {
            metadata.push_str(&format!(", \"cache_misses\": {misses}"));
        }

        if let Some(mispredicts) = branch_mispredicts {
            metadata.push_str(&format!(", \"branch_mispredicts\": {mispredicts}"));
        }

        metadata.push('}');

        events.push(ProfileEvent {
            name: event_name,
            category: "vtune_function".to_string(),
            start_us,
            duration_us,
            thread_id: 0, // Thread ID tracking simplified
            operation_count: Some(1),
            flops: Some(0), // Would be calculated based on instruction analysis
            bytes_transferred: Some(0), // Would be calculated based on memory access analysis
            stack_trace: Some(metadata),
        });

        Ok(())
    }

    /// Record a threading event for VTune analysis
    pub fn record_threading_event(
        &self,
        event_type: ThreadingEventType,
        thread_id: usize,
        synchronization_object: Option<&str>,
        wait_time: Option<Duration>,
    ) -> TorshResult<()> {
        if !self.enabled || !self.config.threading_analysis {
            return Ok(());
        }

        let mut events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let start_us = self.start_time.elapsed().as_micros() as u64;
        let duration_us = wait_time.map(|d| d.as_micros() as u64).unwrap_or(0);

        let event_name = format!(
            "{:?}{}",
            event_type,
            synchronization_object
                .map(|obj| format!(" [{obj}]"))
                .unwrap_or_default()
        );

        let metadata = format!(
            "{{\"session_id\": \"{}\", \"collection_id\": {}, \"thread_id\": {}}}",
            self.session_id, self.collection_id, thread_id
        );

        events.push(ProfileEvent {
            name: event_name,
            category: "vtune_threading".to_string(),
            start_us,
            duration_us,
            thread_id,
            operation_count: Some(1),
            flops: Some(0),
            bytes_transferred: Some(0),
            stack_trace: Some(metadata),
        });

        Ok(())
    }

    /// Record a memory access pattern for VTune analysis
    pub fn record_memory_access(
        &self,
        access_type: MemoryAccessType,
        address: u64,
        size: usize,
        latency: Option<Duration>,
        cache_level: Option<u8>,
    ) -> TorshResult<()> {
        if !self.enabled || !self.config.memory_access_analysis {
            return Ok(());
        }

        let mut events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let start_us = self.start_time.elapsed().as_micros() as u64;
        let duration_us = latency.map(|d| d.as_micros() as u64).unwrap_or(0);

        let event_name = format!(
            "{:?} [0x{:x}, {} bytes{}]",
            access_type,
            address,
            size,
            cache_level
                .map(|level| format!(", L{level}"))
                .unwrap_or_default()
        );

        let metadata = format!(
            "{{\"session_id\": \"{}\", \"collection_id\": {}, \"address\": \"0x{:x}\", \"size\": {}}}",
            self.session_id, self.collection_id, address, size
        );

        events.push(ProfileEvent {
            name: event_name,
            category: "vtune_memory".to_string(),
            start_us,
            duration_us,
            thread_id: 0, // Thread ID tracking simplified
            operation_count: Some(1),
            flops: Some(0),
            bytes_transferred: Some(size as u64),
            stack_trace: Some(metadata),
        });

        Ok(())
    }

    /// Export VTune profiling data
    pub fn export_vtune_data(&self, filename: &str) -> TorshResult<()> {
        let events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let vtune_data = VTuneExportData {
            session_id: self.session_id.clone(),
            collection_id: self.collection_id,
            config: self.config.clone(),
            events: events.clone(),
            total_events: events.len(),
            total_duration_us: events.iter().map(|e| e.duration_us).sum(),
            timestamp: chrono::Utc::now(),
        };

        let json_data = serde_json::to_string_pretty(&vtune_data)
            .map_err(|e| TorshError::InvalidArgument(format!("Failed to serialize data: {e}")))?;

        std::fs::write(filename, json_data)
            .map_err(|e| TorshError::InvalidArgument(format!("Failed to write file: {e}")))?;

        Ok(())
    }

    /// Get VTune profiling statistics
    pub fn get_vtune_stats(&self) -> TorshResult<VTuneStats> {
        let events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let function_events: Vec<_> = events
            .iter()
            .filter(|e| e.category == "vtune_function")
            .collect();

        let threading_events: Vec<_> = events
            .iter()
            .filter(|e| e.category == "vtune_threading")
            .collect();

        let memory_events: Vec<_> = events
            .iter()
            .filter(|e| e.category == "vtune_memory")
            .collect();

        let total_function_time_us: u64 = function_events.iter().map(|e| e.duration_us).sum();

        let total_thread_wait_time_us: u64 = threading_events.iter().map(|e| e.duration_us).sum();

        let total_memory_accesses: usize = memory_events.len();

        let avg_function_duration_us = if !function_events.is_empty() {
            total_function_time_us as f64 / function_events.len() as f64
        } else {
            0.0
        };

        let unique_threads: std::collections::HashSet<_> =
            events.iter().map(|e| e.thread_id).collect();

        Ok(VTuneStats {
            total_events: events.len(),
            function_events: function_events.len(),
            threading_events: threading_events.len(),
            memory_events: memory_events.len(),
            total_function_time_us,
            total_thread_wait_time_us,
            total_memory_accesses,
            avg_function_duration_us,
            unique_threads: unique_threads.len(),
            session_id: self.session_id.clone(),
            collection_id: self.collection_id,
        })
    }

    // Private helper methods

    fn init_itt_api(&self) -> TorshResult<()> {
        // In a real implementation, we would initialize ITT API
        // __itt_thread_set_name, __itt_domain_create, etc.
        Ok(())
    }

    fn finalize_itt_api(&self) -> TorshResult<()> {
        // In a real implementation, we would finalize ITT API
        // __itt_detach, etc.
        Ok(())
    }

    fn start_vtune_collection(&self) -> TorshResult<()> {
        // In a real implementation, we would start VTune collection
        // via command line or VTune API
        Ok(())
    }

    fn stop_vtune_collection(&self) -> TorshResult<()> {
        // In a real implementation, we would stop VTune collection
        // and generate the results file
        Ok(())
    }
}

/// ITT task for profiling
pub struct ITTTask {
    name: String,
    start_time: Instant,
    enabled: bool,
}

impl ITTTask {
    fn new(name: String, start_time: Instant) -> Self {
        Self {
            name,
            start_time,
            enabled: true,
        }
    }

    fn new_disabled() -> Self {
        Self {
            name: String::new(),
            start_time: Instant::now(),
            enabled: false,
        }
    }

    /// Get the duration of this task
    pub fn duration(&self) -> Duration {
        self.start_time.elapsed()
    }

    /// Get the name of this task
    pub fn name(&self) -> &str {
        &self.name
    }
}

impl Drop for ITTTask {
    fn drop(&mut self) {
        if self.enabled {
            // In a real implementation, we would call __itt_task_end()
        }
    }
}

/// Threading event types for VTune analysis
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum ThreadingEventType {
    ThreadCreate,
    ThreadJoin,
    ThreadDestroy,
    MutexLock,
    MutexUnlock,
    MutexWait,
    ConditionWait,
    ConditionSignal,
    BarrierWait,
    SemaphoreWait,
    SemaphorePost,
}

/// Memory access types for VTune analysis
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum MemoryAccessType {
    Read,
    Write,
    ReadWrite,
    Prefetch,
    CacheLineLoad,
    CacheLineStore,
}

/// VTune export data structure
#[derive(Debug, Serialize, Deserialize)]
pub struct VTuneExportData {
    pub session_id: String,
    pub collection_id: u64,
    pub config: VTuneConfig,
    pub events: Vec<ProfileEvent>,
    pub total_events: usize,
    pub total_duration_us: u64,
    pub timestamp: chrono::DateTime<chrono::Utc>,
}

/// VTune profiling statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VTuneStats {
    pub total_events: usize,
    pub function_events: usize,
    pub threading_events: usize,
    pub memory_events: usize,
    pub total_function_time_us: u64,
    pub total_thread_wait_time_us: u64,
    pub total_memory_accesses: usize,
    pub avg_function_duration_us: f64,
    pub unique_threads: usize,
    pub session_id: String,
    pub collection_id: u64,
}

/// Create a new VTune profiler with default configuration
pub fn create_vtune_profiler() -> VTuneProfiler {
    VTuneProfiler::new(VTuneConfig::default())
}

/// Create a new VTune profiler with custom configuration
pub fn create_vtune_profiler_with_config(config: VTuneConfig) -> VTuneProfiler {
    VTuneProfiler::new(config)
}

/// Export VTune profiling data to JSON format
pub fn export_vtune_json(profiler: &VTuneProfiler, filename: &str) -> TorshResult<()> {
    profiler.export_vtune_data(filename)
}

/// Get VTune profiling statistics
pub fn get_vtune_statistics(profiler: &VTuneProfiler) -> TorshResult<VTuneStats> {
    profiler.get_vtune_stats()
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::time::Duration;

    #[test]
    fn test_vtune_profiler_creation() {
        let profiler = create_vtune_profiler();
        assert!(!profiler.enabled);
    }

    #[test]
    fn test_vtune_profiler_enable_disable() {
        let mut profiler = create_vtune_profiler();
        assert!(profiler.enable().is_ok());
        assert!(profiler.enabled);
        assert!(profiler.disable().is_ok());
        assert!(!profiler.enabled);
    }

    #[test]
    fn test_itt_task() {
        let mut profiler = create_vtune_profiler();
        profiler.enable().unwrap();
        let task = profiler.start_itt_task("test_task").unwrap();
        assert_eq!(task.name(), "test_task");
        // Sleep briefly so that at least one nanosecond elapses before sampling
        // the task duration; without this the elapsed time can be 0 ns on fast
        // hardware because the measurement happens in the same CPU clock cycle
        // as task creation.
        std::thread::sleep(std::time::Duration::from_micros(100));
        assert!(task.duration().as_nanos() > 0);
    }

    #[test]
    fn test_function_recording() {
        let mut profiler = create_vtune_profiler();
        profiler.enable().unwrap();

        let result = profiler.record_function_execution(
            "test_function",
            "test_module",
            "test.rs",
            42,
            Duration::from_micros(100),
            Some(1000),
            Some(10),
            Some(5),
        );

        assert!(result.is_ok());

        let stats = profiler.get_vtune_stats().unwrap();
        assert_eq!(stats.function_events, 1);
        assert_eq!(stats.total_function_time_us, 100);
    }

    #[test]
    fn test_threading_recording() {
        let mut profiler = create_vtune_profiler();
        profiler.enable().unwrap();

        let result = profiler.record_threading_event(
            ThreadingEventType::MutexWait,
            123,
            Some("test_mutex"),
            Some(Duration::from_micros(50)),
        );

        assert!(result.is_ok());

        let stats = profiler.get_vtune_stats().unwrap();
        assert_eq!(stats.threading_events, 1);
        assert_eq!(stats.total_thread_wait_time_us, 50);
    }

    #[test]
    fn test_memory_recording() {
        let config = VTuneConfig {
            memory_access_analysis: true,
            ..Default::default()
        };
        let mut profiler = create_vtune_profiler_with_config(config);
        profiler.enable().unwrap();

        let result = profiler.record_memory_access(
            MemoryAccessType::Read,
            0x1000,
            64,
            Some(Duration::from_nanos(100)),
            Some(1),
        );

        assert!(result.is_ok());

        let stats = profiler.get_vtune_stats().unwrap();
        assert_eq!(stats.memory_events, 1);
        assert_eq!(stats.total_memory_accesses, 1);
    }

    #[test]
    fn test_export_vtune_data() {
        let mut profiler = create_vtune_profiler();
        profiler.enable().unwrap();

        profiler
            .record_function_execution(
                "test_function",
                "test_module",
                "test.rs",
                42,
                Duration::from_micros(100),
                None,
                None,
                None,
            )
            .unwrap();

        let temp_file = std::env::temp_dir().join("test_vtune_export.json");
        let temp_str = temp_file.display().to_string();
        let result = profiler.export_vtune_data(&temp_str);
        assert!(result.is_ok());

        // Clean up
        let _ = std::fs::remove_file(&temp_file);
    }

    #[test]
    fn test_custom_config() {
        let config = VTuneConfig {
            itt_enabled: false,
            hotspot_analysis: false,
            threading_analysis: false,
            memory_access_analysis: true,
            microarchitecture_analysis: true,
            hardware_events: false,
            sampling_frequency: 2000,
            output_dir: Some(std::env::temp_dir().join("vtune").display().to_string()),
            cpu_mask: Some(0xFF),
        };

        let profiler = create_vtune_profiler_with_config(config.clone());
        assert_eq!(profiler.config.sampling_frequency, 2000);
        assert!(profiler.config.memory_access_analysis);
        assert!(!profiler.config.itt_enabled);
    }
}