torsh-profiler 0.1.2

//! NVIDIA Nsight profiling integration
//!
//! This module provides integration with NVIDIA Nsight profiling tools, including
//! Nsight Compute, Nsight Systems, and Nsight Graphics for comprehensive GPU profiling.

use crate::{ProfileEvent, TorshResult};
use serde::{Deserialize, Serialize};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use torsh_core::TorshError;

/// NVIDIA Nsight profiling configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NsightConfig {
    /// Whether to enable NVTX ranges
    pub nvtx_enabled: bool,
    /// Whether to enable CUDA API tracing
    pub cuda_api_tracing: bool,
    /// Whether to enable kernel analysis
    pub kernel_analysis: bool,
    /// Whether to enable memory analysis
    pub memory_analysis: bool,
    /// Whether to enable occupancy analysis
    pub occupancy_analysis: bool,
    /// Output directory for Nsight files
    pub output_dir: Option<String>,
    /// Target GPU device ID
    pub device_id: i32,
}

impl Default for NsightConfig {
    fn default() -> Self {
        Self {
            nvtx_enabled: true,
            cuda_api_tracing: true,
            kernel_analysis: true,
            memory_analysis: true,
            occupancy_analysis: true,
            output_dir: None,
            device_id: 0,
        }
    }
}

/// NVIDIA Nsight profiler
pub struct NsightProfiler {
    config: NsightConfig,
    events: Arc<Mutex<Vec<ProfileEvent>>>,
    start_time: Instant,
    enabled: bool,
    session_id: String,
}

impl NsightProfiler {
    /// Create a new Nsight profiler
    pub fn new(config: NsightConfig) -> Self {
        Self {
            config,
            events: Arc::new(Mutex::new(Vec::new())),
            start_time: Instant::now(),
            enabled: false,
            session_id: format!("nsight_session_{}", chrono::Utc::now().timestamp()),
        }
    }

    /// Enable Nsight profiling
    pub fn enable(&mut self) -> TorshResult<()> {
        self.enabled = true;
        self.start_time = Instant::now();

        if let Ok(mut events) = self.events.lock() {
            events.clear();
        }

        // Initialize NVTX if enabled
        if self.config.nvtx_enabled {
            self.init_nvtx()?;
        }

        // Initialize CUDA API tracing if enabled
        if self.config.cuda_api_tracing {
            self.init_cuda_api_tracing()?;
        }

        // Initialize kernel analysis if enabled
        if self.config.kernel_analysis {
            self.init_kernel_analysis()?;
        }

        // Initialize memory analysis if enabled
        if self.config.memory_analysis {
            self.init_memory_analysis()?;
        }

        // Initialize occupancy analysis if enabled
        if self.config.occupancy_analysis {
            self.init_occupancy_analysis()?;
        }

        Ok(())
    }

    /// Disable Nsight profiling
    pub fn disable(&mut self) -> TorshResult<()> {
        self.enabled = false;

        // Finalize profiling session
        self.finalize_session()?;

        Ok(())
    }

    /// Start an NVTX range
    pub fn start_nvtx_range(&self, name: &str) -> TorshResult<NvtxRange> {
        if !self.enabled || !self.config.nvtx_enabled {
            return Ok(NvtxRange::new_disabled());
        }

        let start_time = Instant::now();

        // In a real implementation, we would call nvtxRangePushA()
        // For now, we'll simulate by recording the event
        let range = NvtxRange::new(name.to_string(), start_time);

        Ok(range)
    }

    /// Record a kernel launch for Nsight analysis
    pub fn record_kernel_launch(
        &self,
        kernel_name: &str,
        grid_size: (u32, u32, u32),
        block_size: (u32, u32, u32),
        shared_memory: usize,
        registers_per_thread: u32,
        duration: Duration,
    ) -> TorshResult<()> {
        if !self.enabled {
            return Ok(());
        }

        let mut events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let start_us = self.start_time.elapsed().as_micros() as u64;
        let duration_us = duration.as_micros() as u64;

        let event_name = format!(
            "{} [grid:({},{},{}), block:({},{},{}), shared:{}B, regs:{}]",
            kernel_name,
            grid_size.0,
            grid_size.1,
            grid_size.2,
            block_size.0,
            block_size.1,
            block_size.2,
            shared_memory,
            registers_per_thread
        );

        // Calculate theoretical occupancy
        let _theoretical_occupancy =
            self.calculate_theoretical_occupancy(block_size, shared_memory, registers_per_thread)?;

        events.push(ProfileEvent {
            name: event_name,
            category: "nsight_kernel".to_string(),
            start_us,
            duration_us,
            thread_id: self.config.device_id as usize,
            operation_count: Some(1),
            flops: Some(0),             // Would be calculated based on kernel analysis
            bytes_transferred: Some(0), // Would be calculated based on memory analysis
            stack_trace: None,
        });

        Ok(())
    }

    /// Record a memory operation for Nsight analysis
    pub fn record_memory_operation(
        &self,
        operation: &str,
        src_device: i32,
        dst_device: i32,
        size_bytes: usize,
        duration: Duration,
    ) -> TorshResult<()> {
        if !self.enabled {
            return Ok(());
        }

        let mut events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let start_us = self.start_time.elapsed().as_micros() as u64;
        let duration_us = duration.as_micros() as u64;

        let event_name = format!(
            "{} [{}->{}] {}MB",
            operation,
            src_device,
            dst_device,
            size_bytes as f64 / 1024.0 / 1024.0
        );

        let _bandwidth_gbps = if duration_us > 0 {
            (size_bytes as f64 / 1024.0 / 1024.0 / 1024.0) / (duration_us as f64 / 1_000_000.0)
        } else {
            0.0
        };

        events.push(ProfileEvent {
            name: event_name,
            category: "nsight_memory".to_string(),
            start_us,
            duration_us,
            thread_id: self.config.device_id as usize,
            operation_count: Some(1),
            flops: Some(0),
            bytes_transferred: Some(size_bytes as u64),
            stack_trace: None,
        });

        Ok(())
    }

    /// Export Nsight profiling data
    pub fn export_nsight_data(&self, filename: &str) -> TorshResult<()> {
        let events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let nsight_data = NsightExportData {
            session_id: self.session_id.clone(),
            config: self.config.clone(),
            events: events.clone(),
            total_events: events.len(),
            total_duration_us: events.iter().map(|e| e.duration_us).sum(),
            timestamp: chrono::Utc::now(),
        };

        let json_data = serde_json::to_string_pretty(&nsight_data)
            .map_err(|e| TorshError::InvalidArgument(format!("Failed to serialize data: {e}")))?;

        std::fs::write(filename, json_data)
            .map_err(|e| TorshError::InvalidArgument(format!("Failed to write file: {e}")))?;

        Ok(())
    }

    /// Get Nsight profiling statistics
    pub fn get_nsight_stats(&self) -> TorshResult<NsightStats> {
        let events = self.events.lock().map_err(|_| {
            TorshError::InvalidArgument("Failed to acquire lock on events".to_string())
        })?;

        let kernel_events: Vec<_> = events
            .iter()
            .filter(|e| e.category == "nsight_kernel")
            .collect();

        let memory_events: Vec<_> = events
            .iter()
            .filter(|e| e.category == "nsight_memory")
            .collect();

        let total_kernel_time_us: u64 = kernel_events.iter().map(|e| e.duration_us).sum();

        let total_memory_time_us: u64 = memory_events.iter().map(|e| e.duration_us).sum();

        let total_bytes_transferred: usize = memory_events
            .iter()
            .map(|e| e.bytes_transferred.unwrap_or(0) as usize)
            .sum();

        let avg_kernel_duration_us = if !kernel_events.is_empty() {
            total_kernel_time_us as f64 / kernel_events.len() as f64
        } else {
            0.0
        };

        let avg_memory_bandwidth_gbps = if !memory_events.is_empty() && total_memory_time_us > 0 {
            (total_bytes_transferred as f64 / 1024.0 / 1024.0 / 1024.0)
                / (total_memory_time_us as f64 / 1_000_000.0)
                / memory_events.len() as f64
        } else {
            0.0
        };

        Ok(NsightStats {
            total_events: events.len(),
            kernel_events: kernel_events.len(),
            memory_events: memory_events.len(),
            total_kernel_time_us,
            total_memory_time_us,
            total_bytes_transferred,
            avg_kernel_duration_us,
            avg_memory_bandwidth_gbps,
            session_id: self.session_id.clone(),
        })
    }

    // Private helper methods

    fn init_nvtx(&self) -> TorshResult<()> {
        // In a real implementation, we would initialize NVTX
        // nvtxNameOsThread, nvtxNameCudaDevice, etc.
        Ok(())
    }

    fn init_cuda_api_tracing(&self) -> TorshResult<()> {
        // In a real implementation, we would enable CUDA API tracing
        // cuptiSubscribe, cuptiEnableCallback, etc.
        Ok(())
    }

    fn init_kernel_analysis(&self) -> TorshResult<()> {
        // In a real implementation, we would enable kernel analysis
        // cuptiActivityEnable for kernel activities
        Ok(())
    }

    fn init_memory_analysis(&self) -> TorshResult<()> {
        // In a real implementation, we would enable memory analysis
        // cuptiActivityEnable for memory activities
        Ok(())
    }

    fn init_occupancy_analysis(&self) -> TorshResult<()> {
        // In a real implementation, we would enable occupancy analysis
        // cuOccupancyMaxActiveBlocksPerMultiprocessor, etc.
        Ok(())
    }

    fn finalize_session(&self) -> TorshResult<()> {
        // In a real implementation, we would finalize the profiling session
        // cuptiActivityFlushAll, cuptiUnsubscribe, etc.
        Ok(())
    }

    fn calculate_theoretical_occupancy(
        &self,
        block_size: (u32, u32, u32),
        shared_memory: usize,
        registers_per_thread: u32,
    ) -> TorshResult<f64> {
        // Simplified theoretical occupancy calculation
        // In a real implementation, we would use cuOccupancyMaxActiveBlocksPerMultiprocessor

        let threads_per_block = block_size.0 * block_size.1 * block_size.2;
        let max_threads_per_sm = 2048; // Typical for modern GPUs
        let max_blocks_per_sm = 32;

        let max_blocks_by_threads = max_threads_per_sm / threads_per_block;
        let max_blocks_by_shared_memory = if shared_memory > 0 {
            (48 * 1024) / shared_memory as u32 // 48KB shared memory per SM
        } else {
            max_blocks_per_sm
        };
        let max_blocks_by_registers = if registers_per_thread > 0 {
            (65536 / registers_per_thread) / threads_per_block // 64K registers per SM
        } else {
            max_blocks_per_sm
        };

        let max_blocks = max_blocks_by_threads
            .min(max_blocks_by_shared_memory)
            .min(max_blocks_by_registers)
            .min(max_blocks_per_sm);

        Ok((max_blocks * threads_per_block) as f64 / max_threads_per_sm as f64)
    }
}

/// NVTX range for GPU profiling
pub struct NvtxRange {
    name: String,
    start_time: Instant,
    enabled: bool,
}

impl NvtxRange {
    fn new(name: String, start_time: Instant) -> Self {
        Self {
            name,
            start_time,
            enabled: true,
        }
    }

    fn new_disabled() -> Self {
        Self {
            name: String::new(),
            start_time: Instant::now(),
            enabled: false,
        }
    }

    /// Get the duration of this range
    pub fn duration(&self) -> Duration {
        self.start_time.elapsed()
    }

    /// Get the name of this range
    pub fn name(&self) -> &str {
        &self.name
    }
}

impl Drop for NvtxRange {
    fn drop(&mut self) {
        if self.enabled {
            // In a real implementation, we would call nvtxRangePop()
        }
    }
}

/// Nsight export data structure
#[derive(Debug, Serialize, Deserialize)]
pub struct NsightExportData {
    pub session_id: String,
    pub config: NsightConfig,
    pub events: Vec<ProfileEvent>,
    pub total_events: usize,
    pub total_duration_us: u64,
    pub timestamp: chrono::DateTime<chrono::Utc>,
}

/// Nsight profiling statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NsightStats {
    pub total_events: usize,
    pub kernel_events: usize,
    pub memory_events: usize,
    pub total_kernel_time_us: u64,
    pub total_memory_time_us: u64,
    pub total_bytes_transferred: usize,
    pub avg_kernel_duration_us: f64,
    pub avg_memory_bandwidth_gbps: f64,
    pub session_id: String,
}

/// Create a new Nsight profiler with default configuration
pub fn create_nsight_profiler() -> NsightProfiler {
    NsightProfiler::new(NsightConfig::default())
}

/// Create a new Nsight profiler with custom configuration
pub fn create_nsight_profiler_with_config(config: NsightConfig) -> NsightProfiler {
    NsightProfiler::new(config)
}

/// Export Nsight profiling data to JSON format
pub fn export_nsight_json(profiler: &NsightProfiler, filename: &str) -> TorshResult<()> {
    profiler.export_nsight_data(filename)
}

/// Get Nsight profiling statistics
pub fn get_nsight_statistics(profiler: &NsightProfiler) -> TorshResult<NsightStats> {
    profiler.get_nsight_stats()
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::time::Duration;

    #[test]
    fn test_nsight_profiler_creation() {
        let profiler = create_nsight_profiler();
        assert!(!profiler.enabled);
    }

    #[test]
    fn test_nsight_profiler_enable_disable() {
        let mut profiler = create_nsight_profiler();
        assert!(profiler.enable().is_ok());
        assert!(profiler.enabled);
        assert!(profiler.disable().is_ok());
        assert!(!profiler.enabled);
    }

    #[test]
    fn test_nvtx_range() {
        let mut profiler = create_nsight_profiler();
        profiler.enable().unwrap();
        let range = profiler.start_nvtx_range("test_range").unwrap();
        assert_eq!(range.name(), "test_range");
        // Ensure some measurable time has elapsed before checking duration
        std::thread::sleep(std::time::Duration::from_millis(1));
        assert!(range.duration().as_nanos() > 0);
    }

    #[test]
    fn test_kernel_recording() {
        let mut profiler = create_nsight_profiler();
        profiler.enable().unwrap();

        let result = profiler.record_kernel_launch(
            "test_kernel",
            (1, 1, 1),
            (256, 1, 1),
            1024,
            32,
            Duration::from_micros(100),
        );

        assert!(result.is_ok());

        let stats = profiler.get_nsight_stats().unwrap();
        assert_eq!(stats.kernel_events, 1);
        assert_eq!(stats.total_kernel_time_us, 100);
    }

    #[test]
    fn test_memory_recording() {
        let mut profiler = create_nsight_profiler();
        profiler.enable().unwrap();

        let result = profiler.record_memory_operation(
            "cudaMemcpy",
            0,
            0,
            1024 * 1024,
            Duration::from_micros(50),
        );

        assert!(result.is_ok());

        let stats = profiler.get_nsight_stats().unwrap();
        assert_eq!(stats.memory_events, 1);
        assert_eq!(stats.total_memory_time_us, 50);
        assert_eq!(stats.total_bytes_transferred, 1024 * 1024);
    }

    #[test]
    fn test_theoretical_occupancy_calculation() {
        let profiler = create_nsight_profiler();
        let occupancy = profiler
            .calculate_theoretical_occupancy((256, 1, 1), 1024, 32)
            .unwrap();

        assert!(occupancy > 0.0);
        assert!(occupancy <= 1.0);
    }

    #[test]
    fn test_export_nsight_data() {
        let mut profiler = create_nsight_profiler();
        profiler.enable().unwrap();

        profiler
            .record_kernel_launch(
                "test_kernel",
                (1, 1, 1),
                (256, 1, 1),
                1024,
                32,
                Duration::from_micros(100),
            )
            .unwrap();

        let temp_file = std::env::temp_dir().join("test_nsight_export.json");
        let temp_str = temp_file.display().to_string();
        let result = profiler.export_nsight_data(&temp_str);
        assert!(result.is_ok());

        // Clean up
        let _ = std::fs::remove_file(&temp_file);
    }
}