mathhook-core 0.2.0

//! GPU Acceleration for MathHook
//!
//! This module provides GPU acceleration for computationally intensive mathematical operations
//! using WebGPU for cross-platform compatibility and optional CUDA for maximum performance.

use std::sync::{Arc, OnceLock};

/// GPU acceleration backend types
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuBackend {
    /// No GPU acceleration available
    None,
    /// WebGPU-based acceleration (cross-platform)
    WebGpu,
    /// CUDA-based acceleration (NVIDIA only, maximum performance)
    #[cfg(feature = "cuda")]
    Cuda,
}

/// GPU acceleration capabilities and status
#[derive(Debug, Clone)]
pub struct GpuCapabilities {
    /// Available backend
    pub backend: GpuBackend,
    /// Device name
    pub device_name: String,
    /// Available memory in bytes
    pub memory_bytes: u64,
    /// Maximum workgroup size
    pub max_workgroup_size: u32,
    /// Whether double precision is supported
    pub supports_f64: bool,
}

/// GPU operation types that can be accelerated
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuOperation {
    /// Bulk arithmetic operations (addition, multiplication)
    BulkArithmetic,
    /// Matrix operations (multiplication, determinant)
    MatrixOperations,
    /// Polynomial evaluation
    PolynomialEvaluation,
    /// Fourier transforms
    FourierTransform,
    /// Large expression simplification
    ExpressionSimplification,
}

/// GPU acceleration thresholds for different operations
#[derive(Debug, Clone)]
pub struct GpuThresholds {
    /// Minimum elements for bulk arithmetic GPU acceleration
    pub bulk_arithmetic_threshold: usize,
    /// Minimum matrix size for GPU acceleration
    pub matrix_threshold: usize,
    /// Minimum polynomial degree for GPU acceleration
    pub polynomial_threshold: usize,
}

impl Default for GpuThresholds {
    fn default() -> Self {
        Self {
            bulk_arithmetic_threshold: 10000, // 10K elements
            matrix_threshold: 100,            // 100x100 matrices
            polynomial_threshold: 1000,       // Degree 1000 polynomials
        }
    }
}

/// GPU acceleration manager
pub struct GpuAccelerator {
    capabilities: Option<GpuCapabilities>,
    thresholds: GpuThresholds,
    #[cfg(feature = "webgpu")]
    _webgpu_context: Option<WebGpuContext>,
    #[cfg(feature = "cuda")]
    _cuda_context: Option<CudaContext>,
}

impl Default for GpuAccelerator {
    fn default() -> Self {
        Self::new()
    }
}

impl GpuAccelerator {
    /// Create a new GPU accelerator with automatic backend detection
    pub fn new() -> Self {
        let mut accelerator = Self {
            capabilities: None,
            thresholds: GpuThresholds::default(),
            #[cfg(feature = "webgpu")]
            _webgpu_context: None,
            #[cfg(feature = "cuda")]
            _cuda_context: None,
        };

        accelerator.detect_capabilities();
        accelerator
    }

    /// Detect available GPU capabilities
    fn detect_capabilities(&mut self) {
        // Try CUDA first (highest performance)
        #[cfg(feature = "cuda")]
        if let Some(cuda_caps) = self.detect_cuda_capabilities() {
            self.capabilities = Some(cuda_caps);
            return;
        }

        // Try WebGPU (cross-platform)
        #[cfg(feature = "webgpu")]
        if let Some(webgpu_caps) = self.detect_webgpu_capabilities() {
            self.capabilities = Some(webgpu_caps);
            return;
        }

        // No GPU acceleration available
        self.capabilities = Some(GpuCapabilities {
            backend: GpuBackend::None,
            device_name: "CPU Only".to_owned(),
            memory_bytes: 0,
            max_workgroup_size: 1,
            supports_f64: true,
        });
    }

    /// Detect CUDA capabilities
    #[cfg(feature = "cuda")]
    fn detect_cuda_capabilities(&mut self) -> Option<GpuCapabilities> {
        // This would use cudarc to detect CUDA devices
        // For now, return None as placeholder
        None
    }

    /// Detect WebGPU capabilities
    #[cfg(feature = "webgpu")]
    fn detect_webgpu_capabilities(&mut self) -> Option<GpuCapabilities> {
        // This would use wgpu to detect WebGPU devices
        // For now, return None as placeholder
        None
    }

    /// Check if GPU acceleration should be used for an operation
    pub fn should_use_gpu(&self, operation: GpuOperation, size: usize) -> bool {
        let Some(caps) = &self.capabilities else {
            return false;
        };

        if caps.backend == GpuBackend::None {
            return false;
        }

        match operation {
            GpuOperation::BulkArithmetic => size >= self.thresholds.bulk_arithmetic_threshold,
            GpuOperation::MatrixOperations => size >= self.thresholds.matrix_threshold,
            GpuOperation::PolynomialEvaluation => size >= self.thresholds.polynomial_threshold,
            GpuOperation::FourierTransform => size >= 512, // FFT benefits from GPU at smaller sizes
            GpuOperation::ExpressionSimplification => size >= 1000, // Complex expressions
        }
    }

    /// Get GPU capabilities
    pub fn get_capabilities(&self) -> Option<&GpuCapabilities> {
        self.capabilities.as_ref()
    }

    /// Update GPU thresholds based on performance measurements
    pub fn update_thresholds(&mut self, thresholds: GpuThresholds) {
        self.thresholds = thresholds;
    }

    /// Perform GPU-accelerated bulk addition
    pub fn gpu_bulk_add(&self, values: &[f64]) -> Result<f64, GpuError> {
        if !self.should_use_gpu(GpuOperation::BulkArithmetic, values.len()) {
            return Err(GpuError::ThresholdNotMet);
        }

        let backend = self
            .capabilities
            .as_ref()
            .ok_or(GpuError::NoGpuAvailable)?
            .backend;

        match backend {
            #[cfg(feature = "webgpu")]
            GpuBackend::WebGpu => self.webgpu_bulk_add(values),
            #[cfg(feature = "cuda")]
            GpuBackend::Cuda => self.cuda_bulk_add(values),
            GpuBackend::None => Err(GpuError::NoGpuAvailable),
            #[cfg(not(feature = "webgpu"))]
            GpuBackend::WebGpu => Err(GpuError::NotImplemented(
                "WebGPU feature not enabled".to_owned(),
            )),
        }
    }

    /// Perform GPU-accelerated matrix multiplication
    pub fn gpu_matrix_multiply(
        &self,
        a: &[Vec<f64>],
        b: &[Vec<f64>],
    ) -> Result<Vec<Vec<f64>>, GpuError> {
        let size = a.len() * a[0].len() + b.len() * b[0].len();

        if !self.should_use_gpu(GpuOperation::MatrixOperations, size) {
            return Err(GpuError::ThresholdNotMet);
        }

        let backend = self
            .capabilities
            .as_ref()
            .ok_or(GpuError::NoGpuAvailable)?
            .backend;

        match backend {
            #[cfg(feature = "webgpu")]
            GpuBackend::WebGpu => self.webgpu_matrix_multiply(a, b),
            #[cfg(feature = "cuda")]
            GpuBackend::Cuda => self.cuda_matrix_multiply(a, b),
            GpuBackend::None => Err(GpuError::NoGpuAvailable),
            #[cfg(not(feature = "webgpu"))]
            GpuBackend::WebGpu => Err(GpuError::NotImplemented(
                "WebGPU feature not enabled".to_owned(),
            )),
        }
    }

    /// WebGPU bulk addition implementation
    #[cfg(feature = "webgpu")]
    fn webgpu_bulk_add(&self, _values: &[f64]) -> Result<f64, GpuError> {
        Err(GpuError::NotImplemented(
            "WebGPU bulk operations require compute shader integration (planned for 0.2)"
                .to_string(),
        ))
    }

    /// WebGPU matrix multiplication implementation
    #[cfg(feature = "webgpu")]
    fn webgpu_matrix_multiply(
        &self,
        _a: &[Vec<f64>],
        _b: &[Vec<f64>],
    ) -> Result<Vec<Vec<f64>>, GpuError> {
        Err(GpuError::NotImplemented(
            "WebGPU matrix operations require compute shader integration (planned for 0.2)"
                .to_string(),
        ))
    }

    /// CUDA bulk addition implementation
    #[cfg(feature = "cuda")]
    fn cuda_bulk_add(&self, _values: &[f64]) -> Result<f64, GpuError> {
        Err(GpuError::NotImplemented(
            "CUDA acceleration requires cudarc integration (planned for 0.2)".to_string(),
        ))
    }

    /// CUDA matrix multiplication implementation
    #[cfg(feature = "cuda")]
    fn cuda_matrix_multiply(
        &self,
        _a: &[Vec<f64>],
        _b: &[Vec<f64>],
    ) -> Result<Vec<Vec<f64>>, GpuError> {
        Err(GpuError::NotImplemented(
            "CUDA matrix operations require cudarc integration (planned for 0.2)".to_string(),
        ))
    }
}

/// WebGPU context for GPU operations
#[cfg(feature = "webgpu")]
struct WebGpuContext {
    // WebGPU device, queue, etc.
}

/// CUDA context for GPU operations
#[cfg(feature = "cuda")]
struct CudaContext {
    // CUDA context, streams, etc.
}

/// GPU acceleration errors
#[derive(Debug, Clone)]
pub enum GpuError {
    /// No GPU available for acceleration
    NoGpuAvailable,
    /// Operation size doesn't meet GPU threshold
    ThresholdNotMet,
    /// GPU operation failed
    OperationFailed(String),
    /// Feature planned for future release
    NotImplemented(String),
    /// Memory allocation failed
    OutOfMemory,
    /// Invalid input data
    InvalidInput(String),
}

impl std::fmt::Display for GpuError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            GpuError::NoGpuAvailable => write!(f, "No GPU available for acceleration"),
            GpuError::ThresholdNotMet => write!(f, "Operation size doesn't meet GPU threshold"),
            GpuError::OperationFailed(msg) => write!(f, "GPU operation failed: {}", msg),
            GpuError::NotImplemented(msg) => write!(f, "Not implemented: {}", msg),
            GpuError::OutOfMemory => write!(f, "GPU out of memory"),
            GpuError::InvalidInput(msg) => write!(f, "Invalid input: {}", msg),
        }
    }
}

impl std::error::Error for GpuError {}

/// Global GPU accelerator instance
static GLOBAL_GPU_ACCELERATOR: OnceLock<Arc<std::sync::Mutex<GpuAccelerator>>> = OnceLock::new();

/// Get the global GPU accelerator instance
pub fn get_global_gpu_accelerator() -> &'static Arc<std::sync::Mutex<GpuAccelerator>> {
    GLOBAL_GPU_ACCELERATOR.get_or_init(|| Arc::new(std::sync::Mutex::new(GpuAccelerator::new())))
}

/// Check if GPU acceleration is available
pub fn is_gpu_available() -> bool {
    if let Ok(accelerator) = get_global_gpu_accelerator().lock() {
        if let Some(caps) = accelerator.get_capabilities() {
            return caps.backend != GpuBackend::None;
        }
    }
    false
}

/// Get GPU capabilities information
pub fn get_gpu_capabilities() -> Option<GpuCapabilities> {
    if let Ok(accelerator) = get_global_gpu_accelerator().lock() {
        accelerator.get_capabilities().cloned()
    } else {
        None
    }
}

/// Perform GPU-accelerated bulk addition if available, fallback to CPU
pub fn gpu_or_cpu_bulk_add(values: &[f64]) -> f64 {
    if let Ok(accelerator) = get_global_gpu_accelerator().lock() {
        match accelerator.gpu_bulk_add(values) {
            Ok(result) => return result,
            Err(GpuError::ThresholdNotMet) => {
                // Expected - use CPU for small operations
            }
            Err(e) => {
                eprintln!("GPU operation failed, falling back to CPU: {}", e);
            }
        }
    }

    // CPU fallback
    values.iter().sum()
}

/// Perform GPU-accelerated matrix multiplication if available, fallback to CPU
pub fn gpu_or_cpu_matrix_multiply(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
    if let Ok(accelerator) = get_global_gpu_accelerator().lock() {
        match accelerator.gpu_matrix_multiply(a, b) {
            Ok(result) => return result,
            Err(GpuError::ThresholdNotMet) => {
                // Expected - use CPU for small matrices
            }
            Err(e) => {
                eprintln!("GPU matrix operation failed, falling back to CPU: {}", e);
            }
        }
    }

    // CPU fallback - basic matrix multiplication
    cpu_matrix_multiply(a, b)
}

/// CPU fallback for matrix multiplication
fn cpu_matrix_multiply(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
    let rows_a = a.len();
    let cols_a = a[0].len();
    let cols_b = b[0].len();

    let mut result = vec![vec![0.0; cols_b]; rows_a];

    for i in 0..rows_a {
        for j in 0..cols_b {
            for (k, a_elem) in a[i].iter().enumerate().take(cols_a) {
                result[i][j] += a_elem * b[k][j];
            }
        }
    }

    result
}

/// GPU acceleration statistics
#[derive(Debug, Clone, Default)]
pub struct GpuStatistics {
    /// Number of GPU operations performed
    pub gpu_operations: u64,
    /// Number of CPU fallbacks
    pub cpu_fallbacks: u64,
    /// Total GPU computation time
    pub gpu_time_ms: f64,
    /// Total CPU fallback time
    pub cpu_time_ms: f64,
    /// GPU memory usage in bytes
    pub gpu_memory_used: u64,
}

/// Get GPU acceleration statistics
pub fn get_gpu_statistics() -> GpuStatistics {
    // Placeholder - would track actual statistics
    GpuStatistics::default()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_gpu_accelerator_creation() {
        let accelerator = GpuAccelerator::new();
        assert!(accelerator.get_capabilities().is_some());
    }

    #[test]
    fn test_gpu_threshold_logic() {
        let accelerator = GpuAccelerator::new();

        // Small operations should not use GPU
        assert!(!accelerator.should_use_gpu(GpuOperation::BulkArithmetic, 100));

        // Large operations should use GPU (if available)
        let should_use = accelerator.should_use_gpu(GpuOperation::BulkArithmetic, 20000);
        // Result depends on whether GPU is available
        let _ = should_use;
    }

    #[test]
    fn test_cpu_fallback() {
        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let result = gpu_or_cpu_bulk_add(&values);
        assert_eq!(result, 15.0);
    }

    #[test]
    fn test_matrix_multiplication_fallback() {
        let a = vec![vec![1.0, 2.0], vec![3.0, 4.0]];
        let b = vec![vec![5.0, 6.0], vec![7.0, 8.0]];

        let result = gpu_or_cpu_matrix_multiply(&a, &b);

        // Expected result: [[19, 22], [43, 50]]
        assert_eq!(result[0][0], 19.0);
        assert_eq!(result[0][1], 22.0);
        assert_eq!(result[1][0], 43.0);
        assert_eq!(result[1][1], 50.0);
    }

    #[test]
    fn test_global_gpu_accelerator() {
        let caps = get_gpu_capabilities();
        assert!(caps.is_some());

        let available = is_gpu_available();
        // Should not panic
        let _ = available;
    }
}