sklears-simd 0.1.1

//! Custom accelerator support framework
//!
//! This module provides a flexible framework for integrating custom hardware
//! accelerators with the SIMD operations, including ASICs, custom chips, and
//! specialized processing units.

use crate::traits::SimdError;

#[cfg(feature = "no-std")]
use core::any::Any;
#[cfg(not(feature = "no-std"))]
use std::any::Any;

#[cfg(feature = "no-std")]
use alloc::{
    boxed::Box,
    collections::BTreeMap as HashMap,
    string::{String, ToString},
    vec,
    vec::Vec,
};
#[cfg(not(feature = "no-std"))]
use std::{collections::HashMap, string::ToString};

#[cfg(feature = "no-std")]
extern crate alloc;

/// Accelerator types
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum AcceleratorType {
    ASIC,
    Custom,
    DSP,
    VPU,    // Vision Processing Unit
    NPU,    // Neural Processing Unit
    DPU,    // Deep Learning Processing Unit
    AI,     // Generic AI accelerator
    Crypto, // Cryptographic accelerator
    Signal, // Signal processing accelerator
    Matrix, // Matrix processing accelerator
}

/// Accelerator capabilities
#[derive(Debug, Clone)]
pub struct AcceleratorCapabilities {
    pub supported_operations: Vec<AcceleratorOperation>,
    pub data_types: Vec<AcceleratorDataType>,
    pub max_batch_size: usize,
    pub memory_mb: u64,
    pub compute_units: u32,
    pub peak_performance_ops: u64,
    pub power_consumption_w: f64,
    pub precision_modes: Vec<AcceleratorPrecision>,
}

/// Accelerator operations
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorOperation {
    MatrixMultiply,
    Convolution,
    Activation,
    Pooling,
    Normalization,
    Attention,
    Embedding,
    Reduction,
    Transform,
    Sort,
    Search,
    Compress,
    Encrypt,
    Hash,
    Custom(u32),
}

/// Accelerator data types
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorDataType {
    F32,
    F16,
    BF16,
    I32,
    I16,
    I8,
    U32,
    U16,
    U8,
    Bool,
    Custom(u32),
}

/// Accelerator precision modes
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorPrecision {
    High,
    Medium,
    Low,
    Mixed,
    Adaptive,
}

/// Accelerator device information
#[derive(Debug, Clone)]
pub struct AcceleratorDevice {
    pub id: u32,
    pub name: String,
    pub vendor: String,
    pub model: String,
    pub accelerator_type: AcceleratorType,
    pub capabilities: AcceleratorCapabilities,
    pub driver_version: String,
    pub firmware_version: String,
    pub pci_id: Option<String>,
    pub numa_node: Option<u32>,
}

/// Accelerator buffer
#[derive(Debug)]
pub struct AcceleratorBuffer<T> {
    pub ptr: *mut T,
    pub size: usize,
    pub device: AcceleratorDevice,
    pub alignment: usize,
    pub memory_type: AcceleratorMemoryType,
    #[allow(dead_code)] // Reserved for native accelerator backend handle (CUDA/OpenCL/custom)
    backend_handle: Option<Box<dyn Any + Send + Sync>>,
}

/// Accelerator memory types
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorMemoryType {
    Device,
    Host,
    Unified,
    Pinned,
    Cached,
    Uncached,
}

unsafe impl<T: Send> Send for AcceleratorBuffer<T> {}
unsafe impl<T: Sync> Sync for AcceleratorBuffer<T> {}

impl<T> Drop for AcceleratorBuffer<T> {
    fn drop(&mut self) {
        // Free accelerator memory when buffer is dropped
    }
}

/// Accelerator context
pub struct AcceleratorContext {
    pub device: AcceleratorDevice,
    pub command_queues: Vec<AcceleratorQueue>,
    pub memory_pools: HashMap<AcceleratorMemoryType, AcceleratorMemoryPool>,
    #[allow(dead_code)] // Reserved for native accelerator backend context (CUDA/OpenCL/custom)
    backend_context: Option<Box<dyn Any + Send + Sync>>,
}

/// Accelerator command queue
#[derive(Debug)]
pub struct AcceleratorQueue {
    pub id: u32,
    pub priority: AcceleratorPriority,
    pub device_id: u32,
    #[allow(dead_code)] // Reserved for native command queue handle (CUDA stream / OpenCL queue)
    backend_queue: Option<Box<dyn Any + Send + Sync>>,
}

/// Accelerator priority levels
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorPriority {
    Low,
    Normal,
    High,
    Critical,
}

/// Accelerator memory pool
#[derive(Debug)]
pub struct AcceleratorMemoryPool {
    pub total_size: usize,
    pub available_size: usize,
    pub allocation_count: usize,
    pub memory_type: AcceleratorMemoryType,
    #[allow(dead_code)] // Reserved for native memory pool handle (CUDA/OpenCL/custom allocator)
    backend_pool: Option<Box<dyn Any + Send + Sync>>,
}

/// Accelerator kernel configuration
#[derive(Debug, Clone)]
pub struct AcceleratorKernel {
    pub name: String,
    pub operation: AcceleratorOperation,
    pub input_buffers: Vec<u32>,
    pub output_buffers: Vec<u32>,
    pub parameters: HashMap<String, AcceleratorParameter>,
    pub work_size: (usize, usize, usize),
    pub local_size: (usize, usize, usize),
}

/// Accelerator parameter
#[derive(Debug, Clone)]
pub enum AcceleratorParameter {
    Int(i64),
    Float(f64),
    Bool(bool),
    String(String),
    Array(Vec<u8>),
}

/// Accelerator operations interface
pub trait AcceleratorOperations {
    /// Allocate accelerator memory
    fn allocate<T>(
        &self,
        size: usize,
        memory_type: AcceleratorMemoryType,
    ) -> Result<AcceleratorBuffer<T>, SimdError>;

    /// Copy data to accelerator
    fn copy_to_accelerator<T>(
        &self,
        host_data: &[T],
        accel_buffer: &mut AcceleratorBuffer<T>,
        queue: Option<&AcceleratorQueue>,
    ) -> Result<(), SimdError>;

    /// Copy data from accelerator
    fn copy_from_accelerator<T>(
        &self,
        accel_buffer: &AcceleratorBuffer<T>,
        host_data: &mut [T],
        queue: Option<&AcceleratorQueue>,
    ) -> Result<(), SimdError>;

    /// Execute kernel on accelerator
    fn execute_kernel(
        &self,
        kernel: &AcceleratorKernel,
        buffers: &[&AcceleratorBuffer<u8>],
        queue: Option<&AcceleratorQueue>,
    ) -> Result<(), SimdError>;

    /// Synchronize accelerator operations
    fn synchronize(&self, queue: Option<&AcceleratorQueue>) -> Result<(), SimdError>;

    /// Get accelerator status
    fn get_status(&self) -> Result<AcceleratorStatus, SimdError>;
}

/// Accelerator status
#[derive(Debug, Clone)]
pub struct AcceleratorStatus {
    pub utilization_percent: f64,
    pub memory_usage_percent: f64,
    pub temperature_c: f64,
    pub power_consumption_w: f64,
    pub clock_frequency_mhz: f64,
    pub active_operations: Vec<AcceleratorOperation>,
    pub error_count: u32,
}

/// Accelerator runtime
pub struct AcceleratorRuntime {
    devices: Vec<AcceleratorDevice>,
    contexts: Vec<AcceleratorContext>,
    drivers: HashMap<AcceleratorType, Box<dyn AcceleratorDriver>>,
}

/// Accelerator driver interface
pub trait AcceleratorDriver: Send + Sync {
    fn initialize(&self) -> Result<(), SimdError>;
    fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError>;
    fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError>;
    fn is_available(&self) -> bool;
}

impl AcceleratorRuntime {
    /// Create new accelerator runtime
    pub fn new() -> Result<Self, SimdError> {
        let mut runtime = Self {
            devices: Vec::new(),
            contexts: Vec::new(),
            drivers: HashMap::new(),
        };

        // Register built-in drivers
        runtime.register_driver(AcceleratorType::ASIC, Box::new(AsicDriver::new()));
        runtime.register_driver(AcceleratorType::DSP, Box::new(DspDriver::new()));
        runtime.register_driver(AcceleratorType::VPU, Box::new(VpuDriver::new()));
        runtime.register_driver(AcceleratorType::NPU, Box::new(NpuDriver::new()));

        // Discover devices
        runtime.discover_all_devices()?;

        Ok(runtime)
    }

    /// Register accelerator driver
    pub fn register_driver(
        &mut self,
        accel_type: AcceleratorType,
        driver: Box<dyn AcceleratorDriver>,
    ) {
        self.drivers.insert(accel_type, driver);
    }

    /// Discover all devices
    fn discover_all_devices(&mut self) -> Result<(), SimdError> {
        for driver in self.drivers.values() {
            if driver.is_available() {
                let devices = driver.discover_devices()?;
                self.devices.extend(devices);
            }
        }
        Ok(())
    }

    /// Get available devices
    pub fn devices(&self) -> &[AcceleratorDevice] {
        &self.devices
    }

    /// Get devices by type
    pub fn devices_by_type(&self, accel_type: AcceleratorType) -> Vec<&AcceleratorDevice> {
        self.devices
            .iter()
            .filter(|d| d.accelerator_type == accel_type)
            .collect()
    }

    /// Create context for device
    pub fn create_context(&mut self, device_id: u32) -> Result<&AcceleratorContext, SimdError> {
        let device = self.devices.get(device_id as usize).ok_or_else(|| {
            SimdError::InvalidArgument("Invalid accelerator device ID".to_string())
        })?;

        let driver = self
            .drivers
            .get(&device.accelerator_type)
            .ok_or_else(|| SimdError::NotImplemented("Driver not available".to_string()))?;

        let context = driver.create_context(device)?;
        self.contexts.push(context);
        Ok(self
            .contexts
            .last()
            .expect("collection should not be empty"))
    }

    /// Get best device for operation
    pub fn get_best_device(&self, operation: AcceleratorOperation) -> Option<&AcceleratorDevice> {
        self.devices
            .iter()
            .filter(|d| d.capabilities.supported_operations.contains(&operation))
            .max_by(|a, b| {
                a.capabilities
                    .peak_performance_ops
                    .cmp(&b.capabilities.peak_performance_ops)
            })
    }
}

/// Built-in accelerator drivers
struct AsicDriver;
struct DspDriver;
struct VpuDriver;
struct NpuDriver;

impl AsicDriver {
    fn new() -> Self {
        Self
    }
}

impl AcceleratorDriver for AsicDriver {
    fn initialize(&self) -> Result<(), SimdError> {
        Ok(())
    }

    fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
        Ok(vec![])
    }

    fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
        Ok(AcceleratorContext {
            device: device.clone(),
            command_queues: vec![],
            memory_pools: HashMap::new(),
            backend_context: None,
        })
    }

    fn is_available(&self) -> bool {
        false
    }
}

impl DspDriver {
    fn new() -> Self {
        Self
    }
}

impl AcceleratorDriver for DspDriver {
    fn initialize(&self) -> Result<(), SimdError> {
        Ok(())
    }

    fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
        Ok(vec![])
    }

    fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
        Ok(AcceleratorContext {
            device: device.clone(),
            command_queues: vec![],
            memory_pools: HashMap::new(),
            backend_context: None,
        })
    }

    fn is_available(&self) -> bool {
        false
    }
}

impl VpuDriver {
    fn new() -> Self {
        Self
    }
}

impl AcceleratorDriver for VpuDriver {
    fn initialize(&self) -> Result<(), SimdError> {
        Ok(())
    }

    fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
        Ok(vec![])
    }

    fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
        Ok(AcceleratorContext {
            device: device.clone(),
            command_queues: vec![],
            memory_pools: HashMap::new(),
            backend_context: None,
        })
    }

    fn is_available(&self) -> bool {
        false
    }
}

impl NpuDriver {
    fn new() -> Self {
        Self
    }
}

impl AcceleratorDriver for NpuDriver {
    fn initialize(&self) -> Result<(), SimdError> {
        Ok(())
    }

    fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
        Ok(vec![])
    }

    fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
        Ok(AcceleratorContext {
            device: device.clone(),
            command_queues: vec![],
            memory_pools: HashMap::new(),
            backend_context: None,
        })
    }

    fn is_available(&self) -> bool {
        false
    }
}

/// Accelerator optimization utilities
pub mod optimization {
    use super::*;

    /// Optimize kernel for specific accelerator
    pub fn optimize_kernel(
        kernel: &AcceleratorKernel,
        device: &AcceleratorDevice,
    ) -> Result<AcceleratorKernel, SimdError> {
        let mut optimized = kernel.clone();

        // Optimize based on device capabilities
        match device.accelerator_type {
            AcceleratorType::NPU => {
                // Optimize for neural processing
                optimized.work_size = optimize_for_npu(kernel.work_size, &device.capabilities);
            }
            AcceleratorType::VPU => {
                // Optimize for vision processing
                optimized.work_size = optimize_for_vpu(kernel.work_size, &device.capabilities);
            }
            AcceleratorType::DSP => {
                // Optimize for signal processing
                optimized.work_size = optimize_for_dsp(kernel.work_size, &device.capabilities);
            }
            _ => {
                // Generic optimization
                optimized.work_size = optimize_generic(kernel.work_size, &device.capabilities);
            }
        }

        Ok(optimized)
    }

    fn optimize_for_npu(
        work_size: (usize, usize, usize),
        caps: &AcceleratorCapabilities,
    ) -> (usize, usize, usize) {
        let optimal_batch = caps.max_batch_size.min(work_size.0);
        (optimal_batch, work_size.1, work_size.2)
    }

    fn optimize_for_vpu(
        work_size: (usize, usize, usize),
        caps: &AcceleratorCapabilities,
    ) -> (usize, usize, usize) {
        let compute_units = caps.compute_units as usize;
        let optimal_x = work_size.0.div_ceil(compute_units) * compute_units;
        (optimal_x, work_size.1, work_size.2)
    }

    fn optimize_for_dsp(
        work_size: (usize, usize, usize),
        _caps: &AcceleratorCapabilities,
    ) -> (usize, usize, usize) {
        // DSP typically works well with power-of-2 sizes
        let next_power_of_2 = |n: usize| {
            if n == 0 {
                1
            } else {
                1 << (64 - (n - 1).leading_zeros())
            }
        };

        (next_power_of_2(work_size.0), work_size.1, work_size.2)
    }

    fn optimize_generic(
        work_size: (usize, usize, usize),
        caps: &AcceleratorCapabilities,
    ) -> (usize, usize, usize) {
        let compute_units = caps.compute_units as usize;
        let optimal_size = work_size.0.div_ceil(compute_units) * compute_units;
        (optimal_size, work_size.1, work_size.2)
    }

    /// Select best accelerator for operation
    pub fn select_accelerator(
        operation: AcceleratorOperation,
        data_size: usize,
        devices: &[AcceleratorDevice],
    ) -> Option<&AcceleratorDevice> {
        devices
            .iter()
            .filter(|d| d.capabilities.supported_operations.contains(&operation))
            .filter(|d| d.capabilities.max_batch_size >= data_size)
            .max_by(|a, b| {
                let score_a = compute_device_score(a, operation, data_size);
                let score_b = compute_device_score(b, operation, data_size);
                score_a
                    .partial_cmp(&score_b)
                    .unwrap_or(core::cmp::Ordering::Equal)
            })
    }

    fn compute_device_score(
        device: &AcceleratorDevice,
        operation: AcceleratorOperation,
        data_size: usize,
    ) -> f64 {
        let mut score = 0.0;

        // Performance score
        score += device.capabilities.peak_performance_ops as f64 / 1e9;

        // Memory score
        let memory_ratio =
            data_size as f64 / (device.capabilities.memory_mb as f64 * 1024.0 * 1024.0);
        score += if memory_ratio <= 1.0 {
            1.0
        } else {
            1.0 / memory_ratio
        };

        // Power efficiency score
        let ops_per_watt = device.capabilities.peak_performance_ops as f64
            / device.capabilities.power_consumption_w;
        score += ops_per_watt / 1e9;

        // Operation-specific bonuses
        match (operation, device.accelerator_type) {
            (AcceleratorOperation::MatrixMultiply, AcceleratorType::NPU) => score += 2.0,
            (AcceleratorOperation::Convolution, AcceleratorType::VPU) => score += 2.0,
            (AcceleratorOperation::Transform, AcceleratorType::DSP) => score += 2.0,
            _ => {}
        }

        score
    }
}

#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
    use super::*;

    #[cfg(feature = "no-std")]
    use alloc::{
        string::{String, ToString},
        vec,
        vec::Vec,
    };

    #[test]
    fn test_accelerator_runtime_creation() {
        let runtime = AcceleratorRuntime::new();
        assert!(runtime.is_ok());
    }

    #[test]
    fn test_accelerator_type_display() {
        let types = vec![
            AcceleratorType::ASIC,
            AcceleratorType::Custom,
            AcceleratorType::DSP,
            AcceleratorType::VPU,
            AcceleratorType::NPU,
        ];

        for accel_type in types {
            println!("Accelerator type: {:?}", accel_type);
        }
    }

    #[test]
    fn test_accelerator_capabilities() {
        let caps = AcceleratorCapabilities {
            supported_operations: vec![
                AcceleratorOperation::MatrixMultiply,
                AcceleratorOperation::Convolution,
            ],
            data_types: vec![AcceleratorDataType::F32, AcceleratorDataType::F16],
            max_batch_size: 1024,
            memory_mb: 8192,
            compute_units: 64,
            peak_performance_ops: 1000000000,
            power_consumption_w: 150.0,
            precision_modes: vec![AcceleratorPrecision::High, AcceleratorPrecision::Mixed],
        };

        assert_eq!(caps.supported_operations.len(), 2);
        assert_eq!(caps.data_types.len(), 2);
        assert_eq!(caps.max_batch_size, 1024);
    }

    #[test]
    fn test_accelerator_kernel() {
        let mut params = HashMap::new();
        params.insert("alpha".to_string(), AcceleratorParameter::Float(1.0));
        params.insert("beta".to_string(), AcceleratorParameter::Float(0.0));

        let kernel = AcceleratorKernel {
            name: "test_kernel".to_string(),
            operation: AcceleratorOperation::MatrixMultiply,
            input_buffers: vec![0, 1],
            output_buffers: vec![2],
            parameters: params,
            work_size: (1024, 1024, 1),
            local_size: (16, 16, 1),
        };

        assert_eq!(kernel.name, "test_kernel");
        assert_eq!(kernel.operation, AcceleratorOperation::MatrixMultiply);
        assert_eq!(kernel.input_buffers.len(), 2);
        assert_eq!(kernel.output_buffers.len(), 1);
        assert_eq!(kernel.parameters.len(), 2);
    }

    #[test]
    fn test_accelerator_device() {
        let device = AcceleratorDevice {
            id: 0,
            name: "Test Accelerator".to_string(),
            vendor: "Test Vendor".to_string(),
            model: "Test Model".to_string(),
            accelerator_type: AcceleratorType::NPU,
            capabilities: AcceleratorCapabilities {
                supported_operations: vec![AcceleratorOperation::MatrixMultiply],
                data_types: vec![AcceleratorDataType::F32],
                max_batch_size: 512,
                memory_mb: 4096,
                compute_units: 32,
                peak_performance_ops: 500000000,
                power_consumption_w: 100.0,
                precision_modes: vec![AcceleratorPrecision::High],
            },
            driver_version: "1.0.0".to_string(),
            firmware_version: "2.0.0".to_string(),
            pci_id: Some("1234:5678".to_string()),
            numa_node: Some(0),
        };

        assert_eq!(device.id, 0);
        assert_eq!(device.accelerator_type, AcceleratorType::NPU);
        assert_eq!(device.capabilities.compute_units, 32);
    }

    #[test]
    fn test_kernel_optimization() {
        let device = AcceleratorDevice {
            id: 0,
            name: "Test NPU".to_string(),
            vendor: "Test".to_string(),
            model: "NPU-100".to_string(),
            accelerator_type: AcceleratorType::NPU,
            capabilities: AcceleratorCapabilities {
                supported_operations: vec![AcceleratorOperation::MatrixMultiply],
                data_types: vec![AcceleratorDataType::F32],
                max_batch_size: 256,
                memory_mb: 2048,
                compute_units: 16,
                peak_performance_ops: 100000000,
                power_consumption_w: 50.0,
                precision_modes: vec![AcceleratorPrecision::High],
            },
            driver_version: "1.0.0".to_string(),
            firmware_version: "1.0.0".to_string(),
            pci_id: None,
            numa_node: None,
        };

        let kernel = AcceleratorKernel {
            name: "test".to_string(),
            operation: AcceleratorOperation::MatrixMultiply,
            input_buffers: vec![0, 1],
            output_buffers: vec![2],
            parameters: HashMap::new(),
            work_size: (512, 512, 1),
            local_size: (16, 16, 1),
        };

        let optimized = optimization::optimize_kernel(&kernel, &device);
        assert!(optimized.is_ok());

        let opt_kernel = optimized.expect("operation should succeed");
        assert_eq!(opt_kernel.work_size.0, 256); // Limited by max_batch_size
    }

    #[test]
    fn test_accelerator_selection() {
        let devices = vec![
            AcceleratorDevice {
                id: 0,
                name: "NPU".to_string(),
                vendor: "Test".to_string(),
                model: "NPU-100".to_string(),
                accelerator_type: AcceleratorType::NPU,
                capabilities: AcceleratorCapabilities {
                    supported_operations: vec![AcceleratorOperation::MatrixMultiply],
                    data_types: vec![AcceleratorDataType::F32],
                    max_batch_size: 1024,
                    memory_mb: 4096,
                    compute_units: 32,
                    peak_performance_ops: 1000000000,
                    power_consumption_w: 100.0,
                    precision_modes: vec![AcceleratorPrecision::High],
                },
                driver_version: "1.0.0".to_string(),
                firmware_version: "1.0.0".to_string(),
                pci_id: None,
                numa_node: None,
            },
            AcceleratorDevice {
                id: 1,
                name: "VPU".to_string(),
                vendor: "Test".to_string(),
                model: "VPU-200".to_string(),
                accelerator_type: AcceleratorType::VPU,
                capabilities: AcceleratorCapabilities {
                    supported_operations: vec![AcceleratorOperation::Convolution],
                    data_types: vec![AcceleratorDataType::F16],
                    max_batch_size: 512,
                    memory_mb: 2048,
                    compute_units: 16,
                    peak_performance_ops: 500000000,
                    power_consumption_w: 50.0,
                    precision_modes: vec![AcceleratorPrecision::Mixed],
                },
                driver_version: "1.0.0".to_string(),
                firmware_version: "1.0.0".to_string(),
                pci_id: None,
                numa_node: None,
            },
        ];

        let selected =
            optimization::select_accelerator(AcceleratorOperation::MatrixMultiply, 512, &devices);
        assert!(selected.is_some());
        assert_eq!(selected.expect("operation should succeed").id, 0);

        let selected =
            optimization::select_accelerator(AcceleratorOperation::Convolution, 256, &devices);
        assert!(selected.is_some());
        assert_eq!(selected.expect("operation should succeed").id, 1);
    }
}