rustorch 0.6.29

//! # Unified Compute Backend Abstraction Layer
//!
//! 統一計算バックエンド抽象化レイヤー
//!
//! This module provides a unified abstraction over different compute backends
//! (CPU, CUDA, Metal, OpenCL) for seamless device selection and operation execution.
//!
//! このモジュールは、異なる計算バックエンド（CPU、CUDA、Metal、OpenCL）の統一抽象化を提供し、
//! シームレスなデバイス選択と操作実行を可能にします。

use crate::error::RusTorchResult;
use crate::tensor::Tensor;
use std::any::Any;
use std::collections::HashMap;

/// Device types available for computation
/// 計算に利用可能なデバイス種別
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DeviceType {
    /// CPU computation using optimized algorithms
    /// 最適化アルゴリズムを使用したCPU計算
    Cpu,
    /// NVIDIA CUDA GPU acceleration
    /// NVIDIA CUDA GPU加速
    Cuda(usize), // Device ID
    /// Apple Metal GPU acceleration
    /// Apple Metal GPU加速
    Metal(usize), // Device ID
    /// OpenCL cross-platform GPU acceleration
    /// OpenCLクロスプラットフォームGPU加速
    OpenCL(usize), // Device ID
}

/// Memory transfer direction between host and device
/// ホストとデバイス間のメモリ転送方向
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TransferDirection {
    /// Host to Device (CPU → GPU)
    /// ホストからデバイス（CPU → GPU）
    HostToDevice,
    /// Device to Host (GPU → CPU)
    /// デバイスからホスト（GPU → CPU）
    DeviceToHost,
}

/// Generic operation type for backend execution
/// バックエンド実行用の汎用操作型
#[derive(Debug, Clone)]
pub enum Operation<T: num_traits::Float> {
    /// Element-wise addition
    /// 要素ごとの加算
    Add {
        /// First input tensor
        /// 第1入力テンソル
        a: Tensor<T>,
        /// Second input tensor
        /// 第2入力テンソル
        b: Tensor<T>,
    },
    /// Element-wise multiplication
    /// 要素ごとの乗算
    Multiply {
        /// First input tensor
        /// 第1入力テンソル
        a: Tensor<T>,
        /// Second input tensor
        /// 第2入力テンソル
        b: Tensor<T>,
    },
    /// Matrix multiplication
    /// 行列乗算
    MatMul {
        /// Left matrix tensor
        /// 左行列テンソル
        a: Tensor<T>,
        /// Right matrix tensor
        /// 右行列テンソル
        b: Tensor<T>,
    },
    /// Convolution operation
    /// 畳み込み操作
    Conv2D {
        /// Input tensor for convolution
        /// 畳み込み用入力テンソル
        input: Tensor<T>,
        /// Convolution kernel/filter
        /// 畳み込みカーネル/フィルター
        kernel: Tensor<T>,
        /// Stride for convolution (height, width)
        /// 畳み込みのストライド（高さ、幅）
        stride: (usize, usize),
        /// Padding for convolution (height, width)
        /// 畳み込みのパディング（高さ、幅）
        padding: (usize, usize),
    },
    /// Reduction operations (sum, mean, etc.)
    /// リダクション操作（sum、meanなど）
    Reduce {
        /// Input tensor for reduction
        /// リダクション用入力テンソル
        input: Tensor<T>,
        /// Type of reduction operation
        /// リダクション操作の種類
        operation: ReduceOp,
        /// Axes along which to reduce (None for all axes)
        /// リダクションする軸（Noneは全軸）
        axes: Option<Vec<usize>>,
    },
    // Custom operations will be added later when needed
    // カスタム操作は必要時に追加予定
}

/// Reduction operation types
/// リダクション操作種別
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReduceOp {
    /// Sum reduction
    /// 合計リダクション
    Sum,
    /// Mean (average) reduction
    /// 平均リダクション
    Mean,
    /// Maximum value reduction
    /// 最大値リダクション
    Max,
    /// Minimum value reduction
    /// 最小値リダクション
    Min,
    /// Product reduction
    /// 積リダクション
    Product,
}

/// Performance metrics for operations
/// 操作のパフォーマンス指標
#[derive(Debug, Clone)]
pub struct PerformanceMetrics {
    /// Execution time in nanoseconds
    /// 実行時間（ナノ秒）
    pub execution_time_ns: u64,
    /// Memory bandwidth in GB/s
    /// メモリ帯域幅（GB/s）
    pub memory_bandwidth_gbps: f64,
    /// Device utilization percentage
    /// デバイス使用率（パーセント）
    pub device_utilization: f64,
    /// Memory usage in bytes
    /// メモリ使用量（バイト）
    pub memory_usage_bytes: usize,
}

/// Generic backend operations
/// ジェネリックバックエンド操作
pub trait ComputeBackendGeneric {
    /// Execute a generic operation on this backend
    /// このバックエンドで汎用操作を実行
    fn execute_operation<T>(&self, operation: &Operation<T>) -> RusTorchResult<Tensor<T>>
    where
        T: Clone + Send + Sync + 'static + num_traits::Float;

    /// Transfer memory between host and device
    /// ホストとデバイス間でメモリ転送
    fn memory_transfer<T>(
        &self,
        data: &[T],
        direction: TransferDirection,
    ) -> RusTorchResult<Vec<T>>
    where
        T: Clone + Send + Sync + 'static + num_traits::Float;
}

/// Unified compute backend trait (non-generic methods)
/// 統一計算バックエンドトレイト（非ジェネリックメソッド）
pub trait ComputeBackend: Send + Sync {
    /// Returns the device type this backend handles
    /// このバックエンドが処理するデバイス種別を返す
    fn device_type(&self) -> DeviceType;

    /// Returns whether this backend is available on the current system
    /// このバックエンドが現在のシステムで利用可能かどうかを返す
    fn is_available(&self) -> bool;

    /// Initialize the backend
    /// バックエンドを初期化
    fn initialize(&mut self) -> RusTorchResult<()>;

    /// Get performance metrics for the last operation
    /// 最後の操作のパフォーマンス指標を取得
    fn get_metrics(&self) -> PerformanceMetrics;

    /// Get backend-specific information
    /// バックエンド固有の情報を取得
    fn get_info(&self) -> HashMap<String, Box<dyn Any + Send + Sync>>;

    /// Synchronize device operations (wait for completion)
    /// デバイス操作の同期（完了待機）
    fn synchronize(&self) -> RusTorchResult<()>;

    /// Get available memory in bytes
    /// 利用可能メモリをバイト単位で取得
    fn available_memory(&self) -> RusTorchResult<usize>;

    /// Set device-specific configuration
    /// デバイス固有設定を設定
    fn set_config(&mut self, key: &str, value: Box<dyn Any + Send + Sync>) -> RusTorchResult<()>;
}

/// Backend selection strategy
/// バックエンド選択戦略
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SelectionStrategy {
    /// Always prefer the fastest available backend
    /// 常に最速の利用可能バックエンドを選択
    Performance,
    /// Always prefer backends with most available memory
    /// 常に最多メモリ利用可能バックエンドを選択
    Memory,
    /// Balance between performance and memory
    /// パフォーマンスとメモリのバランス
    Balanced,
    /// User-specified device priority
    /// ユーザー指定デバイス優先度
    Manual(Vec<DeviceType>),
}

/// Device manager for unified backend selection
/// 統一バックエンド選択用デバイスマネージャー
pub struct DeviceManager {
    /// Available compute backends (temporarily CPU-only)
    /// 利用可能な計算バックエンド（一時的にCPUのみ）
    cpu_backend: Option<crate::backends::cpu_unified::CpuBackend>,
    /// Backend selection strategy
    /// バックエンド選択戦略
    pub strategy: SelectionStrategy,
    /// Performance history for adaptive selection
    /// 適応選択用パフォーマンス履歴
    performance_history: HashMap<DeviceType, Vec<PerformanceMetrics>>,
}

impl DeviceManager {
    /// Create a new device manager
    /// 新しいデバイスマネージャーを作成
    pub fn new(strategy: SelectionStrategy) -> Self {
        Self {
            cpu_backend: None,
            strategy,
            performance_history: HashMap::new(),
        }
    }

    /// Register a compute backend (temporarily CPU-only)
    /// 計算バックエンドを登録（一時的にCPUのみ）
    pub fn register_backend(&mut self, backend: Box<dyn ComputeBackend>) -> RusTorchResult<()> {
        let device_type = backend.device_type();

        if device_type == DeviceType::Cpu && backend.is_available() {
            // Temporarily only support CPU backend
            let cpu_backend = match backend.device_type() {
                DeviceType::Cpu => {
                    // This is a bit of a hack - we know it's a CPU backend
                    // In a real implementation, we'd use proper downcasting
                    crate::backends::cpu_unified::CpuBackend::new()?
                }
                _ => {
                    return Err(crate::error::RusTorchError::DeviceNotAvailable(
                        "Only CPU backend is currently supported".to_string(),
                    ))
                }
            };
            self.cpu_backend = Some(cpu_backend);
            self.performance_history.insert(device_type, Vec::new());
            Ok(())
        } else {
            Err(crate::error::RusTorchError::DeviceNotAvailable(format!(
                "Backend for device {:?} is not available",
                device_type
            )))
        }
    }

    /// Get available device types
    /// 利用可能なデバイス種別を取得
    pub fn available_devices(&self) -> Vec<DeviceType> {
        if self.cpu_backend.is_some() {
            vec![DeviceType::Cpu]
        } else {
            vec![]
        }
    }

    /// Select the best backend for an operation
    /// 操作に最適なバックエンドを選択
    pub fn select_backend<T: num_traits::Float>(
        &self,
        _operation: &Operation<T>,
    ) -> RusTorchResult<DeviceType> {
        if self.cpu_backend.is_none() {
            return Err(crate::error::RusTorchError::DeviceNotAvailable(
                "No compute backends available".to_string(),
            ));
        }

        // For now, always return CPU as it's the only supported backend
        Ok(DeviceType::Cpu)
    }

    /// Execute operation on the best available backend
    /// 最適な利用可能バックエンドで操作を実行
    pub fn execute_operation<T>(&mut self, operation: &Operation<T>) -> RusTorchResult<Tensor<T>>
    where
        T: Clone + Send + Sync + 'static + num_traits::Float,
    {
        let selected_device = self.select_backend(operation)?;

        if let Some(ref backend) = self.cpu_backend {
            let start_time = std::time::Instant::now();
            let result = backend.execute_operation(operation)?;
            let execution_time = start_time.elapsed();

            // Record performance metrics
            let mut metrics = backend.get_metrics();
            metrics.execution_time_ns = execution_time.as_nanos() as u64;

            self.performance_history
                .get_mut(&selected_device)
                .unwrap()
                .push(metrics);

            // Keep only recent performance data (last 100 operations)
            let history = self.performance_history.get_mut(&selected_device).unwrap();
            if history.len() > 100 {
                history.drain(0..history.len() - 100);
            }

            Ok(result)
        } else {
            Err(crate::error::RusTorchError::DeviceNotAvailable(
                "CPU backend not available".to_string(),
            ))
        }
    }

    /// Get performance statistics for a device
    /// デバイスのパフォーマンス統計を取得
    pub fn get_device_stats(&self, device: DeviceType) -> Option<PerformanceMetrics> {
        self.performance_history.get(&device).and_then(|history| {
            if history.is_empty() {
                None
            } else {
                let avg_time =
                    history.iter().map(|m| m.execution_time_ns).sum::<u64>() / history.len() as u64;

                let avg_bandwidth = history.iter().map(|m| m.memory_bandwidth_gbps).sum::<f64>()
                    / history.len() as f64;

                let avg_utilization = history.iter().map(|m| m.device_utilization).sum::<f64>()
                    / history.len() as f64;

                let avg_memory =
                    history.iter().map(|m| m.memory_usage_bytes).sum::<usize>() / history.len();

                Some(PerformanceMetrics {
                    execution_time_ns: avg_time,
                    memory_bandwidth_gbps: avg_bandwidth,
                    device_utilization: avg_utilization,
                    memory_usage_bytes: avg_memory,
                })
            }
        })
    }

    // Helper methods for selection strategies (simplified for CPU-only implementation)

    /// Check if CoreML is available on this system
    #[cfg(feature = "coreml")]
    pub fn is_coreml_available() -> bool {
        // CoreML is only available on macOS
        cfg!(target_os = "macos")
    }

    #[cfg(not(feature = "coreml"))]
    pub fn is_coreml_available() -> bool {
        false
    }

    /// Check if Metal is available on this system
    #[cfg(feature = "metal")]
    pub fn is_metal_available() -> bool {
        // Metal is only available on macOS
        cfg!(target_os = "macos")
    }

    #[cfg(not(feature = "metal"))]
    pub fn is_metal_available() -> bool {
        false
    }

    /// Check if CUDA is available on this system
    #[cfg(feature = "cuda")]
    pub fn is_cuda_available() -> bool {
        // CUDA is available on Linux and Windows, not macOS
        !cfg!(target_os = "macos")
    }

    #[cfg(not(feature = "cuda"))]
    pub fn is_cuda_available() -> bool {
        false
    }
}

impl Default for DeviceManager {
    fn default() -> Self {
        Self::new(SelectionStrategy::Balanced)
    }
}

// Global device manager instance - lazy_static doesn't support doc comments
// グローバルデバイスマネージャーインスタンス
lazy_static::lazy_static! {
    static ref GLOBAL_DEVICE_MANAGER: std::sync::RwLock<DeviceManager> =
        std::sync::RwLock::new(DeviceManager::default());
}

/// Get the global device manager
/// グローバルデバイスマネージャーを取得
pub fn global_device_manager() -> &'static std::sync::RwLock<DeviceManager> {
    &GLOBAL_DEVICE_MANAGER
}

/// Initialize all available backends
/// 利用可能な全バックエンドを初期化
pub fn initialize_backends() -> RusTorchResult<()> {
    let mut manager = global_device_manager().write().unwrap();

    // Register CPU backend (always available)
    let cpu_backend = crate::backends::cpu_unified::CpuBackend::new()?;
    manager.register_backend(Box::new(cpu_backend))?;

    // Other backends will be registered here when implemented
    // #[cfg(feature = "cuda")]
    // {
    //     if let Ok(cuda_backend) = crate::backends::cuda::CudaBackend::new() {
    //         manager.register_backend(Box::new(cuda_backend))?;
    //     }
    // }

    // #[cfg(feature = "metal")]
    // {
    //     if let Ok(metal_backend) = crate::backends::metal::MetalBackend::new() {
    //         manager.register_backend(Box::new(metal_backend))?;
    //     }
    // }

    // #[cfg(feature = "opencl")]
    // {
    //     if let Ok(opencl_backend) = crate::backends::opencl::OpenCLBackend::new() {
    //         manager.register_backend(Box::new(opencl_backend))?;
    //     }
    // }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_device_manager_creation() {
        let manager = DeviceManager::new(SelectionStrategy::Performance);
        assert_eq!(manager.strategy, SelectionStrategy::Performance);
        assert!(manager.cpu_backend.is_none());
    }

    #[test]
    fn test_selection_strategy_manual() {
        let priorities = vec![DeviceType::Cuda(0), DeviceType::Cpu];
        let strategy = SelectionStrategy::Manual(priorities.clone());

        if let SelectionStrategy::Manual(ref manual_priorities) = strategy {
            assert_eq!(manual_priorities, &priorities);
        } else {
            panic!("Strategy should be Manual");
        }
    }
}