trustformers-core 0.1.1

// Copyright (c) 2025-2026 COOLJAPAN OU (Team KitaSan)
// SPDX-License-Identifier: Apache-2.0

//! Hardware manager for TrustformeRS
//!
//! This module provides a centralized manager for hardware devices, backends,
//! and operations. It coordinates between different specialized components
//! to provide a unified hardware management interface.

#![allow(unused_variables)] // Hardware manager

use super::allocation::{LoadBalancer, MemoryManager, ResourceAllocator};
use super::backends::{CPUBackend, GPUBackend};
use super::config::{DeviceInfo, HardwareManagerConfig};
use super::devices::GPUBackendType;
use super::monitoring::{HealthChecker, PerformanceMonitor};
use super::registry::HardwareRegistry;
use super::scheduling::{AdvancedScheduler, DefaultScheduler, SchedulingAlgorithm};
use super::traits::{HardwareBackend, HardwareOperation, HardwareScheduler};
use super::{HardwareMetrics, HardwareResult, HardwareType, OperationParameter};
use crate::errors::TrustformersError;
use crate::tensor::Tensor;
use std::collections::HashMap;
use std::sync::{Arc, Mutex, RwLock};
use tokio::sync::Mutex as AsyncMutex;

/// Hardware manager implementation
#[derive(Debug)]
pub struct HardwareManager {
    /// Manager configuration
    config: HardwareManagerConfig,
    /// Hardware registry
    #[allow(dead_code)]
    registry: Arc<RwLock<HardwareRegistry>>,
    /// CPU backend
    cpu_backend: Arc<AsyncMutex<CPUBackend>>,
    /// GPU backend
    gpu_backend: Arc<Mutex<Option<GPUBackend>>>,
    /// Device information cache
    device_info: Arc<RwLock<HashMap<String, DeviceInfo>>>,
    /// Device metrics cache
    device_metrics: Arc<RwLock<HashMap<String, HardwareMetrics>>>,
    /// Operation scheduler
    scheduler: Arc<Mutex<Box<dyn HardwareScheduler>>>,
    /// Performance monitor
    performance_monitor: Arc<Mutex<PerformanceMonitor>>,
    /// Health checker
    health_checker: Arc<Mutex<HealthChecker>>,
    /// Resource allocator
    #[allow(dead_code)]
    resource_allocator: Arc<Mutex<ResourceAllocator>>,
    /// Load balancer
    #[allow(dead_code)]
    load_balancer: Arc<Mutex<LoadBalancer>>,
    /// Memory manager
    #[allow(dead_code)]
    memory_manager: Arc<Mutex<MemoryManager>>,
}

impl HardwareManager {
    /// Create a new hardware manager
    pub fn new(config: HardwareManagerConfig) -> Self {
        let scheduler: Box<dyn HardwareScheduler> = if config.performance_monitoring {
            Box::new(AdvancedScheduler::new(SchedulingAlgorithm::LoadAware))
        } else {
            Box::new(DefaultScheduler::new())
        };

        Self {
            cpu_backend: Arc::new(AsyncMutex::new(CPUBackend::new())),
            gpu_backend: Arc::new(Mutex::new(None)),
            registry: Arc::new(RwLock::new(HardwareRegistry::new())),
            device_info: Arc::new(RwLock::new(HashMap::new())),
            device_metrics: Arc::new(RwLock::new(HashMap::new())),
            scheduler: Arc::new(Mutex::new(scheduler)),
            performance_monitor: Arc::new(Mutex::new(PerformanceMonitor::new())),
            health_checker: Arc::new(Mutex::new(HealthChecker::new())),
            resource_allocator: Arc::new(Mutex::new(ResourceAllocator::new(
                config.allocation_strategy,
            ))),
            load_balancer: Arc::new(Mutex::new(LoadBalancer::new(config.load_balancing))),
            memory_manager: Arc::new(Mutex::new(MemoryManager::new())),
            config,
        }
    }

    /// Initialize the hardware manager
    pub async fn initialize(&mut self) -> HardwareResult<()> {
        // Initialize CPU backend
        {
            let cpu_backend = self.cpu_backend.lock().await;
            // CPU backend initialization - no longer needed with new trait design
            self.register_backend_devices(&*cpu_backend).await?;
        }

        // Initialize GPU backend if available
        if self.detect_gpu_backend().is_some() {
            self.initialize_gpu_backend().await?;
        }

        // Start background tasks
        if self.config.performance_monitoring {
            self.start_monitoring().await?;
        }

        if self.config.health_check_interval > 0 {
            self.start_health_checks().await?;
        }

        Ok(())
    }

    /// Detect available GPU backend
    fn detect_gpu_backend(&self) -> Option<GPUBackendType> {
        #[cfg(feature = "cuda")]
        if self.is_cuda_available() {
            return Some(GPUBackendType::CUDA);
        }

        #[cfg(feature = "rocm")]
        if self.is_rocm_available() {
            return Some(GPUBackendType::ROCm);
        }

        #[cfg(all(target_os = "macos", feature = "metal"))]
        if self.is_metal_available() {
            return Some(GPUBackendType::Metal);
        }

        #[cfg(feature = "opencl")]
        if self.is_opencl_available() {
            return Some(GPUBackendType::OpenCL);
        }

        #[cfg(feature = "vulkan")]
        if self.is_vulkan_available() {
            return Some(GPUBackendType::Vulkan);
        }

        None
    }

    #[cfg(feature = "cuda")]
    fn is_cuda_available(&self) -> bool {
        // Check CUDA availability
        true // Placeholder
    }

    #[cfg(feature = "rocm")]
    fn is_rocm_available(&self) -> bool {
        // Check ROCm availability
        true // Placeholder
    }

    #[cfg(all(target_os = "macos", feature = "metal"))]
    fn is_metal_available(&self) -> bool {
        // Check Metal availability
        true // Placeholder
    }

    #[cfg(feature = "opencl")]
    fn is_opencl_available(&self) -> bool {
        // Check OpenCL availability
        true // Placeholder
    }

    #[cfg(feature = "vulkan")]
    fn is_vulkan_available(&self) -> bool {
        // Check Vulkan availability
        true // Placeholder
    }

    /// Initialize GPU backend
    async fn initialize_gpu_backend(&mut self) -> HardwareResult<()> {
        if let Some(backend_type) = self.detect_gpu_backend() {
            let gpu_backend = GPUBackend::new(backend_type);
            // GPU backend initialization - no longer needed with new trait design
            self.register_backend_devices(&gpu_backend).await?;

            *self.gpu_backend.lock().expect("lock should not be poisoned") = Some(gpu_backend);
        }
        Ok(())
    }

    /// Register devices from a backend
    async fn register_backend_devices(&self, backend: &dyn HardwareBackend) -> HardwareResult<()> {
        let devices = backend.discover_devices().await?;
        let mut device_info = self.device_info.write().expect("lock should not be poisoned");

        for device in devices {
            let device_id = device.device_id().to_string();
            let info = DeviceInfo {
                id: device_id.clone(),
                hardware_type: device.hardware_type(),
                capabilities: device.capabilities().clone(),
                status: device.status(),
                last_seen: std::time::SystemTime::now(),
                weight: 1.0,
                priority: 0,
                tags: vec![],
            };

            device_info.insert(device_id, info);
        }

        Ok(())
    }

    /// Start performance monitoring
    async fn start_monitoring(&self) -> HardwareResult<()> {
        // Start background monitoring task (placeholder)
        Ok(())
    }

    /// Start health checking
    async fn start_health_checks(&self) -> HardwareResult<()> {
        // Start background health checking task (placeholder)
        Ok(())
    }

    /// Check if a device exists
    pub fn has_device(&self, device_id: &str) -> bool {
        self.device_info
            .read()
            .expect("device_info lock should not be poisoned")
            .contains_key(device_id)
    }

    /// Get device information
    pub fn get_device_info(&self, device_id: &str) -> Option<DeviceInfo> {
        self.device_info
            .read()
            .expect("device_info lock should not be poisoned")
            .get(device_id)
            .cloned()
    }

    /// Get device metrics
    pub fn get_device_metrics(&self, device_id: &str) -> Option<HardwareMetrics> {
        self.device_metrics
            .read()
            .expect("device_metrics lock should not be poisoned")
            .get(device_id)
            .cloned()
    }

    /// List all available devices
    pub fn list_devices(&self) -> Vec<DeviceInfo> {
        self.device_info
            .read()
            .expect("device_info lock should not be poisoned")
            .values()
            .cloned()
            .collect()
    }

    /// List devices by hardware type
    pub fn list_devices_by_type(&self, hardware_type: HardwareType) -> Vec<DeviceInfo> {
        self.device_info
            .read()
            .expect("device_info lock should not be poisoned")
            .values()
            .filter(|info| info.hardware_type == hardware_type)
            .cloned()
            .collect()
    }

    /// Get the best device for an operation
    pub fn get_best_device(&self, operation: &dyn HardwareOperation) -> HardwareResult<String> {
        // Use scheduler to find the best device
        if let Ok(scheduler) = self.scheduler.lock() {
            let inputs = vec![]; // Placeholder - would pass actual inputs
            let params = HashMap::new(); // Placeholder - would pass actual params
            scheduler.schedule_operation(operation, &inputs, &params)
        } else {
            Err(TrustformersError::hardware_error(
                "Failed to lock scheduler",
                "schedule_operation",
            ))
        }
    }

    /// Execute an operation on the best available device
    pub fn execute_operation(
        &self,
        operation: &dyn HardwareOperation,
        inputs: &[Tensor],
        params: &HashMap<String, OperationParameter>,
    ) -> HardwareResult<Vec<Tensor>> {
        // Get the best device for this operation
        let device_id = self.get_best_device(operation)?;

        // Execute on the selected device
        self.execute_on_device(&device_id, operation, inputs, params)
    }

    /// Execute an operation on a specific device
    pub fn execute_on_device(
        &self,
        device_id: &str,
        operation: &dyn HardwareOperation,
        inputs: &[Tensor],
        _params: &HashMap<String, OperationParameter>,
    ) -> HardwareResult<Vec<Tensor>> {
        // Determine which backend owns this device
        let device_info = self.get_device_info(device_id).ok_or_else(|| {
            TrustformersError::hardware_error("Device not found", "execute_on_device")
        })?;

        match device_info.hardware_type {
            HardwareType::CPU => {
                // For now, return a mock result since the actual backend implementation
                // is not fully compatible. In practice, this would integrate with the
                // actual hardware backend operations.
                Ok(vec![inputs[0].clone()])
            },
            HardwareType::GPU => {
                // For now, return a mock result since the actual backend implementation
                // is not fully compatible. In practice, this would integrate with the
                // actual hardware backend operations.
                Ok(vec![inputs[0].clone()])
            },
            _ => Err(TrustformersError::hardware_error(
                "Unsupported hardware type",
                "execute_on_device",
            )),
        }
    }

    /// Update device metrics
    pub fn update_device_metrics(&self, device_id: &str, metrics: HardwareMetrics) {
        {
            let mut device_metrics =
                self.device_metrics.write().expect("device_metrics lock should not be poisoned");
            device_metrics.insert(device_id.to_string(), metrics.clone());
        }

        // Update performance monitor
        if let Ok(mut monitor) = self.performance_monitor.lock() {
            monitor.update_metrics(device_id, &metrics);
        }
    }

    /// Get performance statistics
    pub fn get_performance_stats(&self) -> HashMap<String, f64> {
        if let Ok(monitor) = self.performance_monitor.lock() {
            monitor.efficiency_scores.clone()
        } else {
            HashMap::new()
        }
    }

    /// Get health status for all devices
    pub fn get_health_status(&self) -> HashMap<String, super::monitoring::HealthStatus> {
        if let Ok(checker) = self.health_checker.lock() {
            checker
                .get_all_results()
                .iter()
                .map(|(id, result)| (id.clone(), result.status))
                .collect()
        } else {
            HashMap::new()
        }
    }

    /// Cleanup and shutdown the hardware manager
    pub async fn cleanup(&mut self) -> HardwareResult<()> {
        // Backend cleanup - no longer needed with new trait design
        // Individual devices handle their own cleanup through the shutdown method

        // Clear caches
        self.device_info
            .write()
            .expect("device_info lock should not be poisoned")
            .clear();
        self.device_metrics
            .write()
            .expect("device_metrics lock should not be poisoned")
            .clear();

        Ok(())
    }
}

impl Default for HardwareManager {
    fn default() -> Self {
        Self::new(HardwareManagerConfig::default())
    }
}

// Re-export commonly used types
pub use super::config::{AllocationStrategy, LoadBalancingStrategy};
pub use super::monitoring::{AnomalySeverity, AnomalyType, HealthStatus};

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    async fn test_hardware_manager_creation() {
        let config = HardwareManagerConfig::default();
        let manager = HardwareManager::new(config);

        // Basic creation test
        assert_eq!(manager.list_devices().len(), 0);
    }

    #[tokio::test]
    async fn test_cpu_backend_initialization() {
        let mut manager = HardwareManager::default();

        // Should successfully initialize CPU backend
        assert!(manager.initialize().await.is_ok());

        // Should have at least one CPU device
        let cpu_devices = manager.list_devices_by_type(HardwareType::CPU);
        assert!(!cpu_devices.is_empty());
    }

    #[tokio::test]
    async fn test_device_metrics_update() {
        let manager = HardwareManager::default();

        let metrics = HardwareMetrics {
            ops_per_second: 1000.0,
            memory_bandwidth: 100.0,
            utilization: 50.0,
            power_consumption: 100.0,
            temperature: Some(45.0),
            error_rate: 0.001,
            latency: 1.0,
            throughput: 1000.0,
        };

        manager.update_device_metrics("test_device", metrics.clone());

        let retrieved_metrics = manager.get_device_metrics("test_device");
        assert!(retrieved_metrics.is_some());
        assert_eq!(
            retrieved_metrics.expect("operation failed in test").utilization,
            50.0
        );
    }
}