trustformers 0.1.1

// Metal Pipeline Backend Integration for TrustformeRS
// Provides high-performance Metal inference optimized for Apple Silicon and iOS/macOS devices

use crate::core::traits::Tokenizer;
use crate::error::{Result, TrustformersError};
use crate::pipeline::{ClassificationOutput, GenerationOutput, Pipeline, PipelineOutput};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use trustformers_core::tensor::Tensor;

// Metal backend types
#[derive(Debug, Clone)]
pub struct MetalBackend {
    device: MetalDevice,
    command_queue: MetalCommandQueue,
    compute_pipeline: Option<MetalComputePipelineState>,
    library: Option<MetalLibrary>,
}

#[derive(Debug, Clone)]
pub struct MetalDevice;

#[derive(Debug, Clone)]
pub struct MetalCommandQueue;

#[derive(Debug, Clone)]
pub struct MetalComputePipelineState;

#[derive(Debug, Clone)]
pub struct MetalLibrary;

#[derive(Debug, Clone)]
pub struct MetalBuffer;

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum MetalPrecisionMode {
    /// 32-bit floating point (highest precision)
    FP32,
    /// 16-bit floating point (balanced precision/performance)
    FP16,
    /// 8-bit integer (best performance, lowest precision)
    INT8,
    /// Automatic precision selection based on device capabilities
    Auto,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum MetalDeviceType {
    /// Integrated GPU (Apple Silicon)
    IntegratedGPU,
    /// Discrete GPU (Intel Macs)
    DiscreteGPU,
    /// Neural Engine (Apple Silicon only)
    NeuralEngine,
    /// CPU fallback
    CPU,
    /// Automatic device selection
    Auto,
}

#[derive(Debug, Clone, Copy)]
pub enum MetalOptimizationLevel {
    /// Basic optimizations
    O1,
    /// Standard optimizations
    O2,
    /// Aggressive optimizations
    O3,
    /// Maximum optimizations with trade-offs
    Ofast,
}

#[derive(Debug, Clone, Copy)]
pub enum MetalMemoryStrategy {
    /// Shared memory between CPU and GPU
    Shared,
    /// Private GPU memory
    Private,
    /// Managed memory (automatic migration)
    Managed,
    /// Automatic selection based on device
    Auto,
}

#[derive(Debug, Clone)]
pub struct MetalBackendConfig {
    /// Model file path
    pub model_path: PathBuf,
    /// Compiled Metal library path (optional)
    pub metal_library_path: Option<PathBuf>,
    /// Device type preference
    pub device_type: MetalDeviceType,
    /// Precision mode
    pub precision_mode: MetalPrecisionMode,
    /// Maximum batch size
    pub max_batch_size: usize,
    /// Memory strategy
    pub memory_strategy: MetalMemoryStrategy,
    /// Optimization level
    pub optimization_level: MetalOptimizationLevel,
    /// Enable Neural Engine (Apple Silicon only)
    pub enable_neural_engine: bool,
    /// Enable Metal Performance Shaders
    pub enable_mps: bool,
    /// Buffer allocation size
    pub buffer_allocation_size: usize,
    /// Enable profiling
    pub enable_profiling: bool,
    /// Profile output path
    pub profile_output_path: Option<PathBuf>,
    /// Enable fast math optimizations
    pub enable_fast_math: bool,
    /// Enable threadgroup optimization
    pub enable_threadgroup_optimization: bool,
}

impl Default for MetalBackendConfig {
    fn default() -> Self {
        Self {
            model_path: PathBuf::new(),
            metal_library_path: None,
            device_type: MetalDeviceType::Auto,
            precision_mode: MetalPrecisionMode::Auto,
            max_batch_size: 1,
            memory_strategy: MetalMemoryStrategy::Auto,
            optimization_level: MetalOptimizationLevel::O2,
            enable_neural_engine: true,
            enable_mps: true,
            buffer_allocation_size: 256 * 1024 * 1024, // 256MB
            enable_profiling: false,
            profile_output_path: None,
            enable_fast_math: true,
            enable_threadgroup_optimization: true,
        }
    }
}

impl MetalBackend {
    /// Create a new Metal backend instance
    pub fn new(config: MetalBackendConfig) -> Result<Self> {
        let device = Self::create_device(config.device_type)?;
        let command_queue = Self::create_command_queue(&device)?;

        Ok(Self {
            device,
            command_queue,
            compute_pipeline: None,
            library: None,
        })
    }

    /// Create Metal device based on device type preference
    fn create_device(device_type: MetalDeviceType) -> Result<MetalDevice> {
        // In a real implementation, this would use Metal framework
        // For now, return a mock device
        Ok(MetalDevice)
    }

    /// Create command queue for the device
    fn create_command_queue(device: &MetalDevice) -> Result<MetalCommandQueue> {
        // In a real implementation, this would create MTLCommandQueue
        Ok(MetalCommandQueue)
    }

    /// Load and compile Metal shaders
    pub fn load_shaders(&mut self, shader_path: &Path) -> Result<()> {
        // In a real implementation, this would:
        // 1. Load .metal files
        // 2. Compile to Metal library
        // 3. Create compute pipeline states
        self.library = Some(MetalLibrary);
        self.compute_pipeline = Some(MetalComputePipelineState);
        Ok(())
    }

    /// Check device capabilities
    pub fn get_device_capabilities(&self) -> MetalDeviceCapabilities {
        MetalDeviceCapabilities {
            supports_neural_engine: true,
            supports_mps: true,
            supports_fp16: true,
            supports_int8: true,
            max_threads_per_threadgroup: 1024,
            max_buffer_size: 4 * 1024 * 1024 * 1024, // 4GB
            unified_memory: true,
        }
    }

    /// Run inference on Metal device
    pub fn run_inference(
        &self,
        inputs: HashMap<String, Tensor>,
    ) -> Result<HashMap<String, Tensor>> {
        // Mock inference - in real implementation would:
        // 1. Allocate Metal buffers
        // 2. Copy input data to GPU
        // 3. Dispatch compute kernels
        // 4. Copy results back to CPU
        let mut outputs = HashMap::new();

        // Create mock output tensor
        let output_shape = vec![1, 512]; // Example output shape
        let output_data: Vec<f32> = vec![0.5; output_shape.iter().product()];
        let output_tensor = Tensor::from_vec(output_data, &output_shape)?;

        outputs.insert("logits".to_string(), output_tensor);
        Ok(outputs)
    }

    /// Create optimized buffers for inference
    fn create_buffers(&self, tensor: &Tensor) -> Result<MetalBuffer> {
        // In real implementation would create MTLBuffer
        Ok(MetalBuffer)
    }

    /// Compile model for Metal execution
    pub fn compile_model(&mut self, model_path: &Path) -> Result<()> {
        // In real implementation would:
        // 1. Load model weights
        // 2. Generate Metal shaders for operations
        // 3. Optimize for target device
        // 4. Create execution graph
        Ok(())
    }
}

#[derive(Debug, Clone)]
pub struct MetalDeviceCapabilities {
    pub supports_neural_engine: bool,
    pub supports_mps: bool,
    pub supports_fp16: bool,
    pub supports_int8: bool,
    pub max_threads_per_threadgroup: usize,
    pub max_buffer_size: usize,
    pub unified_memory: bool,
}

/// Metal Text Classification Pipeline
pub struct MetalTextClassificationPipeline<T: Tokenizer> {
    tokenizer: T,
    backend: MetalBackend,
    config: MetalBackendConfig,
}

impl<T: Tokenizer + Clone> MetalTextClassificationPipeline<T> {
    /// Create a new Metal text classification pipeline
    pub fn new(tokenizer: T, config: MetalBackendConfig) -> Result<Self> {
        let mut backend = MetalBackend::new(config.clone())?;

        // Compile model for Metal execution
        backend.compile_model(&config.model_path)?;

        Ok(Self {
            tokenizer,
            backend,
            config,
        })
    }

    /// Get device capabilities
    pub fn device_capabilities(&self) -> MetalDeviceCapabilities {
        self.backend.get_device_capabilities()
    }
}

impl<T: Tokenizer + Clone> Pipeline for MetalTextClassificationPipeline<T> {
    type Input = String;
    type Output = PipelineOutput;

    fn __call__(&self, input: Self::Input) -> Result<Self::Output> {
        // Tokenize input
        let tokenized = self.tokenizer.encode(&input)?;
        let input_ids = tokenized.input_ids;
        let attention_mask = tokenized.attention_mask;

        // Prepare inputs for Metal backend
        let mut inputs = HashMap::new();
        inputs.insert(
            "input_ids".to_string(),
            Tensor::from_vec(
                input_ids.iter().map(|&x| x as f32).collect(),
                &[1, input_ids.len()],
            )?,
        );
        inputs.insert(
            "attention_mask".to_string(),
            Tensor::from_vec(
                attention_mask.iter().map(|&x| x as f32).collect(),
                &[1, attention_mask.len()],
            )?,
        );
        // Run inference on Metal device
        let outputs = self.backend.run_inference(inputs)?;

        // Extract logits and convert to classification results
        if let Some(logits_tensor) = outputs.get("logits") {
            let logits = logits_tensor.data()?;

            // Apply softmax and create classification results
            let exp_logits: Vec<f32> = logits.iter().map(|x| x.exp()).collect();
            let sum_exp: f32 = exp_logits.iter().sum();
            let probabilities: Vec<f32> = exp_logits.iter().map(|x| x / sum_exp).collect();

            let mut results = Vec::new();
            for (i, &prob) in probabilities.iter().enumerate() {
                results.push(ClassificationOutput {
                    label: format!("LABEL_{}", i),
                    score: prob,
                });
            }

            // Sort by score (descending)
            results
                .sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));

            Ok(PipelineOutput::Classification(results))
        } else {
            Err(TrustformersError::invalid_input_simple(
                "No logits output from Metal backend".to_string(),
            ))
        }
    }
}

/// Metal Text Generation Pipeline
pub struct MetalTextGenerationPipeline<T: Tokenizer> {
    tokenizer: T,
    backend: MetalBackend,
    config: MetalBackendConfig,
}

impl<T: Tokenizer + Clone> MetalTextGenerationPipeline<T> {
    /// Create a new Metal text generation pipeline
    pub fn new(tokenizer: T, config: MetalBackendConfig) -> Result<Self> {
        let mut backend = MetalBackend::new(config.clone())?;

        // Compile model for Metal execution
        backend.compile_model(&config.model_path)?;

        Ok(Self {
            tokenizer,
            backend,
            config,
        })
    }
}

impl<T: Tokenizer + Clone> Pipeline for MetalTextGenerationPipeline<T> {
    type Input = String;
    type Output = PipelineOutput;

    fn __call__(&self, input: Self::Input) -> Result<Self::Output> {
        // Tokenize input
        let tokenized = self.tokenizer.encode(&input)?;
        let input_ids = tokenized.input_ids;

        // Prepare inputs for Metal backend
        let mut inputs = HashMap::new();
        inputs.insert(
            "input_ids".to_string(),
            Tensor::from_vec(
                input_ids.iter().map(|&x| x as f32).collect(),
                &[1, input_ids.len()],
            )?,
        );

        // Run inference on Metal device
        let outputs = self.backend.run_inference(inputs)?;

        // Extract logits and perform text generation
        if let Some(logits_tensor) = outputs.get("logits") {
            let logits = logits_tensor.data()?;

            // Simple greedy decoding (in real implementation would support various decoding strategies)
            let next_token_id = logits
                .iter()
                .enumerate()
                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
                .map(|(index, _)| index as u32)
                .unwrap_or(0);

            // Decode generated token
            let generated_text = self.tokenizer.decode(&[next_token_id])?;

            Ok(PipelineOutput::Generation(GenerationOutput {
                generated_text: input + &generated_text,
                sequences: Some(vec![vec![next_token_id]]),
                scores: Some(logits.clone()),
            }))
        } else {
            Err(TrustformersError::invalid_input_simple(
                "No logits output from Metal backend".to_string(),
            ))
        }
    }
}

/// Factory functions for creating Metal pipelines
impl MetalBackendConfig {
    /// Create configuration optimized for Apple Silicon M1/M2/M3
    pub fn for_apple_silicon() -> Self {
        Self {
            device_type: MetalDeviceType::IntegratedGPU,
            precision_mode: MetalPrecisionMode::FP16,
            memory_strategy: MetalMemoryStrategy::Shared,
            enable_neural_engine: true,
            enable_mps: true,
            optimization_level: MetalOptimizationLevel::O3,
            enable_fast_math: true,
            enable_threadgroup_optimization: true,
            ..Default::default()
        }
    }

    /// Create configuration optimized for iOS devices
    pub fn for_ios() -> Self {
        Self {
            device_type: MetalDeviceType::IntegratedGPU,
            precision_mode: MetalPrecisionMode::FP16,
            memory_strategy: MetalMemoryStrategy::Shared,
            enable_neural_engine: true,
            enable_mps: true,
            optimization_level: MetalOptimizationLevel::O2,
            max_batch_size: 1,
            buffer_allocation_size: 128 * 1024 * 1024, // 128MB for mobile
            enable_fast_math: true,
            ..Default::default()
        }
    }

    /// Create configuration optimized for Intel Macs with discrete GPU
    pub fn for_intel_mac() -> Self {
        Self {
            device_type: MetalDeviceType::DiscreteGPU,
            precision_mode: MetalPrecisionMode::FP32,
            memory_strategy: MetalMemoryStrategy::Private,
            enable_neural_engine: false,
            enable_mps: false,
            optimization_level: MetalOptimizationLevel::O2,
            enable_fast_math: false,
            ..Default::default()
        }
    }

    /// Create configuration for maximum performance (may sacrifice accuracy)
    pub fn for_maximum_performance() -> Self {
        Self {
            device_type: MetalDeviceType::IntegratedGPU,
            precision_mode: MetalPrecisionMode::INT8,
            memory_strategy: MetalMemoryStrategy::Shared,
            enable_neural_engine: true,
            enable_mps: true,
            optimization_level: MetalOptimizationLevel::Ofast,
            enable_fast_math: true,
            enable_threadgroup_optimization: true,
            ..Default::default()
        }
    }
}

/// Factory functions for creating Metal pipelines
pub fn create_metal_text_classification_pipeline<T: Tokenizer + Clone>(
    tokenizer: T,
    config: Option<MetalBackendConfig>,
) -> Result<MetalTextClassificationPipeline<T>> {
    let config = config.unwrap_or_else(MetalBackendConfig::for_apple_silicon);
    MetalTextClassificationPipeline::new(tokenizer, config)
}

pub fn create_metal_text_generation_pipeline<T: Tokenizer + Clone>(
    tokenizer: T,
    config: Option<MetalBackendConfig>,
) -> Result<MetalTextGenerationPipeline<T>> {
    let config = config.unwrap_or_else(MetalBackendConfig::for_apple_silicon);
    MetalTextGenerationPipeline::new(tokenizer, config)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_metal_backend_creation() {
        let config = MetalBackendConfig::for_apple_silicon();
        let backend = MetalBackend::new(config);
        assert!(backend.is_ok());
    }

    #[test]
    fn test_device_capabilities() {
        let config = MetalBackendConfig::for_apple_silicon();
        let backend = MetalBackend::new(config).expect("operation failed in test");
        let capabilities = backend.get_device_capabilities();

        assert!(capabilities.supports_neural_engine);
        assert!(capabilities.supports_mps);
        assert!(capabilities.supports_fp16);
        assert!(capabilities.unified_memory);
    }

    #[test]
    fn test_configuration_presets() {
        let apple_silicon_config = MetalBackendConfig::for_apple_silicon();
        assert_eq!(
            apple_silicon_config.device_type,
            MetalDeviceType::IntegratedGPU
        );
        assert_eq!(
            apple_silicon_config.precision_mode,
            MetalPrecisionMode::FP16
        );
        assert!(apple_silicon_config.enable_neural_engine);

        let ios_config = MetalBackendConfig::for_ios();
        assert_eq!(ios_config.max_batch_size, 1);
        assert_eq!(ios_config.buffer_allocation_size, 128 * 1024 * 1024);

        let intel_config = MetalBackendConfig::for_intel_mac();
        assert_eq!(intel_config.device_type, MetalDeviceType::DiscreteGPU);
        assert_eq!(intel_config.precision_mode, MetalPrecisionMode::FP32);
        assert!(!intel_config.enable_neural_engine);
    }

    // ── Default configuration ─────────────────────────────────────────────────

    #[test]
    fn test_default_config_values() {
        let cfg = MetalBackendConfig::default();
        assert_eq!(cfg.max_batch_size, 1);
        assert!(cfg.enable_neural_engine);
        assert!(cfg.enable_mps);
        assert!(cfg.enable_fast_math);
        assert!(cfg.enable_threadgroup_optimization);
        assert!(!cfg.enable_profiling);
        assert!(cfg.profile_output_path.is_none());
        assert!(cfg.metal_library_path.is_none());
    }

    #[test]
    fn test_default_config_buffer_allocation_256mb() {
        let cfg = MetalBackendConfig::default();
        assert_eq!(cfg.buffer_allocation_size, 256 * 1024 * 1024);
    }

    // ── Device type variants ──────────────────────────────────────────────────

    #[test]
    fn test_device_type_variants_are_distinct() {
        assert_ne!(
            MetalDeviceType::IntegratedGPU as i32,
            MetalDeviceType::DiscreteGPU as i32
        );
        assert_ne!(
            MetalDeviceType::NeuralEngine as i32,
            MetalDeviceType::CPU as i32
        );
        assert_ne!(MetalDeviceType::Auto as i32, MetalDeviceType::CPU as i32);
    }

    // ── Precision mode variants ───────────────────────────────────────────────

    #[test]
    fn test_precision_mode_variants() {
        let fp32 = MetalPrecisionMode::FP32;
        let fp16 = MetalPrecisionMode::FP16;
        let int8 = MetalPrecisionMode::INT8;
        let auto = MetalPrecisionMode::Auto;
        // All four must be constructable and distinguishable
        assert_ne!(fp32, fp16);
        assert_ne!(fp16, int8);
        assert_ne!(int8, auto);
    }

    // ── Memory strategy variants ──────────────────────────────────────────────

    #[test]
    fn test_memory_strategy_variants() {
        let strategies = [
            MetalMemoryStrategy::Shared,
            MetalMemoryStrategy::Private,
            MetalMemoryStrategy::Managed,
            MetalMemoryStrategy::Auto,
        ];
        // Just check all four can be constructed
        assert_eq!(strategies.len(), 4);
    }

    // ── Optimization level variants ───────────────────────────────────────────

    #[test]
    fn test_optimization_level_variants() {
        let levels = [
            MetalOptimizationLevel::O1,
            MetalOptimizationLevel::O2,
            MetalOptimizationLevel::O3,
            MetalOptimizationLevel::Ofast,
        ];
        assert_eq!(levels.len(), 4);
    }

    // ── Backend inference ─────────────────────────────────────────────────────

    #[test]
    fn test_run_inference_returns_logits() {
        let config = MetalBackendConfig::for_apple_silicon();
        let backend = MetalBackend::new(config).expect("backend creation failed");
        let inputs = HashMap::new();
        let outputs = backend.run_inference(inputs).expect("inference failed");
        assert!(
            outputs.contains_key("logits"),
            "output must contain 'logits' key"
        );
    }

    #[test]
    fn test_run_inference_output_shape_product() {
        let config = MetalBackendConfig::for_apple_silicon();
        let backend = MetalBackend::new(config).expect("backend creation failed");
        let inputs = HashMap::new();
        let outputs = backend.run_inference(inputs).expect("inference failed");
        let logits = outputs.get("logits").expect("logits key must be present");
        let data = logits.data().expect("tensor data must be accessible");
        assert_eq!(
            data.len(),
            512,
            "output shape should be [1, 512] → 512 elements"
        );
    }

    // ── Shader loading ────────────────────────────────────────────────────────

    #[test]
    fn test_load_shaders_sets_pipeline_state() {
        let config = MetalBackendConfig::for_apple_silicon();
        let mut backend = MetalBackend::new(config).expect("backend creation failed");
        let dummy_path = std::path::Path::new("/tmp/dummy.metallib");
        let result = backend.load_shaders(dummy_path);
        assert!(result.is_ok(), "load_shaders should succeed (mock impl)");
        assert!(
            backend.compute_pipeline.is_some(),
            "compute_pipeline must be set after load_shaders"
        );
        assert!(
            backend.library.is_some(),
            "library must be set after load_shaders"
        );
    }

    // ── Compile model ─────────────────────────────────────────────────────────

    #[test]
    fn test_compile_model_ok() {
        let config = MetalBackendConfig::for_apple_silicon();
        let mut backend = MetalBackend::new(config).expect("backend creation failed");
        let dummy_path = std::path::Path::new("/tmp/model.onnx");
        let result = backend.compile_model(dummy_path);
        assert!(result.is_ok(), "compile_model should succeed (mock impl)");
    }

    // ── Maximum performance preset ────────────────────────────────────────────

    #[test]
    fn test_maximum_performance_preset() {
        let cfg = MetalBackendConfig::for_maximum_performance();
        assert_eq!(cfg.precision_mode, MetalPrecisionMode::INT8);
        assert!(cfg.enable_neural_engine);
        assert!(cfg.enable_mps);
        assert!(cfg.enable_fast_math);
        assert!(cfg.enable_threadgroup_optimization);
    }

    // ── Device capabilities fields ────────────────────────────────────────────

    #[test]
    fn test_device_capabilities_max_threads_positive() {
        let config = MetalBackendConfig::default();
        let backend = MetalBackend::new(config).expect("backend creation failed");
        let cap = backend.get_device_capabilities();
        assert!(cap.max_threads_per_threadgroup > 0);
    }

    #[test]
    fn test_device_capabilities_max_buffer_size_positive() {
        let config = MetalBackendConfig::default();
        let backend = MetalBackend::new(config).expect("backend creation failed");
        let cap = backend.get_device_capabilities();
        assert!(cap.max_buffer_size > 0);
    }

    #[test]
    fn test_device_capabilities_int8_support() {
        let config = MetalBackendConfig::for_apple_silicon();
        let backend = MetalBackend::new(config).expect("backend creation failed");
        let cap = backend.get_device_capabilities();
        assert!(cap.supports_int8);
    }

    // ── iOS config specifics ──────────────────────────────────────────────────

    #[test]
    fn test_ios_config_128mb_buffer() {
        let cfg = MetalBackendConfig::for_ios();
        assert_eq!(cfg.buffer_allocation_size, 128 * 1024 * 1024);
    }

    #[test]
    fn test_ios_config_o2_optimization() {
        let cfg = MetalBackendConfig::for_ios();
        assert!(matches!(cfg.optimization_level, MetalOptimizationLevel::O2));
    }

    // ── Intel Mac config ──────────────────────────────────────────────────────

    #[test]
    fn test_intel_mac_no_mps() {
        let cfg = MetalBackendConfig::for_intel_mac();
        assert!(!cfg.enable_mps);
        assert!(!cfg.enable_fast_math);
    }

    #[test]
    fn test_intel_mac_private_memory() {
        let cfg = MetalBackendConfig::for_intel_mac();
        assert!(matches!(cfg.memory_strategy, MetalMemoryStrategy::Private));
    }

    // ── Profiling disabled by default ─────────────────────────────────────────

    #[test]
    fn test_profiling_disabled_by_default_in_presets() {
        for cfg in [
            MetalBackendConfig::for_apple_silicon(),
            MetalBackendConfig::for_ios(),
            MetalBackendConfig::for_intel_mac(),
            MetalBackendConfig::for_maximum_performance(),
        ] {
            assert!(
                !cfg.enable_profiling,
                "profiling should be disabled by default"
            );
        }
    }

    // ── LCG-based deterministic output check ──────────────────────────────────

    #[test]
    fn test_inference_output_values_are_finite() {
        let config = MetalBackendConfig::default();
        let backend = MetalBackend::new(config).expect("backend ok");
        let outputs = backend.run_inference(HashMap::new()).expect("inference ok");
        let logits = outputs.get("logits").expect("logits");
        let data = logits.data().expect("data");
        for v in &data {
            assert!(v.is_finite(), "every logit output must be finite");
        }
    }
}