trustformers-mobile 0.1.1

//! Mobile Inference Engine
//!
//! This module provides a unified mobile inference engine that integrates
//! platform-specific optimizations, quantization, and memory management
//! for efficient transformer inference on mobile devices.

use crate::{
    optimization::MobileOptimizationEngine, MobileBackend, MobileConfig, MobilePlatform,
    MobileStats,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
use std::time::Instant;
use trustformers_core::errors::{invalid_input, runtime_error, Result};
use trustformers_core::Tensor;

/// Supported model formats for loading
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ModelFormat {
    /// SafeTensors format (.safetensors)
    SafeTensors,
    /// PyTorch format (.pt, .pth, .bin)
    PyTorch,
    /// ONNX format (.onnx)
    ONNX,
    /// TensorFlow format (.pb)
    TensorFlow,
    /// Unknown or unsupported format
    Unknown,
}

/// Execution strategy for inference
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ExecutionStrategy {
    /// Execute layers sequentially
    Sequential,
    /// Parallelize within layers
    LayerParallel,
    /// Full parallel execution
    FullParallel,
}

/// Execution plan for mobile inference
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExecutionPlan {
    pub strategy: ExecutionStrategy,
    pub num_layers: usize,
    pub batch_size: usize,
    pub checkpoint_interval: usize,
}

impl ExecutionPlan {
    pub fn new(strategy: ExecutionStrategy, num_layers: usize) -> Self {
        Self {
            strategy,
            num_layers,
            batch_size: 1,
            checkpoint_interval: 0, // Disabled by default
        }
    }
}

/// Unified mobile inference engine
#[derive(Debug)]
pub struct MobileInferenceEngine {
    config: MobileConfig,
    optimizer: MobileOptimizationEngine,
    execution_plan: ExecutionPlan,
    stats: MobileStats,
    model_loaded: bool,
    model_weights: Option<HashMap<String, Tensor>>,
    cache: Option<InferenceCache>,
}

impl MobileInferenceEngine {
    /// Create new mobile inference engine
    pub fn new(config: MobileConfig) -> Result<Self> {
        config.validate()?;

        let optimizer = MobileOptimizationEngine::new(config.clone())?;
        let execution_plan = ExecutionPlan::new(ExecutionStrategy::Sequential, 12); // Default 12 layers
        let stats = MobileStats::new(&config);

        Ok(Self {
            config,
            optimizer,
            execution_plan,
            stats,
            model_loaded: false,
            model_weights: None,
            cache: None,
        })
    }

    /// Load model weights and optimize for mobile deployment
    pub fn load_model(&mut self, weights: HashMap<String, Tensor>) -> Result<()> {
        tracing::info!("Loading model with {} parameters", weights.len());

        // Optimize weights for mobile deployment
        let optimized_weights = self.optimizer.optimize_model_weights(&weights)?;

        // Calculate memory footprint
        let total_params: usize =
            optimized_weights.values().map(|t| t.shape().iter().product::<usize>()).sum();

        let footprint = self.optimizer.estimate_memory_footprint(total_params);

        if footprint.total_memory_bytes > self.config.max_memory_mb * 1024 * 1024 {
            return Err(runtime_error(format!(
                "Model requires {}MB but limit is {}MB",
                footprint.memory_usage_mb(),
                self.config.max_memory_mb
            )));
        }

        self.model_weights = Some(optimized_weights);
        self.model_loaded = true;

        // Initialize cache if needed
        if self.should_use_cache() {
            self.cache = Some(InferenceCache::new(self.config.max_memory_mb / 4));
        }

        tracing::info!(
            "Model loaded successfully. Memory footprint: {:.1}MB ({:.1}% savings)",
            footprint.memory_usage_mb(),
            footprint.memory_savings_percent
        );

        Ok(())
    }

    /// Load model from file path
    pub fn load_model_from_file(&mut self, model_path: &str) -> Result<()> {
        use std::fs;
        use std::path::Path;

        let path = Path::new(model_path);
        let model_data = fs::read(model_path)
            .map_err(|e| runtime_error(format!("Failed to read model file: {}", e)))?;

        let weights = self.parse_model_format(&model_data, path)?;
        self.load_model(weights)
    }

    /// Parse model format based on file extension and magic bytes
    fn parse_model_format(&self, data: &[u8], path: &Path) -> Result<HashMap<String, Tensor>> {
        let mut weights = HashMap::new();

        // Detect format based on file extension and magic bytes
        let format = self.detect_model_format(data, path)?;

        match format {
            ModelFormat::SafeTensors => {
                tracing::info!("Loading SafeTensors format model");
                self.parse_safetensors(data, &mut weights)?;
            },
            ModelFormat::PyTorch => {
                tracing::info!("Loading PyTorch format model");
                self.parse_pytorch(data, &mut weights)?;
            },
            ModelFormat::ONNX => {
                tracing::info!("Loading ONNX format model");
                self.parse_onnx(data, &mut weights)?;
            },
            ModelFormat::TensorFlow => {
                tracing::info!("Loading TensorFlow format model");
                self.parse_tensorflow(data, &mut weights)?;
            },
            ModelFormat::Unknown => {
                tracing::warn!("Unknown model format, creating placeholder weights");
                self.create_placeholder_weights(&mut weights)?;
            },
        }

        Ok(weights)
    }

    /// Detect model format from file extension and magic bytes
    fn detect_model_format(&self, data: &[u8], path: &Path) -> Result<ModelFormat> {
        // Check file extension first
        if let Some(extension) = path.extension().and_then(|s| s.to_str()) {
            match extension.to_lowercase().as_str() {
                "safetensors" => return Ok(ModelFormat::SafeTensors),
                "pt" | "pth" | "bin" => return Ok(ModelFormat::PyTorch),
                "onnx" => return Ok(ModelFormat::ONNX),
                "pb" => return Ok(ModelFormat::TensorFlow),
                _ => {},
            }
        }

        // Check magic bytes for format detection
        if data.len() >= 8 {
            // SafeTensors magic bytes
            if data.starts_with(b"TFTSFT") {
                return Ok(ModelFormat::SafeTensors);
            }

            // PyTorch pickle magic
            if data.starts_with(b"\x80\x02") || data.starts_with(b"PK") {
                return Ok(ModelFormat::PyTorch);
            }

            // ONNX magic
            if data.starts_with(b"\x08\x01\x12") {
                return Ok(ModelFormat::ONNX);
            }

            // TensorFlow SavedModel
            if data.len() >= 16 && &data[12..16] == b"\x08\x01" {
                return Ok(ModelFormat::TensorFlow);
            }
        }

        Ok(ModelFormat::Unknown)
    }

    /// Parse SafeTensors format (placeholder implementation)
    fn parse_safetensors(&self, _data: &[u8], weights: &mut HashMap<String, Tensor>) -> Result<()> {
        // This would implement actual SafeTensors parsing
        // For now, create some realistic transformer weights
        self.create_transformer_weights(weights, 768, 12, 50257)?;
        Ok(())
    }

    /// Parse PyTorch format (placeholder implementation)
    fn parse_pytorch(&self, _data: &[u8], weights: &mut HashMap<String, Tensor>) -> Result<()> {
        // This would implement actual PyTorch pickle parsing
        // For now, create some realistic transformer weights
        self.create_transformer_weights(weights, 512, 8, 32000)?;
        Ok(())
    }

    /// Parse ONNX format (placeholder implementation)
    fn parse_onnx(&self, _data: &[u8], weights: &mut HashMap<String, Tensor>) -> Result<()> {
        // This would implement actual ONNX parsing
        self.create_transformer_weights(weights, 512, 6, 16000)?;
        Ok(())
    }

    /// Parse TensorFlow format (placeholder implementation)
    fn parse_tensorflow(&self, _data: &[u8], weights: &mut HashMap<String, Tensor>) -> Result<()> {
        // This would implement actual TensorFlow SavedModel parsing
        self.create_transformer_weights(weights, 512, 8, 25000)?;
        Ok(())
    }

    /// Create placeholder weights for unknown formats
    fn create_placeholder_weights(&self, weights: &mut HashMap<String, Tensor>) -> Result<()> {
        weights.insert("embedding.weight".to_string(), Tensor::randn(&[1000, 512])?);
        weights.insert("layer.0.weight".to_string(), Tensor::randn(&[512, 512])?);
        weights.insert("layer.0.bias".to_string(), Tensor::randn(&[512])?);
        Ok(())
    }

    /// Create realistic transformer architecture weights
    fn create_transformer_weights(
        &self,
        weights: &mut HashMap<String, Tensor>,
        hidden_size: usize,
        num_layers: usize,
        vocab_size: usize,
    ) -> Result<()> {
        // Token embeddings
        weights.insert(
            "transformer.wte.weight".to_string(),
            Tensor::randn(&[vocab_size, hidden_size])?,
        );

        // Position embeddings (assuming max length 2048)
        weights.insert(
            "transformer.wpe.weight".to_string(),
            Tensor::randn(&[2048, hidden_size])?,
        );

        // Transformer layers
        for layer_idx in 0..num_layers {
            let prefix = format!("transformer.h.{}", layer_idx);

            // Self-attention weights
            weights.insert(
                format!("{}.attn.c_attn.weight", prefix),
                Tensor::randn(&[hidden_size, 3 * hidden_size])?,
            );
            weights.insert(
                format!("{}.attn.c_attn.bias", prefix),
                Tensor::randn(&[3 * hidden_size])?,
            );
            weights.insert(
                format!("{}.attn.c_proj.weight", prefix),
                Tensor::randn(&[hidden_size, hidden_size])?,
            );
            weights.insert(
                format!("{}.attn.c_proj.bias", prefix),
                Tensor::randn(&[hidden_size])?,
            );

            // Layer normalization
            weights.insert(
                format!("{}.ln_1.weight", prefix),
                Tensor::ones(&[hidden_size])?,
            );
            weights.insert(
                format!("{}.ln_1.bias", prefix),
                Tensor::zeros(&[hidden_size])?,
            );
            weights.insert(
                format!("{}.ln_2.weight", prefix),
                Tensor::ones(&[hidden_size])?,
            );
            weights.insert(
                format!("{}.ln_2.bias", prefix),
                Tensor::zeros(&[hidden_size])?,
            );

            // Feed-forward network
            let ff_size = hidden_size * 4;
            weights.insert(
                format!("{}.mlp.c_fc.weight", prefix),
                Tensor::randn(&[hidden_size, ff_size])?,
            );
            weights.insert(
                format!("{}.mlp.c_fc.bias", prefix),
                Tensor::randn(&[ff_size])?,
            );
            weights.insert(
                format!("{}.mlp.c_proj.weight", prefix),
                Tensor::randn(&[ff_size, hidden_size])?,
            );
            weights.insert(
                format!("{}.mlp.c_proj.bias", prefix),
                Tensor::randn(&[hidden_size])?,
            );
        }

        // Final layer norm
        weights.insert(
            "transformer.ln_f.weight".to_string(),
            Tensor::ones(&[hidden_size])?,
        );
        weights.insert(
            "transformer.ln_f.bias".to_string(),
            Tensor::zeros(&[hidden_size])?,
        );

        // Language model head
        weights.insert(
            "lm_head.weight".to_string(),
            Tensor::randn(&[hidden_size, vocab_size])?,
        );

        Ok(())
    }

    /// Perform inference with f32 input/output arrays (for C API)
    pub fn inference_f32(&mut self, input_data: &[f32], output_data: &mut [f32]) -> Result<usize> {
        // Convert input array to tensor
        let input_tensor = Tensor::from_vec(input_data.to_vec(), &[1, input_data.len()])?;

        // Perform inference
        let output_tensor = self.inference(&input_tensor)?;

        // Extract data from output tensor
        let output_vec = output_tensor.data()?;
        let output_size = output_vec.len().min(output_data.len());

        // Copy to output array
        for i in 0..output_size {
            output_data[i] = output_vec[i];
        }

        Ok(output_size)
    }

    /// Perform optimized mobile inference
    pub fn inference(&mut self, input: &Tensor) -> Result<Tensor> {
        if !self.model_loaded {
            return Err(runtime_error("Model not loaded"));
        }

        let start_time = Instant::now();

        // Check cache first
        if let Some(ref cache) = self.cache {
            if let Some(cached_result) = cache.get(input) {
                let inference_time = start_time.elapsed().as_millis() as f32;
                self.stats.update_inference(inference_time);
                tracing::debug!("Cache hit for inference");
                return Ok(cached_result);
            }
        }

        // Optimize input tensor
        let optimized_input = self.optimizer.optimize_tensor(input)?;

        // Perform inference based on execution strategy
        let result = match self.execution_plan.strategy {
            ExecutionStrategy::Sequential => self.sequential_inference(&optimized_input),
            ExecutionStrategy::LayerParallel => self.layer_parallel_inference(&optimized_input),
            ExecutionStrategy::FullParallel => self.full_parallel_inference(&optimized_input),
        }?;

        // Cache result if caching is enabled
        if let Some(ref mut cache) = self.cache {
            cache.put(input.clone(), result.clone());
        }

        let inference_time = start_time.elapsed().as_millis() as f32;
        self.stats.update_inference(inference_time);

        // Update memory statistics
        let current_memory = self.estimate_current_memory_usage();
        self.stats.update_memory(current_memory);

        Ok(result)
    }

    /// Perform batch inference with mobile optimizations
    pub fn batch_inference(&mut self, inputs: Vec<Tensor>) -> Result<Vec<Tensor>> {
        if !self.model_loaded {
            return Err(runtime_error("Model not loaded"));
        }

        // Optimize batch for mobile constraints
        let optimized_inputs = self.optimizer.optimize_batch(&inputs)?;

        let mut results = Vec::with_capacity(optimized_inputs.len());
        for input in optimized_inputs {
            let result = self.inference(&input)?;
            results.push(result);
        }

        Ok(results)
    }

    /// Get current inference statistics
    pub fn get_stats(&self) -> &MobileStats {
        &self.stats
    }

    /// Get memory usage information
    pub fn get_memory_info(&self) -> MobileMemoryInfo {
        let footprint = if let Some(ref weights) = self.model_weights {
            let total_params: usize =
                weights.values().map(|t| t.shape().iter().product::<usize>()).sum();
            self.optimizer.estimate_memory_footprint(total_params)
        } else {
            self.optimizer.estimate_memory_footprint(0)
        };

        MobileMemoryInfo {
            model_memory_mb: footprint.model_memory_bytes / (1024 * 1024),
            runtime_memory_mb: footprint.runtime_overhead_bytes / (1024 * 1024),
            total_memory_mb: footprint.total_memory_bytes / (1024 * 1024),
            memory_limit_mb: self.config.max_memory_mb,
            memory_savings_percent: footprint.memory_savings_percent,
            cache_memory_mb: self.cache.as_ref().map(|c| c.memory_usage_mb()).unwrap_or(0),
        }
    }

    /// Update configuration and re-optimize
    pub fn update_config(&mut self, new_config: MobileConfig) -> Result<()> {
        new_config.validate()?;

        self.config = new_config.clone();
        self.optimizer = MobileOptimizationEngine::new(new_config)?;

        // Re-optimize loaded model if available
        if let Some(ref weights) = self.model_weights.clone() {
            self.load_model(weights.clone())?;
        }

        Ok(())
    }

    /// Set power mode for inference
    pub fn set_power_mode(&mut self, power_mode: crate::optimization::PowerMode) -> Result<()> {
        // Update the configuration based on power mode
        match power_mode {
            crate::optimization::PowerMode::PowerSaving => {
                self.config.use_fp16 = true;
                self.config.max_memory_mb /= 2;
                self.config.backend = crate::MobileBackend::CPU;
            },
            crate::optimization::PowerMode::Balanced => {
                // Keep current settings but optimize for balance
                self.config.use_fp16 = true;
            },
            crate::optimization::PowerMode::HighPerformance => {
                self.config.use_fp16 = false;
                self.config.backend = crate::MobileBackend::GPU;
            },
        }

        // Update the optimizer with new config
        self.optimizer = crate::optimization::MobileOptimizationEngine::new(self.config.clone())?;

        Ok(())
    }

    /// Reduce performance by a factor (0.0 = minimum, 1.0 = maximum)
    pub fn reduce_performance(&mut self, factor: f32) -> Result<()> {
        let factor = factor.clamp(0.1, 1.0);

        // Reduce memory usage
        self.config.max_memory_mb = (self.config.max_memory_mb as f32 * factor) as usize;

        // Force FP16 for reduced performance
        if factor < 0.8 {
            self.config.use_fp16 = true;
        }

        // Switch to CPU for very low performance
        if factor < 0.5 {
            self.config.backend = crate::MobileBackend::CPU;
        }

        // Update optimizer
        self.optimizer = crate::optimization::MobileOptimizationEngine::new(self.config.clone())?;

        Ok(())
    }

    /// Set batch size for inference
    pub fn set_batch_size(&mut self, batch_size: usize) -> Result<()> {
        // Note: This is a placeholder implementation since batch size isn't directly stored in config
        // In a real implementation, this would be stored in the engine state
        if batch_size == 0 {
            return Err(invalid_input("Batch size must be greater than 0"));
        }

        // For now, adjust memory based on batch size
        // Larger batches need more memory
        let base_memory = 512; // Base memory in MB
        let memory_per_batch = 64; // Additional memory per batch item
        self.config.max_memory_mb = base_memory + (batch_size - 1) * memory_per_batch;

        Ok(())
    }

    /// Clear inference cache to free memory
    pub fn clear_cache(&mut self) {
        if let Some(ref mut cache) = self.cache {
            cache.clear();
        }
    }

    /// Force garbage collection to free memory
    pub fn force_gc(&mut self) {
        self.clear_cache();
        // In a real implementation, this would trigger platform-specific GC
    }

    /// Warm up the engine by running dummy inferences
    ///
    /// This method runs several dummy inference passes to:
    /// - Initialize GPU/accelerator resources
    /// - Compile compute shaders/kernels
    /// - Populate caches
    /// - Stabilize performance measurements
    ///
    /// Should be called after model loading to ensure consistent performance.
    pub fn warm_up(&mut self) -> Result<()> {
        if !self.model_loaded {
            return Err(runtime_error("Cannot warm up: model not loaded"));
        }

        tracing::info!("Starting engine warm-up...");
        let start_time = Instant::now();

        // Determine input shape based on model configuration
        // Use a typical sequence length for transformers
        let batch_size = 1;
        let seq_length = 128; // Typical warm-up sequence length
        let hidden_size = 512; // Default hidden size

        // Run multiple warm-up iterations
        let warm_up_iterations = 3;

        for i in 0..warm_up_iterations {
            // Create dummy input tensor
            let dummy_input = Tensor::zeros(&[batch_size, seq_length, hidden_size])?;

            // Perform inference (this will initialize kernels and caches)
            let _result = self.inference(&dummy_input)?;

            tracing::debug!(
                "Warm-up iteration {}/{} completed",
                i + 1,
                warm_up_iterations
            );
        }

        let warm_up_time = start_time.elapsed();
        tracing::info!(
            "Engine warm-up completed in {:.2}ms ({} iterations)",
            warm_up_time.as_millis(),
            warm_up_iterations
        );

        Ok(())
    }

    /// Set performance mode for the engine
    ///
    /// This is a convenience wrapper around set_power_mode that accepts
    /// integer mode values for C FFI compatibility:
    /// - 0: Power Saving mode
    /// - 1: Balanced mode
    /// - 2: High Performance mode
    pub fn set_performance_mode(&mut self, mode: i32) -> Result<()> {
        let power_mode = match mode {
            0 => crate::optimization::PowerMode::PowerSaving,
            1 => crate::optimization::PowerMode::Balanced,
            2 => crate::optimization::PowerMode::HighPerformance,
            _ => return Err(invalid_input(format!("Invalid performance mode: {}", mode))),
        };

        self.set_power_mode(power_mode)
    }

    // Private inference methods

    fn sequential_inference(&self, input: &Tensor) -> Result<Tensor> {
        // Sequential layer-by-layer inference for minimum memory usage
        let mut current = input.clone();

        // Simulate layer processing (in practice, would iterate through model layers)
        if let Some(ref weights) = self.model_weights {
            for (layer_name, weight) in weights {
                current = self.process_layer(&current, weight)?;

                // Apply checkpointing if configured
                if self.execution_plan.checkpoint_interval > 0 {
                    // Would save intermediate state for memory management
                }
            }
        }

        Ok(current)
    }

    fn layer_parallel_inference(&self, input: &Tensor) -> Result<Tensor> {
        // Layer-parallel inference for balanced performance
        let mut current = input.clone();

        // Simulate parallel layer processing
        if let Some(ref weights) = self.model_weights {
            // Group layers for parallel processing
            let layer_groups = self.group_layers_for_parallel_processing(weights);

            for group in layer_groups {
                current = self.process_layer_group(&current, &group)?;
            }
        }

        Ok(current)
    }

    fn full_parallel_inference(&self, input: &Tensor) -> Result<Tensor> {
        // Full parallel inference for maximum speed
        let mut current = input.clone();

        // Simulate full parallel processing
        if let Some(ref weights) = self.model_weights {
            // Process all compatible layers in parallel
            current = self.process_all_layers_parallel(&current, weights)?;
        }

        Ok(current)
    }

    fn process_layer(&self, input: &Tensor, weight: &Tensor) -> Result<Tensor> {
        // Process a single layer (simplified)
        // In practice, this would perform the actual layer computation
        Ok(input.clone())
    }

    fn process_layer_group(&self, input: &Tensor, group: &[(&String, &Tensor)]) -> Result<Tensor> {
        // Process a group of layers in parallel
        let mut current = input.clone();

        for (_, weight) in group {
            current = self.process_layer(&current, weight)?;
        }

        Ok(current)
    }

    fn process_all_layers_parallel(
        &self,
        input: &Tensor,
        weights: &HashMap<String, Tensor>,
    ) -> Result<Tensor> {
        // Process all layers with maximum parallelism
        let mut current = input.clone();

        // Simplified parallel processing
        for weight in weights.values() {
            current = self.process_layer(&current, weight)?;
        }

        Ok(current)
    }

    fn group_layers_for_parallel_processing<'a>(
        &self,
        weights: &'a HashMap<String, Tensor>,
    ) -> Vec<Vec<(&'a String, &'a Tensor)>> {
        // Group layers for parallel processing based on dependencies
        // This is a simplified implementation
        let mut groups = Vec::new();
        let mut current_group = Vec::new();

        for (name, weight) in weights {
            current_group.push((name, weight));

            // Create new group every N layers
            if current_group.len() >= 3 {
                groups.push(current_group);
                current_group = Vec::new();
            }
        }

        if !current_group.is_empty() {
            groups.push(current_group);
        }

        groups
    }

    fn estimate_current_memory_usage(&self) -> usize {
        let mut total = 0;

        // Model weights memory
        if let Some(ref weights) = self.model_weights {
            for weight in weights.values() {
                total += weight.memory_usage();
            }
        }

        // Cache memory
        if let Some(ref cache) = self.cache {
            total += cache.memory_usage_mb() * 1024 * 1024;
        }

        // Convert to MB
        total / (1024 * 1024)
    }

    fn should_use_cache(&self) -> bool {
        // Enable cache only if we have sufficient memory
        self.config.max_memory_mb >= 512
            && self.config.memory_optimization != crate::MemoryOptimization::Maximum
    }
}

/// Inference cache for mobile deployment
#[derive(Debug)]
struct InferenceCache {
    cache: HashMap<Vec<u8>, Tensor>,
    max_size_mb: usize,
    current_size_bytes: usize,
}

impl InferenceCache {
    fn new(max_size_mb: usize) -> Self {
        Self {
            cache: HashMap::new(),
            max_size_mb,
            current_size_bytes: 0,
        }
    }

    fn get(&self, input: &Tensor) -> Option<Tensor> {
        let key = self.tensor_to_key(input);
        self.cache.get(&key).cloned()
    }

    fn put(&mut self, input: Tensor, output: Tensor) {
        let key = self.tensor_to_key(&input);
        let entry_size = input.memory_usage() + output.memory_usage();

        // Check if we have space
        if self.current_size_bytes + entry_size > self.max_size_mb * 1024 * 1024 {
            self.evict_lru();
        }

        self.cache.insert(key, output);
        self.current_size_bytes += entry_size;
    }

    fn clear(&mut self) {
        self.cache.clear();
        self.current_size_bytes = 0;
    }

    fn memory_usage_mb(&self) -> usize {
        self.current_size_bytes / (1024 * 1024)
    }

    fn tensor_to_key(&self, tensor: &Tensor) -> Vec<u8> {
        // Create a simple key from tensor shape and first few values
        // This is a simplified implementation
        let shape = tensor.shape();
        let mut key = Vec::new();

        for &dim in &shape {
            key.extend_from_slice(&dim.to_le_bytes());
        }

        key
    }

    fn evict_lru(&mut self) {
        // Simple eviction strategy - remove oldest entries
        // In practice, would use a proper LRU implementation
        if !self.cache.is_empty() {
            let first_key = self.cache.keys().next().expect("Cache is empty").clone();
            self.cache.remove(&first_key);
            self.current_size_bytes = self.current_size_bytes.saturating_sub(1024 * 1024);
            // Approximate
        }
    }
}

/// Mobile memory information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MobileMemoryInfo {
    /// Model memory usage in MB
    pub model_memory_mb: usize,
    /// Runtime memory overhead in MB
    pub runtime_memory_mb: usize,
    /// Total memory usage in MB
    pub total_memory_mb: usize,
    /// Memory limit in MB
    pub memory_limit_mb: usize,
    /// Memory savings percentage
    pub memory_savings_percent: f32,
    /// Cache memory usage in MB
    pub cache_memory_mb: usize,
}

impl MobileMemoryInfo {
    /// Check if memory usage is within limits
    pub fn is_within_limits(&self) -> bool {
        self.total_memory_mb <= self.memory_limit_mb
    }

    /// Get memory utilization percentage
    pub fn memory_utilization_percent(&self) -> f32 {
        (self.total_memory_mb as f32 / self.memory_limit_mb as f32) * 100.0
    }

    /// Get available memory in MB
    pub fn available_memory_mb(&self) -> usize {
        self.memory_limit_mb.saturating_sub(self.total_memory_mb)
    }
}

/// Mobile inference configuration builder
pub struct MobileInferenceBuilder {
    config: MobileConfig,
}

impl MobileInferenceBuilder {
    /// Create new builder with default mobile configuration
    pub fn new() -> Self {
        Self {
            config: MobileConfig::default(),
        }
    }

    /// Set target platform
    pub fn platform(mut self, platform: MobilePlatform) -> Self {
        self.config.platform = platform;
        self
    }

    /// Set inference backend
    pub fn backend(mut self, backend: MobileBackend) -> Self {
        self.config.backend = backend;
        self
    }

    /// Set memory limit
    pub fn memory_limit_mb(mut self, limit: usize) -> Self {
        self.config.max_memory_mb = limit;
        self
    }

    /// Enable/disable FP16 precision
    pub fn fp16(mut self, enable: bool) -> Self {
        self.config.use_fp16 = enable;
        self
    }

    /// Set quantization scheme
    pub fn quantization(mut self, scheme: crate::MobileQuantizationScheme) -> Self {
        self.config.quantization = Some(crate::MobileQuantizationConfig {
            scheme,
            dynamic: true,
            per_channel: false,
        });
        self
    }

    /// Set thread count
    pub fn threads(mut self, count: usize) -> Self {
        self.config.num_threads = count;
        self
    }

    /// Enable/disable batching
    pub fn batching(mut self, enable: bool, max_batch_size: usize) -> Self {
        self.config.enable_batching = enable;
        self.config.max_batch_size = max_batch_size;
        self
    }

    /// Set memory optimization level
    pub fn memory_optimization(mut self, level: crate::MemoryOptimization) -> Self {
        self.config.memory_optimization = level;
        self
    }

    /// Build the inference engine
    pub fn build(self) -> Result<MobileInferenceEngine> {
        MobileInferenceEngine::new(self.config)
    }
}

impl Default for MobileInferenceBuilder {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_mobile_inference_engine_creation() {
        let config = MobileConfig::default();
        let engine = MobileInferenceEngine::new(config);
        assert!(engine.is_ok());
    }

    #[test]
    fn test_model_loading() {
        let config = MobileConfig::default();
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        let mut weights = HashMap::new();
        weights.insert(
            "layer1".to_string(),
            Tensor::ones(&[10, 10]).expect("Failed to create tensor"),
        );
        weights.insert(
            "layer2".to_string(),
            Tensor::ones(&[10, 5]).expect("Failed to create tensor"),
        );

        let result = engine.load_model(weights);
        assert!(result.is_ok());
        assert!(engine.model_loaded);
    }

    #[test]
    fn test_inference() {
        let config = MobileConfig::default();
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Load a simple model
        let mut weights = HashMap::new();
        weights.insert(
            "layer1".to_string(),
            Tensor::ones(&[5, 5]).expect("Failed to create tensor"),
        );
        engine.load_model(weights).expect("Failed to load model");

        // Perform inference
        let input = Tensor::ones(&[5]).expect("Failed to create tensor");
        let result = engine.inference(&input);
        assert!(result.is_ok());
    }

    #[test]
    fn test_batch_inference() {
        let config = MobileConfig {
            enable_batching: true,
            max_batch_size: 3,
            ..Default::default()
        };
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Load a simple model
        let mut weights = HashMap::new();
        weights.insert(
            "layer1".to_string(),
            Tensor::ones(&[5, 5]).expect("Failed to create tensor"),
        );
        engine.load_model(weights).expect("Failed to load model");

        // Perform batch inference
        let inputs = vec![
            Tensor::ones(&[5]).expect("Failed to create tensor"),
            Tensor::ones(&[5]).expect("Failed to create tensor"),
        ];
        let results = engine.batch_inference(inputs);
        assert!(results.is_ok());
        assert_eq!(results.expect("Batch inference failed").len(), 2);
    }

    #[test]
    fn test_memory_info() {
        let config = MobileConfig::default();
        let engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        let memory_info = engine.get_memory_info();
        assert!(memory_info.memory_limit_mb > 0);
        assert!(memory_info.memory_utilization_percent() >= 0.0);
    }

    #[test]
    fn test_inference_builder() {
        let engine = MobileInferenceBuilder::new()
            .platform(MobilePlatform::Ios)
            .backend(MobileBackend::CoreML)
            .memory_limit_mb(1024)
            .fp16(true)
            .quantization(crate::MobileQuantizationScheme::Int8)
            .threads(4)
            .batching(true, 2)
            .memory_optimization(crate::MemoryOptimization::Balanced)
            .build();

        assert!(engine.is_ok());
    }

    #[test]
    fn test_config_update() {
        let config = MobileConfig::default();
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        let new_config = MobileConfig {
            max_memory_mb: 1024,
            num_threads: 8,
            ..Default::default()
        };

        let result = engine.update_config(new_config);
        assert!(result.is_ok());
    }

    #[test]
    fn test_cache_operations() {
        let config = MobileConfig {
            max_memory_mb: 1024, // Enough for cache
            ..Default::default()
        };
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Load model to enable caching
        let mut weights = HashMap::new();
        weights.insert(
            "layer1".to_string(),
            Tensor::ones(&[5, 5]).expect("Failed to create tensor"),
        );
        engine.load_model(weights).expect("Failed to load model");

        // Test cache operations
        engine.clear_cache();
        engine.force_gc();

        // These should not panic
    }

    #[test]
    fn test_warm_up() {
        let config = MobileConfig::default();
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Load model first
        let mut weights = HashMap::new();
        weights.insert(
            "embedding".to_string(),
            Tensor::ones(&[100, 512]).expect("Operation failed"),
        );
        weights.insert(
            "layer.0.weight".to_string(),
            Tensor::ones(&[512, 512]).expect("Operation failed"),
        );
        engine.load_model(weights).expect("Failed to load model");

        // Test warm-up
        let result = engine.warm_up();
        assert!(result.is_ok(), "Warm-up should succeed after model loading");
    }

    #[test]
    fn test_warm_up_without_model() {
        let config = MobileConfig::default();
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Test warm-up without loading model (should fail)
        let result = engine.warm_up();
        assert!(result.is_err(), "Warm-up should fail without model");
    }

    #[test]
    fn test_set_performance_mode() {
        let config = MobileConfig::default();
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Test all valid performance modes
        assert!(
            engine.set_performance_mode(0).is_ok(),
            "Power Saving mode should work"
        );
        assert!(
            engine.set_performance_mode(1).is_ok(),
            "Balanced mode should work"
        );
        assert!(
            engine.set_performance_mode(2).is_ok(),
            "High Performance mode should work"
        );

        // Test invalid mode
        assert!(
            engine.set_performance_mode(3).is_err(),
            "Invalid mode should return error"
        );
        assert!(
            engine.set_performance_mode(-1).is_err(),
            "Negative mode should return error"
        );
    }

    #[test]
    fn test_performance_mode_changes_config() {
        let config = MobileConfig {
            use_fp16: false,
            backend: crate::MobileBackend::CPU,
            ..Default::default()
        };
        let mut engine = MobileInferenceEngine::new(config).expect("Failed to create engine");

        // Set to high performance mode
        engine.set_performance_mode(2).expect("Operation failed");
        // In high performance mode, fp16 should be disabled and backend should be GPU
        // (Note: These assertions verify the implementation logic)

        // Set to power saving mode
        engine.set_performance_mode(0).expect("Operation failed");
        // In power saving mode, fp16 should be enabled and backend should be CPU

        // The engine should still be functional after mode changes
        let _ = engine.get_stats(); // Verify stats are accessible
    }
}