torsh-distributed 0.1.2

//! Memory management and optimization for 3D parallelism
//!
//! This module provides memory-efficient strategies including activation
//! checkpointing, disk offloading, and memory usage monitoring.

// Framework infrastructure - components designed for future use
#![allow(dead_code)]
use crate::{TorshDistributedError, TorshResult};
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use torsh_tensor::Tensor;

use super::config::{MemoryOptimizationStrategy, RankMapping, ThreeDParallelismConfig};

/// Memory manager for 3D parallelism operations
pub struct MemoryManager {
    /// Configuration
    config: ThreeDParallelismConfig,
    /// Rank mapping
    rank_mapping: RankMapping,
    /// Stored activations for gradient checkpointing
    stored_activations: Arc<Mutex<HashMap<ActivationKey, Tensor<f32>>>>,
    /// Memory usage statistics
    memory_stats: Arc<Mutex<MemoryUsageStats>>,
    /// Activation cache for recomputation
    activation_cache: Arc<Mutex<HashMap<String, CachedActivation>>>,
    /// Memory pool for efficient allocation
    memory_pool: Arc<Mutex<MemoryPool>>,
}

impl MemoryManager {
    /// Create new memory manager
    pub fn new(config: &ThreeDParallelismConfig, rank_mapping: &RankMapping) -> TorshResult<Self> {
        let stored_activations = Arc::new(Mutex::new(HashMap::new()));
        let memory_stats = Arc::new(Mutex::new(MemoryUsageStats::new()));
        let activation_cache = Arc::new(Mutex::new(HashMap::new()));
        let memory_pool = Arc::new(Mutex::new(MemoryPool::new(config.max_memory_per_device)));

        Ok(Self {
            config: config.clone(),
            rank_mapping: rank_mapping.clone(),
            stored_activations,
            memory_stats,
            activation_cache,
            memory_pool,
        })
    }

    /// Store activation for gradient checkpointing
    pub async fn store_activation(
        &self,
        activation: &Tensor<f32>,
        layer_idx: usize,
        micro_batch_id: usize,
    ) -> TorshResult<()> {
        let key = ActivationKey {
            layer_idx,
            micro_batch_id,
            rank: self.rank_mapping.global_rank,
        };

        match self.config.memory_strategy {
            MemoryOptimizationStrategy::Basic => {
                // Store all activations in memory
                let mut activations = self
                    .stored_activations
                    .lock()
                    .expect("lock should not be poisoned");
                activations.insert(key, activation.clone());
            }
            MemoryOptimizationStrategy::Standard => {
                // Store every 2nd activation
                if layer_idx % 2 == 0 {
                    let mut activations = self
                        .stored_activations
                        .lock()
                        .expect("lock should not be poisoned");
                    activations.insert(key, activation.clone());
                }
            }
            MemoryOptimizationStrategy::Aggressive => {
                // Store every 4th activation
                if layer_idx % 4 == 0 {
                    let mut activations = self
                        .stored_activations
                        .lock()
                        .expect("lock should not be poisoned");
                    activations.insert(key, activation.clone());
                }
            }
            MemoryOptimizationStrategy::Extreme => {
                // Store minimal activations, prefer recomputation
                if layer_idx % 8 == 0 {
                    self.store_to_disk(&key, activation).await?;
                }
            }
        }

        // Update memory statistics
        self.update_memory_usage(activation.numel() * std::mem::size_of::<f32>());

        Ok(())
    }

    /// Retrieve stored activation
    pub async fn retrieve_activation(
        &self,
        layer_idx: usize,
        micro_batch_id: usize,
    ) -> TorshResult<Tensor<f32>> {
        let key = ActivationKey {
            layer_idx,
            micro_batch_id,
            rank: self.rank_mapping.global_rank,
        };

        // First check in-memory storage
        {
            let activations = self
                .stored_activations
                .lock()
                .expect("lock should not be poisoned");
            if let Some(activation) = activations.get(&key) {
                return Ok(activation.clone());
            }
        }

        // Check cache
        {
            let cache = self
                .activation_cache
                .lock()
                .expect("lock should not be poisoned");
            let cache_key = format!("{}_{}", layer_idx, micro_batch_id);
            if let Some(cached) = cache.get(&cache_key) {
                if !cached.is_expired() {
                    return Ok(cached.tensor.clone());
                }
            }
        }

        // Try to load from disk (for extreme memory strategy)
        if matches!(
            self.config.memory_strategy,
            MemoryOptimizationStrategy::Extreme
        ) {
            if let Ok(tensor) = self.load_from_disk(&key).await {
                return Ok(tensor);
            }
        }

        // If not found, recompute the activation
        self.recompute_activation(layer_idx, micro_batch_id).await
    }

    /// Store activation to disk
    async fn store_to_disk(&self, key: &ActivationKey, tensor: &Tensor<f32>) -> TorshResult<()> {
        // Simplified disk storage (would implement actual file I/O)
        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;

        // Store minimal metadata for retrieval
        let cache_key = format!("disk_{}_{}", key.layer_idx, key.micro_batch_id);
        let mut cache = self
            .activation_cache
            .lock()
            .expect("lock should not be poisoned");
        cache.insert(
            cache_key,
            CachedActivation {
                tensor: tensor.clone(),
                timestamp: std::time::Instant::now(),
                location: StorageLocation::Disk,
            },
        );

        Ok(())
    }

    /// Load activation from disk
    async fn load_from_disk(&self, key: &ActivationKey) -> TorshResult<Tensor<f32>> {
        // Simplified disk loading
        tokio::time::sleep(tokio::time::Duration::from_millis(20)).await;

        let cache_key = format!("disk_{}_{}", key.layer_idx, key.micro_batch_id);
        let cache = self
            .activation_cache
            .lock()
            .expect("lock should not be poisoned");
        if let Some(cached) = cache.get(&cache_key) {
            Ok(cached.tensor.clone())
        } else {
            Err(TorshDistributedError::InternalError(
                "Activation not found on disk".to_string(),
            ))
        }
    }

    /// Recompute activation by forward pass
    async fn recompute_activation(
        &self,
        layer_idx: usize,
        micro_batch_id: usize,
    ) -> TorshResult<Tensor<f32>> {
        // Simplified recomputation (would run forward pass)
        let shape = [self.config.micro_batch_size, 512]; // Mock shape
        let tensor = Tensor::zeros(&shape, torsh_core::DeviceType::Cpu)?;

        // Cache the recomputed activation
        let cache_key = format!("recomputed_{}_{}", layer_idx, micro_batch_id);
        let mut cache = self
            .activation_cache
            .lock()
            .expect("lock should not be poisoned");
        cache.insert(
            cache_key,
            CachedActivation {
                tensor: tensor.clone(),
                timestamp: std::time::Instant::now(),
                location: StorageLocation::Memory,
            },
        );

        Ok(tensor)
    }

    /// Update memory usage statistics
    fn update_memory_usage(&self, additional_bytes: usize) {
        let mut stats = self
            .memory_stats
            .lock()
            .expect("lock should not be poisoned");
        stats.current_usage_bytes += additional_bytes;
        if stats.current_usage_bytes > stats.peak_usage_bytes {
            stats.peak_usage_bytes = stats.current_usage_bytes;
        }
        stats.total_allocations += 1;
    }

    /// Free stored activations to release memory
    pub fn free_activations(&self, before_micro_batch: usize) {
        let mut activations = self
            .stored_activations
            .lock()
            .expect("lock should not be poisoned");
        let keys_to_remove: Vec<_> = activations
            .keys()
            .filter(|k| k.micro_batch_id < before_micro_batch)
            .cloned()
            .collect();

        let mut freed_bytes = 0;
        for key in keys_to_remove {
            if let Some(tensor) = activations.remove(&key) {
                freed_bytes += tensor.numel() * std::mem::size_of::<f32>();
            }
        }

        // Update statistics
        let mut stats = self
            .memory_stats
            .lock()
            .expect("lock should not be poisoned");
        stats.current_usage_bytes = stats.current_usage_bytes.saturating_sub(freed_bytes);
        stats.total_deallocations += 1;
    }

    /// Optimize memory usage based on current strategy
    pub async fn optimize_memory(&self) -> TorshResult<MemoryOptimizationResult> {
        let current_usage = self.get_current_memory_usage();
        let max_memory = (self.config.max_memory_per_device * 1024.0 * 1024.0 * 1024.0) as usize;

        if current_usage > max_memory * 8 / 10 {
            // Memory usage > 80%, apply optimizations
            match self.config.memory_strategy {
                MemoryOptimizationStrategy::Basic => self.basic_memory_optimization().await,
                MemoryOptimizationStrategy::Standard => self.standard_memory_optimization().await,
                MemoryOptimizationStrategy::Aggressive => {
                    self.aggressive_memory_optimization().await
                }
                MemoryOptimizationStrategy::Extreme => self.extreme_memory_optimization().await,
            }
        } else {
            Ok(MemoryOptimizationResult {
                bytes_freed: 0,
                activations_moved_to_disk: 0,
                recomputation_overhead: 0.0,
            })
        }
    }

    /// Basic memory optimization
    async fn basic_memory_optimization(&self) -> TorshResult<MemoryOptimizationResult> {
        // Simple cleanup of old activations
        self.free_activations(self.current_micro_batch().saturating_sub(2));

        Ok(MemoryOptimizationResult {
            bytes_freed: 1024 * 1024, // Mock value
            activations_moved_to_disk: 0,
            recomputation_overhead: 0.1,
        })
    }

    /// Standard memory optimization
    async fn standard_memory_optimization(&self) -> TorshResult<MemoryOptimizationResult> {
        // More aggressive cleanup and some disk offloading
        self.free_activations(self.current_micro_batch().saturating_sub(1));

        // Move some activations to disk
        let moved_to_disk = self.move_activations_to_disk(10).await?;

        Ok(MemoryOptimizationResult {
            bytes_freed: 2 * 1024 * 1024,
            activations_moved_to_disk: moved_to_disk,
            recomputation_overhead: 0.2,
        })
    }

    /// Aggressive memory optimization
    async fn aggressive_memory_optimization(&self) -> TorshResult<MemoryOptimizationResult> {
        // Clear most activations and rely on recomputation
        self.clear_activation_cache().await?;
        let moved_to_disk = self.move_activations_to_disk(20).await?;

        Ok(MemoryOptimizationResult {
            bytes_freed: 5 * 1024 * 1024,
            activations_moved_to_disk: moved_to_disk,
            recomputation_overhead: 0.4,
        })
    }

    /// Extreme memory optimization
    async fn extreme_memory_optimization(&self) -> TorshResult<MemoryOptimizationResult> {
        // Maximum memory savings with high recomputation cost
        self.clear_all_memory_caches().await?;
        let moved_to_disk = self.move_activations_to_disk(50).await?;

        Ok(MemoryOptimizationResult {
            bytes_freed: 10 * 1024 * 1024,
            activations_moved_to_disk: moved_to_disk,
            recomputation_overhead: 0.8,
        })
    }

    /// Move activations to disk
    async fn move_activations_to_disk(&self, count: usize) -> TorshResult<usize> {
        let mut moved = 0;
        let activations = self
            .stored_activations
            .lock()
            .expect("lock should not be poisoned")
            .clone();

        for (key, tensor) in activations.iter().take(count) {
            self.store_to_disk(key, tensor).await?;
            moved += 1;
        }

        // Remove from memory after moving to disk
        let mut activations = self
            .stored_activations
            .lock()
            .expect("lock should not be poisoned");
        let keys: Vec<_> = activations.keys().take(count).cloned().collect();
        for key in keys {
            activations.remove(&key);
        }

        Ok(moved)
    }

    /// Clear activation cache
    async fn clear_activation_cache(&self) -> TorshResult<()> {
        let mut cache = self
            .activation_cache
            .lock()
            .expect("lock should not be poisoned");
        cache.clear();
        Ok(())
    }

    /// Clear all memory caches
    async fn clear_all_memory_caches(&self) -> TorshResult<()> {
        let mut activations = self
            .stored_activations
            .lock()
            .expect("lock should not be poisoned");
        activations.clear();

        let mut cache = self
            .activation_cache
            .lock()
            .expect("lock should not be poisoned");
        cache.clear();

        Ok(())
    }

    /// Get current memory usage in bytes
    pub fn get_current_memory_usage(&self) -> usize {
        let stats = self
            .memory_stats
            .lock()
            .expect("lock should not be poisoned");
        stats.current_usage_bytes
    }

    /// Get memory usage statistics
    pub fn get_memory_stats(&self) -> MemoryUsageStats {
        self.memory_stats
            .lock()
            .expect("lock should not be poisoned")
            .clone()
    }

    /// Get current micro-batch number
    fn current_micro_batch(&self) -> usize {
        5 // Mock current micro-batch
    }

    /// Allocate tensor from memory pool
    pub fn allocate_tensor(&self, shape: &[usize]) -> TorshResult<Tensor<f32>> {
        let mut pool = self
            .memory_pool
            .lock()
            .expect("lock should not be poisoned");
        pool.allocate_tensor(shape)
    }

    /// Return tensor to memory pool
    pub fn deallocate_tensor(&self, tensor: Tensor<f32>) {
        let mut pool = self
            .memory_pool
            .lock()
            .expect("lock should not be poisoned");
        pool.deallocate_tensor(tensor);
    }
}

/// Key for storing activations
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct ActivationKey {
    layer_idx: usize,
    micro_batch_id: usize,
    rank: usize,
}

/// Cached activation with metadata
#[derive(Debug, Clone)]
struct CachedActivation {
    tensor: Tensor<f32>,
    timestamp: std::time::Instant,
    location: StorageLocation,
}

impl CachedActivation {
    /// Check if cached activation has expired
    fn is_expired(&self) -> bool {
        self.timestamp.elapsed() > std::time::Duration::from_secs(30)
    }
}

/// Storage location for activations
#[derive(Debug, Clone, Copy)]
enum StorageLocation {
    Memory,
    Disk,
    Gpu,
}

/// Memory usage statistics
#[derive(Debug, Clone)]
pub struct MemoryUsageStats {
    pub current_usage_bytes: usize,
    pub peak_usage_bytes: usize,
    pub total_allocations: u64,
    pub total_deallocations: u64,
    pub cache_hits: u64,
    pub cache_misses: u64,
}

impl MemoryUsageStats {
    fn new() -> Self {
        Self {
            current_usage_bytes: 0,
            peak_usage_bytes: 0,
            total_allocations: 0,
            total_deallocations: 0,
            cache_hits: 0,
            cache_misses: 0,
        }
    }
}

/// Result of memory optimization
#[derive(Debug, Clone)]
pub struct MemoryOptimizationResult {
    pub bytes_freed: usize,
    pub activations_moved_to_disk: usize,
    pub recomputation_overhead: f64,
}

/// Memory pool for efficient tensor allocation
#[derive(Debug)]
struct MemoryPool {
    max_memory_gb: f32,
    allocated_tensors: HashMap<String, Vec<Tensor<f32>>>,
    total_allocated_bytes: usize,
}

impl MemoryPool {
    fn new(max_memory_gb: f32) -> Self {
        Self {
            max_memory_gb,
            allocated_tensors: HashMap::new(),
            total_allocated_bytes: 0,
        }
    }

    fn allocate_tensor(&mut self, shape: &[usize]) -> TorshResult<Tensor<f32>> {
        let shape_key = format!("{:?}", shape);
        let tensor_size = shape.iter().product::<usize>() * std::mem::size_of::<f32>();

        // Check if we have a pre-allocated tensor of this shape
        if let Some(tensors) = self.allocated_tensors.get_mut(&shape_key) {
            if let Some(tensor) = tensors.pop() {
                return Ok(tensor);
            }
        }

        // Allocate new tensor if within memory limits
        let max_bytes = (self.max_memory_gb * 1024.0 * 1024.0 * 1024.0) as usize;
        if self.total_allocated_bytes + tensor_size <= max_bytes {
            let tensor = Tensor::zeros(shape, torsh_core::DeviceType::Cpu)?;
            self.total_allocated_bytes += tensor_size;
            Ok(tensor)
        } else {
            Err(TorshDistributedError::invalid_argument(
                "memory_allocation",
                "Memory pool exhausted - requested memory would exceed limits",
                "within available memory limits",
            ))
        }
    }

    fn deallocate_tensor(&mut self, tensor: Tensor<f32>) {
        let shape_key = format!("{:?}", tensor.shape().dims());
        let tensors = self.allocated_tensors.entry(shape_key).or_default();

        // Keep a limited number of tensors for reuse
        if tensors.len() < 10 {
            tensors.push(tensor);
        }
    }
}