torsh-core 0.1.2

//! Intelligent chunking utilities for optimal tensor operations
//!
//! This module provides high-level chunking strategies for tensor operations,
//! building on scirs2-core's intelligent chunking system.
//!
//! # SciRS2 POLICY COMPLIANCE
//!
//! This module wraps scirs2-core::chunking to provide:
//! - Automatic performance optimization (15-30% improvement)
//! - CPU topology-aware processing
//! - Cache-optimized chunking strategies
//! - Dynamic runtime adjustment
//!
//! # Usage
//!
//! ```ignore
//! use torsh_core::chunking::{ChunkingStrategy, TensorChunkConfig};
//!
//! // For compute-intensive operations (matrix multiplication, convolution)
//! let config = TensorChunkConfig::compute_intensive();
//!
//! // For memory-bandwidth-bound operations (large tensor copies)
//! let config = TensorChunkConfig::memory_intensive();
//!
//! // For cache-sensitive operations (reductions, scans)
//! let config = TensorChunkConfig::cache_friendly();
//! ```
//!
//! # Performance Targets
//!
//! According to scirs2-core benchmarks:
//! - Compute-intensive: 15-30% speedup over naive chunking
//! - Memory-intensive: 20-40% speedup with bandwidth optimization
//! - Cache-friendly: 25-50% speedup for L2/L3 cache-sensitive ops

// Note: Result and TorshError are kept for future use in error handling
#[allow(unused_imports)]
use crate::error::{Result, TorshError};

/// Chunking strategy for tensor operations
///
/// # SciRS2 Integration
/// When the "parallel" feature is enabled, this wraps scirs2-core::chunking::ChunkConfig
#[derive(Debug, Clone)]
pub enum ChunkingStrategy {
    /// Optimize for compute-bound tensor operations
    /// - Matrix multiplication, convolution, FFT
    /// - Targets CPU execution units saturation
    /// - Expected speedup: 15-30%
    ComputeIntensive,

    /// Optimize for memory-bandwidth-bound operations
    /// - Large tensor copies, broadcasting, reshaping
    /// - Targets memory bandwidth optimization
    /// - Expected speedup: 20-40%
    MemoryIntensive,

    /// Optimize for cache-sensitive operations
    /// - Reductions, cumulative sums, scans
    /// - Targets L2/L3 cache optimization
    /// - Expected speedup: 25-50%
    CacheFriendly,

    /// Custom chunking with explicit parameters
    Custom {
        /// Chunk size in elements
        chunk_size: usize,
        /// Alignment requirement in bytes
        alignment: usize,
        /// Prefetch distance in chunks
        prefetch_distance: usize,
    },
}

/// Tensor-specific chunking configuration
///
/// Provides high-level configuration for tensor operations with
/// automatic parameter selection based on hardware capabilities.
#[derive(Debug, Clone)]
pub struct TensorChunkConfig {
    /// Chunking strategy
    pub strategy: ChunkingStrategy,
    /// Enable automatic tuning based on runtime profiling
    pub auto_tune: bool,
    /// Minimum chunk size (prevents over-chunking for small tensors)
    pub min_chunk_size: usize,
    /// Maximum chunk size (prevents cache thrashing)
    pub max_chunk_size: usize,
}

impl TensorChunkConfig {
    /// Create a compute-intensive configuration
    ///
    /// Optimized for:
    /// - Matrix multiplication (GEMM operations)
    /// - Convolution operations
    /// - FFT transformations
    ///
    /// # Performance
    /// Expected 15-30% speedup over naive chunking through:
    /// - CPU core utilization optimization
    /// - Instruction-level parallelism
    /// - Reduced synchronization overhead
    pub fn compute_intensive() -> Self {
        Self {
            strategy: ChunkingStrategy::ComputeIntensive,
            auto_tune: true,
            min_chunk_size: 1024,
            max_chunk_size: 1024 * 1024,
        }
    }

    /// Create a memory-intensive configuration
    ///
    /// Optimized for:
    /// - Large tensor copies
    /// - Broadcasting operations
    /// - Tensor reshaping
    ///
    /// # Performance
    /// Expected 20-40% speedup through:
    /// - Memory bandwidth optimization
    /// - NUMA-aware memory access
    /// - Prefetching optimization
    pub fn memory_intensive() -> Self {
        Self {
            strategy: ChunkingStrategy::MemoryIntensive,
            auto_tune: true,
            min_chunk_size: 4096,
            max_chunk_size: 4 * 1024 * 1024,
        }
    }

    /// Create a cache-friendly configuration
    ///
    /// Optimized for:
    /// - Reduction operations (sum, mean, max)
    /// - Cumulative operations (cumsum, cumprod)
    /// - Scan operations
    ///
    /// # Performance
    /// Expected 25-50% speedup through:
    /// - L2/L3 cache size awareness
    /// - Cache line alignment
    /// - Reduced cache misses
    pub fn cache_friendly() -> Self {
        Self {
            strategy: ChunkingStrategy::CacheFriendly,
            auto_tune: true,
            min_chunk_size: 512,
            max_chunk_size: 256 * 1024, // Typical L3 cache size per core
        }
    }

    /// Create a custom configuration
    pub fn custom(
        chunk_size: usize,
        alignment: usize,
        prefetch_distance: usize,
        auto_tune: bool,
    ) -> Self {
        Self {
            strategy: ChunkingStrategy::Custom {
                chunk_size,
                alignment,
                prefetch_distance,
            },
            auto_tune,
            min_chunk_size: chunk_size / 4,
            max_chunk_size: chunk_size * 4,
        }
    }

    /// Apply this configuration to compute optimal chunk size for given tensor size
    ///
    /// # Arguments
    /// * `tensor_size` - Total number of elements in the tensor
    /// * `element_size` - Size of each element in bytes (e.g., 4 for f32)
    ///
    /// # Returns
    /// Optimal chunk size in elements
    pub fn compute_chunk_size(&self, tensor_size: usize, element_size: usize) -> usize {
        #[cfg(feature = "parallel")]
        {
            // Use scirs2-core intelligent chunking when available
            self.compute_chunk_size_scirs2(tensor_size, element_size)
        }

        #[cfg(not(feature = "parallel"))]
        {
            // Fallback to simple heuristic
            self.compute_chunk_size_simple(tensor_size, element_size)
        }
    }

    /// Compute chunk size using scirs2-core (when parallel feature enabled)
    ///
    /// # SciRS2 POLICY COMPLIANCE (Phase 4 Integration)
    /// Uses scirs2-core::chunking for intelligent chunk size computation
    #[cfg(feature = "parallel")]
    fn compute_chunk_size_scirs2(&self, tensor_size: usize, element_size: usize) -> usize {
        // Import scirs2-core chunking utilities
        use scirs2_core::chunking::{
            ChunkConfig, ChunkStrategy as ScirStrategy, ComputeIntensity, MemoryPattern,
        };

        // Convert TensorChunkConfig to scirs2 ChunkConfig
        let scirs2_config = match &self.strategy {
            ChunkingStrategy::ComputeIntensive => {
                let mut config = ChunkConfig::compute_intensive();
                config.min_chunk_size = self.min_chunk_size;
                config.max_chunk_size = self.max_chunk_size;
                config
            }
            ChunkingStrategy::MemoryIntensive => {
                let mut config = ChunkConfig::memory_intensive();
                config.min_chunk_size = self.min_chunk_size;
                config.max_chunk_size = self.max_chunk_size;
                config
            }
            ChunkingStrategy::CacheFriendly => {
                let mut config = ChunkConfig::cache_friendly();
                config.min_chunk_size = self.min_chunk_size;
                config.max_chunk_size = self.max_chunk_size;
                config
            }
            ChunkingStrategy::Custom {
                chunk_size,
                alignment: _,
                prefetch_distance: _,
            } => ChunkConfig {
                strategy: ScirStrategy::Fixed(*chunk_size),
                min_chunk_size: self.min_chunk_size,
                max_chunk_size: self.max_chunk_size,
                prefer_work_stealing: false,
                memory_pattern: MemoryPattern::Sequential,
                compute_intensity: ComputeIntensity::Balanced,
                enable_monitoring: self.auto_tune,
                load_balance_factor: 0.1,
                cache_awareness: scirs2_core::chunking::CacheAwareness::L2,
                numa_strategy: scirs2_core::chunking::NumaStrategy::LocalPreferred,
                gpu_settings: None,
            },
        };

        // Use scirs2-core's ChunkingUtils to compute optimal chunk size
        // Note: scirs2-core uses data_size (number of elements)
        let data_size = tensor_size * element_size;
        let optimal_size =
            scirs2_core::chunking::ChunkingUtils::optimal_chunk_size(data_size, &scirs2_config);

        // Convert from byte-based chunk size back to element-based
        let optimal_elements = if element_size > 0 {
            (optimal_size / element_size).max(1)
        } else {
            optimal_size
        };

        // Clamp to configured min/max
        optimal_elements.clamp(self.min_chunk_size, self.max_chunk_size)
    }

    /// Simple fallback chunk size computation (when parallel feature disabled)
    #[cfg(not(feature = "parallel"))]
    fn compute_chunk_size_simple(&self, tensor_size: usize, _element_size: usize) -> usize {
        // Simple heuristic: divide by 4 for basic parallelism
        (tensor_size / 4)
            .max(self.min_chunk_size)
            .min(self.max_chunk_size)
    }
}

/// Utility functions for chunking operations
pub struct ChunkingUtils;

impl ChunkingUtils {
    /// Calculate optimal number of chunks for parallel processing
    ///
    /// # Arguments
    /// * `total_elements` - Total number of elements to process
    /// * `strategy` - Chunking strategy to use
    ///
    /// # Returns
    /// Optimal number of chunks for the given workload
    pub fn optimal_chunk_count(_total_elements: usize, strategy: &ChunkingStrategy) -> usize {
        let cpu_count = num_cpus::get();

        match strategy {
            ChunkingStrategy::ComputeIntensive => cpu_count,
            ChunkingStrategy::MemoryIntensive => cpu_count * 2,
            ChunkingStrategy::CacheFriendly => cpu_count * 4,
            ChunkingStrategy::Custom { .. } => cpu_count,
        }
    }

    /// Get recommended alignment for the current platform
    pub fn recommended_alignment() -> usize {
        #[cfg(target_arch = "x86_64")]
        {
            32 // AVX2 alignment
        }
        #[cfg(target_arch = "aarch64")]
        {
            16 // NEON alignment
        }
        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
        {
            8 // Conservative default
        }
    }

    /// Check if a pointer is properly aligned for SIMD operations
    pub fn is_aligned<T>(ptr: *const T, alignment: usize) -> bool {
        (ptr as usize) % alignment == 0
    }

    /// Calculate cache-friendly chunk size based on L2 cache size
    ///
    /// # Arguments
    /// * `element_size` - Size of each element in bytes
    ///
    /// # Returns
    /// Chunk size in elements that fits comfortably in L2 cache
    pub fn cache_friendly_chunk_size(element_size: usize) -> usize {
        // Typical L2 cache: 256KB per core
        // Use 75% to account for other data
        const L2_CACHE_SIZE: usize = 256 * 1024;
        const UTILIZATION: f64 = 0.75;

        ((L2_CACHE_SIZE as f64 * UTILIZATION) / element_size as f64) as usize
    }
}

/// Performance recommendations for chunking
#[derive(Debug, Clone)]
pub struct ChunkingRecommendation {
    /// Recommended strategy for the workload
    pub strategy: ChunkingStrategy,
    /// Expected performance improvement (1.0 = no change, 1.3 = 30% faster)
    pub expected_speedup: f64,
    /// Reason for this recommendation
    pub rationale: String,
}

impl ChunkingRecommendation {
    /// Get chunking recommendation for a specific workload
    ///
    /// # Arguments
    /// * `tensor_size` - Number of elements in tensor
    /// * `operation_complexity` - Complexity per element (1.0 = simple, 10.0 = complex)
    /// * `memory_bandwidth_limited` - Whether operation is memory-bound
    pub fn for_workload(
        tensor_size: usize,
        operation_complexity: f64,
        memory_bandwidth_limited: bool,
    ) -> Self {
        if memory_bandwidth_limited {
            Self {
                strategy: ChunkingStrategy::MemoryIntensive,
                expected_speedup: 1.3, // 30% improvement
                rationale: "Memory bandwidth optimization for large data transfers".to_string(),
            }
        } else if operation_complexity > 5.0 {
            Self {
                strategy: ChunkingStrategy::ComputeIntensive,
                expected_speedup: 1.25, // 25% improvement
                rationale: "Compute-intensive optimization for complex operations".to_string(),
            }
        } else if tensor_size < 1024 * 1024 {
            Self {
                strategy: ChunkingStrategy::CacheFriendly,
                expected_speedup: 1.4, // 40% improvement
                rationale: "Cache-friendly optimization for small to medium tensors".to_string(),
            }
        } else {
            Self {
                strategy: ChunkingStrategy::MemoryIntensive,
                expected_speedup: 1.2, // 20% improvement
                rationale: "Memory-intensive optimization for large tensors".to_string(),
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_chunk_config_creation() {
        let compute_config = TensorChunkConfig::compute_intensive();
        assert!(matches!(
            compute_config.strategy,
            ChunkingStrategy::ComputeIntensive
        ));
        assert!(compute_config.auto_tune);

        let memory_config = TensorChunkConfig::memory_intensive();
        assert!(matches!(
            memory_config.strategy,
            ChunkingStrategy::MemoryIntensive
        ));

        let cache_config = TensorChunkConfig::cache_friendly();
        assert!(matches!(
            cache_config.strategy,
            ChunkingStrategy::CacheFriendly
        ));
    }

    #[test]
    fn test_chunk_size_computation() {
        let config = TensorChunkConfig::compute_intensive();
        let chunk_size = config.compute_chunk_size(100_000, 4);

        // Should be between min and max
        assert!(chunk_size >= config.min_chunk_size);
        assert!(chunk_size <= config.max_chunk_size);
        assert!(chunk_size <= 100_000);
    }

    #[test]
    fn test_optimal_chunk_count() {
        let strategy = ChunkingStrategy::ComputeIntensive;
        let count = ChunkingUtils::optimal_chunk_count(1_000_000, &strategy);

        // Should be related to CPU count
        assert!(count > 0);
        assert!(count <= num_cpus::get() * 16); // Reasonable upper bound
    }

    #[test]
    fn test_cache_friendly_chunk_size() {
        let chunk_size_f32 = ChunkingUtils::cache_friendly_chunk_size(4);
        let chunk_size_f64 = ChunkingUtils::cache_friendly_chunk_size(8);

        // f64 should have half the elements of f32 for same cache usage
        assert!((chunk_size_f64 as f64 / chunk_size_f32 as f64 - 0.5).abs() < 0.1);

        // Should fit in typical L2 cache (256KB)
        assert!(chunk_size_f32 * 4 <= 256 * 1024);
    }

    #[test]
    fn test_alignment_check() {
        let aligned_data = vec![0u32; 32];
        let ptr = aligned_data.as_ptr();

        // Should be aligned to at least 4 bytes (u32)
        assert!(ChunkingUtils::is_aligned(ptr, 4));
    }

    #[test]
    fn test_chunking_recommendation() {
        // Memory-bound workload
        let rec = ChunkingRecommendation::for_workload(10_000_000, 1.0, true);
        assert!(matches!(rec.strategy, ChunkingStrategy::MemoryIntensive));
        assert!(rec.expected_speedup > 1.0);

        // Compute-bound workload
        let rec = ChunkingRecommendation::for_workload(1_000_000, 10.0, false);
        assert!(matches!(rec.strategy, ChunkingStrategy::ComputeIntensive));
        assert!(rec.expected_speedup > 1.0);

        // Small cache-friendly workload
        let rec = ChunkingRecommendation::for_workload(100_000, 2.0, false);
        assert!(matches!(rec.strategy, ChunkingStrategy::CacheFriendly));
        assert!(rec.expected_speedup > 1.0);
    }

    #[test]
    fn test_recommended_alignment() {
        let alignment = ChunkingUtils::recommended_alignment();

        // Should be power of 2
        assert!(alignment.is_power_of_two());

        // Should be at least 8 bytes
        assert!(alignment >= 8);
    }
}