torsh_core/
chunking.rs

1//! Intelligent chunking utilities for optimal tensor operations
2//!
3//! This module provides high-level chunking strategies for tensor operations,
4//! building on scirs2-core's intelligent chunking system.
5//!
6//! # SciRS2 POLICY COMPLIANCE
7//!
8//! This module wraps scirs2-core::chunking to provide:
9//! - Automatic performance optimization (15-30% improvement)
10//! - CPU topology-aware processing
11//! - Cache-optimized chunking strategies
12//! - Dynamic runtime adjustment
13//!
14//! # Usage
15//!
16//! ```ignore
17//! use torsh_core::chunking::{ChunkingStrategy, TensorChunkConfig};
18//!
19//! // For compute-intensive operations (matrix multiplication, convolution)
20//! let config = TensorChunkConfig::compute_intensive();
21//!
22//! // For memory-bandwidth-bound operations (large tensor copies)
23//! let config = TensorChunkConfig::memory_intensive();
24//!
25//! // For cache-sensitive operations (reductions, scans)
26//! let config = TensorChunkConfig::cache_friendly();
27//! ```
28//!
29//! # Performance Targets
30//!
31//! According to scirs2-core benchmarks:
32//! - Compute-intensive: 15-30% speedup over naive chunking
33//! - Memory-intensive: 20-40% speedup with bandwidth optimization
34//! - Cache-friendly: 25-50% speedup for L2/L3 cache-sensitive ops
35
36// Note: Result and TorshError are kept for future use in error handling
37#[allow(unused_imports)]
38use crate::error::{Result, TorshError};
39
40/// Chunking strategy for tensor operations
41///
42/// # SciRS2 Integration
43/// When the "parallel" feature is enabled, this wraps scirs2-core::chunking::ChunkConfig
44#[derive(Debug, Clone)]
45pub enum ChunkingStrategy {
46    /// Optimize for compute-bound tensor operations
47    /// - Matrix multiplication, convolution, FFT
48    /// - Targets CPU execution units saturation
49    /// - Expected speedup: 15-30%
50    ComputeIntensive,
51
52    /// Optimize for memory-bandwidth-bound operations
53    /// - Large tensor copies, broadcasting, reshaping
54    /// - Targets memory bandwidth optimization
55    /// - Expected speedup: 20-40%
56    MemoryIntensive,
57
58    /// Optimize for cache-sensitive operations
59    /// - Reductions, cumulative sums, scans
60    /// - Targets L2/L3 cache optimization
61    /// - Expected speedup: 25-50%
62    CacheFriendly,
63
64    /// Custom chunking with explicit parameters
65    Custom {
66        /// Chunk size in elements
67        chunk_size: usize,
68        /// Alignment requirement in bytes
69        alignment: usize,
70        /// Prefetch distance in chunks
71        prefetch_distance: usize,
72    },
73}
74
75/// Tensor-specific chunking configuration
76///
77/// Provides high-level configuration for tensor operations with
78/// automatic parameter selection based on hardware capabilities.
79#[derive(Debug, Clone)]
80pub struct TensorChunkConfig {
81    /// Chunking strategy
82    pub strategy: ChunkingStrategy,
83    /// Enable automatic tuning based on runtime profiling
84    pub auto_tune: bool,
85    /// Minimum chunk size (prevents over-chunking for small tensors)
86    pub min_chunk_size: usize,
87    /// Maximum chunk size (prevents cache thrashing)
88    pub max_chunk_size: usize,
89}
90
91impl TensorChunkConfig {
92    /// Create a compute-intensive configuration
93    ///
94    /// Optimized for:
95    /// - Matrix multiplication (GEMM operations)
96    /// - Convolution operations
97    /// - FFT transformations
98    ///
99    /// # Performance
100    /// Expected 15-30% speedup over naive chunking through:
101    /// - CPU core utilization optimization
102    /// - Instruction-level parallelism
103    /// - Reduced synchronization overhead
104    pub fn compute_intensive() -> Self {
105        Self {
106            strategy: ChunkingStrategy::ComputeIntensive,
107            auto_tune: true,
108            min_chunk_size: 1024,
109            max_chunk_size: 1024 * 1024,
110        }
111    }
112
113    /// Create a memory-intensive configuration
114    ///
115    /// Optimized for:
116    /// - Large tensor copies
117    /// - Broadcasting operations
118    /// - Tensor reshaping
119    ///
120    /// # Performance
121    /// Expected 20-40% speedup through:
122    /// - Memory bandwidth optimization
123    /// - NUMA-aware memory access
124    /// - Prefetching optimization
125    pub fn memory_intensive() -> Self {
126        Self {
127            strategy: ChunkingStrategy::MemoryIntensive,
128            auto_tune: true,
129            min_chunk_size: 4096,
130            max_chunk_size: 4 * 1024 * 1024,
131        }
132    }
133
134    /// Create a cache-friendly configuration
135    ///
136    /// Optimized for:
137    /// - Reduction operations (sum, mean, max)
138    /// - Cumulative operations (cumsum, cumprod)
139    /// - Scan operations
140    ///
141    /// # Performance
142    /// Expected 25-50% speedup through:
143    /// - L2/L3 cache size awareness
144    /// - Cache line alignment
145    /// - Reduced cache misses
146    pub fn cache_friendly() -> Self {
147        Self {
148            strategy: ChunkingStrategy::CacheFriendly,
149            auto_tune: true,
150            min_chunk_size: 512,
151            max_chunk_size: 256 * 1024, // Typical L3 cache size per core
152        }
153    }
154
155    /// Create a custom configuration
156    pub fn custom(
157        chunk_size: usize,
158        alignment: usize,
159        prefetch_distance: usize,
160        auto_tune: bool,
161    ) -> Self {
162        Self {
163            strategy: ChunkingStrategy::Custom {
164                chunk_size,
165                alignment,
166                prefetch_distance,
167            },
168            auto_tune,
169            min_chunk_size: chunk_size / 4,
170            max_chunk_size: chunk_size * 4,
171        }
172    }
173
174    /// Apply this configuration to compute optimal chunk size for given tensor size
175    ///
176    /// # Arguments
177    /// * `tensor_size` - Total number of elements in the tensor
178    /// * `element_size` - Size of each element in bytes (e.g., 4 for f32)
179    ///
180    /// # Returns
181    /// Optimal chunk size in elements
182    pub fn compute_chunk_size(&self, tensor_size: usize, element_size: usize) -> usize {
183        #[cfg(feature = "parallel")]
184        {
185            // Use scirs2-core intelligent chunking when available
186            self.compute_chunk_size_scirs2(tensor_size, element_size)
187        }
188
189        #[cfg(not(feature = "parallel"))]
190        {
191            // Fallback to simple heuristic
192            self.compute_chunk_size_simple(tensor_size, element_size)
193        }
194    }
195
196    /// Compute chunk size using scirs2-core (when parallel feature enabled)
197    ///
198    /// # SciRS2 POLICY COMPLIANCE (Phase 4 Integration)
199    /// Uses scirs2-core::chunking for intelligent chunk size computation
200    #[cfg(feature = "parallel")]
201    fn compute_chunk_size_scirs2(&self, tensor_size: usize, element_size: usize) -> usize {
202        // Import scirs2-core chunking utilities
203        use scirs2_core::chunking::{
204            ChunkConfig, ChunkStrategy as ScirStrategy, ComputeIntensity, MemoryPattern,
205        };
206
207        // Convert TensorChunkConfig to scirs2 ChunkConfig
208        let scirs2_config = match &self.strategy {
209            ChunkingStrategy::ComputeIntensive => {
210                let mut config = ChunkConfig::compute_intensive();
211                config.min_chunk_size = self.min_chunk_size;
212                config.max_chunk_size = self.max_chunk_size;
213                config
214            }
215            ChunkingStrategy::MemoryIntensive => {
216                let mut config = ChunkConfig::memory_intensive();
217                config.min_chunk_size = self.min_chunk_size;
218                config.max_chunk_size = self.max_chunk_size;
219                config
220            }
221            ChunkingStrategy::CacheFriendly => {
222                let mut config = ChunkConfig::cache_friendly();
223                config.min_chunk_size = self.min_chunk_size;
224                config.max_chunk_size = self.max_chunk_size;
225                config
226            }
227            ChunkingStrategy::Custom {
228                chunk_size,
229                alignment: _,
230                prefetch_distance: _,
231            } => ChunkConfig {
232                strategy: ScirStrategy::Fixed(*chunk_size),
233                min_chunk_size: self.min_chunk_size,
234                max_chunk_size: self.max_chunk_size,
235                prefer_work_stealing: false,
236                memory_pattern: MemoryPattern::Sequential,
237                compute_intensity: ComputeIntensity::Balanced,
238                enable_monitoring: self.auto_tune,
239                load_balance_factor: 0.1,
240                cache_awareness: scirs2_core::chunking::CacheAwareness::L2,
241                numa_strategy: scirs2_core::chunking::NumaStrategy::LocalPreferred,
242                gpu_settings: None,
243            },
244        };
245
246        // Use scirs2-core's ChunkingUtils to compute optimal chunk size
247        // Note: scirs2-core uses data_size (number of elements)
248        let data_size = tensor_size * element_size;
249        let optimal_size =
250            scirs2_core::chunking::ChunkingUtils::optimal_chunk_size(data_size, &scirs2_config);
251
252        // Convert from byte-based chunk size back to element-based
253        let optimal_elements = if element_size > 0 {
254            (optimal_size / element_size).max(1)
255        } else {
256            optimal_size
257        };
258
259        // Clamp to configured min/max
260        optimal_elements.clamp(self.min_chunk_size, self.max_chunk_size)
261    }
262
263    /// Simple fallback chunk size computation (when parallel feature disabled)
264    #[cfg(not(feature = "parallel"))]
265    fn compute_chunk_size_simple(&self, tensor_size: usize, _element_size: usize) -> usize {
266        // Simple heuristic: divide by 4 for basic parallelism
267        (tensor_size / 4)
268            .max(self.min_chunk_size)
269            .min(self.max_chunk_size)
270    }
271}
272
273/// Utility functions for chunking operations
274pub struct ChunkingUtils;
275
276impl ChunkingUtils {
277    /// Calculate optimal number of chunks for parallel processing
278    ///
279    /// # Arguments
280    /// * `total_elements` - Total number of elements to process
281    /// * `strategy` - Chunking strategy to use
282    ///
283    /// # Returns
284    /// Optimal number of chunks for the given workload
285    pub fn optimal_chunk_count(_total_elements: usize, strategy: &ChunkingStrategy) -> usize {
286        let cpu_count = num_cpus::get();
287
288        match strategy {
289            ChunkingStrategy::ComputeIntensive => cpu_count,
290            ChunkingStrategy::MemoryIntensive => cpu_count * 2,
291            ChunkingStrategy::CacheFriendly => cpu_count * 4,
292            ChunkingStrategy::Custom { .. } => cpu_count,
293        }
294    }
295
296    /// Get recommended alignment for the current platform
297    pub fn recommended_alignment() -> usize {
298        #[cfg(target_arch = "x86_64")]
299        {
300            32 // AVX2 alignment
301        }
302        #[cfg(target_arch = "aarch64")]
303        {
304            16 // NEON alignment
305        }
306        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
307        {
308            8 // Conservative default
309        }
310    }
311
312    /// Check if a pointer is properly aligned for SIMD operations
313    pub fn is_aligned<T>(ptr: *const T, alignment: usize) -> bool {
314        (ptr as usize) % alignment == 0
315    }
316
317    /// Calculate cache-friendly chunk size based on L2 cache size
318    ///
319    /// # Arguments
320    /// * `element_size` - Size of each element in bytes
321    ///
322    /// # Returns
323    /// Chunk size in elements that fits comfortably in L2 cache
324    pub fn cache_friendly_chunk_size(element_size: usize) -> usize {
325        // Typical L2 cache: 256KB per core
326        // Use 75% to account for other data
327        const L2_CACHE_SIZE: usize = 256 * 1024;
328        const UTILIZATION: f64 = 0.75;
329
330        ((L2_CACHE_SIZE as f64 * UTILIZATION) / element_size as f64) as usize
331    }
332}
333
334/// Performance recommendations for chunking
335#[derive(Debug, Clone)]
336pub struct ChunkingRecommendation {
337    /// Recommended strategy for the workload
338    pub strategy: ChunkingStrategy,
339    /// Expected performance improvement (1.0 = no change, 1.3 = 30% faster)
340    pub expected_speedup: f64,
341    /// Reason for this recommendation
342    pub rationale: String,
343}
344
345impl ChunkingRecommendation {
346    /// Get chunking recommendation for a specific workload
347    ///
348    /// # Arguments
349    /// * `tensor_size` - Number of elements in tensor
350    /// * `operation_complexity` - Complexity per element (1.0 = simple, 10.0 = complex)
351    /// * `memory_bandwidth_limited` - Whether operation is memory-bound
352    pub fn for_workload(
353        tensor_size: usize,
354        operation_complexity: f64,
355        memory_bandwidth_limited: bool,
356    ) -> Self {
357        if memory_bandwidth_limited {
358            Self {
359                strategy: ChunkingStrategy::MemoryIntensive,
360                expected_speedup: 1.3, // 30% improvement
361                rationale: "Memory bandwidth optimization for large data transfers".to_string(),
362            }
363        } else if operation_complexity > 5.0 {
364            Self {
365                strategy: ChunkingStrategy::ComputeIntensive,
366                expected_speedup: 1.25, // 25% improvement
367                rationale: "Compute-intensive optimization for complex operations".to_string(),
368            }
369        } else if tensor_size < 1024 * 1024 {
370            Self {
371                strategy: ChunkingStrategy::CacheFriendly,
372                expected_speedup: 1.4, // 40% improvement
373                rationale: "Cache-friendly optimization for small to medium tensors".to_string(),
374            }
375        } else {
376            Self {
377                strategy: ChunkingStrategy::MemoryIntensive,
378                expected_speedup: 1.2, // 20% improvement
379                rationale: "Memory-intensive optimization for large tensors".to_string(),
380            }
381        }
382    }
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    #[test]
390    fn test_chunk_config_creation() {
391        let compute_config = TensorChunkConfig::compute_intensive();
392        assert!(matches!(
393            compute_config.strategy,
394            ChunkingStrategy::ComputeIntensive
395        ));
396        assert!(compute_config.auto_tune);
397
398        let memory_config = TensorChunkConfig::memory_intensive();
399        assert!(matches!(
400            memory_config.strategy,
401            ChunkingStrategy::MemoryIntensive
402        ));
403
404        let cache_config = TensorChunkConfig::cache_friendly();
405        assert!(matches!(
406            cache_config.strategy,
407            ChunkingStrategy::CacheFriendly
408        ));
409    }
410
411    #[test]
412    fn test_chunk_size_computation() {
413        let config = TensorChunkConfig::compute_intensive();
414        let chunk_size = config.compute_chunk_size(100_000, 4);
415
416        // Should be between min and max
417        assert!(chunk_size >= config.min_chunk_size);
418        assert!(chunk_size <= config.max_chunk_size);
419        assert!(chunk_size <= 100_000);
420    }
421
422    #[test]
423    fn test_optimal_chunk_count() {
424        let strategy = ChunkingStrategy::ComputeIntensive;
425        let count = ChunkingUtils::optimal_chunk_count(1_000_000, &strategy);
426
427        // Should be related to CPU count
428        assert!(count > 0);
429        assert!(count <= num_cpus::get() * 16); // Reasonable upper bound
430    }
431
432    #[test]
433    fn test_cache_friendly_chunk_size() {
434        let chunk_size_f32 = ChunkingUtils::cache_friendly_chunk_size(4);
435        let chunk_size_f64 = ChunkingUtils::cache_friendly_chunk_size(8);
436
437        // f64 should have half the elements of f32 for same cache usage
438        assert!((chunk_size_f64 as f64 / chunk_size_f32 as f64 - 0.5).abs() < 0.1);
439
440        // Should fit in typical L2 cache (256KB)
441        assert!(chunk_size_f32 * 4 <= 256 * 1024);
442    }
443
444    #[test]
445    fn test_alignment_check() {
446        let aligned_data = vec![0u32; 32];
447        let ptr = aligned_data.as_ptr();
448
449        // Should be aligned to at least 4 bytes (u32)
450        assert!(ChunkingUtils::is_aligned(ptr, 4));
451    }
452
453    #[test]
454    fn test_chunking_recommendation() {
455        // Memory-bound workload
456        let rec = ChunkingRecommendation::for_workload(10_000_000, 1.0, true);
457        assert!(matches!(rec.strategy, ChunkingStrategy::MemoryIntensive));
458        assert!(rec.expected_speedup > 1.0);
459
460        // Compute-bound workload
461        let rec = ChunkingRecommendation::for_workload(1_000_000, 10.0, false);
462        assert!(matches!(rec.strategy, ChunkingStrategy::ComputeIntensive));
463        assert!(rec.expected_speedup > 1.0);
464
465        // Small cache-friendly workload
466        let rec = ChunkingRecommendation::for_workload(100_000, 2.0, false);
467        assert!(matches!(rec.strategy, ChunkingStrategy::CacheFriendly));
468        assert!(rec.expected_speedup > 1.0);
469    }
470
471    #[test]
472    fn test_recommended_alignment() {
473        let alignment = ChunkingUtils::recommended_alignment();
474
475        // Should be power of 2
476        assert!(alignment.is_power_of_two());
477
478        // Should be at least 8 bytes
479        assert!(alignment >= 8);
480    }
481}
torsh_core/chunking.rs

torsh_core/
chunking.rs