use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::time::{Duration, Instant};
#[cfg(feature = "parallel")]
use rayon::prelude::*;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ChunkStrategy {
Fixed(usize),
Adaptive,
CacheOptimized,
MemoryOptimized,
Dynamic,
WorkStealingBalanced,
NumaAware,
LinearAlgebra,
SparseMatrix,
SignalProcessing,
ImageProcessing,
MonteCarlo,
IterativeSolver,
GpuAware,
Custom(fn(usize, usize) -> usize),
}
#[derive(Debug, Clone)]
pub struct ChunkConfig {
pub strategy: ChunkStrategy,
pub min_chunk_size: usize,
pub max_chunk_size: usize,
pub prefer_work_stealing: bool,
pub memory_pattern: MemoryPattern,
pub compute_intensity: ComputeIntensity,
pub enable_monitoring: bool,
pub load_balance_factor: f64,
pub cache_awareness: CacheAwareness,
pub numa_strategy: NumaStrategy,
pub gpu_settings: Option<GpuChunkSettings>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MemoryPattern {
Sequential,
Random,
Strided(usize),
BlockWise,
Sparse,
CacheFriendly,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ComputeIntensity {
MemoryBound,
Balanced,
ComputeIntensive,
CpuBound,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CacheAwareness {
None,
L1,
L2,
L3,
Full,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NumaStrategy {
Ignore,
LocalPreferred,
StrictLocal,
Interleave,
Custom,
}
#[derive(Debug, Clone)]
pub struct GpuChunkSettings {
pub gpu_memory_ratio: f64,
pub gpu_min_chunk: usize,
pub overlap_compute: bool,
pub gpu_bandwidth: Option<u64>,
pub transfer_bandwidth: Option<u64>,
}
impl Default for ChunkConfig {
fn default() -> Self {
Self {
strategy: ChunkStrategy::Adaptive,
min_chunk_size: 64,
max_chunk_size: 8192,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::Balanced,
enable_monitoring: false,
load_balance_factor: 0.1,
cache_awareness: CacheAwareness::L2,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
}
impl Default for GpuChunkSettings {
fn default() -> Self {
Self {
gpu_memory_ratio: 0.8,
gpu_min_chunk: 4096,
overlap_compute: true,
gpu_bandwidth: None,
transfer_bandwidth: None,
}
}
}
impl ChunkConfig {
pub fn compute_intensive() -> Self {
Self {
strategy: ChunkStrategy::Adaptive,
min_chunk_size: 32,
max_chunk_size: 1024,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::ComputeIntensive,
enable_monitoring: false,
load_balance_factor: 0.05,
cache_awareness: CacheAwareness::L1,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
pub fn memory_intensive() -> Self {
Self {
strategy: ChunkStrategy::MemoryOptimized,
min_chunk_size: 256,
max_chunk_size: 16384,
prefer_work_stealing: false,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::MemoryBound,
enable_monitoring: false,
load_balance_factor: 0.2,
cache_awareness: CacheAwareness::L3,
numa_strategy: NumaStrategy::StrictLocal,
gpu_settings: None,
}
}
pub fn cache_friendly() -> Self {
Self {
strategy: ChunkStrategy::CacheOptimized,
min_chunk_size: 64,
max_chunk_size: 4096,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::CacheFriendly,
compute_intensity: ComputeIntensity::Balanced,
enable_monitoring: false,
load_balance_factor: 0.1,
cache_awareness: CacheAwareness::Full,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
pub fn linear_algebra() -> Self {
Self {
strategy: ChunkStrategy::LinearAlgebra,
min_chunk_size: 256,
max_chunk_size: 8192,
prefer_work_stealing: false,
memory_pattern: MemoryPattern::BlockWise,
compute_intensity: ComputeIntensity::ComputeIntensive,
enable_monitoring: false,
load_balance_factor: 0.1,
cache_awareness: CacheAwareness::L3,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
pub fn sparse_matrix() -> Self {
Self {
strategy: ChunkStrategy::SparseMatrix,
min_chunk_size: 128,
max_chunk_size: 4096,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Sparse,
compute_intensity: ComputeIntensity::MemoryBound,
enable_monitoring: true,
load_balance_factor: 0.3,
cache_awareness: CacheAwareness::L2,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
pub fn signal_processing() -> Self {
Self {
strategy: ChunkStrategy::SignalProcessing,
min_chunk_size: 512,
max_chunk_size: 16384,
prefer_work_stealing: false,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::ComputeIntensive,
enable_monitoring: false,
load_balance_factor: 0.05,
cache_awareness: CacheAwareness::L2,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
pub fn image_processing() -> Self {
Self {
strategy: ChunkStrategy::ImageProcessing,
min_chunk_size: 1024,
max_chunk_size: 32768,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::BlockWise,
compute_intensity: ComputeIntensity::Balanced,
enable_monitoring: false,
load_balance_factor: 0.15,
cache_awareness: CacheAwareness::L3,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: None,
}
}
pub fn monte_carlo() -> Self {
Self {
strategy: ChunkStrategy::MonteCarlo,
min_chunk_size: 1024,
max_chunk_size: 65536,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Random,
compute_intensity: ComputeIntensity::ComputeIntensive,
enable_monitoring: true,
load_balance_factor: 0.2,
cache_awareness: CacheAwareness::L1,
numa_strategy: NumaStrategy::Interleave,
gpu_settings: None,
}
}
pub fn iterative_solver() -> Self {
Self {
strategy: ChunkStrategy::IterativeSolver,
min_chunk_size: 256,
max_chunk_size: 8192,
prefer_work_stealing: false,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::Balanced,
enable_monitoring: true,
load_balance_factor: 0.1,
cache_awareness: CacheAwareness::L2,
numa_strategy: NumaStrategy::StrictLocal,
gpu_settings: None,
}
}
pub fn gpu_hybrid() -> Self {
Self {
strategy: ChunkStrategy::GpuAware,
min_chunk_size: 4096,
max_chunk_size: 131072,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::ComputeIntensive,
enable_monitoring: true,
load_balance_factor: 0.2,
cache_awareness: CacheAwareness::L3,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: Some(GpuChunkSettings::default()),
}
}
pub fn with_monitoring(mut self) -> Self {
self.enable_monitoring = true;
self
}
pub fn with_numa_strategy(mut self, strategy: NumaStrategy) -> Self {
self.numa_strategy = strategy;
self
}
pub fn with_memory_pattern(mut self, pattern: MemoryPattern) -> Self {
self.memory_pattern = pattern;
self
}
pub fn with_compute_intensity(mut self, intensity: ComputeIntensity) -> Self {
self.compute_intensity = intensity;
self
}
pub fn with_gpu_settings(mut self, settings: GpuChunkSettings) -> Self {
self.gpu_settings = Some(settings);
self
}
}
pub struct ChunkingUtils;
impl ChunkingUtils {
pub fn optimal_chunk_size(data_size: usize, config: &ChunkConfig) -> usize {
let thread_count = Self::thread_count();
let cpu_info = Self::get_cpu_info();
let chunk_size = match config.strategy {
ChunkStrategy::Fixed(size) => size,
ChunkStrategy::Adaptive => Self::adaptive_chunk_size(data_size, thread_count),
ChunkStrategy::CacheOptimized => {
Self::cache_optimized_chunk_size(data_size, thread_count)
}
ChunkStrategy::MemoryOptimized => {
Self::memory_optimized_chunk_size(data_size, thread_count)
}
ChunkStrategy::Dynamic => Self::dynamic_chunk_size(data_size, thread_count),
ChunkStrategy::WorkStealingBalanced => {
Self::work_stealing_chunk_size(data_size, thread_count)
}
ChunkStrategy::NumaAware => {
Self::numa_aware_chunk_size(data_size, thread_count, &cpu_info)
}
ChunkStrategy::LinearAlgebra => {
Self::linear_algebra_chunk_size(data_size, thread_count, &cpu_info)
}
ChunkStrategy::SparseMatrix => Self::sparse_matrix_chunk_size(data_size, thread_count),
ChunkStrategy::SignalProcessing => {
Self::signal_processing_chunk_size(data_size, thread_count)
}
ChunkStrategy::ImageProcessing => {
Self::image_processing_chunk_size(data_size, thread_count)
}
ChunkStrategy::MonteCarlo => Self::monte_carlo_chunk_size(data_size, thread_count),
ChunkStrategy::IterativeSolver => {
Self::iterative_solver_chunk_size(data_size, thread_count)
}
ChunkStrategy::GpuAware => Self::gpu_aware_chunk_size(data_size, thread_count, config),
ChunkStrategy::Custom(func) => func(data_size, thread_count),
};
let cache_adjusted =
Self::apply_cache_awareness(chunk_size, config.cache_awareness, &cpu_info);
let pattern_adjusted = Self::apply_memory_pattern(cache_adjusted, config.memory_pattern);
let intensity_adjusted =
Self::apply_compute_intensity(pattern_adjusted, config.compute_intensity);
intensity_adjusted
.max(config.min_chunk_size)
.min(config.max_chunk_size)
}
fn adaptive_chunk_size(data_size: usize, thread_count: usize) -> usize {
if data_size < 1000 {
data_size
} else if data_size < 10000 {
(data_size / thread_count).max(64)
} else {
let base_chunk = data_size / (thread_count * 4); base_chunk.max(256).min(8192)
}
}
fn cache_optimized_chunk_size(data_size: usize, thread_count: usize) -> usize {
const CACHE_LINE_SIZE: usize = 64;
const L2_CACHE_SIZE: usize = 256 * 1024;
let cache_friendly_size = L2_CACHE_SIZE / std::mem::size_of::<f32>() / 4;
let target_chunk = (data_size / thread_count).min(cache_friendly_size);
let aligned_chunk = (target_chunk / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
aligned_chunk.max(CACHE_LINE_SIZE)
}
fn memory_optimized_chunk_size(data_size: usize, thread_count: usize) -> usize {
let target_chunk = data_size / thread_count;
target_chunk.max(1024).min(32768)
}
fn dynamic_chunk_size(data_size: usize, thread_count: usize) -> usize {
let base_chunk = data_size / (thread_count * 8); base_chunk.max(128).min(2048)
}
fn thread_count() -> usize {
#[cfg(feature = "parallel")]
{
rayon::current_num_threads().max(1)
}
#[cfg(not(feature = "parallel"))]
{
1
}
}
fn get_cpu_info() -> CpuInfo {
CpuInfo {
l1_cache_size: 32 * 1024, l2_cache_size: 256 * 1024, l3_cache_size: 8 * 1024 * 1024, cache_line_size: 64, numa_nodes: 1, cores_per_numa: Self::thread_count(),
}
}
fn work_stealing_chunk_size(data_size: usize, thread_count: usize) -> usize {
let target_chunks = thread_count * 16; (data_size / target_chunks).max(32)
}
fn numa_aware_chunk_size(data_size: usize, thread_count: usize, cpu_info: &CpuInfo) -> usize {
let chunks_per_numa = thread_count / cpu_info.numa_nodes;
let chunk_size = data_size / (cpu_info.numa_nodes * chunks_per_numa);
let aligned = (chunk_size / cpu_info.cache_line_size) * cpu_info.cache_line_size;
aligned.max(cpu_info.cache_line_size)
}
fn linear_algebra_chunk_size(
data_size: usize,
thread_count: usize,
cpu_info: &CpuInfo,
) -> usize {
let cache_size = cpu_info.l3_cache_size / 4; let elements_per_cache = cache_size / std::mem::size_of::<f64>();
let sqrt_elements = (elements_per_cache as f64).sqrt() as usize;
let block_size = sqrt_elements.max(64).min(512);
((data_size / thread_count) / block_size) * block_size
}
fn sparse_matrix_chunk_size(data_size: usize, thread_count: usize) -> usize {
let base_chunk = data_size / (thread_count * 8);
base_chunk.max(128).min(2048)
}
fn signal_processing_chunk_size(data_size: usize, thread_count: usize) -> usize {
let target_chunk = data_size / thread_count;
let log2_chunk = (target_chunk as f64).log2().floor() as u32;
let power_of_2_chunk = 2_usize.pow(log2_chunk);
power_of_2_chunk.max(512).min(16384)
}
fn image_processing_chunk_size(data_size: usize, thread_count: usize) -> usize {
let pixels_per_thread = data_size / thread_count;
let block_side = (pixels_per_thread as f64).sqrt() as usize;
let block_size = block_side * block_side;
block_size.max(1024).min(32768)
}
fn monte_carlo_chunk_size(data_size: usize, thread_count: usize) -> usize {
let chunk_size = data_size / thread_count;
chunk_size.max(4096).min(131072)
}
fn iterative_solver_chunk_size(data_size: usize, thread_count: usize) -> usize {
let chunk_size = data_size / (thread_count * 2);
chunk_size.max(256).min(8192)
}
fn gpu_aware_chunk_size(data_size: usize, thread_count: usize, config: &ChunkConfig) -> usize {
if let Some(gpu_settings) = &config.gpu_settings {
let cpu_chunk = data_size / thread_count;
let gpu_preferred = gpu_settings.gpu_min_chunk;
if data_size > gpu_preferred * 4 {
gpu_preferred.max(cpu_chunk / 2)
} else {
cpu_chunk
}
} else {
Self::adaptive_chunk_size(data_size, thread_count)
}
}
fn apply_cache_awareness(
chunk_size: usize,
awareness: CacheAwareness,
cpu_info: &CpuInfo,
) -> usize {
match awareness {
CacheAwareness::None => chunk_size,
CacheAwareness::L1 => {
let l1_elements = cpu_info.l1_cache_size / std::mem::size_of::<f64>();
chunk_size.min(l1_elements / 2)
}
CacheAwareness::L2 => {
let l2_elements = cpu_info.l2_cache_size / std::mem::size_of::<f64>();
chunk_size.min(l2_elements / 2)
}
CacheAwareness::L3 => {
let l3_elements = cpu_info.l3_cache_size / std::mem::size_of::<f64>();
chunk_size.min(l3_elements / 4)
}
CacheAwareness::Full => {
let l1_elements = cpu_info.l1_cache_size / std::mem::size_of::<f64>();
chunk_size.min(l1_elements / 4)
}
}
}
fn apply_memory_pattern(chunk_size: usize, pattern: MemoryPattern) -> usize {
match pattern {
MemoryPattern::Sequential => chunk_size,
MemoryPattern::Random => chunk_size / 2, MemoryPattern::Strided(stride) => {
((chunk_size / stride) * stride).max(stride)
}
MemoryPattern::BlockWise => {
let block_size = 64; ((chunk_size / block_size) * block_size).max(block_size)
}
MemoryPattern::Sparse => chunk_size / 4, MemoryPattern::CacheFriendly => chunk_size, }
}
fn apply_compute_intensity(chunk_size: usize, intensity: ComputeIntensity) -> usize {
match intensity {
ComputeIntensity::MemoryBound => chunk_size * 2, ComputeIntensity::Balanced => chunk_size,
ComputeIntensity::ComputeIntensive => chunk_size / 2, ComputeIntensity::CpuBound => chunk_size / 4, }
}
pub fn chunked_map<T, R, F>(data: &[T], config: &ChunkConfig, map_fn: F) -> Vec<R>
where
T: Sync,
R: Send,
F: Fn(&T) -> R + Sync,
{
let chunk_size = Self::optimal_chunk_size(data.len(), config);
#[cfg(feature = "parallel")]
{
if config.prefer_work_stealing {
data.par_iter()
.with_min_len(chunk_size)
.map(|x| map_fn(x))
.collect()
} else {
data.par_chunks(chunk_size)
.map(|chunk| chunk.iter().map(|x| map_fn(x)).collect::<Vec<_>>())
.flatten()
.collect()
}
}
#[cfg(not(feature = "parallel"))]
{
data.iter().map(map_fn).collect()
}
}
pub fn chunked_zip_map<T, U, R, F>(
data_a: &[T],
data_b: &[U],
config: &ChunkConfig,
map_fn: F,
) -> Vec<R>
where
T: Sync,
U: Sync,
R: Send,
F: Fn(&T, &U) -> R + Sync,
{
assert_eq!(
data_a.len(),
data_b.len(),
"Arrays must have the same length"
);
let chunk_size = Self::optimal_chunk_size(data_a.len(), config);
#[cfg(feature = "parallel")]
{
if config.prefer_work_stealing {
data_a
.par_iter()
.with_min_len(chunk_size)
.zip(data_b.par_iter())
.map(|(a, b)| map_fn(a, b))
.collect()
} else {
data_a
.par_chunks(chunk_size)
.zip(data_b.par_chunks(chunk_size))
.map(|(chunk_a, chunk_b)| {
chunk_a
.iter()
.zip(chunk_b.iter())
.map(|(a, b)| map_fn(a, b))
.collect::<Vec<_>>()
})
.flatten()
.collect()
}
}
#[cfg(not(feature = "parallel"))]
{
data_a
.iter()
.zip(data_b.iter())
.map(|(a, b)| map_fn(a, b))
.collect()
}
}
pub fn chunked_reduce<T, F>(data: &[T], config: &ChunkConfig, identity: T, reduce_fn: F) -> T
where
T: Clone + Send + Sync,
F: Fn(T, T) -> T + Sync,
{
let chunk_size = Self::optimal_chunk_size(data.len(), config);
#[cfg(feature = "parallel")]
{
data.par_chunks(chunk_size)
.map(|chunk| {
chunk
.iter()
.cloned()
.fold(identity.clone(), |acc, x| reduce_fn(acc, x))
})
.reduce(|| identity.clone(), |a, b| reduce_fn(a, b))
}
#[cfg(not(feature = "parallel"))]
{
data.iter().cloned().fold(identity, reduce_fn)
}
}
}
pub trait ParallelSliceExt<T> {
fn chunked_map<R, F>(&self, config: &ChunkConfig, map_fn: F) -> Vec<R>
where
T: Sync,
R: Send,
F: Fn(&T) -> R + Sync;
fn chunked_reduce<F>(&self, config: &ChunkConfig, identity: T, reduce_fn: F) -> T
where
T: Clone + Send + Sync,
F: Fn(T, T) -> T + Sync;
}
impl<T> ParallelSliceExt<T> for [T] {
fn chunked_map<R, F>(&self, config: &ChunkConfig, map_fn: F) -> Vec<R>
where
T: Sync,
R: Send,
F: Fn(&T) -> R + Sync,
{
ChunkingUtils::chunked_map(self, config, map_fn)
}
fn chunked_reduce<F>(&self, config: &ChunkConfig, identity: T, reduce_fn: F) -> T
where
T: Clone + Send + Sync,
F: Fn(T, T) -> T + Sync,
{
ChunkingUtils::chunked_reduce(self, config, identity, reduce_fn)
}
}
#[derive(Debug, Clone)]
pub struct CpuInfo {
pub l1_cache_size: usize,
pub l2_cache_size: usize,
pub l3_cache_size: usize,
pub cache_line_size: usize,
pub numa_nodes: usize,
pub cores_per_numa: usize,
}
#[derive(Debug, Clone)]
pub struct ChunkPerformanceMonitor {
measurements: Vec<ChunkMeasurement>,
optimal_sizes: std::collections::HashMap<String, usize>,
}
#[derive(Debug, Clone)]
pub struct ChunkMeasurement {
pub chunk_size: usize,
pub data_size: usize,
pub execution_time: std::time::Duration,
pub throughput: f64,
pub operation_type: String,
}
impl ChunkPerformanceMonitor {
pub fn new() -> Self {
Self {
measurements: Vec::new(),
optimal_sizes: std::collections::HashMap::new(),
}
}
pub fn record_measurement(&mut self, measurement: ChunkMeasurement) {
self.measurements.push(measurement);
if self.measurements.len() > 1000 {
self.measurements.remove(0);
}
self.update_optimal_sizes();
}
fn update_optimal_sizes(&mut self) {
for measurement in &self.measurements {
let key = format!(
"{}_{}k",
measurement.operation_type,
measurement.data_size / 1000
);
let current_optimal = self
.optimal_sizes
.get(&key)
.unwrap_or(&measurement.chunk_size);
if measurement.throughput > self.get_throughput_for_size(*current_optimal, &key) {
self.optimal_sizes.insert(key, measurement.chunk_size);
}
}
}
fn get_throughput_for_size(&self, chunk_size: usize, operation_key: &str) -> f64 {
self.measurements
.iter()
.filter(|m| {
m.chunk_size == chunk_size
&& format!("{}_{}k", m.operation_type, m.data_size / 1000) == operation_key
})
.map(|m| m.throughput)
.fold(0.0, f64::max)
}
pub fn get_optimal_size(&self, operation_type: &str, data_size: usize) -> Option<usize> {
let key = format!("{}_{}k", operation_type, data_size / 1000);
self.optimal_sizes.get(&key).copied()
}
pub fn get_statistics(&self) -> ChunkStatistics {
if self.measurements.is_empty() {
return ChunkStatistics::default();
}
let throughputs: Vec<f64> = self.measurements.iter().map(|m| m.throughput).collect();
let avg_throughput = throughputs.iter().sum::<f64>() / throughputs.len() as f64;
let max_throughput = throughputs.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let min_throughput = throughputs.iter().fold(f64::INFINITY, |a, &b| a.min(b));
ChunkStatistics {
total_measurements: self.measurements.len(),
avg_throughput,
max_throughput,
min_throughput,
optimal_operations: self.optimal_sizes.len(),
}
}
}
impl Default for ChunkPerformanceMonitor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct ChunkStatistics {
pub total_measurements: usize,
pub avg_throughput: f64,
pub max_throughput: f64,
pub min_throughput: f64,
pub optimal_operations: usize,
}
pub struct MatrixChunking;
impl MatrixChunking {
pub fn matrix_multiply_chunks(
rows_a: usize,
cols_a: usize,
cols_b: usize,
) -> (usize, usize, usize) {
let cache_size = 256 * 1024; let element_size = std::mem::size_of::<f64>();
let cache_elements = cache_size / element_size;
let block_size = ((cache_elements / 3) as f64).sqrt() as usize;
let block_size = block_size.max(64).min(512);
(
block_size.min(rows_a),
block_size.min(cols_a),
block_size.min(cols_b),
)
}
pub fn array_2d_chunks(rows: usize, cols: usize, thread_count: usize) -> (usize, usize) {
let total_elements = rows * cols;
let elements_per_thread = total_elements / thread_count;
if rows >= cols {
let chunk_rows = (elements_per_thread / cols).max(1);
(chunk_rows, cols)
} else {
let chunk_cols = (elements_per_thread / rows).max(1);
(rows, chunk_cols)
}
}
pub fn array_3d_chunks(
depth: usize,
rows: usize,
cols: usize,
thread_count: usize,
) -> (usize, usize, usize) {
let total_elements = depth * rows * cols;
let elements_per_thread = total_elements / thread_count;
if depth >= rows && depth >= cols {
let chunk_depth = (elements_per_thread / (rows * cols)).max(1);
(chunk_depth, rows, cols)
} else if rows >= cols {
let chunk_rows = (elements_per_thread / (depth * cols)).max(1);
(depth, chunk_rows, cols)
} else {
let chunk_cols = (elements_per_thread / (depth * rows)).max(1);
(depth, rows, chunk_cols)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_size_calculation() {
let config = ChunkConfig::default();
let small_chunk = ChunkingUtils::optimal_chunk_size(100, &config);
assert!(small_chunk >= config.min_chunk_size);
let large_chunk = ChunkingUtils::optimal_chunk_size(100000, &config);
assert!(large_chunk >= config.min_chunk_size);
assert!(large_chunk <= config.max_chunk_size);
}
#[test]
fn test_chunked_map() {
let data: Vec<i32> = (0..1000).collect();
let config = ChunkConfig::compute_intensive();
let result = data.chunked_map(&config, |&x| x * x);
let expected: Vec<i32> = (0..1000).map(|x| x * x).collect();
assert_eq!(result, expected);
}
#[test]
fn test_chunked_zip_map() {
let data_a: Vec<i32> = (0..1000).collect();
let data_b: Vec<i32> = (1000..2000).collect();
let config = ChunkConfig::memory_intensive();
let result = ChunkingUtils::chunked_zip_map(&data_a, &data_b, &config, |&a, &b| a + b);
let expected: Vec<i32> = (0..1000).map(|x| x + (x + 1000)).collect();
assert_eq!(result, expected);
}
#[test]
fn test_chunked_reduce() {
let data: Vec<i32> = (1..=100).collect();
let config = ChunkConfig::cache_friendly();
let result = data.chunked_reduce(&config, 0, |a, b| a + b);
let expected = (1..=100).sum::<i32>();
assert_eq!(result, expected);
}
#[test]
fn test_specialized_configs() {
let linear_algebra = ChunkConfig::linear_algebra();
assert_eq!(linear_algebra.strategy, ChunkStrategy::LinearAlgebra);
assert_eq!(linear_algebra.memory_pattern, MemoryPattern::BlockWise);
let sparse_matrix = ChunkConfig::sparse_matrix();
assert_eq!(sparse_matrix.strategy, ChunkStrategy::SparseMatrix);
assert_eq!(sparse_matrix.memory_pattern, MemoryPattern::Sparse);
let monte_carlo = ChunkConfig::monte_carlo();
assert_eq!(monte_carlo.strategy, ChunkStrategy::MonteCarlo);
assert_eq!(monte_carlo.memory_pattern, MemoryPattern::Random);
}
#[test]
fn test_matrix_chunking() {
let (row_chunk, col_chunk_a, col_chunk_b) =
MatrixChunking::matrix_multiply_chunks(1000, 800, 600);
assert!(row_chunk > 0 && row_chunk <= 1000);
assert!(col_chunk_a > 0 && col_chunk_a <= 800);
assert!(col_chunk_b > 0 && col_chunk_b <= 600);
let (chunk_rows, chunk_cols) = MatrixChunking::array_2d_chunks(100, 200, 4);
assert!(chunk_rows > 0 && chunk_rows <= 100);
assert!(chunk_cols > 0 && chunk_cols <= 200);
}
#[test]
fn test_performance_monitor() {
let mut monitor = ChunkPerformanceMonitor::new();
let measurement = ChunkMeasurement {
chunk_size: 1024,
data_size: 10000,
execution_time: std::time::Duration::from_millis(10),
throughput: 1000.0,
operation_type: "matrix_multiply".to_string(),
};
monitor.record_measurement(measurement);
let stats = monitor.get_statistics();
assert_eq!(stats.total_measurements, 1);
assert_eq!(stats.max_throughput, 1000.0);
}
#[test]
fn test_cache_awareness() {
let cpu_info = CpuInfo {
l1_cache_size: 32 * 1024,
l2_cache_size: 256 * 1024,
l3_cache_size: 8 * 1024 * 1024,
cache_line_size: 64,
numa_nodes: 1,
cores_per_numa: 4,
};
let chunk_size = 10000;
let l1_adjusted =
ChunkingUtils::apply_cache_awareness(chunk_size, CacheAwareness::L1, &cpu_info);
let l3_adjusted =
ChunkingUtils::apply_cache_awareness(chunk_size, CacheAwareness::L3, &cpu_info);
assert!(l1_adjusted <= chunk_size);
assert!(l3_adjusted <= chunk_size);
assert!(l1_adjusted <= l3_adjusted); }
#[test]
fn test_memory_pattern_adjustment() {
let base_size = 1000;
let random_adjusted = ChunkingUtils::apply_memory_pattern(base_size, MemoryPattern::Random);
let sparse_adjusted = ChunkingUtils::apply_memory_pattern(base_size, MemoryPattern::Sparse);
let sequential_adjusted =
ChunkingUtils::apply_memory_pattern(base_size, MemoryPattern::Sequential);
assert!(random_adjusted <= base_size);
assert!(sparse_adjusted <= base_size);
assert_eq!(sequential_adjusted, base_size);
assert!(sparse_adjusted < random_adjusted); }
#[test]
fn test_compute_intensity_adjustment() {
let base_size = 1000;
let memory_bound =
ChunkingUtils::apply_compute_intensity(base_size, ComputeIntensity::MemoryBound);
let compute_intensive =
ChunkingUtils::apply_compute_intensity(base_size, ComputeIntensity::ComputeIntensive);
let cpu_bound =
ChunkingUtils::apply_compute_intensity(base_size, ComputeIntensity::CpuBound);
assert!(memory_bound >= base_size); assert!(compute_intensive <= base_size); assert!(cpu_bound <= base_size); assert!(cpu_bound <= compute_intensive);
}
#[test]
fn test_gpu_chunk_settings() {
let gpu_settings = GpuChunkSettings::default();
assert_eq!(gpu_settings.gpu_memory_ratio, 0.8);
assert_eq!(gpu_settings.gpu_min_chunk, 4096);
assert!(gpu_settings.overlap_compute);
let config = ChunkConfig::gpu_hybrid();
assert!(config.gpu_settings.is_some());
assert_eq!(config.strategy, ChunkStrategy::GpuAware);
}
}