use std::sync::atomic::{AtomicUsize, Ordering};
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Locality {
High,
Medium,
Low,
None,
}
pub struct PerformanceHints;
impl PerformanceHints {
#[inline(always)]
pub fn likely(cond: bool) -> bool {
#[cfg(target_arch = "x86_64")]
{
if cond {
unsafe {
std::arch::asm!("# likely branch", options(nomem, nostack));
}
}
}
cond
}
#[inline(always)]
pub fn unlikely(cond: bool) -> bool {
#[cfg(target_arch = "x86_64")]
{
if !cond {
unsafe {
std::arch::asm!("# unlikely branch", options(nomem, nostack));
}
}
}
cond
}
#[inline(always)]
pub fn prefetch_read<T>(data: &T) {
let ptr = data as *const T as *const u8;
#[cfg(target_arch = "x86_64")]
{
unsafe {
std::arch::asm!(
"prefetcht0 [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
std::arch::asm!(
"prfm pldl1keep, [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
std::hint::black_box(data);
}
}
#[inline(always)]
pub fn prefetch_write<T>(data: &mut T) {
let ptr = data as *mut T as *mut u8;
#[cfg(target_arch = "x86_64")]
{
unsafe {
std::arch::asm!(
"prefetcht0 [{}]",
in(reg) ptr,
options(nostack)
);
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
std::arch::asm!(
"prfm pstl1keep, [{}]",
in(reg) ptr,
options(nostack)
);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
std::hint::black_box(data);
}
}
#[inline(always)]
pub fn prefetch_with_locality<T>(data: &T, locality: Locality) {
let ptr = data as *const T as *const u8;
#[cfg(target_arch = "x86_64")]
{
unsafe {
match locality {
Locality::High => {
std::arch::asm!(
"prefetcht0 [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
Locality::Medium => {
std::arch::asm!(
"prefetcht1 [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
Locality::Low => {
std::arch::asm!(
"prefetcht2 [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
Locality::None => {
std::arch::asm!(
"prefetchnta [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
}
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
match locality {
Locality::High => {
std::arch::asm!(
"prfm pldl1keep, [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
Locality::Medium => {
std::arch::asm!(
"prfm pldl2keep, [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
Locality::Low => {
std::arch::asm!(
"prfm pldl3keep, [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
Locality::None => {
std::arch::asm!(
"prfm pldl1strm, [{}]",
in(reg) ptr,
options(readonly, nostack)
);
}
}
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
std::hint::black_box(data);
}
}
#[inline(always)]
pub fn memory_fence() {
#[cfg(target_arch = "x86_64")]
{
unsafe {
std::arch::asm!("mfence", options(nostack));
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
std::arch::asm!("dmb sy", options(nostack));
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
std::sync::atomic::fence(std::sync::atomic::Ordering::SeqCst);
}
}
#[inline(always)]
pub fn flush_cache_line<T>(data: &T) {
let ptr = data as *const T as *const u8;
#[cfg(target_arch = "x86_64")]
{
unsafe {
std::arch::asm!("mfence", options(nostack, nomem));
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
std::arch::asm!(
"dc civac, {}",
in(reg) ptr,
options(nostack)
);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
std::hint::black_box(data);
}
}
#[inline]
pub fn cache_aware_copy<T: Copy>(src: &[T], dst: &mut [T]) {
assert_eq!(src.len(), dst.len());
if std::mem::size_of_val(src) > 64 * 1024 {
#[cfg(target_arch = "x86_64")]
{
unsafe {
let src_ptr = src.as_ptr() as *const u8;
let dst_ptr = dst.as_mut_ptr() as *mut u8;
let len = std::mem::size_of_val(src);
std::ptr::copy_nonoverlapping(src_ptr, dst_ptr, len);
std::arch::asm!("sfence", options(nostack));
}
return;
}
}
dst.copy_from_slice(src);
}
#[inline]
pub fn cache_aware_memset<T: Copy>(dst: &mut [T], value: T) {
if std::mem::size_of_val(dst) > 32 * 1024 {
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
{
if std::mem::size_of::<T>() == 8 {
let chunks = dst.len() / 2;
for i in 0..chunks {
dst[i * 2] = value;
dst[i * 2 + 1] = value;
}
for item in dst.iter_mut().skip(chunks * 2) {
*item = value;
}
return;
}
}
}
dst.fill(value);
}
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct PerformanceMetrics {
pub operation_times: std::collections::HashMap<String, f64>,
pub strategy_success_rates: std::collections::HashMap<OptimizationStrategy, f64>,
pub memorybandwidth_utilization: f64,
pub cache_hit_rate: f64,
pub parallel_efficiency: f64,
}
impl Default for PerformanceMetrics {
fn default() -> Self {
Self {
operation_times: std::collections::HashMap::new(),
strategy_success_rates: std::collections::HashMap::new(),
memorybandwidth_utilization: 0.0,
cache_hit_rate: 0.0,
parallel_efficiency: 0.0,
}
}
}
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum OptimizationStrategy {
Scalar,
Simd,
Parallel,
Gpu,
Hybrid,
CacheOptimized,
MemoryBound,
ComputeBound,
ModernArchOptimized,
VectorOptimized,
EnergyEfficient,
HighThroughput,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct StrategySelector {
#[allow(dead_code)]
preferred_strategy: OptimizationStrategy,
strategy_weights: std::collections::HashMap<OptimizationStrategy, f64>,
learningrate: f64,
exploration_rate: f64,
}
impl Default for StrategySelector {
fn default() -> Self {
let mut strategy_weights = std::collections::HashMap::new();
strategy_weights.insert(OptimizationStrategy::Scalar, 1.0);
strategy_weights.insert(OptimizationStrategy::Simd, 1.0);
strategy_weights.insert(OptimizationStrategy::Parallel, 1.0);
strategy_weights.insert(OptimizationStrategy::Gpu, 1.0);
strategy_weights.insert(OptimizationStrategy::Hybrid, 1.0);
strategy_weights.insert(OptimizationStrategy::CacheOptimized, 1.0);
strategy_weights.insert(OptimizationStrategy::MemoryBound, 1.0);
strategy_weights.insert(OptimizationStrategy::ComputeBound, 1.0);
strategy_weights.insert(OptimizationStrategy::ModernArchOptimized, 1.5); strategy_weights.insert(OptimizationStrategy::VectorOptimized, 1.3);
strategy_weights.insert(OptimizationStrategy::EnergyEfficient, 1.0);
strategy_weights.insert(OptimizationStrategy::HighThroughput, 1.2);
Self {
preferred_strategy: OptimizationStrategy::ModernArchOptimized,
strategy_weights,
learningrate: 0.1,
exploration_rate: 0.1,
}
}
}
impl StrategySelector {
pub fn select_strategy(
&self,
operation_size: usize,
is_memory_bound: bool,
) -> OptimizationStrategy {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
operation_size.hash(&mut hasher);
let rand_val = (hasher.finish() % 100) as f64 / 100.0;
if rand_val < self.exploration_rate {
let strategies = [
OptimizationStrategy::Scalar,
OptimizationStrategy::Simd,
OptimizationStrategy::Parallel,
OptimizationStrategy::Gpu,
OptimizationStrategy::ModernArchOptimized,
OptimizationStrategy::VectorOptimized,
OptimizationStrategy::EnergyEfficient,
OptimizationStrategy::HighThroughput,
];
strategies[operation_size % strategies.len()]
} else {
if is_memory_bound {
if is_apple_silicon() || is_neoverse_or_newer() {
OptimizationStrategy::ModernArchOptimized
} else {
OptimizationStrategy::MemoryBound
}
} else if operation_size > 1_000_000 {
OptimizationStrategy::HighThroughput
} else if operation_size > 100_000 {
if is_zen4_or_newer() || is_intel_golden_cove_or_newer() {
OptimizationStrategy::VectorOptimized
} else {
OptimizationStrategy::Parallel
}
} else if operation_size > 1_000 {
if is_zen4_or_newer() || is_apple_silicon() {
OptimizationStrategy::ModernArchOptimized
} else {
OptimizationStrategy::Simd
}
} else {
if cfg!(target_os = "android") || cfg!(target_os = "ios") {
OptimizationStrategy::EnergyEfficient
} else {
OptimizationStrategy::Scalar
}
}
}
}
pub fn update_weights(&mut self, strategy: OptimizationStrategy, performancescore: f64) {
if let Some(weight) = self.strategy_weights.get_mut(&strategy) {
*weight = *weight * (1.0 - self.learningrate) + performancescore * self.learningrate;
}
}
#[allow(dead_code)]
fn is_neoverse_or_newer() -> bool {
crate::performance_optimization::is_neoverse_or_newer()
}
#[allow(dead_code)]
fn is_zen4_or_newer() -> bool {
crate::performance_optimization::is_zen4_or_newer()
}
#[allow(dead_code)]
fn is_intel_golden_cove_or_newer() -> bool {
crate::performance_optimization::is_intel_golden_cove_or_newer()
}
}
#[allow(dead_code)]
fn is_zen4_or_newer() -> bool {
#[cfg(target_arch = "x86_64")]
{
is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
}
#[cfg(not(target_arch = "x86_64"))]
{
false
}
}
#[allow(dead_code)]
fn is_intel_golden_cove_or_newer() -> bool {
#[cfg(target_arch = "x86_64")]
{
is_x86_feature_detected!("avx2")
&& is_x86_feature_detected!("fma")
&& is_x86_feature_detected!("bmi2")
}
#[cfg(not(target_arch = "x86_64"))]
{
false
}
}
#[allow(dead_code)]
fn is_apple_silicon() -> bool {
#[cfg(target_arch = "aarch64")]
{
cfg!(target_vendor = "apple")
}
#[cfg(not(target_arch = "aarch64"))]
{
false
}
}
#[allow(dead_code)]
fn is_neoverse_or_newer() -> bool {
#[cfg(target_arch = "aarch64")]
{
std::arch::is_aarch64_feature_detected!("asimd")
&& std::arch::is_aarch64_feature_detected!("crc")
&& std::arch::is_aarch64_feature_detected!("fp")
}
#[cfg(not(target_arch = "aarch64"))]
{
false
}
}
pub struct AdaptiveOptimizer {
parallel_threshold: AtomicUsize,
simd_threshold: AtomicUsize,
#[allow(dead_code)]
gpu_threshold: AtomicUsize,
cache_line_size: usize,
performance_metrics: std::sync::RwLock<PerformanceMetrics>,
strategy_selector: std::sync::RwLock<StrategySelector>,
}
impl AdaptiveOptimizer {
pub fn new() -> Self {
Self {
parallel_threshold: AtomicUsize::new(10_000),
simd_threshold: AtomicUsize::new(1_000),
gpu_threshold: AtomicUsize::new(100_000),
cache_line_size: Self::detect_cache_line_size(),
performance_metrics: std::sync::RwLock::new(PerformanceMetrics::default()),
strategy_selector: std::sync::RwLock::new(StrategySelector::default()),
}
}
fn detect_cache_line_size() -> usize {
#[cfg(target_arch = "x86_64")]
{
64
}
#[cfg(target_arch = "aarch64")]
{
128
}
#[cfg(target_arch = "riscv64")]
{
64 }
#[cfg(not(any(
target_arch = "x86_64",
target_arch = "aarch64",
target_arch = "riscv64"
)))]
{
64 }
}
#[inline]
#[allow(unused_variables)]
pub fn should_use_parallel(&self, size: usize) -> bool {
#[cfg(feature = "parallel")]
{
size >= self.parallel_threshold.load(Ordering::Relaxed)
}
#[cfg(not(feature = "parallel"))]
{
false
}
}
#[inline]
#[allow(unused_variables)]
pub fn should_use_simd(&self, size: usize) -> bool {
#[cfg(feature = "simd")]
{
size >= self.simd_threshold.load(Ordering::Relaxed)
}
#[cfg(not(feature = "simd"))]
{
false
}
}
pub fn update_from_measurement(&mut self, operation: &str, size: usize, durationns: u64) {
let ops_per_ns = size as f64 / durationns as f64;
if operation.contains("parallel") && ops_per_ns < 0.1 {
self.parallel_threshold
.fetch_add(size / 10, Ordering::Relaxed);
} else if operation.contains("simd") && ops_per_ns < 1.0 {
self.simd_threshold.fetch_add(size / 10, Ordering::Relaxed);
}
}
#[inline]
pub fn optimal_chunk_size<T>(&self) -> usize {
let element_size = std::mem::size_of::<T>();
let elements_per_cache_line = self.cache_line_size / element_size.max(1);
elements_per_cache_line * 16
}
#[inline]
#[allow(unused_variables)]
pub fn should_use_gpu(&self, size: usize) -> bool {
#[cfg(feature = "gpu")]
{
size >= self.gpu_threshold.load(Ordering::Relaxed)
}
#[cfg(not(feature = "gpu"))]
{
false
}
}
pub fn select_for_operation(&self, operationname: &str, size: usize) -> OptimizationStrategy {
let memory_bound = operationname.contains("copy")
|| operationname.contains("memset")
|| operationname.contains("transpose");
if let Ok(selector) = self.strategy_selector.read() {
selector.select_strategy(size, memory_bound)
} else {
if self.should_use_gpu(size) {
OptimizationStrategy::Gpu
} else if self.should_use_parallel(size) {
OptimizationStrategy::Parallel
} else if self.should_use_simd(size) {
OptimizationStrategy::Simd
} else {
OptimizationStrategy::Scalar
}
}
}
pub fn record_performance(
&mut self,
operation: &str,
size: usize,
strategy: OptimizationStrategy,
duration_ns: u64,
) {
let ops_per_ns = size as f64 / duration_ns as f64;
let performance_score = ops_per_ns.min(10.0) / 10.0;
if let Ok(mut selector) = self.strategy_selector.write() {
selector.update_weights(strategy, performance_score);
}
if let Ok(mut metrics) = self.performance_metrics.write() {
let avg_time = metrics
.operation_times
.entry(operation.to_string())
.or_insert(0.0);
*avg_time = (*avg_time * 0.9) + (duration_ns as f64 * 0.1);
metrics
.strategy_success_rates
.insert(strategy, performance_score);
}
self.update_thresholds(operation, size, duration_ns);
}
pub fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
self.performance_metrics.read().ok().map(|m| m.clone())
}
pub fn analyze_operation(&self, operation_name: &str, inputsize: usize) -> OptimizationAdvice {
let strategy = self.select_optimal_strategy(operation_name, inputsize);
let chunk_size = if strategy == OptimizationStrategy::Parallel {
Some(self.optimal_chunk_size::<f64>())
} else {
None
};
let prefetch_distance = if inputsize > 10_000 {
Some(self.cache_line_size * 8) } else {
None
};
OptimizationAdvice {
recommended_strategy: strategy,
optimal_chunk_size: chunk_size,
prefetch_distance,
memory_allocation_hint: if inputsize > 1_000_000 {
Some("Consider using memory-mapped files for large outputs".to_string())
} else {
None
},
}
}
#[allow(dead_code)]
fn is_zen4_or_newer() -> bool {
crate::performance_optimization::is_zen4_or_newer()
}
#[allow(dead_code)]
fn is_intel_golden_cove_or_newer() -> bool {
crate::performance_optimization::is_intel_golden_cove_or_newer()
}
pub fn select_optimal_strategy(
&self,
_operation_name: &str,
input_size: usize,
) -> OptimizationStrategy {
if input_size >= self.gpu_threshold.load(Ordering::Relaxed) && self.has_gpu_support() {
return OptimizationStrategy::Gpu;
}
if input_size >= self.parallel_threshold.load(Ordering::Relaxed) {
return OptimizationStrategy::Parallel;
}
if input_size >= self.simd_threshold.load(Ordering::Relaxed) && self.has_simd_support() {
return OptimizationStrategy::Simd;
}
OptimizationStrategy::Scalar
}
pub fn has_gpu_support(&self) -> bool {
false
}
pub fn has_simd_support(&self) -> bool {
#[cfg(target_arch = "x86_64")]
{
std::arch::is_x86_feature_detected!("avx2")
|| std::arch::is_x86_feature_detected!("sse4.1")
}
#[cfg(target_arch = "aarch64")]
{
std::arch::is_aarch64_feature_detected!("neon")
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
false
}
}
fn update_thresholds(&self, operation: &str, size: usize, duration_ns: u64) {
let ops_per_ns = size as f64 / duration_ns as f64;
let current_strategy = self.select_optimal_strategy(operation, size);
const PARALLEL_MIN_EFFICIENCY: f64 = 0.5; const SIMD_MIN_EFFICIENCY: f64 = 2.0; const GPU_MIN_EFFICIENCY: f64 = 10.0;
match current_strategy {
OptimizationStrategy::Parallel => {
if ops_per_ns < PARALLEL_MIN_EFFICIENCY {
let new_threshold = (size as f64 * 1.2) as usize;
self.parallel_threshold
.store(new_threshold, Ordering::Relaxed);
} else if ops_per_ns > PARALLEL_MIN_EFFICIENCY * 2.0 {
let current = self.parallel_threshold.load(Ordering::Relaxed);
let new_threshold = (current as f64 * 0.9).max(1000.0) as usize;
self.parallel_threshold
.store(new_threshold, Ordering::Relaxed);
}
}
OptimizationStrategy::Simd => {
if ops_per_ns < SIMD_MIN_EFFICIENCY {
let new_threshold = (size as f64 * 1.1) as usize;
self.simd_threshold.store(new_threshold, Ordering::Relaxed);
} else if ops_per_ns > SIMD_MIN_EFFICIENCY * 2.0 {
let current = self.simd_threshold.load(Ordering::Relaxed);
let new_threshold = (current as f64 * 0.95).max(100.0) as usize;
self.simd_threshold.store(new_threshold, Ordering::Relaxed);
}
}
OptimizationStrategy::Gpu => {
if ops_per_ns < GPU_MIN_EFFICIENCY {
let new_threshold = (size as f64 * 1.5) as usize;
self.gpu_threshold.store(new_threshold, Ordering::Relaxed);
} else if ops_per_ns > GPU_MIN_EFFICIENCY * 2.0 {
let current = self.gpu_threshold.load(Ordering::Relaxed);
let new_threshold = (current as f64 * 0.8).max(10000.0) as usize;
self.gpu_threshold.store(new_threshold, Ordering::Relaxed);
}
}
_ => {
if size > 1000 && ops_per_ns > SIMD_MIN_EFFICIENCY {
let current = self.simd_threshold.load(Ordering::Relaxed);
let new_threshold = size.min(current);
self.simd_threshold.store(new_threshold, Ordering::Relaxed);
}
if size > 10000 && ops_per_ns > PARALLEL_MIN_EFFICIENCY {
let current = self.parallel_threshold.load(Ordering::Relaxed);
let new_threshold = size.min(current);
self.parallel_threshold
.store(new_threshold, Ordering::Relaxed);
}
}
}
if let Ok(mut metrics) = self.performance_metrics.write() {
metrics.operation_times.insert(
format!("{}_threshold_parallel", operation),
self.parallel_threshold.load(Ordering::Relaxed) as f64,
);
metrics.operation_times.insert(
format!("{}_threshold_simd", operation),
self.simd_threshold.load(Ordering::Relaxed) as f64,
);
metrics.operation_times.insert(
format!("{}_threshold_gpu", operation),
self.gpu_threshold.load(Ordering::Relaxed) as f64,
);
}
}
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct OptimizationAdvice {
pub recommended_strategy: OptimizationStrategy,
pub optimal_chunk_size: Option<usize>,
pub prefetch_distance: Option<usize>,
pub memory_allocation_hint: Option<String>,
}
impl Default for AdaptiveOptimizer {
fn default() -> Self {
Self::new()
}
}
pub mod fast_paths {
use super::*;
#[inline]
#[allow(unused_variables)]
pub fn add_f64_arrays(a: &[f64], b: &[f64], result: &mut [f64]) -> Result<(), &'static str> {
if a.len() != b.len() || a.len() != result.len() {
return Err("Array lengths must match");
}
let len = a.len();
let optimizer = AdaptiveOptimizer::new();
#[cfg(feature = "simd")]
if optimizer.should_use_simd(len) {
use crate::simd_ops::SimdUnifiedOps;
use ::ndarray::ArrayView1;
let simd_chunks = len / 4;
for i in 0..simd_chunks {
let start = i * 4;
let end = start + 4;
if end <= len {
let a_view = ArrayView1::from(&a[start..end]);
let b_view = ArrayView1::from(&b[start..end]);
let simd_result = f64::simd_add(&a_view, &b_view);
result[start..end]
.copy_from_slice(simd_result.as_slice().expect("Operation failed"));
}
}
for i in (simd_chunks * 4)..len {
result[0] = a[0] + b[0];
}
return Ok(());
}
#[cfg(feature = "parallel")]
if optimizer.should_use_parallel(len) {
use crate::parallel_ops::*;
result
.par_chunks_mut(optimizer.optimal_chunk_size::<f64>())
.zip(a.par_chunks(optimizer.optimal_chunk_size::<f64>()))
.zip(b.par_chunks(optimizer.optimal_chunk_size::<f64>()))
.for_each(|((r_chunk, a_chunk), b_chunk)| {
for i in 0..r_chunk.len() {
r_chunk[0] = a_chunk[0] + b_chunk[0];
}
});
return Ok(());
}
let chunks = len / 8;
for i in 0..chunks {
let idx = i * 8;
result[idx] = a[idx] + b[idx];
result[idx + 1] = a[idx + 1] + b[idx + 1];
result[idx + 2] = a[idx + 2] + b[idx + 2];
result[idx + 3] = a[idx + 3] + b[idx + 3];
result[idx + 4] = a[idx + 4] + b[idx + 4];
result[idx + 5] = a[idx + 5] + b[idx + 5];
result[idx + 6] = a[idx + 6] + b[idx + 6];
result[idx + 7] = a[idx + 7] + b[idx + 7];
}
for i in (chunks * 8)..len {
result[0] = a[0] + b[0];
}
Ok(())
}
#[inline]
pub fn matmul_kernel(
a: &[f64],
b: &[f64],
c: &mut [f64],
m: usize,
k: usize,
n: usize,
) -> Result<(), &'static str> {
if a.len() != m * k || b.len() != k * n || c.len() != m * n {
return Err("Invalid matrix dimensions");
}
const TILE_M: usize = 64;
const TILE_N: usize = 64;
const TILE_K: usize = 64;
c.fill(0.0);
#[cfg(feature = "parallel")]
{
let optimizer = AdaptiveOptimizer::new();
if optimizer.should_use_parallel(m * n) {
use crate::parallel_ops::*;
use std::sync::Mutex;
let c_mutex = Mutex::new(c);
(0..m).into_par_iter().step_by(TILE_M).for_each(|i0| {
let i_max = (i0 + TILE_M).min(m);
let mut local_updates = Vec::new();
for j0 in (0..n).step_by(TILE_N) {
for k0 in (0..k).step_by(TILE_K) {
let j_max = (j0 + TILE_N).min(n);
let k_max = (k0 + TILE_K).min(k);
for i in i0..i_max {
for j in j0..j_max {
let mut sum = 0.0;
for k_idx in k0..k_max {
sum += a[i * k + k_idx] * b[k_idx * n + j];
}
local_updates.push((i, j, sum));
}
}
}
}
if let Ok(mut c_guard) = c_mutex.lock() {
for (i, j, sum) in local_updates {
c_guard[i * n + j] += sum;
}
}
});
return Ok(());
}
}
for i0 in (0..m).step_by(TILE_M) {
for j0 in (0..n).step_by(TILE_N) {
for k0 in (0..k).step_by(TILE_K) {
let i_max = (i0 + TILE_M).min(m);
let j_max = (j0 + TILE_N).min(n);
let k_max = (k0 + TILE_K).min(k);
for i in i0..i_max {
for j in j0..j_max {
let mut sum = c[i * n + j];
for k_idx in k0..k_max {
sum += a[i * k + k_idx] * b[k_idx * n + j];
}
c[i * n + j] = sum;
}
}
}
}
}
Ok(())
}
}
#[allow(dead_code)]
pub struct MemoryAccessOptimizer {
stride_detector: StrideDetector,
}
#[derive(Default)]
#[allow(dead_code)]
struct StrideDetector {
last_address: Option<usize>,
detected_stride: Option<isize>,
confidence: f32,
}
impl MemoryAccessOptimizer {
pub fn new() -> Self {
Self {
stride_detector: StrideDetector::default(),
}
}
pub fn analyze_access_pattern<T>(&mut self, addresses: &[*const T]) -> AccessPattern {
if addresses.is_empty() {
return AccessPattern::Unknown;
}
let mut strides = Vec::new();
for window in addresses.windows(2) {
let stride = (window[1] as isize) - (window[0] as isize);
strides.push(stride / std::mem::size_of::<T>() as isize);
}
if strides.windows(2).all(|w| w[0] == w[1]) {
match strides[0] {
1 => AccessPattern::Sequential,
-1 => AccessPattern::ReverseSequential,
s if s > 1 => AccessPattern::Strided(s as usize),
_ => AccessPattern::Random,
}
} else {
AccessPattern::Random
}
}
}
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum AccessPattern {
Sequential,
ReverseSequential,
Strided(usize),
Random,
Unknown,
}
impl Default for MemoryAccessOptimizer {
fn default() -> Self {
Self::new()
}
}
pub use crate::performance::benchmarking;
pub use crate::performance::cache_optimization as cache_aware_algorithms;
pub use crate::performance::advanced_optimization;