use crate::error::{NumRs2Error, Result};
use crate::traits::{MemoryAllocator, SpecializedAllocator};
use std::alloc::Layout;
use std::collections::HashMap;
use std::ptr::NonNull;
use std::time::{Duration, Instant};
pub mod cache_constants {
pub const CACHE_LINE_SIZE: usize = 64;
pub const L1_CACHE_SIZE: usize = 32 * 1024;
pub const L2_CACHE_SIZE: usize = 256 * 1024;
pub const L3_CACHE_SIZE: usize = 8 * 1024 * 1024;
pub const PAGE_SIZE: usize = 4096;
pub const PREFETCH_DISTANCE: usize = 3;
}
#[derive(Debug, Clone)]
pub struct CacheConfig {
pub cache_line_size: usize,
pub l1_cache_size: usize,
pub l2_cache_size: usize,
pub l3_cache_size: usize,
pub enable_cache_padding: bool,
pub enable_blocking: bool,
pub enable_prefetch: bool,
pub optimize_access_patterns: bool,
}
impl Default for CacheConfig {
fn default() -> Self {
Self {
cache_line_size: cache_constants::CACHE_LINE_SIZE,
l1_cache_size: cache_constants::L1_CACHE_SIZE,
l2_cache_size: cache_constants::L2_CACHE_SIZE,
l3_cache_size: cache_constants::L3_CACHE_SIZE,
enable_cache_padding: true,
enable_blocking: true,
enable_prefetch: true,
optimize_access_patterns: true,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct CacheMetrics {
pub l1_hits: u64,
pub l1_misses: u64,
pub l2_hits: u64,
pub l2_misses: u64,
pub total_accesses: u64,
pub avg_latency_ns: f64,
pub cache_efficiency: f64,
pub last_updated: Option<Instant>,
}
impl CacheMetrics {
pub fn l1_hit_ratio(&self) -> f64 {
if self.l1_hits + self.l1_misses == 0 {
0.0
} else {
self.l1_hits as f64 / (self.l1_hits + self.l1_misses) as f64
}
}
pub fn l2_hit_ratio(&self) -> f64 {
if self.l2_hits + self.l2_misses == 0 {
0.0
} else {
self.l2_hits as f64 / (self.l2_hits + self.l2_misses) as f64
}
}
pub fn overall_efficiency(&self) -> f64 {
let l1_ratio = self.l1_hit_ratio();
let l2_ratio = self.l2_hit_ratio();
(l1_ratio * 0.9) + (l2_ratio * 0.1)
}
}
pub struct CacheOptimizedAllocator {
config: CacheConfig,
metrics: std::sync::Mutex<CacheMetrics>,
access_tracker: std::sync::Mutex<AccessTracker>,
}
impl Default for CacheOptimizedAllocator {
fn default() -> Self {
Self::new(CacheConfig::default())
}
}
impl CacheOptimizedAllocator {
pub fn new(config: CacheConfig) -> Self {
Self {
config,
metrics: std::sync::Mutex::new(CacheMetrics::default()),
access_tracker: std::sync::Mutex::new(AccessTracker::new()),
}
}
pub fn get_cache_metrics(&self) -> CacheMetrics {
self.metrics
.lock()
.expect("metrics mutex should not be poisoned")
.clone()
}
pub fn optimal_block_size(&self, element_size: usize, cache_level: CacheLevel) -> usize {
let cache_size = match cache_level {
CacheLevel::L1 => self.config.l1_cache_size,
CacheLevel::L2 => self.config.l2_cache_size,
CacheLevel::L3 => self.config.l3_cache_size,
};
let usable_cache = cache_size / 4;
let elements_per_block = usable_cache / element_size;
elements_per_block.next_power_of_two() / 2
}
pub fn fits_in_cache(&self, size: usize, cache_level: CacheLevel) -> bool {
let cache_size = match cache_level {
CacheLevel::L1 => self.config.l1_cache_size,
CacheLevel::L2 => self.config.l2_cache_size,
CacheLevel::L3 => self.config.l3_cache_size,
};
size <= (cache_size * 4) / 5
}
fn record_access(&self, address: usize, size: usize, access_type: AccessType) {
if let Ok(mut tracker) = self.access_tracker.try_lock() {
tracker.record_access(address, size, access_type);
if tracker.should_update_metrics() {
if let Ok(mut metrics) = self.metrics.try_lock() {
tracker.update_cache_metrics(&mut metrics, &self.config);
}
}
}
}
pub fn analyze_cache_performance(&self) -> Vec<CacheOptimizationRecommendation> {
let metrics = self.get_cache_metrics();
let mut recommendations = Vec::new();
if metrics.l1_hit_ratio() < 0.9 {
recommendations.push(CacheOptimizationRecommendation {
optimization_type: CacheOptimizationType::ImproveLocality,
description: "L1 cache hit ratio is low. Consider improving data locality through blocking or tiling.".to_string(),
estimated_improvement: 0.15,
complexity: 3,
target_cache_level: CacheLevel::L1,
});
}
if metrics.l2_hit_ratio() < 0.95 {
recommendations.push(CacheOptimizationRecommendation {
optimization_type: CacheOptimizationType::ReduceWorkingSet,
description: "L2 cache hit ratio indicates large working set. Consider data structure reorganization.".to_string(),
estimated_improvement: 0.10,
complexity: 4,
target_cache_level: CacheLevel::L2,
});
}
if self.has_potential_false_sharing() {
recommendations.push(CacheOptimizationRecommendation {
optimization_type: CacheOptimizationType::EliminateFalseSharing,
description: "Potential false sharing detected. Consider cache line padding for frequently accessed data.".to_string(),
estimated_improvement: 0.25,
complexity: 2,
target_cache_level: CacheLevel::L1,
});
}
recommendations
}
fn has_potential_false_sharing(&self) -> bool {
if let Ok(tracker) = self.access_tracker.try_lock() {
tracker.concurrent_writes_in_cache_lines() > 10
} else {
false
}
}
}
impl std::fmt::Debug for CacheOptimizedAllocator {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CacheOptimizedAllocator")
.field("config", &self.config)
.field("metrics", &"Mutex<CacheMetrics>")
.finish()
}
}
impl MemoryAllocator for CacheOptimizedAllocator {
type Error = NumRs2Error;
fn allocate(&self, layout: Layout) -> Result<NonNull<u8>> {
let aligned_size = if self.config.enable_cache_padding {
(layout.size() + self.config.cache_line_size - 1) & !(self.config.cache_line_size - 1)
} else {
layout.size()
};
let aligned_layout = Layout::from_size_align(
aligned_size,
layout.align().max(self.config.cache_line_size),
)
.map_err(|_| {
NumRs2Error::Memory(crate::error::memory::MemoryError::alignment_error(
"Invalid layout",
self.config.cache_line_size,
))
})?;
unsafe {
let ptr = std::alloc::alloc(aligned_layout);
if ptr.is_null() {
return Err(NumRs2Error::Memory(
crate::error::memory::MemoryError::allocation_failed(
"Allocation failed",
aligned_size,
),
));
}
let non_null_ptr = NonNull::new_unchecked(ptr);
self.record_access(ptr as usize, aligned_size, AccessType::Write);
Ok(non_null_ptr)
}
}
unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) -> Result<()> {
let aligned_size = if self.config.enable_cache_padding {
(layout.size() + self.config.cache_line_size - 1) & !(self.config.cache_line_size - 1)
} else {
layout.size()
};
let aligned_layout = Layout::from_size_align(
aligned_size,
layout.align().max(self.config.cache_line_size),
)
.map_err(|_| {
NumRs2Error::Memory(crate::error::memory::MemoryError::alignment_error(
"Invalid layout",
self.config.cache_line_size,
))
})?;
std::alloc::dealloc(ptr.as_ptr(), aligned_layout);
Ok(())
}
unsafe fn reallocate(
&self,
ptr: NonNull<u8>,
old_layout: Layout,
new_layout: Layout,
) -> Result<NonNull<u8>> {
let new_ptr = self.allocate(new_layout)?;
let copy_size = old_layout.size().min(new_layout.size());
std::ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_ptr(), copy_size);
self.deallocate(ptr, old_layout)?;
Ok(new_ptr)
}
fn supports_layout(&self, layout: Layout) -> bool {
layout.size() > 0 && layout.align() <= self.config.cache_line_size
}
fn preferred_alignment(&self) -> usize {
self.config.cache_line_size
}
fn statistics(&self) -> Option<crate::traits::AllocationStats> {
let metrics = self.get_cache_metrics();
Some(crate::traits::AllocationStats {
bytes_allocated: 0, bytes_deallocated: 0,
active_allocations: 0,
peak_usage: 0,
allocation_count: metrics.total_accesses as usize,
deallocation_count: 0,
})
}
}
impl SpecializedAllocator for CacheOptimizedAllocator {
fn allocation_error(&self, msg: &str) -> Self::Error {
NumRs2Error::Memory(crate::error::memory::MemoryError::allocation_failed(msg, 0))
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CacheLevel {
L1,
L2,
L3,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AccessType {
Read,
Write,
ReadWrite,
}
#[derive(Debug, Clone)]
pub struct CacheOptimizationRecommendation {
pub optimization_type: CacheOptimizationType,
pub description: String,
pub estimated_improvement: f64,
pub complexity: u8,
pub target_cache_level: CacheLevel,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CacheOptimizationType {
ImproveLocality,
ReduceWorkingSet,
EliminateFalseSharing,
OptimizeStride,
EnablePrefetch,
UseBlocking,
AlignDataStructures,
}
#[derive(Debug)]
struct AccessTracker {
recent_accesses: Vec<MemoryAccess>,
access_histogram: HashMap<usize, usize>, total_accesses: u64,
concurrent_writes: u64,
last_analysis: Option<Instant>,
}
#[derive(Debug, Clone)]
struct MemoryAccess {
address: usize,
#[allow(dead_code)]
size: usize,
#[allow(dead_code)]
access_type: AccessType,
#[allow(dead_code)]
timestamp: Instant,
}
impl AccessTracker {
fn new() -> Self {
Self {
recent_accesses: Vec::new(),
access_histogram: HashMap::new(),
total_accesses: 0,
concurrent_writes: 0,
last_analysis: None,
}
}
fn record_access(&mut self, address: usize, size: usize, access_type: AccessType) {
let access = MemoryAccess {
address,
size,
access_type,
timestamp: Instant::now(),
};
let cache_line = address / cache_constants::CACHE_LINE_SIZE;
*self.access_histogram.entry(cache_line).or_insert(0) += 1;
if access_type == AccessType::Write || access_type == AccessType::ReadWrite {
self.concurrent_writes += 1;
}
self.recent_accesses.push(access);
self.total_accesses += 1;
if self.recent_accesses.len() > 1000 {
self.recent_accesses.remove(0);
}
}
fn should_update_metrics(&self) -> bool {
match self.last_analysis {
None => true,
Some(last) => last.elapsed() > Duration::from_secs(1),
}
}
fn update_cache_metrics(&mut self, metrics: &mut CacheMetrics, config: &CacheConfig) {
let mut l1_hits = 0u64;
let mut l1_misses = 0u64;
let mut l2_hits = 0u64;
let mut l2_misses = 0u64;
let mut simulated_l1_cache = std::collections::HashSet::new();
let mut simulated_l2_cache = std::collections::HashSet::new();
let l1_cache_lines = config.l1_cache_size / config.cache_line_size;
let l2_cache_lines = config.l2_cache_size / config.cache_line_size;
for access in &self.recent_accesses {
let cache_line = access.address / config.cache_line_size;
if simulated_l1_cache.contains(&cache_line) {
l1_hits += 1;
} else if simulated_l2_cache.contains(&cache_line) {
l1_misses += 1;
l2_hits += 1;
if simulated_l1_cache.len() >= l1_cache_lines {
simulated_l1_cache.clear();
}
simulated_l1_cache.insert(cache_line);
} else {
l1_misses += 1;
l2_misses += 1;
if simulated_l2_cache.len() >= l2_cache_lines {
simulated_l2_cache.clear();
}
simulated_l2_cache.insert(cache_line);
if simulated_l1_cache.len() >= l1_cache_lines {
simulated_l1_cache.clear();
}
simulated_l1_cache.insert(cache_line);
}
}
metrics.l1_hits = l1_hits;
metrics.l1_misses = l1_misses;
metrics.l2_hits = l2_hits;
metrics.l2_misses = l2_misses;
metrics.total_accesses = self.total_accesses;
metrics.cache_efficiency = metrics.overall_efficiency();
metrics.last_updated = Some(Instant::now());
self.last_analysis = Some(Instant::now());
}
fn concurrent_writes_in_cache_lines(&self) -> u64 {
self.concurrent_writes
}
}
pub struct CacheBlockedMatrix<T> {
data: Vec<T>,
rows: usize,
cols: usize,
block_size: usize,
cache_config: CacheConfig,
}
impl<T: Clone> CacheBlockedMatrix<T> {
pub fn new(rows: usize, cols: usize, cache_config: CacheConfig) -> Self {
let element_size = std::mem::size_of::<T>();
let block_size = if cache_config.enable_blocking {
let cache_size = cache_config.l1_cache_size / 4; let elements_per_block = cache_size / element_size;
(elements_per_block as f64).sqrt() as usize
} else {
64 };
Self {
data: vec![unsafe { std::mem::zeroed() }; rows * cols],
rows,
cols,
block_size,
cache_config,
}
}
pub fn get(&self, row: usize, col: usize) -> Option<&T> {
if row < self.rows && col < self.cols {
Some(&self.data[row * self.cols + col])
} else {
None
}
}
pub fn set(&mut self, row: usize, col: usize, value: T) -> bool {
if row < self.rows && col < self.cols {
self.data[row * self.cols + col] = value;
true
} else {
false
}
}
pub fn multiply_blocked(&self, other: &Self) -> Option<Self>
where
T: std::ops::Add<Output = T> + std::ops::Mul<Output = T> + Copy + Default,
{
if self.cols != other.rows {
return None;
}
let mut result = Self {
data: vec![T::default(); self.rows * other.cols],
rows: self.rows,
cols: other.cols,
block_size: self.block_size,
cache_config: self.cache_config.clone(),
};
for i_block in (0..self.rows).step_by(self.block_size) {
for j_block in (0..other.cols).step_by(self.block_size) {
for k_block in (0..self.cols).step_by(self.block_size) {
let i_end = (i_block + self.block_size).min(self.rows);
let j_end = (j_block + self.block_size).min(other.cols);
let k_end = (k_block + self.block_size).min(self.cols);
for i in i_block..i_end {
for j in j_block..j_end {
let mut sum = result.data[i * result.cols + j];
for k in k_block..k_end {
sum = sum
+ self.data[i * self.cols + k] * other.data[k * other.cols + j];
}
result.data[i * result.cols + j] = sum;
}
}
}
}
}
Some(result)
}
pub fn transpose_blocked(&self) -> Self
where
T: Copy + Default,
{
let mut result = Self {
data: vec![T::default(); self.rows * self.cols],
rows: self.cols,
cols: self.rows,
block_size: self.block_size,
cache_config: self.cache_config.clone(),
};
for i_block in (0..self.rows).step_by(self.block_size) {
for j_block in (0..self.cols).step_by(self.block_size) {
let i_end = (i_block + self.block_size).min(self.rows);
let j_end = (j_block + self.block_size).min(self.cols);
for i in i_block..i_end {
for j in j_block..j_end {
result.data[j * result.cols + i] = self.data[i * self.cols + j];
}
}
}
}
result
}
}
pub struct CacheOptimizedBuilder {
config: CacheConfig,
}
impl CacheOptimizedBuilder {
pub fn new(config: CacheConfig) -> Self {
Self { config }
}
pub fn build_allocator(&self) -> CacheOptimizedAllocator {
CacheOptimizedAllocator::new(self.config.clone())
}
pub fn build_matrix<T: Clone>(&self, rows: usize, cols: usize) -> CacheBlockedMatrix<T> {
CacheBlockedMatrix::new(rows, cols, self.config.clone())
}
pub fn optimize_for_size(
&self,
total_elements: usize,
element_size: usize,
) -> CacheOptimizationParams {
let total_size = total_elements * element_size;
let recommended_block_size = if total_size <= self.config.l1_cache_size {
self.config.l1_cache_size / 4 / element_size
} else if total_size <= self.config.l2_cache_size {
self.config.l2_cache_size / 8 / element_size
} else {
self.config.l3_cache_size / 16 / element_size
};
CacheOptimizationParams {
block_size: recommended_block_size.max(1),
enable_prefetch: total_size > self.config.l2_cache_size,
enable_blocking: total_size > self.config.l1_cache_size,
target_cache_level: if total_size <= self.config.l1_cache_size {
CacheLevel::L1
} else if total_size <= self.config.l2_cache_size {
CacheLevel::L2
} else {
CacheLevel::L3
},
}
}
}
#[derive(Debug, Clone)]
pub struct CacheOptimizationParams {
pub block_size: usize,
pub enable_prefetch: bool,
pub enable_blocking: bool,
pub target_cache_level: CacheLevel,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_optimized_allocator() {
let config = CacheConfig::default();
let allocator = CacheOptimizedAllocator::new(config);
let layout = Layout::from_size_align(1024, 8).expect("Layout should succeed");
let ptr = allocator
.allocate(layout)
.expect("allocation should succeed");
assert_eq!(ptr.as_ptr() as usize % cache_constants::CACHE_LINE_SIZE, 0);
unsafe {
allocator
.deallocate(ptr, layout)
.expect("deallocation should succeed");
}
}
#[test]
fn test_cache_metrics() {
let metrics = CacheMetrics {
l1_hits: 900,
l1_misses: 100,
l2_hits: 80,
l2_misses: 20,
..Default::default()
};
assert_eq!(metrics.l1_hit_ratio(), 0.9);
assert_eq!(metrics.l2_hit_ratio(), 0.8);
assert!(metrics.overall_efficiency() > 0.8);
}
#[test]
fn test_cache_blocked_matrix() {
let config = CacheConfig::default();
let mut matrix = CacheBlockedMatrix::<f32>::new(4, 4, config);
matrix.set(0, 0, 1.0);
matrix.set(1, 1, 2.0);
matrix.set(2, 2, 3.0);
matrix.set(3, 3, 4.0);
let transposed = matrix.transpose_blocked();
assert_eq!(transposed.get(0, 0), Some(&1.0));
assert_eq!(transposed.get(1, 1), Some(&2.0));
}
#[test]
fn test_cache_optimization_params() {
let config = CacheConfig::default();
let builder = CacheOptimizedBuilder::new(config);
let params = builder.optimize_for_size(1000, 4);
assert_eq!(params.target_cache_level, CacheLevel::L1);
assert!(!params.enable_blocking);
let params = builder.optimize_for_size(1_000_000, 8);
assert_eq!(params.target_cache_level, CacheLevel::L3);
assert!(params.enable_blocking);
assert!(params.enable_prefetch);
}
#[test]
fn test_optimal_block_size_calculation() {
let config = CacheConfig::default();
let allocator = CacheOptimizedAllocator::new(config);
let block_size_l1 = allocator.optimal_block_size(8, CacheLevel::L1);
let block_size_l2 = allocator.optimal_block_size(8, CacheLevel::L2);
let block_size_l3 = allocator.optimal_block_size(8, CacheLevel::L3);
assert!(block_size_l1 < block_size_l2);
assert!(block_size_l2 < block_size_l3);
assert!(block_size_l1.is_power_of_two());
}
#[test]
fn test_cache_fits_analysis() {
let config = CacheConfig::default();
let allocator = CacheOptimizedAllocator::new(config);
assert!(allocator.fits_in_cache(16 * 1024, CacheLevel::L1));
assert!(!allocator.fits_in_cache(100 * 1024, CacheLevel::L1));
assert!(allocator.fits_in_cache(100 * 1024, CacheLevel::L2));
assert!(!allocator.fits_in_cache(4 * 1024 * 1024, CacheLevel::L2));
assert!(allocator.fits_in_cache(4 * 1024 * 1024, CacheLevel::L3));
}
#[test]
fn test_matrix_blocked_multiplication() {
let config = CacheConfig::default();
let mut a = CacheBlockedMatrix::<f32>::new(2, 2, config.clone());
let mut b = CacheBlockedMatrix::<f32>::new(2, 2, config);
a.set(0, 0, 1.0);
a.set(1, 1, 1.0);
b.set(0, 0, 2.0);
b.set(1, 1, 2.0);
let result = a
.multiply_blocked(&b)
.expect("matrix multiplication should succeed");
assert_eq!(result.get(0, 0), Some(&2.0));
assert_eq!(result.get(1, 1), Some(&2.0));
assert_eq!(result.get(0, 1), Some(&0.0));
assert_eq!(result.get(1, 0), Some(&0.0));
}
#[test]
fn test_cache_performance_analysis() {
let config = CacheConfig::default();
let allocator = CacheOptimizedAllocator::new(config);
for _ in 0..100 {
let layout = Layout::from_size_align(64, 8).expect("Layout should succeed");
let _ptr = allocator
.allocate(layout)
.expect("allocation should succeed");
}
let recommendations = allocator.analyze_cache_performance();
assert!(!recommendations.is_empty());
}
}