use crate::error::{Result, ZiporaError};
use crate::memory::simd_ops::{SimdMemOps, SimdTier};
use crate::system::cpu_features::{CpuFeatures, get_cpu_features};
use std::alloc::{Layout, alloc, dealloc};
use std::ptr::{self, NonNull};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::mem;
pub const DEFAULT_CACHE_LINE_SIZE: usize = 64;
pub const L1_CACHE_SIZE: usize = 32 * 1024; pub const L2_CACHE_SIZE: usize = 256 * 1024; pub const L3_CACHE_SIZE: usize = 8 * 1024 * 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum AccessPattern {
Sequential,
Random,
WriteHeavy,
ReadHeavy,
Mixed,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CacheHierarchy {
pub l1_line_size: usize,
pub l1_size: usize,
pub l2_line_size: usize,
pub l2_size: usize,
pub l3_line_size: usize,
pub l3_size: usize,
pub levels: usize,
pub associativity: usize,
}
impl Default for CacheHierarchy {
fn default() -> Self {
Self {
l1_line_size: DEFAULT_CACHE_LINE_SIZE,
l1_size: L1_CACHE_SIZE,
l2_line_size: DEFAULT_CACHE_LINE_SIZE,
l2_size: L2_CACHE_SIZE,
l3_line_size: DEFAULT_CACHE_LINE_SIZE,
l3_size: L3_CACHE_SIZE,
levels: 3,
associativity: 8,
}
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CacheLayoutConfig {
pub hierarchy: CacheHierarchy,
pub cache_line_size: usize,
pub enable_hot_cold_separation: bool,
pub hot_threshold: usize,
pub enable_prefetch: bool,
pub prefetch_distance: usize,
pub access_pattern: AccessPattern,
}
impl Default for CacheLayoutConfig {
fn default() -> Self {
Self {
hierarchy: CacheHierarchy::default(),
cache_line_size: DEFAULT_CACHE_LINE_SIZE,
enable_hot_cold_separation: true,
hot_threshold: 1000, enable_prefetch: true,
prefetch_distance: 256, access_pattern: AccessPattern::Mixed,
}
}
}
impl CacheLayoutConfig {
pub fn new() -> Self {
let hierarchy = detect_cache_hierarchy();
Self {
cache_line_size: hierarchy.l1_line_size,
hierarchy,
..Default::default()
}
}
pub fn sequential() -> Self {
Self {
access_pattern: AccessPattern::Sequential,
prefetch_distance: 512, enable_prefetch: true,
..Self::new()
}
}
pub fn random() -> Self {
Self {
access_pattern: AccessPattern::Random,
prefetch_distance: 64, enable_hot_cold_separation: true,
..Self::new()
}
}
pub fn write_heavy() -> Self {
Self {
access_pattern: AccessPattern::WriteHeavy,
enable_hot_cold_separation: false, prefetch_distance: 128,
..Self::new()
}
}
pub fn read_heavy() -> Self {
Self {
access_pattern: AccessPattern::ReadHeavy,
enable_hot_cold_separation: true,
prefetch_distance: 256,
enable_prefetch: true,
..Self::new()
}
}
}
#[derive(Debug)]
pub struct CacheOptimizedAllocator {
config: CacheLayoutConfig,
simd_ops: SimdMemOps,
hot_allocations: AtomicUsize,
cold_allocations: AtomicUsize,
}
impl Clone for CacheOptimizedAllocator {
fn clone(&self) -> Self {
Self {
config: self.config.clone(),
simd_ops: self.simd_ops.clone(),
hot_allocations: AtomicUsize::new(self.hot_allocations.load(std::sync::atomic::Ordering::Relaxed)),
cold_allocations: AtomicUsize::new(self.cold_allocations.load(std::sync::atomic::Ordering::Relaxed)),
}
}
}
impl CacheOptimizedAllocator {
pub fn new(config: CacheLayoutConfig) -> Self {
Self {
config,
simd_ops: SimdMemOps::new(),
hot_allocations: AtomicUsize::new(0),
cold_allocations: AtomicUsize::new(0),
}
}
pub fn optimal() -> Self {
Self::new(CacheLayoutConfig::new())
}
pub fn allocate_aligned(&self, size: usize, alignment: usize, is_hot: bool) -> Result<NonNull<u8>> {
let effective_alignment = alignment.max(self.config.cache_line_size);
let aligned_size = align_to_cache_line(size, self.config.cache_line_size);
let layout = Layout::from_size_align(aligned_size, effective_alignment)
.map_err(|_| ZiporaError::invalid_data("Invalid layout for cache-aligned allocation"))?;
let ptr = unsafe { alloc(layout) };
if ptr.is_null() {
return Err(ZiporaError::out_of_memory(size));
}
if is_hot {
self.hot_allocations.fetch_add(1, Ordering::Relaxed);
} else {
self.cold_allocations.fetch_add(1, Ordering::Relaxed);
}
Ok(unsafe { NonNull::new_unchecked(ptr) })
}
pub fn deallocate_aligned(&self, ptr: NonNull<u8>, size: usize, alignment: usize) -> Result<()> {
let effective_alignment = alignment.max(self.config.cache_line_size);
let aligned_size = align_to_cache_line(size, self.config.cache_line_size);
let layout = Layout::from_size_align(aligned_size, effective_alignment)
.map_err(|_| ZiporaError::invalid_data("Invalid layout for cache-aligned deallocation"))?;
unsafe {
dealloc(ptr.as_ptr(), layout);
}
Ok(())
}
pub fn prefetch(&self, addr: *const u8, hint: PrefetchHint) {
if !self.config.enable_prefetch {
return;
}
#[cfg(target_arch = "x86_64")]
unsafe {
match hint {
PrefetchHint::T0 => std::arch::x86_64::_mm_prefetch(addr as *const i8, std::arch::x86_64::_MM_HINT_T0),
PrefetchHint::T1 => std::arch::x86_64::_mm_prefetch(addr as *const i8, std::arch::x86_64::_MM_HINT_T1),
PrefetchHint::T2 => std::arch::x86_64::_mm_prefetch(addr as *const i8, std::arch::x86_64::_MM_HINT_T2),
PrefetchHint::NTA => std::arch::x86_64::_mm_prefetch(addr as *const i8, std::arch::x86_64::_MM_HINT_NTA),
}
}
#[cfg(target_arch = "aarch64")]
unsafe {
match hint {
PrefetchHint::T0 | PrefetchHint::T1 => {
std::arch::asm!("prfm pldl1keep, [{}]", in(reg) addr);
}
PrefetchHint::T2 => {
std::arch::asm!("prfm pldl2keep, [{}]", in(reg) addr);
}
PrefetchHint::NTA => {
std::arch::asm!("prfm pldl1strm, [{}]", in(reg) addr);
}
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let _ = (addr, hint);
}
}
pub fn prefetch_range(&self, data: &[u8]) {
if !self.config.enable_prefetch || data.is_empty() {
return;
}
let distance = self.config.prefetch_distance;
let cache_line_size = self.config.cache_line_size;
let step_size = cache_line_size.min(distance);
for chunk in data.chunks(step_size) {
self.prefetch(chunk.as_ptr(), PrefetchHint::T0);
}
}
pub fn stats(&self) -> CacheLayoutStats {
CacheLayoutStats {
hot_allocations: self.hot_allocations.load(Ordering::Relaxed),
cold_allocations: self.cold_allocations.load(Ordering::Relaxed),
cache_line_size: self.config.cache_line_size,
prefetch_enabled: self.config.enable_prefetch,
hot_cold_separation: self.config.enable_hot_cold_separation,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PrefetchHint {
T0,
T1,
T2,
NTA,
}
#[derive(Debug)]
pub struct HotColdSeparator<T> {
hot_data: Vec<T>,
cold_data: Vec<T>,
access_counts: Vec<usize>,
config: CacheLayoutConfig,
}
impl<T> HotColdSeparator<T> {
pub fn new(config: CacheLayoutConfig) -> Self {
Self {
hot_data: Vec::new(),
cold_data: Vec::new(),
access_counts: Vec::new(),
config,
}
}
pub fn insert(&mut self, item: T, access_count: usize) {
if self.config.enable_hot_cold_separation && access_count >= self.config.hot_threshold {
self.hot_data.push(item);
} else {
self.cold_data.push(item);
}
self.access_counts.push(access_count);
}
pub fn hot_slice(&self) -> &[T] {
&self.hot_data
}
pub fn cold_slice(&self) -> &[T] {
&self.cold_data
}
pub fn reorganize(&mut self) {
if !self.config.enable_hot_cold_separation {
return;
}
let mut cold_to_hot = Vec::new();
let mut hot_to_cold = Vec::new();
for (i, &count) in self.access_counts.iter().enumerate() {
if count >= self.config.hot_threshold * 2 {
if i < self.cold_data.len() {
cold_to_hot.push(i);
}
} else if count < self.config.hot_threshold / 2 {
if i >= self.cold_data.len() {
hot_to_cold.push(i - self.cold_data.len());
}
}
}
for &i in cold_to_hot.iter().rev() {
if i < self.cold_data.len() {
let item = self.cold_data.remove(i);
self.hot_data.push(item);
}
}
for &i in hot_to_cold.iter().rev() {
if i < self.hot_data.len() {
let item = self.hot_data.remove(i);
self.cold_data.push(item);
}
}
}
pub fn separation_stats(&self) -> HotColdStats {
HotColdStats {
hot_items: self.hot_data.len(),
cold_items: self.cold_data.len(),
total_accesses: self.access_counts.iter().sum(),
separation_enabled: self.config.enable_hot_cold_separation,
}
}
}
#[derive(Debug, Clone)]
pub struct CacheLayoutStats {
pub hot_allocations: usize,
pub cold_allocations: usize,
pub cache_line_size: usize,
pub prefetch_enabled: bool,
pub hot_cold_separation: bool,
}
#[derive(Debug, Clone)]
pub struct HotColdStats {
pub hot_items: usize,
pub cold_items: usize,
pub total_accesses: usize,
pub separation_enabled: bool,
}
pub fn align_to_cache_line(size: usize, cache_line_size: usize) -> usize {
(size + cache_line_size - 1) & !(cache_line_size - 1)
}
pub fn detect_cache_hierarchy() -> CacheHierarchy {
#[cfg(target_arch = "x86_64")]
{
detect_x86_cache_hierarchy()
}
#[cfg(target_arch = "aarch64")]
{
detect_arm_cache_hierarchy()
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
CacheHierarchy::default()
}
}
#[cfg(target_arch = "x86_64")]
fn detect_x86_cache_hierarchy() -> CacheHierarchy {
use std::arch::x86_64::__cpuid;
let mut hierarchy = CacheHierarchy::default();
unsafe {
{
let mut level = 0;
loop {
let result = __cpuid(0x4 | (level << 8));
let cache_type = result.eax & 0x1F;
if cache_type == 0 {
break; }
let cache_level = (result.eax >> 5) & 0x7;
let line_size = ((result.ebx & 0xFFF) + 1) as usize;
let cache_size = (((result.ebx >> 22) + 1) as usize *
((result.ecx + 1) as usize) *
line_size);
match cache_level {
1 => {
hierarchy.l1_line_size = line_size;
hierarchy.l1_size = cache_size;
}
2 => {
hierarchy.l2_line_size = line_size;
hierarchy.l2_size = cache_size;
}
3 => {
hierarchy.l3_line_size = line_size;
hierarchy.l3_size = cache_size;
}
_ => {}
}
level += 1;
}
hierarchy.levels = if level == 0 { 3 } else { level as usize }; }
}
hierarchy
}
#[cfg(target_arch = "aarch64")]
fn detect_arm_cache_hierarchy() -> CacheHierarchy {
let mut hierarchy = CacheHierarchy::default();
#[cfg(target_os = "linux")]
{
if let Ok(l1_size) = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cache/index0/size") {
if let Ok(size) = parse_cache_size(&l1_size) {
hierarchy.l1_size = size;
}
}
if let Ok(l2_size) = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cache/index2/size") {
if let Ok(size) = parse_cache_size(&l2_size) {
hierarchy.l2_size = size;
}
}
if let Ok(l3_size) = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cache/index3/size") {
if let Ok(size) = parse_cache_size(&l3_size) {
hierarchy.l3_size = size;
}
}
if let Ok(line_size) = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size") {
if let Ok(size) = line_size.trim().parse::<usize>() {
hierarchy.l1_line_size = size;
hierarchy.l2_line_size = size;
hierarchy.l3_line_size = size;
}
}
}
hierarchy
}
#[cfg(target_os = "linux")]
fn parse_cache_size(size_str: &str) -> Result<usize> {
let size_str = size_str.trim();
if size_str.is_empty() {
return Err(ZiporaError::invalid_data("Empty cache size string"));
}
let (number_part, unit) = if size_str.ends_with('K') {
(&size_str[..size_str.len()-1], 1024)
} else if size_str.ends_with('M') {
(&size_str[..size_str.len()-1], 1024 * 1024)
} else if size_str.ends_with('G') {
(&size_str[..size_str.len()-1], 1024 * 1024 * 1024)
} else {
(size_str, 1)
};
let number: usize = number_part.parse()
.map_err(|_| ZiporaError::invalid_data(&format!("Invalid cache size number: {}", number_part)))?;
Ok(number * unit)
}
pub struct CacheAlignedVec<T> {
data: Vec<T>,
allocator: CacheOptimizedAllocator,
access_pattern: AccessPattern,
}
impl<T> CacheAlignedVec<T> {
pub fn new() -> Self {
Self {
data: Vec::new(),
allocator: CacheOptimizedAllocator::optimal(),
access_pattern: AccessPattern::Mixed,
}
}
pub fn with_access_pattern(pattern: AccessPattern) -> Self {
let config = match pattern {
AccessPattern::Sequential => CacheLayoutConfig::sequential(),
AccessPattern::Random => CacheLayoutConfig::random(),
AccessPattern::WriteHeavy => CacheLayoutConfig::write_heavy(),
AccessPattern::ReadHeavy => CacheLayoutConfig::read_heavy(),
AccessPattern::Mixed => CacheLayoutConfig::new(),
};
Self {
data: Vec::new(),
allocator: CacheOptimizedAllocator::new(config),
access_pattern: pattern,
}
}
pub fn push(&mut self, value: T) {
self.data.push(value);
if matches!(self.access_pattern, AccessPattern::Sequential) {
let len = self.data.len();
if len > 0 && len % 8 == 0 { let byte_slice = unsafe {
std::slice::from_raw_parts(
self.data.as_ptr() as *const u8,
mem::size_of::<T>() * len
)
};
self.allocator.prefetch_range(byte_slice);
}
}
}
pub fn get(&self, index: usize) -> Option<&T> {
if let Some(element) = self.data.get(index) {
if matches!(self.access_pattern, AccessPattern::Random) && index + 1 < self.data.len() {
let addr = unsafe { self.data.as_ptr().add(index + 1) } as *const u8;
self.allocator.prefetch(addr, PrefetchHint::T1);
}
Some(element)
} else {
None
}
}
pub fn slice(&self, range: std::ops::Range<usize>) -> Option<&[T]> {
if range.end <= self.data.len() {
let slice = &self.data[range];
if !slice.is_empty() {
let byte_slice = unsafe {
std::slice::from_raw_parts(
slice.as_ptr() as *const u8,
slice.len() * mem::size_of::<T>()
)
};
self.allocator.prefetch_range(byte_slice);
}
Some(slice)
} else {
None
}
}
#[inline]
pub fn as_slice(&self) -> &[T] {
&self.data
}
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
}
impl<T> Default for CacheAlignedVec<T> {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_hierarchy_detection() {
let hierarchy = detect_cache_hierarchy();
assert!(hierarchy.l1_line_size > 0);
assert!(hierarchy.l1_size > 0);
assert!(hierarchy.levels > 0);
println!("Detected cache hierarchy: {:?}", hierarchy);
}
#[test]
fn test_cache_layout_config() {
let config = CacheLayoutConfig::new();
assert!(config.cache_line_size > 0);
assert!(config.hierarchy.l1_size > 0);
let sequential_config = CacheLayoutConfig::sequential();
assert_eq!(sequential_config.access_pattern, AccessPattern::Sequential);
assert!(sequential_config.prefetch_distance > config.prefetch_distance);
}
#[test]
fn test_cache_optimized_allocator() {
let allocator = CacheOptimizedAllocator::optimal();
let ptr = allocator.allocate_aligned(1024, 64, true).unwrap();
assert_eq!(ptr.as_ptr() as usize % 64, 0);
assert!(allocator.deallocate_aligned(ptr, 1024, 64).is_ok());
let stats = allocator.stats();
assert_eq!(stats.hot_allocations, 1);
assert_eq!(stats.cold_allocations, 0);
}
#[test]
fn test_align_to_cache_line() {
assert_eq!(align_to_cache_line(0, 64), 0);
assert_eq!(align_to_cache_line(1, 64), 64);
assert_eq!(align_to_cache_line(64, 64), 64);
assert_eq!(align_to_cache_line(65, 64), 128);
assert_eq!(align_to_cache_line(100, 64), 128);
}
#[test]
fn test_hot_cold_separator() {
let config = CacheLayoutConfig::new();
let mut separator = HotColdSeparator::new(config);
separator.insert("hot1".to_string(), 2000);
separator.insert("cold1".to_string(), 100);
separator.insert("hot2".to_string(), 1500);
separator.insert("cold2".to_string(), 50);
assert_eq!(separator.hot_slice().len(), 2);
assert_eq!(separator.cold_slice().len(), 2);
let stats = separator.separation_stats();
assert_eq!(stats.hot_items, 2);
assert_eq!(stats.cold_items, 2);
assert!(stats.separation_enabled);
}
#[test]
fn test_cache_aligned_vec() {
let mut vec = CacheAlignedVec::with_access_pattern(AccessPattern::Sequential);
vec.push(1);
vec.push(2);
vec.push(3);
assert_eq!(vec.len(), 3);
assert_eq!(vec.get(1), Some(&2));
assert_eq!(vec.as_slice(), &[1, 2, 3]);
let slice = vec.slice(1..3).unwrap();
assert_eq!(slice, &[2, 3]);
}
#[test]
fn test_prefetch_operations() {
let allocator = CacheOptimizedAllocator::optimal();
let data = vec![1u8; 1024];
allocator.prefetch(data.as_ptr(), PrefetchHint::T0);
allocator.prefetch_range(&data);
}
#[test]
fn test_access_patterns() {
let sequential = CacheAlignedVec::<i32>::with_access_pattern(AccessPattern::Sequential);
let random = CacheAlignedVec::<i32>::with_access_pattern(AccessPattern::Random);
let write_heavy = CacheAlignedVec::<i32>::with_access_pattern(AccessPattern::WriteHeavy);
let read_heavy = CacheAlignedVec::<i32>::with_access_pattern(AccessPattern::ReadHeavy);
assert!(sequential.is_empty());
assert!(random.is_empty());
assert!(write_heavy.is_empty());
assert!(read_heavy.is_empty());
}
#[cfg(target_os = "linux")]
#[test]
fn test_cache_size_parsing() {
assert_eq!(parse_cache_size("32K").unwrap(), 32 * 1024);
assert_eq!(parse_cache_size("256K").unwrap(), 256 * 1024);
assert_eq!(parse_cache_size("8M").unwrap(), 8 * 1024 * 1024);
assert_eq!(parse_cache_size("1G").unwrap(), 1024 * 1024 * 1024);
assert_eq!(parse_cache_size("1024").unwrap(), 1024);
assert!(parse_cache_size("").is_err());
assert!(parse_cache_size("invalid").is_err());
}
#[test]
fn test_cache_layout_stats() {
let allocator = CacheOptimizedAllocator::optimal();
let _hot1 = allocator.allocate_aligned(64, 64, true).unwrap();
let _hot2 = allocator.allocate_aligned(128, 64, true).unwrap();
let _cold1 = allocator.allocate_aligned(64, 64, false).unwrap();
let stats = allocator.stats();
assert_eq!(stats.hot_allocations, 2);
assert_eq!(stats.cold_allocations, 1);
assert!(stats.cache_line_size >= 32);
}
#[test]
fn test_hot_cold_reorganization() {
let config = CacheLayoutConfig::new();
let mut separator = HotColdSeparator::new(config);
for i in 0..10 {
let access_count = if i < 3 { 2000 } else { 100 }; separator.insert(format!("item{}", i), access_count);
}
assert_eq!(separator.hot_slice().len(), 3);
assert_eq!(separator.cold_slice().len(), 7);
separator.reorganize();
let stats = separator.separation_stats();
assert!(stats.total_accesses > 0);
}
}