use std::alloc::{alloc_zeroed, dealloc, Layout};
use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0, _MM_HINT_T1, _MM_HINT_T2, _MM_HINT_NTA};
use std::marker::PhantomData;
use std::mem::{align_of, size_of, MaybeUninit};
use std::ptr::NonNull;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
pub const CACHE_LINE_SIZE: usize = 64;
pub const L1_CACHE_SIZE: usize = 32 * 1024;
pub const L2_CACHE_SIZE: usize = 256 * 1024;
pub const L3_CACHE_SIZE: usize = 8 * 1024 * 1024;
pub const PREFETCH_DISTANCE: usize = 4;
static NUMA_NODE_COUNT: AtomicUsize = AtomicUsize::new(0);
#[derive(Debug, Default, Clone)]
pub struct CacheMetrics {
pub l1_hits: u64,
pub l1_misses: u64,
pub l2_hits: u64,
pub l2_misses: u64,
pub l3_hits: u64,
pub l3_misses: u64,
pub prefetch_count: u64,
pub memory_stalls: u64,
pub cache_invalidations: u64,
pub false_sharing_detected: u64,
}
impl CacheMetrics {
pub fn new() -> Self {
Self::default()
}
pub fn hit_ratio(&self) -> f64 {
let total_hits = self.l1_hits + self.l2_hits + self.l3_hits;
let total_accesses = total_hits + self.l1_misses + self.l2_misses + self.l3_misses;
if total_accesses == 0 {
0.0
} else {
total_hits as f64 / total_accesses as f64
}
}
pub fn estimated_bandwidth_gb(&self) -> f64 {
let cache_line_transfers = self.l1_misses + self.l2_misses + self.l3_misses;
(cache_line_transfers * CACHE_LINE_SIZE as u64) as f64 / (1024.0 * 1024.0 * 1024.0)
}
}
#[repr(align(64))]
#[derive(Clone, Debug)]
pub struct CacheAligned<T> {
data: T,
}
impl<T> CacheAligned<T> {
pub fn new(data: T) -> Self {
Self { data }
}
#[inline]
pub fn get(&self) -> &T {
&self.data
}
pub fn get_mut(&mut self) -> &mut T {
&mut self.data
}
pub fn into_inner(self) -> T {
self.data
}
}
#[derive(Debug, Clone, Copy)]
pub enum PrefetchHint {
AllLevels,
L2L3,
L3Only,
NonTemporal,
}
pub struct Prefetcher;
impl Prefetcher {
pub fn new() -> Self {
Prefetcher
}
#[inline(always)]
pub unsafe fn prefetch<T>(ptr: *const T, hint: PrefetchHint) {
#[cfg(target_arch = "x86_64")]
unsafe {
match hint {
PrefetchHint::AllLevels => _mm_prefetch(ptr as *const i8, _MM_HINT_T0),
PrefetchHint::L2L3 => _mm_prefetch(ptr as *const i8, _MM_HINT_T1),
PrefetchHint::L3Only => _mm_prefetch(ptr as *const i8, _MM_HINT_T2),
PrefetchHint::NonTemporal => _mm_prefetch(ptr as *const i8, _MM_HINT_NTA),
}
}
}
#[inline(always)]
pub unsafe fn prefetch_range<T>(start: *const T, count: usize, hint: PrefetchHint) {
unsafe {
let bytes_per_element = size_of::<T>();
let mut current = start as *const u8;
for _ in 0..count {
Self::prefetch(current as *const T, hint);
current = current.add(bytes_per_element);
}
}
}
#[inline(always)]
pub unsafe fn prefetch_strided<T>(
base: *const T,
stride: usize,
count: usize,
hint: PrefetchHint,
) {
unsafe {
let mut current = base;
for _ in 0..count {
Self::prefetch(current, hint);
current = current.add(stride);
}
}
}
}
#[repr(C, align(64))]
pub struct CacheOptimizedBucket<K, V, const ENTRIES_PER_BUCKET: usize = 7> {
pub metadata: BucketMetadata,
pub hashes: [u32; ENTRIES_PER_BUCKET],
pub entries: [MaybeUninit<(K, V)>; ENTRIES_PER_BUCKET],
pub overflow: Option<NonNull<Self>>,
pub _padding: [u8; 0], }
#[repr(C)]
#[derive(Clone, Copy)]
pub struct BucketMetadata {
pub occupancy: u8,
pub lock_or_version: u8,
pub max_probe_distance: u16,
pub reserved: u32,
}
impl<K, V, const N: usize> CacheOptimizedBucket<K, V, N> {
pub fn new() -> Self {
Self {
metadata: BucketMetadata {
occupancy: 0,
lock_or_version: 0,
max_probe_distance: 0,
reserved: 0,
},
hashes: [0; N],
entries: unsafe { MaybeUninit::uninit().assume_init() },
overflow: None,
_padding: [],
}
}
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.metadata.occupancy == 0
}
#[inline(always)]
pub fn count(&self) -> usize {
self.metadata.occupancy.count_ones() as usize
}
#[inline(always)]
pub fn find_hash(&self, hash: u32) -> Option<usize> {
for i in 0..N {
if (self.metadata.occupancy & (1 << i)) != 0 && self.hashes[i] == hash {
return Some(i);
}
}
None
}
#[inline(always)]
pub unsafe fn prefetch(&self) {
unsafe {
Prefetcher::prefetch(self as *const _, PrefetchHint::AllLevels);
if size_of::<(K, V)>() * N > CACHE_LINE_SIZE {
let second_line = (self as *const _ as *const u8).add(CACHE_LINE_SIZE);
Prefetcher::prefetch(second_line as *const Self, PrefetchHint::L2L3);
}
}
}
}
pub struct NumaAllocator {
preferred_node: usize,
node_stats: Vec<AtomicU64>,
}
impl NumaAllocator {
pub fn new() -> Self {
let node_count = Self::detect_numa_nodes();
NUMA_NODE_COUNT.store(node_count, Ordering::Relaxed);
Self {
preferred_node: Self::current_numa_node(),
node_stats: (0..node_count).map(|_| AtomicU64::new(0)).collect(),
}
}
fn detect_numa_nodes() -> usize {
1
}
fn current_numa_node() -> usize {
0
}
pub unsafe fn alloc_on_node(&self, layout: Layout, node: usize) -> *mut u8 {
unsafe {
let ptr = alloc_zeroed(layout);
if !ptr.is_null() {
self.node_stats[node.min(self.node_stats.len() - 1)]
.fetch_add(layout.size() as u64, Ordering::Relaxed);
}
ptr
}
}
pub unsafe fn alloc_cache_aligned<T>(&self, count: usize) -> *mut T {
unsafe {
let layout = Layout::from_size_align(
size_of::<T>() * count,
CACHE_LINE_SIZE.max(align_of::<T>()),
)
.expect("Layout invariant violated: alignment is always valid power of 2");
self.alloc_on_node(layout, self.preferred_node) as *mut T
}
}
pub unsafe fn dealloc<T>(&self, ptr: *mut T, count: usize) {
unsafe {
let layout = Layout::from_size_align(
size_of::<T>() * count,
CACHE_LINE_SIZE.max(align_of::<T>()),
)
.expect("Layout invariant violated: must match layout used during allocation");
dealloc(ptr as *mut u8, layout);
}
}
}
pub struct CacheLayoutOptimizer<K, V> {
working_set_size: usize,
target_cache_level: CacheLevel,
access_pattern: AccessPattern,
_phantom: PhantomData<(K, V)>,
}
#[derive(Debug, Clone, Copy)]
pub enum CacheLevel {
L1,
L2,
L3,
Memory,
}
#[derive(Debug, Clone, Copy)]
pub enum AccessPattern {
Random,
Sequential,
Strided(usize),
Temporal,
}
impl<K, V> CacheLayoutOptimizer<K, V> {
pub fn new(working_set_size: usize) -> Self {
let target_cache_level = if working_set_size <= L1_CACHE_SIZE {
CacheLevel::L1
} else if working_set_size <= L2_CACHE_SIZE {
CacheLevel::L2
} else if working_set_size <= L3_CACHE_SIZE {
CacheLevel::L3
} else {
CacheLevel::Memory
};
Self {
working_set_size,
target_cache_level,
access_pattern: AccessPattern::Random,
_phantom: PhantomData,
}
}
pub fn with_access_pattern(mut self, pattern: AccessPattern) -> Self {
self.access_pattern = pattern;
self
}
pub fn optimal_bucket_size(&self) -> usize {
let entry_size = size_of::<(K, V)>() + size_of::<u32>(); let cache_line_capacity = CACHE_LINE_SIZE / entry_size;
match self.target_cache_level {
CacheLevel::L1 => {
cache_line_capacity.min(4)
}
CacheLevel::L2 => {
cache_line_capacity.min(8)
}
CacheLevel::L3 | CacheLevel::Memory => {
(cache_line_capacity * 2).min(16)
}
}
}
pub fn optimal_load_factor(&self) -> f64 {
match self.target_cache_level {
CacheLevel::L1 => 0.5, CacheLevel::L2 => 0.65, CacheLevel::L3 => 0.75, CacheLevel::Memory => 0.85, }
}
pub fn should_separate_hot_cold(&self) -> bool {
matches!(self.access_pattern, AccessPattern::Temporal)
&& self.working_set_size > L2_CACHE_SIZE
}
}
pub struct HotColdSeparator<T> {
hot: Vec<CacheAligned<T>>,
cold: Vec<T>,
access_counts: Vec<AtomicU64>,
migration_threshold: u64,
}
impl<T: Clone> HotColdSeparator<T> {
pub fn new(capacity: usize, hot_ratio: f64) -> Self {
let hot_capacity = ((capacity as f64) * hot_ratio) as usize;
let cold_capacity = capacity - hot_capacity;
Self {
hot: Vec::with_capacity(hot_capacity),
cold: Vec::with_capacity(cold_capacity),
access_counts: Vec::with_capacity(capacity),
migration_threshold: 10, }
}
pub fn access(&self, index: usize) -> Option<&T> {
self.access_counts[index].fetch_add(1, Ordering::Relaxed);
if index < self.hot.len() {
Some(self.hot[index].get())
} else {
let cold_index = index - self.hot.len();
self.cold.get(cold_index)
}
}
pub fn rebalance(&mut self) {
let mut access_stats: Vec<(usize, u64)> = self
.access_counts
.iter()
.enumerate()
.map(|(i, count)| (i, count.load(Ordering::Relaxed)))
.collect();
access_stats.sort_by_key(|&(_, count)| std::cmp::Reverse(count));
let hot_capacity = self.hot.capacity();
let new_hot_indices: Vec<usize> = access_stats
.iter()
.take(hot_capacity)
.map(|&(idx, _)| idx)
.collect();
for count in &self.access_counts {
count.store(0, Ordering::Relaxed);
}
}
}
pub struct CacheConsciousResizer {
current_size: usize,
target_size: usize,
chunk_size: usize,
use_cow: bool,
}
impl CacheConsciousResizer {
pub fn new(current_size: usize, target_size: usize) -> Self {
let chunk_size = (L3_CACHE_SIZE / CACHE_LINE_SIZE).min(target_size / 16);
Self {
current_size,
target_size,
chunk_size,
use_cow: target_size > L3_CACHE_SIZE,
}
}
pub fn resize_step<T, F>(&mut self, data: &mut Vec<T>, rehash_fn: F) -> bool
where
T: Clone,
F: Fn(&T, usize) -> usize,
{
if self.current_size >= self.target_size {
return true; }
let step_end = (self.current_size + self.chunk_size).min(self.target_size);
data.reserve(step_end - self.current_size);
self.current_size = step_end;
false }
}
pub struct AccessPatternAnalyzer {
access_history: Vec<usize>,
history_pos: usize,
detected_pattern: AccessPattern,
confidence: f64,
}
impl AccessPatternAnalyzer {
pub fn new(history_size: usize) -> Self {
Self {
access_history: vec![0; history_size],
history_pos: 0,
detected_pattern: AccessPattern::Random,
confidence: 0.0,
}
}
pub fn record_access(&mut self, address: usize) {
self.access_history[self.history_pos] = address;
self.history_pos = (self.history_pos + 1) % self.access_history.len();
if self.history_pos == 0 {
self.analyze_pattern();
}
}
fn analyze_pattern(&mut self) {
let mut sequential_count = 0;
let mut stride_counts = std::collections::HashMap::new();
for i in 1..self.access_history.len() {
let diff = self.access_history[i].wrapping_sub(self.access_history[i - 1]);
if diff == 1 {
sequential_count += 1;
} else if diff > 0 && diff < 1024 {
*stride_counts.entry(diff).or_insert(0) += 1;
}
}
let total = self.access_history.len() as f64;
if sequential_count as f64 / total > 0.7 {
self.detected_pattern = AccessPattern::Sequential;
self.confidence = sequential_count as f64 / total;
} else if let Some((&stride, &count)) = stride_counts.iter().max_by_key(|&(_, &c)| c) {
if count as f64 / total > 0.5 {
self.detected_pattern = AccessPattern::Strided(stride);
self.confidence = count as f64 / total;
}
} else {
self.detected_pattern = AccessPattern::Random;
self.confidence = 1.0 - (sequential_count as f64 / total);
}
}
pub fn get_pattern(&self) -> (AccessPattern, f64) {
(self.detected_pattern, self.confidence)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cache_aligned() {
let aligned: CacheAligned<u64> = CacheAligned::new(42);
assert_eq!(*aligned.get(), 42);
let ptr = aligned.get() as *const _ as usize;
assert_eq!(ptr % CACHE_LINE_SIZE, 0);
}
#[test]
fn test_cache_optimized_bucket() {
let mut bucket: CacheOptimizedBucket<u32, u64, 7> = CacheOptimizedBucket::new();
assert!(bucket.is_empty());
assert_eq!(bucket.count(), 0);
bucket.metadata.occupancy = 0b0000011; bucket.hashes[0] = 12345;
bucket.hashes[1] = 67890;
assert_eq!(bucket.find_hash(12345), Some(0));
assert_eq!(bucket.find_hash(67890), Some(1));
assert_eq!(bucket.find_hash(99999), None);
}
#[test]
fn test_cache_layout_optimizer() {
let optimizer: CacheLayoutOptimizer<u64, u64> = CacheLayoutOptimizer::new(16 * 1024);
assert!(matches!(optimizer.target_cache_level, CacheLevel::L1));
assert!(optimizer.optimal_load_factor() < 0.6);
let optimizer: CacheLayoutOptimizer<u64, u64> = CacheLayoutOptimizer::new(4 * 1024 * 1024);
assert!(matches!(optimizer.target_cache_level, CacheLevel::L3));
assert!(optimizer.optimal_load_factor() > 0.7);
}
#[test]
fn test_hot_cold_separator() {
let mut separator: HotColdSeparator<u32> = HotColdSeparator::new(100, 0.2);
for i in 0..20 {
separator.hot.push(CacheAligned::new(i));
separator.access_counts.push(AtomicU64::new(0));
}
for i in 20..100 {
separator.cold.push(i);
separator.access_counts.push(AtomicU64::new(0));
}
for _ in 0..50 {
separator.access(5); }
for _ in 0..5 {
separator.access(50); }
assert_eq!(separator.access_counts[5].load(Ordering::Relaxed), 50);
assert_eq!(separator.access_counts[50].load(Ordering::Relaxed), 5);
}
#[test]
fn test_access_pattern_analyzer() {
let mut analyzer = AccessPatternAnalyzer::new(100);
for i in 0..100 {
analyzer.record_access(1000 + i);
}
let (pattern, confidence) = analyzer.get_pattern();
assert!(matches!(pattern, AccessPattern::Sequential));
assert!(confidence > 0.6);
}
#[test]
fn test_cache_metrics() {
let mut metrics = CacheMetrics::default();
metrics.l1_hits = 1000;
metrics.l1_misses = 100;
metrics.l2_hits = 50;
metrics.l2_misses = 10;
let hit_ratio = metrics.hit_ratio();
assert!(hit_ratio > 0.9);
let bandwidth = metrics.estimated_bandwidth_gb();
assert!(bandwidth > 0.0);
}
#[test]
fn test_prefetcher() {
let data = vec![1u64, 2, 3, 4, 5];
unsafe {
Prefetcher::prefetch(data.as_ptr(), PrefetchHint::AllLevels);
Prefetcher::prefetch_range(data.as_ptr(), 3, PrefetchHint::L2L3);
Prefetcher::prefetch_strided(data.as_ptr(), 2, 2, PrefetchHint::L3Only);
}
}
#[test]
fn test_numa_allocator() {
let allocator = NumaAllocator::new();
unsafe {
let ptr = allocator.alloc_cache_aligned::<u64>(100);
assert!(!ptr.is_null());
assert_eq!(ptr as usize % CACHE_LINE_SIZE, 0);
allocator.dealloc(ptr, 100);
}
}
#[test]
fn test_cache_conscious_resizer() {
let mut resizer = CacheConsciousResizer::new(1000, 10000);
assert!(resizer.chunk_size > 0);
assert_eq!(resizer.current_size, 1000);
assert_eq!(resizer.target_size, 10000);
let mut data = vec![0u64; 1000];
let complete = resizer.resize_step(&mut data, |&x, _| x as usize);
assert!(!complete); }
}
unsafe impl<K: Send, V: Send, const N: usize> Send for CacheOptimizedBucket<K, V, N> {}
unsafe impl<K: Sync, V: Sync, const N: usize> Sync for CacheOptimizedBucket<K, V, N> {}