use std::mem;
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum PlacementStrategy {
Default,
Packed,
Aligned(usize), NumaAware,
CacheAware,
}
pub fn optimize_placement<T: Copy>(data: &mut [T], strategy: PlacementStrategy) {
match strategy {
PlacementStrategy::Default => {
}
PlacementStrategy::Packed => {
pack_data(data);
}
PlacementStrategy::Aligned(alignment) => {
align_data(data, alignment);
}
PlacementStrategy::NumaAware => {
}
PlacementStrategy::CacheAware => {
cache_aware_placement(data);
}
}
}
fn pack_data<T: Copy>(data: &mut [T]) {
let size = std::mem::size_of_val(data);
unsafe {
let layout = std::alloc::Layout::from_size_align(size, mem::align_of::<T>())
.unwrap_or_else(|_| std::alloc::Layout::new::<T>());
let new_ptr = std::alloc::alloc(layout) as *mut T;
if !new_ptr.is_null() {
std::ptr::copy_nonoverlapping(data.as_ptr(), new_ptr, data.len());
std::ptr::copy_nonoverlapping(new_ptr, data.as_mut_ptr(), data.len());
std::alloc::dealloc(new_ptr as *mut u8, layout);
}
}
}
fn align_data<T: Copy>(data: &mut [T], alignment: usize) {
let data_ptr = data.as_ptr() as usize;
let misalignment = data_ptr % alignment;
if misalignment == 0 {
return;
}
let size = std::mem::size_of_val(data);
unsafe {
let layout = std::alloc::Layout::from_size_align(size, alignment)
.unwrap_or_else(|_| std::alloc::Layout::new::<T>());
let aligned_ptr = std::alloc::alloc(layout) as *mut T;
if !aligned_ptr.is_null() {
std::ptr::copy_nonoverlapping(data.as_ptr(), aligned_ptr, data.len());
std::ptr::copy_nonoverlapping(aligned_ptr, data.as_mut_ptr(), data.len());
std::alloc::dealloc(aligned_ptr as *mut u8, layout);
}
}
}
fn cache_aware_placement<T: Copy>(data: &mut [T]) {
let cache_line_size = get_cache_line_size();
let elements_per_line = cache_line_size / mem::size_of::<T>();
if data.len() <= elements_per_line {
align_data(data, cache_line_size);
return;
}
cache_blocked_placement(data, elements_per_line);
}
fn cache_blocked_placement<T: Copy>(data: &mut [T], block_size: usize) {
if data.len() <= block_size {
return;
}
let mut temp = vec![data[0]; data.len()];
let mut temp_idx = 0;
for chunk_start in (0..data.len()).step_by(block_size) {
let chunk_end = (chunk_start + block_size).min(data.len());
let chunk_size = chunk_end - chunk_start;
if temp_idx + chunk_size <= temp.len() {
temp[temp_idx..temp_idx + chunk_size].copy_from_slice(&data[chunk_start..chunk_end]);
temp_idx += chunk_size;
}
}
data.copy_from_slice(&temp);
}
fn get_cache_line_size() -> usize {
64
}
pub fn optimal_alignment<T>() -> usize {
let type_size = mem::size_of::<T>();
if cfg!(target_arch = "x86_64") {
if is_avx512_available() {
return 64.max(type_size);
} else if is_avx_available() {
return 32.max(type_size);
} else {
return 16.max(type_size);
}
} else if cfg!(target_arch = "aarch64") {
return 16.max(type_size);
}
8.max(type_size)
}
fn is_avx_available() -> bool {
#[cfg(target_arch = "x86_64")]
{
is_x86_feature_detected!("avx")
}
#[cfg(not(target_arch = "x86_64"))]
{
false
}
}
fn is_avx512_available() -> bool {
#[cfg(target_arch = "x86_64")]
{
is_x86_feature_detected!("avx512f")
}
#[cfg(not(target_arch = "x86_64"))]
{
false
}
}
#[allow(dead_code)]
fn is_avx2_available() -> bool {
#[cfg(target_arch = "x86_64")]
{
is_x86_feature_detected!("avx2")
}
#[cfg(not(target_arch = "x86_64"))]
{
false
}
}
pub fn get_optimal_simd_width<T>() -> usize {
let type_size = mem::size_of::<T>();
#[cfg(target_arch = "x86_64")]
{
if is_avx512_available() {
64 / type_size } else if is_avx2_available() || is_avx_available() {
32 / type_size } else {
16 / type_size }
}
#[cfg(target_arch = "aarch64")]
{
16 / type_size }
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
4 / type_size }
}