pub const L2_CACHE_LINE_BYTES: usize = 64;
#[inline]
#[must_use]
pub const fn calculate_prefetch_distance(dimension: usize) -> usize {
let vector_bytes = dimension * std::mem::size_of::<f32>();
let raw_distance = vector_bytes / L2_CACHE_LINE_BYTES;
if raw_distance < 4 {
4
} else if raw_distance > 16 {
16
} else {
raw_distance
}
}
#[inline]
pub fn prefetch_vector(vector: &[f32]) {
if vector.is_empty() {
return;
}
#[cfg(target_arch = "x86_64")]
{
unsafe {
use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0};
_mm_prefetch(vector.as_ptr().cast::<i8>(), _MM_HINT_T0);
}
}
#[cfg(target_arch = "aarch64")]
{
crate::simd_neon_prefetch::prefetch_vector_neon(vector);
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let _ = vector;
}
}
#[inline]
pub fn prefetch_vector_from_u16(data: &[u16]) {
if data.is_empty() {
return;
}
#[cfg(target_arch = "x86_64")]
{
unsafe {
use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0};
_mm_prefetch(data.as_ptr().cast::<i8>(), _MM_HINT_T0);
}
}
#[cfg(target_arch = "aarch64")]
{
crate::simd_neon_prefetch::prefetch_read_l1(data.as_ptr().cast::<u8>());
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let _ = data;
}
}
#[inline]
pub fn prefetch_vector_u64(data: &[u64]) {
if data.is_empty() {
return;
}
#[cfg(target_arch = "x86_64")]
{
unsafe {
use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0};
_mm_prefetch(data.as_ptr().cast::<i8>(), _MM_HINT_T0);
}
}
#[cfg(target_arch = "aarch64")]
{
crate::simd_neon_prefetch::prefetch_read_l1(data.as_ptr().cast::<u8>());
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let _ = data;
}
}
#[inline]
pub fn prefetch_vector_multi_cache_line(vector: &[f32]) {
if vector.is_empty() {
return;
}
#[cfg(target_arch = "x86_64")]
{
prefetch_multi_x86(vector);
}
#[cfg(target_arch = "aarch64")]
{
prefetch_multi_arm64(vector);
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let _ = vector;
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn prefetch_multi_x86(vector: &[f32]) {
use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0, _MM_HINT_T1, _MM_HINT_T2};
let vector_bytes = std::mem::size_of_val(vector);
unsafe {
_mm_prefetch(vector.as_ptr().cast::<i8>(), _MM_HINT_T0);
if vector_bytes > L2_CACHE_LINE_BYTES {
_mm_prefetch(
vector.as_ptr().cast::<i8>().add(L2_CACHE_LINE_BYTES),
_MM_HINT_T1,
);
}
if vector_bytes > L2_CACHE_LINE_BYTES * 2 {
_mm_prefetch(
vector.as_ptr().cast::<i8>().add(L2_CACHE_LINE_BYTES * 2),
_MM_HINT_T2,
);
}
if vector_bytes > L2_CACHE_LINE_BYTES * 4 {
_mm_prefetch(
vector.as_ptr().cast::<i8>().add(L2_CACHE_LINE_BYTES * 4),
_MM_HINT_T2,
);
}
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
fn prefetch_multi_arm64(vector: &[f32]) {
const ARM_CL: usize = 128;
let base = vector.as_ptr().cast::<u8>();
let vector_bytes = std::mem::size_of_val(vector);
crate::simd_neon_prefetch::prefetch_read_l1(base);
if vector_bytes > ARM_CL {
let ptr = unsafe { base.add(ARM_CL) };
crate::simd_neon_prefetch::prefetch_read_l1(ptr);
}
if vector_bytes > ARM_CL * 2 {
let ptr = unsafe { base.add(ARM_CL * 2) };
crate::simd_neon_prefetch::prefetch_read_l2(ptr);
}
if vector_bytes > ARM_CL * 4 {
let ptr = unsafe { base.add(ARM_CL * 4) };
crate::simd_neon_prefetch::prefetch_read_l3(ptr);
}
}