#[cfg(all(target_arch = "x86", feature = "kernel_sse2"))]
use core::arch::x86::{__m128i, _mm_loadu_si128, _mm_storeu_si128};
#[cfg(all(target_arch = "x86", feature = "kernel_avx2"))]
use core::arch::x86::{__m256i, _mm256_loadu_si256, _mm256_storeu_si256};
#[cfg(all(target_arch = "x86", feature = "kernel_vbmi2"))]
use core::arch::x86::{__m512i, _mm512_loadu_si512, _mm512_storeu_si512};
#[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
use core::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_storeu_si128};
#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
use core::arch::x86_64::{__m256i, _mm256_loadu_si256, _mm256_storeu_si256};
#[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
use core::arch::x86_64::{__m512i, _mm512_loadu_si512, _mm512_storeu_si512};
#[cfg(all(feature = "std", feature = "kernel_sse2", target_arch = "x86"))]
use std::arch::is_x86_feature_detected;
#[cfg(all(
feature = "std",
feature = "kernel_sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
use std::sync::OnceLock;
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
use core::arch::aarch64::{uint8x16_t, vld1q_u8, vst1q_u8};
#[cfg(all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
))]
use core::arch::wasm32::{v128, v128_load, v128_store};
#[cfg(feature = "copy_shape_stats")]
pub mod shape_stats {
use core::sync::atomic::{AtomicU64, Ordering};
pub static CALLS_LE8: AtomicU64 = AtomicU64::new(0);
pub static CALLS_9_16: AtomicU64 = AtomicU64::new(0);
pub static CALLS_17_32: AtomicU64 = AtomicU64::new(0);
pub static CALLS_GT32: AtomicU64 = AtomicU64::new(0);
pub static REQ_BYTES_GT32: AtomicU64 = AtomicU64::new(0);
pub static WRITTEN_BYTES_GT32: AtomicU64 = AtomicU64::new(0);
pub static MAX_LEN: AtomicU64 = AtomicU64::new(0);
pub static MATCH_NONOVERLAP: AtomicU64 = AtomicU64::new(0);
pub static MATCH_NONOVERLAP_BYTES: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_LT8: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_LT8_BYTES: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_8_15: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_8_15_BYTES: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_16_31: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_16_31_BYTES: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_32_63: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_32_63_BYTES: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_GE64: AtomicU64 = AtomicU64::new(0);
pub static MATCH_OVL_GE64_BYTES: AtomicU64 = AtomicU64::new(0);
#[inline]
pub fn record_repeat(offset: usize, match_length: usize, overlapping: bool) {
let mlen = match_length as u64;
if !overlapping {
MATCH_NONOVERLAP.fetch_add(1, Ordering::Relaxed);
MATCH_NONOVERLAP_BYTES.fetch_add(mlen, Ordering::Relaxed);
return;
}
let (n, b) = if offset < 8 {
(&MATCH_OVL_LT8, &MATCH_OVL_LT8_BYTES)
} else if offset < 16 {
(&MATCH_OVL_8_15, &MATCH_OVL_8_15_BYTES)
} else if offset < 32 {
(&MATCH_OVL_16_31, &MATCH_OVL_16_31_BYTES)
} else if offset < 64 {
(&MATCH_OVL_32_63, &MATCH_OVL_32_63_BYTES)
} else {
(&MATCH_OVL_GE64, &MATCH_OVL_GE64_BYTES)
};
n.fetch_add(1, Ordering::Relaxed);
b.fetch_add(mlen, Ordering::Relaxed);
}
pub fn take_repeat() -> [(u64, u64); 6] {
[
(
MATCH_NONOVERLAP.swap(0, Ordering::Relaxed),
MATCH_NONOVERLAP_BYTES.swap(0, Ordering::Relaxed),
),
(
MATCH_OVL_LT8.swap(0, Ordering::Relaxed),
MATCH_OVL_LT8_BYTES.swap(0, Ordering::Relaxed),
),
(
MATCH_OVL_8_15.swap(0, Ordering::Relaxed),
MATCH_OVL_8_15_BYTES.swap(0, Ordering::Relaxed),
),
(
MATCH_OVL_16_31.swap(0, Ordering::Relaxed),
MATCH_OVL_16_31_BYTES.swap(0, Ordering::Relaxed),
),
(
MATCH_OVL_32_63.swap(0, Ordering::Relaxed),
MATCH_OVL_32_63_BYTES.swap(0, Ordering::Relaxed),
),
(
MATCH_OVL_GE64.swap(0, Ordering::Relaxed),
MATCH_OVL_GE64_BYTES.swap(0, Ordering::Relaxed),
),
]
}
#[inline]
pub(super) fn record(copy_at_least: usize) {
let n = copy_at_least as u64;
if copy_at_least <= 8 {
CALLS_LE8.fetch_add(1, Ordering::Relaxed);
} else if copy_at_least <= 16 {
CALLS_9_16.fetch_add(1, Ordering::Relaxed);
} else if copy_at_least <= 32 {
CALLS_17_32.fetch_add(1, Ordering::Relaxed);
} else {
CALLS_GT32.fetch_add(1, Ordering::Relaxed);
REQ_BYTES_GT32.fetch_add(n, Ordering::Relaxed);
WRITTEN_BYTES_GT32.fetch_add(
(copy_at_least.next_multiple_of(32)) as u64,
Ordering::Relaxed,
);
}
MAX_LEN.fetch_max(n, Ordering::Relaxed);
}
pub fn take() -> [u64; 7] {
[
CALLS_LE8.swap(0, Ordering::Relaxed),
CALLS_9_16.swap(0, Ordering::Relaxed),
CALLS_17_32.swap(0, Ordering::Relaxed),
CALLS_GT32.swap(0, Ordering::Relaxed),
REQ_BYTES_GT32.swap(0, Ordering::Relaxed),
WRITTEN_BYTES_GT32.swap(0, Ordering::Relaxed),
MAX_LEN.swap(0, Ordering::Relaxed),
]
}
}
const BULK_MEMCPY_THRESHOLD: usize = 2048;
#[inline(always)]
pub(crate) unsafe fn copy_bytes_overshooting(
src: (*const u8, usize),
dst: (*mut u8, usize),
copy_at_least: usize,
) {
if copy_at_least == 0 {
return;
}
#[cfg(feature = "copy_shape_stats")]
shape_stats::record(copy_at_least);
let min_buffer_size = core::cmp::min(src.1, dst.1);
if copy_at_least <= 16 && min_buffer_size >= 16 {
unsafe { single_op_copy_16(src.0, dst.0, copy_at_least) };
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
if copy_at_least <= 32 {
unsafe {
if copy_at_least <= 8 {
let mut i = 0;
while i < copy_at_least {
dst.0.add(i).write(src.0.add(i).read());
i += 1;
}
} else if copy_at_least <= 16 {
let lo: u64 = src.0.cast::<u64>().read_unaligned();
let hi_offset = copy_at_least - 8;
let hi: u64 = src.0.add(hi_offset).cast::<u64>().read_unaligned();
dst.0.cast::<u64>().write_unaligned(lo);
dst.0.add(hi_offset).cast::<u64>().write_unaligned(hi);
} else {
let lo: u64 = src.0.cast::<u64>().read_unaligned();
let hi: u64 = src.0.add(8).cast::<u64>().read_unaligned();
dst.0.cast::<u64>().write_unaligned(lo);
dst.0.add(8).cast::<u64>().write_unaligned(hi);
let tail_off = copy_at_least - 16;
let tail_lo: u64 = src.0.add(tail_off).cast::<u64>().read_unaligned();
let tail_hi: u64 = src.0.add(copy_at_least - 8).cast::<u64>().read_unaligned();
dst.0.add(tail_off).cast::<u64>().write_unaligned(tail_lo);
dst.0
.add(copy_at_least - 8)
.cast::<u64>()
.write_unaligned(tail_hi);
}
}
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
if copy_at_least >= BULK_MEMCPY_THRESHOLD {
unsafe { dst.0.copy_from_nonoverlapping(src.0, copy_at_least) };
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
#[allow(unused_macros)]
macro_rules! try_chunk_kernel {
($chunk:expr, $kernel:ident) => {{
if copy_at_least >= $chunk {
let rounded = copy_at_least.next_multiple_of($chunk);
if min_buffer_size >= rounded {
unsafe { $kernel(src.0, dst.0, rounded) };
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
}
}};
}
#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
{
#[cfg(feature = "kernel_sse2")]
let caps = detect_x86_caps();
#[cfg(feature = "kernel_vbmi2")]
if caps.avx512f {
try_chunk_kernel!(64, copy_avx512);
}
#[cfg(feature = "kernel_avx2")]
if caps.avx2 {
try_chunk_kernel!(32, copy_avx2);
}
#[cfg(feature = "kernel_sse2")]
if caps.sse2 {
try_chunk_kernel!(16, copy_sse2);
}
}
#[cfg(all(not(feature = "std"), any(target_arch = "x86", target_arch = "x86_64")))]
{
#[cfg(all(target_feature = "avx512vbmi2", feature = "kernel_vbmi2"))]
try_chunk_kernel!(64, copy_avx512);
#[cfg(all(target_feature = "avx2", feature = "kernel_avx2"))]
try_chunk_kernel!(32, copy_avx2);
#[cfg(all(target_feature = "sse2", feature = "kernel_sse2"))]
try_chunk_kernel!(16, copy_sse2);
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
try_chunk_kernel!(16, copy_neon);
#[cfg(all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
))]
try_chunk_kernel!(16, copy_simd128);
let scalar_chunk = core::mem::size_of::<usize>();
let rounded = copy_at_least.next_multiple_of(scalar_chunk);
if min_buffer_size >= rounded {
unsafe { copy_scalar(src.0, dst.0, rounded) };
} else {
unsafe { dst.0.copy_from_nonoverlapping(src.0, copy_at_least) };
}
debug_assert_eq_copy(src, dst, copy_at_least);
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
feature = "kernel_avx2"
))]
#[target_feature(enable = "avx2")]
#[allow(dead_code)]
pub(crate) unsafe fn copy_bytes_overshooting_avx2(
src: (*const u8, usize),
dst: (*mut u8, usize),
copy_at_least: usize,
) {
if copy_at_least == 0 {
return;
}
let min_buffer_size = core::cmp::min(src.1, dst.1);
if copy_at_least <= 16 && min_buffer_size >= 16 {
unsafe { single_op_copy_16(src.0, dst.0, copy_at_least) };
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
if copy_at_least <= 32 {
unsafe {
if copy_at_least <= 8 {
let mut i = 0;
while i < copy_at_least {
dst.0.add(i).write(src.0.add(i).read());
i += 1;
}
} else if copy_at_least <= 16 {
let lo: u64 = src.0.cast::<u64>().read_unaligned();
let hi_offset = copy_at_least - 8;
let hi: u64 = src.0.add(hi_offset).cast::<u64>().read_unaligned();
dst.0.cast::<u64>().write_unaligned(lo);
dst.0.add(hi_offset).cast::<u64>().write_unaligned(hi);
} else {
let lo: u64 = src.0.cast::<u64>().read_unaligned();
let hi: u64 = src.0.add(8).cast::<u64>().read_unaligned();
dst.0.cast::<u64>().write_unaligned(lo);
dst.0.add(8).cast::<u64>().write_unaligned(hi);
let tail_off = copy_at_least - 16;
let tail_lo: u64 = src.0.add(tail_off).cast::<u64>().read_unaligned();
let tail_hi: u64 = src.0.add(copy_at_least - 8).cast::<u64>().read_unaligned();
dst.0.add(tail_off).cast::<u64>().write_unaligned(tail_lo);
dst.0
.add(copy_at_least - 8)
.cast::<u64>()
.write_unaligned(tail_hi);
}
}
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
let rounded = copy_at_least.next_multiple_of(32);
if min_buffer_size >= rounded {
unsafe { copy_avx2(src.0, dst.0, rounded) };
debug_assert_eq_copy(src, dst, copy_at_least);
return;
}
let scalar_chunk = core::mem::size_of::<usize>();
let rounded_scalar = copy_at_least.next_multiple_of(scalar_chunk);
if min_buffer_size >= rounded_scalar {
unsafe { copy_scalar(src.0, dst.0, rounded_scalar) };
} else {
unsafe { dst.0.copy_from_nonoverlapping(src.0, copy_at_least) };
}
debug_assert_eq_copy(src, dst, copy_at_least);
}
#[inline(always)]
unsafe fn single_op_copy_16(src: *const u8, dst: *mut u8, len: usize) {
debug_assert!(len <= 16);
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
unsafe {
let v: uint8x16_t = vld1q_u8(src);
vst1q_u8(dst, v);
return;
}
#[cfg(all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
))]
unsafe {
let v: v128 = v128_load(src.cast::<v128>());
v128_store(dst.cast::<v128>(), v);
return;
}
#[cfg(all(
feature = "std",
feature = "kernel_sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
unsafe {
if detect_x86_caps().sse2 {
copy_sse2(src, dst, 16);
return;
}
}
#[cfg(all(
not(feature = "std"),
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse2",
feature = "kernel_sse2"
))]
unsafe {
copy_sse2(src, dst, 16);
return;
}
#[allow(unreachable_code)]
unsafe {
let lo: u64 = src.cast::<u64>().read_unaligned();
let hi_offset = len.saturating_sub(8);
let hi: u64 = src.add(hi_offset).cast::<u64>().read_unaligned();
dst.cast::<u64>().write_unaligned(lo);
dst.add(hi_offset).cast::<u64>().write_unaligned(hi);
}
}
#[inline(always)]
fn debug_assert_eq_copy(_src: (*const u8, usize), _dst: (*mut u8, usize), _len: usize) {
#[cfg(debug_assertions)]
unsafe {
let s = core::slice::from_raw_parts(_src.0, _len);
let d = core::slice::from_raw_parts(_dst.0, _len);
debug_assert_eq!(s, d);
}
}
#[cfg(feature = "bench_internals")]
#[inline(always)]
pub(crate) unsafe fn copy_bytes_overshooting_for_bench(
src: (*const u8, usize),
dst: (*mut u8, usize),
copy_at_least: usize,
) {
unsafe { copy_bytes_overshooting(src, dst, copy_at_least) };
}
#[cfg(test)]
#[inline]
pub(crate) fn active_chunk_size_for_tests() -> usize {
#[cfg(all(
feature = "std",
feature = "kernel_sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
{
let caps = detect_x86_caps();
#[cfg(feature = "kernel_vbmi2")]
if caps.avx512f {
return 64;
}
#[cfg(feature = "kernel_avx2")]
if caps.avx2 {
return 32;
}
if caps.sse2 {
return 16;
}
}
#[cfg(all(
not(feature = "std"),
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx512vbmi2",
feature = "kernel_vbmi2"
))]
{
return 64;
}
#[cfg(all(
not(feature = "std"),
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
feature = "kernel_avx2"
))]
{
return 32;
}
#[cfg(all(
not(feature = "std"),
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse2",
feature = "kernel_sse2"
))]
{
return 16;
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
{
return 16;
}
#[allow(unreachable_code)]
{
core::mem::size_of::<usize>()
}
}
#[inline(always)]
unsafe fn copy_scalar(mut src: *const u8, mut dst: *mut u8, len: usize) {
let end = unsafe { src.add(len) };
while src < end {
unsafe {
dst.cast::<usize>()
.write_unaligned(src.cast::<usize>().read_unaligned());
src = src.add(core::mem::size_of::<usize>());
dst = dst.add(core::mem::size_of::<usize>());
}
}
}
#[cfg(all(
feature = "std",
feature = "kernel_sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[derive(Clone, Copy)]
#[allow(dead_code)]
struct X86Caps {
avx512f: bool,
avx2: bool,
sse2: bool,
}
#[cfg(all(
feature = "std",
feature = "kernel_sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[inline(always)]
fn detect_x86_caps() -> X86Caps {
static CAPS: OnceLock<X86Caps> = OnceLock::new();
*CAPS.get_or_init(|| {
#[cfg(target_arch = "x86_64")]
{
use crate::cpu_kernel::{CpuKernelTag, detect_cpu_kernel};
match detect_cpu_kernel() {
#[cfg(feature = "kernel_vbmi2")]
CpuKernelTag::Vbmi2 => X86Caps {
avx512f: true,
avx2: true,
sse2: true,
},
#[cfg(feature = "kernel_avx2")]
CpuKernelTag::Avx2 => X86Caps {
avx512f: false,
avx2: true,
sse2: true,
},
#[cfg(feature = "kernel_bmi2")]
CpuKernelTag::Bmi2 => X86Caps {
avx512f: false,
avx2: false,
sse2: true,
},
#[cfg(feature = "kernel_sse2")]
CpuKernelTag::Sse2 => X86Caps {
avx512f: false,
avx2: false,
sse2: true,
},
CpuKernelTag::Scalar => X86Caps {
avx512f: false,
avx2: false,
sse2: false,
},
}
}
#[cfg(target_arch = "x86")]
{
X86Caps {
avx512f: cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512vbmi2"),
avx2: cfg!(feature = "kernel_avx2") && is_x86_feature_detected!("avx2"),
sse2: cfg!(feature = "kernel_sse2") && is_x86_feature_detected!("sse2"),
}
}
})
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
feature = "kernel_sse2"
))]
#[target_feature(enable = "sse2")]
#[allow(dead_code)]
unsafe fn copy_sse2(mut src: *const u8, mut dst: *mut u8, len: usize) {
let end = unsafe { src.add(len) };
while src < end {
unsafe {
let v: __m128i = _mm_loadu_si128(src.cast::<__m128i>());
_mm_storeu_si128(dst.cast::<__m128i>(), v);
src = src.add(16);
dst = dst.add(16);
}
}
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
feature = "kernel_avx2"
))]
#[target_feature(enable = "avx2")]
#[allow(dead_code)]
unsafe fn copy_avx2(mut src: *const u8, mut dst: *mut u8, len: usize) {
debug_assert!(
len.is_multiple_of(32),
"copy_avx2 expects len to be a multiple of 32 (dispatcher rounds up)",
);
let end_unrolled = len & !63;
let mut copied = 0usize;
while copied < end_unrolled {
unsafe {
let v0: __m256i = _mm256_loadu_si256(src.cast::<__m256i>());
let v1: __m256i = _mm256_loadu_si256(src.add(32).cast::<__m256i>());
_mm256_storeu_si256(dst.cast::<__m256i>(), v0);
_mm256_storeu_si256(dst.add(32).cast::<__m256i>(), v1);
src = src.add(64);
dst = dst.add(64);
}
copied += 64;
}
if copied < len {
unsafe {
let v: __m256i = _mm256_loadu_si256(src.cast::<__m256i>());
_mm256_storeu_si256(dst.cast::<__m256i>(), v);
}
}
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
feature = "kernel_vbmi2"
))]
#[target_feature(enable = "avx512f")]
#[allow(dead_code)]
unsafe fn copy_avx512(mut src: *const u8, mut dst: *mut u8, len: usize) {
let end = unsafe { src.add(len) };
while src < end {
unsafe {
let v: __m512i = _mm512_loadu_si512(src.cast::<__m512i>());
_mm512_storeu_si512(dst.cast::<__m512i>(), v);
src = src.add(64);
dst = dst.add(64);
}
}
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
#[inline(always)]
unsafe fn copy_neon(mut src: *const u8, mut dst: *mut u8, len: usize) {
let end = unsafe { src.add(len) };
while src < end {
unsafe {
let v: uint8x16_t = vld1q_u8(src);
vst1q_u8(dst, v);
src = src.add(16);
dst = dst.add(16);
}
}
}
#[cfg(all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
))]
#[inline(always)]
unsafe fn copy_simd128(mut src: *const u8, mut dst: *mut u8, len: usize) {
let end = unsafe { src.add(len) };
while src < end {
unsafe {
let v: v128 = v128_load(src.cast::<v128>());
v128_store(dst.cast::<v128>(), v);
src = src.add(16);
dst = dst.add(16);
}
}
}
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
feature = "kernel_avx2"
))]
#[inline]
unsafe fn copy_exact_inline_avx2(src: *const u8, dst: *mut u8, len: usize) {
debug_assert!(len >= 33, "copy_exact_inline_avx2 requires len >= 33");
unsafe {
if len <= 64 {
let a = _mm256_loadu_si256(src.cast::<__m256i>());
let b = _mm256_loadu_si256(src.add(len - 32).cast::<__m256i>());
_mm256_storeu_si256(dst.cast::<__m256i>(), a);
_mm256_storeu_si256(dst.add(len - 32).cast::<__m256i>(), b);
} else if len <= 128 {
let a = _mm256_loadu_si256(src.cast::<__m256i>());
let b = _mm256_loadu_si256(src.add(32).cast::<__m256i>());
let c = _mm256_loadu_si256(src.add(len - 64).cast::<__m256i>());
let d = _mm256_loadu_si256(src.add(len - 32).cast::<__m256i>());
_mm256_storeu_si256(dst.cast::<__m256i>(), a);
_mm256_storeu_si256(dst.add(32).cast::<__m256i>(), b);
_mm256_storeu_si256(dst.add(len - 64).cast::<__m256i>(), c);
_mm256_storeu_si256(dst.add(len - 32).cast::<__m256i>(), d);
} else {
let mut o = 0usize;
while o + 64 <= len {
let v0 = _mm256_loadu_si256(src.add(o).cast::<__m256i>());
let v1 = _mm256_loadu_si256(src.add(o + 32).cast::<__m256i>());
_mm256_storeu_si256(dst.add(o).cast::<__m256i>(), v0);
_mm256_storeu_si256(dst.add(o + 32).cast::<__m256i>(), v1);
o += 64;
}
while o + 32 <= len {
let v = _mm256_loadu_si256(src.add(o).cast::<__m256i>());
_mm256_storeu_si256(dst.add(o).cast::<__m256i>(), v);
o += 32;
}
if o < len {
let t = len - 32;
let v = _mm256_loadu_si256(src.add(t).cast::<__m256i>());
_mm256_storeu_si256(dst.add(t).cast::<__m256i>(), v);
}
}
}
}
#[cfg(all(
target_arch = "x86",
target_feature = "sse2",
not(target_feature = "avx2"),
feature = "kernel_sse2"
))]
#[inline]
unsafe fn copy_exact_inline_sse2(src: *const u8, dst: *mut u8, len: usize) {
debug_assert!(len >= 33, "copy_exact_inline_sse2 requires len >= 33");
unsafe {
if len <= 64 {
let a = _mm_loadu_si128(src.cast::<__m128i>());
let b = _mm_loadu_si128(src.add(16).cast::<__m128i>());
let c = _mm_loadu_si128(src.add(len - 32).cast::<__m128i>());
let d = _mm_loadu_si128(src.add(len - 16).cast::<__m128i>());
_mm_storeu_si128(dst.cast::<__m128i>(), a);
_mm_storeu_si128(dst.add(16).cast::<__m128i>(), b);
_mm_storeu_si128(dst.add(len - 32).cast::<__m128i>(), c);
_mm_storeu_si128(dst.add(len - 16).cast::<__m128i>(), d);
} else {
let mut o = 0usize;
while o + 32 <= len {
let v0 = _mm_loadu_si128(src.add(o).cast::<__m128i>());
let v1 = _mm_loadu_si128(src.add(o + 16).cast::<__m128i>());
_mm_storeu_si128(dst.add(o).cast::<__m128i>(), v0);
_mm_storeu_si128(dst.add(o + 16).cast::<__m128i>(), v1);
o += 32;
}
while o + 16 <= len {
_mm_storeu_si128(
dst.add(o).cast::<__m128i>(),
_mm_loadu_si128(src.add(o).cast::<__m128i>()),
);
o += 16;
}
if o < len {
let t = len - 16;
_mm_storeu_si128(
dst.add(t).cast::<__m128i>(),
_mm_loadu_si128(src.add(t).cast::<__m128i>()),
);
}
}
}
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
#[inline]
unsafe fn copy_exact_inline_neon(src: *const u8, dst: *mut u8, len: usize) {
debug_assert!(len >= 33, "copy_exact_inline_neon requires len >= 33");
let mut o = 0usize;
unsafe {
while o + 32 <= len {
let v0 = vld1q_u8(src.add(o));
let v1 = vld1q_u8(src.add(o + 16));
vst1q_u8(dst.add(o), v0);
vst1q_u8(dst.add(o + 16), v1);
o += 32;
}
while o + 16 <= len {
vst1q_u8(dst.add(o), vld1q_u8(src.add(o)));
o += 16;
}
if o < len {
let t = len - 16;
vst1q_u8(dst.add(t), vld1q_u8(src.add(t)));
}
}
}
#[inline]
pub(crate) unsafe fn copy_exact_medium(src: *const u8, dst: *mut u8, len: usize) {
debug_assert!(
len >= 33,
"copy_exact_medium requires len >= 33 (overlapping SIMD tail underflows below that)",
);
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
feature = "kernel_avx2"
))]
unsafe {
copy_exact_inline_avx2(src, dst, len)
};
#[cfg(all(
target_arch = "x86",
target_feature = "sse2",
not(target_feature = "avx2"),
feature = "kernel_sse2"
))]
unsafe {
copy_exact_inline_sse2(src, dst, len)
};
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
))]
unsafe {
copy_exact_inline_neon(src, dst, len)
};
#[cfg(not(any(
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
feature = "kernel_avx2"
),
all(
target_arch = "x86",
target_feature = "sse2",
not(target_feature = "avx2"),
feature = "kernel_sse2"
),
all(
target_arch = "aarch64",
target_feature = "neon",
feature = "kernel_neon"
)
)))]
unsafe {
core::ptr::copy_nonoverlapping(src, dst, len)
};
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
#[test]
fn copy_exact_medium_matches_memcpy_all_sizes() {
let src: vec::Vec<u8> = (0..4096u32)
.map(|i| (i.wrapping_mul(2654435761) >> 24) as u8)
.collect();
for len in 33..2048usize {
let mut got = vec![0u8; len];
unsafe { copy_exact_medium(src.as_ptr(), got.as_mut_ptr(), len) };
assert_eq!(
&got[..],
&src[..len],
"copy_exact_medium mismatch at len={len}"
);
}
}
#[test]
fn copy_bytes_overshooting_zero_len_is_noop() {
let src = [1_u8, 2, 3, 4];
let mut dst = [9_u8, 9, 9, 9];
unsafe {
copy_bytes_overshooting((src.as_ptr(), src.len()), (dst.as_mut_ptr(), dst.len()), 0);
}
assert_eq!(dst, [9_u8, 9, 9, 9]);
}
#[test]
fn copy_bytes_overshooting_fallback_exact_copy_when_caps_are_tight() {
let len = 65; let src = vec![5_u8; len];
let mut dst = vec![0_u8; len];
unsafe {
copy_bytes_overshooting((src.as_ptr(), len), (dst.as_mut_ptr(), len), len);
}
assert_eq!(dst, src);
}
#[test]
fn copy_bytes_overshooting_single_op_small() {
for len in 1..=16 {
let mut src = [0u8; 32];
for (i, b) in src.iter_mut().enumerate() {
*b = i as u8;
}
let mut dst = [0u8; 32];
unsafe {
copy_bytes_overshooting((src.as_ptr(), 32), (dst.as_mut_ptr(), 32), len);
}
assert_eq!(&dst[..len], &src[..len], "len={len}");
}
}
#[test]
fn copy_scalar_copies_requested_bytes() {
let src = [11_u8, 12, 13, 14, 15, 16, 17, 18];
let mut dst = [0_u8; 8];
unsafe { copy_scalar(src.as_ptr(), dst.as_mut_ptr(), src.len()) };
assert_eq!(dst, src);
}
#[cfg(all(
feature = "std",
feature = "kernel_sse2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[test]
fn copy_sse2_copies_full_chunk_when_available() {
if !std::arch::is_x86_feature_detected!("sse2") {
return;
}
let src = [7_u8; 16];
let mut dst = [0_u8; 16];
unsafe { copy_sse2(src.as_ptr(), dst.as_mut_ptr(), 16) };
assert_eq!(dst, src);
}
#[cfg(all(
feature = "std",
feature = "kernel_avx2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[test]
fn copy_avx2_copies_full_chunk_when_available() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
let src = [8_u8; 32];
let mut dst = [0_u8; 32];
unsafe { copy_avx2(src.as_ptr(), dst.as_mut_ptr(), 32) };
assert_eq!(dst, src);
}
#[cfg(all(
feature = "std",
feature = "kernel_avx2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[test]
fn copy_avx2_copies_full_unroll2_iteration() {
use alloc::vec::Vec;
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
let src: Vec<u8> = (0..64u8).collect();
let mut dst = [0_u8; 64];
unsafe { copy_avx2(src.as_ptr(), dst.as_mut_ptr(), 64) };
assert_eq!(&dst[..], &src[..]);
}
#[cfg(all(
feature = "std",
feature = "kernel_avx2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[test]
fn copy_avx2_copies_unroll2_loop_plus_residual_tail() {
use alloc::vec::Vec;
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
let src: Vec<u8> = (0..96u8).collect();
let mut dst = [0_u8; 96];
unsafe { copy_avx2(src.as_ptr(), dst.as_mut_ptr(), 96) };
assert_eq!(&dst[..], &src[..]);
assert_eq!(&dst[60..68], &[60, 61, 62, 63, 64, 65, 66, 67]);
}
#[cfg(all(
feature = "std",
feature = "kernel_vbmi2",
any(target_arch = "x86", target_arch = "x86_64")
))]
#[test]
fn copy_avx512_copies_full_chunk_when_available() {
if !std::arch::is_x86_feature_detected!("avx512f") {
return;
}
let src = [9_u8; 64];
let mut dst = [0_u8; 64];
unsafe { copy_avx512(src.as_ptr(), dst.as_mut_ptr(), 64) };
assert_eq!(dst, src);
}
}