#[cfg(target_arch = "x86_64")]
pub(crate) mod x86 {
use core::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_storeu_si128};
#[inline(always)]
pub(crate) unsafe fn copy16(dst: *mut u8, src: *const u8) {
unsafe {
let v = _mm_loadu_si128(src as *const __m128i);
_mm_storeu_si128(dst as *mut __m128i, v);
}
}
#[inline(always)]
pub(crate) unsafe fn wildcopy_no_overlap(dst: *mut u8, src: *const u8, length: usize) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
copy16(dst.add(off), src.add(off));
off += 16;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn wildcopy_overlap_8byte_stride(
dst: *mut u8,
src: *const u8,
length: usize,
) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
let v: u64 = src.add(off).cast::<u64>().read_unaligned();
dst.add(off).cast::<u64>().write_unaligned(v);
off += 8;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn overlap_copy8(
dst: *mut u8,
src: *const u8,
offset: usize,
) -> (*mut u8, *const u8) {
const DEC32_TABLE: [u32; 8] = [0, 1, 2, 1, 4, 4, 4, 4];
const DEC64_TABLE: [i32; 8] = [8, 8, 8, 7, 8, 9, 10, 11];
unsafe {
if offset < 8 {
let sub2 = DEC64_TABLE[offset];
dst.add(0).write(src.add(0).read());
dst.add(1).write(src.add(1).read());
dst.add(2).write(src.add(2).read());
dst.add(3).write(src.add(3).read());
let dec32 = DEC32_TABLE[offset] as usize;
let v: u32 = src.add(dec32).cast::<u32>().read_unaligned();
dst.add(4).cast::<u32>().write_unaligned(v);
let net_offset = dec32 as isize - sub2 as isize + 8;
debug_assert!(
net_offset >= 0,
"overlap_copy8 net offset is non-negative for all offset ∈ 1..=7"
);
let src_after = src.offset(net_offset);
(dst.add(8), src_after)
} else {
let v: u64 = src.cast::<u64>().read_unaligned();
dst.cast::<u64>().write_unaligned(v);
(dst.add(8), src.add(8))
}
}
}
}
#[cfg(all(test, target_arch = "x86_64"))]
mod inline_helper_tests {
use super::x86::{copy16, overlap_copy8, wildcopy_no_overlap, wildcopy_overlap_8byte_stride};
#[test]
fn copy16_copies_exactly_16_bytes() {
let src: [u8; 16] = [
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
0xAE, 0xAF,
];
let mut dst = [0u8; 16];
unsafe { copy16(dst.as_mut_ptr(), src.as_ptr()) };
assert_eq!(dst, src);
}
#[test]
fn wildcopy_no_overlap_short_length_overshoots() {
let src: [u8; 32] = core::array::from_fn(|i| (i + 1) as u8);
let mut dst = [0u8; 32];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), 1) };
assert_eq!(&dst[..16], &src[..16]);
assert!(dst[16..].iter().all(|&b| b == 0));
}
#[test]
fn wildcopy_no_overlap_length_above_16_uses_multiple_iters() {
let src: [u8; 32] = core::array::from_fn(|i| (i + 1) as u8);
let mut dst = [0u8; 32];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), 24) };
assert_eq!(&dst[..32], &src[..32]);
}
#[test]
fn wildcopy_overlap_8byte_stride_rle_expansion_offset_8() {
let mut buf = [0u8; 32];
buf[..8].copy_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]);
unsafe {
wildcopy_overlap_8byte_stride(buf.as_mut_ptr().add(8), buf.as_ptr(), 16);
}
assert_eq!(&buf[8..16], &[1, 2, 3, 4, 5, 6, 7, 8]);
assert_eq!(&buf[16..24], &[1, 2, 3, 4, 5, 6, 7, 8]);
}
#[test]
fn overlap_copy8_offset_ge_8_does_plain_copy() {
let mut buf = [0u8; 32];
buf[..8].copy_from_slice(&[0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]);
let (op2, ip2) = unsafe { overlap_copy8(buf.as_mut_ptr().add(8), buf.as_ptr(), 8) };
assert_eq!(op2, unsafe { buf.as_mut_ptr().add(16) });
assert_eq!(ip2, unsafe { buf.as_ptr().add(8) });
assert_eq!(
&buf[8..16],
&[0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]
);
}
#[test]
fn overlap_copy8_offset_lt_8_spreads_source() {
let mut buf = [0u8; 32];
buf[..3].copy_from_slice(&[0xAA, 0xBB, 0xCC]);
let (op2, _ip2) = unsafe { overlap_copy8(buf.as_mut_ptr().add(3), buf.as_ptr(), 3) };
assert_eq!(op2, unsafe { buf.as_mut_ptr().add(11) });
assert!(buf[3..11].iter().any(|&b| b != 0));
}
}