#[inline]
pub(crate) unsafe fn exec_sequence_bounded_copy(
base: *mut u8,
tail: usize,
lit_src: *const u8,
lit_length: usize,
offset: usize,
match_length: usize,
) {
unsafe {
let op_lit = base.add(tail);
core::ptr::copy_nonoverlapping(lit_src, op_lit, lit_length);
let op_match = base.add(tail + lit_length);
let match_src = base.cast_const().add(tail + lit_length - offset);
if offset >= match_length {
core::ptr::copy_nonoverlapping(match_src, op_match, match_length);
} else {
let mut i = 0usize;
while i < match_length {
*op_match.add(i) = *match_src.add(i);
i += 1;
}
}
}
}
#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
macro_rules! exec_sequence_avx2_inline {
($buffer:expr, $lit_src:expr, $lit_length:expr, $offset:expr, $match_length:expr) => {{
use crate::decoding::buffer_backend::sequence_output_fits;
use crate::decoding::exec_sequence_inline::x86::{
copy16, overlap_copy8, wildcopy_no_overlap, wildcopy_no_overlap_avx2,
wildcopy_overlap_8byte_stride,
};
const MAX_WILDCOPY_OVERSHOOT: usize = 31;
let lit_length_v: usize = $lit_length;
let offset_v: usize = $offset;
let match_length_v: usize = $match_length;
let lit_src_v: *const u8 = $lit_src;
let backend = $buffer.buffer_mut();
let cap = backend.cap();
let tail = backend.tail();
match sequence_output_fits(lit_length_v, match_length_v, tail, cap, 0) {
Err(e) => Err(e),
Ok(total) => {
unsafe {
let base = backend.inline_exec_base_ptr();
if total + MAX_WILDCOPY_OVERSHOOT > cap - tail {
$crate::decoding::exec_sequence_inline::exec_sequence_bounded_copy(
base,
tail,
lit_src_v,
lit_length_v,
offset_v,
match_length_v,
);
} else {
let op_lit = base.add(tail);
let op_match = base.add(tail + lit_length_v);
let match_src = base.cast_const().add(tail + lit_length_v - offset_v);
copy16(op_lit, lit_src_v);
if lit_length_v > 16 {
wildcopy_no_overlap(
op_lit.add(16),
lit_src_v.add(16),
lit_length_v - 16,
);
}
if offset_v >= 32 {
wildcopy_no_overlap_avx2(op_match, match_src, match_length_v);
} else if offset_v >= 16 {
wildcopy_no_overlap(op_match, match_src, match_length_v);
} else {
let (op2, ip2) = overlap_copy8(op_match, match_src, offset_v);
if match_length_v > 8 {
wildcopy_overlap_8byte_stride(op2, ip2, match_length_v - 8);
}
}
}
backend.inline_exec_commit(tail + total);
}
Ok(())
}
}
}};
}
#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
pub(crate) use exec_sequence_avx2_inline;
#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
macro_rules! exec_sequence_sse2_inline {
($buffer:expr, $lit_src:expr, $lit_length:expr, $offset:expr, $match_length:expr) => {{
use crate::decoding::buffer_backend::sequence_output_fits;
use crate::decoding::exec_sequence_inline::x86::{
copy16, overlap_copy8, wildcopy_no_overlap, wildcopy_overlap_8byte_stride,
};
const MAX_WILDCOPY_OVERSHOOT: usize = 15;
let lit_length_v: usize = $lit_length;
let offset_v: usize = $offset;
let match_length_v: usize = $match_length;
let lit_src_v: *const u8 = $lit_src;
let backend = $buffer.buffer_mut();
let cap = backend.cap();
let tail = backend.tail();
match sequence_output_fits(lit_length_v, match_length_v, tail, cap, 0) {
Err(e) => Err(e),
Ok(total) => {
unsafe {
let base = backend.inline_exec_base_ptr();
if total + MAX_WILDCOPY_OVERSHOOT > cap - tail {
$crate::decoding::exec_sequence_inline::exec_sequence_bounded_copy(
base,
tail,
lit_src_v,
lit_length_v,
offset_v,
match_length_v,
);
} else {
let op_lit = base.add(tail);
let op_match = base.add(tail + lit_length_v);
let match_src = base.cast_const().add(tail + lit_length_v - offset_v);
copy16(op_lit, lit_src_v);
if lit_length_v > 16 {
wildcopy_no_overlap(
op_lit.add(16),
lit_src_v.add(16),
lit_length_v - 16,
);
}
if offset_v >= 16 {
wildcopy_no_overlap(op_match, match_src, match_length_v);
} else {
let (op2, ip2) = overlap_copy8(op_match, match_src, offset_v);
if match_length_v > 8 {
wildcopy_overlap_8byte_stride(op2, ip2, match_length_v - 8);
}
}
}
backend.inline_exec_commit(tail + total);
}
Ok(())
}
}
}};
}
#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
pub(crate) use exec_sequence_sse2_inline;
#[cfg(target_arch = "x86_64")]
pub(crate) mod x86 {
use core::arch::x86_64::{
__m128i, __m256i, _mm_loadu_si128, _mm_storeu_si128, _mm256_loadu_si256,
_mm256_storeu_si256,
};
#[inline(always)]
#[allow(dead_code)]
pub(crate) unsafe fn copy32_avx2(dst: *mut u8, src: *const u8) {
unsafe {
let v = _mm256_loadu_si256(src as *const __m256i);
_mm256_storeu_si256(dst as *mut __m256i, v);
}
}
#[inline(always)]
#[allow(dead_code)]
pub(crate) unsafe fn wildcopy_no_overlap_avx2(dst: *mut u8, src: *const u8, length: usize) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
copy32_avx2(dst.add(off), src.add(off));
off += 32;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn copy16(dst: *mut u8, src: *const u8) {
unsafe {
let v = _mm_loadu_si128(src as *const __m128i);
_mm_storeu_si128(dst as *mut __m128i, v);
}
}
#[inline(always)]
pub(crate) unsafe fn wildcopy_no_overlap(dst: *mut u8, src: *const u8, length: usize) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
copy16(dst.add(off), src.add(off));
off += 16;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn wildcopy_overlap_8byte_stride(
dst: *mut u8,
src: *const u8,
length: usize,
) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
let v: u64 = src.add(off).cast::<u64>().read_unaligned();
dst.add(off).cast::<u64>().write_unaligned(v);
off += 8;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn overlap_copy8(
dst: *mut u8,
src: *const u8,
offset: usize,
) -> (*mut u8, *const u8) {
const DEC32_TABLE: [u32; 8] = [0, 1, 2, 1, 4, 4, 4, 4];
const DEC64_TABLE: [i32; 8] = [8, 8, 8, 7, 8, 9, 10, 11];
unsafe {
if offset < 8 {
let sub2 = DEC64_TABLE[offset];
dst.add(0).write(src.add(0).read());
dst.add(1).write(src.add(1).read());
dst.add(2).write(src.add(2).read());
dst.add(3).write(src.add(3).read());
let dec32 = DEC32_TABLE[offset] as usize;
let v: u32 = src.add(dec32).cast::<u32>().read_unaligned();
dst.add(4).cast::<u32>().write_unaligned(v);
let net_offset = dec32 as isize - sub2 as isize + 8;
debug_assert!(
net_offset >= 0,
"overlap_copy8 net offset is non-negative for all offset ∈ 1..=7"
);
let src_after = src.offset(net_offset);
(dst.add(8), src_after)
} else {
let v: u64 = src.cast::<u64>().read_unaligned();
dst.cast::<u64>().write_unaligned(v);
(dst.add(8), src.add(8))
}
}
}
}
#[cfg(any(not(target_arch = "x86_64"), test))]
pub(crate) mod portable {
#[inline(always)]
pub(crate) unsafe fn copy16(dst: *mut u8, src: *const u8) {
unsafe {
let v: u128 = src.cast::<u128>().read_unaligned();
dst.cast::<u128>().write_unaligned(v);
}
}
#[inline(always)]
pub(crate) unsafe fn wildcopy_no_overlap(dst: *mut u8, src: *const u8, length: usize) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
copy16(dst.add(off), src.add(off));
off += 16;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn wildcopy_overlap_8byte_stride(
dst: *mut u8,
src: *const u8,
length: usize,
) {
debug_assert!(length > 0);
unsafe {
let mut off = 0usize;
loop {
let v: u64 = src.add(off).cast::<u64>().read_unaligned();
dst.add(off).cast::<u64>().write_unaligned(v);
off += 8;
if off >= length {
break;
}
}
}
}
#[inline(always)]
pub(crate) unsafe fn overlap_copy8(
dst: *mut u8,
src: *const u8,
offset: usize,
) -> (*mut u8, *const u8) {
const DEC32_TABLE: [u32; 8] = [0, 1, 2, 1, 4, 4, 4, 4];
const DEC64_TABLE: [i32; 8] = [8, 8, 8, 7, 8, 9, 10, 11];
unsafe {
if offset < 8 {
let sub2 = DEC64_TABLE[offset];
dst.add(0).write(src.add(0).read());
dst.add(1).write(src.add(1).read());
dst.add(2).write(src.add(2).read());
dst.add(3).write(src.add(3).read());
let dec32 = DEC32_TABLE[offset] as usize;
let v: u32 = src.add(dec32).cast::<u32>().read_unaligned();
dst.add(4).cast::<u32>().write_unaligned(v);
let net_offset = dec32 as isize - sub2 as isize + 8;
debug_assert!(
net_offset >= 0,
"overlap_copy8 net offset is non-negative for all offset ∈ 1..=7"
);
let src_after = src.offset(net_offset);
(dst.add(8), src_after)
} else {
let v: u64 = src.cast::<u64>().read_unaligned();
dst.cast::<u64>().write_unaligned(v);
(dst.add(8), src.add(8))
}
}
}
}
#[cfg(all(test, target_arch = "x86_64"))]
mod inline_helper_tests {
use super::x86::{copy16, overlap_copy8, wildcopy_no_overlap, wildcopy_overlap_8byte_stride};
#[test]
fn copy16_copies_exactly_16_bytes() {
let src: [u8; 16] = [
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
0xAE, 0xAF,
];
let mut dst = [0u8; 16];
unsafe { copy16(dst.as_mut_ptr(), src.as_ptr()) };
assert_eq!(dst, src);
}
#[test]
fn wildcopy_no_overlap_short_length_overshoots() {
let src: [u8; 32] = core::array::from_fn(|i| (i + 1) as u8);
let mut dst = [0u8; 32];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), 1) };
assert_eq!(&dst[..16], &src[..16]);
assert!(dst[16..].iter().all(|&b| b == 0));
}
#[test]
fn wildcopy_no_overlap_length_above_16_uses_multiple_iters() {
let src: [u8; 32] = core::array::from_fn(|i| (i + 1) as u8);
let mut dst = [0u8; 32];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), 24) };
assert_eq!(&dst[..32], &src[..32]);
}
#[test]
fn wildcopy_overlap_8byte_stride_rle_expansion_offset_8() {
let mut buf = [0u8; 32];
buf[..8].copy_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]);
unsafe {
wildcopy_overlap_8byte_stride(buf.as_mut_ptr().add(8), buf.as_ptr(), 16);
}
assert_eq!(&buf[8..16], &[1, 2, 3, 4, 5, 6, 7, 8]);
assert_eq!(&buf[16..24], &[1, 2, 3, 4, 5, 6, 7, 8]);
}
#[test]
fn overlap_copy8_offset_ge_8_does_plain_copy() {
let mut buf = [0u8; 32];
buf[..8].copy_from_slice(&[0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]);
let (op2, ip2) = unsafe { overlap_copy8(buf.as_mut_ptr().add(8), buf.as_ptr(), 8) };
assert_eq!(op2, unsafe { buf.as_mut_ptr().add(16) });
assert_eq!(ip2, unsafe { buf.as_ptr().add(8) });
assert_eq!(
&buf[8..16],
&[0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]
);
}
#[test]
fn overlap_copy8_offset_lt_8_spreads_source() {
let mut buf = [0u8; 32];
buf[..3].copy_from_slice(&[0xAA, 0xBB, 0xCC]);
let (op2, _ip2) = unsafe { overlap_copy8(buf.as_mut_ptr().add(3), buf.as_ptr(), 3) };
assert_eq!(op2, unsafe { buf.as_mut_ptr().add(11) });
assert!(buf[3..11].iter().any(|&b| b != 0));
}
}
#[cfg(test)]
mod portable_helper_tests {
use super::portable::{
copy16, overlap_copy8, wildcopy_no_overlap, wildcopy_overlap_8byte_stride,
};
#[test]
fn copy16_copies_exactly_16_bytes() {
let src: [u8; 16] = [
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
0xAE, 0xAF,
];
let mut dst = [0u8; 16];
unsafe { copy16(dst.as_mut_ptr(), src.as_ptr()) };
assert_eq!(dst, src);
}
#[test]
fn wildcopy_no_overlap_short_length_overshoots() {
let src: [u8; 32] = core::array::from_fn(|i| (i + 1) as u8);
let mut dst = [0u8; 32];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), 1) };
assert_eq!(&dst[..16], &src[..16]);
assert!(dst[16..].iter().all(|&b| b == 0));
}
#[test]
fn wildcopy_no_overlap_length_above_16_uses_multiple_iters() {
let src: [u8; 32] = core::array::from_fn(|i| (i + 1) as u8);
let mut dst = [0u8; 32];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), 24) };
assert_eq!(&dst[..32], &src[..32]);
}
#[test]
fn wildcopy_overlap_8byte_stride_rle_expansion_offset_8() {
let mut buf = [0u8; 32];
buf[..8].copy_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]);
unsafe {
wildcopy_overlap_8byte_stride(buf.as_mut_ptr().add(8), buf.as_ptr(), 16);
}
assert_eq!(&buf[8..16], &[1, 2, 3, 4, 5, 6, 7, 8]);
assert_eq!(&buf[16..24], &[1, 2, 3, 4, 5, 6, 7, 8]);
}
#[test]
fn overlap_copy8_offset_ge_8_does_plain_copy() {
let mut buf = [0u8; 32];
buf[..8].copy_from_slice(&[0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]);
let (op2, ip2) = unsafe { overlap_copy8(buf.as_mut_ptr().add(8), buf.as_ptr(), 8) };
assert_eq!(op2, unsafe { buf.as_mut_ptr().add(16) });
assert_eq!(ip2, unsafe { buf.as_ptr().add(8) });
assert_eq!(
&buf[8..16],
&[0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]
);
}
#[test]
fn overlap_copy8_offset_lt_8_spreads_source() {
let mut buf = [0u8; 32];
buf[..3].copy_from_slice(&[0xAA, 0xBB, 0xCC]);
let (op2, _ip2) = unsafe { overlap_copy8(buf.as_mut_ptr().add(3), buf.as_ptr(), 3) };
assert_eq!(op2, unsafe { buf.as_mut_ptr().add(11) });
assert!(buf[3..11].iter().any(|&b| b != 0));
}
#[test]
fn wildcopy_no_overlap_matches_scalar_reference() {
for len in 1usize..=48 {
let src: [u8; 64] = core::array::from_fn(|i| (i as u8).wrapping_mul(7).wrapping_add(1));
let mut dst = [0u8; 64];
unsafe { wildcopy_no_overlap(dst.as_mut_ptr(), src.as_ptr(), len) };
assert_eq!(&dst[..len], &src[..len], "len={len}");
}
}
}