use std::arch::x86_64::*;
use super::V3;
#[inline(always)]
unsafe fn __load_8_to_16_bytes(_: V3, ptr: *const u8, bytes: usize) -> __m128i {
debug_assert!(bytes > 8 && bytes < 16);
unsafe {
let base = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let lo = _mm_loadl_epi64(ptr as *const __m128i);
let hi = _mm_loadl_epi64(ptr.add(bytes - 8) as *const __m128i);
let mask = _mm_sub_epi8(base, _mm_set1_epi8((bytes - 8) as i8));
_mm_or_si128(lo, _mm_shuffle_epi8(hi, mask))
}
}
#[inline(always)]
pub(crate) unsafe fn __load_first_of_16_bytes(arch: V3, ptr: *const u8, first: usize) -> u128 {
if first >= 16 {
return unsafe {
std::mem::transmute::<__m128i, u128>(_mm_loadu_si128(ptr as *const __m128i))
};
}
if first > 8 {
return unsafe {
std::mem::transmute::<__m128i, u128>(__load_8_to_16_bytes(arch, ptr, first))
};
}
unsafe {
if first == 8 {
std::ptr::read_unaligned(ptr as *const u64) as u128
} else if first >= 4 {
let lo = std::ptr::read_unaligned(ptr as *const u32) as u64;
let hi = std::ptr::read_unaligned(ptr.add(first - 4) as *const u32) as u64;
(lo | (hi << ((first - 4) * 8))) as u128
} else if first >= 2 {
let lo = std::ptr::read_unaligned(ptr as *const u16) as u64;
let hi = std::ptr::read_unaligned(ptr.add(first - 2) as *const u16) as u64;
(lo | (hi << ((first - 2) * 8))) as u128
} else if first == 1 {
std::ptr::read(ptr) as u128
} else {
0
}
}
}
#[inline(always)]
pub(crate) unsafe fn __load_first_u16_of_16_bytes(
arch: V3,
ptr: *const u16,
first: usize,
) -> __m128i {
if first >= 8 {
return unsafe { _mm_loadu_si128(ptr as *const __m128i) };
}
let byte_ptr = ptr as *const u8;
let bytes = first * 2;
if bytes > 8 {
return unsafe { __load_8_to_16_bytes(arch, byte_ptr, bytes) };
}
unsafe {
if bytes == 8 {
let v = std::ptr::read_unaligned(byte_ptr as *const u64);
_mm_cvtsi64_si128(v as i64)
} else if bytes >= 4 {
let lo = std::ptr::read_unaligned(byte_ptr as *const u32) as u64;
let hi = std::ptr::read_unaligned(byte_ptr.add(bytes - 4) as *const u32) as u64;
_mm_cvtsi64_si128((lo | (hi << ((bytes - 4) * 8))) as i64)
} else if bytes >= 2 {
_mm_cvtsi32_si128(std::ptr::read_unaligned(byte_ptr as *const u16) as i32)
} else {
_mm_setzero_si128()
}
}
}