use std::arch::x86_64::*;
use super::{V3, v3::i32x4};
use crate::SIMDVector;
#[inline(always)]
pub(crate) unsafe fn __load_first_of_16_bytes(_: V3, mut ptr: *const u8, first: usize) -> u128 {
let mut remaining = first;
if remaining >= 16 {
return unsafe {
std::mem::transmute::<__m128i, u128>(_mm_loadu_si128(ptr as *const __m128i))
};
}
ptr = unsafe { ptr.add(first) };
let mut buffer: u128 = 0;
unsafe {
if remaining >= 8 {
ptr = ptr.sub(8);
let v: u64 = std::ptr::read_unaligned(ptr as *const u64);
buffer |= v as u128;
remaining -= 8;
}
if remaining >= 4 {
ptr = ptr.sub(4);
let v: u32 = std::ptr::read_unaligned(ptr as *const u32);
buffer = (buffer << (8 * std::mem::size_of::<u32>())) | (v as u128);
remaining -= 4;
}
if remaining >= 2 {
ptr = ptr.sub(2);
let v: u16 = std::ptr::read_unaligned(ptr as *const u16);
buffer = (buffer << (8 * std::mem::size_of::<u16>())) | (v as u128);
remaining -= 2;
}
if remaining >= 1 {
ptr = ptr.sub(1);
let v: u8 = std::ptr::read(ptr);
buffer = (buffer << 8) | (v as u128);
}
}
buffer
}
#[inline(always)]
pub(crate) unsafe fn __load_first_u16_of_16_bytes(
arch: V3,
ptr: *const u16,
first: usize,
) -> __m128i {
if first >= 8 {
return unsafe { _mm_loadu_si128(ptr as *const __m128i) };
}
unsafe {
let mut reg = i32x4::load_simd_first(arch, ptr as *const i32, first / 2).to_underlying();
if first == 1 {
reg = _mm_insert_epi16::<0>(reg, std::ptr::read_unaligned(ptr.add(first - 1)).into());
} else if first == 3 {
reg = _mm_insert_epi16::<2>(reg, std::ptr::read_unaligned(ptr.add(first - 1)).into());
} else if first == 5 {
reg = _mm_insert_epi16::<4>(reg, std::ptr::read_unaligned(ptr.add(first - 1)).into());
} else if first == 7 {
reg = _mm_insert_epi16::<6>(reg, std::ptr::read_unaligned(ptr.add(first - 1)).into());
}
reg
}
}