use super::{BitPacker, UnsafeBitPacker};
#[cfg(target_arch = "x86_64")]
use Available;
const BLOCK_LEN: usize = 32 * 4;
#[cfg(any(target_arch="x86_64"))]
mod sse3 {
use super::BLOCK_LEN;
use Available;
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_srli_epi32 as right_shift_32;
use std::arch::x86_64::_mm_slli_epi32 as left_shift_32;
use std::arch::x86_64::_mm_or_si128 as op_or;
use std::arch::x86_64::_mm_and_si128 as op_and;
use std::arch::x86_64::_mm_lddqu_si128 as load_unaligned;
use std::arch::x86_64::_mm_storeu_si128 as store_unaligned;
use std::arch::x86_64::{_mm_add_epi32, _mm_shuffle_epi32, _mm_srli_si128, _mm_sub_epi32, _mm_slli_si128, _mm_cvtsi128_si32};
#[allow(non_snake_case)]
unsafe fn or_collapse_to_u32(accumulator: DataType) -> u32 {
let a__b__c__d_ = accumulator;
let ______a__b_ = _mm_srli_si128(a__b__c__d_, 8);
let a__b__ca_db = op_or(a__b__c__d_, ______a__b_);
let ___a__b__ca = _mm_srli_si128(a__b__ca_db, 4);
let _______cadb = op_or(a__b__ca_db, ___a__b__ca);
_mm_cvtsi128_si32(_______cadb) as u32
}
#[target_feature(enable="sse3")]
unsafe fn compute_delta(curr: DataType, prev: DataType) -> DataType {
_mm_sub_epi32(curr,
op_or(_mm_slli_si128(curr, 4),
_mm_srli_si128(prev, 12))
)
}
#[target_feature(enable="sse3")]
#[allow(non_snake_case)]
unsafe fn integrate_delta(prev: DataType, delta: DataType) -> DataType {
let offset = _mm_shuffle_epi32(prev, 0xff);
let a__b__c__d_ = delta;
let ______a__b_ = _mm_slli_si128(delta, 8);
let a__b__ca_db = _mm_add_epi32(______a__b_, a__b__c__d_);
let ___a__b__ca = _mm_slli_si128(a__b__ca_db, 4);
let a_ab_abc_abcd: DataType = _mm_add_epi32(___a__b__ca, a__b__ca_db);
_mm_add_epi32(offset, a_ab_abc_abcd)
}
declare_bitpacker!(target_feature(enable="sse3"));
impl Available for UnsafeBitPackerImpl {
fn available() -> bool {
is_x86_feature_detected!("sse3")
}
}
}
mod scalar {
use super::BLOCK_LEN;
use Available;
type DataType = [u32; 4];
fn set1(el: i32) -> DataType {
[el as u32; 4]
}
fn right_shift_32(el: DataType, shift: i32) -> DataType {
[el[0] >> shift,
el[1] >> shift,
el[2] >> shift,
el[3] >> shift]
}
fn left_shift_32(el: DataType, shift: i32) -> DataType {
[el[0] << shift,
el[1] << shift,
el[2] << shift,
el[3] << shift]
}
fn op_or(left: DataType, right: DataType) -> DataType {
[left[0] | right[0],
left[1] | right[1],
left[2] | right[2],
left[3] | right[3]]
}
fn op_and(left: DataType, right: DataType) -> DataType {
[left[0] & right[0],
left[1] & right[1],
left[2] & right[2],
left[3] & right[3]]
}
unsafe fn load_unaligned(addr: *const DataType) -> DataType {
*addr
}
unsafe fn store_unaligned(addr: *mut DataType, data: DataType) {
*addr = data;
}
fn or_collapse_to_u32(accumulator: DataType) -> u32 {
(accumulator[0] | accumulator[1]) | (accumulator[2] | accumulator[3])
}
fn compute_delta(curr: DataType, prev: DataType) -> DataType {
[
curr[0].wrapping_sub(prev[3]),
curr[1].wrapping_sub(curr[0]),
curr[2].wrapping_sub(curr[1]),
curr[3].wrapping_sub(curr[2]),
]
}
fn integrate_delta(offset: DataType, delta: DataType) -> DataType {
let el0 = offset[3].wrapping_add(delta[0]);
let el1 = el0.wrapping_add(delta[1]);
let el2 = el1.wrapping_add(delta[2]);
let el3 = el2.wrapping_add(delta[3]);
[el0, el1, el2, el3]
}
declare_bitpacker!(cfg(any(debug, not(debug))) );
impl Available for UnsafeBitPackerImpl {
fn available() -> bool {
true
}
}
}
enum InstructionSet {
#[cfg(target_arch = "x86_64")]
SSE3,
Scalar
}
pub struct BitPacker4x(InstructionSet);
impl BitPacker for BitPacker4x {
const BLOCK_LEN: usize = BLOCK_LEN;
fn new() -> Self {
#[cfg(target_arch = "x86_64")]
{
if sse3::UnsafeBitPackerImpl::available() {
return BitPacker4x(InstructionSet::SSE3);
}
}
BitPacker4x(InstructionSet::Scalar)
}
fn compress(&self, decompressed: &[u32], compressed: &mut [u8], num_bits: u8) -> usize {
unsafe {
match self.0 {
#[cfg(target_arch = "x86_64")]
InstructionSet::SSE3 =>
sse3::UnsafeBitPackerImpl::compress(decompressed, compressed, num_bits),
InstructionSet::Scalar =>
scalar::UnsafeBitPackerImpl::compress(decompressed, compressed, num_bits)
}
}
}
fn compress_sorted(&self, initial: u32, decompressed: &[u32], compressed: &mut [u8], num_bits: u8) -> usize {
unsafe {
match self.0 {
#[cfg(target_arch = "x86_64")]
InstructionSet::SSE3 =>
sse3::UnsafeBitPackerImpl::compress_sorted(initial, decompressed, compressed, num_bits),
InstructionSet::Scalar =>
scalar::UnsafeBitPackerImpl::compress_sorted(initial, decompressed, compressed, num_bits)
}
}
}
fn decompress(&self, compressed: &[u8], decompressed: &mut [u32], num_bits: u8) -> usize {
unsafe {
match self.0 {
#[cfg(target_arch = "x86_64")]
InstructionSet::SSE3 =>
sse3::UnsafeBitPackerImpl::decompress(compressed, decompressed, num_bits),
InstructionSet::Scalar =>
scalar::UnsafeBitPackerImpl::decompress(compressed, decompressed, num_bits),
}
}
}
fn decompress_sorted(&self, initial: u32, compressed: &[u8], decompressed: &mut [u32], num_bits: u8) -> usize {
unsafe {
match self.0 {
#[cfg(target_arch = "x86_64")]
InstructionSet::SSE3 =>
sse3::UnsafeBitPackerImpl::decompress_sorted(initial, compressed, decompressed, num_bits),
InstructionSet::Scalar =>
scalar::UnsafeBitPackerImpl::decompress_sorted(initial, compressed, decompressed, num_bits)
}
}
}
fn num_bits(&self, decompressed: &[u32]) -> u8 {
unsafe {
match self.0 {
#[cfg(target_arch = "x86_64")]
InstructionSet::SSE3 =>
sse3::UnsafeBitPackerImpl::num_bits(decompressed),
InstructionSet::Scalar =>
scalar::UnsafeBitPackerImpl::num_bits(decompressed)
}
}
}
fn num_bits_sorted(&self, initial: u32, decompressed: &[u32]) -> u8 {
unsafe {
match self.0 {
#[cfg(target_arch = "x86_64")]
InstructionSet::SSE3 =>
sse3::UnsafeBitPackerImpl::num_bits_sorted(initial, decompressed),
InstructionSet::Scalar =>
scalar::UnsafeBitPackerImpl::num_bits_sorted(initial, decompressed)
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[cfg(test)]
mod tests {
use Available;
use super::{scalar, sse3};
use tests::test_util_compatible;
use super::BLOCK_LEN;
#[test]
fn test_compatible() {
if sse3::UnsafeBitPackerImpl::available() {
test_util_compatible::<scalar::UnsafeBitPackerImpl, sse3::UnsafeBitPackerImpl>(BLOCK_LEN);
}
}
}