use std::mem::transmute;
use wide::{CmpGt, CmpLt};
pub struct Avx2;
pub struct Avx512<const CS: bool = false>;
pub trait SimdElem<T>: 'static {
const L: usize;
const MAX: T;
type Simd: Copy;
fn splat(v: T) -> Self::Simd;
unsafe fn simd_from_slice(slice: &[T]) -> Self::Simd;
fn simd_lt_bitmask(a: Self::Simd, b: Self::Simd) -> u64;
fn lane_indices() -> Self::Simd;
fn from_usize(n: usize) -> T;
fn wrapping_add_one(t: T) -> T;
unsafe fn partition_fast(
vals: Self::Simd,
threshold: Self::Simd,
v: &mut [T],
v_idx: &mut usize,
w: &mut [T],
w_idx: &mut usize,
);
unsafe fn partition_slow(
vals: Self::Simd,
len: Self::Simd,
threshold: Self::Simd,
v: &mut [T],
v_idx: &mut usize,
w: &mut [T],
w_idx: &mut usize,
);
}
#[inline(always)]
pub fn push_position<T: Copy + Ord, S: SimdElem<T>>(pivots: &Vec<T>, t: T) -> usize {
if pivots.len() <= 64 {
let t_simd = S::splat(t);
let mut target_layer = 0;
let mut i = 0;
while i < pivots.len() {
let vals = unsafe { (pivots.as_ptr().add(i) as *const S::Simd).read_unaligned() };
target_layer += S::simd_lt_bitmask(t_simd, vals).trailing_ones() as usize;
i += S::L;
}
target_layer.min(pivots.len())
} else {
pivots
.binary_search_by(|p| {
if *p < t {
std::cmp::Ordering::Greater
} else {
std::cmp::Ordering::Less
}
})
.unwrap_err()
}
}
#[inline(never)]
pub fn position_min<T: Copy + Ord, S: SimdElem<T>>(v: &mut Vec<T>) -> usize {
let mut min_pos = [0; 2];
let mut min_val = [S::MAX; 2];
for (i, &[l, r]) in v.as_chunks::<2>().0.iter().enumerate() {
if l < min_val[0] {
min_val[0] = l;
min_pos[0] = i * 2;
}
if r < min_val[1] {
min_val[1] = r;
min_pos[1] = i * 2 + 1;
}
}
if v.len() % 2 == 1 {
let l = *v.last().unwrap();
if l < min_val[0] {
min_val[0] = l;
min_pos[0] = v.len() - 1;
}
}
if min_val[0] <= min_val[1] {
min_pos[0]
} else {
min_pos[1]
}
}
macro_rules! impl_simd_elem_32 {
($t:ty, $simd:ty) => {
impl SimdElem<$t> for Avx2 {
const L: usize = 8;
const MAX: $t = <$t>::MAX;
type Simd = $simd;
#[inline(always)]
fn splat(v: $t) -> $simd {
<$simd>::splat(v)
}
#[inline(always)]
unsafe fn simd_from_slice(slice: &[$t]) -> $simd {
unsafe { <$simd>::from(*(slice.as_ptr() as *const [$t; 8])) }
}
#[inline(always)]
fn simd_lt_bitmask(a: $simd, b: $simd) -> u64 {
a.simd_lt(b).to_bitmask() as u64
}
#[inline(always)]
fn lane_indices() -> $simd {
<$simd>::from([0 as $t, 1, 2, 3, 4, 5, 6, 7])
}
#[inline(always)]
fn from_usize(n: usize) -> $t {
n as $t
}
#[inline(always)]
fn wrapping_add_one(t: $t) -> $t {
t.wrapping_add(1)
}
#[inline(always)]
unsafe fn partition_fast(
vals: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let small = threshold.simd_gt(vals).to_bitmask() as u8;
let large = !small;
let vals: __m256i = transmute(vals);
let key: __m256i = transmute(crate::simd::UNIQSHUF32[small as usize]);
_mm256_storeu_si256(
v.as_mut_ptr().add(*v_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*v_idx += large.count_ones() as usize;
let key: __m256i = transmute(crate::simd::UNIQSHUF32[large as usize]);
_mm256_storeu_si256(
w.as_mut_ptr().add(*w_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*w_idx += small.count_ones() as usize;
}
}
#[inline(always)]
unsafe fn partition_slow(
vals: $simd,
len: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let mut small = vals.simd_lt(threshold).to_bitmask() as u8;
let mut large = !small;
let in_range = len
.simd_gt(<Self as SimdElem<$t>>::lane_indices())
.to_bitmask() as u8;
small &= in_range;
large &= in_range;
let vals: __m256i = transmute(vals);
let key: __m256i = transmute(crate::simd::UNIQSHUF32[(!large) as usize]);
_mm256_storeu_si256(
v.as_mut_ptr().add(*v_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*v_idx += large.count_ones() as usize;
let key: __m256i = transmute(crate::simd::UNIQSHUF32[(!small) as usize]);
_mm256_storeu_si256(
w.as_mut_ptr().add(*w_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*w_idx += small.count_ones() as usize;
}
}
}
};
}
macro_rules! impl_simd_elem_64 {
($t:ty, $simd:ty) => {
impl SimdElem<$t> for Avx2 {
const L: usize = 4;
const MAX: $t = <$t>::MAX;
type Simd = $simd;
#[inline(always)]
fn splat(v: $t) -> $simd {
<$simd>::splat(v)
}
#[inline(always)]
unsafe fn simd_from_slice(slice: &[$t]) -> $simd {
unsafe { <$simd>::from(*(slice.as_ptr() as *const [$t; 4])) }
}
#[inline(always)]
fn simd_lt_bitmask(a: $simd, b: $simd) -> u64 {
a.simd_lt(b).to_bitmask() as u64
}
#[inline(always)]
fn lane_indices() -> $simd {
<$simd>::from([0 as $t, 1, 2, 3])
}
#[inline(always)]
fn from_usize(n: usize) -> $t {
n as $t
}
#[inline(always)]
fn wrapping_add_one(t: $t) -> $t {
t.wrapping_add(1)
}
#[inline(always)]
unsafe fn partition_fast(
vals: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let small = (threshold.simd_gt(vals).to_bitmask() as u8) & 0xF;
let large = small ^ 0xF;
let vals: __m256i = transmute(vals);
let key: __m256i = transmute(crate::simd::UNIQSHUF64[small as usize]);
_mm256_storeu_si256(
v.as_mut_ptr().add(*v_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*v_idx += large.count_ones() as usize;
let key: __m256i = transmute(crate::simd::UNIQSHUF64[large as usize]);
_mm256_storeu_si256(
w.as_mut_ptr().add(*w_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*w_idx += small.count_ones() as usize;
}
}
#[inline(always)]
unsafe fn partition_slow(
vals: $simd,
len: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let mut small = (vals.simd_lt(threshold).to_bitmask() as u8) & 0xF;
let mut large = small ^ 0xF;
let in_range = (len
.simd_gt(<Self as SimdElem<$t>>::lane_indices())
.to_bitmask() as u8)
& 0xF;
small &= in_range;
large &= in_range;
let vals: __m256i = transmute(vals);
let key: __m256i = transmute(crate::simd::UNIQSHUF64[(large ^ 0xF) as usize]);
_mm256_storeu_si256(
v.as_mut_ptr().add(*v_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*v_idx += large.count_ones() as usize;
let key: __m256i = transmute(crate::simd::UNIQSHUF64[(small ^ 0xF) as usize]);
_mm256_storeu_si256(
w.as_mut_ptr().add(*w_idx) as *mut __m256i,
_mm256_permutevar8x32_epi32(vals, key),
);
*w_idx += small.count_ones() as usize;
}
}
}
};
}
macro_rules! impl_simd_elem_32_avx512 {
($t:ty, $simd:ty) => {
impl<const CS: bool> SimdElem<$t> for Avx512<CS> {
const L: usize = 16;
const MAX: $t = <$t>::MAX;
type Simd = $simd;
#[inline(always)]
fn splat(v: $t) -> $simd {
<$simd>::splat(v)
}
#[inline(always)]
unsafe fn simd_from_slice(slice: &[$t]) -> $simd {
unsafe { <$simd>::from(*(slice.as_ptr() as *const [$t; 16])) }
}
#[inline(always)]
fn simd_lt_bitmask(a: $simd, b: $simd) -> u64 {
a.simd_lt(b).to_bitmask() as u64
}
#[inline(always)]
fn lane_indices() -> $simd {
<$simd>::from([0 as $t, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
}
#[inline(always)]
fn from_usize(n: usize) -> $t {
n as $t
}
#[inline(always)]
fn wrapping_add_one(t: $t) -> $t {
t.wrapping_add(1)
}
#[inline(always)]
unsafe fn partition_fast(
vals: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let small: u16 = threshold.simd_gt(vals).to_bitmask() as u16;
let large: u16 = !small;
let vals: __m512i = transmute(vals);
if CS {
let cv = _mm512_maskz_compress_epi32(large, vals);
_mm512_storeu_si512(v.as_mut_ptr().add(*v_idx) as *mut __m512i, cv);
*v_idx += large.count_ones() as usize;
let cw = _mm512_maskz_compress_epi32(small, vals);
_mm512_storeu_si512(w.as_mut_ptr().add(*w_idx) as *mut __m512i, cw);
*w_idx += small.count_ones() as usize;
} else {
_mm512_mask_compressstoreu_epi32(
v.as_mut_ptr().add(*v_idx) as *mut i32,
large,
vals,
);
*v_idx += large.count_ones() as usize;
_mm512_mask_compressstoreu_epi32(
w.as_mut_ptr().add(*w_idx) as *mut i32,
small,
vals,
);
*w_idx += small.count_ones() as usize;
}
}
}
#[inline(always)]
unsafe fn partition_slow(
vals: $simd,
len: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let in_range: u16 = len
.simd_gt(<Self as SimdElem<$t>>::lane_indices())
.to_bitmask() as u16;
let small: u16 = vals.simd_lt(threshold).to_bitmask() as u16 & in_range;
let large: u16 = (!small) & in_range;
let vals: __m512i = transmute(vals);
if CS {
let cv = _mm512_maskz_compress_epi32(large, vals);
_mm512_storeu_si512(v.as_mut_ptr().add(*v_idx) as *mut __m512i, cv);
*v_idx += large.count_ones() as usize;
let cw = _mm512_maskz_compress_epi32(small, vals);
_mm512_storeu_si512(w.as_mut_ptr().add(*w_idx) as *mut __m512i, cw);
*w_idx += small.count_ones() as usize;
} else {
_mm512_mask_compressstoreu_epi32(
v.as_mut_ptr().add(*v_idx) as *mut i32,
large,
vals,
);
*v_idx += large.count_ones() as usize;
_mm512_mask_compressstoreu_epi32(
w.as_mut_ptr().add(*w_idx) as *mut i32,
small,
vals,
);
*w_idx += small.count_ones() as usize;
}
}
}
}
};
}
macro_rules! impl_simd_elem_64_avx512 {
($t:ty, $simd:ty) => {
impl<const CS: bool> SimdElem<$t> for Avx512<CS> {
const L: usize = 8;
const MAX: $t = <$t>::MAX;
type Simd = $simd;
#[inline(always)]
fn splat(v: $t) -> $simd {
<$simd>::splat(v)
}
#[inline(always)]
unsafe fn simd_from_slice(slice: &[$t]) -> $simd {
unsafe { <$simd>::from(*(slice.as_ptr() as *const [$t; 8])) }
}
#[inline(always)]
fn simd_lt_bitmask(a: $simd, b: $simd) -> u64 {
a.simd_lt(b).to_bitmask() as u64
}
#[inline(always)]
fn lane_indices() -> $simd {
<$simd>::from([0 as $t, 1, 2, 3, 4, 5, 6, 7])
}
#[inline(always)]
fn from_usize(n: usize) -> $t {
n as $t
}
#[inline(always)]
fn wrapping_add_one(t: $t) -> $t {
t.wrapping_add(1)
}
#[inline(always)]
unsafe fn partition_fast(
vals: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let small: u8 = threshold.simd_gt(vals).to_bitmask() as u8;
let large: u8 = !small;
let vals: __m512i = transmute(vals);
if CS {
let cv = _mm512_maskz_compress_epi64(large, vals);
_mm512_storeu_si512(v.as_mut_ptr().add(*v_idx) as *mut __m512i, cv);
*v_idx += large.count_ones() as usize;
let cw = _mm512_maskz_compress_epi64(small, vals);
_mm512_storeu_si512(w.as_mut_ptr().add(*w_idx) as *mut __m512i, cw);
*w_idx += small.count_ones() as usize;
} else {
_mm512_mask_compressstoreu_epi64(
v.as_mut_ptr().add(*v_idx) as *mut i64,
large,
vals,
);
*v_idx += large.count_ones() as usize;
_mm512_mask_compressstoreu_epi64(
w.as_mut_ptr().add(*w_idx) as *mut i64,
small,
vals,
);
*w_idx += small.count_ones() as usize;
}
}
}
#[inline(always)]
unsafe fn partition_slow(
vals: $simd,
len: $simd,
threshold: $simd,
v: &mut [$t],
v_idx: &mut usize,
w: &mut [$t],
w_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;
use std::mem::transmute;
let in_range: u8 = len
.simd_gt(<Self as SimdElem<$t>>::lane_indices())
.to_bitmask() as u8;
let small: u8 = vals.simd_lt(threshold).to_bitmask() as u8 & in_range;
let large: u8 = (!small) & in_range;
let vals: __m512i = transmute(vals);
if CS {
let cv = _mm512_maskz_compress_epi64(large, vals);
_mm512_storeu_si512(v.as_mut_ptr().add(*v_idx) as *mut __m512i, cv);
*v_idx += large.count_ones() as usize;
let cw = _mm512_maskz_compress_epi64(small, vals);
_mm512_storeu_si512(w.as_mut_ptr().add(*w_idx) as *mut __m512i, cw);
*w_idx += small.count_ones() as usize;
} else {
_mm512_mask_compressstoreu_epi64(
v.as_mut_ptr().add(*v_idx) as *mut i64,
large,
vals,
);
*v_idx += large.count_ones() as usize;
_mm512_mask_compressstoreu_epi64(
w.as_mut_ptr().add(*w_idx) as *mut i64,
small,
vals,
);
*w_idx += small.count_ones() as usize;
}
}
}
}
};
}
impl_simd_elem_32!(i32, wide::i32x8);
impl_simd_elem_32!(u32, wide::u32x8);
impl_simd_elem_64!(i64, wide::i64x4);
impl_simd_elem_64!(u64, wide::u64x4);
impl_simd_elem_32_avx512!(i32, wide::i32x16);
impl_simd_elem_32_avx512!(u32, wide::u32x16);
impl_simd_elem_64_avx512!(i64, wide::i64x8);
impl_simd_elem_64_avx512!(u64, wide::u64x8);
#[rustfmt::skip]
pub(crate) const UNIQSHUF32: [[i32; 8]; 256] = unsafe {transmute([
0,1,2,3,4,5,6,7,
1,2,3,4,5,6,7,0,
0,2,3,4,5,6,7,0,
2,3,4,5,6,7,0,0,
0,1,3,4,5,6,7,0,
1,3,4,5,6,7,0,0,
0,3,4,5,6,7,0,0,
3,4,5,6,7,0,0,0,
0,1,2,4,5,6,7,0,
1,2,4,5,6,7,0,0,
0,2,4,5,6,7,0,0,
2,4,5,6,7,0,0,0,
0,1,4,5,6,7,0,0,
1,4,5,6,7,0,0,0,
0,4,5,6,7,0,0,0,
4,5,6,7,0,0,0,0,
0,1,2,3,5,6,7,0,
1,2,3,5,6,7,0,0,
0,2,3,5,6,7,0,0,
2,3,5,6,7,0,0,0,
0,1,3,5,6,7,0,0,
1,3,5,6,7,0,0,0,
0,3,5,6,7,0,0,0,
3,5,6,7,0,0,0,0,
0,1,2,5,6,7,0,0,
1,2,5,6,7,0,0,0,
0,2,5,6,7,0,0,0,
2,5,6,7,0,0,0,0,
0,1,5,6,7,0,0,0,
1,5,6,7,0,0,0,0,
0,5,6,7,0,0,0,0,
5,6,7,0,0,0,0,0,
0,1,2,3,4,6,7,0,
1,2,3,4,6,7,0,0,
0,2,3,4,6,7,0,0,
2,3,4,6,7,0,0,0,
0,1,3,4,6,7,0,0,
1,3,4,6,7,0,0,0,
0,3,4,6,7,0,0,0,
3,4,6,7,0,0,0,0,
0,1,2,4,6,7,0,0,
1,2,4,6,7,0,0,0,
0,2,4,6,7,0,0,0,
2,4,6,7,0,0,0,0,
0,1,4,6,7,0,0,0,
1,4,6,7,0,0,0,0,
0,4,6,7,0,0,0,0,
4,6,7,0,0,0,0,0,
0,1,2,3,6,7,0,0,
1,2,3,6,7,0,0,0,
0,2,3,6,7,0,0,0,
2,3,6,7,0,0,0,0,
0,1,3,6,7,0,0,0,
1,3,6,7,0,0,0,0,
0,3,6,7,0,0,0,0,
3,6,7,0,0,0,0,0,
0,1,2,6,7,0,0,0,
1,2,6,7,0,0,0,0,
0,2,6,7,0,0,0,0,
2,6,7,0,0,0,0,0,
0,1,6,7,0,0,0,0,
1,6,7,0,0,0,0,0,
0,6,7,0,0,0,0,0,
6,7,0,0,0,0,0,0,
0,1,2,3,4,5,7,0,
1,2,3,4,5,7,0,0,
0,2,3,4,5,7,0,0,
2,3,4,5,7,0,0,0,
0,1,3,4,5,7,0,0,
1,3,4,5,7,0,0,0,
0,3,4,5,7,0,0,0,
3,4,5,7,0,0,0,0,
0,1,2,4,5,7,0,0,
1,2,4,5,7,0,0,0,
0,2,4,5,7,0,0,0,
2,4,5,7,0,0,0,0,
0,1,4,5,7,0,0,0,
1,4,5,7,0,0,0,0,
0,4,5,7,0,0,0,0,
4,5,7,0,0,0,0,0,
0,1,2,3,5,7,0,0,
1,2,3,5,7,0,0,0,
0,2,3,5,7,0,0,0,
2,3,5,7,0,0,0,0,
0,1,3,5,7,0,0,0,
1,3,5,7,0,0,0,0,
0,3,5,7,0,0,0,0,
3,5,7,0,0,0,0,0,
0,1,2,5,7,0,0,0,
1,2,5,7,0,0,0,0,
0,2,5,7,0,0,0,0,
2,5,7,0,0,0,0,0,
0,1,5,7,0,0,0,0,
1,5,7,0,0,0,0,0,
0,5,7,0,0,0,0,0,
5,7,0,0,0,0,0,0,
0,1,2,3,4,7,0,0,
1,2,3,4,7,0,0,0,
0,2,3,4,7,0,0,0,
2,3,4,7,0,0,0,0,
0,1,3,4,7,0,0,0,
1,3,4,7,0,0,0,0,
0,3,4,7,0,0,0,0,
3,4,7,0,0,0,0,0,
0,1,2,4,7,0,0,0,
1,2,4,7,0,0,0,0,
0,2,4,7,0,0,0,0,
2,4,7,0,0,0,0,0,
0,1,4,7,0,0,0,0,
1,4,7,0,0,0,0,0,
0,4,7,0,0,0,0,0,
4,7,0,0,0,0,0,0,
0,1,2,3,7,0,0,0,
1,2,3,7,0,0,0,0,
0,2,3,7,0,0,0,0,
2,3,7,0,0,0,0,0,
0,1,3,7,0,0,0,0,
1,3,7,0,0,0,0,0,
0,3,7,0,0,0,0,0,
3,7,0,0,0,0,0,0,
0,1,2,7,0,0,0,0,
1,2,7,0,0,0,0,0,
0,2,7,0,0,0,0,0,
2,7,0,0,0,0,0,0,
0,1,7,0,0,0,0,0,
1,7,0,0,0,0,0,0,
0,7,0,0,0,0,0,0,
7,0,0,0,0,0,0,0,
0,1,2,3,4,5,6,0,
1,2,3,4,5,6,0,0,
0,2,3,4,5,6,0,0,
2,3,4,5,6,0,0,0,
0,1,3,4,5,6,0,0,
1,3,4,5,6,0,0,0,
0,3,4,5,6,0,0,0,
3,4,5,6,0,0,0,0,
0,1,2,4,5,6,0,0,
1,2,4,5,6,0,0,0,
0,2,4,5,6,0,0,0,
2,4,5,6,0,0,0,0,
0,1,4,5,6,0,0,0,
1,4,5,6,0,0,0,0,
0,4,5,6,0,0,0,0,
4,5,6,0,0,0,0,0,
0,1,2,3,5,6,0,0,
1,2,3,5,6,0,0,0,
0,2,3,5,6,0,0,0,
2,3,5,6,0,0,0,0,
0,1,3,5,6,0,0,0,
1,3,5,6,0,0,0,0,
0,3,5,6,0,0,0,0,
3,5,6,0,0,0,0,0,
0,1,2,5,6,0,0,0,
1,2,5,6,0,0,0,0,
0,2,5,6,0,0,0,0,
2,5,6,0,0,0,0,0,
0,1,5,6,0,0,0,0,
1,5,6,0,0,0,0,0,
0,5,6,0,0,0,0,0,
5,6,0,0,0,0,0,0,
0,1,2,3,4,6,0,0,
1,2,3,4,6,0,0,0,
0,2,3,4,6,0,0,0,
2,3,4,6,0,0,0,0,
0,1,3,4,6,0,0,0,
1,3,4,6,0,0,0,0,
0,3,4,6,0,0,0,0,
3,4,6,0,0,0,0,0,
0,1,2,4,6,0,0,0,
1,2,4,6,0,0,0,0,
0,2,4,6,0,0,0,0,
2,4,6,0,0,0,0,0,
0,1,4,6,0,0,0,0,
1,4,6,0,0,0,0,0,
0,4,6,0,0,0,0,0,
4,6,0,0,0,0,0,0,
0,1,2,3,6,0,0,0,
1,2,3,6,0,0,0,0,
0,2,3,6,0,0,0,0,
2,3,6,0,0,0,0,0,
0,1,3,6,0,0,0,0,
1,3,6,0,0,0,0,0,
0,3,6,0,0,0,0,0,
3,6,0,0,0,0,0,0,
0,1,2,6,0,0,0,0,
1,2,6,0,0,0,0,0,
0,2,6,0,0,0,0,0,
2,6,0,0,0,0,0,0,
0,1,6,0,0,0,0,0,
1,6,0,0,0,0,0,0,
0,6,0,0,0,0,0,0,
6,0,0,0,0,0,0,0,
0,1,2,3,4,5,0,0,
1,2,3,4,5,0,0,0,
0,2,3,4,5,0,0,0,
2,3,4,5,0,0,0,0,
0,1,3,4,5,0,0,0,
1,3,4,5,0,0,0,0,
0,3,4,5,0,0,0,0,
3,4,5,0,0,0,0,0,
0,1,2,4,5,0,0,0,
1,2,4,5,0,0,0,0,
0,2,4,5,0,0,0,0,
2,4,5,0,0,0,0,0,
0,1,4,5,0,0,0,0,
1,4,5,0,0,0,0,0,
0,4,5,0,0,0,0,0,
4,5,0,0,0,0,0,0,
0,1,2,3,5,0,0,0,
1,2,3,5,0,0,0,0,
0,2,3,5,0,0,0,0,
2,3,5,0,0,0,0,0,
0,1,3,5,0,0,0,0,
1,3,5,0,0,0,0,0,
0,3,5,0,0,0,0,0,
3,5,0,0,0,0,0,0,
0,1,2,5,0,0,0,0,
1,2,5,0,0,0,0,0,
0,2,5,0,0,0,0,0,
2,5,0,0,0,0,0,0,
0,1,5,0,0,0,0,0,
1,5,0,0,0,0,0,0,
0,5,0,0,0,0,0,0,
5,0,0,0,0,0,0,0,
0,1,2,3,4,0,0,0,
1,2,3,4,0,0,0,0,
0,2,3,4,0,0,0,0,
2,3,4,0,0,0,0,0,
0,1,3,4,0,0,0,0,
1,3,4,0,0,0,0,0,
0,3,4,0,0,0,0,0,
3,4,0,0,0,0,0,0,
0,1,2,4,0,0,0,0,
1,2,4,0,0,0,0,0,
0,2,4,0,0,0,0,0,
2,4,0,0,0,0,0,0,
0,1,4,0,0,0,0,0,
1,4,0,0,0,0,0,0,
0,4,0,0,0,0,0,0,
4,0,0,0,0,0,0,0,
0,1,2,3,0,0,0,0,
1,2,3,0,0,0,0,0,
0,2,3,0,0,0,0,0,
2,3,0,0,0,0,0,0,
0,1,3,0,0,0,0,0,
1,3,0,0,0,0,0,0,
0,3,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,
0,1,2,0,0,0,0,0,
1,2,0,0,0,0,0,0,
0,2,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,
0,1,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
])};
#[rustfmt::skip]
pub(crate) const UNIQSHUF64: [[i32; 8]; 16] = unsafe {
transmute([
0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ])
};