use crate::bits_find_lsb::fd_ulong_find_lsb_w_default;
use core::arch::x86_64::{
__m256i, _mm256_add_epi8, _mm256_and_si256, _mm256_cmpeq_epi64, _mm256_cmpeq_epi8,
_mm256_cmpgt_epi64, _mm256_cmpgt_epi8, _mm256_extractf128_si256, _mm256_loadu_si256,
_mm256_movemask_epi8, _mm256_mul_epu32, _mm256_or_si256, _mm256_set1_epi64x, _mm256_set1_epi8,
_mm256_set_m128i, _mm256_setr_epi64x, _mm256_setr_epi8, _mm256_setzero_si256,
_mm256_shuffle_epi8, _mm256_slli_epi64, _mm256_slli_si256, _mm256_srli_epi64,
_mm256_srlv_epi64, _mm256_storeu_si256, _mm256_sub_epi64, _mm256_sub_epi8, _mm_or_si128,
_mm_setzero_si128, _mm_slli_si128, _mm_srli_si128,
};
#[inline(always)]
pub(crate) fn wuc_ldu(p: *const u8) -> __m256i {
unsafe { _mm256_loadu_si256(p as *const __m256i) }
}
#[inline(always)]
pub(crate) fn wuc_stu(p: *mut u8, i: __m256i) {
unsafe { _mm256_storeu_si256(p as *mut __m256i, i) };
}
#[inline(always)]
pub(crate) fn wl(l0: i64, l1: i64, l2: i64, l3: i64) -> __m256i {
unsafe { _mm256_setr_epi64x(l0, l1, l2, l3) }
}
#[inline(always)]
pub(crate) fn wl_bcast(l0: i64) -> __m256i {
unsafe { _mm256_set1_epi64x(l0) }
}
#[inline(always)]
pub(crate) fn wl_shl<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe { _mm256_slli_epi64::<IMM8>(a) }
}
#[inline(always)]
pub(crate) fn wl_shru<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe { _mm256_srli_epi64::<IMM8>(a) }
}
#[inline(always)]
pub(crate) fn wl_shru_vector(a: __m256i, b: __m256i) -> __m256i {
unsafe { _mm256_srlv_epi64(a, b) }
}
#[inline(always)]
pub(crate) fn wl_and(a: __m256i, b: __m256i) -> __m256i {
unsafe { _mm256_and_si256(a, b) }
}
#[inline(always)]
pub(crate) fn wl_eq(a: __m256i, b: __m256i) -> __m256i {
unsafe { _mm256_cmpeq_epi64(a, b) }
}
#[inline(always)]
pub(crate) fn wl_gt(a: __m256i, b: __m256i) -> __m256i {
unsafe { _mm256_cmpgt_epi64(a, b) }
}
#[inline(always)]
fn wl_sub(a: __m256i, b: __m256i) -> __m256i {
unsafe { _mm256_sub_epi64(a, b) }
}
#[inline(always)]
pub(crate) fn intermediate_to_raw(intermediate: __m256i) -> __m256i {
let ca = wl_bcast(2369637129);
let cb = wl_bcast(1307386003);
let broadcast_58 = wl_bcast(58);
let div3364 = |r: __m256i| wl_shru::<40>(unsafe { _mm256_mul_epu32(wl_shru::<2>(r), cb) });
let div0 = intermediate;
let div1 = wl_shru::<37>(unsafe { _mm256_mul_epu32(div0, ca) });
let rem0 = wl_sub(div0, unsafe { _mm256_mul_epu32(div1, broadcast_58) });
let div2 = div3364(div0);
let rem1 = wl_sub(div1, unsafe { _mm256_mul_epu32(div2, broadcast_58) });
let div3 = div3364(div1);
let rem2 = wl_sub(div2, unsafe { _mm256_mul_epu32(div3, broadcast_58) });
let div4 = div3364(div2);
let rem3 = wl_sub(div3, unsafe { _mm256_mul_epu32(div4, broadcast_58) });
let rem4 = div4;
let shuffle1 = unsafe {
_mm256_setr_epi8(
0, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1,
)
};
let shift4 = unsafe { _mm256_shuffle_epi8(rem4, shuffle1) };
let shift3 = unsafe { _mm256_slli_si256(_mm256_shuffle_epi8(rem3, shuffle1), 1) };
let shift2 = unsafe { _mm256_slli_si256(_mm256_shuffle_epi8(rem2, shuffle1), 2) };
let shift1 = unsafe { _mm256_slli_si256(_mm256_shuffle_epi8(rem1, shuffle1), 3) };
let shift0 = unsafe { _mm256_slli_si256(_mm256_shuffle_epi8(rem0, shuffle1), 4) };
unsafe {
_mm256_or_si256(
_mm256_or_si256(
_mm256_or_si256(shift4, shift3),
_mm256_or_si256(shift2, shift1),
),
shift0,
)
}
}
#[inline(always)]
const fn fd_ulong_mask_lsb(n: i32) -> u64 {
(((n <= 63) as u64) << (n & 63)).wrapping_sub(1)
}
#[inline(always)]
pub(crate) fn raw_to_base58(in_: __m256i) -> __m256i {
let gt0 = unsafe { _mm256_cmpgt_epi8(in_, _mm256_set1_epi8(8)) };
let gt1 = unsafe { _mm256_cmpgt_epi8(in_, _mm256_set1_epi8(16)) };
let gt2 = unsafe { _mm256_cmpgt_epi8(in_, _mm256_set1_epi8(21)) };
let gt3 = unsafe { _mm256_cmpgt_epi8(in_, _mm256_set1_epi8(32)) };
let gt4 = unsafe { _mm256_cmpgt_epi8(in_, _mm256_set1_epi8(43)) };
let gt0_7 = unsafe { _mm256_and_si256(gt0, _mm256_set1_epi8(-7)) };
let gt3_6 = unsafe { _mm256_and_si256(gt3, _mm256_set1_epi8(-6)) };
let sum = unsafe {
_mm256_add_epi8(
_mm256_add_epi8(
_mm256_add_epi8(_mm256_set1_epi8(-('1' as i8)), gt1),
_mm256_add_epi8(gt2, gt4),
),
_mm256_add_epi8(gt0_7, gt3_6),
)
};
unsafe { _mm256_sub_epi8(in_, sum) }
}
#[inline(always)]
pub(crate) fn count_leading_zeros_26(in_: __m256i) -> u64 {
const MASK_LSB_27: u64 = fd_ulong_mask_lsb(27);
const MASK_LSB_26: u64 = fd_ulong_mask_lsb(26);
let mask0 = unsafe { _mm256_movemask_epi8(_mm256_cmpeq_epi8(in_, _mm256_setzero_si256())) }
as u32 as u64;
let mask = MASK_LSB_27 ^ (mask0 & MASK_LSB_26);
mask.trailing_zeros() as u64
}
#[inline(always)]
pub(crate) fn count_leading_zeros_32(in_: __m256i) -> u64 {
const MASK_LSB: u64 = fd_ulong_mask_lsb(33);
let comparison = unsafe { _mm256_cmpeq_epi8(in_, _mm256_setzero_si256()) };
let xor_rhs = unsafe { _mm256_movemask_epi8(comparison) } as u32 as u64;
let mask = MASK_LSB ^ xor_rhs;
mask.trailing_zeros() as u64
}
#[inline(always)]
pub(crate) fn count_leading_zeros_45(in0: __m256i, in1: __m256i) -> u64 {
const MASK_LSB_46: u64 = fd_ulong_mask_lsb(46);
const MASK_LSB_13: u64 = fd_ulong_mask_lsb(13);
let mask0 = unsafe { _mm256_movemask_epi8(_mm256_cmpeq_epi8(in0, _mm256_setzero_si256())) }
as u32 as u64;
let mask1 = unsafe { _mm256_movemask_epi8(_mm256_cmpeq_epi8(in1, _mm256_setzero_si256())) }
as u32 as u64;
let mask = MASK_LSB_46 ^ (((mask1 & MASK_LSB_13) << 32) | mask0);
mask.trailing_zeros() as u64
}
#[inline(always)]
pub(crate) fn count_leading_zeros_64(in0: __m256i, in1: __m256i) -> u64 {
let mask0 = unsafe { _mm256_movemask_epi8(_mm256_cmpeq_epi8(in0, _mm256_setzero_si256())) }
as u32 as u64;
let mask1 = unsafe { _mm256_movemask_epi8(_mm256_cmpeq_epi8(in1, _mm256_setzero_si256())) }
as u32 as u64;
let mask = !((mask1 << 32) | mask0);
fd_ulong_find_lsb_w_default(mask, 64) as u64
}
#[inline(always)]
pub(crate) fn ten_per_slot_down_32(in0: __m256i, in1: __m256i, in2: __m256i) -> (__m256i, __m256i) {
let lo0 = unsafe { _mm256_extractf128_si256(in0, 0) };
let hi0 = unsafe { _mm256_extractf128_si256(in0, 1) };
let lo1 = unsafe { _mm256_extractf128_si256(in1, 0) };
let hi1 = unsafe { _mm256_extractf128_si256(in1, 1) };
let lo2 = unsafe { _mm256_extractf128_si256(in2, 0) };
let o0 = unsafe { _mm_or_si128(lo0, _mm_slli_si128(hi0, 10)) };
let o1 = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_srli_si128(hi0, 6), _mm_slli_si128(lo1, 4)),
_mm_slli_si128(hi1, 14),
)
};
let o2 = unsafe { _mm_or_si128(_mm_srli_si128(hi1, 2), _mm_slli_si128(lo2, 8)) };
let out0 = unsafe { _mm256_set_m128i(o1, o0) };
let out1 = unsafe { _mm256_set_m128i(_mm_setzero_si128(), o2) };
(out0, out1)
}
#[inline(always)]
pub(crate) fn ten_per_slot_down_64(
in0: __m256i,
in1: __m256i,
in2: __m256i,
in3: __m256i,
in4: __m256i,
) -> (__m256i, __m256i, __m256i) {
let lo0 = unsafe { _mm256_extractf128_si256(in0, 0) };
let hi0 = unsafe { _mm256_extractf128_si256(in0, 1) };
let lo1 = unsafe { _mm256_extractf128_si256(in1, 0) };
let hi1 = unsafe { _mm256_extractf128_si256(in1, 1) };
let lo2 = unsafe { _mm256_extractf128_si256(in2, 0) };
let hi2 = unsafe { _mm256_extractf128_si256(in2, 1) };
let lo3 = unsafe { _mm256_extractf128_si256(in3, 0) };
let hi3 = unsafe { _mm256_extractf128_si256(in3, 1) };
let lo4 = unsafe { _mm256_extractf128_si256(in4, 0) };
let o0 = unsafe { _mm_or_si128(lo0, _mm_slli_si128(hi0, 10)) };
let o1 = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_srli_si128(hi0, 6), _mm_slli_si128(lo1, 4)),
_mm_slli_si128(hi1, 14),
)
};
let o2 = unsafe { _mm_or_si128(_mm_srli_si128(hi1, 2), _mm_slli_si128(lo2, 8)) };
let o3 = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_srli_si128(lo2, 8), _mm_slli_si128(hi2, 2)),
_mm_slli_si128(lo3, 12),
)
};
let o4 = unsafe { _mm_or_si128(_mm_srli_si128(lo3, 4), _mm_slli_si128(hi3, 6)) };
let out0 = unsafe { _mm256_set_m128i(o1, o0) };
let out1 = unsafe { _mm256_set_m128i(o3, o2) };
let out2 = unsafe { _mm256_set_m128i(lo4, o4) };
(out0, out1, out2)
}
#[inline(always)]
pub(crate) fn wl_ld(p: *const i64) -> __m256i {
unsafe { core::arch::x86_64::_mm256_load_si256(p as *const __m256i) }
}
#[cfg(test)]
mod tests {
use super::*;
use core::arch::x86_64::{_mm256_load_si256, _mm256_store_si256};
fn wuc_ld(p: *const u8) -> __m256i {
unsafe { _mm256_load_si256(p as *const __m256i) }
}
#[inline(always)]
fn wuc_st(p: *mut u8, i: __m256i) {
unsafe { _mm256_store_si256(p as *mut __m256i, i) };
}
#[test]
fn test_fd_ulong_mask_lsb() {
let w = 64;
for n in 0..=64 {
let mask = (if n < w { 1u64 << n } else { 0u64 }).wrapping_sub(1u64);
assert_eq!(fd_ulong_mask_lsb(n), mask);
}
}
#[repr(C, align(32))]
#[derive(Debug)]
struct Aligned32([u8; 64]);
#[test]
fn test_count_leading_zeros() {
let mut buffer = Aligned32([0; 64]);
let buf_ptr = buffer.0.as_mut_ptr();
assert_eq!(count_leading_zeros_32(wuc_ld(buf_ptr as *const u8)), 32);
assert_eq!(
count_leading_zeros_45(
wuc_ld(buf_ptr as *const u8),
wuc_ld(unsafe { buf_ptr.add(32) } as *const u8)
),
45
);
unsafe { *buf_ptr = 2 };
assert_eq!(count_leading_zeros_32(wuc_ld(buf_ptr as *const u8)), 0);
assert_eq!(
count_leading_zeros_45(
wuc_ld(buf_ptr as *const u8),
wuc_ld(unsafe { buf_ptr.add(32) } as *const u8)
),
0
);
unsafe { *buf_ptr = 0 };
unsafe { *buf_ptr.add(1) = 7 };
assert_eq!(count_leading_zeros_32(wuc_ld(buf_ptr as *const u8)), 1);
assert_eq!(
count_leading_zeros_45(
wuc_ld(buf_ptr as *const u8),
wuc_ld(unsafe { buf_ptr.add(32) } as *const u8)
),
1
);
unsafe { *buf_ptr.add(1) = 255 };
}
#[repr(C, align(32))]
struct UcharArr32<const N: usize>([u8; N]);
#[test]
fn test_ten_per_slot_down_64() {
let mut in_ = UcharArr32([0; 32 * 5]);
let mut out = UcharArr32([0; 32 * 3]);
for i in 0..90 {
in_.0[16 * (i / 10) + (i % 10)] = i as u8 + 1;
}
let in_ptr = in_.0.as_ptr();
let a = wuc_ld(in_ptr);
let b = wuc_ld(unsafe { in_ptr.offset(32) });
let c = wuc_ld(unsafe { in_ptr.offset(64) });
let d = wuc_ld(unsafe { in_ptr.offset(96) });
let e = wuc_ld(unsafe { in_ptr.offset(128) });
let (out0, out1, out2) = ten_per_slot_down_64(a, b, c, d, e);
let out_ptr = out.0.as_mut_ptr();
wuc_st(out_ptr, out0);
wuc_st(unsafe { out_ptr.offset(32) }, out1);
wuc_st(unsafe { out_ptr.offset(64) }, out2);
for i in 0..90 {
assert_eq!(out.0[i], i as u8 + 1);
}
}
}