#![allow(unused_attributes)]
macro_rules! unsigned_128 {
($name:ident, $cpuid:tt, $id:ident, $nlanes:expr, $cmpeq:ident, $minu:ident,
$head:ident, $tail:ident) => {
#[inline]
#[target_feature(enable = $cpuid)]
pub unsafe fn $name(s: &[$id]) -> usize {
#[cfg(target_arch = "x86")]
use arch::x86::*;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::*;
const ALIGNMENT: usize = 16;
let mut i: usize = $head!(s, $id, ALIGNMENT);
let n = s.len();
let ap = |o| (s.as_ptr().offset(o as isize)) as *const __m128i;
const NVECS: usize = 4;
const NLANES: usize = $nlanes;
const STRIDE: usize = NLANES * NVECS;
const MIN_LEN: usize = NLANES * (NVECS + 1);
const EWIDTH: i32 = 128 / 8 / NLANES as i32;
if (n - i) >= MIN_LEN {
let mut current = _mm_load_si128(ap(i + 0 * NLANES)); while i < n - STRIDE {
let next0 = _mm_load_si128(ap(i + 1 * NLANES)); let next1 = _mm_load_si128(ap(i + 2 * NLANES)); let next2 = _mm_load_si128(ap(i + 3 * NLANES)); let next3 = _mm_load_si128(ap(i + 4 * NLANES));
let compare0 = _mm_alignr_epi8(next0, current, EWIDTH); let compare1 = _mm_alignr_epi8(next1, next0, EWIDTH); let compare2 = _mm_alignr_epi8(next2, next1, EWIDTH); let compare3 = _mm_alignr_epi8(next3, next2, EWIDTH);
let mask0 = $cmpeq(current, $minu(current, compare0));
let mask1 = $cmpeq(next0, $minu(next0, compare1));
let mask2 = $cmpeq(next1, $minu(next1, compare2));
let mask3 = $cmpeq(next2, $minu(next2, compare3));
let mask = _mm_and_si128(
_mm_and_si128(mask0, mask1),
_mm_and_si128(mask2, mask3),
);
if _mm_test_all_ones(mask) == 0 {
return i;
}
current = next3;
i += STRIDE;
}
}
$tail!(s, n, i)
}
};
}
pub mod sse41 {
unsigned_128!(
is_sorted_lt_u32,
"sse4.1",
u32,
4,
_mm_cmpeq_epi32,
_mm_min_epu32,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
unsigned_128!(
is_sorted_lt_u16,
"sse4.1",
u16,
8,
_mm_cmpeq_epi16,
_mm_min_epu16,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
unsigned_128!(
is_sorted_lt_u8,
"sse4.1",
u8,
16,
_mm_cmpeq_epi8,
_mm_min_epu8,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
unsigned_128!(
is_sorted_gt_u32,
"sse4.1",
u32,
4,
_mm_cmpeq_epi32,
_mm_max_epu32,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
unsigned_128!(
is_sorted_gt_u16,
"sse4.1",
u16,
8,
_mm_cmpeq_epi16,
_mm_max_epu16,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
unsigned_128!(
is_sorted_gt_u8,
"sse4.1",
u8,
16,
_mm_cmpeq_epi8,
_mm_max_epu8,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
}
macro_rules! unsigned_256 {
($name:ident, $cpuid:tt, $id:ident, $nlanes:expr, $cmpeq:ident, $minu:ident,
$head:ident, $tail:ident) => {
#[inline]
#[target_feature(enable = $cpuid)]
pub unsafe fn $name(s: &[$id]) -> usize {
#[cfg(target_arch = "x86")]
use arch::x86::*;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::*;
const ALIGNMENT: usize = 32;
let mut i: usize = $head!(s, $id, ALIGNMENT);
let n = s.len();
let ap = |o| (s.as_ptr().offset(o as isize)) as *const __m256i;
const NVECS: usize = 4;
const NLANES: usize = $nlanes;
const STRIDE: usize = NLANES * NVECS;
const MIN_LEN: usize = NLANES * (NVECS + 1);
if (n - i) >= MIN_LEN {
while i < n - STRIDE {
let current = _mm256_load_si256(ap(i + 0 * NLANES)); let next0 = _mm256_load_si256(ap(i + 1 * NLANES)); let next1 = _mm256_load_si256(ap(i + 2 * NLANES)); let next2 = _mm256_load_si256(ap(i + 3 * NLANES));
let compare0 = _mm256_loadu_si256(ap(i + 0 * NLANES + 1)); let compare1 = _mm256_loadu_si256(ap(i + 1 * NLANES + 1)); let compare2 = _mm256_loadu_si256(ap(i + 2 * NLANES + 1)); let compare3 = _mm256_loadu_si256(ap(i + 3 * NLANES + 1));
let mask0 = $cmpeq(current, $minu(current, compare0));
let mask1 = $cmpeq(next0, $minu(next0, compare1));
let mask2 = $cmpeq(next1, $minu(next1, compare2));
let mask3 = $cmpeq(next2, $minu(next2, compare3));
let mask = _mm256_and_si256(
_mm256_and_si256(mask0, mask1),
_mm256_and_si256(mask2, mask3),
);
if _mm256_testc_si256(mask, _mm256_set1_epi64x(-1)) == 0 {
return i;
}
i += STRIDE;
}
}
$tail!(s, n, i)
}
};
}
pub mod avx2 {
unsigned_256!(
is_sorted_lt_u32,
"avx2",
u32,
8,
_mm256_cmpeq_epi32,
_mm256_min_epu32,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
unsigned_256!(
is_sorted_lt_u16,
"avx2",
u16,
16,
_mm256_cmpeq_epi16,
_mm256_min_epu16,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
unsigned_256!(
is_sorted_lt_u8,
"avx2",
u8,
32,
_mm256_cmpeq_epi8,
_mm256_min_epu8,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
unsigned_256!(
is_sorted_gt_u32,
"avx2",
u32,
8,
_mm256_cmpeq_epi32,
_mm256_max_epu32,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
unsigned_256!(
is_sorted_gt_u16,
"avx2",
u16,
16,
_mm256_cmpeq_epi16,
_mm256_max_epu16,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
unsigned_256!(
is_sorted_gt_u8,
"avx2",
u8,
32,
_mm256_cmpeq_epi8,
_mm256_max_epu8,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
}