#![allow(unused_attributes)]
macro_rules! signed_128 {
($name:ident, $cpuid:tt, $id:ident, $nlanes:expr, $cmpgt:ident,
$head:ident, $tail:ident) => {
#[inline]
#[target_feature(enable = $cpuid)]
pub unsafe fn $name(s: &[$id]) -> usize {
#[cfg(target_arch = "x86")]
use arch::x86::*;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::*;
const ALIGNMENT: usize = 16;
let mut i: usize = $head!(s, $id, ALIGNMENT);
let n = s.len();
let ap = |o| (s.as_ptr().offset(o as isize)) as *const __m128i;
const NVECS: usize = 4;
const NLANES: usize = $nlanes;
const STRIDE: usize = NLANES * NVECS;
const MIN_LEN: usize = NLANES * (NVECS + 1);
const EWIDTH: i32 = 128 / 8 / NLANES as i32;
if (n - i) >= MIN_LEN {
let mut current = _mm_load_si128(ap(i + 0 * NLANES)); while i < n - STRIDE {
let next0 = _mm_load_si128(ap(i + 1 * NLANES)); let next1 = _mm_load_si128(ap(i + 2 * NLANES)); let next2 = _mm_load_si128(ap(i + 3 * NLANES)); let next3 = _mm_load_si128(ap(i + 4 * NLANES));
let compare0 = _mm_alignr_epi8(next0, current, EWIDTH); let compare1 = _mm_alignr_epi8(next1, next0, EWIDTH); let compare2 = _mm_alignr_epi8(next2, next1, EWIDTH); let compare3 = _mm_alignr_epi8(next3, next2, EWIDTH);
let mask0 = $cmpgt(current, compare0);
let mask1 = $cmpgt(next0, compare1);
let mask2 = $cmpgt(next1, compare2);
let mask3 = $cmpgt(next2, compare3);
let mask = _mm_or_si128(
_mm_or_si128(mask0, mask1),
_mm_or_si128(mask2, mask3),
);
if _mm_test_all_zeros(mask, mask) == 0 {
return i;
}
current = next3;
i += STRIDE;
}
}
$tail!(s, n, i)
}
};
}
pub mod sse42 {
signed_128!(
is_sorted_lt_i64,
"sse4.2",
i64,
2,
_mm_cmpgt_epi64,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
#[cfg(target_arch = "x86")]
use arch::x86::*;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::*;
#[target_feature(enable = "sse4.2")]
unsafe fn _mm_cmplt_epi64(x: __m128i, y: __m128i) -> __m128i {
let a = _mm_cmpgt_epi64(x, y); let b = _mm_cmpeq_epi64(x, y); let c = _mm_or_si128(a, b); let ones = _mm_set1_epi64x(-1_i64);
_mm_andnot_si128(c, ones) }
signed_128!(
is_sorted_gt_i64,
"sse4.2",
i64,
2,
_mm_cmplt_epi64,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
}
pub mod sse41 {
signed_128!(
is_sorted_lt_i32,
"sse4.1",
i32,
4,
_mm_cmpgt_epi32,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
signed_128!(
is_sorted_lt_i16,
"sse4.1",
i16,
8,
_mm_cmpgt_epi16,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
signed_128!(
is_sorted_lt_i8,
"sse4.1",
i8,
16,
_mm_cmpgt_epi8,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
signed_128!(
is_sorted_gt_i32,
"sse4.1",
i32,
4,
_mm_cmplt_epi32,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
signed_128!(
is_sorted_gt_i16,
"sse4.1",
i16,
8,
_mm_cmplt_epi16,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
signed_128!(
is_sorted_gt_i8,
"sse4.1",
i8,
16,
_mm_cmplt_epi8,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
}
macro_rules! signed_256 {
($name:ident, $cpuid:tt, $id:ident, $nlanes:expr, $cmpgt:ident, $head:ident, $tail:ident) => {
#[inline]
#[target_feature(enable = $cpuid)]
pub unsafe fn $name(s: &[$id]) -> usize {
#[cfg(target_arch = "x86")]
use arch::x86::*;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::*;
const ALIGNMENT: usize = 32;
let mut i: usize = $head!(s, $id, ALIGNMENT);
let n = s.len();
let ap = |o| (s.as_ptr().offset(o as isize)) as *const __m256i;
const NVECS: usize = 4;
const NLANES: usize = $nlanes;
const STRIDE: usize = NLANES * NVECS;
const MIN_LEN: usize = NLANES * (NVECS + 1);
if (n - i) >= MIN_LEN {
while i < n - STRIDE {
let current = _mm256_load_si256(ap(i + 0 * NLANES)); let next0 = _mm256_load_si256(ap(i + 1 * NLANES)); let next1 = _mm256_load_si256(ap(i + 2 * NLANES)); let next2 = _mm256_load_si256(ap(i + 3 * NLANES));
let compare0 = _mm256_loadu_si256(ap(i + 0 * NLANES + 1)); let compare1 = _mm256_loadu_si256(ap(i + 1 * NLANES + 1)); let compare2 = _mm256_loadu_si256(ap(i + 2 * NLANES + 1)); let compare3 = _mm256_loadu_si256(ap(i + 3 * NLANES + 1));
let mask0 = $cmpgt(current, compare0);
let mask1 = $cmpgt(next0, compare1);
let mask2 = $cmpgt(next1, compare2);
let mask3 = $cmpgt(next2, compare3);
let mask = _mm256_or_si256(
_mm256_or_si256(mask0, mask1),
_mm256_or_si256(mask2, mask3),
);
if _mm256_testz_si256(mask, mask) == 0 {
return i;
}
i += STRIDE;
}
}
$tail!(s, n, i)
}
};
}
pub mod avx2 {
signed_256!(
is_sorted_lt_i64,
"avx2",
i64,
4,
_mm256_cmpgt_epi64,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
signed_256!(
is_sorted_lt_i32,
"avx2",
i32,
8,
_mm256_cmpgt_epi32,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
signed_256!(
is_sorted_lt_i16,
"avx2",
i16,
16,
_mm256_cmpgt_epi16,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
signed_256!(
is_sorted_lt_i8,
"avx2",
i8,
32,
_mm256_cmpgt_epi8,
is_sorted_lt_until_alignment_boundary,
is_sorted_lt_tail
);
#[cfg(target_arch = "x86")]
use arch::x86::*;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::*;
#[target_feature(enable = "avx2")]
unsafe fn _mm256_cmplt_epi64(x: __m256i, y: __m256i) -> __m256i {
let a = _mm256_cmpgt_epi64(x, y);
let b = _mm256_cmpeq_epi64(x, y);
let x = _mm256_or_si256(a, b); _mm256_andnot_si256(x, _mm256_set1_epi64x(-1_i64))
}
signed_256!(
is_sorted_gt_i64,
"avx2",
i64,
4,
_mm256_cmplt_epi64,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
#[target_feature(enable = "avx2")]
unsafe fn _mm256_cmplt_epi32(x: __m256i, y: __m256i) -> __m256i {
let a = _mm256_cmpgt_epi32(x, y);
let b = _mm256_cmpeq_epi32(x, y);
let x = _mm256_or_si256(a, b); _mm256_andnot_si256(x, _mm256_set1_epi64x(-1_i64))
}
signed_256!(
is_sorted_gt_i32,
"avx2",
i32,
8,
_mm256_cmplt_epi32,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
#[target_feature(enable = "avx2")]
unsafe fn _mm256_cmplt_epi16(x: __m256i, y: __m256i) -> __m256i {
let a = _mm256_cmpgt_epi16(x, y);
let b = _mm256_cmpeq_epi16(x, y);
let x = _mm256_or_si256(a, b); _mm256_andnot_si256(x, _mm256_set1_epi64x(-1_i64))
}
signed_256!(
is_sorted_gt_i16,
"avx2",
i16,
16,
_mm256_cmplt_epi16,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
#[target_feature(enable = "avx2")]
unsafe fn _mm256_cmplt_epi8(x: __m256i, y: __m256i) -> __m256i {
let a = _mm256_cmpgt_epi8(x, y);
let b = _mm256_cmpeq_epi8(x, y);
let x = _mm256_or_si256(a, b); _mm256_andnot_si256(x, _mm256_set1_epi64x(-1_i64))
}
signed_256!(
is_sorted_gt_i8,
"avx2",
i8,
32,
_mm256_cmplt_epi8,
is_sorted_gt_until_alignment_boundary,
is_sorted_gt_tail
);
}