1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#[macro_export]
macro_rules! loop_range_switch_avx2 {
(($len:ident, $ptr:ident, $start_ptr:ident, $end_ptr:ident) $($t:tt, )+) => {
use std::arch::x86_64::{
__m256i, _mm256_load_si256, _mm256_loadu_si256, _mm256_movemask_epi8, _mm256_or_si256,
};
const M256_VECTOR_SIZE: usize = std::mem::size_of::<__m256i>();
if $len < M256_VECTOR_SIZE {
$crate::loop_range_switch_sse2!(($len, $ptr, $start_ptr, $end_ptr) $($t, )+);
} else {
$crate::translations_256!($($t, )+);
{
const M256_VECTOR_ALIGN: usize = M256_VECTOR_SIZE - 1;
let align = M256_VECTOR_SIZE - ($start_ptr as usize & M256_VECTOR_ALIGN);
if align < M256_VECTOR_SIZE {
let mut mask = {
let a = _mm256_loadu_si256($ptr as *const __m256i);
_mm256_movemask_epi8(masking!(a))
};
if mask != 0 {
write_forward!(mask, align);
}
$ptr = $ptr.add(align);
}
}
$crate::avx_main_loop!(($len, $ptr, $end_ptr) $($t, )+);
while $ptr <= $end_ptr.sub(M256_VECTOR_SIZE) {
debug_assert_eq!(0, ($ptr as usize) % M256_VECTOR_SIZE);
let mut mask = {
let a = _mm256_load_si256($ptr as *const __m256i);
_mm256_movemask_epi8(masking!(a))
};
if mask != 0 {
write_mask!(mask, $ptr);
}
$ptr = $ptr.add(M256_VECTOR_SIZE);
}
debug_assert!($end_ptr.sub(M256_VECTOR_SIZE) < $ptr);
if $ptr < $end_ptr {
let d = M256_VECTOR_SIZE - $crate::sub!($end_ptr, $ptr);
let mut mask = ({
debug_assert_eq!(M256_VECTOR_SIZE, $crate::sub!($end_ptr, $ptr.sub(d)), "Over runs");
let a = _mm256_loadu_si256($ptr.sub(d) as *const __m256i);
_mm256_movemask_epi8(masking!(a))
} as u32).wrapping_shr(d as u32);
if mask != 0 {
write_mask!(mask, $ptr);
}
}
}
};
}