1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#[macro_export]
macro_rules! loop_range_switch_sse2 {
(($len:ident, $ptr:ident, $start_ptr:ident, $end_ptr:ident) $($t:tt, )+) => {
use std::arch::x86_64::{__m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8};
const M128_VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
const M128_VECTOR_ALIGN: usize = M128_VECTOR_SIZE - 1;
if $len < M128_VECTOR_SIZE {
fallback!();
} else {
$crate::translations_128!($($t, )+);
{
let align = M128_VECTOR_SIZE - ($start_ptr as usize & M128_VECTOR_ALIGN);
if align < M128_VECTOR_SIZE {
let mut mask = {
let a = _mm_loadu_si128($ptr as *const __m128i);
_mm_movemask_epi8(masking!(a))
};
if mask != 0 {
write_forward!(mask, align);
}
$ptr = $ptr.add(align);
}
}
while $ptr <= $end_ptr.sub(M128_VECTOR_SIZE) {
debug_assert_eq!(0, ($ptr as usize) % M128_VECTOR_SIZE);
let mut mask = {
let a = _mm_load_si128($ptr as *const __m128i);
_mm_movemask_epi8(masking!(a))
};
if mask != 0 {
write_mask!(mask, $ptr);
}
$ptr = $ptr.add(M128_VECTOR_SIZE);
}
debug_assert!($end_ptr.sub(M128_VECTOR_SIZE) < $ptr);
if $ptr < $end_ptr {
let d = M128_VECTOR_SIZE - $crate::sub!($end_ptr, $ptr);
let mut mask = ({
debug_assert_eq!(M128_VECTOR_SIZE, $crate::sub!($end_ptr, $ptr.sub(d)));
let a = _mm_loadu_si128($ptr.sub(d) as *const __m128i);
_mm_movemask_epi8(masking!(a))
} as u16).wrapping_shr(d as u32);
if mask != 0 {
write_mask!(mask, $ptr);
}
}
}
};
}