1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
use crate::mem as basic;
use crate::RangeError;
#[cfg(target_arch = "x86")]
use std::arch::x86 as mmx;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64 as mmx;
use mmx::__m128i;
use mmx::_mm_set1_epi8;
use mmx::_mm_store_si128;
use mmx::_mm_storeu_si128;
use mmx::__m256i;
use mmx::_mm256_set1_epi8;
use mmx::_mm256_store_si256;
use mmx::_mm256_storeu_si256;
#[inline(always)]
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub fn _memset_impl(buf: &mut [u8], c: u8, n: usize) -> Result<(), RangeError> {
#[cfg(target_feature = "avx")]
let r = unsafe { _memset_avx(buf, c, n) };
#[cfg(all(target_feature = "sse2", not(target_feature = "avx")))]
let r = unsafe { _memset_sse2(buf, c, n) };
#[cfg(not(any(target_feature = "sse2", target_feature = "avx")))]
let r = _memset_basic(buf, c, n);
r
}
fn _memset_basic(buf: &mut [u8], c: u8, n: usize) -> Result<(), RangeError> {
basic::_memset_impl(buf, c, n)
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "sse2")]
pub unsafe fn _memset_sse2(buf: &mut [u8], c: u8, n: usize) -> Result<(), RangeError> {
_memset_sse2_impl(buf, c, n)
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx")]
pub unsafe fn _memset_avx(buf: &mut [u8], c: u8, n: usize) -> Result<(), RangeError> {
_memset_avx_impl(buf, c, n)
}
#[inline(always)]
fn _memset_sse2_impl(buf: &mut [u8], c: u8, n: usize) -> Result<(), RangeError> {
let buf_len = buf.len();
if buf_len < n {
return Err(RangeError);
}
let mut a_ptr = buf.as_mut_ptr();
let end_ptr = unsafe { a_ptr.add(n) };
if n >= 16 {
let mcc: __m128i = unsafe { _mm_set1_epi8(c as i8) };
{
let remaining_align = 0x10_usize - ((a_ptr as usize) & 0x0F_usize);
let aa_ptr = a_ptr as *mut __m128i;
unsafe { _mm_storeu_si128(aa_ptr, mcc) };
a_ptr = unsafe { a_ptr.add(remaining_align) };
}
{
let unroll = 8;
let loop_size = 16;
let end_ptr_16_8 = unsafe { end_ptr.sub(loop_size * unroll) };
while a_ptr <= end_ptr_16_8 {
for i in 0..unroll {
let aa_ptr = unsafe { a_ptr.add(loop_size * i) } as *mut __m128i;
unsafe { _mm_store_si128(aa_ptr, mcc) };
}
a_ptr = unsafe { a_ptr.add(loop_size * unroll) };
}
}
{
let loop_size = 16;
let end_ptr_8 = unsafe { end_ptr.sub(loop_size) };
while a_ptr <= end_ptr_8 {
let aa_ptr = a_ptr as *mut __m128i;
unsafe { _mm_store_si128(aa_ptr, mcc) };
a_ptr = unsafe { a_ptr.add(loop_size) };
}
}
}
let cc: u64 = c as u64 * 0x0101_0101_0101_0101_u64;
basic::_memset_remaining_15_bytes_impl(a_ptr, cc, end_ptr)
}
#[inline(always)]
fn _memset_avx_impl(buf: &mut [u8], c: u8, n: usize) -> Result<(), RangeError> {
let buf_len = buf.len();
if buf_len < n {
return Err(RangeError);
}
let mut a_ptr = buf.as_mut_ptr();
let end_ptr = unsafe { a_ptr.add(n) };
if n >= 32 {
let mcc: __m256i = unsafe { _mm256_set1_epi8(c as i8) };
{
let remaining_align = 0x20_usize - ((a_ptr as usize) & 0x1F_usize);
let aa_ptr = a_ptr as *mut __m256i;
unsafe { _mm256_storeu_si256(aa_ptr, mcc) };
a_ptr = unsafe { a_ptr.add(remaining_align) };
}
{
let unroll = 8;
let loop_size = 32;
let end_ptr_32_8 = unsafe { end_ptr.sub(loop_size * unroll) };
while a_ptr <= end_ptr_32_8 {
for i in 0..unroll {
let aa_ptr = unsafe { a_ptr.add(loop_size * i) } as *mut __m256i;
unsafe { _mm256_store_si256(aa_ptr, mcc) };
}
a_ptr = unsafe { a_ptr.add(loop_size * unroll) };
}
}
{
let loop_size = 32;
let end_ptr_16 = unsafe { end_ptr.sub(loop_size) };
while a_ptr <= end_ptr_16 {
let aa_ptr = a_ptr as *mut __m256i;
unsafe { _mm256_store_si256(aa_ptr, mcc) };
a_ptr = unsafe { a_ptr.add(loop_size) };
}
}
}
{
let loop_size = 16;
let end_ptr_8 = unsafe { end_ptr.sub(loop_size) };
if a_ptr <= end_ptr_8 {
let mcc: __m128i = unsafe { _mm_set1_epi8(c as i8) };
let aa_ptr = a_ptr as *mut __m128i;
unsafe { _mm_storeu_si128(aa_ptr, mcc) };
a_ptr = unsafe { a_ptr.add(loop_size) };
}
}
let cc: u64 = c as u64 * 0x0101_0101_0101_0101_u64;
basic::_memset_remaining_15_bytes_impl(a_ptr, cc, end_ptr)
}