1use super::{MMB16Sgl, MMB32Sgl};
2use crate::mem as basic;
3use crate::utils::*;
4
5#[cfg(target_arch = "x86")]
6use core::arch::x86 as mmx;
7#[cfg(target_arch = "x86_64")]
8use core::arch::x86_64 as mmx;
9
10use mmx::__m128i;
11use mmx::_mm_store_si128;
12use mmx::_mm_storeu_si128;
13
14use mmx::__m256i;
15use mmx::_mm256_store_si256;
16use mmx::_mm256_storeu_si256;
17
18#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
19use super::cpuid;
20
21use core::sync::atomic::AtomicPtr;
22use core::sync::atomic::Ordering;
23type FuncType = fn(&mut [u8], u8);
24
25const FUNC: FuncType = fnptr_setup_func;
26static FUNC_PTR_ATOM: AtomicPtr<FuncType> = AtomicPtr::new(FUNC as *mut FuncType);
27
28#[inline(never)]
29fn fnptr_setup_func(buf: &mut [u8], c: u8) {
30 #[cfg(target_arch = "x86_64")]
31 let func = if cpuid::has_avx2() {
32 _memset_avx2
33 } else {
34 _memset_sse2
35 };
36 #[cfg(target_arch = "x86")]
37 let func = if cpuid::has_avx2() {
38 _memset_avx2
39 } else if cpuid::has_sse2() {
40 _memset_sse2
41 } else {
42 _memset_basic
43 };
44 FUNC_PTR_ATOM.store(func as *mut FuncType, Ordering::Relaxed);
46 unsafe { func(buf, c) }
47}
48
49#[inline(always)]
50pub(crate) fn _memset_impl(buf: &mut [u8], c: u8) {
51 let func_u = FUNC_PTR_ATOM.load(Ordering::Relaxed);
52 #[allow(clippy::crosspointer_transmute)]
53 let func: FuncType = unsafe { core::mem::transmute(func_u) };
54 func(buf, c)
55}
56
57unsafe fn _memset_basic(buf: &mut [u8], c: u8) {
58 basic::_memset_impl(buf, c)
59}
60
61#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
62#[target_feature(enable = "sse2")]
63#[allow(clippy::missing_safety_doc)]
64pub unsafe fn _memset_sse2(buf: &mut [u8], c: u8) {
65 _memset_sse2_impl(buf, c)
66}
67
68#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
69#[target_feature(enable = "avx2")]
70#[allow(clippy::missing_safety_doc)]
71pub unsafe fn _memset_avx2(buf: &mut [u8], c: u8) {
72 _memset_avx2_impl(buf, c)
73}
74
75#[inline(always)]
76fn _memset_sse2_impl(buf: &mut [u8], c1: u8) {
77 let buf_len = buf.len();
78 let mut buf_ptr = buf.as_mut_ptr();
79 let end_ptr = unsafe { buf_ptr.add(buf_len) };
80 if buf_len >= 16 {
82 let cc = MMB16Sgl::new(c1);
83 {
85 if !buf_ptr.is_aligned_u128() {
86 #[cfg(not(feature = "test_alignment_check"))]
87 {
88 let remaining_align = 0x10_usize - ((buf_ptr as usize) & 0x0F_usize);
89 unsafe { _set_c16_uu_x1(buf_ptr, cc) };
90 buf_ptr = unsafe { buf_ptr.add(remaining_align) };
91 }
92 #[cfg(feature = "test_alignment_check")]
93 {
94 let c = B1Sgl::new(c1);
95 buf_ptr = basic::_set_to_aligned_u128(buf_ptr, c);
96 }
97 }
98 }
99 {
101 let unroll = 16;
102 let loop_size = 16;
103 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
104 unsafe { _set_c16_aa_x16(buf_ptr, cc) };
105 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
106 }
107 }
108 {
109 let unroll = 8;
110 let loop_size = 16;
111 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
112 unsafe { _set_c16_aa_x8(buf_ptr, cc) };
113 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
114 }
115 }
116 {
135 let unroll = 1;
136 let loop_size = 16;
137 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
138 unsafe { _set_c16_aa_x1(buf_ptr, cc) };
139 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
140 }
141 }
142 }
143 let cc = B8Sgl::new(c1);
145 basic::_memset_remaining_15_bytes_impl(buf_ptr, cc, end_ptr)
146}
147
148#[inline(always)]
149fn _memset_avx2_impl(buf: &mut [u8], c1: u8) {
150 let buf_len = buf.len();
151 let mut buf_ptr = buf.as_mut_ptr();
152 let end_ptr = unsafe { buf_ptr.add(buf_len) };
153 if buf_len >= 32 {
155 let cc = MMB32Sgl::new(c1);
156 {
158 if !buf_ptr.is_aligned_u256() {
159 #[cfg(not(feature = "test_alignment_check"))]
160 {
161 let remaining_align = 0x20_usize - ((buf_ptr as usize) & 0x1F_usize);
162 unsafe { _set_c32_uu_x1(buf_ptr, cc) };
163 buf_ptr = unsafe { buf_ptr.add(remaining_align) };
164 }
165 #[cfg(feature = "test_alignment_check")]
166 {
167 let c = B1Sgl::new(c1);
168 buf_ptr = basic::_set_to_aligned_u256(buf_ptr, c);
169 }
170 }
171 }
172 {
174 let unroll = 8;
175 let loop_size = 32;
176 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
177 unsafe { _set_c32_aa_x8(buf_ptr, cc) };
178 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
179 }
180 }
181 {
182 let unroll = 4;
183 let loop_size = 32;
184 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
185 unsafe { _set_c32_aa_x4(buf_ptr, cc) };
186 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
187 }
188 }
189 {
200 let unroll = 1;
201 let loop_size = 32;
202 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
203 unsafe { _set_c32_aa_x1(buf_ptr, cc) };
204 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
205 }
206 }
207 {
208 let cc = MMB16Sgl::new(c1);
209 let unroll = 1;
210 let loop_size = 16;
211 while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
212 unsafe { _set_c16_aa_x1(buf_ptr, cc) };
213 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
214 }
215 }
216 } else if buf_len >= 16 {
217 {
218 let cc = MMB16Sgl::new(c1);
219 let unroll = 1;
220 let loop_size = 16;
221 if buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
222 let end_ptr_16_x1 = unsafe { end_ptr.sub(loop_size * unroll) };
223 if buf_ptr.is_aligned_u128() {
225 while buf_ptr <= end_ptr_16_x1 {
226 unsafe { _set_c16_aa_x1(buf_ptr, cc) };
227 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
228 }
229 } else {
230 #[cfg(not(feature = "test_alignment_check"))]
231 {
232 while buf_ptr <= end_ptr_16_x1 {
233 unsafe { _set_c16_uu_x1(buf_ptr, cc) };
234 buf_ptr = unsafe { buf_ptr.add(loop_size) };
235 }
236 }
237 #[cfg(feature = "test_alignment_check")]
238 {
239 let c = B1Sgl::new(c1);
240 buf_ptr = basic::_set_to_aligned_u128(buf_ptr, c);
241 while buf_ptr <= end_ptr_16_x1 {
242 unsafe { _set_c16_aa_x1(buf_ptr, cc) };
243 buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
244 }
245 }
246 }
247 }
248 }
249 }
250 let cc = B8Sgl::new(c1);
252 basic::_memset_remaining_15_bytes_impl(buf_ptr, cc, end_ptr)
253}
254
255#[inline(always)]
256unsafe fn _set_c16_uu_x1(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
257 unsafe {
258 _mm_storeu_si128(buf_ptr as *mut __m128i, mm_c16.v1);
259 }
260}
261
262#[inline(always)]
263unsafe fn _set_c16_aa_x1(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
264 unsafe {
265 _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
266 }
267}
268
269#[inline(always)]
270unsafe fn _set_c16_aa_x2(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
271 unsafe {
272 _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
273 _mm_store_si128(buf_ptr.add(16) as *mut __m128i, mm_c16.v1);
274 }
275}
276
277#[inline(always)]
278unsafe fn _set_c16_aa_x4(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
279 unsafe {
280 _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
281 _mm_store_si128(buf_ptr.add(16) as *mut __m128i, mm_c16.v1);
282 _mm_store_si128(buf_ptr.add(16 * 2) as *mut __m128i, mm_c16.v1);
283 _mm_store_si128(buf_ptr.add(16 * 3) as *mut __m128i, mm_c16.v1);
284 }
285}
286
287#[inline(always)]
288unsafe fn _set_c16_aa_x8(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
289 unsafe {
290 _set_c16_aa_x4(buf_ptr, mm_c16);
291 _set_c16_aa_x4(buf_ptr.add(16 * 4), mm_c16);
292 }
293}
294
295#[inline(always)]
296unsafe fn _set_c16_aa_x16(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
297 unsafe {
298 _set_c16_aa_x8(buf_ptr, mm_c16);
299 _set_c16_aa_x8(buf_ptr.add(16 * 8), mm_c16);
300 }
301}
302
303#[inline(always)]
304unsafe fn _set_c32_uu_x1(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
305 unsafe {
306 _mm256_storeu_si256(buf_ptr as *mut __m256i, mm_c32.v1);
307 }
308}
309
310#[inline(always)]
311unsafe fn _set_c32_aa_x1(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
312 unsafe {
313 _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
314 }
315}
316
317#[inline(always)]
318unsafe fn _set_c32_aa_x2(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
319 unsafe {
320 _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
321 _mm256_store_si256(buf_ptr.add(32) as *mut __m256i, mm_c32.v1);
322 }
323}
324
325#[inline(always)]
326unsafe fn _set_c32_aa_x4(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
327 unsafe {
328 _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
329 _mm256_store_si256(buf_ptr.add(32) as *mut __m256i, mm_c32.v1);
330 _mm256_store_si256(buf_ptr.add(32 * 2) as *mut __m256i, mm_c32.v1);
331 _mm256_store_si256(buf_ptr.add(32 * 3) as *mut __m256i, mm_c32.v1);
332 }
333}
334
335#[inline(always)]
336unsafe fn _set_c32_aa_x8(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
337 unsafe {
338 _set_c32_aa_x4(buf_ptr, mm_c32);
339 _set_c32_aa_x4(buf_ptr.add(32 * 4), mm_c32);
340 }
341}
342
343#[cfg(test)]
344mod disasm {
345 use super::*;
346 #[test]
348 fn do_procs() {
349 let mut a = b"abcdefg".to_vec();
350 let a = a.as_mut_slice();
351 let c = b'A';
352 do_proc_basic(a, c);
353 do_proc_sse2(a, c);
354 do_proc_avx2(a, c);
355 }
356 #[inline(never)]
357 fn do_proc_basic(a: &mut [u8], c: u8) {
358 unsafe { _memset_basic(a, c) }
359 }
360 #[inline(never)]
361 fn do_proc_sse2(a: &mut [u8], c: u8) {
362 unsafe { _memset_sse2(a, c) }
363 }
364 #[inline(never)]
365 fn do_proc_avx2(a: &mut [u8], c: u8) {
366 unsafe { _memset_avx2(a, c) }
367 }
368}