memx/arch/x86/
x86_set.rs

1use super::{MMB16Sgl, MMB32Sgl};
2use crate::mem as basic;
3use crate::utils::*;
4
5#[cfg(target_arch = "x86")]
6use core::arch::x86 as mmx;
7#[cfg(target_arch = "x86_64")]
8use core::arch::x86_64 as mmx;
9
10use mmx::__m128i;
11use mmx::_mm_store_si128;
12use mmx::_mm_storeu_si128;
13
14use mmx::__m256i;
15use mmx::_mm256_store_si256;
16use mmx::_mm256_storeu_si256;
17
18#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
19use super::cpuid;
20
21use core::sync::atomic::AtomicPtr;
22use core::sync::atomic::Ordering;
23type FuncType = fn(&mut [u8], u8);
24
25const FUNC: FuncType = fnptr_setup_func;
26static FUNC_PTR_ATOM: AtomicPtr<FuncType> = AtomicPtr::new(FUNC as *mut FuncType);
27
28#[inline(never)]
29fn fnptr_setup_func(buf: &mut [u8], c: u8) {
30    #[cfg(target_arch = "x86_64")]
31    let func = if cpuid::has_avx2() {
32        _memset_avx2
33    } else {
34        _memset_sse2
35    };
36    #[cfg(target_arch = "x86")]
37    let func = if cpuid::has_avx2() {
38        _memset_avx2
39    } else if cpuid::has_sse2() {
40        _memset_sse2
41    } else {
42        _memset_basic
43    };
44    //
45    FUNC_PTR_ATOM.store(func as *mut FuncType, Ordering::Relaxed);
46    unsafe { func(buf, c) }
47}
48
49#[inline(always)]
50pub(crate) fn _memset_impl(buf: &mut [u8], c: u8) {
51    let func_u = FUNC_PTR_ATOM.load(Ordering::Relaxed);
52    #[allow(clippy::crosspointer_transmute)]
53    let func: FuncType = unsafe { core::mem::transmute(func_u) };
54    func(buf, c)
55}
56
57unsafe fn _memset_basic(buf: &mut [u8], c: u8) {
58    basic::_memset_impl(buf, c)
59}
60
61#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
62#[target_feature(enable = "sse2")]
63#[allow(clippy::missing_safety_doc)]
64pub unsafe fn _memset_sse2(buf: &mut [u8], c: u8) {
65    _memset_sse2_impl(buf, c)
66}
67
68#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
69#[target_feature(enable = "avx2")]
70#[allow(clippy::missing_safety_doc)]
71pub unsafe fn _memset_avx2(buf: &mut [u8], c: u8) {
72    _memset_avx2_impl(buf, c)
73}
74
75#[inline(always)]
76fn _memset_sse2_impl(buf: &mut [u8], c1: u8) {
77    let buf_len = buf.len();
78    let mut buf_ptr = buf.as_mut_ptr();
79    let end_ptr = unsafe { buf_ptr.add(buf_len) };
80    //
81    if buf_len >= 16 {
82        let cc = MMB16Sgl::new(c1);
83        // to a aligned pointer
84        {
85            if !buf_ptr.is_aligned_u128() {
86                #[cfg(not(feature = "test_alignment_check"))]
87                {
88                    let remaining_align = 0x10_usize - ((buf_ptr as usize) & 0x0F_usize);
89                    unsafe { _set_c16_uu_x1(buf_ptr, cc) };
90                    buf_ptr = unsafe { buf_ptr.add(remaining_align) };
91                }
92                #[cfg(feature = "test_alignment_check")]
93                {
94                    let c = B1Sgl::new(c1);
95                    buf_ptr = basic::_set_to_aligned_u128(buf_ptr, c);
96                }
97            }
98        }
99        // the loop
100        {
101            let unroll = 16;
102            let loop_size = 16;
103            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
104                unsafe { _set_c16_aa_x16(buf_ptr, cc) };
105                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
106            }
107        }
108        {
109            let unroll = 8;
110            let loop_size = 16;
111            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
112                unsafe { _set_c16_aa_x8(buf_ptr, cc) };
113                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
114            }
115        }
116        /*
117        {
118            let unroll = 4;
119            let loop_size = 16;
120            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
121                unsafe { _set_c16_aa_x4(buf_ptr, cc) };
122                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
123            }
124        }
125        {
126            let unroll = 2;
127            let loop_size = 16;
128            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
129                unsafe { _set_c16_aa_x2(buf_ptr, cc) };
130                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
131            }
132        }
133        */
134        {
135            let unroll = 1;
136            let loop_size = 16;
137            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
138                unsafe { _set_c16_aa_x1(buf_ptr, cc) };
139                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
140            }
141        }
142    }
143    //
144    let cc = B8Sgl::new(c1);
145    basic::_memset_remaining_15_bytes_impl(buf_ptr, cc, end_ptr)
146}
147
148#[inline(always)]
149fn _memset_avx2_impl(buf: &mut [u8], c1: u8) {
150    let buf_len = buf.len();
151    let mut buf_ptr = buf.as_mut_ptr();
152    let end_ptr = unsafe { buf_ptr.add(buf_len) };
153    //
154    if buf_len >= 32 {
155        let cc = MMB32Sgl::new(c1);
156        // to a aligned pointer
157        {
158            if !buf_ptr.is_aligned_u256() {
159                #[cfg(not(feature = "test_alignment_check"))]
160                {
161                    let remaining_align = 0x20_usize - ((buf_ptr as usize) & 0x1F_usize);
162                    unsafe { _set_c32_uu_x1(buf_ptr, cc) };
163                    buf_ptr = unsafe { buf_ptr.add(remaining_align) };
164                }
165                #[cfg(feature = "test_alignment_check")]
166                {
167                    let c = B1Sgl::new(c1);
168                    buf_ptr = basic::_set_to_aligned_u256(buf_ptr, c);
169                }
170            }
171        }
172        // the loop
173        {
174            let unroll = 8;
175            let loop_size = 32;
176            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
177                unsafe { _set_c32_aa_x8(buf_ptr, cc) };
178                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
179            }
180        }
181        {
182            let unroll = 4;
183            let loop_size = 32;
184            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
185                unsafe { _set_c32_aa_x4(buf_ptr, cc) };
186                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
187            }
188        }
189        /*
190        {
191            let unroll = 2;
192            let loop_size = 32;
193            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
194                unsafe { _set_c32_aa_x2(buf_ptr, cc) };
195                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
196            }
197        }
198        */
199        {
200            let unroll = 1;
201            let loop_size = 32;
202            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
203                unsafe { _set_c32_aa_x1(buf_ptr, cc) };
204                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
205            }
206        }
207        {
208            let cc = MMB16Sgl::new(c1);
209            let unroll = 1;
210            let loop_size = 16;
211            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
212                unsafe { _set_c16_aa_x1(buf_ptr, cc) };
213                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
214            }
215        }
216    } else if buf_len >= 16 {
217        {
218            let cc = MMB16Sgl::new(c1);
219            let unroll = 1;
220            let loop_size = 16;
221            if buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
222                let end_ptr_16_x1 = unsafe { end_ptr.sub(loop_size * unroll) };
223                //
224                if buf_ptr.is_aligned_u128() {
225                    while buf_ptr <= end_ptr_16_x1 {
226                        unsafe { _set_c16_aa_x1(buf_ptr, cc) };
227                        buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
228                    }
229                } else {
230                    #[cfg(not(feature = "test_alignment_check"))]
231                    {
232                        while buf_ptr <= end_ptr_16_x1 {
233                            unsafe { _set_c16_uu_x1(buf_ptr, cc) };
234                            buf_ptr = unsafe { buf_ptr.add(loop_size) };
235                        }
236                    }
237                    #[cfg(feature = "test_alignment_check")]
238                    {
239                        let c = B1Sgl::new(c1);
240                        buf_ptr = basic::_set_to_aligned_u128(buf_ptr, c);
241                        while buf_ptr <= end_ptr_16_x1 {
242                            unsafe { _set_c16_aa_x1(buf_ptr, cc) };
243                            buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
244                        }
245                    }
246                }
247            }
248        }
249    }
250    //
251    let cc = B8Sgl::new(c1);
252    basic::_memset_remaining_15_bytes_impl(buf_ptr, cc, end_ptr)
253}
254
255#[inline(always)]
256unsafe fn _set_c16_uu_x1(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
257    unsafe {
258        _mm_storeu_si128(buf_ptr as *mut __m128i, mm_c16.v1);
259    }
260}
261
262#[inline(always)]
263unsafe fn _set_c16_aa_x1(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
264    unsafe {
265        _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
266    }
267}
268
269#[inline(always)]
270unsafe fn _set_c16_aa_x2(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
271    unsafe {
272        _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
273        _mm_store_si128(buf_ptr.add(16) as *mut __m128i, mm_c16.v1);
274    }
275}
276
277#[inline(always)]
278unsafe fn _set_c16_aa_x4(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
279    unsafe {
280        _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
281        _mm_store_si128(buf_ptr.add(16) as *mut __m128i, mm_c16.v1);
282        _mm_store_si128(buf_ptr.add(16 * 2) as *mut __m128i, mm_c16.v1);
283        _mm_store_si128(buf_ptr.add(16 * 3) as *mut __m128i, mm_c16.v1);
284    }
285}
286
287#[inline(always)]
288unsafe fn _set_c16_aa_x8(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
289    unsafe {
290        _set_c16_aa_x4(buf_ptr, mm_c16);
291        _set_c16_aa_x4(buf_ptr.add(16 * 4), mm_c16);
292    }
293}
294
295#[inline(always)]
296unsafe fn _set_c16_aa_x16(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
297    unsafe {
298        _set_c16_aa_x8(buf_ptr, mm_c16);
299        _set_c16_aa_x8(buf_ptr.add(16 * 8), mm_c16);
300    }
301}
302
303#[inline(always)]
304unsafe fn _set_c32_uu_x1(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
305    unsafe {
306        _mm256_storeu_si256(buf_ptr as *mut __m256i, mm_c32.v1);
307    }
308}
309
310#[inline(always)]
311unsafe fn _set_c32_aa_x1(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
312    unsafe {
313        _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
314    }
315}
316
317#[inline(always)]
318unsafe fn _set_c32_aa_x2(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
319    unsafe {
320        _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
321        _mm256_store_si256(buf_ptr.add(32) as *mut __m256i, mm_c32.v1);
322    }
323}
324
325#[inline(always)]
326unsafe fn _set_c32_aa_x4(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
327    unsafe {
328        _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
329        _mm256_store_si256(buf_ptr.add(32) as *mut __m256i, mm_c32.v1);
330        _mm256_store_si256(buf_ptr.add(32 * 2) as *mut __m256i, mm_c32.v1);
331        _mm256_store_si256(buf_ptr.add(32 * 3) as *mut __m256i, mm_c32.v1);
332    }
333}
334
335#[inline(always)]
336unsafe fn _set_c32_aa_x8(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
337    unsafe {
338        _set_c32_aa_x4(buf_ptr, mm_c32);
339        _set_c32_aa_x4(buf_ptr.add(32 * 4), mm_c32);
340    }
341}
342
343#[cfg(test)]
344mod disasm {
345    use super::*;
346    //
347    #[test]
348    fn do_procs() {
349        let mut a = b"abcdefg".to_vec();
350        let a = a.as_mut_slice();
351        let c = b'A';
352        do_proc_basic(a, c);
353        do_proc_sse2(a, c);
354        do_proc_avx2(a, c);
355    }
356    #[inline(never)]
357    fn do_proc_basic(a: &mut [u8], c: u8) {
358        unsafe { _memset_basic(a, c) }
359    }
360    #[inline(never)]
361    fn do_proc_sse2(a: &mut [u8], c: u8) {
362        unsafe { _memset_sse2(a, c) }
363    }
364    #[inline(never)]
365    fn do_proc_avx2(a: &mut [u8], c: u8) {
366        unsafe { _memset_avx2(a, c) }
367    }
368}