memx/arch/x86/
x86_set.rs

1use super::{MMB16Sgl, MMB32Sgl};
2use crate::mem as basic;
3use crate::utils::*;
4
5#[cfg(target_arch = "x86")]
6use core::arch::x86 as mmx;
7#[cfg(target_arch = "x86_64")]
8use core::arch::x86_64 as mmx;
9
10use mmx::__m128i;
11use mmx::_mm_store_si128;
12use mmx::_mm_storeu_si128;
13
14use mmx::__m256i;
15use mmx::_mm256_store_si256;
16use mmx::_mm256_storeu_si256;
17
18#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
19use super::cpuid;
20
21use core::sync::atomic::AtomicPtr;
22use core::sync::atomic::Ordering;
23type FuncType = fn(&mut [u8], u8);
24
25const FUNC: FuncType = fnptr_setup_func;
26static FUNC_PTR_ATOM: AtomicPtr<FuncType> = AtomicPtr::new(FUNC as *mut FuncType);
27
28#[inline(never)]
29fn fnptr_setup_func(buf: &mut [u8], c: u8) {
30    #[cfg(target_arch = "x86_64")]
31    let func = if cpuid::has_avx2() {
32        _memset_avx2
33    } else {
34        _memset_sse2
35    };
36    #[cfg(target_arch = "x86")]
37    let func = if cpuid::has_avx2() {
38        _memset_avx2
39    } else if cpuid::has_sse2() {
40        _memset_sse2
41    } else {
42        _memset_basic
43    };
44    //
45    FUNC_PTR_ATOM.store(func as *mut FuncType, Ordering::Relaxed);
46    unsafe { func(buf, c) }
47}
48
49#[inline(always)]
50pub(crate) fn _memset_impl(buf: &mut [u8], c: u8) {
51    let func_u = FUNC_PTR_ATOM.load(Ordering::Relaxed);
52    #[allow(clippy::crosspointer_transmute)]
53    let func: FuncType = unsafe { core::mem::transmute(func_u) };
54    func(buf, c)
55}
56
57unsafe fn _memset_basic(buf: &mut [u8], c: u8) {
58    basic::_memset_impl(buf, c)
59}
60
61#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
62#[target_feature(enable = "sse2")]
63#[allow(clippy::missing_safety_doc)]
64pub unsafe fn _memset_sse2(buf: &mut [u8], c: u8) {
65    _memset_sse2_impl(buf, c)
66}
67
68#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
69#[target_feature(enable = "avx2")]
70#[allow(clippy::missing_safety_doc)]
71pub unsafe fn _memset_avx2(buf: &mut [u8], c: u8) {
72    _memset_avx2_impl(buf, c)
73}
74
75#[inline(always)]
76fn _memset_sse2_impl(buf: &mut [u8], c1: u8) {
77    let buf_len = buf.len();
78    let mut buf_ptr = buf.as_mut_ptr();
79    let end_ptr = unsafe { buf_ptr.add(buf_len) };
80    //
81    if buf_len >= 16 {
82        let cc = MMB16Sgl::new(c1);
83        // to a aligned pointer
84        {
85            if !buf_ptr.is_aligned_u128() {
86                #[cfg(not(feature = "test_alignment_check"))]
87                {
88                    let remaining_align = 0x10_usize - ((buf_ptr as usize) & 0x0F_usize);
89                    unsafe { _set_c16_uu_x1(buf_ptr, cc) };
90                    buf_ptr = unsafe { buf_ptr.add(remaining_align) };
91                }
92                #[cfg(feature = "test_alignment_check")]
93                {
94                    let c = B1Sgl::new(c1);
95                    buf_ptr = basic::_set_to_aligned_u128(buf_ptr, c);
96                }
97            }
98        }
99        // the loop
100        {
101            let unroll = 16;
102            let loop_size = 16;
103            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
104                unsafe { _set_c16_aa_x16(buf_ptr, cc) };
105                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
106            }
107        }
108        {
109            let unroll = 8;
110            let loop_size = 16;
111            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
112                unsafe { _set_c16_aa_x8(buf_ptr, cc) };
113                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
114            }
115        }
116        /*
117        {
118            let unroll = 4;
119            let loop_size = 16;
120            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
121                unsafe { _set_c16_aa_x4(buf_ptr, cc) };
122                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
123            }
124        }
125        {
126            let unroll = 2;
127            let loop_size = 16;
128            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
129                unsafe { _set_c16_aa_x2(buf_ptr, cc) };
130                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
131            }
132        }
133        */
134        {
135            let unroll = 1;
136            let loop_size = 16;
137            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
138                unsafe { _set_c16_aa_x1(buf_ptr, cc) };
139                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
140            }
141        }
142    }
143    //
144    let cc = B8Sgl::new(c1);
145    basic::_memset_remaining_15_bytes_impl(buf_ptr, cc, end_ptr)
146}
147
148#[inline(always)]
149fn _memset_avx2_impl(buf: &mut [u8], c1: u8) {
150    let buf_len = buf.len();
151    let mut buf_ptr = buf.as_mut_ptr();
152    let end_ptr = unsafe { buf_ptr.add(buf_len) };
153    //
154    if buf_len >= 32 {
155        let cc = MMB32Sgl::new(c1);
156        // to a aligned pointer
157        {
158            if !buf_ptr.is_aligned_u256() {
159                #[cfg(not(feature = "test_alignment_check"))]
160                {
161                    let remaining_align = 0x20_usize - ((buf_ptr as usize) & 0x1F_usize);
162                    unsafe { _set_c32_uu_x1(buf_ptr, cc) };
163                    buf_ptr = unsafe { buf_ptr.add(remaining_align) };
164                }
165                #[cfg(feature = "test_alignment_check")]
166                {
167                    let c = B1Sgl::new(c1);
168                    buf_ptr = basic::_set_to_aligned_u256(buf_ptr, c);
169                }
170            }
171        }
172        // the loop
173        {
174            let unroll = 8;
175            let loop_size = 32;
176            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
177                unsafe { _set_c32_aa_x8(buf_ptr, cc) };
178                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
179            }
180        }
181        {
182            let unroll = 4;
183            let loop_size = 32;
184            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
185                unsafe { _set_c32_aa_x4(buf_ptr, cc) };
186                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
187            }
188        }
189        /*
190        {
191            let unroll = 2;
192            let loop_size = 32;
193            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
194                unsafe { _set_c32_aa_x2(buf_ptr, cc) };
195                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
196            }
197        }
198        */
199        {
200            let unroll = 1;
201            let loop_size = 32;
202            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
203                unsafe { _set_c32_aa_x1(buf_ptr, cc) };
204                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
205            }
206        }
207        {
208            let cc = MMB16Sgl::new(c1);
209            let unroll = 1;
210            let loop_size = 16;
211            while buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
212                unsafe { _set_c16_aa_x1(buf_ptr, cc) };
213                buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
214            }
215        }
216    } else if buf_len >= 16 {
217        {
218            let cc = MMB16Sgl::new(c1);
219            let unroll = 1;
220            let loop_size = 16;
221            if buf_ptr.is_not_over(end_ptr, loop_size * unroll) {
222                let end_ptr_16_x1 = unsafe { end_ptr.sub(loop_size * unroll) };
223                //
224                if buf_ptr.is_aligned_u128() {
225                    while buf_ptr <= end_ptr_16_x1 {
226                        unsafe { _set_c16_aa_x1(buf_ptr, cc) };
227                        buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
228                    }
229                } else {
230                    #[cfg(not(feature = "test_alignment_check"))]
231                    {
232                        while buf_ptr <= end_ptr_16_x1 {
233                            unsafe { _set_c16_uu_x1(buf_ptr, cc) };
234                            buf_ptr = unsafe { buf_ptr.add(loop_size) };
235                        }
236                    }
237                    #[cfg(feature = "test_alignment_check")]
238                    {
239                        let c = B1Sgl::new(c1);
240                        buf_ptr = basic::_set_to_aligned_u128(buf_ptr, c);
241                        while buf_ptr <= end_ptr_16_x1 {
242                            unsafe { _set_c16_aa_x1(buf_ptr, cc) };
243                            buf_ptr = unsafe { buf_ptr.add(loop_size * unroll) };
244                        }
245                    }
246                }
247            }
248        }
249    }
250    //
251    let cc = B8Sgl::new(c1);
252    basic::_memset_remaining_15_bytes_impl(buf_ptr, cc, end_ptr)
253}
254
255#[inline(always)]
256unsafe fn _set_c16_uu_x1(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
257    _mm_storeu_si128(buf_ptr as *mut __m128i, mm_c16.v1);
258}
259
260#[inline(always)]
261unsafe fn _set_c16_aa_x1(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
262    _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
263}
264
265#[inline(always)]
266unsafe fn _set_c16_aa_x2(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
267    _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
268    _mm_store_si128(buf_ptr.add(16) as *mut __m128i, mm_c16.v1);
269}
270
271#[inline(always)]
272unsafe fn _set_c16_aa_x4(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
273    _mm_store_si128(buf_ptr as *mut __m128i, mm_c16.v1);
274    _mm_store_si128(buf_ptr.add(16) as *mut __m128i, mm_c16.v1);
275    _mm_store_si128(buf_ptr.add(16 * 2) as *mut __m128i, mm_c16.v1);
276    _mm_store_si128(buf_ptr.add(16 * 3) as *mut __m128i, mm_c16.v1);
277}
278
279#[inline(always)]
280unsafe fn _set_c16_aa_x8(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
281    _set_c16_aa_x4(buf_ptr, mm_c16);
282    _set_c16_aa_x4(buf_ptr.add(16 * 4), mm_c16);
283}
284
285#[inline(always)]
286unsafe fn _set_c16_aa_x16(buf_ptr: *mut u8, mm_c16: MMB16Sgl) {
287    _set_c16_aa_x8(buf_ptr, mm_c16);
288    _set_c16_aa_x8(buf_ptr.add(16 * 8), mm_c16);
289}
290
291#[inline(always)]
292unsafe fn _set_c32_uu_x1(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
293    _mm256_storeu_si256(buf_ptr as *mut __m256i, mm_c32.v1);
294}
295
296#[inline(always)]
297unsafe fn _set_c32_aa_x1(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
298    _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
299}
300
301#[inline(always)]
302unsafe fn _set_c32_aa_x2(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
303    _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
304    _mm256_store_si256(buf_ptr.add(32) as *mut __m256i, mm_c32.v1);
305}
306
307#[inline(always)]
308unsafe fn _set_c32_aa_x4(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
309    _mm256_store_si256(buf_ptr as *mut __m256i, mm_c32.v1);
310    _mm256_store_si256(buf_ptr.add(32) as *mut __m256i, mm_c32.v1);
311    _mm256_store_si256(buf_ptr.add(32 * 2) as *mut __m256i, mm_c32.v1);
312    _mm256_store_si256(buf_ptr.add(32 * 3) as *mut __m256i, mm_c32.v1);
313}
314
315#[inline(always)]
316unsafe fn _set_c32_aa_x8(buf_ptr: *mut u8, mm_c32: MMB32Sgl) {
317    _set_c32_aa_x4(buf_ptr, mm_c32);
318    _set_c32_aa_x4(buf_ptr.add(32 * 4), mm_c32);
319}
320
321#[cfg(test)]
322mod disasm {
323    use super::*;
324    //
325    #[test]
326    fn do_procs() {
327        let mut a = b"abcdefg".to_vec();
328        let a = a.as_mut_slice();
329        let c = b'A';
330        do_proc_basic(a, c);
331        do_proc_sse2(a, c);
332        do_proc_avx2(a, c);
333    }
334    #[inline(never)]
335    fn do_proc_basic(a: &mut [u8], c: u8) {
336        unsafe { _memset_basic(a, c) }
337    }
338    #[inline(never)]
339    fn do_proc_sse2(a: &mut [u8], c: u8) {
340        unsafe { _memset_sse2(a, c) }
341    }
342    #[inline(never)]
343    fn do_proc_avx2(a: &mut [u8], c: u8) {
344        unsafe { _memset_avx2(a, c) }
345    }
346}