atomic_maybe_uninit/arch/
x86.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3/*
4x86 and x86_64
5
6Refs:
7- IntelĀ® 64 and IA-32 Architectures Software Developer Manuals
8  https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html
9- x86 and amd64 instruction reference
10  https://www.felixcloutier.com/x86
11- portable-atomic
12  https://github.com/taiki-e/portable-atomic
13
14Generated asm:
15- x86_64 https://godbolt.org/z/xKzj4WcaE
16- x86_64 (+cmpxchg16b) https://godbolt.org/z/jzMoM9nhq
17- x86_64 (+cmpxchg16b,+avx) https://godbolt.org/z/6TnxM5hnj
18- x86 (i686) https://godbolt.org/z/sM6MPjYWf
19- x86 (i686,-sse2) https://godbolt.org/z/MsrxfbcMG
20- x86 (i586) https://godbolt.org/z/KEo6P7YEo
21- x86 (i586,-x87) https://godbolt.org/z/P8cdjY7h1
22*/
23
24delegate_size!(delegate_load_store);
25delegate_size!(delegate_swap);
26#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
27delegate_size!(delegate_cas);
28
29use core::{
30    arch::asm,
31    mem::{self, MaybeUninit},
32    sync::atomic::Ordering,
33};
34
35#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
36use crate::raw::AtomicCompareExchange;
37use crate::raw::{AtomicLoad, AtomicStore, AtomicSwap};
38#[cfg(target_arch = "x86")]
39#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
40use crate::utils::{MaybeUninit64, Pair};
41#[cfg(target_arch = "x86_64")]
42#[cfg(target_feature = "cmpxchg16b")]
43use crate::utils::{MaybeUninit128, Pair};
44
45#[cfg(target_pointer_width = "32")]
46macro_rules! ptr_modifier {
47    () => {
48        ":e"
49    };
50}
51#[cfg(target_pointer_width = "64")]
52macro_rules! ptr_modifier {
53    () => {
54        ""
55    };
56}
57
58macro_rules! atomic {
59    (
60        $ty:ident, $val_reg:ident, $ux_reg:ident, $ux:ident,
61        $zx:literal, $val_modifier:literal, $reg_val_modifier:tt, $zx_val_modifier:tt, $ptr_size:tt,
62        $cmpxchg_cmp_reg:tt
63    ) => {
64        #[cfg(target_arch = "x86")]
65        atomic!($ty, $val_reg, $ux_reg, reg_abcd, $ux, $zx, $val_modifier,
66            $reg_val_modifier, $zx_val_modifier, $ptr_size, $cmpxchg_cmp_reg);
67        #[cfg(target_arch = "x86_64")]
68        atomic!($ty, $val_reg, $ux_reg, reg, $ux, $zx, $val_modifier,
69            $reg_val_modifier, $zx_val_modifier, $ptr_size, $cmpxchg_cmp_reg);
70    };
71    (
72        $ty:ident, $val_reg:ident, $ux_reg:ident, $r_reg:ident, $ux:ident,
73        $zx:literal, $val_modifier:literal, $reg_val_modifier:tt, $zx_val_modifier:tt, $ptr_size:tt,
74        $cmpxchg_cmp_reg:tt
75    ) => {
76        delegate_signed!(delegate_load_store, $ty);
77        delegate_signed!(delegate_swap, $ty);
78        #[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
79        delegate_signed!(delegate_cas, $ty);
80        impl AtomicLoad for $ty {
81            #[inline]
82            unsafe fn atomic_load(
83                src: *const MaybeUninit<Self>,
84                _order: Ordering,
85            ) -> MaybeUninit<Self> {
86                debug_assert_atomic_unsafe_precondition!(src, $ty);
87                let out;
88
89                // SAFETY: the caller must uphold the safety contract.
90                unsafe {
91                    // atomic load is always SeqCst.
92                    asm!(
93                        concat!("mov", $zx, " {out", $zx_val_modifier, "}, ", $ptr_size, " ptr [{src", ptr_modifier!(), "}]"), // atomic { out = *src }
94                        src = in(reg) src,
95                        out = lateout(reg) out,
96                        options(nostack, preserves_flags),
97                    );
98                }
99                crate::utils::extend32::$ty::extract(out)
100            }
101        }
102        impl AtomicStore for $ty {
103            #[inline]
104            unsafe fn atomic_store(
105                dst: *mut MaybeUninit<Self>,
106                val: MaybeUninit<Self>,
107                order: Ordering,
108            ) {
109                debug_assert_atomic_unsafe_precondition!(dst, $ty);
110
111                // SAFETY: the caller must uphold the safety contract.
112                unsafe {
113                    match order {
114                        // Relaxed and Release stores are equivalent.
115                        Ordering::Relaxed | Ordering::Release => {
116                            asm!(
117                                concat!("mov ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { *dst = val }
118                                dst = in(reg) dst,
119                                val = in($val_reg) val,
120                                options(nostack, preserves_flags),
121                            );
122                        }
123                        Ordering::SeqCst => {
124                            asm!(
125                                // SeqCst store is xchg, not mov
126                                concat!("xchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { _x = *dst; *dst = val; val = _x }
127                                dst = in(reg) dst,
128                                val = inout($val_reg) val => _,
129                                options(nostack, preserves_flags),
130                            );
131                        }
132                        _ => unreachable!(),
133                    }
134                }
135            }
136        }
137        impl AtomicSwap for $ty {
138            #[inline]
139            unsafe fn atomic_swap(
140                dst: *mut MaybeUninit<Self>,
141                val: MaybeUninit<Self>,
142                _order: Ordering,
143            ) -> MaybeUninit<Self> {
144                debug_assert_atomic_unsafe_precondition!(dst, $ty);
145                let out: MaybeUninit<Self>;
146
147                // SAFETY: the caller must uphold the safety contract.
148                unsafe {
149                    // atomic swap is always SeqCst.
150                    asm!(
151                        concat!("xchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { _x = *dst; *dst = val; val = _x }
152                        dst = in(reg) dst,
153                        val = inout($val_reg) val => out,
154                        options(nostack, preserves_flags),
155                    );
156                }
157                out
158            }
159        }
160        #[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
161        impl AtomicCompareExchange for $ty {
162            #[inline]
163            unsafe fn atomic_compare_exchange(
164                dst: *mut MaybeUninit<Self>,
165                old: MaybeUninit<Self>,
166                new: MaybeUninit<Self>,
167                _success: Ordering,
168                _failure: Ordering,
169            ) -> (MaybeUninit<Self>, bool) {
170                debug_assert_atomic_unsafe_precondition!(dst, $ty);
171                let out: MaybeUninit<Self>;
172
173                // SAFETY: the caller must uphold the safety contract.
174                //
175                // Refs: https://www.felixcloutier.com/x86/cmpxchg
176                unsafe {
177                    let r: MaybeUninit<u32>;
178                    // compare_exchange is always SeqCst.
179                    asm!(
180                        concat!("lock cmpxchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {new", $reg_val_modifier, "}"), // atomic { if *dst == $cmpxchg_cmp_reg { ZF = 1; *dst = new } else { ZF = 0; $cmpxchg_cmp_reg = *dst } }
181                        "sete {r:l}",                                                                                           // r = ZF
182                        dst = in(reg) dst,
183                        // Avoid reg_byte ($val_reg) in new and r to work around cranelift bug with multiple or lateout reg_byte.
184                        new = in($ux_reg) crate::utils::extend32::$ty::$ux(new),
185                        r = lateout($r_reg) r,
186                        inout($cmpxchg_cmp_reg) old => out,
187                        // Do not use `preserves_flags` because CMPXCHG modifies the ZF, CF, PF, AF, SF, and OF flags.
188                        options(nostack),
189                    );
190                    let r = crate::utils::extend32::u8::extract(r).assume_init();
191                    crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
192                    (out, r != 0)
193                }
194            }
195        }
196    };
197}
198
199#[cfg(target_arch = "x86")]
200atomic!(u8, reg_byte, reg_abcd, uninit, "zx", "", ":l", ":e", "byte", "al");
201#[cfg(target_arch = "x86_64")]
202atomic!(u8, reg_byte, reg, uninit, "zx", "", ":l", ":e", "byte", "al");
203atomic!(u16, reg, reg, identity, "zx", ":x", ":x", ":e", "word", "ax");
204atomic!(u32, reg, reg, identity, "", ":e", ":e", ":e", "dword", "eax");
205#[cfg(target_arch = "x86_64")]
206atomic!(u64, reg, reg, identity, "", "", "", "", "qword", "rax");
207
208// For load/store, we can use MOVQ(SSE2)/MOVLPS(SSE)/FILD&FISTP(x87) instead of CMPXCHG8B.
209// Refs: https://github.com/llvm/llvm-project/blob/llvmorg-21.1.0/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
210#[cfg(target_arch = "x86")]
211#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
212macro_rules! atomic64 {
213    ($ty:ident) => {
214        delegate_signed!(delegate_all, $ty);
215        impl AtomicLoad for $ty {
216            #[inline]
217            unsafe fn atomic_load(
218                src: *const MaybeUninit<Self>,
219                _order: Ordering,
220            ) -> MaybeUninit<Self> {
221                debug_assert_atomic_unsafe_precondition!(src, $ty);
222
223                #[cfg(all(
224                    target_feature = "sse2",
225                    not(atomic_maybe_uninit_test_prefer_x87_over_sse),
226                ))]
227                // SAFETY: the caller must uphold the safety contract.
228                // cfg guarantees that the CPU supports SSE.
229                //
230                // Refs:
231                // - https://www.felixcloutier.com/x86/movq (SSE2)
232                // - https://www.felixcloutier.com/x86/movd:movq (SSE2)
233                unsafe {
234                    let out;
235                    // atomic load is always SeqCst.
236                    asm!(
237                        "movq {out}, qword ptr [{src}]", // atomic { out[:] = *src }
238                        src = in(reg) src,
239                        out = out(xmm_reg) out,
240                        options(nostack, preserves_flags),
241                    );
242                    mem::transmute::<
243                        MaybeUninit<core::arch::x86::__m128i>,
244                        [MaybeUninit<Self>; 2],
245                    >(out)[0]
246                }
247                #[cfg(all(
248                    not(target_feature = "sse2"),
249                    target_feature = "sse",
250                    not(atomic_maybe_uninit_test_prefer_x87_over_sse),
251                ))]
252                // SAFETY: the caller must uphold the safety contract.
253                // cfg guarantees that the CPU supports SSE.
254                //
255                // Refs:
256                // - https://www.felixcloutier.com/x86/movlps (SSE)
257                unsafe {
258                    let out;
259                    // atomic load is always SeqCst.
260                    asm!(
261                        "movlps {out}, qword ptr [{src}]", // atomic { out[:] = *src }
262                        src = in(reg) src,
263                        out = out(xmm_reg) out,
264                        options(nostack, preserves_flags),
265                    );
266                    mem::transmute::<
267                        MaybeUninit<core::arch::x86::__m128>,
268                        [MaybeUninit<Self>; 2],
269                    >(out)[0]
270                }
271                #[cfg(all(
272                    any(
273                        not(target_feature = "sse"),
274                        atomic_maybe_uninit_test_prefer_x87_over_sse,
275                    ),
276                    all(
277                        any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
278                        not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
279                    ),
280                ))]
281                // SAFETY: the caller must uphold the safety contract.
282                //
283                // Refs:
284                // - https://www.felixcloutier.com/x86/fild
285                // - https://www.felixcloutier.com/x86/fist:fistp
286                unsafe {
287                    let mut out = MaybeUninit::<Self>::uninit();
288                    // atomic load is always SeqCst.
289                    asm!(
290                        "fild qword ptr [{src}]",  // atomic { st.push(*src) }
291                        "fistp qword ptr [{out}]", // *out = st.pop()
292                        src = in(reg) src,
293                        out = in(reg) out.as_mut_ptr(),
294                        out("st(0)") _,
295                        out("st(1)") _,
296                        out("st(2)") _,
297                        out("st(3)") _,
298                        out("st(4)") _,
299                        out("st(5)") _,
300                        out("st(6)") _,
301                        out("st(7)") _,
302                        // Do not use `preserves_flags` because FILD and FISTP modify C1 in x87 FPU status word.
303                        options(nostack),
304                    );
305                    out
306                }
307                #[cfg(all(
308                    any(
309                        not(target_feature = "sse"),
310                        atomic_maybe_uninit_test_prefer_x87_over_sse,
311                    ),
312                    not(all(
313                        any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
314                        not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
315                    )),
316                ))]
317                // SAFETY: the caller must uphold the safety contract.
318                //
319                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
320                unsafe {
321                    let (prev_lo, prev_hi);
322                    // atomic load is always SeqCst.
323                    asm!(
324                        "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
325                        // set old/new args of CMPXCHG8B to 0
326                        in("ebx") 0_u32,
327                        in("ecx") 0_u32,
328                        inout("eax") 0_u32 => prev_lo,
329                        inout("edx") 0_u32 => prev_hi,
330                        in("edi") src,
331                        // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
332                        options(nostack),
333                    );
334                    MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
335                }
336            }
337        }
338        impl AtomicStore for $ty {
339            #[inline]
340            unsafe fn atomic_store(
341                dst: *mut MaybeUninit<Self>,
342                val: MaybeUninit<Self>,
343                order: Ordering,
344            ) {
345                debug_assert_atomic_unsafe_precondition!(dst, $ty);
346
347                #[cfg(all(
348                    target_feature = "sse",
349                    not(atomic_maybe_uninit_test_prefer_x87_over_sse),
350                ))]
351                // SAFETY: the caller must uphold the safety contract.
352                // cfg guarantees that the CPU supports SSE.
353                //
354                // Refs:
355                // - https://www.felixcloutier.com/x86/movlps (SSE)
356                // - https://www.felixcloutier.com/x86/lock
357                // - https://www.felixcloutier.com/x86/or
358                unsafe {
359                    let val: MaybeUninit<core::arch::x86::__m128>
360                        = mem::transmute([val, MaybeUninit::uninit()]);
361                    match order {
362                        // Relaxed and Release stores are equivalent.
363                        Ordering::Relaxed | Ordering::Release => {
364                            asm!(
365                                "movlps qword ptr [{dst}], {val}", // atomic { *dst = val[:] }
366                                dst = in(reg) dst,
367                                val = in(xmm_reg) val,
368                                options(nostack, preserves_flags),
369                            );
370                        }
371                        Ordering::SeqCst => {
372                            let p = core::cell::UnsafeCell::new(MaybeUninit::<u32>::uninit());
373                            asm!(
374                                "movlps qword ptr [{dst}], {val}", // atomic { *dst = val[:] }
375                                // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
376                                // - https://github.com/taiki-e/portable-atomic/pull/156
377                                // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
378                                // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
379                                // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
380                                // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
381                                "xchg dword ptr [{p}], {tmp}",     // fence
382                                dst = in(reg) dst,
383                                val = in(xmm_reg) val,
384                                p = inout(reg) p.get() => _,
385                                tmp = lateout(reg) _,
386                                options(nostack, preserves_flags),
387                            );
388                        }
389                        _ => unreachable!(),
390                    }
391                }
392                #[cfg(all(
393                    any(
394                        not(target_feature = "sse"),
395                        atomic_maybe_uninit_test_prefer_x87_over_sse,
396                    ),
397                    all(
398                        any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
399                        not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
400                    ),
401                ))]
402                // SAFETY: the caller must uphold the safety contract.
403                //
404                // Refs:
405                // - https://www.felixcloutier.com/x86/fild
406                // - https://www.felixcloutier.com/x86/fist:fistp
407                unsafe {
408                    match order {
409                        // Relaxed and Release stores are equivalent.
410                        Ordering::Relaxed | Ordering::Release => {
411                            asm!(
412                                "fild qword ptr [{val}]",  // st.push(*val)
413                                "fistp qword ptr [{dst}]", // atomic { *dst = st.pop() }
414                                dst = in(reg) dst,
415                                val = in(reg) val.as_ptr(),
416                                out("st(0)") _,
417                                out("st(1)") _,
418                                out("st(2)") _,
419                                out("st(3)") _,
420                                out("st(4)") _,
421                                out("st(5)") _,
422                                out("st(6)") _,
423                                out("st(7)") _,
424                                // Do not use `preserves_flags` because FILD and FISTP modify condition code flags in x87 FPU status word.
425                                options(nostack),
426                            );
427                        }
428                        Ordering::SeqCst => {
429                            let p = core::cell::UnsafeCell::new(MaybeUninit::<u32>::uninit());
430                            asm!(
431                                "fild qword ptr [{val}]",      // st.push(*val)
432                                "fistp qword ptr [{dst}]",     // atomic { *dst = st.pop() }
433                                // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
434                                // - https://github.com/taiki-e/portable-atomic/pull/156
435                                // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
436                                // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
437                                // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
438                                // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
439                                "xchg dword ptr [{p}], {tmp}", // fence
440                                dst = in(reg) dst,
441                                val = in(reg) val.as_ptr(),
442                                p = inout(reg) p.get() => _,
443                                tmp = lateout(reg) _,
444                                out("st(0)") _,
445                                out("st(1)") _,
446                                out("st(2)") _,
447                                out("st(3)") _,
448                                out("st(4)") _,
449                                out("st(5)") _,
450                                out("st(6)") _,
451                                out("st(7)") _,
452                                // Do not use `preserves_flags` because FILD and FISTP modify condition code flags in x87 FPU status word.
453                                options(nostack),
454                            );
455                        }
456                        _ => unreachable!(),
457                    }
458                }
459                #[cfg(all(
460                    any(
461                        not(target_feature = "sse"),
462                        atomic_maybe_uninit_test_prefer_x87_over_sse,
463                    ),
464                    not(all(
465                        any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
466                        not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
467                    )),
468                ))]
469                // SAFETY: the caller must uphold the safety contract.
470                //
471                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
472                unsafe {
473                    let val = MaybeUninit64 { whole: val };
474                    // atomic store by CMPXCHG8B is always SeqCst.
475                    let _ = order;
476                    asm!(
477                        // This is based on the code generated for the first load in DW RMWs by LLVM,
478                        // but it is interesting that they generate code that does mixed-sized atomic access.
479                        //
480                        // This is not single-copy atomic reads, but this is ok because subsequent
481                        // CAS will check for consistency.
482                        "mov eax, dword ptr [edi]",           // atomic { eax = *edi }
483                        "mov edx, dword ptr [edi + 4]",       // atomic { edx = *edi.byte_add(4) }
484                        "2:", // 'retry:
485                            "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
486                            "jne 2b",                         // if ZF == 0 { jump 'retry }
487                        in("ebx") val.pair.lo,
488                        in("ecx") val.pair.hi,
489                        out("eax") _,
490                        out("edx") _,
491                        in("edi") dst,
492                        // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
493                        options(nostack),
494                    );
495                }
496            }
497        }
498        impl AtomicSwap for $ty {
499            #[inline]
500            unsafe fn atomic_swap(
501                dst: *mut MaybeUninit<Self>,
502                val: MaybeUninit<Self>,
503                _order: Ordering,
504            ) -> MaybeUninit<Self> {
505                debug_assert_atomic_unsafe_precondition!(dst, $ty);
506                let val = MaybeUninit64 { whole: val };
507                let (mut prev_lo, mut prev_hi);
508
509                // SAFETY: the caller must uphold the safety contract.
510                //
511                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
512                unsafe {
513                    // atomic swap is always SeqCst.
514                    asm!(
515                        // This is based on the code generated for the first load in DW RMWs by LLVM,
516                        // but it is interesting that they generate code that does mixed-sized atomic access.
517                        //
518                        // This is not single-copy atomic reads, but this is ok because subsequent
519                        // CAS will check for consistency.
520                        "mov eax, dword ptr [edi]",           // atomic { eax = *edi }
521                        "mov edx, dword ptr [edi + 4]",       // atomic { edx = *edi.byte_add(4) }
522                        "2:", // 'retry:
523                            "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
524                            "jne 2b",                         // if ZF == 0 { jump 'retry }
525                        in("ebx") val.pair.lo,
526                        in("ecx") val.pair.hi,
527                        out("eax") prev_lo,
528                        out("edx") prev_hi,
529                        in("edi") dst,
530                        // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
531                        options(nostack),
532                    );
533                    MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
534                }
535            }
536        }
537        impl AtomicCompareExchange for $ty {
538            #[inline]
539            unsafe fn atomic_compare_exchange(
540                dst: *mut MaybeUninit<Self>,
541                old: MaybeUninit<Self>,
542                new: MaybeUninit<Self>,
543                _success: Ordering,
544                _failure: Ordering,
545            ) -> (MaybeUninit<Self>, bool) {
546                debug_assert_atomic_unsafe_precondition!(dst, $ty);
547                let old = MaybeUninit64 { whole: old };
548                let new = MaybeUninit64 { whole: new };
549                let (prev_lo, prev_hi);
550
551                // SAFETY: the caller must uphold the safety contract.
552                //
553                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
554                unsafe {
555                    let r: u8;
556                    // compare_exchange is always SeqCst.
557                    asm!(
558                        "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
559                        "sete cl",                        // cl = ZF
560                        in("ebx") new.pair.lo,
561                        in("ecx") new.pair.hi,
562                        inout("eax") old.pair.lo => prev_lo,
563                        inout("edx") old.pair.hi => prev_hi,
564                        in("edi") dst,
565                        lateout("cl") r,
566                        // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
567                        options(nostack),
568                    );
569                    crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
570                    (
571                        MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole,
572                        r != 0
573                    )
574                }
575            }
576        }
577    };
578}
579
580#[cfg(target_arch = "x86")]
581#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
582atomic64!(u64);
583
584#[cfg(target_arch = "x86_64")]
585#[cfg(target_feature = "cmpxchg16b")]
586macro_rules! atomic128 {
587    ($ty:ident) => {
588        #[cfg(target_pointer_width = "32")]
589        atomic128!($ty, "edi");
590        #[cfg(target_pointer_width = "64")]
591        atomic128!($ty, "rdi");
592    };
593    ($ty:ident, $rdi:tt) => {
594        delegate_signed!(delegate_all, $ty);
595        impl AtomicLoad for $ty {
596            #[inline]
597            unsafe fn atomic_load(
598                src: *const MaybeUninit<Self>,
599                _order: Ordering,
600            ) -> MaybeUninit<Self> {
601                debug_assert_atomic_unsafe_precondition!(src, $ty);
602
603                // VMOVDQA is atomic when AVX is available.
604                // See https://gcc.gnu.org/bugzilla//show_bug.cgi?id=104688 for details.
605                //
606                // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
607                #[cfg(target_feature = "avx")]
608                // SAFETY: the caller must guarantee that `src` is valid for reads,
609                // 16-byte aligned, and that there are no concurrent non-atomic operations.
610                // cfg guarantees that the CPU supports AVX.
611                unsafe {
612                    let out;
613                    asm!(
614                        concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"), // atomic { out = *src }
615                        src = in(reg) src,
616                        out = lateout(xmm_reg) out,
617                        options(nostack, preserves_flags),
618                    );
619                    mem::transmute::<MaybeUninit<core::arch::x86_64::__m128i>, MaybeUninit<Self>>(
620                        out
621                    )
622                }
623                #[cfg(not(target_feature = "avx"))]
624                // SAFETY: the caller must guarantee that `src` is valid for both writes and
625                // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
626                // cfg guarantees that the CPU supports CMPXCHG16B.
627                //
628                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
629                unsafe {
630                    let (prev_lo, prev_hi);
631                    // atomic load is always SeqCst.
632                    asm!(
633                        "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
634                        "xor rbx, rbx",       // zeroed rbx
635                        concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
636                        "mov rbx, {rbx_tmp}", // restore rbx
637                        // set old/new args of CMPXCHG16B to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
638                        rbx_tmp = out(reg) _,
639                        in("rcx") 0_u64,
640                        inout("rax") 0_u64 => prev_lo,
641                        inout("rdx") 0_u64 => prev_hi,
642                        in($rdi) src,
643                        // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
644                        options(nostack),
645                    );
646                    MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
647                }
648            }
649        }
650        impl AtomicStore for $ty {
651            #[inline]
652            unsafe fn atomic_store(
653                dst: *mut MaybeUninit<Self>,
654                val: MaybeUninit<Self>,
655                order: Ordering,
656            ) {
657                debug_assert_atomic_unsafe_precondition!(dst, $ty);
658
659                // VMOVDQA is atomic when AVX is available.
660                // See https://gcc.gnu.org/bugzilla//show_bug.cgi?id=104688 for details.
661                //
662                // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
663                #[cfg(target_feature = "avx")]
664                // SAFETY: the caller must guarantee that `dst` is valid for writes,
665                // 16-byte aligned, and that there are no concurrent non-atomic operations.
666                // cfg guarantees that the CPU supports AVX.
667                unsafe {
668                    let val: MaybeUninit<core::arch::x86_64::__m128i> = mem::transmute(val);
669                    match order {
670                        // Relaxed and Release stores are equivalent.
671                        Ordering::Relaxed | Ordering::Release => {
672                            asm!(
673                                concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), // atomic { *dst = val }
674                                dst = in(reg) dst,
675                                val = in(xmm_reg) val,
676                                options(nostack, preserves_flags),
677                            );
678                        }
679                        Ordering::SeqCst => {
680                            let p = core::cell::UnsafeCell::new(MaybeUninit::<u64>::uninit());
681                            asm!(
682                                concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), // atomic { *dst = val }
683                                // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
684                                // - https://github.com/taiki-e/portable-atomic/pull/156
685                                // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
686                                // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
687                                // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
688                                // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
689                                concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),        // fence
690                                dst = in(reg) dst,
691                                val = in(xmm_reg) val,
692                                p = inout(reg) p.get() => _,
693                                tmp = lateout(reg) _,
694                                options(nostack, preserves_flags),
695                            );
696                        }
697                        _ => unreachable!(),
698                    }
699                }
700                #[cfg(not(target_feature = "avx"))]
701                // SAFETY: the caller must guarantee that `dst` is valid for both writes and
702                // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
703                // cfg guarantees that the CPU supports CMPXCHG16B.
704                //
705                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
706                unsafe {
707                    let val = MaybeUninit128 { whole: val };
708                    let _ = order;
709                    // atomic store is always SeqCst.
710                    asm!(
711                        "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
712                        // This is based on the code generated for the first load in DW RMWs by LLVM,
713                        // but it is interesting that they generate code that does mixed-sized atomic access.
714                        //
715                        // This is not single-copy atomic reads, but this is ok because subsequent
716                        // CAS will check for consistency.
717                        concat!("mov rax, qword ptr [", $rdi, "]"),              // atomic { rax = *$rdi }
718                        concat!("mov rdx, qword ptr [", $rdi, " + 8]"),          // atomic { rdx = *$rdi.byte_add(8) }
719                        "2:", // 'retry:
720                            concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
721                            "jne 2b",                                            // if ZF == 0 { jump 'retry }
722                        "mov rbx, {rbx_tmp}", // restore rbx
723                        rbx_tmp = inout(reg) val.pair.lo => _,
724                        in("rcx") val.pair.hi,
725                        out("rax") _,
726                        out("rdx") _,
727                        in($rdi) dst,
728                        // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
729                        options(nostack),
730                    );
731                }
732            }
733        }
734        impl AtomicSwap for $ty {
735            #[inline]
736            unsafe fn atomic_swap(
737                dst: *mut MaybeUninit<Self>,
738                val: MaybeUninit<Self>,
739                _order: Ordering,
740            ) -> MaybeUninit<Self> {
741                debug_assert_atomic_unsafe_precondition!(dst, $ty);
742                let val = MaybeUninit128 { whole: val };
743                let (mut prev_lo, mut prev_hi);
744
745                // SAFETY: the caller must guarantee that `dst` is valid for both writes and
746                // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
747                // cfg guarantees that the CPU supports CMPXCHG16B.
748                //
749                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
750                unsafe {
751                    // atomic swap is always SeqCst.
752                    asm!(
753                        "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
754                        // This is based on the code generated for the first load in DW RMWs by LLVM,
755                        // but it is interesting that they generate code that does mixed-sized atomic access.
756                        //
757                        // This is not single-copy atomic reads, but this is ok because subsequent
758                        // CAS will check for consistency.
759                        concat!("mov rax, qword ptr [", $rdi, "]"),              // atomic { rax = *$rdi }
760                        concat!("mov rdx, qword ptr [", $rdi, " + 8]"),          // atomic { rdx = *$rdi.byte_add(8) }
761                        "2:", // 'retry:
762                            concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
763                            "jne 2b",                                            // if ZF == 0 { jump 'retry }
764                        "mov rbx, {rbx_tmp}", // restore rbx
765                        rbx_tmp = inout(reg) val.pair.lo => _,
766                        in("rcx") val.pair.hi,
767                        out("rax") prev_lo,
768                        out("rdx") prev_hi,
769                        in($rdi) dst,
770                        // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
771                        options(nostack),
772                    );
773                    MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
774                }
775            }
776        }
777        impl AtomicCompareExchange for $ty {
778            #[inline]
779            unsafe fn atomic_compare_exchange(
780                dst: *mut MaybeUninit<Self>,
781                old: MaybeUninit<Self>,
782                new: MaybeUninit<Self>,
783                _success: Ordering,
784                _failure: Ordering,
785            ) -> (MaybeUninit<Self>, bool) {
786                debug_assert_atomic_unsafe_precondition!(dst, $ty);
787                let old = MaybeUninit128 { whole: old };
788                let new = MaybeUninit128 { whole: new };
789                let (prev_lo, prev_hi);
790
791                // SAFETY: the caller must guarantee that `dst` is valid for both writes and
792                // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
793                // cfg guarantees that the CPU supports CMPXCHG16B.
794                //
795                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
796                unsafe {
797                    let r: u8;
798                    // compare_exchange is always SeqCst.
799                    asm!(
800                        "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
801                        concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
802                        "sete cl",                                           // cl = ZF
803                        "mov rbx, {rbx_tmp}", // restore rbx
804                        rbx_tmp = inout(reg) new.pair.lo => _,
805                        in("rcx") new.pair.hi,
806                        inout("rax") old.pair.lo => prev_lo,
807                        inout("rdx") old.pair.hi => prev_hi,
808                        in($rdi) dst,
809                        lateout("cl") r,
810                        // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
811                        options(nostack),
812                    );
813                    crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
814                    (
815                        MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole,
816                        r != 0
817                    )
818                }
819            }
820        }
821    };
822}
823
824#[cfg(target_arch = "x86_64")]
825#[cfg(target_feature = "cmpxchg16b")]
826atomic128!(u128);
827
828// -----------------------------------------------------------------------------
829// cfg macros
830
831#[macro_export]
832macro_rules! cfg_has_atomic_8 {
833    ($($tt:tt)*) => { $($tt)* };
834}
835#[macro_export]
836macro_rules! cfg_no_atomic_8 {
837    ($($tt:tt)*) => {};
838}
839#[macro_export]
840macro_rules! cfg_has_atomic_16 {
841    ($($tt:tt)*) => { $($tt)* };
842}
843#[macro_export]
844macro_rules! cfg_no_atomic_16 {
845    ($($tt:tt)*) => {};
846}
847#[macro_export]
848macro_rules! cfg_has_atomic_32 {
849    ($($tt:tt)*) => { $($tt)* };
850}
851#[macro_export]
852macro_rules! cfg_no_atomic_32 {
853    ($($tt:tt)*) => {};
854}
855#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b)))]
856#[macro_export]
857macro_rules! cfg_has_atomic_64 {
858    ($($tt:tt)*) => { $($tt)* };
859}
860#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b)))]
861#[macro_export]
862macro_rules! cfg_no_atomic_64 {
863    ($($tt:tt)*) => {};
864}
865#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b))]
866#[macro_export]
867macro_rules! cfg_has_atomic_64 {
868    ($($tt:tt)*) => {};
869}
870#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b))]
871#[macro_export]
872macro_rules! cfg_no_atomic_64 {
873    ($($tt:tt)*) => { $($tt)* };
874}
875#[cfg(not(all(target_arch = "x86_64", target_feature = "cmpxchg16b")))]
876#[macro_export]
877macro_rules! cfg_has_atomic_128 {
878    ($($tt:tt)*) => {};
879}
880#[cfg(not(all(target_arch = "x86_64", target_feature = "cmpxchg16b")))]
881#[macro_export]
882macro_rules! cfg_no_atomic_128 {
883    ($($tt:tt)*) => { $($tt)* };
884}
885#[cfg(all(target_arch = "x86_64", target_feature = "cmpxchg16b"))]
886#[macro_export]
887macro_rules! cfg_has_atomic_128 {
888    ($($tt:tt)*) => { $($tt)* };
889}
890#[cfg(all(target_arch = "x86_64", target_feature = "cmpxchg16b"))]
891#[macro_export]
892macro_rules! cfg_no_atomic_128 {
893    ($($tt:tt)*) => {};
894}
895#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
896#[macro_export]
897macro_rules! cfg_has_atomic_cas {
898    ($($tt:tt)*) => { $($tt)* };
899}
900#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
901#[macro_export]
902macro_rules! cfg_no_atomic_cas {
903    ($($tt:tt)*) => {};
904}
905#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg))]
906#[macro_export]
907macro_rules! cfg_has_atomic_cas {
908    ($($tt:tt)*) => {};
909}
910#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg))]
911#[macro_export]
912macro_rules! cfg_no_atomic_cas {
913    ($($tt:tt)*) => { $($tt)* };
914}