Skip to main content

atomic_maybe_uninit/arch/
x86.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3/*
4x86 and x86_64
5
6Refs:
7- IntelĀ® 64 and IA-32 Architectures Software Developer Manuals
8  https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html
9- x86 and amd64 instruction reference
10  https://www.felixcloutier.com/x86
11- portable-atomic
12  https://github.com/taiki-e/portable-atomic
13
14See tests/asm-test/asm/atomic-maybe-uninit for generated assembly.
15*/
16
17#[cfg(target_arch = "x86_64")]
18#[cfg(target_feature = "cmpxchg16b")]
19#[cfg(not(atomic_maybe_uninit_no_outline_atomics))]
20#[cfg(not(target_env = "sgx"))]
21#[cfg_attr(
22    not(test),
23    cfg(not(any(
24        target_feature = "avx",
25        all(
26            not(target_feature = "avx"),
27            any(
28                atomic_maybe_uninit_no_outline_atomics,
29                target_env = "sgx",
30                not(target_feature = "sse"),
31            ),
32        ),
33    )))
34)]
35#[path = "../detect/x86_64.rs"]
36mod detect;
37
38delegate_size!(delegate_load_store);
39delegate_size!(delegate_swap);
40#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
41delegate_size!(delegate_cas);
42
43#[cfg(target_arch = "x86")]
44#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
45#[cfg(all(target_feature = "sse", not(atomic_maybe_uninit_test_prefer_x87_over_sse)))]
46use core::arch::x86::__m128;
47#[cfg(target_arch = "x86")]
48#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
49#[cfg(all(target_feature = "sse2", not(atomic_maybe_uninit_test_prefer_x87_over_sse)))]
50use core::arch::x86::__m128i;
51#[cfg(target_arch = "x86_64")]
52#[cfg(target_feature = "cmpxchg16b")]
53#[cfg(not(all(
54    not(target_feature = "avx"),
55    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
56)))]
57use core::arch::x86_64::__m128i;
58use core::{
59    arch::asm,
60    mem::{self, MaybeUninit},
61    sync::atomic::Ordering,
62};
63
64#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
65use crate::raw::AtomicCompareExchange;
66use crate::raw::{AtomicLoad, AtomicStore, AtomicSwap};
67#[cfg(target_arch = "x86")]
68#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
69use crate::utils::{MaybeUninit64, Pair};
70#[cfg(target_arch = "x86_64")]
71#[cfg(target_feature = "cmpxchg16b")]
72use crate::utils::{MaybeUninit128, Pair};
73
74#[cfg(target_pointer_width = "32")]
75macro_rules! ptr_modifier {
76    () => {
77        ":e"
78    };
79}
80#[cfg(target_pointer_width = "64")]
81macro_rules! ptr_modifier {
82    () => {
83        ""
84    };
85}
86
87// -----------------------------------------------------------------------------
88// Register-width or smaller atomics
89
90macro_rules! atomic {
91    (
92        $ty:ident, $val_reg:ident, $ux_reg:ident, $ux:ident,
93        $zx:literal, $val_modifier:literal, $reg_val_modifier:tt, $zx_val_modifier:tt, $ptr_size:tt,
94        $cmpxchg_cmp_reg:tt
95    ) => {
96        #[cfg(target_arch = "x86")]
97        atomic!($ty, $val_reg, $ux_reg, reg_abcd, $ux, $zx, $val_modifier,
98            $reg_val_modifier, $zx_val_modifier, $ptr_size, $cmpxchg_cmp_reg);
99        #[cfg(target_arch = "x86_64")]
100        atomic!($ty, $val_reg, $ux_reg, reg, $ux, $zx, $val_modifier,
101            $reg_val_modifier, $zx_val_modifier, $ptr_size, $cmpxchg_cmp_reg);
102    };
103    (
104        $ty:ident, $val_reg:ident, $ux_reg:ident, $r_reg:ident, $ux:ident,
105        $zx:literal, $val_modifier:literal, $reg_val_modifier:tt, $zx_val_modifier:tt, $ptr_size:tt,
106        $cmpxchg_cmp_reg:tt
107    ) => {
108        delegate_signed!(delegate_load_store, $ty);
109        delegate_signed!(delegate_swap, $ty);
110        #[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
111        delegate_signed!(delegate_cas, $ty);
112        impl AtomicLoad for $ty {
113            #[inline]
114            unsafe fn atomic_load(
115                src: *const MaybeUninit<Self>,
116                _order: Ordering,
117            ) -> MaybeUninit<Self> {
118                debug_assert_atomic_unsafe_precondition!(src, $ty);
119                let out;
120
121                // SAFETY: the caller must uphold the safety contract.
122                // load by MOV has SeqCst semantics.
123                unsafe {
124                    asm!(
125                        concat!("mov", $zx, " {out", $zx_val_modifier, "}, ", $ptr_size, " ptr [{src", ptr_modifier!(), "}]"), // atomic { out = zero_extend(*src) }
126                        src = in(reg) src,
127                        out = lateout(reg) out,
128                        options(nostack, preserves_flags),
129                    );
130                }
131                crate::utils::extend32::$ty::extract(out)
132            }
133        }
134        impl AtomicStore for $ty {
135            #[inline]
136            unsafe fn atomic_store(
137                dst: *mut MaybeUninit<Self>,
138                mut val: MaybeUninit<Self>,
139                order: Ordering,
140            ) {
141                debug_assert_atomic_unsafe_precondition!(dst, $ty);
142
143                // SAFETY: the caller must uphold the safety contract.
144                unsafe {
145                    match order {
146                        // Relaxed and Release stores are equivalent.
147                        Ordering::Relaxed | Ordering::Release => {
148                            asm!(
149                                concat!("mov ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { *dst = val }
150                                dst = in(reg) dst,
151                                val = in($val_reg) val,
152                                options(nostack, preserves_flags),
153                            );
154                        }
155                        #[allow(unused_assignments)] // TODO(gcc): Workaround for rustc_codegen_gcc bug
156                        Ordering::SeqCst => {
157                            asm!(
158                                // SeqCst store is xchg, not mov
159                                concat!("xchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { _x = *dst; *dst = val; val = _x }
160                                dst = in(reg) dst,
161                                val = inout($val_reg) val,
162                                options(nostack, preserves_flags),
163                            );
164                        }
165                        _ => crate::utils::unreachable_unchecked(),
166                    }
167                }
168            }
169        }
170        impl AtomicSwap for $ty {
171            #[inline]
172            unsafe fn atomic_swap(
173                dst: *mut MaybeUninit<Self>,
174                val: MaybeUninit<Self>,
175                _order: Ordering,
176            ) -> MaybeUninit<Self> {
177                debug_assert_atomic_unsafe_precondition!(dst, $ty);
178                let out: MaybeUninit<Self>;
179
180                // SAFETY: the caller must uphold the safety contract.
181                // XCHG has SeqCst semantics.
182                unsafe {
183                    asm!(
184                        concat!("xchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { _x = *dst; *dst = val; val = _x }
185                        dst = in(reg) dst,
186                        val = inout($val_reg) val => out,
187                        options(nostack, preserves_flags),
188                    );
189                }
190                out
191            }
192        }
193        #[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
194        impl AtomicCompareExchange for $ty {
195            #[inline]
196            unsafe fn atomic_compare_exchange(
197                dst: *mut MaybeUninit<Self>,
198                old: MaybeUninit<Self>,
199                new: MaybeUninit<Self>,
200                _success: Ordering,
201                _failure: Ordering,
202            ) -> (MaybeUninit<Self>, bool) {
203                debug_assert_atomic_unsafe_precondition!(dst, $ty);
204                let out: MaybeUninit<Self>;
205
206                // SAFETY: the caller must uphold the safety contract.
207                // CMPXCHG has SeqCst semantics.
208                //
209                // Refs: https://www.felixcloutier.com/x86/cmpxchg
210                unsafe {
211                    let r: MaybeUninit<u32>;
212                    asm!(
213                        concat!("lock cmpxchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {new", $reg_val_modifier, "}"), // atomic { if *dst == $cmpxchg_cmp_reg { ZF = 1; *dst = new } else { ZF = 0; $cmpxchg_cmp_reg = *dst } }
214                        "sete {r:l}",                                                                                           // r = ZF
215                        dst = in(reg) dst,
216                        // Avoid reg_byte ($val_reg) in new and r to work around cranelift bug with multiple or lateout reg_byte.
217                        new = in($ux_reg) crate::utils::extend32::$ty::$ux(new),
218                        r = lateout($r_reg) r,
219                        inout($cmpxchg_cmp_reg) old => out,
220                        // Do not use `preserves_flags` because CMPXCHG modifies the ZF, CF, PF, AF, SF, and OF flags.
221                        options(nostack),
222                    );
223                    let r = crate::utils::extend32::u8::extract(r).assume_init();
224                    crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
225                    (out, r != 0)
226                }
227            }
228        }
229    };
230}
231
232#[cfg(target_arch = "x86")]
233atomic!(u8, reg_byte, reg_abcd, uninit, "zx", "", ":l", ":e", "byte", "al");
234#[cfg(target_arch = "x86_64")]
235atomic!(u8, reg_byte, reg, uninit, "zx", "", ":l", ":e", "byte", "al");
236atomic!(u16, reg, reg, identity, "zx", ":x", ":x", ":e", "word", "ax");
237atomic!(u32, reg, reg, identity, "", ":e", ":e", ":e", "dword", "eax");
238#[cfg(target_arch = "x86_64")]
239atomic!(u64, reg, reg, identity, "", "", "", "", "qword", "rax");
240
241// -----------------------------------------------------------------------------
242// 64-bit atomics on x86_32
243//
244// For load/store, we can use MOVQ(SSE2)/MOVLPS(SSE)/FILD&FISTP(x87) instead of CMPXCHG8B.
245// Refs: https://github.com/llvm/llvm-project/blob/llvmorg-22.1.0-rc1/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
246
247#[cfg(all(target_arch = "x86", not(atomic_maybe_uninit_no_cmpxchg8b)))]
248delegate_signed!(delegate_all, u64);
249#[cfg(all(target_arch = "x86", not(atomic_maybe_uninit_no_cmpxchg8b)))]
250impl AtomicLoad for u64 {
251    #[inline]
252    unsafe fn atomic_load(src: *const MaybeUninit<Self>, _order: Ordering) -> MaybeUninit<Self> {
253        debug_assert_atomic_unsafe_precondition!(src, u64);
254
255        #[cfg(all(target_feature = "sse2", not(atomic_maybe_uninit_test_prefer_x87_over_sse)))]
256        // SAFETY: the caller must uphold the safety contract.
257        // cfg guarantees that the CPU supports SSE.
258        // load by MOVQ has SeqCst semantics.
259        //
260        // Refs:
261        // - https://www.felixcloutier.com/x86/movq (SSE2)
262        // - https://www.felixcloutier.com/x86/movd:movq (SSE2)
263        unsafe {
264            let out;
265            asm!(
266                "movq {out}, qword ptr [{src}]", // atomic { out[:] = *src }
267                src = in(reg) src,
268                out = out(xmm_reg) out,
269                options(nostack, preserves_flags),
270            );
271            mem::transmute::<MaybeUninit<__m128i>, [MaybeUninit<Self>; 2]>(out)[0]
272        }
273        #[cfg(all(
274            not(target_feature = "sse2"),
275            target_feature = "sse",
276            not(atomic_maybe_uninit_test_prefer_x87_over_sse),
277        ))]
278        // SAFETY: the caller must uphold the safety contract.
279        // cfg guarantees that the CPU supports SSE.
280        // load by MOVLPS has SeqCst semantics.
281        //
282        // Refs:
283        // - https://www.felixcloutier.com/x86/movlps (SSE)
284        unsafe {
285            let out;
286            asm!(
287                "movlps {out}, qword ptr [{src}]", // atomic { out[:] = *src }
288                src = in(reg) src,
289                out = out(xmm_reg) out,
290                options(nostack, preserves_flags),
291            );
292            mem::transmute::<MaybeUninit<__m128>, [MaybeUninit<Self>; 2]>(out)[0]
293        }
294        #[cfg(all(
295            any(not(target_feature = "sse"), atomic_maybe_uninit_test_prefer_x87_over_sse),
296            all(
297                any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
298                not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
299            ),
300        ))]
301        // SAFETY: the caller must uphold the safety contract.
302        // load by FILD has SeqCst semantics.
303        //
304        // Refs:
305        // - https://www.felixcloutier.com/x86/fild
306        // - https://www.felixcloutier.com/x86/fist:fistp
307        unsafe {
308            let mut out = MaybeUninit::<Self>::uninit();
309            asm!(
310                "fild qword ptr [{src}]",  // atomic { st.push(*src) }
311                "fistp qword ptr [{out}]", // *out = st.pop()
312                src = in(reg) src,
313                out = in(reg) out.as_mut_ptr(),
314                out("st(0)") _,
315                out("st(1)") _,
316                out("st(2)") _,
317                out("st(3)") _,
318                out("st(4)") _,
319                out("st(5)") _,
320                out("st(6)") _,
321                out("st(7)") _,
322                // Do not use `preserves_flags` because FILD and FISTP modify C1 in x87 FPU status word.
323                options(nostack),
324            );
325            out
326        }
327        #[cfg(all(
328            any(not(target_feature = "sse"), atomic_maybe_uninit_test_prefer_x87_over_sse),
329            not(all(
330                any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
331                not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
332            )),
333        ))]
334        // SAFETY: the caller must uphold the safety contract.
335        // CMPXCHG8B has SeqCst semantics.
336        //
337        // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
338        unsafe {
339            let (out_lo, out_hi);
340            asm!(
341                "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
342                // set old/new args of CMPXCHG8B to 0
343                in("ebx") 0_u32,
344                in("ecx") 0_u32,
345                inout("eax") 0_u32 => out_lo,
346                inout("edx") 0_u32 => out_hi,
347                in("edi") src,
348                // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
349                options(nostack),
350            );
351            MaybeUninit64 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
352        }
353    }
354}
355#[cfg(all(target_arch = "x86", not(atomic_maybe_uninit_no_cmpxchg8b)))]
356impl AtomicStore for u64 {
357    #[inline]
358    unsafe fn atomic_store(dst: *mut MaybeUninit<Self>, val: MaybeUninit<Self>, order: Ordering) {
359        debug_assert_atomic_unsafe_precondition!(dst, u64);
360
361        #[cfg(all(target_feature = "sse", not(atomic_maybe_uninit_test_prefer_x87_over_sse)))]
362        // SAFETY: the caller must uphold the safety contract.
363        // cfg guarantees that the CPU supports SSE.
364        //
365        // Refs:
366        // - https://www.felixcloutier.com/x86/movlps (SSE)
367        // - https://www.felixcloutier.com/x86/lock
368        // - https://www.felixcloutier.com/x86/or
369        unsafe {
370            let val: MaybeUninit<__m128> = mem::transmute([val, MaybeUninit::uninit()]);
371            match order {
372                // Relaxed and Release stores are equivalent.
373                Ordering::Relaxed | Ordering::Release => {
374                    asm!(
375                        "movlps qword ptr [{dst}], {val}", // atomic { *dst = val[:] }
376                        dst = in(reg) dst,
377                        val = in(xmm_reg) val,
378                        options(nostack, preserves_flags),
379                    );
380                }
381                Ordering::SeqCst => {
382                    let p = core::cell::UnsafeCell::new(MaybeUninit::<u32>::uninit());
383                    asm!(
384                        "movlps qword ptr [{dst}], {val}", // atomic { *dst = val[:] }
385                        // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
386                        // - https://github.com/taiki-e/portable-atomic/pull/156
387                        // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
388                        // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
389                        // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
390                        // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
391                        "xchg dword ptr [{p}], {tmp}",     // fence
392                        dst = in(reg) dst,
393                        val = in(xmm_reg) val,
394                        p = inout(reg) p.get() => _,
395                        tmp = lateout(reg) _,
396                        options(nostack, preserves_flags),
397                    );
398                }
399                _ => crate::utils::unreachable_unchecked(),
400            }
401        }
402        #[cfg(all(
403            any(not(target_feature = "sse"), atomic_maybe_uninit_test_prefer_x87_over_sse),
404            all(
405                any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
406                not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
407            ),
408        ))]
409        // SAFETY: the caller must uphold the safety contract.
410        //
411        // Refs:
412        // - https://www.felixcloutier.com/x86/fild
413        // - https://www.felixcloutier.com/x86/fist:fistp
414        unsafe {
415            match order {
416                // Relaxed and Release stores are equivalent.
417                Ordering::Relaxed | Ordering::Release => {
418                    asm!(
419                        "fild qword ptr [{val}]",  // st.push(*val)
420                        "fistp qword ptr [{dst}]", // atomic { *dst = st.pop() }
421                        dst = in(reg) dst,
422                        val = in(reg) val.as_ptr(),
423                        out("st(0)") _,
424                        out("st(1)") _,
425                        out("st(2)") _,
426                        out("st(3)") _,
427                        out("st(4)") _,
428                        out("st(5)") _,
429                        out("st(6)") _,
430                        out("st(7)") _,
431                        // Do not use `preserves_flags` because FILD and FISTP modify condition code flags in x87 FPU status word.
432                        options(nostack),
433                    );
434                }
435                Ordering::SeqCst => {
436                    let p = core::cell::UnsafeCell::new(MaybeUninit::<u32>::uninit());
437                    asm!(
438                        "fild qword ptr [{val}]",      // st.push(*val)
439                        "fistp qword ptr [{dst}]",     // atomic { *dst = st.pop() }
440                        // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
441                        // - https://github.com/taiki-e/portable-atomic/pull/156
442                        // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
443                        // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
444                        // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
445                        // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
446                        "xchg dword ptr [{p}], {tmp}", // fence
447                        dst = in(reg) dst,
448                        val = in(reg) val.as_ptr(),
449                        p = inout(reg) p.get() => _,
450                        tmp = lateout(reg) _,
451                        out("st(0)") _,
452                        out("st(1)") _,
453                        out("st(2)") _,
454                        out("st(3)") _,
455                        out("st(4)") _,
456                        out("st(5)") _,
457                        out("st(6)") _,
458                        out("st(7)") _,
459                        // Do not use `preserves_flags` because FILD and FISTP modify condition code flags in x87 FPU status word.
460                        options(nostack),
461                    );
462                }
463                _ => crate::utils::unreachable_unchecked(),
464            }
465        }
466        #[cfg(all(
467            any(not(target_feature = "sse"), atomic_maybe_uninit_test_prefer_x87_over_sse),
468            not(all(
469                any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
470                not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
471            )),
472        ))]
473        // SAFETY: the caller must uphold the safety contract.
474        unsafe {
475            // CMPXCHG8B has SeqCst semantics.
476            let _ = order;
477            <Self as AtomicSwap>::atomic_swap(dst, val, Ordering::SeqCst);
478        }
479    }
480}
481#[cfg(all(target_arch = "x86", not(atomic_maybe_uninit_no_cmpxchg8b)))]
482impl AtomicSwap for u64 {
483    #[inline]
484    unsafe fn atomic_swap(
485        dst: *mut MaybeUninit<Self>,
486        val: MaybeUninit<Self>,
487        _order: Ordering,
488    ) -> MaybeUninit<Self> {
489        debug_assert_atomic_unsafe_precondition!(dst, u64);
490        let val = MaybeUninit64 { whole: val };
491        let (mut prev_lo, mut prev_hi);
492
493        // SAFETY: the caller must uphold the safety contract.
494        // CMPXCHG8B has SeqCst semantics.
495        //
496        // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
497        unsafe {
498            asm!(
499                // This is based on the code generated for the first load in DW RMWs by LLVM,
500                // but it is interesting that they generate code that does mixed-sized atomic access.
501                //
502                // This is not single-copy atomic reads, but this is ok because subsequent
503                // CAS will check for consistency.
504                "mov eax, dword ptr [edi]",           // atomic { eax = *edi }
505                "mov edx, dword ptr [edi + 4]",       // atomic { edx = *edi.byte_add(4) }
506                "2:", // 'retry:
507                    "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
508                    "jne 2b",                         // if ZF == 0 { jump 'retry }
509                in("ebx") val.pair.lo,
510                in("ecx") val.pair.hi,
511                out("eax") prev_lo,
512                out("edx") prev_hi,
513                in("edi") dst,
514                // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
515                options(nostack),
516            );
517            MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
518        }
519    }
520}
521#[cfg(all(target_arch = "x86", not(atomic_maybe_uninit_no_cmpxchg8b)))]
522impl AtomicCompareExchange for u64 {
523    #[inline]
524    unsafe fn atomic_compare_exchange(
525        dst: *mut MaybeUninit<Self>,
526        old: MaybeUninit<Self>,
527        new: MaybeUninit<Self>,
528        _success: Ordering,
529        _failure: Ordering,
530    ) -> (MaybeUninit<Self>, bool) {
531        debug_assert_atomic_unsafe_precondition!(dst, u64);
532        let old = MaybeUninit64 { whole: old };
533        let new = MaybeUninit64 { whole: new };
534        let (prev_lo, prev_hi);
535        let r: u8;
536
537        // SAFETY: the caller must uphold the safety contract.
538        // CMPXCHG8B has SeqCst semantics.
539        //
540        // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
541        unsafe {
542            asm!(
543                "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
544                "sete cl",                        // cl = ZF
545                in("ebx") new.pair.lo,
546                in("ecx") new.pair.hi,
547                inout("eax") old.pair.lo => prev_lo,
548                inout("edx") old.pair.hi => prev_hi,
549                in("edi") dst,
550                lateout("cl") r,
551                // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
552                options(nostack),
553            );
554            crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
555            (MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
556        }
557    }
558}
559
560// -----------------------------------------------------------------------------
561// 128-bit atomics on x86_64
562
563#[cfg(target_arch = "x86_64")]
564#[cfg(target_feature = "cmpxchg16b")]
565macro_rules! atomic128 {
566    () => {
567        // rdi and rsi are call-preserved on Windows.
568        #[cfg(not(windows))]
569        #[cfg(target_pointer_width = "32")]
570        atomic128!("edi", "esi", "rsi");
571        #[cfg(not(windows))]
572        #[cfg(target_pointer_width = "64")]
573        atomic128!("rdi", "rsi", "rsi");
574        #[cfg(windows)]
575        #[cfg(target_pointer_width = "32")]
576        atomic128!("r9d", "r11d", "r8");
577        #[cfg(windows)]
578        #[cfg(target_pointer_width = "64")]
579        atomic128!("r9", "r11", "r8");
580    };
581    ($dst:tt, $cas_dst:tt, $save:tt) => {
582        delegate_signed!(delegate_all, u128);
583        impl AtomicLoad for u128 {
584            #[inline]
585            unsafe fn atomic_load(
586                src: *const MaybeUninit<Self>,
587                _order: Ordering,
588            ) -> MaybeUninit<Self> {
589                // VMOVDQA is atomic when AVX is available.
590                // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
591                //
592                // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
593                #[cfg(not(all(
594                    not(target_feature = "avx"),
595                    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
596                )))]
597                #[target_feature(enable = "avx")]
598                #[inline]
599                unsafe fn atomic_load_avx(
600                    src: *const MaybeUninit<u128>,
601                ) -> MaybeUninit<u128> {
602                    // SAFETY: the caller must guarantee that `src` is valid for reads,
603                    // 16-byte aligned, and that there are no concurrent non-atomic operations.
604                    // load by VMOVDQA has SeqCst semantics.
605                    unsafe {
606                        let out;
607                        asm!(
608                            concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"), // atomic { out = *src }
609                            src = in(reg) src,
610                            out = lateout(xmm_reg) out,
611                            options(nostack, preserves_flags),
612                        );
613                        mem::transmute::<MaybeUninit<__m128i>, MaybeUninit<u128>>(out)
614                    }
615                }
616                #[cfg(not(target_feature = "avx"))]
617                #[inline]
618                unsafe fn atomic_load_cmpxchg16b(
619                    src: *const MaybeUninit<u128>,
620                ) -> MaybeUninit<u128> {
621                    // SAFETY: the caller must guarantee that `src` is valid for both writes and
622                    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
623                    // CMPXCHG16B has SeqCst semantics.
624                    //
625                    // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
626                    unsafe {
627                        let (out_lo, out_hi);
628                        asm!(
629                            concat!("mov ", $save, ", rbx"), // save rbx which is reserved by LLVM
630                            "xor rbx, rbx", // zeroed rbx
631                            concat!("lock cmpxchg16b xmmword ptr [", $dst, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
632                            concat!("mov rbx, ", $save), // restore rbx
633                            // set old/new args of CMPXCHG16B to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
634                            out($save) _,
635                            in("rcx") 0_u64,
636                            inout("rax") 0_u64 => out_lo,
637                            inout("rdx") 0_u64 => out_hi,
638                            in($dst) src,
639                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
640                            options(nostack),
641                        );
642                        MaybeUninit128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
643                    }
644                }
645                debug_assert_atomic_unsafe_precondition!(src, u128);
646
647                #[cfg(target_feature = "avx")]
648                // SAFETY: the caller must uphold the safety contract.
649                // cfg guarantees that the CPU supports AVX.
650                unsafe {
651                    atomic_load_avx(src)
652                }
653                #[cfg(not(target_feature = "avx"))]
654                #[cfg(not(all(
655                    not(target_feature = "avx"),
656                    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
657                )))]
658                // SAFETY: the caller must uphold the safety contract.
659                // cfg guarantees that the CPU supports CMPXCHG16B.
660                unsafe {
661                    ifunc!(unsafe fn(src: *const MaybeUninit<u128>) -> MaybeUninit<u128> {
662                        if detect::detect().avx() {
663                            atomic_load_avx
664                        } else {
665                            atomic_load_cmpxchg16b
666                        }
667                    })
668                }
669                #[cfg(not(target_feature = "avx"))]
670                #[cfg(all(
671                    not(target_feature = "avx"),
672                    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
673                ))]
674                // SAFETY: the caller must uphold the safety contract.
675                // cfg guarantees that the CPU supports CMPXCHG16B.
676                unsafe {
677                    atomic_load_cmpxchg16b(src)
678                }
679            }
680        }
681        impl AtomicStore for u128 {
682            #[inline]
683            unsafe fn atomic_store(
684                dst: *mut MaybeUninit<Self>,
685                val: MaybeUninit<Self>,
686                order: Ordering,
687            ) {
688                // VMOVDQA is atomic when AVX is available.
689                // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
690                //
691                // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
692                #[cfg(not(all(
693                    not(target_feature = "avx"),
694                    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
695                )))]
696                #[target_feature(enable = "avx")]
697                #[inline]
698                unsafe fn atomic_store_avx(
699                    dst: *mut MaybeUninit<u128>,
700                    val: MaybeUninit<u128>,
701                    order: Ordering,
702                ) {
703                    // SAFETY: the caller must guarantee that `dst` is valid for writes,
704                    // 16-byte aligned, and that there are no concurrent non-atomic operations.
705                    // cfg guarantees that the CPU supports AVX.
706                    unsafe {
707                        let val: MaybeUninit<__m128i> = mem::transmute(val);
708                        match order {
709                            // Relaxed and Release stores are equivalent.
710                            Ordering::Relaxed | Ordering::Release => {
711                                asm!(
712                                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), // atomic { *dst = val }
713                                    dst = in(reg) dst,
714                                    val = in(xmm_reg) val,
715                                    options(nostack, preserves_flags),
716                                );
717                            }
718                            Ordering::SeqCst => {
719                                let p = core::cell::UnsafeCell::new(MaybeUninit::<u64>::uninit());
720                                asm!(
721                                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), // atomic { *dst = val }
722                                    // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
723                                    // - https://github.com/taiki-e/portable-atomic/pull/156
724                                    // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
725                                    // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
726                                    // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
727                                    // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
728                                    concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),        // fence
729                                    dst = in(reg) dst,
730                                    val = in(xmm_reg) val,
731                                    p = in(reg) p.get(),
732                                    tmp = out(reg) _,
733                                    options(nostack, preserves_flags),
734                                );
735                            }
736                            _ => crate::utils::unreachable_unchecked(),
737                        }
738                    }
739                }
740                #[cfg(not(target_feature = "avx"))]
741                #[inline]
742                unsafe fn atomic_store_cmpxchg16b(
743                    dst: *mut MaybeUninit<u128>,
744                    val: MaybeUninit<u128>,
745                ) {
746                    // SAFETY: the caller must uphold the safety contract.
747                    unsafe {
748                        // CMPXCHG16B has SeqCst semantics.
749                        <u128 as AtomicSwap>::atomic_swap(dst, val, Ordering::SeqCst);
750                    }
751                }
752                debug_assert_atomic_unsafe_precondition!(dst, u128);
753
754                #[cfg(target_feature = "avx")]
755                // SAFETY: the caller must uphold the safety contract.
756                // cfg guarantees that the CPU supports AVX.
757                unsafe {
758                    atomic_store_avx(dst, val, order);
759                }
760                #[cfg(not(target_feature = "avx"))]
761                #[cfg(not(all(
762                    not(target_feature = "avx"),
763                    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
764                )))]
765                // SAFETY: the caller must uphold the safety contract.
766                // cfg guarantees that the CPU supports CMPXCHG16B.
767                unsafe {
768                    fn_alias! {
769                        #[target_feature(enable = "avx")]
770                        unsafe fn(dst: *mut MaybeUninit<u128>, val: MaybeUninit<u128>);
771                        // atomic store by vmovdqa has at least release semantics.
772                        atomic_store_avx_non_seqcst = atomic_store_avx(Ordering::Release);
773                        atomic_store_avx_seqcst = atomic_store_avx(Ordering::SeqCst);
774                    }
775                    match order {
776                        // Relaxed and Release stores are equivalent in all implementations
777                        // that may be called here.
778                        Ordering::Relaxed | Ordering::Release => {
779                            ifunc!(unsafe fn(dst: *mut MaybeUninit<u128>, val: MaybeUninit<u128>) {
780                                if detect::detect().avx() {
781                                    atomic_store_avx_non_seqcst
782                                } else {
783                                    atomic_store_cmpxchg16b
784                                }
785                            });
786                        }
787                        Ordering::SeqCst => {
788                            ifunc!(unsafe fn(dst: *mut MaybeUninit<u128>, val: MaybeUninit<u128>) {
789                                if detect::detect().avx() {
790                                    atomic_store_avx_seqcst
791                                } else {
792                                    atomic_store_cmpxchg16b
793                                }
794                            });
795                        }
796                        _ => crate::utils::unreachable_unchecked(),
797                    }
798                }
799                #[cfg(not(target_feature = "avx"))]
800                #[cfg(all(
801                    not(target_feature = "avx"),
802                    any(atomic_maybe_uninit_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
803                ))]
804                // SAFETY: the caller must uphold the safety contract.
805                // cfg guarantees that the CPU supports CMPXCHG16B.
806                unsafe {
807                    // CMPXCHG16B has SeqCst semantics.
808                    let _ = order;
809                    atomic_store_cmpxchg16b(dst, val);
810                }
811            }
812        }
813        impl AtomicSwap for u128 {
814            #[inline]
815            unsafe fn atomic_swap(
816                dst: *mut MaybeUninit<Self>,
817                val: MaybeUninit<Self>,
818                _order: Ordering,
819            ) -> MaybeUninit<Self> {
820                debug_assert_atomic_unsafe_precondition!(dst, u128);
821                let val = MaybeUninit128 { whole: val };
822                let (mut prev_lo, mut prev_hi);
823
824                // SAFETY: the caller must guarantee that `dst` is valid for both writes and
825                // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
826                // cfg guarantees that the CPU supports CMPXCHG16B.
827                // CMPXCHG16B has SeqCst semantics.
828                //
829                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
830                unsafe {
831                    asm!(
832                        concat!("xchg ", $save, ", rbx"), // save rbx which is reserved by LLVM
833                        // This is based on the code generated for the first load in DW RMWs by LLVM,
834                        // but it is interesting that they generate code that does mixed-sized atomic access.
835                        //
836                        // This is not single-copy atomic reads, but this is ok because subsequent
837                        // CAS will check for consistency.
838                        concat!("mov rax, qword ptr [", $dst, "]"),              // atomic { rax = *$rdi }
839                        concat!("mov rdx, qword ptr [", $dst, " + 8]"),          // atomic { rdx = *$rdi.byte_add(8) }
840                        "2:", // 'retry:
841                            concat!("lock cmpxchg16b xmmword ptr [", $dst, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
842                            "jne 2b",                                            // if ZF == 0 { jump 'retry }
843                        concat!("mov rbx, ", $save), // restore rbx
844                        inout($save) val.pair.lo => _,
845                        in("rcx") val.pair.hi,
846                        out("rax") prev_lo,
847                        out("rdx") prev_hi,
848                        in($dst) dst,
849                        // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
850                        options(nostack),
851                    );
852                    MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
853                }
854            }
855        }
856        impl AtomicCompareExchange for u128 {
857            #[inline]
858            unsafe fn atomic_compare_exchange(
859                dst: *mut MaybeUninit<Self>,
860                old: MaybeUninit<Self>,
861                new: MaybeUninit<Self>,
862                _success: Ordering,
863                _failure: Ordering,
864            ) -> (MaybeUninit<Self>, bool) {
865                debug_assert_atomic_unsafe_precondition!(dst, u128);
866                let old = MaybeUninit128 { whole: old };
867                let new = MaybeUninit128 { whole: new };
868                let (prev_lo, prev_hi);
869                let r: u8;
870
871                // SAFETY: the caller must guarantee that `dst` is valid for both writes and
872                // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
873                // cfg guarantees that the CPU supports CMPXCHG16B.
874                // CMPXCHG16B has SeqCst semantics.
875                //
876                // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
877                unsafe {
878                    asm!(
879                        "xchg r8, rbx", // save rbx which is reserved by LLVM
880                        concat!("lock cmpxchg16b xmmword ptr [", $cas_dst, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
881                        "sete cl",                                               // cl = ZF
882                        "mov rbx, r8", // restore rbx
883                        inout("r8") new.pair.lo => _,
884                        in("rcx") new.pair.hi,
885                        inout("rax") old.pair.lo => prev_lo,
886                        inout("rdx") old.pair.hi => prev_hi,
887                        in($cas_dst) dst,
888                        lateout("cl") r,
889                        // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
890                        options(nostack),
891                    );
892                    crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
893                    (
894                        MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole,
895                        r != 0
896                    )
897                }
898            }
899        }
900    };
901}
902
903#[cfg(target_arch = "x86_64")]
904#[cfg(target_feature = "cmpxchg16b")]
905atomic128!();
906
907// -----------------------------------------------------------------------------
908// cfg macros
909
910#[macro_export]
911macro_rules! cfg_has_atomic_8 {
912    ($($tt:tt)*) => { $($tt)* };
913}
914#[macro_export]
915macro_rules! cfg_no_atomic_8 {
916    ($($tt:tt)*) => {};
917}
918#[macro_export]
919macro_rules! cfg_has_atomic_16 {
920    ($($tt:tt)*) => { $($tt)* };
921}
922#[macro_export]
923macro_rules! cfg_no_atomic_16 {
924    ($($tt:tt)*) => {};
925}
926#[macro_export]
927macro_rules! cfg_has_atomic_32 {
928    ($($tt:tt)*) => { $($tt)* };
929}
930#[macro_export]
931macro_rules! cfg_no_atomic_32 {
932    ($($tt:tt)*) => {};
933}
934#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b)))]
935#[macro_export]
936macro_rules! cfg_has_atomic_64 {
937    ($($tt:tt)*) => { $($tt)* };
938}
939#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b)))]
940#[macro_export]
941macro_rules! cfg_no_atomic_64 {
942    ($($tt:tt)*) => {};
943}
944#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b))]
945#[macro_export]
946macro_rules! cfg_has_atomic_64 {
947    ($($tt:tt)*) => {};
948}
949#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b))]
950#[macro_export]
951macro_rules! cfg_no_atomic_64 {
952    ($($tt:tt)*) => { $($tt)* };
953}
954#[cfg(not(all(target_arch = "x86_64", target_feature = "cmpxchg16b")))]
955#[macro_export]
956macro_rules! cfg_has_atomic_128 {
957    ($($tt:tt)*) => {};
958}
959#[cfg(not(all(target_arch = "x86_64", target_feature = "cmpxchg16b")))]
960#[macro_export]
961macro_rules! cfg_no_atomic_128 {
962    ($($tt:tt)*) => { $($tt)* };
963}
964#[cfg(all(target_arch = "x86_64", target_feature = "cmpxchg16b"))]
965#[macro_export]
966macro_rules! cfg_has_atomic_128 {
967    ($($tt:tt)*) => { $($tt)* };
968}
969#[cfg(all(target_arch = "x86_64", target_feature = "cmpxchg16b"))]
970#[macro_export]
971macro_rules! cfg_no_atomic_128 {
972    ($($tt:tt)*) => {};
973}
974#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
975#[macro_export]
976macro_rules! cfg_has_atomic_cas {
977    ($($tt:tt)*) => { $($tt)* };
978}
979#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
980#[macro_export]
981macro_rules! cfg_no_atomic_cas {
982    ($($tt:tt)*) => {};
983}
984#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg))]
985#[macro_export]
986macro_rules! cfg_has_atomic_cas {
987    ($($tt:tt)*) => {};
988}
989#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg))]
990#[macro_export]
991macro_rules! cfg_no_atomic_cas {
992    ($($tt:tt)*) => { $($tt)* };
993}