atomic_maybe_uninit/arch/x86.rs
1// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3/*
4x86 and x86_64
5
6Refs:
7- IntelĀ® 64 and IA-32 Architectures Software Developer Manuals
8 https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html
9- x86 and amd64 instruction reference
10 https://www.felixcloutier.com/x86
11- portable-atomic
12 https://github.com/taiki-e/portable-atomic
13
14Generated asm:
15- x86_64 https://godbolt.org/z/xKzj4WcaE
16- x86_64 (+cmpxchg16b) https://godbolt.org/z/jzMoM9nhq
17- x86_64 (+cmpxchg16b,+avx) https://godbolt.org/z/6TnxM5hnj
18- x86 (i686) https://godbolt.org/z/sM6MPjYWf
19- x86 (i686,-sse2) https://godbolt.org/z/MsrxfbcMG
20- x86 (i586) https://godbolt.org/z/KEo6P7YEo
21- x86 (i586,-x87) https://godbolt.org/z/P8cdjY7h1
22*/
23
24delegate_size!(delegate_load_store);
25delegate_size!(delegate_swap);
26#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
27delegate_size!(delegate_cas);
28
29use core::{
30 arch::asm,
31 mem::{self, MaybeUninit},
32 sync::atomic::Ordering,
33};
34
35#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
36use crate::raw::AtomicCompareExchange;
37use crate::raw::{AtomicLoad, AtomicStore, AtomicSwap};
38#[cfg(target_arch = "x86")]
39#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
40use crate::utils::{MaybeUninit64, Pair};
41#[cfg(target_arch = "x86_64")]
42#[cfg(target_feature = "cmpxchg16b")]
43use crate::utils::{MaybeUninit128, Pair};
44
45#[cfg(target_pointer_width = "32")]
46macro_rules! ptr_modifier {
47 () => {
48 ":e"
49 };
50}
51#[cfg(target_pointer_width = "64")]
52macro_rules! ptr_modifier {
53 () => {
54 ""
55 };
56}
57
58macro_rules! atomic {
59 (
60 $ty:ident, $val_reg:ident, $ux_reg:ident, $ux:ident,
61 $zx:literal, $val_modifier:literal, $reg_val_modifier:tt, $zx_val_modifier:tt, $ptr_size:tt,
62 $cmpxchg_cmp_reg:tt
63 ) => {
64 #[cfg(target_arch = "x86")]
65 atomic!($ty, $val_reg, $ux_reg, reg_abcd, $ux, $zx, $val_modifier,
66 $reg_val_modifier, $zx_val_modifier, $ptr_size, $cmpxchg_cmp_reg);
67 #[cfg(target_arch = "x86_64")]
68 atomic!($ty, $val_reg, $ux_reg, reg, $ux, $zx, $val_modifier,
69 $reg_val_modifier, $zx_val_modifier, $ptr_size, $cmpxchg_cmp_reg);
70 };
71 (
72 $ty:ident, $val_reg:ident, $ux_reg:ident, $r_reg:ident, $ux:ident,
73 $zx:literal, $val_modifier:literal, $reg_val_modifier:tt, $zx_val_modifier:tt, $ptr_size:tt,
74 $cmpxchg_cmp_reg:tt
75 ) => {
76 delegate_signed!(delegate_load_store, $ty);
77 delegate_signed!(delegate_swap, $ty);
78 #[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
79 delegate_signed!(delegate_cas, $ty);
80 impl AtomicLoad for $ty {
81 #[inline]
82 unsafe fn atomic_load(
83 src: *const MaybeUninit<Self>,
84 _order: Ordering,
85 ) -> MaybeUninit<Self> {
86 debug_assert_atomic_unsafe_precondition!(src, $ty);
87 let out;
88
89 // SAFETY: the caller must uphold the safety contract.
90 unsafe {
91 // atomic load is always SeqCst.
92 asm!(
93 concat!("mov", $zx, " {out", $zx_val_modifier, "}, ", $ptr_size, " ptr [{src", ptr_modifier!(), "}]"), // atomic { out = *src }
94 src = in(reg) src,
95 out = lateout(reg) out,
96 options(nostack, preserves_flags),
97 );
98 }
99 crate::utils::extend32::$ty::extract(out)
100 }
101 }
102 impl AtomicStore for $ty {
103 #[inline]
104 unsafe fn atomic_store(
105 dst: *mut MaybeUninit<Self>,
106 val: MaybeUninit<Self>,
107 order: Ordering,
108 ) {
109 debug_assert_atomic_unsafe_precondition!(dst, $ty);
110
111 // SAFETY: the caller must uphold the safety contract.
112 unsafe {
113 match order {
114 // Relaxed and Release stores are equivalent.
115 Ordering::Relaxed | Ordering::Release => {
116 asm!(
117 concat!("mov ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { *dst = val }
118 dst = in(reg) dst,
119 val = in($val_reg) val,
120 options(nostack, preserves_flags),
121 );
122 }
123 Ordering::SeqCst => {
124 asm!(
125 // SeqCst store is xchg, not mov
126 concat!("xchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { _x = *dst; *dst = val; val = _x }
127 dst = in(reg) dst,
128 val = inout($val_reg) val => _,
129 options(nostack, preserves_flags),
130 );
131 }
132 _ => unreachable!(),
133 }
134 }
135 }
136 }
137 impl AtomicSwap for $ty {
138 #[inline]
139 unsafe fn atomic_swap(
140 dst: *mut MaybeUninit<Self>,
141 val: MaybeUninit<Self>,
142 _order: Ordering,
143 ) -> MaybeUninit<Self> {
144 debug_assert_atomic_unsafe_precondition!(dst, $ty);
145 let out: MaybeUninit<Self>;
146
147 // SAFETY: the caller must uphold the safety contract.
148 unsafe {
149 // atomic swap is always SeqCst.
150 asm!(
151 concat!("xchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {val", $val_modifier, "}"), // atomic { _x = *dst; *dst = val; val = _x }
152 dst = in(reg) dst,
153 val = inout($val_reg) val => out,
154 options(nostack, preserves_flags),
155 );
156 }
157 out
158 }
159 }
160 #[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
161 impl AtomicCompareExchange for $ty {
162 #[inline]
163 unsafe fn atomic_compare_exchange(
164 dst: *mut MaybeUninit<Self>,
165 old: MaybeUninit<Self>,
166 new: MaybeUninit<Self>,
167 _success: Ordering,
168 _failure: Ordering,
169 ) -> (MaybeUninit<Self>, bool) {
170 debug_assert_atomic_unsafe_precondition!(dst, $ty);
171 let out: MaybeUninit<Self>;
172
173 // SAFETY: the caller must uphold the safety contract.
174 //
175 // Refs: https://www.felixcloutier.com/x86/cmpxchg
176 unsafe {
177 let r: MaybeUninit<u32>;
178 // compare_exchange is always SeqCst.
179 asm!(
180 concat!("lock cmpxchg ", $ptr_size, " ptr [{dst", ptr_modifier!(), "}], {new", $reg_val_modifier, "}"), // atomic { if *dst == $cmpxchg_cmp_reg { ZF = 1; *dst = new } else { ZF = 0; $cmpxchg_cmp_reg = *dst } }
181 "sete {r:l}", // r = ZF
182 dst = in(reg) dst,
183 // Avoid reg_byte ($val_reg) in new and r to work around cranelift bug with multiple or lateout reg_byte.
184 new = in($ux_reg) crate::utils::extend32::$ty::$ux(new),
185 r = lateout($r_reg) r,
186 inout($cmpxchg_cmp_reg) old => out,
187 // Do not use `preserves_flags` because CMPXCHG modifies the ZF, CF, PF, AF, SF, and OF flags.
188 options(nostack),
189 );
190 let r = crate::utils::extend32::u8::extract(r).assume_init();
191 crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
192 (out, r != 0)
193 }
194 }
195 }
196 };
197}
198
199#[cfg(target_arch = "x86")]
200atomic!(u8, reg_byte, reg_abcd, uninit, "zx", "", ":l", ":e", "byte", "al");
201#[cfg(target_arch = "x86_64")]
202atomic!(u8, reg_byte, reg, uninit, "zx", "", ":l", ":e", "byte", "al");
203atomic!(u16, reg, reg, identity, "zx", ":x", ":x", ":e", "word", "ax");
204atomic!(u32, reg, reg, identity, "", ":e", ":e", ":e", "dword", "eax");
205#[cfg(target_arch = "x86_64")]
206atomic!(u64, reg, reg, identity, "", "", "", "", "qword", "rax");
207
208// For load/store, we can use MOVQ(SSE2)/MOVLPS(SSE)/FILD&FISTP(x87) instead of CMPXCHG8B.
209// Refs: https://github.com/llvm/llvm-project/blob/llvmorg-21.1.0/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
210#[cfg(target_arch = "x86")]
211#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
212macro_rules! atomic64 {
213 ($ty:ident) => {
214 delegate_signed!(delegate_all, $ty);
215 impl AtomicLoad for $ty {
216 #[inline]
217 unsafe fn atomic_load(
218 src: *const MaybeUninit<Self>,
219 _order: Ordering,
220 ) -> MaybeUninit<Self> {
221 debug_assert_atomic_unsafe_precondition!(src, $ty);
222
223 #[cfg(all(
224 target_feature = "sse2",
225 not(atomic_maybe_uninit_test_prefer_x87_over_sse),
226 ))]
227 // SAFETY: the caller must uphold the safety contract.
228 // cfg guarantees that the CPU supports SSE.
229 //
230 // Refs:
231 // - https://www.felixcloutier.com/x86/movq (SSE2)
232 // - https://www.felixcloutier.com/x86/movd:movq (SSE2)
233 unsafe {
234 let out;
235 // atomic load is always SeqCst.
236 asm!(
237 "movq {out}, qword ptr [{src}]", // atomic { out[:] = *src }
238 src = in(reg) src,
239 out = out(xmm_reg) out,
240 options(nostack, preserves_flags),
241 );
242 mem::transmute::<
243 MaybeUninit<core::arch::x86::__m128i>,
244 [MaybeUninit<Self>; 2],
245 >(out)[0]
246 }
247 #[cfg(all(
248 not(target_feature = "sse2"),
249 target_feature = "sse",
250 not(atomic_maybe_uninit_test_prefer_x87_over_sse),
251 ))]
252 // SAFETY: the caller must uphold the safety contract.
253 // cfg guarantees that the CPU supports SSE.
254 //
255 // Refs:
256 // - https://www.felixcloutier.com/x86/movlps (SSE)
257 unsafe {
258 let out;
259 // atomic load is always SeqCst.
260 asm!(
261 "movlps {out}, qword ptr [{src}]", // atomic { out[:] = *src }
262 src = in(reg) src,
263 out = out(xmm_reg) out,
264 options(nostack, preserves_flags),
265 );
266 mem::transmute::<
267 MaybeUninit<core::arch::x86::__m128>,
268 [MaybeUninit<Self>; 2],
269 >(out)[0]
270 }
271 #[cfg(all(
272 any(
273 not(target_feature = "sse"),
274 atomic_maybe_uninit_test_prefer_x87_over_sse,
275 ),
276 all(
277 any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
278 not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
279 ),
280 ))]
281 // SAFETY: the caller must uphold the safety contract.
282 //
283 // Refs:
284 // - https://www.felixcloutier.com/x86/fild
285 // - https://www.felixcloutier.com/x86/fist:fistp
286 unsafe {
287 let mut out = MaybeUninit::<Self>::uninit();
288 // atomic load is always SeqCst.
289 asm!(
290 "fild qword ptr [{src}]", // atomic { st.push(*src) }
291 "fistp qword ptr [{out}]", // *out = st.pop()
292 src = in(reg) src,
293 out = in(reg) out.as_mut_ptr(),
294 out("st(0)") _,
295 out("st(1)") _,
296 out("st(2)") _,
297 out("st(3)") _,
298 out("st(4)") _,
299 out("st(5)") _,
300 out("st(6)") _,
301 out("st(7)") _,
302 // Do not use `preserves_flags` because FILD and FISTP modify C1 in x87 FPU status word.
303 options(nostack),
304 );
305 out
306 }
307 #[cfg(all(
308 any(
309 not(target_feature = "sse"),
310 atomic_maybe_uninit_test_prefer_x87_over_sse,
311 ),
312 not(all(
313 any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
314 not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
315 )),
316 ))]
317 // SAFETY: the caller must uphold the safety contract.
318 //
319 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
320 unsafe {
321 let (prev_lo, prev_hi);
322 // atomic load is always SeqCst.
323 asm!(
324 "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
325 // set old/new args of CMPXCHG8B to 0
326 in("ebx") 0_u32,
327 in("ecx") 0_u32,
328 inout("eax") 0_u32 => prev_lo,
329 inout("edx") 0_u32 => prev_hi,
330 in("edi") src,
331 // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
332 options(nostack),
333 );
334 MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
335 }
336 }
337 }
338 impl AtomicStore for $ty {
339 #[inline]
340 unsafe fn atomic_store(
341 dst: *mut MaybeUninit<Self>,
342 val: MaybeUninit<Self>,
343 order: Ordering,
344 ) {
345 debug_assert_atomic_unsafe_precondition!(dst, $ty);
346
347 #[cfg(all(
348 target_feature = "sse",
349 not(atomic_maybe_uninit_test_prefer_x87_over_sse),
350 ))]
351 // SAFETY: the caller must uphold the safety contract.
352 // cfg guarantees that the CPU supports SSE.
353 //
354 // Refs:
355 // - https://www.felixcloutier.com/x86/movlps (SSE)
356 // - https://www.felixcloutier.com/x86/lock
357 // - https://www.felixcloutier.com/x86/or
358 unsafe {
359 let val: MaybeUninit<core::arch::x86::__m128>
360 = mem::transmute([val, MaybeUninit::uninit()]);
361 match order {
362 // Relaxed and Release stores are equivalent.
363 Ordering::Relaxed | Ordering::Release => {
364 asm!(
365 "movlps qword ptr [{dst}], {val}", // atomic { *dst = val[:] }
366 dst = in(reg) dst,
367 val = in(xmm_reg) val,
368 options(nostack, preserves_flags),
369 );
370 }
371 Ordering::SeqCst => {
372 let p = core::cell::UnsafeCell::new(MaybeUninit::<u32>::uninit());
373 asm!(
374 "movlps qword ptr [{dst}], {val}", // atomic { *dst = val[:] }
375 // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
376 // - https://github.com/taiki-e/portable-atomic/pull/156
377 // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
378 // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
379 // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
380 // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
381 "xchg dword ptr [{p}], {tmp}", // fence
382 dst = in(reg) dst,
383 val = in(xmm_reg) val,
384 p = inout(reg) p.get() => _,
385 tmp = lateout(reg) _,
386 options(nostack, preserves_flags),
387 );
388 }
389 _ => unreachable!(),
390 }
391 }
392 #[cfg(all(
393 any(
394 not(target_feature = "sse"),
395 atomic_maybe_uninit_test_prefer_x87_over_sse,
396 ),
397 all(
398 any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
399 not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
400 ),
401 ))]
402 // SAFETY: the caller must uphold the safety contract.
403 //
404 // Refs:
405 // - https://www.felixcloutier.com/x86/fild
406 // - https://www.felixcloutier.com/x86/fist:fistp
407 unsafe {
408 match order {
409 // Relaxed and Release stores are equivalent.
410 Ordering::Relaxed | Ordering::Release => {
411 asm!(
412 "fild qword ptr [{val}]", // st.push(*val)
413 "fistp qword ptr [{dst}]", // atomic { *dst = st.pop() }
414 dst = in(reg) dst,
415 val = in(reg) val.as_ptr(),
416 out("st(0)") _,
417 out("st(1)") _,
418 out("st(2)") _,
419 out("st(3)") _,
420 out("st(4)") _,
421 out("st(5)") _,
422 out("st(6)") _,
423 out("st(7)") _,
424 // Do not use `preserves_flags` because FILD and FISTP modify condition code flags in x87 FPU status word.
425 options(nostack),
426 );
427 }
428 Ordering::SeqCst => {
429 let p = core::cell::UnsafeCell::new(MaybeUninit::<u32>::uninit());
430 asm!(
431 "fild qword ptr [{val}]", // st.push(*val)
432 "fistp qword ptr [{dst}]", // atomic { *dst = st.pop() }
433 // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
434 // - https://github.com/taiki-e/portable-atomic/pull/156
435 // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
436 // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
437 // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
438 // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
439 "xchg dword ptr [{p}], {tmp}", // fence
440 dst = in(reg) dst,
441 val = in(reg) val.as_ptr(),
442 p = inout(reg) p.get() => _,
443 tmp = lateout(reg) _,
444 out("st(0)") _,
445 out("st(1)") _,
446 out("st(2)") _,
447 out("st(3)") _,
448 out("st(4)") _,
449 out("st(5)") _,
450 out("st(6)") _,
451 out("st(7)") _,
452 // Do not use `preserves_flags` because FILD and FISTP modify condition code flags in x87 FPU status word.
453 options(nostack),
454 );
455 }
456 _ => unreachable!(),
457 }
458 }
459 #[cfg(all(
460 any(
461 not(target_feature = "sse"),
462 atomic_maybe_uninit_test_prefer_x87_over_sse,
463 ),
464 not(all(
465 any(target_feature = "x87", atomic_maybe_uninit_target_feature = "x87"),
466 not(atomic_maybe_uninit_test_prefer_cmpxchg8b_over_x87),
467 )),
468 ))]
469 // SAFETY: the caller must uphold the safety contract.
470 //
471 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
472 unsafe {
473 let val = MaybeUninit64 { whole: val };
474 // atomic store by CMPXCHG8B is always SeqCst.
475 let _ = order;
476 asm!(
477 // This is based on the code generated for the first load in DW RMWs by LLVM,
478 // but it is interesting that they generate code that does mixed-sized atomic access.
479 //
480 // This is not single-copy atomic reads, but this is ok because subsequent
481 // CAS will check for consistency.
482 "mov eax, dword ptr [edi]", // atomic { eax = *edi }
483 "mov edx, dword ptr [edi + 4]", // atomic { edx = *edi.byte_add(4) }
484 "2:", // 'retry:
485 "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
486 "jne 2b", // if ZF == 0 { jump 'retry }
487 in("ebx") val.pair.lo,
488 in("ecx") val.pair.hi,
489 out("eax") _,
490 out("edx") _,
491 in("edi") dst,
492 // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
493 options(nostack),
494 );
495 }
496 }
497 }
498 impl AtomicSwap for $ty {
499 #[inline]
500 unsafe fn atomic_swap(
501 dst: *mut MaybeUninit<Self>,
502 val: MaybeUninit<Self>,
503 _order: Ordering,
504 ) -> MaybeUninit<Self> {
505 debug_assert_atomic_unsafe_precondition!(dst, $ty);
506 let val = MaybeUninit64 { whole: val };
507 let (mut prev_lo, mut prev_hi);
508
509 // SAFETY: the caller must uphold the safety contract.
510 //
511 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
512 unsafe {
513 // atomic swap is always SeqCst.
514 asm!(
515 // This is based on the code generated for the first load in DW RMWs by LLVM,
516 // but it is interesting that they generate code that does mixed-sized atomic access.
517 //
518 // This is not single-copy atomic reads, but this is ok because subsequent
519 // CAS will check for consistency.
520 "mov eax, dword ptr [edi]", // atomic { eax = *edi }
521 "mov edx, dword ptr [edi + 4]", // atomic { edx = *edi.byte_add(4) }
522 "2:", // 'retry:
523 "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
524 "jne 2b", // if ZF == 0 { jump 'retry }
525 in("ebx") val.pair.lo,
526 in("ecx") val.pair.hi,
527 out("eax") prev_lo,
528 out("edx") prev_hi,
529 in("edi") dst,
530 // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
531 options(nostack),
532 );
533 MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
534 }
535 }
536 }
537 impl AtomicCompareExchange for $ty {
538 #[inline]
539 unsafe fn atomic_compare_exchange(
540 dst: *mut MaybeUninit<Self>,
541 old: MaybeUninit<Self>,
542 new: MaybeUninit<Self>,
543 _success: Ordering,
544 _failure: Ordering,
545 ) -> (MaybeUninit<Self>, bool) {
546 debug_assert_atomic_unsafe_precondition!(dst, $ty);
547 let old = MaybeUninit64 { whole: old };
548 let new = MaybeUninit64 { whole: new };
549 let (prev_lo, prev_hi);
550
551 // SAFETY: the caller must uphold the safety contract.
552 //
553 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
554 unsafe {
555 let r: u8;
556 // compare_exchange is always SeqCst.
557 asm!(
558 "lock cmpxchg8b qword ptr [edi]", // atomic { if *edi == edx:eax { ZF = 1; *edi = ecx:ebx } else { ZF = 0; edx:eax = *edi } }
559 "sete cl", // cl = ZF
560 in("ebx") new.pair.lo,
561 in("ecx") new.pair.hi,
562 inout("eax") old.pair.lo => prev_lo,
563 inout("edx") old.pair.hi => prev_hi,
564 in("edi") dst,
565 lateout("cl") r,
566 // Do not use `preserves_flags` because CMPXCHG8B modifies the ZF flag.
567 options(nostack),
568 );
569 crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
570 (
571 MaybeUninit64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole,
572 r != 0
573 )
574 }
575 }
576 }
577 };
578}
579
580#[cfg(target_arch = "x86")]
581#[cfg(not(atomic_maybe_uninit_no_cmpxchg8b))]
582atomic64!(u64);
583
584#[cfg(target_arch = "x86_64")]
585#[cfg(target_feature = "cmpxchg16b")]
586macro_rules! atomic128 {
587 ($ty:ident) => {
588 #[cfg(target_pointer_width = "32")]
589 atomic128!($ty, "edi");
590 #[cfg(target_pointer_width = "64")]
591 atomic128!($ty, "rdi");
592 };
593 ($ty:ident, $rdi:tt) => {
594 delegate_signed!(delegate_all, $ty);
595 impl AtomicLoad for $ty {
596 #[inline]
597 unsafe fn atomic_load(
598 src: *const MaybeUninit<Self>,
599 _order: Ordering,
600 ) -> MaybeUninit<Self> {
601 debug_assert_atomic_unsafe_precondition!(src, $ty);
602
603 // VMOVDQA is atomic when AVX is available.
604 // See https://gcc.gnu.org/bugzilla//show_bug.cgi?id=104688 for details.
605 //
606 // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
607 #[cfg(target_feature = "avx")]
608 // SAFETY: the caller must guarantee that `src` is valid for reads,
609 // 16-byte aligned, and that there are no concurrent non-atomic operations.
610 // cfg guarantees that the CPU supports AVX.
611 unsafe {
612 let out;
613 asm!(
614 concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"), // atomic { out = *src }
615 src = in(reg) src,
616 out = lateout(xmm_reg) out,
617 options(nostack, preserves_flags),
618 );
619 mem::transmute::<MaybeUninit<core::arch::x86_64::__m128i>, MaybeUninit<Self>>(
620 out
621 )
622 }
623 #[cfg(not(target_feature = "avx"))]
624 // SAFETY: the caller must guarantee that `src` is valid for both writes and
625 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
626 // cfg guarantees that the CPU supports CMPXCHG16B.
627 //
628 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
629 unsafe {
630 let (prev_lo, prev_hi);
631 // atomic load is always SeqCst.
632 asm!(
633 "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
634 "xor rbx, rbx", // zeroed rbx
635 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
636 "mov rbx, {rbx_tmp}", // restore rbx
637 // set old/new args of CMPXCHG16B to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
638 rbx_tmp = out(reg) _,
639 in("rcx") 0_u64,
640 inout("rax") 0_u64 => prev_lo,
641 inout("rdx") 0_u64 => prev_hi,
642 in($rdi) src,
643 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
644 options(nostack),
645 );
646 MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
647 }
648 }
649 }
650 impl AtomicStore for $ty {
651 #[inline]
652 unsafe fn atomic_store(
653 dst: *mut MaybeUninit<Self>,
654 val: MaybeUninit<Self>,
655 order: Ordering,
656 ) {
657 debug_assert_atomic_unsafe_precondition!(dst, $ty);
658
659 // VMOVDQA is atomic when AVX is available.
660 // See https://gcc.gnu.org/bugzilla//show_bug.cgi?id=104688 for details.
661 //
662 // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
663 #[cfg(target_feature = "avx")]
664 // SAFETY: the caller must guarantee that `dst` is valid for writes,
665 // 16-byte aligned, and that there are no concurrent non-atomic operations.
666 // cfg guarantees that the CPU supports AVX.
667 unsafe {
668 let val: MaybeUninit<core::arch::x86_64::__m128i> = mem::transmute(val);
669 match order {
670 // Relaxed and Release stores are equivalent.
671 Ordering::Relaxed | Ordering::Release => {
672 asm!(
673 concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), // atomic { *dst = val }
674 dst = in(reg) dst,
675 val = in(xmm_reg) val,
676 options(nostack, preserves_flags),
677 );
678 }
679 Ordering::SeqCst => {
680 let p = core::cell::UnsafeCell::new(MaybeUninit::<u64>::uninit());
681 asm!(
682 concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"), // atomic { *dst = val }
683 // Equivalent to `mfence`, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
684 // - https://github.com/taiki-e/portable-atomic/pull/156
685 // - LLVM uses `lock or` https://godbolt.org/z/vv6rjzfYd
686 // - Windows uses `xchg` for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
687 // - MSVC STL uses `lock inc` https://github.com/microsoft/STL/pull/740
688 // - boost uses `lock or` https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
689 concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"), // fence
690 dst = in(reg) dst,
691 val = in(xmm_reg) val,
692 p = inout(reg) p.get() => _,
693 tmp = lateout(reg) _,
694 options(nostack, preserves_flags),
695 );
696 }
697 _ => unreachable!(),
698 }
699 }
700 #[cfg(not(target_feature = "avx"))]
701 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
702 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
703 // cfg guarantees that the CPU supports CMPXCHG16B.
704 //
705 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
706 unsafe {
707 let val = MaybeUninit128 { whole: val };
708 let _ = order;
709 // atomic store is always SeqCst.
710 asm!(
711 "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
712 // This is based on the code generated for the first load in DW RMWs by LLVM,
713 // but it is interesting that they generate code that does mixed-sized atomic access.
714 //
715 // This is not single-copy atomic reads, but this is ok because subsequent
716 // CAS will check for consistency.
717 concat!("mov rax, qword ptr [", $rdi, "]"), // atomic { rax = *$rdi }
718 concat!("mov rdx, qword ptr [", $rdi, " + 8]"), // atomic { rdx = *$rdi.byte_add(8) }
719 "2:", // 'retry:
720 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
721 "jne 2b", // if ZF == 0 { jump 'retry }
722 "mov rbx, {rbx_tmp}", // restore rbx
723 rbx_tmp = inout(reg) val.pair.lo => _,
724 in("rcx") val.pair.hi,
725 out("rax") _,
726 out("rdx") _,
727 in($rdi) dst,
728 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
729 options(nostack),
730 );
731 }
732 }
733 }
734 impl AtomicSwap for $ty {
735 #[inline]
736 unsafe fn atomic_swap(
737 dst: *mut MaybeUninit<Self>,
738 val: MaybeUninit<Self>,
739 _order: Ordering,
740 ) -> MaybeUninit<Self> {
741 debug_assert_atomic_unsafe_precondition!(dst, $ty);
742 let val = MaybeUninit128 { whole: val };
743 let (mut prev_lo, mut prev_hi);
744
745 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
746 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
747 // cfg guarantees that the CPU supports CMPXCHG16B.
748 //
749 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
750 unsafe {
751 // atomic swap is always SeqCst.
752 asm!(
753 "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
754 // This is based on the code generated for the first load in DW RMWs by LLVM,
755 // but it is interesting that they generate code that does mixed-sized atomic access.
756 //
757 // This is not single-copy atomic reads, but this is ok because subsequent
758 // CAS will check for consistency.
759 concat!("mov rax, qword ptr [", $rdi, "]"), // atomic { rax = *$rdi }
760 concat!("mov rdx, qword ptr [", $rdi, " + 8]"), // atomic { rdx = *$rdi.byte_add(8) }
761 "2:", // 'retry:
762 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
763 "jne 2b", // if ZF == 0 { jump 'retry }
764 "mov rbx, {rbx_tmp}", // restore rbx
765 rbx_tmp = inout(reg) val.pair.lo => _,
766 in("rcx") val.pair.hi,
767 out("rax") prev_lo,
768 out("rdx") prev_hi,
769 in($rdi) dst,
770 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
771 options(nostack),
772 );
773 MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
774 }
775 }
776 }
777 impl AtomicCompareExchange for $ty {
778 #[inline]
779 unsafe fn atomic_compare_exchange(
780 dst: *mut MaybeUninit<Self>,
781 old: MaybeUninit<Self>,
782 new: MaybeUninit<Self>,
783 _success: Ordering,
784 _failure: Ordering,
785 ) -> (MaybeUninit<Self>, bool) {
786 debug_assert_atomic_unsafe_precondition!(dst, $ty);
787 let old = MaybeUninit128 { whole: old };
788 let new = MaybeUninit128 { whole: new };
789 let (prev_lo, prev_hi);
790
791 // SAFETY: the caller must guarantee that `dst` is valid for both writes and
792 // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
793 // cfg guarantees that the CPU supports CMPXCHG16B.
794 //
795 // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
796 unsafe {
797 let r: u8;
798 // compare_exchange is always SeqCst.
799 asm!(
800 "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
801 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"), // atomic { if *$rdi == rdx:rax { ZF = 1; *$rdi = rcx:rbx } else { ZF = 0; rdx:rax = *$rdi } }
802 "sete cl", // cl = ZF
803 "mov rbx, {rbx_tmp}", // restore rbx
804 rbx_tmp = inout(reg) new.pair.lo => _,
805 in("rcx") new.pair.hi,
806 inout("rax") old.pair.lo => prev_lo,
807 inout("rdx") old.pair.hi => prev_hi,
808 in($rdi) dst,
809 lateout("cl") r,
810 // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
811 options(nostack),
812 );
813 crate::utils::assert_unchecked(r == 0 || r == 1); // may help remove extra test
814 (
815 MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole,
816 r != 0
817 )
818 }
819 }
820 }
821 };
822}
823
824#[cfg(target_arch = "x86_64")]
825#[cfg(target_feature = "cmpxchg16b")]
826atomic128!(u128);
827
828// -----------------------------------------------------------------------------
829// cfg macros
830
831#[macro_export]
832macro_rules! cfg_has_atomic_8 {
833 ($($tt:tt)*) => { $($tt)* };
834}
835#[macro_export]
836macro_rules! cfg_no_atomic_8 {
837 ($($tt:tt)*) => {};
838}
839#[macro_export]
840macro_rules! cfg_has_atomic_16 {
841 ($($tt:tt)*) => { $($tt)* };
842}
843#[macro_export]
844macro_rules! cfg_no_atomic_16 {
845 ($($tt:tt)*) => {};
846}
847#[macro_export]
848macro_rules! cfg_has_atomic_32 {
849 ($($tt:tt)*) => { $($tt)* };
850}
851#[macro_export]
852macro_rules! cfg_no_atomic_32 {
853 ($($tt:tt)*) => {};
854}
855#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b)))]
856#[macro_export]
857macro_rules! cfg_has_atomic_64 {
858 ($($tt:tt)*) => { $($tt)* };
859}
860#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b)))]
861#[macro_export]
862macro_rules! cfg_no_atomic_64 {
863 ($($tt:tt)*) => {};
864}
865#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b))]
866#[macro_export]
867macro_rules! cfg_has_atomic_64 {
868 ($($tt:tt)*) => {};
869}
870#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg8b))]
871#[macro_export]
872macro_rules! cfg_no_atomic_64 {
873 ($($tt:tt)*) => { $($tt)* };
874}
875#[cfg(not(all(target_arch = "x86_64", target_feature = "cmpxchg16b")))]
876#[macro_export]
877macro_rules! cfg_has_atomic_128 {
878 ($($tt:tt)*) => {};
879}
880#[cfg(not(all(target_arch = "x86_64", target_feature = "cmpxchg16b")))]
881#[macro_export]
882macro_rules! cfg_no_atomic_128 {
883 ($($tt:tt)*) => { $($tt)* };
884}
885#[cfg(all(target_arch = "x86_64", target_feature = "cmpxchg16b"))]
886#[macro_export]
887macro_rules! cfg_has_atomic_128 {
888 ($($tt:tt)*) => { $($tt)* };
889}
890#[cfg(all(target_arch = "x86_64", target_feature = "cmpxchg16b"))]
891#[macro_export]
892macro_rules! cfg_no_atomic_128 {
893 ($($tt:tt)*) => {};
894}
895#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
896#[macro_export]
897macro_rules! cfg_has_atomic_cas {
898 ($($tt:tt)*) => { $($tt)* };
899}
900#[cfg(not(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg)))]
901#[macro_export]
902macro_rules! cfg_no_atomic_cas {
903 ($($tt:tt)*) => {};
904}
905#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg))]
906#[macro_export]
907macro_rules! cfg_has_atomic_cas {
908 ($($tt:tt)*) => {};
909}
910#[cfg(all(target_arch = "x86", atomic_maybe_uninit_no_cmpxchg))]
911#[macro_export]
912macro_rules! cfg_no_atomic_cas {
913 ($($tt:tt)*) => { $($tt)* };
914}