pessimize 3.0.2

More efficient Rust compiler optimization barriers
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
//! This crate aims to implement minimally costly optimization barriers for
//! every architecture that has `asm!()` support (currently x86(_64),
//! 32-bit ARM, AArch64 and RISC-V, more possible on nightly via the
//! `asm_experimental_arch` unstable feature).
//!
//! You can use these barriers to prevent the compiler from optimizing out
//! selected redundant or unnecessary computations in situations where such
//! optimization is undesirable. The most typical usage scenario is
//! microbenchmarking, but there might also be applications to cryptography or
//! low-level development, where optimization must also be controlled.
//!
//! # Implementations
//!
//! The barriers will be implemented for any type from core/std that either...
//! - Can be shoved into CPU registers, with "natural" target registers
//!   dictated by normal ABI calling conventions.
//! - Can be losslessly converted back and forth to a set of values that have
//!   this property, in a manner that is easily optimized out.
//!
//! Any type which is not directly supported can still be subjected to an
//! optimization barrier by taking a reference to it and subjecting that
//! reference to an optimization barrier, at the cost of causing the value to
//! be spilled to memory. If the nightly `default_impl` feature is enabled, the
//! crate will provide a default `Pessimize` impl that does this for you.
//!
//! You can tell which types implement `Pessimize` on your compiler target by
//! running `cargo doc` and checking the implementor list of `Pessimize` and
//! `BorrowPessimize`.
//!
//! To implement `Pessimize` for your own types, you should consider
//! implementing `PessimizeCast` and `BorrowPessimize`, which make the job a
//! bit easier. `Pessimize` is automatically implemented for any type that
//! implements `BorrowPessimize`.
//!
//! # Semantics
//!
//! For pointer-like entities, optimization barriers other than `hide` can
//! have the side-effect of causing the compiler to assume that global and
//! thread-local variables might have been accessed using similar semantics as
//! the pointer itself. This will reduce applicable compiler optimizations for
//! such variables, so the use of `hide` should be favored whenever global or
//! thread-local variables are used (or you don't know if they are used).
//!
//! In general, barriers other than `hide` have more avenues for surprising
//! behavior (see their documentation for details), so you should strive to do
//! what you want with `hide` if possible, and only reach for other barriers
//! where the extra expressive power of these primitives is truly needed.
//!
//! While the barriers will accept zero-sized types such as `PhantomData`, they
//! will only be effective for those that access global or thread-local state,
//! like `std::alloc::System` does. That is because without such external state,
//! zero-sized objects do not own or provide access to any information, so the
//! compiler can trivially infer that the optimization barrier cannot read or
//! modify any internal state. Implementations of `Pessimize` on such types are
//! only provided to ease automatic derivation of `Pessimize` like tuples (and
//! hopefully custom structs too in the future).
//!
//! The documentation of the top-level functions (`hide`, `assume_read`,
//! `consume`, `assume_accessed` and `assume_accessed_imut`) contain more
//! details on the optimization barrier that is being implemented.
//!
//! # When to use this crate
//!
//! You should consider use of this crate over `core::hint::black_box`, or third
//! party cousins thereof, because...
//! - It has a better-defined API contract with stronger guarantees (unlike
//!   `core::hint::black_box`, where "do nothing" is a valid implementation).
//! - It exposes finer-grained operations, which clarify your code's intent and
//!   reduce harmful side-effects.
//!
//! The main drawbacks of this crate's approach being that...
//! - It only works on selected hardware architectures (though they are the ones
//!   on which you are most likely to run benchmarks, and it should get better
//!   over time as more inline assembly architectures get stabilized).
//! - It needs a lot of tricky unsafe code.

#![cfg_attr(not(any(feature = "std", test)), no_std)]
#![cfg_attr(feature = "nightly", feature(doc_cfg, portable_simd, ptr_metadata))]
#![cfg_attr(feature = "default_impl", allow(incomplete_features))]
#![cfg_attr(feature = "default_impl", feature(specialization))]
#![deny(missing_docs)]

// TODO: Once allocator_api is stable, support collections with custom
//       allocators by applying optimization barriers to the allocator as well.
//       Right now, doing so would require either duplicating the tricky
//       collection code (one version with allocators and one version without)
//       or dropping collection support on stable, neither of which sound
//       satisfactory given that custom allocators are expected to be niche.
#[cfg(any(feature = "alloc", test))]
extern crate alloc as std_alloc;

mod alloc;
pub mod arch;
#[cfg(any(feature = "alloc", test))]
mod boxed;
mod cell;
mod cmp;
#[cfg(all(any(feature = "std", test), any(unix, target_os = "wasi")))]
mod ffi;
mod fmt;
#[cfg(all(any(feature = "std", test), any(unix, windows)))]
mod fs;
#[cfg(any(feature = "std", test))]
mod io;
mod iter;
mod marker;
mod mem;
#[cfg(any(feature = "std", test))]
mod net;
mod num;
mod ops;
mod panic;
#[cfg(all(any(feature = "std", test), any(unix, target_os = "wasi")))]
mod path;
mod pin;
mod primitive;
#[cfg(all(any(feature = "std", test), unix))]
mod process;
mod ptr;
#[cfg(any(feature = "alloc", test))]
mod string;
mod sync;
// TODO: Implement task support once the waker_getters feature is stable
// TODO: Implement time support when/if a zero-cost way to construct a Duration
//       back from seconds and nanoseconds is provided. Currently, we only have
//       Duration::new(), which pessimistically assumes the worst about nanos.
#[cfg(any(feature = "alloc", test))]
mod vec;

/// Optimization barriers provided by this crate
///
/// This trait is implemented for both value and reference types, which can
/// lead to unexpected method syntax semantics (you expected to call the
/// `Pessimize` impl of `T`, and you actually called that of `&T`). As a result,
/// it is strongly recommended to use the optimization barriers via the free
/// functions provided at the crate root, rather than via method syntax.
///
/// Implementing `Pessimize` requires fairly tricky code. Consider implementing
/// `PessimizeCast` and `BorrowPessimize` instead, which is slightly easier, and
/// will lead to an automatic `Pessimize` implementation.
///
/// # Safety
///
/// Unsafe code may rely on hide() behaving as an identity function (returning
/// the input value unaltered) and `assume_xyz()` not altering anything even if
/// the type is internally mutable.
///
pub unsafe trait Pessimize {
    /// See `pessimize::hide()` for documentation
    fn hide(self) -> Self;

    /// See `pessimize::assume_read()` for documentation
    fn assume_read(&self);

    /// See `pessimize::assume_accessed()` for documentation
    fn assume_accessed(&mut self);

    /// See `pessimize::assume_accessed_imut()` for documentation
    fn assume_accessed_imut(&self);
}

/// Re-emit the input value as its output (identity function), but force the
/// compiler to assume that it is a completely different value.
///
/// If you want to re-do the exact same computation in a loop, you can pass
/// its inputs through this barrier to prevent the compiler from optimizing
/// out the redundant computations.
///
/// Although `Pessimize` is implemented for zero-sized types, `hide()` will not
/// serve its normal purpose of obscuring output on those types, because there
/// is only one possible return value so the compiler knows that the output
/// value is the  same as the input value. Since zero-sized types may only hold
/// state through global and thread-local variables, implementations of
/// `Pessimize::hide` for stateful ZSTs should feature an
/// `assume_globals_accessed()` optimization barrier.
///
/// If you need a `hide` alternative for a variable `x` that does not
/// implement `Pessimize`, you can use `*hide(&x)`, at the cost of forcing
/// all data reachable via x which is currently cached in registers to be
/// spilled to memory and reloaded when needed later on.
///
/// If you are familiar with the unstable `core::hint::black_box` function or
/// analogs in benchmarking libraries like Criterion, please note that although
/// this function has a similar API signature, it does not have the same
/// semantics and cannot be used as a direct replacement. For example,
/// `core::hint::black_box(&mut x)` should have the effect of
/// `pessimize::assume_accessed(&mut x)`, whereas `pessimize::hide(x)` does not
/// enforce any compiler assumptions concerning the input value, it just turns
/// it into another value that looks unrelated in the eye of the compiler.
///
#[inline]
pub fn hide<T: Pessimize>(x: T) -> T {
    Pessimize::hide(x)
}

/// Force the compiler to assume that a value, and data transitively
/// reachable via that value (for pointers/refs), is being used if Rust
/// rules allow for it.
///
/// You can apply this barrier to unused computation results in order to
/// prevent the compiler from optimizing out the associated computations.
///
/// On pointers/references, it will have the side effect of spilling target data
/// resident in CPU registers to memory, although the in-register copies remain
/// valid and can be reused later on without reloading.
///
/// If you need an `assume_read` alternative for a variable `x` that does not
/// implement `Pessimize`, you can use `assume_read(&x)`, at the cost of
/// forcing any data from x which is currently cached in registers to be
/// spilled into memory.
///
/// The `assume_read` implementation of `*const T` and `*mut T` may not work
/// as expected if an `&mut T` reference to the same data exists somewhere,
/// because dereferencing the pointer in that situation would be undefined
/// behavior, which by definition does not exist in the eye of the compiler.
///
/// For pointer types, this operation may sometimes be pessimized into a
/// full `assume_accessed()` optimization barrier, as a result of rustc not
/// leveraging the underlying `readonly` optimization hint. It is hoped that
/// future versions of rustc will take stronger notice of that hint.
///
#[inline]
pub fn assume_read<T: Pessimize>(x: &T) {
    Pessimize::assume_read(x)
}

/// Like `assume_read`, but by value
///
/// This is a more ergonomic alternative to `assume_read` in the common case
/// where the input is `Copy` or will not be needed anymore.
///
#[inline]
pub fn consume<T: Pessimize>(x: T) {
    assume_read(&x);
}

/// Assume that all global and thread-local variables have been read
#[inline]
pub fn assume_globals_read() {
    unsafe { core::arch::asm!("", options(preserves_flags, nostack, readonly)) }
}

/// Force the compiler to assume that any data transitively reachable via a
/// pointer/reference has been read, and modified if Rust rules allow for it.
///
/// This operation only makes sense on pointers/references, or values that
/// contain them. On others, it is equivalent to `assume_read()`. That's because
/// we assume that the pointer's target has changed, not the pointer itself
/// (except for fat pointer types where pointer metadata is target-dependent).
///
/// At the optimizer level, `assume_accessed()` will cause all target data which
/// is currently cached in registers to be spilled to memory and invalidated.
///
/// The compiler is allowed to assume that data which is only reachable via
/// an &-reference and does not have interior mutability semantics cannot be
/// modified, so you should not expect this pattern to work:
///
/// ```
/// # use pessimize::assume_accessed;
/// let x = 42;
/// let mut r = &x;
/// assume_accessed(&mut r);
/// // Compiler may still infer that x and *r are both 42 here
/// ```
///
/// Instead, if you have a shared reference to something and need the
/// compiler to assume that it is a shared reference to something completely
/// different, use `hide` to obscure the shared reference's target.
///
/// ```
/// # use pessimize::hide;
/// let x = 42;
/// let mut r = &x;
/// r = hide(r);
/// // Compiler still knows that x is 42 but cannot infer that *r is 42 here
/// ```
///
/// Similar considerations apply to the use of `assume_accessed` on a `*const T`
/// or `*mut T` in the presence of an `&T` or `&mut T` to the same target, where
/// the compiler may or may not manage to infer that these pointers cannot be
/// used to modify or read their targets where that would be undefined behavior.
///
#[inline]
pub fn assume_accessed<R: Pessimize>(r: &mut R) {
    Pessimize::assume_accessed(r)
}

/// Variant of `assume_accessed` for internally mutable types
///
/// You should only use this variant on pointers/references to internally
/// mutable types (Cell, RefCell, Mutex, AtomicXyz...), or values that contain
/// them. Otherwise you will instantly fall victim of the "shared reference
/// mutation is UB" edge case mentioned in the docs of `assume_accessed()`.
///
/// For example, calling `assume_accessed_imut()` on a slice pointer will not
/// assume that the slice length has changed, since slice length does not have
/// internal mutability semantics.
///
#[inline]
pub fn assume_accessed_imut<R: Pessimize>(r: &R) {
    Pessimize::assume_accessed_imut(r)
}

/// Assume that all global and thread-local variables have been read and modified
#[inline]
pub fn assume_globals_accessed() {
    unsafe { core::arch::asm!("", options(preserves_flags, nostack)) }
}

/// Convert `Self` back and forth to a `Pessimize` impl (`Pessimize` impl helper)
///
/// While only a small number of `Pessimize` types are supported by inline
/// assembly, many standard types can be losslessly converted to a lower-level
/// type (or tuple of types) that implement Pessimize and back in such a way
/// that the runtime costs should be optimized out.
///
/// This trait exposes that capability under a common abstraction vocabulary.
/// Combined with the related `BorrowPessimize` trait, it enables implementation
/// of `Pessimize` with increased safety reduced boilerplate.
///
/// # Safety
///
/// By implementing this trait, you guarantee that someone using it as
/// documented will not trigger Undefined Behavior, since the safe `Pessimize`
/// trait can be automatically implemented on top of it
///
pub unsafe trait PessimizeCast {
    /// Pessimize type that can be converted to and from a Self value
    type Pessimized: Pessimize;

    /// Convert Self to Pessimized
    fn into_pessimize(self) -> Self::Pessimized;

    /// Convert back from Pessimized to Self
    ///
    /// # Safety
    ///
    /// A correct implementation of this operation only needs to be safe for the
    /// intended purpose of converting `Self` to `Pessimized` using
    /// `into_pessimize()`, invoking `Pessimize` trait operations on the
    /// resulting value, and optionally converting the `Pessimized` value back
    /// to `Self` afterwards via `from_pessimize()`.
    ///
    /// The final `from_pessimize()` operation of this round trip must be
    /// performed in the same scope where the initial `into_pessimize()`
    /// operation was called, or a child scope thereof.
    ///
    /// Even if `Pessimized` is `Clone`, it is strongly advised to treat the
    /// `Pessimized` value from `into_pessimize()` as a `!Clone` value: don't
    /// clone or copy it, and stop using it after converting it back to `Self`.
    /// Otherwise, _suprising_ (but safe) behavior may occur.
    ///
    /// **No other usage of `from_pessimize()` is safe.** To give a few examples
    /// of incorrect usage of `from_pessimize()`...
    ///
    /// - `Self` may contain references to the surrounding stack frame, so even
    ///   if `Pessimized` is `'static`, letting a `Pessimized` escape the scope
    ///   in which `into_pessimize()` was called before converting it back to
    ///   `Self` is unsafe.
    /// - `Self` may contain `!Clone` data like &mut references, so even if
    ///   `Pessimized` is `Clone`, converting two clones of a single
    ///   `Pessimized` value back into `Self` is unsafe. In fact, even using the
    ///   `Pessimize` implementation after converting one of the clones to
    ///   `Self` is not guaranteed to produce the desired optimization barrier.
    /// - `Self` may be `!Send`, so even if `Pessimized` is `Send`, sending a
    ///   `Pessimized` value to another thread before calling `from_pessimize()`
    ///   on that separate thread is unsafe.
    /// - Even if two types share the same `Pessimized` representation, abusing
    ///   the `PessimizeCast` trait to perform a cast operation, like casting a
    ///   reference to another reference with different mutability or lifetime,
    ///   is unsafe.
    ///
    unsafe fn from_pessimize(x: Self::Pessimized) -> Self;
}

/// Extract references to `Pessimize` values from references to `Self` (`Pessimize` impl helper)
pub trait BorrowPessimize: PessimizeCast {
    /// Pessimize type to which a reference can be extracted from &self
    type BorrowedPessimize: Pessimize;

    /// Extract an `&Pessimized` from `&self` and all the provided operation
    /// on it.
    ///
    /// In the common case where `Self: Copy`, you can implement this by calling
    /// `pessimize::impl_with_pessimize_via_copy`.
    ///
    fn with_pessimize(&self, f: impl FnOnce(&Self::BorrowedPessimize));

    /// Extract an `&mut Pessimized` from `&mut self`, call `assume_accessed`
    /// on it, and propagate any "changes" back to the original `&mut self`.
    ///
    /// In the common case where there is a cheap way to go from an `&mut Self`
    /// to a `Self` or `Self::Pessimized`, you can implement this by calling
    /// one of the `pessimize::impl_assume_accessed_via_xyz` functions.
    ///
    fn assume_accessed_impl(&mut self);
}

/// Implementation of `BorrowPessimize::with_pessimize` for `Copy` types
// TODO: Use specializable BorrowPessimize impl once available on stable
#[inline]
pub fn impl_with_pessimize_via_copy<T: Copy + PessimizeCast>(
    self_: &T,
    f: impl FnOnce(&T::Pessimized),
) {
    let pessimize = T::into_pessimize(*self_);
    f(&pessimize)
}

/// Implementation of `BorrowPessimize::assume_accessed_impl` for types where
/// there is a way to get a `T::Pessimized` from an `&mut T`
///
/// The `Drop` impl of the value left in `self_` by `extract_pessimized` will
/// not be called, make sure that this does not result in a resource leak.
///
#[inline]
pub fn impl_assume_accessed_via_extract_pessimized<T: PessimizeCast>(
    self_: &mut T,
    extract_pessimized: impl FnOnce(&mut T) -> T::Pessimized,
) {
    let mut pessimize = extract_pessimized(self_);
    assume_accessed(&mut pessimize);
    // Safe because assume_accessed is allowed between into_pessimize and from_pessimize
    unsafe { (self_ as *mut T).write(T::from_pessimize(pessimize)) };
}

/// Implementation of `BorrowPessimize::assume_accessed_impl` for types where
/// there is a cheap way to extract the inner `T` from an `&mut T`
///
/// For `Copy` types, this is a dereference, and for `Default` types where the
/// default value is truly trivial and guaranteed to be optimized out (like
/// `Vec`), this is `core::mem::take`.
///
/// The `Drop` impl of the value left in `self_` by `extract_self` will not be
/// called, make sure that this does not result in a resource leak.
///
#[inline]
pub fn impl_assume_accessed_via_extract_self<T: PessimizeCast>(
    self_: &mut T,
    extract_self: impl FnOnce(&mut T) -> T,
) {
    impl_assume_accessed_via_extract_pessimized(self_, |self_| {
        T::into_pessimize(extract_self(self_))
    });
}

// Given a BorrowPessimize impl, we can automatically implement Pessimize
unsafe impl<T: BorrowPessimize> Pessimize for T {
    #[inline]
    fn hide(self) -> Self {
        // Safe because `from_pessimize` is from the same scope and `hide` is
        // allowed before `from_pessimize`.
        unsafe { Self::from_pessimize(hide(self.into_pessimize())) }
    }

    #[inline]
    fn assume_read(&self) {
        Self::with_pessimize(self, assume_read)
    }

    #[inline]
    fn assume_accessed(&mut self) {
        Self::assume_accessed_impl(self)
    }

    #[inline]
    fn assume_accessed_imut(&self) {
        Self::with_pessimize(self, assume_accessed_imut)
    }
}

/// Default implementation of Pessimize when no better one is available
///
/// Uses the implementation of `Pessimize` for references, which will cause any
/// state currently cached in CPU registers to be spilled to memory and reloaded
/// if used again.
///
#[cfg(feature = "default_impl")]
mod default_impl {
    use super::*;

    // Can't use PessimizeCast/BorrowPessimize here because need to use
    // `assume_accessed` in `hide` impl.
    #[doc(cfg(all(feature = "nightly", feature = "default_impl")))]
    unsafe impl<T> Pessimize for T {
        #[inline]
        default fn hide(mut self) -> Self {
            let mut r: &mut Self = &mut self;
            assume_accessed::<&mut T>(&mut r);
            self
        }

        #[inline]
        default fn assume_read(&self) {
            consume::<&T>(self)
        }

        #[inline]
        default fn assume_accessed(mut self: &mut Self) {
            assume_accessed::<&mut T>(&mut self);
        }

        #[inline]
        default fn assume_accessed_imut(&self) {
            assume_accessed_imut::<&T>(&self)
        }
    }
}

/// Implementation of Pessimize for asm inputs/outputs without pointer semantics
///
/// To be used by arch-specific modules to implement Pessimize for primitive and
/// arch-specific SIMD types.
///
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_asm_values {
    (
        $doc_cfg:meta
        {
            $(
                $reg:ident : ( $($value_type:ty),* )
            ),*
        }
    ) => {
        $($(
            // This is one of the primitive Pessimize impls on which the
            // PessimizeCast/BorrowPessimize stack is built
            #[allow(asm_sub_register)]
            #[cfg_attr(feature = "nightly", $doc_cfg)]
            unsafe impl $crate::Pessimize for $value_type {
                #[inline]
                fn hide(mut self) -> Self {
                    unsafe {
                        core::arch::asm!("/* {0} */", inout($reg) self, options(preserves_flags, nostack, nomem));
                    }
                    self
                }

                #[inline]
                fn assume_read(&self) {
                    unsafe {
                        core::arch::asm!("/* {0} */", in($reg) *self, options(preserves_flags, nostack, nomem))
                    }
                }

                #[inline]
                fn assume_accessed(&mut self) {
                    Self::assume_read(self)
                }

                #[inline]
                fn assume_accessed_imut(&self) {
                    Self::assume_read(self)
                }
            }
        )*)*
    };
}

/// Implementation of PessimizeCast for types that can be converted to and from
/// a Pessimized impl at low cost
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_cast {
    (
        $doc_cfg:meta
        {
            $(
                $inner:ty : (
                    $(
                        $(
                            | $param:ident $( : ( $trait1:path $(, $traitN:path)* ) )? |
                        )?
                        $outer:ty : ($into:expr, $from:expr)
                    ),*
                )
            ),*
        }
    ) => {
        $($(
            #[cfg_attr(feature = "nightly", $doc_cfg)]
            unsafe impl $(< $param $( : $trait1 $( + $traitN )* )? >)? $crate::PessimizeCast for $outer {
                type Pessimized = $inner;

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                fn into_pessimize(self) -> $inner {
                    $into(self)
                }

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                unsafe fn from_pessimize(inner: $inner) -> Self {
                    $from(inner)
                }
            }
        )*)*
    };
}

/// Implementation of Pessimize for types from which a Pessimize impl can be
/// extracted given nothing but an &self, having round trip conversion functions
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_extractible {
    (
        $doc_cfg:meta
        {
            $(
                $inner:ty : (
                    $(
                        $(
                            | $param:ident $( : ( $trait1:path $(, $traitN:path)* ) )? |
                        )?
                        $outer:ty : ($into:expr, $from:expr, $extract:expr)
                    ),*
                )
            ),*
        }
    ) => {
        $crate::pessimize_cast!(
            $doc_cfg
            {
                $(
                    $inner : (
                        $(
                            $(
                                | $param $( : ( $trait1 $(, $traitN)* ) )? |
                            )?
                            $outer : ($into, $from)
                        ),*
                    )
                ),*
            }
        );
        //
        $($(
            #[cfg_attr(feature = "nightly", $doc_cfg)]
            impl $(< $param $( : $trait1 $( + $traitN )* )? >)? $crate::BorrowPessimize for $outer {
                type BorrowedPessimize = $inner;

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                fn with_pessimize(&self, f: impl FnOnce(&$inner)) {
                    f(&$extract(self))
                }

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                fn assume_accessed_impl(&mut self) {
                    $crate::impl_assume_accessed_via_extract_pessimized(self, |self_: &mut Self| $extract(self_))
                }
            }
        )*)*
    };
}

/// Implementation of Pessimize for Copy types with round trip conversion functions
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_copy {
    (
        $doc_cfg:meta
        {
            $(
                $inner:ty : (
                    $(
                        $(
                            | $param:ident $( : $traits:tt )? |
                        )?
                        $outer:ty : ($into:expr, $from:expr)
                    ),*
                )
            ),*
        }
    ) => {
        $crate::pessimize_extractible!(
            $doc_cfg
            {
                $(
                    $inner : (
                        $(
                            $(
                                | $param $( : $traits )? |
                            )?
                            $outer : ($into, $from, |self_: &Self| $into(*self_))
                        ),*
                    )
                ),*
            }
        );
    };
}

/// Like pessimize_copy, but using a standard Into/From pair for PessimizeCast
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_into_from {
    (
        $doc_cfg:meta
        {
            $(
                $inner:ty : (
                    $(
                        $( | $param:ident $( : $traits:tt )? | )?
                        $outer:ty
                    ),*
                )
            ),*
        }
    ) => {
        $crate::pessimize_copy!(
            $doc_cfg
            {
                $(
                    $inner : (
                        $(
                            $( | $param $( : $traits )? | )?
                            $outer : (Self::into, Self::from)
                        ),*
                    )
                ),*
            }
        );
    };
}

/// Implementation of Pessimize for any stateless zero-sized type
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_zsts {
    (
        $doc_cfg:meta
        {
            $(
                $( | $param:ident $( : $traits:tt )? | )?
                $name:ty : $make:expr
            ),*
        }
    ) => {
        $crate::pessimize_extractible!(
            $doc_cfg
            {
                () : (
                    $(
                        $( | $param $( : $traits )? | )?
                        $name : (
                            |_self| (),
                            |()| $make,
                            |_self| ()
                        )
                    ),*
                )
            }
        );
    };
}

/// Implementation of Pessimize for tuple structs
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_tuple_structs {
    (
        $doc_cfg:meta
        {
            $(
                $( | $param:ident $( : $traits:tt )? | )?
                $outer:ty {
                    $( $name:ident : $inner:ty ),*
                }
            ),*
        }
    ) => {
        $crate::pessimize_copy!(
            $doc_cfg
            {
                $(
                    ( $($inner,)* ) : (
                        $( | $param $( : $traits )? | )?
                        $outer : (
                            |Self( $($name),* )| ( $($name,)* ),
                            |( $($name,)* )| Self( $($name),* )
                        )
                    )
                ),*
            }
        );
    };
}

/// Implementation of Pessimize for T(pub U) style newtypes
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_newtypes {
    (
        $doc_cfg:meta
        {
            $(
                $(
                    | $param:ident $( : ( $trait1:path $(, $traitN:path)* ) )? |
                )?
                $outer:ty { $inner:ty }
            ),*
        }
    ) => {
        $crate::pessimize_cast!(
            $doc_cfg
            {
                $(
                    $inner : (
                        $(
                            | $param $( : ( $trait1 $(, $traitN)* ) )? |
                        )?
                        $outer : (
                            |Self(inner)| inner,
                            |inner| Self(inner)
                        )
                    )
                ),*
            }
        );
        //
        $(
            #[cfg_attr(feature = "nightly", $doc_cfg)]
            impl $(< $param $( : $trait1 $( + $traitN )* )? >)? $crate::BorrowPessimize for $outer {
                type BorrowedPessimize = $inner;

                #[inline]
                fn with_pessimize(&self, f: impl FnOnce(&$inner)) {
                    f(&self.0)
                }

                #[inline]
                fn assume_accessed_impl(&mut self) {
                    $crate::assume_accessed(&mut self.0)
                }
            }
        )*
    };
}

/// Pessimize a type that behaves like core::iter::Once
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_once_like {
    (
        $doc_cfg:meta
        {
            $(
                $(
                    | $param:ident $( : ( $trait1:path $(, $traitN:path)* ) )? |
                )?
                $outer:ty : (
                    $inner:ty,
                    $extract:expr,
                    $make:expr
                )
            ),*
        }
    ) => {
        $crate::pessimize_cast!(
            $doc_cfg
            {
                $(
                    $inner : (
                        $(
                            | $param $( : ( $trait1 $(, $traitN)* ) )? |
                        )?
                        $outer : (|mut self_| $extract(&mut self_), $make)
                    )
                ),*
            }
        );
        //
        $(
            #[cfg_attr(feature = "nightly", $doc_cfg)]
            impl $(< $param $( : $trait1 $( + $traitN )* )? >)? $crate::BorrowPessimize for $outer {
                type BorrowedPessimize = *const Self;

                #[inline]
                fn with_pessimize(&self, f: impl FnOnce(&Self::BorrowedPessimize)) {
                    // Need at least an &mut Self to access the inner value, so
                    // must use by-reference optimization barrier with &Self
                    f(&(self as *const Self))
                }

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                fn assume_accessed_impl(&mut self) {
                    let mut value = $extract(self);
                    $crate::assume_accessed(&mut value);
                    *self = $make(value)
                }
            }
        )*
    };
}

/// Pessimize a type that behaves like a collection (cheap Default impl, owned
/// state differs from borrowed state, backing heap allocation)
#[doc(hidden)]
#[macro_export]
macro_rules! pessimize_collections {
    (
        $doc_cfg:meta
        {
            $(
                ($owned_inner:ty, $borrowed_inner:ty) : (
                    $(
                        $(
                            | $param:ident $( : ( $trait1:path $(, $traitN:path)* ) )? |
                        )?
                        $outer:ty : ($into_owned:expr, $from_owned:expr, $extract_borrowed:expr)
                    ),*
                )
            ),*
        }
    ) => {
        $crate::pessimize_cast!(
            $doc_cfg
            {
                $(
                    $owned_inner : (
                        $(
                            $(
                                | $param $( : ( $trait1 $(, $traitN)* ) )? |
                            )?
                            $outer : (
                                $into_owned,
                                |owned| {
                                    // To simulate the creation of an unrelated
                                    // collection, we must simulate access to
                                    // the global memory allocator.
                                    $crate::assume_globals_accessed();
                                    $from_owned(owned)
                                }
                            )
                        ),*
                    )
                ),*
            }
        );
        //
        $($(
            #[cfg_attr(feature = "nightly", $doc_cfg)]
            impl $(< $param $( : $trait1 $( + $traitN )* )? >)? $crate::BorrowPessimize for $outer {
                type BorrowedPessimize = $borrowed_inner;

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                fn with_pessimize(&self, f: impl FnOnce(&$borrowed_inner)) {
                    f(&$extract_borrowed(self))
                }

                #[allow(clippy::redundant_closure_call)]
                #[inline]
                fn assume_accessed_impl(&mut self) {
                    // With an &mut to a collection, one can trigger a
                    // reallocation, and thus affect global state.
                    $crate::assume_globals_accessed();
                    $crate::impl_assume_accessed_via_extract_pessimized(self, |self_: &mut Self| $into_owned(core::mem::take(self_)))
                }
            }
        )*)*
    };
}

// TODO: Provide a Derive macro to derive Pessimize for a small struct, with a
//       warning that it will do more harm than good on a larger struct

/// Global variable used to check if stateful zero-sized types are pessimized
#[doc(hidden)]
pub static mut TEST_GLOBAL_STATE: isize = -42;

#[cfg(test)]
pub(crate) mod tests {
    use super::*;
    use crate::ptr::tests::{test_all_pinned_pointers, test_unpinned_pointers};
    #[cfg(feature = "nightly")]
    use std::simd::{Simd, SimdElement};
    use std::{
        fmt::Debug,
        time::{Duration, Instant},
    };

    // === Tests asserting that the barriers don't modify anything ===
    // ===    (should be run on both debug and release builds)     ===

    // Test that, for a given value, Pessimize seems to work
    pub fn test_pinned_value<T: Clone + Debug + PartialEq + Pessimize>(x: T) {
        let old_x = x.clone();
        assume_read(&x);
        assert_eq!(x, old_x);
        assert_eq!(hide(x.clone()), old_x);
        test_all_pinned_pointers::<T, _>(x.clone());
        consume(x);
    }

    // Same as above, but also assume the value is Unpin
    pub fn test_value<T: Clone + Debug + PartialEq + Pessimize + Unpin>(x: T) {
        test_unpinned_pointers(x.clone());
        test_pinned_value(x);
    }

    // Run test_value on the minimal, default and maximal value of a type
    pub fn test_value_type<T: Clone + Debug + Default + PartialEq + Pessimize + Unpin>(
        min: T,
        max: T,
    ) {
        test_value(min);
        test_value(T::default());
        test_value(max);
    }

    // Run test_value_type for an encapsulated SIMD type
    #[allow(unused)]
    pub fn test_simd<
        Scalar: Copy + Default,
        const LANES: usize,
        T: Copy + Debug + Default + From<[Scalar; LANES]> + PartialEq + Pessimize + Unpin,
    >(
        min: Scalar,
        max: Scalar,
    ) {
        test_value_type(T::from([min; LANES]), T::from([max; LANES]));
    }

    // Run test_value_type for a portable_simd::Simd type
    #[allow(unused)]
    #[cfg(feature = "nightly")]
    pub fn test_portable_simd<
        Scalar: Debug + Default + PartialEq + SimdElement + Unpin,
        const LANES: usize,
    >(
        min: Scalar,
        max: Scalar,
    ) where
        Simd<Scalar, LANES>: Pessimize,
    {
        test_simd::<Scalar, LANES, Simd<Scalar, LANES>>(min, max)
    }

    // === Tests asserting that the barriers prevent optimization ===
    // ===         (should only be run on release builds)         ===

    // --- Basic harness ---

    /// Maximum degree of instruction-level parallelism for any instruction
    const MAX_SUPERSCALAR_WAYS: u64 = 4;

    /// Maximum realistic number of empty loop iterations per second
    ///
    /// At the time of writing, CPU boost clocks commonly go above 5 GHz and
    /// 8-9 GHz has been observed in overclocking records 10 years ago, it seems
    /// unlikely that these records will be beaten anytime soon so 10 GHz is a
    /// good clock frequency upper bound.
    ///
    /// While most processors can only process 1 conditional jump per second, it
    /// is possible to go all the way up to the maximal ILP integer increment
    /// rate through loop unrolling.
    ///
    const MAX_LOOP_FREQ: u64 = MAX_SUPERSCALAR_WAYS * 10_000_000_000;

    /// Maximum expected clock granularity
    /// The system clock is expected to always be able to measure this duration
    const MIN_DURATION: Duration = Duration::from_millis(2);

    /// Minimum number of loop iterations for which a loop duration greater than
    /// or equal to MIN_DURATION should be measured
    const MIN_ITERATIONS: u64 = 2 * MAX_LOOP_FREQ * MIN_DURATION.as_nanos() as u64 / 1_000_000_000;

    // Measure the time it takes to run something in a loop
    fn time_loop(mut op: impl FnMut(u64)) -> Duration {
        let start = Instant::now();
        for iter in 0..MIN_ITERATIONS {
            op(iter);
        }
        start.elapsed()
    }

    // Measure time to run an empty loop (counter increment and nothing else),
    // check that it is within expectations.
    fn checked_empty_loop_duration() -> Duration {
        // Any architecture with 64-bit pointers can store 64-bit integers in
        // registers and thus has native u64: Pessimize. On other arches, we
        // consume a reference to the loop counter, which is always valid but
        // causes it to be spilled to memory.
        #[cfg(target_pointer_width = "64")]
        let elapsed = time_loop(consume);
        #[cfg(not(target_pointer_width = "64"))]
        let elapsed = time_loop(|iter| consume(&iter));
        assert!(elapsed >= MIN_DURATION);
        elapsed
    }

    // Make sure that an operation was not optimized out
    //
    // An input value is initially provided. This input is passed down to the
    // operation, and whatever the operation emits will be the next input. The
    // output of the last operation is emitted as the output of the test. This
    // allows running tests with values that cannot or should not be cloned.
    //
    pub fn assert_unoptimized<T>(input: T, mut op: impl FnMut(T) -> T) -> T {
        // Run the operation in a loop
        let mut opt = Some(input);
        let elapsed = time_loop(|_iter| {
            let mut input = opt.take().unwrap();
            // For each loop iteration, we perform more of the requested
            // operation than can be performed in a single CPU cycle. Therefore,
            // if the operation is not optimized out, each loop iteration should
            // take at least one more CPU cycle.
            for _ in 0..=MAX_SUPERSCALAR_WAYS {
                input = op(input);
            }
            opt = Some(input);
        });

        // Immediately check empty loop iteration speed to evaluate clock rate,
        // which can vary depending on the operation we're doing.
        // Since a loop iteration takes at most 1 CPU clock cycle on modern
        // CPUs, with >1 extra cycle, the loop should be at least 2x slower.
        let elapsed_empty = checked_empty_loop_duration();
        let ratio = elapsed.as_secs_f64() / elapsed_empty.as_secs_f64();
        assert!(ratio > 1.9);

        // Next, deduce the actual rate at which operations are being executed
        // and compare that to the rate at which empty loop iterations execute.
        eprintln!(
            "Operation pessimized (running at {:.1}x empty iteration speed of {:.1} GHz)",
            (MAX_SUPERSCALAR_WAYS + 1) as f64 / ratio,
            MIN_ITERATIONS as f64 / elapsed_empty.as_nanos() as f64
        );

        // Finally, return the last output for the next benchmark test
        opt.take().unwrap()
    }

    // --- Tests for normal values with native Pessimize support ---

    pub fn test_unoptimized_value<T: Clone + PartialEq + Pessimize>(x: T) {
        let old_x = x.clone();
        assert_unoptimized(x, |mut x| {
            x = hide(x);
            consume(x == old_x);
            x
        });
    }
    //
    pub fn test_unoptimized_value_type<T: Clone + Default + PartialEq + Pessimize>() {
        test_unoptimized_value(T::default());
    }

    // --- Tests for stateful ZSTs (that mediate access to global state) ---

    pub fn test_unoptimized_stateful_zsv<T: Pessimize>(x: T) {
        // ZSTs don't have inner state to pessimize, but some of them
        // provide access to global state.
        let old_state = unsafe { TEST_GLOBAL_STATE };
        assert_unoptimized(x, |mut x| {
            x = hide(x);
            unsafe { consume(TEST_GLOBAL_STATE == old_state) };
            x
        });
    }

    pub fn test_unoptimized_stateful_zst<T: Default + Pessimize>() {
        test_unoptimized_stateful_zsv(T::default())
    }

    // === Tests for types implemented here ===

    // --- "Big" array with no native Pessimize implementation ---

    // What is considered too big (in units of isize)
    // 2 is enough as we don't currently pessimize larger arrays
    pub const BIG: usize = 2;

    // Should be run on both debug and release builds
    #[test]
    fn non_native() {
        #[cfg(feature = "default_impl")]
        test_value_type::<[isize; BIG]>([isize::MIN; BIG], [isize::MAX; BIG]);
        #[cfg(not(feature = "default_impl"))]
        for inner in [isize::MIN, 0, isize::MAX] {
            crate::ptr::tests::test_all_pointers::<[isize; BIG], _>([inner; BIG]);
        }
    }

    // Should only be run on release builds
    #[test]
    #[ignore]
    fn non_native_optim() {
        #[cfg(feature = "default_impl")]
        test_unoptimized_value_type::<[isize; BIG]>();
        #[cfg(not(feature = "default_impl"))]
        crate::ptr::tests::test_unoptimized_ptrs::<[isize; BIG], _>([0isize; BIG]);
    }
}