codspeed_divan_compat_walltime/bench/
mod.rs

1use std::{
2    cell::UnsafeCell,
3    fmt,
4    mem::{self, MaybeUninit},
5    num::NonZeroUsize,
6    sync::Barrier,
7};
8
9use crate::{
10    alloc::{
11        AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap,
12    },
13    black_box, black_box_drop,
14    counter::{
15        AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount,
16        IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt,
17    },
18    divan::SharedContext,
19    stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample},
20    thread_pool::BENCH_POOL,
21    time::{FineDuration, Timestamp, UntaggedTimestamp},
22    util::{self, sync::SyncWrap, Unit},
23};
24
25#[cfg(test)]
26mod tests;
27
28mod args;
29mod defer;
30mod options;
31
32use ::codspeed::instrument_hooks::InstrumentHooks;
33use defer::{DeferSlot, DeferStore};
34
35pub use self::{
36    args::{BenchArgs, BenchArgsRunner},
37    options::BenchOptions,
38};
39
40pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100;
41
42/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
43///
44/// # Examples
45///
46/// ```
47/// use divan::{Bencher, black_box};
48///
49/// #[divan::bench]
50/// fn copy_from_slice(bencher: Bencher) {
51///     // Input and output buffers get used in the closure.
52///     let src = (0..100).collect::<Vec<i32>>();
53///     let mut dst = vec![0; src.len()];
54///
55///     bencher.bench_local(|| {
56///         black_box(&mut dst).copy_from_slice(black_box(&src));
57///     });
58/// }
59/// ```
60#[must_use = "a benchmark function must be registered"]
61pub struct Bencher<'a, 'b, C = BencherConfig> {
62    pub(crate) context: &'a mut BenchContext<'b>,
63    pub(crate) config: C,
64}
65
66/// Public-in-private type for statically-typed `Bencher` configuration.
67///
68/// This enables configuring `Bencher` using the builder pattern with zero
69/// runtime cost.
70pub struct BencherConfig<GenI = Unit> {
71    gen_input: GenI,
72}
73
74impl<C> fmt::Debug for Bencher<'_, '_, C> {
75    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76        f.debug_struct("Bencher").finish_non_exhaustive()
77    }
78}
79
80impl<'a, 'b> Bencher<'a, 'b> {
81    #[inline]
82    pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
83        Self { context, config: BencherConfig { gen_input: Unit } }
84    }
85}
86
87impl<'a, 'b> Bencher<'a, 'b> {
88    /// Benchmarks a function.
89    ///
90    /// The function can be benchmarked in parallel using the [`threads`
91    /// option](macro@crate::bench#threads). If the function is strictly
92    /// single-threaded, use [`Bencher::bench_local`] instead.
93    ///
94    /// # Examples
95    ///
96    /// ```
97    /// #[divan::bench]
98    /// fn bench(bencher: divan::Bencher) {
99    ///     bencher.bench(|| {
100    ///         // Benchmarked code...
101    ///     });
102    /// }
103    /// ```
104    pub fn bench<O, B>(self, benched: B)
105    where
106        B: Fn() -> O + Sync,
107    {
108        // Reusing `bench_values` for a zero-sized non-drop input type should
109        // have no overhead.
110        self.with_inputs(|| ()).bench_values(|_: ()| benched());
111    }
112
113    /// Benchmarks a function on the current thread.
114    ///
115    /// # Examples
116    ///
117    /// ```
118    /// #[divan::bench]
119    /// fn bench(bencher: divan::Bencher) {
120    ///     bencher.bench_local(|| {
121    ///         // Benchmarked code...
122    ///     });
123    /// }
124    /// ```
125    pub fn bench_local<O, B>(self, mut benched: B)
126    where
127        B: FnMut() -> O,
128    {
129        // Reusing `bench_local_values` for a zero-sized non-drop input type
130        // should have no overhead.
131        self.with_inputs(|| ()).bench_local_values(|_: ()| benched());
132    }
133
134    /// Generate inputs for the [benchmarked function](#input-bench).
135    ///
136    /// Time spent generating inputs does not affect benchmark timing.
137    ///
138    /// When [benchmarking in parallel](macro@crate::bench#threads), the input
139    /// generator is called on the same thread as the sample loop that uses that
140    /// input.
141    ///
142    /// # Examples
143    ///
144    /// ```
145    /// #[divan::bench]
146    /// fn bench(bencher: divan::Bencher) {
147    ///     bencher
148    ///         .with_inputs(|| {
149    ///             // Generate input:
150    ///             String::from("...")
151    ///         })
152    ///         .bench_values(|s| {
153    ///             // Use input by-value:
154    ///             s + "123"
155    ///         });
156    /// }
157    /// ```
158    pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
159        Bencher { context: self.context, config: BencherConfig { gen_input } }
160    }
161}
162
163impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
164    /// Assign a [`Counter`] for all iterations of the benchmarked function.
165    ///
166    /// This will either:
167    /// - Assign a new counter
168    /// - Override an existing counter of the same type
169    ///
170    /// If the counter depends on [generated inputs](Self::with_inputs), use
171    /// [`Bencher::input_counter`] instead.
172    ///
173    /// If context is not needed, the counter can instead be set via
174    /// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
175    ///
176    /// # Examples
177    ///
178    /// ```
179    /// use divan::{Bencher, counter::BytesCount};
180    ///
181    /// #[divan::bench]
182    /// fn char_count(bencher: Bencher) {
183    ///     let s: String = // ...
184    ///     # String::new();
185    ///
186    ///     bencher
187    ///         .counter(BytesCount::of_str(&s))
188    ///         .bench(|| {
189    ///             divan::black_box(&s).chars().count()
190    ///         });
191    /// }
192    /// ```
193    #[doc(alias = "throughput")]
194    pub fn counter<C>(self, counter: C) -> Self
195    where
196        C: IntoCounter,
197    {
198        let counter = AnyCounter::new(counter);
199        self.context.counters.set_counter(counter);
200        self
201    }
202}
203
204/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
205impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
206where
207    GenI: FnMut() -> I,
208{
209    /// Calls a closure to create a [`Counter`] for each input of the
210    /// benchmarked function.
211    ///
212    /// This will either:
213    /// - Assign a new counter
214    /// - Override an existing counter of the same type
215    ///
216    /// If the counter is constant, use [`Bencher::counter`] instead.
217    ///
218    /// When [benchmarking in parallel](macro@crate::bench#threads), the input
219    /// counter is called on the same thread as the sample loop that generates
220    /// and uses that input.
221    ///
222    /// # Examples
223    ///
224    /// The following example emits info for the number of bytes processed when
225    /// benchmarking [`char`-counting](std::str::Chars::count). The byte count
226    /// is gotten by calling [`BytesCount::of_str`] on each iteration's input
227    /// [`String`].
228    ///
229    /// ```
230    /// use divan::{Bencher, counter::BytesCount};
231    ///
232    /// #[divan::bench]
233    /// fn char_count(bencher: Bencher) {
234    ///     bencher
235    ///         .with_inputs(|| -> String {
236    ///             // ...
237    ///             # String::new()
238    ///         })
239    ///         .input_counter(BytesCount::of_str)
240    ///         .bench_refs(|s| {
241    ///             s.chars().count()
242    ///         });
243    /// }
244    /// ```
245    pub fn input_counter<C, F>(self, make_counter: F) -> Self
246    where
247        F: Fn(&I) -> C + Sync + 'static,
248        C: IntoCounter,
249    {
250        self.context.counters.set_input_counter(make_counter);
251        self
252    }
253
254    /// Creates a [`Counter`] from each input of the benchmarked function.
255    ///
256    /// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any
257    /// nesting of references to those types.
258    ///
259    /// # Examples
260    ///
261    /// The following example emits info for the number of items processed when
262    /// benchmarking [`FromIterator`] from
263    /// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`].
264    ///
265    /// ```
266    /// use divan::{Bencher, counter::ItemsCount};
267    ///
268    /// #[divan::bench]
269    /// fn range_to_vec(bencher: Bencher) {
270    ///     bencher
271    ///         .with_inputs(|| -> usize {
272    ///             // ...
273    ///             # 0
274    ///         })
275    ///         .count_inputs_as::<ItemsCount>()
276    ///         .bench_values(|n| -> Vec<usize> {
277    ///             (0..n).collect()
278    ///         });
279    /// }
280    /// ```
281    #[inline]
282    pub fn count_inputs_as<C>(self) -> Self
283    where
284        C: Counter,
285        I: AsCountUInt,
286    {
287        match KnownCounterKind::of::<C>() {
288            KnownCounterKind::Bytes => self.input_counter(|c| BytesCount::from(c)),
289            KnownCounterKind::Chars => self.input_counter(|c| CharsCount::from(c)),
290            KnownCounterKind::Cycles => self.input_counter(|c| CyclesCount::from(c)),
291            KnownCounterKind::Items => self.input_counter(|c| ItemsCount::from(c)),
292        }
293    }
294
295    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
296    /// provided by-value.
297    ///
298    /// Per-iteration means the benchmarked function is called exactly once for
299    /// each generated input.
300    ///
301    /// The function can be benchmarked in parallel using the [`threads`
302    /// option](macro@crate::bench#threads). If the function is strictly
303    /// single-threaded, use [`Bencher::bench_local_values`] instead.
304    ///
305    /// # Examples
306    ///
307    /// ```
308    /// #[divan::bench]
309    /// fn bench(bencher: divan::Bencher) {
310    ///     bencher
311    ///         .with_inputs(|| {
312    ///             // Generate input:
313    ///             String::from("...")
314    ///         })
315    ///         .bench_values(|s| {
316    ///             // Use input by-value:
317    ///             s + "123"
318    ///         });
319    /// }
320    /// ```
321    pub fn bench_values<O, B>(self, benched: B)
322    where
323        B: Fn(I) -> O + Sync,
324        GenI: Fn() -> I + Sync,
325    {
326        self.context.bench_loop_threaded(
327            self.config.gen_input,
328            |input| {
329                // SAFETY: Input is guaranteed to be initialized and not
330                // currently referenced by anything else.
331                let input = unsafe { input.get().read().assume_init() };
332
333                benched(input)
334            },
335            // Input ownership is transferred to `benched`.
336            |_input| {},
337        );
338    }
339
340    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
341    /// provided by-value.
342    ///
343    /// Per-iteration means the benchmarked function is called exactly once for
344    /// each generated input.
345    ///
346    /// # Examples
347    ///
348    /// ```
349    /// #[divan::bench]
350    /// fn bench(bencher: divan::Bencher) {
351    ///     let mut values = Vec::new();
352    ///     bencher
353    ///         .with_inputs(|| {
354    ///             // Generate input:
355    ///             String::from("...")
356    ///         })
357    ///         .bench_local_values(|s| {
358    ///             // Use input by-value:
359    ///             values.push(s);
360    ///         });
361    /// }
362    /// ```
363    pub fn bench_local_values<O, B>(self, mut benched: B)
364    where
365        B: FnMut(I) -> O,
366    {
367        self.context.bench_loop_local(
368            self.config.gen_input,
369            |input| {
370                // SAFETY: Input is guaranteed to be initialized and not
371                // currently referenced by anything else.
372                let input = unsafe { input.get().read().assume_init() };
373
374                benched(input)
375            },
376            // Input ownership is transferred to `benched`.
377            |_input| {},
378        );
379    }
380
381    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
382    /// provided by-reference.
383    ///
384    /// Per-iteration means the benchmarked function is called exactly once for
385    /// each generated input.
386    ///
387    /// # Examples
388    ///
389    /// ```
390    /// #[divan::bench]
391    /// fn bench(bencher: divan::Bencher) {
392    ///     bencher
393    ///         .with_inputs(|| {
394    ///             // Generate input:
395    ///             String::from("...")
396    ///         })
397    ///         .bench_refs(|s| {
398    ///             // Use input by-reference:
399    ///             *s += "123";
400    ///         });
401    /// }
402    /// ```
403    pub fn bench_refs<O, B>(self, benched: B)
404    where
405        B: Fn(&mut I) -> O + Sync,
406        GenI: Fn() -> I + Sync,
407    {
408        // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
409        self.context.bench_loop_threaded(
410            self.config.gen_input,
411            |input| {
412                // SAFETY: Input is guaranteed to be initialized and not
413                // currently referenced by anything else.
414                let input = unsafe { (*input.get()).assume_init_mut() };
415
416                benched(input)
417            },
418            // Input ownership was not transferred to `benched`.
419            |input| {
420                // SAFETY: This function is called after `benched` outputs are
421                // dropped, so we have exclusive access.
422                unsafe { (*input.get()).assume_init_drop() }
423            },
424        );
425    }
426
427    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
428    /// provided by-reference.
429    ///
430    /// Per-iteration means the benchmarked function is called exactly once for
431    /// each generated input.
432    ///
433    /// # Examples
434    ///
435    /// ```
436    /// #[divan::bench]
437    /// fn bench(bencher: divan::Bencher) {
438    ///     bencher
439    ///         .with_inputs(|| {
440    ///             // Generate input:
441    ///             String::from("...")
442    ///         })
443    ///         .bench_local_refs(|s| {
444    ///             // Use input by-reference:
445    ///             *s += "123";
446    ///         });
447    /// }
448    /// ```
449    pub fn bench_local_refs<O, B>(self, mut benched: B)
450    where
451        B: FnMut(&mut I) -> O,
452    {
453        // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
454        self.context.bench_loop_local(
455            self.config.gen_input,
456            |input| {
457                // SAFETY: Input is guaranteed to be initialized and not
458                // currently referenced by anything else.
459                let input = unsafe { (*input.get()).assume_init_mut() };
460
461                benched(input)
462            },
463            // Input ownership was not transferred to `benched`.
464            |input| {
465                // SAFETY: This function is called after `benched` outputs are
466                // dropped, so we have exclusive access.
467                unsafe { (*input.get()).assume_init_drop() }
468            },
469        );
470    }
471}
472
473/// State machine for how the benchmark is being run.
474#[derive(Clone, Copy)]
475pub(crate) enum BenchMode {
476    /// The benchmark is being run as `--test`.
477    ///
478    /// Don't collect samples and run exactly once.
479    Test,
480
481    /// Scale `sample_size` to determine the right size for collecting.
482    Tune { sample_size: u32 },
483
484    /// Simply collect samples.
485    Collect { sample_size: u32 },
486}
487
488impl BenchMode {
489    #[inline]
490    pub fn is_test(self) -> bool {
491        matches!(self, Self::Test)
492    }
493
494    #[inline]
495    pub fn is_tune(self) -> bool {
496        matches!(self, Self::Tune { .. })
497    }
498
499    #[inline]
500    pub fn is_collect(self) -> bool {
501        matches!(self, Self::Collect { .. })
502    }
503
504    #[inline]
505    pub fn sample_size(self) -> u32 {
506        match self {
507            Self::Test => 1,
508            Self::Tune { sample_size, .. } | Self::Collect { sample_size, .. } => sample_size,
509        }
510    }
511}
512
513/// `#[divan::bench]` loop context.
514///
515/// Functions called within the benchmark loop should be `#[inline(always)]` to
516/// ensure instruction cache locality.
517pub(crate) struct BenchContext<'a> {
518    shared_context: &'a SharedContext,
519
520    /// User-configured options.
521    pub options: &'a BenchOptions<'a>,
522
523    /// Whether the benchmark loop was started.
524    pub did_run: bool,
525
526    /// The number of threads to run the benchmark. The default is 1.
527    ///
528    /// When set to 1, the benchmark loop is guaranteed to stay on the current
529    /// thread and not spawn any threads.
530    pub thread_count: NonZeroUsize,
531
532    /// Recorded samples.
533    pub samples: SampleCollection,
534
535    /// Per-iteration counters grouped by sample.
536    counters: CounterCollection,
537}
538
539impl<'a> BenchContext<'a> {
540    /// Creates a new benchmarking context.
541    pub fn new(
542        shared_context: &'a SharedContext,
543        options: &'a BenchOptions,
544        thread_count: NonZeroUsize,
545    ) -> Self {
546        Self {
547            shared_context,
548            options,
549            thread_count,
550            did_run: false,
551            samples: SampleCollection::default(),
552            counters: options.counters.to_collection(),
553        }
554    }
555
556    /// Runs the single-threaded loop for benchmarking `benched`.
557    ///
558    /// # Safety
559    ///
560    /// See `bench_loop_threaded`.
561    pub fn bench_loop_local<I, O>(
562        &mut self,
563        gen_input: impl FnMut() -> I,
564        benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
565        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
566    ) {
567        // SAFETY: Closures are guaranteed to run on the current thread, so they
568        // can safely be mutable and non-`Sync`.
569        unsafe {
570            let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
571            let benched = SyncWrap::new(UnsafeCell::new(benched));
572            let drop_input = SyncWrap::new(drop_input);
573
574            self.thread_count = NonZeroUsize::MIN;
575            self.bench_loop_threaded::<I, O>(
576                || (*gen_input.get())(),
577                |input| (*benched.get())(input),
578                |input| drop_input(input),
579            )
580        }
581    }
582
583    /// Runs the multi-threaded loop for benchmarking `benched`.
584    ///
585    /// # Safety
586    ///
587    /// If `self.threads` is 1, the incoming closures will not escape the
588    /// current thread. This guarantee ensures `bench_loop_local` can soundly
589    /// reuse this method with mutable non-`Sync` closures.
590    ///
591    /// When `benched` is called:
592    /// - `I` is guaranteed to be initialized.
593    /// - No external `&I` or `&mut I` exists.
594    ///
595    /// When `drop_input` is called:
596    /// - All instances of `O` returned from `benched` have been dropped.
597    /// - The same guarantees for `I` apply as in `benched`, unless `benched`
598    ///   escaped references to `I`.
599    fn bench_loop_threaded<I, O>(
600        &mut self,
601        gen_input: impl Fn() -> I + Sync,
602        benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
603        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
604    ) {
605        self.did_run = true;
606
607        let mut current_mode = self.initial_mode();
608        let is_test = current_mode.is_test();
609
610        let record_sample = self.sample_recorder(gen_input, benched, drop_input);
611
612        let thread_count = self.thread_count.get();
613        let aux_thread_count = thread_count - 1;
614
615        let is_single_thread = aux_thread_count == 0;
616
617        // Per-thread sample info returned by `record_sample`. These are
618        // processed locally to emit user-facing sample info. As a result, this
619        // only contains `thread_count` many elements at a time.
620        let mut raw_samples = Vec::<Option<RawSample>>::new();
621
622        // The time spent benchmarking, in picoseconds.
623        //
624        // Unless `skip_ext_time` is set, this includes time external to
625        // `benched`, such as time spent generating inputs and running drop.
626        let mut elapsed_picos: u128 = 0;
627
628        // The minimum time for benchmarking, in picoseconds.
629        let min_picos = self.options.min_time().picos;
630
631        // The remaining time left for benchmarking, in picoseconds.
632        let max_picos = self.options.max_time().picos;
633
634        // Don't bother running if user specifies 0 max time or 0 samples.
635        if max_picos == 0 || !self.options.has_samples() {
636            return;
637        }
638
639        let timer = self.shared_context.timer;
640        let timer_kind = timer.kind();
641
642        let mut rem_samples = if current_mode.is_collect() {
643            Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
644        } else {
645            None
646        };
647
648        // Only measure precision if we need to tune sample size.
649        let timer_precision =
650            if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
651
652        if !is_test {
653            self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(1) as usize);
654        }
655
656        let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
657        let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
658
659        let bench_overheads = timer.bench_overheads();
660
661        let _ = InstrumentHooks::instance().start_benchmark();
662        while {
663            // Conditions for when sampling is over:
664            if elapsed_picos >= max_picos {
665                // Depleted the benchmarking time budget. This is a strict
666                // condition regardless of sample count and minimum time.
667                false
668            } else if rem_samples.unwrap_or(1) > 0 {
669                // More samples expected.
670                true
671            } else {
672                // Continue if we haven't reached the time floor.
673                elapsed_picos < min_picos
674            }
675        } {
676            let sample_size = current_mode.sample_size();
677            self.samples.sample_size = sample_size;
678
679            let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
680
681            // Sample loop helper:
682            let record_sample = || -> RawSample {
683                let mut counter_totals: [u128; KnownCounterKind::COUNT] =
684                    [0; KnownCounterKind::COUNT];
685
686                // Updates per-input counter info for this sample.
687                let mut count_input = |input: &I| {
688                    for counter_kind in KnownCounterKind::ALL {
689                        // SAFETY: The `I` type cannot change since `with_inputs`
690                        // cannot be called more than once on the same `Bencher`.
691                        if let Some(count) =
692                            unsafe { self.counters.get_input_count(counter_kind, input) }
693                        {
694                            let total = &mut counter_totals[counter_kind as usize];
695                            *total = (*total).saturating_add(count as u128);
696                        }
697                    }
698                };
699
700                // Sample loop:
701                let ([start, end], alloc_info) =
702                    record_sample(sample_size as usize, barrier.as_ref(), &mut count_input);
703
704                RawSample { start, end, timer, alloc_info, counter_totals }
705            };
706
707            // Sample loop:
708            raw_samples.clear();
709            BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, |_| record_sample());
710
711            // Convert `&[Option<RawSample>]` to `&[Sample]`.
712            let raw_samples: &[RawSample] = {
713                if let Some(thread) = raw_samples
714                    .iter()
715                    .enumerate()
716                    .find_map(|(thread, sample)| sample.is_none().then_some(thread))
717                {
718                    panic!("Divan benchmarking thread {thread} panicked");
719                }
720
721                unsafe {
722                    assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>());
723                    std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len())
724                }
725            };
726
727            // If testing, exit the benchmarking loop immediately after timing a
728            // single run.
729            if is_test {
730                break;
731            }
732
733            let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap();
734            let slowest_time = slowest_sample.duration();
735
736            // TODO: Make tuning be less influenced by early runs. Currently if
737            // early runs are very quick but later runs are slow, benchmarking
738            // will take a very long time.
739            //
740            // TODO: Make `sample_size` consider time generating inputs and
741            // dropping inputs/outputs. Currently benchmarks like
742            // `Bencher::bench_refs(String::clear)` take a very long time.
743            if current_mode.is_tune() {
744                // Clear previous smaller samples.
745                self.samples.clear();
746                self.counters.clear_input_counts();
747
748                // If within 100x timer precision, continue tuning.
749                let precision_multiple = slowest_time.picos / timer_precision.picos;
750                if precision_multiple <= 100 {
751                    current_mode = BenchMode::Tune { sample_size: sample_size * 2 };
752                } else {
753                    current_mode = BenchMode::Collect { sample_size };
754                    rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
755                }
756            }
757
758            // Returns the sample's duration adjusted for overhead.
759            let sample_duration_sub_overhead = |raw_sample: &RawSample| {
760                let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info);
761
762                FineDuration {
763                    picos: raw_sample
764                        .duration()
765                        .clamp_to(timer_precision)
766                        .picos
767                        .saturating_sub(overhead.picos),
768                }
769                .clamp_to(timer_precision)
770            };
771
772            for raw_sample in raw_samples {
773                let sample_index = self.samples.time_samples.len();
774
775                self.samples
776                    .time_samples
777                    .push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) });
778
779                if !raw_sample.alloc_info.tallies.is_empty() {
780                    self.samples
781                        .alloc_info_by_sample
782                        .insert(sample_index as u32, raw_sample.alloc_info.clone());
783                }
784
785                // Insert per-input counter information.
786                for counter_kind in KnownCounterKind::ALL {
787                    if !self.counters.uses_input_counts(counter_kind) {
788                        continue;
789                    }
790
791                    let total_count = raw_sample.counter_totals[counter_kind as usize];
792
793                    // Cannot overflow `MaxCountUInt` because `total_count`
794                    // cannot exceed `MaxCountUInt::MAX * sample_size`.
795                    let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
796
797                    self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
798                }
799
800                if let Some(rem_samples) = &mut rem_samples {
801                    *rem_samples = rem_samples.saturating_sub(1);
802                }
803            }
804
805            if let Some(initial_start) = initial_start {
806                let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
807                elapsed_picos = last_end.duration_since(initial_start, timer).picos;
808            } else {
809                // Progress by at least 1ns to prevent extremely fast
810                // functions from taking forever when `min_time` is set.
811                let progress_picos = slowest_time.picos.max(1_000);
812                elapsed_picos = elapsed_picos.saturating_add(progress_picos);
813            }
814        }
815        let _ = InstrumentHooks::instance().stop_benchmark();
816
817        // Reset flag for ignoring allocations.
818        crate::alloc::IGNORE_ALLOC.set(false);
819    }
820
821    /// Returns a closure that takes the sample size and input counter, and then
822    /// returns a newly recorded sample.
823    fn sample_recorder<I, O>(
824        &self,
825        gen_input: impl Fn() -> I,
826        benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
827        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
828    ) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; 2], ThreadAllocInfo)
829    {
830        // We defer:
831        // - Usage of `gen_input` values.
832        // - Drop destructor for `O`, preventing it from affecting sample
833        //   measurements. Outputs are stored into a pre-allocated buffer during
834        //   the sample loop. The allocation is reused between samples to reduce
835        //   time spent between samples.
836
837        let timer_kind = self.shared_context.timer.kind();
838
839        move |sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)| {
840            let mut defer_store = DeferStore::<I, O>::default();
841
842            let mut saved_alloc_info = ThreadAllocInfo::new();
843            let mut save_alloc_info = || {
844                if crate::alloc::IGNORE_ALLOC.get() {
845                    return;
846                }
847
848                if let Some(alloc_info) = ThreadAllocInfo::try_current() {
849                    // SAFETY: We have exclusive access.
850                    saved_alloc_info = unsafe { alloc_info.as_ptr().read() };
851                }
852            };
853
854            // Synchronize all threads to start timed section simultaneously and
855            // clear every thread's memory profiling info.
856            //
857            // This ensures work external to the timed section does not affect
858            // the timing of other threads.
859            let sync_threads = |is_start: bool| {
860                sync_impl(barrier, is_start);
861
862                // Monomorphize implementation to reduce code size.
863                #[inline(never)]
864                fn sync_impl(barrier: Option<&Barrier>, is_start: bool) {
865                    // Ensure benchmarked section has a `ThreadAllocInfo`
866                    // allocated for the current thread and clear previous info.
867                    let alloc_info = if is_start { ThreadAllocInfo::current() } else { None };
868
869                    // Synchronize all threads.
870                    //
871                    // This is the final synchronization point for the end.
872                    if let Some(barrier) = barrier {
873                        barrier.wait();
874                    }
875
876                    if let Some(mut alloc_info) = alloc_info {
877                        // SAFETY: We have exclusive access.
878                        let alloc_info = unsafe { alloc_info.as_mut() };
879
880                        alloc_info.clear();
881
882                        // Synchronize all threads.
883                        if let Some(barrier) = barrier {
884                            barrier.wait();
885                        }
886                    }
887                }
888            };
889
890            // The following logic chooses how to efficiently sample the
891            // benchmark function once and assigns `sample_start`/`sample_end`
892            // before/after the sample loop.
893            //
894            // NOTE: Testing and benchmarking should behave exactly the same
895            // when getting the sample time span. We don't want to introduce
896            // extra work that may worsen measurement quality for real
897            // benchmarking.
898            let sample_start: UntaggedTimestamp;
899            let sample_end: UntaggedTimestamp;
900
901            let instrument_hooks = InstrumentHooks::instance();
902            if size_of::<I>() == 0 && (size_of::<O>() == 0 || !mem::needs_drop::<O>()) {
903                // Use a range instead of `defer_store` to make the benchmarking
904                // loop cheaper.
905
906                // Run `gen_input` the expected number of times in case it
907                // updates external state used by `benched`.
908                for _ in 0..sample_size {
909                    let input = gen_input();
910                    count_input(&input);
911
912                    // Inputs are consumed/dropped later.
913                    mem::forget(input);
914                }
915
916                sync_threads(true);
917
918                let start_time = InstrumentHooks::current_timestamp();
919                sample_start = UntaggedTimestamp::start(timer_kind);
920
921                // Sample loop:
922                for _ in 0..sample_size {
923                    // SAFETY: Input is a ZST, so we can construct one out of
924                    // thin air.
925                    let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
926
927                    mem::forget(black_box(benched(&input)));
928                }
929
930                sample_end = UntaggedTimestamp::end(timer_kind);
931                let end_time = InstrumentHooks::current_timestamp();
932                instrument_hooks.add_benchmark_timestamps(start_time, end_time);
933
934                sync_threads(false);
935                save_alloc_info();
936
937                // Drop outputs and inputs.
938                for _ in 0..sample_size {
939                    // Output only needs drop if ZST.
940                    if size_of::<O>() == 0 {
941                        // SAFETY: Output is a ZST, so we can construct one out
942                        // of thin air.
943                        unsafe { _ = mem::zeroed::<O>() }
944                    }
945
946                    if mem::needs_drop::<I>() {
947                        // SAFETY: Input is a ZST, so we can construct one out
948                        // of thin air and not worry about aliasing.
949                        unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
950                    }
951                }
952            } else {
953                defer_store.prepare(sample_size);
954
955                match defer_store.slots() {
956                    // Output needs to be dropped. We defer drop in the sample
957                    // loop by inserting it into `defer_store`.
958                    Ok(defer_slots_slice) => {
959                        // Initialize and store inputs.
960                        for DeferSlot { input, .. } in defer_slots_slice {
961                            // SAFETY: We have exclusive access to `input`.
962                            let input = unsafe { &mut *input.get() };
963                            let input = input.write(gen_input());
964                            count_input(input);
965
966                            // Make input opaque to benchmarked function.
967                            black_box(input);
968                        }
969
970                        // Create iterator before the sample timing section to
971                        // reduce benchmarking overhead.
972                        let defer_slots_iter = defer_slots_slice.iter();
973
974                        sync_threads(true);
975                        let start_time = InstrumentHooks::current_timestamp();
976                        sample_start = UntaggedTimestamp::start(timer_kind);
977
978                        // Sample loop:
979                        for defer_slot in defer_slots_iter {
980                            // SAFETY: All inputs in `defer_store` were
981                            // initialized and we have exclusive access to the
982                            // output slot.
983                            unsafe {
984                                let output = benched(&defer_slot.input);
985                                *defer_slot.output.get() = MaybeUninit::new(output);
986                            }
987                        }
988
989                        sample_end = UntaggedTimestamp::end(timer_kind);
990                        let end_time = InstrumentHooks::current_timestamp();
991                        instrument_hooks.add_benchmark_timestamps(start_time, end_time);
992                        sync_threads(false);
993                        save_alloc_info();
994
995                        // Prevent the optimizer from removing writes to inputs
996                        // and outputs in the sample loop.
997                        black_box(defer_slots_slice);
998
999                        // Drop outputs and inputs.
1000                        for DeferSlot { input, output } in defer_slots_slice {
1001                            // SAFETY: All outputs were initialized in the
1002                            // sample loop and we have exclusive access.
1003                            unsafe { (*output.get()).assume_init_drop() }
1004
1005                            if mem::needs_drop::<I>() {
1006                                // SAFETY: The output was dropped and thus we
1007                                // have exclusive access to inputs.
1008                                unsafe { drop_input(input) }
1009                            }
1010                        }
1011                    }
1012
1013                    // Output does not need to be dropped.
1014                    Err(defer_inputs_slice) => {
1015                        // Initialize and store inputs.
1016                        for input in defer_inputs_slice {
1017                            // SAFETY: We have exclusive access to `input`.
1018                            let input = unsafe { &mut *input.get() };
1019                            let input = input.write(gen_input());
1020                            count_input(input);
1021
1022                            // Make input opaque to benchmarked function.
1023                            black_box(input);
1024                        }
1025
1026                        // Create iterator before the sample timing section to
1027                        // reduce benchmarking overhead.
1028                        let defer_inputs_iter = defer_inputs_slice.iter();
1029
1030                        sync_threads(true);
1031                        let start_time = InstrumentHooks::current_timestamp();
1032                        sample_start = UntaggedTimestamp::start(timer_kind);
1033
1034                        // Sample loop:
1035                        for input in defer_inputs_iter {
1036                            // SAFETY: All inputs in `defer_store` were
1037                            // initialized.
1038                            black_box_drop(unsafe { benched(input) });
1039                        }
1040
1041                        sample_end = UntaggedTimestamp::end(timer_kind);
1042                        let end_time = InstrumentHooks::current_timestamp();
1043                        instrument_hooks.add_benchmark_timestamps(start_time, end_time);
1044                        sync_threads(false);
1045                        save_alloc_info();
1046
1047                        // Prevent the optimizer from removing writes to inputs
1048                        // in the sample loop.
1049                        black_box(defer_inputs_slice);
1050
1051                        // Drop inputs.
1052                        if mem::needs_drop::<I>() {
1053                            for input in defer_inputs_slice {
1054                                // SAFETY: We have exclusive access to inputs.
1055                                unsafe { drop_input(input) }
1056                            }
1057                        }
1058                    }
1059                }
1060            }
1061
1062            // SAFETY: These values are guaranteed to be the correct variant
1063            // because they were created from the same `timer_kind`.
1064            let interval = unsafe {
1065                [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
1066            };
1067
1068            (interval, saved_alloc_info)
1069        }
1070    }
1071
1072    #[inline]
1073    fn initial_mode(&self) -> BenchMode {
1074        if self.shared_context.action.is_test() {
1075            BenchMode::Test
1076        } else if let Some(sample_size) = self.options.sample_size {
1077            BenchMode::Collect { sample_size }
1078        } else {
1079            BenchMode::Tune { sample_size: 1 }
1080        }
1081    }
1082
1083    pub fn compute_stats(&self) -> Stats {
1084        let time_samples = &self.samples.time_samples;
1085        let alloc_info_by_sample = &self.samples.alloc_info_by_sample;
1086
1087        let sample_count = time_samples.len();
1088        let sample_size = self.samples.sample_size;
1089
1090        let total_count = self.samples.iter_count();
1091
1092        let total_duration = self.samples.total_duration();
1093        let mean_duration = FineDuration {
1094            picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(),
1095        };
1096
1097        // Samples sorted by duration.
1098        let sorted_samples = self.samples.sorted_samples();
1099        let median_samples = util::slice_middle(&sorted_samples);
1100
1101        let index_of_sample = |sample: &TimeSample| -> usize {
1102            util::slice_ptr_index(&self.samples.time_samples, sample)
1103        };
1104
1105        let counter_count_for_sample =
1106            |sample: &TimeSample, counter_kind: KnownCounterKind| -> Option<MaxCountUInt> {
1107                let counts = self.counters.counts(counter_kind);
1108
1109                let index = if self.counters.uses_input_counts(counter_kind) {
1110                    index_of_sample(sample)
1111                } else {
1112                    0
1113                };
1114
1115                counts.get(index).copied()
1116            };
1117
1118        let min_duration =
1119            sorted_samples.first().map(|s| s.duration / sample_size).unwrap_or_default();
1120        let max_duration =
1121            sorted_samples.last().map(|s| s.duration / sample_size).unwrap_or_default();
1122
1123        let median_duration = if median_samples.is_empty() {
1124            FineDuration::default()
1125        } else {
1126            let sum: u128 = median_samples.iter().map(|s| s.duration.picos).sum();
1127            FineDuration { picos: sum / median_samples.len() as u128 } / sample_size
1128        };
1129
1130        let counts = KnownCounterKind::ALL.map(|counter_kind| {
1131            let median: MaxCountUInt = {
1132                let mut sum: u128 = 0;
1133
1134                for sample in median_samples {
1135                    let sample_count = counter_count_for_sample(sample, counter_kind)? as u128;
1136
1137                    // Saturating add in case `MaxUIntCount > u64`.
1138                    sum = sum.saturating_add(sample_count);
1139                }
1140
1141                (sum / median_samples.len() as u128) as MaxCountUInt
1142            };
1143
1144            Some(StatsSet {
1145                fastest: sorted_samples
1146                    .first()
1147                    .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1148                slowest: sorted_samples
1149                    .last()
1150                    .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1151                median,
1152                mean: self.counters.mean_count(counter_kind),
1153            })
1154        });
1155
1156        let sample_alloc_info = |sample: Option<&TimeSample>| -> Option<&ThreadAllocInfo> {
1157            sample
1158                .and_then(|sample| u32::try_from(index_of_sample(sample)).ok())
1159                .and_then(|index| self.samples.alloc_info_by_sample.get(&index))
1160        };
1161
1162        let sample_alloc_tally = |sample: Option<&TimeSample>, op: AllocOp| -> ThreadAllocTally {
1163            sample_alloc_info(sample)
1164                .map(|alloc_info| alloc_info.tallies.get(op))
1165                .copied()
1166                .unwrap_or_default()
1167        };
1168
1169        let mut alloc_total_max_count = 0u128;
1170        let mut alloc_total_max_size = 0u128;
1171        let mut alloc_total_tallies = TotalAllocTallyMap::default();
1172
1173        for alloc_info in alloc_info_by_sample.values() {
1174            alloc_total_max_count += alloc_info.max_count as u128;
1175            alloc_total_max_size += alloc_info.max_size as u128;
1176            alloc_info.tallies.add_to_total(&mut alloc_total_tallies);
1177        }
1178
1179        let sample_size = f64::from(sample_size);
1180        Stats {
1181            sample_count: sample_count as u32,
1182            iter_count: total_count,
1183            time: StatsSet {
1184                fastest: min_duration,
1185                slowest: max_duration,
1186                median: median_duration,
1187                mean: mean_duration,
1188            },
1189            max_alloc: StatsSet {
1190                fastest: {
1191                    let alloc_info = sample_alloc_info(sorted_samples.first().copied());
1192
1193                    AllocTally {
1194                        count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1195                            / sample_size,
1196                        size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1197                            / sample_size,
1198                    }
1199                },
1200                slowest: {
1201                    let alloc_info = sample_alloc_info(sorted_samples.last().copied());
1202
1203                    AllocTally {
1204                        count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1205                            / sample_size,
1206                        size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1207                            / sample_size,
1208                    }
1209                },
1210                // TODO: Switch to median of alloc info itself, rather than
1211                // basing off of median times.
1212                median: {
1213                    let alloc_info_for_median =
1214                        |index| sample_alloc_info(median_samples.get(index).copied());
1215
1216                    let max_count_for_median = |index: usize| -> f64 {
1217                        alloc_info_for_median(index)
1218                            .map(|info| info.max_count as f64)
1219                            .unwrap_or_default()
1220                    };
1221
1222                    let max_size_for_median = |index: usize| -> f64 {
1223                        alloc_info_for_median(index)
1224                            .map(|info| info.max_size as f64)
1225                            .unwrap_or_default()
1226                    };
1227
1228                    let median_count = median_samples.len().max(1) as f64;
1229
1230                    let median_max_count = max_count_for_median(0) + max_count_for_median(1);
1231                    let median_max_size = max_size_for_median(0) + max_size_for_median(1);
1232
1233                    AllocTally {
1234                        count: median_max_count / median_count / sample_size,
1235                        size: median_max_size / median_count / sample_size,
1236                    }
1237                },
1238                mean: AllocTally {
1239                    count: alloc_total_max_count as f64 / total_count as f64,
1240                    size: alloc_total_max_size as f64 / total_count as f64,
1241                },
1242            }
1243            .transpose(),
1244            alloc_tallies: AllocOpMap {
1245                values: AllocOp::ALL
1246                    .map(|op| StatsSet {
1247                        fastest: {
1248                            let fastest = sample_alloc_tally(sorted_samples.first().copied(), op);
1249
1250                            AllocTally {
1251                                count: fastest.count as f64 / sample_size,
1252                                size: fastest.size as f64 / sample_size,
1253                            }
1254                        },
1255                        slowest: {
1256                            let slowest = sample_alloc_tally(sorted_samples.last().copied(), op);
1257
1258                            AllocTally {
1259                                count: slowest.count as f64 / sample_size,
1260                                size: slowest.size as f64 / sample_size,
1261                            }
1262                        },
1263                        median: {
1264                            let tally_for_median = |index: usize| -> ThreadAllocTally {
1265                                sample_alloc_tally(median_samples.get(index).copied(), op)
1266                            };
1267
1268                            let a = tally_for_median(0);
1269                            let b = tally_for_median(1);
1270
1271                            let median_count = median_samples.len().max(1) as f64;
1272
1273                            let avg_count = (a.count as f64 + b.count as f64) / median_count;
1274                            let avg_size = (a.size as f64 + b.size as f64) / median_count;
1275
1276                            AllocTally {
1277                                count: avg_count / sample_size,
1278                                size: avg_size / sample_size,
1279                            }
1280                        },
1281                        mean: {
1282                            let tally = alloc_total_tallies.get(op);
1283                            AllocTally {
1284                                count: tally.count as f64 / total_count as f64,
1285                                size: tally.size as f64 / total_count as f64,
1286                            }
1287                        },
1288                    })
1289                    .map(StatsSet::transpose),
1290            },
1291            counts,
1292        }
1293    }
1294}
1295
1296impl<T> StatsSet<AllocTally<T>> {
1297    #[inline]
1298    pub fn transpose(self) -> AllocTally<StatsSet<T>> {
1299        AllocTally {
1300            count: StatsSet {
1301                fastest: self.fastest.count,
1302                slowest: self.slowest.count,
1303                median: self.median.count,
1304                mean: self.mean.count,
1305            },
1306            size: StatsSet {
1307                fastest: self.fastest.size,
1308                slowest: self.slowest.size,
1309                median: self.median.size,
1310                mean: self.mean.size,
1311            },
1312        }
1313    }
1314}