codspeed_divan_compat_walltime/bench/
mod.rs

1use std::{
2    cell::UnsafeCell,
3    fmt,
4    mem::{self, MaybeUninit},
5    num::NonZeroUsize,
6    sync::Barrier,
7};
8
9use crate::{
10    alloc::{
11        AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap,
12    },
13    black_box, black_box_drop,
14    counter::{
15        AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount,
16        IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt,
17    },
18    divan::SharedContext,
19    stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample},
20    thread_pool::BENCH_POOL,
21    time::{FineDuration, Timestamp, UntaggedTimestamp},
22    util::{self, sync::SyncWrap, Unit},
23};
24
25#[cfg(test)]
26mod tests;
27
28mod args;
29mod defer;
30mod options;
31
32use defer::{DeferSlot, DeferStore};
33
34pub use self::{
35    args::{BenchArgs, BenchArgsRunner},
36    options::BenchOptions,
37};
38
39pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100;
40
41/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
42///
43/// # Examples
44///
45/// ```
46/// use divan::{Bencher, black_box};
47///
48/// #[divan::bench]
49/// fn copy_from_slice(bencher: Bencher) {
50///     // Input and output buffers get used in the closure.
51///     let src = (0..100).collect::<Vec<i32>>();
52///     let mut dst = vec![0; src.len()];
53///
54///     bencher.bench_local(|| {
55///         black_box(&mut dst).copy_from_slice(black_box(&src));
56///     });
57/// }
58/// ```
59#[must_use = "a benchmark function must be registered"]
60pub struct Bencher<'a, 'b, C = BencherConfig> {
61    pub(crate) context: &'a mut BenchContext<'b>,
62    pub(crate) config: C,
63}
64
65/// Public-in-private type for statically-typed `Bencher` configuration.
66///
67/// This enables configuring `Bencher` using the builder pattern with zero
68/// runtime cost.
69pub struct BencherConfig<GenI = Unit> {
70    gen_input: GenI,
71}
72
73impl<C> fmt::Debug for Bencher<'_, '_, C> {
74    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
75        f.debug_struct("Bencher").finish_non_exhaustive()
76    }
77}
78
79impl<'a, 'b> Bencher<'a, 'b> {
80    #[inline]
81    pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
82        Self { context, config: BencherConfig { gen_input: Unit } }
83    }
84}
85
86impl<'a, 'b> Bencher<'a, 'b> {
87    /// Benchmarks a function.
88    ///
89    /// The function can be benchmarked in parallel using the [`threads`
90    /// option](macro@crate::bench#threads). If the function is strictly
91    /// single-threaded, use [`Bencher::bench_local`] instead.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// #[divan::bench]
97    /// fn bench(bencher: divan::Bencher) {
98    ///     bencher.bench(|| {
99    ///         // Benchmarked code...
100    ///     });
101    /// }
102    /// ```
103    pub fn bench<O, B>(self, benched: B)
104    where
105        B: Fn() -> O + Sync,
106    {
107        // Reusing `bench_values` for a zero-sized non-drop input type should
108        // have no overhead.
109        self.with_inputs(|| ()).bench_values(|_: ()| benched());
110    }
111
112    /// Benchmarks a function on the current thread.
113    ///
114    /// # Examples
115    ///
116    /// ```
117    /// #[divan::bench]
118    /// fn bench(bencher: divan::Bencher) {
119    ///     bencher.bench_local(|| {
120    ///         // Benchmarked code...
121    ///     });
122    /// }
123    /// ```
124    pub fn bench_local<O, B>(self, mut benched: B)
125    where
126        B: FnMut() -> O,
127    {
128        // Reusing `bench_local_values` for a zero-sized non-drop input type
129        // should have no overhead.
130        self.with_inputs(|| ()).bench_local_values(|_: ()| benched());
131    }
132
133    /// Generate inputs for the [benchmarked function](#input-bench).
134    ///
135    /// Time spent generating inputs does not affect benchmark timing.
136    ///
137    /// When [benchmarking in parallel](macro@crate::bench#threads), the input
138    /// generator is called on the same thread as the sample loop that uses that
139    /// input.
140    ///
141    /// # Examples
142    ///
143    /// ```
144    /// #[divan::bench]
145    /// fn bench(bencher: divan::Bencher) {
146    ///     bencher
147    ///         .with_inputs(|| {
148    ///             // Generate input:
149    ///             String::from("...")
150    ///         })
151    ///         .bench_values(|s| {
152    ///             // Use input by-value:
153    ///             s + "123"
154    ///         });
155    /// }
156    /// ```
157    pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
158        Bencher { context: self.context, config: BencherConfig { gen_input } }
159    }
160}
161
162impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
163    /// Assign a [`Counter`] for all iterations of the benchmarked function.
164    ///
165    /// This will either:
166    /// - Assign a new counter
167    /// - Override an existing counter of the same type
168    ///
169    /// If the counter depends on [generated inputs](Self::with_inputs), use
170    /// [`Bencher::input_counter`] instead.
171    ///
172    /// If context is not needed, the counter can instead be set via
173    /// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
174    ///
175    /// # Examples
176    ///
177    /// ```
178    /// use divan::{Bencher, counter::BytesCount};
179    ///
180    /// #[divan::bench]
181    /// fn char_count(bencher: Bencher) {
182    ///     let s: String = // ...
183    ///     # String::new();
184    ///
185    ///     bencher
186    ///         .counter(BytesCount::of_str(&s))
187    ///         .bench(|| {
188    ///             divan::black_box(&s).chars().count()
189    ///         });
190    /// }
191    /// ```
192    #[doc(alias = "throughput")]
193    pub fn counter<C>(self, counter: C) -> Self
194    where
195        C: IntoCounter,
196    {
197        let counter = AnyCounter::new(counter);
198        self.context.counters.set_counter(counter);
199        self
200    }
201}
202
203/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
204impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
205where
206    GenI: FnMut() -> I,
207{
208    /// Calls a closure to create a [`Counter`] for each input of the
209    /// benchmarked function.
210    ///
211    /// This will either:
212    /// - Assign a new counter
213    /// - Override an existing counter of the same type
214    ///
215    /// If the counter is constant, use [`Bencher::counter`] instead.
216    ///
217    /// When [benchmarking in parallel](macro@crate::bench#threads), the input
218    /// counter is called on the same thread as the sample loop that generates
219    /// and uses that input.
220    ///
221    /// # Examples
222    ///
223    /// The following example emits info for the number of bytes processed when
224    /// benchmarking [`char`-counting](std::str::Chars::count). The byte count
225    /// is gotten by calling [`BytesCount::of_str`] on each iteration's input
226    /// [`String`].
227    ///
228    /// ```
229    /// use divan::{Bencher, counter::BytesCount};
230    ///
231    /// #[divan::bench]
232    /// fn char_count(bencher: Bencher) {
233    ///     bencher
234    ///         .with_inputs(|| -> String {
235    ///             // ...
236    ///             # String::new()
237    ///         })
238    ///         .input_counter(BytesCount::of_str)
239    ///         .bench_refs(|s| {
240    ///             s.chars().count()
241    ///         });
242    /// }
243    /// ```
244    pub fn input_counter<C, F>(self, make_counter: F) -> Self
245    where
246        F: Fn(&I) -> C + Sync + 'static,
247        C: IntoCounter,
248    {
249        self.context.counters.set_input_counter(make_counter);
250        self
251    }
252
253    /// Creates a [`Counter`] from each input of the benchmarked function.
254    ///
255    /// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any
256    /// nesting of references to those types.
257    ///
258    /// # Examples
259    ///
260    /// The following example emits info for the number of items processed when
261    /// benchmarking [`FromIterator`] from
262    /// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`].
263    ///
264    /// ```
265    /// use divan::{Bencher, counter::ItemsCount};
266    ///
267    /// #[divan::bench]
268    /// fn range_to_vec(bencher: Bencher) {
269    ///     bencher
270    ///         .with_inputs(|| -> usize {
271    ///             // ...
272    ///             # 0
273    ///         })
274    ///         .count_inputs_as::<ItemsCount>()
275    ///         .bench_values(|n| -> Vec<usize> {
276    ///             (0..n).collect()
277    ///         });
278    /// }
279    /// ```
280    #[inline]
281    pub fn count_inputs_as<C>(self) -> Self
282    where
283        C: Counter,
284        I: AsCountUInt,
285    {
286        match KnownCounterKind::of::<C>() {
287            KnownCounterKind::Bytes => self.input_counter(|c| BytesCount::from(c)),
288            KnownCounterKind::Chars => self.input_counter(|c| CharsCount::from(c)),
289            KnownCounterKind::Cycles => self.input_counter(|c| CyclesCount::from(c)),
290            KnownCounterKind::Items => self.input_counter(|c| ItemsCount::from(c)),
291        }
292    }
293
294    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
295    /// provided by-value.
296    ///
297    /// Per-iteration means the benchmarked function is called exactly once for
298    /// each generated input.
299    ///
300    /// The function can be benchmarked in parallel using the [`threads`
301    /// option](macro@crate::bench#threads). If the function is strictly
302    /// single-threaded, use [`Bencher::bench_local_values`] instead.
303    ///
304    /// # Examples
305    ///
306    /// ```
307    /// #[divan::bench]
308    /// fn bench(bencher: divan::Bencher) {
309    ///     bencher
310    ///         .with_inputs(|| {
311    ///             // Generate input:
312    ///             String::from("...")
313    ///         })
314    ///         .bench_values(|s| {
315    ///             // Use input by-value:
316    ///             s + "123"
317    ///         });
318    /// }
319    /// ```
320    pub fn bench_values<O, B>(self, benched: B)
321    where
322        B: Fn(I) -> O + Sync,
323        GenI: Fn() -> I + Sync,
324    {
325        self.context.bench_loop_threaded(
326            self.config.gen_input,
327            |input| {
328                // SAFETY: Input is guaranteed to be initialized and not
329                // currently referenced by anything else.
330                let input = unsafe { input.get().read().assume_init() };
331
332                benched(input)
333            },
334            // Input ownership is transferred to `benched`.
335            |_input| {},
336        );
337    }
338
339    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
340    /// provided by-value.
341    ///
342    /// Per-iteration means the benchmarked function is called exactly once for
343    /// each generated input.
344    ///
345    /// # Examples
346    ///
347    /// ```
348    /// #[divan::bench]
349    /// fn bench(bencher: divan::Bencher) {
350    ///     let mut values = Vec::new();
351    ///     bencher
352    ///         .with_inputs(|| {
353    ///             // Generate input:
354    ///             String::from("...")
355    ///         })
356    ///         .bench_local_values(|s| {
357    ///             // Use input by-value:
358    ///             values.push(s);
359    ///         });
360    /// }
361    /// ```
362    pub fn bench_local_values<O, B>(self, mut benched: B)
363    where
364        B: FnMut(I) -> O,
365    {
366        self.context.bench_loop_local(
367            self.config.gen_input,
368            |input| {
369                // SAFETY: Input is guaranteed to be initialized and not
370                // currently referenced by anything else.
371                let input = unsafe { input.get().read().assume_init() };
372
373                benched(input)
374            },
375            // Input ownership is transferred to `benched`.
376            |_input| {},
377        );
378    }
379
380    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
381    /// provided by-reference.
382    ///
383    /// Per-iteration means the benchmarked function is called exactly once for
384    /// each generated input.
385    ///
386    /// # Examples
387    ///
388    /// ```
389    /// #[divan::bench]
390    /// fn bench(bencher: divan::Bencher) {
391    ///     bencher
392    ///         .with_inputs(|| {
393    ///             // Generate input:
394    ///             String::from("...")
395    ///         })
396    ///         .bench_refs(|s| {
397    ///             // Use input by-reference:
398    ///             *s += "123";
399    ///         });
400    /// }
401    /// ```
402    pub fn bench_refs<O, B>(self, benched: B)
403    where
404        B: Fn(&mut I) -> O + Sync,
405        GenI: Fn() -> I + Sync,
406    {
407        // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
408        self.context.bench_loop_threaded(
409            self.config.gen_input,
410            |input| {
411                // SAFETY: Input is guaranteed to be initialized and not
412                // currently referenced by anything else.
413                let input = unsafe { (*input.get()).assume_init_mut() };
414
415                benched(input)
416            },
417            // Input ownership was not transferred to `benched`.
418            |input| {
419                // SAFETY: This function is called after `benched` outputs are
420                // dropped, so we have exclusive access.
421                unsafe { (*input.get()).assume_init_drop() }
422            },
423        );
424    }
425
426    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
427    /// provided by-reference.
428    ///
429    /// Per-iteration means the benchmarked function is called exactly once for
430    /// each generated input.
431    ///
432    /// # Examples
433    ///
434    /// ```
435    /// #[divan::bench]
436    /// fn bench(bencher: divan::Bencher) {
437    ///     bencher
438    ///         .with_inputs(|| {
439    ///             // Generate input:
440    ///             String::from("...")
441    ///         })
442    ///         .bench_local_refs(|s| {
443    ///             // Use input by-reference:
444    ///             *s += "123";
445    ///         });
446    /// }
447    /// ```
448    pub fn bench_local_refs<O, B>(self, mut benched: B)
449    where
450        B: FnMut(&mut I) -> O,
451    {
452        // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
453        self.context.bench_loop_local(
454            self.config.gen_input,
455            |input| {
456                // SAFETY: Input is guaranteed to be initialized and not
457                // currently referenced by anything else.
458                let input = unsafe { (*input.get()).assume_init_mut() };
459
460                benched(input)
461            },
462            // Input ownership was not transferred to `benched`.
463            |input| {
464                // SAFETY: This function is called after `benched` outputs are
465                // dropped, so we have exclusive access.
466                unsafe { (*input.get()).assume_init_drop() }
467            },
468        );
469    }
470}
471
472/// State machine for how the benchmark is being run.
473#[derive(Clone, Copy)]
474pub(crate) enum BenchMode {
475    /// The benchmark is being run as `--test`.
476    ///
477    /// Don't collect samples and run exactly once.
478    Test,
479
480    /// Scale `sample_size` to determine the right size for collecting.
481    Tune { sample_size: u32 },
482
483    /// Simply collect samples.
484    Collect { sample_size: u32 },
485}
486
487impl BenchMode {
488    #[inline]
489    pub fn is_test(self) -> bool {
490        matches!(self, Self::Test)
491    }
492
493    #[inline]
494    pub fn is_tune(self) -> bool {
495        matches!(self, Self::Tune { .. })
496    }
497
498    #[inline]
499    pub fn is_collect(self) -> bool {
500        matches!(self, Self::Collect { .. })
501    }
502
503    #[inline]
504    pub fn sample_size(self) -> u32 {
505        match self {
506            Self::Test => 1,
507            Self::Tune { sample_size, .. } | Self::Collect { sample_size, .. } => sample_size,
508        }
509    }
510}
511
512/// `#[divan::bench]` loop context.
513///
514/// Functions called within the benchmark loop should be `#[inline(always)]` to
515/// ensure instruction cache locality.
516pub(crate) struct BenchContext<'a> {
517    shared_context: &'a SharedContext,
518
519    /// User-configured options.
520    pub options: &'a BenchOptions<'a>,
521
522    /// Whether the benchmark loop was started.
523    pub did_run: bool,
524
525    /// The number of threads to run the benchmark. The default is 1.
526    ///
527    /// When set to 1, the benchmark loop is guaranteed to stay on the current
528    /// thread and not spawn any threads.
529    pub thread_count: NonZeroUsize,
530
531    /// Recorded samples.
532    pub samples: SampleCollection,
533
534    /// Per-iteration counters grouped by sample.
535    counters: CounterCollection,
536}
537
538impl<'a> BenchContext<'a> {
539    /// Creates a new benchmarking context.
540    pub fn new(
541        shared_context: &'a SharedContext,
542        options: &'a BenchOptions,
543        thread_count: NonZeroUsize,
544    ) -> Self {
545        Self {
546            shared_context,
547            options,
548            thread_count,
549            did_run: false,
550            samples: SampleCollection::default(),
551            counters: options.counters.to_collection(),
552        }
553    }
554
555    /// Runs the single-threaded loop for benchmarking `benched`.
556    ///
557    /// # Safety
558    ///
559    /// See `bench_loop_threaded`.
560    pub fn bench_loop_local<I, O>(
561        &mut self,
562        gen_input: impl FnMut() -> I,
563        benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
564        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
565    ) {
566        // SAFETY: Closures are guaranteed to run on the current thread, so they
567        // can safely be mutable and non-`Sync`.
568        unsafe {
569            let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
570            let benched = SyncWrap::new(UnsafeCell::new(benched));
571            let drop_input = SyncWrap::new(drop_input);
572
573            self.thread_count = NonZeroUsize::MIN;
574            self.bench_loop_threaded::<I, O>(
575                || (*gen_input.get())(),
576                |input| (*benched.get())(input),
577                |input| drop_input(input),
578            )
579        }
580    }
581
582    /// Runs the multi-threaded loop for benchmarking `benched`.
583    ///
584    /// # Safety
585    ///
586    /// If `self.threads` is 1, the incoming closures will not escape the
587    /// current thread. This guarantee ensures `bench_loop_local` can soundly
588    /// reuse this method with mutable non-`Sync` closures.
589    ///
590    /// When `benched` is called:
591    /// - `I` is guaranteed to be initialized.
592    /// - No external `&I` or `&mut I` exists.
593    ///
594    /// When `drop_input` is called:
595    /// - All instances of `O` returned from `benched` have been dropped.
596    /// - The same guarantees for `I` apply as in `benched`, unless `benched`
597    ///   escaped references to `I`.
598    fn bench_loop_threaded<I, O>(
599        &mut self,
600        gen_input: impl Fn() -> I + Sync,
601        benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
602        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
603    ) {
604        self.did_run = true;
605
606        let mut current_mode = self.initial_mode();
607        let is_test = current_mode.is_test();
608
609        let record_sample = self.sample_recorder(gen_input, benched, drop_input);
610
611        let thread_count = self.thread_count.get();
612        let aux_thread_count = thread_count - 1;
613
614        let is_single_thread = aux_thread_count == 0;
615
616        // Per-thread sample info returned by `record_sample`. These are
617        // processed locally to emit user-facing sample info. As a result, this
618        // only contains `thread_count` many elements at a time.
619        let mut raw_samples = Vec::<Option<RawSample>>::new();
620
621        // The time spent benchmarking, in picoseconds.
622        //
623        // Unless `skip_ext_time` is set, this includes time external to
624        // `benched`, such as time spent generating inputs and running drop.
625        let mut elapsed_picos: u128 = 0;
626
627        // The minimum time for benchmarking, in picoseconds.
628        let min_picos = self.options.min_time().picos;
629
630        // The remaining time left for benchmarking, in picoseconds.
631        let max_picos = self.options.max_time().picos;
632
633        // Don't bother running if user specifies 0 max time or 0 samples.
634        if max_picos == 0 || !self.options.has_samples() {
635            return;
636        }
637
638        let timer = self.shared_context.timer;
639        let timer_kind = timer.kind();
640
641        let mut rem_samples = if current_mode.is_collect() {
642            Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
643        } else {
644            None
645        };
646
647        // Only measure precision if we need to tune sample size.
648        let timer_precision =
649            if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
650
651        if !is_test {
652            self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(1) as usize);
653        }
654
655        let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
656        let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
657
658        let bench_overheads = timer.bench_overheads();
659
660        #[cfg(unix)]
661        let _guard = codspeed::fifo::BenchGuard::new_with_runner_fifo();
662        while {
663            // Conditions for when sampling is over:
664            if elapsed_picos >= max_picos {
665                // Depleted the benchmarking time budget. This is a strict
666                // condition regardless of sample count and minimum time.
667                false
668            } else if rem_samples.unwrap_or(1) > 0 {
669                // More samples expected.
670                true
671            } else {
672                // Continue if we haven't reached the time floor.
673                elapsed_picos < min_picos
674            }
675        } {
676            let sample_size = current_mode.sample_size();
677            self.samples.sample_size = sample_size;
678
679            let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
680
681            // Sample loop helper:
682            let record_sample = || -> RawSample {
683                let mut counter_totals: [u128; KnownCounterKind::COUNT] =
684                    [0; KnownCounterKind::COUNT];
685
686                // Updates per-input counter info for this sample.
687                let mut count_input = |input: &I| {
688                    for counter_kind in KnownCounterKind::ALL {
689                        // SAFETY: The `I` type cannot change since `with_inputs`
690                        // cannot be called more than once on the same `Bencher`.
691                        if let Some(count) =
692                            unsafe { self.counters.get_input_count(counter_kind, input) }
693                        {
694                            let total = &mut counter_totals[counter_kind as usize];
695                            *total = (*total).saturating_add(count as u128);
696                        }
697                    }
698                };
699
700                // Sample loop:
701                let ([start, end], alloc_info) =
702                    record_sample(sample_size as usize, barrier.as_ref(), &mut count_input);
703
704                RawSample { start, end, timer, alloc_info, counter_totals }
705            };
706
707            // Sample loop:
708            raw_samples.clear();
709            BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, |_| record_sample());
710
711            // Convert `&[Option<RawSample>]` to `&[Sample]`.
712            let raw_samples: &[RawSample] = {
713                if let Some(thread) = raw_samples
714                    .iter()
715                    .enumerate()
716                    .find_map(|(thread, sample)| sample.is_none().then_some(thread))
717                {
718                    panic!("Divan benchmarking thread {thread} panicked");
719                }
720
721                unsafe {
722                    assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>());
723                    std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len())
724                }
725            };
726
727            // If testing, exit the benchmarking loop immediately after timing a
728            // single run.
729            if is_test {
730                break;
731            }
732
733            let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap();
734            let slowest_time = slowest_sample.duration();
735
736            // TODO: Make tuning be less influenced by early runs. Currently if
737            // early runs are very quick but later runs are slow, benchmarking
738            // will take a very long time.
739            //
740            // TODO: Make `sample_size` consider time generating inputs and
741            // dropping inputs/outputs. Currently benchmarks like
742            // `Bencher::bench_refs(String::clear)` take a very long time.
743            if current_mode.is_tune() {
744                // Clear previous smaller samples.
745                self.samples.clear();
746                self.counters.clear_input_counts();
747
748                // If within 100x timer precision, continue tuning.
749                let precision_multiple = slowest_time.picos / timer_precision.picos;
750                if precision_multiple <= 100 {
751                    current_mode = BenchMode::Tune { sample_size: sample_size * 2 };
752                } else {
753                    current_mode = BenchMode::Collect { sample_size };
754                    rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
755                }
756            }
757
758            // Returns the sample's duration adjusted for overhead.
759            let sample_duration_sub_overhead = |raw_sample: &RawSample| {
760                let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info);
761
762                FineDuration {
763                    picos: raw_sample
764                        .duration()
765                        .clamp_to(timer_precision)
766                        .picos
767                        .saturating_sub(overhead.picos),
768                }
769                .clamp_to(timer_precision)
770            };
771
772            for raw_sample in raw_samples {
773                let sample_index = self.samples.time_samples.len();
774
775                self.samples
776                    .time_samples
777                    .push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) });
778
779                if !raw_sample.alloc_info.tallies.is_empty() {
780                    self.samples
781                        .alloc_info_by_sample
782                        .insert(sample_index as u32, raw_sample.alloc_info.clone());
783                }
784
785                // Insert per-input counter information.
786                for counter_kind in KnownCounterKind::ALL {
787                    if !self.counters.uses_input_counts(counter_kind) {
788                        continue;
789                    }
790
791                    let total_count = raw_sample.counter_totals[counter_kind as usize];
792
793                    // Cannot overflow `MaxCountUInt` because `total_count`
794                    // cannot exceed `MaxCountUInt::MAX * sample_size`.
795                    let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
796
797                    self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
798                }
799
800                if let Some(rem_samples) = &mut rem_samples {
801                    *rem_samples = rem_samples.saturating_sub(1);
802                }
803            }
804
805            if let Some(initial_start) = initial_start {
806                let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
807                elapsed_picos = last_end.duration_since(initial_start, timer).picos;
808            } else {
809                // Progress by at least 1ns to prevent extremely fast
810                // functions from taking forever when `min_time` is set.
811                let progress_picos = slowest_time.picos.max(1_000);
812                elapsed_picos = elapsed_picos.saturating_add(progress_picos);
813            }
814        }
815        #[cfg(unix)]
816        core::mem::drop(_guard);
817
818        // Reset flag for ignoring allocations.
819        crate::alloc::IGNORE_ALLOC.set(false);
820    }
821
822    /// Returns a closure that takes the sample size and input counter, and then
823    /// returns a newly recorded sample.
824    fn sample_recorder<I, O>(
825        &self,
826        gen_input: impl Fn() -> I,
827        benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
828        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
829    ) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; 2], ThreadAllocInfo)
830    {
831        // We defer:
832        // - Usage of `gen_input` values.
833        // - Drop destructor for `O`, preventing it from affecting sample
834        //   measurements. Outputs are stored into a pre-allocated buffer during
835        //   the sample loop. The allocation is reused between samples to reduce
836        //   time spent between samples.
837
838        let timer_kind = self.shared_context.timer.kind();
839
840        move |sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)| {
841            let mut defer_store = DeferStore::<I, O>::default();
842
843            let mut saved_alloc_info = ThreadAllocInfo::new();
844            let mut save_alloc_info = || {
845                if crate::alloc::IGNORE_ALLOC.get() {
846                    return;
847                }
848
849                if let Some(alloc_info) = ThreadAllocInfo::try_current() {
850                    // SAFETY: We have exclusive access.
851                    saved_alloc_info = unsafe { alloc_info.as_ptr().read() };
852                }
853            };
854
855            // Synchronize all threads to start timed section simultaneously and
856            // clear every thread's memory profiling info.
857            //
858            // This ensures work external to the timed section does not affect
859            // the timing of other threads.
860            let sync_threads = |is_start: bool| {
861                sync_impl(barrier, is_start);
862
863                // Monomorphize implementation to reduce code size.
864                #[inline(never)]
865                fn sync_impl(barrier: Option<&Barrier>, is_start: bool) {
866                    // Ensure benchmarked section has a `ThreadAllocInfo`
867                    // allocated for the current thread and clear previous info.
868                    let alloc_info = if is_start { ThreadAllocInfo::current() } else { None };
869
870                    // Synchronize all threads.
871                    //
872                    // This is the final synchronization point for the end.
873                    if let Some(barrier) = barrier {
874                        barrier.wait();
875                    }
876
877                    if let Some(mut alloc_info) = alloc_info {
878                        // SAFETY: We have exclusive access.
879                        let alloc_info = unsafe { alloc_info.as_mut() };
880
881                        alloc_info.clear();
882
883                        // Synchronize all threads.
884                        if let Some(barrier) = barrier {
885                            barrier.wait();
886                        }
887                    }
888                }
889            };
890
891            // The following logic chooses how to efficiently sample the
892            // benchmark function once and assigns `sample_start`/`sample_end`
893            // before/after the sample loop.
894            //
895            // NOTE: Testing and benchmarking should behave exactly the same
896            // when getting the sample time span. We don't want to introduce
897            // extra work that may worsen measurement quality for real
898            // benchmarking.
899            let sample_start: UntaggedTimestamp;
900            let sample_end: UntaggedTimestamp;
901
902            if size_of::<I>() == 0 && (size_of::<O>() == 0 || !mem::needs_drop::<O>()) {
903                // Use a range instead of `defer_store` to make the benchmarking
904                // loop cheaper.
905
906                // Run `gen_input` the expected number of times in case it
907                // updates external state used by `benched`.
908                for _ in 0..sample_size {
909                    let input = gen_input();
910                    count_input(&input);
911
912                    // Inputs are consumed/dropped later.
913                    mem::forget(input);
914                }
915
916                sync_threads(true);
917                sample_start = UntaggedTimestamp::start(timer_kind);
918
919                // Sample loop:
920                for _ in 0..sample_size {
921                    // SAFETY: Input is a ZST, so we can construct one out of
922                    // thin air.
923                    let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
924
925                    mem::forget(black_box(benched(&input)));
926                }
927
928                sample_end = UntaggedTimestamp::end(timer_kind);
929                sync_threads(false);
930                save_alloc_info();
931
932                // Drop outputs and inputs.
933                for _ in 0..sample_size {
934                    // Output only needs drop if ZST.
935                    if size_of::<O>() == 0 {
936                        // SAFETY: Output is a ZST, so we can construct one out
937                        // of thin air.
938                        unsafe { _ = mem::zeroed::<O>() }
939                    }
940
941                    if mem::needs_drop::<I>() {
942                        // SAFETY: Input is a ZST, so we can construct one out
943                        // of thin air and not worry about aliasing.
944                        unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
945                    }
946                }
947            } else {
948                defer_store.prepare(sample_size);
949
950                match defer_store.slots() {
951                    // Output needs to be dropped. We defer drop in the sample
952                    // loop by inserting it into `defer_store`.
953                    Ok(defer_slots_slice) => {
954                        // Initialize and store inputs.
955                        for DeferSlot { input, .. } in defer_slots_slice {
956                            // SAFETY: We have exclusive access to `input`.
957                            let input = unsafe { &mut *input.get() };
958                            let input = input.write(gen_input());
959                            count_input(input);
960
961                            // Make input opaque to benchmarked function.
962                            black_box(input);
963                        }
964
965                        // Create iterator before the sample timing section to
966                        // reduce benchmarking overhead.
967                        let defer_slots_iter = defer_slots_slice.iter();
968
969                        sync_threads(true);
970                        sample_start = UntaggedTimestamp::start(timer_kind);
971
972                        // Sample loop:
973                        for defer_slot in defer_slots_iter {
974                            // SAFETY: All inputs in `defer_store` were
975                            // initialized and we have exclusive access to the
976                            // output slot.
977                            unsafe {
978                                let output = benched(&defer_slot.input);
979                                *defer_slot.output.get() = MaybeUninit::new(output);
980                            }
981                        }
982
983                        sample_end = UntaggedTimestamp::end(timer_kind);
984                        sync_threads(false);
985                        save_alloc_info();
986
987                        // Prevent the optimizer from removing writes to inputs
988                        // and outputs in the sample loop.
989                        black_box(defer_slots_slice);
990
991                        // Drop outputs and inputs.
992                        for DeferSlot { input, output } in defer_slots_slice {
993                            // SAFETY: All outputs were initialized in the
994                            // sample loop and we have exclusive access.
995                            unsafe { (*output.get()).assume_init_drop() }
996
997                            if mem::needs_drop::<I>() {
998                                // SAFETY: The output was dropped and thus we
999                                // have exclusive access to inputs.
1000                                unsafe { drop_input(input) }
1001                            }
1002                        }
1003                    }
1004
1005                    // Output does not need to be dropped.
1006                    Err(defer_inputs_slice) => {
1007                        // Initialize and store inputs.
1008                        for input in defer_inputs_slice {
1009                            // SAFETY: We have exclusive access to `input`.
1010                            let input = unsafe { &mut *input.get() };
1011                            let input = input.write(gen_input());
1012                            count_input(input);
1013
1014                            // Make input opaque to benchmarked function.
1015                            black_box(input);
1016                        }
1017
1018                        // Create iterator before the sample timing section to
1019                        // reduce benchmarking overhead.
1020                        let defer_inputs_iter = defer_inputs_slice.iter();
1021
1022                        sync_threads(true);
1023                        sample_start = UntaggedTimestamp::start(timer_kind);
1024
1025                        // Sample loop:
1026                        for input in defer_inputs_iter {
1027                            // SAFETY: All inputs in `defer_store` were
1028                            // initialized.
1029                            black_box_drop(unsafe { benched(input) });
1030                        }
1031
1032                        sample_end = UntaggedTimestamp::end(timer_kind);
1033                        sync_threads(false);
1034                        save_alloc_info();
1035
1036                        // Prevent the optimizer from removing writes to inputs
1037                        // in the sample loop.
1038                        black_box(defer_inputs_slice);
1039
1040                        // Drop inputs.
1041                        if mem::needs_drop::<I>() {
1042                            for input in defer_inputs_slice {
1043                                // SAFETY: We have exclusive access to inputs.
1044                                unsafe { drop_input(input) }
1045                            }
1046                        }
1047                    }
1048                }
1049            }
1050
1051            // SAFETY: These values are guaranteed to be the correct variant
1052            // because they were created from the same `timer_kind`.
1053            let interval = unsafe {
1054                [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
1055            };
1056
1057            (interval, saved_alloc_info)
1058        }
1059    }
1060
1061    #[inline]
1062    fn initial_mode(&self) -> BenchMode {
1063        if self.shared_context.action.is_test() {
1064            BenchMode::Test
1065        } else if let Some(sample_size) = self.options.sample_size {
1066            BenchMode::Collect { sample_size }
1067        } else {
1068            BenchMode::Tune { sample_size: 1 }
1069        }
1070    }
1071
1072    pub fn compute_stats(&self) -> Stats {
1073        let time_samples = &self.samples.time_samples;
1074        let alloc_info_by_sample = &self.samples.alloc_info_by_sample;
1075
1076        let sample_count = time_samples.len();
1077        let sample_size = self.samples.sample_size;
1078
1079        let total_count = self.samples.iter_count();
1080
1081        let total_duration = self.samples.total_duration();
1082        let mean_duration = FineDuration {
1083            picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(),
1084        };
1085
1086        // Samples sorted by duration.
1087        let sorted_samples = self.samples.sorted_samples();
1088        let median_samples = util::slice_middle(&sorted_samples);
1089
1090        let index_of_sample = |sample: &TimeSample| -> usize {
1091            util::slice_ptr_index(&self.samples.time_samples, sample)
1092        };
1093
1094        let counter_count_for_sample =
1095            |sample: &TimeSample, counter_kind: KnownCounterKind| -> Option<MaxCountUInt> {
1096                let counts = self.counters.counts(counter_kind);
1097
1098                let index = if self.counters.uses_input_counts(counter_kind) {
1099                    index_of_sample(sample)
1100                } else {
1101                    0
1102                };
1103
1104                counts.get(index).copied()
1105            };
1106
1107        let min_duration =
1108            sorted_samples.first().map(|s| s.duration / sample_size).unwrap_or_default();
1109        let max_duration =
1110            sorted_samples.last().map(|s| s.duration / sample_size).unwrap_or_default();
1111
1112        let median_duration = if median_samples.is_empty() {
1113            FineDuration::default()
1114        } else {
1115            let sum: u128 = median_samples.iter().map(|s| s.duration.picos).sum();
1116            FineDuration { picos: sum / median_samples.len() as u128 } / sample_size
1117        };
1118
1119        let counts = KnownCounterKind::ALL.map(|counter_kind| {
1120            let median: MaxCountUInt = {
1121                let mut sum: u128 = 0;
1122
1123                for sample in median_samples {
1124                    let sample_count = counter_count_for_sample(sample, counter_kind)? as u128;
1125
1126                    // Saturating add in case `MaxUIntCount > u64`.
1127                    sum = sum.saturating_add(sample_count);
1128                }
1129
1130                (sum / median_samples.len() as u128) as MaxCountUInt
1131            };
1132
1133            Some(StatsSet {
1134                fastest: sorted_samples
1135                    .first()
1136                    .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1137                slowest: sorted_samples
1138                    .last()
1139                    .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1140                median,
1141                mean: self.counters.mean_count(counter_kind),
1142            })
1143        });
1144
1145        let sample_alloc_info = |sample: Option<&TimeSample>| -> Option<&ThreadAllocInfo> {
1146            sample
1147                .and_then(|sample| u32::try_from(index_of_sample(sample)).ok())
1148                .and_then(|index| self.samples.alloc_info_by_sample.get(&index))
1149        };
1150
1151        let sample_alloc_tally = |sample: Option<&TimeSample>, op: AllocOp| -> ThreadAllocTally {
1152            sample_alloc_info(sample)
1153                .map(|alloc_info| alloc_info.tallies.get(op))
1154                .copied()
1155                .unwrap_or_default()
1156        };
1157
1158        let mut alloc_total_max_count = 0u128;
1159        let mut alloc_total_max_size = 0u128;
1160        let mut alloc_total_tallies = TotalAllocTallyMap::default();
1161
1162        for alloc_info in alloc_info_by_sample.values() {
1163            alloc_total_max_count += alloc_info.max_count as u128;
1164            alloc_total_max_size += alloc_info.max_size as u128;
1165            alloc_info.tallies.add_to_total(&mut alloc_total_tallies);
1166        }
1167
1168        let sample_size = f64::from(sample_size);
1169        Stats {
1170            sample_count: sample_count as u32,
1171            iter_count: total_count,
1172            time: StatsSet {
1173                fastest: min_duration,
1174                slowest: max_duration,
1175                median: median_duration,
1176                mean: mean_duration,
1177            },
1178            max_alloc: StatsSet {
1179                fastest: {
1180                    let alloc_info = sample_alloc_info(sorted_samples.first().copied());
1181
1182                    AllocTally {
1183                        count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1184                            / sample_size,
1185                        size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1186                            / sample_size,
1187                    }
1188                },
1189                slowest: {
1190                    let alloc_info = sample_alloc_info(sorted_samples.last().copied());
1191
1192                    AllocTally {
1193                        count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1194                            / sample_size,
1195                        size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1196                            / sample_size,
1197                    }
1198                },
1199                // TODO: Switch to median of alloc info itself, rather than
1200                // basing off of median times.
1201                median: {
1202                    let alloc_info_for_median =
1203                        |index| sample_alloc_info(median_samples.get(index).copied());
1204
1205                    let max_count_for_median = |index: usize| -> f64 {
1206                        alloc_info_for_median(index)
1207                            .map(|info| info.max_count as f64)
1208                            .unwrap_or_default()
1209                    };
1210
1211                    let max_size_for_median = |index: usize| -> f64 {
1212                        alloc_info_for_median(index)
1213                            .map(|info| info.max_size as f64)
1214                            .unwrap_or_default()
1215                    };
1216
1217                    let median_count = median_samples.len().max(1) as f64;
1218
1219                    let median_max_count = max_count_for_median(0) + max_count_for_median(1);
1220                    let median_max_size = max_size_for_median(0) + max_size_for_median(1);
1221
1222                    AllocTally {
1223                        count: median_max_count / median_count / sample_size,
1224                        size: median_max_size / median_count / sample_size,
1225                    }
1226                },
1227                mean: AllocTally {
1228                    count: alloc_total_max_count as f64 / total_count as f64,
1229                    size: alloc_total_max_size as f64 / total_count as f64,
1230                },
1231            }
1232            .transpose(),
1233            alloc_tallies: AllocOpMap {
1234                values: AllocOp::ALL
1235                    .map(|op| StatsSet {
1236                        fastest: {
1237                            let fastest = sample_alloc_tally(sorted_samples.first().copied(), op);
1238
1239                            AllocTally {
1240                                count: fastest.count as f64 / sample_size,
1241                                size: fastest.size as f64 / sample_size,
1242                            }
1243                        },
1244                        slowest: {
1245                            let slowest = sample_alloc_tally(sorted_samples.last().copied(), op);
1246
1247                            AllocTally {
1248                                count: slowest.count as f64 / sample_size,
1249                                size: slowest.size as f64 / sample_size,
1250                            }
1251                        },
1252                        median: {
1253                            let tally_for_median = |index: usize| -> ThreadAllocTally {
1254                                sample_alloc_tally(median_samples.get(index).copied(), op)
1255                            };
1256
1257                            let a = tally_for_median(0);
1258                            let b = tally_for_median(1);
1259
1260                            let median_count = median_samples.len().max(1) as f64;
1261
1262                            let avg_count = (a.count as f64 + b.count as f64) / median_count;
1263                            let avg_size = (a.size as f64 + b.size as f64) / median_count;
1264
1265                            AllocTally {
1266                                count: avg_count / sample_size,
1267                                size: avg_size / sample_size,
1268                            }
1269                        },
1270                        mean: {
1271                            let tally = alloc_total_tallies.get(op);
1272                            AllocTally {
1273                                count: tally.count as f64 / total_count as f64,
1274                                size: tally.size as f64 / total_count as f64,
1275                            }
1276                        },
1277                    })
1278                    .map(StatsSet::transpose),
1279            },
1280            counts,
1281        }
1282    }
1283}
1284
1285impl<T> StatsSet<AllocTally<T>> {
1286    #[inline]
1287    pub fn transpose(self) -> AllocTally<StatsSet<T>> {
1288        AllocTally {
1289            count: StatsSet {
1290                fastest: self.fastest.count,
1291                slowest: self.slowest.count,
1292                median: self.median.count,
1293                mean: self.mean.count,
1294            },
1295            size: StatsSet {
1296                fastest: self.fastest.size,
1297                slowest: self.slowest.size,
1298                median: self.median.size,
1299                mean: self.mean.size,
1300            },
1301        }
1302    }
1303}