codspeed_divan_compat_walltime/bench/mod.rs
1use std::{
2 cell::UnsafeCell,
3 fmt,
4 mem::{self, MaybeUninit},
5 num::NonZeroUsize,
6 sync::Barrier,
7};
8
9use crate::{
10 alloc::{
11 AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap,
12 },
13 black_box, black_box_drop,
14 counter::{
15 AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount,
16 IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt,
17 },
18 divan::SharedContext,
19 stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample},
20 thread_pool::BENCH_POOL,
21 time::{FineDuration, Timestamp, UntaggedTimestamp},
22 util::{self, sync::SyncWrap, Unit},
23};
24
25#[cfg(test)]
26mod tests;
27
28mod args;
29mod defer;
30mod options;
31
32use defer::{DeferSlot, DeferStore};
33
34pub use self::{
35 args::{BenchArgs, BenchArgsRunner},
36 options::BenchOptions,
37};
38
39pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100;
40
41/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
42///
43/// # Examples
44///
45/// ```
46/// use divan::{Bencher, black_box};
47///
48/// #[divan::bench]
49/// fn copy_from_slice(bencher: Bencher) {
50/// // Input and output buffers get used in the closure.
51/// let src = (0..100).collect::<Vec<i32>>();
52/// let mut dst = vec![0; src.len()];
53///
54/// bencher.bench_local(|| {
55/// black_box(&mut dst).copy_from_slice(black_box(&src));
56/// });
57/// }
58/// ```
59#[must_use = "a benchmark function must be registered"]
60pub struct Bencher<'a, 'b, C = BencherConfig> {
61 pub(crate) context: &'a mut BenchContext<'b>,
62 pub(crate) config: C,
63}
64
65/// Public-in-private type for statically-typed `Bencher` configuration.
66///
67/// This enables configuring `Bencher` using the builder pattern with zero
68/// runtime cost.
69pub struct BencherConfig<GenI = Unit> {
70 gen_input: GenI,
71}
72
73impl<C> fmt::Debug for Bencher<'_, '_, C> {
74 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
75 f.debug_struct("Bencher").finish_non_exhaustive()
76 }
77}
78
79impl<'a, 'b> Bencher<'a, 'b> {
80 #[inline]
81 pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
82 Self { context, config: BencherConfig { gen_input: Unit } }
83 }
84}
85
86impl<'a, 'b> Bencher<'a, 'b> {
87 /// Benchmarks a function.
88 ///
89 /// The function can be benchmarked in parallel using the [`threads`
90 /// option](macro@crate::bench#threads). If the function is strictly
91 /// single-threaded, use [`Bencher::bench_local`] instead.
92 ///
93 /// # Examples
94 ///
95 /// ```
96 /// #[divan::bench]
97 /// fn bench(bencher: divan::Bencher) {
98 /// bencher.bench(|| {
99 /// // Benchmarked code...
100 /// });
101 /// }
102 /// ```
103 pub fn bench<O, B>(self, benched: B)
104 where
105 B: Fn() -> O + Sync,
106 {
107 // Reusing `bench_values` for a zero-sized non-drop input type should
108 // have no overhead.
109 self.with_inputs(|| ()).bench_values(|_: ()| benched());
110 }
111
112 /// Benchmarks a function on the current thread.
113 ///
114 /// # Examples
115 ///
116 /// ```
117 /// #[divan::bench]
118 /// fn bench(bencher: divan::Bencher) {
119 /// bencher.bench_local(|| {
120 /// // Benchmarked code...
121 /// });
122 /// }
123 /// ```
124 pub fn bench_local<O, B>(self, mut benched: B)
125 where
126 B: FnMut() -> O,
127 {
128 // Reusing `bench_local_values` for a zero-sized non-drop input type
129 // should have no overhead.
130 self.with_inputs(|| ()).bench_local_values(|_: ()| benched());
131 }
132
133 /// Generate inputs for the [benchmarked function](#input-bench).
134 ///
135 /// Time spent generating inputs does not affect benchmark timing.
136 ///
137 /// When [benchmarking in parallel](macro@crate::bench#threads), the input
138 /// generator is called on the same thread as the sample loop that uses that
139 /// input.
140 ///
141 /// # Examples
142 ///
143 /// ```
144 /// #[divan::bench]
145 /// fn bench(bencher: divan::Bencher) {
146 /// bencher
147 /// .with_inputs(|| {
148 /// // Generate input:
149 /// String::from("...")
150 /// })
151 /// .bench_values(|s| {
152 /// // Use input by-value:
153 /// s + "123"
154 /// });
155 /// }
156 /// ```
157 pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
158 Bencher { context: self.context, config: BencherConfig { gen_input } }
159 }
160}
161
162impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
163 /// Assign a [`Counter`] for all iterations of the benchmarked function.
164 ///
165 /// This will either:
166 /// - Assign a new counter
167 /// - Override an existing counter of the same type
168 ///
169 /// If the counter depends on [generated inputs](Self::with_inputs), use
170 /// [`Bencher::input_counter`] instead.
171 ///
172 /// If context is not needed, the counter can instead be set via
173 /// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
174 ///
175 /// # Examples
176 ///
177 /// ```
178 /// use divan::{Bencher, counter::BytesCount};
179 ///
180 /// #[divan::bench]
181 /// fn char_count(bencher: Bencher) {
182 /// let s: String = // ...
183 /// # String::new();
184 ///
185 /// bencher
186 /// .counter(BytesCount::of_str(&s))
187 /// .bench(|| {
188 /// divan::black_box(&s).chars().count()
189 /// });
190 /// }
191 /// ```
192 #[doc(alias = "throughput")]
193 pub fn counter<C>(self, counter: C) -> Self
194 where
195 C: IntoCounter,
196 {
197 let counter = AnyCounter::new(counter);
198 self.context.counters.set_counter(counter);
199 self
200 }
201}
202
203/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
204impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
205where
206 GenI: FnMut() -> I,
207{
208 /// Calls a closure to create a [`Counter`] for each input of the
209 /// benchmarked function.
210 ///
211 /// This will either:
212 /// - Assign a new counter
213 /// - Override an existing counter of the same type
214 ///
215 /// If the counter is constant, use [`Bencher::counter`] instead.
216 ///
217 /// When [benchmarking in parallel](macro@crate::bench#threads), the input
218 /// counter is called on the same thread as the sample loop that generates
219 /// and uses that input.
220 ///
221 /// # Examples
222 ///
223 /// The following example emits info for the number of bytes processed when
224 /// benchmarking [`char`-counting](std::str::Chars::count). The byte count
225 /// is gotten by calling [`BytesCount::of_str`] on each iteration's input
226 /// [`String`].
227 ///
228 /// ```
229 /// use divan::{Bencher, counter::BytesCount};
230 ///
231 /// #[divan::bench]
232 /// fn char_count(bencher: Bencher) {
233 /// bencher
234 /// .with_inputs(|| -> String {
235 /// // ...
236 /// # String::new()
237 /// })
238 /// .input_counter(BytesCount::of_str)
239 /// .bench_refs(|s| {
240 /// s.chars().count()
241 /// });
242 /// }
243 /// ```
244 pub fn input_counter<C, F>(self, make_counter: F) -> Self
245 where
246 F: Fn(&I) -> C + Sync + 'static,
247 C: IntoCounter,
248 {
249 self.context.counters.set_input_counter(make_counter);
250 self
251 }
252
253 /// Creates a [`Counter`] from each input of the benchmarked function.
254 ///
255 /// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any
256 /// nesting of references to those types.
257 ///
258 /// # Examples
259 ///
260 /// The following example emits info for the number of items processed when
261 /// benchmarking [`FromIterator`] from
262 /// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`].
263 ///
264 /// ```
265 /// use divan::{Bencher, counter::ItemsCount};
266 ///
267 /// #[divan::bench]
268 /// fn range_to_vec(bencher: Bencher) {
269 /// bencher
270 /// .with_inputs(|| -> usize {
271 /// // ...
272 /// # 0
273 /// })
274 /// .count_inputs_as::<ItemsCount>()
275 /// .bench_values(|n| -> Vec<usize> {
276 /// (0..n).collect()
277 /// });
278 /// }
279 /// ```
280 #[inline]
281 pub fn count_inputs_as<C>(self) -> Self
282 where
283 C: Counter,
284 I: AsCountUInt,
285 {
286 match KnownCounterKind::of::<C>() {
287 KnownCounterKind::Bytes => self.input_counter(|c| BytesCount::from(c)),
288 KnownCounterKind::Chars => self.input_counter(|c| CharsCount::from(c)),
289 KnownCounterKind::Cycles => self.input_counter(|c| CyclesCount::from(c)),
290 KnownCounterKind::Items => self.input_counter(|c| ItemsCount::from(c)),
291 }
292 }
293
294 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
295 /// provided by-value.
296 ///
297 /// Per-iteration means the benchmarked function is called exactly once for
298 /// each generated input.
299 ///
300 /// The function can be benchmarked in parallel using the [`threads`
301 /// option](macro@crate::bench#threads). If the function is strictly
302 /// single-threaded, use [`Bencher::bench_local_values`] instead.
303 ///
304 /// # Examples
305 ///
306 /// ```
307 /// #[divan::bench]
308 /// fn bench(bencher: divan::Bencher) {
309 /// bencher
310 /// .with_inputs(|| {
311 /// // Generate input:
312 /// String::from("...")
313 /// })
314 /// .bench_values(|s| {
315 /// // Use input by-value:
316 /// s + "123"
317 /// });
318 /// }
319 /// ```
320 pub fn bench_values<O, B>(self, benched: B)
321 where
322 B: Fn(I) -> O + Sync,
323 GenI: Fn() -> I + Sync,
324 {
325 self.context.bench_loop_threaded(
326 self.config.gen_input,
327 |input| {
328 // SAFETY: Input is guaranteed to be initialized and not
329 // currently referenced by anything else.
330 let input = unsafe { input.get().read().assume_init() };
331
332 benched(input)
333 },
334 // Input ownership is transferred to `benched`.
335 |_input| {},
336 );
337 }
338
339 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
340 /// provided by-value.
341 ///
342 /// Per-iteration means the benchmarked function is called exactly once for
343 /// each generated input.
344 ///
345 /// # Examples
346 ///
347 /// ```
348 /// #[divan::bench]
349 /// fn bench(bencher: divan::Bencher) {
350 /// let mut values = Vec::new();
351 /// bencher
352 /// .with_inputs(|| {
353 /// // Generate input:
354 /// String::from("...")
355 /// })
356 /// .bench_local_values(|s| {
357 /// // Use input by-value:
358 /// values.push(s);
359 /// });
360 /// }
361 /// ```
362 pub fn bench_local_values<O, B>(self, mut benched: B)
363 where
364 B: FnMut(I) -> O,
365 {
366 self.context.bench_loop_local(
367 self.config.gen_input,
368 |input| {
369 // SAFETY: Input is guaranteed to be initialized and not
370 // currently referenced by anything else.
371 let input = unsafe { input.get().read().assume_init() };
372
373 benched(input)
374 },
375 // Input ownership is transferred to `benched`.
376 |_input| {},
377 );
378 }
379
380 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
381 /// provided by-reference.
382 ///
383 /// Per-iteration means the benchmarked function is called exactly once for
384 /// each generated input.
385 ///
386 /// # Examples
387 ///
388 /// ```
389 /// #[divan::bench]
390 /// fn bench(bencher: divan::Bencher) {
391 /// bencher
392 /// .with_inputs(|| {
393 /// // Generate input:
394 /// String::from("...")
395 /// })
396 /// .bench_refs(|s| {
397 /// // Use input by-reference:
398 /// *s += "123";
399 /// });
400 /// }
401 /// ```
402 pub fn bench_refs<O, B>(self, benched: B)
403 where
404 B: Fn(&mut I) -> O + Sync,
405 GenI: Fn() -> I + Sync,
406 {
407 // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
408 self.context.bench_loop_threaded(
409 self.config.gen_input,
410 |input| {
411 // SAFETY: Input is guaranteed to be initialized and not
412 // currently referenced by anything else.
413 let input = unsafe { (*input.get()).assume_init_mut() };
414
415 benched(input)
416 },
417 // Input ownership was not transferred to `benched`.
418 |input| {
419 // SAFETY: This function is called after `benched` outputs are
420 // dropped, so we have exclusive access.
421 unsafe { (*input.get()).assume_init_drop() }
422 },
423 );
424 }
425
426 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
427 /// provided by-reference.
428 ///
429 /// Per-iteration means the benchmarked function is called exactly once for
430 /// each generated input.
431 ///
432 /// # Examples
433 ///
434 /// ```
435 /// #[divan::bench]
436 /// fn bench(bencher: divan::Bencher) {
437 /// bencher
438 /// .with_inputs(|| {
439 /// // Generate input:
440 /// String::from("...")
441 /// })
442 /// .bench_local_refs(|s| {
443 /// // Use input by-reference:
444 /// *s += "123";
445 /// });
446 /// }
447 /// ```
448 pub fn bench_local_refs<O, B>(self, mut benched: B)
449 where
450 B: FnMut(&mut I) -> O,
451 {
452 // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
453 self.context.bench_loop_local(
454 self.config.gen_input,
455 |input| {
456 // SAFETY: Input is guaranteed to be initialized and not
457 // currently referenced by anything else.
458 let input = unsafe { (*input.get()).assume_init_mut() };
459
460 benched(input)
461 },
462 // Input ownership was not transferred to `benched`.
463 |input| {
464 // SAFETY: This function is called after `benched` outputs are
465 // dropped, so we have exclusive access.
466 unsafe { (*input.get()).assume_init_drop() }
467 },
468 );
469 }
470}
471
472/// State machine for how the benchmark is being run.
473#[derive(Clone, Copy)]
474pub(crate) enum BenchMode {
475 /// The benchmark is being run as `--test`.
476 ///
477 /// Don't collect samples and run exactly once.
478 Test,
479
480 /// Scale `sample_size` to determine the right size for collecting.
481 Tune { sample_size: u32 },
482
483 /// Simply collect samples.
484 Collect { sample_size: u32 },
485}
486
487impl BenchMode {
488 #[inline]
489 pub fn is_test(self) -> bool {
490 matches!(self, Self::Test)
491 }
492
493 #[inline]
494 pub fn is_tune(self) -> bool {
495 matches!(self, Self::Tune { .. })
496 }
497
498 #[inline]
499 pub fn is_collect(self) -> bool {
500 matches!(self, Self::Collect { .. })
501 }
502
503 #[inline]
504 pub fn sample_size(self) -> u32 {
505 match self {
506 Self::Test => 1,
507 Self::Tune { sample_size, .. } | Self::Collect { sample_size, .. } => sample_size,
508 }
509 }
510}
511
512/// `#[divan::bench]` loop context.
513///
514/// Functions called within the benchmark loop should be `#[inline(always)]` to
515/// ensure instruction cache locality.
516pub(crate) struct BenchContext<'a> {
517 shared_context: &'a SharedContext,
518
519 /// User-configured options.
520 pub options: &'a BenchOptions<'a>,
521
522 /// Whether the benchmark loop was started.
523 pub did_run: bool,
524
525 /// The number of threads to run the benchmark. The default is 1.
526 ///
527 /// When set to 1, the benchmark loop is guaranteed to stay on the current
528 /// thread and not spawn any threads.
529 pub thread_count: NonZeroUsize,
530
531 /// Recorded samples.
532 pub samples: SampleCollection,
533
534 /// Per-iteration counters grouped by sample.
535 counters: CounterCollection,
536}
537
538impl<'a> BenchContext<'a> {
539 /// Creates a new benchmarking context.
540 pub fn new(
541 shared_context: &'a SharedContext,
542 options: &'a BenchOptions,
543 thread_count: NonZeroUsize,
544 ) -> Self {
545 Self {
546 shared_context,
547 options,
548 thread_count,
549 did_run: false,
550 samples: SampleCollection::default(),
551 counters: options.counters.to_collection(),
552 }
553 }
554
555 /// Runs the single-threaded loop for benchmarking `benched`.
556 ///
557 /// # Safety
558 ///
559 /// See `bench_loop_threaded`.
560 pub fn bench_loop_local<I, O>(
561 &mut self,
562 gen_input: impl FnMut() -> I,
563 benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
564 drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
565 ) {
566 // SAFETY: Closures are guaranteed to run on the current thread, so they
567 // can safely be mutable and non-`Sync`.
568 unsafe {
569 let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
570 let benched = SyncWrap::new(UnsafeCell::new(benched));
571 let drop_input = SyncWrap::new(drop_input);
572
573 self.thread_count = NonZeroUsize::MIN;
574 self.bench_loop_threaded::<I, O>(
575 || (*gen_input.get())(),
576 |input| (*benched.get())(input),
577 |input| drop_input(input),
578 )
579 }
580 }
581
582 /// Runs the multi-threaded loop for benchmarking `benched`.
583 ///
584 /// # Safety
585 ///
586 /// If `self.threads` is 1, the incoming closures will not escape the
587 /// current thread. This guarantee ensures `bench_loop_local` can soundly
588 /// reuse this method with mutable non-`Sync` closures.
589 ///
590 /// When `benched` is called:
591 /// - `I` is guaranteed to be initialized.
592 /// - No external `&I` or `&mut I` exists.
593 ///
594 /// When `drop_input` is called:
595 /// - All instances of `O` returned from `benched` have been dropped.
596 /// - The same guarantees for `I` apply as in `benched`, unless `benched`
597 /// escaped references to `I`.
598 fn bench_loop_threaded<I, O>(
599 &mut self,
600 gen_input: impl Fn() -> I + Sync,
601 benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
602 drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
603 ) {
604 self.did_run = true;
605
606 let mut current_mode = self.initial_mode();
607 let is_test = current_mode.is_test();
608
609 let record_sample = self.sample_recorder(gen_input, benched, drop_input);
610
611 let thread_count = self.thread_count.get();
612 let aux_thread_count = thread_count - 1;
613
614 let is_single_thread = aux_thread_count == 0;
615
616 // Per-thread sample info returned by `record_sample`. These are
617 // processed locally to emit user-facing sample info. As a result, this
618 // only contains `thread_count` many elements at a time.
619 let mut raw_samples = Vec::<Option<RawSample>>::new();
620
621 // The time spent benchmarking, in picoseconds.
622 //
623 // Unless `skip_ext_time` is set, this includes time external to
624 // `benched`, such as time spent generating inputs and running drop.
625 let mut elapsed_picos: u128 = 0;
626
627 // The minimum time for benchmarking, in picoseconds.
628 let min_picos = self.options.min_time().picos;
629
630 // The remaining time left for benchmarking, in picoseconds.
631 let max_picos = self.options.max_time().picos;
632
633 // Don't bother running if user specifies 0 max time or 0 samples.
634 if max_picos == 0 || !self.options.has_samples() {
635 return;
636 }
637
638 let timer = self.shared_context.timer;
639 let timer_kind = timer.kind();
640
641 let mut rem_samples = if current_mode.is_collect() {
642 Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
643 } else {
644 None
645 };
646
647 // Only measure precision if we need to tune sample size.
648 let timer_precision =
649 if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
650
651 if !is_test {
652 self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(1) as usize);
653 }
654
655 let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
656 let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
657
658 let bench_overheads = timer.bench_overheads();
659
660 #[cfg(unix)]
661 let _guard = codspeed::fifo::BenchGuard::new_with_runner_fifo();
662 while {
663 // Conditions for when sampling is over:
664 if elapsed_picos >= max_picos {
665 // Depleted the benchmarking time budget. This is a strict
666 // condition regardless of sample count and minimum time.
667 false
668 } else if rem_samples.unwrap_or(1) > 0 {
669 // More samples expected.
670 true
671 } else {
672 // Continue if we haven't reached the time floor.
673 elapsed_picos < min_picos
674 }
675 } {
676 let sample_size = current_mode.sample_size();
677 self.samples.sample_size = sample_size;
678
679 let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
680
681 // Sample loop helper:
682 let record_sample = || -> RawSample {
683 let mut counter_totals: [u128; KnownCounterKind::COUNT] =
684 [0; KnownCounterKind::COUNT];
685
686 // Updates per-input counter info for this sample.
687 let mut count_input = |input: &I| {
688 for counter_kind in KnownCounterKind::ALL {
689 // SAFETY: The `I` type cannot change since `with_inputs`
690 // cannot be called more than once on the same `Bencher`.
691 if let Some(count) =
692 unsafe { self.counters.get_input_count(counter_kind, input) }
693 {
694 let total = &mut counter_totals[counter_kind as usize];
695 *total = (*total).saturating_add(count as u128);
696 }
697 }
698 };
699
700 // Sample loop:
701 let ([start, end], alloc_info) =
702 record_sample(sample_size as usize, barrier.as_ref(), &mut count_input);
703
704 RawSample { start, end, timer, alloc_info, counter_totals }
705 };
706
707 // Sample loop:
708 raw_samples.clear();
709 BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, |_| record_sample());
710
711 // Convert `&[Option<RawSample>]` to `&[Sample]`.
712 let raw_samples: &[RawSample] = {
713 if let Some(thread) = raw_samples
714 .iter()
715 .enumerate()
716 .find_map(|(thread, sample)| sample.is_none().then_some(thread))
717 {
718 panic!("Divan benchmarking thread {thread} panicked");
719 }
720
721 unsafe {
722 assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>());
723 std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len())
724 }
725 };
726
727 // If testing, exit the benchmarking loop immediately after timing a
728 // single run.
729 if is_test {
730 break;
731 }
732
733 let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap();
734 let slowest_time = slowest_sample.duration();
735
736 // TODO: Make tuning be less influenced by early runs. Currently if
737 // early runs are very quick but later runs are slow, benchmarking
738 // will take a very long time.
739 //
740 // TODO: Make `sample_size` consider time generating inputs and
741 // dropping inputs/outputs. Currently benchmarks like
742 // `Bencher::bench_refs(String::clear)` take a very long time.
743 if current_mode.is_tune() {
744 // Clear previous smaller samples.
745 self.samples.clear();
746 self.counters.clear_input_counts();
747
748 // If within 100x timer precision, continue tuning.
749 let precision_multiple = slowest_time.picos / timer_precision.picos;
750 if precision_multiple <= 100 {
751 current_mode = BenchMode::Tune { sample_size: sample_size * 2 };
752 } else {
753 current_mode = BenchMode::Collect { sample_size };
754 rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
755 }
756 }
757
758 // Returns the sample's duration adjusted for overhead.
759 let sample_duration_sub_overhead = |raw_sample: &RawSample| {
760 let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info);
761
762 FineDuration {
763 picos: raw_sample
764 .duration()
765 .clamp_to(timer_precision)
766 .picos
767 .saturating_sub(overhead.picos),
768 }
769 .clamp_to(timer_precision)
770 };
771
772 for raw_sample in raw_samples {
773 let sample_index = self.samples.time_samples.len();
774
775 self.samples
776 .time_samples
777 .push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) });
778
779 if !raw_sample.alloc_info.tallies.is_empty() {
780 self.samples
781 .alloc_info_by_sample
782 .insert(sample_index as u32, raw_sample.alloc_info.clone());
783 }
784
785 // Insert per-input counter information.
786 for counter_kind in KnownCounterKind::ALL {
787 if !self.counters.uses_input_counts(counter_kind) {
788 continue;
789 }
790
791 let total_count = raw_sample.counter_totals[counter_kind as usize];
792
793 // Cannot overflow `MaxCountUInt` because `total_count`
794 // cannot exceed `MaxCountUInt::MAX * sample_size`.
795 let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
796
797 self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
798 }
799
800 if let Some(rem_samples) = &mut rem_samples {
801 *rem_samples = rem_samples.saturating_sub(1);
802 }
803 }
804
805 if let Some(initial_start) = initial_start {
806 let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
807 elapsed_picos = last_end.duration_since(initial_start, timer).picos;
808 } else {
809 // Progress by at least 1ns to prevent extremely fast
810 // functions from taking forever when `min_time` is set.
811 let progress_picos = slowest_time.picos.max(1_000);
812 elapsed_picos = elapsed_picos.saturating_add(progress_picos);
813 }
814 }
815 #[cfg(unix)]
816 core::mem::drop(_guard);
817
818 // Reset flag for ignoring allocations.
819 crate::alloc::IGNORE_ALLOC.set(false);
820 }
821
822 /// Returns a closure that takes the sample size and input counter, and then
823 /// returns a newly recorded sample.
824 fn sample_recorder<I, O>(
825 &self,
826 gen_input: impl Fn() -> I,
827 benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
828 drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
829 ) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; 2], ThreadAllocInfo)
830 {
831 // We defer:
832 // - Usage of `gen_input` values.
833 // - Drop destructor for `O`, preventing it from affecting sample
834 // measurements. Outputs are stored into a pre-allocated buffer during
835 // the sample loop. The allocation is reused between samples to reduce
836 // time spent between samples.
837
838 let timer_kind = self.shared_context.timer.kind();
839
840 move |sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)| {
841 let mut defer_store = DeferStore::<I, O>::default();
842
843 let mut saved_alloc_info = ThreadAllocInfo::new();
844 let mut save_alloc_info = || {
845 if crate::alloc::IGNORE_ALLOC.get() {
846 return;
847 }
848
849 if let Some(alloc_info) = ThreadAllocInfo::try_current() {
850 // SAFETY: We have exclusive access.
851 saved_alloc_info = unsafe { alloc_info.as_ptr().read() };
852 }
853 };
854
855 // Synchronize all threads to start timed section simultaneously and
856 // clear every thread's memory profiling info.
857 //
858 // This ensures work external to the timed section does not affect
859 // the timing of other threads.
860 let sync_threads = |is_start: bool| {
861 sync_impl(barrier, is_start);
862
863 // Monomorphize implementation to reduce code size.
864 #[inline(never)]
865 fn sync_impl(barrier: Option<&Barrier>, is_start: bool) {
866 // Ensure benchmarked section has a `ThreadAllocInfo`
867 // allocated for the current thread and clear previous info.
868 let alloc_info = if is_start { ThreadAllocInfo::current() } else { None };
869
870 // Synchronize all threads.
871 //
872 // This is the final synchronization point for the end.
873 if let Some(barrier) = barrier {
874 barrier.wait();
875 }
876
877 if let Some(mut alloc_info) = alloc_info {
878 // SAFETY: We have exclusive access.
879 let alloc_info = unsafe { alloc_info.as_mut() };
880
881 alloc_info.clear();
882
883 // Synchronize all threads.
884 if let Some(barrier) = barrier {
885 barrier.wait();
886 }
887 }
888 }
889 };
890
891 // The following logic chooses how to efficiently sample the
892 // benchmark function once and assigns `sample_start`/`sample_end`
893 // before/after the sample loop.
894 //
895 // NOTE: Testing and benchmarking should behave exactly the same
896 // when getting the sample time span. We don't want to introduce
897 // extra work that may worsen measurement quality for real
898 // benchmarking.
899 let sample_start: UntaggedTimestamp;
900 let sample_end: UntaggedTimestamp;
901
902 if size_of::<I>() == 0 && (size_of::<O>() == 0 || !mem::needs_drop::<O>()) {
903 // Use a range instead of `defer_store` to make the benchmarking
904 // loop cheaper.
905
906 // Run `gen_input` the expected number of times in case it
907 // updates external state used by `benched`.
908 for _ in 0..sample_size {
909 let input = gen_input();
910 count_input(&input);
911
912 // Inputs are consumed/dropped later.
913 mem::forget(input);
914 }
915
916 sync_threads(true);
917 sample_start = UntaggedTimestamp::start(timer_kind);
918
919 // Sample loop:
920 for _ in 0..sample_size {
921 // SAFETY: Input is a ZST, so we can construct one out of
922 // thin air.
923 let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
924
925 mem::forget(black_box(benched(&input)));
926 }
927
928 sample_end = UntaggedTimestamp::end(timer_kind);
929 sync_threads(false);
930 save_alloc_info();
931
932 // Drop outputs and inputs.
933 for _ in 0..sample_size {
934 // Output only needs drop if ZST.
935 if size_of::<O>() == 0 {
936 // SAFETY: Output is a ZST, so we can construct one out
937 // of thin air.
938 unsafe { _ = mem::zeroed::<O>() }
939 }
940
941 if mem::needs_drop::<I>() {
942 // SAFETY: Input is a ZST, so we can construct one out
943 // of thin air and not worry about aliasing.
944 unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
945 }
946 }
947 } else {
948 defer_store.prepare(sample_size);
949
950 match defer_store.slots() {
951 // Output needs to be dropped. We defer drop in the sample
952 // loop by inserting it into `defer_store`.
953 Ok(defer_slots_slice) => {
954 // Initialize and store inputs.
955 for DeferSlot { input, .. } in defer_slots_slice {
956 // SAFETY: We have exclusive access to `input`.
957 let input = unsafe { &mut *input.get() };
958 let input = input.write(gen_input());
959 count_input(input);
960
961 // Make input opaque to benchmarked function.
962 black_box(input);
963 }
964
965 // Create iterator before the sample timing section to
966 // reduce benchmarking overhead.
967 let defer_slots_iter = defer_slots_slice.iter();
968
969 sync_threads(true);
970 sample_start = UntaggedTimestamp::start(timer_kind);
971
972 // Sample loop:
973 for defer_slot in defer_slots_iter {
974 // SAFETY: All inputs in `defer_store` were
975 // initialized and we have exclusive access to the
976 // output slot.
977 unsafe {
978 let output = benched(&defer_slot.input);
979 *defer_slot.output.get() = MaybeUninit::new(output);
980 }
981 }
982
983 sample_end = UntaggedTimestamp::end(timer_kind);
984 sync_threads(false);
985 save_alloc_info();
986
987 // Prevent the optimizer from removing writes to inputs
988 // and outputs in the sample loop.
989 black_box(defer_slots_slice);
990
991 // Drop outputs and inputs.
992 for DeferSlot { input, output } in defer_slots_slice {
993 // SAFETY: All outputs were initialized in the
994 // sample loop and we have exclusive access.
995 unsafe { (*output.get()).assume_init_drop() }
996
997 if mem::needs_drop::<I>() {
998 // SAFETY: The output was dropped and thus we
999 // have exclusive access to inputs.
1000 unsafe { drop_input(input) }
1001 }
1002 }
1003 }
1004
1005 // Output does not need to be dropped.
1006 Err(defer_inputs_slice) => {
1007 // Initialize and store inputs.
1008 for input in defer_inputs_slice {
1009 // SAFETY: We have exclusive access to `input`.
1010 let input = unsafe { &mut *input.get() };
1011 let input = input.write(gen_input());
1012 count_input(input);
1013
1014 // Make input opaque to benchmarked function.
1015 black_box(input);
1016 }
1017
1018 // Create iterator before the sample timing section to
1019 // reduce benchmarking overhead.
1020 let defer_inputs_iter = defer_inputs_slice.iter();
1021
1022 sync_threads(true);
1023 sample_start = UntaggedTimestamp::start(timer_kind);
1024
1025 // Sample loop:
1026 for input in defer_inputs_iter {
1027 // SAFETY: All inputs in `defer_store` were
1028 // initialized.
1029 black_box_drop(unsafe { benched(input) });
1030 }
1031
1032 sample_end = UntaggedTimestamp::end(timer_kind);
1033 sync_threads(false);
1034 save_alloc_info();
1035
1036 // Prevent the optimizer from removing writes to inputs
1037 // in the sample loop.
1038 black_box(defer_inputs_slice);
1039
1040 // Drop inputs.
1041 if mem::needs_drop::<I>() {
1042 for input in defer_inputs_slice {
1043 // SAFETY: We have exclusive access to inputs.
1044 unsafe { drop_input(input) }
1045 }
1046 }
1047 }
1048 }
1049 }
1050
1051 // SAFETY: These values are guaranteed to be the correct variant
1052 // because they were created from the same `timer_kind`.
1053 let interval = unsafe {
1054 [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
1055 };
1056
1057 (interval, saved_alloc_info)
1058 }
1059 }
1060
1061 #[inline]
1062 fn initial_mode(&self) -> BenchMode {
1063 if self.shared_context.action.is_test() {
1064 BenchMode::Test
1065 } else if let Some(sample_size) = self.options.sample_size {
1066 BenchMode::Collect { sample_size }
1067 } else {
1068 BenchMode::Tune { sample_size: 1 }
1069 }
1070 }
1071
1072 pub fn compute_stats(&self) -> Stats {
1073 let time_samples = &self.samples.time_samples;
1074 let alloc_info_by_sample = &self.samples.alloc_info_by_sample;
1075
1076 let sample_count = time_samples.len();
1077 let sample_size = self.samples.sample_size;
1078
1079 let total_count = self.samples.iter_count();
1080
1081 let total_duration = self.samples.total_duration();
1082 let mean_duration = FineDuration {
1083 picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(),
1084 };
1085
1086 // Samples sorted by duration.
1087 let sorted_samples = self.samples.sorted_samples();
1088 let median_samples = util::slice_middle(&sorted_samples);
1089
1090 let index_of_sample = |sample: &TimeSample| -> usize {
1091 util::slice_ptr_index(&self.samples.time_samples, sample)
1092 };
1093
1094 let counter_count_for_sample =
1095 |sample: &TimeSample, counter_kind: KnownCounterKind| -> Option<MaxCountUInt> {
1096 let counts = self.counters.counts(counter_kind);
1097
1098 let index = if self.counters.uses_input_counts(counter_kind) {
1099 index_of_sample(sample)
1100 } else {
1101 0
1102 };
1103
1104 counts.get(index).copied()
1105 };
1106
1107 let min_duration =
1108 sorted_samples.first().map(|s| s.duration / sample_size).unwrap_or_default();
1109 let max_duration =
1110 sorted_samples.last().map(|s| s.duration / sample_size).unwrap_or_default();
1111
1112 let median_duration = if median_samples.is_empty() {
1113 FineDuration::default()
1114 } else {
1115 let sum: u128 = median_samples.iter().map(|s| s.duration.picos).sum();
1116 FineDuration { picos: sum / median_samples.len() as u128 } / sample_size
1117 };
1118
1119 let counts = KnownCounterKind::ALL.map(|counter_kind| {
1120 let median: MaxCountUInt = {
1121 let mut sum: u128 = 0;
1122
1123 for sample in median_samples {
1124 let sample_count = counter_count_for_sample(sample, counter_kind)? as u128;
1125
1126 // Saturating add in case `MaxUIntCount > u64`.
1127 sum = sum.saturating_add(sample_count);
1128 }
1129
1130 (sum / median_samples.len() as u128) as MaxCountUInt
1131 };
1132
1133 Some(StatsSet {
1134 fastest: sorted_samples
1135 .first()
1136 .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1137 slowest: sorted_samples
1138 .last()
1139 .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1140 median,
1141 mean: self.counters.mean_count(counter_kind),
1142 })
1143 });
1144
1145 let sample_alloc_info = |sample: Option<&TimeSample>| -> Option<&ThreadAllocInfo> {
1146 sample
1147 .and_then(|sample| u32::try_from(index_of_sample(sample)).ok())
1148 .and_then(|index| self.samples.alloc_info_by_sample.get(&index))
1149 };
1150
1151 let sample_alloc_tally = |sample: Option<&TimeSample>, op: AllocOp| -> ThreadAllocTally {
1152 sample_alloc_info(sample)
1153 .map(|alloc_info| alloc_info.tallies.get(op))
1154 .copied()
1155 .unwrap_or_default()
1156 };
1157
1158 let mut alloc_total_max_count = 0u128;
1159 let mut alloc_total_max_size = 0u128;
1160 let mut alloc_total_tallies = TotalAllocTallyMap::default();
1161
1162 for alloc_info in alloc_info_by_sample.values() {
1163 alloc_total_max_count += alloc_info.max_count as u128;
1164 alloc_total_max_size += alloc_info.max_size as u128;
1165 alloc_info.tallies.add_to_total(&mut alloc_total_tallies);
1166 }
1167
1168 let sample_size = f64::from(sample_size);
1169 Stats {
1170 sample_count: sample_count as u32,
1171 iter_count: total_count,
1172 time: StatsSet {
1173 fastest: min_duration,
1174 slowest: max_duration,
1175 median: median_duration,
1176 mean: mean_duration,
1177 },
1178 max_alloc: StatsSet {
1179 fastest: {
1180 let alloc_info = sample_alloc_info(sorted_samples.first().copied());
1181
1182 AllocTally {
1183 count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1184 / sample_size,
1185 size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1186 / sample_size,
1187 }
1188 },
1189 slowest: {
1190 let alloc_info = sample_alloc_info(sorted_samples.last().copied());
1191
1192 AllocTally {
1193 count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1194 / sample_size,
1195 size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1196 / sample_size,
1197 }
1198 },
1199 // TODO: Switch to median of alloc info itself, rather than
1200 // basing off of median times.
1201 median: {
1202 let alloc_info_for_median =
1203 |index| sample_alloc_info(median_samples.get(index).copied());
1204
1205 let max_count_for_median = |index: usize| -> f64 {
1206 alloc_info_for_median(index)
1207 .map(|info| info.max_count as f64)
1208 .unwrap_or_default()
1209 };
1210
1211 let max_size_for_median = |index: usize| -> f64 {
1212 alloc_info_for_median(index)
1213 .map(|info| info.max_size as f64)
1214 .unwrap_or_default()
1215 };
1216
1217 let median_count = median_samples.len().max(1) as f64;
1218
1219 let median_max_count = max_count_for_median(0) + max_count_for_median(1);
1220 let median_max_size = max_size_for_median(0) + max_size_for_median(1);
1221
1222 AllocTally {
1223 count: median_max_count / median_count / sample_size,
1224 size: median_max_size / median_count / sample_size,
1225 }
1226 },
1227 mean: AllocTally {
1228 count: alloc_total_max_count as f64 / total_count as f64,
1229 size: alloc_total_max_size as f64 / total_count as f64,
1230 },
1231 }
1232 .transpose(),
1233 alloc_tallies: AllocOpMap {
1234 values: AllocOp::ALL
1235 .map(|op| StatsSet {
1236 fastest: {
1237 let fastest = sample_alloc_tally(sorted_samples.first().copied(), op);
1238
1239 AllocTally {
1240 count: fastest.count as f64 / sample_size,
1241 size: fastest.size as f64 / sample_size,
1242 }
1243 },
1244 slowest: {
1245 let slowest = sample_alloc_tally(sorted_samples.last().copied(), op);
1246
1247 AllocTally {
1248 count: slowest.count as f64 / sample_size,
1249 size: slowest.size as f64 / sample_size,
1250 }
1251 },
1252 median: {
1253 let tally_for_median = |index: usize| -> ThreadAllocTally {
1254 sample_alloc_tally(median_samples.get(index).copied(), op)
1255 };
1256
1257 let a = tally_for_median(0);
1258 let b = tally_for_median(1);
1259
1260 let median_count = median_samples.len().max(1) as f64;
1261
1262 let avg_count = (a.count as f64 + b.count as f64) / median_count;
1263 let avg_size = (a.size as f64 + b.size as f64) / median_count;
1264
1265 AllocTally {
1266 count: avg_count / sample_size,
1267 size: avg_size / sample_size,
1268 }
1269 },
1270 mean: {
1271 let tally = alloc_total_tallies.get(op);
1272 AllocTally {
1273 count: tally.count as f64 / total_count as f64,
1274 size: tally.size as f64 / total_count as f64,
1275 }
1276 },
1277 })
1278 .map(StatsSet::transpose),
1279 },
1280 counts,
1281 }
1282 }
1283}
1284
1285impl<T> StatsSet<AllocTally<T>> {
1286 #[inline]
1287 pub fn transpose(self) -> AllocTally<StatsSet<T>> {
1288 AllocTally {
1289 count: StatsSet {
1290 fastest: self.fastest.count,
1291 slowest: self.slowest.count,
1292 median: self.median.count,
1293 mean: self.mean.count,
1294 },
1295 size: StatsSet {
1296 fastest: self.fastest.size,
1297 slowest: self.slowest.size,
1298 median: self.median.size,
1299 mean: self.mean.size,
1300 },
1301 }
1302 }
1303}