codspeed_divan_compat_walltime/bench/mod.rs
1use std::{
2 cell::UnsafeCell,
3 fmt,
4 mem::{self, MaybeUninit},
5 num::NonZeroUsize,
6 sync::Barrier,
7};
8
9use crate::{
10 alloc::{
11 AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap,
12 },
13 black_box, black_box_drop,
14 counter::{
15 AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount,
16 IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt,
17 },
18 divan::SharedContext,
19 stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample},
20 thread_pool::BENCH_POOL,
21 time::{FineDuration, Timestamp, UntaggedTimestamp},
22 util::{self, sync::SyncWrap, Unit},
23};
24
25#[cfg(test)]
26mod tests;
27
28mod args;
29mod defer;
30mod options;
31
32use ::codspeed::instrument_hooks::InstrumentHooks;
33use defer::{DeferSlot, DeferStore};
34
35pub use self::{
36 args::{BenchArgs, BenchArgsRunner},
37 options::BenchOptions,
38};
39
40pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100;
41
42/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
43///
44/// # Examples
45///
46/// ```
47/// use divan::{Bencher, black_box};
48///
49/// #[divan::bench]
50/// fn copy_from_slice(bencher: Bencher) {
51/// // Input and output buffers get used in the closure.
52/// let src = (0..100).collect::<Vec<i32>>();
53/// let mut dst = vec![0; src.len()];
54///
55/// bencher.bench_local(|| {
56/// black_box(&mut dst).copy_from_slice(black_box(&src));
57/// });
58/// }
59/// ```
60#[must_use = "a benchmark function must be registered"]
61pub struct Bencher<'a, 'b, C = BencherConfig> {
62 pub(crate) context: &'a mut BenchContext<'b>,
63 pub(crate) config: C,
64}
65
66/// Public-in-private type for statically-typed `Bencher` configuration.
67///
68/// This enables configuring `Bencher` using the builder pattern with zero
69/// runtime cost.
70pub struct BencherConfig<GenI = Unit> {
71 gen_input: GenI,
72}
73
74impl<C> fmt::Debug for Bencher<'_, '_, C> {
75 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76 f.debug_struct("Bencher").finish_non_exhaustive()
77 }
78}
79
80impl<'a, 'b> Bencher<'a, 'b> {
81 #[inline]
82 pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
83 Self { context, config: BencherConfig { gen_input: Unit } }
84 }
85}
86
87impl<'a, 'b> Bencher<'a, 'b> {
88 /// Benchmarks a function.
89 ///
90 /// The function can be benchmarked in parallel using the [`threads`
91 /// option](macro@crate::bench#threads). If the function is strictly
92 /// single-threaded, use [`Bencher::bench_local`] instead.
93 ///
94 /// # Examples
95 ///
96 /// ```
97 /// #[divan::bench]
98 /// fn bench(bencher: divan::Bencher) {
99 /// bencher.bench(|| {
100 /// // Benchmarked code...
101 /// });
102 /// }
103 /// ```
104 pub fn bench<O, B>(self, benched: B)
105 where
106 B: Fn() -> O + Sync,
107 {
108 // Reusing `bench_values` for a zero-sized non-drop input type should
109 // have no overhead.
110 self.with_inputs(|| ()).bench_values(|_: ()| benched());
111 }
112
113 /// Benchmarks a function on the current thread.
114 ///
115 /// # Examples
116 ///
117 /// ```
118 /// #[divan::bench]
119 /// fn bench(bencher: divan::Bencher) {
120 /// bencher.bench_local(|| {
121 /// // Benchmarked code...
122 /// });
123 /// }
124 /// ```
125 pub fn bench_local<O, B>(self, mut benched: B)
126 where
127 B: FnMut() -> O,
128 {
129 // Reusing `bench_local_values` for a zero-sized non-drop input type
130 // should have no overhead.
131 self.with_inputs(|| ()).bench_local_values(|_: ()| benched());
132 }
133
134 /// Generate inputs for the [benchmarked function](#input-bench).
135 ///
136 /// Time spent generating inputs does not affect benchmark timing.
137 ///
138 /// When [benchmarking in parallel](macro@crate::bench#threads), the input
139 /// generator is called on the same thread as the sample loop that uses that
140 /// input.
141 ///
142 /// # Examples
143 ///
144 /// ```
145 /// #[divan::bench]
146 /// fn bench(bencher: divan::Bencher) {
147 /// bencher
148 /// .with_inputs(|| {
149 /// // Generate input:
150 /// String::from("...")
151 /// })
152 /// .bench_values(|s| {
153 /// // Use input by-value:
154 /// s + "123"
155 /// });
156 /// }
157 /// ```
158 pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
159 Bencher { context: self.context, config: BencherConfig { gen_input } }
160 }
161}
162
163impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
164 /// Assign a [`Counter`] for all iterations of the benchmarked function.
165 ///
166 /// This will either:
167 /// - Assign a new counter
168 /// - Override an existing counter of the same type
169 ///
170 /// If the counter depends on [generated inputs](Self::with_inputs), use
171 /// [`Bencher::input_counter`] instead.
172 ///
173 /// If context is not needed, the counter can instead be set via
174 /// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
175 ///
176 /// # Examples
177 ///
178 /// ```
179 /// use divan::{Bencher, counter::BytesCount};
180 ///
181 /// #[divan::bench]
182 /// fn char_count(bencher: Bencher) {
183 /// let s: String = // ...
184 /// # String::new();
185 ///
186 /// bencher
187 /// .counter(BytesCount::of_str(&s))
188 /// .bench(|| {
189 /// divan::black_box(&s).chars().count()
190 /// });
191 /// }
192 /// ```
193 #[doc(alias = "throughput")]
194 pub fn counter<C>(self, counter: C) -> Self
195 where
196 C: IntoCounter,
197 {
198 let counter = AnyCounter::new(counter);
199 self.context.counters.set_counter(counter);
200 self
201 }
202}
203
204/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
205impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
206where
207 GenI: FnMut() -> I,
208{
209 /// Calls a closure to create a [`Counter`] for each input of the
210 /// benchmarked function.
211 ///
212 /// This will either:
213 /// - Assign a new counter
214 /// - Override an existing counter of the same type
215 ///
216 /// If the counter is constant, use [`Bencher::counter`] instead.
217 ///
218 /// When [benchmarking in parallel](macro@crate::bench#threads), the input
219 /// counter is called on the same thread as the sample loop that generates
220 /// and uses that input.
221 ///
222 /// # Examples
223 ///
224 /// The following example emits info for the number of bytes processed when
225 /// benchmarking [`char`-counting](std::str::Chars::count). The byte count
226 /// is gotten by calling [`BytesCount::of_str`] on each iteration's input
227 /// [`String`].
228 ///
229 /// ```
230 /// use divan::{Bencher, counter::BytesCount};
231 ///
232 /// #[divan::bench]
233 /// fn char_count(bencher: Bencher) {
234 /// bencher
235 /// .with_inputs(|| -> String {
236 /// // ...
237 /// # String::new()
238 /// })
239 /// .input_counter(BytesCount::of_str)
240 /// .bench_refs(|s| {
241 /// s.chars().count()
242 /// });
243 /// }
244 /// ```
245 pub fn input_counter<C, F>(self, make_counter: F) -> Self
246 where
247 F: Fn(&I) -> C + Sync + 'static,
248 C: IntoCounter,
249 {
250 self.context.counters.set_input_counter(make_counter);
251 self
252 }
253
254 /// Creates a [`Counter`] from each input of the benchmarked function.
255 ///
256 /// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any
257 /// nesting of references to those types.
258 ///
259 /// # Examples
260 ///
261 /// The following example emits info for the number of items processed when
262 /// benchmarking [`FromIterator`] from
263 /// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`].
264 ///
265 /// ```
266 /// use divan::{Bencher, counter::ItemsCount};
267 ///
268 /// #[divan::bench]
269 /// fn range_to_vec(bencher: Bencher) {
270 /// bencher
271 /// .with_inputs(|| -> usize {
272 /// // ...
273 /// # 0
274 /// })
275 /// .count_inputs_as::<ItemsCount>()
276 /// .bench_values(|n| -> Vec<usize> {
277 /// (0..n).collect()
278 /// });
279 /// }
280 /// ```
281 #[inline]
282 pub fn count_inputs_as<C>(self) -> Self
283 where
284 C: Counter,
285 I: AsCountUInt,
286 {
287 match KnownCounterKind::of::<C>() {
288 KnownCounterKind::Bytes => self.input_counter(|c| BytesCount::from(c)),
289 KnownCounterKind::Chars => self.input_counter(|c| CharsCount::from(c)),
290 KnownCounterKind::Cycles => self.input_counter(|c| CyclesCount::from(c)),
291 KnownCounterKind::Items => self.input_counter(|c| ItemsCount::from(c)),
292 }
293 }
294
295 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
296 /// provided by-value.
297 ///
298 /// Per-iteration means the benchmarked function is called exactly once for
299 /// each generated input.
300 ///
301 /// The function can be benchmarked in parallel using the [`threads`
302 /// option](macro@crate::bench#threads). If the function is strictly
303 /// single-threaded, use [`Bencher::bench_local_values`] instead.
304 ///
305 /// # Examples
306 ///
307 /// ```
308 /// #[divan::bench]
309 /// fn bench(bencher: divan::Bencher) {
310 /// bencher
311 /// .with_inputs(|| {
312 /// // Generate input:
313 /// String::from("...")
314 /// })
315 /// .bench_values(|s| {
316 /// // Use input by-value:
317 /// s + "123"
318 /// });
319 /// }
320 /// ```
321 pub fn bench_values<O, B>(self, benched: B)
322 where
323 B: Fn(I) -> O + Sync,
324 GenI: Fn() -> I + Sync,
325 {
326 self.context.bench_loop_threaded(
327 self.config.gen_input,
328 |input| {
329 // SAFETY: Input is guaranteed to be initialized and not
330 // currently referenced by anything else.
331 let input = unsafe { input.get().read().assume_init() };
332
333 benched(input)
334 },
335 // Input ownership is transferred to `benched`.
336 |_input| {},
337 );
338 }
339
340 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
341 /// provided by-value.
342 ///
343 /// Per-iteration means the benchmarked function is called exactly once for
344 /// each generated input.
345 ///
346 /// # Examples
347 ///
348 /// ```
349 /// #[divan::bench]
350 /// fn bench(bencher: divan::Bencher) {
351 /// let mut values = Vec::new();
352 /// bencher
353 /// .with_inputs(|| {
354 /// // Generate input:
355 /// String::from("...")
356 /// })
357 /// .bench_local_values(|s| {
358 /// // Use input by-value:
359 /// values.push(s);
360 /// });
361 /// }
362 /// ```
363 pub fn bench_local_values<O, B>(self, mut benched: B)
364 where
365 B: FnMut(I) -> O,
366 {
367 self.context.bench_loop_local(
368 self.config.gen_input,
369 |input| {
370 // SAFETY: Input is guaranteed to be initialized and not
371 // currently referenced by anything else.
372 let input = unsafe { input.get().read().assume_init() };
373
374 benched(input)
375 },
376 // Input ownership is transferred to `benched`.
377 |_input| {},
378 );
379 }
380
381 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
382 /// provided by-reference.
383 ///
384 /// Per-iteration means the benchmarked function is called exactly once for
385 /// each generated input.
386 ///
387 /// # Examples
388 ///
389 /// ```
390 /// #[divan::bench]
391 /// fn bench(bencher: divan::Bencher) {
392 /// bencher
393 /// .with_inputs(|| {
394 /// // Generate input:
395 /// String::from("...")
396 /// })
397 /// .bench_refs(|s| {
398 /// // Use input by-reference:
399 /// *s += "123";
400 /// });
401 /// }
402 /// ```
403 pub fn bench_refs<O, B>(self, benched: B)
404 where
405 B: Fn(&mut I) -> O + Sync,
406 GenI: Fn() -> I + Sync,
407 {
408 // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
409 self.context.bench_loop_threaded(
410 self.config.gen_input,
411 |input| {
412 // SAFETY: Input is guaranteed to be initialized and not
413 // currently referenced by anything else.
414 let input = unsafe { (*input.get()).assume_init_mut() };
415
416 benched(input)
417 },
418 // Input ownership was not transferred to `benched`.
419 |input| {
420 // SAFETY: This function is called after `benched` outputs are
421 // dropped, so we have exclusive access.
422 unsafe { (*input.get()).assume_init_drop() }
423 },
424 );
425 }
426
427 /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
428 /// provided by-reference.
429 ///
430 /// Per-iteration means the benchmarked function is called exactly once for
431 /// each generated input.
432 ///
433 /// # Examples
434 ///
435 /// ```
436 /// #[divan::bench]
437 /// fn bench(bencher: divan::Bencher) {
438 /// bencher
439 /// .with_inputs(|| {
440 /// // Generate input:
441 /// String::from("...")
442 /// })
443 /// .bench_local_refs(|s| {
444 /// // Use input by-reference:
445 /// *s += "123";
446 /// });
447 /// }
448 /// ```
449 pub fn bench_local_refs<O, B>(self, mut benched: B)
450 where
451 B: FnMut(&mut I) -> O,
452 {
453 // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
454 self.context.bench_loop_local(
455 self.config.gen_input,
456 |input| {
457 // SAFETY: Input is guaranteed to be initialized and not
458 // currently referenced by anything else.
459 let input = unsafe { (*input.get()).assume_init_mut() };
460
461 benched(input)
462 },
463 // Input ownership was not transferred to `benched`.
464 |input| {
465 // SAFETY: This function is called after `benched` outputs are
466 // dropped, so we have exclusive access.
467 unsafe { (*input.get()).assume_init_drop() }
468 },
469 );
470 }
471}
472
473/// State machine for how the benchmark is being run.
474#[derive(Clone, Copy)]
475pub(crate) enum BenchMode {
476 /// The benchmark is being run as `--test`.
477 ///
478 /// Don't collect samples and run exactly once.
479 Test,
480
481 /// Scale `sample_size` to determine the right size for collecting.
482 Tune { sample_size: u32 },
483
484 /// Simply collect samples.
485 Collect { sample_size: u32 },
486}
487
488impl BenchMode {
489 #[inline]
490 pub fn is_test(self) -> bool {
491 matches!(self, Self::Test)
492 }
493
494 #[inline]
495 pub fn is_tune(self) -> bool {
496 matches!(self, Self::Tune { .. })
497 }
498
499 #[inline]
500 pub fn is_collect(self) -> bool {
501 matches!(self, Self::Collect { .. })
502 }
503
504 #[inline]
505 pub fn sample_size(self) -> u32 {
506 match self {
507 Self::Test => 1,
508 Self::Tune { sample_size, .. } | Self::Collect { sample_size, .. } => sample_size,
509 }
510 }
511}
512
513/// `#[divan::bench]` loop context.
514///
515/// Functions called within the benchmark loop should be `#[inline(always)]` to
516/// ensure instruction cache locality.
517pub(crate) struct BenchContext<'a> {
518 shared_context: &'a SharedContext,
519
520 /// User-configured options.
521 pub options: &'a BenchOptions<'a>,
522
523 /// Whether the benchmark loop was started.
524 pub did_run: bool,
525
526 /// The number of threads to run the benchmark. The default is 1.
527 ///
528 /// When set to 1, the benchmark loop is guaranteed to stay on the current
529 /// thread and not spawn any threads.
530 pub thread_count: NonZeroUsize,
531
532 /// Recorded samples.
533 pub samples: SampleCollection,
534
535 /// Per-iteration counters grouped by sample.
536 counters: CounterCollection,
537}
538
539impl<'a> BenchContext<'a> {
540 /// Creates a new benchmarking context.
541 pub fn new(
542 shared_context: &'a SharedContext,
543 options: &'a BenchOptions,
544 thread_count: NonZeroUsize,
545 ) -> Self {
546 Self {
547 shared_context,
548 options,
549 thread_count,
550 did_run: false,
551 samples: SampleCollection::default(),
552 counters: options.counters.to_collection(),
553 }
554 }
555
556 /// Runs the single-threaded loop for benchmarking `benched`.
557 ///
558 /// # Safety
559 ///
560 /// See `bench_loop_threaded`.
561 pub fn bench_loop_local<I, O>(
562 &mut self,
563 gen_input: impl FnMut() -> I,
564 benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
565 drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
566 ) {
567 // SAFETY: Closures are guaranteed to run on the current thread, so they
568 // can safely be mutable and non-`Sync`.
569 unsafe {
570 let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
571 let benched = SyncWrap::new(UnsafeCell::new(benched));
572 let drop_input = SyncWrap::new(drop_input);
573
574 self.thread_count = NonZeroUsize::MIN;
575 self.bench_loop_threaded::<I, O>(
576 || (*gen_input.get())(),
577 |input| (*benched.get())(input),
578 |input| drop_input(input),
579 )
580 }
581 }
582
583 /// Runs the multi-threaded loop for benchmarking `benched`.
584 ///
585 /// # Safety
586 ///
587 /// If `self.threads` is 1, the incoming closures will not escape the
588 /// current thread. This guarantee ensures `bench_loop_local` can soundly
589 /// reuse this method with mutable non-`Sync` closures.
590 ///
591 /// When `benched` is called:
592 /// - `I` is guaranteed to be initialized.
593 /// - No external `&I` or `&mut I` exists.
594 ///
595 /// When `drop_input` is called:
596 /// - All instances of `O` returned from `benched` have been dropped.
597 /// - The same guarantees for `I` apply as in `benched`, unless `benched`
598 /// escaped references to `I`.
599 fn bench_loop_threaded<I, O>(
600 &mut self,
601 gen_input: impl Fn() -> I + Sync,
602 benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
603 drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
604 ) {
605 self.did_run = true;
606
607 let mut current_mode = self.initial_mode();
608 let is_test = current_mode.is_test();
609
610 let record_sample = self.sample_recorder(gen_input, benched, drop_input);
611
612 let thread_count = self.thread_count.get();
613 let aux_thread_count = thread_count - 1;
614
615 let is_single_thread = aux_thread_count == 0;
616
617 // Per-thread sample info returned by `record_sample`. These are
618 // processed locally to emit user-facing sample info. As a result, this
619 // only contains `thread_count` many elements at a time.
620 let mut raw_samples = Vec::<Option<RawSample>>::new();
621
622 // The time spent benchmarking, in picoseconds.
623 //
624 // Unless `skip_ext_time` is set, this includes time external to
625 // `benched`, such as time spent generating inputs and running drop.
626 let mut elapsed_picos: u128 = 0;
627
628 // The minimum time for benchmarking, in picoseconds.
629 let min_picos = self.options.min_time().picos;
630
631 // The remaining time left for benchmarking, in picoseconds.
632 let max_picos = self.options.max_time().picos;
633
634 // Don't bother running if user specifies 0 max time or 0 samples.
635 if max_picos == 0 || !self.options.has_samples() {
636 return;
637 }
638
639 let timer = self.shared_context.timer;
640 let timer_kind = timer.kind();
641
642 let mut rem_samples = if current_mode.is_collect() {
643 Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
644 } else {
645 None
646 };
647
648 // Only measure precision if we need to tune sample size.
649 let timer_precision =
650 if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
651
652 if !is_test {
653 self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(1) as usize);
654 }
655
656 let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
657 let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
658
659 let bench_overheads = timer.bench_overheads();
660
661 let _ = InstrumentHooks::instance().start_benchmark();
662 while {
663 // Conditions for when sampling is over:
664 if elapsed_picos >= max_picos {
665 // Depleted the benchmarking time budget. This is a strict
666 // condition regardless of sample count and minimum time.
667 false
668 } else if rem_samples.unwrap_or(1) > 0 {
669 // More samples expected.
670 true
671 } else {
672 // Continue if we haven't reached the time floor.
673 elapsed_picos < min_picos
674 }
675 } {
676 let sample_size = current_mode.sample_size();
677 self.samples.sample_size = sample_size;
678
679 let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
680
681 // Sample loop helper:
682 let record_sample = || -> RawSample {
683 let mut counter_totals: [u128; KnownCounterKind::COUNT] =
684 [0; KnownCounterKind::COUNT];
685
686 // Updates per-input counter info for this sample.
687 let mut count_input = |input: &I| {
688 for counter_kind in KnownCounterKind::ALL {
689 // SAFETY: The `I` type cannot change since `with_inputs`
690 // cannot be called more than once on the same `Bencher`.
691 if let Some(count) =
692 unsafe { self.counters.get_input_count(counter_kind, input) }
693 {
694 let total = &mut counter_totals[counter_kind as usize];
695 *total = (*total).saturating_add(count as u128);
696 }
697 }
698 };
699
700 // Sample loop:
701 let ([start, end], alloc_info) =
702 record_sample(sample_size as usize, barrier.as_ref(), &mut count_input);
703
704 RawSample { start, end, timer, alloc_info, counter_totals }
705 };
706
707 // Sample loop:
708 raw_samples.clear();
709 BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, |_| record_sample());
710
711 // Convert `&[Option<RawSample>]` to `&[Sample]`.
712 let raw_samples: &[RawSample] = {
713 if let Some(thread) = raw_samples
714 .iter()
715 .enumerate()
716 .find_map(|(thread, sample)| sample.is_none().then_some(thread))
717 {
718 panic!("Divan benchmarking thread {thread} panicked");
719 }
720
721 unsafe {
722 assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>());
723 std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len())
724 }
725 };
726
727 // If testing, exit the benchmarking loop immediately after timing a
728 // single run.
729 if is_test {
730 break;
731 }
732
733 let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap();
734 let slowest_time = slowest_sample.duration();
735
736 // TODO: Make tuning be less influenced by early runs. Currently if
737 // early runs are very quick but later runs are slow, benchmarking
738 // will take a very long time.
739 //
740 // TODO: Make `sample_size` consider time generating inputs and
741 // dropping inputs/outputs. Currently benchmarks like
742 // `Bencher::bench_refs(String::clear)` take a very long time.
743 if current_mode.is_tune() {
744 // Clear previous smaller samples.
745 self.samples.clear();
746 self.counters.clear_input_counts();
747
748 // If within 100x timer precision, continue tuning.
749 let precision_multiple = slowest_time.picos / timer_precision.picos;
750 if precision_multiple <= 100 {
751 current_mode = BenchMode::Tune { sample_size: sample_size * 2 };
752 } else {
753 current_mode = BenchMode::Collect { sample_size };
754 rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
755 }
756 }
757
758 // Returns the sample's duration adjusted for overhead.
759 let sample_duration_sub_overhead = |raw_sample: &RawSample| {
760 let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info);
761
762 FineDuration {
763 picos: raw_sample
764 .duration()
765 .clamp_to(timer_precision)
766 .picos
767 .saturating_sub(overhead.picos),
768 }
769 .clamp_to(timer_precision)
770 };
771
772 for raw_sample in raw_samples {
773 let sample_index = self.samples.time_samples.len();
774
775 self.samples
776 .time_samples
777 .push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) });
778
779 if !raw_sample.alloc_info.tallies.is_empty() {
780 self.samples
781 .alloc_info_by_sample
782 .insert(sample_index as u32, raw_sample.alloc_info.clone());
783 }
784
785 // Insert per-input counter information.
786 for counter_kind in KnownCounterKind::ALL {
787 if !self.counters.uses_input_counts(counter_kind) {
788 continue;
789 }
790
791 let total_count = raw_sample.counter_totals[counter_kind as usize];
792
793 // Cannot overflow `MaxCountUInt` because `total_count`
794 // cannot exceed `MaxCountUInt::MAX * sample_size`.
795 let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
796
797 self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
798 }
799
800 if let Some(rem_samples) = &mut rem_samples {
801 *rem_samples = rem_samples.saturating_sub(1);
802 }
803 }
804
805 if let Some(initial_start) = initial_start {
806 let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
807 elapsed_picos = last_end.duration_since(initial_start, timer).picos;
808 } else {
809 // Progress by at least 1ns to prevent extremely fast
810 // functions from taking forever when `min_time` is set.
811 let progress_picos = slowest_time.picos.max(1_000);
812 elapsed_picos = elapsed_picos.saturating_add(progress_picos);
813 }
814 }
815 let _ = InstrumentHooks::instance().stop_benchmark();
816
817 // Reset flag for ignoring allocations.
818 crate::alloc::IGNORE_ALLOC.set(false);
819 }
820
821 /// Returns a closure that takes the sample size and input counter, and then
822 /// returns a newly recorded sample.
823 fn sample_recorder<I, O>(
824 &self,
825 gen_input: impl Fn() -> I,
826 benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
827 drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
828 ) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; 2], ThreadAllocInfo)
829 {
830 // We defer:
831 // - Usage of `gen_input` values.
832 // - Drop destructor for `O`, preventing it from affecting sample
833 // measurements. Outputs are stored into a pre-allocated buffer during
834 // the sample loop. The allocation is reused between samples to reduce
835 // time spent between samples.
836
837 let timer_kind = self.shared_context.timer.kind();
838
839 move |sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)| {
840 let mut defer_store = DeferStore::<I, O>::default();
841
842 let mut saved_alloc_info = ThreadAllocInfo::new();
843 let mut save_alloc_info = || {
844 if crate::alloc::IGNORE_ALLOC.get() {
845 return;
846 }
847
848 if let Some(alloc_info) = ThreadAllocInfo::try_current() {
849 // SAFETY: We have exclusive access.
850 saved_alloc_info = unsafe { alloc_info.as_ptr().read() };
851 }
852 };
853
854 // Synchronize all threads to start timed section simultaneously and
855 // clear every thread's memory profiling info.
856 //
857 // This ensures work external to the timed section does not affect
858 // the timing of other threads.
859 let sync_threads = |is_start: bool| {
860 sync_impl(barrier, is_start);
861
862 // Monomorphize implementation to reduce code size.
863 #[inline(never)]
864 fn sync_impl(barrier: Option<&Barrier>, is_start: bool) {
865 // Ensure benchmarked section has a `ThreadAllocInfo`
866 // allocated for the current thread and clear previous info.
867 let alloc_info = if is_start { ThreadAllocInfo::current() } else { None };
868
869 // Synchronize all threads.
870 //
871 // This is the final synchronization point for the end.
872 if let Some(barrier) = barrier {
873 barrier.wait();
874 }
875
876 if let Some(mut alloc_info) = alloc_info {
877 // SAFETY: We have exclusive access.
878 let alloc_info = unsafe { alloc_info.as_mut() };
879
880 alloc_info.clear();
881
882 // Synchronize all threads.
883 if let Some(barrier) = barrier {
884 barrier.wait();
885 }
886 }
887 }
888 };
889
890 // The following logic chooses how to efficiently sample the
891 // benchmark function once and assigns `sample_start`/`sample_end`
892 // before/after the sample loop.
893 //
894 // NOTE: Testing and benchmarking should behave exactly the same
895 // when getting the sample time span. We don't want to introduce
896 // extra work that may worsen measurement quality for real
897 // benchmarking.
898 let sample_start: UntaggedTimestamp;
899 let sample_end: UntaggedTimestamp;
900
901 let instrument_hooks = InstrumentHooks::instance();
902 if size_of::<I>() == 0 && (size_of::<O>() == 0 || !mem::needs_drop::<O>()) {
903 // Use a range instead of `defer_store` to make the benchmarking
904 // loop cheaper.
905
906 // Run `gen_input` the expected number of times in case it
907 // updates external state used by `benched`.
908 for _ in 0..sample_size {
909 let input = gen_input();
910 count_input(&input);
911
912 // Inputs are consumed/dropped later.
913 mem::forget(input);
914 }
915
916 sync_threads(true);
917
918 let start_time = InstrumentHooks::current_timestamp();
919 sample_start = UntaggedTimestamp::start(timer_kind);
920
921 // Sample loop:
922 for _ in 0..sample_size {
923 // SAFETY: Input is a ZST, so we can construct one out of
924 // thin air.
925 let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
926
927 mem::forget(black_box(benched(&input)));
928 }
929
930 sample_end = UntaggedTimestamp::end(timer_kind);
931 let end_time = InstrumentHooks::current_timestamp();
932 instrument_hooks.add_benchmark_timestamps(start_time, end_time);
933
934 sync_threads(false);
935 save_alloc_info();
936
937 // Drop outputs and inputs.
938 for _ in 0..sample_size {
939 // Output only needs drop if ZST.
940 if size_of::<O>() == 0 {
941 // SAFETY: Output is a ZST, so we can construct one out
942 // of thin air.
943 unsafe { _ = mem::zeroed::<O>() }
944 }
945
946 if mem::needs_drop::<I>() {
947 // SAFETY: Input is a ZST, so we can construct one out
948 // of thin air and not worry about aliasing.
949 unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
950 }
951 }
952 } else {
953 defer_store.prepare(sample_size);
954
955 match defer_store.slots() {
956 // Output needs to be dropped. We defer drop in the sample
957 // loop by inserting it into `defer_store`.
958 Ok(defer_slots_slice) => {
959 // Initialize and store inputs.
960 for DeferSlot { input, .. } in defer_slots_slice {
961 // SAFETY: We have exclusive access to `input`.
962 let input = unsafe { &mut *input.get() };
963 let input = input.write(gen_input());
964 count_input(input);
965
966 // Make input opaque to benchmarked function.
967 black_box(input);
968 }
969
970 // Create iterator before the sample timing section to
971 // reduce benchmarking overhead.
972 let defer_slots_iter = defer_slots_slice.iter();
973
974 sync_threads(true);
975 let start_time = InstrumentHooks::current_timestamp();
976 sample_start = UntaggedTimestamp::start(timer_kind);
977
978 // Sample loop:
979 for defer_slot in defer_slots_iter {
980 // SAFETY: All inputs in `defer_store` were
981 // initialized and we have exclusive access to the
982 // output slot.
983 unsafe {
984 let output = benched(&defer_slot.input);
985 *defer_slot.output.get() = MaybeUninit::new(output);
986 }
987 }
988
989 sample_end = UntaggedTimestamp::end(timer_kind);
990 let end_time = InstrumentHooks::current_timestamp();
991 instrument_hooks.add_benchmark_timestamps(start_time, end_time);
992 sync_threads(false);
993 save_alloc_info();
994
995 // Prevent the optimizer from removing writes to inputs
996 // and outputs in the sample loop.
997 black_box(defer_slots_slice);
998
999 // Drop outputs and inputs.
1000 for DeferSlot { input, output } in defer_slots_slice {
1001 // SAFETY: All outputs were initialized in the
1002 // sample loop and we have exclusive access.
1003 unsafe { (*output.get()).assume_init_drop() }
1004
1005 if mem::needs_drop::<I>() {
1006 // SAFETY: The output was dropped and thus we
1007 // have exclusive access to inputs.
1008 unsafe { drop_input(input) }
1009 }
1010 }
1011 }
1012
1013 // Output does not need to be dropped.
1014 Err(defer_inputs_slice) => {
1015 // Initialize and store inputs.
1016 for input in defer_inputs_slice {
1017 // SAFETY: We have exclusive access to `input`.
1018 let input = unsafe { &mut *input.get() };
1019 let input = input.write(gen_input());
1020 count_input(input);
1021
1022 // Make input opaque to benchmarked function.
1023 black_box(input);
1024 }
1025
1026 // Create iterator before the sample timing section to
1027 // reduce benchmarking overhead.
1028 let defer_inputs_iter = defer_inputs_slice.iter();
1029
1030 sync_threads(true);
1031 let start_time = InstrumentHooks::current_timestamp();
1032 sample_start = UntaggedTimestamp::start(timer_kind);
1033
1034 // Sample loop:
1035 for input in defer_inputs_iter {
1036 // SAFETY: All inputs in `defer_store` were
1037 // initialized.
1038 black_box_drop(unsafe { benched(input) });
1039 }
1040
1041 sample_end = UntaggedTimestamp::end(timer_kind);
1042 let end_time = InstrumentHooks::current_timestamp();
1043 instrument_hooks.add_benchmark_timestamps(start_time, end_time);
1044 sync_threads(false);
1045 save_alloc_info();
1046
1047 // Prevent the optimizer from removing writes to inputs
1048 // in the sample loop.
1049 black_box(defer_inputs_slice);
1050
1051 // Drop inputs.
1052 if mem::needs_drop::<I>() {
1053 for input in defer_inputs_slice {
1054 // SAFETY: We have exclusive access to inputs.
1055 unsafe { drop_input(input) }
1056 }
1057 }
1058 }
1059 }
1060 }
1061
1062 // SAFETY: These values are guaranteed to be the correct variant
1063 // because they were created from the same `timer_kind`.
1064 let interval = unsafe {
1065 [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
1066 };
1067
1068 (interval, saved_alloc_info)
1069 }
1070 }
1071
1072 #[inline]
1073 fn initial_mode(&self) -> BenchMode {
1074 if self.shared_context.action.is_test() {
1075 BenchMode::Test
1076 } else if let Some(sample_size) = self.options.sample_size {
1077 BenchMode::Collect { sample_size }
1078 } else {
1079 BenchMode::Tune { sample_size: 1 }
1080 }
1081 }
1082
1083 pub fn compute_stats(&self) -> Stats {
1084 let time_samples = &self.samples.time_samples;
1085 let alloc_info_by_sample = &self.samples.alloc_info_by_sample;
1086
1087 let sample_count = time_samples.len();
1088 let sample_size = self.samples.sample_size;
1089
1090 let total_count = self.samples.iter_count();
1091
1092 let total_duration = self.samples.total_duration();
1093 let mean_duration = FineDuration {
1094 picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(),
1095 };
1096
1097 // Samples sorted by duration.
1098 let sorted_samples = self.samples.sorted_samples();
1099 let median_samples = util::slice_middle(&sorted_samples);
1100
1101 let index_of_sample = |sample: &TimeSample| -> usize {
1102 util::slice_ptr_index(&self.samples.time_samples, sample)
1103 };
1104
1105 let counter_count_for_sample =
1106 |sample: &TimeSample, counter_kind: KnownCounterKind| -> Option<MaxCountUInt> {
1107 let counts = self.counters.counts(counter_kind);
1108
1109 let index = if self.counters.uses_input_counts(counter_kind) {
1110 index_of_sample(sample)
1111 } else {
1112 0
1113 };
1114
1115 counts.get(index).copied()
1116 };
1117
1118 let min_duration =
1119 sorted_samples.first().map(|s| s.duration / sample_size).unwrap_or_default();
1120 let max_duration =
1121 sorted_samples.last().map(|s| s.duration / sample_size).unwrap_or_default();
1122
1123 let median_duration = if median_samples.is_empty() {
1124 FineDuration::default()
1125 } else {
1126 let sum: u128 = median_samples.iter().map(|s| s.duration.picos).sum();
1127 FineDuration { picos: sum / median_samples.len() as u128 } / sample_size
1128 };
1129
1130 let counts = KnownCounterKind::ALL.map(|counter_kind| {
1131 let median: MaxCountUInt = {
1132 let mut sum: u128 = 0;
1133
1134 for sample in median_samples {
1135 let sample_count = counter_count_for_sample(sample, counter_kind)? as u128;
1136
1137 // Saturating add in case `MaxUIntCount > u64`.
1138 sum = sum.saturating_add(sample_count);
1139 }
1140
1141 (sum / median_samples.len() as u128) as MaxCountUInt
1142 };
1143
1144 Some(StatsSet {
1145 fastest: sorted_samples
1146 .first()
1147 .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1148 slowest: sorted_samples
1149 .last()
1150 .and_then(|s| counter_count_for_sample(s, counter_kind))?,
1151 median,
1152 mean: self.counters.mean_count(counter_kind),
1153 })
1154 });
1155
1156 let sample_alloc_info = |sample: Option<&TimeSample>| -> Option<&ThreadAllocInfo> {
1157 sample
1158 .and_then(|sample| u32::try_from(index_of_sample(sample)).ok())
1159 .and_then(|index| self.samples.alloc_info_by_sample.get(&index))
1160 };
1161
1162 let sample_alloc_tally = |sample: Option<&TimeSample>, op: AllocOp| -> ThreadAllocTally {
1163 sample_alloc_info(sample)
1164 .map(|alloc_info| alloc_info.tallies.get(op))
1165 .copied()
1166 .unwrap_or_default()
1167 };
1168
1169 let mut alloc_total_max_count = 0u128;
1170 let mut alloc_total_max_size = 0u128;
1171 let mut alloc_total_tallies = TotalAllocTallyMap::default();
1172
1173 for alloc_info in alloc_info_by_sample.values() {
1174 alloc_total_max_count += alloc_info.max_count as u128;
1175 alloc_total_max_size += alloc_info.max_size as u128;
1176 alloc_info.tallies.add_to_total(&mut alloc_total_tallies);
1177 }
1178
1179 let sample_size = f64::from(sample_size);
1180 Stats {
1181 sample_count: sample_count as u32,
1182 iter_count: total_count,
1183 time: StatsSet {
1184 fastest: min_duration,
1185 slowest: max_duration,
1186 median: median_duration,
1187 mean: mean_duration,
1188 },
1189 max_alloc: StatsSet {
1190 fastest: {
1191 let alloc_info = sample_alloc_info(sorted_samples.first().copied());
1192
1193 AllocTally {
1194 count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1195 / sample_size,
1196 size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1197 / sample_size,
1198 }
1199 },
1200 slowest: {
1201 let alloc_info = sample_alloc_info(sorted_samples.last().copied());
1202
1203 AllocTally {
1204 count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
1205 / sample_size,
1206 size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
1207 / sample_size,
1208 }
1209 },
1210 // TODO: Switch to median of alloc info itself, rather than
1211 // basing off of median times.
1212 median: {
1213 let alloc_info_for_median =
1214 |index| sample_alloc_info(median_samples.get(index).copied());
1215
1216 let max_count_for_median = |index: usize| -> f64 {
1217 alloc_info_for_median(index)
1218 .map(|info| info.max_count as f64)
1219 .unwrap_or_default()
1220 };
1221
1222 let max_size_for_median = |index: usize| -> f64 {
1223 alloc_info_for_median(index)
1224 .map(|info| info.max_size as f64)
1225 .unwrap_or_default()
1226 };
1227
1228 let median_count = median_samples.len().max(1) as f64;
1229
1230 let median_max_count = max_count_for_median(0) + max_count_for_median(1);
1231 let median_max_size = max_size_for_median(0) + max_size_for_median(1);
1232
1233 AllocTally {
1234 count: median_max_count / median_count / sample_size,
1235 size: median_max_size / median_count / sample_size,
1236 }
1237 },
1238 mean: AllocTally {
1239 count: alloc_total_max_count as f64 / total_count as f64,
1240 size: alloc_total_max_size as f64 / total_count as f64,
1241 },
1242 }
1243 .transpose(),
1244 alloc_tallies: AllocOpMap {
1245 values: AllocOp::ALL
1246 .map(|op| StatsSet {
1247 fastest: {
1248 let fastest = sample_alloc_tally(sorted_samples.first().copied(), op);
1249
1250 AllocTally {
1251 count: fastest.count as f64 / sample_size,
1252 size: fastest.size as f64 / sample_size,
1253 }
1254 },
1255 slowest: {
1256 let slowest = sample_alloc_tally(sorted_samples.last().copied(), op);
1257
1258 AllocTally {
1259 count: slowest.count as f64 / sample_size,
1260 size: slowest.size as f64 / sample_size,
1261 }
1262 },
1263 median: {
1264 let tally_for_median = |index: usize| -> ThreadAllocTally {
1265 sample_alloc_tally(median_samples.get(index).copied(), op)
1266 };
1267
1268 let a = tally_for_median(0);
1269 let b = tally_for_median(1);
1270
1271 let median_count = median_samples.len().max(1) as f64;
1272
1273 let avg_count = (a.count as f64 + b.count as f64) / median_count;
1274 let avg_size = (a.size as f64 + b.size as f64) / median_count;
1275
1276 AllocTally {
1277 count: avg_count / sample_size,
1278 size: avg_size / sample_size,
1279 }
1280 },
1281 mean: {
1282 let tally = alloc_total_tallies.get(op);
1283 AllocTally {
1284 count: tally.count as f64 / total_count as f64,
1285 size: tally.size as f64 / total_count as f64,
1286 }
1287 },
1288 })
1289 .map(StatsSet::transpose),
1290 },
1291 counts,
1292 }
1293 }
1294}
1295
1296impl<T> StatsSet<AllocTally<T>> {
1297 #[inline]
1298 pub fn transpose(self) -> AllocTally<StatsSet<T>> {
1299 AllocTally {
1300 count: StatsSet {
1301 fastest: self.fastest.count,
1302 slowest: self.slowest.count,
1303 median: self.median.count,
1304 mean: self.mean.count,
1305 },
1306 size: StatsSet {
1307 fastest: self.fastest.size,
1308 slowest: self.slowest.size,
1309 median: self.median.size,
1310 mean: self.mean.size,
1311 },
1312 }
1313 }
1314}