wasm_bindgen_test/rt/criterion/
mod.rs

1//! A statistics-driven micro-benchmarking library written in Rust.
2//!
3//! This crate is a microbenchmarking library which aims to provide strong
4//! statistical confidence in detecting and estimating the size of performance
5//! improvements and regressions, while also being easy to use.
6//!
7//! See
8//! [the user guide](https://bheisler.github.io/criterion.rs/book/index.html)
9//! for examples as well as details on the measurement and analysis process,
10//! and the output.
11//!
12//! ## Features:
13//! * Collects detailed statistics, providing strong confidence that changes
14//!   to performance are real, not measurement noise.
15//! * Produces detailed charts, providing thorough understanding of your code's
16//!   performance behavior.
17
18#![warn(clippy::doc_markdown, missing_docs)]
19#![warn(bare_trait_objects)]
20#![allow(
21    clippy::just_underscores_and_digits, // Used in the stats code
22    clippy::transmute_ptr_to_ptr, // Used in the stats code
23)]
24
25// Needs to be declared before other modules
26// in order to be usable there.
27mod analysis;
28mod baseline;
29mod bencher;
30mod benchmark;
31mod compare;
32mod estimate;
33mod format;
34mod measurement;
35mod report;
36mod routine;
37mod stats;
38
39use core::future::Future;
40use core::pin::Pin;
41use core::ptr;
42use core::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
43use core::time::Duration;
44use libm::{ceil, sqrt};
45use serde::{Deserialize, Serialize};
46
47use alloc::boxed::Box;
48use alloc::string::String;
49use alloc::vec;
50use alloc::vec::Vec;
51use benchmark::BenchmarkConfig;
52use measurement::WallTime;
53use report::WasmReport;
54
55pub use bencher::Bencher;
56pub use measurement::Measurement;
57
58/// The benchmark manager
59///
60/// `Criterion` lets you configure and execute benchmarks
61///
62/// Each benchmark consists of four phases:
63///
64/// - **Warm-up**: The routine is repeatedly executed, to let the CPU/OS/JIT/interpreter adapt to
65///   the new load
66/// - **Measurement**: The routine is repeatedly executed, and timing information is collected into
67///   a sample
68/// - **Analysis**: The sample is analyzed and distilled into meaningful statistics that get
69///   reported to stdout, stored in files, and plotted
70/// - **Comparison**: The current sample is compared with the sample obtained in the previous
71///   benchmark.
72pub struct Criterion<M: Measurement = WallTime> {
73    config: BenchmarkConfig,
74    report: WasmReport,
75    measurement: M,
76    location: Option<Location>,
77}
78
79pub(crate) struct Location {
80    file: String,
81    module_path: String,
82}
83
84impl Default for Criterion {
85    /// Creates a benchmark manager with the following default settings:
86    ///
87    /// - Sample size: 100 measurements
88    /// - Warm-up time: 3 s
89    /// - Measurement time: 5 s
90    /// - Bootstrap size: 100 000 resamples
91    /// - Noise threshold: 0.01 (1%)
92    /// - Confidence level: 0.95
93    /// - Significance level: 0.05
94    fn default() -> Criterion {
95        Criterion {
96            config: BenchmarkConfig {
97                confidence_level: 0.95,
98                measurement_time: Duration::from_secs(5),
99                noise_threshold: 0.01,
100                nresamples: 100_000,
101                sample_size: 100,
102                significance_level: 0.05,
103                warm_up_time: Duration::from_secs(3),
104                sampling_mode: SamplingMode::Auto,
105            },
106            report: WasmReport,
107            measurement: WallTime,
108            location: None,
109        }
110    }
111}
112
113impl<M: Measurement> Criterion<M> {
114    /// Changes the measurement for the benchmarks run with this runner. See the
115    /// [`Measurement`] trait for more details
116    pub fn with_measurement<M2: Measurement>(self, m: M2) -> Criterion<M2> {
117        // Can't use struct update syntax here because they're technically different types.
118        Criterion {
119            config: self.config,
120            report: self.report,
121            measurement: m,
122            location: self.location,
123        }
124    }
125
126    /// Configure file and module paths for use with codspeed.
127    #[must_use]
128    pub fn with_location(self, file: &str, module_path: &str) -> Criterion<M> {
129        Criterion {
130            location: Some(Location {
131                file: file.into(),
132                module_path: module_path.into(),
133            }),
134            ..self
135        }
136    }
137
138    /// Changes the default size of the sample for benchmarks run with this runner.
139    ///
140    /// A bigger sample should yield more accurate results if paired with a sufficiently large
141    /// measurement time.
142    ///
143    /// Sample size must be at least 10.
144    ///
145    /// # Panics
146    ///
147    /// Panics if n < 10
148    #[must_use]
149    pub fn sample_size(mut self, n: usize) -> Criterion<M> {
150        assert!(n >= 10);
151
152        self.config.sample_size = n;
153        self
154    }
155
156    /// Changes the default warm up time for benchmarks run with this runner.
157    ///
158    /// # Panics
159    ///
160    /// Panics if the input duration is zero
161    #[must_use]
162    pub fn warm_up_time(mut self, dur: Duration) -> Criterion<M> {
163        assert!(dur.as_nanos() > 0);
164
165        self.config.warm_up_time = dur;
166        self
167    }
168
169    ///
170    /// With a longer time, the measurement will become more resilient to transitory peak loads
171    /// caused by external programs
172    ///
173    /// **Note**: If the measurement time is too "low", Criterion will automatically increase it
174    ///
175    /// # Panics
176    ///
177    /// Panics if the input duration in zero
178    /// Changes the default measurement time for benchmarks run with this runner.
179    #[must_use]
180    pub fn measurement_time(mut self, dur: Duration) -> Criterion<M> {
181        assert!(dur.as_nanos() > 0);
182
183        self.config.measurement_time = dur;
184        self
185    }
186
187    /// Changes the default number of resamples for benchmarks run with this runner.
188    ///
189    /// Number of resamples to use for the
190    /// [bootstrap](http://en.wikipedia.org/wiki/Bootstrapping_(statistics)#Case_resampling)
191    ///
192    /// A larger number of resamples reduces the random sampling errors, which are inherent to the
193    /// bootstrap method, but also increases the analysis time
194    ///
195    /// # Panics
196    ///
197    /// Panics if the number of resamples is set to zero
198    #[must_use]
199    pub fn nresamples(mut self, n: usize) -> Criterion<M> {
200        assert!(n > 0);
201        if n <= 1000 {
202            console_error!("\nWarning: It is not recommended to reduce nresamples below 1000.");
203        }
204
205        self.config.nresamples = n;
206        self
207    }
208
209    /// Changes the default noise threshold for benchmarks run with this runner. The noise threshold
210    /// is used to filter out small changes in performance, even if they are statistically
211    /// significant. Sometimes benchmarking the same code twice will result in small but
212    /// statistically significant differences solely because of noise. This provides a way to filter
213    /// out some of these false positives at the cost of making it harder to detect small changes
214    /// to the true performance of the benchmark.
215    ///
216    /// The default is 0.01, meaning that changes smaller than 1% will be ignored.
217    ///
218    /// # Panics
219    ///
220    /// Panics if the threshold is set to a negative value
221    #[must_use]
222    pub fn noise_threshold(mut self, threshold: f64) -> Criterion<M> {
223        assert!(threshold >= 0.0);
224
225        self.config.noise_threshold = threshold;
226        self
227    }
228
229    /// Changes the default confidence level for benchmarks run with this runner. The confidence
230    /// level is the desired probability that the true runtime lies within the estimated
231    /// [confidence interval](https://en.wikipedia.org/wiki/Confidence_interval). The default is
232    /// 0.95, meaning that the confidence interval should capture the true value 95% of the time.
233    ///
234    /// # Panics
235    ///
236    /// Panics if the confidence level is set to a value outside the `(0, 1)` range
237    #[must_use]
238    pub fn confidence_level(mut self, cl: f64) -> Criterion<M> {
239        assert!(cl > 0.0 && cl < 1.0);
240        if cl < 0.5 {
241            console_error!(
242                "\nWarning: It is not recommended to reduce confidence level below 0.5."
243            );
244        }
245
246        self.config.confidence_level = cl;
247        self
248    }
249
250    /// Changes the default [significance level](https://en.wikipedia.org/wiki/Statistical_significance)
251    /// for benchmarks run with this runner. This is used to perform a
252    /// [hypothesis test](https://en.wikipedia.org/wiki/Statistical_hypothesis_testing) to see if
253    /// the measurements from this run are different from the measured performance of the last run.
254    /// The significance level is the desired probability that two measurements of identical code
255    /// will be considered 'different' due to noise in the measurements. The default value is 0.05,
256    /// meaning that approximately 5% of identical benchmarks will register as different due to
257    /// noise.
258    ///
259    /// This presents a trade-off. By setting the significance level closer to 0.0, you can increase
260    /// the statistical robustness against noise, but it also weakens Criterion.rs' ability to
261    /// detect small but real changes in the performance. By setting the significance level
262    /// closer to 1.0, Criterion.rs will be more able to detect small true changes, but will also
263    /// report more spurious differences.
264    ///
265    /// See also the noise threshold setting.
266    ///
267    /// # Panics
268    ///
269    /// Panics if the significance level is set to a value outside the `(0, 1)` range
270    #[must_use]
271    pub fn significance_level(mut self, sl: f64) -> Criterion<M> {
272        assert!(sl > 0.0 && sl < 1.0);
273
274        self.config.significance_level = sl;
275        self
276    }
277}
278
279impl<M> Criterion<M>
280where
281    M: Measurement + 'static,
282{
283    /// Benchmarks a function.
284    ///
285    /// # Example
286    ///
287    /// ```rust
288    /// use wasm_bindgen_test::{Criterion, wasm_bindgen_bench};
289    ///
290    /// #[wasm_bindgen_bench]
291    /// fn bench(c: &mut Criterion) {
292    ///     // Setup (construct data, allocate memory, etc)
293    ///     c.bench_function(
294    ///         "bench desc",
295    ///         |b| b.iter(|| {
296    ///             // Code to benchmark goes here
297    ///         }),
298    ///     );
299    /// }
300    /// ```
301    pub fn bench_function<F>(&mut self, desc: &str, f: F) -> &mut Criterion<M>
302    where
303        F: FnMut(&mut Bencher<'_, M>),
304    {
305        const NOOP: RawWaker = {
306            const VTABLE: RawWakerVTable = RawWakerVTable::new(
307                // Cloning just returns a new no-op raw waker
308                |_| NOOP,
309                // `wake` does nothing
310                |_| {},
311                // `wake_by_ref` does nothing
312                |_| {},
313                // Dropping does nothing as we don't allocate anything
314                |_| {},
315            );
316            RawWaker::new(ptr::null(), &VTABLE)
317        };
318
319        // bench_function never be pending
320        fn block_on(f: impl Future<Output = ()>) {
321            let waker = unsafe { Waker::from_raw(NOOP) };
322            let mut ctx = Context::from_waker(&waker);
323            match core::pin::pin!(f).poll(&mut ctx) {
324                Poll::Ready(_) => (),
325                // sync functions not be pending
326                Poll::Pending => unreachable!(),
327            }
328        }
329
330        let id = report::BenchmarkId::new(desc.into());
331        block_on(analysis::common(
332            &id,
333            &mut routine::Function::new(f),
334            &self.config,
335            self,
336        ));
337
338        self
339    }
340
341    /// Benchmarks a future.
342    ///
343    /// # Example
344    ///
345    /// ```rust
346    /// use wasm_bindgen_test::{Criterion, wasm_bindgen_bench};
347    ///
348    /// #[wasm_bindgen_bench]
349    /// async fn bench(c: &mut Criterion) {
350    ///     // Setup (construct data, allocate memory, etc)
351    ///     c.bench_async_function(
352    ///         "bench desc",
353    ///         |b| {
354    ///             Box::pin(
355    ///                 b.iter_future(|| async {
356    ///                     // Code to benchmark goes here
357    ///                 })
358    ///             )
359    ///         }
360    ///     ).await;
361    /// }
362    /// ```
363    pub async fn bench_async_function<F>(&mut self, desc: &str, f: F) -> &mut Criterion<M>
364    where
365        for<'b> F: FnMut(&'b mut Bencher<'_, M>) -> Pin<Box<dyn Future<Output = ()> + 'b>>,
366    {
367        let id = report::BenchmarkId::new(desc.into());
368        analysis::common(&id, &mut routine::AsyncFunction::new(f), &self.config, self).await;
369        self
370    }
371}
372
373/// Enum representing different ways of measuring the throughput of benchmarked code.
374/// If the throughput setting is configured for a benchmark then the estimated throughput will
375/// be reported as well as the time per iteration.
376// TODO: Remove serialize/deserialize from the public API.
377#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
378pub enum Throughput {
379    /// Measure throughput in terms of bytes/second. The value should be the number of bytes
380    /// processed by one iteration of the benchmarked code. Typically, this would be the length of
381    /// an input string or `&[u8]`.
382    Bytes(u64),
383
384    /// Equivalent to Bytes, but the value will be reported in terms of
385    /// kilobytes (1000 bytes) per second instead of kibibytes (1024 bytes) per
386    /// second, megabytes instead of mibibytes, and gigabytes instead of gibibytes.
387    BytesDecimal(u64),
388
389    /// Measure throughput in terms of elements/second. The value should be the number of elements
390    /// processed by one iteration of the benchmarked code. Typically, this would be the size of a
391    /// collection, but could also be the number of lines of input text or the number of values to
392    /// parse.
393    Elements(u64),
394
395    /// Measure throughput in terms of bits/second. The value should be the number of bits
396    /// processed by one iteration of the benchmarked code. Typically, this would be the number of
397    /// bits transferred by a networking function.
398    Bits(u64),
399}
400
401/// This enum allows the user to control how Criterion.rs chooses the iteration count when sampling.
402/// The default is `Auto`, which will choose a method automatically based on the iteration time during
403/// the warm-up phase.
404#[derive(Debug, Default, Clone, Copy)]
405pub enum SamplingMode {
406    /// Criterion.rs should choose a sampling method automatically. This is the default, and is
407    /// recommended for most users and most benchmarks.
408    #[default]
409    Auto,
410
411    /// Scale the iteration count in each sample linearly. This is suitable for most benchmarks,
412    /// but it tends to require many iterations which can make it very slow for very long benchmarks.
413    Linear,
414
415    /// Keep the iteration count the same for all samples. This is not recommended, as it affects
416    /// the statistics that Criterion.rs can compute. However, it requires fewer iterations than
417    /// the `Linear` method and therefore is more suitable for very long-running benchmarks where
418    /// benchmark execution time is more of a problem and statistical precision is less important.
419    Flat,
420}
421
422impl SamplingMode {
423    pub(crate) fn choose_sampling_mode(
424        &self,
425        warmup_mean_execution_time: f64,
426        sample_count: u64,
427        target_time: f64,
428    ) -> ActualSamplingMode {
429        match self {
430            SamplingMode::Linear => ActualSamplingMode::Linear,
431            SamplingMode::Flat => ActualSamplingMode::Flat,
432            SamplingMode::Auto => {
433                // Estimate execution time with linear sampling
434                let total_runs = sample_count * (sample_count + 1) / 2;
435                let d = ceil(target_time / warmup_mean_execution_time / total_runs as f64) as u64;
436                let expected_ns = total_runs as f64 * d as f64 * warmup_mean_execution_time;
437
438                if expected_ns > (2.0 * target_time) {
439                    ActualSamplingMode::Flat
440                } else {
441                    ActualSamplingMode::Linear
442                }
443            }
444        }
445    }
446}
447
448/// Enum to represent the sampling mode without Auto.
449#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
450pub(crate) enum ActualSamplingMode {
451    Linear,
452    Flat,
453}
454
455impl ActualSamplingMode {
456    pub(crate) fn iteration_counts(
457        &self,
458        warmup_mean_execution_time: f64,
459        sample_count: u64,
460        target_time: &Duration,
461    ) -> Vec<u64> {
462        match self {
463            ActualSamplingMode::Linear => {
464                let n = sample_count;
465                let met = warmup_mean_execution_time;
466                let m_ns = target_time.as_nanos();
467                // Solve: [d + 2*d + 3*d + ... + n*d] * met = m_ns
468                let total_runs = n * (n + 1) / 2;
469                let d = (ceil(m_ns as f64 / met / total_runs as f64) as u64).max(1);
470                let expected_ns = total_runs as f64 * d as f64 * met;
471
472                if d == 1 {
473                    let recommended_sample_size =
474                        ActualSamplingMode::recommend_linear_sample_size(m_ns as f64, met);
475                    let actual_time = Duration::from_nanos(expected_ns as u64);
476                    console_error!("\nWarning: Unable to complete {} samples in {:.1?}. You may wish to increase target time to {:.1?}",
477                            n, target_time, actual_time);
478
479                    if recommended_sample_size != n {
480                        console_error!(
481                            ", enable flat sampling, or reduce sample count to {}.",
482                            recommended_sample_size
483                        );
484                    } else {
485                        console_error!(" or enable flat sampling.");
486                    }
487                }
488
489                (1..(n + 1)).map(|a| a * d).collect::<Vec<u64>>()
490            }
491            ActualSamplingMode::Flat => {
492                let n = sample_count;
493                let met = warmup_mean_execution_time;
494                let m_ns = target_time.as_nanos() as f64;
495                let time_per_sample = m_ns / (n as f64);
496                // This is pretty simplistic; we could do something smarter to fit into the allotted time.
497                let iterations_per_sample = (ceil(time_per_sample / met) as u64).max(1);
498
499                let expected_ns = met * (iterations_per_sample * n) as f64;
500
501                if iterations_per_sample == 1 {
502                    let recommended_sample_size =
503                        ActualSamplingMode::recommend_flat_sample_size(m_ns, met);
504                    let actual_time = Duration::from_nanos(expected_ns as u64);
505                    console_error!("\nWarning: Unable to complete {} samples in {:.1?}. You may wish to increase target time to {:.1?}",
506                            n, target_time, actual_time);
507
508                    if recommended_sample_size != n {
509                        console_error!(", or reduce sample count to {}.", recommended_sample_size);
510                    } else {
511                        console_error!(".");
512                    }
513                }
514
515                vec![iterations_per_sample; n as usize]
516            }
517        }
518    }
519
520    fn is_linear(&self) -> bool {
521        matches!(self, ActualSamplingMode::Linear)
522    }
523
524    fn recommend_linear_sample_size(target_time: f64, met: f64) -> u64 {
525        // Some math shows that n(n+1)/2 * d * met = target_time. d = 1, so it can be ignored.
526        // This leaves n(n+1) = (2*target_time)/met, or n^2 + n - (2*target_time)/met = 0
527        // Which can be solved with the quadratic formula. Since A and B are constant 1,
528        // this simplifies to sample_size = (-1 +- sqrt(1 - 4C))/2, where C = (2*target_time)/met.
529        // We don't care about the negative solution. Experimentation shows that this actually tends to
530        // result in twice the desired execution time (probably because of the ceil used to calculate
531        // d) so instead I use c = target_time/met.
532        let c = target_time / met;
533        let sample_size = (-1.0 + sqrt(4.0 * c)) / 2.0;
534        let sample_size = sample_size as u64;
535
536        // Round down to the nearest 10 to give a margin and avoid excessive precision
537        let sample_size = (sample_size / 10) * 10;
538
539        // Clamp it to be at least 10, since criterion.rs doesn't allow sample sizes smaller than 10.
540        if sample_size < 10 {
541            10
542        } else {
543            sample_size
544        }
545    }
546
547    fn recommend_flat_sample_size(target_time: f64, met: f64) -> u64 {
548        let sample_size = (target_time / met) as u64;
549
550        // Round down to the nearest 10 to give a margin and avoid excessive precision
551        let sample_size = (sample_size / 10) * 10;
552
553        // Clamp it to be at least 10, since criterion.rs doesn't allow sample sizes smaller than 10.
554        if sample_size < 10 {
555            10
556        } else {
557            sample_size
558        }
559    }
560}
561
562#[derive(Debug, Serialize, Deserialize, Clone)]
563pub(crate) struct SavedSample {
564    pub(crate) sampling_mode: ActualSamplingMode,
565    pub(crate) iters: Vec<f64>,
566    pub(crate) times: Vec<f64>,
567}