wasm_bindgen_test/rt/criterion/mod.rs
1//! A statistics-driven micro-benchmarking library written in Rust.
2//!
3//! This crate is a microbenchmarking library which aims to provide strong
4//! statistical confidence in detecting and estimating the size of performance
5//! improvements and regressions, while also being easy to use.
6//!
7//! See
8//! [the user guide](https://bheisler.github.io/criterion.rs/book/index.html)
9//! for examples as well as details on the measurement and analysis process,
10//! and the output.
11//!
12//! ## Features:
13//! * Collects detailed statistics, providing strong confidence that changes
14//! to performance are real, not measurement noise.
15//! * Produces detailed charts, providing thorough understanding of your code's
16//! performance behavior.
17
18#![warn(clippy::doc_markdown, missing_docs)]
19#![warn(bare_trait_objects)]
20#![allow(
21 clippy::just_underscores_and_digits, // Used in the stats code
22 clippy::transmute_ptr_to_ptr, // Used in the stats code
23)]
24
25// Needs to be declared before other modules
26// in order to be usable there.
27mod analysis;
28mod baseline;
29mod bencher;
30mod benchmark;
31mod compare;
32mod estimate;
33mod format;
34mod measurement;
35mod report;
36mod routine;
37mod stats;
38
39use core::future::Future;
40use core::pin::Pin;
41use core::ptr;
42use core::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
43use core::time::Duration;
44use libm::{ceil, sqrt};
45use serde::{Deserialize, Serialize};
46
47use alloc::boxed::Box;
48use alloc::string::String;
49use alloc::vec;
50use alloc::vec::Vec;
51use benchmark::BenchmarkConfig;
52use measurement::WallTime;
53use report::WasmReport;
54
55pub use bencher::Bencher;
56pub use measurement::Measurement;
57
58/// The benchmark manager
59///
60/// `Criterion` lets you configure and execute benchmarks
61///
62/// Each benchmark consists of four phases:
63///
64/// - **Warm-up**: The routine is repeatedly executed, to let the CPU/OS/JIT/interpreter adapt to
65/// the new load
66/// - **Measurement**: The routine is repeatedly executed, and timing information is collected into
67/// a sample
68/// - **Analysis**: The sample is analyzed and distilled into meaningful statistics that get
69/// reported to stdout, stored in files, and plotted
70/// - **Comparison**: The current sample is compared with the sample obtained in the previous
71/// benchmark.
72pub struct Criterion<M: Measurement = WallTime> {
73 config: BenchmarkConfig,
74 report: WasmReport,
75 measurement: M,
76 location: Option<Location>,
77}
78
79pub(crate) struct Location {
80 file: String,
81 module_path: String,
82}
83
84impl Default for Criterion {
85 /// Creates a benchmark manager with the following default settings:
86 ///
87 /// - Sample size: 100 measurements
88 /// - Warm-up time: 3 s
89 /// - Measurement time: 5 s
90 /// - Bootstrap size: 100 000 resamples
91 /// - Noise threshold: 0.01 (1%)
92 /// - Confidence level: 0.95
93 /// - Significance level: 0.05
94 fn default() -> Criterion {
95 Criterion {
96 config: BenchmarkConfig {
97 confidence_level: 0.95,
98 measurement_time: Duration::from_secs(5),
99 noise_threshold: 0.01,
100 nresamples: 100_000,
101 sample_size: 100,
102 significance_level: 0.05,
103 warm_up_time: Duration::from_secs(3),
104 sampling_mode: SamplingMode::Auto,
105 },
106 report: WasmReport,
107 measurement: WallTime,
108 location: None,
109 }
110 }
111}
112
113impl<M: Measurement> Criterion<M> {
114 /// Changes the measurement for the benchmarks run with this runner. See the
115 /// [`Measurement`] trait for more details
116 pub fn with_measurement<M2: Measurement>(self, m: M2) -> Criterion<M2> {
117 // Can't use struct update syntax here because they're technically different types.
118 Criterion {
119 config: self.config,
120 report: self.report,
121 measurement: m,
122 location: self.location,
123 }
124 }
125
126 /// Configure file and module paths for use with codspeed.
127 #[must_use]
128 pub fn with_location(self, file: &str, module_path: &str) -> Criterion<M> {
129 Criterion {
130 location: Some(Location {
131 file: file.into(),
132 module_path: module_path.into(),
133 }),
134 ..self
135 }
136 }
137
138 /// Changes the default size of the sample for benchmarks run with this runner.
139 ///
140 /// A bigger sample should yield more accurate results if paired with a sufficiently large
141 /// measurement time.
142 ///
143 /// Sample size must be at least 10.
144 ///
145 /// # Panics
146 ///
147 /// Panics if n < 10
148 #[must_use]
149 pub fn sample_size(mut self, n: usize) -> Criterion<M> {
150 assert!(n >= 10);
151
152 self.config.sample_size = n;
153 self
154 }
155
156 /// Changes the default warm up time for benchmarks run with this runner.
157 ///
158 /// # Panics
159 ///
160 /// Panics if the input duration is zero
161 #[must_use]
162 pub fn warm_up_time(mut self, dur: Duration) -> Criterion<M> {
163 assert!(dur.as_nanos() > 0);
164
165 self.config.warm_up_time = dur;
166 self
167 }
168
169 ///
170 /// With a longer time, the measurement will become more resilient to transitory peak loads
171 /// caused by external programs
172 ///
173 /// **Note**: If the measurement time is too "low", Criterion will automatically increase it
174 ///
175 /// # Panics
176 ///
177 /// Panics if the input duration in zero
178 /// Changes the default measurement time for benchmarks run with this runner.
179 #[must_use]
180 pub fn measurement_time(mut self, dur: Duration) -> Criterion<M> {
181 assert!(dur.as_nanos() > 0);
182
183 self.config.measurement_time = dur;
184 self
185 }
186
187 /// Changes the default number of resamples for benchmarks run with this runner.
188 ///
189 /// Number of resamples to use for the
190 /// [bootstrap](http://en.wikipedia.org/wiki/Bootstrapping_(statistics)#Case_resampling)
191 ///
192 /// A larger number of resamples reduces the random sampling errors, which are inherent to the
193 /// bootstrap method, but also increases the analysis time
194 ///
195 /// # Panics
196 ///
197 /// Panics if the number of resamples is set to zero
198 #[must_use]
199 pub fn nresamples(mut self, n: usize) -> Criterion<M> {
200 assert!(n > 0);
201 if n <= 1000 {
202 console_error!("\nWarning: It is not recommended to reduce nresamples below 1000.");
203 }
204
205 self.config.nresamples = n;
206 self
207 }
208
209 /// Changes the default noise threshold for benchmarks run with this runner. The noise threshold
210 /// is used to filter out small changes in performance, even if they are statistically
211 /// significant. Sometimes benchmarking the same code twice will result in small but
212 /// statistically significant differences solely because of noise. This provides a way to filter
213 /// out some of these false positives at the cost of making it harder to detect small changes
214 /// to the true performance of the benchmark.
215 ///
216 /// The default is 0.01, meaning that changes smaller than 1% will be ignored.
217 ///
218 /// # Panics
219 ///
220 /// Panics if the threshold is set to a negative value
221 #[must_use]
222 pub fn noise_threshold(mut self, threshold: f64) -> Criterion<M> {
223 assert!(threshold >= 0.0);
224
225 self.config.noise_threshold = threshold;
226 self
227 }
228
229 /// Changes the default confidence level for benchmarks run with this runner. The confidence
230 /// level is the desired probability that the true runtime lies within the estimated
231 /// [confidence interval](https://en.wikipedia.org/wiki/Confidence_interval). The default is
232 /// 0.95, meaning that the confidence interval should capture the true value 95% of the time.
233 ///
234 /// # Panics
235 ///
236 /// Panics if the confidence level is set to a value outside the `(0, 1)` range
237 #[must_use]
238 pub fn confidence_level(mut self, cl: f64) -> Criterion<M> {
239 assert!(cl > 0.0 && cl < 1.0);
240 if cl < 0.5 {
241 console_error!(
242 "\nWarning: It is not recommended to reduce confidence level below 0.5."
243 );
244 }
245
246 self.config.confidence_level = cl;
247 self
248 }
249
250 /// Changes the default [significance level](https://en.wikipedia.org/wiki/Statistical_significance)
251 /// for benchmarks run with this runner. This is used to perform a
252 /// [hypothesis test](https://en.wikipedia.org/wiki/Statistical_hypothesis_testing) to see if
253 /// the measurements from this run are different from the measured performance of the last run.
254 /// The significance level is the desired probability that two measurements of identical code
255 /// will be considered 'different' due to noise in the measurements. The default value is 0.05,
256 /// meaning that approximately 5% of identical benchmarks will register as different due to
257 /// noise.
258 ///
259 /// This presents a trade-off. By setting the significance level closer to 0.0, you can increase
260 /// the statistical robustness against noise, but it also weakens Criterion.rs' ability to
261 /// detect small but real changes in the performance. By setting the significance level
262 /// closer to 1.0, Criterion.rs will be more able to detect small true changes, but will also
263 /// report more spurious differences.
264 ///
265 /// See also the noise threshold setting.
266 ///
267 /// # Panics
268 ///
269 /// Panics if the significance level is set to a value outside the `(0, 1)` range
270 #[must_use]
271 pub fn significance_level(mut self, sl: f64) -> Criterion<M> {
272 assert!(sl > 0.0 && sl < 1.0);
273
274 self.config.significance_level = sl;
275 self
276 }
277}
278
279impl<M> Criterion<M>
280where
281 M: Measurement + 'static,
282{
283 /// Benchmarks a function.
284 ///
285 /// # Example
286 ///
287 /// ```rust
288 /// use wasm_bindgen_test::{Criterion, wasm_bindgen_bench};
289 ///
290 /// #[wasm_bindgen_bench]
291 /// fn bench(c: &mut Criterion) {
292 /// // Setup (construct data, allocate memory, etc)
293 /// c.bench_function(
294 /// "bench desc",
295 /// |b| b.iter(|| {
296 /// // Code to benchmark goes here
297 /// }),
298 /// );
299 /// }
300 /// ```
301 pub fn bench_function<F>(&mut self, desc: &str, f: F) -> &mut Criterion<M>
302 where
303 F: FnMut(&mut Bencher<'_, M>),
304 {
305 const NOOP: RawWaker = {
306 const VTABLE: RawWakerVTable = RawWakerVTable::new(
307 // Cloning just returns a new no-op raw waker
308 |_| NOOP,
309 // `wake` does nothing
310 |_| {},
311 // `wake_by_ref` does nothing
312 |_| {},
313 // Dropping does nothing as we don't allocate anything
314 |_| {},
315 );
316 RawWaker::new(ptr::null(), &VTABLE)
317 };
318
319 // bench_function never be pending
320 fn block_on(f: impl Future<Output = ()>) {
321 let waker = unsafe { Waker::from_raw(NOOP) };
322 let mut ctx = Context::from_waker(&waker);
323 match core::pin::pin!(f).poll(&mut ctx) {
324 Poll::Ready(_) => (),
325 // sync functions not be pending
326 Poll::Pending => unreachable!(),
327 }
328 }
329
330 let id = report::BenchmarkId::new(desc.into());
331 block_on(analysis::common(
332 &id,
333 &mut routine::Function::new(f),
334 &self.config,
335 self,
336 ));
337
338 self
339 }
340
341 /// Benchmarks a future.
342 ///
343 /// # Example
344 ///
345 /// ```rust
346 /// use wasm_bindgen_test::{Criterion, wasm_bindgen_bench};
347 ///
348 /// #[wasm_bindgen_bench]
349 /// async fn bench(c: &mut Criterion) {
350 /// // Setup (construct data, allocate memory, etc)
351 /// c.bench_async_function(
352 /// "bench desc",
353 /// |b| {
354 /// Box::pin(
355 /// b.iter_future(|| async {
356 /// // Code to benchmark goes here
357 /// })
358 /// )
359 /// }
360 /// ).await;
361 /// }
362 /// ```
363 pub async fn bench_async_function<F>(&mut self, desc: &str, f: F) -> &mut Criterion<M>
364 where
365 for<'b> F: FnMut(&'b mut Bencher<'_, M>) -> Pin<Box<dyn Future<Output = ()> + 'b>>,
366 {
367 let id = report::BenchmarkId::new(desc.into());
368 analysis::common(&id, &mut routine::AsyncFunction::new(f), &self.config, self).await;
369 self
370 }
371}
372
373/// Enum representing different ways of measuring the throughput of benchmarked code.
374/// If the throughput setting is configured for a benchmark then the estimated throughput will
375/// be reported as well as the time per iteration.
376// TODO: Remove serialize/deserialize from the public API.
377#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
378pub enum Throughput {
379 /// Measure throughput in terms of bytes/second. The value should be the number of bytes
380 /// processed by one iteration of the benchmarked code. Typically, this would be the length of
381 /// an input string or `&[u8]`.
382 Bytes(u64),
383
384 /// Equivalent to Bytes, but the value will be reported in terms of
385 /// kilobytes (1000 bytes) per second instead of kibibytes (1024 bytes) per
386 /// second, megabytes instead of mibibytes, and gigabytes instead of gibibytes.
387 BytesDecimal(u64),
388
389 /// Measure throughput in terms of elements/second. The value should be the number of elements
390 /// processed by one iteration of the benchmarked code. Typically, this would be the size of a
391 /// collection, but could also be the number of lines of input text or the number of values to
392 /// parse.
393 Elements(u64),
394
395 /// Measure throughput in terms of bits/second. The value should be the number of bits
396 /// processed by one iteration of the benchmarked code. Typically, this would be the number of
397 /// bits transferred by a networking function.
398 Bits(u64),
399}
400
401/// This enum allows the user to control how Criterion.rs chooses the iteration count when sampling.
402/// The default is `Auto`, which will choose a method automatically based on the iteration time during
403/// the warm-up phase.
404#[derive(Debug, Default, Clone, Copy)]
405pub enum SamplingMode {
406 /// Criterion.rs should choose a sampling method automatically. This is the default, and is
407 /// recommended for most users and most benchmarks.
408 #[default]
409 Auto,
410
411 /// Scale the iteration count in each sample linearly. This is suitable for most benchmarks,
412 /// but it tends to require many iterations which can make it very slow for very long benchmarks.
413 Linear,
414
415 /// Keep the iteration count the same for all samples. This is not recommended, as it affects
416 /// the statistics that Criterion.rs can compute. However, it requires fewer iterations than
417 /// the `Linear` method and therefore is more suitable for very long-running benchmarks where
418 /// benchmark execution time is more of a problem and statistical precision is less important.
419 Flat,
420}
421
422impl SamplingMode {
423 pub(crate) fn choose_sampling_mode(
424 &self,
425 warmup_mean_execution_time: f64,
426 sample_count: u64,
427 target_time: f64,
428 ) -> ActualSamplingMode {
429 match self {
430 SamplingMode::Linear => ActualSamplingMode::Linear,
431 SamplingMode::Flat => ActualSamplingMode::Flat,
432 SamplingMode::Auto => {
433 // Estimate execution time with linear sampling
434 let total_runs = sample_count * (sample_count + 1) / 2;
435 let d = ceil(target_time / warmup_mean_execution_time / total_runs as f64) as u64;
436 let expected_ns = total_runs as f64 * d as f64 * warmup_mean_execution_time;
437
438 if expected_ns > (2.0 * target_time) {
439 ActualSamplingMode::Flat
440 } else {
441 ActualSamplingMode::Linear
442 }
443 }
444 }
445 }
446}
447
448/// Enum to represent the sampling mode without Auto.
449#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
450pub(crate) enum ActualSamplingMode {
451 Linear,
452 Flat,
453}
454
455impl ActualSamplingMode {
456 pub(crate) fn iteration_counts(
457 &self,
458 warmup_mean_execution_time: f64,
459 sample_count: u64,
460 target_time: &Duration,
461 ) -> Vec<u64> {
462 match self {
463 ActualSamplingMode::Linear => {
464 let n = sample_count;
465 let met = warmup_mean_execution_time;
466 let m_ns = target_time.as_nanos();
467 // Solve: [d + 2*d + 3*d + ... + n*d] * met = m_ns
468 let total_runs = n * (n + 1) / 2;
469 let d = (ceil(m_ns as f64 / met / total_runs as f64) as u64).max(1);
470 let expected_ns = total_runs as f64 * d as f64 * met;
471
472 if d == 1 {
473 let recommended_sample_size =
474 ActualSamplingMode::recommend_linear_sample_size(m_ns as f64, met);
475 let actual_time = Duration::from_nanos(expected_ns as u64);
476 console_error!("\nWarning: Unable to complete {} samples in {:.1?}. You may wish to increase target time to {:.1?}",
477 n, target_time, actual_time);
478
479 if recommended_sample_size != n {
480 console_error!(
481 ", enable flat sampling, or reduce sample count to {}.",
482 recommended_sample_size
483 );
484 } else {
485 console_error!(" or enable flat sampling.");
486 }
487 }
488
489 (1..(n + 1)).map(|a| a * d).collect::<Vec<u64>>()
490 }
491 ActualSamplingMode::Flat => {
492 let n = sample_count;
493 let met = warmup_mean_execution_time;
494 let m_ns = target_time.as_nanos() as f64;
495 let time_per_sample = m_ns / (n as f64);
496 // This is pretty simplistic; we could do something smarter to fit into the allotted time.
497 let iterations_per_sample = (ceil(time_per_sample / met) as u64).max(1);
498
499 let expected_ns = met * (iterations_per_sample * n) as f64;
500
501 if iterations_per_sample == 1 {
502 let recommended_sample_size =
503 ActualSamplingMode::recommend_flat_sample_size(m_ns, met);
504 let actual_time = Duration::from_nanos(expected_ns as u64);
505 console_error!("\nWarning: Unable to complete {} samples in {:.1?}. You may wish to increase target time to {:.1?}",
506 n, target_time, actual_time);
507
508 if recommended_sample_size != n {
509 console_error!(", or reduce sample count to {}.", recommended_sample_size);
510 } else {
511 console_error!(".");
512 }
513 }
514
515 vec![iterations_per_sample; n as usize]
516 }
517 }
518 }
519
520 fn is_linear(&self) -> bool {
521 matches!(self, ActualSamplingMode::Linear)
522 }
523
524 fn recommend_linear_sample_size(target_time: f64, met: f64) -> u64 {
525 // Some math shows that n(n+1)/2 * d * met = target_time. d = 1, so it can be ignored.
526 // This leaves n(n+1) = (2*target_time)/met, or n^2 + n - (2*target_time)/met = 0
527 // Which can be solved with the quadratic formula. Since A and B are constant 1,
528 // this simplifies to sample_size = (-1 +- sqrt(1 - 4C))/2, where C = (2*target_time)/met.
529 // We don't care about the negative solution. Experimentation shows that this actually tends to
530 // result in twice the desired execution time (probably because of the ceil used to calculate
531 // d) so instead I use c = target_time/met.
532 let c = target_time / met;
533 let sample_size = (-1.0 + sqrt(4.0 * c)) / 2.0;
534 let sample_size = sample_size as u64;
535
536 // Round down to the nearest 10 to give a margin and avoid excessive precision
537 let sample_size = (sample_size / 10) * 10;
538
539 // Clamp it to be at least 10, since criterion.rs doesn't allow sample sizes smaller than 10.
540 if sample_size < 10 {
541 10
542 } else {
543 sample_size
544 }
545 }
546
547 fn recommend_flat_sample_size(target_time: f64, met: f64) -> u64 {
548 let sample_size = (target_time / met) as u64;
549
550 // Round down to the nearest 10 to give a margin and avoid excessive precision
551 let sample_size = (sample_size / 10) * 10;
552
553 // Clamp it to be at least 10, since criterion.rs doesn't allow sample sizes smaller than 10.
554 if sample_size < 10 {
555 10
556 } else {
557 sample_size
558 }
559 }
560}
561
562#[derive(Debug, Serialize, Deserialize, Clone)]
563pub(crate) struct SavedSample {
564 pub(crate) sampling_mode: ActualSamplingMode,
565 pub(crate) iters: Vec<f64>,
566 pub(crate) times: Vec<f64>,
567}