canbench_rs/
lib.rs

1//! `canbench` is a tool for benchmarking canisters on the Internet Computer.
2//!
3//! ## Quickstart
4//!
5//! This example is also available to tinker with in the examples directory. See the [fibonacci example](https://github.com/dfinity/bench/tree/main/examples/fibonacci).
6//!
7//! ### 1. Install the `canbench` binary.
8//!
9//! The `canbench` is what runs your canister's benchmarks.
10//!
11//! ```bash
12//! cargo install canbench
13//! ```
14//!
15//! ### 2. Add optional dependency to `Cargo.toml`
16//!
17//! Typically you do not want your benchmarks to be part of your canister when deploying it to the Internet Computer.
18//! Therefore, we include `canbench` only as an optional dependency so that it's only included when running benchmarks.
19//! For more information about optional dependencies, you can read more about them [here](https://doc.rust-lang.org/cargo/reference/features.html#optional-dependencies).
20//!
21//! ```toml
22//! canbench-rs = { version = "x.y.z", optional = true }
23//! ```
24//!
25//! ### 3. Add a configuration to `canbench.yml`
26//!
27//! The `canbench.yml` configuration file tells `canbench` how to build and run you canister.
28//! Below is a typical configuration.
29//! Note that we're compiling the canister with the `canbench` feature so that the benchmarking logic is included in the Wasm.
30//!
31//! ```yml
32//! build_cmd:
33//!   cargo build --release --target wasm32-unknown-unknown --locked --features canbench-rs
34//!
35//! wasm_path:
36//!   ./target/wasm32-unknown-unknown/release/<YOUR_CANISTER>.wasm
37//! ```
38//! #### Init Args
39//!
40//! Init args can be specified using the `init_args` key in the configuration file:
41//! ```yml
42//! init_args:
43//!   hex: 4449444c0001710568656c6c6f
44//! ```
45//!
46//! #### Stable Memory
47//!
48//! A file can be specified to be loaded in the canister's stable memory _after_ initialization.
49//!
50//! ```yml
51//! stable_memory:
52//!   file:
53//!     stable_memory.bin
54//! ```
55//!
56//! <div class="warning">Contents of the stable memory file are loaded <i>after</i> the call to the canister's init method.
57//! Therefore, changes made to stable memory in the init method would be overwritten.</div>
58//!
59//! ### 4. Start benching! 🏋🏽
60//!
61//! Let's say we have a canister that exposes a `query` computing the fibonacci sequence of a given number.
62//! Here's what that query can look like:
63//!
64//! ```rust
65//! #[ic_cdk::query]
66//! fn fibonacci(n: u32) -> u32 {
67//!     if n == 0 {
68//!         return 0;
69//!     } else if n == 1 {
70//!         return 1;
71//!     }
72//!
73//!     let mut a = 0;
74//!     let mut b = 1;
75//!     let mut result = 0;
76//!
77//!     for _ in 2..=n {
78//!         result = a + b;
79//!         a = b;
80//!         b = result;
81//!     }
82//!
83//!     result
84//! }
85//! ```
86//!
87//! Now, let's add some benchmarks to this query:
88//!
89//! ```rust
90//! #[cfg(feature = "canbench-rs")]
91//! mod benches {
92//!     use super::*;
93//!     use canbench_rs::bench;
94//!
95//!     # fn fibonacci(_: u32) -> u32 { 0 }
96//!
97//!     #[bench]
98//!     fn fibonacci_20() {
99//!         // Prevent the compiler from optimizing the call and propagating constants.
100//!         std::hint::black_box(fibonacci(std::hint::black_box(20)));
101//!     }
102//!
103//!     #[bench]
104//!     fn fibonacci_45() {
105//!         // Prevent the compiler from optimizing the call and propagating constants.
106//!         std::hint::black_box(fibonacci(std::hint::black_box(45)));
107//!     }
108//! }
109//! ```
110//!
111//! Run `canbench`. You'll see an output that looks similar to this:
112//!
113//! ```txt
114//! $ canbench
115//!
116//! ---------------------------------------------------
117//!
118//! Benchmark: fibonacci_20 (new)
119//!   total:
120//!     instructions: 2301 (new)
121//!     heap_increase: 0 pages (new)
122//!     stable_memory_increase: 0 pages (new)
123//!
124//! ---------------------------------------------------
125//!
126//! Benchmark: fibonacci_45 (new)
127//!   total:
128//!     instructions: 3088 (new)
129//!     heap_increase: 0 pages (new)
130//!     stable_memory_increase: 0 pages (new)
131//!
132//! ---------------------------------------------------
133//!
134//! Executed 2 of 2 benchmarks.
135//! ```
136//!
137//! ### 5. Track performance regressions
138//!
139//! Notice that `canbench` reported the above benchmarks as "new".
140//! `canbench` allows you to persist the results of these benchmarks.
141//! In subsequent runs, `canbench` reports the performance relative to the last persisted run.
142//!
143//! Let's first persist the results above by running `canbench` again, but with the `persist` flag:
144//!
145//! ```txt
146//! $ canbench --persist
147//! # optionally add `--csv` to generate a CSV report
148//! $ canbench --persist --csv
149//! ...
150//! ---------------------------------------------------
151//!
152//! Executed 2 of 2 benchmarks.
153//! Successfully persisted results to canbench_results.yml
154//! ```
155//!
156//! Now, if we run `canbench` again, `canbench` will run the benchmarks, and will additionally report that there were no changes detected in performance.
157//!
158//! ```txt
159//! $ canbench
160//!     Finished release [optimized] target(s) in 0.34s
161//!
162//! ---------------------------------------------------
163//!
164//! Benchmark: fibonacci_20
165//!   total:
166//!     instructions: 2301 (no change)
167//!     heap_increase: 0 pages (no change)
168//!     stable_memory_increase: 0 pages (no change)
169//!
170//! ---------------------------------------------------
171//!
172//! Benchmark: fibonacci_45
173//!   total:
174//!     instructions: 3088 (no change)
175//!     heap_increase: 0 pages (no change)
176//!     stable_memory_increase: 0 pages (no change)
177//!
178//! ---------------------------------------------------
179//!
180//! Executed 2 of 2 benchmarks.
181//! ```
182//!
183//! Let's try swapping out our implementation of `fibonacci` with an implementation that's miserably inefficient.
184//! Replace the `fibonacci` function defined previously with the following:
185//!
186//! ```rust
187//! #[ic_cdk::query]
188//! fn fibonacci(n: u32) -> u32 {
189//!     match n {
190//!         0 => 1,
191//!         1 => 1,
192//!         _ => fibonacci(n - 1) + fibonacci(n - 2),
193//!     }
194//! }
195//! ```
196//!
197//! And running `canbench` again, we see that it detects and reports a regression.
198//!
199//! ```txt
200//! $ canbench
201//!
202//! ---------------------------------------------------
203//!
204//! Benchmark: fibonacci_20
205//!   total:
206//!     instructions: 337.93 K (regressed by 14586.14%)
207//!     heap_increase: 0 pages (no change)
208//!     stable_memory_increase: 0 pages (no change)
209//!
210//! ---------------------------------------------------
211//!
212//! Benchmark: fibonacci_45
213//!   total:
214//!     instructions: 56.39 B (regressed by 1826095830.76%)
215//!     heap_increase: 0 pages (no change)
216//!     stable_memory_increase: 0 pages (no change)
217//!
218//! ---------------------------------------------------
219//!
220//! Executed 2 of 2 benchmarks.
221//! ```
222//!
223//! Apparently, the recursive implementation is many orders of magnitude more expensive than the iterative implementation 😱
224//! Good thing we found out before deploying this implementation to production.
225//!
226//! Notice that `fibonacci_45` took > 50B instructions, which is substantially more than the instruction limit given for a single message execution on the Internet Computer. `canbench` runs benchmarks in an environment that gives them up to 10T instructions.
227//!
228//! ## Additional Examples
229//!
230//! For the following examples, we'll be using the following canister code, which you can also find in the [examples](./examples/btreemap_vs_hashmap) directory.
231//! This canister defines a simple state as well as a `pre_upgrade` function that stores that state into stable memory.
232//!
233//! ```rust
234//! use candid::{CandidType, Encode};
235//! use ic_cdk::pre_upgrade;
236//! use std::cell::RefCell;
237//!
238//! #[derive(CandidType)]
239//! struct User {
240//!     name: String,
241//! }
242//!
243//! #[derive(Default, CandidType)]
244//! struct State {
245//!     users: std::collections::BTreeMap<u64, User>,
246//! }
247//!
248//! thread_local! {
249//!     static STATE: RefCell<State> = RefCell::new(State::default());
250//! }
251//!
252//! #[pre_upgrade]
253//! fn pre_upgrade() {
254//!     // Serialize state.
255//!     let bytes = STATE.with(|s| Encode!(s).unwrap());
256//!
257//!     // Write to stable memory.
258//!     ic_cdk::stable::StableWriter::default()
259//!         .write(&bytes)
260//!         .unwrap();
261//! }
262//! ```
263//!
264//! ### Excluding setup code
265//!
266//! Let's say we want to benchmark how long it takes to run the `pre_upgrade` function. We can define the following benchmark:
267//!
268//! ```rust
269//! #[cfg(feature = "canbench-rs")]
270//! mod benches {
271//!     use super::*;
272//!     use canbench_rs::bench;
273//!
274//!     # fn initialize_state() {}
275//!     # fn pre_upgrade() {}
276//!
277//!     #[bench]
278//!     fn pre_upgrade_bench() {
279//!         // Some function that fills the state with lots of data.
280//!         initialize_state();
281//!
282//!         pre_upgrade();
283//!     }
284//! }
285//! ```
286//!
287//! The problem with the above benchmark is that it's benchmarking both the `pre_upgrade` call _and_ the initialization of the state.
288//! What if we're only interested in benchmarking the `pre_upgrade` call?
289//! To address this, we can use the `#[bench(raw)]` macro to specify exactly which code we'd like to benchmark.
290//!
291//! ```rust
292//! #[cfg(feature = "canbench-rs")]
293//! mod benches {
294//!     use super::*;
295//!     use canbench_rs::bench;
296//!
297//!     # fn initialize_state() {}
298//!     # fn pre_upgrade() {}
299//!
300//!     #[bench(raw)]
301//!     fn pre_upgrade_bench() -> canbench_rs::BenchResult {
302//!         // Some function that fills the state with lots of data.
303//!         initialize_state();
304//!
305//!         // Only benchmark the pre_upgrade. Initializing the state isn't
306//!         // included in the results of our benchmark.
307//!         canbench_rs::bench_fn(pre_upgrade)
308//!     }
309//! }
310//! ```
311//!
312//! Running `canbench` on the example above will benchmark only the code wrapped in `canbench_rs::bench_fn`, which in this case is the call to `pre_upgrade`.
313//!
314//! ```txt
315//! $ canbench pre_upgrade_bench
316//!
317//! ---------------------------------------------------
318//!
319//! Benchmark: pre_upgrade_bench (new)
320//!   total:
321//!     instructions: 717.10 M (new)
322//!     heap_increase: 519 pages (new)
323//!     stable_memory_increase: 184 pages (new)
324//!
325//! ---------------------------------------------------
326//!
327//! Executed 1 of 1 benchmarks.
328//! ```
329//!
330//! ### Granular Benchmarking
331//!
332//! Building on the example above, the `pre_upgrade` function does two steps:
333//!
334//! 1. Serialize the state
335//! 2. Write to stable memory
336//!
337//! Suppose we're interested in understanding, within `pre_upgrade`, the resources spent in each of these steps.
338//! `canbench` allows you to do more granular benchmarking using the `canbench_rs::bench_scope` function.
339//! Here's how we can modify our `pre_upgrade` function:
340//!
341//!
342//! ```rust
343//! # use candid::{Encode, CandidType};
344//! # use ic_cdk::pre_upgrade;
345//! # use std::cell::RefCell;
346//! #
347//! # #[derive(CandidType)]
348//! # struct User {
349//! #     name: String,
350//! # }
351//! #
352//! # #[derive(Default, CandidType)]
353//! # struct State {
354//! #     users: std::collections::BTreeMap<u64, User>,
355//! # }
356//! #
357//! # thread_local! {
358//! #     static STATE: RefCell<State> = RefCell::new(State::default());
359//! # }
360//!
361//! #[pre_upgrade]
362//! fn pre_upgrade() {
363//!     // Serialize state.
364//!     let bytes = {
365//!         #[cfg(feature = "canbench-rs")]
366//!         let _p = canbench_rs::bench_scope("serialize_state");
367//!         STATE.with(|s| Encode!(s).unwrap())
368//!     };
369//!
370//!     // Write to stable memory.
371//!     #[cfg(feature = "canbench-rs")]
372//!     let _p = canbench_rs::bench_scope("writing_to_stable_memory");
373//!     ic_cdk::stable::StableWriter::default()
374//!         .write(&bytes)
375//!         .unwrap();
376//! }
377//! ```
378//!
379//! In the code above, we've asked `canbench` to profile each of these steps separately.
380//! Running `canbench` now, each of these steps are reported.
381//!
382//! ```txt
383//! $ canbench pre_upgrade_bench
384//!
385//! ---------------------------------------------------
386//!
387//! Benchmark: pre_upgrade_bench (new)
388//!   total:
389//!     instructions: 717.11 M (new)
390//!     heap_increase: 519 pages (new)
391//!     stable_memory_increase: 184 pages (new)
392//!
393//!   serialize_state (profiling):
394//!     instructions: 717.10 M (new)
395//!     heap_increase: 519 pages (new)
396//!     stable_memory_increase: 0 pages (new)
397//!
398//!   writing_to_stable_memory (profiling):
399//!     instructions: 502 (new)
400//!     heap_increase: 0 pages (new)
401//!     stable_memory_increase: 184 pages (new)
402//!
403//! ---------------------------------------------------
404//!
405//! Executed 1 of 1 benchmarks.
406//! ```
407//!
408//! ### Debugging
409//!
410//! The `ic_cdk::eprintln!()` macro facilitates tracing canister and benchmark execution.
411//! Output is displayed on the console when `canbench` is executed with
412//! the `--show-canister-output` option.
413//!
414//! ```rust
415//! # #[cfg(feature = "canbench-rs")]
416//! # mod benches {
417//! #     use super::*;
418//! #     use canbench_rs::bench;
419//! #
420//!     #[bench]
421//!     fn bench_with_debug_print() {
422//!         // Run `canbench --show-canister-output` to see the output.
423//!         ic_cdk::eprintln!("Hello from {}!", env!("CARGO_PKG_NAME"));
424//!     }
425//! # }
426//! ```
427//!
428//! Example output:
429//!
430//! ```bash
431//! $ canbench bench_with_debug_print --show-canister-output
432//! [...]
433//! 2021-05-06 19:17:10.000000003 UTC: [Canister lxzze-o7777-77777-aaaaa-cai] Hello from example!
434//! [...]
435//! ```
436//!
437//! Refer to the [Internet Computer specification](https://internetcomputer.org/docs/references/ic-interface-spec#debugging-aids) for more details.
438//!
439//! ### Preventing Compiler Optimizations
440//!
441//! If benchmark results appear suspiciously low and remain consistent
442//! despite increased benchmarked function complexity, the `std::hint::black_box`
443//! function helps prevent compiler optimizations.
444//!
445//! ```rust
446//! # #[cfg(feature = "canbench-rs")]
447//! # mod benches {
448//! #     use super::*;
449//! #     use canbench_rs::bench;
450//! #
451//!     #[bench]
452//!     fn fibonacci_20() {
453//!         // Prevent the compiler from optimizing the call and propagating constants.
454//!         std::hint::black_box(fibonacci(std::hint::black_box(20)));
455//!     }
456//! # }
457//! ```
458//!
459//! Note that passing constant values as function arguments can also
460//! trigger compiler optimizations. If the actual code uses
461//! variables (not constants), both the arguments and the result
462//! of the benchmarked function must be wrapped in `black_box` calls.
463//!
464//! Refer to the [Rust documentation](https://doc.rust-lang.org/std/hint/fn.black_box.html)
465//! for more details.
466//!
467pub use canbench_rs_macros::bench;
468use candid::CandidType;
469use serde::{Deserialize, Serialize};
470use std::{cell::RefCell, collections::BTreeMap};
471
472thread_local! {
473    static SCOPES: RefCell<BTreeMap<&'static str, Vec<MeasurementInternal>>> =
474        const { RefCell::new(BTreeMap::new()) };
475}
476
477/// The results of a benchmark.
478/// This type is in a public API.
479#[derive(Debug, PartialEq, Serialize, Deserialize, CandidType, Default)]
480pub struct BenchResult {
481    /// A measurement for the entire duration of the benchmark.
482    pub total: Measurement,
483
484    /// Measurements for scopes.
485    #[serde(default)]
486    pub scopes: BTreeMap<String, Measurement>,
487}
488
489/// The internal representation of the benchmark result.
490/// This type is not deserialized, therefore fields are not `Option`.
491#[derive(Debug, PartialEq, Default)]
492struct BenchResultInternal {
493    /// A measurement for the entire duration of the benchmark.
494    pub total: MeasurementInternal,
495
496    /// Measurements for scopes.
497    pub scopes: BTreeMap<String, MeasurementInternal>,
498}
499
500impl From<BenchResultInternal> for BenchResult {
501    fn from(r: BenchResultInternal) -> Self {
502        Self {
503            total: Measurement::from(r.total),
504            scopes: r
505                .scopes
506                .into_iter()
507                .map(|(k, v)| (k, Measurement::from(v)))
508                .collect(),
509        }
510    }
511}
512
513/// A benchmark measurement containing various stats.
514/// This type is in a public API.
515#[derive(Debug, PartialEq, Serialize, Deserialize, CandidType, Clone, Default)]
516pub struct Measurement {
517    /// The number of calls made during the measurement.
518    #[serde(default)]
519    pub calls: u64,
520
521    /// The number of instructions.
522    #[serde(default)]
523    pub instructions: u64,
524
525    /// The increase in heap (measured in pages).
526    #[serde(default)]
527    pub heap_increase: u64,
528
529    /// The increase in stable memory (measured in pages).
530    #[serde(default)]
531    pub stable_memory_increase: u64,
532}
533
534#[test]
535fn public_api_of_measurement_should_not_change() {
536    // If you have to modify this test, it's likely you broke the public API of `Measurement`.
537    // Avoid making such changes unless absolutely necessary — doing so requires a major version bump.
538    //
539    // This test checks that the `Measurement` struct:
540    // - Exists
541    // - Has all expected public fields
542    // - Fields have the expected names and types
543
544    let m = Measurement {
545        calls: 0_u64,
546        instructions: 0_u64,
547        heap_increase: 0_u64,
548        stable_memory_increase: 0_u64,
549    };
550
551    // Ensure field access works and types match expectations
552    let _: u64 = m.calls;
553    let _: u64 = m.instructions;
554    let _: u64 = m.heap_increase;
555    let _: u64 = m.stable_memory_increase;
556}
557
558/// The internal representation of a measurement.
559#[derive(Debug, PartialEq, Clone, Default)]
560struct MeasurementInternal {
561    /// Instruction counter at the start of measurement.
562    /// Not in public API, because it is not supposed to be compared to other measurements.
563    /// Used internally to correctly calculate instructions of overlapping or nested scopes.
564    start_instructions: u64,
565
566    /// The number of calls made during the measurement.
567    pub calls: u64,
568
569    /// The number of instructions.
570    pub instructions: u64,
571
572    /// The increase in heap (measured in pages).
573    pub heap_increase: u64,
574
575    /// The increase in stable memory (measured in pages).
576    pub stable_memory_increase: u64,
577}
578
579impl From<MeasurementInternal> for Measurement {
580    fn from(m: MeasurementInternal) -> Self {
581        Self {
582            calls: m.calls,
583            instructions: m.instructions,
584            heap_increase: m.heap_increase,
585            stable_memory_increase: m.stable_memory_increase,
586        }
587    }
588}
589
590/// Benchmarks the given function.
591pub fn bench_fn<R>(f: impl FnOnce() -> R) -> BenchResult {
592    reset();
593
594    let is_tracing_enabled = TRACING_BUFFER.with_borrow(|p| !p.is_empty());
595
596    if !is_tracing_enabled {
597        let start_heap = heap_size();
598        let start_stable_memory = ic_cdk::api::stable_size();
599        let start_instructions = instruction_count();
600        f();
601        let instructions = instruction_count() - start_instructions;
602        let stable_memory_increase = ic_cdk::api::stable_size() - start_stable_memory;
603        let heap_increase = heap_size() - start_heap;
604
605        let total = MeasurementInternal {
606            start_instructions,
607            calls: 1,
608            instructions,
609            heap_increase,
610            stable_memory_increase,
611        }
612        .into();
613        let scopes: std::collections::BTreeMap<_, _> = get_scopes_measurements()
614            .into_iter()
615            .map(|(k, v)| (k.to_string(), v))
616            .collect();
617
618        BenchResult { total, scopes }
619    } else {
620        // The first 4 bytes are a flag to indicate if tracing is enabled. It will be read by the
621        // tracing function (instrumented code) to decide whether to trace or not.
622        let tracing_started_flag_address = TRACING_BUFFER.with_borrow_mut(|p| p.as_mut_ptr());
623        unsafe {
624            // Ideally, we'd like to reverse the following 2 statements, but it might be possible
625            // for the compiler not to inline `ic_cdk::api::performance_counter` which would be
626            // problematic as `performance_counter` would be traced itself. Perhaps we can call
627            // ic0.performance_counter directly.
628            INSTRUCTIONS_START = ic_cdk::api::performance_counter(0) as i64;
629            *tracing_started_flag_address = 1;
630        }
631        f();
632        unsafe {
633            *tracing_started_flag_address = 0;
634            INSTRUCTIONS_END = ic_cdk::api::performance_counter(0) as i64;
635        }
636
637        // Only the traces are meaningful, and it's written to `TRACING_BUFFER` and will be
638        // collected in the tracing query method.
639        BenchResult::default()
640    }
641}
642
643/// Benchmarks the scope this function is declared in.
644///
645/// NOTE: It's important to assign this function, otherwise benchmarking won't work correctly.
646///
647/// # Correct Usage
648///
649/// ```
650/// fn my_func() {
651///   let _p = canbench_rs::bench_scope("my_scope");
652///   // Do something.
653/// }
654/// ```
655///
656/// # Incorrect Usages
657///
658/// ```
659/// fn my_func() {
660///   let _ = canbench_rs::bench_scope("my_scope"); // Doesn't capture the scope.
661///   // Do something.
662/// }
663/// ```
664///
665/// ```
666/// fn my_func() {
667///   canbench_rs::bench_scope("my_scope"); // Doesn't capture the scope.
668///   // Do something.
669/// }
670/// ```
671#[must_use]
672pub fn bench_scope(name: &'static str) -> BenchScope {
673    BenchScope::new(name)
674}
675
676/// An object used for benchmarking a specific scope.
677pub struct BenchScope {
678    name: &'static str,
679    start_instructions: u64,
680    start_stable_memory: u64,
681    start_heap: u64,
682}
683
684impl BenchScope {
685    fn new(name: &'static str) -> Self {
686        let start_heap = heap_size();
687        let start_stable_memory = ic_cdk::api::stable_size();
688        let start_instructions = instruction_count();
689
690        Self {
691            name,
692            start_instructions,
693            start_stable_memory,
694            start_heap,
695        }
696    }
697}
698
699impl Drop for BenchScope {
700    fn drop(&mut self) {
701        SCOPES.with(|p| {
702            let mut p = p.borrow_mut();
703            let start_instructions = self.start_instructions;
704            let stable_memory_increase = ic_cdk::api::stable_size() - self.start_stable_memory;
705            let heap_increase = heap_size() - self.start_heap;
706            let instructions = instruction_count() - self.start_instructions;
707            p.entry(self.name).or_default().push(MeasurementInternal {
708                start_instructions,
709                calls: 1,
710                instructions,
711                heap_increase,
712                stable_memory_increase,
713            });
714        });
715    }
716}
717
718// Clears all scope data.
719fn reset() {
720    SCOPES.with(|p| p.borrow_mut().clear());
721}
722
723// Returns the measurements for any declared scopes, aggregated by the scope name.
724fn get_scopes_measurements() -> BTreeMap<&'static str, Measurement> {
725    fn sum_non_overlapping(measurements: &[MeasurementInternal]) -> MeasurementInternal {
726        #[derive(Debug)]
727        struct Interval {
728            start: u64,
729            end: u64,
730            measurement: MeasurementInternal,
731        }
732
733        let mut intervals: Vec<Interval> = measurements
734            .iter()
735            .map(|m| Interval {
736                start: m.start_instructions,
737                end: m.start_instructions + m.instructions,
738                measurement: m.clone(),
739            })
740            .collect();
741
742        intervals.sort_by_key(|i| i.start);
743
744        let mut total = MeasurementInternal::default();
745        let mut current_start = 0;
746        let mut current_end = 0;
747        let mut group_measurements: Vec<MeasurementInternal> = Vec::new();
748
749        for i in intervals {
750            if i.start < current_end {
751                current_end = current_end.max(i.end);
752                group_measurements.push(i.measurement);
753            } else {
754                if current_end > current_start {
755                    total.instructions += current_end - current_start;
756                    for m in &group_measurements {
757                        total.calls += m.calls;
758                        total.heap_increase += m.heap_increase;
759                        total.stable_memory_increase += m.stable_memory_increase;
760                    }
761                }
762                current_start = i.start;
763                current_end = i.end;
764                group_measurements.clear();
765                group_measurements.push(i.measurement);
766            }
767        }
768
769        // Final group
770        if current_end > current_start {
771            total.instructions += current_end - current_start;
772            for m in &group_measurements {
773                total.calls += m.calls;
774                total.heap_increase += m.heap_increase;
775                total.stable_memory_increase += m.stable_memory_increase;
776            }
777        }
778
779        total
780    }
781
782    SCOPES.with(|p| {
783        p.borrow()
784            .iter()
785            .map(|(&scope, measurements)| {
786                (scope, Measurement::from(sum_non_overlapping(measurements)))
787            })
788            .collect()
789    })
790}
791
792fn instruction_count() -> u64 {
793    #[cfg(target_arch = "wasm32")]
794    {
795        ic_cdk::api::performance_counter(0)
796    }
797
798    #[cfg(not(target_arch = "wasm32"))]
799    {
800        // Consider using cpu time here.
801        0
802    }
803}
804
805fn heap_size() -> u64 {
806    #[cfg(target_arch = "wasm32")]
807    {
808        core::arch::wasm32::memory_size(0) as u64
809    }
810
811    #[cfg(not(target_arch = "wasm32"))]
812    {
813        0
814    }
815}
816
817thread_local! {
818    static TRACING_BUFFER: RefCell<Vec<u8>> = const { RefCell::new(Vec::new()) };
819}
820
821static mut INSTRUCTIONS_START: i64 = 0;
822static mut INSTRUCTIONS_END: i64 = 0;
823const NUM_BYTES_ENABLED_FLAG: usize = 4;
824const NUM_BYTES_NUM_ENTRIES: usize = 8;
825const MAX_NUM_LOG_ENTRIES: usize = 100_000_000;
826const NUM_BYTES_FUNC_ID: usize = 4;
827const NUM_BYTES_INSTRUCTION_COUNTER: usize = 8;
828const BUFFER_SIZE: usize = NUM_BYTES_ENABLED_FLAG
829    + NUM_BYTES_NUM_ENTRIES
830    + MAX_NUM_LOG_ENTRIES * (NUM_BYTES_FUNC_ID + NUM_BYTES_INSTRUCTION_COUNTER);
831const LOGS_START_OFFSET: usize = NUM_BYTES_ENABLED_FLAG + NUM_BYTES_NUM_ENTRIES;
832const MAX_NUM_LOG_ENTRIES_IN_RESPONSE: usize = 131_000;
833
834#[export_name = "__prepare_tracing"]
835fn prepare_tracing() -> i32 {
836    TRACING_BUFFER.with_borrow_mut(|b| {
837        *b = vec![0; BUFFER_SIZE];
838        b.as_ptr() as i32
839    })
840}
841
842pub fn get_traces(bench_instructions: u64) -> Result<Vec<(i32, i64)>, String> {
843    TRACING_BUFFER.with_borrow(|b| {
844        if b[0] == 1 {
845            panic!("Tracing is still enabled.");
846        }
847        let num_entries = i64::from_le_bytes(
848            b[NUM_BYTES_ENABLED_FLAG..(NUM_BYTES_ENABLED_FLAG + NUM_BYTES_NUM_ENTRIES)]
849                .try_into()
850                .unwrap(),
851        );
852        if num_entries > MAX_NUM_LOG_ENTRIES as i64 {
853            return Err(format!(
854                "There are {num_entries} log entries which is more than \
855                {MAX_NUM_LOG_ENTRIES}, as we can currently support",
856            ));
857        }
858        let instructions_start = unsafe { INSTRUCTIONS_START };
859        let mut traces = vec![(i32::MAX, 0)];
860        for i in 0..num_entries {
861            let log_start_address = i as usize
862                * (NUM_BYTES_FUNC_ID + NUM_BYTES_INSTRUCTION_COUNTER)
863                + LOGS_START_OFFSET;
864            let func_id = i32::from_le_bytes(
865                b[log_start_address..log_start_address + NUM_BYTES_FUNC_ID]
866                    .try_into()
867                    .unwrap(),
868            );
869            let instruction_counter = i64::from_le_bytes(
870                b[log_start_address + NUM_BYTES_FUNC_ID
871                    ..log_start_address + NUM_BYTES_FUNC_ID + NUM_BYTES_INSTRUCTION_COUNTER]
872                    .try_into()
873                    .unwrap(),
874            );
875            traces.push((func_id, instruction_counter - instructions_start));
876        }
877        traces.push((i32::MIN, unsafe { INSTRUCTIONS_END - instructions_start }));
878        let traces = adjust_traces_for_overhead(traces, bench_instructions);
879        // TODO(EXC-2020): consider using compression.
880        let traces = truncate_traces(traces);
881        Ok(traces)
882    })
883}
884
885fn adjust_traces_for_overhead(traces: Vec<(i32, i64)>, bench_instructions: u64) -> Vec<(i32, i64)> {
886    let num_logs = traces.len() - 2;
887    let overhead = (traces[num_logs].1 as f64 - bench_instructions as f64) / (num_logs as f64);
888    traces
889        .into_iter()
890        .enumerate()
891        .map(|(i, (id, count))| {
892            if i <= num_logs {
893                (id, count - (overhead * i as f64) as i64)
894            } else {
895                (id, count - (overhead * num_logs as f64) as i64)
896            }
897        })
898        .collect()
899}
900
901fn truncate_traces(traces: Vec<(i32, i64)>) -> Vec<(i32, i64)> {
902    if traces.len() <= MAX_NUM_LOG_ENTRIES_IN_RESPONSE {
903        return traces;
904    }
905
906    let mut num_traces_by_depth = BTreeMap::new();
907
908    let mut depth = 0;
909    for (func_id, _) in traces.iter() {
910        if *func_id >= 0 {
911            depth += 1;
912            *num_traces_by_depth.entry(depth).or_insert(0) += 1;
913        } else {
914            depth -= 1;
915        }
916    }
917    assert_eq!(depth, 0, "Traces are not balanced.");
918    let mut depth_to_truncate = 0;
919    let mut cumulative_traces = 0;
920    for (depth, num_traces) in num_traces_by_depth.iter() {
921        cumulative_traces += num_traces;
922        if cumulative_traces <= MAX_NUM_LOG_ENTRIES_IN_RESPONSE {
923            depth_to_truncate = *depth;
924        } else {
925            break;
926        }
927    }
928
929    let truncated: Vec<_> = traces
930        .into_iter()
931        .scan(0, |depth, (func_id, instruction_counter)| {
932            if func_id >= 0 {
933                *depth += 1;
934                Some((*depth, func_id, instruction_counter))
935            } else {
936                *depth -= 1;
937                Some((*depth + 1, func_id, instruction_counter))
938            }
939        })
940        .filter(|(depth, _, _)| *depth <= depth_to_truncate)
941        .map(|(_, func_id, instruction_counter)| (func_id, instruction_counter))
942        .collect();
943
944    truncated
945}