1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
//! [`EmbeddedProfiler`] implementation based on [`DWT`].
//!
//! This profiler depends on the [`DWT`] hardware which is not available on cortex-M0.
//! The profiler's resolution is the same as the core clock. The cycle count clock is
//! free-running, so overflows are likely if you have long running functions to profile.
//! To mitigate this, one can use the `extended` feature, which extends the resolution of
//! the counter from [`u32`] to [`u64`] using the [`DebugMonitor`] exception. It is set
//! to expire just before overflow, so you can expect an exception to fire every 2**32
//! clock cycles.
//!
//! Snapshots are logged using [`log::info!`], so having a logger installed is required
//! if you want to use [`embedded_profiling::log_snapshot`] or functions that call it
//! (like [`embedded_profiling::profile_function`]).
//!
//! ## Example Usage
//!
//!```no_run
//! # use cortex_m::peripheral::Peripherals as CorePeripherals;
//! # const CORE_FREQ: u32 = 120_000_000;
//! let mut core = CorePeripherals::take().unwrap();
//! // (...)
//! let dwt_profiler = cortex_m::singleton!(: ep_dwt::DwtProfiler::<CORE_FREQ> =
//!     ep_dwt::DwtProfiler::<CORE_FREQ>::new(&mut core.DCB, core.DWT, CORE_FREQ).unwrap())
//! .unwrap();
//! unsafe {
//!     embedded_profiling::set_profiler(dwt_profiler).unwrap();
//! }
//! // (...)
//! embedded_profiling::profile("print_profile", || println!("Hello, world"));
//! ```
//!
//! [`DWT`]: cortex_m::peripheral::DWT
//! [`DebugMonitor`]: `cortex_m::peripheral::scb::Exception::DebugMonitor`

#![cfg_attr(not(test), no_std)]
use embedded_profiling::{EPContainer, EPInstant, EPSnapshot, EmbeddedProfiler};

use cortex_m::peripheral::{DCB, DWT};

#[cfg(feature = "extended")]
use core::sync::atomic::{AtomicU32, Ordering};
#[cfg(feature = "extended")]
use cortex_m_rt::exception;

#[cfg(feature = "extended")]
/// Tracker of `cyccnt` cycle count overflows to extend this timer to 64 bit
static ROLLOVER_COUNT: AtomicU32 = AtomicU32::new(0);

#[cfg(feature = "extended")]
// For extended mode to work, we really need a u64 container. Double check this.
static_assertions::assert_type_eq_all!(EPContainer, u64);

/// DWT trace unit implementing [`EmbeddedProfiler`].
///
/// The frequency of the [`DWT`] is encoded using the parameter `FREQ`.
pub struct DwtProfiler<const FREQ: u32> {
    dwt: DWT,
}

impl<const FREQ: u32> DwtProfiler<FREQ> {
    /// Enable the [`DWT`] and provide a new [`EmbeddedProfiler`].
    ///
    /// Note that the `sysclk` parameter should come from e.g. the HAL's clock generation function
    /// so the real speed and the declared speed can be compared.
    ///
    /// # Panics
    /// asserts that the compile time constant `FREQ` matches the runtime provided `sysclk`
    pub fn new(dcb: &mut DCB, mut dwt: DWT, sysclk: u32) -> Self {
        assert!(FREQ == sysclk);

        // Enable the DWT block
        dcb.enable_trace();
        #[cfg(feature = "extended")]
        // Enable DebugMonitor exceptions to fire to track overflows
        unsafe {
            dcb.demcr.modify(|f| f | 1 << 16);
        }
        DWT::unlock();

        // reset cycle count and enable it to run
        unsafe { dwt.cyccnt.write(0) };
        dwt.enable_cycle_counter();

        Self { dwt }
    }

    /// binary GCD function stolen from wikipedia, made const
    const fn gcd(mut u: EPContainer, mut v: EPContainer) -> EPContainer {
        // Base cases: gcd(n, 0) = gcd(0, n) = n
        if u == 0 {
            return v;
        } else if v == 0 {
            return u;
        }

        // Using identities 2 and 3:
        // gcd(2ⁱ u, 2ʲ v) = 2ᵏ gcd(u, v) with u, v odd and k = min(i, j)
        // 2ᵏ is the greatest power of two that divides both u and v
        let i = u.trailing_zeros();
        u >>= i;
        let j = v.trailing_zeros();
        v >>= j;

        // min(i, j);
        let k = if i <= j { i } else { j };

        loop {
            // u and v are odd at the start of the loop
            // debug_assert!(u % 2 == 1, "u = {} is even", u);
            // debug_assert!(v % 2 == 1, "v = {} is even", v);

            // Swap if necessary so u <= v
            if u > v {
                // swap(&mut u, &mut v);
                let tmp = u;
                u = v;
                v = tmp;
            }

            // Using identity 4 (gcd(u, v) = gcd(|v-u|, min(u, v))
            v -= u;

            // Identity 1: gcd(u, 0) = u
            // The shift by k is necessary to add back the 2ᵏ factor that was removed before the loop
            if v == 0 {
                return u << k;
            }

            // Identity 3: gcd(u, 2ʲ v) = gcd(u, v) (u is known to be odd)
            v >>= v.trailing_zeros();
        }
    }

    /// Reduce the fraction we need to convert between 1µs precision and whatever our core clock is running at
    pub(crate) const fn reduced_fraction() -> (EPContainer, EPContainer) {
        let gcd = Self::gcd(1_000_000, FREQ as EPContainer) as EPContainer;
        (1_000_000 / gcd, FREQ as EPContainer / gcd)
    }
}

impl<const FREQ: u32> EmbeddedProfiler for DwtProfiler<FREQ> {
    fn read_clock(&self) -> EPInstant {
        // get the cycle count and add the rollover if we're extended
        #[allow(unused_mut)]
        let mut count = self.dwt.cyccnt.read() as EPContainer;
        #[cfg(feature = "extended")]
        {
            count +=
                ROLLOVER_COUNT.load(Ordering::Relaxed) as EPContainer * u32::MAX as EPContainer;
        }

        // convert count and return the instant
        let (red_num, red_denom) = Self::reduced_fraction();
        EPInstant::from_ticks(count * red_num / red_denom)
    }

    fn log_snapshot(&self, snapshot: &EPSnapshot) {
        log::info!("{}", snapshot);
    }
}

#[cfg(feature = "extended")]
#[exception]
#[allow(non_snake_case)]
fn DebugMonitor() {
    ROLLOVER_COUNT.fetch_add(1, Ordering::Relaxed);
}