codspeed_divan_compat_walltime/
alloc.rs

1use std::{alloc::*, fmt, ptr::NonNull};
2
3use cfg_if::cfg_if;
4
5use crate::{stats::StatsSet, util::sync::AtomicFlag};
6
7#[cfg(target_os = "macos")]
8use crate::util::{sync::CachePadded, thread::PThreadKey};
9
10#[cfg(not(target_os = "macos"))]
11use std::cell::UnsafeCell;
12
13/// The `AllocProfiler` when running crate-internal tests.
14///
15/// This enables us to test it for:
16/// - Undefined behavior with Miri
17/// - Correctness when tallying
18#[cfg(test)]
19#[global_allocator]
20static ALLOC: AllocProfiler = AllocProfiler::system();
21
22/// Whether to ignore allocation info set during the benchmark.
23pub(crate) static IGNORE_ALLOC: AtomicFlag = AtomicFlag::new(false);
24
25/// Measures [`GlobalAlloc`] memory usage.
26///
27/// # Examples
28///
29/// The default usage is to create a
30/// [`#[global_allocator]`](macro@global_allocator) that wraps the [`System`]
31/// allocator with [`AllocProfiler::system()`]:
32///
33/// ```
34/// use std::collections::*;
35/// use divan::AllocProfiler;
36///
37/// #[global_allocator]
38/// static ALLOC: AllocProfiler = AllocProfiler::system();
39///
40/// fn main() {
41///     divan::main();
42/// }
43///
44/// #[divan::bench(types = [
45///     Vec<i32>,
46///     LinkedList<i32>,
47///     HashSet<i32>,
48/// ])]
49/// fn from_iter<T>() -> T
50/// where
51///     T: FromIterator<i32>,
52/// {
53///     (0..100).collect()
54/// }
55///
56/// #[divan::bench(types = [
57///     Vec<i32>,
58///     LinkedList<i32>,
59///     HashSet<i32>,
60/// ])]
61/// fn drop<T>(bencher: divan::Bencher)
62/// where
63///     T: FromIterator<i32>,
64/// {
65///     bencher
66///         .with_inputs(|| (0..100).collect::<T>())
67///         .bench_values(std::mem::drop);
68/// }
69/// ```
70///
71/// Wrap other [`GlobalAlloc`] implementations like
72/// [`mimalloc`](https://docs.rs/mimalloc) with [`AllocProfiler::new()`]:
73///
74/// ```
75/// use divan::AllocProfiler;
76/// use mimalloc::MiMalloc;
77///
78/// # #[cfg(not(miri))]
79/// #[global_allocator]
80/// static ALLOC: AllocProfiler<MiMalloc> = AllocProfiler::new(MiMalloc);
81/// ```
82///
83/// See [`string`](https://github.com/nvzqz/divan/blob/main/examples/benches/string.rs)
84/// and [`collections`](https://github.com/nvzqz/divan/blob/main/examples/benches/collections.rs)
85/// benchmarks for more examples.
86///
87/// # Implementation
88///
89/// Collecting allocation information happens at any point during which Divan is
90/// also measuring the time. As a result, counting allocations affects timing.
91///
92/// To reduce Divan's footprint during benchmarking:
93/// - Allocation information is recorded in thread-local storage to prevent
94///   contention when benchmarks involve multiple threads, either through
95///   options like [`threads`](macro@crate::bench#threads) or internally
96///   spawning their own threads.
97/// - It does not check for overflow and assumes it will not happen. This is
98///   subject to change in the future.
99/// - Fast thread-local storage access is assembly-optimized on macOS.
100///
101/// Allocation information is the only data Divan records outside of timing, and
102/// thus it also has the only code that affects timing. Steps for recording
103/// alloc info:
104/// 1. Load the thread-local slot for allocation information.
105///
106///    On macOS, this is via the
107///    [`gs`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L34)/[`tpidrro_el0`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L47)
108///    registers for
109///    [`pthread_getspecific`](https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_getspecific.html).
110///    Although this is not guaranteed as stable ABI, in practice many programs
111///    assume these registers store thread-local data. [`thread_local!`] is used
112///    on all other platforms.
113///
114/// 2. Increment allocation operation invocation count and bytes count
115///    (a.k.a. size).
116///
117/// Allocation information is recorded in thread-local storage to prevent
118/// slowdowns from synchronized sharing when using multiple threads, through
119/// options like [`threads`](macro@crate::bench#threads).
120///
121/// Note that allocations in threads not controlled by Divan are not currently
122/// counted.
123#[derive(Debug, Default)]
124pub struct AllocProfiler<Alloc = System> {
125    alloc: Alloc,
126}
127
128unsafe impl<A: GlobalAlloc> GlobalAlloc for AllocProfiler<A> {
129    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
130        // Tally allocation count.
131        if let Some(mut info) = ThreadAllocInfo::try_current() {
132            // SAFETY: We have exclusive access.
133            let info = unsafe { info.as_mut() };
134
135            info.tally_alloc(layout.size());
136        };
137
138        self.alloc.alloc(layout)
139    }
140
141    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
142        // Tally allocation count.
143        if let Some(mut info) = ThreadAllocInfo::try_current() {
144            // SAFETY: We have exclusive access.
145            let info = unsafe { info.as_mut() };
146
147            info.tally_alloc(layout.size());
148        };
149
150        self.alloc.alloc_zeroed(layout)
151    }
152
153    unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
154        // Tally reallocation count.
155        if let Some(mut info) = ThreadAllocInfo::try_current() {
156            // SAFETY: We have exclusive access.
157            let info = unsafe { info.as_mut() };
158
159            info.tally_realloc(layout.size(), new_size);
160        };
161
162        self.alloc.realloc(ptr, layout, new_size)
163    }
164
165    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
166        // Tally deallocation count.
167        if let Some(mut info) = ThreadAllocInfo::try_current() {
168            // SAFETY: We have exclusive access.
169            let info = unsafe { info.as_mut() };
170
171            info.tally_dealloc(layout.size());
172        };
173
174        self.alloc.dealloc(ptr, layout)
175    }
176}
177
178impl AllocProfiler {
179    /// Profiles the [`System`] allocator.
180    #[inline]
181    pub const fn system() -> Self {
182        Self::new(System)
183    }
184}
185
186impl<A> AllocProfiler<A> {
187    /// Profiles a [`GlobalAlloc`].
188    #[inline]
189    pub const fn new(alloc: A) -> Self {
190        Self { alloc }
191    }
192}
193
194/// Thread-local allocation information.
195#[derive(Clone, Default)]
196#[repr(C)]
197pub(crate) struct ThreadAllocInfo {
198    // NOTE: `tallies` should be ordered first so that `tally_realloc` can
199    // directly index `&self` without an offset.
200    pub tallies: ThreadAllocTallyMap,
201
202    // NOTE: Max size and count are signed for convenience but can never be
203    // negative due to it being initialized to 0.
204    //
205    // PERF: Grouping current/max fields together by count and size makes
206    // `tally_alloc` take the least time on M1 Mac.
207    pub current_count: ThreadAllocCountSigned,
208    pub max_count: ThreadAllocCountSigned,
209    pub current_size: ThreadAllocCountSigned,
210    pub max_size: ThreadAllocCountSigned,
211}
212
213#[cfg(not(target_os = "macos"))]
214thread_local! {
215    /// Instance specific to the current thread.
216    ///
217    /// On macOS, we use `ALLOC_PTHREAD_KEY` instead.
218    static CURRENT_THREAD_INFO: UnsafeCell<ThreadAllocInfo> = const {
219        UnsafeCell::new(ThreadAllocInfo::new())
220    };
221}
222
223#[cfg(target_os = "macos")]
224static ALLOC_PTHREAD_KEY: CachePadded<PThreadKey<ThreadAllocInfo>> = CachePadded(PThreadKey::new());
225
226impl ThreadAllocInfo {
227    #[inline]
228    pub const fn new() -> Self {
229        Self {
230            tallies: ThreadAllocTallyMap::new(),
231            max_count: 0,
232            current_count: 0,
233            max_size: 0,
234            current_size: 0,
235        }
236    }
237
238    /// Returns the current thread's allocation information, initializing it on
239    /// first access.
240    ///
241    /// Returns `None` if the thread is terminating and has thus deallocated its
242    /// local instance.
243    #[inline]
244    pub fn current() -> Option<NonNull<Self>> {
245        cfg_if! {
246            if #[cfg(target_os = "macos")] {
247                return Self::try_current().or_else(slow_impl);
248            } else {
249                Self::try_current()
250            }
251        }
252
253        #[cfg(target_os = "macos")]
254        #[cold]
255        #[inline(never)]
256        fn slow_impl() -> Option<NonNull<ThreadAllocInfo>> {
257            unsafe {
258                let layout = Layout::new::<ThreadAllocInfo>();
259
260                let Some(info_alloc) = NonNull::new(unsafe { System.alloc_zeroed(layout) }) else {
261                    handle_alloc_error(layout);
262                };
263
264                let success = ALLOC_PTHREAD_KEY.0.set(info_alloc.as_ptr().cast(), |this| {
265                    System.dealloc(this.as_ptr().cast(), Layout::new::<ThreadAllocInfo>());
266                });
267
268                if !success {
269                    System.dealloc(info_alloc.as_ptr(), layout);
270                    return None;
271                }
272
273                // When using static thread local key, write directly because it
274                // is undefined behavior to call `pthread_setspecific` with a
275                // key that didn't originate from `pthread_key_create`.
276                #[cfg(all(not(miri), not(feature = "dyn_thread_local"), target_arch = "x86_64"))]
277                unsafe {
278                    crate::util::thread::fast::set_static_thread_local(info_alloc.as_ptr());
279                };
280
281                Some(info_alloc.cast())
282            }
283        }
284    }
285
286    /// Returns the current thread's allocation information if initialized.
287    ///
288    /// Returns `None` if the instance has not yet been allocated or the thread
289    /// is terminating and has thus deallocated its local instance.
290    #[inline]
291    pub fn try_current() -> Option<NonNull<Self>> {
292        cfg_if! {
293            if #[cfg(target_os = "macos")] {
294                // Fast path: static thread local.
295                #[cfg(all(
296                    not(miri),
297                    not(feature = "dyn_thread_local"),
298                    target_arch = "x86_64",
299                ))]
300                return NonNull::new(unsafe {
301                    crate::util::thread::fast::get_static_thread_local::<Self>().cast_mut()
302                });
303
304                #[allow(unreachable_code)]
305                ALLOC_PTHREAD_KEY.0.get()
306            } else {
307                CURRENT_THREAD_INFO.try_with(|info| unsafe {
308                    NonNull::new_unchecked(info.get())
309                }).ok()
310            }
311        }
312    }
313
314    /// Sets 0 to all values.
315    pub fn clear(&mut self) {
316        *self = Self::new();
317    }
318
319    /// Tallies the total count and size of the allocation operation.
320    #[inline]
321    pub fn tally_alloc(&mut self, size: usize) {
322        self.tally_op(AllocOp::Alloc, size);
323
324        self.current_count += 1;
325        self.max_count = self.max_count.max(self.current_count);
326
327        self.current_size += size as ThreadAllocCountSigned;
328        self.max_size = self.max_size.max(self.current_size);
329    }
330
331    /// Tallies the total count and size of the deallocation operation.
332    #[inline]
333    pub fn tally_dealloc(&mut self, size: usize) {
334        self.tally_op(AllocOp::Dealloc, size);
335
336        self.current_count -= 1;
337        self.current_size -= size as ThreadAllocCountSigned;
338    }
339
340    /// Tallies the total count and size of the reallocation operation.
341    #[inline]
342    pub fn tally_realloc(&mut self, old_size: usize, new_size: usize) {
343        let (diff, is_shrink) = new_size.overflowing_sub(old_size);
344        let diff = diff as isize;
345        let abs_diff = diff.wrapping_abs() as usize;
346
347        self.tally_op(AllocOp::realloc(is_shrink), abs_diff);
348
349        // NOTE: Realloc does not change allocation count.
350        self.current_size += diff as ThreadAllocCountSigned;
351        self.max_size = self.max_size.max(self.current_size);
352    }
353
354    /// Tallies the total count and size of the allocation operation.
355    #[inline]
356    fn tally_op(&mut self, op: AllocOp, size: usize) {
357        let tally = self.tallies.get_mut(op);
358        tally.count += 1;
359        tally.size += size as ThreadAllocCount;
360    }
361}
362
363/// Allocation numbers being accumulated.
364///
365/// # Memory Layout
366///
367/// Aligning to 16 nudges the compiler to emit aligned SIMD operations.
368///
369/// Placing `count` first generates less code on AArch64.
370#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
371#[repr(C, align(16))]
372pub(crate) struct AllocTally<Count> {
373    /// The number of times this operation was performed.
374    pub count: Count,
375
376    /// The amount of memory this operation changed.
377    pub size: Count,
378}
379
380pub(crate) type ThreadAllocCount = condtype::num::Usize64;
381pub(crate) type ThreadAllocCountSigned = condtype::num::Isize64;
382
383pub(crate) type ThreadAllocTally = AllocTally<ThreadAllocCount>;
384
385pub(crate) type TotalAllocTally = AllocTally<u128>;
386
387impl AllocTally<StatsSet<f64>> {
388    pub fn is_zero(&self) -> bool {
389        self.count.is_zero() && self.size.is_zero()
390    }
391}
392
393impl<C> AllocTally<C> {
394    #[inline]
395    pub fn as_array(&self) -> &[C; 2] {
396        // SAFETY: This is `#[repr(C)]`, so we can treat it as a contiguous
397        // sequence of items.
398        unsafe { &*(self as *const _ as *const _) }
399    }
400}
401
402/// Allocation number categories.
403///
404/// Note that grow/shrink are first to improve code generation for `realloc`.
405#[derive(Clone, Copy, PartialEq, Eq)]
406pub(crate) enum AllocOp {
407    Grow,
408    Shrink,
409    Alloc,
410    Dealloc,
411}
412
413impl AllocOp {
414    pub const ALL: [Self; 4] = {
415        use AllocOp::*;
416
417        // Use same order as declared so that it can be indexed as-is.
418        [Grow, Shrink, Alloc, Dealloc]
419    };
420
421    #[inline]
422    pub fn realloc(shrink: bool) -> Self {
423        // This generates the same code as `std::mem::transmute`.
424        if shrink {
425            Self::Shrink
426        } else {
427            Self::Grow
428        }
429    }
430
431    #[inline]
432    pub fn name(self) -> &'static str {
433        match self {
434            Self::Grow => "grow",
435            Self::Shrink => "shrink",
436            Self::Alloc => "alloc",
437            Self::Dealloc => "dealloc",
438        }
439    }
440
441    #[inline]
442    pub fn prefix(self) -> &'static str {
443        match self {
444            Self::Grow => "grow:",
445            Self::Shrink => "shrink:",
446            Self::Alloc => "alloc:",
447            Self::Dealloc => "dealloc:",
448        }
449    }
450}
451
452/// Values keyed by `AllocOp`.
453#[derive(Clone, Copy, Default, PartialEq, Eq)]
454pub(crate) struct AllocOpMap<T> {
455    pub values: [T; 4],
456}
457
458pub(crate) type ThreadAllocTallyMap = AllocOpMap<ThreadAllocTally>;
459
460pub(crate) type TotalAllocTallyMap = AllocOpMap<TotalAllocTally>;
461
462impl<T: fmt::Debug> fmt::Debug for AllocOpMap<T> {
463    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
464        f.debug_map().entries(AllocOp::ALL.iter().map(|&op| (op.name(), self.get(op)))).finish()
465    }
466}
467
468impl ThreadAllocTallyMap {
469    #[inline]
470    pub const fn new() -> Self {
471        unsafe { std::mem::transmute([0u8; size_of::<Self>()]) }
472    }
473
474    /// Returns `true` if all tallies are 0.
475    #[inline]
476    pub fn is_empty(&self) -> bool {
477        self.values.iter().all(|tally| tally.count == 0 && tally.size == 0)
478    }
479
480    pub fn add_to_total(&self, total: &mut TotalAllocTallyMap) {
481        for (i, value) in self.values.iter().enumerate() {
482            total.values[i].count += value.count as u128;
483            total.values[i].size += value.size as u128;
484        }
485    }
486}
487
488impl<T> AllocOpMap<T> {
489    #[cfg(test)]
490    pub fn from_fn<F>(f: F) -> Self
491    where
492        F: FnMut(AllocOp) -> T,
493    {
494        Self { values: AllocOp::ALL.map(f) }
495    }
496
497    #[inline]
498    pub const fn get(&self, op: AllocOp) -> &T {
499        &self.values[op as usize]
500    }
501
502    #[inline]
503    pub fn get_mut(&mut self, op: AllocOp) -> &mut T {
504        &mut self.values[op as usize]
505    }
506}
507
508#[cfg(feature = "internal_benches")]
509mod benches {
510    use super::*;
511
512    // We want the approach to scale well with thread count.
513    const THREADS: &[usize] = &[0, 1, 2, 4, 16];
514
515    #[crate::bench(crate = crate, threads = THREADS)]
516    fn tally_alloc(bencher: crate::Bencher) {
517        IGNORE_ALLOC.set(true);
518
519        // Using 0 simulates tallying without affecting benchmark reporting.
520        let size = crate::black_box(0);
521
522        bencher.bench(|| {
523            if let Some(mut info) = ThreadAllocInfo::try_current() {
524                // SAFETY: We have exclusive access.
525                let info = unsafe { info.as_mut() };
526
527                info.tally_alloc(size);
528            }
529        })
530    }
531
532    #[crate::bench(crate = crate, threads = THREADS)]
533    fn tally_dealloc(bencher: crate::Bencher) {
534        IGNORE_ALLOC.set(true);
535
536        // Using 0 simulates tallying without affecting benchmark reporting.
537        let size = crate::black_box(0);
538
539        bencher.bench(|| {
540            if let Some(mut info) = ThreadAllocInfo::try_current() {
541                // SAFETY: We have exclusive access.
542                let info = unsafe { info.as_mut() };
543
544                info.tally_dealloc(size);
545            }
546        })
547    }
548
549    #[crate::bench(crate = crate, threads = THREADS)]
550    fn tally_realloc(bencher: crate::Bencher) {
551        IGNORE_ALLOC.set(true);
552
553        // Using 0 simulates tallying without affecting benchmark reporting.
554        let new_size = crate::black_box(0);
555        let old_size = crate::black_box(0);
556
557        bencher.bench(|| {
558            if let Some(mut info) = ThreadAllocInfo::try_current() {
559                // SAFETY: We have exclusive access.
560                let info = unsafe { info.as_mut() };
561
562                info.tally_realloc(old_size, new_size);
563            }
564        })
565    }
566
567    #[crate::bench_group(crate = crate, threads = THREADS)]
568    mod current {
569        use super::*;
570
571        #[crate::bench(crate = crate)]
572        fn init() -> Option<NonNull<ThreadAllocInfo>> {
573            ThreadAllocInfo::current()
574        }
575
576        #[crate::bench(crate = crate)]
577        fn r#try() -> Option<NonNull<ThreadAllocInfo>> {
578            ThreadAllocInfo::try_current()
579        }
580    }
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    /// Tests that `AllocProfiler` is counting correctly.
588    #[test]
589    fn tally() {
590        // Initialize the thread's alloc info.
591        //
592        // SAFETY: This cannot be kept as a reference and is instead a raw
593        // pointer because a reference would cause undefined behavior when
594        // `AllocProfiler` attempts to update tallies.
595        let mut alloc_info = ThreadAllocInfo::current().unwrap();
596
597        // Resets the allocation tallies and returns the previous tallies.
598        let mut take_alloc_tallies = || std::mem::take(unsafe { &mut alloc_info.as_mut().tallies });
599
600        // Start fresh.
601        _ = take_alloc_tallies();
602
603        // Helper to create `ThreadAllocTallyMap` since each operation only
604        // changes `buf` by 1 `i32`.
605        let item_tally = ThreadAllocTally { count: 1, size: size_of::<i32>() as _ };
606        let make_tally_map = |op: AllocOp| {
607            ThreadAllocTallyMap::from_fn(|other_op| {
608                if other_op == op {
609                    item_tally
610                } else {
611                    Default::default()
612                }
613            })
614        };
615
616        // Test zero.
617        let mut buf: Vec<i32> = Vec::new();
618        assert_eq!(take_alloc_tallies(), Default::default());
619
620        // Test allocation.
621        buf.reserve_exact(1);
622        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Alloc));
623
624        // Test grow.
625        buf.reserve_exact(2);
626        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Grow));
627
628        // Test shrink.
629        buf.shrink_to(1);
630        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Shrink));
631
632        // Test dealloc.
633        drop(buf);
634        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Dealloc));
635
636        // Test all of the above together.
637        let mut buf: Vec<i32> = Vec::new();
638        buf.reserve_exact(1); // alloc
639        buf.reserve_exact(2); // grow
640        buf.shrink_to(1); // shrink
641        drop(buf); // dealloc
642        assert_eq!(take_alloc_tallies(), ThreadAllocTallyMap { values: [item_tally; 4] });
643    }
644}