#![feature(alloc)]
#![feature(allocator_api)]
extern crate alloc;
extern crate bagpipe;
extern crate elfmalloc;
extern crate num_cpus;
use std::marker;
use alloc::heap;
use std::mem;
use std::thread;
use std::time;
use std::ptr::write_volatile;
use elfmalloc::slag::{AllocBuilder, LocalAllocator, MagazineAllocator};
use elfmalloc::general::global;
use elfmalloc::general::DynamicAllocator;
use std::sync::{Arc, Barrier};
use std::sync::atomic::{AtomicPtr, Ordering};
type BenchItem = [usize; 2];
const PAGE_SIZE: usize = 32 << 10;
const EAGER_DECOMMIT: usize = 30 << 10;
trait AllocLike
where Self: Clone + Send
{
type Item;
fn create() -> Self;
unsafe fn allocate(&mut self) -> *mut Self::Item;
unsafe fn deallocate(&mut self, *mut Self::Item);
fn kill(&mut self) {}
}
impl<T: 'static> AllocLike for MagazineAllocator<T> {
type Item = T;
fn create() -> Self {
AllocBuilder::default()
.cutoff_factor(0.8)
.page_size(PAGE_SIZE)
.eager_decommit_threshold(EAGER_DECOMMIT)
.build_magazine()
}
unsafe fn allocate(&mut self) -> *mut T {
self.alloc()
}
unsafe fn deallocate(&mut self, item: *mut T) {
self.free(item)
}
fn kill(&mut self) {}
}
struct ElfGlobal<T>(marker::PhantomData<T>);
impl<T> Clone for ElfGlobal<T> {
fn clone(&self) -> Self {
ElfGlobal(marker::PhantomData)
}
}
unsafe impl<T> Send for ElfGlobal<T> {}
impl<T: 'static> AllocLike for ElfGlobal<T> {
type Item = T;
fn create() -> Self {
ElfGlobal(marker::PhantomData)
}
unsafe fn allocate(&mut self) -> *mut T {
global::alloc(mem::size_of::<T>()) as *mut T
}
unsafe fn deallocate(&mut self, item: *mut T) {
global::free(item as *mut u8)
}
fn kill(&mut self) {}
}
struct ElfClone<T>(DynamicAllocator, marker::PhantomData<T>);
impl<T> Clone for ElfClone<T> {
fn clone(&self) -> Self {
ElfClone(self.0.clone(), marker::PhantomData)
}
}
unsafe impl<T> Send for ElfClone<T> {}
impl<T: 'static> AllocLike for ElfClone<T> {
type Item = T;
fn create() -> Self {
ElfClone(DynamicAllocator::new(), marker::PhantomData)
}
unsafe fn allocate(&mut self) -> *mut T {
self.0.alloc(mem::size_of::<T>()) as *mut T
}
unsafe fn deallocate(&mut self, item: *mut T) {
self.0.free(item as *mut u8)
}
fn kill(&mut self) {}
}
impl<T: 'static> AllocLike for LocalAllocator<T> {
type Item = T;
fn create() -> Self {
AllocBuilder::default()
.cutoff_factor(0.8)
.page_size(PAGE_SIZE)
.eager_decommit_threshold(EAGER_DECOMMIT)
.build_local()
}
unsafe fn allocate(&mut self) -> *mut T {
self.alloc()
}
unsafe fn deallocate(&mut self, item: *mut T) {
self.free(item)
}
fn kill(&mut self) {}
}
struct DefaultMalloc<T>(marker::PhantomData<T>);
unsafe impl<T> Send for DefaultMalloc<T> {}
impl<T> Clone for DefaultMalloc<T> {
fn clone(&self) -> Self {
DefaultMalloc(marker::PhantomData)
}
}
impl<T> AllocLike for DefaultMalloc<T> {
type Item = T;
fn create() -> Self {
DefaultMalloc(marker::PhantomData)
}
unsafe fn allocate(&mut self) -> *mut T {
use heap::{Alloc, Layout};
heap::Heap.alloc(Layout::from_size_align(mem::size_of::<T>(), 8).unwrap())
.unwrap() as *mut T
}
unsafe fn deallocate(&mut self, item: *mut T) {
use heap::{Alloc, Layout};
heap::Heap.dealloc(item as *mut u8,
Layout::from_size_align(mem::size_of::<T>(), 8).unwrap());
}
}
macro_rules! time_block {
($block:expr) => {
{
$block;
let start = time::Instant::now();
$block;
let dur = start.elapsed();
(dur.as_secs() * 1_000_000_000) + (dur.subsec_nanos() as u64)
}
}
}
macro_rules! time_block_once {
($block:expr) => {
{
let start = time::Instant::now();
$block;
let dur = start.elapsed();
(dur.as_secs() * 1_000_000_000) + (dur.subsec_nanos() as u64)
}
}
}
fn bench_alloc_free_pairs<A: AllocLike<Item = BenchItem> + 'static>(nthreads: usize,
per_thread: usize) {
let mut a = A::create();
let b = Arc::new(Barrier::new(nthreads + 1));
let mut threads = Vec::new();
for _ in 0..nthreads {
let mut alloc = a.clone();
let barrier = b.clone();
threads.push(thread::spawn(move || {
barrier.wait();
time_block!(unsafe {
for i in 0..per_thread {
let ptr = alloc.allocate();
write_volatile(ptr as *mut usize, i);
alloc.deallocate(ptr);
}
})
}));
}
b.wait();
let mut total = 0;
for i in threads {
total += i.join().unwrap();
}
println!("{} Mops/s",
((nthreads * nthreads * per_thread * 2 * 1_000) as f64) / (total as f64));
a.kill();
}
fn bench_alloc_free_pairs_buffered<A: AllocLike<Item = BenchItem> + 'static>(nthreads: usize,
per_thread: usize) {
let mut a = A::create();
let b = Arc::new(Barrier::new(nthreads + 1));
let mut threads = Vec::new();
for _ in 0..nthreads {
let mut alloc = a.clone();
let barrier = b.clone();
threads.push(thread::spawn(move || {
let mut ptrs = Vec::new();
for _ in 0..(64 * 1024) {
ptrs.push(unsafe { alloc.allocate() });
}
barrier.wait();
time_block!(unsafe {
for i in 0..per_thread {
let idx = i % ptrs.len();
let ptr = ptrs.get_unchecked_mut(idx);
alloc.deallocate(*ptr);
*ptr = alloc.allocate();
write_volatile(*ptr as *mut u8, i as u8);
}
})
}));
}
b.wait();
let mut total = 0;
for i in threads {
total += i.join().unwrap();
}
println!("{} Mops/s",
((nthreads * nthreads * per_thread * 2 * 1_000) as f64) / (total as f64));
a.kill();
}
fn bench_prod_cons<A: AllocLike<Item = BenchItem> + 'static>(nthreads: usize, per_thread: usize) {
let mut a = A::create();
let b = Arc::new(Barrier::new(nthreads + 1));
let mut v_base = Vec::new();
for _ in 0..nthreads {
v_base.push(AtomicPtr::new(Box::into_raw(Box::new(Vec::with_capacity(per_thread)))));
}
let v = Arc::new(v_base);
let mut threads = Vec::new();
for t in 0..nthreads {
let mut alloc = a.clone();
let barrier = b.clone();
let io_vec = v.clone();
threads.push(thread::spawn(move || {
let (me, them) = (t, (t + 1) % nthreads);
unsafe {
let my_v = io_vec[me].load(Ordering::Acquire).as_mut().unwrap();
for i in 0..per_thread {
let ptr = alloc.allocate();
write_volatile(ptr as *mut usize, i);
my_v.push(AtomicPtr::new(ptr));
}
}
barrier.wait();
time_block_once!(unsafe {
for i in 0..per_thread {
let ptr = io_vec[them].load(Ordering::Acquire).as_ref().unwrap()[i]
.load(Ordering::Relaxed);
alloc.deallocate(ptr);
}
})
}));
}
b.wait();
let mut total = 0;
for i in threads {
total += i.join().unwrap();
}
println!("{} Mops/s",
((nthreads * nthreads * per_thread * 1_000) as f64) / (total as f64));
a.kill();
}
fn bench_alloc_free<A: AllocLike<Item = BenchItem> + 'static>(nthreads: usize, per_thread: usize) {
let mut a = A::create();
let b = Arc::new(Barrier::new(nthreads + 1));
let mut threads = Vec::new();
for _ in 0..nthreads {
let mut alloc = a.clone();
let barrier = b.clone();
threads.push(thread::spawn(move || {
barrier.wait();
let mut ptrs = Vec::with_capacity(2 * per_thread);
time_block_once!(unsafe {
for i in 0..per_thread {
let ptr = alloc.allocate();
write_volatile(ptr as *mut usize, i);
ptrs.push(ptr);
}
for ptr in ptrs {
alloc.deallocate(ptr);
}
})
}));
}
b.wait();
let mut total = 0;
for i in threads {
total += i.join().unwrap();
}
println!("{} Mops/s",
((nthreads * nthreads * per_thread * 2 * 1_000) as f64) / (total as f64));
a.kill()
}
fn bench_alloc<A: AllocLike<Item = BenchItem> + 'static>(nthreads: usize, per_thread: usize) {
let mut a = A::create();
let b = Arc::new(Barrier::new(nthreads + 1));
let mut threads = Vec::new();
for _ in 0..nthreads {
let mut alloc = a.clone();
let barrier = b.clone();
threads.push(thread::spawn(move || {
barrier.wait();
let mut ptrs = Vec::with_capacity(per_thread);
let t = time_block_once!(unsafe {
for i in 0..per_thread {
let ptr = alloc.allocate();
write_volatile(ptr as *mut usize, i);
ptrs.push(ptr);
}
});
for ptr in ptrs {
unsafe { alloc.deallocate(ptr) };
}
t
}));
}
b.wait();
let mut total = 0;
for i in threads {
total += i.join().unwrap();
}
println!("{} Mops/s",
((nthreads * nthreads * per_thread * 1_000) as f64) / (total as f64));
a.kill()
}
fn bench_free<A: AllocLike<Item = BenchItem> + 'static>(nthreads: usize, per_thread: usize) {
let mut a = A::create();
let b = Arc::new(Barrier::new(nthreads + 1));
let mut threads = Vec::new();
for _ in 0..nthreads {
let mut alloc = a.clone();
let barrier = b.clone();
threads.push(thread::spawn(move || {
barrier.wait();
let mut ptrs = Vec::with_capacity(per_thread);
for i in 0..per_thread {
unsafe {
let ptr = alloc.allocate();
write_volatile(ptr as *mut usize, i);
ptrs.push(ptr);
}
}
time_block_once!(unsafe {
for ptr in ptrs {
alloc.deallocate(ptr);
}
})
}));
}
b.wait();
let mut total = 0;
for i in threads {
total += i.join().unwrap();
}
println!("{} Mops/s",
((nthreads * nthreads * per_thread * 1_000) as f64) / (total as f64));
a.kill()
}
macro_rules! run_bench_inner {
($bench:tt, $nthreads:expr, $iters:expr) => {
let iters = $iters;
let nthreads = $nthreads;
println!("global slag allocator");
$bench::<ElfGlobal<BenchItem>>(nthreads, iters);
println!("clone-based slag allocator");
$bench::<ElfClone<BenchItem>>(nthreads, iters);
println!("slag allocator");
$bench::<LocalAllocator<BenchItem>>(nthreads, iters);
println!("slagazine allocator");
$bench::<MagazineAllocator<BenchItem>>(nthreads, iters);
};
}
macro_rules! run_bench {
(both $desc:expr, $bench:tt, $nthreads:expr, $iters:expr) => {
println!("\n{} - {}", $desc, "single-threaded");
run_bench_inner!($bench, 1, $iters);
println!("\n{} - {} threads", $desc, $nthreads);
run_bench_inner!($bench, $nthreads, $iters);
};
(threads $desc:expr, $bench:tt, $nthreads:expr, $iters:expr) => {
println!("\n{} - {} threads", $desc, $nthreads);
run_bench_inner!($bench, $nthreads, $iters);
};
}
fn main() {
const ITERS: usize = 1_000_000;
let nthreads = num_cpus::get();
println!("allocating {} bytes per thread",
ITERS * mem::size_of::<BenchItem>());
run_bench!(both "alloc/free pairs", bench_alloc_free_pairs, nthreads, ITERS);
run_bench!(both "buffered alloc/free pairs", bench_alloc_free_pairs_buffered, nthreads, ITERS);
run_bench!(both "alloc (thread-local)", bench_alloc, nthreads, ITERS);
run_bench!(both "free (thread-local)", bench_free, nthreads, ITERS);
run_bench!(both "alloc & free (thread-local)", bench_alloc_free, nthreads, ITERS);
run_bench!(threads "free (producer-consumer)", bench_prod_cons, nthreads, ITERS);
}