bitvec 1.0.1

Addresses memory by bits, for packed collections and bitfields
Documentation
/*! Benchmarks for `BitSlice::copy_from_slice`.

The `copy_from_slice` implementation attempts to detect slice conditions that
allow element-wise `memcpy` behavior, rather than the conservative bit-by-bit
iteration, as element load/stores are faster than reading and writing each bit
in an element individually.
!*/

#![feature(maybe_uninit_uninit_array, maybe_uninit_slice)]

use std::mem::MaybeUninit;

use bitvec::{
	mem::{
		bits_of,
		elts,
	},
	prelude::*,
};
use criterion::{
	criterion_group,
	criterion_main,
	BenchmarkId,
	Criterion,
	SamplingMode,
	Throughput,
};
use tap::Tap;

//  One kibibyte
const KIBIBYTE: usize = 1024;
//  Some number of kibibytes
#[allow(clippy::identity_op)]
const FACTOR: usize = 1 * KIBIBYTE;

//  Scalars applied to FACTOR to get a range of action
const SCALARS: &[usize] = &[1, 2, 4, 8, 16, 24, 32, 40, 52, 64];
//  The maximum number of bits in a memory region.
const MAX_BITS: usize = 64 * FACTOR * 8;

fn make_slots<T, const LEN: usize>()
-> ([MaybeUninit<T>; LEN], [MaybeUninit<T>; LEN])
where T: BitStore {
	(
		MaybeUninit::<T>::uninit_array::<LEN>(),
		MaybeUninit::<T>::uninit_array::<LEN>(),
	)
}

fn view_slots<'a, 'b, T>(
	src: &'a [MaybeUninit<T>],
	dst: &'b mut [MaybeUninit<T>],
) -> (&'a [T], &'b mut [T])
where
	T: BitStore,
{
	unsafe {
		(
			MaybeUninit::slice_assume_init_ref(src),
			MaybeUninit::slice_assume_init_mut(dst),
		)
	}
}

pub fn benchmarks(crit: &mut Criterion) {
	fn steps()
	-> impl Iterator<Item = (impl Fn(&'static str) -> BenchmarkId, usize, Throughput)>
	{
		SCALARS.iter().map(|&n| {
			(
				move |name| BenchmarkId::new(name, n),
				n * FACTOR * bits_of::<u8>(),
				Throughput::Bytes((n * FACTOR) as u64),
			)
		})
	}

	let (src_words, mut dst_words) =
		make_slots::<usize, { elts::<usize>(MAX_BITS) }>();
	let (src_bytes, mut dst_bytes) =
		make_slots::<u8, { elts::<u8>(MAX_BITS) }>();

	let (src_words, dst_words) = view_slots(&src_words, &mut dst_words);
	let (src_bytes, dst_bytes) = view_slots(&src_bytes, &mut dst_bytes);

	macro_rules! mkgrp {
		($crit:ident, $name:literal) => {
			(&mut *$crit).benchmark_group($name).tap_mut(|grp| {
				grp.sampling_mode(SamplingMode::Flat).sample_size(2000);
			})
		};
	}

	let mut group = mkgrp!(crit, "Element-wise");
	for (id, bits, thrpt) in steps() {
		group.throughput(thrpt);
		let words = bits / bits_of::<usize>();
		let bytes = bits / bits_of::<u8>();

		let (src_words, dst_words) =
			(&src_words[.. words], &mut dst_words[.. words]);
		let (src_bytes, dst_bytes) =
			(&src_bytes[.. bytes], &mut dst_bytes[.. bytes]);

		//  Use the builtin memcpy to run the slices in bulk. This ought to be a
		//  lower bound on execution time.

		group.bench_function(id("words_plain"), |b| {
			let (src, dst) = (src_words, &mut *dst_words);
			b.iter(|| dst.copy_from_slice(src))
		});
		group.bench_function(id("bytes_plain"), |b| {
			let (src, dst) = (src_bytes, &mut *dst_bytes);
			b.iter(|| dst.copy_from_slice(src))
		});

		group.bench_function(id("words_manual"), |b| {
			let (src, dst) = (src_words, &mut *dst_words);
			b.iter(|| {
				for (from, to) in src.iter().zip(dst.iter_mut()) {
					*to = *from;
				}
			})
		});
		group.bench_function(id("bytes_manual"), |b| {
			let (src, dst) = (src_bytes, &mut *dst_bytes);
			b.iter(|| {
				for (from, to) in src.iter().zip(dst.iter_mut()) {
					*to = *from;
				}
			})
		});
	}
	group.finish();

	let mut group = mkgrp!(crit, "Bit-wise accelerated");
	for (id, bits, thrpt) in steps() {
		group.throughput(thrpt);
		let words = bits / bits_of::<usize>();
		let bytes = bits / bits_of::<u8>();

		let (src_words, dst_words) =
			(&src_words[.. words], &mut dst_words[.. words]);
		let (src_bytes, dst_bytes) =
			(&src_bytes[.. bytes], &mut dst_bytes[.. bytes]);

		//  Ideal bitwise memcpy: no edges, same typarams, fully aligned.

		group.bench_function(id("bits_words_plain"), |b| {
			let (src, dst) = (
				src_words.view_bits::<Lsb0>(),
				dst_words.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| dst.copy_from_bitslice(src));
		});
		group.bench_function(id("bits_bytes_plain"), |b| {
			let (src, dst) = (
				src_bytes.view_bits::<Lsb0>(),
				dst_bytes.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| dst.copy_from_bitslice(src));
		});

		//  Same typarams, fully aligned, with fuzzed edges.

		group.bench_function(id("bits_words_edges"), |b| {
			let src = src_words.view_bits::<Lsb0>();
			let len = src.len();
			let (src, dst) = (
				&src[10 .. len - 10],
				&mut dst_words.view_bits_mut::<Lsb0>()[10 .. len - 10],
			);
			b.iter(|| dst.copy_from_bitslice(src));
		});
		group.bench_function(id("bits_bytes_edges"), |b| {
			let src = src_bytes.view_bits::<Lsb0>();
			let len = src.len();
			let (src, dst) = (
				&src[10 .. len - 10],
				&mut dst_bytes.view_bits_mut::<Lsb0>()[10 .. len - 10],
			);
			b.iter(|| dst.copy_from_bitslice(src));
		});

		//  Same typarams, misaligned.

		group.bench_function(id("bits_words_misalign"), |b| {
			let src = &src_words.view_bits::<Lsb0>()[10 ..];
			let dst = &mut dst_words.view_bits_mut::<Lsb0>()[.. src.len()];
			b.iter(|| dst.copy_from_bitslice(src));
		});
		group.bench_function(id("bits_bytes_misalign"), |b| {
			let src = &src_bytes.view_bits::<Lsb0>()[10 ..];
			let dst = &mut dst_bytes.view_bits_mut::<Lsb0>()[.. src.len()];
			b.iter(|| dst.copy_from_bitslice(src));
		});
	}
	group.finish();

	let mut group = mkgrp!(crit, "Bit-wise crawl");
	for (id, bits, thrpt) in steps() {
		group.throughput(thrpt);
		let words = bits / bits_of::<usize>();
		let bytes = bits / bits_of::<u8>();

		let (src_words, dst_words) =
			(&src_words[.. words], &mut dst_words[.. words]);
		let (src_bytes, dst_bytes) =
			(&src_bytes[.. bytes], &mut dst_bytes[.. bytes]);

		//  Mismatched type parameters

		group.bench_function(id("bits_words_mismatched"), |b| {
			let (src, dst) = (
				src_words.view_bits::<Msb0>(),
				dst_words.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| dst.clone_from_bitslice(src));
		});
		group.bench_function(id("bits_bytes_mismatched"), |b| {
			let (src, dst) = (
				src_bytes.view_bits::<Msb0>(),
				dst_bytes.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| dst.clone_from_bitslice(src));
		});

		//  Crawl each bit individually. This ought to be an upper bound on
		//  execution time.

		group.bench_function(id("bitwise_words"), |b| {
			let (src, dst) = (
				src_words.view_bits::<Lsb0>(),
				dst_words.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| unsafe {
				for (from, to) in
					src.as_bitptr_range().zip(dst.as_mut_bitptr_range())
				{
					to.write(from.read());
				}
			})
		});
		group.bench_function(id("bitwise_bytes"), |b| {
			let (src, dst) = (
				src_bytes.view_bits::<Lsb0>(),
				dst_bytes.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| unsafe {
				for (from, to) in
					src.as_bitptr_range().zip(dst.as_mut_bitptr_range())
				{
					to.write(from.read());
				}
			})
		});

		group.bench_function(id("bitwise_words_mismatch"), |b| {
			let (src, dst) = (
				src_words.view_bits::<Lsb0>(),
				dst_words.view_bits_mut::<Msb0>(),
			);
			b.iter(|| unsafe {
				for (from, to) in
					src.as_bitptr_range().zip(dst.as_mut_bitptr_range())
				{
					to.write(from.read());
				}
			})
		});
		group.bench_function(id("bitwise_bytes_mismatch"), |b| {
			let (src, dst) = (
				src_bytes.view_bits::<Msb0>(),
				dst_bytes.view_bits_mut::<Lsb0>(),
			);
			b.iter(|| unsafe {
				for (from, to) in
					src.as_bitptr_range().zip(dst.as_mut_bitptr_range())
				{
					to.write(from.read());
				}
			})
		});
	}
}

criterion_group!(benches, benchmarks);
criterion_main!(benches);