use aligned_vec::{AVec, avec};
use criterion::{Criterion, criterion_group, criterion_main};
use itertools::Itertools;
mod extras {
use ndwt::simd::*;
use ndwt::{boundarys::BoundaryExtension, simd::SimdTransformable};
pub(super) fn db2_forward_arr<T, BC, const N: usize>(
s: &mut [[T; N]],
d: &mut [[T; N]],
bc: &BC,
) where
T: SimdTransformable,
BC: BoundaryExtension,
{
use ndwt::simd::Dispatch;
let n_lanes = T::lanes();
debug_assert_eq!(N, n_lanes);
let ns = s.len();
let nd = d.len();
assert!(
ns == nd || nd + 1 == ns,
"detail and smooth coefficient arrays must have compatible lengths, got {nd} d-chunks and {ns} s-chunks."
);
struct Impl<'a, 'b, 'c, T, BC>(&'a mut [T], &'b mut [T], &'c BC);
impl<'a, 'b, 'c, T: SimdTransformable, BC: BoundaryExtension> WithSimd for Impl<'a, 'b, 'c, T, BC>
where
T: SimdTransformable,
BC: BoundaryExtension,
{
type Output = ();
#[inline(always)]
fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
let s = T::as_mut_simd(simd, self.0).0;
let d = T::as_mut_simd(simd, self.1).0;
let ns = s.len();
let nd = d.len();
let bc = self.2;
let c = T::simd_splat(
simd,
T::scalar_type_from_f64(
-1.73205080756887729352744634150587236694280525381038062805581,
),
);
d.iter_mut().zip(s.iter()).for_each(|(l, r)| {
*l = T::simd_mul_add(simd, *r, c, *l);
});
let c = [
T::simd_splat(
simd,
T::scalar_type_from_f64(
0.433012701892219323381861585376468091735701313452595157013952,
),
),
T::simd_splat(
simd,
T::scalar_type_from_f64(
-0.0669872981077806766181384146235319082642986865474048429860483,
),
),
];
let (sf, sb) = s.split_at_mut(nd - 1);
sf.iter_mut()
.zip(d.array_windows())
.for_each(|(l, [r0, r1])| {
*l = T::simd_mul_add(simd, *r0, c[0], *l);
*l = T::simd_mul_add(simd, *r1, c[1], *l);
});
(nd as isize - 1..ns as isize).zip(sb).for_each(|(io, l)| {
c.iter().enumerate().for_each(|(i, c)| {
let bc_parts = bc.get_parts::<T>(nd, io + i as isize);
for (coef, i_bc) in bc_parts {
let rv = match coef {
Some(coef) => {
let c = T::simd_splat(simd, coef);
T::simd_mul(simd, d[i_bc], c)
}
None => d[i_bc],
};
*l = T::simd_mul_add(simd, rv, *c, *l);
}
});
});
let (df, dv) = d.split_at_mut(1);
(-1..0).zip(df).for_each(|(io, l)| {
let bc_parts = bc.get_parts::<T>(nd, io);
for (coef, i_bc) in bc_parts {
match coef {
Some(coef) => {
let c = T::simd_splat(simd, coef);
*l = T::simd_mul_add(simd, s[i_bc], c, *l);
}
None => {
*l = T::simd_add(simd, s[i_bc], *l);
}
};
}
});
dv.iter_mut().zip(s.iter()).for_each(|(l, r)| {
*l = T::simd_add(simd, *r, *l);
});
let scale = T::simd_splat(
simd,
T::scalar_type_from_f64(
1.93185165257813657349948639945779473526780967801680910080469,
),
);
let inv_scale = T::simd_splat(
simd,
T::scalar_type_from_f64(
1.0 / 1.93185165257813657349948639945779473526780967801680910080469,
),
);
s.iter_mut().for_each(|s| *s = T::simd_mul(simd, *s, scale));
d.iter_mut()
.for_each(|d| *d = T::simd_mul(simd, *d, inv_scale));
}
}
ndwt::simd::ARCH.dispatch_wvlt(Impl(s.as_flattened_mut(), d.as_flattened_mut(), bc));
}
}
fn db2_benchmark(c: &mut Criterion) {
use ndwt::boundarys::BoundaryCondition;
use ndwt::daubechies;
type WVLT = daubechies::Daubechies2;
let n = 1000;
let ns = (n + 1) / 2;
let nd = n / 2;
let x = AVec::<_>::from_iter(128, (0..n).map(|i| i as f64));
let mut s = AVec::<_>::from_iter(128, (0..ns).map(|i| i as f64));
let mut d = AVec::<_>::from_iter(128, (0..nd).map(|i| (i + ns) as f64));
let bc = BoundaryCondition::Zero;
let mut group = c.benchmark_group("Daubechies 2");
group.bench_function("lifted inplace", |b| {
b.iter(|| {
use ndwt::lwt::LiftingTransform;
WVLT::forward(&mut s, &mut d, &bc);
})
});
group.bench_function("lifted out of place", |b| {
b.iter(|| {
use ndwt::lwt::LiftingTransform;
ndwt::utils::deinterleave(&x, &mut s, &mut d);
WVLT::forward(&mut s, &mut d, &bc);
})
});
let x2 = (0..n).map(|i| i as f64).collect_vec();
let nsd = ndwt::dwt::get_outlen(WVLT::WIDTH, n);
let mut s2 = avec![0.0; nsd];
let mut d2 = avec![0.0; nsd];
group.bench_function("filtered", |b| {
b.iter(|| {
use ndwt::dwt::DiscreteTransform;
WVLT::forward(&x2, &mut s2, &mut d2, &bc);
})
});
group.finish();
}
fn db6_benchmark(c: &mut Criterion) {
use ndwt::boundarys::BoundaryCondition;
use ndwt::daubechies;
type WVLT = daubechies::Daubechies6;
let n = 1000;
let ns = (n + 1) / 2;
let nd = n / 2;
let x = AVec::<_>::from_iter(128, (0..n).map(|i| i as f64));
let mut s = AVec::<_>::from_iter(128, (0..ns).map(|i| i as f64));
let mut d = AVec::<_>::from_iter(128, (0..nd).map(|i| (i + ns) as f64));
let bc = BoundaryCondition::Zero;
let mut group = c.benchmark_group("Daubechies 6");
group.bench_function("lifted inplace", |b| {
b.iter(|| {
use ndwt::lwt::LiftingTransform;
WVLT::forward(&mut s, &mut d, &bc);
})
});
group.bench_function("lifted out of place", |b| {
b.iter(|| {
use ndwt::lwt::LiftingTransform;
ndwt::utils::deinterleave(&x, &mut s, &mut d);
WVLT::forward(&mut s, &mut d, &bc);
})
});
let x2 = (0..n).map(|i| i as f64).collect_vec();
let nsd = ndwt::dwt::get_outlen(WVLT::WIDTH, n);
let mut s2 = vec![0.0; nsd];
let mut d2 = vec![0.0; nsd];
group.bench_function("filtered", |b| {
b.iter(|| {
use ndwt::dwt::DiscreteTransform;
WVLT::forward(&x2, &mut s2, &mut d2, &bc);
})
});
group.finish();
}
fn interleave_slice_benchmark(c: &mut Criterion) {
use ndwt::utils::{interleave, interleave_inplace};
let n = 1042;
let evens = (0..n).step_by(2).collect_vec();
let odds = (1..n).step_by(2).collect_vec();
let mut x1 = vec![0; n];
let mut x2 = evens.iter().chain(odds.iter()).collect_vec();
let mut group = c.benchmark_group("slice");
group.bench_function("out of place", |b| {
b.iter(|| {
interleave(&evens, &odds, &mut x1);
})
});
group.bench_function("in place", |b| {
b.iter(|| {
interleave_inplace(&mut x2);
})
});
group.finish();
}
fn interleave_strided_benchmark(c: &mut Criterion) {
use ndwt::iter::LanesIterator;
let n: usize = 1000;
let mut group = c.benchmark_group("interleave");
let shape = [n, n];
let n_total: usize = shape.iter().product();
let x1 = (0..n_total as i32).collect_vec();
let mut x2 = (0..n_total as i32).collect_vec();
let nf = (n + 1) / 2;
let ns = n / 2;
let mut work_f = vec![0; nf];
let mut work_s = vec![0; ns];
group.bench_function("lanes/across", |b| {
let ax = 0;
b.iter(|| {
for (lane_in, mut lane_out) in
x1.iter_lanes(&shape, ax).zip(x2.iter_lanes_mut(&shape, ax))
{
lane_in.split(&mut work_f, &mut work_s);
lane_out.interleave(&work_f, &work_s);
}
})
});
group.bench_function("lanes/along", |b| {
let ax = 1;
b.iter(|| {
for (lane_in, mut lane_out) in
x1.iter_lanes(&shape, ax).zip(x2.iter_lanes_mut(&shape, ax))
{
lane_in.split(&mut work_f, &mut work_s);
lane_out.interleave(&work_f, &work_s);
}
})
});
const N: usize = 8;
let mut work_f: [_; N] = core::array::from_fn(|_| avec![0; nf]);
let mut work_s: [_; N] = core::array::from_fn(|_| avec![0; nf]);
let mut work_f2 = avec![0; nf];
let mut work_s2 = avec![0; ns];
group.bench_function("chunks/across", |b| {
let ax = 0;
b.iter(|| {
let (in_chunk, in_rem) = x1.iter_lane_chunks::<N>(&shape, ax);
let (out_chunk, out_rem) = x2.iter_lane_chunks_mut::<N>(&shape, ax);
for (in_chunk, mut out_chunk) in in_chunk.zip(out_chunk) {
in_chunk.split(&mut work_f, &mut work_s);
out_chunk.interleave(&&work_f, &work_s);
}
for (in_rem, mut out_rem) in in_rem.zip(out_rem) {
in_rem.split(&mut work_f2, &mut work_s2);
out_rem.interleave(&work_f2, &work_s2);
}
})
});
group.bench_function("chunks/along", |b| {
let ax = 1;
b.iter(|| {
let (in_chunk, in_rem) = x1.iter_lane_chunks::<N>(&shape, ax);
let (out_chunk, out_rem) = x2.iter_lane_chunks_mut::<N>(&shape, ax);
for (in_chunk, mut out_chunk) in in_chunk.zip(out_chunk) {
in_chunk.split(&mut work_f, &mut work_s);
out_chunk.interleave(&work_f, &work_s);
}
for (in_rem, mut out_rem) in in_rem.zip(out_rem) {
in_rem.split(&mut work_f2, &mut work_s2);
out_rem.interleave(&work_f2, &work_s2);
}
})
});
group.bench_function("chunked", |b| {
b.iter(|| {
let n_total = x1.len();
let n0 = x1.len() / n;
let n_first = (n0 + 1) / 2;
let (first, second) = x1.split_at(n_first * n);
let mut first_chunks = first.chunks_exact(n);
first_chunks
.by_ref()
.zip(second.chunks_exact(n))
.zip(x2.chunks_exact_mut(2 * n))
.for_each(|((f, s), out)| {
let (evens, odds) = out.split_at_mut(n);
evens
.iter_mut()
.zip(f.iter().cloned())
.for_each(|(e, f)| *e = f);
odds.iter_mut()
.zip(s.iter().cloned())
.for_each(|(o, s)| *o = s);
});
if let Some(last_chunk) = first_chunks.next() {
x2[n_total - n..n_total]
.iter_mut()
.zip(last_chunk.iter().cloned())
.for_each(|(e, f)| *e = f);
}
})
});
group.finish();
}
fn deinterleave_benchmark(c: &mut Criterion) {
use ndwt::iter::LanesIterator;
use ndwt::utils::deinterleave_nd;
let n = 1000;
let mut group = c.benchmark_group("deinterleave");
let shape: Vec<usize> = vec![n, n];
let n_total: usize = shape.iter().product();
let x1 = (0..n_total as i32).collect_vec();
let mut x2 = (0..n_total as i32).collect_vec();
group.bench_function("lanes/across", |b| {
let ax = 0;
let n = shape[ax];
let n_e = (n + 1) / 2;
let n_o = n / 2;
let mut work_e = vec![0; n_e];
let mut work_o = vec![0; n_o];
b.iter(|| {
for (lane_in, mut lane_out) in
x1.iter_lanes(&shape, ax).zip(x2.iter_lanes_mut(&shape, ax))
{
lane_in.deinterleave(&mut work_e, &mut work_o);
lane_out.stack(&work_e, &work_o);
}
})
});
group.bench_function("lanes/along", |b| {
let ax = 1;
let n = shape[ax];
let n_e = (n + 1) / 2;
let n_o = n / 2;
let mut work_e = vec![0; n_e];
let mut work_o = vec![0; n_o];
b.iter(|| {
for (lane_in, mut lane_out) in
x1.iter_lanes(&shape, ax).zip(x2.iter_lanes_mut(&shape, ax))
{
lane_in.deinterleave(&mut work_e, &mut work_o);
lane_out.stack(&work_e, &work_o);
}
})
});
const N: usize = 8;
group.bench_function("chunks/across", |b| {
let ax = 0;
let n = shape[ax];
let n_e = (n + 1) / 2;
let n_o = n / 2;
let mut work_e: [_; N] = core::array::from_fn(|_| avec![0; n_e]);
let mut work_o: [_; N] = core::array::from_fn(|_| avec![0; n_o]);
let mut work_e2 = avec![0; n_e];
let mut work_o2 = avec![0; n_o];
b.iter(|| {
let (in_chunk, in_rem) = x1.iter_lane_chunks::<N>(&shape, ax);
let (out_chunk, out_rem) = x2.iter_lane_chunks_mut::<N>(&shape, ax);
for (in_chunk, mut out_chunk) in in_chunk.zip(out_chunk) {
in_chunk.deinterleave(&mut work_e, &mut work_o);
out_chunk.stack(&work_e, &work_o);
}
for (in_rem, mut out_rem) in in_rem.zip(out_rem) {
in_rem.deinterleave(&mut work_e2, &mut work_o2);
out_rem.stack(&work_e2, &work_o2);
}
})
});
group.bench_function("chunks/along", |b| {
let ax = 1;
let n = shape[ax];
let n_e = (n + 1) / 2;
let n_o = n / 2;
let mut work_e: [_; N] = core::array::from_fn(|_| avec![0; n_e]);
let mut work_o: [_; N] = core::array::from_fn(|_| avec![0; n_o]);
let mut work_e2 = avec![0; n_e];
let mut work_o2 = avec![0; n_o];
b.iter(|| {
let (in_chunk, in_rem) = x1.iter_lane_chunks::<N>(&shape, ax);
let (out_chunk, out_rem) = x2.iter_lane_chunks_mut::<N>(&shape, ax);
for (in_chunk, mut out_chunk) in in_chunk.zip(out_chunk) {
in_chunk.deinterleave(&mut work_e, &mut work_o);
out_chunk.stack(&work_e, &work_o);
}
for (in_rem, mut out_rem) in in_rem.zip(out_rem) {
in_rem.deinterleave(&mut work_e2, &mut work_o2);
out_rem.stack(&work_e2, &work_o2);
}
})
});
group.bench_function("array_chunks/across", |b| {
let ax = 0;
let n = shape[ax];
let n_e = (n + 1) / 2;
let n_o = n / 2;
let mut work_e = avec![[0; N]; n_e];
let mut work_o = avec![[0; N]; n_o];
let mut work_e2 = avec![0; n_e];
let mut work_o2 = avec![0; n_o];
b.iter(|| {
let (in_chunk, in_rem) = x1.iter_lane_chunks::<N>(&shape, ax);
let (out_chunk, out_rem) = x2.iter_lane_chunks_mut::<N>(&shape, ax);
for (in_chunk, mut out_chunk) in in_chunk.zip(out_chunk) {
in_chunk.deinterleave_arrays(&mut work_e, &mut work_o);
out_chunk.stack_arrays(&work_e, &work_o);
}
for (in_rem, mut out_rem) in in_rem.zip(out_rem) {
in_rem.deinterleave(&mut work_e2, &mut work_o2);
out_rem.stack(&work_e2, &work_o2);
}
})
});
group.bench_function("array_chunks/along", |b| {
let ax = 1;
let n = shape[ax];
let n_e = (n + 1) / 2;
let n_o = n / 2;
let mut work_e = avec![[0; N]; n_e];
let mut work_o = avec![[0; N]; n_o];
let mut work_e2 = avec![0; n_e];
let mut work_o2 = avec![0; n_o];
b.iter(|| {
let (in_chunk, in_rem) = x1.iter_lane_chunks::<N>(&shape, ax);
let (out_chunk, out_rem) = x2.iter_lane_chunks_mut::<N>(&shape, ax);
for (in_chunk, mut out_chunk) in in_chunk.zip(out_chunk) {
in_chunk.deinterleave_arrays(&mut work_e, &mut work_o);
out_chunk.stack_arrays(&work_e, &work_o);
}
for (in_rem, mut out_rem) in in_rem.zip(out_rem) {
in_rem.deinterleave(&mut work_e2, &mut work_o2);
out_rem.stack(&work_e2, &work_o2);
}
})
});
group.bench_function("recursive", |b| {
b.iter(|| {
deinterleave_nd(&x1, &mut x2, &shape);
})
});
group.finish();
}
fn driver_vs_array_db2(c: &mut Criterion) {
use extras::db2_forward_arr;
use ndwt::Wavelet;
use ndwt::boundarys::ZeroBoundary;
use ndwt::daubechies::Daubechies2;
use ndwt::iter::LanesIterator;
use ndwt::lwt::LiftingTransform;
use ndwt::lwt::driver::WaveletTransform;
use ndwt::simd::Alignable;
let n = 1000;
let wvlt = Wavelet::Daubechies2;
let bc = ZeroBoundary;
let mut group = c.benchmark_group("driver_vs_array");
let shape = [n, n];
let n_total: usize = shape.iter().product();
let x1 = (0..n_total).map(|v| v as f64).collect_vec();
let mut out = vec![0.0; n_total];
let trans = WaveletTransform::new(wvlt, bc);
group.bench_function("driver/along", |b| {
let ax = 1;
b.iter(|| {
trans.forward_nd(&x1, &mut out, &shape, &[ax]);
});
});
group.bench_function("driver/across", |b| {
let ax = 0;
b.iter(|| {
trans.forward_nd(&x1, &mut out, &shape, &[ax]);
});
});
macro_rules! impl_arm {
($N:tt, $ax:ident, $ns:ident, $nd:ident) => {
const N: usize = $N;
let (in_chunks, in_rem) = x1.iter_lane_chunks::<N>(&shape, $ax);
let (out_chunks, out_rem) = out.iter_lane_chunks_mut::<N>(&shape, $ax);
let mut s = avec![[0.0;N]; $ns];
let mut d = avec![[0.0;N]; $nd];
for (inc, mut outc) in in_chunks.zip(out_chunks){
inc.deinterleave_arrays(&mut s, &mut d);
db2_forward_arr(&mut s, &mut d, &bc);
outc.stack_arrays(&s, &d);
};
let mut s = avec![0.0; $ns];
let mut d = avec![0.0; $nd];
in_rem.zip(out_rem).for_each(|(ins, mut outs)| {
ins.deinterleave(&mut s, &mut d);
Daubechies2::forward(&mut s, &mut d, &bc);
outs.stack(&s, &d);
});
};
}
group.bench_function("arrays/along", |b| {
b.iter(|| {
let ax = 1;
let lanes = f64::lanes();
let n_ax = shape[ax];
let ns = n_ax.div_ceil(2);
let nd = n_ax / 2;
match lanes {
2 => {
impl_arm! {2, ax, ns, nd};
}
4 => {
impl_arm! {4, ax, ns, nd};
}
8 => {
impl_arm! {8, ax, ns, nd}
}
16 => {
impl_arm! {16, ax, ns, nd}
}
_ => {
unimplemented!()
}
}
});
});
group.bench_function("arrays/across", |b| {
b.iter(|| {
let ax = 0;
let lanes = f64::lanes();
let n_ax = shape[ax];
let ns = n_ax.div_ceil(2);
let nd = n_ax / 2;
match lanes {
2 => {
impl_arm! {2, ax, ns, nd};
}
4 => {
impl_arm! {4, ax, ns, nd};
}
8 => {
impl_arm! {8, ax, ns, nd}
}
16 => {
impl_arm! {16, ax, ns, nd}
}
_ => {
unimplemented!()
}
}
});
});
}
fn broadcasted_vs_strided_db2(c: &mut Criterion) {
use ndwt::Wavelet;
use ndwt::boundarys::ZeroBoundary;
use ndwt::lwt::LiftingTransform;
use ndwt::lwt::driver::WaveletTransform;
use ndwt::utils::deinterleave_nd;
let n = 500;
let shape = [n, n];
let n_total = shape.iter().product();
let x = (0..n_total).map(|i| i as f64).collect_vec();
let wvlt = Wavelet::Daubechies2;
let trans = WaveletTransform::new(wvlt, ZeroBoundary {});
let mut sd = vec![0.0; n_total];
let axes = [1, 0];
let mut group = c.benchmark_group("broadcasted_vs_lanes/db2");
group.bench_function("using driver", |b| {
b.iter(|| {
trans.forward_nd(&x, &mut sd, &shape, &axes);
})
});
group.bench_function("using parallel driver", |b| {
b.iter(|| {
trans.par_forward_nd(&x, &mut sd, &shape, &axes);
})
});
let ne = (shape[0] + 1) / 2;
let mut x2 = vec![0.0; n_total];
group.bench_function("using recursive", |b| {
b.iter(|| {
let bc = ZeroBoundary;
deinterleave_nd(&x, &mut x2, &shape);
let (s, d) = x2.split_at_mut(ne * shape[1]);
ndwt::lwt::daubechies::Daubechies2::forward_chunk(s, d, shape[1], &bc);
x2.chunks_exact_mut(shape[1]).for_each(|slc| {
let (s, d) = slc.split_at_mut(ne);
ndwt::lwt::daubechies::Daubechies2::forward(s, d, &bc);
});
})
});
group.finish();
}
fn broadcasted_vs_strided_db4(c: &mut Criterion) {
use ndwt::Wavelet;
use ndwt::boundarys::ZeroBoundary;
use ndwt::lwt::LiftingTransform;
use ndwt::lwt::driver::WaveletTransform;
use ndwt::utils::deinterleave_nd;
let n = 500;
let shape = [n, n];
let n_total = shape.iter().product();
let x = (0..n_total).map(|i| i as f64).collect_vec();
let wvlt = Wavelet::Daubechies4;
let trans = WaveletTransform::new(wvlt, ZeroBoundary {});
let mut sd = vec![0.0; n_total];
let axes = [1, 0];
let mut group = c.benchmark_group("broadcasted_vs_lanes/db4");
group.bench_function("using driver", |b| {
b.iter(|| {
trans.forward_nd(&x, &mut sd, &shape, &axes);
})
});
group.bench_function("using parallel driver", |b| {
b.iter(|| {
trans.par_forward_nd(&x, &mut sd, &shape, &axes);
})
});
let ne = (shape[0] + 1) / 2;
let mut x2 = vec![0.0; n_total];
group.bench_function("using recursive", |b| {
b.iter(|| {
let bc = ZeroBoundary;
deinterleave_nd(&x, &mut x2, &shape);
let (s, d) = x2.split_at_mut(ne * shape[1]);
ndwt::lwt::daubechies::Daubechies4::forward_chunk(s, d, shape[1], &bc);
x2.chunks_exact_mut(shape[1]).for_each(|slc| {
let (s, d) = slc.split_at_mut(ne);
ndwt::lwt::daubechies::Daubechies4::forward(s, d, &bc);
});
})
});
group.finish();
}
fn broadcasted_vs_strided_db6(c: &mut Criterion) {
use ndwt::Wavelet;
use ndwt::boundarys::ZeroBoundary;
use ndwt::lwt::LiftingTransform;
use ndwt::lwt::driver::WaveletTransform;
use ndwt::utils::deinterleave_nd;
let n = 500;
let shape = [n, n];
let n_total = shape.iter().product();
let x = (0..n_total).map(|i| i as f64).collect_vec();
let wvlt = Wavelet::Daubechies6;
let trans = WaveletTransform::new(wvlt, ZeroBoundary {});
let mut sd = vec![0.0; n_total];
let axes = [1, 0];
let mut group = c.benchmark_group("broadcasted_vs_lanes/db6");
group.bench_function("using driver", |b| {
b.iter(|| {
trans.forward_nd(&x, &mut sd, &shape, &axes);
})
});
group.bench_function("using parallel driver", |b| {
b.iter(|| {
trans.par_forward_nd(&x, &mut sd, &shape, &axes);
})
});
let ne = (shape[0] + 1) / 2;
let mut x2 = vec![0.0; n_total];
group.bench_function("using recursive", |b| {
b.iter(|| {
let bc = ZeroBoundary;
deinterleave_nd(&x, &mut x2, &shape);
let (s, d) = x2.split_at_mut(ne * shape[1]);
ndwt::lwt::daubechies::Daubechies6::forward_chunk(s, d, shape[1], &bc);
x2.chunks_exact_mut(shape[1]).for_each(|slc| {
let (s, d) = slc.split_at_mut(ne);
ndwt::lwt::daubechies::Daubechies6::forward(s, d, &bc);
});
})
});
group.finish();
}
#[cfg(feature = "ndarray")]
fn ndarray_lanes_vs_wvlt_lanes(c: &mut Criterion) {
use ndarray::{Array2, Axis};
use ndwt::iter::LanesIterator;
let n: usize = 1000;
let mut group = c.benchmark_group("ndarray");
let shape = [n, n];
let n_total: usize = shape.iter().product();
let inp = Array2::from_shape_vec(shape, (0..n_total).collect()).unwrap();
let mut out = inp.clone();
group.bench_function("wvlt_lanes/along", |b| {
let ax = 1;
b.iter(|| {
inp.iter_lanes(&shape, ax)
.zip(out.iter_lanes_mut(&shape, ax))
.for_each(|(i, mut o)| {
let i = i.as_slice().unwrap();
let o = o.as_mut_slice().unwrap();
i.iter().zip(o.iter_mut()).for_each(|(i, o)| *o = 2 * i)
})
})
});
group.bench_function("wvlt_lanes/across", |b| {
let ax = 0;
b.iter(|| {
inp.iter_lanes(&shape, ax)
.zip(out.iter_lanes_mut(&shape, ax))
.for_each(|(i, mut o)| i.iter().zip(o.iter_mut()).for_each(|(i, o)| *o = 2 * i))
})
});
group.bench_function("ndarray_lanes/along", |b| {
let ax = 1;
b.iter(|| {
inp.lanes(Axis(ax))
.into_iter()
.zip(out.lanes_mut(Axis(ax)))
.for_each(|(i, mut o)| {
let i = i.as_slice().unwrap();
let o = o.as_slice_mut().unwrap();
i.iter().zip(o.iter_mut()).for_each(|(i, o)| *o = 2 * i)
})
})
});
group.bench_function("ndarray_lanes/across", |b| {
let ax = 0;
b.iter(|| {
inp.lanes(Axis(ax))
.into_iter()
.zip(out.lanes_mut(Axis(ax)))
.for_each(|(i, mut o)| i.iter().zip(o.iter_mut()).for_each(|(i, o)| *o = 2 * i))
})
});
const N: usize = 4;
group.bench_function("wvlt_chunks/along", |b| {
let ax = 1;
b.iter(|| {
let (in_chunks, _) = inp.iter_lane_chunks::<N>(&shape, ax);
let (out_chunks, _) = out.iter_lane_chunks_mut::<N>(&shape, ax);
in_chunks.zip(out_chunks).for_each(|(in_c, mut out_c)| {
in_c.iter().zip(out_c.iter_mut()).for_each(|(i, o)| {
o.into_iter().zip(i).for_each(|(o, i)| *o = 2 * i);
})
});
})
});
group.bench_function("wvlt_chunks/across", |b| {
let ax = 0;
b.iter(|| {
let (in_chunks, _) = inp.iter_lane_chunks::<N>(&shape, ax);
let (out_chunks, _) = out.iter_lane_chunks_mut::<N>(&shape, ax);
in_chunks.zip(out_chunks).for_each(|(in_c, mut out_c)| {
in_c.iter().zip(out_c.iter_mut()).for_each(|(i, o)| {
o.into_iter().zip(i).for_each(|(o, i)| *o = 2 * i);
})
});
})
});
group.bench_function("ndarray_chunks/along", |b| {
let ax = 1;
b.iter(|| {
let in_lanes = inp.lanes(Axis(ax));
let out_lanes = out.lanes_mut(Axis(ax));
in_lanes
.into_iter()
.zip(out_lanes.into_iter())
.tuples()
.for_each(|((i0, mut o0), (i1, mut o1), (i2, mut o2), (i3, mut o3))| {
i0.iter().zip(o0.iter_mut()).for_each(|(i, o)| *o = 2 * i);
i1.iter().zip(o1.iter_mut()).for_each(|(i, o)| *o = 2 * i);
i2.iter().zip(o2.iter_mut()).for_each(|(i, o)| *o = 2 * i);
i3.iter().zip(o3.iter_mut()).for_each(|(i, o)| *o = 2 * i);
});
})
});
group.bench_function("ndarray_chunks/across", |b| {
let ax = 0;
b.iter(|| {
let in_lanes = inp.lanes(Axis(ax));
let out_lanes = out.lanes_mut(Axis(ax));
in_lanes
.into_iter()
.zip(out_lanes.into_iter())
.tuples()
.for_each(|((i0, mut o0), (i1, mut o1), (i2, mut o2), (i3, mut o3))| {
i0.iter().zip(o0.iter_mut()).for_each(|(i, o)| *o = 2 * i);
i1.iter().zip(o1.iter_mut()).for_each(|(i, o)| *o = 2 * i);
i2.iter().zip(o2.iter_mut()).for_each(|(i, o)| *o = 2 * i);
i3.iter().zip(o3.iter_mut()).for_each(|(i, o)| *o = 2 * i);
});
})
});
group.finish();
}
criterion_group!(
interleave_deinterleave,
interleave_slice_benchmark,
interleave_strided_benchmark,
deinterleave_benchmark,
);
criterion_group!(lifted_vs_filtered, db2_benchmark, db6_benchmark);
criterion_group!(
broadcasted_vs_lanes,
broadcasted_vs_strided_db2,
broadcasted_vs_strided_db4,
broadcasted_vs_strided_db6,
driver_vs_array_db2,
);
#[cfg(feature = "ndarray")]
criterion_group!(ndarray_bench, ndarray_lanes_vs_wvlt_lanes);
#[cfg(feature = "ndarray")]
criterion_main!(
interleave_deinterleave,
lifted_vs_filtered,
broadcasted_vs_lanes,
ndarray_bench
);
#[cfg(not(feature = "ndarray"))]
criterion_main!(
interleave_deinterleave,
lifted_vs_filtered,
broadcasted_vs_lanes
);