#![feature(test)]
#![feature(cfg_target_feature)]
extern crate test;
extern crate simd;
use test::black_box as bb;
use test::Bencher as B;
use simd::f32x4;
#[cfg(target_feature = "avx")]
use simd::x86::avx::{f32x8, f64x4};
#[bench]
fn multiply_naive(b: &mut B) {
let x = [[1.0_f32; 4]; 4];
let y = [[2.0; 4]; 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));
bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
[x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
[x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
[x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
]);
}
})
}
#[bench]
fn multiply_simd4_32(b: &mut B) {
let x = [f32x4::splat(1.0_f32); 4];
let y = [f32x4::splat(2.0); 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));
let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
bb(&[f32x4::splat(y0.extract(0)) * x[0] +
f32x4::splat(y0.extract(1)) * x[1] +
f32x4::splat(y0.extract(2)) * x[2] +
f32x4::splat(y0.extract(3)) * x[3],
f32x4::splat(y1.extract(0)) * x[0] +
f32x4::splat(y1.extract(1)) * x[1] +
f32x4::splat(y1.extract(2)) * x[2] +
f32x4::splat(y1.extract(3)) * x[3],
f32x4::splat(y2.extract(0)) * x[0] +
f32x4::splat(y2.extract(1)) * x[1] +
f32x4::splat(y2.extract(2)) * x[2] +
f32x4::splat(y2.extract(3)) * x[3],
f32x4::splat(y3.extract(0)) * x[0] +
f32x4::splat(y3.extract(1)) * x[1] +
f32x4::splat(y3.extract(2)) * x[2] +
f32x4::splat(y3.extract(3)) * x[3],
]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn multiply_simd4_64(b: &mut B) {
let x = [f64x4::splat(1.0_f64); 4];
let y = [f64x4::splat(2.0); 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));
let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
bb(&[f64x4::splat(y0.extract(0)) * x[0] +
f64x4::splat(y0.extract(1)) * x[1] +
f64x4::splat(y0.extract(2)) * x[2] +
f64x4::splat(y0.extract(3)) * x[3],
f64x4::splat(y1.extract(0)) * x[0] +
f64x4::splat(y1.extract(1)) * x[1] +
f64x4::splat(y1.extract(2)) * x[2] +
f64x4::splat(y1.extract(3)) * x[3],
f64x4::splat(y2.extract(0)) * x[0] +
f64x4::splat(y2.extract(1)) * x[1] +
f64x4::splat(y2.extract(2)) * x[2] +
f64x4::splat(y2.extract(3)) * x[3],
f64x4::splat(y3.extract(0)) * x[0] +
f64x4::splat(y3.extract(1)) * x[1] +
f64x4::splat(y3.extract(2)) * x[2] +
f64x4::splat(y3.extract(3)) * x[3],
]);
}
})
}
#[bench]
fn inverse_naive(b: &mut B) {
let mut x = [[0_f32; 4]; 4];
for i in 0..4 { x[i][i] = 1.0 }
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let mut t = [[0_f32; 4]; 4];
for i in 0..4 {
t[0][i] = x[i][0];
t[1][i] = x[i][1];
t[2][i] = x[i][2];
t[3][i] = x[i][3];
}
let _0 = t[2][2] * t[3][3];
let _1 = t[2][3] * t[3][2];
let _2 = t[2][1] * t[3][3];
let _3 = t[2][3] * t[3][1];
let _4 = t[2][1] * t[3][2];
let _5 = t[2][2] * t[3][1];
let _6 = t[2][0] * t[3][3];
let _7 = t[2][3] * t[3][0];
let _8 = t[2][0] * t[3][2];
let _9 = t[2][2] * t[3][0];
let _10 = t[2][0] * t[3][1];
let _11 = t[2][1] * t[3][0];
let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
let _0 = t[0][2] * t[1][3];
let _1 = t[0][3] * t[1][2];
let _2 = t[0][1] * t[1][3];
let _3 = t[0][3] * t[1][1];
let _4 = t[0][1] * t[1][2];
let _5 = t[0][2] * t[1][1];
let _6 = t[0][0] * t[1][3];
let _7 = t[0][3] * t[1][0];
let _8 = t[0][0] * t[1][2];
let _9 = t[0][2] * t[1][0];
let _10 = t[0][0] * t[1][1];
let _11 = t[0][1] * t[1][0];
let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
let det = 1.0 / det;
let mut ret = [[d00, d01, d02, d03],
[d10, d11, d12, d13],
[d20, d21, d22, d23],
[d30, d31, d32, d33]];
for i in 0..4 {
for j in 0..4 {
ret[i][j] *= det;
}
}
bb(&ret);
}
})
}
#[bench]
fn inverse_simd4(b: &mut B) {
let mut x = [f32x4::splat(0_f32); 4];
for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }
fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(1),
w.extract(4 - 4), w.extract(5 - 4))
}
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(2),
w.extract(4 - 4), w.extract(6 - 4))
}
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(3),
w.extract(5 - 4), w.extract(7 - 4))
}
fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(2), v.extract(3),
w.extract(6 - 4), w.extract(7 - 4))
}
fn swiz1032(v: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(0),
v.extract(3), v.extract(2))
}
fn swiz2301(v: f32x4) -> f32x4 {
f32x4::new(v.extract(2), v.extract(3),
v.extract(0), v.extract(1))
}
b.iter(|| {
for _ in 0..100 {
let src0;
let src1;
let src2;
let src3;
let mut tmp1;
let row0;
let mut row1;
let mut row2;
let mut row3;
let mut minor0;
let mut minor1;
let mut minor2;
let mut minor3;
let mut det;
let x = bb(&x);
src0 = x[0];
src1 = x[1];
src2 = x[2];
src3 = x[3];
tmp1 = shuf0145(src0, src1);
row1 = shuf0145(src2, src3);
row0 = shuf0246(tmp1, row1);
row1 = shuf1357(row1, tmp1);
tmp1 = shuf2367(src0, src1);
row3 = shuf2367(src2, src3);
row2 = shuf0246(tmp1, row3);
row3 = shuf0246(row3, tmp1);
tmp1 = row2 * row3;
tmp1 = swiz1032(tmp1);
minor0 = row1 * tmp1;
minor1 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = (row1 * tmp1) - minor0;
minor1 = (row0 * tmp1) - minor1;
minor1 = swiz2301(minor1);
tmp1 = row1 * row2;
tmp1 = swiz1032(tmp1);
minor0 = (row3 * tmp1) + minor0;
minor3 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = minor0 - row3 * tmp1;
minor3 = row0 * tmp1 - minor3;
minor3 = swiz2301(minor3);
tmp1 = row3 * swiz2301(row1);
tmp1 = swiz1032(tmp1);
row2 = swiz2301(row2);
minor0 = row2 * tmp1 + minor0;
minor2 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = minor0 - row2 * tmp1;
minor2 = row0 * tmp1 - minor2;
minor2 = swiz2301(minor2);
tmp1 = row0 * row1;
tmp1 = swiz1032(tmp1);
minor2 = minor2 + row3 * tmp1;
minor3 = row2 * tmp1 - minor3;
tmp1 = swiz2301(tmp1);
minor2 = row3 * tmp1 - minor2;
minor3 = minor3 - row2 * tmp1;
tmp1 = row0 * row3;
tmp1 = swiz1032(tmp1);
minor1 = minor1 - row2 * tmp1;
minor2 = row1 * tmp1 + minor2;
tmp1 = swiz2301(tmp1);
minor1 = row2 * tmp1 + minor1;
minor2 = minor2 - row1 * tmp1;
tmp1 = row0 * row2;
tmp1 = swiz1032(tmp1);
minor1 = row3 * tmp1 + minor1;
minor3 = minor3 - row1 * tmp1;
tmp1 = swiz2301(tmp1);
minor1 = minor1 - row3 * tmp1;
minor3 = row1 * tmp1 + minor3;
det = row0 * minor0;
det = swiz2301(det) + det;
det = swiz1032(det) + det;
det = f32x4::splat(1.0) / det;
bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
}
})
}
#[bench]
fn transpose_naive(b: &mut B) {
let x = [[0_f32; 4]; 4];
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
bb(&[[x[0][0], x[1][0], x[2][0], x[3][0]],
[x[0][1], x[1][1], x[2][1], x[3][1]],
[x[0][2], x[1][2], x[2][2], x[3][2]],
[x[0][3], x[1][3], x[2][3], x[3][3]]]);
}
})
}
#[bench]
fn transpose_simd4(b: &mut B) {
let x = [f32x4::splat(0_f32); 4];
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(2),
w.extract(4 - 4), w.extract(6 - 4))
}
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(3),
w.extract(5 - 4), w.extract(7 - 4))
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x0 = x[0];
let x1 = x[1];
let x2 = x[2];
let x3 = x[3];
let a0 = shuf0246(x0, x1);
let a1 = shuf0246(x2, x3);
let a2 = shuf1357(x0, x1);
let a3 = shuf1357(x2, x3);
let b0 = shuf0246(a0, a1);
let b1 = shuf0246(a2, a3);
let b2 = shuf1357(a0, a1);
let b3 = shuf1357(a2, a3);
bb(&[b0, b1, b2, b3]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn transpose_simd8_naive(b: &mut B) {
let x = [f32x8::splat(0_f32); 2];
fn shuf0246(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(2), v.extract(4), v.extract(6),
w.extract(0), w.extract(2), w.extract(4), w.extract(6))
}
fn shuf1357(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(1), v.extract(3), v.extract(5), v.extract(7),
w.extract(1), w.extract(3), w.extract(5), w.extract(7),)
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x01 = x[0];
let x23 = x[1];
let a01 = shuf0246(x01, x23);
let a23 = shuf1357(x01, x23);
let b01 = shuf0246(a01, a23);
let b23 = shuf1357(a01, a23);
bb(&[b01, b23]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn transpose_simd8_avx2_vpermps(b: &mut B) {
let x = [f32x8::splat(0_f32); 2];
fn perm04152637(v: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(4), v.extract(1), v.extract(5),
v.extract(2), v.extract(6), v.extract(3), v.extract(7))
}
fn shuf_lo(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(1), w.extract(0), w.extract(1),
v.extract(4), v.extract(5), w.extract(4), w.extract(5),)
}
fn shuf_hi(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(2), v.extract(3), w.extract(2), w.extract(3),
v.extract(6), v.extract(7), w.extract(6), w.extract(7),)
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x01 = x[0];
let x23 = x[1];
let a01 = perm04152637(x01);
let a23 = perm04152637(x23);
let b01 = shuf_lo(a01, a23);
let b23 = shuf_hi(a01, a23);
bb(&[b01, b23]);
}
})
}
#[cfg(target_feature = "avx")]
#[bench]
fn transpose_simd8_avx2_vpermpd(b: &mut B) {
let x = [f32x8::splat(0_f32); 2];
fn perm01452367(v: f32x8) -> f32x8 {
f32x8::new(v.extract(0), v.extract(1), v.extract(4), v.extract(5),
v.extract(2), v.extract(3), v.extract(6), v.extract(7))
}
fn shuf_lo_ps(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(0), w.extract(0), v.extract(1), w.extract(1),
v.extract(4), w.extract(4), v.extract(5), w.extract(5),)
}
fn shuf_hi_ps(v: f32x8, w: f32x8) -> f32x8 {
f32x8::new(v.extract(2), w.extract(2), v.extract(3), w.extract(3),
v.extract(6), w.extract(6), v.extract(7), w.extract(7),)
}
b.iter(|| {
for _ in 0..100 {
let x = bb(&x);
let x01 = x[0];
let x23 = x[1];
let a01 = perm01452367(x01);
let a23 = perm01452367(x23);
let b01 = shuf_lo_ps(a01, a23);
let b23 = shuf_hi_ps(a01, a23);
bb(&[b01, b23]);
}
})
}