extern crate ssimd;
use ssimd::f32x4;
fn mul(x: &[f32x4; 4], y: &[f32x4; 4]) -> [f32x4; 4] {
let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
[f32x4::splat(y0.extract(0)) * x[0] +
f32x4::splat(y0.extract(1)) * x[1] +
f32x4::splat(y0.extract(2)) * x[2] +
f32x4::splat(y0.extract(3)) * x[3],
f32x4::splat(y1.extract(0)) * x[0] +
f32x4::splat(y1.extract(1)) * x[1] +
f32x4::splat(y1.extract(2)) * x[2] +
f32x4::splat(y1.extract(3)) * x[3],
f32x4::splat(y2.extract(0)) * x[0] +
f32x4::splat(y2.extract(1)) * x[1] +
f32x4::splat(y2.extract(2)) * x[2] +
f32x4::splat(y2.extract(3)) * x[3],
f32x4::splat(y3.extract(0)) * x[0] +
f32x4::splat(y3.extract(1)) * x[1] +
f32x4::splat(y3.extract(2)) * x[2] +
f32x4::splat(y3.extract(3)) * x[3],
]
}
#[allow(dead_code)]
fn inverse_naive(x: &[[f32; 4]; 4]) -> [[f32; 4]; 4] {
let mut t = [[0_f32; 4]; 4];
for i in 0..4 {
t[0][i] = x[i][0];
t[1][i] = x[i][1];
t[2][i] = x[i][2];
t[3][i] = x[i][3];
}
println!("{:?}", t);
let _0 = t[2][2] * t[3][3];
let _1 = t[2][3] * t[3][2];
let _2 = t[2][1] * t[3][3];
let _3 = t[2][3] * t[3][1];
let _4 = t[2][1] * t[3][2];
let _5 = t[2][2] * t[3][1];
let _6 = t[2][0] * t[3][3];
let _7 = t[2][3] * t[3][0];
let _8 = t[2][0] * t[3][2];
let _9 = t[2][2] * t[3][0];
let _10 = t[2][0] * t[3][1];
let _11 = t[2][1] * t[3][0];
let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
println!("{:?}", v);
let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
println!("{:?}", [d00, d01, d02, d03, d10, d11, d12, d13]);
let _0 = t[0][2] * t[1][3];
let _1 = t[0][3] * t[1][2];
let _2 = t[0][1] * t[1][3];
let _3 = t[0][3] * t[1][1];
let _4 = t[0][1] * t[1][2];
let _5 = t[0][2] * t[1][1];
let _6 = t[0][0] * t[1][3];
let _7 = t[0][3] * t[1][0];
let _8 = t[0][0] * t[1][2];
let _9 = t[0][2] * t[1][0];
let _10 = t[0][0] * t[1][1];
let _11 = t[0][1] * t[1][0];
let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
println!("{:?}", v);
let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
println!("{:?}", [d20, d21, d22, d23, d30, d31, d32, d33]);
let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
let det = 1.0 / det;
let mut ret = [[d00, d01, d02, d03],
[d10, d11, d12, d13],
[d20, d21, d22, d23],
[d30, d31, d32, d33]];
for i in 0..4 {
for j in 0..4 {
ret[i][j] *= det;
}
}
ret
}
fn inverse_simd4(x: &[f32x4; 4]) -> [f32x4; 4] {
let src0 = x[0];
let src1 = x[1];
let src2 = x[2];
let src3 = x[3];
let tmp1 = f32x4::new(src0.extract(0), src0.extract(1),
src1.extract(4 - 4), src1.extract(5 - 4));
let row1 = f32x4::new(src2.extract(0), src2.extract(1),
src3.extract(4 - 4), src3.extract(5 - 4));
let row0 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
row1.extract(4 - 4), row1.extract(6 - 4));
let row1 = f32x4::new(row1.extract(1), row1.extract(3),
tmp1.extract(5 - 4), tmp1.extract(7 - 4));
let tmp1 = f32x4::new(src0.extract(2), src0.extract(3),
src1.extract(6 - 4), src1.extract(7 - 4));
let row3 = f32x4::new(src2.extract(2), src2.extract(3),
src3.extract(6 - 4), src3.extract(7 - 4));
let row2 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
row3.extract(4 - 4), row3.extract(6 - 4));
let row3 = f32x4::new(row3.extract(1), row3.extract(3),
tmp1.extract(5 - 4), tmp1.extract(7 - 4));
let tmp1 = row2 * row3;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor0 = row1 * tmp1;
let minor1 = row0 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor0 = (row1 * tmp1) - minor0;
let minor1 = (row0 * tmp1) - minor1;
let minor1 = f32x4::new(minor1.extract(2), minor1.extract(3),
minor1.extract(0), minor1.extract(1));
let tmp1 = row1 * row2;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor0 = (row3 * tmp1) + minor0;
let minor3 = row0 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor0 = minor0 - row3 * tmp1;
let minor3 = row0 * tmp1 - minor3;
let minor3 = f32x4::new(minor3.extract(2), minor3.extract(3),
minor3.extract(0), minor3.extract(1));
let tmp1 = row3 * f32x4::new(row1.extract(2), row1.extract(3),
row1.extract(0), row1.extract(1));
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let row2 = f32x4::new(row2.extract(2), row2.extract(3),
row2.extract(0), row2.extract(1));
let minor0 = row2 * tmp1 + minor0;
let minor2 = row0 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor0 = minor0 - row2 * tmp1;
let minor2 = row0 * tmp1 - minor2;
let minor2 = f32x4::new(minor2.extract(2), minor2.extract(3),
minor2.extract(0), minor2.extract(1));
let tmp1 = row0 * row1;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor2 = minor2 + row3 * tmp1;
let minor3 = row2 * tmp1 - minor3;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor2 = row3 * tmp1 - minor2;
let minor3 = minor3 - row2 * tmp1;
let tmp1 = row0 * row3;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor1 = minor1 - row2 * tmp1;
let minor2 = row1 * tmp1 + minor2;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor1 = row2 * tmp1 + minor1;
let minor2 = minor2 - row1 * tmp1;
let tmp1 = row0 * row2;
let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
tmp1.extract(3), tmp1.extract(2));
let minor1 = row3 * tmp1 + minor1;
let minor3 = minor3 - row1 * tmp1;
let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
tmp1.extract(0), tmp1.extract(1));
let minor1 = minor1 - row3 * tmp1;
let minor3 = row1 * tmp1 + minor3;
let det = row0 * minor0;
let det = f32x4::new(det.extract(2), det.extract(3),
det.extract(0), det.extract(1)) + det;
let det = f32x4::new(det.extract(1), det.extract(0),
det.extract(3), det.extract(2)) + det;
let det = f32x4::splat(1.0) / det;
[minor0 * det, minor1 * det, minor2 * det, minor3 * det]
}
fn p(x: &[f32x4; 4]) {
for xx in x {
for i in 0..4 {
let v = xx.extract(i);
if v == 0.0 {
print!("{}{:6.2}", if i > 0 {", "} else {"|"}, "");
} else {
print!("{}{:6.2}", if i > 0 {", "} else {"|"}, xx.extract(i));
}
}
println!(" |");
}
}
fn main() {
let x = [f32x4::new(-100.0, 6.0, 100.0, 1.0),
f32x4::new(3.0, 1.0, 0.0, 1.0),
f32x4::new(2.0, 1.0, 1.0, 1.0),
f32x4::new(-10.0, 1.0, 1.0, 1.0)];
let y = inverse_simd4(&x);
p(&x);
println!("");
p(&y);
println!("");
p(&mul(&x, &y))
}