simulated_packed_simd 0.0.1

Simulated SIMD for Stable channel. Effort to make the `packed_simd` crate work on stable channel.
Documentation
/// Slightly modify the example on crate `simd` to make it work on stable channel 
/// Reference link : https://github.com/rust-lang-nursery/simd

extern crate ssimd;
use ssimd::f32x4;

fn mul(x: &[f32x4; 4], y: &[f32x4; 4]) -> [f32x4; 4] {
    let y0 = y[0];
    let y1 = y[1];
    let y2 = y[2];
    let y3 = y[3];
    [f32x4::splat(y0.extract(0)) * x[0] +
     f32x4::splat(y0.extract(1)) * x[1] +
     f32x4::splat(y0.extract(2)) * x[2] +
     f32x4::splat(y0.extract(3)) * x[3],
     f32x4::splat(y1.extract(0)) * x[0] +
     f32x4::splat(y1.extract(1)) * x[1] +
     f32x4::splat(y1.extract(2)) * x[2] +
     f32x4::splat(y1.extract(3)) * x[3],
     f32x4::splat(y2.extract(0)) * x[0] +
     f32x4::splat(y2.extract(1)) * x[1] +
     f32x4::splat(y2.extract(2)) * x[2] +
     f32x4::splat(y2.extract(3)) * x[3],
     f32x4::splat(y3.extract(0)) * x[0] +
     f32x4::splat(y3.extract(1)) * x[1] +
     f32x4::splat(y3.extract(2)) * x[2] +
     f32x4::splat(y3.extract(3)) * x[3],
     ]
}

#[allow(dead_code)]
fn inverse_naive(x: &[[f32; 4]; 4]) -> [[f32; 4]; 4] {
    let mut t = [[0_f32; 4]; 4];
    for i in 0..4 {
        t[0][i] = x[i][0];
        t[1][i] = x[i][1];
        t[2][i] = x[i][2];
        t[3][i] = x[i][3];
    }
    println!("{:?}", t);

    let _0 = t[2][2] * t[3][3];
    let _1 = t[2][3] * t[3][2];
    let _2 = t[2][1] * t[3][3];
    let _3 = t[2][3] * t[3][1];
    let _4 = t[2][1] * t[3][2];
    let _5 = t[2][2] * t[3][1];
    let _6 = t[2][0] * t[3][3];
    let _7 = t[2][3] * t[3][0];
    let _8 = t[2][0] * t[3][2];
    let _9 = t[2][2] * t[3][0];
    let _10 = t[2][0] * t[3][1];
    let _11 = t[2][1] * t[3][0];
    let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
    println!("{:?}", v);

    let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
        (_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
    let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
        (_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
    let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
        (_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
    let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
        (_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
    let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
        (_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
    let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
        (_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
    let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
        (_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
    let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
        (_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);

    println!("{:?}", [d00, d01, d02, d03, d10, d11, d12, d13]);

    let _0 = t[0][2] * t[1][3];
    let _1 = t[0][3] * t[1][2];
    let _2 = t[0][1] * t[1][3];
    let _3 = t[0][3] * t[1][1];
    let _4 = t[0][1] * t[1][2];
    let _5 = t[0][2] * t[1][1];
    let _6 = t[0][0] * t[1][3];
    let _7 = t[0][3] * t[1][0];
    let _8 = t[0][0] * t[1][2];
    let _9 = t[0][2] * t[1][0];
    let _10 = t[0][0] * t[1][1];
    let _11 = t[0][1] * t[1][0];
    let v = [_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11];
    println!("{:?}", v);

    let d20  = _0*t[3][1]  + _3*t[3][2]  + _4*t[3][3]-
        (_1*t[3][1]  + _2*t[3][2]  + _5*t[3][3]);
    let d21  = _1*t[3][0]  + _6*t[3][2]  + _9*t[3][3]-
        (_0*t[3][0]  + _7*t[3][2]  + _8*t[3][3]);
    let d22 = _2*t[3][0]  + _7*t[3][1]  + _10*t[3][3]-
        (_3*t[3][0]  + _6*t[3][1]  + _11*t[3][3]);
    let d23 = _5*t[3][0]  + _8*t[3][1]  + _11*t[3][2]-
        (_4*t[3][0]  + _9*t[3][1]  + _10*t[3][2]);
    let d30 = _2*t[2][2]  + _5*t[2][3]  + _1*t[2][1]-
        (_4*t[2][3]  + _0*t[2][1]   + _3*t[2][2]);
    let d31 = _8*t[2][3]  + _0*t[2][0]   + _7*t[2][2]-
        (_6*t[2][2]  + _9*t[2][3]  + _1*t[2][0]);
    let d32 = _6*t[2][1]   + _11*t[2][3] + _3*t[2][0]-
        (_10*t[2][3] + _2*t[2][0]   + _7*t[2][1]);
    let d33 = _10*t[2][2] + _4*t[2][0]   + _9*t[2][1]-
        (_8*t[2][1]   + _11*t[2][2] + _5*t[2][0]);

    println!("{:?}", [d20, d21, d22, d23, d30, d31, d32, d33]);

    let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;

    let det = 1.0 / det;
    let mut ret = [[d00, d01, d02, d03],
                   [d10, d11, d12, d13],
                   [d20, d21, d22, d23],
                   [d30, d31, d32, d33]];
    for i in 0..4 {
        for j in 0..4 {
            ret[i][j] *= det;
        }
    }
    ret
}

fn inverse_simd4(x: &[f32x4; 4]) -> [f32x4; 4] {
    let src0 = x[0];
    let src1 = x[1];
    let src2 = x[2];
    let src3 = x[3];

    let tmp1 = f32x4::new(src0.extract(0), src0.extract(1),
                          src1.extract(4 - 4), src1.extract(5 - 4));
    let row1 = f32x4::new(src2.extract(0), src2.extract(1),
                          src3.extract(4 - 4), src3.extract(5 - 4));
    let row0 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
                          row1.extract(4 - 4), row1.extract(6 - 4));
    let row1 = f32x4::new(row1.extract(1), row1.extract(3),
                          tmp1.extract(5 - 4), tmp1.extract(7 - 4));

    let tmp1 = f32x4::new(src0.extract(2), src0.extract(3),
                          src1.extract(6 - 4), src1.extract(7 - 4));
    let row3 = f32x4::new(src2.extract(2), src2.extract(3),
                          src3.extract(6 - 4), src3.extract(7 - 4));
    let row2 = f32x4::new(tmp1.extract(0), tmp1.extract(2),
                          row3.extract(4 - 4), row3.extract(6 - 4));
    let row3 = f32x4::new(row3.extract(1), row3.extract(3),
                          tmp1.extract(5 - 4), tmp1.extract(7 - 4));


    let tmp1 = row2 * row3;
    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
                          tmp1.extract(3), tmp1.extract(2));
    let minor0 = row1 * tmp1;
    let minor1 = row0 * tmp1;
    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
                          tmp1.extract(0), tmp1.extract(1));
    let minor0 = (row1 * tmp1) - minor0;
    let minor1 = (row0 * tmp1) - minor1;
    let minor1 = f32x4::new(minor1.extract(2), minor1.extract(3),
                            minor1.extract(0), minor1.extract(1));
    //println!("{:?}", minor1);


    let tmp1 = row1 * row2;
    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
                          tmp1.extract(3), tmp1.extract(2));
    let minor0 = (row3 * tmp1) + minor0;
    let minor3 = row0 * tmp1;
    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
                          tmp1.extract(0), tmp1.extract(1));

    let minor0 = minor0 - row3 * tmp1;
    let minor3 = row0 * tmp1 - minor3;
    let minor3 = f32x4::new(minor3.extract(2), minor3.extract(3),
                            minor3.extract(0), minor3.extract(1));
    //println!("{:?}", minor1);


    let tmp1 = row3 * f32x4::new(row1.extract(2), row1.extract(3),
                                 row1.extract(0), row1.extract(1));
    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
                          tmp1.extract(3), tmp1.extract(2));
    let row2 = f32x4::new(row2.extract(2), row2.extract(3),
                          row2.extract(0), row2.extract(1));
    let minor0 = row2 * tmp1 + minor0;
    let minor2 = row0 * tmp1;
    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
                          tmp1.extract(0), tmp1.extract(1));
    let minor0 = minor0 - row2 * tmp1;
    let minor2 = row0 * tmp1 - minor2;
    let minor2 = f32x4::new(minor2.extract(2), minor2.extract(3),
                            minor2.extract(0), minor2.extract(1));
    //println!("{:?}", minor1);


    let tmp1 = row0 * row1;
    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
                          tmp1.extract(3), tmp1.extract(2));
    let minor2 = minor2 + row3 * tmp1;
    let minor3 = row2 * tmp1 - minor3;
    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
                          tmp1.extract(0), tmp1.extract(1));
    let minor2 = row3 * tmp1 - minor2;
    let minor3 = minor3 - row2 * tmp1;
    //println!("{:?}", minor1);



    let tmp1 = row0 * row3;
    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
                          tmp1.extract(3), tmp1.extract(2));
    let minor1 = minor1 - row2 * tmp1;
    let minor2 = row1 * tmp1 + minor2;
    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
                          tmp1.extract(0), tmp1.extract(1));
    let minor1 = row2 * tmp1 + minor1;
    let minor2 = minor2 - row1 * tmp1;
    //println!("{:?}", minor1);

    let tmp1 = row0 * row2;
    let tmp1 = f32x4::new(tmp1.extract(1), tmp1.extract(0),
                          tmp1.extract(3), tmp1.extract(2));
    let minor1 = row3 * tmp1 + minor1;
    let minor3 = minor3 - row1 * tmp1;
    let tmp1 = f32x4::new(tmp1.extract(2), tmp1.extract(3),
                          tmp1.extract(0), tmp1.extract(1));
    let minor1 = minor1 - row3 * tmp1;
    let minor3 = row1 * tmp1 + minor3;
    //println!("{:?}", minor1);

    let det = row0 * minor0;
    let det = f32x4::new(det.extract(2), det.extract(3),
                         det.extract(0), det.extract(1)) + det;
    let det = f32x4::new(det.extract(1), det.extract(0),
                         det.extract(3), det.extract(2)) + det;
    //let tmp1 = det.approx_reciprocal();
    //let det = tmp1 + tmp1 - det * tmp1 * tmp1;
    let det = f32x4::splat(1.0) / det;

//    let det = f32x4::splat(det.extract(0));

    [minor0 * det, minor1 * det, minor2 * det, minor3 * det]
}

fn p(x: &[f32x4; 4]) {
    for xx in x {
        for i in 0..4 {
            let v = xx.extract(i);
            if v == 0.0 {
                print!("{}{:6.2}", if i > 0 {", "} else {"|"}, "");
            } else {
                print!("{}{:6.2}", if i > 0 {", "} else {"|"}, xx.extract(i));
            }
        }
        println!(" |");
    }
}

fn main() {
    let x = [f32x4::new(-100.0, 6.0, 100.0, 1.0),
             f32x4::new(3.0, 1.0, 0.0, 1.0),
             f32x4::new(2.0, 1.0, 1.0, 1.0),
             f32x4::new(-10.0, 1.0, 1.0, 1.0)];

  /*  let mut x_ = [[0.0; 4]; 4];
    for i in 0..4 {
        for j in 0..4 {
            x_[i][j] = x[i].extract(j as u32)
        }
    }

    let ret = inverse_naive(&x_);
    let mut y = [f32x4::splat(0.0); 4];
    for i in 0..4 {
        for j in 0..4 {
            y[i] = y[i].replace(j as u32, ret[i][j])
        }
}*/
    let y = inverse_simd4(&x);
    p(&x);
    println!("");
    p(&y);
    println!("");
    p(&mul(&x, &y))
}