//! Module for 4x4 Matrices.
//!
//! A 4x4 matrix is needed for certain transformations on 3D vectors:
//!
//! * The 3D vector is promoted to a 4D vector (generally with a `w` component
//!   of 1.0)
//! * The 4D vector is multiplied on the right by the 4x4 matrix.
//! * The resulting 4D vector is turned back into a 3D vector by dividing `x`,
//!   `y`, and `z` by the `w` component (which is why 1.0 is considered the
//!   default).
//!
//! Because we usually care quite a bit about our ability to manipulate 3D
//! vectors/points, the `Mat4` type is used much more than any other matrix.

use super::*;

/// A 4x4 Matrix.
/// 
/// * Row Major: index via `m[row][col]` when picking a location.
#[rustfmt::skip]
#[derive(Debug)]
#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
#[derive(Clone, Copy, Default)]
#[repr(align(16),C)]
pub struct Mat4(pub [[f32; 4]; 4]);

impl Mat4 {
  /// Const function for `Mat4` with 0.0 in all positions.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// const MAT4_ZERO: Mat4 = Mat4::zero();
  /// assert_eq!(MAT4_ZERO, Mat4([
  ///    [0.0, 0.0, 0.0, 0.0],
  ///    [0.0, 0.0, 0.0, 0.0],
  ///    [0.0, 0.0, 0.0, 0.0],
  ///    [0.0, 0.0, 0.0, 0.0],
  ///  ]));
  /// ```
  pub const fn zero() -> Self {
    Mat4([
      [0.0, 0.0, 0.0, 0.0],
      [0.0, 0.0, 0.0, 0.0],
      [0.0, 0.0, 0.0, 0.0],
      [0.0, 0.0, 0.0, 0.0],
    ])
  }

  /// Const function for the identity `Mat4`.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// const MAT4_IDENTITY: Mat4 = Mat4::identity();
  /// assert_eq!(MAT4_IDENTITY, Mat4([
  ///    [1.0, 0.0, 0.0, 0.0],
  ///    [0.0, 1.0, 0.0, 0.0],
  ///    [0.0, 0.0, 1.0, 0.0],
  ///    [0.0, 0.0, 0.0, 1.0],
  ///  ]));
  /// ```
  pub const fn identity() -> Self {
    Mat4([
      [1.0, 0.0, 0.0, 0.0],
      [0.0, 1.0, 0.0, 0.0],
      [0.0, 0.0, 1.0, 0.0],
      [0.0, 0.0, 0.0, 1.0],
    ])
  }

  /// Const function for a `Mat4` to scale a `Vec4` in 3D space.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// const MAT4_SCALE: Mat4 = Mat4::scale_xyz(2.0, 3.0, 4.0);
  /// assert_eq!(MAT4_SCALE, Mat4([
  ///    [2.0, 0.0, 0.0, 0.0],
  ///    [0.0, 3.0, 0.0, 0.0],
  ///    [0.0, 0.0, 4.0, 0.0],
  ///    [0.0, 0.0, 0.0, 1.0],
  ///  ]));
  /// ```
  pub const fn scale_xyz(x: f32, y: f32, z: f32) -> Mat4 {
    Mat4([
      [x, 0.0, 0.0, 0.0],
      [0.0, y, 0.0, 0.0],
      [0.0, 0.0, z, 0.0],
      [0.0, 0.0, 0.0, 1.0],
    ])
  }

  /// Const function for a `Mat4` to translate a `Vec4` in 3D space.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// const MAT4_SCALE: Mat4 = Mat4::translate_xyz(2.0, 3.0, 4.0);
  /// assert_eq!(MAT4_SCALE, Mat4([
  ///    [1.0, 0.0, 0.0, 2.0],
  ///    [0.0, 1.0, 0.0, 3.0],
  ///    [0.0, 0.0, 1.0, 4.0],
  ///    [0.0, 0.0, 0.0, 1.0],
  ///  ]));
  /// ```
  pub const fn translate_xyz(x: f32, y: f32, z: f32) -> Mat4 {
    Mat4([
      [1.0, 0.0, 0.0, x],
      [0.0, 1.0, 0.0, y],
      [0.0, 0.0, 1.0, z],
      [0.0, 0.0, 0.0, 1.0],
    ])
  }

  /// Views this `Mat4` as a linear block of 16x `f32`
  #[allow(dead_code)] // only used in fallback paths.
  fn as_floats(&self) -> &[f32] {
    unsafe { core::slice::from_raw_parts(self.0.as_ptr() as *const f32, 16) }
  }

  /// Views this `Mat4` as a mutable linear block of 16x `f32`
  #[allow(dead_code)] // only used in fallback paths.
  fn as_floats_mut(&mut self) -> &mut [f32] {
    unsafe { core::slice::from_raw_parts_mut(self.0.as_mut_ptr() as *mut f32, 16) }
  }
}

impl Deref for Mat4 {
  type Target = [[f32; 4]; 4];
  /// Deref to the inner 4 element array of 4 element arrays
  fn deref(&self) -> &Self::Target {
    &self.0
  }
}

impl DerefMut for Mat4 {
  /// DerefMut to the inner 4 element array of 4 element arrays
  fn deref_mut(&mut self) -> &mut Self::Target {
    &mut self.0
  }
}

impl Index<usize> for Mat4 {
  type Output = [f32; 4];
  /// Index a row
  fn index(&self, i: usize) -> &Self::Output {
    &self.0[i]
  }
}

impl IndexMut<usize> for Mat4 {
  /// IndexMut a row
  fn index_mut(&mut self, i: usize) -> &mut Self::Output {
    &mut self.0[i]
  }
}

impl Index<(usize, usize)> for Mat4 {
  type Output = f32;
  /// Index a (row,col)
  fn index(&self, (row, col): (usize, usize)) -> &Self::Output {
    &self.0[row][col]
  }
}

impl IndexMut<(usize, usize)> for Mat4 {
  /// IndexMut a (row,col)
  fn index_mut(&mut self, (row, col): (usize, usize)) -> &mut Self::Output {
    &mut self.0[row][col]
  }
}

impl From<[[f32; 4]; 4]> for Mat4 {
  /// Directly wraps the given array.
  fn from(array: [[f32; 4]; 4]) -> Self {
    Self(array)
  }
}

impl From<Mat4> for [[f32; 4]; 4] {
  /// Directly unwraps the given array.
  fn from(mat: Mat4) -> Self {
    mat.0
  }
}

impl PartialEq for Mat4 {
  /// ```rust
  /// use hektor::Mat4;
  /// assert_eq!(Mat4::identity(), Mat4::identity());
  /// ```
  fn eq(&self, other: &Self) -> bool {
    if_sse2! {{
      // TODO: benchmark this version against the early bailout version
      // https://rust.godbolt.org/z/cHAxeV we will assume for now that this is
      // faster based on the cycle count of the instructions involved.
      let row0out = _mm_cvtps_epi32(_mm_cmpeq_ps(load_ps!(self[0]), load_ps!(other[0])));
      let row1out = _mm_cvtps_epi32(_mm_cmpeq_ps(load_ps!(self[1]), load_ps!(other[1])));
      let row2out = _mm_cvtps_epi32(_mm_cmpeq_ps(load_ps!(self[2]), load_ps!(other[2])));
      let row3out = _mm_cvtps_epi32(_mm_cmpeq_ps(load_ps!(self[3]), load_ps!(other[3])));
      let row01out = _mm_and_si128(row0out, row1out);
      let row23out = _mm_and_si128(row2out, row3out);
      _mm_movemask_ps(_mm_cvtepi32_ps(_mm_and_si128(row01out, row23out))) == 0b1111
    } else {
      self.0 == other.0
    }}
  }
  /// ```rust
  /// use hektor::Mat4;
  /// assert_ne!(Mat4::zero(), Mat4::identity());
  /// ```
  fn ne(&self, other: &Self) -> bool {
    if_sse2! {{
      let row0out = _mm_cvtps_epi32(_mm_cmpneq_ps(load_ps!(self[0]), load_ps!(other[0])));
      let row1out = _mm_cvtps_epi32(_mm_cmpneq_ps(load_ps!(self[1]), load_ps!(other[1])));
      let row2out = _mm_cvtps_epi32(_mm_cmpneq_ps(load_ps!(self[2]), load_ps!(other[2])));
      let row3out = _mm_cvtps_epi32(_mm_cmpneq_ps(load_ps!(self[3]), load_ps!(other[3])));
      let row01out = _mm_and_si128(row0out, row1out);
      let row23out = _mm_and_si128(row2out, row3out);
      _mm_movemask_ps(_mm_cvtepi32_ps(_mm_and_si128(row01out, row23out))) == 0b1111
    } else {
      self.0 != other.0
    }}
  }
}

impl Add<Mat4> for Mat4 {
  type Output = Mat4;
  /// Element-wise addition between two `Mat4`.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = Mat4([
  ///   [0.6, 0.0, 6.2, 4.0],
  ///   [5.0, 6.0, 7.0, 0.2],
  ///   [4.0, 1.2, 7.0, 0.3],
  ///   [0.8, 6.0, -9.0, 1.0],
  /// ]);
  /// let expected = Mat4([
  ///   [1.6, 2.0, 9.2, 8.0],
  ///   [10.0, 12.0, 14.0, 8.2],
  ///   [13.0, 2.2, 9.0, 3.3],
  ///   [4.8, 11.0, -3.0, 8.0],
  /// ]);
  /// assert_eq!(a + b, expected);
  /// ```
  fn add(self, rhs: Mat4) -> Self::Output {
    let mut z = Self::zero();
    if_sse2! {{
      for i in 0..4 {
        store_ps!(z[i], _mm_add_ps(load_ps!(self[i]), load_ps!(rhs[i])));
      }
    } else {
      for (z_mut, (s, r)) in z.as_floats_mut().iter_mut().zip(self.as_floats().iter().zip(rhs.as_floats().iter())) {
        *z_mut = *s + *r;
      }
    }}
    z
  }
}

impl AddAssign<Mat4> for Mat4 {
  /// Element-wise addition into the `Mat4` on the left.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = Mat4([
  ///   [0.6, 0.0, 6.2, 4.0],
  ///   [5.0, 6.0, 7.0, 0.2],
  ///   [4.0, 1.2, 7.0, 0.3],
  ///   [0.8, 6.0, -9.0, 1.0],
  /// ]);
  /// let expected = Mat4([
  ///   [1.6, 2.0, 9.2, 8.0],
  ///   [10.0, 12.0, 14.0, 8.2],
  ///   [13.0, 2.2, 9.0, 3.3],
  ///   [4.8, 11.0, -3.0, 8.0],
  /// ]);
  /// a += b;
  /// assert_eq!(a, expected);
  /// ```
  fn add_assign(&mut self, rhs: Mat4) {
    if_sse2! {{
      for i in 0..4 {
        store_ps!(self[i], _mm_add_ps(load_ps!(self[i]), load_ps!(rhs[i])));
      }
    } else {
      for (s_mut, r) in self.as_floats_mut().iter_mut().zip(rhs.as_floats().iter()) {
        *s_mut = *s_mut + *r;
      }
    }}
  }
}

impl Add<f32> for Mat4 {
  type Output = Mat4;
  /// Adds the `f32` to each element of the `Mat4` (float on the right).
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 30.0;
  /// let expected = Mat4([
  ///   [31.0, 32.0, 33.0, 34.0],
  ///   [35.0, 36.0, 37.0, 38.0],
  ///   [39.0, 31.0, 32.0, 33.0],
  ///   [34.0, 35.0, 36.0, 37.0],
  /// ]);
  /// assert_eq!(a + b, expected);
  /// ```
  fn add(self, rhs: f32) -> Self::Output {
    let mut z = Self::zero();
    if_sse2! {{
      let splat = _mm_set1_ps(rhs);
      for i in 0..4 {
        store_ps!(z[i], _mm_add_ps(load_ps!(self[i]), splat));
      }
    } else {
      for (z_mut, s) in z.as_floats_mut().iter_mut().zip(self.as_floats().iter()) {
        *z_mut = *s + rhs;
      }
    }}
    z
  }
}

impl AddAssign<f32> for Mat4 {
  /// Adds the `f32` into each element of the `Mat4`.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 30.0;
  /// let expected = Mat4([
  ///   [31.0, 32.0, 33.0, 34.0],
  ///   [35.0, 36.0, 37.0, 38.0],
  ///   [39.0, 31.0, 32.0, 33.0],
  ///   [34.0, 35.0, 36.0, 37.0],
  /// ]);
  /// a += b;
  /// assert_eq!(a, expected);
  /// ```
  fn add_assign(&mut self, rhs: f32) {
    if_sse2! {{
      let splat = _mm_set1_ps(rhs);
      for i in 0..4 {
        store_ps!(self[i], _mm_add_ps(load_ps!(self[i]), splat));
      }
    } else {
      for s_mut in self.as_floats_mut().iter_mut() {
        *s_mut = *s_mut + rhs;
      }
    }}
  }
}

impl Add<Mat4> for f32 {
  type Output = Mat4;
  /// Adds the `f32` to each element of the `Mat4` (float on the left).
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 30.0;
  /// let expected = Mat4([
  ///   [31.0, 32.0, 33.0, 34.0],
  ///   [35.0, 36.0, 37.0, 38.0],
  ///   [39.0, 31.0, 32.0, 33.0],
  ///   [34.0, 35.0, 36.0, 37.0],
  /// ]);
  /// assert_eq!(b + a, expected);
  /// ```
  fn add(self, rhs: Mat4) -> Self::Output {
    rhs + self
  }
}

impl Mul<Mat4> for f32 {
  type Output = Mat4;
  /// Multiplies the `f32` by each element of the `Mat4` (float on the left).
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 10.0;
  /// let expected = Mat4([
  ///   [10.0, 20.0, 30.0, 40.0],
  ///   [50.0, 60.0, 70.0, 80.0],
  ///   [90.0, 10.0, 20.0, 30.0],
  ///   [40.0, 50.0, 60.0, 70.0],
  /// ]);
  /// assert_eq!(b * a, expected);
  /// ```
  fn mul(self, rhs: Mat4) -> Self::Output {
    rhs * self
  }
}

impl Mul<Mat4> for Mat4 {
  type Output = Mat4;
  /// Multiply this `Mat4` by the other `Mat4` on the right.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let i = Mat4::identity();
  /// assert_eq!(i * i, i);
  /// let a = Mat4([
  ///   [3.0, 5.0, 7.0, 8.0],
  ///   [1.0, 2.0, 8.0, 7.0],
  ///   [4.0, 5.0, 3.0, -2.0],
  ///   [1.0, 6.0, 7.0, 9.0],
  /// ]);
  /// let b = Mat4([
  ///   [2.0, 8.0, 6.0, 9.0],
  ///   [3.0, -5.0, 6.0, 7.0],
  ///   [1.0, 4.0, 9.0, -3.0],
  ///   [10.0, -2.0, 5.0, 2.0],
  /// ]);
  /// let expected = Mat4([
  ///   [108.0, 11.0, 151.0, 57.0],
  ///   [86.0, 16.0, 125.0, 13.0],
  ///   [6.0, 23.0, 71.0, 58.0],
  ///   [117.0, -12.0, 150.0, 48.0],
  /// ]);
  /// assert_eq!(a * b, expected);
  /// ```
  fn mul(self, rhs: Mat4) -> Self::Output {
    let mut z = Self::zero();
    if_sse2! {{
      let rhs0 = load_ps!(rhs[0]);
      let rhs1 = load_ps!(rhs[1]);
      let rhs2 = load_ps!(rhs[2]);
      let rhs3 = load_ps!(rhs[3]);
      for r in 0 .. 4 {
        let row128 = load_ps!(self[r]);
        let out0 = _mm_mul_ps(_mm_shuffle_ps(row128, row128, 0b00_00_00_00), rhs0);
        let out1 = _mm_mul_ps(_mm_shuffle_ps(row128, row128, 0b01_01_01_01), rhs1);
        let out2 = _mm_mul_ps(_mm_shuffle_ps(row128, row128, 0b10_10_10_10), rhs2);
        let out3 = _mm_mul_ps(_mm_shuffle_ps(row128, row128, 0b11_11_11_11), rhs3);
        let out01 = _mm_add_ps(out0, out1);
        let out23 = _mm_add_ps(out2, out3);
        store_ps!(z[r], _mm_add_ps(out01, out23));
      }
    } else {
      for r in 0..4 {
        for c in 0..4 {
          for p in 0..4 {
            // I hope LLVM unrolls our 3 tier loop :(
            z[r][c] += self[r][p] * rhs[p][c];
          }
        }
      }
    }}
    z
  }
}

impl Mul<f32> for Mat4 {
  type Output = Mat4;
  /// Multiplies the `f32` by each element of the `Mat4` (float on the right).
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 10.0;
  /// let expected = Mat4([
  ///   [10.0, 20.0, 30.0, 40.0],
  ///   [50.0, 60.0, 70.0, 80.0],
  ///   [90.0, 10.0, 20.0, 30.0],
  ///   [40.0, 50.0, 60.0, 70.0],
  /// ]);
  /// assert_eq!(a * b, expected);
  /// ```
  fn mul(self, rhs: f32) -> Self::Output {
    let mut z = Self::zero();
    if_sse2! {{
      let splat = _mm_set1_ps(rhs);
      for i in 0..4 {
        store_ps!(z[i], _mm_mul_ps(load_ps!(self[i]), splat));
      }
    } else {
      for (z_mut, s) in z.as_floats_mut().iter_mut().zip(self.as_floats().iter()) {
        *z_mut = *s * rhs;
      }
    }}
    z
  }
}

impl Mul<Vec4> for Mat4 {
  type Output = Vec4;
  /// Multiply this `Mat4` and a `Vec4` on the right.
  ///
  /// ```rust
  /// use hektor::{Mat4, Vec4};
  /// let v = Vec4([1.0, 2.0, 3.0, 4.0]);
  /// assert_eq!(Mat4::identity() * v, v);
  /// assert_eq!(Mat4::zero() * v, Vec4::zero());
  /// ```
  fn mul(self, rhs: Vec4) -> Self::Output {
    if_sse2! {{
      let rhs128 = load_ps!(rhs);
      Vec4([
        horizontal_sum!(_mm_mul_ps(load_ps!(self[0]), rhs128)),
        horizontal_sum!(_mm_mul_ps(load_ps!(self[1]), rhs128)),
        horizontal_sum!(_mm_mul_ps(load_ps!(self[2]), rhs128)),
        horizontal_sum!(_mm_mul_ps(load_ps!(self[3]), rhs128)),
      ])
    } else {
      let mut v = Vec4::zero();
      for r in 0 .. 4 {
        let this_row = &self[r];
        for c in 0 .. 4 {
          v[r] += this_row[c] * rhs[c];
        }
      }
      v
    }}
  }
}

impl MulAssign<Mat4> for Mat4 {
  /// Multiply into this `Mat4` using the other `Mat4` on the right.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [3.0, 5.0, 7.0, 8.0],
  ///   [1.0, 2.0, 8.0, 7.0],
  ///   [4.0, 5.0, 3.0, -2.0],
  ///   [1.0, 6.0, 7.0, 9.0],
  /// ]);
  /// let b = Mat4([
  ///   [2.0, 8.0, 6.0, 9.0],
  ///   [3.0, -5.0, 6.0, 7.0],
  ///   [1.0, 4.0, 9.0, -3.0],
  ///   [10.0, -2.0, 5.0, 2.0],
  /// ]);
  /// let expected = Mat4([
  ///   [108.0, 11.0, 151.0, 57.0],
  ///   [86.0, 16.0, 125.0, 13.0],
  ///   [6.0, 23.0, 71.0, 58.0],
  ///   [117.0, -12.0, 150.0, 48.0],
  /// ]);
  /// a *= b;
  /// assert_eq!(a, expected);
  /// ```
  fn mul_assign(&mut self, rhs: Mat4) {
    // Even when doing mul_assign we have to accumulate the result into a
    // temporary value and then save it at the end.
    let mut z = Self::zero();
    if_sse2! {{
      macro_rules! linear_combination {
        ($a:expr, $b:expr) => {{
          let a128 = load_ps!($a);
          let mut output = _mm_mul_ps(_mm_shuffle_ps(a128,a128,0b00_00_00_00), load_ps!($b[0]));
          output = _mm_add_ps(output, _mm_mul_ps(_mm_shuffle_ps(a128,a128,0b01_01_01_01), load_ps!($b[1])));
          output = _mm_add_ps(output, _mm_mul_ps(_mm_shuffle_ps(a128,a128,0b10_10_10_10), load_ps!($b[2])));
          output = _mm_add_ps(output, _mm_mul_ps(_mm_shuffle_ps(a128,a128,0b11_11_11_11), load_ps!($b[3])));
          output
        }};
      }
      for r in 0 .. 4 {
        store_ps!(z[r], linear_combination!(self[r], rhs));
      }
    } else {
      for r in 0..4 {
        for c in 0..4 {
          for p in 0..4 {
            // I hope LLVM unrolls our 3 tier loop :(
            z[r][c] += self[r][p] * rhs[p][c];
          }
        }
      }
    }}
    *self = z;
  }
}

impl MulAssign<f32> for Mat4 {
  /// Multiplies the `f32` into each element of the `Mat4`.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 10.0;
  /// let expected = Mat4([
  ///   [10.0, 20.0, 30.0, 40.0],
  ///   [50.0, 60.0, 70.0, 80.0],
  ///   [90.0, 10.0, 20.0, 30.0],
  ///   [40.0, 50.0, 60.0, 70.0],
  /// ]);
  /// a *= b;
  /// assert_eq!(a, expected);
  /// ```
  fn mul_assign(&mut self, rhs: f32) {
    if_sse2! {{
      let splat = _mm_set1_ps(rhs);
      for i in 0..4 {
        store_ps!(self[i], _mm_mul_ps(load_ps!(self[i]), splat));
      }
    } else {
      for s_mut in self.as_floats_mut().iter_mut() {
        *s_mut = *s_mut * rhs;
      }
    }}
  }
}

impl Sub<Mat4> for Mat4 {
  type Output = Mat4;
  /// Element-wise subtraction between two `Mat4`.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = Mat4([
  ///   [0.5, 0.0, 6.0, 4.0],
  ///   [5.0, 6.0, 7.0, 0.2],
  ///   [4.0, 1.5, 7.0, 0.3],
  ///   [0.8, 6.0, -9.0, 1.0],
  /// ]);
  /// let expected = Mat4([
  ///   [0.5, 2.0, -3.0, 0.0],
  ///   [0.0, 0.0, 0.0, 7.8],
  ///   [5.0, -0.5, -5.0, 2.7],
  ///   [3.2, -1.0, 15.0, 6.0],
  /// ]);
  /// assert_eq!(a - b, expected);
  /// ```
  fn sub(self, rhs: Mat4) -> Self::Output {
    let mut z = Self::zero();
    if_sse2! {{
      for i in 0..4 {
        store_ps!(z[i], _mm_sub_ps(load_ps!(self[i]), load_ps!(rhs[i])));
      }
    } else {
      for (z_mut, (s, r)) in z.as_floats_mut().iter_mut().zip(self.as_floats().iter().zip(rhs.as_floats().iter())) {
        *z_mut = *s - *r;
      }
    }}
    z
  }
}

impl SubAssign<Mat4> for Mat4 {
  /// Element-wise subtraction by the `Mat4` on the left.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = Mat4([
  ///   [0.5, 0.0, 6.0, 4.0],
  ///   [5.0, 6.0, 7.0, 0.2],
  ///   [4.0, 1.5, 7.0, 0.3],
  ///   [0.8, 6.0, -9.0, 1.0],
  /// ]);
  /// let expected = Mat4([
  ///   [0.5, 2.0, -3.0, 0.0],
  ///   [0.0, 0.0, 0.0, 7.8],
  ///   [5.0, -0.5, -5.0, 2.7],
  ///   [3.2, -1.0, 15.0, 6.0],
  /// ]);
  /// a -= b;
  /// assert_eq!(a, expected);
  /// ```
  fn sub_assign(&mut self, rhs: Mat4) {
    if_sse2! {{
      for i in 0..4 {
        store_ps!(self[i], _mm_sub_ps(load_ps!(self[i]), load_ps!(rhs[i])));
      }
    } else {
      for (s_mut, r) in self.as_floats_mut().iter_mut().zip(rhs.as_floats().iter()) {
        *s_mut = *s_mut - *r;
      }
    }}
  }
}

impl Sub<f32> for Mat4 {
  type Output = Mat4;
  /// Subs the `f32` from each element of the `Mat4` (float on the right).
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 1.0;
  /// let expected = Mat4([
  ///   [0.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  ///   [8.0, 0.0, 1.0, 2.0],
  ///   [3.0, 4.0, 5.0, 6.0],
  /// ]);
  /// assert_eq!(a - b, expected);
  /// ```
  fn sub(self, rhs: f32) -> Self::Output {
    let mut z = Self::zero();
    if_sse2! {{
      let splat = _mm_set1_ps(rhs);
      for i in 0..4 {
        store_ps!(z[i], _mm_sub_ps(load_ps!(self[i]), splat));
      }
    } else {
      for (z_mut, s) in z.as_floats_mut().iter_mut().zip(self.as_floats().iter()) {
        *z_mut = *s - rhs;
      }
    }}
    z
  }
}

impl SubAssign<f32> for Mat4 {
  /// Subs the `f32` from each element of the `Mat4`.
  ///
  /// ```rust
  /// use hektor::Mat4;
  /// let mut a = Mat4([
  ///   [1.0, 2.0, 3.0, 4.0],
  ///   [5.0, 6.0, 7.0, 8.0],
  ///   [9.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  /// ]);
  /// let b = 1.0;
  /// let expected = Mat4([
  ///   [0.0, 1.0, 2.0, 3.0],
  ///   [4.0, 5.0, 6.0, 7.0],
  ///   [8.0, 0.0, 1.0, 2.0],
  ///   [3.0, 4.0, 5.0, 6.0],
  /// ]);
  /// a -= b;
  /// assert_eq!(a, expected);
  /// ```
  fn sub_assign(&mut self, rhs: f32) {
    if_sse2! {{
      let splat = _mm_set1_ps(rhs);
      for i in 0..4 {
        store_ps!(self[i], _mm_sub_ps(load_ps!(self[i]), splat));
      }
    } else {
      for s_mut in self.as_floats_mut().iter_mut() {
        *s_mut = *s_mut - rhs;
      }
    }}
  }
}