1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
use crate::{ops::*, tensor::*};
use std::iter::{Iterator, Sum};
use std::ops::{Add, AddAssign, Mul};
use std::mem;
use typenum::{Unsigned};
use generic_array::{ArrayLength};

impl<'a, 'b, V, T, N, L> Add<&'b Tensor<V, N, T, L>> for &'a Tensor<V, N, T, L>
  where &'a V: Add<&'b V, Output=V>,
        N: ArrayLength<V> {
  type Output = Tensor<V, N, T, L>;
  fn add(self, rhs: &'b Tensor<V, N, T, L>) -> Self::Output {
    let mut tn: Self::Output = unsafe { mem::uninitialized() };  
    for (i, (a, b)) in self.iter().zip(rhs.iter()).enumerate() {
      tn[i] = a + b;
    }
    tn
  }
}

impl<'a, 'b, T, N> Dot<&'b Tensor<T, N, T, CMaj>> for &'a Tensor<T, N, T, RMaj>
  where T: Add<T, Output=T> + AddAssign<T> + Mul<T, Output=T> + Default + Sum + Sized + Copy,
        N: ArrayLength<T> + Unsigned {
  type Output = T;
  fn dot(self, rhs: &'b Tensor<T, N, T, CMaj>) -> Self::Output {
    let n = N::to_usize();
    let c = n / 8;
    let i = c * 8;
    let mut t = self[i..].into_iter()
                         .zip(rhs[i..].into_iter())
                         .map(|(&a, &b)| a + b)
                         .sum();
    if c > 0 {
      let mut p = [T::default(); 8];  
      for (a, b) in self[..].chunks(8)
                            .take(c)
                            .zip(rhs[..].chunks(8)
                                        .take(c)) {
        p[0] = a[0] * b[0];
        p[1] = a[1] * b[1];
        p[2] = a[2] * b[2];
        p[3] = a[3] * b[3];
        p[4] = a[4] * b[4];
        p[5] = a[5] * b[5];
        p[6] = a[6] * b[6];
        p[7] = a[7] * b[7];
      }
      t += p[0] + p[4];
      t += p[1] + p[5];
      t += p[2] + p[6];
      t += p[3] + p[7];
    } 
    t 
  }
}