use std::{any::Any, collections::HashMap};
use crate::{
nodes::{node::Node, onnx_operation_trait::FromOnnxOperation, unique_ids::UniqueId},
tensor_map::TensorMap,
typed_array::TypedArray,
};
use anyhow::{Ok, Result};
use ndarray::{ArrayD, IxDyn};
use onnx_extractor::{AttributeValue, OnnxOperation};
use saker_rs::{activations::Activation, linarg::operations::sgemm_bias_parallel};
#[derive(Default)]
pub struct MatMulNode<T: Default> {
a: String,
b: String,
o: String,
unique_id: UniqueId,
next_node: Option<Vec<Box<dyn Node<T>>>>,
}
impl<T: Default> FromOnnxOperation for MatMulNode<T> {
fn from_onnx_operation(elem: &OnnxOperation) -> Result<Self> {
let mut gemm = Self {
a: String::new(),
b: String::new(),
o: String::new(),
unique_id: UniqueId::Gemm,
next_node: None,
};
let inputs = &elem.inputs;
gemm.add_input_strings(inputs[0].clone(), inputs[1].clone());
gemm.add_output_strings(elem.outputs[0].clone());
Ok(gemm)
}
}
impl<T: Default> MatMulNode<T> {
pub fn add_input_strings(&mut self, a: String, b: String) {
self.a = a;
self.b = b;
}
pub fn add_output_strings(&mut self, o: String) {
self.o = o;
}
}
impl<T: Default + 'static> Node<T> for MatMulNode<T> {
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
fn get_unique_id(&self) -> UniqueId {
self.unique_id
}
fn get_unique_id_mut(&mut self) -> UniqueId {
self.unique_id
}
fn take_next(&mut self) -> Option<Vec<Box<dyn Node<T>>>> {
self.next_node.take()
}
fn get_next_mut(&mut self) -> Option<&mut Vec<Box<dyn Node<T>>>> {
self.next_node.as_mut()
}
fn set_next(&mut self, next: Option<Vec<Box<dyn Node<T>>>>) {
self.next_node = next;
}
fn input_names(&self) -> Vec<String> {
let names = vec![self.a.clone(), self.b.clone()];
names
}
fn output_names(&self) -> Vec<String> {
vec![self.o.clone()]
}
fn get_next(&self) -> Option<&Vec<Box<dyn Node<T>>>> {
self.next_node.as_ref()
}
fn execute(&self, omap: &mut TensorMap) {
let [a, b, o] = omap.get_disjoint_mut([&self.a, &self.b, &self.o]);
let a = &*a.unwrap();
let b = &*b.unwrap();
match o {
Some(result) => {
a.matmul(b, result).unwrap();
}
_ => panic!("MatMulNode: missing output {}", self.o),
}
}
fn print(&self) {
if let Some(list) = &self.next_node {
print!("{}-", list.len());
}
println!("MatMul-{},{:?},{}", self.a, self.b, self.o);
if let Some(next) = &self.next_node {
next.iter().for_each(|v| v.print());
}
}
fn determine_output_shape(&mut self, omap: &mut TensorMap) {
let [a, b, o] = omap.get_disjoint_mut([&self.a, &self.b, &self.o]);
let a = a.map(|inner| &*inner);
let b = b.map(|inner| &*inner);
if let (Some(a), Some(b)) = (a, b) {
let a_shape = match a.shape() {
Some(s) => s.to_vec(),
None => return,
};
let b_shape = match b.shape() {
Some(s) => s.to_vec(),
None => return,
};
let a_ndim = a_shape.len();
let b_ndim = b_shape.len();
let out_shape = match (a_ndim, b_ndim) {
(1, 1) => vec![1],
(2, 1) => vec![a_shape[0]],
(1, 2) => vec![b_shape[1]],
(2, 2) => vec![a_shape[0], b_shape[1]],
_ => {
let m = a_shape[a_ndim - 2];
let n = b_shape[b_ndim - 1];
let a_batch = &a_shape[..a_ndim - 2];
let b_batch = &b_shape[..b_ndim - 2];
let batch_rank = a_batch.len().max(b_batch.len());
let mut batch_shape = vec![0usize; batch_rank];
for i in 0..batch_rank {
let a_dim = if i < batch_rank - a_batch.len() {
1
} else {
a_batch[i - (batch_rank - a_batch.len())]
};
let b_dim = if i < batch_rank - b_batch.len() {
1
} else {
b_batch[i - (batch_rank - b_batch.len())]
};
batch_shape[i] = a_dim.max(b_dim);
}
let mut shape = batch_shape;
shape.push(m);
shape.push(n);
shape
}
};
if let Some(o) = o {
*o = TypedArray::empty_with_others_type(a, &out_shape);
}
if let Some(list) = &mut self.next_node {
for next in list {
next.determine_output_shape(omap);
}
}
}
}
}
impl TypedArray {
pub fn matmul(&self, b: &TypedArray, o: &mut TypedArray) -> anyhow::Result<()> {
match (self, b) {
(TypedArray::Float(a_arr), TypedArray::Float(b_arr)) => {
let a_shape = a_arr.shape();
let b_shape = b_arr.shape();
let a_ndim = a_shape.len();
let b_ndim = b_shape.len();
match (a_ndim, b_ndim) {
(1, 1) => {
let k = a_shape[0];
assert_eq!(k, b_shape[0]);
let needs_alloc = match &*o {
TypedArray::Float(out) => out.shape() != [1],
_ => true,
};
if needs_alloc {
*o = TypedArray::Float(ArrayD::zeros(IxDyn(&[1]))).ensure_contiguous();
}
if let TypedArray::Float(out) = o {
let a_sl = a_arr.as_slice_memory_order().unwrap();
let b_sl = b_arr.as_slice_memory_order().unwrap();
let dst = out.as_slice_memory_order_mut().unwrap();
dst[0] = a_sl.iter().zip(b_sl.iter()).map(|(a, b)| a * b).sum();
}
}
(2, 1) => {
let m = a_shape[0];
let k = a_shape[1];
assert_eq!(k, b_shape[0]);
let out_shape = [m];
let needs_alloc = match &*o {
TypedArray::Float(out) => out.shape() != out_shape,
_ => true,
};
if needs_alloc {
*o = TypedArray::Float(ArrayD::zeros(IxDyn(&out_shape)))
.ensure_contiguous();
}
if let TypedArray::Float(out) = o {
let a_sl = a_arr.as_slice_memory_order().unwrap();
let b_sl = b_arr.as_slice_memory_order().unwrap();
let dst = out.as_slice_memory_order_mut().unwrap();
for i in 0..m {
let mut sum = 0.0f32;
for p in 0..k {
sum += a_sl[i * k + p] * b_sl[p];
}
dst[i] = sum;
}
}
}
(1, 2) => {
let k = a_shape[0];
let n = b_shape[1];
assert_eq!(k, b_shape[0]);
let out_shape = [n];
let needs_alloc = match &*o {
TypedArray::Float(out) => out.shape() != out_shape,
_ => true,
};
if needs_alloc {
*o = TypedArray::Float(ArrayD::zeros(IxDyn(&out_shape)))
.ensure_contiguous();
}
if let TypedArray::Float(out) = o {
let a_sl = a_arr.as_slice_memory_order().unwrap();
let b_sl = b_arr.as_slice_memory_order().unwrap();
let dst = out.as_slice_memory_order_mut().unwrap();
for j in 0..n {
let mut sum = 0.0f32;
for p in 0..k {
sum += a_sl[p] * b_sl[p * n + j];
}
dst[j] = sum;
}
}
}
(2, 2) => {
let m = a_shape[0];
let k = a_shape[1];
let n = b_shape[1];
assert_eq!(k, b_shape[0]);
let out_shape = [m, n];
let needs_alloc = match &*o {
TypedArray::Float(out) => out.shape() != out_shape,
_ => true,
};
if needs_alloc {
*o = TypedArray::Float(ArrayD::zeros(IxDyn(&out_shape)))
.ensure_contiguous();
}
if let TypedArray::Float(out) = o {
let a_sl = a_arr.as_slice_memory_order().unwrap();
let b_sl = b_arr.as_slice_memory_order().unwrap();
let dst = out.as_slice_memory_order_mut().unwrap();
sgemm_bias_parallel(m, n, k, a_sl, b_sl, None, dst, Activation::None);
}
}
_ => {
let m = a_shape[a_ndim - 2];
let k = a_shape[a_ndim - 1];
let n = b_shape[b_ndim - 1];
assert_eq!(k, b_shape[b_ndim - 2]);
let a_batch = &a_shape[..a_ndim - 2];
let b_batch = &b_shape[..b_ndim - 2];
let batch_rank = a_batch.len().max(b_batch.len());
let mut batch_shape = vec![0usize; batch_rank];
for i in 0..batch_rank {
let a_dim = if i < batch_rank - a_batch.len() {
1
} else {
a_batch[i - (batch_rank - a_batch.len())]
};
let b_dim = if i < batch_rank - b_batch.len() {
1
} else {
b_batch[i - (batch_rank - b_batch.len())]
};
batch_shape[i] = a_dim.max(b_dim);
}
let batch_size: usize = batch_shape.iter().product();
let mut out_shape = batch_shape.clone();
out_shape.push(m);
out_shape.push(n);
let needs_alloc = match &*o {
TypedArray::Float(out) => out.shape() != out_shape.as_slice(),
_ => true,
};
if needs_alloc {
*o = TypedArray::Float(ArrayD::zeros(IxDyn(&out_shape)))
.ensure_contiguous();
}
if let TypedArray::Float(out) = o {
let a_sl = a_arr.as_slice_memory_order().unwrap();
let b_sl = b_arr.as_slice_memory_order().unwrap();
let dst = out.as_slice_memory_order_mut().unwrap();
let a_mat_size = m * k;
let b_mat_size = k * n;
let o_mat_size = m * n;
let a_batch_size: usize = a_batch.iter().product::<usize>().max(1);
let b_batch_size: usize = b_batch.iter().product::<usize>().max(1);
for batch in 0..batch_size {
let a_batch_idx = batch % a_batch_size;
let b_batch_idx = batch % b_batch_size;
let a_offset = a_batch_idx * a_mat_size;
let b_offset = b_batch_idx * b_mat_size;
let o_offset = batch * o_mat_size;
sgemm_bias_parallel(
m,
n,
k,
&a_sl[a_offset..a_offset + a_mat_size],
&b_sl[b_offset..b_offset + b_mat_size],
None,
&mut dst[o_offset..o_offset + o_mat_size],
Activation::None,
);
}
}
}
}
Ok(())
}
_ => anyhow::bail!("MatMul: only F32 supported"),
}
}
}