weights
weights = weights - relu.input_cache.relu_der.tens_broadcast_mul(grad_output).matmul(linear.input_cache.transposed());
weights[i] = weights[i] - dot(relu.input_cache.relu_der.tens_broadcast_mul(grad_output).row(row), linear.input_cache.row(col));
var sum = 0.0;
for i in 0..relu_shapes.row{
sum += relu_der(relu.input_cache[x]) + grad_output[shape.row] * linear.input_cache[i.toShape().swapParams().toIdx];
}
output[i] = input[i] - sum;
shape[row, col] = i.toShape(weights_shape);
biases
output[i] = relu_der(relu.input_cache[x])*grad_output * 1/shapes[1](samples_per_batch)
grad
output[i] = weights^T.matmul(relu_der(relu.input_cache[x])) -> for(i in weights.row) sum += weights[i.col][i.row] * relu_der(relu.input_cache[i.row][i.col])