#include "./opr_impl.h"
#include "src/fallback/add_update/opr_impl.h"
#include "src/common/utils.h"
#include "src/x86/handle.h"
#include "src/x86/utils.h"
#include <immintrin.h>
#ifdef WIN32
#include <avxintrin.h>
#include <fmaintrin.h>
#endif
namespace {
using namespace megdnn;
using namespace x86;
MEGDNN_ATTRIBUTE_TARGET("fma")
void add_update_fp32_fma(
_megdnn_tensor_inout dest, _megdnn_tensor_in delta,
const AddUpdate::Param& param) {
dt_float32 alpha(param.alpha), beta(param.beta), bias(param.bias);
__m256 packed_alpha(_mm256_set1_ps(alpha));
__m256 packed_beta(_mm256_set1_ps(beta));
__m256 packed_bias(_mm256_set1_ps(bias));
dt_float32* dest_ptr = dest.ptr<dt_float32>();
dt_float32* delta_ptr = delta.ptr<dt_float32>();
size_t i = 0;
size_t total_nr_elems = dest.layout.total_nr_elems();
__m256 x0, b0;
for (i = 0; i + 7 < total_nr_elems; i += 8) {
b0 = _mm256_loadu_ps(delta_ptr + i);
x0 = _mm256_loadu_ps(dest_ptr + i);
b0 = _mm256_fmadd_ps(packed_beta, b0, packed_bias);
x0 = _mm256_fmadd_ps(packed_alpha, x0, b0);
_mm256_storeu_ps(dest_ptr + i, x0);
}
for (; i < total_nr_elems; i++) {
dest_ptr[i] = alpha * dest_ptr[i] + beta * delta_ptr[i] + bias;
}
}
}
namespace megdnn {
namespace x86 {
void AddUpdateImpl::exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) {
check_exec(dest.layout, delta.layout);
if (is_supported(SIMDType::FMA) && delta.layout.is_contiguous() &&
dest.layout.is_contiguous() && delta.layout.eq_shape(dest.layout) &&
dest.layout.dtype == delta.layout.dtype) {
if (dest.layout.dtype == ::megdnn::dtype::Float32()) {
auto param = m_param;
MEGDNN_DISPATCH_CPU_KERN_OPR(add_update_fp32_fma(dest, delta, param));
return;
}
}
return megdnn::fallback::AddUpdateImpl::exec(dest, delta);
}
} }