megenginelite-sys 1.8.2

/**
 * \file dnn/src/x86/add_update/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "./opr_impl.h"
#include "src/fallback/add_update/opr_impl.h"

#include "src/common/utils.h"
#include "src/x86/handle.h"
#include "src/x86/utils.h"

#include <immintrin.h>
#ifdef WIN32
#include <avxintrin.h>
#include <fmaintrin.h>
#endif

namespace {

using namespace megdnn;
using namespace x86;

MEGDNN_ATTRIBUTE_TARGET("fma")
void add_update_fp32_fma(
        _megdnn_tensor_inout dest, _megdnn_tensor_in delta,
        const AddUpdate::Param& param) {
    dt_float32 alpha(param.alpha), beta(param.beta), bias(param.bias);
    __m256 packed_alpha(_mm256_set1_ps(alpha));
    __m256 packed_beta(_mm256_set1_ps(beta));
    __m256 packed_bias(_mm256_set1_ps(bias));

    dt_float32* dest_ptr = dest.ptr<dt_float32>();
    dt_float32* delta_ptr = delta.ptr<dt_float32>();
    size_t i = 0;
    size_t total_nr_elems = dest.layout.total_nr_elems();
    __m256 x0, b0;
    for (i = 0; i + 7 < total_nr_elems; i += 8) {
        b0 = _mm256_loadu_ps(delta_ptr + i);
        x0 = _mm256_loadu_ps(dest_ptr + i);
        b0 = _mm256_fmadd_ps(packed_beta, b0, packed_bias);
        x0 = _mm256_fmadd_ps(packed_alpha, x0, b0);
        _mm256_storeu_ps(dest_ptr + i, x0);
    }
    for (; i < total_nr_elems; i++) {
        dest_ptr[i] = alpha * dest_ptr[i] + beta * delta_ptr[i] + bias;
    }
}

}  // anonymous namespace

namespace megdnn {
namespace x86 {

void AddUpdateImpl::exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) {
    check_exec(dest.layout, delta.layout);
    // eq_shape is the same as eq_layout when both input tensors are contiguous.
    if (is_supported(SIMDType::FMA) && delta.layout.is_contiguous() &&
        dest.layout.is_contiguous() && delta.layout.eq_shape(dest.layout) &&
        dest.layout.dtype == delta.layout.dtype) {
        if (dest.layout.dtype == ::megdnn::dtype::Float32()) {
            auto param = m_param;
            MEGDNN_DISPATCH_CPU_KERN_OPR(add_update_fp32_fma(dest, delta, param));
            return;
        }
    }
    return megdnn::fallback::AddUpdateImpl::exec(dest, delta);
}

}  // namespace x86
}  // namespace megdnn
// vim: syntax=cpp.doxygen