#pragma once
#include "megdnn/arch.h"
#include "src/common/unroll_macro.h"
#if MGB_ENABLE_DOT
#if defined(__ARM_FEATURE_DOTPROD)
#undef __ARM_FEATURE_DOTPROD
#endif
#define __ARM_FEATURE_DOTPROD 1
#endif
#include <arm_neon.h>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wattributes"
#define __ai \
static inline __attribute__((__gnu_inline__, __always_inline__, __nodebug__))
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !MEGDNN_DISABLE_FLOAT16
#define MEGDNN_INC_ARM_FP16(_x) _x
#else
#define MEGDNN_INC_ARM_FP16(_x)
#endif
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#ifdef __LITTLE_ENDIAN__
__ai float16x8_t vmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
float16x8_t __ret;
__ret = __p0 + __p1 * __p2;
return __ret;
}
#else
__ai float16x8_t vmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
float16x8_t __rev0;
__rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __rev1;
__rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __rev2;
__rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
float16x8_t __ret;
__ret = __rev0 + __rev1 * __rev2;
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
#define vmlaq_lane_f16(__p0, __p1, __p2, __p3) \
__extension__({ \
float16x8_t __s0 = __p0; \
float16x8_t __s1 = __p1; \
float16x4_t __s2 = __p2; \
float16x8_t __ret; \
__ret = __s0 + __s1 * __builtin_shufflevector( \
__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, \
__p3, __p3); \
__ret; \
})
#else
#define vmlaq_lane_f16(__p0, __p1, __p2, __p3) \
__extension__({ \
float16x8_t __s0 = __p0; \
float16x8_t __s1 = __p1; \
float16x4_t __s2 = __p2; \
float16x8_t __rev0; \
__rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev1; \
__rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x4_t __rev2; \
__rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
float16x8_t __ret; \
__ret = __rev0 + __rev1 * __builtin_shufflevector( \
__rev2, __rev2, __p3, __p3, __p3, __p3, \
__p3, __p3, __p3, __p3); \
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
__ret; \
})
#endif
#if 0#endif
#ifdef __LITTLE_ENDIAN__
#define vmlaq_laneq_f16(__p0, __p1, __p2, __p3) \
__extension__({ \
float16x8_t __s0 = __p0; \
float16x8_t __s1 = __p1; \
float16x8_t __s2 = __p2; \
float16x8_t __ret; \
__ret = __s0 + __s1 * __builtin_shufflevector( \
__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, \
__p3, __p3); \
__ret; \
})
#else
#define vmlaq_laneq_f16(__p0, __p1, __p2, __p3) \
__extension__({ \
float16x8_t __s0 = __p0; \
float16x8_t __s1 = __p1; \
float16x8_t __s2 = __p2; \
float16x8_t __rev0; \
__rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev1; \
__rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __rev2; \
__rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
float16x8_t __ret; \
__ret = __rev0 + __rev1 * __builtin_shufflevector( \
__rev2, __rev2, __p3, __p3, __p3, __p3, \
__p3, __p3, __p3, __p3); \
__ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
__ret; \
})
#endif
#if MEGDNN_ARMV7
#define vmlaq_low_lane_f16(__a, __b, __v, __lane) \
__extension__({ \
auto c = vget_low_f16(__v); \
auto __ret = vmlaq_lane_f16(__a, __b, c, __lane); \
__ret; \
})
#define vmlaq_high_lane_f16(__a, __b, __v, __lane) \
__extension__({ \
auto c = vget_high_f16(__v); \
auto __ret = vmlaq_lane_f16(__a, __b, c, (__lane - 4)); \
__ret; \
})
__ai float16x8_t vmulq_fix_f16(float16x8_t a, float16x8_t b) {
float16x8_t ret;
asm volatile("vmul.f16 %0, %1, %2\n" : "+w"(ret) : "w"(a), "w"(b));
return ret;
}
__ai float16x8_t vmulq_n_fix_f16(float16x8_t a, __fp16 b) {
float16x8_t ret;
asm volatile(
"vdup.16 q0, %2 \n"
"vmul.f16 %0, %1, q0\n"
: "+w"(ret)
: "w"(a), "r"(b)
: "q0");
return ret;
}
__ai float16x4_t vmul_n_fix_f16(float16x4_t a, __fp16 b) {
float16x4_t ret;
asm volatile(
"vdup.16 d0,%2\n"
"vmul.f16 %0, %1, d0[0]\n"
: "+w"(ret)
: "w"(a), "r"(b)
: "d0");
return ret;
}
__ai float16x8_t vmlaq_fix_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
asm volatile("vmla.f16 %0, %1, %2\n" : "+w"(a) : "w"(b), "w"(c));
return a;
}
__ai float16x8_t vaddq_fix_f16(float16x8_t a, float16x8_t b) {
float16x8_t ret;
asm volatile("vadd.f16 %0, %1, %2\n" : "+w"(ret) : "w"(a), "w"(b));
return ret;
}
#undef vdupq_n_f16
__ai float16x8_t vdupq_n_f16(__fp16 a) {
float16x8_t ret;
asm volatile("vdup.16 %0, %1\n" : "+w"(ret) : "r"(a) :);
return ret;
}
#elif MEGDNN_AARCH64
#define vmlaq_low_lane_f16(__a, __b, __v, __lane) vmlaq_laneq_f16(__a, __b, __v, __lane)
#define vmlaq_high_lane_f16(__a, __b, __v, __lane) \
vmlaq_laneq_f16(__a, __b, __v, __lane)
__ai float16x8_t vmulq_fix_f16(float16x8_t a, float16x8_t b) {
return vmulq_f16(a, b);
}
__ai float16x8_t vmlaq_fix_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
return vmlaq_f16(a, b, c);
}
__ai float16x8_t vaddq_fix_f16(float16x8_t a, float16x8_t b) {
return vaddq_f16(a, b);
}
#undef vdupq_n_f16
__ai float16x8_t vdupq_n_f16(__fp16 a) {
float16x8_t ret;
asm volatile("dup %0.8h, %w1\n" : "+w"(ret) : "r"(a) :);
return ret;
}
#endif
#endif
#if MGB_ENABLE_DOT
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai int32x4_t vdotq2_s32(int8x16_t a, int8x16_t b) {
int32x4_t c = vdupq_n_s32(0);
return vdotq_s32(c, a, b);
}
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai uint32x4_t vdotq2_u32(uint8x16_t a, uint8x16_t b) {
uint32x4_t c = vdupq_n_u32(0);
return vdotq_u32(c, a, b);
}
#define vdotq2_lane_s32(a, b, lane) \
__extension__({ \
int32x4_t c = vdupq_n_s32(0); \
c = vdotq_lane_s32(c, a, b, lane); \
c; \
})
#define vdotq2_lane_u32(a, b, lane) \
__extension__({ \
uint32x4_t c = vdupq_n_u32(0); \
c = vdotq_lane_u32(c, a, b, lane); \
c; \
})
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai int32x2_t vdot2_s32(int8x8_t a, int8x8_t b) {
int32x2_t c = vdup_n_s32(0);
return vdot_s32(c, a, b);
}
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai uint32x2_t vdot2_u8(uint8x8_t a, uint8x8_t b) {
uint32x2_t c = vdup_n_u32(0);
return vdot_u32(c, a, b);
}
#define vdot2_lane_s32(a, b, lane) \
__extension__({ \
int32x2_t c = vdup_n_s32(0); \
c = vdot_lane_s32(c, a, b, lane); \
c; \
})
#define vdot2_lane_u8(a, b, lane) \
__extension__({ \
uint32x2_t c = vdup_n_u32(0); \
c = vdot_lane_u32(c, a, b, lane); \
c; \
})
#endif
#if __GNUC__ < 8
#undef vld1q_f32_x2
__ai float32x4x2_t vld1q_f32_x2(const float* p) {
return {{vld1q_f32(p), vld1q_f32(p + 4)}};
}
#endif
#if __GNUC__ < 9
#undef vst1q_f32_x2
__ai void vst1q_f32_x2(const float* p, float32x4x2_t v) {
vst1q_f32(const_cast<float*>(p), v.val[0]);
vst1q_f32(const_cast<float*>(p) + 4, v.val[1]);
}
#endif
#if !defined(vld1q_u32_x2) && (__GNUC__ < 8 || (__GNUC__ == 8 && __GNUC_MINOR__ < 3))
__ai uint32x4x2_t vld1q_u32_x2(const uint32_t* p) {
return {{vld1q_u32(p), vld1q_u32(p + 4)}};
}
#endif
__ai int8x16_t vtranslq_s8(int8x8_t a) {
int8x16_t ret;
#if MEGDNN_AARCH64
asm volatile("ins %0.d[0], %1.d[0]\n" : "+w"(ret) : "w"(a) :);
#else
asm volatile("vmov %e0, %P1\n" : "+w"(ret) : "w"(a) :);
#endif
return ret;
}
__ai uint8x16_t vtranslq_u8(uint8x8_t a) {
uint8x16_t ret;
#if MEGDNN_AARCH64
asm volatile("ins %0.d[0], %1.d[0]\n" : "+w"(ret) : "w"(a) :);
#else
asm volatile("vmov %e0, %P1\n" : "+w"(ret) : "w"(a) :);
#endif
return ret;
}
#ifdef MEGDNN_TEGRA_X1
#define vset_lane_s16_fix_tx1(__elem, __vec, __index) \
{ asm volatile("ins %0.h[" #__index "], %w1\n" : "+w"(__vec) : "r"(__elem) :); }
#else
#define vset_lane_s16_fix_tx1(__elem, __vec, __index) \
__vec = vset_lane_s16(__elem, __vec, __index)
#endif
#if MEGDNN_ARMV7
__ai int32_t vaddlvq_s16(int16x8_t __p0) {
int32_t __ret = 0;
auto sum = vpaddlq_s16(__p0);
__ret +=
(vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) +
vgetq_lane_s32(sum, 3));
return __ret;
}
__ai int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
int16x8_t __ret;
__ret = vmlal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
return __ret;
}
__ai int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
int16x8_t __ret;
__ret = vmull_s8(vget_high_s8(__p0), vget_high_s8(__p1));
return __ret;
}
__ai int16x8_t vmovl_high_s8(int8x16_t __p0) {
return vmovl_s8(vget_high_s8(__p0));
}
__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0) {
return vmovl_u8(vget_high_u8(__p0));
}
__ai int32x4_t vmovl_high_s16(int16x8_t __p0) {
return vmovl_s16(vget_high_s16(__p0));
}
__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0) {
return vmovl_u16(vget_high_u16(__p0));
}
__ai int64x2_t vmovl_high_s32(int32x4_t __p0) {
return vmovl_s32(vget_high_s32(__p0));
}
__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0) {
return vmovl_u32(vget_high_u32(__p0));
}
__ai int64x2_t vzip1q_s64(int64x2_t& a, int64x2_t& b) {
return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
}
__ai int64x2_t vzip2q_s64(int64x2_t& a, int64x2_t& b) {
return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
}
__ai int32_t vaddv_s32(int32x2_t a) {
return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
}
__ai int32_t vaddvq_s32(int32x4_t a) {
return vgetq_lane_s32(a, 0) + vgetq_lane_s32(a, 1) + vgetq_lane_s32(a, 2) +
vgetq_lane_s32(a, 3);
}
__ai float32_t vaddvq_f32(float32x4_t a) {
return vgetq_lane_f32(a, 0) + vgetq_lane_f32(a, 1) + vgetq_lane_f32(a, 2) +
vgetq_lane_f32(a, 3);
}
#endif
__ai int16x8_t vmovl_low_s8(int8x16_t __p0) {
return vmovl_s8(vget_low_s8(__p0));
}
__ai uint16x8_t vmovl_low_u8(uint8x16_t __p0) {
return vmovl_u8(vget_low_u8(__p0));
}
__ai int32x4_t vmovl_low_s16(int16x8_t __p0) {
return vmovl_s16(vget_low_s16(__p0));
}
__ai uint32x4_t vmovl_low_u16(uint16x8_t __p0) {
return vmovl_u16(vget_low_u16(__p0));
}
__ai int64x2_t vmovl_low_s32(int32x4_t __p0) {
return vmovl_s32(vget_low_s32(__p0));
}
__ai uint64x2_t vmovl_low_u32(uint32x4_t __p0) {
return vmovl_u32(vget_low_u32(__p0));
}
#if MEGDNN_ARMV7
#define vmlaq_low_lane_f32(__a, __b, __v, __lane) \
__extension__({ \
auto c = vget_low_f32(__v); \
auto __ret = vmlaq_lane_f32(__a, __b, c, __lane); \
__ret; \
})
#define vmlaq_high_lane_f32(__a, __b, __v, __lane) \
__extension__({ \
auto c = vget_high_f32(__v); \
auto __ret = vmlaq_lane_f32(__a, __b, c, (__lane - 2)); \
__ret; \
})
#elif MEGDNN_AARCH64
__ai float64x2_t vbitq_f64(float64x2_t dst, float64x2_t v1, uint64x2_t mask) {
asm volatile("bit %0.16b, %1.16b, %2.16b\n" : "+w"(dst) : "w"(v1), "w"(mask) :);
return dst;
}
#define vmlaq_low_lane_f32(__a, __b, __v, __lane) vmlaq_laneq_f32(__a, __b, __v, __lane)
#define vmlaq_high_lane_f32(__a, __b, __v, __lane) \
vmlaq_laneq_f32(__a, __b, __v, __lane)
#endif
#if MEGDNN_ARMV7
__ai int8x16_t vqtbl1q_s8(int8x16_t& a, uint8x16_t& idx) {
int8x8_t src_low = vget_low_s8(a);
int8x8_t src_high = vget_high_s8(a);
return vcombine_s8(
vtbl2_s8({src_low, src_high}, vget_low_s8(vreinterpretq_s8_u8(idx))),
vtbl2_s8({src_low, src_high}, vget_high_s8(vreinterpretq_s8_u8(idx))));
}
namespace {
template <int lane>
struct Vdup_laneq_s16_armv7 {
__ai int16x4_t impl(int16x8_t vec);
};
#define cb(step) \
template <> \
struct Vdup_laneq_s16_armv7<step + 4> { \
__ai int16x4_t impl(int16x8_t vec) { \
return vdup_lane_s16(vget_high_s16(vec), step); \
} \
}; \
template <> \
struct Vdup_laneq_s16_armv7<step> { \
__ai int16x4_t impl(int16x8_t vec) { \
return vdup_lane_s16(vget_low_s16(vec), step); \
} \
};
UNROLL_CALL_RAW(4, cb);
#undef cb
} #define vdup_laneq_s16(vec, lane) Vdup_laneq_s16_armv7<lane>::impl(vec)
namespace {
template <int lane>
struct Vfmaq_laneq_f32_armv7 {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
};
template <>
struct Vfmaq_laneq_f32_armv7<0> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlaq_lane_f32(a, b, vget_low_f32(v), 0);
}
};
template <>
struct Vfmaq_laneq_f32_armv7<1> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlaq_lane_f32(a, b, vget_low_f32(v), 1);
}
};
template <>
struct Vfmaq_laneq_f32_armv7<2> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlaq_lane_f32(a, b, vget_high_f32(v), 0);
}
};
template <>
struct Vfmaq_laneq_f32_armv7<3> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlaq_lane_f32(a, b, vget_high_f32(v), 1);
}
};
template <int lane>
struct Vfmsq_laneq_f32_armv7 {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
};
template <>
struct Vfmsq_laneq_f32_armv7<0> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlsq_lane_f32(a, b, vget_low_f32(v), 0);
}
};
template <>
struct Vfmsq_laneq_f32_armv7<1> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlsq_lane_f32(a, b, vget_low_f32(v), 1);
}
};
template <>
struct Vfmsq_laneq_f32_armv7<2> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlsq_lane_f32(a, b, vget_high_f32(v), 0);
}
};
template <>
struct Vfmsq_laneq_f32_armv7<3> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
return vmlsq_lane_f32(a, b, vget_high_f32(v), 1);
}
};
} #define vfmaq_laneq_f32(a, b, v, lane) Vfmaq_laneq_f32_armv7<lane>::impl(a, b, v)
#define vfmsq_laneq_f32(a, b, v, lane) Vfmsq_laneq_f32_armv7<lane>::impl(a, b, v)
#if MGB_ENABLE_DOT
namespace {
template <int lane>
struct Vdotq_laneq_s32_armv7 {
__ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v);
};
template <>
struct Vdotq_laneq_s32_armv7<0> {
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
return vdotq_lane_s32(a, b, vget_low_s32(v), 0);
}
};
template <>
struct Vdotq_laneq_s32_armv7<1> {
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
return vdotq_lane_s32(a, b, vget_low_s32(v), 1);
}
};
template <>
struct Vdotq_laneq_s32_armv7<2> {
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
return vdotq_lane_s32(a, b, vget_high_s32(v), 0);
}
};
template <>
struct Vdotq_laneq_s32_armv7<3> {
MEGDNN_ATTRIBUTE_TARGET("dotprod")
__ai int32x4_t impl(int32x4_t a, int8x16_t b, int8x16_t v) {
return vdotq_lane_s32(a, b, vget_high_f32(v), 1);
}
};
#define vdotq_laneq_s32(a, b, v, lane) Vdotq_laneq_s32_armv7<lane>::impl(a, b, v)
} #endif
#endif
#if MEGDNN_AARCH64
namespace {
template <int lane>
struct Vfmaq_laneq_f32_armv8 {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
};
template <>
struct Vfmaq_laneq_f32_armv8<0> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <>
struct Vfmaq_laneq_f32_armv8<1> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <>
struct Vfmaq_laneq_f32_armv8<2> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <>
struct Vfmaq_laneq_f32_armv8<3> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <int lane>
struct Vfmsq_laneq_f32_armv8 {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v);
};
template <>
struct Vfmsq_laneq_f32_armv8<0> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmls %0.4s, %1.4s, %2.s[0]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <>
struct Vfmsq_laneq_f32_armv8<1> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmls %0.4s, %1.4s, %2.s[1]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <>
struct Vfmsq_laneq_f32_armv8<2> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmls %0.4s, %1.4s, %2.s[2]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
template <>
struct Vfmsq_laneq_f32_armv8<3> {
__ai float32x4_t impl(float32x4_t a, float32x4_t b, float32x4_t v) {
asm volatile("fmls %0.4s, %1.4s, %2.s[3]\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
};
} #undef vfmaq_laneq_f32
#define vfmaq_laneq_f32(a, b, v, lane) Vfmaq_laneq_f32_armv8<lane>::impl(a, b, v)
#undef vfmsq_laneq_f32
#define vfmsq_laneq_f32(a, b, v, lane) Vfmsq_laneq_f32_armv8<lane>::impl(a, b, v)
#endif
__ai int8x16_t vld_dup_tbl_s32(const int8_t* ptr, uint8x16_t& idx) {
int8x16_t result = vreinterpretq_s8_s32(vld1q_dup_s32((const int32_t*)ptr));
result = vqtbl1q_s8(result, idx);
return result;
}
__ai int8x16_t vldq_tbl_s8(const int8_t* ptr, uint8x16_t& idx) {
int8x16_t result = vld1q_s8(ptr);
result = vqtbl1q_s8(result, idx);
return result;
}
__ai int32x4_t vdotq_s32_h(int8x16_t& a, int8x16_t& b, int32x4_t& c, int16x8_t& temp) {
temp = vmull_s8(vget_low_s8(a), vget_low_s8(b));
temp = vmlal_high_s8(temp, a, b);
c = vpadalq_s16(c, temp);
return c;
}
__ai int32x4_t vdot2_s32_h(int8x8_t& a, int8x8_t& b, int32x4_t& c, int16x8_t& temp) {
temp = vmull_s8(a, b);
c = vpadalq_s16(c, temp);
return c;
}
__ai int32x4_t vmlal_s16(int32x4_t& a, int16x8_t& b, int16x8_t& c) {
return vmlal_s16(a, vget_low_s16(b), vget_low_s16(c));
}
__ai int16x8_t vldq_dup_4s8_8s16(const int8_t* ptr) {
return vmovl_s8(
vreinterpret_s8_s32(vld1_dup_s32(reinterpret_cast<const int32_t*>(ptr))));
}
__ai int8x8_t vldq_tbl_low_s8(const int8_t* ptr, uint8x16_t idx) {
return vget_low_s8(vldq_tbl_s8(ptr, idx));
}
__ai int16x8_t vld1_dup_s8_s16(const int8_t* ptr) {
return vmovl_s8(vld1_dup_s8(ptr));
}
__ai float32x4_t Vfmsq_f32(float32x4_t& a, float32x4_t& b, float32x4_t& v) {
asm volatile("fmls %0.4s, %1.4s, %2.4s\n" : "+w"(a) : "w"(b), "w"(v) :);
return a;
}
#if MGB_ENABLE_DOT
#undef __ARM_FEATURE_DOTPROD
#endif
#undef __ai
#pragma GCC diagnostic pop