#ifndef _ARRAY_ARITHMETIC_H_
#define _ARRAY_ARITHMETIC_H_
#if __APPLE__
#include <Accelerate/Accelerate.h>
#endif
#if __SSE__
#include <xmmintrin.h>
#elif __ARM_NEON__
#include <arm_neon.h>
#endif
class ArrayArithmetic {
public:
static inline void add(float *input0, float *input1, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vadd(input0+startIndex, 1, input1+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ = *input0++ + *input1++; --n;
case 2: *output++ = *input0++ + *input1++; --n;
case 3: *output++ = *input0++ + *input1++; --n;
}
int n4 = n & 0xFFFFFFFC;
while (n4) {
_mm_store_ps(output, _mm_add_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
n4 -= 4; input0 += 4; input1 += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ + *input1++;
case 2: *output++ = *input0++ + *input1++;
case 1: *output++ = *input0++ + *input1++;
case 0: default: break;
}
#elif __ARM_NEON__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC;
float32x4_t inVec0, inVec1, res;
while (n4) {
inVec0 = vld1q_f32((const float32_t *) input0);
inVec1 = vld1q_f32((const float32_t *) input1);
res = vaddq_f32(inVec0, inVec1);
vst1q_f32((float32_t *) output, res);
n4 -= 4;
input0 += 4;
input1 += 4;
output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ + *input1++;
case 2: *output++ = *input0++ + *input1++;
case 1: *output++ = *input0++ + *input1++;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input0[i] + input1[i];
}
#endif
}
static inline void add(float *input, float constant, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vsadd(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ += constant; --n;
case 2: *output++ += constant; --n;
case 3: *output++ += constant; --n;
}
int n4 = n & 0xFFFFFFFC;
const __m128 constVec = _mm_set1_ps(constant);
while (n4) {
_mm_store_ps(output, _mm_add_ps(_mm_load_ps(input), constVec));
n4 -= 4; input += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ += constant;
case 2: *output++ += constant;
case 1: *output++ += constant;
case 0: default: break;
}
#elif __ARM_NEON__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC;
float32x4_t inVec, res;
while (n4) {
inVec = vld1q_f32((const float32_t *) input);
res = vaddq_f32(inVec, constant);
vst1q_f32((float32_t *) output, res);
n4 -= 4;
input += 4;
output += 4;
}
switch (n & 0x3) {
case 3: *output++ += constant;
case 2: *output++ += constant;
case 1: *output++ += constant;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input[i] + constant;
}
#endif
}
static inline void subtract(float *input0, float *input1, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vsub(input1+startIndex, 1, input0+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ = *input0++ - *input1++; --n;
case 2: *output++ = *input0++ - *input1++; --n;
case 3: *output++ = *input0++ - *input1++; --n;
}
int n4 = n & 0xFFFFFFFC;
while (n4) {
_mm_store_ps(output, _mm_sub_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
n4 -= 4; input0 += 4; input1 += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ - *input1++;
case 2: *output++ = *input0++ - *input1++;
case 1: *output++ = *input0++ - *input1++;
case 0: default: break;
}
#elif __ARM_NEON__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC;
float32x4_t inVec0, inVec1, res;
while (n4) {
inVec0 = vld1q_f32((const float32_t *) input0);
inVec1 = vld1q_f32((const float32_t *) input1);
res = vsubq_f32(inVec0, inVec1);
vst1q_f32((float32_t *) output, res);
n4 -= 4;
input0 += 4;
input1 += 4;
output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ - *input1++;
case 2: *output++ = *input0++ - *input1++;
case 1: *output++ = *input0++ - *input1++;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input0[i] - input1[i];
}
#endif
}
static inline void subtract(float *input, float constant, float *output, int startIndex, int endIndex) {
#if __APPLE__
float negation = -1.0f * constant;
vDSP_vsadd(input+startIndex, 1, &negation, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ -= constant; --n;
case 2: *output++ -= constant; --n;
case 3: *output++ -= constant; --n;
}
int n4 = n & 0xFFFFFFFC;
const __m128 constVec = _mm_set1_ps(constant);
while (n4) {
_mm_store_ps(output, _mm_sub_ps(_mm_load_ps(input), constVec));
n4 -= 4; input += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ -= constant;
case 2: *output++ -= constant;
case 1: *output++ -= constant;
case 0: default: break;
}
#elif __ARM_NEON__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC;
float32x4_t inVec, res;
while (n4) {
inVec = vld1q_f32((const float32_t *) input);
res = vsubq_f32(inVec, constant);
vst1q_f32((float32_t *) output, res);
n4 -= 4;
input += 4;
output += 4;
}
switch (n & 0x3) {
case 3: *output++ -= constant;
case 2: *output++ -= constant;
case 1: *output++ -= constant;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input[i] - constant;
}
#endif
}
static inline void multiply(float *input0, float *input1, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vmul(input0+startIndex, 1, input1+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ = *input0++ * *input1++; --n;
case 2: *output++ = *input0++ * *input1++; --n;
case 3: *output++ = *input0++ * *input1++; --n;
}
int n4 = n & 0xFFFFFFFC;
while (n4) {
_mm_store_ps(output, _mm_mul_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
n4 -= 4; input0 += 4; input1 += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ * *input1++;
case 2: *output++ = *input0++ * *input1++;
case 1: *output++ = *input0++ * *input1++;
case 0: default: break;
}
#elif __ARM_NEON__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC;
float32x4_t inVec0, inVec1, res;
while (n4) {
inVec0 = vld1q_f32((const float32_t *) input0); inVec1 = vld1q_f32((const float32_t *) input1); res = vmulq_f32(inVec0, inVec1); vst1q_f32((float32_t *) output, res); n4 -= 4;
input0 += 4;
input1 += 4;
output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ * *input1++;
case 2: *output++ = *input0++ * *input1++;
case 1: *output++ = *input0++ * *input1++;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input0[i] * input1[i];
}
#endif
}
static inline void multiply(float *input, float constant, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vsmul(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ *= constant; --n;
case 2: *output++ *= constant; --n;
case 3: *output++ *= constant; --n;
}
int n4 = n & 0xFFFFFFFC;
const __m128 constVec = _mm_set1_ps(constant);
while (n4) {
_mm_store_ps(output, _mm_mul_ps(_mm_load_ps(input), constVec));
n4 -= 4; input += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ *= constant;
case 2: *output++ *= constant;
case 1: *output++ *= constant;
case 0: default: break;
}
#elif __ARM_NEON__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC;
float32x4_t inVec, res;
while (n4) {
inVec = vld1q_f32((const float32_t *) input);
res = vmulq_n_f32(inVec, constant);
vst1q_f32((float32_t *) output, res);
n4 -= 4;
input += 4;
output += 4;
}
switch (n & 0x3) {
case 3: *output++ *= constant;
case 2: *output++ *= constant;
case 1: *output++ *= constant;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input[i] * constant;
}
#endif
}
static inline void divide(float *input0, float *input1, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vdiv(input1+startIndex, 1, input0+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input0 += startIndex;
input1 += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ = *input0++ / *input1++; --n;
case 2: *output++ = *input0++ / *input1++; --n;
case 3: *output++ = *input0++ / *input1++; --n;
}
int n4 = n & 0xFFFFFFFC;
while (n4) {
_mm_store_ps(output, _mm_div_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
n4 -= 4; input0 += 4; input1 += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ = *input0++ / *input1++;
case 2: *output++ = *input0++ / *input1++;
case 1: *output++ = *input0++ / *input1++;
case 0: default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input0[i] / input1[i];
}
#endif
}
static inline void divide(float *input, float constant, float *output, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vsdiv(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *output++ /= constant; --n;
case 2: *output++ /= constant; --n;
case 3: *output++ /= constant; --n;
}
int n4 = n & 0xFFFFFFFC;
const __m128 constVec = _mm_set1_ps(constant);
while (n4) {
_mm_store_ps(output, _mm_div_ps(_mm_load_ps(input), constVec));
n4 -= 4; input += 4; output += 4;
}
switch (n & 0x3) {
case 3: *output++ /= constant;
case 2: *output++ /= constant;
case 1: *output++ /= constant;
case 0: default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
output[i] = input[i] / constant;
}
#endif
}
static inline void fill(float *input, float constant, int startIndex, int endIndex) {
#if __APPLE__
vDSP_vfill(&constant, input+startIndex, 1, endIndex-startIndex);
#elif __SSE__
input += startIndex;
int n = endIndex - startIndex;
switch (startIndex & 0x3) {
case 0: default: break;
case 1: *input++ = constant; --n;
case 2: *input++ = constant; --n;
case 3: *input++ = constant; --n;
}
int n4 = n & 0xFFFFFFFC; const __m128 constVec = _mm_set1_ps(constant);
while (n4) {
_mm_store_ps(input, constVec);
n4 -= 4; input += 4;
}
switch (n & 0x3) {
case 3: *input++ = constant;
case 2: *input++ = constant;
case 1: *input++ = constant;
case 0: default: break;
}
#elif __ARM_NEON__
input += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFFC; float32x4_t constVec = vdupq_n_f32(constant);
while (n4) {
vst1q_f32((float32_t *) input, constVec);
n4 -= 4;
input += 4;
}
switch (n & 0x3) {
case 3: *input++ = constant;
case 2: *input++ = constant;
case 1: *input++ = constant;
default: break;
}
#else
for (int i = startIndex; i < endIndex; i++) {
input[i] = constant;
}
#endif
}
private:
ArrayArithmetic(); ~ArrayArithmetic();
};
#endif