#ifndef _ARRAY_ARITHMETIC_H_
#define _ARRAY_ARITHMETIC_H_
#if __APPLE__
#include <Accelerate/Accelerate.h>
#endif
#if __SSE__
#include <xmmintrin.h>
#elif __ARM_NEON__
#include <arm_neon.h>
#endif
class ArrayArithmetic {
  
  public:
  
    static inline void add(float *input0, float *input1, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vadd(input0+startIndex, 1, input1+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
            switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ = *input0++ + *input1++; --n;
        case 2: *output++ = *input0++ + *input1++; --n;
        case 3: *output++ = *input0++ + *input1++; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      while (n4) {
        _mm_store_ps(output, _mm_add_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
        n4 -= 4; input0 += 4; input1 += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ = *input0++ + *input1++;
        case 2: *output++ = *input0++ + *input1++;
        case 1: *output++ = *input0++ + *input1++;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;
      float32x4_t inVec0, inVec1, res;
      while (n4) {
        inVec0 = vld1q_f32((const float32_t *) input0);
        inVec1 = vld1q_f32((const float32_t *) input1);
        res = vaddq_f32(inVec0, inVec1);
        vst1q_f32((float32_t *) output, res);
        n4 -= 4;
        input0 += 4;
        input1 += 4;
        output += 4;
      }
      switch (n & 0x3) {
        case 3: *output++ = *input0++ + *input1++;
        case 2: *output++ = *input0++ + *input1++;
        case 1: *output++ = *input0++ + *input1++;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input0[i] + input1[i];
      }
      #endif
    }
  
    static inline void add(float *input, float constant, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vsadd(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
            switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ += constant; --n;
        case 2: *output++ += constant; --n;
        case 3: *output++ += constant; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      const __m128 constVec = _mm_set1_ps(constant);
      while (n4) {
        _mm_store_ps(output, _mm_add_ps(_mm_load_ps(input), constVec));
        n4 -= 4; input += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ += constant;
        case 2: *output++ += constant;
        case 1: *output++ += constant;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;
      float32x4_t inVec, res;
      while (n4) {
        inVec = vld1q_f32((const float32_t *) input);
        res = vaddq_f32(inVec, constant);
        vst1q_f32((float32_t *) output, res);
        n4 -= 4;
        input += 4;
        output += 4;
      }
      switch (n & 0x3) {
        case 3: *output++ += constant;
        case 2: *output++ += constant;
        case 1: *output++ += constant;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input[i] + constant;
      }
      #endif
    }
    
        static inline void subtract(float *input0, float *input1, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vsub(input1+startIndex, 1, input0+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ = *input0++ - *input1++; --n;
        case 2: *output++ = *input0++ - *input1++; --n;
        case 3: *output++ = *input0++ - *input1++; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      while (n4) {
        _mm_store_ps(output, _mm_sub_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
        n4 -= 4; input0 += 4; input1 += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ = *input0++ - *input1++;
        case 2: *output++ = *input0++ - *input1++;
        case 1: *output++ = *input0++ - *input1++;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;
      float32x4_t inVec0, inVec1, res;
      while (n4) {
        inVec0 = vld1q_f32((const float32_t *) input0);
        inVec1 = vld1q_f32((const float32_t *) input1);
        res = vsubq_f32(inVec0, inVec1);
        vst1q_f32((float32_t *) output, res);
        n4 -= 4;
        input0 += 4;
        input1 += 4;
        output += 4;
      }
      switch (n & 0x3) {
        case 3: *output++ = *input0++ - *input1++;
        case 2: *output++ = *input0++ - *input1++;
        case 1: *output++ = *input0++ - *input1++;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input0[i] - input1[i];
      }
      #endif
    }
  
    static inline void subtract(float *input, float constant, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      float negation = -1.0f * constant;
      vDSP_vsadd(input+startIndex, 1, &negation, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ -= constant; --n;
        case 2: *output++ -= constant; --n;
        case 3: *output++ -= constant; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      const __m128 constVec = _mm_set1_ps(constant);
      while (n4) {
        _mm_store_ps(output, _mm_sub_ps(_mm_load_ps(input), constVec));
        n4 -= 4; input += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ -= constant;
        case 2: *output++ -= constant;
        case 1: *output++ -= constant;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;
      float32x4_t inVec, res;
      while (n4) {
        inVec = vld1q_f32((const float32_t *) input);
        res = vsubq_f32(inVec, constant);
        vst1q_f32((float32_t *) output, res);
        n4 -= 4;
        input += 4;
        output += 4;
      }
      switch (n & 0x3) {
        case 3: *output++ -= constant;
        case 2: *output++ -= constant;
        case 1: *output++ -= constant;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input[i] - constant;
      }
      #endif
    }
    
    static inline void multiply(float *input0, float *input1, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vmul(input0+startIndex, 1, input1+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ = *input0++ * *input1++; --n;
        case 2: *output++ = *input0++ * *input1++; --n;
        case 3: *output++ = *input0++ * *input1++; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      while (n4) {
        _mm_store_ps(output, _mm_mul_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
        n4 -= 4; input0 += 4; input1 += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ = *input0++ * *input1++;
        case 2: *output++ = *input0++ * *input1++;
        case 1: *output++ = *input0++ * *input1++;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;
      float32x4_t inVec0, inVec1, res;
      while (n4) {
        inVec0 = vld1q_f32((const float32_t *) input0);         inVec1 = vld1q_f32((const float32_t *) input1);         res = vmulq_f32(inVec0, inVec1);         vst1q_f32((float32_t *) output, res);         n4 -= 4;
        input0 += 4;
        input1 += 4;
        output += 4;
      }
      switch (n & 0x3) {
        case 3: *output++ = *input0++ * *input1++;
        case 2: *output++ = *input0++ * *input1++;
        case 1: *output++ = *input0++ * *input1++;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input0[i] * input1[i];
      }
      #endif
    }
  
    static inline void multiply(float *input, float constant, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vsmul(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ *= constant; --n;
        case 2: *output++ *= constant; --n;
        case 3: *output++ *= constant; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      const __m128 constVec = _mm_set1_ps(constant);
      while (n4) {
        _mm_store_ps(output, _mm_mul_ps(_mm_load_ps(input), constVec));
        n4 -= 4; input += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ *= constant;
        case 2: *output++ *= constant;
        case 1: *output++ *= constant;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;
      float32x4_t inVec, res;
      while (n4) {
        inVec = vld1q_f32((const float32_t *) input);
        res = vmulq_n_f32(inVec, constant);
        vst1q_f32((float32_t *) output, res);
        n4 -= 4;
        input += 4;
        output += 4;
      }
      switch (n & 0x3) {
        case 3: *output++ *= constant;
        case 2: *output++ *= constant;
        case 1: *output++ *= constant;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input[i] * constant;
      }
      #endif
    }
    
        static inline void divide(float *input0, float *input1, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vdiv(input1+startIndex, 1, input0+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input0 += startIndex;
      input1 += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ = *input0++ / *input1++; --n;
        case 2: *output++ = *input0++ / *input1++; --n;
        case 3: *output++ = *input0++ / *input1++; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      while (n4) {
        _mm_store_ps(output, _mm_div_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
        n4 -= 4; input0 += 4; input1 += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ = *input0++ / *input1++;
        case 2: *output++ = *input0++ / *input1++;
        case 1: *output++ = *input0++ / *input1++;
        case 0: default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input0[i] / input1[i];
      }
      #endif
    }
  
    static inline void divide(float *input, float constant, float *output, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vsdiv(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input += startIndex;
      output += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *output++ /= constant; --n;
        case 2: *output++ /= constant; --n;
        case 3: *output++ /= constant; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;
      const __m128 constVec = _mm_set1_ps(constant);
      while (n4) {
        _mm_store_ps(output, _mm_div_ps(_mm_load_ps(input), constVec));
        n4 -= 4; input += 4; output += 4;
      }
      
      switch (n & 0x3) {
        case 3: *output++ /= constant;
        case 2: *output++ /= constant;
        case 1: *output++ /= constant;
        case 0: default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        output[i] = input[i] / constant;
      }
      #endif
    }
  
    static inline void fill(float *input, float constant, int startIndex, int endIndex) {
      #if __APPLE__
      vDSP_vfill(&constant, input+startIndex, 1, endIndex-startIndex);
      #elif __SSE__
      input += startIndex;
      int n = endIndex - startIndex;
      
      switch (startIndex & 0x3) {
        case 0: default: break;
        case 1: *input++ = constant; --n;
        case 2: *input++ = constant; --n;
        case 3: *input++ = constant; --n;
      }
      
      int n4 = n & 0xFFFFFFFC;       const __m128 constVec = _mm_set1_ps(constant);
      while (n4) {
        _mm_store_ps(input, constVec);
        n4 -= 4; input += 4;
      }
      
      switch (n & 0x3) {
        case 3: *input++ = constant;
        case 2: *input++ = constant;
        case 1: *input++ = constant;
        case 0: default: break;
      }
      #elif __ARM_NEON__
      input += startIndex;
      int n = endIndex - startIndex;
      int n4 = n & 0xFFFFFFFC;       float32x4_t constVec = vdupq_n_f32(constant);
      while (n4) {
        vst1q_f32((float32_t *) input, constVec);
        n4 -= 4;
        input += 4;
      }
      switch (n & 0x3) {
        case 3: *input++ = constant;
        case 2: *input++ = constant;
        case 1: *input++ = constant;
        default: break;
      }
      #else
      for (int i = startIndex; i < endIndex; i++) {
        input[i] = constant;
      }
      #endif
    }
    
  private:
    ArrayArithmetic();     ~ArrayArithmetic();
};
#endif