#include <stdint.h>
#include <tmmintrin.h>
void accumulate_sse(const float *in, uint8_t *out, uint32_t n) {
__m128 offset = _mm_setzero_ps();
__m128i mask = _mm_set1_epi32(0x0c080400);
__m128 sign_mask = _mm_set1_ps(-0.f);
for (uint32_t i = 0; i < n; i += 4) {
__m128 x = _mm_load_ps(&in[i]);
x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4)));
x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40));
x = _mm_add_ps(x, offset);
__m128 y = _mm_andnot_ps(sign_mask, x); y = _mm_min_ps(y, _mm_set1_ps(1.0f));
y = _mm_mul_ps(y, _mm_set1_ps(255.0f));
__m128i z = _mm_cvttps_epi32(y);
z = _mm_shuffle_epi8(z, mask);
_mm_store_ss((float *)&out[i], _mm_castsi128_ps(z));
offset = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 3, 3, 3));
}
}