#include "SDL_internal.h"
#include "SDL_sysaudio.h"
#include "SDL_audioresample.h"
#if defined(SDL_SSE_INTRINSICS) || defined(SDL_NEON_INTRINSICS)
#define RESAMPLER_ZERO_CROSSINGS 6
#else
#define RESAMPLER_ZERO_CROSSINGS 5
#endif
#define RESAMPLER_SAMPLES_PER_FRAME (RESAMPLER_ZERO_CROSSINGS * 2)
#define RESAMPLER_MAX_PADDING_FRAMES (RESAMPLER_ZERO_CROSSINGS + 1)
#define RESAMPLER_BITS_PER_ZERO_CROSSING 3
#define RESAMPLER_SAMPLES_PER_ZERO_CROSSING (1 << RESAMPLER_BITS_PER_ZERO_CROSSING)
#define RESAMPLER_FILTER_INTERP_BITS (32 - RESAMPLER_BITS_PER_ZERO_CROSSING)
#define RESAMPLER_FILTER_INTERP_RANGE (1 << RESAMPLER_FILTER_INTERP_BITS)
typedef union Cubic
{
float v[4];
#ifdef SDL_SSE_INTRINSICS
__m128 v128;
#endif
#ifdef SDL_NEON_INTRINSICS
float32x4_t v128;
#endif
} Cubic;
static void ResampleFrame_Generic(const float *src, float *dst, const Cubic *filter, float frac, int chans)
{
const float frac2 = frac * frac;
const float frac3 = frac * frac2;
int i, chan;
float scales[RESAMPLER_SAMPLES_PER_FRAME];
for (i = 0; i < RESAMPLER_SAMPLES_PER_FRAME; ++i, ++filter) {
scales[i] = filter->v[0] + (filter->v[1] * frac) + (filter->v[2] * frac2) + (filter->v[3] * frac3);
}
for (chan = 0; chan < chans; ++chan) {
float out = 0.0f;
for (i = 0; i < RESAMPLER_SAMPLES_PER_FRAME; ++i) {
out += src[i * chans + chan] * scales[i];
}
dst[chan] = out;
}
}
static void ResampleFrame_Mono(const float *src, float *dst, const Cubic *filter, float frac, int chans)
{
const float frac2 = frac * frac;
const float frac3 = frac * frac2;
int i;
float out = 0.0f;
for (i = 0; i < RESAMPLER_SAMPLES_PER_FRAME; ++i, ++filter) {
const float scale = filter->v[0] + (filter->v[1] * frac) + (filter->v[2] * frac2) + (filter->v[3] * frac3);
out += src[i] * scale;
}
dst[0] = out;
}
static void ResampleFrame_Stereo(const float *src, float *dst, const Cubic *filter, float frac, int chans)
{
const float frac2 = frac * frac;
const float frac3 = frac * frac2;
int i;
float out0 = 0.0f;
float out1 = 0.0f;
for (i = 0; i < RESAMPLER_SAMPLES_PER_FRAME; ++i, ++filter) {
const float scale = filter->v[0] + (filter->v[1] * frac) + (filter->v[2] * frac2) + (filter->v[3] * frac3);
out0 += src[i * 2 + 0] * scale;
out1 += src[i * 2 + 1] * scale;
}
dst[0] = out0;
dst[1] = out1;
}
#ifdef SDL_SSE_INTRINSICS
#define sdl_madd_ps(a, b, c) _mm_add_ps(a, _mm_mul_ps(b, c))
static void SDL_TARGETING("sse") ResampleFrame_Generic_SSE(const float *src, float *dst, const Cubic *filter, float frac, int chans)
{
#if RESAMPLER_SAMPLES_PER_FRAME != 12
#error Invalid samples per frame
#endif
__m128 f0, f1, f2;
{
const __m128 frac1 = _mm_set1_ps(frac);
const __m128 frac2 = _mm_mul_ps(frac1, frac1);
const __m128 frac3 = _mm_mul_ps(frac1, frac2);
#define X(out) \
out = _mm_load_ps(filter[0].v); \
out = sdl_madd_ps(out, frac1, _mm_load_ps(filter[1].v)); \
out = sdl_madd_ps(out, frac2, _mm_load_ps(filter[2].v)); \
out = sdl_madd_ps(out, frac3, _mm_load_ps(filter[3].v)); \
filter += 4
X(f0);
X(f1);
X(f2);
#undef X
}
if (chans == 2) {
__m128 out0 = _mm_mul_ps(_mm_loadu_ps(src + 0), _mm_unpacklo_ps(f0, f0));
__m128 out1 = _mm_mul_ps(_mm_loadu_ps(src + 4), _mm_unpackhi_ps(f0, f0));
out0 = sdl_madd_ps(out0, _mm_loadu_ps(src + 8), _mm_unpacklo_ps(f1, f1));
out1 = sdl_madd_ps(out1, _mm_loadu_ps(src + 12), _mm_unpackhi_ps(f1, f1));
out0 = sdl_madd_ps(out0, _mm_loadu_ps(src + 16), _mm_unpacklo_ps(f2, f2));
out1 = sdl_madd_ps(out1, _mm_loadu_ps(src + 20), _mm_unpackhi_ps(f2, f2));
__m128 out = _mm_add_ps(out0, out1);
out = _mm_add_ps(out, _mm_movehl_ps(out, out));
_mm_storel_pi((__m64 *)dst, out);
return;
}
if (chans == 1) {
__m128 out = _mm_mul_ps(f0, _mm_loadu_ps(src + 0));
out = sdl_madd_ps(out, f1, _mm_loadu_ps(src + 4));
out = sdl_madd_ps(out, f2, _mm_loadu_ps(src + 8));
__m128 shuf = _mm_shuffle_ps(out, out, _MM_SHUFFLE(2, 3, 0, 1));
out = _mm_add_ps(out, shuf);
out = _mm_add_ss(out, _mm_movehl_ps(shuf, out));
_mm_store_ss(dst, out);
return;
}
int chan = 0;
for (; chan + 4 <= chans; chan += 4) {
const float *in = &src[chan];
__m128 out0 = _mm_setzero_ps();
__m128 out1 = _mm_setzero_ps();
#define X(a, b, out) \
out = sdl_madd_ps(out, _mm_loadu_ps(in), _mm_shuffle_ps(a, a, _MM_SHUFFLE(b, b, b, b))); \
in += chans
#define Y(a) \
X(a, 0, out0); \
X(a, 1, out1); \
X(a, 2, out0); \
X(a, 3, out1)
Y(f0);
Y(f1);
Y(f2);
#undef X
#undef Y
__m128 out = _mm_add_ps(out0, out1);
_mm_storeu_ps(&dst[chan], out);
}
for (; chan < chans; ++chan) {
const float *in = &src[chan];
__m128 v0, v1, v2;
#define X(x) \
x = _mm_unpacklo_ps(_mm_load_ss(in), _mm_load_ss(in + chans)); \
in += chans + chans; \
x = _mm_movelh_ps(x, _mm_unpacklo_ps(_mm_load_ss(in), _mm_load_ss(in + chans))); \
in += chans + chans
X(v0);
X(v1);
X(v2);
#undef X
__m128 out = _mm_mul_ps(f0, v0);
out = sdl_madd_ps(out, f1, v1);
out = sdl_madd_ps(out, f2, v2);
__m128 shuf = _mm_shuffle_ps(out, out, _MM_SHUFFLE(2, 3, 0, 1));
out = _mm_add_ps(out, shuf);
out = _mm_add_ss(out, _mm_movehl_ps(shuf, out));
_mm_store_ss(&dst[chan], out);
}
}
#undef sdl_madd_ps
#endif
#ifdef SDL_NEON_INTRINSICS
static void ResampleFrame_Generic_NEON(const float *src, float *dst, const Cubic *filter, float frac, int chans)
{
#if RESAMPLER_SAMPLES_PER_FRAME != 12
#error Invalid samples per frame
#endif
float32x4_t f0, f1, f2;
{
const float32x4_t frac1 = vdupq_n_f32(frac);
const float32x4_t frac2 = vmulq_f32(frac1, frac1);
const float32x4_t frac3 = vmulq_f32(frac1, frac2);
#define X(out) \
out = vmlaq_f32(vmlaq_f32(vmlaq_f32(filter[0].v128, filter[1].v128, frac1), filter[2].v128, frac2), filter[3].v128, frac3); \
filter += 4
X(f0);
X(f1);
X(f2);
#undef X
}
if (chans == 2) {
float32x4x2_t g0 = vzipq_f32(f0, f0);
float32x4x2_t g1 = vzipq_f32(f1, f1);
float32x4x2_t g2 = vzipq_f32(f2, f2);
float32x4_t out0 = vmulq_f32(vld1q_f32(src + 0), g0.val[0]);
float32x4_t out1 = vmulq_f32(vld1q_f32(src + 4), g0.val[1]);
out0 = vmlaq_f32(out0, vld1q_f32(src + 8), g1.val[0]);
out1 = vmlaq_f32(out1, vld1q_f32(src + 12), g1.val[1]);
out0 = vmlaq_f32(out0, vld1q_f32(src + 16), g2.val[0]);
out1 = vmlaq_f32(out1, vld1q_f32(src + 20), g2.val[1]);
out0 = vaddq_f32(out0, out1);
float32x2_t out = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
vst1_f32(dst, out);
return;
}
if (chans == 1) {
float32x4_t out = vmulq_f32(f0, vld1q_f32(src + 0));
out = vmlaq_f32(out, f1, vld1q_f32(src + 4));
out = vmlaq_f32(out, f2, vld1q_f32(src + 8));
float32x2_t sum = vadd_f32(vget_low_f32(out), vget_high_f32(out));
sum = vpadd_f32(sum, sum);
vst1_lane_f32(dst, sum, 0);
return;
}
int chan = 0;
for (; chan + 4 <= chans; chan += 4) {
const float *in = &src[chan];
float32x4_t out0 = vdupq_n_f32(0);
float32x4_t out1 = vdupq_n_f32(0);
#define X(a, b, out) \
out = vmlaq_f32(out, vld1q_f32(in), vdupq_lane_f32(a, b)); \
in += chans
#define Y(a) \
X(vget_low_f32(a), 0, out0); \
X(vget_low_f32(a), 1, out1); \
X(vget_high_f32(a), 0, out0); \
X(vget_high_f32(a), 1, out1)
Y(f0);
Y(f1);
Y(f2);
#undef X
#undef Y
float32x4_t out = vaddq_f32(out0, out1);
vst1q_f32(&dst[chan], out);
}
for (; chan < chans; ++chan) {
const float *in = &src[chan];
float32x4_t v0, v1, v2;
#define X(x) \
x = vld1q_dup_f32(in); \
in += chans; \
x = vld1q_lane_f32(in, x, 1); \
in += chans; \
x = vld1q_lane_f32(in, x, 2); \
in += chans; \
x = vld1q_lane_f32(in, x, 3); \
in += chans
X(v0);
X(v1);
X(v2);
#undef X
float32x4_t out = vmulq_f32(f0, v0);
out = vmlaq_f32(out, f1, v1);
out = vmlaq_f32(out, f2, v2);
float32x2_t sum = vadd_f32(vget_low_f32(out), vget_high_f32(out));
sum = vpadd_f32(sum, sum);
vst1_lane_f32(&dst[chan], sum, 0);
}
}
#endif
static void CubicLeastSquares(Cubic *coeffs, float y0, float y1, float y2, float y3)
{
coeffs->v[0] = y0;
coeffs->v[1] = -5.5f * y0 + 9.0f * y1 - 4.5f * y2 + y3;
coeffs->v[2] = 9.0f * y0 - 22.5f * y1 + 18.0f * y2 - 4.5f * y3;
coeffs->v[3] = -4.5f * y0 + 13.5f * y1 - 13.5f * y2 + 4.5f * y3;
}
static float BesselI0(float x)
{
float sum = 0.0f;
float i = 1.0f;
float t = 1.0f;
x *= x * 0.25f;
while (t >= sum * SDL_FLT_EPSILON) {
sum += t;
t *= x / (i * i);
++i;
}
return sum;
}
static void SincTable(float *table, int len)
{
int i;
for (i = 0; i < len; ++i) {
table[i] = SDL_sinf(i * (SDL_PI_F / len)) / SDL_PI_F;
}
}
static float Sinc(const float *table, int x, int y)
{
float s = table[x % y];
s = ((x / y) & 1) ? -s : s;
return (s * y) / x;
}
static Cubic ResamplerFilter[RESAMPLER_SAMPLES_PER_ZERO_CROSSING][RESAMPLER_SAMPLES_PER_FRAME];
static void GenerateResamplerFilter(void)
{
enum
{
TABLE_SAMPLES_PER_ZERO_CROSSING = RESAMPLER_SAMPLES_PER_ZERO_CROSSING * 3,
TABLE_SIZE = RESAMPLER_ZERO_CROSSINGS * TABLE_SAMPLES_PER_ZERO_CROSSING,
};
const float dB = 80.0f;
const float beta = 0.1102f * (dB - 8.7f);
const float bessel_beta = BesselI0(beta);
const float lensqr = TABLE_SIZE * TABLE_SIZE;
int i, j;
float sinc[TABLE_SAMPLES_PER_ZERO_CROSSING];
SincTable(sinc, TABLE_SAMPLES_PER_ZERO_CROSSING);
float filter[TABLE_SIZE + 1];
filter[0] = 1.0f;
for (i = 1; i <= TABLE_SIZE; ++i) {
float b = BesselI0(beta * SDL_sqrtf((lensqr - (i * i)) / lensqr)) / bessel_beta;
float s = Sinc(sinc, i, TABLE_SAMPLES_PER_ZERO_CROSSING);
filter[i] = b * s;
}
for (i = 0; i < RESAMPLER_SAMPLES_PER_ZERO_CROSSING; ++i) {
for (j = 0; j < RESAMPLER_ZERO_CROSSINGS; ++j) {
const float *ys = &filter[((j * RESAMPLER_SAMPLES_PER_ZERO_CROSSING) + i) * 3];
Cubic *fwd = &ResamplerFilter[i][RESAMPLER_ZERO_CROSSINGS - j - 1];
Cubic *rev = &ResamplerFilter[RESAMPLER_SAMPLES_PER_ZERO_CROSSING - i - 1][RESAMPLER_ZERO_CROSSINGS + j];
CubicLeastSquares(fwd, ys[0], ys[1], ys[2], ys[3]);
CubicLeastSquares(rev, ys[3], ys[2], ys[1], ys[0]);
}
}
}
typedef void (*ResampleFrameFunc)(const float *src, float *dst, const Cubic *filter, float frac, int chans);
static ResampleFrameFunc ResampleFrame[8];
static void Transpose4x4(Cubic *data)
{
int i, j;
Cubic temp[4] = { data[0], data[1], data[2], data[3] };
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) {
data[i].v[j] = temp[j].v[i];
}
}
}
static void SetupAudioResampler(void)
{
int i, j;
bool transpose = false;
GenerateResamplerFilter();
#ifdef SDL_SSE_INTRINSICS
if (SDL_HasSSE()) {
for (i = 0; i < 8; ++i) {
ResampleFrame[i] = ResampleFrame_Generic_SSE;
}
transpose = true;
} else
#endif
#ifdef SDL_NEON_INTRINSICS
if (SDL_HasNEON()) {
for (i = 0; i < 8; ++i) {
ResampleFrame[i] = ResampleFrame_Generic_NEON;
}
transpose = true;
} else
#endif
{
for (i = 0; i < 8; ++i) {
ResampleFrame[i] = ResampleFrame_Generic;
}
ResampleFrame[0] = ResampleFrame_Mono;
ResampleFrame[1] = ResampleFrame_Stereo;
}
if (transpose) {
for (i = 0; i < RESAMPLER_SAMPLES_PER_ZERO_CROSSING; ++i) {
for (j = 0; j + 4 <= RESAMPLER_SAMPLES_PER_FRAME; j += 4) {
Transpose4x4(&ResamplerFilter[i][j]);
}
}
}
}
void SDL_SetupAudioResampler(void)
{
static SDL_InitState init;
if (SDL_ShouldInit(&init)) {
SetupAudioResampler();
SDL_SetInitialized(&init, true);
}
}
Sint64 SDL_GetResampleRate(int src_rate, int dst_rate)
{
SDL_assert(src_rate > 0);
SDL_assert(dst_rate > 0);
Sint64 numerator = (Sint64)src_rate << 32;
Sint64 denominator = (Sint64)dst_rate;
Sint64 sample_rate = ((numerator - 1) / denominator) + 1;
SDL_assert(sample_rate > 0);
return sample_rate;
}
int SDL_GetResamplerHistoryFrames(void)
{
return RESAMPLER_MAX_PADDING_FRAMES;
}
int SDL_GetResamplerPaddingFrames(Sint64 resample_rate)
{
return resample_rate ? RESAMPLER_MAX_PADDING_FRAMES : 0;
}
SDL_FORCE_INLINE bool ResamplerAdd(Sint64 a, Sint64 b, Sint64 *ret)
{
if ((b > 0) && (a > SDL_MAX_SINT64 - b)) {
return false;
}
*ret = a + b;
return true;
}
SDL_FORCE_INLINE bool ResamplerMul(Sint64 a, Sint64 b, Sint64 *ret)
{
if ((b > 0) && (a > SDL_MAX_SINT64 / b)) {
return false;
}
*ret = a * b;
return true;
}
Sint64 SDL_GetResamplerInputFrames(Sint64 output_frames, Sint64 resample_rate, Sint64 resample_offset)
{
Sint64 output_offset;
if (!ResamplerMul(output_frames, resample_rate, &output_offset) ||
!ResamplerAdd(output_offset, -resample_rate + resample_offset + 0x100000000, &output_offset)) {
output_offset = SDL_MAX_SINT64;
}
Sint64 input_frames = (Sint64)(Sint32)(output_offset >> 32);
input_frames = SDL_max(input_frames, 0);
return input_frames;
}
Sint64 SDL_GetResamplerOutputFrames(Sint64 input_frames, Sint64 resample_rate, Sint64 *inout_resample_offset)
{
Sint64 resample_offset = *inout_resample_offset;
Sint64 input_offset;
if (!ResamplerMul(input_frames, 0x100000000, &input_offset) ||
!ResamplerAdd(input_offset, -resample_offset, &input_offset)) {
input_offset = SDL_MAX_SINT64;
}
Sint64 output_frames = (input_offset > 0) ? ((input_offset - 1) / resample_rate) + 1 : 0;
*inout_resample_offset = (output_frames * resample_rate) - input_offset;
return output_frames;
}
void SDL_ResampleAudio(int chans, const float *src, int inframes, float *dst, int outframes,
Sint64 resample_rate, Sint64 *inout_resample_offset)
{
int i;
Sint64 srcpos = *inout_resample_offset;
ResampleFrameFunc resample_frame = ResampleFrame[chans - 1];
SDL_assert(resample_rate > 0);
src -= (RESAMPLER_ZERO_CROSSINGS - 1) * chans;
for (i = 0; i < outframes; ++i) {
int srcindex = (int)(Sint32)(srcpos >> 32);
Uint32 srcfraction = (Uint32)(srcpos & 0xFFFFFFFF);
srcpos += resample_rate;
SDL_assert(srcindex >= -1 && srcindex < inframes);
const Cubic *filter = ResamplerFilter[srcfraction >> RESAMPLER_FILTER_INTERP_BITS];
const float frac = (float)(srcfraction & (RESAMPLER_FILTER_INTERP_RANGE - 1)) * (1.0f / RESAMPLER_FILTER_INTERP_RANGE);
const float *frame = &src[srcindex * chans];
resample_frame(frame, dst, filter, frac, chans);
dst += chans;
}
*inout_resample_offset = srcpos - ((Sint64)inframes << 32);
}