1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/*
* Copyright 2010,2011,2012 Reality Jockey, Ltd.
* info@rjdj.me
* http://rjdj.me/
*
* This file is part of ZenGarden.
*
* ZenGarden is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ZenGarden is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with ZenGarden. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include <float.h>
#include "ArrayArithmetic.h"
#include "DspReciprocalSqrt.h"
#include "PdGraph.h"
message::Object *DspReciprocalSqrt::new_object(pd::Message *init_message, PdGraph *graph) {
return new DspReciprocalSqrt(init_message, graph);
}
DspReciprocalSqrt::DspReciprocalSqrt(pd::Message *init_message, PdGraph *graph) : DspObject(0, 1, 0, 1, graph) {
process_function = &processSignal;
}
DspReciprocalSqrt::~DspReciprocalSqrt() {
// nothing to do
}
void DspReciprocalSqrt::processSignal(DspObject *dspObject, int fromIndex, int toIndex) {
// [rsqrt~] takes no messages, so the full block will be computed every time
DspReciprocalSqrt *d = reinterpret_cast<DspReciprocalSqrt *>(dspObject);
#if __ARM_NEON__
float *inBuff = d->dspBufferAtInlet[0];
float *outBuff = d->dspBufferAtOutlet[0];
float32x4_t inVec, outVec;
float32x4_t zeroVec = vdupq_n_f32(FLT_MIN);
while (toIndex) {
inVec = vld1q_f32(inBuff);
inVec = vmaxq_f32(inVec, zeroVec);
outVec = vrsqrteq_f32(inVec);
vst1q_f32((float32_t *) outBuff, outVec);
toIndex -= 4;
inBuff += 4;
outBuff += 4;
}
#elif __SSE__
// NOTE: for all non-positive numbers, this routine will output a very large number (not Inf) == 1/sqrt(FLT_MIN)
float *inBuff = d->dspBufferAtInlet[0];
float *outBuff = d->dspBufferAtOutlet[0];
__m128 inVec, outVec;
__m128 zeroVec = _mm_set1_ps(FLT_MIN);
while (toIndex) {
inVec = _mm_load_ps(inBuff); // unaligned load must be used because inBuff could point anywhere
// ensure that all inputs are positive, max(FLT_MIN, inVec), preventing divide-by-zero
inVec = _mm_max_ps(inVec, zeroVec);
outVec = _mm_rsqrt_ps(inVec);
// aligned store may be used because outBuff always points to the beginning of the output buffer
_mm_store_ps(outBuff, outVec);
toIndex -= 4;
inBuff += 4;
outBuff += 4;
}
#else
// http://en.wikipedia.org/wiki/Fast_inverse_square_root
int j;
float y;
for (int i = 0; i < toIndex; ++i) {
float f = dspBufferAtInlet[0][i];
if (f <= 0.0f) {
dspBufferAtOutlet0[i] = 0.0f;
} else {
y = f;
j = *((long *) &y);
j = 0x5f375a86 - (j >> 1);
y = *((float *) &j);
dspBufferAtOutlet0[i] = y * (1.5f - ( 0.5f * f * y * y ));
}
}
#endif
}