1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* WORK IN PROGRESS
#![feature(stdsimd)]
#![cfg(target_arch = "wasm32")]
use std::arch::wasm32::*;
macro_rules! _ps_const_ty {
($name:ident, $field:ident, $x:expr) => {
const $name: UnionCast = UnionCast {
$field: [$x, $x, $x, $x],
};
};
}
pub union F32x4 {
pub simd: v128,
pub array: [f32; 4],
}
pub union I32x4 {
pub simd: v128,
pub array: [i32; 4],
}
pub union F32x8 {
pub simd: __m256,
pub array: [f32; 8],
}
pub union I32x8 {
pub simd: __m256i,
pub array: [i32; 8],
}
pub union UnionCast {
pub m128: v128,
pub m128i: v128,
pub f32x4: [f32; 4],
pub i32x4: [i32; 4],
pub u32x4: [u32; 4],
}
_ps_const_ty!(PS_INV_SIGN_MASK, u32x4, !0x8000_0000);
_ps_const_ty!(PS_SIGN_MASK, u32x4, 0x8000_0000);
_ps_const_ty!(PS_1_0, f32x4, 1.0);
_ps_const_ty!(PS_0_5, f32x4, 0.5);
_ps_const_ty!(PI32_1, i32x4, 1);
_ps_const_ty!(PI32_INV_1, i32x4, !1);
_ps_const_ty!(PI32_2, i32x4, 2);
_ps_const_ty!(PI32_4, i32x4, 4);
_ps_const_ty!(PS_MINUS_CEPHES_DP1, f32x4, -0.78515625);
_ps_const_ty!(PS_MINUS_CEPHES_DP2, f32x4, -2.4187564849853515625e-4);
_ps_const_ty!(PS_MINUS_CEPHES_DP3, f32x4, -3.77489497744594108e-8);
_ps_const_ty!(PS_SINCOF_P0, f32x4, -1.9515295891E-4);
_ps_const_ty!(PS_SINCOF_P1, f32x4, 8.3321608736E-3);
_ps_const_ty!(PS_SINCOF_P2, f32x4, -1.6666654611E-1);
_ps_const_ty!(PS_COSCOF_P0, f32x4, 2.443315711809948E-005);
_ps_const_ty!(PS_COSCOF_P1, f32x4, -1.388731625493765E-003);
_ps_const_ty!(PS_COSCOF_P2, f32x4, 4.166664568298827E-002);
_ps_const_ty!(PS_CEPHES_FOPI, f32x4, 1.27323954473516); // 4 / M_PI
pub fn sinf_cosf(x: f32) -> (f32, f32) {
// expect sse2 to be available on all x86 builds
unsafe {
let (sinx, cosx) = sinf_cosf_sse2(_mm_set1_ps(x));
(_mm_cvtss_f32(sinx), _mm_cvtss_f32(cosx))
}
x.sin_cos()
}
// Based on http://gruntthepeon.free.fr/ssemath/sse_mathfun.h
pub unsafe fn sinf_cosf_sse2(x: v128) -> (v128, v128) {
let mut sign_bit_sin = x;
// take the absolute value
let mut x = _mm_and_ps(x, PS_INV_SIGN_MASK.m128);
// extract the sign bit (upper one)
sign_bit_sin = _mm_and_ps(sign_bit_sin, PS_SIGN_MASK.m128);
// scale by 4/Pi
let mut y = _mm_mul_ps(x, PS_CEPHES_FOPI.m128);
// store the integer part of y in emm2
let mut emm2 = _mm_cvttps_epi32(y);
// j=(j+1) & (~1) (see the cephes sources)
emm2 = _mm_add_epi32(emm2, PI32_1.m128i);
emm2 = _mm_and_si128(emm2, PI32_INV_1.m128i);
y = _mm_cvtepi32_ps(emm2);
let mut emm4 = emm2;
/* get the swap sign flag for the sine */
let mut emm0 = _mm_and_si128(emm2, PI32_4.m128i);
emm0 = _mm_slli_epi32(emm0, 29);
let swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, PI32_2.m128i);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
let poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
let mut xmm1 = PS_MINUS_CEPHES_DP1.m128;
let mut xmm2 = PS_MINUS_CEPHES_DP2.m128;
let mut xmm3 = PS_MINUS_CEPHES_DP3.m128;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, PI32_2.m128i);
emm4 = _mm_andnot_si128(emm4, PI32_4.m128i);
emm4 = _mm_slli_epi32(emm4, 29);
let sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
// Evaluate the first polynom (0 <= x <= Pi/4)
let z = _mm_mul_ps(x, x);
y = PS_COSCOF_P0.m128;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, PS_COSCOF_P1.m128);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, PS_COSCOF_P2.m128);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
let tmp = _mm_mul_ps(z, PS_0_5.m128);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, PS_1_0.m128);
// Evaluate the second polynom (Pi/4 <= x <= 0)
let mut y2 = PS_SINCOF_P0.m128;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, PS_SINCOF_P1.m128);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, PS_SINCOF_P2.m128);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
// select the correct result from the two polynoms
xmm3 = poly_mask;
let ysin2 = _mm_and_ps(xmm3, y2);
let ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y, y2);
// update the sign
(
_mm_xor_ps(xmm1, sign_bit_sin),
_mm_xor_ps(xmm2, sign_bit_cos),
)
}
pub fn simd_bits() -> usize {
{
return unsafe { simd_bits_sse2() };
}
32
}
pub unsafe fn simd_bits_sse2() -> usize {
128
}
pub unsafe fn blend_i32_sse2(lhs: v128, rhs: v128, cond: v128) -> v128 {
let d = _mm_srai_epi32(_mm_castps_si128(cond), 31);
_mm_or_si128(_mm_and_si128(d, rhs), _mm_andnot_si128(d, lhs))
}
pub unsafe fn hmin_sse2(v: v128) -> f32 {
let v = _mm_min_ps(v, _mm_shuffle_ps(v, v, 0b00_00_11_10));
let v = _mm_min_ps(v, _mm_shuffle_ps(v, v, 0b00_00_00_01));
_mm_cvtss_f32(v)
}
pub unsafe fn blend_f32_sse2(lhs: v128, rhs: v128, cond: v128) -> v128 {
let d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond), 31));
_mm_or_ps(_mm_and_ps(d, rhs), _mm_andnot_ps(d, lhs))
}
pub unsafe fn dot3_sse2(
x0: v128,
x1: v128,
y0: v128,
y1: v128,
z0: v128,
z1: v128,
) -> v128 {
let mut dot = _mm_mul_ps(x0, x1);
dot = _mm_add_ps(dot, _mm_mul_ps(y0, y1));
dot = _mm_add_ps(dot, _mm_mul_ps(z0, z1));
dot
}
*/