Skip to main content

simd_vector/
lib.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2
3mod vec4;
4mod vec8;
5
6pub use vec4::Vec4;
7pub use vec8::Vec8;
8
9/// Precise transcendental functions (≤ 1.0 ULP). Import with `use simd_vector::precise::*`.
10pub mod precise {
11    pub use crate::vec4::Vec4;
12    pub use crate::vec8::Vec8;
13
14    /// Precise math functions (≤ 1.0 ULP accuracy).
15    pub trait PreciseMath {
16        fn sin(self) -> Self;
17        fn cos(self) -> Self;
18        fn exp(self) -> Self;
19        fn sum(self) -> f32;
20        fn dot(self, other: Self) -> f32;
21    }
22}
23
24/// Fast transcendental functions (≤ 3.5 ULP). Import with `use simd_vector::fast::*`.
25pub mod fast {
26    pub use crate::vec4::Vec4;
27    pub use crate::vec8::Vec8;
28
29    /// Fast math functions (≤ 3.5 ULP transcendentals, f32 reductions).
30    pub trait FastMath {
31        fn sin(self) -> Self;
32        fn cos(self) -> Self;
33        fn exp(self) -> Self;
34        fn sum(self) -> f32;
35        fn dot(self, other: Self) -> f32;
36    }
37}
38
39// SLEEF constants for single-precision trig range reduction (u10 variants, 3-part split)
40const PI_A2F: f32 = 3.1414794921875;
41const PI_B2F: f32 = 0.00011315941810607910156;
42const PI_C2F: f32 = 1.9841872589410058936e-09;
43const TRIGRANGEMAX2F: f32 = 125.0;
44const M_1_PI_F: f32 = std::f32::consts::FRAC_1_PI;
45
46// SLEEF constants for medium-range trig reduction (u35 variants, 4-part split)
47const PI_AF: f32 = 3.140625;
48const PI_BF: f32 = 0.0009670257568359375;
49const PI_CF: f32 = 6.2771141529083251953e-07;
50const PI_DF: f32 = 1.2154201256553420762e-10;
51const TRIGRANGEMAXF: f32 = 39000.0;
52
53// SLEEF constants for exp
54const R_LN2F: f32 = 1.442695040888963407359924681001892137426645954152985934135449406931;
55const L2UF: f32 = 0.693145751953125;
56const L2LF: f32 = 1.428606765330187045e-06;
57
58// SLEEF Payne-Hanek table for single-precision pi reduction
59#[allow(clippy::excessive_precision)]
60const REMPI_TABSP: [f32; 416] = [
61    f32::from_bits(0x3E22F980),
62    f32::from_bits(0x335B9390),
63    f32::from_bits(0x2782A540),
64    f32::from_bits(0x9B762A0C),
65    f32::from_bits(0x3D0BE60C),
66    f32::from_bits(0x31DC9C88),
67    f32::from_bits(0x24A94FE0),
68    f32::from_bits(0x191D5F48),
69    f32::from_bits(0x3D0BE60C),
70    f32::from_bits(0x31DC9C88),
71    f32::from_bits(0x24A94FE0),
72    f32::from_bits(0x191D5F48),
73    f32::from_bits(0x3B3E60DC),
74    f32::from_bits(0xAED8DDF4),
75    f32::from_bits(0xA33580F4),
76    f32::from_bits(0x980A82E1),
77    f32::from_bits(0x3B3E60DC),
78    f32::from_bits(0xAED8DDF4),
79    f32::from_bits(0xA33580F4),
80    f32::from_bits(0x980A82E1),
81    f32::from_bits(0x3B3E60DC),
82    f32::from_bits(0xAED8DDF4),
83    f32::from_bits(0xA33580F4),
84    f32::from_bits(0x980A82E1),
85    f32::from_bits(0x3B3E60DC),
86    f32::from_bits(0xAED8DDF4),
87    f32::from_bits(0xA33580F4),
88    f32::from_bits(0x980A82E1),
89    f32::from_bits(0x3A79836C),
90    f32::from_bits(0x2F139104),
91    f32::from_bits(0x23A53F84),
92    f32::from_bits(0x17EAFA3F),
93    f32::from_bits(0x3A79836C),
94    f32::from_bits(0x2F139104),
95    f32::from_bits(0x23A53F84),
96    f32::from_bits(0x17EAFA3F),
97    f32::from_bits(0x39F306DC),
98    f32::from_bits(0x2D9C8828),
99    f32::from_bits(0x2294FE14),
100    f32::from_bits(0x96282E0B),
101    f32::from_bits(0x39660DB8),
102    f32::from_bits(0x2D9C8828),
103    f32::from_bits(0x2294FE14),
104    f32::from_bits(0x96282E0B),
105    f32::from_bits(0x38CC1B70),
106    f32::from_bits(0x2D9C8828),
107    f32::from_bits(0x2294FE14),
108    f32::from_bits(0x96282E0B),
109    f32::from_bits(0x381836E4),
110    f32::from_bits(0x2C644150),
111    f32::from_bits(0x2127F09C),
112    f32::from_bits(0x15AFA3EA),
113    f32::from_bits(0x36C1B724),
114    f32::from_bits(0x2BC882A4),
115    f32::from_bits(0x201FC274),
116    f32::from_bits(0x14BE8FAA),
117    f32::from_bits(0x36C1B724),
118    f32::from_bits(0x2BC882A4),
119    f32::from_bits(0x201FC274),
120    f32::from_bits(0x14BE8FAA),
121    f32::from_bits(0x36C1B724),
122    f32::from_bits(0x2BC882A4),
123    f32::from_bits(0x201FC274),
124    f32::from_bits(0x14BE8FAA),
125    f32::from_bits(0x36036E4C),
126    f32::from_bits(0x2B110548),
127    f32::from_bits(0x201FC274),
128    f32::from_bits(0x14BE8FAA),
129    f32::from_bits(0x335B9390),
130    f32::from_bits(0x2782A540),
131    f32::from_bits(0x9B762A0C),
132    f32::from_bits(0x0EFA9A6F),
133    f32::from_bits(0x335B9390),
134    f32::from_bits(0x2782A540),
135    f32::from_bits(0x9B762A0C),
136    f32::from_bits(0x0EFA9A6F),
137    f32::from_bits(0x335B9390),
138    f32::from_bits(0x2782A540),
139    f32::from_bits(0x9B762A0C),
140    f32::from_bits(0x0EFA9A6F),
141    f32::from_bits(0x335B9390),
142    f32::from_bits(0x2782A540),
143    f32::from_bits(0x9B762A0C),
144    f32::from_bits(0x0EFA9A6F),
145    f32::from_bits(0x335B9390),
146    f32::from_bits(0x2782A540),
147    f32::from_bits(0x9B762A0C),
148    f32::from_bits(0x0EFA9A6F),
149    f32::from_bits(0x335B9390),
150    f32::from_bits(0x2782A540),
151    f32::from_bits(0x9B762A0C),
152    f32::from_bits(0x0EFA9A6F),
153    f32::from_bits(0x32B72720),
154    f32::from_bits(0x2782A540),
155    f32::from_bits(0x9B762A0C),
156    f32::from_bits(0x0EFA9A6F),
157    f32::from_bits(0x31DC9C88),
158    f32::from_bits(0x24A94FE0),
159    f32::from_bits(0x191D5F48),
160    f32::from_bits(0x8C2CB224),
161    f32::from_bits(0x31DC9C88),
162    f32::from_bits(0x24A94FE0),
163    f32::from_bits(0x191D5F48),
164    f32::from_bits(0x8C2CB224),
165    f32::from_bits(0x31393910),
166    f32::from_bits(0x24A94FE0),
167    f32::from_bits(0x191D5F48),
168    f32::from_bits(0x8C2CB224),
169    f32::from_bits(0x3064E440),
170    f32::from_bits(0x24A94FE0),
171    f32::from_bits(0x191D5F48),
172    f32::from_bits(0x8C2CB224),
173    f32::from_bits(0x3064E440),
174    f32::from_bits(0x24A94FE0),
175    f32::from_bits(0x191D5F48),
176    f32::from_bits(0x8C2CB224),
177    f32::from_bits(0x2FC9C880),
178    f32::from_bits(0x24A94FE0),
179    f32::from_bits(0x191D5F48),
180    f32::from_bits(0x8C2CB224),
181    f32::from_bits(0x2F139104),
182    f32::from_bits(0x23A53F84),
183    f32::from_bits(0x17EAFA3C),
184    f32::from_bits(0x0CA9A6EE),
185    f32::from_bits(0x2D9C8828),
186    f32::from_bits(0x2294FE14),
187    f32::from_bits(0x96282E08),
188    f32::from_bits(0x8B32C890),
189    f32::from_bits(0x2D9C8828),
190    f32::from_bits(0x2294FE14),
191    f32::from_bits(0x96282E08),
192    f32::from_bits(0x8B32C890),
193    f32::from_bits(0x2D9C8828),
194    f32::from_bits(0x2294FE14),
195    f32::from_bits(0x96282E08),
196    f32::from_bits(0x8B32C890),
197    f32::from_bits(0x2C644150),
198    f32::from_bits(0x2127F09C),
199    f32::from_bits(0x15AFA3E8),
200    f32::from_bits(0x0A9A6EE0),
201    f32::from_bits(0x2C644150),
202    f32::from_bits(0x2127F09C),
203    f32::from_bits(0x15AFA3E8),
204    f32::from_bits(0x0A9A6EE0),
205    f32::from_bits(0x2C644150),
206    f32::from_bits(0x2127F09C),
207    f32::from_bits(0x15AFA3E8),
208    f32::from_bits(0x0A9A6EE0),
209    f32::from_bits(0x2BC882A4),
210    f32::from_bits(0x201FC274),
211    f32::from_bits(0x14BE8FA8),
212    f32::from_bits(0x09537703),
213    f32::from_bits(0x2B110548),
214    f32::from_bits(0x201FC274),
215    f32::from_bits(0x14BE8FA8),
216    f32::from_bits(0x09537703),
217    f32::from_bits(0x29882A54),
218    f32::from_bits(0x9B762A0C),
219    f32::from_bits(0x0EFA9A6C),
220    f32::from_bits(0x03B81B6C),
221    f32::from_bits(0x29882A54),
222    f32::from_bits(0x9B762A0C),
223    f32::from_bits(0x0EFA9A6C),
224    f32::from_bits(0x03B81B6C),
225    f32::from_bits(0x29882A54),
226    f32::from_bits(0x9B762A0C),
227    f32::from_bits(0x0EFA9A6C),
228    f32::from_bits(0x03B81B6C),
229    f32::from_bits(0x2782A540),
230    f32::from_bits(0x9B762A0C),
231    f32::from_bits(0x0EFA9A6C),
232    f32::from_bits(0x03B81B6C),
233    f32::from_bits(0x2782A540),
234    f32::from_bits(0x9B762A0C),
235    f32::from_bits(0x0EFA9A6C),
236    f32::from_bits(0x03B81B6C),
237    f32::from_bits(0x2782A540),
238    f32::from_bits(0x9B762A0C),
239    f32::from_bits(0x0EFA9A6C),
240    f32::from_bits(0x03B81B6C),
241    f32::from_bits(0x2782A540),
242    f32::from_bits(0x9B762A0C),
243    f32::from_bits(0x0EFA9A6C),
244    f32::from_bits(0x03B81B6C),
245    f32::from_bits(0x24A94FE0),
246    f32::from_bits(0x191D5F48),
247    f32::from_bits(0x8C2CB224),
248    f32::from_bits(0x0006DB15),
249    f32::from_bits(0x24A94FE0),
250    f32::from_bits(0x191D5F48),
251    f32::from_bits(0x8C2CB224),
252    f32::from_bits(0x0006DB15),
253    f32::from_bits(0x24A94FE0),
254    f32::from_bits(0x191D5F48),
255    f32::from_bits(0x8C2CB224),
256    f32::from_bits(0x0006DB15),
257    f32::from_bits(0x24A94FE0),
258    f32::from_bits(0x191D5F48),
259    f32::from_bits(0x8C2CB224),
260    f32::from_bits(0x0006DB15),
261    f32::from_bits(0x24A94FE0),
262    f32::from_bits(0x191D5F48),
263    f32::from_bits(0x8C2CB224),
264    f32::from_bits(0x0006DB15),
265    f32::from_bits(0x24A94FE0),
266    f32::from_bits(0x191D5F48),
267    f32::from_bits(0x8C2CB224),
268    f32::from_bits(0x0006DB15),
269    f32::from_bits(0x23A53F84),
270    f32::from_bits(0x17EAFA3C),
271    f32::from_bits(0x0CA9A6EC),
272    f32::from_bits(0x0181B6C5),
273    f32::from_bits(0x23A53F84),
274    f32::from_bits(0x17EAFA3C),
275    f32::from_bits(0x0CA9A6EC),
276    f32::from_bits(0x0181B6C5),
277    f32::from_bits(0x2294FE14),
278    f32::from_bits(0x96282E08),
279    f32::from_bits(0x8B32C890),
280    f32::from_bits(0x0006DB15),
281    f32::from_bits(0x2294FE14),
282    f32::from_bits(0x96282E08),
283    f32::from_bits(0x8B32C890),
284    f32::from_bits(0x0006DB15),
285    f32::from_bits(0x2127F09C),
286    f32::from_bits(0x15AFA3E8),
287    f32::from_bits(0x0A9A6EE0),
288    f32::from_bits(0x0006DB15),
289    f32::from_bits(0x2127F09C),
290    f32::from_bits(0x15AFA3E8),
291    f32::from_bits(0x0A9A6EE0),
292    f32::from_bits(0x0006DB15),
293    f32::from_bits(0x2127F09C),
294    f32::from_bits(0x15AFA3E8),
295    f32::from_bits(0x0A9A6EE0),
296    f32::from_bits(0x0006DB15),
297    f32::from_bits(0x201FC274),
298    f32::from_bits(0x14BE8FA8),
299    f32::from_bits(0x09537700),
300    f32::from_bits(0x0006DB15),
301    f32::from_bits(0x201FC274),
302    f32::from_bits(0x14BE8FA8),
303    f32::from_bits(0x09537700),
304    f32::from_bits(0x0006DB15),
305    f32::from_bits(0x1EFE13AC),
306    f32::from_bits(0x91382B2C),
307    f32::from_bits(0x8508FC90),
308    f32::from_bits(0x800004EB),
309    f32::from_bits(0x1EFE13AC),
310    f32::from_bits(0x91382B2C),
311    f32::from_bits(0x8508FC90),
312    f32::from_bits(0x800004EB),
313    f32::from_bits(0x1EFE13AC),
314    f32::from_bits(0x91382B2C),
315    f32::from_bits(0x8508FC90),
316    f32::from_bits(0x800004EB),
317    f32::from_bits(0x1E7C2758),
318    f32::from_bits(0x91382B2C),
319    f32::from_bits(0x8508FC90),
320    f32::from_bits(0x800004EB),
321    f32::from_bits(0x1DF84EB0),
322    f32::from_bits(0x91382B2C),
323    f32::from_bits(0x8508FC90),
324    f32::from_bits(0x800004EB),
325    f32::from_bits(0x3D709D5C),
326    f32::from_bits(0x3251F534),
327    f32::from_bits(0x265DC0D8),
328    f32::from_bits(0x1B58A566),
329    f32::from_bits(0x3CE13ABC),
330    f32::from_bits(0x31A3EA68),
331    f32::from_bits(0x265DC0D8),
332    f32::from_bits(0x1B58A566),
333    f32::from_bits(0x3C42757C),
334    f32::from_bits(0x308FA9A4),
335    f32::from_bits(0x25BB81B4),
336    f32::from_bits(0x1AB14ACD),
337    f32::from_bits(0x3B84EAF8),
338    f32::from_bits(0x308FA9A4),
339    f32::from_bits(0x25BB81B4),
340    f32::from_bits(0x1AB14ACD),
341    f32::from_bits(0x391D5F48),
342    f32::from_bits(0xAC2CB224),
343    f32::from_bits(0x1E5B6294),
344    f32::from_bits(0x12CC9E22),
345    f32::from_bits(0x391D5F48),
346    f32::from_bits(0xAC2CB224),
347    f32::from_bits(0x1E5B6294),
348    f32::from_bits(0x12CC9E22),
349    f32::from_bits(0x391D5F48),
350    f32::from_bits(0xAC2CB224),
351    f32::from_bits(0x1E5B6294),
352    f32::from_bits(0x12CC9E22),
353    f32::from_bits(0x391D5F48),
354    f32::from_bits(0xAC2CB224),
355    f32::from_bits(0x1E5B6294),
356    f32::from_bits(0x12CC9E22),
357    f32::from_bits(0x391D5F48),
358    f32::from_bits(0xAC2CB224),
359    f32::from_bits(0x1E5B6294),
360    f32::from_bits(0x12CC9E22),
361    f32::from_bits(0x37EAFA3C),
362    f32::from_bits(0x2CA9A6EC),
363    f32::from_bits(0x2181B6C4),
364    f32::from_bits(0x1615993C),
365    f32::from_bits(0x37EAFA3C),
366    f32::from_bits(0x2CA9A6EC),
367    f32::from_bits(0x2181B6C4),
368    f32::from_bits(0x1615993C),
369    f32::from_bits(0x37EAFA3C),
370    f32::from_bits(0x2CA9A6EC),
371    f32::from_bits(0x2181B6C4),
372    f32::from_bits(0x1615993C),
373    f32::from_bits(0x3755F47C),
374    f32::from_bits(0x2BA69BB8),
375    f32::from_bits(0x1E5B6294),
376    f32::from_bits(0x12CC9E22),
377    f32::from_bits(0x36ABE8F8),
378    f32::from_bits(0x2BA69BB8),
379    f32::from_bits(0x1E5B6294),
380    f32::from_bits(0x12CC9E22),
381    f32::from_bits(0x35AFA3E8),
382    f32::from_bits(0x2A9A6EE0),
383    f32::from_bits(0x1E5B6294),
384    f32::from_bits(0x12CC9E22),
385    f32::from_bits(0x35AFA3E8),
386    f32::from_bits(0x2A9A6EE0),
387    f32::from_bits(0x1E5B6294),
388    f32::from_bits(0x12CC9E22),
389    f32::from_bits(0x34BE8FA8),
390    f32::from_bits(0x29537700),
391    f32::from_bits(0x1E5B6294),
392    f32::from_bits(0x12CC9E22),
393    f32::from_bits(0x34BE8FA8),
394    f32::from_bits(0x29537700),
395    f32::from_bits(0x1E5B6294),
396    f32::from_bits(0x12CC9E22),
397    f32::from_bits(0x33FA3EA4),
398    f32::from_bits(0x28A6EE04),
399    f32::from_bits(0x1DB6C528),
400    f32::from_bits(0x12CC9E22),
401    f32::from_bits(0x33FA3EA4),
402    f32::from_bits(0x28A6EE04),
403    f32::from_bits(0x1DB6C528),
404    f32::from_bits(0x12CC9E22),
405    f32::from_bits(0x33747D4C),
406    f32::from_bits(0x279BB818),
407    f32::from_bits(0x1CDB14AC),
408    f32::from_bits(0x10C9E21D),
409    f32::from_bits(0x32E8FA98),
410    f32::from_bits(0x279BB818),
411    f32::from_bits(0x1CDB14AC),
412    f32::from_bits(0x10C9E21D),
413    f32::from_bits(0x3251F534),
414    f32::from_bits(0x265DC0D8),
415    f32::from_bits(0x1B58A564),
416    f32::from_bits(0x1013C439),
417    f32::from_bits(0x31A3EA68),
418    f32::from_bits(0x265DC0D8),
419    f32::from_bits(0x1B58A564),
420    f32::from_bits(0x1013C439),
421    f32::from_bits(0x308FA9A4),
422    f32::from_bits(0x25BB81B4),
423    f32::from_bits(0x1AB14ACC),
424    f32::from_bits(0x0E9E21C8),
425    f32::from_bits(0x308FA9A4),
426    f32::from_bits(0x25BB81B4),
427    f32::from_bits(0x1AB14ACC),
428    f32::from_bits(0x0E9E21C8),
429    f32::from_bits(0x2EFA9A6C),
430    f32::from_bits(0x23B81B6C),
431    f32::from_bits(0x1725664C),
432    f32::from_bits(0x0C443904),
433    f32::from_bits(0x2EFA9A6C),
434    f32::from_bits(0x23B81B6C),
435    f32::from_bits(0x1725664C),
436    f32::from_bits(0x0C443904),
437    f32::from_bits(0x2EFA9A6C),
438    f32::from_bits(0x23B81B6C),
439    f32::from_bits(0x1725664C),
440    f32::from_bits(0x0C443904),
441    f32::from_bits(0x2EFA9A6C),
442    f32::from_bits(0x23B81B6C),
443    f32::from_bits(0x1725664C),
444    f32::from_bits(0x0C443904),
445    f32::from_bits(0x2E7534DC),
446    f32::from_bits(0x22E06DB0),
447    f32::from_bits(0x1725664C),
448    f32::from_bits(0x0C443904),
449    f32::from_bits(0x2DEA69BC),
450    f32::from_bits(0xA17C9274),
451    f32::from_bits(0x95D4CD84),
452    f32::from_bits(0x8ADE37DF),
453    f32::from_bits(0x2D54D374),
454    f32::from_bits(0x2240DB60),
455    f32::from_bits(0x1725664C),
456    f32::from_bits(0x0C443904),
457    f32::from_bits(0x2CA9A6EC),
458    f32::from_bits(0x2181B6C4),
459    f32::from_bits(0x1615993C),
460    f32::from_bits(0x09872084),
461    f32::from_bits(0x2BA69BB8),
462    f32::from_bits(0x1E5B6294),
463    f32::from_bits(0x12CC9E20),
464    f32::from_bits(0x07641080),
465    f32::from_bits(0x2BA69BB8),
466    f32::from_bits(0x1E5B6294),
467    f32::from_bits(0x12CC9E20),
468    f32::from_bits(0x07641080),
469    f32::from_bits(0x2A9A6EE0),
470    f32::from_bits(0x1E5B6294),
471    f32::from_bits(0x12CC9E20),
472    f32::from_bits(0x07641080),
473    f32::from_bits(0x00000000),
474    f32::from_bits(0x00000000),
475    f32::from_bits(0x00000000),
476    f32::from_bits(0x00000000),
477];
478
479/// Scalar Payne-Hanek pi reduction for a single `f32`.
480///
481/// Returns `(reduced_hi, reduced_lo, quadrant)` where `reduced_hi + reduced_lo`
482/// is `a` reduced modulo pi/2, and `quadrant` encodes which quadrant of the
483/// original angle was occupied.
484fn rempif_scalar(a: f32) -> (f32, f32, i32) {
485    // vilogb2k: extract exponent
486    let mut ex = ((a.to_bits() >> 23) & 0xFF) as i32 - 0x7F;
487    ex -= 25;
488    let mut a = a;
489
490    // Scale down very large inputs (original exponent > 90) to prevent
491    // f32 intermediate precision loss: without this, a * table[idx] products
492    // are so large that rempisubf cannot extract quadrant/fractional bits.
493    if ex > 90 - 25 {
494        // vldexp3: add -64 to exponent field
495        a = f32::from_bits((a.to_bits() as i32 + (-64i32 << 23)) as u32);
496    }
497    // Clamp negative to 0: vandnot with arithmetic right shift
498    if ex < 0 {
499        ex = 0;
500    }
501    let idx = (ex as usize) * 4;
502
503    // First multiply
504    let (mut x_hi, mut x_lo) = df_mul_f_f(a, REMPI_TABSP[idx]);
505    // rempisubf: extract quadrant
506    let y = (x_hi * 4.0).round();
507    let mut q = (y - (x_hi.round()) * 4.0) as i32;
508    x_hi -= y * 0.25;
509    let (x_hi2, x_lo2) = df_normalize(x_hi, x_lo);
510    x_hi = x_hi2;
511    x_lo = x_lo2;
512
513    // Second multiply
514    let (y_hi, y_lo) = df_mul_f_f(a, REMPI_TABSP[idx + 1]);
515    let (x_hi2, x_lo2) = df_add2_f2_f2(x_hi, x_lo, y_hi, y_lo);
516    x_hi = x_hi2;
517    x_lo = x_lo2;
518
519    let y2 = (x_hi * 4.0).round();
520    q += (y2 - (x_hi.round()) * 4.0) as i32;
521    x_hi -= y2 * 0.25;
522    let (x_hi2, x_lo2) = df_normalize(x_hi, x_lo);
523    x_hi = x_hi2;
524    x_lo = x_lo2;
525
526    // Third and fourth multiply
527    let (y_hi, y_lo) = (REMPI_TABSP[idx + 2], REMPI_TABSP[idx + 3]);
528    let (y_hi2, y_lo2) = df_mul_f2_f(y_hi, y_lo, a);
529    let (x_hi2, x_lo2) = df_add2_f2_f2(x_hi, x_lo, y_hi2, y_lo2);
530    x_hi = x_hi2;
531    x_lo = x_lo2;
532    let (x_hi2, x_lo2) = df_normalize(x_hi, x_lo);
533    x_hi = x_hi2;
534    x_lo = x_lo2;
535
536    // Multiply by 2*pi
537    const TWO_PI_HI: f32 = 3.1415927410125732422 * 2.0;
538    const TWO_PI_LO: f32 = -8.7422776573475857731e-08 * 2.0;
539    let (r_hi, r_lo) = df_mul_f2_f2(x_hi, x_lo, TWO_PI_HI, TWO_PI_LO);
540
541    // For very small inputs, return a directly
542    if a.abs() < 0.7 {
543        return (a, 0.0, 0);
544    }
545
546    (r_hi, r_lo, q)
547}
548
549/// Normalizes a double-float pair so that `hi` carries the leading bits.
550///
551/// Returns `(s, err)` where `s = a_hi + a_lo` and `err` captures the rounding error.
552#[inline(always)]
553fn df_normalize(a_hi: f32, a_lo: f32) -> (f32, f32) {
554    let s = a_hi + a_lo;
555    (s, (a_hi - s) + a_lo)
556}
557
558/// Error-free addition of two double-float pairs: `(a_hi, a_lo) + (b_hi, b_lo)`.
559#[inline(always)]
560fn df_add2_f2_f2(a_hi: f32, a_lo: f32, b_hi: f32, b_lo: f32) -> (f32, f32) {
561    let s = a_hi + b_hi;
562    let v = s - a_hi;
563    let t = (a_hi - (s - v)) + (b_hi - v);
564    (s, t + a_lo + b_lo)
565}
566
567/// Error-free multiplication of two `f32` values using FMA. Returns `(product, error)`.
568#[inline(always)]
569fn df_mul_f_f(a: f32, b: f32) -> (f32, f32) {
570    let s = a * b;
571    let t = a.mul_add(b, -s);
572    (s, t)
573}
574
575/// Multiplies a double-float pair `(a_hi, a_lo)` by a single `f32`.
576#[inline(always)]
577fn df_mul_f2_f(a_hi: f32, a_lo: f32, b: f32) -> (f32, f32) {
578    let s = a_hi * b;
579    let t = a_hi.mul_add(b, -s) + a_lo * b;
580    (s, t)
581}
582
583/// Multiplies two double-float pairs: `(a_hi, a_lo) * (b_hi, b_lo)`.
584#[inline(always)]
585fn df_mul_f2_f2(a_hi: f32, a_lo: f32, b_hi: f32, b_lo: f32) -> (f32, f32) {
586    let s = a_hi * b_hi;
587    let t = a_hi.mul_add(b_hi, -s) + a_hi * b_lo + a_lo * b_hi;
588    (s, t)
589}
590
591#[cfg(test)]
592mod tests;