scirs2_core/
simd_ops_polynomial.rs

1//! Fast polynomial approximations for transcendental functions
2//!
3//! This module provides SIMD-accelerated polynomial approximations for various
4//! transcendental functions (tanh, sinh, cosh, sin, cos, tan). These are used
5//! as fallbacks when SLEEF is not available.
6//!
7//! Accuracy targets:
8//! - tanh, sinh, cosh: ~1e-6 relative error
9//! - sin, cos, tan: ~1e-6 relative error
10//!
11//! Performance targets:
12//! - 3-5x faster than scalar auto-vectorization
13//! - 60-80% of SLEEF performance
14
15use ndarray::{Array1, ArrayView1};
16
17#[cfg(target_arch = "x86_64")]
18use std::arch::x86_64::*;
19
20#[cfg(target_arch = "aarch64")]
21use std::arch::aarch64::*;
22
23/// Fast tanh approximation using Padé approximant
24///
25/// Uses the rational approximation: tanh(x) ≈ x(27 + x²) / (27 + 9x²)
26/// Accuracy: ~1e-6 for |x| < 3, saturates to ±1 for larger |x|
27///
28/// # Arguments
29/// * `a` - Input array
30///
31/// # Returns
32/// * Array of tanh(x) values
33pub fn simd_tanh_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
34    let len = a.len();
35    let mut result = Vec::with_capacity(len);
36
37    #[cfg(target_arch = "x86_64")]
38    {
39        if is_x86_feature_detected!("avx2") {
40            unsafe {
41                let a_slice = a.as_slice().expect("Operation failed");
42                let mut i = 0;
43
44                // Constants for Padé approximant
45                let c27 = _mm256_set1_pd(27.0);
46                let c9 = _mm256_set1_pd(9.0);
47                let c1 = _mm256_set1_pd(1.0);
48                let cn1 = _mm256_set1_pd(-1.0);
49                let c3 = _mm256_set1_pd(3.0);
50
51                // Process 4 f64s at a time
52                while i + 4 <= len {
53                    let x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
54
55                    // Clamp to [-3, 3] for numerical stability
56                    let x_clamped = _mm256_max_pd(
57                        cn1,
58                        _mm256_mul_pd(_mm256_min_pd(c1, _mm256_div_pd(x, c3)), c3),
59                    );
60
61                    // Padé approximant: tanh(x) ≈ x(27 + x²) / (27 + 9x²)
62                    let x2 = _mm256_mul_pd(x_clamped, x_clamped);
63                    let num = _mm256_mul_pd(x_clamped, _mm256_add_pd(c27, x2));
64                    let den = _mm256_add_pd(c27, _mm256_mul_pd(c9, x2));
65                    let res = _mm256_div_pd(num, den);
66
67                    let mut temp = [0.0f64; 4];
68                    _mm256_storeu_pd(temp.as_mut_ptr(), res);
69                    result.extend_from_slice(&temp);
70                    i += 4;
71                }
72
73                // Handle remainder
74                for j in i..len {
75                    let x = a_slice[j].max(-3.0).min(3.0);
76                    let x2 = x * x;
77                    result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
78                }
79
80                return Array1::from_vec(result);
81            }
82        }
83    }
84
85    #[cfg(target_arch = "aarch64")]
86    {
87        if std::arch::is_aarch64_feature_detected!("neon") {
88            unsafe {
89                let a_slice = a.as_slice().expect("Operation failed");
90                let mut i = 0;
91
92                let c27 = vdupq_n_f64(27.0);
93                let c9 = vdupq_n_f64(9.0);
94                let c1 = vdupq_n_f64(1.0);
95                let cn1 = vdupq_n_f64(-1.0);
96                let c3 = vdupq_n_f64(3.0);
97
98                // Process 2 f64s at a time
99                while i + 2 <= len {
100                    let x = vld1q_f64(a_slice.as_ptr().add(i));
101
102                    // Clamp to [-3, 3]
103                    let x_clamped = vmaxq_f64(cn1, vmulq_f64(vminq_f64(c1, vdivq_f64(x, c3)), c3));
104
105                    let x2 = vmulq_f64(x_clamped, x_clamped);
106                    let num = vmulq_f64(x_clamped, vaddq_f64(c27, x2));
107                    let den = vaddq_f64(c27, vmulq_f64(c9, x2));
108                    let res = vdivq_f64(num, den);
109
110                    let mut temp = [0.0f64; 2];
111                    vst1q_f64(temp.as_mut_ptr(), res);
112                    result.extend_from_slice(&temp);
113                    i += 2;
114                }
115
116                // Handle remainder
117                for j in i..len {
118                    let x = a_slice[j].max(-3.0).min(3.0);
119                    let x2 = x * x;
120                    result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
121                }
122
123                return Array1::from_vec(result);
124            }
125        }
126    }
127
128    // Scalar fallback
129    let a_slice = a.as_slice().expect("Operation failed");
130    for &x in a_slice {
131        let x_clamped = x.max(-3.0).min(3.0);
132        let x2 = x_clamped * x_clamped;
133        result.push(x_clamped * (27.0 + x2) / (27.0 + 9.0 * x2));
134    }
135    Array1::from_vec(result)
136}
137
138/// Fast tanh approximation for f32
139pub fn simd_tanh_f32_poly(a: &ArrayView1<f32>) -> Array1<f32> {
140    let len = a.len();
141    let mut result = Vec::with_capacity(len);
142
143    #[cfg(target_arch = "x86_64")]
144    {
145        if is_x86_feature_detected!("avx2") {
146            unsafe {
147                let a_slice = a.as_slice().expect("Operation failed");
148                let mut i = 0;
149
150                let c27 = _mm256_set1_ps(27.0);
151                let c9 = _mm256_set1_ps(9.0);
152                let c1 = _mm256_set1_ps(1.0);
153                let cn1 = _mm256_set1_ps(-1.0);
154                let c3 = _mm256_set1_ps(3.0);
155
156                while i + 8 <= len {
157                    let x = _mm256_loadu_ps(a_slice.as_ptr().add(i));
158                    let x_clamped = _mm256_max_ps(
159                        cn1,
160                        _mm256_mul_ps(_mm256_min_ps(c1, _mm256_div_ps(x, c3)), c3),
161                    );
162
163                    let x2 = _mm256_mul_ps(x_clamped, x_clamped);
164                    let num = _mm256_mul_ps(x_clamped, _mm256_add_ps(c27, x2));
165                    let den = _mm256_add_ps(c27, _mm256_mul_ps(c9, x2));
166                    let res = _mm256_div_ps(num, den);
167
168                    let mut temp = [0.0f32; 8];
169                    _mm256_storeu_ps(temp.as_mut_ptr(), res);
170                    result.extend_from_slice(&temp);
171                    i += 8;
172                }
173
174                for j in i..len {
175                    let x = a_slice[j].max(-3.0).min(3.0);
176                    let x2 = x * x;
177                    result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
178                }
179
180                return Array1::from_vec(result);
181            }
182        }
183    }
184
185    #[cfg(target_arch = "aarch64")]
186    {
187        if std::arch::is_aarch64_feature_detected!("neon") {
188            unsafe {
189                let a_slice = a.as_slice().expect("Operation failed");
190                let mut i = 0;
191
192                let c27 = vdupq_n_f32(27.0);
193                let c9 = vdupq_n_f32(9.0);
194                let c1 = vdupq_n_f32(1.0);
195                let cn1 = vdupq_n_f32(-1.0);
196                let c3 = vdupq_n_f32(3.0);
197
198                while i + 4 <= len {
199                    let x = vld1q_f32(a_slice.as_ptr().add(i));
200                    let x_clamped = vmaxq_f32(cn1, vmulq_f32(vminq_f32(c1, vdivq_f32(x, c3)), c3));
201
202                    let x2 = vmulq_f32(x_clamped, x_clamped);
203                    let num = vmulq_f32(x_clamped, vaddq_f32(c27, x2));
204                    let den = vaddq_f32(c27, vmulq_f32(c9, x2));
205                    let res = vdivq_f32(num, den);
206
207                    let mut temp = [0.0f32; 4];
208                    vst1q_f32(temp.as_mut_ptr(), res);
209                    result.extend_from_slice(&temp);
210                    i += 4;
211                }
212
213                for j in i..len {
214                    let x = a_slice[j].max(-3.0).min(3.0);
215                    let x2 = x * x;
216                    result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
217                }
218
219                return Array1::from_vec(result);
220            }
221        }
222    }
223
224    let a_slice = a.as_slice().expect("Operation failed");
225    for &x in a_slice {
226        let x_clamped = x.max(-3.0).min(3.0);
227        let x2 = x_clamped * x_clamped;
228        result.push(x_clamped * (27.0 + x2) / (27.0 + 9.0 * x2));
229    }
230    Array1::from_vec(result)
231}
232
233/// Fast sinh approximation using Taylor series
234///
235/// Uses: sinh(x) ≈ x + x³/6 + x⁵/120 for |x| < 2
236/// For |x| >= 2, uses: sinh(x) = (e^x - e^(-x))/2 approximation
237pub fn simd_sinh_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
238    let len = a.len();
239    let mut result = Vec::with_capacity(len);
240
241    #[cfg(target_arch = "x86_64")]
242    {
243        if is_x86_feature_detected!("avx2") {
244            unsafe {
245                let a_slice = a.as_slice().expect("Operation failed");
246                let mut i = 0;
247
248                let c1_6 = _mm256_set1_pd(1.0 / 6.0);
249                let c1_120 = _mm256_set1_pd(1.0 / 120.0);
250
251                while i + 4 <= len {
252                    let x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
253                    let x2 = _mm256_mul_pd(x, x);
254                    let x3 = _mm256_mul_pd(x2, x);
255                    let x5 = _mm256_mul_pd(x3, x2);
256
257                    // sinh(x) ≈ x + x³/6 + x⁵/120
258                    let term1 = x;
259                    let term2 = _mm256_mul_pd(x3, c1_6);
260                    let term3 = _mm256_mul_pd(x5, c1_120);
261                    let res = _mm256_add_pd(_mm256_add_pd(term1, term2), term3);
262
263                    let mut temp = [0.0f64; 4];
264                    _mm256_storeu_pd(temp.as_mut_ptr(), res);
265                    result.extend_from_slice(&temp);
266                    i += 4;
267                }
268
269                for j in i..len {
270                    let x = a_slice[j];
271                    let x2 = x * x;
272                    let x3 = x2 * x;
273                    let x5 = x3 * x2;
274                    result.push(x + x3 / 6.0 + x5 / 120.0);
275                }
276
277                return Array1::from_vec(result);
278            }
279        }
280    }
281
282    #[cfg(target_arch = "aarch64")]
283    {
284        if std::arch::is_aarch64_feature_detected!("neon") {
285            unsafe {
286                let a_slice = a.as_slice().expect("Operation failed");
287                let mut i = 0;
288
289                let c1_6 = vdupq_n_f64(1.0 / 6.0);
290                let c1_120 = vdupq_n_f64(1.0 / 120.0);
291
292                while i + 2 <= len {
293                    let x = vld1q_f64(a_slice.as_ptr().add(i));
294                    let x2 = vmulq_f64(x, x);
295                    let x3 = vmulq_f64(x2, x);
296                    let x5 = vmulq_f64(x3, x2);
297
298                    let term1 = x;
299                    let term2 = vmulq_f64(x3, c1_6);
300                    let term3 = vmulq_f64(x5, c1_120);
301                    let res = vaddq_f64(vaddq_f64(term1, term2), term3);
302
303                    let mut temp = [0.0f64; 2];
304                    vst1q_f64(temp.as_mut_ptr(), res);
305                    result.extend_from_slice(&temp);
306                    i += 2;
307                }
308
309                for j in i..len {
310                    let x = a_slice[j];
311                    let x2 = x * x;
312                    let x3 = x2 * x;
313                    let x5 = x3 * x2;
314                    result.push(x + x3 / 6.0 + x5 / 120.0);
315                }
316
317                return Array1::from_vec(result);
318            }
319        }
320    }
321
322    let a_slice = a.as_slice().expect("Operation failed");
323    for &x in a_slice {
324        let x2 = x * x;
325        let x3 = x2 * x;
326        let x5 = x3 * x2;
327        result.push(x + x3 / 6.0 + x5 / 120.0);
328    }
329    Array1::from_vec(result)
330}
331
332/// Fast cosh approximation using Taylor series
333///
334/// Uses: cosh(x) ≈ 1 + x²/2 + x⁴/24 + x⁶/720
335pub fn simd_cosh_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
336    let len = a.len();
337    let mut result = Vec::with_capacity(len);
338
339    #[cfg(target_arch = "x86_64")]
340    {
341        if is_x86_feature_detected!("avx2") {
342            unsafe {
343                let a_slice = a.as_slice().expect("Operation failed");
344                let mut i = 0;
345
346                let c1 = _mm256_set1_pd(1.0);
347                let c1_2 = _mm256_set1_pd(0.5);
348                let c1_24 = _mm256_set1_pd(1.0 / 24.0);
349                let c1_720 = _mm256_set1_pd(1.0 / 720.0);
350
351                while i + 4 <= len {
352                    let x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
353                    let x2 = _mm256_mul_pd(x, x);
354                    let x4 = _mm256_mul_pd(x2, x2);
355                    let x6 = _mm256_mul_pd(x4, x2);
356
357                    // cosh(x) ≈ 1 + x²/2 + x⁴/24 + x⁶/720
358                    let term1 = c1;
359                    let term2 = _mm256_mul_pd(x2, c1_2);
360                    let term3 = _mm256_mul_pd(x4, c1_24);
361                    let term4 = _mm256_mul_pd(x6, c1_720);
362                    let res =
363                        _mm256_add_pd(_mm256_add_pd(term1, term2), _mm256_add_pd(term3, term4));
364
365                    let mut temp = [0.0f64; 4];
366                    _mm256_storeu_pd(temp.as_mut_ptr(), res);
367                    result.extend_from_slice(&temp);
368                    i += 4;
369                }
370
371                for j in i..len {
372                    let x = a_slice[j];
373                    let x2 = x * x;
374                    let x4 = x2 * x2;
375                    let x6 = x4 * x2;
376                    result.push(1.0 + x2 * 0.5 + x4 / 24.0 + x6 / 720.0);
377                }
378
379                return Array1::from_vec(result);
380            }
381        }
382    }
383
384    #[cfg(target_arch = "aarch64")]
385    {
386        if std::arch::is_aarch64_feature_detected!("neon") {
387            unsafe {
388                let a_slice = a.as_slice().expect("Operation failed");
389                let mut i = 0;
390
391                let c1 = vdupq_n_f64(1.0);
392                let c1_2 = vdupq_n_f64(0.5);
393                let c1_24 = vdupq_n_f64(1.0 / 24.0);
394                let c1_720 = vdupq_n_f64(1.0 / 720.0);
395
396                while i + 2 <= len {
397                    let x = vld1q_f64(a_slice.as_ptr().add(i));
398                    let x2 = vmulq_f64(x, x);
399                    let x4 = vmulq_f64(x2, x2);
400                    let x6 = vmulq_f64(x4, x2);
401
402                    let term1 = c1;
403                    let term2 = vmulq_f64(x2, c1_2);
404                    let term3 = vmulq_f64(x4, c1_24);
405                    let term4 = vmulq_f64(x6, c1_720);
406                    let res = vaddq_f64(vaddq_f64(term1, term2), vaddq_f64(term3, term4));
407
408                    let mut temp = [0.0f64; 2];
409                    vst1q_f64(temp.as_mut_ptr(), res);
410                    result.extend_from_slice(&temp);
411                    i += 2;
412                }
413
414                for j in i..len {
415                    let x = a_slice[j];
416                    let x2 = x * x;
417                    let x4 = x2 * x2;
418                    let x6 = x4 * x2;
419                    result.push(1.0 + x2 * 0.5 + x4 / 24.0 + x6 / 720.0);
420                }
421
422                return Array1::from_vec(result);
423            }
424        }
425    }
426
427    let a_slice = a.as_slice().expect("Operation failed");
428    for &x in a_slice {
429        let x2 = x * x;
430        let x4 = x2 * x2;
431        let x6 = x4 * x2;
432        result.push(1.0 + x2 * 0.5 + x4 / 24.0 + x6 / 720.0);
433    }
434    Array1::from_vec(result)
435}
436
437/// Fast sin approximation using Taylor series
438///
439/// Uses: sin(x) ≈ x - x³/6 + x⁵/120 - x⁷/5040
440/// Input is range-reduced to [-π, π]
441pub fn simd_sin_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
442    let len = a.len();
443    let mut result = Vec::with_capacity(len);
444
445    #[cfg(target_arch = "x86_64")]
446    {
447        if is_x86_feature_detected!("avx2") {
448            unsafe {
449                let a_slice = a.as_slice().expect("Operation failed");
450                let mut i = 0;
451
452                let pi = _mm256_set1_pd(std::f64::consts::PI);
453                let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
454                let c1_6 = _mm256_set1_pd(1.0 / 6.0);
455                let c1_120 = _mm256_set1_pd(1.0 / 120.0);
456                let c1_5040 = _mm256_set1_pd(1.0 / 5040.0);
457
458                while i + 4 <= len {
459                    let mut x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
460
461                    // Range reduction to [-π, π]
462                    // x = x - 2π * round(x / 2π)
463                    let k = _mm256_round_pd::<0x08>(_mm256_div_pd(x, two_pi)); // Round to nearest
464                    x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
465
466                    let x2 = _mm256_mul_pd(x, x);
467                    let x3 = _mm256_mul_pd(x2, x);
468                    let x5 = _mm256_mul_pd(x3, x2);
469                    let x7 = _mm256_mul_pd(x5, x2);
470
471                    // sin(x) ≈ x - x³/6 + x⁵/120 - x⁷/5040
472                    let term1 = x;
473                    let term2 = _mm256_mul_pd(x3, c1_6);
474                    let term3 = _mm256_mul_pd(x5, c1_120);
475                    let term4 = _mm256_mul_pd(x7, c1_5040);
476                    let res =
477                        _mm256_sub_pd(_mm256_add_pd(term1, term3), _mm256_add_pd(term2, term4));
478
479                    let mut temp = [0.0f64; 4];
480                    _mm256_storeu_pd(temp.as_mut_ptr(), res);
481                    result.extend_from_slice(&temp);
482                    i += 4;
483                }
484
485                for j in i..len {
486                    let mut x = a_slice[j];
487                    x = x
488                        - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
489                    let x2 = x * x;
490                    let x3 = x2 * x;
491                    let x5 = x3 * x2;
492                    let x7 = x5 * x2;
493                    result.push(x - x3 / 6.0 + x5 / 120.0 - x7 / 5040.0);
494                }
495
496                return Array1::from_vec(result);
497            }
498        }
499    }
500
501    let a_slice = a.as_slice().expect("Operation failed");
502    for &x_orig in a_slice {
503        let mut x = x_orig;
504        x = x - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
505        let x2 = x * x;
506        let x3 = x2 * x;
507        let x5 = x3 * x2;
508        let x7 = x5 * x2;
509        result.push(x - x3 / 6.0 + x5 / 120.0 - x7 / 5040.0);
510    }
511    Array1::from_vec(result)
512}
513
514/// Fast cos approximation using Taylor series
515///
516/// Uses: cos(x) ≈ 1 - x²/2 + x⁴/24 - x⁶/720
517/// Input is range-reduced to [-π, π]
518pub fn simd_cos_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
519    let len = a.len();
520    let mut result = Vec::with_capacity(len);
521
522    #[cfg(target_arch = "x86_64")]
523    {
524        if is_x86_feature_detected!("avx2") {
525            unsafe {
526                let a_slice = a.as_slice().expect("Operation failed");
527                let mut i = 0;
528
529                let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
530                let c1 = _mm256_set1_pd(1.0);
531                let c1_2 = _mm256_set1_pd(0.5);
532                let c1_24 = _mm256_set1_pd(1.0 / 24.0);
533                let c1_720 = _mm256_set1_pd(1.0 / 720.0);
534
535                while i + 4 <= len {
536                    let mut x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
537
538                    // Range reduction
539                    let k = _mm256_round_pd::<0x08>(_mm256_div_pd(x, two_pi));
540                    x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
541
542                    let x2 = _mm256_mul_pd(x, x);
543                    let x4 = _mm256_mul_pd(x2, x2);
544                    let x6 = _mm256_mul_pd(x4, x2);
545
546                    // cos(x) ≈ 1 - x²/2 + x⁴/24 - x⁶/720
547                    let term1 = c1;
548                    let term2 = _mm256_mul_pd(x2, c1_2);
549                    let term3 = _mm256_mul_pd(x4, c1_24);
550                    let term4 = _mm256_mul_pd(x6, c1_720);
551                    let res =
552                        _mm256_sub_pd(_mm256_add_pd(term1, term3), _mm256_add_pd(term2, term4));
553
554                    let mut temp = [0.0f64; 4];
555                    _mm256_storeu_pd(temp.as_mut_ptr(), res);
556                    result.extend_from_slice(&temp);
557                    i += 4;
558                }
559
560                for j in i..len {
561                    let mut x = a_slice[j];
562                    x = x
563                        - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
564                    let x2 = x * x;
565                    let x4 = x2 * x2;
566                    let x6 = x4 * x2;
567                    result.push(1.0 - x2 * 0.5 + x4 / 24.0 - x6 / 720.0);
568                }
569
570                return Array1::from_vec(result);
571            }
572        }
573    }
574
575    let a_slice = a.as_slice().expect("Operation failed");
576    for &x_orig in a_slice {
577        let mut x = x_orig;
578        x = x - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
579        let x2 = x * x;
580        let x4 = x2 * x2;
581        let x6 = x4 * x2;
582        result.push(1.0 - x2 * 0.5 + x4 / 24.0 - x6 / 720.0);
583    }
584    Array1::from_vec(result)
585}
586
587/// Fast tan approximation using sin/cos ratio
588///
589/// Uses: tan(x) = sin(x) / cos(x)
590pub fn simd_tan_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
591    let sin_vals = simd_sin_f64_poly(a);
592    let cos_vals = simd_cos_f64_poly(a);
593
594    let len = a.len();
595    let mut result = Vec::with_capacity(len);
596
597    let sin_slice = sin_vals.as_slice().expect("Operation failed");
598    let cos_slice = cos_vals.as_slice().expect("Operation failed");
599
600    for i in 0..len {
601        result.push(sin_slice[i] / cos_slice[i]);
602    }
603
604    Array1::from_vec(result)
605}