1use ndarray::{Array1, ArrayView1};
16
17#[cfg(target_arch = "x86_64")]
18use std::arch::x86_64::*;
19
20#[cfg(target_arch = "aarch64")]
21use std::arch::aarch64::*;
22
23pub fn simd_tanh_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
34 let len = a.len();
35 let mut result = Vec::with_capacity(len);
36
37 #[cfg(target_arch = "x86_64")]
38 {
39 if is_x86_feature_detected!("avx2") {
40 unsafe {
41 let a_slice = a.as_slice().expect("Operation failed");
42 let mut i = 0;
43
44 let c27 = _mm256_set1_pd(27.0);
46 let c9 = _mm256_set1_pd(9.0);
47 let c1 = _mm256_set1_pd(1.0);
48 let cn1 = _mm256_set1_pd(-1.0);
49 let c3 = _mm256_set1_pd(3.0);
50
51 while i + 4 <= len {
53 let x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
54
55 let x_clamped = _mm256_max_pd(
57 cn1,
58 _mm256_mul_pd(_mm256_min_pd(c1, _mm256_div_pd(x, c3)), c3),
59 );
60
61 let x2 = _mm256_mul_pd(x_clamped, x_clamped);
63 let num = _mm256_mul_pd(x_clamped, _mm256_add_pd(c27, x2));
64 let den = _mm256_add_pd(c27, _mm256_mul_pd(c9, x2));
65 let res = _mm256_div_pd(num, den);
66
67 let mut temp = [0.0f64; 4];
68 _mm256_storeu_pd(temp.as_mut_ptr(), res);
69 result.extend_from_slice(&temp);
70 i += 4;
71 }
72
73 for j in i..len {
75 let x = a_slice[j].max(-3.0).min(3.0);
76 let x2 = x * x;
77 result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
78 }
79
80 return Array1::from_vec(result);
81 }
82 }
83 }
84
85 #[cfg(target_arch = "aarch64")]
86 {
87 if std::arch::is_aarch64_feature_detected!("neon") {
88 unsafe {
89 let a_slice = a.as_slice().expect("Operation failed");
90 let mut i = 0;
91
92 let c27 = vdupq_n_f64(27.0);
93 let c9 = vdupq_n_f64(9.0);
94 let c1 = vdupq_n_f64(1.0);
95 let cn1 = vdupq_n_f64(-1.0);
96 let c3 = vdupq_n_f64(3.0);
97
98 while i + 2 <= len {
100 let x = vld1q_f64(a_slice.as_ptr().add(i));
101
102 let x_clamped = vmaxq_f64(cn1, vmulq_f64(vminq_f64(c1, vdivq_f64(x, c3)), c3));
104
105 let x2 = vmulq_f64(x_clamped, x_clamped);
106 let num = vmulq_f64(x_clamped, vaddq_f64(c27, x2));
107 let den = vaddq_f64(c27, vmulq_f64(c9, x2));
108 let res = vdivq_f64(num, den);
109
110 let mut temp = [0.0f64; 2];
111 vst1q_f64(temp.as_mut_ptr(), res);
112 result.extend_from_slice(&temp);
113 i += 2;
114 }
115
116 for j in i..len {
118 let x = a_slice[j].max(-3.0).min(3.0);
119 let x2 = x * x;
120 result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
121 }
122
123 return Array1::from_vec(result);
124 }
125 }
126 }
127
128 let a_slice = a.as_slice().expect("Operation failed");
130 for &x in a_slice {
131 let x_clamped = x.max(-3.0).min(3.0);
132 let x2 = x_clamped * x_clamped;
133 result.push(x_clamped * (27.0 + x2) / (27.0 + 9.0 * x2));
134 }
135 Array1::from_vec(result)
136}
137
138pub fn simd_tanh_f32_poly(a: &ArrayView1<f32>) -> Array1<f32> {
140 let len = a.len();
141 let mut result = Vec::with_capacity(len);
142
143 #[cfg(target_arch = "x86_64")]
144 {
145 if is_x86_feature_detected!("avx2") {
146 unsafe {
147 let a_slice = a.as_slice().expect("Operation failed");
148 let mut i = 0;
149
150 let c27 = _mm256_set1_ps(27.0);
151 let c9 = _mm256_set1_ps(9.0);
152 let c1 = _mm256_set1_ps(1.0);
153 let cn1 = _mm256_set1_ps(-1.0);
154 let c3 = _mm256_set1_ps(3.0);
155
156 while i + 8 <= len {
157 let x = _mm256_loadu_ps(a_slice.as_ptr().add(i));
158 let x_clamped = _mm256_max_ps(
159 cn1,
160 _mm256_mul_ps(_mm256_min_ps(c1, _mm256_div_ps(x, c3)), c3),
161 );
162
163 let x2 = _mm256_mul_ps(x_clamped, x_clamped);
164 let num = _mm256_mul_ps(x_clamped, _mm256_add_ps(c27, x2));
165 let den = _mm256_add_ps(c27, _mm256_mul_ps(c9, x2));
166 let res = _mm256_div_ps(num, den);
167
168 let mut temp = [0.0f32; 8];
169 _mm256_storeu_ps(temp.as_mut_ptr(), res);
170 result.extend_from_slice(&temp);
171 i += 8;
172 }
173
174 for j in i..len {
175 let x = a_slice[j].max(-3.0).min(3.0);
176 let x2 = x * x;
177 result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
178 }
179
180 return Array1::from_vec(result);
181 }
182 }
183 }
184
185 #[cfg(target_arch = "aarch64")]
186 {
187 if std::arch::is_aarch64_feature_detected!("neon") {
188 unsafe {
189 let a_slice = a.as_slice().expect("Operation failed");
190 let mut i = 0;
191
192 let c27 = vdupq_n_f32(27.0);
193 let c9 = vdupq_n_f32(9.0);
194 let c1 = vdupq_n_f32(1.0);
195 let cn1 = vdupq_n_f32(-1.0);
196 let c3 = vdupq_n_f32(3.0);
197
198 while i + 4 <= len {
199 let x = vld1q_f32(a_slice.as_ptr().add(i));
200 let x_clamped = vmaxq_f32(cn1, vmulq_f32(vminq_f32(c1, vdivq_f32(x, c3)), c3));
201
202 let x2 = vmulq_f32(x_clamped, x_clamped);
203 let num = vmulq_f32(x_clamped, vaddq_f32(c27, x2));
204 let den = vaddq_f32(c27, vmulq_f32(c9, x2));
205 let res = vdivq_f32(num, den);
206
207 let mut temp = [0.0f32; 4];
208 vst1q_f32(temp.as_mut_ptr(), res);
209 result.extend_from_slice(&temp);
210 i += 4;
211 }
212
213 for j in i..len {
214 let x = a_slice[j].max(-3.0).min(3.0);
215 let x2 = x * x;
216 result.push(x * (27.0 + x2) / (27.0 + 9.0 * x2));
217 }
218
219 return Array1::from_vec(result);
220 }
221 }
222 }
223
224 let a_slice = a.as_slice().expect("Operation failed");
225 for &x in a_slice {
226 let x_clamped = x.max(-3.0).min(3.0);
227 let x2 = x_clamped * x_clamped;
228 result.push(x_clamped * (27.0 + x2) / (27.0 + 9.0 * x2));
229 }
230 Array1::from_vec(result)
231}
232
233pub fn simd_sinh_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
238 let len = a.len();
239 let mut result = Vec::with_capacity(len);
240
241 #[cfg(target_arch = "x86_64")]
242 {
243 if is_x86_feature_detected!("avx2") {
244 unsafe {
245 let a_slice = a.as_slice().expect("Operation failed");
246 let mut i = 0;
247
248 let c1_6 = _mm256_set1_pd(1.0 / 6.0);
249 let c1_120 = _mm256_set1_pd(1.0 / 120.0);
250
251 while i + 4 <= len {
252 let x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
253 let x2 = _mm256_mul_pd(x, x);
254 let x3 = _mm256_mul_pd(x2, x);
255 let x5 = _mm256_mul_pd(x3, x2);
256
257 let term1 = x;
259 let term2 = _mm256_mul_pd(x3, c1_6);
260 let term3 = _mm256_mul_pd(x5, c1_120);
261 let res = _mm256_add_pd(_mm256_add_pd(term1, term2), term3);
262
263 let mut temp = [0.0f64; 4];
264 _mm256_storeu_pd(temp.as_mut_ptr(), res);
265 result.extend_from_slice(&temp);
266 i += 4;
267 }
268
269 for j in i..len {
270 let x = a_slice[j];
271 let x2 = x * x;
272 let x3 = x2 * x;
273 let x5 = x3 * x2;
274 result.push(x + x3 / 6.0 + x5 / 120.0);
275 }
276
277 return Array1::from_vec(result);
278 }
279 }
280 }
281
282 #[cfg(target_arch = "aarch64")]
283 {
284 if std::arch::is_aarch64_feature_detected!("neon") {
285 unsafe {
286 let a_slice = a.as_slice().expect("Operation failed");
287 let mut i = 0;
288
289 let c1_6 = vdupq_n_f64(1.0 / 6.0);
290 let c1_120 = vdupq_n_f64(1.0 / 120.0);
291
292 while i + 2 <= len {
293 let x = vld1q_f64(a_slice.as_ptr().add(i));
294 let x2 = vmulq_f64(x, x);
295 let x3 = vmulq_f64(x2, x);
296 let x5 = vmulq_f64(x3, x2);
297
298 let term1 = x;
299 let term2 = vmulq_f64(x3, c1_6);
300 let term3 = vmulq_f64(x5, c1_120);
301 let res = vaddq_f64(vaddq_f64(term1, term2), term3);
302
303 let mut temp = [0.0f64; 2];
304 vst1q_f64(temp.as_mut_ptr(), res);
305 result.extend_from_slice(&temp);
306 i += 2;
307 }
308
309 for j in i..len {
310 let x = a_slice[j];
311 let x2 = x * x;
312 let x3 = x2 * x;
313 let x5 = x3 * x2;
314 result.push(x + x3 / 6.0 + x5 / 120.0);
315 }
316
317 return Array1::from_vec(result);
318 }
319 }
320 }
321
322 let a_slice = a.as_slice().expect("Operation failed");
323 for &x in a_slice {
324 let x2 = x * x;
325 let x3 = x2 * x;
326 let x5 = x3 * x2;
327 result.push(x + x3 / 6.0 + x5 / 120.0);
328 }
329 Array1::from_vec(result)
330}
331
332pub fn simd_cosh_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
336 let len = a.len();
337 let mut result = Vec::with_capacity(len);
338
339 #[cfg(target_arch = "x86_64")]
340 {
341 if is_x86_feature_detected!("avx2") {
342 unsafe {
343 let a_slice = a.as_slice().expect("Operation failed");
344 let mut i = 0;
345
346 let c1 = _mm256_set1_pd(1.0);
347 let c1_2 = _mm256_set1_pd(0.5);
348 let c1_24 = _mm256_set1_pd(1.0 / 24.0);
349 let c1_720 = _mm256_set1_pd(1.0 / 720.0);
350
351 while i + 4 <= len {
352 let x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
353 let x2 = _mm256_mul_pd(x, x);
354 let x4 = _mm256_mul_pd(x2, x2);
355 let x6 = _mm256_mul_pd(x4, x2);
356
357 let term1 = c1;
359 let term2 = _mm256_mul_pd(x2, c1_2);
360 let term3 = _mm256_mul_pd(x4, c1_24);
361 let term4 = _mm256_mul_pd(x6, c1_720);
362 let res =
363 _mm256_add_pd(_mm256_add_pd(term1, term2), _mm256_add_pd(term3, term4));
364
365 let mut temp = [0.0f64; 4];
366 _mm256_storeu_pd(temp.as_mut_ptr(), res);
367 result.extend_from_slice(&temp);
368 i += 4;
369 }
370
371 for j in i..len {
372 let x = a_slice[j];
373 let x2 = x * x;
374 let x4 = x2 * x2;
375 let x6 = x4 * x2;
376 result.push(1.0 + x2 * 0.5 + x4 / 24.0 + x6 / 720.0);
377 }
378
379 return Array1::from_vec(result);
380 }
381 }
382 }
383
384 #[cfg(target_arch = "aarch64")]
385 {
386 if std::arch::is_aarch64_feature_detected!("neon") {
387 unsafe {
388 let a_slice = a.as_slice().expect("Operation failed");
389 let mut i = 0;
390
391 let c1 = vdupq_n_f64(1.0);
392 let c1_2 = vdupq_n_f64(0.5);
393 let c1_24 = vdupq_n_f64(1.0 / 24.0);
394 let c1_720 = vdupq_n_f64(1.0 / 720.0);
395
396 while i + 2 <= len {
397 let x = vld1q_f64(a_slice.as_ptr().add(i));
398 let x2 = vmulq_f64(x, x);
399 let x4 = vmulq_f64(x2, x2);
400 let x6 = vmulq_f64(x4, x2);
401
402 let term1 = c1;
403 let term2 = vmulq_f64(x2, c1_2);
404 let term3 = vmulq_f64(x4, c1_24);
405 let term4 = vmulq_f64(x6, c1_720);
406 let res = vaddq_f64(vaddq_f64(term1, term2), vaddq_f64(term3, term4));
407
408 let mut temp = [0.0f64; 2];
409 vst1q_f64(temp.as_mut_ptr(), res);
410 result.extend_from_slice(&temp);
411 i += 2;
412 }
413
414 for j in i..len {
415 let x = a_slice[j];
416 let x2 = x * x;
417 let x4 = x2 * x2;
418 let x6 = x4 * x2;
419 result.push(1.0 + x2 * 0.5 + x4 / 24.0 + x6 / 720.0);
420 }
421
422 return Array1::from_vec(result);
423 }
424 }
425 }
426
427 let a_slice = a.as_slice().expect("Operation failed");
428 for &x in a_slice {
429 let x2 = x * x;
430 let x4 = x2 * x2;
431 let x6 = x4 * x2;
432 result.push(1.0 + x2 * 0.5 + x4 / 24.0 + x6 / 720.0);
433 }
434 Array1::from_vec(result)
435}
436
437pub fn simd_sin_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
442 let len = a.len();
443 let mut result = Vec::with_capacity(len);
444
445 #[cfg(target_arch = "x86_64")]
446 {
447 if is_x86_feature_detected!("avx2") {
448 unsafe {
449 let a_slice = a.as_slice().expect("Operation failed");
450 let mut i = 0;
451
452 let pi = _mm256_set1_pd(std::f64::consts::PI);
453 let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
454 let c1_6 = _mm256_set1_pd(1.0 / 6.0);
455 let c1_120 = _mm256_set1_pd(1.0 / 120.0);
456 let c1_5040 = _mm256_set1_pd(1.0 / 5040.0);
457
458 while i + 4 <= len {
459 let mut x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
460
461 let k = _mm256_round_pd::<0x08>(_mm256_div_pd(x, two_pi)); x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
465
466 let x2 = _mm256_mul_pd(x, x);
467 let x3 = _mm256_mul_pd(x2, x);
468 let x5 = _mm256_mul_pd(x3, x2);
469 let x7 = _mm256_mul_pd(x5, x2);
470
471 let term1 = x;
473 let term2 = _mm256_mul_pd(x3, c1_6);
474 let term3 = _mm256_mul_pd(x5, c1_120);
475 let term4 = _mm256_mul_pd(x7, c1_5040);
476 let res =
477 _mm256_sub_pd(_mm256_add_pd(term1, term3), _mm256_add_pd(term2, term4));
478
479 let mut temp = [0.0f64; 4];
480 _mm256_storeu_pd(temp.as_mut_ptr(), res);
481 result.extend_from_slice(&temp);
482 i += 4;
483 }
484
485 for j in i..len {
486 let mut x = a_slice[j];
487 x = x
488 - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
489 let x2 = x * x;
490 let x3 = x2 * x;
491 let x5 = x3 * x2;
492 let x7 = x5 * x2;
493 result.push(x - x3 / 6.0 + x5 / 120.0 - x7 / 5040.0);
494 }
495
496 return Array1::from_vec(result);
497 }
498 }
499 }
500
501 let a_slice = a.as_slice().expect("Operation failed");
502 for &x_orig in a_slice {
503 let mut x = x_orig;
504 x = x - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
505 let x2 = x * x;
506 let x3 = x2 * x;
507 let x5 = x3 * x2;
508 let x7 = x5 * x2;
509 result.push(x - x3 / 6.0 + x5 / 120.0 - x7 / 5040.0);
510 }
511 Array1::from_vec(result)
512}
513
514pub fn simd_cos_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
519 let len = a.len();
520 let mut result = Vec::with_capacity(len);
521
522 #[cfg(target_arch = "x86_64")]
523 {
524 if is_x86_feature_detected!("avx2") {
525 unsafe {
526 let a_slice = a.as_slice().expect("Operation failed");
527 let mut i = 0;
528
529 let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
530 let c1 = _mm256_set1_pd(1.0);
531 let c1_2 = _mm256_set1_pd(0.5);
532 let c1_24 = _mm256_set1_pd(1.0 / 24.0);
533 let c1_720 = _mm256_set1_pd(1.0 / 720.0);
534
535 while i + 4 <= len {
536 let mut x = _mm256_loadu_pd(a_slice.as_ptr().add(i));
537
538 let k = _mm256_round_pd::<0x08>(_mm256_div_pd(x, two_pi));
540 x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
541
542 let x2 = _mm256_mul_pd(x, x);
543 let x4 = _mm256_mul_pd(x2, x2);
544 let x6 = _mm256_mul_pd(x4, x2);
545
546 let term1 = c1;
548 let term2 = _mm256_mul_pd(x2, c1_2);
549 let term3 = _mm256_mul_pd(x4, c1_24);
550 let term4 = _mm256_mul_pd(x6, c1_720);
551 let res =
552 _mm256_sub_pd(_mm256_add_pd(term1, term3), _mm256_add_pd(term2, term4));
553
554 let mut temp = [0.0f64; 4];
555 _mm256_storeu_pd(temp.as_mut_ptr(), res);
556 result.extend_from_slice(&temp);
557 i += 4;
558 }
559
560 for j in i..len {
561 let mut x = a_slice[j];
562 x = x
563 - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
564 let x2 = x * x;
565 let x4 = x2 * x2;
566 let x6 = x4 * x2;
567 result.push(1.0 - x2 * 0.5 + x4 / 24.0 - x6 / 720.0);
568 }
569
570 return Array1::from_vec(result);
571 }
572 }
573 }
574
575 let a_slice = a.as_slice().expect("Operation failed");
576 for &x_orig in a_slice {
577 let mut x = x_orig;
578 x = x - (2.0 * std::f64::consts::PI) * (x / (2.0 * std::f64::consts::PI)).round();
579 let x2 = x * x;
580 let x4 = x2 * x2;
581 let x6 = x4 * x2;
582 result.push(1.0 - x2 * 0.5 + x4 / 24.0 - x6 / 720.0);
583 }
584 Array1::from_vec(result)
585}
586
587pub fn simd_tan_f64_poly(a: &ArrayView1<f64>) -> Array1<f64> {
591 let sin_vals = simd_sin_f64_poly(a);
592 let cos_vals = simd_cos_f64_poly(a);
593
594 let len = a.len();
595 let mut result = Vec::with_capacity(len);
596
597 let sin_slice = sin_vals.as_slice().expect("Operation failed");
598 let cos_slice = cos_vals.as_slice().expect("Operation failed");
599
600 for i in 0..len {
601 result.push(sin_slice[i] / cos_slice[i]);
602 }
603
604 Array1::from_vec(result)
605}