1use ndarray::{Array1, ArrayView1};
7
8#[allow(dead_code)]
22pub fn simd_maximum_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
23 let mut result = Array1::zeros(a.len());
25 for i in 0..a.len() {
26 result[i] = a[i].max(b[i]);
27 }
28 result
29}
30
31#[allow(dead_code)]
45pub fn simd_maximum_f64(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> Array1<f64> {
46 let mut result = Array1::zeros(a.len());
48 for i in 0..a.len() {
49 result[i] = a[i].max(b[i]);
50 }
51 result
52}
53
54#[allow(dead_code)]
68pub fn simd_minimum_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
69 let mut result = Array1::zeros(a.len());
71 for i in 0..a.len() {
72 result[i] = a[i].min(b[i]);
73 }
74 result
75}
76
77#[allow(dead_code)]
91pub fn simd_minimum_f64(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> Array1<f64> {
92 let mut result = Array1::zeros(a.len());
94 for i in 0..a.len() {
95 result[i] = a[i].min(b[i]);
96 }
97 result
98}
99
100#[allow(dead_code)]
114pub fn simd_add_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
115 assert_eq!(a.len(), b.len(), "Arrays must have the same length");
116
117 let len = a.len();
118 let mut result = Vec::with_capacity(len);
119
120 #[cfg(target_arch = "x86_64")]
121 {
122 use std::arch::x86_64::*;
123
124 if is_x86_feature_detected!("avx2") {
125 unsafe {
126 let mut i = 0;
127 while i + 8 <= len {
129 let a_slice = &a.as_slice().expect("Operation failed")[i..i + 8];
130 let b_slice = &b.as_slice().expect("Operation failed")[i..i + 8];
131
132 let a_vec = _mm256_loadu_ps(a_slice.as_ptr());
133 let b_vec = _mm256_loadu_ps(b_slice.as_ptr());
134 let result_vec = _mm256_add_ps(a_vec, b_vec);
135
136 let mut temp = [0.0f32; 8];
137 _mm256_storeu_ps(temp.as_mut_ptr(), result_vec);
138 result.extend_from_slice(&temp);
139 i += 8;
140 }
141
142 for j in i..len {
144 result.push(a[j] + b[j]);
145 }
146 }
147 } else if is_x86_feature_detected!("sse") {
148 unsafe {
149 let mut i = 0;
150 while i + 4 <= len {
152 let a_slice = &a.as_slice().expect("Operation failed")[i..i + 4];
153 let b_slice = &b.as_slice().expect("Operation failed")[i..i + 4];
154
155 let a_vec = _mm_loadu_ps(a_slice.as_ptr());
156 let b_vec = _mm_loadu_ps(b_slice.as_ptr());
157 let result_vec = _mm_add_ps(a_vec, b_vec);
158
159 let mut temp = [0.0f32; 4];
160 _mm_storeu_ps(temp.as_mut_ptr(), result_vec);
161 result.extend_from_slice(&temp);
162 i += 4;
163 }
164
165 for j in i..len {
167 result.push(a[j] + b[j]);
168 }
169 }
170 } else {
171 for i in 0..len {
173 result.push(a[i] + b[i]);
174 }
175 }
176 }
177
178 #[cfg(target_arch = "aarch64")]
179 {
180 use std::arch::aarch64::*;
181
182 if std::arch::is_aarch64_feature_detected!("neon") {
183 unsafe {
184 let mut i = 0;
185 while i + 4 <= len {
187 let a_slice = &a.as_slice().expect("Operation failed")[i..i + 4];
188 let b_slice = &b.as_slice().expect("Operation failed")[i..i + 4];
189
190 let a_vec = vld1q_f32(a_slice.as_ptr());
191 let b_vec = vld1q_f32(b_slice.as_ptr());
192 let result_vec = vaddq_f32(a_vec, b_vec);
193
194 let mut temp = [0.0f32; 4];
195 vst1q_f32(temp.as_mut_ptr(), result_vec);
196 result.extend_from_slice(&temp);
197 i += 4;
198 }
199
200 for j in i..len {
202 result.push(a[j] + b[j]);
203 }
204 }
205 } else {
206 for i in 0..len {
208 result.push(a[i] + b[i]);
209 }
210 }
211 }
212
213 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
214 {
215 for i in 0..len {
217 result.push(a[i] + b[i]);
218 }
219 }
220
221 Array1::from_vec(result)
222}
223
224#[allow(dead_code)]
227pub fn simd_add_f32_optimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
228 assert_eq!(a.len(), b.len(), "Arrays must have the same length");
229
230 let len = a.len();
231
232 let mut result = vec![0.0f32; len];
234
235 let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
237 let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
238 let result_ptr = result.as_mut_ptr();
239
240 #[cfg(target_arch = "x86_64")]
241 {
242 use std::arch::x86_64::*;
243
244 if is_x86_feature_detected!("avx2") {
245 unsafe {
246 let mut i = 0;
247 while i + 8 <= len {
249 let a_vec = _mm256_loadu_ps(a_ptr.add(i));
251 let b_vec = _mm256_loadu_ps(b_ptr.add(i));
252 let result_vec = _mm256_add_ps(a_vec, b_vec);
253
254 _mm256_storeu_ps(result_ptr.add(i), result_vec);
256 i += 8;
257 }
258
259 while i < len {
261 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
262 i += 1;
263 }
264 }
265 } else if is_x86_feature_detected!("sse") {
266 unsafe {
267 let mut i = 0;
268 while i + 4 <= len {
270 let a_vec = _mm_loadu_ps(a_ptr.add(i));
271 let b_vec = _mm_loadu_ps(b_ptr.add(i));
272 let result_vec = _mm_add_ps(a_vec, b_vec);
273
274 _mm_storeu_ps(result_ptr.add(i), result_vec);
276 i += 4;
277 }
278
279 while i < len {
281 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
282 i += 1;
283 }
284 }
285 } else {
286 unsafe {
288 for i in 0..len {
289 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
290 }
291 }
292 }
293 }
294
295 #[cfg(target_arch = "aarch64")]
296 {
297 use std::arch::aarch64::*;
298
299 if std::arch::is_aarch64_feature_detected!("neon") {
300 unsafe {
301 let mut i = 0;
302 while i + 4 <= len {
304 let a_vec = vld1q_f32(a_ptr.add(i));
305 let b_vec = vld1q_f32(b_ptr.add(i));
306 let result_vec = vaddq_f32(a_vec, b_vec);
307
308 vst1q_f32(result_ptr.add(i), result_vec);
310 i += 4;
311 }
312
313 while i < len {
315 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
316 i += 1;
317 }
318 }
319 } else {
320 unsafe {
322 for i in 0..len {
323 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
324 }
325 }
326 }
327 }
328
329 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
330 {
331 unsafe {
333 for i in 0..len {
334 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
335 }
336 }
337 }
338
339 Array1::from_vec(result)
340}
341
342use std::sync::OnceLock;
344
345struct CpuFeatures {
346 has_avx512f: bool,
347 has_avx2: bool,
348 has_sse: bool,
349 has_fma: bool,
350 has_neon: bool,
351}
352
353static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
354
355fn get_cpu_features() -> &'static CpuFeatures {
356 CPU_FEATURES.get_or_init(|| {
357 #[cfg(target_arch = "x86_64")]
358 {
359 CpuFeatures {
360 has_avx512f: std::arch::is_x86_feature_detected!("avx512f"),
361 has_avx2: std::arch::is_x86_feature_detected!("avx2"),
362 has_sse: std::arch::is_x86_feature_detected!("sse"),
363 has_fma: std::arch::is_x86_feature_detected!("fma"),
364 has_neon: false,
365 }
366 }
367 #[cfg(target_arch = "aarch64")]
368 {
369 CpuFeatures {
370 has_avx512f: false,
371 has_avx2: false,
372 has_sse: false,
373 has_fma: false, has_neon: std::arch::is_aarch64_feature_detected!("neon"),
375 }
376 }
377 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
378 {
379 CpuFeatures {
380 has_avx512f: false,
381 has_avx2: false,
382 has_sse: false,
383 has_fma: false,
384 has_neon: false,
385 }
386 }
387 })
388}
389
390#[allow(dead_code)]
392pub fn simd_add_f32_fast(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
393 assert_eq!(a.len(), b.len(), "Arrays must have the same length");
394
395 let len = a.len();
396 let mut result = vec![0.0f32; len];
397
398 let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
399 let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
400 let result_ptr = result.as_mut_ptr();
401
402 let features = get_cpu_features();
403
404 #[cfg(target_arch = "x86_64")]
405 {
406 use std::arch::x86_64::*;
407
408 if features.has_avx2 {
409 unsafe {
410 let mut i = 0;
411
412 while i + 32 <= len {
414 let a_vec1 = _mm256_loadu_ps(a_ptr.add(i));
416 let a_vec2 = _mm256_loadu_ps(a_ptr.add(i + 8));
417 let a_vec3 = _mm256_loadu_ps(a_ptr.add(i + 16));
418 let a_vec4 = _mm256_loadu_ps(a_ptr.add(i + 24));
419
420 let b_vec1 = _mm256_loadu_ps(b_ptr.add(i));
421 let b_vec2 = _mm256_loadu_ps(b_ptr.add(i + 8));
422 let b_vec3 = _mm256_loadu_ps(b_ptr.add(i + 16));
423 let b_vec4 = _mm256_loadu_ps(b_ptr.add(i + 24));
424
425 let result_vec1 = _mm256_add_ps(a_vec1, b_vec1);
427 let result_vec2 = _mm256_add_ps(a_vec2, b_vec2);
428 let result_vec3 = _mm256_add_ps(a_vec3, b_vec3);
429 let result_vec4 = _mm256_add_ps(a_vec4, b_vec4);
430
431 _mm256_storeu_ps(result_ptr.add(i), result_vec1);
433 _mm256_storeu_ps(result_ptr.add(i + 8), result_vec2);
434 _mm256_storeu_ps(result_ptr.add(i + 16), result_vec3);
435 _mm256_storeu_ps(result_ptr.add(i + 24), result_vec4);
436
437 i += 32;
438 }
439
440 while i + 8 <= len {
442 let a_vec = _mm256_loadu_ps(a_ptr.add(i));
443 let b_vec = _mm256_loadu_ps(b_ptr.add(i));
444 let result_vec = _mm256_add_ps(a_vec, b_vec);
445 _mm256_storeu_ps(result_ptr.add(i), result_vec);
446 i += 8;
447 }
448
449 while i < len {
451 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
452 i += 1;
453 }
454 }
455 } else if features.has_sse {
456 unsafe {
457 let mut i = 0;
458
459 while i + 8 <= len {
461 let a_vec1 = _mm_loadu_ps(a_ptr.add(i));
462 let a_vec2 = _mm_loadu_ps(a_ptr.add(i + 4));
463 let b_vec1 = _mm_loadu_ps(b_ptr.add(i));
464 let b_vec2 = _mm_loadu_ps(b_ptr.add(i + 4));
465
466 let result_vec1 = _mm_add_ps(a_vec1, b_vec1);
467 let result_vec2 = _mm_add_ps(a_vec2, b_vec2);
468
469 _mm_storeu_ps(result_ptr.add(i), result_vec1);
470 _mm_storeu_ps(result_ptr.add(i + 4), result_vec2);
471 i += 8;
472 }
473
474 while i + 4 <= len {
475 let a_vec = _mm_loadu_ps(a_ptr.add(i));
476 let b_vec = _mm_loadu_ps(b_ptr.add(i));
477 let result_vec = _mm_add_ps(a_vec, b_vec);
478 _mm_storeu_ps(result_ptr.add(i), result_vec);
479 i += 4;
480 }
481
482 while i < len {
483 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
484 i += 1;
485 }
486 }
487 } else {
488 unsafe {
489 for i in 0..len {
490 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
491 }
492 }
493 }
494 }
495
496 #[cfg(target_arch = "aarch64")]
497 {
498 use std::arch::aarch64::*;
499
500 if features.has_neon {
501 unsafe {
502 let mut i = 0;
503
504 while i + 8 <= len {
506 let a_vec1 = vld1q_f32(a_ptr.add(i));
507 let a_vec2 = vld1q_f32(a_ptr.add(i + 4));
508 let b_vec1 = vld1q_f32(b_ptr.add(i));
509 let b_vec2 = vld1q_f32(b_ptr.add(i + 4));
510
511 let result_vec1 = vaddq_f32(a_vec1, b_vec1);
512 let result_vec2 = vaddq_f32(a_vec2, b_vec2);
513
514 vst1q_f32(result_ptr.add(i), result_vec1);
515 vst1q_f32(result_ptr.add(i + 4), result_vec2);
516 i += 8;
517 }
518
519 while i + 4 <= len {
520 let a_vec = vld1q_f32(a_ptr.add(i));
521 let b_vec = vld1q_f32(b_ptr.add(i));
522 let result_vec = vaddq_f32(a_vec, b_vec);
523 vst1q_f32(result_ptr.add(i), result_vec);
524 i += 4;
525 }
526
527 while i < len {
528 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
529 i += 1;
530 }
531 }
532 } else {
533 unsafe {
534 for i in 0..len {
535 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
536 }
537 }
538 }
539 }
540
541 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
542 {
543 unsafe {
544 for i in 0..len {
545 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
546 }
547 }
548 }
549
550 Array1::from_vec(result)
551}
552
553#[allow(dead_code)]
555pub fn simd_add_f32_ultra(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
556 assert_eq!(a.len(), b.len(), "Arrays must have the same length");
557
558 let len = a.len();
559 let mut result = vec![0.0f32; len];
560
561 let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
562 let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
563 let result_ptr = result.as_mut_ptr();
564
565 let features = get_cpu_features();
566
567 #[cfg(target_arch = "x86_64")]
568 {
569 use std::arch::x86_64::*;
570
571 if features.has_avx2 {
572 unsafe {
573 let mut i = 0;
574
575 const PREFETCH_DISTANCE: usize = 256; let a_aligned = (a_ptr as usize) % 32 == 0;
580 let b_aligned = (b_ptr as usize) % 32 == 0;
581 let result_aligned = (result_ptr as usize) % 32 == 0;
582
583 if a_aligned && b_aligned && result_aligned && len >= 64 {
584 while i + 64 <= len {
586 if i + PREFETCH_DISTANCE < len {
588 _mm_prefetch(
589 a_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
590 _MM_HINT_T0,
591 );
592 _mm_prefetch(
593 b_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
594 _MM_HINT_T0,
595 );
596 }
597
598 let a_vec1 = _mm256_load_ps(a_ptr.add(i));
600 let a_vec2 = _mm256_load_ps(a_ptr.add(i + 8));
601 let a_vec3 = _mm256_load_ps(a_ptr.add(i + 16));
602 let a_vec4 = _mm256_load_ps(a_ptr.add(i + 24));
603 let a_vec5 = _mm256_load_ps(a_ptr.add(i + 32));
604 let a_vec6 = _mm256_load_ps(a_ptr.add(i + 40));
605 let a_vec7 = _mm256_load_ps(a_ptr.add(i + 48));
606 let a_vec8 = _mm256_load_ps(a_ptr.add(i + 56));
607
608 let b_vec1 = _mm256_load_ps(b_ptr.add(i));
609 let b_vec2 = _mm256_load_ps(b_ptr.add(i + 8));
610 let b_vec3 = _mm256_load_ps(b_ptr.add(i + 16));
611 let b_vec4 = _mm256_load_ps(b_ptr.add(i + 24));
612 let b_vec5 = _mm256_load_ps(b_ptr.add(i + 32));
613 let b_vec6 = _mm256_load_ps(b_ptr.add(i + 40));
614 let b_vec7 = _mm256_load_ps(b_ptr.add(i + 48));
615 let b_vec8 = _mm256_load_ps(b_ptr.add(i + 56));
616
617 let result_vec1 = _mm256_add_ps(a_vec1, b_vec1);
619 let result_vec2 = _mm256_add_ps(a_vec2, b_vec2);
620 let result_vec3 = _mm256_add_ps(a_vec3, b_vec3);
621 let result_vec4 = _mm256_add_ps(a_vec4, b_vec4);
622 let result_vec5 = _mm256_add_ps(a_vec5, b_vec5);
623 let result_vec6 = _mm256_add_ps(a_vec6, b_vec6);
624 let result_vec7 = _mm256_add_ps(a_vec7, b_vec7);
625 let result_vec8 = _mm256_add_ps(a_vec8, b_vec8);
626
627 _mm256_store_ps(result_ptr.add(i), result_vec1);
629 _mm256_store_ps(result_ptr.add(i + 8), result_vec2);
630 _mm256_store_ps(result_ptr.add(i + 16), result_vec3);
631 _mm256_store_ps(result_ptr.add(i + 24), result_vec4);
632 _mm256_store_ps(result_ptr.add(i + 32), result_vec5);
633 _mm256_store_ps(result_ptr.add(i + 40), result_vec6);
634 _mm256_store_ps(result_ptr.add(i + 48), result_vec7);
635 _mm256_store_ps(result_ptr.add(i + 56), result_vec8);
636
637 i += 64;
638 }
639 } else {
640 while i + 32 <= len {
642 if i + PREFETCH_DISTANCE < len {
644 _mm_prefetch(
645 a_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
646 _MM_HINT_T0,
647 );
648 _mm_prefetch(
649 b_ptr.add(i + PREFETCH_DISTANCE) as *const i8,
650 _MM_HINT_T0,
651 );
652 }
653
654 let a_vec1 = _mm256_loadu_ps(a_ptr.add(i));
656 let a_vec2 = _mm256_loadu_ps(a_ptr.add(i + 8));
657 let a_vec3 = _mm256_loadu_ps(a_ptr.add(i + 16));
658 let a_vec4 = _mm256_loadu_ps(a_ptr.add(i + 24));
659
660 let b_vec1 = _mm256_loadu_ps(b_ptr.add(i));
661 let b_vec2 = _mm256_loadu_ps(b_ptr.add(i + 8));
662 let b_vec3 = _mm256_loadu_ps(b_ptr.add(i + 16));
663 let b_vec4 = _mm256_loadu_ps(b_ptr.add(i + 24));
664
665 let result_vec1 = _mm256_add_ps(a_vec1, b_vec1);
666 let result_vec2 = _mm256_add_ps(a_vec2, b_vec2);
667 let result_vec3 = _mm256_add_ps(a_vec3, b_vec3);
668 let result_vec4 = _mm256_add_ps(a_vec4, b_vec4);
669
670 _mm256_storeu_ps(result_ptr.add(i), result_vec1);
671 _mm256_storeu_ps(result_ptr.add(i + 8), result_vec2);
672 _mm256_storeu_ps(result_ptr.add(i + 16), result_vec3);
673 _mm256_storeu_ps(result_ptr.add(i + 24), result_vec4);
674
675 i += 32;
676 }
677 }
678
679 while i + 8 <= len {
681 let a_vec = _mm256_loadu_ps(a_ptr.add(i));
682 let b_vec = _mm256_loadu_ps(b_ptr.add(i));
683 let result_vec = _mm256_add_ps(a_vec, b_vec);
684 _mm256_storeu_ps(result_ptr.add(i), result_vec);
685 i += 8;
686 }
687
688 while i < len {
690 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
691 i += 1;
692 }
693 }
694 } else if features.has_sse {
695 unsafe {
696 let mut i = 0;
697
698 while i + 16 <= len {
700 if i + 128 < len {
701 _mm_prefetch(a_ptr.add(i + 128) as *const i8, _MM_HINT_T0);
702 _mm_prefetch(b_ptr.add(i + 128) as *const i8, _MM_HINT_T0);
703 }
704
705 let a_vec1 = _mm_loadu_ps(a_ptr.add(i));
707 let a_vec2 = _mm_loadu_ps(a_ptr.add(i + 4));
708 let a_vec3 = _mm_loadu_ps(a_ptr.add(i + 8));
709 let a_vec4 = _mm_loadu_ps(a_ptr.add(i + 12));
710
711 let b_vec1 = _mm_loadu_ps(b_ptr.add(i));
712 let b_vec2 = _mm_loadu_ps(b_ptr.add(i + 4));
713 let b_vec3 = _mm_loadu_ps(b_ptr.add(i + 8));
714 let b_vec4 = _mm_loadu_ps(b_ptr.add(i + 12));
715
716 let result_vec1 = _mm_add_ps(a_vec1, b_vec1);
717 let result_vec2 = _mm_add_ps(a_vec2, b_vec2);
718 let result_vec3 = _mm_add_ps(a_vec3, b_vec3);
719 let result_vec4 = _mm_add_ps(a_vec4, b_vec4);
720
721 _mm_storeu_ps(result_ptr.add(i), result_vec1);
722 _mm_storeu_ps(result_ptr.add(i + 4), result_vec2);
723 _mm_storeu_ps(result_ptr.add(i + 8), result_vec3);
724 _mm_storeu_ps(result_ptr.add(i + 12), result_vec4);
725
726 i += 16;
727 }
728
729 while i + 4 <= len {
730 let a_vec = _mm_loadu_ps(a_ptr.add(i));
731 let b_vec = _mm_loadu_ps(b_ptr.add(i));
732 let result_vec = _mm_add_ps(a_vec, b_vec);
733 _mm_storeu_ps(result_ptr.add(i), result_vec);
734 i += 4;
735 }
736
737 while i < len {
738 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
739 i += 1;
740 }
741 }
742 } else {
743 unsafe {
745 let mut i = 0;
746 while i + 64 <= len {
747 if i + 256 < len {
748 _mm_prefetch(a_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
749 _mm_prefetch(b_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
750 }
751
752 for j in 0..64 {
754 *result_ptr.add(i + j) = *a_ptr.add(i + j) + *b_ptr.add(i + j);
755 }
756 i += 64;
757 }
758
759 while i < len {
760 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
761 i += 1;
762 }
763 }
764 }
765 }
766
767 #[cfg(target_arch = "x86_64")]
768 {
769 Array1::from_vec(result)
770 }
771
772 #[cfg(not(target_arch = "x86_64"))]
773 {
774 simd_add_f32_fast(a, b)
776 }
777}
778
779use crate::simd_aligned::AlignedVec;
781
782#[allow(dead_code)]
783pub fn simd_add_aligned_ultra(
784 a: &AlignedVec<f32>,
785 b: &AlignedVec<f32>,
786) -> Result<AlignedVec<f32>, &'static str> {
787 if a.len() != b.len() {
788 return Err("Arrays must have the same length");
789 }
790
791 let len = a.len();
792 let mut result =
793 AlignedVec::with_capacity(len).map_err(|_| "Failed to allocate aligned memory")?;
794
795 unsafe {
796 let a_ptr = a.as_slice().as_ptr();
797 let b_ptr = b.as_slice().as_ptr();
798 let result_ptr = result.as_slice().as_ptr() as *mut f32;
799
800 #[cfg(target_arch = "x86_64")]
801 {
802 use std::arch::x86_64::*;
803
804 if is_x86_feature_detected!("avx2") {
805 let mut i = 0;
806
807 while i + 64 <= len {
809 if i + 256 < len {
811 _mm_prefetch(a_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
812 _mm_prefetch(b_ptr.add(i + 256) as *const i8, _MM_HINT_T0);
813 }
814
815 for j in (0..64).step_by(8) {
817 let a_vec = _mm256_load_ps(a_ptr.add(i + j)); let b_vec = _mm256_load_ps(b_ptr.add(i + j)); let result_vec = _mm256_add_ps(a_vec, b_vec);
820 _mm256_store_ps(result_ptr.add(i + j), result_vec); }
822
823 i += 64;
824 }
825
826 while i + 8 <= len {
828 let a_vec = _mm256_load_ps(a_ptr.add(i));
829 let b_vec = _mm256_load_ps(b_ptr.add(i));
830 let result_vec = _mm256_add_ps(a_vec, b_vec);
831 _mm256_store_ps(result_ptr.add(i), result_vec);
832 i += 8;
833 }
834
835 while i < len {
836 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
837 i += 1;
838 }
839
840 result.set_len(len);
841 }
842 }
843
844 #[cfg(not(target_arch = "x86_64"))]
845 {
846 for i in 0..len {
848 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
849 }
850 result.set_len(len);
851 }
852 }
853
854 Ok(result)
855}
856
857#[allow(dead_code)]
859pub fn simd_add_f64(a: &ArrayView1<f64>, b: &ArrayView1<f64>) -> Array1<f64> {
860 (a + b).to_owned()
862}