1#[cfg(target_arch = "x86")]
8use std::arch::x86::*;
9#[cfg(target_arch = "x86_64")]
10use std::arch::x86_64::*;
11
12pub struct SimdF32Ops;
14
15impl SimdF32Ops {
16 pub fn dot_product(a: &[f32], b: &[f32]) -> f32 {
18 assert_eq!(a.len(), b.len(), "Vectors must have the same length");
19
20 if a.is_empty() {
21 return 0.0;
22 }
23
24 #[cfg(target_arch = "x86_64")]
25 {
26 if SimdCapabilities::has_avx2() {
27 return unsafe { Self::avx2_dot_product(a, b) };
28 } else if SimdCapabilities::has_sse41() {
29 return unsafe { Self::sse_dot_product(a, b) };
30 }
31 }
32
33 Self::scalar_dot_product(a, b)
35 }
36
37 #[cfg(target_arch = "x86_64")]
38 #[target_feature(enable = "avx2")]
39 unsafe fn avx2_dot_product(a: &[f32], b: &[f32]) -> f32 {
40 const CHUNK_SIZE: usize = 8; let mut result = _mm256_setzero_ps();
42
43 let chunks_a = a.chunks_exact(CHUNK_SIZE);
44 let chunks_b = b.chunks_exact(CHUNK_SIZE);
45 let remainder_a = chunks_a.remainder();
46 let remainder_b = chunks_b.remainder();
47
48 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
50 let va = _mm256_loadu_ps(chunk_a.as_ptr());
51 let vb = _mm256_loadu_ps(chunk_b.as_ptr());
52 let prod = _mm256_mul_ps(va, vb);
53 result = _mm256_add_ps(result, prod);
54 }
55
56 let mut sum_array = [0.0f32; 8];
58 _mm256_storeu_ps(sum_array.as_mut_ptr(), result);
59 let mut final_result = sum_array.iter().sum::<f32>();
60
61 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
63 final_result += a_val * b_val;
64 }
65
66 final_result
67 }
68
69 #[cfg(target_arch = "x86_64")]
70 #[target_feature(enable = "sse4.1")]
71 unsafe fn sse_dot_product(a: &[f32], b: &[f32]) -> f32 {
72 const CHUNK_SIZE: usize = 4; let mut result = _mm_setzero_ps();
74
75 let chunks_a = a.chunks_exact(CHUNK_SIZE);
76 let chunks_b = b.chunks_exact(CHUNK_SIZE);
77 let remainder_a = chunks_a.remainder();
78 let remainder_b = chunks_b.remainder();
79
80 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
82 let va = _mm_loadu_ps(chunk_a.as_ptr());
83 let vb = _mm_loadu_ps(chunk_b.as_ptr());
84 let prod = _mm_mul_ps(va, vb);
85 result = _mm_add_ps(result, prod);
86 }
87
88 let mut sum_array = [0.0f32; 4];
90 _mm_storeu_ps(sum_array.as_mut_ptr(), result);
91 let mut final_result = sum_array.iter().sum::<f32>();
92
93 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
95 final_result += a_val * b_val;
96 }
97
98 final_result
99 }
100
101 fn scalar_dot_product(a: &[f32], b: &[f32]) -> f32 {
102 const CHUNK_SIZE: usize = 8;
104 let mut result = 0.0f32;
105
106 let chunks_a = a.chunks_exact(CHUNK_SIZE);
108 let chunks_b = b.chunks_exact(CHUNK_SIZE);
109 let remainder_a = chunks_a.remainder();
110 let remainder_b = chunks_b.remainder();
111
112 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
113 for i in 0..CHUNK_SIZE {
114 result += chunk_a[i] * chunk_b[i];
115 }
116 }
117
118 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
120 result += a_val * b_val;
121 }
122
123 result
124 }
125
126 pub fn add_vectors(a: &[f32], b: &[f32], result: &mut [f32]) {
128 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
129 assert_eq!(
130 a.len(),
131 result.len(),
132 "Result vector must have the same length as inputs"
133 );
134
135 #[cfg(target_arch = "x86_64")]
136 {
137 if SimdCapabilities::has_avx2() {
138 unsafe { Self::avx2_add_vectors(a, b, result) };
139 return;
140 } else if SimdCapabilities::has_sse41() {
141 unsafe { Self::sse_add_vectors(a, b, result) };
142 return;
143 }
144 }
145
146 Self::scalar_add_vectors(a, b, result);
148 }
149
150 #[cfg(target_arch = "x86_64")]
151 #[target_feature(enable = "avx2")]
152 unsafe fn avx2_add_vectors(a: &[f32], b: &[f32], result: &mut [f32]) {
153 const CHUNK_SIZE: usize = 8; let len = a.len();
155 let chunk_count = len / CHUNK_SIZE;
156 let remainder_start = chunk_count * CHUNK_SIZE;
157
158 for i in 0..chunk_count {
160 let start = i * CHUNK_SIZE;
161 let va = _mm256_loadu_ps(a.as_ptr().add(start));
162 let vb = _mm256_loadu_ps(b.as_ptr().add(start));
163 let vresult = _mm256_add_ps(va, vb);
164 _mm256_storeu_ps(result.as_mut_ptr().add(start), vresult);
165 }
166
167 for i in remainder_start..len {
169 result[i] = a[i] + b[i];
170 }
171 }
172
173 #[cfg(target_arch = "x86_64")]
174 #[target_feature(enable = "sse4.1")]
175 unsafe fn sse_add_vectors(a: &[f32], b: &[f32], result: &mut [f32]) {
176 const CHUNK_SIZE: usize = 4; let len = a.len();
178 let chunk_count = len / CHUNK_SIZE;
179 let remainder_start = chunk_count * CHUNK_SIZE;
180
181 for i in 0..chunk_count {
183 let start = i * CHUNK_SIZE;
184 let va = _mm_loadu_ps(a.as_ptr().add(start));
185 let vb = _mm_loadu_ps(b.as_ptr().add(start));
186 let vresult = _mm_add_ps(va, vb);
187 _mm_storeu_ps(result.as_mut_ptr().add(start), vresult);
188 }
189
190 for i in remainder_start..len {
192 result[i] = a[i] + b[i];
193 }
194 }
195
196 fn scalar_add_vectors(a: &[f32], b: &[f32], result: &mut [f32]) {
197 const CHUNK_SIZE: usize = 8;
198 let len = a.len();
199 let chunk_count = len / CHUNK_SIZE;
200 let remainder_start = chunk_count * CHUNK_SIZE;
201
202 for i in 0..chunk_count {
204 let start = i * CHUNK_SIZE;
205 for j in 0..CHUNK_SIZE {
206 result[start + j] = a[start + j] + b[start + j];
207 }
208 }
209
210 for i in remainder_start..len {
212 result[i] = a[i] + b[i];
213 }
214 }
215
216 pub fn scalar_multiply(vector: &[f32], scalar: f32, result: &mut [f32]) {
218 assert_eq!(
219 vector.len(),
220 result.len(),
221 "Vector and result must have the same length"
222 );
223
224 const CHUNK_SIZE: usize = 8;
225 let len = vector.len();
226 let chunk_len = len - (len % CHUNK_SIZE);
227
228 for i in (0..chunk_len).step_by(CHUNK_SIZE) {
230 for j in 0..CHUNK_SIZE {
231 result[i + j] = vector[i + j] * scalar;
232 }
233 }
234
235 for i in chunk_len..len {
237 result[i] = vector[i] * scalar;
238 }
239 }
240
241 pub fn norm_squared(vector: &[f32]) -> f32 {
243 if vector.is_empty() {
244 return 0.0;
245 }
246
247 const CHUNK_SIZE: usize = 8;
248 let mut result = 0.0f32;
249
250 let chunks = vector.chunks_exact(CHUNK_SIZE);
252 let remainder = chunks.remainder();
253
254 for chunk in chunks {
255 for &val in chunk {
256 result += val * val;
257 }
258 }
259
260 for &val in remainder {
262 result += val * val;
263 }
264
265 result
266 }
267
268 pub fn exp_approx(input: &[f32], result: &mut [f32]) {
270 assert_eq!(
271 input.len(),
272 result.len(),
273 "Input and result must have the same length"
274 );
275
276 for (input_val, result_val) in input.iter().zip(result.iter_mut()) {
277 *result_val = Self::fast_exp(*input_val);
278 }
279 }
280
281 fn fast_exp(x: f32) -> f32 {
283 if x.abs() < 1.0 {
286 let x2 = x * x;
287 let x3 = x2 * x;
288 let x4 = x3 * x;
289 1.0 + x + x2 * 0.5 + x3 / 6.0 + x4 / 24.0
290 } else {
291 x.exp() }
293 }
294
295 pub fn softmax(input: &[f32], result: &mut [f32]) {
297 assert_eq!(
298 input.len(),
299 result.len(),
300 "Input and result must have the same length"
301 );
302
303 if input.is_empty() {
304 return;
305 }
306
307 let max_val = input.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
309
310 let mut temp = vec![0.0f32; input.len()];
312 for (i, &val) in input.iter().enumerate() {
313 temp[i] = (val - max_val).exp();
314 }
315
316 let sum: f32 = temp.iter().sum();
317 let inv_sum = 1.0 / sum;
318
319 Self::scalar_multiply(&temp, inv_sum, result);
321 }
322}
323
324pub struct SimdF64Ops;
326
327impl SimdF64Ops {
328 pub fn dot_product(a: &[f64], b: &[f64]) -> f64 {
330 assert_eq!(a.len(), b.len(), "Vectors must have the same length");
331
332 if a.is_empty() {
333 return 0.0;
334 }
335
336 const CHUNK_SIZE: usize = 4;
338 let mut result = 0.0f64;
339
340 let chunks_a = a.chunks_exact(CHUNK_SIZE);
342 let chunks_b = b.chunks_exact(CHUNK_SIZE);
343 let remainder_a = chunks_a.remainder();
344 let remainder_b = chunks_b.remainder();
345
346 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
347 for i in 0..CHUNK_SIZE {
348 result += chunk_a[i] * chunk_b[i];
349 }
350 }
351
352 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
354 result += a_val * b_val;
355 }
356
357 result
358 }
359
360 pub fn matrix_vector_multiply(
362 matrix: &[f64],
363 vector: &[f64],
364 result: &mut [f64],
365 rows: usize,
366 cols: usize,
367 ) {
368 assert_eq!(
369 matrix.len(),
370 rows * cols,
371 "Matrix size must match dimensions"
372 );
373 assert_eq!(
374 vector.len(),
375 cols,
376 "Vector length must match matrix columns"
377 );
378 assert_eq!(result.len(), rows, "Result length must match matrix rows");
379
380 for (i, result_item) in result.iter_mut().enumerate().take(rows) {
381 let row_start = i * cols;
382 let row = &matrix[row_start..row_start + cols];
383 *result_item = Self::dot_product(row, vector);
384 }
385 }
386
387 pub fn add_vectors(a: &[f64], b: &[f64], result: &mut [f64]) {
389 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
390 assert_eq!(
391 a.len(),
392 result.len(),
393 "Result vector must have the same length as inputs"
394 );
395
396 const CHUNK_SIZE: usize = 4;
397 let len = a.len();
398 let chunk_len = len - (len % CHUNK_SIZE);
399
400 for i in (0..chunk_len).step_by(CHUNK_SIZE) {
402 for j in 0..CHUNK_SIZE {
403 result[i + j] = a[i + j] + b[i + j];
404 }
405 }
406
407 for i in chunk_len..len {
409 result[i] = a[i] + b[i];
410 }
411 }
412}
413
414pub struct SimdMatrixOps;
416
417impl SimdMatrixOps {
418 pub fn transpose_f32(input: &[f32], output: &mut [f32], rows: usize, cols: usize) {
420 assert_eq!(input.len(), rows * cols);
421 assert_eq!(output.len(), rows * cols);
422
423 if rows <= 4 || cols <= 4 {
425 for i in 0..rows {
426 for j in 0..cols {
427 output[j * rows + i] = input[i * cols + j];
428 }
429 }
430 return;
431 }
432
433 const BLOCK_SIZE: usize = 8;
435
436 for i in (0..rows).step_by(BLOCK_SIZE) {
437 for j in (0..cols).step_by(BLOCK_SIZE) {
438 let max_i = (i + BLOCK_SIZE).min(rows);
439 let max_j = (j + BLOCK_SIZE).min(cols);
440
441 for ii in i..max_i {
442 for jj in j..max_j {
443 output[jj * rows + ii] = input[ii * cols + jj];
444 }
445 }
446 }
447 }
448 }
449
450 pub fn matrix_multiply_f32(a: &[f32], b: &[f32], c: &mut [f32], m: usize, n: usize, k: usize) {
452 assert_eq!(a.len(), m * k);
453 assert_eq!(b.len(), k * n);
454 assert_eq!(c.len(), m * n);
455
456 c.fill(0.0);
458
459 const BLOCK_SIZE: usize = 64;
461
462 for i in (0..m).step_by(BLOCK_SIZE) {
463 for j in (0..n).step_by(BLOCK_SIZE) {
464 for l in (0..k).step_by(BLOCK_SIZE) {
465 let max_i = (i + BLOCK_SIZE).min(m);
466 let max_j = (j + BLOCK_SIZE).min(n);
467 let max_l = (l + BLOCK_SIZE).min(k);
468
469 for ii in i..max_i {
470 for jj in j..max_j {
471 let mut sum = 0.0f32;
472 let a_row = &a[ii * k + l..ii * k + max_l];
473 let b_col: Vec<f32> = (l..max_l).map(|ll| b[ll * n + jj]).collect();
474
475 sum += SimdF32Ops::dot_product(a_row, &b_col);
476 c[ii * n + jj] += sum;
477 }
478 }
479 }
480 }
481 }
482 }
483}
484
485pub struct SimdStatsOps;
487
488impl SimdStatsOps {
489 pub fn mean_f32(data: &[f32]) -> f32 {
491 if data.is_empty() {
492 return 0.0;
493 }
494
495 const CHUNK_SIZE: usize = 8;
496 let mut result = 0.0f32;
497
498 let chunks = data.chunks_exact(CHUNK_SIZE);
500 let remainder = chunks.remainder();
501
502 for chunk in chunks {
503 for &val in chunk {
504 result += val;
505 }
506 }
507
508 for &val in remainder {
510 result += val;
511 }
512
513 result / data.len() as f32
514 }
515
516 pub fn variance_f32(data: &[f32]) -> f32 {
518 if data.len() <= 1 {
519 return 0.0;
520 }
521
522 let mean = Self::mean_f32(data);
523 const CHUNK_SIZE: usize = 8;
524 let mut result = 0.0f32;
525
526 let chunks = data.chunks_exact(CHUNK_SIZE);
528 let remainder = chunks.remainder();
529
530 for chunk in chunks {
531 for &val in chunk {
532 let diff = val - mean;
533 result += diff * diff;
534 }
535 }
536
537 for &val in remainder {
539 let diff = val - mean;
540 result += diff * diff;
541 }
542
543 result / (data.len() - 1) as f32
544 }
545
546 pub fn min_max_f32(data: &[f32]) -> Option<(f32, f32)> {
548 if data.is_empty() {
549 return None;
550 }
551
552 const CHUNK_SIZE: usize = 8;
553 let mut min_val = f32::INFINITY;
554 let mut max_val = f32::NEG_INFINITY;
555
556 let chunks = data.chunks_exact(CHUNK_SIZE);
558 let remainder = chunks.remainder();
559
560 for chunk in chunks {
561 for &val in chunk {
562 min_val = min_val.min(val);
563 max_val = max_val.max(val);
564 }
565 }
566
567 for &val in remainder {
569 min_val = min_val.min(val);
570 max_val = max_val.max(val);
571 }
572
573 Some((min_val, max_val))
574 }
575}
576
577pub struct SimdDistanceOps;
579
580impl SimdDistanceOps {
581 pub fn euclidean_distance_f32(a: &[f32], b: &[f32]) -> f32 {
583 assert_eq!(a.len(), b.len(), "Vectors must have the same length");
584
585 if a.is_empty() {
586 return 0.0;
587 }
588
589 #[cfg(target_arch = "x86_64")]
590 {
591 if SimdCapabilities::has_avx2() {
592 return unsafe { Self::avx2_euclidean_distance(a, b) };
593 } else if SimdCapabilities::has_sse41() {
594 return unsafe { Self::sse_euclidean_distance(a, b) };
595 }
596 }
597
598 Self::scalar_euclidean_distance(a, b)
600 }
601
602 #[cfg(target_arch = "x86_64")]
603 #[target_feature(enable = "avx2")]
604 unsafe fn avx2_euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
605 const CHUNK_SIZE: usize = 8; let mut result = _mm256_setzero_ps();
607
608 let chunks_a = a.chunks_exact(CHUNK_SIZE);
609 let chunks_b = b.chunks_exact(CHUNK_SIZE);
610 let remainder_a = chunks_a.remainder();
611 let remainder_b = chunks_b.remainder();
612
613 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
615 let va = _mm256_loadu_ps(chunk_a.as_ptr());
616 let vb = _mm256_loadu_ps(chunk_b.as_ptr());
617 let diff = _mm256_sub_ps(va, vb);
618 let squared = _mm256_mul_ps(diff, diff);
619 result = _mm256_add_ps(result, squared);
620 }
621
622 let mut sum_array = [0.0f32; 8];
624 _mm256_storeu_ps(sum_array.as_mut_ptr(), result);
625 let mut final_result = sum_array.iter().sum::<f32>();
626
627 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
629 let diff = a_val - b_val;
630 final_result += diff * diff;
631 }
632
633 final_result.sqrt()
634 }
635
636 #[cfg(target_arch = "x86_64")]
637 #[target_feature(enable = "sse4.1")]
638 unsafe fn sse_euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
639 const CHUNK_SIZE: usize = 4; let mut result = _mm_setzero_ps();
641
642 let chunks_a = a.chunks_exact(CHUNK_SIZE);
643 let chunks_b = b.chunks_exact(CHUNK_SIZE);
644 let remainder_a = chunks_a.remainder();
645 let remainder_b = chunks_b.remainder();
646
647 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
649 let va = _mm_loadu_ps(chunk_a.as_ptr());
650 let vb = _mm_loadu_ps(chunk_b.as_ptr());
651 let diff = _mm_sub_ps(va, vb);
652 let squared = _mm_mul_ps(diff, diff);
653 result = _mm_add_ps(result, squared);
654 }
655
656 let mut sum_array = [0.0f32; 4];
658 _mm_storeu_ps(sum_array.as_mut_ptr(), result);
659 let mut final_result = sum_array.iter().sum::<f32>();
660
661 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
663 let diff = a_val - b_val;
664 final_result += diff * diff;
665 }
666
667 final_result.sqrt()
668 }
669
670 fn scalar_euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
671 let mut result = 0.0f32;
672 for (a_val, b_val) in a.iter().zip(b.iter()) {
673 let diff = a_val - b_val;
674 result += diff * diff;
675 }
676 result.sqrt()
677 }
678
679 pub fn manhattan_distance_f32(a: &[f32], b: &[f32]) -> f32 {
681 assert_eq!(a.len(), b.len(), "Vectors must have the same length");
682
683 if a.is_empty() {
684 return 0.0;
685 }
686
687 #[cfg(target_arch = "x86_64")]
688 {
689 if SimdCapabilities::has_avx2() {
690 return unsafe { Self::avx2_manhattan_distance(a, b) };
691 } else if SimdCapabilities::has_sse41() {
692 return unsafe { Self::sse_manhattan_distance(a, b) };
693 }
694 }
695
696 Self::scalar_manhattan_distance(a, b)
698 }
699
700 #[cfg(target_arch = "x86_64")]
701 #[target_feature(enable = "avx2")]
702 unsafe fn avx2_manhattan_distance(a: &[f32], b: &[f32]) -> f32 {
703 const CHUNK_SIZE: usize = 8; let mut result = _mm256_setzero_ps();
705 let sign_mask = _mm256_set1_ps(-0.0f32); let chunks_a = a.chunks_exact(CHUNK_SIZE);
708 let chunks_b = b.chunks_exact(CHUNK_SIZE);
709 let remainder_a = chunks_a.remainder();
710 let remainder_b = chunks_b.remainder();
711
712 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
714 let va = _mm256_loadu_ps(chunk_a.as_ptr());
715 let vb = _mm256_loadu_ps(chunk_b.as_ptr());
716 let diff = _mm256_sub_ps(va, vb);
717 let abs_diff = _mm256_andnot_ps(sign_mask, diff); result = _mm256_add_ps(result, abs_diff);
719 }
720
721 let mut sum_array = [0.0f32; 8];
723 _mm256_storeu_ps(sum_array.as_mut_ptr(), result);
724 let mut final_result = sum_array.iter().sum::<f32>();
725
726 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
728 final_result += (a_val - b_val).abs();
729 }
730
731 final_result
732 }
733
734 #[cfg(target_arch = "x86_64")]
735 #[target_feature(enable = "sse4.1")]
736 unsafe fn sse_manhattan_distance(a: &[f32], b: &[f32]) -> f32 {
737 const CHUNK_SIZE: usize = 4; let mut result = _mm_setzero_ps();
739 let sign_mask = _mm_set1_ps(-0.0f32); let chunks_a = a.chunks_exact(CHUNK_SIZE);
742 let chunks_b = b.chunks_exact(CHUNK_SIZE);
743 let remainder_a = chunks_a.remainder();
744 let remainder_b = chunks_b.remainder();
745
746 for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
748 let va = _mm_loadu_ps(chunk_a.as_ptr());
749 let vb = _mm_loadu_ps(chunk_b.as_ptr());
750 let diff = _mm_sub_ps(va, vb);
751 let abs_diff = _mm_andnot_ps(sign_mask, diff); result = _mm_add_ps(result, abs_diff);
753 }
754
755 let mut sum_array = [0.0f32; 4];
757 _mm_storeu_ps(sum_array.as_mut_ptr(), result);
758 let mut final_result = sum_array.iter().sum::<f32>();
759
760 for (a_val, b_val) in remainder_a.iter().zip(remainder_b.iter()) {
762 final_result += (a_val - b_val).abs();
763 }
764
765 final_result
766 }
767
768 fn scalar_manhattan_distance(a: &[f32], b: &[f32]) -> f32 {
769 let mut result = 0.0f32;
770 for (a_val, b_val) in a.iter().zip(b.iter()) {
771 result += (a_val - b_val).abs();
772 }
773 result
774 }
775
776 pub fn cosine_similarity_f32(a: &[f32], b: &[f32]) -> f32 {
778 assert_eq!(a.len(), b.len(), "Vectors must have the same length");
779
780 if a.is_empty() {
781 return 0.0;
782 }
783
784 let dot_product = SimdF32Ops::dot_product(a, b);
785 let norm_a = SimdF32Ops::norm_squared(a).sqrt();
786 let norm_b = SimdF32Ops::norm_squared(b).sqrt();
787
788 if norm_a == 0.0 || norm_b == 0.0 {
789 0.0
790 } else {
791 dot_product / (norm_a * norm_b)
792 }
793 }
794}
795
796pub struct SimdCapabilities;
798
799impl SimdCapabilities {
800 #[cfg(target_arch = "x86_64")]
802 pub fn has_avx() -> bool {
803 std::arch::is_x86_feature_detected!("avx")
804 }
805
806 #[cfg(not(target_arch = "x86_64"))]
807 pub fn has_avx() -> bool {
808 false
809 }
810
811 #[cfg(target_arch = "x86_64")]
813 pub fn has_avx2() -> bool {
814 std::arch::is_x86_feature_detected!("avx2")
815 }
816
817 #[cfg(not(target_arch = "x86_64"))]
818 pub fn has_avx2() -> bool {
819 false
820 }
821
822 #[cfg(target_arch = "x86_64")]
824 pub fn has_sse41() -> bool {
825 std::arch::is_x86_feature_detected!("sse4.1")
826 }
827
828 #[cfg(not(target_arch = "x86_64"))]
829 pub fn has_sse41() -> bool {
830 false
831 }
832
833 pub fn capabilities_summary() -> String {
835 format!(
836 "SIMD Capabilities: AVX={}, AVX2={}, SSE4.1={}",
837 Self::has_avx(),
838 Self::has_avx2(),
839 Self::has_sse41()
840 )
841 }
842}
843
844#[allow(non_snake_case)]
845#[cfg(test)]
846mod tests {
847 use super::*;
848 use approx::assert_relative_eq;
849
850 #[test]
851 fn test_simd_dot_product() {
852 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
853 let b = vec![9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0];
854
855 let result = SimdF32Ops::dot_product(&a, &b);
856 let expected: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
857
858 assert_relative_eq!(result, expected, epsilon = 1e-6);
859 }
860
861 #[test]
862 fn test_simd_vector_addition() {
863 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
864 let b = vec![9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0];
865 let mut result = vec![0.0; a.len()];
866
867 SimdF32Ops::add_vectors(&a, &b, &mut result);
868
869 for i in 0..a.len() {
870 assert_relative_eq!(result[i], a[i] + b[i], epsilon = 1e-6);
871 }
872 }
873
874 #[test]
875 fn test_simd_scalar_multiply() {
876 let vector = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
877 let scalar = 2.5;
878 let mut result = vec![0.0; vector.len()];
879
880 SimdF32Ops::scalar_multiply(&vector, scalar, &mut result);
881
882 for i in 0..vector.len() {
883 assert_relative_eq!(result[i], vector[i] * scalar, epsilon = 1e-6);
884 }
885 }
886
887 #[test]
888 fn test_simd_norm_squared() {
889 let vector = vec![3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
890 let result = SimdF32Ops::norm_squared(&vector);
891 let expected: f32 = vector.iter().map(|x| x * x).sum();
892
893 assert_relative_eq!(result, expected, epsilon = 1e-6);
894 }
895
896 #[test]
897 fn test_simd_stats() {
898 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
899
900 let mean = SimdStatsOps::mean_f32(&data);
901 assert_relative_eq!(mean, 5.5, epsilon = 1e-6);
902
903 let variance = SimdStatsOps::variance_f32(&data);
904 assert_relative_eq!(variance, 9.166667, epsilon = 1e-5);
905
906 let (min_val, max_val) = SimdStatsOps::min_max_f32(&data).unwrap();
907 assert_eq!(min_val, 1.0);
908 assert_eq!(max_val, 10.0);
909 }
910
911 #[test]
912 fn test_simd_distances() {
913 let a = vec![1.0, 2.0, 3.0];
914 let b = vec![4.0, 5.0, 6.0];
915
916 let euclidean = SimdDistanceOps::euclidean_distance_f32(&a, &b);
917 assert_relative_eq!(euclidean, (27.0_f32).sqrt(), epsilon = 1e-6);
918
919 let manhattan = SimdDistanceOps::manhattan_distance_f32(&a, &b);
920 assert_relative_eq!(manhattan, 9.0, epsilon = 1e-6);
921
922 let cosine = SimdDistanceOps::cosine_similarity_f32(&a, &b);
923 let expected = 32.0 / ((14.0_f32).sqrt() * (77.0_f32).sqrt());
924 assert_relative_eq!(cosine, expected, epsilon = 1e-6);
925 }
926
927 #[test]
928 fn test_matrix_transpose() {
929 let input = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; let mut output = vec![0.0; 6];
931
932 SimdMatrixOps::transpose_f32(&input, &mut output, 2, 3);
933
934 let expected = [1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; for i in 0..6 {
936 assert_relative_eq!(output[i], expected[i], epsilon = 1e-6);
937 }
938 }
939
940 #[test]
941 fn test_simd_capabilities() {
942 let summary = SimdCapabilities::capabilities_summary();
943 assert!(summary.contains("SIMD Capabilities"));
944 }
945}