Skip to main content

rill_core/math/vector/simd/
wide.rs

1//! Cross-platform SIMD implementations via the `wide` crate
2//!
3//! This module provides vector types using the `wide` library,
4//! which provides portable SIMD operations with fallback to scalar implementations.
5//!
6//! Types:
7//! - `F32x4`, `F32x8` for `f32`
8//! - `F64x2`, `F64x4` for `f64`
9
10use crate::Transcendental;
11use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
12use wide::{f32x4, f32x8, f64x2, f64x4, CmpEq, CmpGe, CmpGt, CmpLe, CmpLt, CmpNe};
13
14use crate::math::vector::traits::{Vector, VectorMask, VectorTranscendental};
15
16// -----------------------------------------------------------------------------
17// Wrappers around wide types for implementing the Vector trait
18// -----------------------------------------------------------------------------
19
20/// SIMD vector of 4 `f32` elements
21#[derive(Copy, Clone, Debug, PartialEq)]
22pub struct F32x4(f32x4);
23
24/// SIMD vector of 8 `f32` elements
25#[derive(Copy, Clone, Debug, PartialEq)]
26pub struct F32x8(f32x8);
27
28/// SIMD vector of 2 `f64` elements
29#[derive(Copy, Clone, Debug, PartialEq)]
30pub struct F64x2(f64x2);
31
32/// SIMD vector of 4 `f64` elements
33#[derive(Copy, Clone, Debug, PartialEq)]
34pub struct F64x4(f64x4);
35
36// -----------------------------------------------------------------------------
37// Default implementations
38// -----------------------------------------------------------------------------
39
40impl Default for F32x4 {
41    fn default() -> Self {
42        Self(f32x4::splat(0.0))
43    }
44}
45
46impl Default for F32x8 {
47    fn default() -> Self {
48        Self(f32x8::splat(0.0))
49    }
50}
51
52impl Default for F64x2 {
53    fn default() -> Self {
54        Self(f64x2::splat(0.0))
55    }
56}
57
58impl Default for F64x4 {
59    fn default() -> Self {
60        Self(f64x4::splat(0.0))
61    }
62}
63
64// -----------------------------------------------------------------------------
65// Vector implementation for F32x4
66// -----------------------------------------------------------------------------
67
68impl Vector<f32, 4> for F32x4 {
69    fn splat(value: f32) -> Self {
70        F32x4(f32x4::splat(value))
71    }
72
73    fn load(slice: &[f32]) -> Self {
74        let mut arr = [0.0f32; 4];
75        arr.copy_from_slice(&slice[0..4]);
76        F32x4(f32x4::from(arr))
77    }
78
79    fn store(&self, slice: &mut [f32]) {
80        let arr: [f32; 4] = self.0.into();
81        slice[0..4].copy_from_slice(&arr);
82    }
83
84    fn extract(&self, index: usize) -> f32 {
85        let arr: [f32; 4] = self.0.into();
86        arr[index]
87    }
88
89    fn insert(&self, index: usize, value: f32) -> Self {
90        let mut arr: [f32; 4] = self.0.into();
91        arr[index] = value;
92        F32x4(f32x4::from(arr))
93    }
94
95    fn add(&self, other: &Self) -> Self {
96        F32x4(self.0 + other.0)
97    }
98
99    fn sub(&self, other: &Self) -> Self {
100        F32x4(self.0 - other.0)
101    }
102
103    fn mul(&self, other: &Self) -> Self {
104        F32x4(self.0 * other.0)
105    }
106
107    fn div(&self, other: &Self) -> Self {
108        F32x4(self.0 / other.0)
109    }
110
111    fn rem(&self, other: &Self) -> Self {
112        // wide does not provide a remainder operation, implement component-wise
113        let a: [f32; 4] = self.0.into();
114        let b: [f32; 4] = other.0.into();
115        let mut arr = [0.0f32; 4];
116        for i in 0..4 {
117            arr[i] = a[i] % b[i];
118        }
119        F32x4(f32x4::from(arr))
120    }
121
122    fn neg(&self) -> Self {
123        F32x4(-self.0)
124    }
125
126    fn abs(&self) -> Self {
127        F32x4(self.0.abs())
128    }
129
130    fn min(&self, other: &Self) -> Self {
131        F32x4(self.0.min(other.0))
132    }
133
134    fn max(&self, other: &Self) -> Self {
135        F32x4(self.0.max(other.0))
136    }
137
138    fn clamp(&self, min: &Self, max: &Self) -> Self {
139        // clamp = self.max(min).min(max)
140        F32x4(self.0.max(min.0).min(max.0))
141    }
142}
143
144impl VectorTranscendental<f32, 4> for F32x4 {
145    fn sqrt(&self) -> Self {
146        F32x4(self.0.sqrt())
147    }
148    fn exp(&self) -> Self {
149        F32x4(self.0.exp())
150    }
151    fn ln(&self) -> Self {
152        F32x4(self.0.ln())
153    }
154    fn sin(&self) -> Self {
155        F32x4(self.0.sin())
156    }
157    fn cos(&self) -> Self {
158        F32x4(self.0.cos())
159    }
160    fn tan(&self) -> Self {
161        F32x4(self.0.tan())
162    }
163}
164
165// -----------------------------------------------------------------------------
166// Vector implementation for F32x8
167// -----------------------------------------------------------------------------
168
169impl Vector<f32, 8> for F32x8 {
170    fn splat(value: f32) -> Self {
171        F32x8(f32x8::splat(value))
172    }
173
174    fn load(slice: &[f32]) -> Self {
175        let mut arr = [0.0f32; 8];
176        arr.copy_from_slice(&slice[0..8]);
177        F32x8(f32x8::from(arr))
178    }
179
180    fn store(&self, slice: &mut [f32]) {
181        let arr: [f32; 8] = self.0.into();
182        slice[0..8].copy_from_slice(&arr);
183    }
184
185    fn extract(&self, index: usize) -> f32 {
186        let arr: [f32; 8] = self.0.into();
187        arr[index]
188    }
189
190    fn insert(&self, index: usize, value: f32) -> Self {
191        let mut arr: [f32; 8] = self.0.into();
192        arr[index] = value;
193        F32x8(f32x8::from(arr))
194    }
195
196    fn add(&self, other: &Self) -> Self {
197        F32x8(self.0 + other.0)
198    }
199
200    fn sub(&self, other: &Self) -> Self {
201        F32x8(self.0 - other.0)
202    }
203
204    fn mul(&self, other: &Self) -> Self {
205        F32x8(self.0 * other.0)
206    }
207
208    fn div(&self, other: &Self) -> Self {
209        F32x8(self.0 / other.0)
210    }
211
212    fn rem(&self, other: &Self) -> Self {
213        let a: [f32; 8] = self.0.into();
214        let b: [f32; 8] = other.0.into();
215        let mut arr = [0.0f32; 8];
216        for i in 0..8 {
217            arr[i] = a[i] % b[i];
218        }
219        F32x8(f32x8::from(arr))
220    }
221
222    fn neg(&self) -> Self {
223        F32x8(-self.0)
224    }
225
226    fn abs(&self) -> Self {
227        F32x8(self.0.abs())
228    }
229
230    fn min(&self, other: &Self) -> Self {
231        F32x8(self.0.min(other.0))
232    }
233
234    fn max(&self, other: &Self) -> Self {
235        F32x8(self.0.max(other.0))
236    }
237
238    fn clamp(&self, min: &Self, max: &Self) -> Self {
239        F32x8(self.0.max(min.0).min(max.0))
240    }
241}
242
243impl VectorTranscendental<f32, 8> for F32x8 {
244    fn sqrt(&self) -> Self {
245        F32x8(self.0.sqrt())
246    }
247    fn exp(&self) -> Self {
248        F32x8(self.0.exp())
249    }
250    fn ln(&self) -> Self {
251        F32x8(self.0.ln())
252    }
253    fn sin(&self) -> Self {
254        F32x8(self.0.sin())
255    }
256    fn cos(&self) -> Self {
257        F32x8(self.0.cos())
258    }
259    fn tan(&self) -> Self {
260        F32x8(self.0.tan())
261    }
262}
263
264// -----------------------------------------------------------------------------
265// Vector implementation for F64x2
266// -----------------------------------------------------------------------------
267
268impl Vector<f64, 2> for F64x2 {
269    fn splat(value: f64) -> Self {
270        F64x2(f64x2::splat(value))
271    }
272
273    fn load(slice: &[f64]) -> Self {
274        let mut arr = [0.0f64; 2];
275        arr.copy_from_slice(&slice[0..2]);
276        F64x2(f64x2::from(arr))
277    }
278
279    fn store(&self, slice: &mut [f64]) {
280        let arr: [f64; 2] = self.0.into();
281        slice[0..2].copy_from_slice(&arr);
282    }
283
284    fn extract(&self, index: usize) -> f64 {
285        let arr: [f64; 2] = self.0.into();
286        arr[index]
287    }
288
289    fn insert(&self, index: usize, value: f64) -> Self {
290        let mut arr: [f64; 2] = self.0.into();
291        arr[index] = value;
292        F64x2(f64x2::from(arr))
293    }
294
295    fn add(&self, other: &Self) -> Self {
296        F64x2(self.0 + other.0)
297    }
298
299    fn sub(&self, other: &Self) -> Self {
300        F64x2(self.0 - other.0)
301    }
302
303    fn mul(&self, other: &Self) -> Self {
304        F64x2(self.0 * other.0)
305    }
306
307    fn div(&self, other: &Self) -> Self {
308        F64x2(self.0 / other.0)
309    }
310
311    fn rem(&self, other: &Self) -> Self {
312        let a: [f64; 2] = self.0.into();
313        let b: [f64; 2] = other.0.into();
314        let mut arr = [0.0f64; 2];
315        for i in 0..2 {
316            arr[i] = a[i] % b[i];
317        }
318        F64x2(f64x2::from(arr))
319    }
320
321    fn neg(&self) -> Self {
322        F64x2(-self.0)
323    }
324
325    fn abs(&self) -> Self {
326        F64x2(self.0.abs())
327    }
328
329    fn min(&self, other: &Self) -> Self {
330        F64x2(self.0.min(other.0))
331    }
332
333    fn max(&self, other: &Self) -> Self {
334        F64x2(self.0.max(other.0))
335    }
336
337    fn clamp(&self, min: &Self, max: &Self) -> Self {
338        F64x2(self.0.max(min.0).min(max.0))
339    }
340}
341
342impl VectorTranscendental<f64, 2> for F64x2 {
343    fn sqrt(&self) -> Self {
344        F64x2(self.0.sqrt())
345    }
346    fn exp(&self) -> Self {
347        F64x2(self.0.exp())
348    }
349    fn ln(&self) -> Self {
350        F64x2(self.0.ln())
351    }
352    fn sin(&self) -> Self {
353        F64x2(self.0.sin())
354    }
355    fn cos(&self) -> Self {
356        F64x2(self.0.cos())
357    }
358    fn tan(&self) -> Self {
359        F64x2(self.0.tan())
360    }
361}
362
363// -----------------------------------------------------------------------------
364// Vector implementation for F64x4
365// -----------------------------------------------------------------------------
366
367impl Vector<f64, 4> for F64x4 {
368    fn splat(value: f64) -> Self {
369        F64x4(f64x4::splat(value))
370    }
371
372    fn load(slice: &[f64]) -> Self {
373        let mut arr = [0.0f64; 4];
374        arr.copy_from_slice(&slice[0..4]);
375        F64x4(f64x4::from(arr))
376    }
377
378    fn store(&self, slice: &mut [f64]) {
379        let arr: [f64; 4] = self.0.into();
380        slice[0..4].copy_from_slice(&arr);
381    }
382
383    fn extract(&self, index: usize) -> f64 {
384        let arr: [f64; 4] = self.0.into();
385        arr[index]
386    }
387
388    fn insert(&self, index: usize, value: f64) -> Self {
389        let mut arr: [f64; 4] = self.0.into();
390        arr[index] = value;
391        F64x4(f64x4::from(arr))
392    }
393
394    fn add(&self, other: &Self) -> Self {
395        F64x4(self.0 + other.0)
396    }
397
398    fn sub(&self, other: &Self) -> Self {
399        F64x4(self.0 - other.0)
400    }
401
402    fn mul(&self, other: &Self) -> Self {
403        F64x4(self.0 * other.0)
404    }
405
406    fn div(&self, other: &Self) -> Self {
407        F64x4(self.0 / other.0)
408    }
409
410    fn rem(&self, other: &Self) -> Self {
411        let a: [f64; 4] = self.0.into();
412        let b: [f64; 4] = other.0.into();
413        let mut arr = [0.0f64; 4];
414        for i in 0..4 {
415            arr[i] = a[i] % b[i];
416        }
417        F64x4(f64x4::from(arr))
418    }
419
420    fn neg(&self) -> Self {
421        F64x4(-self.0)
422    }
423
424    fn abs(&self) -> Self {
425        F64x4(self.0.abs())
426    }
427
428    fn min(&self, other: &Self) -> Self {
429        F64x4(self.0.min(other.0))
430    }
431
432    fn max(&self, other: &Self) -> Self {
433        F64x4(self.0.max(other.0))
434    }
435
436    fn clamp(&self, min: &Self, max: &Self) -> Self {
437        F64x4(self.0.max(min.0).min(max.0))
438    }
439}
440
441impl VectorTranscendental<f64, 4> for F64x4 {
442    fn sqrt(&self) -> Self {
443        F64x4(self.0.sqrt())
444    }
445    fn exp(&self) -> Self {
446        F64x4(self.0.exp())
447    }
448    fn ln(&self) -> Self {
449        F64x4(self.0.ln())
450    }
451    fn sin(&self) -> Self {
452        F64x4(self.0.sin())
453    }
454    fn cos(&self) -> Self {
455        F64x4(self.0.cos())
456    }
457    fn tan(&self) -> Self {
458        F64x4(self.0.tan())
459    }
460}
461
462// -----------------------------------------------------------------------------
463// VectorMask implementation for F64x4
464// -----------------------------------------------------------------------------
465
466impl VectorMask<f64, 4> for F64x4 {
467    type Mask = F64x4;
468
469    fn eq(&self, other: &Self) -> F64x4 {
470        F64x4(self.0.cmp_eq(other.0))
471    }
472    fn ne(&self, other: &Self) -> F64x4 {
473        F64x4(self.0.cmp_ne(other.0))
474    }
475    fn gt(&self, other: &Self) -> F64x4 {
476        F64x4(self.0.cmp_gt(other.0))
477    }
478    fn ge(&self, other: &Self) -> F64x4 {
479        F64x4(self.0.cmp_ge(other.0))
480    }
481    fn lt(&self, other: &Self) -> F64x4 {
482        F64x4(self.0.cmp_lt(other.0))
483    }
484    fn le(&self, other: &Self) -> F64x4 {
485        F64x4(self.0.cmp_le(other.0))
486    }
487    fn select(&self, other: &Self, mask: F64x4) -> Self {
488        F64x4(mask.0.blend(self.0, other.0))
489    }
490    fn all(mask: &F64x4) -> bool {
491        mask.0.move_mask() == 0b1111
492    }
493}
494
495// -----------------------------------------------------------------------------
496// VectorMask implementation for F64x2
497// -----------------------------------------------------------------------------
498
499impl VectorMask<f64, 2> for F64x2 {
500    type Mask = F64x2;
501
502    fn eq(&self, other: &Self) -> F64x2 {
503        F64x2(self.0.cmp_eq(other.0))
504    }
505    fn ne(&self, other: &Self) -> F64x2 {
506        F64x2(self.0.cmp_ne(other.0))
507    }
508    fn gt(&self, other: &Self) -> F64x2 {
509        F64x2(self.0.cmp_gt(other.0))
510    }
511    fn ge(&self, other: &Self) -> F64x2 {
512        F64x2(self.0.cmp_ge(other.0))
513    }
514    fn lt(&self, other: &Self) -> F64x2 {
515        F64x2(self.0.cmp_lt(other.0))
516    }
517    fn le(&self, other: &Self) -> F64x2 {
518        F64x2(self.0.cmp_le(other.0))
519    }
520    fn select(&self, other: &Self, mask: F64x2) -> Self {
521        F64x2(mask.0.blend(self.0, other.0))
522    }
523    fn all(mask: &F64x2) -> bool {
524        mask.0.move_mask() == 0b11
525    }
526}
527
528// -----------------------------------------------------------------------------
529// VectorMask implementation for F32x4
530// -----------------------------------------------------------------------------
531
532impl VectorMask<f32, 4> for F32x4 {
533    type Mask = F32x4;
534
535    fn eq(&self, other: &Self) -> F32x4 {
536        F32x4(self.0.cmp_eq(other.0))
537    }
538    fn ne(&self, other: &Self) -> F32x4 {
539        F32x4(self.0.cmp_ne(other.0))
540    }
541    fn gt(&self, other: &Self) -> F32x4 {
542        F32x4(self.0.cmp_gt(other.0))
543    }
544    fn ge(&self, other: &Self) -> F32x4 {
545        F32x4(self.0.cmp_ge(other.0))
546    }
547    fn lt(&self, other: &Self) -> F32x4 {
548        F32x4(self.0.cmp_lt(other.0))
549    }
550    fn le(&self, other: &Self) -> F32x4 {
551        F32x4(self.0.cmp_le(other.0))
552    }
553    fn select(&self, other: &Self, mask: F32x4) -> Self {
554        F32x4(mask.0.blend(self.0, other.0))
555    }
556    fn all(mask: &F32x4) -> bool {
557        mask.0.move_mask() == 0b1111
558    }
559}
560
561// -----------------------------------------------------------------------------
562// VectorMask implementation for F32x8
563// -----------------------------------------------------------------------------
564
565impl VectorMask<f32, 8> for F32x8 {
566    type Mask = F32x8;
567
568    fn eq(&self, other: &Self) -> F32x8 {
569        F32x8(self.0.cmp_eq(other.0))
570    }
571    fn ne(&self, other: &Self) -> F32x8 {
572        F32x8(self.0.cmp_ne(other.0))
573    }
574    fn gt(&self, other: &Self) -> F32x8 {
575        F32x8(self.0.cmp_gt(other.0))
576    }
577    fn ge(&self, other: &Self) -> F32x8 {
578        F32x8(self.0.cmp_ge(other.0))
579    }
580    fn lt(&self, other: &Self) -> F32x8 {
581        F32x8(self.0.cmp_lt(other.0))
582    }
583    fn le(&self, other: &Self) -> F32x8 {
584        F32x8(self.0.cmp_le(other.0))
585    }
586    fn select(&self, other: &Self, mask: F32x8) -> Self {
587        F32x8(mask.0.blend(self.0, other.0))
588    }
589    fn all(mask: &F32x8) -> bool {
590        mask.0.move_mask() == 0b1111_1111
591    }
592}
593
594// -----------------------------------------------------------------------------
595// Operator implementations (Add, Sub, Mul, Div, Rem, Neg)
596// -----------------------------------------------------------------------------
597
598impl Add for F32x4 {
599    type Output = Self;
600    fn add(self, rhs: Self) -> Self {
601        Self(self.0 + rhs.0)
602    }
603}
604
605impl Sub for F32x4 {
606    type Output = Self;
607    fn sub(self, rhs: Self) -> Self {
608        Self(self.0 - rhs.0)
609    }
610}
611
612impl Mul for F32x4 {
613    type Output = Self;
614    fn mul(self, rhs: Self) -> Self {
615        Self(self.0 * rhs.0)
616    }
617}
618
619impl Div for F32x4 {
620    type Output = Self;
621    fn div(self, rhs: Self) -> Self {
622        Self(self.0 / rhs.0)
623    }
624}
625
626impl Rem for F32x4 {
627    type Output = Self;
628    fn rem(self, rhs: Self) -> Self {
629        let a: [f32; 4] = self.0.into();
630        let b: [f32; 4] = rhs.0.into();
631        let mut arr = [0.0f32; 4];
632        for i in 0..4 {
633            arr[i] = a[i] % b[i];
634        }
635        Self(f32x4::from(arr))
636    }
637}
638
639impl Neg for F32x4 {
640    type Output = Self;
641    fn neg(self) -> Self {
642        Self(-self.0)
643    }
644}
645
646// Similarly for F32x8, F64x2, F64x4
647
648impl Add for F32x8 {
649    type Output = Self;
650    fn add(self, rhs: Self) -> Self {
651        Self(self.0 + rhs.0)
652    }
653}
654
655impl Sub for F32x8 {
656    type Output = Self;
657    fn sub(self, rhs: Self) -> Self {
658        Self(self.0 - rhs.0)
659    }
660}
661
662impl Mul for F32x8 {
663    type Output = Self;
664    fn mul(self, rhs: Self) -> Self {
665        Self(self.0 * rhs.0)
666    }
667}
668
669impl Div for F32x8 {
670    type Output = Self;
671    fn div(self, rhs: Self) -> Self {
672        Self(self.0 / rhs.0)
673    }
674}
675
676impl Rem for F32x8 {
677    type Output = Self;
678    fn rem(self, rhs: Self) -> Self {
679        let a: [f32; 8] = self.0.into();
680        let b: [f32; 8] = rhs.0.into();
681        let mut arr = [0.0f32; 8];
682        for i in 0..8 {
683            arr[i] = a[i] % b[i];
684        }
685        Self(f32x8::from(arr))
686    }
687}
688
689impl Neg for F32x8 {
690    type Output = Self;
691    fn neg(self) -> Self {
692        Self(-self.0)
693    }
694}
695
696impl Add for F64x2 {
697    type Output = Self;
698    fn add(self, rhs: Self) -> Self {
699        Self(self.0 + rhs.0)
700    }
701}
702
703impl Sub for F64x2 {
704    type Output = Self;
705    fn sub(self, rhs: Self) -> Self {
706        Self(self.0 - rhs.0)
707    }
708}
709
710impl Mul for F64x2 {
711    type Output = Self;
712    fn mul(self, rhs: Self) -> Self {
713        Self(self.0 * rhs.0)
714    }
715}
716
717impl Div for F64x2 {
718    type Output = Self;
719    fn div(self, rhs: Self) -> Self {
720        Self(self.0 / rhs.0)
721    }
722}
723
724impl Rem for F64x2 {
725    type Output = Self;
726    fn rem(self, rhs: Self) -> Self {
727        let a: [f64; 2] = self.0.into();
728        let b: [f64; 2] = rhs.0.into();
729        let mut arr = [0.0f64; 2];
730        for i in 0..2 {
731            arr[i] = a[i] % b[i];
732        }
733        Self(f64x2::from(arr))
734    }
735}
736
737impl Neg for F64x2 {
738    type Output = Self;
739    fn neg(self) -> Self {
740        Self(-self.0)
741    }
742}
743
744impl Add for F64x4 {
745    type Output = Self;
746    fn add(self, rhs: Self) -> Self {
747        Self(self.0 + rhs.0)
748    }
749}
750
751impl Sub for F64x4 {
752    type Output = Self;
753    fn sub(self, rhs: Self) -> Self {
754        Self(self.0 - rhs.0)
755    }
756}
757
758impl Mul for F64x4 {
759    type Output = Self;
760    fn mul(self, rhs: Self) -> Self {
761        Self(self.0 * rhs.0)
762    }
763}
764
765impl Div for F64x4 {
766    type Output = Self;
767    fn div(self, rhs: Self) -> Self {
768        Self(self.0 / rhs.0)
769    }
770}
771
772impl Rem for F64x4 {
773    type Output = Self;
774    fn rem(self, rhs: Self) -> Self {
775        let a: [f64; 4] = self.0.into();
776        let b: [f64; 4] = rhs.0.into();
777        let mut arr = [0.0f64; 4];
778        for i in 0..4 {
779            arr[i] = a[i] % b[i];
780        }
781        Self(f64x4::from(arr))
782    }
783}
784
785impl Neg for F64x4 {
786    type Output = Self;
787    fn neg(self) -> Self {
788        Self(-self.0)
789    }
790}
791
792// -----------------------------------------------------------------------------
793// Unit tests
794// -----------------------------------------------------------------------------
795
796#[cfg(test)]
797mod tests {
798    use super::*;
799    use crate::math::vector::traits::VectorMask;
800
801    #[test]
802    fn test_f32x4_basic() {
803        let a = F32x4::load(&[1.0, 2.0, 3.0, 4.0]);
804        let b = F32x4::load(&[5.0, 6.0, 7.0, 8.0]);
805
806        let c = a + b;
807        let mut arr = [0.0f32; 4];
808        c.store(&mut arr);
809        assert_eq!(arr, [6.0, 8.0, 10.0, 12.0]);
810
811        let c = a * b;
812        c.store(&mut arr);
813        assert_eq!(arr, [5.0, 12.0, 21.0, 32.0]);
814    }
815
816    #[test]
817    fn test_f32x4_math() {
818        let a = F32x4::load(&[0.0, 0.5, 1.0, 2.0]);
819        let sin_a = a.sin();
820        let mut arr = [0.0f32; 4];
821        sin_a.store(&mut arr);
822        let expected = [0.0f32.sin(), 0.5f32.sin(), 1.0f32.sin(), 2.0f32.sin()];
823        for i in 0..4 {
824            assert!((arr[i] - expected[i]).abs() < 1e-5);
825        }
826    }
827
828    #[test]
829    fn test_f64x2_basic() {
830        let a = F64x2::load(&[1.0, 2.0]);
831        let b = F64x2::load(&[3.0, 4.0]);
832
833        let c = a + b;
834        let mut arr = [0.0f64; 2];
835        c.store(&mut arr);
836        assert_eq!(arr, [4.0, 6.0]);
837    }
838
839    #[test]
840    fn test_f64x4_basic() {
841        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
842        let b = F64x4::load(&[5.0, 6.0, 7.0, 8.0]);
843
844        let c = a + b;
845        let mut arr = [0.0f64; 4];
846        c.store(&mut arr);
847        assert_eq!(arr, [6.0, 8.0, 10.0, 12.0]);
848
849        let c = a * b;
850        c.store(&mut arr);
851        assert_eq!(arr, [5.0, 12.0, 21.0, 32.0]);
852    }
853
854    #[test]
855    fn test_f64x4_math() {
856        let a = F64x4::load(&[0.0, 0.5, 1.0, 2.0]);
857        let sqrt_a = a.sqrt();
858        let mut arr = [0.0f64; 4];
859        sqrt_a.store(&mut arr);
860        let expected = [0.0f64.sqrt(), 0.5f64.sqrt(), 1.0f64.sqrt(), 2.0f64.sqrt()];
861        for i in 0..4 {
862            assert!((arr[i] - expected[i]).abs() < 1e-12);
863        }
864
865        let exp_a = a.exp();
866        exp_a.store(&mut arr);
867        let expected = [0.0f64.exp(), 0.5f64.exp(), 1.0f64.exp(), 2.0f64.exp()];
868        for i in 0..4 {
869            assert!((arr[i] - expected[i]).abs() < 1e-12);
870        }
871    }
872
873    #[test]
874    fn test_f64x4_vector_mask_lt() {
875        // wide 0.7 returns mask with from_bits(u64::MAX) = NaN for true, 0.0 for false
876        // Use move_mask to check bits
877        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
878        let b = F64x4::load(&[3.0, 3.0, 3.0, 3.0]);
879        let mask = <F64x4 as VectorMask<f64, 4>>::lt(&a, &b);
880        // move_mask extracts sign bit of each lane
881        assert_eq!(mask.0.move_mask() & 0b1111, 0b0011); // lanes 0,1 true
882    }
883
884    #[test]
885    fn test_f64x4_vector_mask_gt() {
886        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
887        let b = F64x4::load(&[2.0, 2.0, 2.0, 2.0]);
888        let mask = <F64x4 as VectorMask<f64, 4>>::gt(&a, &b);
889        assert_eq!(mask.0.move_mask() & 0b1111, 0b1100); // lanes 2,3 true
890    }
891
892    #[test]
893    fn test_f64x4_vector_mask_eq() {
894        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
895        let b = F64x4::load(&[1.0, 0.0, 3.0, 5.0]);
896        let mask = <F64x4 as VectorMask<f64, 4>>::eq(&a, &b);
897        assert_eq!(mask.0.move_mask() & 0b1111, 0b0101); // lanes 0,2 true
898    }
899
900    #[test]
901    fn test_f64x4_vector_mask_all() {
902        let all_true = <F64x4 as VectorMask<f64, 4>>::lt(&F64x4::splat(1.0), &F64x4::splat(2.0));
903        assert!(<F64x4 as VectorMask<f64, 4>>::all(&all_true));
904
905        let partial_true = <F64x4 as VectorMask<f64, 4>>::lt(
906            &F64x4::load(&[1.0, 2.0, 3.0, 4.0]),
907            &F64x4::splat(3.0),
908        );
909        assert!(!<F64x4 as VectorMask<f64, 4>>::all(&partial_true));
910    }
911
912    #[test]
913    fn test_f64x4_vector_mask_select() {
914        let true_vals = F64x4::load(&[10.0, 20.0, 30.0, 40.0]);
915        let false_vals = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
916        // mask: true where true_vals < 25
917        let threshold = F64x4::load(&[5.0, 25.0, 25.0, 25.0]);
918        let mask = <F64x4 as VectorMask<f64, 4>>::lt(&true_vals, &threshold);
919        let selected = <F64x4 as VectorMask<f64, 4>>::select(&true_vals, &false_vals, mask);
920        // lanes 0 true (10 < 5? No — 10 < 5 false, so lane 0 is false)
921
922        // Actually: a = [10, 20, 30, 40], threshold = [5, 25, 25, 25]
923        // a < threshold: [false, true, false, false]
924        assert_eq!(mask.0.move_mask() & 0b1111, 0b0010);
925        // select: only lane 1 takes from true_vals (20)
926        let mut arr = [0.0; 4];
927        selected.store(&mut arr);
928        assert!((arr[0] - 1.0).abs() < 1e-15);
929        assert!((arr[1] - 20.0).abs() < 1e-15);
930        assert!((arr[2] - 3.0).abs() < 1e-15);
931        assert!((arr[3] - 4.0).abs() < 1e-15);
932    }
933}