Skip to main content

rill_core/math/vector/simd/
wide.rs

1//! Cross-platform SIMD implementations via the `wide` crate
2//!
3//! This module provides vector types using the `wide` library,
4//! which provides portable SIMD operations with fallback to scalar implementations.
5//!
6//! Types:
7//! - `F32x4`, `F32x8` for `f32`
8//! - `F64x2`, `F64x4` for `f64`
9
10use crate::Transcendental;
11use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
12use wide::{f32x4, f32x8, f64x2, f64x4, CmpEq, CmpGe, CmpGt, CmpLe, CmpLt, CmpNe};
13
14use crate::math::vector::traits::{Vector, VectorMask, VectorTranscendental};
15
16// -----------------------------------------------------------------------------
17// Wrappers around wide types for implementing the Vector trait
18// -----------------------------------------------------------------------------
19
20/// SIMD vector of 4 `f32` elements
21#[derive(Copy, Clone, Debug, PartialEq)]
22pub struct F32x4(f32x4);
23
24/// SIMD vector of 8 `f32` elements
25#[derive(Copy, Clone, Debug, PartialEq)]
26pub struct F32x8(f32x8);
27
28/// SIMD vector of 2 `f64` elements
29#[derive(Copy, Clone, Debug, PartialEq)]
30pub struct F64x2(f64x2);
31
32/// SIMD vector of 4 `f64` elements
33#[derive(Copy, Clone, Debug, PartialEq)]
34pub struct F64x4(f64x4);
35
36// -----------------------------------------------------------------------------
37// Default implementations
38// -----------------------------------------------------------------------------
39
40impl Default for F32x4 {
41    fn default() -> Self {
42        Self(f32x4::splat(0.0))
43    }
44}
45
46impl Default for F32x8 {
47    fn default() -> Self {
48        Self(f32x8::splat(0.0))
49    }
50}
51
52impl Default for F64x2 {
53    fn default() -> Self {
54        Self(f64x2::splat(0.0))
55    }
56}
57
58impl Default for F64x4 {
59    fn default() -> Self {
60        Self(f64x4::splat(0.0))
61    }
62}
63
64// -----------------------------------------------------------------------------
65// Vector implementation for F32x4
66// -----------------------------------------------------------------------------
67
68impl Vector<f32, 4> for F32x4 {
69    fn splat(value: f32) -> Self {
70        F32x4(f32x4::splat(value))
71    }
72
73    fn load(slice: &[f32]) -> Self {
74        let mut arr = [0.0f32; 4];
75        arr.copy_from_slice(&slice[0..4]);
76        F32x4(f32x4::from(arr))
77    }
78
79    fn store(&self, slice: &mut [f32]) {
80        let arr: [f32; 4] = self.0.into();
81        slice[0..4].copy_from_slice(&arr);
82    }
83
84    fn extract(&self, index: usize) -> f32 {
85        let arr: [f32; 4] = self.0.into();
86        arr[index]
87    }
88
89    fn insert(&self, index: usize, value: f32) -> Self {
90        let mut arr: [f32; 4] = self.0.into();
91        arr[index] = value;
92        F32x4(f32x4::from(arr))
93    }
94
95    fn add(&self, other: &Self) -> Self {
96        F32x4(self.0 + other.0)
97    }
98
99    fn sub(&self, other: &Self) -> Self {
100        F32x4(self.0 - other.0)
101    }
102
103    fn mul(&self, other: &Self) -> Self {
104        F32x4(self.0 * other.0)
105    }
106
107    fn div(&self, other: &Self) -> Self {
108        F32x4(self.0 / other.0)
109    }
110
111    fn rem(&self, other: &Self) -> Self {
112        // wide does not provide a remainder operation, implement component-wise
113        let a: [f32; 4] = self.0.into();
114        let b: [f32; 4] = other.0.into();
115        let mut arr = [0.0f32; 4];
116        for i in 0..4 {
117            arr[i] = a[i] % b[i];
118        }
119        F32x4(f32x4::from(arr))
120    }
121
122    fn neg(&self) -> Self {
123        F32x4(-self.0)
124    }
125
126    fn abs(&self) -> Self {
127        F32x4(self.0.abs())
128    }
129
130    fn min(&self, other: &Self) -> Self {
131        F32x4(self.0.min(other.0))
132    }
133
134    fn max(&self, other: &Self) -> Self {
135        F32x4(self.0.max(other.0))
136    }
137
138    fn clamp(&self, min: &Self, max: &Self) -> Self {
139        // clamp = self.max(min).min(max)
140        F32x4(self.0.max(min.0).min(max.0))
141    }
142}
143
144impl VectorTranscendental<f32, 4> for F32x4 {
145    fn sqrt(&self) -> Self {
146        F32x4(self.0.sqrt())
147    }
148    fn exp(&self) -> Self {
149        F32x4(self.0.exp())
150    }
151    fn ln(&self) -> Self {
152        F32x4(self.0.ln())
153    }
154    fn sin(&self) -> Self {
155        F32x4(self.0.sin())
156    }
157    fn cos(&self) -> Self {
158        F32x4(self.0.cos())
159    }
160    fn tan(&self) -> Self {
161        F32x4(self.0.tan())
162    }
163}
164
165// -----------------------------------------------------------------------------
166// Vector implementation for F32x8
167// -----------------------------------------------------------------------------
168
169impl Vector<f32, 8> for F32x8 {
170    fn splat(value: f32) -> Self {
171        F32x8(f32x8::splat(value))
172    }
173
174    fn load(slice: &[f32]) -> Self {
175        let mut arr = [0.0f32; 8];
176        arr.copy_from_slice(&slice[0..8]);
177        F32x8(f32x8::from(arr))
178    }
179
180    fn store(&self, slice: &mut [f32]) {
181        let arr: [f32; 8] = self.0.into();
182        slice[0..8].copy_from_slice(&arr);
183    }
184
185    fn extract(&self, index: usize) -> f32 {
186        let arr: [f32; 8] = self.0.into();
187        arr[index]
188    }
189
190    fn insert(&self, index: usize, value: f32) -> Self {
191        let mut arr: [f32; 8] = self.0.into();
192        arr[index] = value;
193        F32x8(f32x8::from(arr))
194    }
195
196    fn add(&self, other: &Self) -> Self {
197        F32x8(self.0 + other.0)
198    }
199
200    fn sub(&self, other: &Self) -> Self {
201        F32x8(self.0 - other.0)
202    }
203
204    fn mul(&self, other: &Self) -> Self {
205        F32x8(self.0 * other.0)
206    }
207
208    fn div(&self, other: &Self) -> Self {
209        F32x8(self.0 / other.0)
210    }
211
212    fn rem(&self, other: &Self) -> Self {
213        let a: [f32; 8] = self.0.into();
214        let b: [f32; 8] = other.0.into();
215        let mut arr = [0.0f32; 8];
216        for i in 0..8 {
217            arr[i] = a[i] % b[i];
218        }
219        F32x8(f32x8::from(arr))
220    }
221
222    fn neg(&self) -> Self {
223        F32x8(-self.0)
224    }
225
226    fn abs(&self) -> Self {
227        F32x8(self.0.abs())
228    }
229
230    fn min(&self, other: &Self) -> Self {
231        F32x8(self.0.min(other.0))
232    }
233
234    fn max(&self, other: &Self) -> Self {
235        F32x8(self.0.max(other.0))
236    }
237
238    fn clamp(&self, min: &Self, max: &Self) -> Self {
239        F32x8(self.0.max(min.0).min(max.0))
240    }
241}
242
243impl VectorTranscendental<f32, 8> for F32x8 {
244    fn sqrt(&self) -> Self {
245        F32x8(self.0.sqrt())
246    }
247    fn exp(&self) -> Self {
248        F32x8(self.0.exp())
249    }
250    fn ln(&self) -> Self {
251        F32x8(self.0.ln())
252    }
253    fn sin(&self) -> Self {
254        F32x8(self.0.sin())
255    }
256    fn cos(&self) -> Self {
257        F32x8(self.0.cos())
258    }
259    fn tan(&self) -> Self {
260        F32x8(self.0.tan())
261    }
262}
263
264// -----------------------------------------------------------------------------
265// Vector implementation for F64x2
266// -----------------------------------------------------------------------------
267
268impl Vector<f64, 2> for F64x2 {
269    fn splat(value: f64) -> Self {
270        F64x2(f64x2::splat(value))
271    }
272
273    fn load(slice: &[f64]) -> Self {
274        let mut arr = [0.0f64; 2];
275        arr.copy_from_slice(&slice[0..2]);
276        F64x2(f64x2::from(arr))
277    }
278
279    fn store(&self, slice: &mut [f64]) {
280        let arr: [f64; 2] = self.0.into();
281        slice[0..2].copy_from_slice(&arr);
282    }
283
284    fn extract(&self, index: usize) -> f64 {
285        let arr: [f64; 2] = self.0.into();
286        arr[index]
287    }
288
289    fn insert(&self, index: usize, value: f64) -> Self {
290        let mut arr: [f64; 2] = self.0.into();
291        arr[index] = value;
292        F64x2(f64x2::from(arr))
293    }
294
295    fn add(&self, other: &Self) -> Self {
296        F64x2(self.0 + other.0)
297    }
298
299    fn sub(&self, other: &Self) -> Self {
300        F64x2(self.0 - other.0)
301    }
302
303    fn mul(&self, other: &Self) -> Self {
304        F64x2(self.0 * other.0)
305    }
306
307    fn div(&self, other: &Self) -> Self {
308        F64x2(self.0 / other.0)
309    }
310
311    fn rem(&self, other: &Self) -> Self {
312        let a: [f64; 2] = self.0.into();
313        let b: [f64; 2] = other.0.into();
314        let mut arr = [0.0f64; 2];
315        for i in 0..2 {
316            arr[i] = a[i] % b[i];
317        }
318        F64x2(f64x2::from(arr))
319    }
320
321    fn neg(&self) -> Self {
322        F64x2(-self.0)
323    }
324
325    fn abs(&self) -> Self {
326        F64x2(self.0.abs())
327    }
328
329    fn min(&self, other: &Self) -> Self {
330        F64x2(self.0.min(other.0))
331    }
332
333    fn max(&self, other: &Self) -> Self {
334        F64x2(self.0.max(other.0))
335    }
336
337    fn clamp(&self, min: &Self, max: &Self) -> Self {
338        F64x2(self.0.max(min.0).min(max.0))
339    }
340}
341
342impl VectorTranscendental<f64, 2> for F64x2 {
343    fn sqrt(&self) -> Self {
344        F64x2(self.0.sqrt())
345    }
346    fn exp(&self) -> Self {
347        F64x2(self.0.exp())
348    }
349    fn ln(&self) -> Self {
350        F64x2(self.0.ln())
351    }
352    fn sin(&self) -> Self {
353        F64x2(self.0.sin())
354    }
355    fn cos(&self) -> Self {
356        F64x2(self.0.cos())
357    }
358    fn tan(&self) -> Self {
359        F64x2(self.0.tan())
360    }
361}
362
363// -----------------------------------------------------------------------------
364// Vector implementation for F64x4
365// -----------------------------------------------------------------------------
366
367impl Vector<f64, 4> for F64x4 {
368    fn splat(value: f64) -> Self {
369        F64x4(f64x4::splat(value))
370    }
371
372    fn load(slice: &[f64]) -> Self {
373        let mut arr = [0.0f64; 4];
374        arr.copy_from_slice(&slice[0..4]);
375        F64x4(f64x4::from(arr))
376    }
377
378    fn store(&self, slice: &mut [f64]) {
379        let arr: [f64; 4] = self.0.into();
380        slice[0..4].copy_from_slice(&arr);
381    }
382
383    fn extract(&self, index: usize) -> f64 {
384        let arr: [f64; 4] = self.0.into();
385        arr[index]
386    }
387
388    fn insert(&self, index: usize, value: f64) -> Self {
389        let mut arr: [f64; 4] = self.0.into();
390        arr[index] = value;
391        F64x4(f64x4::from(arr))
392    }
393
394    fn add(&self, other: &Self) -> Self {
395        F64x4(self.0 + other.0)
396    }
397
398    fn sub(&self, other: &Self) -> Self {
399        F64x4(self.0 - other.0)
400    }
401
402    fn mul(&self, other: &Self) -> Self {
403        F64x4(self.0 * other.0)
404    }
405
406    fn div(&self, other: &Self) -> Self {
407        F64x4(self.0 / other.0)
408    }
409
410    fn rem(&self, other: &Self) -> Self {
411        let a: [f64; 4] = self.0.into();
412        let b: [f64; 4] = other.0.into();
413        let mut arr = [0.0f64; 4];
414        for i in 0..4 {
415            arr[i] = a[i] % b[i];
416        }
417        F64x4(f64x4::from(arr))
418    }
419
420    fn neg(&self) -> Self {
421        F64x4(-self.0)
422    }
423
424    fn abs(&self) -> Self {
425        F64x4(self.0.abs())
426    }
427
428    fn min(&self, other: &Self) -> Self {
429        F64x4(self.0.min(other.0))
430    }
431
432    fn max(&self, other: &Self) -> Self {
433        F64x4(self.0.max(other.0))
434    }
435
436    fn clamp(&self, min: &Self, max: &Self) -> Self {
437        F64x4(self.0.max(min.0).min(max.0))
438    }
439}
440
441impl VectorTranscendental<f64, 4> for F64x4 {
442    fn sqrt(&self) -> Self {
443        F64x4(self.0.sqrt())
444    }
445    fn exp(&self) -> Self {
446        F64x4(self.0.exp())
447    }
448    fn ln(&self) -> Self {
449        F64x4(self.0.ln())
450    }
451    fn sin(&self) -> Self {
452        F64x4(self.0.sin())
453    }
454    fn cos(&self) -> Self {
455        F64x4(self.0.cos())
456    }
457    fn tan(&self) -> Self {
458        F64x4(self.0.tan())
459    }
460}
461
462// -----------------------------------------------------------------------------
463// VectorMask implementation
464// -----------------------------------------------------------------------------
465
466impl VectorMask<f64, 4> for F64x4 {
467    // In wide 0.7, comparison masks are the same type as the vector,
468    // where -1.0 = true and 0.0 = false.
469    type Mask = F64x4;
470
471    fn eq(&self, other: &Self) -> F64x4 {
472        F64x4(self.0.cmp_eq(other.0))
473    }
474
475    fn ne(&self, other: &Self) -> F64x4 {
476        F64x4(self.0.cmp_ne(other.0))
477    }
478
479    fn gt(&self, other: &Self) -> F64x4 {
480        F64x4(self.0.cmp_gt(other.0))
481    }
482
483    fn ge(&self, other: &Self) -> F64x4 {
484        F64x4(self.0.cmp_ge(other.0))
485    }
486
487    fn lt(&self, other: &Self) -> F64x4 {
488        F64x4(self.0.cmp_lt(other.0))
489    }
490
491    fn le(&self, other: &Self) -> F64x4 {
492        F64x4(self.0.cmp_le(other.0))
493    }
494
495    fn select(&self, other: &Self, mask: F64x4) -> Self {
496        // f64x4::blend(self=mask, t=true_vals, f=false_vals)
497        // returns t where self != 0, f where self == 0
498        F64x4(mask.0.blend(self.0, other.0))
499    }
500
501    fn all(mask: &F64x4) -> bool {
502        // move_mask returns bit i = sign bit of lane i
503        // For -1.0 (true), sign bit is 1; for 0.0 (false), sign bit is 0.
504        mask.0.move_mask() == 0b1111
505    }
506}
507
508// -----------------------------------------------------------------------------
509// Operator implementations (Add, Sub, Mul, Div, Rem, Neg)
510// -----------------------------------------------------------------------------
511
512impl Add for F32x4 {
513    type Output = Self;
514    fn add(self, rhs: Self) -> Self {
515        Self(self.0 + rhs.0)
516    }
517}
518
519impl Sub for F32x4 {
520    type Output = Self;
521    fn sub(self, rhs: Self) -> Self {
522        Self(self.0 - rhs.0)
523    }
524}
525
526impl Mul for F32x4 {
527    type Output = Self;
528    fn mul(self, rhs: Self) -> Self {
529        Self(self.0 * rhs.0)
530    }
531}
532
533impl Div for F32x4 {
534    type Output = Self;
535    fn div(self, rhs: Self) -> Self {
536        Self(self.0 / rhs.0)
537    }
538}
539
540impl Rem for F32x4 {
541    type Output = Self;
542    fn rem(self, rhs: Self) -> Self {
543        let a: [f32; 4] = self.0.into();
544        let b: [f32; 4] = rhs.0.into();
545        let mut arr = [0.0f32; 4];
546        for i in 0..4 {
547            arr[i] = a[i] % b[i];
548        }
549        Self(f32x4::from(arr))
550    }
551}
552
553impl Neg for F32x4 {
554    type Output = Self;
555    fn neg(self) -> Self {
556        Self(-self.0)
557    }
558}
559
560// Similarly for F32x8, F64x2, F64x4
561
562impl Add for F32x8 {
563    type Output = Self;
564    fn add(self, rhs: Self) -> Self {
565        Self(self.0 + rhs.0)
566    }
567}
568
569impl Sub for F32x8 {
570    type Output = Self;
571    fn sub(self, rhs: Self) -> Self {
572        Self(self.0 - rhs.0)
573    }
574}
575
576impl Mul for F32x8 {
577    type Output = Self;
578    fn mul(self, rhs: Self) -> Self {
579        Self(self.0 * rhs.0)
580    }
581}
582
583impl Div for F32x8 {
584    type Output = Self;
585    fn div(self, rhs: Self) -> Self {
586        Self(self.0 / rhs.0)
587    }
588}
589
590impl Rem for F32x8 {
591    type Output = Self;
592    fn rem(self, rhs: Self) -> Self {
593        let a: [f32; 8] = self.0.into();
594        let b: [f32; 8] = rhs.0.into();
595        let mut arr = [0.0f32; 8];
596        for i in 0..8 {
597            arr[i] = a[i] % b[i];
598        }
599        Self(f32x8::from(arr))
600    }
601}
602
603impl Neg for F32x8 {
604    type Output = Self;
605    fn neg(self) -> Self {
606        Self(-self.0)
607    }
608}
609
610impl Add for F64x2 {
611    type Output = Self;
612    fn add(self, rhs: Self) -> Self {
613        Self(self.0 + rhs.0)
614    }
615}
616
617impl Sub for F64x2 {
618    type Output = Self;
619    fn sub(self, rhs: Self) -> Self {
620        Self(self.0 - rhs.0)
621    }
622}
623
624impl Mul for F64x2 {
625    type Output = Self;
626    fn mul(self, rhs: Self) -> Self {
627        Self(self.0 * rhs.0)
628    }
629}
630
631impl Div for F64x2 {
632    type Output = Self;
633    fn div(self, rhs: Self) -> Self {
634        Self(self.0 / rhs.0)
635    }
636}
637
638impl Rem for F64x2 {
639    type Output = Self;
640    fn rem(self, rhs: Self) -> Self {
641        let a: [f64; 2] = self.0.into();
642        let b: [f64; 2] = rhs.0.into();
643        let mut arr = [0.0f64; 2];
644        for i in 0..2 {
645            arr[i] = a[i] % b[i];
646        }
647        Self(f64x2::from(arr))
648    }
649}
650
651impl Neg for F64x2 {
652    type Output = Self;
653    fn neg(self) -> Self {
654        Self(-self.0)
655    }
656}
657
658impl Add for F64x4 {
659    type Output = Self;
660    fn add(self, rhs: Self) -> Self {
661        Self(self.0 + rhs.0)
662    }
663}
664
665impl Sub for F64x4 {
666    type Output = Self;
667    fn sub(self, rhs: Self) -> Self {
668        Self(self.0 - rhs.0)
669    }
670}
671
672impl Mul for F64x4 {
673    type Output = Self;
674    fn mul(self, rhs: Self) -> Self {
675        Self(self.0 * rhs.0)
676    }
677}
678
679impl Div for F64x4 {
680    type Output = Self;
681    fn div(self, rhs: Self) -> Self {
682        Self(self.0 / rhs.0)
683    }
684}
685
686impl Rem for F64x4 {
687    type Output = Self;
688    fn rem(self, rhs: Self) -> Self {
689        let a: [f64; 4] = self.0.into();
690        let b: [f64; 4] = rhs.0.into();
691        let mut arr = [0.0f64; 4];
692        for i in 0..4 {
693            arr[i] = a[i] % b[i];
694        }
695        Self(f64x4::from(arr))
696    }
697}
698
699impl Neg for F64x4 {
700    type Output = Self;
701    fn neg(self) -> Self {
702        Self(-self.0)
703    }
704}
705
706// -----------------------------------------------------------------------------
707// Unit tests
708// -----------------------------------------------------------------------------
709
710#[cfg(test)]
711mod tests {
712    use super::*;
713    use crate::math::vector::traits::VectorMask;
714
715    #[test]
716    fn test_f32x4_basic() {
717        let a = F32x4::load(&[1.0, 2.0, 3.0, 4.0]);
718        let b = F32x4::load(&[5.0, 6.0, 7.0, 8.0]);
719
720        let c = a + b;
721        let mut arr = [0.0f32; 4];
722        c.store(&mut arr);
723        assert_eq!(arr, [6.0, 8.0, 10.0, 12.0]);
724
725        let c = a * b;
726        c.store(&mut arr);
727        assert_eq!(arr, [5.0, 12.0, 21.0, 32.0]);
728    }
729
730    #[test]
731    fn test_f32x4_math() {
732        let a = F32x4::load(&[0.0, 0.5, 1.0, 2.0]);
733        let sin_a = a.sin();
734        let mut arr = [0.0f32; 4];
735        sin_a.store(&mut arr);
736        let expected = [0.0f32.sin(), 0.5f32.sin(), 1.0f32.sin(), 2.0f32.sin()];
737        for i in 0..4 {
738            assert!((arr[i] - expected[i]).abs() < 1e-5);
739        }
740    }
741
742    #[test]
743    fn test_f64x2_basic() {
744        let a = F64x2::load(&[1.0, 2.0]);
745        let b = F64x2::load(&[3.0, 4.0]);
746
747        let c = a + b;
748        let mut arr = [0.0f64; 2];
749        c.store(&mut arr);
750        assert_eq!(arr, [4.0, 6.0]);
751    }
752
753    #[test]
754    fn test_f64x4_basic() {
755        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
756        let b = F64x4::load(&[5.0, 6.0, 7.0, 8.0]);
757
758        let c = a + b;
759        let mut arr = [0.0f64; 4];
760        c.store(&mut arr);
761        assert_eq!(arr, [6.0, 8.0, 10.0, 12.0]);
762
763        let c = a * b;
764        c.store(&mut arr);
765        assert_eq!(arr, [5.0, 12.0, 21.0, 32.0]);
766    }
767
768    #[test]
769    fn test_f64x4_math() {
770        let a = F64x4::load(&[0.0, 0.5, 1.0, 2.0]);
771        let sqrt_a = a.sqrt();
772        let mut arr = [0.0f64; 4];
773        sqrt_a.store(&mut arr);
774        let expected = [0.0f64.sqrt(), 0.5f64.sqrt(), 1.0f64.sqrt(), 2.0f64.sqrt()];
775        for i in 0..4 {
776            assert!((arr[i] - expected[i]).abs() < 1e-12);
777        }
778
779        let exp_a = a.exp();
780        exp_a.store(&mut arr);
781        let expected = [0.0f64.exp(), 0.5f64.exp(), 1.0f64.exp(), 2.0f64.exp()];
782        for i in 0..4 {
783            assert!((arr[i] - expected[i]).abs() < 1e-12);
784        }
785    }
786
787    #[test]
788    fn test_f64x4_vector_mask_lt() {
789        // wide 0.7 returns mask with from_bits(u64::MAX) = NaN for true, 0.0 for false
790        // Use move_mask to check bits
791        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
792        let b = F64x4::load(&[3.0, 3.0, 3.0, 3.0]);
793        let mask = <F64x4 as VectorMask<f64, 4>>::lt(&a, &b);
794        // move_mask extracts sign bit of each lane
795        assert_eq!(mask.0.move_mask() & 0b1111, 0b0011); // lanes 0,1 true
796    }
797
798    #[test]
799    fn test_f64x4_vector_mask_gt() {
800        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
801        let b = F64x4::load(&[2.0, 2.0, 2.0, 2.0]);
802        let mask = <F64x4 as VectorMask<f64, 4>>::gt(&a, &b);
803        assert_eq!(mask.0.move_mask() & 0b1111, 0b1100); // lanes 2,3 true
804    }
805
806    #[test]
807    fn test_f64x4_vector_mask_eq() {
808        let a = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
809        let b = F64x4::load(&[1.0, 0.0, 3.0, 5.0]);
810        let mask = <F64x4 as VectorMask<f64, 4>>::eq(&a, &b);
811        assert_eq!(mask.0.move_mask() & 0b1111, 0b0101); // lanes 0,2 true
812    }
813
814    #[test]
815    fn test_f64x4_vector_mask_all() {
816        let all_true = <F64x4 as VectorMask<f64, 4>>::lt(&F64x4::splat(1.0), &F64x4::splat(2.0));
817        assert!(<F64x4 as VectorMask<f64, 4>>::all(&all_true));
818
819        let partial_true = <F64x4 as VectorMask<f64, 4>>::lt(
820            &F64x4::load(&[1.0, 2.0, 3.0, 4.0]),
821            &F64x4::splat(3.0),
822        );
823        assert!(!<F64x4 as VectorMask<f64, 4>>::all(&partial_true));
824    }
825
826    #[test]
827    fn test_f64x4_vector_mask_select() {
828        let true_vals = F64x4::load(&[10.0, 20.0, 30.0, 40.0]);
829        let false_vals = F64x4::load(&[1.0, 2.0, 3.0, 4.0]);
830        // mask: true where true_vals < 25
831        let threshold = F64x4::load(&[5.0, 25.0, 25.0, 25.0]);
832        let mask = <F64x4 as VectorMask<f64, 4>>::lt(&true_vals, &threshold);
833        let selected = <F64x4 as VectorMask<f64, 4>>::select(&true_vals, &false_vals, mask);
834        // lanes 0 true (10 < 5? No — 10 < 5 false, so lane 0 is false)
835
836        // Actually: a = [10, 20, 30, 40], threshold = [5, 25, 25, 25]
837        // a < threshold: [false, true, false, false]
838        assert_eq!(mask.0.move_mask() & 0b1111, 0b0010);
839        // select: only lane 1 takes from true_vals (20)
840        let mut arr = [0.0; 4];
841        selected.store(&mut arr);
842        assert!((arr[0] - 1.0).abs() < 1e-15);
843        assert!((arr[1] - 20.0).abs() < 1e-15);
844        assert!((arr[2] - 3.0).abs() < 1e-15);
845        assert!((arr[3] - 4.0).abs() < 1e-15);
846    }
847}