vrl/
vec4f.rs

1use std::{
2    fmt::Debug,
3    mem::MaybeUninit,
4    ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign},
5};
6
7use crate::{
8    common::SIMDVector,
9    intrinsics::*,
10    macros::{vec_impl_sum_prod, vec_overload_operator},
11};
12
13/// Represents a packed vector of 4 single-precision floating-point values. [`__m128`] wrapper.
14#[derive(Clone, Copy)]
15#[repr(transparent)]
16pub struct Vec4f {
17    xmm: __m128,
18}
19
20impl Vec4f {
21    /// Initializes elements of returned vector with given values.
22    ///
23    /// # Example
24    /// ```
25    /// # use vrl::Vec4f;
26    /// assert_eq!(
27    ///     Vec4f::new(1.0, 2.0, 3.0, 4.0),
28    ///     [1.0, 2.0, 3.0, 4.0].into()
29    /// );
30    /// ```
31    #[inline(always)]
32    #[allow(clippy::too_many_arguments)]
33    pub fn new(v0: f32, v1: f32, v2: f32, v3: f32) -> Self {
34        unsafe { _mm_setr_ps(v0, v1, v2, v3) }.into()
35    }
36
37    /// Loads vector from array pointer by `addr`.
38    /// `addr` is not required to be aligned.
39    ///
40    /// # Safety
41    /// `addr` must be a valid pointer.
42    ///
43    /// # Example
44    /// ```
45    /// # use vrl::Vec4f;
46    /// let array = [42.0; 4];
47    /// let vec = unsafe { Vec4f::load_ptr(&array) };
48    /// ```
49    #[inline(always)]
50    pub unsafe fn load_ptr(addr: *const [f32; 4]) -> Self {
51        _mm_loadu_ps(addr as *const f32).into()
52    }
53
54    /// Loads vector from aligned array pointed by `addr`.
55    ///
56    /// # Safety
57    /// Like [`load`], requires `addr` to be valid.
58    /// Unlike [`load`], requires `addr` to be divisible by `16`, i.e. to be a `16`-bytes aligned address.
59    ///
60    /// [`load`]: Self::load
61    ///
62    /// # Examples
63    /// ```
64    /// # use vrl::Vec4f;
65    /// #[repr(align(16))]
66    /// struct AlignedArray([f32; 4]);
67    ///
68    /// let array = AlignedArray([42.0; 4]);
69    /// let vec = unsafe { Vec4f::load_ptr_aligned(&array.0) };
70    /// assert_eq!(vec, Vec4f::broadcast(42.0));
71    /// ```
72    /// In the following example `zeros` is aligned 2-bytes aligned. Therefore
73    /// `zeros.as_ptr().byte_add(1)` is an odd address and hence not divisible by `16`.
74    /// ```should_panic
75    /// # use vrl::Vec4f;
76    /// let zeros = unsafe { std::mem::zeroed::<[u16; 10]>() };
77    /// unsafe { Vec4f::load_ptr_aligned(zeros.as_ptr().byte_add(1) as *const [f32; 4]) };
78    /// ```
79    #[inline(always)]
80    pub unsafe fn load_ptr_aligned(addr: *const [f32; 4]) -> Self {
81        _mm_load_ps(addr as *const f32).into()
82    }
83
84    /// Loads values of returned vector from given data.
85    ///
86    /// # Exmaple
87    /// ```
88    /// # use vrl::Vec4f;
89    /// assert_eq!(
90    ///     Vec4f::new(1.0, 2.0, 3.0, 4.0),
91    ///     Vec4f::load(&[1.0, 2.0, 3.0, 4.0])
92    /// );
93    /// ```
94    #[inline(always)]
95    pub fn load(data: &[f32; 4]) -> Self {
96        unsafe { Self::load_ptr(data) }
97    }
98
99    /// Checks that data contains exactly four elements and loads them into vector.
100    ///
101    /// # Panics
102    /// Panics if `data.len()` isn't `4`.
103    ///
104    /// # Examples
105    /// ```
106    /// # use vrl::Vec4f;
107    /// assert_eq!(
108    ///     Vec4f::load_checked(&[1.0, 2.0, 3.0, 4.0]),
109    ///     Vec4f::new(1.0, 2.0, 3.0, 4.0)
110    /// );
111    /// ```
112    /// ```should_panic
113    /// # use vrl::Vec4f;
114    /// Vec4f::load_checked(&[1.0, 2.0, 3.0]);
115    /// ```
116    /// ```should_panic
117    /// # use vrl::Vec4f;
118    /// Vec4f::load_checked(&[1.0, 2.0, 3.0, 4.0, 5.0]);
119    /// ```
120    #[inline(always)]
121    pub fn load_checked(data: &[f32]) -> Self {
122        Self::load(
123            data.try_into()
124                .expect("data must contain exactly 4 elements"),
125        )
126    }
127
128    /// Loads the first four elements of `data` into vector.
129    ///
130    /// # Panics
131    /// Panics if `data` contains less than four elements.
132    ///
133    /// # Exmaples
134    /// ```
135    /// # use vrl::Vec4f;
136    /// assert_eq!(
137    ///     Vec4f::load_prefix(&[1.0, 2.0, 3.0, 4.0, 5.0]),
138    ///     Vec4f::new(1.0, 2.0, 3.0, 4.0)
139    /// );
140    /// ```
141    ///
142    /// ```should_panic
143    /// # use vrl::Vec4f;
144    /// Vec4f::load_prefix(&[1.0, 2.0, 3.0]);
145    /// ```
146    #[inline(always)]
147    pub fn load_prefix(data: &[f32]) -> Self {
148        if data.len() < 4 {
149            panic!("data must contain at least 4 elements");
150        }
151        unsafe { Self::load_ptr(data.as_ptr() as *const [f32; 4]) }
152    }
153
154    /// Loads first 4 elements of `data` if available otherwise initializes first elements of
155    /// returned vector with values of `data` and rest elements with zeros.
156    ///
157    /// # Example
158    /// ```
159    /// # use vrl::Vec4f;
160    /// let values = [1.0, 2.0, 3.0, 4.0, 5.0];
161    /// assert_eq!(
162    ///     Vec4f::load_partial(&values),
163    ///     Vec4f::from(&values[..4].try_into().unwrap())
164    /// );
165    /// assert_eq!(
166    ///     Vec4f::load_partial(&values[..2]),
167    ///     Vec4f::new(1.0, 2.0, 0.0, 0.0)  // note zeros here
168    /// );
169    /// ```
170    #[inline]
171    pub fn load_partial(data: &[f32]) -> Self {
172        match data.len() {
173            4.. => unsafe { Self::load_ptr(data.as_ptr() as *const [f32; 4]) },
174            3 => Self::new(data[0], data[1], data[2], 0.0),
175            2 => Self::new(data[0], data[1], 0.0, 0.0),
176            1 => Self::new(data[0], 0.0, 0.0, 0.0),
177            0 => Self::default(),
178        }
179    }
180
181    /// Returns vector with all its elements initialized with a given `value`, i.e. broadcasts
182    /// `value` to all elements of returned vector.
183    ///
184    /// # Example
185    /// ```
186    /// # use vrl::Vec4f;
187    /// assert_eq!(
188    ///     Vec4f::broadcast(42.0),
189    ///     [42.0; 4].into()
190    /// );
191    /// ```
192    #[inline(always)]
193    pub fn broadcast(value: f32) -> Self {
194        unsafe { _mm_set1_ps(value) }.into()
195    }
196
197    /// Stores vector into array at given address.
198    ///
199    /// # Safety
200    /// `addr` must be a valid pointer.
201    #[inline(always)]
202    pub unsafe fn store_ptr(&self, addr: *mut [f32; 4]) {
203        _mm_storeu_ps(addr as *mut f32, self.xmm)
204    }
205
206    /// Stores vector into aligned array at given address.
207    ///
208    /// # Safety
209    /// Like [`store_ptr`], requires `addr` to be valid.
210    /// Unlike [`store_ptr`], requires `addr` to be divisible by `16`, i.e. to be a 16-bytes aligned address.
211    ///
212    /// [`store_ptr`]: Self::store_ptr
213    #[inline(always)]
214    pub unsafe fn store_ptr_aligned(&self, addr: *mut [f32; 4]) {
215        _mm_store_ps(addr as *mut f32, self.xmm)
216    }
217
218    /// Stores vector into aligned array at given address in uncached memory (non-temporal store).
219    /// This may be more efficient than [`store_ptr_aligned`] if it is unlikely that stored data will
220    /// stay in cache until it is read again, for instance, when storing large blocks of memory.
221    ///
222    /// # Safety
223    /// Has same requirements as [`store_ptr_aligned`]: `addr` must be valid and
224    /// divisible by `16`, i.e. to be a 16-bytes aligned address.
225    ///
226    /// [`store_ptr_aligned`]: Self::store_ptr_aligned
227    #[inline(always)]
228    pub unsafe fn store_ptr_non_temporal(&self, addr: *mut [f32; 4]) {
229        _mm_stream_ps(addr as *mut f32, self.xmm)
230    }
231
232    /// Stores vector into given `array`.
233    #[inline(always)]
234    pub fn store(&self, array: &mut [f32; 4]) {
235        unsafe { self.store_ptr(array) }
236    }
237
238    /// Checkes that `slice` contains exactly four elements and store elements of vector there.
239    ///
240    /// # Panics
241    /// Panics if `slice.len()` isn't `4`.
242    ///
243    /// # Examples
244    /// ```
245    /// # use vrl::Vec4f;
246    /// let mut data = [-1.0; 4];
247    /// Vec4f::default().store_checked(&mut data);
248    /// assert_eq!(data, [0.0; 4]);
249    /// ```
250    /// ```should_panic
251    /// # use vrl::Vec4f;
252    /// let mut data = [-1.0; 3];
253    /// Vec4f::default().store_checked(&mut data);
254    /// ```
255    /// ```should_panic
256    /// # use vrl::Vec4f;
257    /// let mut data = [-1.0; 5];
258    /// Vec4f::default().store_checked(&mut data);
259    /// ```
260    #[inline]
261    pub fn store_checked(&self, slice: &mut [f32]) {
262        self.store(
263            slice
264                .try_into()
265                .expect("slice must contain at least 4 elements"),
266        )
267    }
268
269    /// Stores elements of vector into the first four elements of `slice`.
270    ///
271    /// # Panics
272    /// Panics if `slice` contains less then four elements.
273    ///
274    /// # Exmaples
275    /// ```
276    /// # use vrl::Vec4f;
277    /// let mut data = [-1.0; 5];
278    /// Vec4f::broadcast(2.0).store_prefix(&mut data);
279    /// assert_eq!(data, [2.0, 2.0, 2.0, 2.0, -1.0]);
280    /// ```
281    /// ```should_panic
282    /// # use vrl::Vec4f;
283    /// let mut data = [-1.0; 3];
284    /// Vec4f::default().store_prefix(&mut data);
285    /// ```
286    #[inline(always)]
287
288    pub fn store_prefix(&self, slice: &mut [f32]) {
289        if slice.len() < 4 {
290            panic!("slice.len() must at least 4");
291        }
292        unsafe { self.store_ptr(slice.as_ptr() as *mut [f32; 4]) };
293    }
294
295    /// Stores `min(4, slice.len())` elements of vector into prefix of `slice`.
296    ///
297    /// # Exmaples
298    /// ```
299    /// # use vrl::Vec4f;
300    /// let mut data = [0.0; 3];
301    /// Vec4f::broadcast(1.0).store_partial(&mut data);
302    /// assert_eq!(data, [1.0; 3]);
303    /// ```
304    /// ```
305    /// # use vrl::Vec4f;
306    /// let mut data = [0.0; 5];
307    /// Vec4f::broadcast(1.0).store_partial(&mut data);
308    /// assert_eq!(data, [1.0, 1.0, 1.0, 1.0, 0.0]);  // note last zero
309    /// ```
310    #[inline]
311    pub fn store_partial(&self, slice: &mut [f32]) {
312        match slice.len() {
313            4.. => unsafe { self.store_ptr(slice.as_mut_ptr() as *mut [f32; 4]) },
314            _ => slice.copy_from_slice(&<[f32; 4]>::from(self)[..slice.len()]),
315        }
316    }
317
318    /// Calculates the sum of all elements of vector.
319    ///
320    /// # Exmaple
321    /// ```
322    /// # use vrl::Vec4f;
323    /// assert_eq!(Vec4f::new(1.0, 2.0, 3.0, 4.0).sum(), 10.0);
324    /// ```
325    #[inline(always)]
326    pub fn sum(self) -> f32 {
327        // Acoording to Agner Fog, using `hadd` is inefficient.
328        // src: https://github.com/vectorclass/version2/blob/master/vectorf128.h#L1043
329        // TODO: benchmark this implementation and `hadd`-based one
330        unsafe {
331            let t1 = _mm_movehl_ps(self.xmm, self.xmm);
332            let t2 = _mm_add_ps(self.xmm, t1);
333            let t3 = _mm_shuffle_ps(t2, t2, 1);
334            let t4 = _mm_add_ss(t2, t3);
335            _mm_cvtss_f32(t4)
336        }
337    }
338}
339
340impl SIMDVector for Vec4f {
341    type Underlying = __m128;
342    type Element = f32;
343    const ELEMENTS: usize = 4;
344}
345
346impl Default for Vec4f {
347    /// Initializes all elements of returned vector with zero.
348    ///
349    /// # Example
350    /// ```
351    /// # use vrl::Vec4f;
352    /// assert_eq!(Vec4f::default(), Vec4f::broadcast(0.0));
353    /// ```
354    #[inline(always)]
355    fn default() -> Self {
356        unsafe { _mm_setzero_ps() }.into()
357    }
358}
359
360impl Neg for Vec4f {
361    type Output = Self;
362
363    /// Flips sign bit of each element including non-finite ones.
364    #[inline(always)]
365    fn neg(self) -> Self::Output {
366        unsafe { _mm_xor_ps(self.xmm, _mm_set1_ps(-0f32)) }.into()
367    }
368}
369
370vec_overload_operator!(Vec4f, Add, add, _mm_add_ps, sse);
371vec_overload_operator!(Vec4f, Sub, sub, _mm_sub_ps, sse);
372vec_overload_operator!(Vec4f, Mul, mul, _mm_mul_ps, sse);
373vec_overload_operator!(Vec4f, Div, div, _mm_div_ps, sse);
374vec_impl_sum_prod!(Vec4f);
375
376impl From<__m128> for Vec4f {
377    /// Wraps given `value` into [`Vec4f`].
378    #[inline(always)]
379    fn from(value: __m128) -> Self {
380        Self { xmm: value }
381    }
382}
383
384impl From<Vec4f> for __m128 {
385    /// Unwraps given vector into raw [`__m128`] value.
386    #[inline(always)]
387    fn from(value: Vec4f) -> Self {
388        value.xmm
389    }
390}
391
392impl From<&[f32; 4]> for Vec4f {
393    /// Does same as [`load`](Self::load).
394    #[inline(always)]
395    fn from(value: &[f32; 4]) -> Self {
396        Self::load(value)
397    }
398}
399
400impl From<[f32; 4]> for Vec4f {
401    #[inline(always)]
402    fn from(value: [f32; 4]) -> Self {
403        (&value).into()
404    }
405}
406
407impl From<Vec4f> for [f32; 4] {
408    #[inline(always)]
409    fn from(value: Vec4f) -> Self {
410        let mut result = MaybeUninit::<Self>::uninit();
411        unsafe {
412            value.store_ptr(result.as_mut_ptr());
413            result.assume_init()
414        }
415    }
416}
417
418impl From<&Vec4f> for [f32; 4] {
419    #[inline(always)]
420    fn from(value: &Vec4f) -> Self {
421        unsafe { *(value as *const Vec4f as *const [f32; 4]) }
422    }
423}
424
425impl PartialEq for Vec4f {
426    /// Checks whether all elements of vectors are equal.
427    ///
428    /// __Note__: when [`NaN`](`f32::NAN`) is an element of one of the operands the result is always `false`.
429    ///
430    /// # Examples
431    /// ```
432    /// # use vrl::Vec4f;
433    /// let a = Vec4f::new(1.0, 2.0, 3.0, 4.0);
434    /// assert_eq!(a, a);
435    /// assert_ne!(a, Vec4f::default());
436    /// ```
437    ///
438    /// ```
439    /// # use vrl::Vec4f;
440    /// let a = Vec4f::broadcast(f32::NAN);
441    /// assert_ne!(a, a);
442    /// ```
443    #[inline(always)]
444    fn eq(&self, other: &Self) -> bool {
445        unsafe {
446            let cmp_result = _mm_cmpeq_ps(self.xmm, other.xmm);
447            _mm_movemask_ps(cmp_result) == 0x0F
448        }
449    }
450}
451
452impl Debug for Vec4f {
453    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
454        let mut debug_tuple = f.debug_tuple("Vec4f");
455        for value in <[f32; 4]>::from(self) {
456            debug_tuple.field(&value);
457        }
458        debug_tuple.finish()
459    }
460}
461
462#[cfg(test)]
463mod tests {
464    use super::Vec4f;
465
466    #[test]
467    #[inline(never)] // in order to find the function in disassembled binary
468    fn it_works() {
469        let a = Vec4f::broadcast(1.0);
470        assert_eq!(<[f32; 4]>::from(a), [1.0; 4]);
471        assert_eq!(a, [1.0; 4].into());
472
473        let b = 2.0 * a;
474        assert_ne!(a, b);
475
476        let mut c = b / 2.0;
477        assert_eq!(a, c);
478
479        c += Vec4f::from(&[1.0, 0.0, 2.0, 0.0]);
480        let d = -c;
481
482        const EXPECTED_D: [f32; 4] = [-2.0, -1.0, -3.0, -1.0];
483        assert_eq!(d, EXPECTED_D.into());
484        assert_eq!(<[f32; 4]>::from(d), EXPECTED_D);
485    }
486}