vrl/vec4f.rs
1use std::{
2 fmt::Debug,
3 mem::MaybeUninit,
4 ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign},
5};
6
7use crate::{
8 common::SIMDVector,
9 intrinsics::*,
10 macros::{vec_impl_sum_prod, vec_overload_operator},
11};
12
13/// Represents a packed vector of 4 single-precision floating-point values. [`__m128`] wrapper.
14#[derive(Clone, Copy)]
15#[repr(transparent)]
16pub struct Vec4f {
17 xmm: __m128,
18}
19
20impl Vec4f {
21 /// Initializes elements of returned vector with given values.
22 ///
23 /// # Example
24 /// ```
25 /// # use vrl::Vec4f;
26 /// assert_eq!(
27 /// Vec4f::new(1.0, 2.0, 3.0, 4.0),
28 /// [1.0, 2.0, 3.0, 4.0].into()
29 /// );
30 /// ```
31 #[inline(always)]
32 #[allow(clippy::too_many_arguments)]
33 pub fn new(v0: f32, v1: f32, v2: f32, v3: f32) -> Self {
34 unsafe { _mm_setr_ps(v0, v1, v2, v3) }.into()
35 }
36
37 /// Loads vector from array pointer by `addr`.
38 /// `addr` is not required to be aligned.
39 ///
40 /// # Safety
41 /// `addr` must be a valid pointer.
42 ///
43 /// # Example
44 /// ```
45 /// # use vrl::Vec4f;
46 /// let array = [42.0; 4];
47 /// let vec = unsafe { Vec4f::load_ptr(&array) };
48 /// ```
49 #[inline(always)]
50 pub unsafe fn load_ptr(addr: *const [f32; 4]) -> Self {
51 _mm_loadu_ps(addr as *const f32).into()
52 }
53
54 /// Loads vector from aligned array pointed by `addr`.
55 ///
56 /// # Safety
57 /// Like [`load`], requires `addr` to be valid.
58 /// Unlike [`load`], requires `addr` to be divisible by `16`, i.e. to be a `16`-bytes aligned address.
59 ///
60 /// [`load`]: Self::load
61 ///
62 /// # Examples
63 /// ```
64 /// # use vrl::Vec4f;
65 /// #[repr(align(16))]
66 /// struct AlignedArray([f32; 4]);
67 ///
68 /// let array = AlignedArray([42.0; 4]);
69 /// let vec = unsafe { Vec4f::load_ptr_aligned(&array.0) };
70 /// assert_eq!(vec, Vec4f::broadcast(42.0));
71 /// ```
72 /// In the following example `zeros` is aligned 2-bytes aligned. Therefore
73 /// `zeros.as_ptr().byte_add(1)` is an odd address and hence not divisible by `16`.
74 /// ```should_panic
75 /// # use vrl::Vec4f;
76 /// let zeros = unsafe { std::mem::zeroed::<[u16; 10]>() };
77 /// unsafe { Vec4f::load_ptr_aligned(zeros.as_ptr().byte_add(1) as *const [f32; 4]) };
78 /// ```
79 #[inline(always)]
80 pub unsafe fn load_ptr_aligned(addr: *const [f32; 4]) -> Self {
81 _mm_load_ps(addr as *const f32).into()
82 }
83
84 /// Loads values of returned vector from given data.
85 ///
86 /// # Exmaple
87 /// ```
88 /// # use vrl::Vec4f;
89 /// assert_eq!(
90 /// Vec4f::new(1.0, 2.0, 3.0, 4.0),
91 /// Vec4f::load(&[1.0, 2.0, 3.0, 4.0])
92 /// );
93 /// ```
94 #[inline(always)]
95 pub fn load(data: &[f32; 4]) -> Self {
96 unsafe { Self::load_ptr(data) }
97 }
98
99 /// Checks that data contains exactly four elements and loads them into vector.
100 ///
101 /// # Panics
102 /// Panics if `data.len()` isn't `4`.
103 ///
104 /// # Examples
105 /// ```
106 /// # use vrl::Vec4f;
107 /// assert_eq!(
108 /// Vec4f::load_checked(&[1.0, 2.0, 3.0, 4.0]),
109 /// Vec4f::new(1.0, 2.0, 3.0, 4.0)
110 /// );
111 /// ```
112 /// ```should_panic
113 /// # use vrl::Vec4f;
114 /// Vec4f::load_checked(&[1.0, 2.0, 3.0]);
115 /// ```
116 /// ```should_panic
117 /// # use vrl::Vec4f;
118 /// Vec4f::load_checked(&[1.0, 2.0, 3.0, 4.0, 5.0]);
119 /// ```
120 #[inline(always)]
121 pub fn load_checked(data: &[f32]) -> Self {
122 Self::load(
123 data.try_into()
124 .expect("data must contain exactly 4 elements"),
125 )
126 }
127
128 /// Loads the first four elements of `data` into vector.
129 ///
130 /// # Panics
131 /// Panics if `data` contains less than four elements.
132 ///
133 /// # Exmaples
134 /// ```
135 /// # use vrl::Vec4f;
136 /// assert_eq!(
137 /// Vec4f::load_prefix(&[1.0, 2.0, 3.0, 4.0, 5.0]),
138 /// Vec4f::new(1.0, 2.0, 3.0, 4.0)
139 /// );
140 /// ```
141 ///
142 /// ```should_panic
143 /// # use vrl::Vec4f;
144 /// Vec4f::load_prefix(&[1.0, 2.0, 3.0]);
145 /// ```
146 #[inline(always)]
147 pub fn load_prefix(data: &[f32]) -> Self {
148 if data.len() < 4 {
149 panic!("data must contain at least 4 elements");
150 }
151 unsafe { Self::load_ptr(data.as_ptr() as *const [f32; 4]) }
152 }
153
154 /// Loads first 4 elements of `data` if available otherwise initializes first elements of
155 /// returned vector with values of `data` and rest elements with zeros.
156 ///
157 /// # Example
158 /// ```
159 /// # use vrl::Vec4f;
160 /// let values = [1.0, 2.0, 3.0, 4.0, 5.0];
161 /// assert_eq!(
162 /// Vec4f::load_partial(&values),
163 /// Vec4f::from(&values[..4].try_into().unwrap())
164 /// );
165 /// assert_eq!(
166 /// Vec4f::load_partial(&values[..2]),
167 /// Vec4f::new(1.0, 2.0, 0.0, 0.0) // note zeros here
168 /// );
169 /// ```
170 #[inline]
171 pub fn load_partial(data: &[f32]) -> Self {
172 match data.len() {
173 4.. => unsafe { Self::load_ptr(data.as_ptr() as *const [f32; 4]) },
174 3 => Self::new(data[0], data[1], data[2], 0.0),
175 2 => Self::new(data[0], data[1], 0.0, 0.0),
176 1 => Self::new(data[0], 0.0, 0.0, 0.0),
177 0 => Self::default(),
178 }
179 }
180
181 /// Returns vector with all its elements initialized with a given `value`, i.e. broadcasts
182 /// `value` to all elements of returned vector.
183 ///
184 /// # Example
185 /// ```
186 /// # use vrl::Vec4f;
187 /// assert_eq!(
188 /// Vec4f::broadcast(42.0),
189 /// [42.0; 4].into()
190 /// );
191 /// ```
192 #[inline(always)]
193 pub fn broadcast(value: f32) -> Self {
194 unsafe { _mm_set1_ps(value) }.into()
195 }
196
197 /// Stores vector into array at given address.
198 ///
199 /// # Safety
200 /// `addr` must be a valid pointer.
201 #[inline(always)]
202 pub unsafe fn store_ptr(&self, addr: *mut [f32; 4]) {
203 _mm_storeu_ps(addr as *mut f32, self.xmm)
204 }
205
206 /// Stores vector into aligned array at given address.
207 ///
208 /// # Safety
209 /// Like [`store_ptr`], requires `addr` to be valid.
210 /// Unlike [`store_ptr`], requires `addr` to be divisible by `16`, i.e. to be a 16-bytes aligned address.
211 ///
212 /// [`store_ptr`]: Self::store_ptr
213 #[inline(always)]
214 pub unsafe fn store_ptr_aligned(&self, addr: *mut [f32; 4]) {
215 _mm_store_ps(addr as *mut f32, self.xmm)
216 }
217
218 /// Stores vector into aligned array at given address in uncached memory (non-temporal store).
219 /// This may be more efficient than [`store_ptr_aligned`] if it is unlikely that stored data will
220 /// stay in cache until it is read again, for instance, when storing large blocks of memory.
221 ///
222 /// # Safety
223 /// Has same requirements as [`store_ptr_aligned`]: `addr` must be valid and
224 /// divisible by `16`, i.e. to be a 16-bytes aligned address.
225 ///
226 /// [`store_ptr_aligned`]: Self::store_ptr_aligned
227 #[inline(always)]
228 pub unsafe fn store_ptr_non_temporal(&self, addr: *mut [f32; 4]) {
229 _mm_stream_ps(addr as *mut f32, self.xmm)
230 }
231
232 /// Stores vector into given `array`.
233 #[inline(always)]
234 pub fn store(&self, array: &mut [f32; 4]) {
235 unsafe { self.store_ptr(array) }
236 }
237
238 /// Checkes that `slice` contains exactly four elements and store elements of vector there.
239 ///
240 /// # Panics
241 /// Panics if `slice.len()` isn't `4`.
242 ///
243 /// # Examples
244 /// ```
245 /// # use vrl::Vec4f;
246 /// let mut data = [-1.0; 4];
247 /// Vec4f::default().store_checked(&mut data);
248 /// assert_eq!(data, [0.0; 4]);
249 /// ```
250 /// ```should_panic
251 /// # use vrl::Vec4f;
252 /// let mut data = [-1.0; 3];
253 /// Vec4f::default().store_checked(&mut data);
254 /// ```
255 /// ```should_panic
256 /// # use vrl::Vec4f;
257 /// let mut data = [-1.0; 5];
258 /// Vec4f::default().store_checked(&mut data);
259 /// ```
260 #[inline]
261 pub fn store_checked(&self, slice: &mut [f32]) {
262 self.store(
263 slice
264 .try_into()
265 .expect("slice must contain at least 4 elements"),
266 )
267 }
268
269 /// Stores elements of vector into the first four elements of `slice`.
270 ///
271 /// # Panics
272 /// Panics if `slice` contains less then four elements.
273 ///
274 /// # Exmaples
275 /// ```
276 /// # use vrl::Vec4f;
277 /// let mut data = [-1.0; 5];
278 /// Vec4f::broadcast(2.0).store_prefix(&mut data);
279 /// assert_eq!(data, [2.0, 2.0, 2.0, 2.0, -1.0]);
280 /// ```
281 /// ```should_panic
282 /// # use vrl::Vec4f;
283 /// let mut data = [-1.0; 3];
284 /// Vec4f::default().store_prefix(&mut data);
285 /// ```
286 #[inline(always)]
287
288 pub fn store_prefix(&self, slice: &mut [f32]) {
289 if slice.len() < 4 {
290 panic!("slice.len() must at least 4");
291 }
292 unsafe { self.store_ptr(slice.as_ptr() as *mut [f32; 4]) };
293 }
294
295 /// Stores `min(4, slice.len())` elements of vector into prefix of `slice`.
296 ///
297 /// # Exmaples
298 /// ```
299 /// # use vrl::Vec4f;
300 /// let mut data = [0.0; 3];
301 /// Vec4f::broadcast(1.0).store_partial(&mut data);
302 /// assert_eq!(data, [1.0; 3]);
303 /// ```
304 /// ```
305 /// # use vrl::Vec4f;
306 /// let mut data = [0.0; 5];
307 /// Vec4f::broadcast(1.0).store_partial(&mut data);
308 /// assert_eq!(data, [1.0, 1.0, 1.0, 1.0, 0.0]); // note last zero
309 /// ```
310 #[inline]
311 pub fn store_partial(&self, slice: &mut [f32]) {
312 match slice.len() {
313 4.. => unsafe { self.store_ptr(slice.as_mut_ptr() as *mut [f32; 4]) },
314 _ => slice.copy_from_slice(&<[f32; 4]>::from(self)[..slice.len()]),
315 }
316 }
317
318 /// Calculates the sum of all elements of vector.
319 ///
320 /// # Exmaple
321 /// ```
322 /// # use vrl::Vec4f;
323 /// assert_eq!(Vec4f::new(1.0, 2.0, 3.0, 4.0).sum(), 10.0);
324 /// ```
325 #[inline(always)]
326 pub fn sum(self) -> f32 {
327 // Acoording to Agner Fog, using `hadd` is inefficient.
328 // src: https://github.com/vectorclass/version2/blob/master/vectorf128.h#L1043
329 // TODO: benchmark this implementation and `hadd`-based one
330 unsafe {
331 let t1 = _mm_movehl_ps(self.xmm, self.xmm);
332 let t2 = _mm_add_ps(self.xmm, t1);
333 let t3 = _mm_shuffle_ps(t2, t2, 1);
334 let t4 = _mm_add_ss(t2, t3);
335 _mm_cvtss_f32(t4)
336 }
337 }
338}
339
340impl SIMDVector for Vec4f {
341 type Underlying = __m128;
342 type Element = f32;
343 const ELEMENTS: usize = 4;
344}
345
346impl Default for Vec4f {
347 /// Initializes all elements of returned vector with zero.
348 ///
349 /// # Example
350 /// ```
351 /// # use vrl::Vec4f;
352 /// assert_eq!(Vec4f::default(), Vec4f::broadcast(0.0));
353 /// ```
354 #[inline(always)]
355 fn default() -> Self {
356 unsafe { _mm_setzero_ps() }.into()
357 }
358}
359
360impl Neg for Vec4f {
361 type Output = Self;
362
363 /// Flips sign bit of each element including non-finite ones.
364 #[inline(always)]
365 fn neg(self) -> Self::Output {
366 unsafe { _mm_xor_ps(self.xmm, _mm_set1_ps(-0f32)) }.into()
367 }
368}
369
370vec_overload_operator!(Vec4f, Add, add, _mm_add_ps, sse);
371vec_overload_operator!(Vec4f, Sub, sub, _mm_sub_ps, sse);
372vec_overload_operator!(Vec4f, Mul, mul, _mm_mul_ps, sse);
373vec_overload_operator!(Vec4f, Div, div, _mm_div_ps, sse);
374vec_impl_sum_prod!(Vec4f);
375
376impl From<__m128> for Vec4f {
377 /// Wraps given `value` into [`Vec4f`].
378 #[inline(always)]
379 fn from(value: __m128) -> Self {
380 Self { xmm: value }
381 }
382}
383
384impl From<Vec4f> for __m128 {
385 /// Unwraps given vector into raw [`__m128`] value.
386 #[inline(always)]
387 fn from(value: Vec4f) -> Self {
388 value.xmm
389 }
390}
391
392impl From<&[f32; 4]> for Vec4f {
393 /// Does same as [`load`](Self::load).
394 #[inline(always)]
395 fn from(value: &[f32; 4]) -> Self {
396 Self::load(value)
397 }
398}
399
400impl From<[f32; 4]> for Vec4f {
401 #[inline(always)]
402 fn from(value: [f32; 4]) -> Self {
403 (&value).into()
404 }
405}
406
407impl From<Vec4f> for [f32; 4] {
408 #[inline(always)]
409 fn from(value: Vec4f) -> Self {
410 let mut result = MaybeUninit::<Self>::uninit();
411 unsafe {
412 value.store_ptr(result.as_mut_ptr());
413 result.assume_init()
414 }
415 }
416}
417
418impl From<&Vec4f> for [f32; 4] {
419 #[inline(always)]
420 fn from(value: &Vec4f) -> Self {
421 unsafe { *(value as *const Vec4f as *const [f32; 4]) }
422 }
423}
424
425impl PartialEq for Vec4f {
426 /// Checks whether all elements of vectors are equal.
427 ///
428 /// __Note__: when [`NaN`](`f32::NAN`) is an element of one of the operands the result is always `false`.
429 ///
430 /// # Examples
431 /// ```
432 /// # use vrl::Vec4f;
433 /// let a = Vec4f::new(1.0, 2.0, 3.0, 4.0);
434 /// assert_eq!(a, a);
435 /// assert_ne!(a, Vec4f::default());
436 /// ```
437 ///
438 /// ```
439 /// # use vrl::Vec4f;
440 /// let a = Vec4f::broadcast(f32::NAN);
441 /// assert_ne!(a, a);
442 /// ```
443 #[inline(always)]
444 fn eq(&self, other: &Self) -> bool {
445 unsafe {
446 let cmp_result = _mm_cmpeq_ps(self.xmm, other.xmm);
447 _mm_movemask_ps(cmp_result) == 0x0F
448 }
449 }
450}
451
452impl Debug for Vec4f {
453 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
454 let mut debug_tuple = f.debug_tuple("Vec4f");
455 for value in <[f32; 4]>::from(self) {
456 debug_tuple.field(&value);
457 }
458 debug_tuple.finish()
459 }
460}
461
462#[cfg(test)]
463mod tests {
464 use super::Vec4f;
465
466 #[test]
467 #[inline(never)] // in order to find the function in disassembled binary
468 fn it_works() {
469 let a = Vec4f::broadcast(1.0);
470 assert_eq!(<[f32; 4]>::from(a), [1.0; 4]);
471 assert_eq!(a, [1.0; 4].into());
472
473 let b = 2.0 * a;
474 assert_ne!(a, b);
475
476 let mut c = b / 2.0;
477 assert_eq!(a, c);
478
479 c += Vec4f::from(&[1.0, 0.0, 2.0, 0.0]);
480 let d = -c;
481
482 const EXPECTED_D: [f32; 4] = [-2.0, -1.0, -3.0, -1.0];
483 assert_eq!(d, EXPECTED_D.into());
484 assert_eq!(<[f32; 4]>::from(d), EXPECTED_D);
485 }
486}