1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
crate::ix!();
impl crate::LipolPs {
/// Adds the given block of contiguously-allocated `nquads` quadwords from
/// the memory location pointed to by `src` to this `LipolPs` instance.
///
/// # Safety
///
/// This function is marked as `unsafe` because the caller must ensure that
/// the memory pointed to by `src` is valid and that the `nquads` items can
/// be accessed contiguously.
///
/// Additionally, the `src` pointer must be a valid pointer to a mutable
/// slice of `f32` values.
///
/// # Arguments
///
/// * `src` - A pointer to the start of the memory region containing the quadwords to be added.
/// * `nquads` - The number of quadwords to be added.
///
/// # Examples
///
/// ```rust
/// # use std::mem::size_of;
/// # use std::ptr::null_mut;
/// # use crate::LipolPs;
/// let mut lp = LipolPs::new(4);
///
/// // create a block of 4 quadwords
/// let mut block = vec![0.0f32; 16];
///
/// // set the values of the first quadword
/// block[0] = 1.0f32;
/// block[1] = 2.0f32;
/// block[2] = 3.0f32;
/// block[3] = 4.0f32;
///
/// // add the block to `lp`
/// unsafe {
/// lp.add_block(block.as_mut_ptr(), 4);
/// }
///
/// // verify that the values were added correctly
/// let mut result = vec![0.0f32; 16];
/// lp.as_slice_mut().read_exact(&mut result).unwrap();
/// assert_eq!(result[0], 1.0f32);
/// assert_eq!(result[1], 2.0f32);
/// assert_eq!(result[2], 3.0f32);
/// assert_eq!(result[3], 4.0f32);
/// ```
///
/// # Safety
///
/// caller needs to ensure we can access nquads
/// valid items contiguously from a valid src pointer
///
pub unsafe fn add_block<NQ: TryInto<usize>>(
&mut self,
src: *mut f32,
nquads: NQ)
where <NQ as TryInto<usize>>::Error: fmt::Debug
{
let nquads: usize = nquads.try_into().unwrap();
lipol_ps_sse_block!(self, nquads, 2;
|idx:usize, ref mut y1, ref mut y2, ref mut dy| {
let src = src as *mut __m128;
*src.add(idx) = _mm_add_ps(*src.add(idx), *y1);
*y1 = _mm_add_ps(*y1, *dy) ;
*src.add(idx + 1) = _mm_add_ps(*src.add(idx + 1), *y2);
*y2 = _mm_add_ps(*y2, *dy);
}
);
}
/// # Safety
///
/// Caller needs to ensure that we can access nquads valid items
/// contiguously from a valid src pointer.
///
/// Subtracts the corresponding values from the current state for a block of
/// `nquads` items in `src`, using vectorized SSE instructions. The result
/// is stored back in `src`.
///
/// # Example
///
/// ```rust
/// # use std::mem::MaybeUninit;
/// # use std::ptr;
/// # use lipol::LipolPs;
/// # fn main() {
/// let mut lipol = LipolPs::new();
/// let mut data: [f32; 8] = unsafe { MaybeUninit::uninit().assume_init() };
/// let nquads = data.len() / 2;
/// data.copy_from_slice(&[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]);
///
/// unsafe {
/// lipol.subtract_block(data.as_mut_ptr(), nquads);
/// }
///
/// assert_eq!(data, [-1.0, 1.0, -1.0, 3.0, -1.0, 5.0, -1.0, 7.0]);
/// # }
/// ```
///
/// # Safety
///
/// caller needs to ensure we can access nquads
/// valid items contiguously from a valid src pointer
///
pub unsafe fn subtract_block<NQ: TryInto<usize>>(
&mut self,
src: *mut f32,
nquads: NQ)
where <NQ as TryInto<usize>>::Error: fmt::Debug
{
let nquads: usize = nquads.try_into().unwrap();
lipol_ps_sse_block!(self, nquads, 2;
|idx:usize, ref mut y1, ref mut y2, ref mut dy| {
let src = src as *mut __m128;
*src.add(idx) = _mm_sub_ps(
*src.add(idx),
*y1
);
*y1 = _mm_add_ps(*y1, *dy);
*src.add(idx + 1) = _mm_sub_ps(
*src.add(idx + 1),
*y2
);
*y2 = _mm_add_ps(*y2, *dy);
}
);
}
}