sparkl2d_kernels/cuda/
sort.rs1use crate::cuda::atomic::AtomicAdd;
2use crate::cuda::AtomicInt;
3use crate::gpu_grid::GpuGrid;
4use crate::{
5 BlockHeaderId, BlockVirtualId, DispatchBlock2ActiveBlock, HaloBlockData, NUM_CELL_PER_BLOCK,
6};
7use cuda_std::thread;
8use sparkl_core::dynamics::ParticlePosition;
9
10#[cfg_attr(not(target_os = "cuda"), derive(cust::DeviceCopy, bytemuck::Zeroable))]
13#[derive(Copy, Clone, Debug, PartialEq, Eq)]
14#[repr(transparent)]
15pub struct HaloState(pub u32);
16impl HaloState {
17 pub const EMPTY: Self = HaloState(0);
18 pub const IS_HALO: Self = HaloState(1);
19 pub const HAS_HALO_NEIGHBOR: Self = HaloState(2);
20
21 pub fn contains(self, rhs: Self) -> bool {
22 (self.0 & rhs.0) != 0
23 }
24 pub fn needs_halo_treatment(self) -> bool {
25 self.contains(Self::IS_HALO) || self.contains(Self::HAS_HALO_NEIGHBOR)
26 }
27}
28
29#[cfg_attr(not(target_os = "cuda"), derive(cust::DeviceCopy, bytemuck::Zeroable))]
30#[derive(Copy, Clone, Debug)]
31#[repr(C)]
32pub struct ActiveBlockHeader {
33 pub virtual_id: BlockVirtualId,
35 pub first_particle: u32,
36 pub num_particles: u32,
37 pub halo_state: HaloState,
38}
39
40impl ActiveBlockHeader {
41 pub fn multiplicity(&self, max_threads_per_block: u32) -> u32 {
42 self.num_particles / max_threads_per_block
43 + (self.num_particles % max_threads_per_block > 0) as u32
44 }
45}
46
47impl Default for ActiveBlockHeader {
48 fn default() -> Self {
49 Self {
50 virtual_id: BlockVirtualId(0),
51 first_particle: 0,
52 num_particles: 0,
53 halo_state: HaloState::EMPTY,
54 }
55 }
56}
57
58#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
59pub unsafe fn touch_particle_blocks(
60 particles: *mut ParticlePosition,
61 particles_len: u32,
62 mut grid: GpuGrid,
63) {
64 let id = thread::index();
65 if id < particles_len {
66 let p = &*particles.add(id as usize);
67
68 for block_id in grid.blocks_associated_to_point(&p.point) {
69 grid.mark_block_as_active(block_id);
70 }
71 }
72}
73
74#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
75pub unsafe fn tag_halo_blocks(
76 mut grid: GpuGrid,
77 remote_active_blocks: *const ActiveBlockHeader,
78 num_remote_active_blocks: u32,
79 num_halo_blocks: *mut u32,
80) {
81 let id = thread::index();
82 if id < num_remote_active_blocks {
83 let block_vid = (*remote_active_blocks.add(id as usize)).virtual_id;
84 if let Some(block_hid) = grid.get_header_block_id(block_vid) {
85 if grid
86 .active_block_unchecked_mut(block_hid)
87 .halo_state
88 .0
89 .global_atomic_exch(HaloState::IS_HALO.0)
90 == 0
91 {
92 (*num_halo_blocks).global_atomic_add(1);
93 }
94 }
95 }
96}
97
98#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
99pub unsafe fn tag_halo_neighbors(mut grid: GpuGrid, num_active_blocks: u32) {
100 let id = thread::index();
101 if id < num_active_blocks {
102 let active_block = grid.active_block_unchecked(BlockHeaderId(id));
103 if active_block.halo_state.contains(HaloState::IS_HALO) {
104 let assoc_blocks =
105 GpuGrid::blocks_transferring_into_block(active_block.virtual_id.unpack());
106 for assoc_block in &assoc_blocks[1..] {
107 if let Some(active_assoc_block_id) = grid.get_header_block_id(*assoc_block) {
108 let active_assoc_block = grid.active_block_unchecked_mut(active_assoc_block_id);
109 active_assoc_block.halo_state.0 |= HaloState::HAS_HALO_NEIGHBOR.0;
110 }
111 }
112 }
113 }
114}
115
116#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
117pub unsafe fn copy_halo_to_staging(
118 grid: GpuGrid,
119 staging_buffer: *mut HaloBlockData,
120 num_halo_blocks: *mut u32,
121) {
122 let id = thread::index();
123 if id < grid.num_active_blocks() {
124 let block_id = BlockHeaderId(id);
125 let block = grid.active_block_unchecked(block_id);
126
127 if block.halo_state.contains(HaloState::IS_HALO) {
128 let index = (&mut *num_halo_blocks).global_atomic_dec() - 1;
129
130 let out = &mut *staging_buffer.add(index as usize);
131 out.virtual_id = block.virtual_id;
132 let first_cell_id = block_id.to_physical();
133
134 for k in 0..NUM_CELL_PER_BLOCK {
135 let curr_cell_id = first_cell_id.node(k);
136 out.cells[k as usize] = *grid.get_node_unchecked(curr_cell_id);
137 }
138 }
139 }
140}
141
142#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
143pub unsafe fn merge_halo_blocks(mut grid: GpuGrid, remote_halo_blocks: *const HaloBlockData) {
144 let bid = thread::block_idx_x();
145 let tid = thread::thread_idx_x();
146
147 let halo_block = &*remote_halo_blocks.add(bid as usize);
148 if let Some(target_block_id) = grid.get_header_block_id(halo_block.virtual_id) {
149 let node_id = target_block_id.to_physical().node(tid as u64);
150 let target_node = &mut *grid.get_node_mut(node_id).unwrap();
151 target_node
152 .mass
153 .global_red_add(halo_block.cells[tid as usize].mass);
154 target_node
155 .momentum_velocity
156 .global_red_add(halo_block.cells[tid as usize].momentum_velocity);
157 target_node
158 .psi_mass
159 .global_red_add(halo_block.cells[tid as usize].psi_mass);
160 target_node
161 .psi_momentum_velocity
162 .global_red_add(halo_block.cells[tid as usize].psi_momentum_velocity);
163 }
164}
165
166#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
167pub unsafe fn update_block_particle_count(
168 particles: *mut ParticlePosition,
169 particles_len: u32,
170 mut grid: GpuGrid,
171) {
172 let id = thread::index();
173 if id < particles_len {
174 let p = &*particles.add(id as usize);
175 let block_id = grid.block_associated_to_point(&p.point);
176
177 if let Some(active_block) = grid.get_packed_block_mut(block_id) {
178 active_block.num_particles.global_red_add(1)
179 }
180 }
181}
182
183#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
184pub unsafe fn copy_particles_len_to_scan_value(grid: GpuGrid, scan_values: *mut u32) {
185 let id = thread::index();
186 if id < grid.num_active_blocks() {
187 *scan_values.add(id as usize) =
188 grid.active_block_unchecked(BlockHeaderId(id)).num_particles;
189 }
190}
191
192#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
193pub unsafe fn copy_scan_values_to_first_particles(mut grid: GpuGrid, scan_values: *const u32) {
194 let id = thread::index();
195 if id < grid.num_active_blocks() {
196 grid.active_block_unchecked_mut(BlockHeaderId(id))
197 .first_particle = *scan_values.add(id as usize);
198 }
199}
200
201#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
202pub unsafe fn finalize_particles_sort(
203 particles: *mut ParticlePosition,
204 particles_len: u32,
205 grid: GpuGrid,
206 scan_values: *mut u32,
207 sorted_particle_ids: *mut u32,
208) {
209 let id = thread::index();
210 if id < particles_len {
211 let p = &*particles.add(id as usize);
212 let block_id = grid.block_associated_to_point(&p.point);
213 if let Some(active_block_id) = grid.get_header_block_id(block_id) {
216 let scan_value = &mut *scan_values.add(active_block_id.0 as usize);
217 let target_index = scan_value.global_atomic_add(1);
218 *sorted_particle_ids.add(target_index as usize) = id;
219 }
220 }
221}
222
223#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
227pub unsafe fn write_blocks_multiplicity_to_scan_value(
228 grid: GpuGrid,
229 scan_values: *mut u32,
230 halo_scan_values: *mut u32,
231 max_threads_per_block: u32,
232) {
233 let id = thread::index();
234 if id < grid.num_active_blocks() {
235 let active_block = grid.active_block_unchecked(BlockHeaderId(id));
236 let multiplicity = active_block.multiplicity(max_threads_per_block);
237
238 if active_block.halo_state.needs_halo_treatment() {
239 *scan_values.add(id as usize) = 0;
240 *halo_scan_values.add(id as usize) = multiplicity;
241 } else {
242 *scan_values.add(id as usize) = multiplicity;
243 *halo_scan_values.add(id as usize) = 0;
244 }
245 }
246}
247
248#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
249pub unsafe fn init_gpu_dispatch_blocks_mapping(
250 grid: GpuGrid,
251 not_halo_scan_values: *mut u32,
252 halo_scan_values: *mut u32,
253 max_threads_per_gpu_block: u32,
254) {
255 let mut tid = thread::thread_idx_x();
256 let bid = BlockHeaderId(thread::block_idx_x());
257 let bsize = thread::block_dim_x();
258
259 let active_block = grid.active_block_unchecked(bid);
260
261 let (dispatch2active, scan_values) = if active_block.halo_state.needs_halo_treatment() {
262 (grid.dispatch_halo_block_to_active_block, halo_scan_values)
263 } else {
264 (grid.dispatch_block_to_active_block, not_halo_scan_values)
265 };
266
267 let multiplicity = active_block.multiplicity(max_threads_per_gpu_block);
268 let first_particle = active_block.first_particle;
269 let base_dispatch_block_id = *scan_values.add(bid.0 as usize);
270
271 while tid < multiplicity {
272 *dispatch2active
273 .as_mut_ptr()
274 .add((base_dispatch_block_id + tid) as usize) = DispatchBlock2ActiveBlock {
275 active_block_id: bid,
276 first_particle: first_particle + tid * max_threads_per_gpu_block,
277 };
278 tid += bsize;
279 }
280}