sparkl2d_kernels/cuda/
sort.rs

1use crate::cuda::atomic::AtomicAdd;
2use crate::cuda::AtomicInt;
3use crate::gpu_grid::GpuGrid;
4use crate::{
5    BlockHeaderId, BlockVirtualId, DispatchBlock2ActiveBlock, HaloBlockData, NUM_CELL_PER_BLOCK,
6};
7use cuda_std::thread;
8use sparkl_core::dynamics::ParticlePosition;
9
10// NOTE: this is similar to what we could have gotten with bitflags,
11// except that we have direct access to the bits to run atomic operations.
12#[cfg_attr(not(target_os = "cuda"), derive(cust::DeviceCopy, bytemuck::Zeroable))]
13#[derive(Copy, Clone, Debug, PartialEq, Eq)]
14#[repr(transparent)]
15pub struct HaloState(pub u32);
16impl HaloState {
17    pub const EMPTY: Self = HaloState(0);
18    pub const IS_HALO: Self = HaloState(1);
19    pub const HAS_HALO_NEIGHBOR: Self = HaloState(2);
20
21    pub fn contains(self, rhs: Self) -> bool {
22        (self.0 & rhs.0) != 0
23    }
24    pub fn needs_halo_treatment(self) -> bool {
25        self.contains(Self::IS_HALO) || self.contains(Self::HAS_HALO_NEIGHBOR)
26    }
27}
28
29#[cfg_attr(not(target_os = "cuda"), derive(cust::DeviceCopy, bytemuck::Zeroable))]
30#[derive(Copy, Clone, Debug)]
31#[repr(C)]
32pub struct ActiveBlockHeader {
33    // Needed to compute the world-space position of the block.
34    pub virtual_id: BlockVirtualId,
35    pub first_particle: u32,
36    pub num_particles: u32,
37    pub halo_state: HaloState,
38}
39
40impl ActiveBlockHeader {
41    pub fn multiplicity(&self, max_threads_per_block: u32) -> u32 {
42        self.num_particles / max_threads_per_block
43            + (self.num_particles % max_threads_per_block > 0) as u32
44    }
45}
46
47impl Default for ActiveBlockHeader {
48    fn default() -> Self {
49        Self {
50            virtual_id: BlockVirtualId(0),
51            first_particle: 0,
52            num_particles: 0,
53            halo_state: HaloState::EMPTY,
54        }
55    }
56}
57
58#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
59pub unsafe fn touch_particle_blocks(
60    particles: *mut ParticlePosition,
61    particles_len: u32,
62    mut grid: GpuGrid,
63) {
64    let id = thread::index();
65    if id < particles_len {
66        let p = &*particles.add(id as usize);
67
68        for block_id in grid.blocks_associated_to_point(&p.point) {
69            grid.mark_block_as_active(block_id);
70        }
71    }
72}
73
74#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
75pub unsafe fn tag_halo_blocks(
76    mut grid: GpuGrid,
77    remote_active_blocks: *const ActiveBlockHeader,
78    num_remote_active_blocks: u32,
79    num_halo_blocks: *mut u32,
80) {
81    let id = thread::index();
82    if id < num_remote_active_blocks {
83        let block_vid = (*remote_active_blocks.add(id as usize)).virtual_id;
84        if let Some(block_hid) = grid.get_header_block_id(block_vid) {
85            if grid
86                .active_block_unchecked_mut(block_hid)
87                .halo_state
88                .0
89                .global_atomic_exch(HaloState::IS_HALO.0)
90                == 0
91            {
92                (*num_halo_blocks).global_atomic_add(1);
93            }
94        }
95    }
96}
97
98#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
99pub unsafe fn tag_halo_neighbors(mut grid: GpuGrid, num_active_blocks: u32) {
100    let id = thread::index();
101    if id < num_active_blocks {
102        let active_block = grid.active_block_unchecked(BlockHeaderId(id));
103        if active_block.halo_state.contains(HaloState::IS_HALO) {
104            let assoc_blocks =
105                GpuGrid::blocks_transferring_into_block(active_block.virtual_id.unpack());
106            for assoc_block in &assoc_blocks[1..] {
107                if let Some(active_assoc_block_id) = grid.get_header_block_id(*assoc_block) {
108                    let active_assoc_block = grid.active_block_unchecked_mut(active_assoc_block_id);
109                    active_assoc_block.halo_state.0 |= HaloState::HAS_HALO_NEIGHBOR.0;
110                }
111            }
112        }
113    }
114}
115
116#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
117pub unsafe fn copy_halo_to_staging(
118    grid: GpuGrid,
119    staging_buffer: *mut HaloBlockData,
120    num_halo_blocks: *mut u32,
121) {
122    let id = thread::index();
123    if id < grid.num_active_blocks() {
124        let block_id = BlockHeaderId(id);
125        let block = grid.active_block_unchecked(block_id);
126
127        if block.halo_state.contains(HaloState::IS_HALO) {
128            let index = (&mut *num_halo_blocks).global_atomic_dec() - 1;
129
130            let out = &mut *staging_buffer.add(index as usize);
131            out.virtual_id = block.virtual_id;
132            let first_cell_id = block_id.to_physical();
133
134            for k in 0..NUM_CELL_PER_BLOCK {
135                let curr_cell_id = first_cell_id.node(k);
136                out.cells[k as usize] = *grid.get_node_unchecked(curr_cell_id);
137            }
138        }
139    }
140}
141
142#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
143pub unsafe fn merge_halo_blocks(mut grid: GpuGrid, remote_halo_blocks: *const HaloBlockData) {
144    let bid = thread::block_idx_x();
145    let tid = thread::thread_idx_x();
146
147    let halo_block = &*remote_halo_blocks.add(bid as usize);
148    if let Some(target_block_id) = grid.get_header_block_id(halo_block.virtual_id) {
149        let node_id = target_block_id.to_physical().node(tid as u64);
150        let target_node = &mut *grid.get_node_mut(node_id).unwrap();
151        target_node
152            .mass
153            .global_red_add(halo_block.cells[tid as usize].mass);
154        target_node
155            .momentum_velocity
156            .global_red_add(halo_block.cells[tid as usize].momentum_velocity);
157        target_node
158            .psi_mass
159            .global_red_add(halo_block.cells[tid as usize].psi_mass);
160        target_node
161            .psi_momentum_velocity
162            .global_red_add(halo_block.cells[tid as usize].psi_momentum_velocity);
163    }
164}
165
166#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
167pub unsafe fn update_block_particle_count(
168    particles: *mut ParticlePosition,
169    particles_len: u32,
170    mut grid: GpuGrid,
171) {
172    let id = thread::index();
173    if id < particles_len {
174        let p = &*particles.add(id as usize);
175        let block_id = grid.block_associated_to_point(&p.point);
176
177        if let Some(active_block) = grid.get_packed_block_mut(block_id) {
178            active_block.num_particles.global_red_add(1)
179        }
180    }
181}
182
183#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
184pub unsafe fn copy_particles_len_to_scan_value(grid: GpuGrid, scan_values: *mut u32) {
185    let id = thread::index();
186    if id < grid.num_active_blocks() {
187        *scan_values.add(id as usize) =
188            grid.active_block_unchecked(BlockHeaderId(id)).num_particles;
189    }
190}
191
192#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
193pub unsafe fn copy_scan_values_to_first_particles(mut grid: GpuGrid, scan_values: *const u32) {
194    let id = thread::index();
195    if id < grid.num_active_blocks() {
196        grid.active_block_unchecked_mut(BlockHeaderId(id))
197            .first_particle = *scan_values.add(id as usize);
198    }
199}
200
201#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
202pub unsafe fn finalize_particles_sort(
203    particles: *mut ParticlePosition,
204    particles_len: u32,
205    grid: GpuGrid,
206    scan_values: *mut u32,
207    sorted_particle_ids: *mut u32,
208) {
209    let id = thread::index();
210    if id < particles_len {
211        let p = &*particles.add(id as usize);
212        let block_id = grid.block_associated_to_point(&p.point);
213        // Place the particles to their rightful place.
214        // TODO: store the block id inside of the particle instead?
215        if let Some(active_block_id) = grid.get_header_block_id(block_id) {
216            let scan_value = &mut *scan_values.add(active_block_id.0 as usize);
217            let target_index = scan_value.global_atomic_add(1);
218            *sorted_particle_ids.add(target_index as usize) = id;
219        }
220    }
221}
222
223/*
224 * Kernel for handling block multiplicity for mapping between grid blocks and GPU dispatch blocks.
225 */
226#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
227pub unsafe fn write_blocks_multiplicity_to_scan_value(
228    grid: GpuGrid,
229    scan_values: *mut u32,
230    halo_scan_values: *mut u32,
231    max_threads_per_block: u32,
232) {
233    let id = thread::index();
234    if id < grid.num_active_blocks() {
235        let active_block = grid.active_block_unchecked(BlockHeaderId(id));
236        let multiplicity = active_block.multiplicity(max_threads_per_block);
237
238        if active_block.halo_state.needs_halo_treatment() {
239            *scan_values.add(id as usize) = 0;
240            *halo_scan_values.add(id as usize) = multiplicity;
241        } else {
242            *scan_values.add(id as usize) = multiplicity;
243            *halo_scan_values.add(id as usize) = 0;
244        }
245    }
246}
247
248#[cfg_attr(target_os = "cuda", cuda_std::kernel)]
249pub unsafe fn init_gpu_dispatch_blocks_mapping(
250    grid: GpuGrid,
251    not_halo_scan_values: *mut u32,
252    halo_scan_values: *mut u32,
253    max_threads_per_gpu_block: u32,
254) {
255    let mut tid = thread::thread_idx_x();
256    let bid = BlockHeaderId(thread::block_idx_x());
257    let bsize = thread::block_dim_x();
258
259    let active_block = grid.active_block_unchecked(bid);
260
261    let (dispatch2active, scan_values) = if active_block.halo_state.needs_halo_treatment() {
262        (grid.dispatch_halo_block_to_active_block, halo_scan_values)
263    } else {
264        (grid.dispatch_block_to_active_block, not_halo_scan_values)
265    };
266
267    let multiplicity = active_block.multiplicity(max_threads_per_gpu_block);
268    let first_particle = active_block.first_particle;
269    let base_dispatch_block_id = *scan_values.add(bid.0 as usize);
270
271    while tid < multiplicity {
272        *dispatch2active
273            .as_mut_ptr()
274            .add((base_dispatch_block_id + tid) as usize) = DispatchBlock2ActiveBlock {
275            active_block_id: bid,
276            first_particle: first_particle + tid * max_threads_per_gpu_block,
277        };
278        tid += bsize;
279    }
280}