pub fn nms_gpu(
    d_desc_sorted_boxes: *const f32,
    n: i32,
    thresh: f32,
    legacy_plus_one: bool,
    d_keep_sorted_list: *mut i32,
    h_nkeep: *mut i32,
    dev_delete_mask: &mut TensorCUDA,
    host_delete_mask: &mut TensorCPU,
    context: *mut CUDAContext,
    box_dim: i32
)