use crate::{
build_flags::{
dwt97_fused_column_quantize_disabled, DWT97_ROW_LIFT_COOP_ROWS_PER_BLOCK,
DWT97_ROW_LIFT_COOP_THREADS_X, DWT97_ROW_LIFT_MAX_WIDTH,
PINNED_POOLED_I16_UPLOAD_MAX_BYTES, TRANSCODE_PTX_BUILT_FROM_CUDA,
},
bytes::i16_slice_as_bytes,
context::CudaContext,
error::CudaError,
execution::cuda_kernel_param,
j2k_encode::CudaDwt97BatchStageTimings,
kernels::{
self, copy_u8_launch_geometry, j2k_dwt53_launch_geometry, with_grid_y, with_grid_z,
CudaKernel,
},
memory::{pooled_device_buffer, CudaBufferPool, CudaDeviceBuffer, CudaPooledDeviceBuffer},
};
use std::os::raw::c_uint;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CudaTranscodeReversible53Bands {
pub ll: Vec<i32>,
pub hl: Vec<i32>,
pub lh: Vec<i32>,
pub hh: Vec<i32>,
pub low_width: usize,
pub low_height: usize,
pub high_width: usize,
pub high_height: usize,
}
#[derive(Clone, Copy)]
pub(crate) struct Reversible53Dims {
pub(crate) block_cols: i32,
pub(crate) width: i32,
pub(crate) height: i32,
pub(crate) low_width: i32,
pub(crate) high_width: i32,
}
#[derive(Clone, Copy)]
pub(crate) struct DctBlockGrid {
pub(crate) block_count: usize,
pub(crate) expected_coeffs: usize,
pub(crate) low_width: usize,
pub(crate) low_height: usize,
pub(crate) high_width: usize,
pub(crate) high_height: usize,
pub(crate) dims: Reversible53Dims,
}
impl CudaContext {
#[allow(clippy::too_many_lines)]
pub fn j2k_transcode_reversible_dwt53(
&self,
dequantized_blocks: &[i16],
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
) -> Result<CudaTranscodeReversible53Bands, CudaError> {
if !TRANSCODE_PTX_BUILT_FROM_CUDA {
return Err(CudaError::InvalidArgument {
message: "CUDA transcode kernels were not built (nvcc unavailable at build time)"
.to_string(),
});
}
let grid = validate_dct_block_grid(
block_cols,
block_rows,
width,
height,
1,
dequantized_blocks.len(),
"reversible 5/3 transcode job has unsupported grid geometry",
)?;
let DctBlockGrid {
block_count,
expected_coeffs,
low_width,
low_height,
high_width,
high_height,
dims,
} = grid;
self.inner.set_current()?;
let alloc_i32 = |count: usize| -> Result<CudaDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
self.allocate(bytes)
};
let samples = alloc_i32(expected_coeffs)?;
let v_low = alloc_i32(width * low_height)?;
let v_high = alloc_i32(width * high_height)?;
let ll = alloc_i32(low_width * low_height)?;
let hl = alloc_i32(high_width * low_height)?;
let lh = alloc_i32(low_width * high_height)?;
let hh = alloc_i32(high_width * high_height)?;
let block_bytes: &[u8] = unsafe {
std::slice::from_raw_parts(
dequantized_blocks.as_ptr().cast::<u8>(),
std::mem::size_of_val(dequantized_blocks),
)
};
let blocks_dev = self.upload(block_bytes)?;
self.launch_transcode_reversible53_idct(&blocks_dev, &samples, block_count)?;
if low_height > 0 {
self.launch_transcode_reversible53_vertical(
CudaKernel::TranscodeReversible53VerticalLow,
&samples,
dims,
&v_low,
checked_i32(low_height)?,
)?;
self.launch_transcode_reversible53_horizontal(
CudaKernel::TranscodeReversible53HorizontalLow,
&v_low,
dims,
checked_i32(low_height)?,
&ll,
&hl,
)?;
}
if high_height > 0 {
self.launch_transcode_reversible53_vertical(
CudaKernel::TranscodeReversible53VerticalHigh,
&samples,
dims,
&v_high,
checked_i32(high_height)?,
)?;
self.launch_transcode_reversible53_horizontal(
CudaKernel::TranscodeReversible53HorizontalHigh,
&v_high,
dims,
checked_i32(high_height)?,
&lh,
&hh,
)?;
}
Ok(CudaTranscodeReversible53Bands {
ll: Self::download_i32_band(&ll, low_width * low_height)?,
hl: Self::download_i32_band(&hl, high_width * low_height)?,
lh: Self::download_i32_band(&lh, low_width * high_height)?,
hh: Self::download_i32_band(&hh, high_width * high_height)?,
low_width,
low_height,
high_width,
high_height,
})
}
fn launch_transcode_reversible53_idct(
&self,
blocks: &CudaDeviceBuffer,
samples: &CudaDeviceBuffer,
block_count: usize,
) -> Result<(), CudaError> {
if block_count == 0 {
return Ok(());
}
let function = self
.inner
.kernel_function(CudaKernel::TranscodeReversible53Idct)?;
let mut blocks_ptr = blocks.device_ptr();
let mut samples_ptr = samples.device_ptr();
let mut count = u32::try_from(block_count)
.map_err(|_| CudaError::LengthTooLarge { len: block_count })?;
let mut params = cuda_kernel_params!(blocks_ptr, samples_ptr, count);
let geometry = copy_u8_launch_geometry(block_count)
.ok_or(CudaError::LengthTooLarge { len: block_count })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_transcode_reversible53_vertical(
&self,
kernel: CudaKernel,
samples: &CudaDeviceBuffer,
dims: Reversible53Dims,
out: &CudaDeviceBuffer,
out_rows: i32,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(kernel)?;
let mut samples_ptr = samples.device_ptr();
let mut block_cols = dims.block_cols;
let mut width = dims.width;
let mut height = dims.height;
let mut out_ptr = out.device_ptr();
let mut rows = out_rows;
let mut params = cuda_kernel_params!(samples_ptr, block_cols, width, height, out_ptr, rows);
let grid_w = u32::try_from(dims.width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let grid_h = u32::try_from(out_rows).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let geometry = j2k_dwt53_launch_geometry(grid_w, grid_h)
.ok_or(CudaError::LengthTooLarge { len: 0 })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_transcode_reversible53_horizontal(
&self,
kernel: CudaKernel,
rows_buffer: &CudaDeviceBuffer,
dims: Reversible53Dims,
n_rows: i32,
low_out: &CudaDeviceBuffer,
high_out: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let row_count =
usize::try_from(n_rows).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
if row_count == 0 {
return Ok(());
}
let function = self.inner.kernel_function(kernel)?;
let mut rows_ptr = rows_buffer.device_ptr();
let mut width = dims.width;
let mut rows = n_rows;
let mut low_width = dims.low_width;
let mut high_width = dims.high_width;
let mut low_ptr = low_out.device_ptr();
let mut high_ptr = high_out.device_ptr();
let mut params =
cuda_kernel_params!(rows_ptr, width, rows, low_width, high_width, low_ptr, high_ptr);
let geometry = copy_u8_launch_geometry(row_count)
.ok_or(CudaError::LengthTooLarge { len: row_count })?;
self.launch_kernel(function, geometry, &mut params)
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct CudaTranscodeDwt97Bands {
pub ll: Vec<f32>,
pub hl: Vec<f32>,
pub lh: Vec<f32>,
pub hh: Vec<f32>,
pub low_width: usize,
pub low_height: usize,
pub high_width: usize,
pub high_height: usize,
}
#[derive(Clone, Copy, Debug)]
pub struct CudaHtj2k97QuantizeParams {
pub inv_delta_ll: f32,
pub inv_delta_hl: f32,
pub inv_delta_lh: f32,
pub inv_delta_hh: f32,
pub cb_width: usize,
pub cb_height: usize,
}
#[derive(Clone, Copy)]
pub(crate) struct Dwt97CodeblockBandBuffers<'a> {
pub(crate) ll: &'a CudaDeviceBuffer,
pub(crate) hl: &'a CudaDeviceBuffer,
pub(crate) lh: &'a CudaDeviceBuffer,
pub(crate) hh: &'a CudaDeviceBuffer,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CudaHtj2k97CodeblockBands {
pub ll: Vec<i32>,
pub hl: Vec<i32>,
pub lh: Vec<i32>,
pub hh: Vec<i32>,
pub item_count: usize,
pub low_width: usize,
pub low_height: usize,
pub high_width: usize,
pub high_height: usize,
}
#[derive(Debug)]
pub struct CudaHtj2k97DeviceCodeblockBands {
pub ll: CudaPooledDeviceBuffer,
pub hl: CudaPooledDeviceBuffer,
pub lh: CudaPooledDeviceBuffer,
pub hh: CudaPooledDeviceBuffer,
pub item_count: usize,
pub low_width: usize,
pub low_height: usize,
pub high_width: usize,
pub high_height: usize,
}
pub(crate) struct Dwt97BatchDeviceBands {
pub(crate) ll: CudaPooledDeviceBuffer,
pub(crate) lh: CudaPooledDeviceBuffer,
pub(crate) hl: CudaPooledDeviceBuffer,
pub(crate) hh: CudaPooledDeviceBuffer,
pub(crate) low_width: usize,
pub(crate) low_height: usize,
pub(crate) high_width: usize,
pub(crate) high_height: usize,
}
#[derive(Clone, Copy)]
pub(crate) enum Dwt97BatchInput<'a> {
F32(&'a [f32]),
I16(&'a [i16]),
}
impl Dwt97BatchInput<'_> {
fn len(self) -> usize {
match self {
Self::F32(blocks) => blocks.len(),
Self::I16(blocks) => blocks.len(),
}
}
fn upload(self, pool: &CudaBufferPool) -> Result<CudaPooledDeviceBuffer, CudaError> {
match self {
Self::F32(blocks) => pool.upload_f32(blocks),
Self::I16(blocks) => {
let bytes = i16_slice_as_bytes(blocks);
if should_use_pinned_pooled_i16_upload(bytes.len()) {
pool.upload_pinned(bytes)
} else {
pool.upload(bytes)
}
}
}
}
}
pub(crate) fn should_use_pinned_pooled_i16_upload(byte_len: usize) -> bool {
byte_len <= PINNED_POOLED_I16_UPLOAD_MAX_BYTES
}
pub(crate) fn validate_dct_block_grid(
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
item_count: usize,
coeff_len: usize,
invalid_message: &'static str,
) -> Result<DctBlockGrid, CudaError> {
let block_count = block_cols
.checked_mul(block_rows)
.ok_or(CudaError::LengthTooLarge { len: block_cols })?;
let covered_w = block_cols
.checked_mul(8)
.ok_or(CudaError::LengthTooLarge { len: block_cols })?;
let covered_h = block_rows
.checked_mul(8)
.ok_or(CudaError::LengthTooLarge { len: block_rows })?;
let per_item_coeffs = block_count
.checked_mul(64)
.ok_or(CudaError::LengthTooLarge { len: block_count })?;
let expected_coeffs =
per_item_coeffs
.checked_mul(item_count)
.ok_or(CudaError::LengthTooLarge {
len: per_item_coeffs,
})?;
if item_count == 0
|| width == 0
|| height == 0
|| width > covered_w
|| height > covered_h
|| coeff_len != expected_coeffs
{
return Err(CudaError::InvalidArgument {
message: invalid_message.to_string(),
});
}
let low_width = width.div_ceil(2);
let low_height = height.div_ceil(2);
let high_width = width / 2;
let high_height = height / 2;
Ok(DctBlockGrid {
block_count,
expected_coeffs,
low_width,
low_height,
high_width,
high_height,
dims: Reversible53Dims {
block_cols: checked_i32(block_cols)?,
width: checked_i32(width)?,
height: checked_i32(height)?,
low_width: checked_i32(low_width)?,
high_width: checked_i32(high_width)?,
},
})
}
pub(crate) fn checked_i32(value: usize) -> Result<i32, CudaError> {
i32::try_from(value).map_err(|_| CudaError::LengthTooLarge { len: value })
}
impl CudaContext {
#[allow(clippy::too_many_lines)]
pub fn j2k_transcode_dwt97(
&self,
blocks: &[f32],
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
) -> Result<CudaTranscodeDwt97Bands, CudaError> {
if !TRANSCODE_PTX_BUILT_FROM_CUDA {
return Err(CudaError::InvalidArgument {
message: "CUDA transcode kernels were not built (nvcc unavailable at build time)"
.to_string(),
});
}
let grid = validate_dct_block_grid(
block_cols,
block_rows,
width,
height,
1,
blocks.len(),
"9/7 transcode job has unsupported grid geometry",
)?;
let DctBlockGrid {
expected_coeffs: _,
low_width,
low_height,
high_width,
high_height,
dims,
..
} = grid;
self.inner.set_current()?;
let alloc_f32 = |count: usize| -> Result<CudaDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
self.allocate(bytes)
};
let spatial = alloc_f32(width * height)?;
let row_low = alloc_f32(height * low_width)?;
let row_high = alloc_f32(height * high_width)?;
let ll = alloc_f32(low_width * low_height)?;
let lh = alloc_f32(low_width * high_height)?;
let hl = alloc_f32(high_width * low_height)?;
let hh = alloc_f32(high_width * high_height)?;
let blocks_dev = self.upload_f32(blocks)?;
self.launch_transcode_dwt97_idct(dims, &blocks_dev, &spatial)?;
self.launch_transcode_dwt97_row_lift(dims, &spatial, &row_low, &row_high)?;
if dims.low_width > 0 {
self.launch_transcode_dwt97_column_lift(
&row_low,
dims.low_width,
dims.height,
&ll,
&lh,
)?;
}
if dims.high_width > 0 {
self.launch_transcode_dwt97_column_lift(
&row_high,
dims.high_width,
dims.height,
&hl,
&hh,
)?;
}
Ok(CudaTranscodeDwt97Bands {
ll: Self::download_f32_band(&ll, low_width * low_height)?,
hl: Self::download_f32_band(&hl, high_width * low_height)?,
lh: Self::download_f32_band(&lh, low_width * high_height)?,
hh: Self::download_f32_band(&hh, high_width * high_height)?,
low_width,
low_height,
high_width,
high_height,
})
}
fn launch_transcode_dwt97_idct(
&self,
dims: Reversible53Dims,
blocks: &CudaDeviceBuffer,
spatial: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(CudaKernel::TranscodeDwt97Idct)?;
let mut blocks_ptr = blocks.device_ptr();
let mut block_cols = dims.block_cols;
let mut width = dims.width;
let mut height = dims.height;
let mut spatial_ptr = spatial.device_ptr();
let mut params = cuda_kernel_params!(blocks_ptr, block_cols, width, height, spatial_ptr);
let grid_w = u32::try_from(dims.width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let grid_h =
u32::try_from(dims.height).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let geometry = j2k_dwt53_launch_geometry(grid_w, grid_h)
.ok_or(CudaError::LengthTooLarge { len: 0 })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_transcode_dwt97_row_lift(
&self,
dims: Reversible53Dims,
spatial: &CudaDeviceBuffer,
row_low: &CudaDeviceBuffer,
row_high: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97RowLift)?;
let mut spatial_ptr = spatial.device_ptr();
let mut width = dims.width;
let mut height = dims.height;
let mut low_width = dims.low_width;
let mut high_width = dims.high_width;
let mut low_ptr = row_low.device_ptr();
let mut high_ptr = row_high.device_ptr();
let mut params = cuda_kernel_params!(
spatial_ptr,
width,
height,
low_width,
high_width,
low_ptr,
high_ptr
);
let rows =
usize::try_from(dims.height).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let geometry =
copy_u8_launch_geometry(rows).ok_or(CudaError::LengthTooLarge { len: rows })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_transcode_dwt97_column_lift(
&self,
rows_buffer: &CudaDeviceBuffer,
band_width: i32,
height: i32,
low_out: &CudaDeviceBuffer,
high_out: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let columns =
usize::try_from(band_width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
if columns == 0 {
return Ok(());
}
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97ColumnLift)?;
let mut rows_ptr = rows_buffer.device_ptr();
let mut band = band_width;
let mut rows = height;
let mut low_ptr = low_out.device_ptr();
let mut high_ptr = high_out.device_ptr();
let mut params = cuda_kernel_params!(rows_ptr, band, rows, low_ptr, high_ptr);
let geometry =
copy_u8_launch_geometry(columns).ok_or(CudaError::LengthTooLarge { len: columns })?;
self.launch_kernel(function, geometry, &mut params)
}
}
impl CudaContext {
#[allow(clippy::similar_names)]
pub fn j2k_transcode_dwt97_batch(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
) -> Result<(Vec<CudaTranscodeDwt97Bands>, CudaDwt97BatchStageTimings), CudaError> {
let pool = self.buffer_pool();
self.j2k_transcode_dwt97_batch_with_pool(
blocks, item_count, block_cols, block_rows, width, height, &pool,
)
}
#[allow(clippy::too_many_arguments, clippy::similar_names)]
pub fn j2k_transcode_dwt97_batch_with_pool(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
pool: &CudaBufferPool,
) -> Result<(Vec<CudaTranscodeDwt97Bands>, CudaDwt97BatchStageTimings), CudaError> {
let (bands, pack_upload_us, idct_row_lift_us, column_lift_us) = self
.transcode_dwt97_batch_to_device(
blocks, item_count, block_cols, block_rows, width, height, pool,
)?;
let Dwt97BatchDeviceBands {
ll,
lh,
hl,
hh,
low_width,
low_height,
high_width,
high_height,
} = bands;
let ll_size = low_width * low_height;
let lh_size = low_width * high_height;
let hl_size = high_width * low_height;
let hh_size = high_width * high_height;
let (outputs, readback_us) = self.time_default_stream_us(|| {
let ll_all = Self::download_pooled_f32_band(&ll, item_count * ll_size)?;
let lh_all = Self::download_pooled_f32_band(&lh, item_count * lh_size)?;
let hl_all = Self::download_pooled_f32_band(&hl, item_count * hl_size)?;
let hh_all = Self::download_pooled_f32_band(&hh, item_count * hh_size)?;
let mut outputs = Vec::with_capacity(item_count);
for item in 0..item_count {
outputs.push(CudaTranscodeDwt97Bands {
ll: ll_all[item * ll_size..(item + 1) * ll_size].to_vec(),
hl: hl_all[item * hl_size..(item + 1) * hl_size].to_vec(),
lh: lh_all[item * lh_size..(item + 1) * lh_size].to_vec(),
hh: hh_all[item * hh_size..(item + 1) * hh_size].to_vec(),
low_width,
low_height,
high_width,
high_height,
});
}
Ok(outputs)
})?;
Ok((
outputs,
CudaDwt97BatchStageTimings {
pack_upload_us,
idct_row_lift_us,
column_lift_us,
quantize_codeblock_us: 0,
ht_encode_us: 0,
ht_codeblock_dispatches: 0,
readback_us,
},
))
}
#[allow(clippy::too_many_arguments)]
pub fn j2k_transcode_htj2k97_codeblock_batch_resident(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
params: CudaHtj2k97QuantizeParams,
) -> Result<(CudaHtj2k97DeviceCodeblockBands, CudaDwt97BatchStageTimings), CudaError> {
let pool = self.buffer_pool();
self.j2k_transcode_htj2k97_codeblock_batch_resident_with_pool(
blocks, item_count, block_cols, block_rows, width, height, params, &pool,
)
}
#[allow(clippy::similar_names, clippy::too_many_arguments)]
pub fn j2k_transcode_htj2k97_codeblock_batch_resident_with_pool(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
params: CudaHtj2k97QuantizeParams,
pool: &CudaBufferPool,
) -> Result<(CudaHtj2k97DeviceCodeblockBands, CudaDwt97BatchStageTimings), CudaError> {
let (bands, pack_upload_us, idct_row_lift_us, column_lift_us) = self
.transcode_dwt97_batch_to_device(
blocks, item_count, block_cols, block_rows, width, height, pool,
)?;
let low_width = bands.low_width;
let low_height = bands.low_height;
let high_width = bands.high_width;
let high_height = bands.high_height;
let items =
u32::try_from(item_count).map_err(|_| CudaError::LengthTooLarge { len: item_count })?;
let alloc_i32 = |count: usize| -> Result<CudaPooledDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
pool.take(bytes)
};
let ll_size = low_width * low_height;
let lh_size = low_width * high_height;
let hl_size = high_width * low_height;
let hh_size = high_width * high_height;
let ll_q = alloc_i32(item_count * ll_size)?;
let lh_q = alloc_i32(item_count * lh_size)?;
let hl_q = alloc_i32(item_count * hl_size)?;
let hh_q = alloc_i32(item_count * hh_size)?;
let ((), quantize_codeblock_us) = self.time_default_stream_us(|| {
self.launch_transcode_dwt97_quantize_codeblock_bands(
&bands,
Dwt97CodeblockBandBuffers {
ll: pooled_device_buffer(&ll_q)?,
hl: pooled_device_buffer(&hl_q)?,
lh: pooled_device_buffer(&lh_q)?,
hh: pooled_device_buffer(&hh_q)?,
},
params,
items,
)
})?;
Ok((
CudaHtj2k97DeviceCodeblockBands {
ll: ll_q,
hl: hl_q,
lh: lh_q,
hh: hh_q,
item_count,
low_width,
low_height,
high_width,
high_height,
},
CudaDwt97BatchStageTimings {
pack_upload_us,
idct_row_lift_us,
column_lift_us,
quantize_codeblock_us,
ht_encode_us: 0,
ht_codeblock_dispatches: 0,
readback_us: 0,
},
))
}
#[allow(clippy::similar_names, clippy::too_many_arguments)]
pub fn j2k_transcode_htj2k97_codeblock_i16_batch_resident_with_pool(
&self,
blocks: &[i16],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
params: CudaHtj2k97QuantizeParams,
pool: &CudaBufferPool,
) -> Result<(CudaHtj2k97DeviceCodeblockBands, CudaDwt97BatchStageTimings), CudaError> {
if !dwt97_fused_column_quantize_disabled() {
return self.j2k_transcode_htj2k97_codeblock_i16_batch_resident_fused_with_pool(
blocks, item_count, block_cols, block_rows, width, height, params, pool,
);
}
let (bands, pack_upload_us, idct_row_lift_us, column_lift_us) = self
.transcode_dwt97_i16_batch_to_device(
blocks, item_count, block_cols, block_rows, width, height, pool,
)?;
let low_width = bands.low_width;
let low_height = bands.low_height;
let high_width = bands.high_width;
let high_height = bands.high_height;
let items =
u32::try_from(item_count).map_err(|_| CudaError::LengthTooLarge { len: item_count })?;
let alloc_i32 = |count: usize| -> Result<CudaPooledDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
pool.take(bytes)
};
let ll_size = low_width * low_height;
let lh_size = low_width * high_height;
let hl_size = high_width * low_height;
let hh_size = high_width * high_height;
let ll_q = alloc_i32(item_count * ll_size)?;
let lh_q = alloc_i32(item_count * lh_size)?;
let hl_q = alloc_i32(item_count * hl_size)?;
let hh_q = alloc_i32(item_count * hh_size)?;
let ((), quantize_codeblock_us) = self.time_default_stream_us(|| {
self.launch_transcode_dwt97_quantize_codeblock_bands(
&bands,
Dwt97CodeblockBandBuffers {
ll: pooled_device_buffer(&ll_q)?,
hl: pooled_device_buffer(&hl_q)?,
lh: pooled_device_buffer(&lh_q)?,
hh: pooled_device_buffer(&hh_q)?,
},
params,
items,
)
})?;
Ok((
CudaHtj2k97DeviceCodeblockBands {
ll: ll_q,
hl: hl_q,
lh: lh_q,
hh: hh_q,
item_count,
low_width,
low_height,
high_width,
high_height,
},
CudaDwt97BatchStageTimings {
pack_upload_us,
idct_row_lift_us,
column_lift_us,
quantize_codeblock_us,
ht_encode_us: 0,
ht_codeblock_dispatches: 0,
readback_us: 0,
},
))
}
#[allow(
clippy::similar_names,
clippy::too_many_arguments,
clippy::too_many_lines
)]
fn j2k_transcode_htj2k97_codeblock_i16_batch_resident_fused_with_pool(
&self,
blocks: &[i16],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
params: CudaHtj2k97QuantizeParams,
pool: &CudaBufferPool,
) -> Result<(CudaHtj2k97DeviceCodeblockBands, CudaDwt97BatchStageTimings), CudaError> {
if !TRANSCODE_PTX_BUILT_FROM_CUDA {
return Err(CudaError::InvalidArgument {
message: "CUDA transcode kernels were not built (nvcc unavailable at build time)"
.to_string(),
});
}
let grid = validate_dct_block_grid(
block_cols,
block_rows,
width,
height,
item_count,
blocks.len(),
"9/7 transcode batch has unsupported grid geometry",
)?;
let DctBlockGrid {
block_count,
low_width,
low_height,
high_width,
high_height,
dims,
..
} = grid;
let items =
u32::try_from(item_count).map_err(|_| CudaError::LengthTooLarge { len: item_count })?;
let blocks_per_item = checked_i32(block_count)?;
let low_height_i32 = checked_i32(low_height)?;
let high_height_i32 = checked_i32(high_height)?;
let cb_w = checked_i32(params.cb_width)?;
let cb_h = checked_i32(params.cb_height)?;
self.inner.set_current()?;
let alloc_f32 = |count: usize| -> Result<CudaPooledDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
pool.take(bytes)
};
let alloc_i32 = |count: usize| -> Result<CudaPooledDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
pool.take(bytes)
};
let (buffers, pack_upload_us) = self.time_default_stream_us(|| {
let spatial = alloc_f32(item_count * width * height)?;
let row_low = alloc_f32(item_count * height * low_width)?;
let row_high = alloc_f32(item_count * height * high_width)?;
let blocks_dev = Dwt97BatchInput::I16(blocks).upload(pool)?;
Ok((spatial, row_low, row_high, blocks_dev))
})?;
let (spatial, row_low, row_high, blocks_dev) = buffers;
let ll_size = low_width * low_height;
let lh_size = low_width * high_height;
let hl_size = high_width * low_height;
let hh_size = high_width * high_height;
let ll_q = alloc_i32(item_count * ll_size)?;
let lh_q = alloc_i32(item_count * lh_size)?;
let hl_q = alloc_i32(item_count * hl_size)?;
let hh_q = alloc_i32(item_count * hh_size)?;
let ((), idct_row_lift_us) = self.time_default_stream_us(|| {
self.launch_transcode_dwt97_idct_batch_kernel(
CudaKernel::TranscodeDwt97IdctI16Batch,
dims,
blocks_per_item,
items,
pooled_device_buffer(&blocks_dev)?,
pooled_device_buffer(&spatial)?,
)?;
self.launch_transcode_dwt97_row_lift_batch(
dims,
items,
pooled_device_buffer(&spatial)?,
pooled_device_buffer(&row_low)?,
pooled_device_buffer(&row_high)?,
)?;
Ok(())
})?;
let ((), column_quantize_us) = self.time_default_stream_us(|| {
if dims.low_width > 0 {
self.launch_transcode_dwt97_column_lift_quantize_codeblocks_batch(
pooled_device_buffer(&row_low)?,
dims.low_width,
dims.height,
low_height_i32,
high_height_i32,
items,
pooled_device_buffer(&ll_q)?,
pooled_device_buffer(&lh_q)?,
cb_w,
cb_h,
params.inv_delta_ll,
params.inv_delta_lh,
)?;
}
if dims.high_width > 0 {
self.launch_transcode_dwt97_column_lift_quantize_codeblocks_batch(
pooled_device_buffer(&row_high)?,
dims.high_width,
dims.height,
low_height_i32,
high_height_i32,
items,
pooled_device_buffer(&hl_q)?,
pooled_device_buffer(&hh_q)?,
cb_w,
cb_h,
params.inv_delta_hl,
params.inv_delta_hh,
)?;
}
Ok(())
})?;
Ok((
CudaHtj2k97DeviceCodeblockBands {
ll: ll_q,
hl: hl_q,
lh: lh_q,
hh: hh_q,
item_count,
low_width,
low_height,
high_width,
high_height,
},
CudaDwt97BatchStageTimings {
pack_upload_us,
idct_row_lift_us,
column_lift_us: 0,
quantize_codeblock_us: column_quantize_us,
ht_encode_us: 0,
ht_codeblock_dispatches: 0,
readback_us: 0,
},
))
}
#[allow(clippy::too_many_arguments)]
pub fn j2k_transcode_htj2k97_codeblock_batch(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
params: CudaHtj2k97QuantizeParams,
) -> Result<(CudaHtj2k97CodeblockBands, CudaDwt97BatchStageTimings), CudaError> {
let pool = self.buffer_pool();
self.j2k_transcode_htj2k97_codeblock_batch_with_pool(
blocks, item_count, block_cols, block_rows, width, height, params, &pool,
)
}
#[allow(clippy::similar_names, clippy::too_many_arguments)]
pub fn j2k_transcode_htj2k97_codeblock_batch_with_pool(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
params: CudaHtj2k97QuantizeParams,
pool: &CudaBufferPool,
) -> Result<(CudaHtj2k97CodeblockBands, CudaDwt97BatchStageTimings), CudaError> {
let (bands, pack_upload_us, idct_row_lift_us, column_lift_us) = self
.transcode_dwt97_batch_to_device(
blocks, item_count, block_cols, block_rows, width, height, pool,
)?;
let low_width = bands.low_width;
let low_height = bands.low_height;
let high_width = bands.high_width;
let high_height = bands.high_height;
let items =
u32::try_from(item_count).map_err(|_| CudaError::LengthTooLarge { len: item_count })?;
let alloc_i32 = |count: usize| -> Result<CudaDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
self.allocate(bytes)
};
let ll_size = low_width * low_height;
let lh_size = low_width * high_height;
let hl_size = high_width * low_height;
let hh_size = high_width * high_height;
let ll_q = alloc_i32(item_count * ll_size)?;
let lh_q = alloc_i32(item_count * lh_size)?;
let hl_q = alloc_i32(item_count * hl_size)?;
let hh_q = alloc_i32(item_count * hh_size)?;
let ((), quantize_codeblock_us) = self.time_default_stream_us(|| {
self.launch_transcode_dwt97_quantize_codeblock_bands(
&bands,
Dwt97CodeblockBandBuffers {
ll: &ll_q,
hl: &hl_q,
lh: &lh_q,
hh: &hh_q,
},
params,
items,
)
})?;
let (codeblocks, readback_us) = self.time_default_stream_us(|| {
Ok(CudaHtj2k97CodeblockBands {
ll: Self::download_i32_band(&ll_q, item_count * ll_size)?,
hl: Self::download_i32_band(&hl_q, item_count * hl_size)?,
lh: Self::download_i32_band(&lh_q, item_count * lh_size)?,
hh: Self::download_i32_band(&hh_q, item_count * hh_size)?,
item_count,
low_width,
low_height,
high_width,
high_height,
})
})?;
Ok((
codeblocks,
CudaDwt97BatchStageTimings {
pack_upload_us,
idct_row_lift_us,
column_lift_us,
quantize_codeblock_us,
ht_encode_us: 0,
ht_codeblock_dispatches: 0,
readback_us,
},
))
}
#[allow(clippy::too_many_lines)]
#[allow(clippy::too_many_arguments)]
fn transcode_dwt97_batch_to_device(
&self,
blocks: &[f32],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
pool: &CudaBufferPool,
) -> Result<(Dwt97BatchDeviceBands, u128, u128, u128), CudaError> {
self.transcode_dwt97_batch_input_to_device(
Dwt97BatchInput::F32(blocks),
item_count,
block_cols,
block_rows,
width,
height,
pool,
)
}
#[allow(clippy::too_many_arguments)]
fn transcode_dwt97_i16_batch_to_device(
&self,
blocks: &[i16],
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
pool: &CudaBufferPool,
) -> Result<(Dwt97BatchDeviceBands, u128, u128, u128), CudaError> {
self.transcode_dwt97_batch_input_to_device(
Dwt97BatchInput::I16(blocks),
item_count,
block_cols,
block_rows,
width,
height,
pool,
)
}
#[allow(clippy::too_many_lines)]
#[allow(clippy::too_many_arguments)]
fn transcode_dwt97_batch_input_to_device(
&self,
input: Dwt97BatchInput<'_>,
item_count: usize,
block_cols: usize,
block_rows: usize,
width: usize,
height: usize,
pool: &CudaBufferPool,
) -> Result<(Dwt97BatchDeviceBands, u128, u128, u128), CudaError> {
if !TRANSCODE_PTX_BUILT_FROM_CUDA {
return Err(CudaError::InvalidArgument {
message: "CUDA transcode kernels were not built (nvcc unavailable at build time)"
.to_string(),
});
}
let grid = validate_dct_block_grid(
block_cols,
block_rows,
width,
height,
item_count,
input.len(),
"9/7 transcode batch has unsupported grid geometry",
)?;
let DctBlockGrid {
block_count,
low_width,
low_height,
high_width,
high_height,
dims,
..
} = grid;
let items =
u32::try_from(item_count).map_err(|_| CudaError::LengthTooLarge { len: item_count })?;
let blocks_per_item = checked_i32(block_count)?;
let low_height_i32 = checked_i32(low_height)?;
let high_height_i32 = checked_i32(high_height)?;
self.inner.set_current()?;
let alloc_f32 = |count: usize| -> Result<CudaPooledDeviceBuffer, CudaError> {
let bytes = count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: count })?;
pool.take(bytes)
};
let (buffers, pack_upload_us) = self.time_default_stream_us(|| {
let spatial = alloc_f32(item_count * width * height)?;
let row_low = alloc_f32(item_count * height * low_width)?;
let row_high = alloc_f32(item_count * height * high_width)?;
let ll = alloc_f32(item_count * low_width * low_height)?;
let lh = alloc_f32(item_count * low_width * high_height)?;
let hl = alloc_f32(item_count * high_width * low_height)?;
let hh = alloc_f32(item_count * high_width * high_height)?;
let blocks_dev = input.upload(pool)?;
Ok((spatial, row_low, row_high, ll, lh, hl, hh, blocks_dev))
})?;
let (spatial, row_low, row_high, ll, lh, hl, hh, blocks_dev) = buffers;
let ((), idct_row_lift_us) = self.time_default_stream_us(|| {
let idct_kernel = match input {
Dwt97BatchInput::F32(_) => CudaKernel::TranscodeDwt97IdctBatch,
Dwt97BatchInput::I16(_) => CudaKernel::TranscodeDwt97IdctI16Batch,
};
self.launch_transcode_dwt97_idct_batch_kernel(
idct_kernel,
dims,
blocks_per_item,
items,
pooled_device_buffer(&blocks_dev)?,
pooled_device_buffer(&spatial)?,
)?;
self.launch_transcode_dwt97_row_lift_batch(
dims,
items,
pooled_device_buffer(&spatial)?,
pooled_device_buffer(&row_low)?,
pooled_device_buffer(&row_high)?,
)?;
Ok(())
})?;
let ((), column_lift_us) = self.time_default_stream_us(|| {
if dims.low_width > 0 {
self.launch_transcode_dwt97_column_lift_batch(
pooled_device_buffer(&row_low)?,
dims.low_width,
dims.height,
low_height_i32,
high_height_i32,
items,
pooled_device_buffer(&ll)?,
pooled_device_buffer(&lh)?,
)?;
}
if dims.high_width > 0 {
self.launch_transcode_dwt97_column_lift_batch(
pooled_device_buffer(&row_high)?,
dims.high_width,
dims.height,
low_height_i32,
high_height_i32,
items,
pooled_device_buffer(&hl)?,
pooled_device_buffer(&hh)?,
)?;
}
Ok(())
})?;
Ok((
Dwt97BatchDeviceBands {
ll,
lh,
hl,
hh,
low_width,
low_height,
high_width,
high_height,
},
pack_upload_us,
idct_row_lift_us,
column_lift_us,
))
}
fn launch_transcode_dwt97_idct_batch_kernel(
&self,
kernel: CudaKernel,
dims: Reversible53Dims,
blocks_per_item: i32,
items: u32,
blocks: &CudaDeviceBuffer,
spatial: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(kernel)?;
let mut blocks_ptr = blocks.device_ptr();
let mut block_cols = dims.block_cols;
let mut width = dims.width;
let mut height = dims.height;
let mut blocks_per_item = blocks_per_item;
let mut spatial_ptr = spatial.device_ptr();
let mut params = cuda_kernel_params!(
blocks_ptr,
block_cols,
width,
height,
blocks_per_item,
spatial_ptr
);
let grid_w = u32::try_from(dims.width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let grid_h =
u32::try_from(dims.height).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let base = j2k_dwt53_launch_geometry(grid_w, grid_h)
.ok_or(CudaError::LengthTooLarge { len: 0 })?;
let geometry = with_grid_z(base, items);
self.launch_kernel_async(function, geometry, &mut params)
}
fn launch_transcode_dwt97_row_lift_batch(
&self,
dims: Reversible53Dims,
items: u32,
spatial: &CudaDeviceBuffer,
row_low: &CudaDeviceBuffer,
row_high: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
if dims.width <= DWT97_ROW_LIFT_MAX_WIDTH {
return self.launch_transcode_dwt97_row_lift_batch_coop(
dims, items, spatial, row_low, row_high,
);
}
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97RowLiftBatch)?;
let mut spatial_ptr = spatial.device_ptr();
let mut width = dims.width;
let mut height = dims.height;
let mut low_width = dims.low_width;
let mut high_width = dims.high_width;
let mut low_ptr = row_low.device_ptr();
let mut high_ptr = row_high.device_ptr();
let mut params = cuda_kernel_params!(
spatial_ptr,
width,
height,
low_width,
high_width,
low_ptr,
high_ptr
);
let rows =
usize::try_from(dims.height).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let base = copy_u8_launch_geometry(rows).ok_or(CudaError::LengthTooLarge { len: rows })?;
let geometry = with_grid_y(base, items);
self.launch_kernel_async(function, geometry, &mut params)
}
fn launch_transcode_dwt97_row_lift_batch_coop(
&self,
dims: Reversible53Dims,
items: u32,
spatial: &CudaDeviceBuffer,
row_low: &CudaDeviceBuffer,
row_high: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97RowLiftBatchCoop)?;
let mut spatial_ptr = spatial.device_ptr();
let mut width = dims.width;
let mut height = dims.height;
let mut low_width = dims.low_width;
let mut high_width = dims.high_width;
let mut low_ptr = row_low.device_ptr();
let mut high_ptr = row_high.device_ptr();
let mut params = cuda_kernel_params!(
spatial_ptr,
width,
height,
low_width,
high_width,
low_ptr,
high_ptr
);
let rows =
usize::try_from(dims.height).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let rows_per_block = DWT97_ROW_LIFT_COOP_ROWS_PER_BLOCK as usize;
let grid_x = c_uint::try_from(rows.div_ceil(rows_per_block))
.map_err(|_| CudaError::LengthTooLarge { len: rows })?;
let geometry = kernels::CudaLaunchGeometry {
grid: (grid_x, items, 1),
block: (
DWT97_ROW_LIFT_COOP_THREADS_X,
DWT97_ROW_LIFT_COOP_ROWS_PER_BLOCK,
1,
),
};
self.launch_kernel_async(function, geometry, &mut params)
}
#[allow(clippy::too_many_arguments)]
fn launch_transcode_dwt97_column_lift_batch(
&self,
rows_buffer: &CudaDeviceBuffer,
band_width: i32,
height: i32,
low_height: i32,
high_height: i32,
items: u32,
low_out: &CudaDeviceBuffer,
high_out: &CudaDeviceBuffer,
) -> Result<(), CudaError> {
let columns =
usize::try_from(band_width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
if columns == 0 {
return Ok(());
}
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97ColumnLiftBatch)?;
let mut rows_ptr = rows_buffer.device_ptr();
let mut band = band_width;
let mut rows = height;
let mut low_h = low_height;
let mut high_h = high_height;
let mut low_ptr = low_out.device_ptr();
let mut high_ptr = high_out.device_ptr();
let mut params =
cuda_kernel_params!(rows_ptr, band, rows, low_h, high_h, low_ptr, high_ptr);
let base =
copy_u8_launch_geometry(columns).ok_or(CudaError::LengthTooLarge { len: columns })?;
let geometry = with_grid_y(base, items);
self.launch_kernel_async(function, geometry, &mut params)
}
#[allow(clippy::too_many_arguments)]
fn launch_transcode_dwt97_column_lift_quantize_codeblocks_batch(
&self,
rows_buffer: &CudaDeviceBuffer,
band_width: i32,
height: i32,
low_height: i32,
high_height: i32,
items: u32,
low_out: &CudaDeviceBuffer,
high_out: &CudaDeviceBuffer,
cb_width: i32,
cb_height: i32,
inv_delta_low: f32,
inv_delta_high: f32,
) -> Result<(), CudaError> {
let columns =
usize::try_from(band_width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
if columns == 0 {
return Ok(());
}
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97ColumnLiftQuantizeCodeblocksBatch)?;
let mut rows_ptr = rows_buffer.device_ptr();
let mut band = band_width;
let mut rows = height;
let mut low_h = low_height;
let mut high_h = high_height;
let mut low_ptr = low_out.device_ptr();
let mut high_ptr = high_out.device_ptr();
let mut cb_w = cb_width;
let mut cb_h = cb_height;
let mut inv_low = inv_delta_low;
let mut inv_high = inv_delta_high;
let mut params = cuda_kernel_params!(
rows_ptr, band, rows, low_h, high_h, low_ptr, high_ptr, cb_w, cb_h, inv_low, inv_high
);
let base =
copy_u8_launch_geometry(columns).ok_or(CudaError::LengthTooLarge { len: columns })?;
let geometry = with_grid_y(base, items);
self.launch_kernel_async(function, geometry, &mut params)
}
fn launch_transcode_dwt97_quantize_codeblock_bands(
&self,
bands: &Dwt97BatchDeviceBands,
outputs: Dwt97CodeblockBandBuffers<'_>,
params: CudaHtj2k97QuantizeParams,
items: u32,
) -> Result<(), CudaError> {
let to_i32 = |value: usize| -> Result<i32, CudaError> {
i32::try_from(value).map_err(|_| CudaError::LengthTooLarge { len: value })
};
let low_width = to_i32(bands.low_width)?;
let low_height = to_i32(bands.low_height)?;
let high_width = to_i32(bands.high_width)?;
let high_height = to_i32(bands.high_height)?;
let cb_width = to_i32(params.cb_width)?;
let cb_height = to_i32(params.cb_height)?;
self.launch_transcode_dwt97_quantize_codeblocks(
pooled_device_buffer(&bands.ll)?,
outputs.ll,
low_width,
low_height,
cb_width,
cb_height,
params.inv_delta_ll,
items,
)?;
self.launch_transcode_dwt97_quantize_codeblocks(
pooled_device_buffer(&bands.hl)?,
outputs.hl,
high_width,
low_height,
cb_width,
cb_height,
params.inv_delta_hl,
items,
)?;
self.launch_transcode_dwt97_quantize_codeblocks(
pooled_device_buffer(&bands.lh)?,
outputs.lh,
low_width,
high_height,
cb_width,
cb_height,
params.inv_delta_lh,
items,
)?;
self.launch_transcode_dwt97_quantize_codeblocks(
pooled_device_buffer(&bands.hh)?,
outputs.hh,
high_width,
high_height,
cb_width,
cb_height,
params.inv_delta_hh,
items,
)?;
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn launch_transcode_dwt97_quantize_codeblocks(
&self,
band: &CudaDeviceBuffer,
output: &CudaDeviceBuffer,
width: i32,
height: i32,
cb_width: i32,
cb_height: i32,
inv_delta: f32,
items: u32,
) -> Result<(), CudaError> {
if width <= 0 || height <= 0 {
return Ok(());
}
let function = self
.inner
.kernel_function(CudaKernel::TranscodeDwt97QuantizeCodeblocks)?;
let mut band_ptr = band.device_ptr();
let mut output_ptr = output.device_ptr();
let mut width = width;
let mut height = height;
let mut cb_width = cb_width;
let mut cb_height = cb_height;
let mut inv_delta = inv_delta;
let mut params = cuda_kernel_params!(
band_ptr, output_ptr, width, height, cb_width, cb_height, inv_delta
);
let grid_w = u32::try_from(width).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let grid_h = u32::try_from(height).map_err(|_| CudaError::LengthTooLarge { len: 0 })?;
let base = j2k_dwt53_launch_geometry(grid_w, grid_h)
.ok_or(CudaError::LengthTooLarge { len: 0 })?;
let geometry = with_grid_z(base, items);
self.launch_kernel_async(function, geometry, &mut params)
}
}