1#[cfg(all(test, feature = "cuda-runtime"))]
4use core::cell::Cell;
5use core::convert::Infallible;
6#[cfg(feature = "cuda-runtime")]
7use std::sync::Arc;
8
9use j2k::{
10 adapter::device_plan::{DeviceDecodePlan, DeviceDecodeRequest},
11 J2kDecoder as CpuDecoder, J2kError, J2kScratchPool as CpuJ2kScratchPool, J2kView,
12};
13#[cfg(feature = "cuda-runtime")]
14use j2k_core::BackendKind;
15use j2k_core::{
16 submit_ready_device, BackendRequest, DecodeOutcome, Downscale, ImageCodec, ImageDecode,
17 ImageDecodeDevice, ImageDecodeSubmit, PixelFormat, ReadySubmission, Rect,
18};
19#[cfg(feature = "cuda-runtime")]
20use j2k_cuda_runtime::{
21 CudaBufferPool, CudaBufferPoolTakeTrace, CudaDeviceBuffer, CudaError, CudaHtj2kCleanupTarget,
22 CudaHtj2kCodeBlockJob, CudaHtj2kDecodeResources, CudaHtj2kDecodeTableResources,
23 CudaHtj2kDequantizeTarget, CudaJ2kIdwtJob, CudaJ2kIdwtTarget, CudaJ2kInverseMctJob,
24 CudaJ2kRect, CudaJ2kStoreGray16Job, CudaJ2kStoreGray8Job, CudaJ2kStoreRgb16Job,
25 CudaJ2kStoreRgb16MctJob, CudaJ2kStoreRgb8Job, CudaJ2kStoreRgb8MctJob,
26 CudaJ2kStoreRgb8MctTarget, CudaPooledDeviceBuffer, CudaQueuedExecution, CudaQueuedHtj2kCleanup,
27};
28use j2k_native::{DecodeSettings, DecoderContext as NativeDecoderContext, Image as NativeImage};
29
30#[cfg(feature = "cuda-runtime")]
31use crate::runtime::cuda_error;
32use crate::runtime::{validate_surface_request, wrap_cpu_staged_cuda_surface, wrap_surface};
33#[cfg(feature = "cuda-runtime")]
34use crate::surface::{cuda_range_storage, Storage};
35use crate::{
36 profile, CudaHtj2kDecodePlan, CudaHtj2kDecodeProfileDetail, CudaHtj2kProfileReport,
37 CudaSession, Error, Surface,
38};
39#[cfg(feature = "cuda-runtime")]
40use crate::{
41 CudaHtj2kBandId, CudaHtj2kIdwtStep, CudaHtj2kStoreStep, CudaHtj2kTransform, CudaSurfaceStats,
42 SurfaceResidency,
43};
44
45#[cfg(feature = "cuda-runtime")]
46const CUDA_HTJ2K_KERNELS_NOT_READY: &str =
47 "strict CUDA HTJ2K resident codestream decode kernels are not available in this build";
48#[cfg(feature = "cuda-runtime")]
49const CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED: &str =
50 "strict CUDA HTJ2K resident decode currently accepts Gray8, Gray16, Rgb8, Rgba8, Rgb16, and Rgba16 output";
51#[cfg(feature = "cuda-runtime")]
52const CUDA_HTJ2K_PLAN_INVARIANT_FAILED: &str =
53 "strict CUDA HTJ2K resident decode plan has invalid internal ranges";
54#[cfg(feature = "cuda-runtime")]
55const CUDA_HTJ2K_STORE_UNSUPPORTED: &str =
56 "strict CUDA HTJ2K resident decode requires a single grayscale store step";
57#[cfg(feature = "cuda-runtime")]
58const CUDA_HTJ2K_BATCH_PAYLOAD_TOO_LARGE: &str =
59 "strict CUDA HTJ2K resident batch decode payload is too large";
60#[cfg(feature = "cuda-runtime")]
61const CUDA_IDWT_TRACE_ENV_VAR: &str = "J2K_CUDA_IDWT_TRACE";
62
63#[cfg(all(test, feature = "cuda-runtime"))]
64std::thread_local! {
65 static CUDA_HTJ2K_BATCH_DECODE_CALLS: Cell<usize> = const { Cell::new(0) };
66}
67
68#[cfg(all(test, feature = "cuda-runtime"))]
69pub(crate) fn testing_reset_cuda_htj2k_batch_decode_calls() {
70 CUDA_HTJ2K_BATCH_DECODE_CALLS.with(|calls| calls.set(0));
71}
72
73#[cfg(all(test, feature = "cuda-runtime"))]
74pub(crate) fn testing_cuda_htj2k_batch_decode_calls() -> usize {
75 CUDA_HTJ2K_BATCH_DECODE_CALLS.with(Cell::get)
76}
77
78#[cfg(any(test, feature = "cuda-runtime"))]
79#[derive(Clone, Copy, Debug, Eq, PartialEq)]
80struct CudaIdwtBatchHostTraceRow {
81 component_count: usize,
82 step_count: usize,
83 output_alloc_us: u128,
84 target_build_us: u128,
85 enqueue_us: u128,
86 output_take_count: usize,
87 output_pool_reuse_count: usize,
88 output_pool_alloc_count: usize,
89 output_pool_scanned_count: usize,
90 output_pool_max_free_count: usize,
91 output_requested_bytes: usize,
92}
93
94#[cfg(any(test, feature = "cuda-runtime"))]
95fn format_cuda_idwt_batch_host_trace_row(row: CudaIdwtBatchHostTraceRow) -> String {
96 format!(
97 "j2k_profile codec=j2k op=cuda_idwt_batch_host path=decode \
98 component_count={} step_count={} output_alloc_us={} target_build_us={} enqueue_us={} \
99 output_take_count={} output_pool_reuse_count={} output_pool_alloc_count={} \
100 output_pool_scanned_count={} output_pool_max_free_count={} output_requested_bytes={}",
101 row.component_count,
102 row.step_count,
103 row.output_alloc_us,
104 row.target_build_us,
105 row.enqueue_us,
106 row.output_take_count,
107 row.output_pool_reuse_count,
108 row.output_pool_alloc_count,
109 row.output_pool_scanned_count,
110 row.output_pool_max_free_count,
111 row.output_requested_bytes
112 )
113}
114
115#[cfg(feature = "cuda-runtime")]
116#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
117struct CudaIdwtOutputPoolTraceTotals {
118 take_count: usize,
119 reuse_count: usize,
120 alloc_count: usize,
121 scanned_count: usize,
122 max_free_count: usize,
123 requested_bytes: usize,
124}
125
126#[cfg(feature = "cuda-runtime")]
127impl CudaIdwtOutputPoolTraceTotals {
128 fn add_take(&mut self, trace: CudaBufferPoolTakeTrace) {
129 self.take_count = self.take_count.saturating_add(1);
130 if trace.reused {
131 self.reuse_count = self.reuse_count.saturating_add(1);
132 } else {
133 self.alloc_count = self.alloc_count.saturating_add(1);
134 }
135 self.scanned_count = self.scanned_count.saturating_add(trace.scanned_count);
136 self.max_free_count = self.max_free_count.max(trace.free_count_before);
137 self.requested_bytes = self.requested_bytes.saturating_add(trace.requested_len);
138 }
139}
140
141#[cfg(feature = "cuda-runtime")]
142fn cuda_idwt_trace_enabled() -> bool {
143 std::env::var_os(CUDA_IDWT_TRACE_ENV_VAR).is_some()
144}
145
146#[cfg(feature = "cuda-runtime")]
147fn elapsed_host_us(start: Option<std::time::Instant>) -> u128 {
148 start.map_or(0, |start| start.elapsed().as_micros())
149}
150
151pub struct J2kDecoder<'a> {
153 inner: CpuDecoder<'a>,
154 pool: CpuJ2kScratchPool,
155}
156
157impl<'a> J2kDecoder<'a> {
158 pub fn new(input: &'a [u8]) -> Result<Self, Error> {
160 Ok(Self {
161 inner: CpuDecoder::new(input)?,
162 pool: CpuJ2kScratchPool::new(),
163 })
164 }
165
166 fn decode_to_surface_impl(
167 &mut self,
168 session: &mut CudaSession,
169 fmt: PixelFormat,
170 backend: BackendRequest,
171 ) -> Result<Surface, Error> {
172 validate_surface_request(backend)?;
173 if matches!(backend, BackendRequest::Cuda) {
174 return self.decode_to_cuda_resident_surface_impl(session, fmt);
175 }
176 let dims = self.inner.info().dimensions;
177 let stride = dims.0 as usize * fmt.bytes_per_pixel();
178 let mut out = vec![0u8; stride * dims.1 as usize];
179 if j2k_profile::gpu_route_profile_enabled() {
180 let request_s = format!("{backend:?}");
181 let fmt_s = format!("{fmt:?}");
182 let width_s = dims.0.to_string();
183 let height_s = dims.1.to_string();
184 j2k_profile::emit_gpu_route_profile(
185 "j2k",
186 "cuda",
187 &[
188 ("op", "full"),
189 ("request", request_s.as_str()),
190 ("fmt", fmt_s.as_str()),
191 ("width", width_s.as_str()),
192 ("height", height_s.as_str()),
193 ("decision", "cpu_decode_then_wrap"),
194 ],
195 );
196 }
197 self.inner
198 .decode_into_with_scratch(&mut self.pool, &mut out, stride, fmt)?;
199 wrap_surface(out, dims, fmt, backend, session)
200 }
201
202 fn decode_to_cuda_resident_surface_impl(
203 &mut self,
204 session: &mut CudaSession,
205 fmt: PixelFormat,
206 ) -> Result<Surface, Error> {
207 decode_to_cuda_resident_surface_impl(self, session, fmt)
208 }
209
210 fn decode_region_to_cuda_resident_surface_impl(
211 &mut self,
212 session: &mut CudaSession,
213 fmt: PixelFormat,
214 roi: Rect,
215 ) -> Result<Surface, Error> {
216 decode_region_to_cuda_resident_surface_impl(self, session, fmt, roi)
217 }
218
219 fn decode_scaled_to_cuda_resident_surface_impl(
220 &mut self,
221 session: &mut CudaSession,
222 fmt: PixelFormat,
223 scale: Downscale,
224 ) -> Result<Surface, Error> {
225 decode_scaled_to_cuda_resident_surface_impl(self, session, fmt, scale)
226 }
227
228 fn decode_region_scaled_to_cuda_resident_surface_impl(
229 &mut self,
230 session: &mut CudaSession,
231 fmt: PixelFormat,
232 roi: Rect,
233 scale: Downscale,
234 ) -> Result<Surface, Error> {
235 decode_region_scaled_to_cuda_resident_surface_impl(self, session, fmt, roi, scale)
236 }
237
238 fn decode_region_to_surface_impl(
239 &mut self,
240 session: &mut CudaSession,
241 fmt: PixelFormat,
242 roi: Rect,
243 backend: BackendRequest,
244 ) -> Result<Surface, Error> {
245 validate_surface_request(backend)?;
246 if matches!(backend, BackendRequest::Cuda) {
247 return self.decode_region_to_cuda_resident_surface_impl(session, fmt, roi);
248 }
249 let plan = DeviceDecodePlan::for_image(
250 self.inner.info().dimensions,
251 DeviceDecodeRequest::Region { roi },
252 )?;
253 let dims = plan.output_dims();
254 let stride = dims.0 as usize * fmt.bytes_per_pixel();
255 let mut out = vec![0u8; stride * dims.1 as usize];
256 self.inner
257 .decode_region_into(&mut self.pool, &mut out, stride, fmt, plan.source_rect())?;
258 wrap_surface(out, dims, fmt, backend, session)
259 }
260
261 fn decode_scaled_to_surface_impl(
262 &mut self,
263 session: &mut CudaSession,
264 fmt: PixelFormat,
265 scale: Downscale,
266 backend: BackendRequest,
267 ) -> Result<Surface, Error> {
268 validate_surface_request(backend)?;
269 if matches!(backend, BackendRequest::Cuda) {
270 return self.decode_scaled_to_cuda_resident_surface_impl(session, fmt, scale);
271 }
272 let dims = DeviceDecodePlan::for_image(
273 self.inner.info().dimensions,
274 DeviceDecodeRequest::Scaled { scale },
275 )?
276 .output_dims();
277 let stride = dims.0 as usize * fmt.bytes_per_pixel();
278 let mut out = vec![0u8; stride * dims.1 as usize];
279 self.inner
280 .decode_scaled_into(&mut self.pool, &mut out, stride, fmt, scale)?;
281 wrap_surface(out, dims, fmt, backend, session)
282 }
283
284 fn decode_region_scaled_to_surface_impl(
285 &mut self,
286 session: &mut CudaSession,
287 fmt: PixelFormat,
288 roi: Rect,
289 scale: Downscale,
290 backend: BackendRequest,
291 ) -> Result<Surface, Error> {
292 validate_surface_request(backend)?;
293 if matches!(backend, BackendRequest::Cuda) {
294 return self
295 .decode_region_scaled_to_cuda_resident_surface_impl(session, fmt, roi, scale);
296 }
297 let plan = DeviceDecodePlan::for_image(
298 self.inner.info().dimensions,
299 DeviceDecodeRequest::RegionScaled { roi, scale },
300 )?;
301 let dims = plan.output_dims();
302 let stride = dims.0 as usize * fmt.bytes_per_pixel();
303 let mut out = vec![0u8; stride * dims.1 as usize];
304 self.inner.decode_region_scaled_into(
305 &mut self.pool,
306 &mut out,
307 stride,
308 fmt,
309 plan.source_rect(),
310 scale,
311 )?;
312 wrap_surface(out, dims, fmt, backend, session)
313 }
314
315 pub fn decode_to_device_with_session(
318 &mut self,
319 fmt: PixelFormat,
320 session: &mut CudaSession,
321 ) -> Result<Surface, Error> {
322 self.decode_to_surface_impl(session, fmt, BackendRequest::Cuda)
323 }
324
325 pub fn decode_to_device_with_session_and_profile(
328 &mut self,
329 fmt: PixelFormat,
330 session: &mut CudaSession,
331 ) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
332 decode_to_cuda_resident_surface_with_profile_impl(self, session, fmt)
333 }
334
335 pub fn decode_batch_to_device_with_session(
338 inputs: &[&[u8]],
339 fmt: PixelFormat,
340 session: &mut CudaSession,
341 ) -> Result<Vec<Surface>, Error> {
342 decode_batch_to_cuda_resident_surface_with_profile_control(inputs, session, fmt, false)
343 .map(|(surfaces, _report)| surfaces)
344 }
345
346 pub fn decode_batch_to_device_with_session_and_profile(
349 inputs: &[&[u8]],
350 fmt: PixelFormat,
351 session: &mut CudaSession,
352 ) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
353 decode_batch_to_cuda_resident_surface_with_profile_control(inputs, session, fmt, true)
354 }
355
356 pub(crate) fn decode_region_to_device_with_session(
359 &mut self,
360 fmt: PixelFormat,
361 roi: Rect,
362 session: &mut CudaSession,
363 ) -> Result<Surface, Error> {
364 self.decode_region_to_surface_impl(session, fmt, roi, BackendRequest::Cuda)
365 }
366
367 pub(crate) fn decode_scaled_to_device_with_session(
370 &mut self,
371 fmt: PixelFormat,
372 scale: Downscale,
373 session: &mut CudaSession,
374 ) -> Result<Surface, Error> {
375 self.decode_scaled_to_surface_impl(session, fmt, scale, BackendRequest::Cuda)
376 }
377
378 pub(crate) fn decode_region_scaled_to_device_with_session(
381 &mut self,
382 fmt: PixelFormat,
383 roi: Rect,
384 scale: Downscale,
385 session: &mut CudaSession,
386 ) -> Result<Surface, Error> {
387 self.decode_region_scaled_to_surface_impl(session, fmt, roi, scale, BackendRequest::Cuda)
388 }
389
390 pub fn decode_to_host_surface(&mut self, fmt: PixelFormat) -> Result<Surface, Error> {
392 let mut session = CudaSession::default();
393 self.decode_to_surface_impl(&mut session, fmt, BackendRequest::Cpu)
394 }
395
396 pub fn build_cuda_htj2k_grayscale_plan_with_profile(
398 &mut self,
399 fmt: PixelFormat,
400 ) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
401 let total_start = profile::profile_now(true);
402
403 let parse_start = profile::profile_now(true);
404 let image = NativeImage::new(self.inner.bytes(), &DecodeSettings::default())
405 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
406 let parse_us = profile::elapsed_us(parse_start);
407
408 let plan_start = profile::profile_now(true);
409 let mut native_context = NativeDecoderContext::default();
410 let native_plan = image
411 .build_direct_grayscale_plan_with_context(&mut native_context)
412 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
413 let plan_us = profile::elapsed_us(plan_start);
414
415 let flatten_start = profile::profile_now(true);
416 let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan(&native_plan, fmt, (0, 0))?;
417 let flatten_us = profile::elapsed_us(flatten_start);
418
419 let report = CudaHtj2kProfileReport {
420 parse_us,
421 plan_us,
422 flatten_us,
423 total_us: profile::elapsed_us(total_start),
424 block_count: cuda_plan.code_blocks().len(),
425 payload_bytes: cuda_plan.payload().len(),
426 dispatch_count: 0,
427 residency: crate::SurfaceResidency::CudaResidentDecode,
428 detail: CudaHtj2kDecodeProfileDetail::default(),
429 ..CudaHtj2kProfileReport::default()
430 };
431 report.emit("plan");
432 Ok((cuda_plan, report))
433 }
434
435 pub fn build_cuda_htj2k_grayscale_region_plan_with_profile(
437 &mut self,
438 fmt: PixelFormat,
439 roi: Rect,
440 ) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
441 let total_start = profile::profile_now(true);
442
443 let parse_start = profile::profile_now(true);
444 let image = NativeImage::new(self.inner.bytes(), &DecodeSettings::default())
445 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
446 let parse_us = profile::elapsed_us(parse_start);
447
448 let plan_start = profile::profile_now(true);
449 let mut native_context = NativeDecoderContext::default();
450 let native_plan = image
451 .build_direct_grayscale_plan_region_with_context(
452 &mut native_context,
453 (roi.x, roi.y, roi.w, roi.h),
454 )
455 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
456 let plan_us = profile::elapsed_us(plan_start);
457
458 let flatten_start = profile::profile_now(true);
459 let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
460 &native_plan,
461 fmt,
462 (roi.x, roi.y),
463 (roi.w, roi.h),
464 )?;
465 let flatten_us = profile::elapsed_us(flatten_start);
466
467 let report = CudaHtj2kProfileReport {
468 parse_us,
469 plan_us,
470 flatten_us,
471 total_us: profile::elapsed_us(total_start),
472 block_count: cuda_plan.code_blocks().len(),
473 payload_bytes: cuda_plan.payload().len(),
474 dispatch_count: 0,
475 residency: crate::SurfaceResidency::CudaResidentDecode,
476 detail: CudaHtj2kDecodeProfileDetail::default(),
477 ..CudaHtj2kProfileReport::default()
478 };
479 report.emit("plan");
480 Ok((cuda_plan, report))
481 }
482
483 pub fn build_cuda_htj2k_grayscale_scaled_plan_with_profile(
486 &mut self,
487 fmt: PixelFormat,
488 output_dimensions: (u32, u32),
489 ) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
490 let total_start = profile::profile_now(true);
491
492 let parse_start = profile::profile_now(true);
493 let image = NativeImage::new(
494 self.inner.bytes(),
495 &DecodeSettings {
496 target_resolution: Some(output_dimensions),
497 ..DecodeSettings::default()
498 },
499 )
500 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
501 let parse_us = profile::elapsed_us(parse_start);
502
503 let plan_start = profile::profile_now(true);
504 let mut native_context = NativeDecoderContext::default();
505 let native_plan = image
506 .build_direct_grayscale_plan_with_context(&mut native_context)
507 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
508 let plan_us = profile::elapsed_us(plan_start);
509
510 let flatten_start = profile::profile_now(true);
511 let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan(&native_plan, fmt, (0, 0))?;
512 let flatten_us = profile::elapsed_us(flatten_start);
513
514 let report = CudaHtj2kProfileReport {
515 parse_us,
516 plan_us,
517 flatten_us,
518 total_us: profile::elapsed_us(total_start),
519 block_count: cuda_plan.code_blocks().len(),
520 payload_bytes: cuda_plan.payload().len(),
521 dispatch_count: 0,
522 residency: crate::SurfaceResidency::CudaResidentDecode,
523 detail: CudaHtj2kDecodeProfileDetail::default(),
524 ..CudaHtj2kProfileReport::default()
525 };
526 report.emit("plan");
527 Ok((cuda_plan, report))
528 }
529
530 pub fn build_cuda_htj2k_grayscale_region_scaled_plan_with_profile(
533 &mut self,
534 fmt: PixelFormat,
535 scaled_roi: Rect,
536 output_dimensions: (u32, u32),
537 ) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
538 let total_start = profile::profile_now(true);
539
540 let parse_start = profile::profile_now(true);
541 let image = NativeImage::new(
542 self.inner.bytes(),
543 &DecodeSettings {
544 target_resolution: Some(output_dimensions),
545 ..DecodeSettings::default()
546 },
547 )
548 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
549 let parse_us = profile::elapsed_us(parse_start);
550
551 let plan_start = profile::profile_now(true);
552 let mut native_context = NativeDecoderContext::default();
553 let native_plan = image
554 .build_direct_grayscale_plan_region_with_context(
555 &mut native_context,
556 (scaled_roi.x, scaled_roi.y, scaled_roi.w, scaled_roi.h),
557 )
558 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
559 let plan_us = profile::elapsed_us(plan_start);
560
561 let flatten_start = profile::profile_now(true);
562 let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
563 &native_plan,
564 fmt,
565 (scaled_roi.x, scaled_roi.y),
566 (scaled_roi.w, scaled_roi.h),
567 )?;
568 let flatten_us = profile::elapsed_us(flatten_start);
569
570 let report = CudaHtj2kProfileReport {
571 parse_us,
572 plan_us,
573 flatten_us,
574 total_us: profile::elapsed_us(total_start),
575 block_count: cuda_plan.code_blocks().len(),
576 payload_bytes: cuda_plan.payload().len(),
577 dispatch_count: 0,
578 residency: crate::SurfaceResidency::CudaResidentDecode,
579 detail: CudaHtj2kDecodeProfileDetail::default(),
580 ..CudaHtj2kProfileReport::default()
581 };
582 report.emit("plan");
583 Ok((cuda_plan, report))
584 }
585
586 #[cfg(feature = "cuda-runtime")]
588 fn build_cuda_htj2k_color_plans_with_profile(
589 &mut self,
590 fmt: PixelFormat,
591 ) -> Result<CudaHtj2kColorDecodePlans, Error> {
592 let mut native_context = NativeDecoderContext::default();
593 build_cuda_htj2k_color_plans_from_bytes_with_profile(
594 self.inner.bytes(),
595 fmt,
596 &mut native_context,
597 )
598 }
599
600 #[cfg(feature = "cuda-runtime")]
601 fn build_cuda_htj2k_color_scaled_plans_with_profile(
602 &mut self,
603 fmt: PixelFormat,
604 output_dimensions: (u32, u32),
605 ) -> Result<CudaHtj2kColorDecodePlans, Error> {
606 let total_start = profile::profile_now(true);
607
608 let parse_start = profile::profile_now(true);
609 let image = NativeImage::new(
610 self.inner.bytes(),
611 &DecodeSettings {
612 target_resolution: Some(output_dimensions),
613 ..DecodeSettings::default()
614 },
615 )
616 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
617 let parse_us = profile::elapsed_us(parse_start);
618
619 let plan_start = profile::profile_now(true);
620 let mut native_context = NativeDecoderContext::default();
621 let native_plan = image
622 .build_direct_color_plan_with_context(&mut native_context)
623 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
624 let plan_us = profile::elapsed_us(plan_start);
625
626 let flatten_start = profile::profile_now(true);
627 let mut payload = Vec::new();
628 let mut components = Vec::with_capacity(native_plan.component_plans.len());
629 for component_plan in &native_plan.component_plans {
630 let mut component =
631 CudaHtj2kDecodePlan::from_grayscale_direct_plan(component_plan, fmt, (0, 0))?;
632 component.append_payload_to_shared(&mut payload)?;
633 components.push(component);
634 }
635 let flatten_us = profile::elapsed_us(flatten_start);
636 let block_count = components
637 .iter()
638 .map(|plan| plan.code_blocks().len())
639 .sum::<usize>();
640 let payload_bytes = payload.len();
641 let report = CudaHtj2kProfileReport {
642 parse_us,
643 plan_us,
644 flatten_us,
645 total_us: profile::elapsed_us(total_start),
646 block_count,
647 payload_bytes,
648 dispatch_count: 0,
649 residency: crate::SurfaceResidency::CudaResidentDecode,
650 detail: CudaHtj2kDecodeProfileDetail::default(),
651 ..CudaHtj2kProfileReport::default()
652 };
653 report.emit("plan");
654
655 Ok(CudaHtj2kColorDecodePlans {
656 dimensions: native_plan.dimensions,
657 mct_dimensions: native_plan.dimensions,
658 bit_depths: native_plan.bit_depths,
659 mct: native_plan.mct,
660 transform: CudaHtj2kTransform::from_native(native_plan.transform),
661 payload,
662 components,
663 report,
664 })
665 }
666
667 #[cfg(feature = "cuda-runtime")]
668 fn build_cuda_htj2k_color_region_plans_with_profile(
669 &mut self,
670 fmt: PixelFormat,
671 roi: Rect,
672 ) -> Result<CudaHtj2kColorDecodePlans, Error> {
673 let total_start = profile::profile_now(true);
674
675 let parse_start = profile::profile_now(true);
676 let image = NativeImage::new(self.inner.bytes(), &DecodeSettings::default())
677 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
678 let parse_us = profile::elapsed_us(parse_start);
679
680 let plan_start = profile::profile_now(true);
681 let mut native_context = NativeDecoderContext::default();
682 let native_plan = image
683 .build_direct_color_plan_with_context(&mut native_context)
684 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
685 let plan_us = profile::elapsed_us(plan_start);
686
687 let flatten_start = profile::profile_now(true);
688 let mut payload = Vec::new();
689 let mut components = Vec::with_capacity(native_plan.component_plans.len());
690 for component_plan in &native_plan.component_plans {
691 let mut component = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
692 component_plan,
693 fmt,
694 (roi.x, roi.y),
695 (roi.w, roi.h),
696 )?;
697 component.append_payload_to_shared(&mut payload)?;
698 components.push(component);
699 }
700 let flatten_us = profile::elapsed_us(flatten_start);
701 let block_count = components
702 .iter()
703 .map(|plan| plan.code_blocks().len())
704 .sum::<usize>();
705 let payload_bytes = payload.len();
706 let report = CudaHtj2kProfileReport {
707 parse_us,
708 plan_us,
709 flatten_us,
710 total_us: profile::elapsed_us(total_start),
711 block_count,
712 payload_bytes,
713 dispatch_count: 0,
714 residency: crate::SurfaceResidency::CudaResidentDecode,
715 detail: CudaHtj2kDecodeProfileDetail::default(),
716 ..CudaHtj2kProfileReport::default()
717 };
718 report.emit("plan");
719
720 Ok(CudaHtj2kColorDecodePlans {
721 dimensions: (roi.w, roi.h),
722 mct_dimensions: native_plan.dimensions,
723 bit_depths: native_plan.bit_depths,
724 mct: native_plan.mct,
725 transform: CudaHtj2kTransform::from_native(native_plan.transform),
726 payload,
727 components,
728 report,
729 })
730 }
731
732 #[cfg(feature = "cuda-runtime")]
733 fn build_cuda_htj2k_color_region_scaled_plans_with_profile(
734 &mut self,
735 fmt: PixelFormat,
736 scaled_roi: Rect,
737 output_dimensions: (u32, u32),
738 ) -> Result<CudaHtj2kColorDecodePlans, Error> {
739 let total_start = profile::profile_now(true);
740
741 let parse_start = profile::profile_now(true);
742 let image = NativeImage::new(
743 self.inner.bytes(),
744 &DecodeSettings {
745 target_resolution: Some(output_dimensions),
746 ..DecodeSettings::default()
747 },
748 )
749 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
750 let parse_us = profile::elapsed_us(parse_start);
751
752 let plan_start = profile::profile_now(true);
753 let mut native_context = NativeDecoderContext::default();
754 let native_plan = image
755 .build_direct_color_plan_with_context(&mut native_context)
756 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
757 let plan_us = profile::elapsed_us(plan_start);
758
759 let flatten_start = profile::profile_now(true);
760 let mut payload = Vec::new();
761 let mut components = Vec::with_capacity(native_plan.component_plans.len());
762 for component_plan in &native_plan.component_plans {
763 let mut component = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
764 component_plan,
765 fmt,
766 (scaled_roi.x, scaled_roi.y),
767 (scaled_roi.w, scaled_roi.h),
768 )?;
769 component.append_payload_to_shared(&mut payload)?;
770 components.push(component);
771 }
772 let flatten_us = profile::elapsed_us(flatten_start);
773 let block_count = components
774 .iter()
775 .map(|plan| plan.code_blocks().len())
776 .sum::<usize>();
777 let payload_bytes = payload.len();
778 let report = CudaHtj2kProfileReport {
779 parse_us,
780 plan_us,
781 flatten_us,
782 total_us: profile::elapsed_us(total_start),
783 block_count,
784 payload_bytes,
785 dispatch_count: 0,
786 residency: crate::SurfaceResidency::CudaResidentDecode,
787 detail: CudaHtj2kDecodeProfileDetail::default(),
788 ..CudaHtj2kProfileReport::default()
789 };
790 report.emit("plan");
791
792 Ok(CudaHtj2kColorDecodePlans {
793 dimensions: (scaled_roi.w, scaled_roi.h),
794 mct_dimensions: native_plan.dimensions,
795 bit_depths: native_plan.bit_depths,
796 mct: native_plan.mct,
797 transform: CudaHtj2kTransform::from_native(native_plan.transform),
798 payload,
799 components,
800 report,
801 })
802 }
803
804 pub fn decode_to_cpu_staged_cuda_surface_with_session(
807 &mut self,
808 fmt: PixelFormat,
809 session: &mut CudaSession,
810 ) -> Result<Surface, Error> {
811 let dims = self.inner.info().dimensions;
812 let stride = dims.0 as usize * fmt.bytes_per_pixel();
813 let mut out = vec![0u8; stride * dims.1 as usize];
814 self.inner
815 .decode_into_with_scratch(&mut self.pool, &mut out, stride, fmt)?;
816 wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
817 }
818
819 pub fn decode_region_to_cpu_staged_cuda_surface_with_session(
822 &mut self,
823 fmt: PixelFormat,
824 roi: Rect,
825 session: &mut CudaSession,
826 ) -> Result<Surface, Error> {
827 let plan = DeviceDecodePlan::for_image(
828 self.inner.info().dimensions,
829 DeviceDecodeRequest::Region { roi },
830 )?;
831 let dims = plan.output_dims();
832 let stride = dims.0 as usize * fmt.bytes_per_pixel();
833 let mut out = vec![0u8; stride * dims.1 as usize];
834 self.inner
835 .decode_region_into(&mut self.pool, &mut out, stride, fmt, plan.source_rect())?;
836 wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
837 }
838
839 pub fn decode_scaled_to_cpu_staged_cuda_surface_with_session(
842 &mut self,
843 fmt: PixelFormat,
844 scale: Downscale,
845 session: &mut CudaSession,
846 ) -> Result<Surface, Error> {
847 let dims = DeviceDecodePlan::for_image(
848 self.inner.info().dimensions,
849 DeviceDecodeRequest::Scaled { scale },
850 )?
851 .output_dims();
852 let stride = dims.0 as usize * fmt.bytes_per_pixel();
853 let mut out = vec![0u8; stride * dims.1 as usize];
854 self.inner
855 .decode_scaled_into(&mut self.pool, &mut out, stride, fmt, scale)?;
856 wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
857 }
858
859 pub fn decode_region_scaled_to_cpu_staged_cuda_surface_with_session(
862 &mut self,
863 fmt: PixelFormat,
864 roi: Rect,
865 scale: Downscale,
866 session: &mut CudaSession,
867 ) -> Result<Surface, Error> {
868 let plan = DeviceDecodePlan::for_image(
869 self.inner.info().dimensions,
870 DeviceDecodeRequest::RegionScaled { roi, scale },
871 )?;
872 let dims = plan.output_dims();
873 let stride = dims.0 as usize * fmt.bytes_per_pixel();
874 let mut out = vec![0u8; stride * dims.1 as usize];
875 self.inner.decode_region_scaled_into(
876 &mut self.pool,
877 &mut out,
878 stride,
879 fmt,
880 plan.source_rect(),
881 scale,
882 )?;
883 wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
884 }
885}
886
887#[cfg(feature = "cuda-runtime")]
888struct CudaCoefficientBand {
889 band_id: CudaHtj2kBandId,
890 buffer: CudaPooledDeviceBuffer,
891}
892
893#[cfg(feature = "cuda-runtime")]
894struct CudaPendingDequantBand {
895 band_index: usize,
896 jobs: Vec<CudaHtj2kCodeBlockJob>,
897 output_words: usize,
898}
899
900#[cfg(feature = "cuda-runtime")]
901struct CudaComponentDecodeWork {
902 bands: Vec<CudaCoefficientBand>,
903 pending_dequant_bands: Vec<CudaPendingDequantBand>,
904 store: CudaHtj2kStoreStep,
905 dispatches: usize,
906 decode_dispatches: usize,
907 timings: CudaDecodeStageTimings,
908}
909
910#[cfg(feature = "cuda-runtime")]
911struct CudaQueuedIdwtBatch {
912 queued: Vec<CudaQueuedExecution>,
913 kernel_dispatches: usize,
914 decode_dispatches: usize,
915}
916
917#[cfg(feature = "cuda-runtime")]
918struct CudaDecodedComponent {
919 buffer: CudaPooledDeviceBuffer,
920 store: CudaHtj2kStoreStep,
921 dispatches: usize,
922 decode_dispatches: usize,
923 timings: CudaDecodeStageTimings,
924}
925
926#[cfg(feature = "cuda-runtime")]
927struct CudaPreparedRgb8MctBatchStore {
928 color: CudaHtj2kColorDecodePlans,
929 decoded_components: Vec<CudaDecodedComponent>,
930 dispatches: usize,
931 decode_dispatches: usize,
932 job: CudaJ2kStoreRgb8MctJob,
933}
934
935#[cfg(feature = "cuda-runtime")]
936#[derive(Clone, Copy, Debug, Default)]
937struct CudaDecodeStageTimings {
938 h2d: u128,
939 payload_upload: u128,
940 status_d2h: u128,
941 ht_cleanup: u128,
942 ht_refine: u128,
943 dequant: u128,
944 ht_dispatch_count: usize,
945 idwt: u128,
946 dequant_dispatch_count: usize,
947 idwt_dispatch_count: usize,
948}
949
950#[cfg(feature = "cuda-runtime")]
951impl CudaDecodeStageTimings {
952 fn add_to_report(self, report: &mut CudaHtj2kProfileReport) {
953 report.h2d_us = report.h2d_us.saturating_add(self.h2d);
954 report.detail.payload_upload_us = report
955 .detail
956 .payload_upload_us
957 .saturating_add(self.payload_upload);
958 report.detail.status_d2h_us = report.detail.status_d2h_us.saturating_add(self.status_d2h);
959 report.ht_cleanup_us = report.ht_cleanup_us.saturating_add(self.ht_cleanup);
960 report.ht_refine_us = report.ht_refine_us.saturating_add(self.ht_refine);
961 report.dequant_us = report.dequant_us.saturating_add(self.dequant);
962 report.idwt_us = report.idwt_us.saturating_add(self.idwt);
963 report.detail.ht_dispatch_count = report
964 .detail
965 .ht_dispatch_count
966 .saturating_add(self.ht_dispatch_count);
967 report.detail.dequant_dispatch_count = report
968 .detail
969 .dequant_dispatch_count
970 .saturating_add(self.dequant_dispatch_count);
971 report.detail.idwt_dispatch_count = report
972 .detail
973 .idwt_dispatch_count
974 .saturating_add(self.idwt_dispatch_count);
975 }
976}
977
978#[cfg(feature = "cuda-runtime")]
979struct CudaHtj2kColorDecodePlans {
980 dimensions: (u32, u32),
981 mct_dimensions: (u32, u32),
982 bit_depths: [u8; 3],
983 mct: bool,
984 transform: CudaHtj2kTransform,
985 payload: Vec<u8>,
986 components: Vec<CudaHtj2kDecodePlan>,
987 report: CudaHtj2kProfileReport,
988}
989
990#[cfg(feature = "cuda-runtime")]
991fn decode_to_cuda_resident_surface_impl(
992 decoder: &mut J2kDecoder<'_>,
993 session: &mut CudaSession,
994 fmt: PixelFormat,
995) -> Result<Surface, Error> {
996 decode_to_cuda_resident_surface_with_profile_control(decoder, session, fmt, false)
997 .map(|(surface, _report)| surface)
998}
999
1000#[cfg(feature = "cuda-runtime")]
1001fn decode_to_cuda_resident_surface_with_profile_impl(
1002 decoder: &mut J2kDecoder<'_>,
1003 session: &mut CudaSession,
1004 fmt: PixelFormat,
1005) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
1006 decode_to_cuda_resident_surface_with_profile_control(decoder, session, fmt, true)
1007}
1008
1009#[cfg(feature = "cuda-runtime")]
1010fn decode_to_cuda_resident_surface_with_profile_control(
1011 decoder: &mut J2kDecoder<'_>,
1012 session: &mut CudaSession,
1013 fmt: PixelFormat,
1014 collect_stage_timings: bool,
1015) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
1016 let collect_stage_timings = collect_stage_timings || profile::profile_stages_enabled();
1017 let wall_started = profile::profile_now(collect_stage_timings);
1018 match fmt {
1019 PixelFormat::Gray8 | PixelFormat::Gray16 => {
1020 decode_grayscale_cuda_resident_surface_with_profile(
1021 decoder,
1022 session,
1023 fmt,
1024 wall_started,
1025 collect_stage_timings,
1026 )
1027 }
1028 PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
1029 decode_color_cuda_resident_surface_with_profile(
1030 decoder,
1031 session,
1032 fmt,
1033 wall_started,
1034 collect_stage_timings,
1035 )
1036 }
1037 _ => Err(Error::UnsupportedCudaRequest {
1038 reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
1039 }),
1040 }
1041}
1042
1043#[cfg(feature = "cuda-runtime")]
1044fn decode_batch_to_cuda_resident_surface_with_profile_control(
1045 inputs: &[&[u8]],
1046 session: &mut CudaSession,
1047 fmt: PixelFormat,
1048 collect_stage_timings: bool,
1049) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
1050 #[cfg(all(test, feature = "cuda-runtime"))]
1051 CUDA_HTJ2K_BATCH_DECODE_CALLS.with(|calls| calls.set(calls.get().saturating_add(1)));
1052
1053 let collect_stage_timings = collect_stage_timings || profile::profile_stages_enabled();
1054 if inputs.is_empty() {
1055 return Ok((
1056 Vec::new(),
1057 CudaHtj2kProfileReport {
1058 residency: SurfaceResidency::CudaResidentDecode,
1059 ..CudaHtj2kProfileReport::default()
1060 },
1061 ));
1062 }
1063 match fmt {
1064 PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
1065 decode_color_cuda_resident_batch_surfaces_with_profile(
1066 inputs,
1067 session,
1068 fmt,
1069 collect_stage_timings,
1070 )
1071 }
1072 _ => Err(Error::UnsupportedCudaRequest {
1073 reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
1074 }),
1075 }
1076}
1077
1078#[cfg(feature = "cuda-runtime")]
1079fn decode_region_to_cuda_resident_surface_impl(
1080 decoder: &mut J2kDecoder<'_>,
1081 session: &mut CudaSession,
1082 fmt: PixelFormat,
1083 roi: Rect,
1084) -> Result<Surface, Error> {
1085 let plan = DeviceDecodePlan::for_image(
1086 decoder.inner.info().dimensions,
1087 DeviceDecodeRequest::Region { roi },
1088 )?;
1089 if plan.is_full_frame() {
1090 return decode_to_cuda_resident_surface_impl(decoder, session, fmt);
1091 }
1092
1093 match fmt {
1094 PixelFormat::Gray8 | PixelFormat::Gray16 => {
1095 decode_grayscale_cuda_resident_region_surface(decoder, session, fmt, plan.source_rect())
1096 }
1097 PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
1098 decode_color_cuda_resident_region_surface(decoder, session, fmt, plan.source_rect())
1099 }
1100 _ => Err(Error::UnsupportedCudaRequest {
1101 reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
1102 }),
1103 }
1104}
1105
1106#[cfg(feature = "cuda-runtime")]
1107fn decode_scaled_to_cuda_resident_surface_impl(
1108 decoder: &mut J2kDecoder<'_>,
1109 session: &mut CudaSession,
1110 fmt: PixelFormat,
1111 scale: Downscale,
1112) -> Result<Surface, Error> {
1113 if scale == Downscale::None {
1114 return decode_to_cuda_resident_surface_impl(decoder, session, fmt);
1115 }
1116 let output_dimensions = DeviceDecodePlan::for_image(
1117 decoder.inner.info().dimensions,
1118 DeviceDecodeRequest::Scaled { scale },
1119 )?
1120 .output_dims();
1121
1122 match fmt {
1123 PixelFormat::Gray8 | PixelFormat::Gray16 => {
1124 decode_grayscale_cuda_resident_scaled_surface(decoder, session, fmt, output_dimensions)
1125 }
1126 PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
1127 decode_color_cuda_resident_scaled_surface(decoder, session, fmt, output_dimensions)
1128 }
1129 _ => Err(Error::UnsupportedCudaRequest {
1130 reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
1131 }),
1132 }
1133}
1134
1135#[cfg(feature = "cuda-runtime")]
1136fn decode_region_scaled_to_cuda_resident_surface_impl(
1137 decoder: &mut J2kDecoder<'_>,
1138 session: &mut CudaSession,
1139 fmt: PixelFormat,
1140 roi: Rect,
1141 scale: Downscale,
1142) -> Result<Surface, Error> {
1143 if scale == Downscale::None {
1144 return decode_region_to_cuda_resident_surface_impl(decoder, session, fmt, roi);
1145 }
1146 let source_dimensions = decoder.inner.info().dimensions;
1147 let scaled_dimensions =
1148 DeviceDecodePlan::for_image(source_dimensions, DeviceDecodeRequest::Scaled { scale })?
1149 .output_dims();
1150 let plan = DeviceDecodePlan::for_image(
1151 source_dimensions,
1152 DeviceDecodeRequest::RegionScaled { roi, scale },
1153 )?;
1154 let scaled_roi = plan.output_rect();
1155
1156 match fmt {
1157 PixelFormat::Gray8 | PixelFormat::Gray16 => {
1158 decode_grayscale_cuda_resident_region_scaled_surface(
1159 decoder,
1160 session,
1161 fmt,
1162 scaled_roi,
1163 scaled_dimensions,
1164 )
1165 }
1166 PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
1167 decode_color_cuda_resident_region_scaled_surface(
1168 decoder,
1169 session,
1170 fmt,
1171 scaled_roi,
1172 scaled_dimensions,
1173 )
1174 }
1175 _ => Err(Error::UnsupportedCudaRequest {
1176 reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
1177 }),
1178 }
1179}
1180
1181#[cfg(feature = "cuda-runtime")]
1182fn decode_grayscale_cuda_resident_surface_with_profile(
1183 decoder: &mut J2kDecoder<'_>,
1184 session: &mut CudaSession,
1185 fmt: PixelFormat,
1186 wall_started: Option<profile::ProfileInstant>,
1187 collect_stage_timings: bool,
1188) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
1189 let (plan, mut report) = decoder.build_cuda_htj2k_grayscale_plan_with_profile(fmt)?;
1190 decode_grayscale_cuda_resident_surface_with_plan_profile(
1191 session,
1192 fmt,
1193 &plan,
1194 &mut report,
1195 wall_started,
1196 collect_stage_timings,
1197 )
1198}
1199
1200#[cfg(feature = "cuda-runtime")]
1201fn decode_grayscale_cuda_resident_region_surface(
1202 decoder: &mut J2kDecoder<'_>,
1203 session: &mut CudaSession,
1204 fmt: PixelFormat,
1205 roi: Rect,
1206) -> Result<Surface, Error> {
1207 let collect_stage_timings = profile::profile_stages_enabled();
1208 let wall_started = profile::profile_now(collect_stage_timings);
1209 let (plan, mut report) =
1210 decoder.build_cuda_htj2k_grayscale_region_plan_with_profile(fmt, roi)?;
1211 decode_grayscale_cuda_resident_surface_with_plan_profile(
1212 session,
1213 fmt,
1214 &plan,
1215 &mut report,
1216 wall_started,
1217 collect_stage_timings,
1218 )
1219 .map(|(surface, _report)| surface)
1220}
1221
1222#[cfg(feature = "cuda-runtime")]
1223fn decode_grayscale_cuda_resident_scaled_surface(
1224 decoder: &mut J2kDecoder<'_>,
1225 session: &mut CudaSession,
1226 fmt: PixelFormat,
1227 output_dimensions: (u32, u32),
1228) -> Result<Surface, Error> {
1229 let collect_stage_timings = profile::profile_stages_enabled();
1230 let wall_started = profile::profile_now(collect_stage_timings);
1231 let (plan, mut report) =
1232 decoder.build_cuda_htj2k_grayscale_scaled_plan_with_profile(fmt, output_dimensions)?;
1233 decode_grayscale_cuda_resident_surface_with_plan_profile(
1234 session,
1235 fmt,
1236 &plan,
1237 &mut report,
1238 wall_started,
1239 collect_stage_timings,
1240 )
1241 .map(|(surface, _report)| surface)
1242}
1243
1244#[cfg(feature = "cuda-runtime")]
1245fn decode_grayscale_cuda_resident_region_scaled_surface(
1246 decoder: &mut J2kDecoder<'_>,
1247 session: &mut CudaSession,
1248 fmt: PixelFormat,
1249 scaled_roi: Rect,
1250 scaled_dimensions: (u32, u32),
1251) -> Result<Surface, Error> {
1252 let collect_stage_timings = profile::profile_stages_enabled();
1253 let wall_started = profile::profile_now(collect_stage_timings);
1254 let (plan, mut report) = decoder.build_cuda_htj2k_grayscale_region_scaled_plan_with_profile(
1255 fmt,
1256 scaled_roi,
1257 scaled_dimensions,
1258 )?;
1259 decode_grayscale_cuda_resident_surface_with_plan_profile(
1260 session,
1261 fmt,
1262 &plan,
1263 &mut report,
1264 wall_started,
1265 collect_stage_timings,
1266 )
1267 .map(|(surface, _report)| surface)
1268}
1269
1270#[cfg(feature = "cuda-runtime")]
1271fn decode_grayscale_cuda_resident_surface_with_plan_profile(
1272 session: &mut CudaSession,
1273 fmt: PixelFormat,
1274 plan: &CudaHtj2kDecodePlan,
1275 report: &mut CudaHtj2kProfileReport,
1276 wall_started: Option<profile::ProfileInstant>,
1277 collect_stage_timings: bool,
1278) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
1279 let context = session.cuda_context()?;
1280 let table_upload_start = profile::profile_now(collect_stage_timings);
1281 let table_resources = session.htj2k_decode_table_resources()?;
1282 let table_upload_us = profile::elapsed_us(table_upload_start);
1283 report.h2d_us = report.h2d_us.saturating_add(table_upload_us);
1284 report.detail.table_upload_us = report
1285 .detail
1286 .table_upload_us
1287 .saturating_add(table_upload_us);
1288 let pool = session.decode_buffer_pool()?;
1289 let component = decode_cuda_component_plan(
1290 &context,
1291 plan,
1292 &table_resources,
1293 &pool,
1294 collect_stage_timings,
1295 )?;
1296 let input_width = component
1297 .store
1298 .input_rect
1299 .x1
1300 .saturating_sub(component.store.input_rect.x0);
1301 let component_buffer = pooled_cuda_buffer(&component.buffer)?;
1302 let (store_output, store_us) = context
1303 .time_default_stream_named_us_if(
1304 collect_stage_timings,
1305 "j2k.htj2k.decode.store.gray",
1306 || match fmt {
1307 PixelFormat::Gray8 => context.j2k_store_gray8_device(
1308 component_buffer,
1309 CudaJ2kStoreGray8Job {
1310 input_width,
1311 source_x: component.store.source_x,
1312 source_y: component.store.source_y,
1313 copy_width: component.store.copy_width,
1314 copy_height: component.store.copy_height,
1315 output_width: component.store.output_width,
1316 output_height: component.store.output_height,
1317 output_x: component.store.output_x,
1318 output_y: component.store.output_y,
1319 addend: component.store.addend,
1320 bit_depth: u32::from(plan.bit_depth()),
1321 },
1322 ),
1323 PixelFormat::Gray16 => context.j2k_store_gray16_device(
1324 component_buffer,
1325 CudaJ2kStoreGray16Job {
1326 input_width,
1327 source_x: component.store.source_x,
1328 source_y: component.store.source_y,
1329 copy_width: component.store.copy_width,
1330 copy_height: component.store.copy_height,
1331 output_width: component.store.output_width,
1332 output_height: component.store.output_height,
1333 output_x: component.store.output_x,
1334 output_y: component.store.output_y,
1335 addend: component.store.addend,
1336 bit_depth: u32::from(plan.bit_depth()),
1337 },
1338 ),
1339 _ => Err(CudaError::InvalidArgument {
1340 message: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED.to_string(),
1341 }),
1342 },
1343 )
1344 .map_err(cuda_error)?;
1345 let (surface_buffer, store_stats) = store_output.into_parts();
1346 let dispatches = component
1347 .dispatches
1348 .saturating_add(store_stats.kernel_dispatches());
1349 let decode_dispatches = component
1350 .decode_dispatches
1351 .saturating_add(store_stats.decode_kernel_dispatches());
1352 report.dispatch_count = dispatches;
1353 component.timings.add_to_report(report);
1354 report.store_us = report.store_us.saturating_add(store_us);
1355 report.detail.store_dispatch_count = report
1356 .detail
1357 .store_dispatch_count
1358 .saturating_add(store_stats.kernel_dispatches());
1359 report.detail.wall_total_us = profile::elapsed_us(wall_started);
1360 profile::finalize_decode_total_us(report);
1361 report.emit("decode");
1362
1363 let dimensions = (component.store.output_width, component.store.output_height);
1364 let surface = Surface {
1365 backend: BackendKind::Cuda,
1366 residency: SurfaceResidency::CudaResidentDecode,
1367 dimensions,
1368 fmt,
1369 pitch_bytes: dimensions.0 as usize * fmt.bytes_per_pixel(),
1370 stats: CudaSurfaceStats {
1371 total: dispatches,
1372 copy: 0,
1373 decode: decode_dispatches,
1374 },
1375 storage: Storage::Cuda(surface_buffer),
1376 };
1377 Ok((surface, report.clone()))
1378}
1379
1380#[cfg(feature = "cuda-runtime")]
1381fn decode_color_cuda_resident_surface_with_profile(
1382 decoder: &mut J2kDecoder<'_>,
1383 session: &mut CudaSession,
1384 fmt: PixelFormat,
1385 wall_started: Option<profile::ProfileInstant>,
1386 collect_stage_timings: bool,
1387) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
1388 let color = decoder.build_cuda_htj2k_color_plans_with_profile(fmt)?;
1389 decode_color_cuda_resident_surface_with_plans_profile(
1390 session,
1391 fmt,
1392 color,
1393 wall_started,
1394 collect_stage_timings,
1395 )
1396}
1397
1398#[cfg(feature = "cuda-runtime")]
1399fn decode_color_cuda_resident_scaled_surface(
1400 decoder: &mut J2kDecoder<'_>,
1401 session: &mut CudaSession,
1402 fmt: PixelFormat,
1403 output_dimensions: (u32, u32),
1404) -> Result<Surface, Error> {
1405 let collect_stage_timings = profile::profile_stages_enabled();
1406 let wall_started = profile::profile_now(collect_stage_timings);
1407 let color = decoder.build_cuda_htj2k_color_scaled_plans_with_profile(fmt, output_dimensions)?;
1408 decode_color_cuda_resident_surface_with_plans_profile(
1409 session,
1410 fmt,
1411 color,
1412 wall_started,
1413 collect_stage_timings,
1414 )
1415 .map(|(surface, _report)| surface)
1416}
1417
1418#[cfg(feature = "cuda-runtime")]
1419fn decode_color_cuda_resident_region_surface(
1420 decoder: &mut J2kDecoder<'_>,
1421 session: &mut CudaSession,
1422 fmt: PixelFormat,
1423 roi: Rect,
1424) -> Result<Surface, Error> {
1425 let collect_stage_timings = profile::profile_stages_enabled();
1426 let wall_started = profile::profile_now(collect_stage_timings);
1427 let color = decoder.build_cuda_htj2k_color_region_plans_with_profile(fmt, roi)?;
1428 decode_color_cuda_resident_surface_with_plans_profile(
1429 session,
1430 fmt,
1431 color,
1432 wall_started,
1433 collect_stage_timings,
1434 )
1435 .map(|(surface, _report)| surface)
1436}
1437
1438#[cfg(feature = "cuda-runtime")]
1439fn decode_color_cuda_resident_region_scaled_surface(
1440 decoder: &mut J2kDecoder<'_>,
1441 session: &mut CudaSession,
1442 fmt: PixelFormat,
1443 scaled_roi: Rect,
1444 scaled_dimensions: (u32, u32),
1445) -> Result<Surface, Error> {
1446 let collect_stage_timings = profile::profile_stages_enabled();
1447 let wall_started = profile::profile_now(collect_stage_timings);
1448 let color = decoder.build_cuda_htj2k_color_region_scaled_plans_with_profile(
1449 fmt,
1450 scaled_roi,
1451 scaled_dimensions,
1452 )?;
1453 decode_color_cuda_resident_surface_with_plans_profile(
1454 session,
1455 fmt,
1456 color,
1457 wall_started,
1458 collect_stage_timings,
1459 )
1460 .map(|(surface, _report)| surface)
1461}
1462
1463#[cfg(feature = "cuda-runtime")]
1464fn decode_color_cuda_resident_surface_with_plans_profile(
1465 session: &mut CudaSession,
1466 fmt: PixelFormat,
1467 mut color: CudaHtj2kColorDecodePlans,
1468 wall_started: Option<profile::ProfileInstant>,
1469 collect_stage_timings: bool,
1470) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
1471 if color.components.len() != 3 {
1472 return Err(Error::UnsupportedCudaRequest {
1473 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1474 });
1475 }
1476 let context = session.cuda_context()?;
1477 let pool = session.decode_buffer_pool()?;
1478 let table_upload_start = profile::profile_now(collect_stage_timings);
1479 let table_resources = session.htj2k_decode_table_resources()?;
1480 let table_upload_us = profile::elapsed_us(table_upload_start);
1481 color.report.h2d_us = color.report.h2d_us.saturating_add(table_upload_us);
1482 color.report.detail.table_upload_us = color
1483 .report
1484 .detail
1485 .table_upload_us
1486 .saturating_add(table_upload_us);
1487 let payload_upload_start = profile::profile_now(collect_stage_timings);
1488 let decode_resources = context
1489 .upload_htj2k_decode_resources_with_tables(&color.payload, &table_resources)
1490 .map_err(cuda_error)?;
1491 let payload_upload_us = profile::elapsed_us(payload_upload_start);
1492 profile::add_payload_resource_upload_us(&mut color.report, payload_upload_us);
1493 let mut component_work = Vec::with_capacity(3);
1494 for plan in &color.components {
1495 component_work.push(decode_cuda_component_subbands_with_resources(
1496 &context,
1497 plan,
1498 &pool,
1499 collect_stage_timings,
1500 )?);
1501 }
1502 run_component_cleanup_dequant_batches(
1503 &context,
1504 &decode_resources,
1505 &mut component_work,
1506 &pool,
1507 collect_stage_timings,
1508 )?;
1509 finish_color_cuda_resident_surface_with_component_work(
1510 &context,
1511 &pool,
1512 fmt,
1513 color,
1514 component_work,
1515 wall_started,
1516 collect_stage_timings,
1517 true,
1518 true,
1519 )
1520}
1521
1522#[cfg(feature = "cuda-runtime")]
1523fn decode_color_cuda_resident_batch_surfaces_with_profile(
1524 inputs: &[&[u8]],
1525 session: &mut CudaSession,
1526 fmt: PixelFormat,
1527 collect_stage_timings: bool,
1528) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
1529 let batch_wall_started = profile::profile_now(collect_stage_timings);
1530 let mut colors = Vec::with_capacity(inputs.len());
1531 let mut shared_payload = Vec::new();
1532 let mut native_context = NativeDecoderContext::default();
1533 for input in inputs {
1534 let mut color =
1535 build_cuda_htj2k_color_plans_from_bytes_with_profile(input, fmt, &mut native_context)?;
1536 if color.components.len() != 3 {
1537 return Err(Error::UnsupportedCudaRequest {
1538 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1539 });
1540 }
1541 append_color_payload_to_shared(&mut color, &mut shared_payload)?;
1542 colors.push(color);
1543 }
1544
1545 let context = session.cuda_context()?;
1546 let pool = session.decode_batch_buffer_pool()?;
1547 let table_upload_start = profile::profile_now(collect_stage_timings);
1548 let table_resources = session.htj2k_decode_table_resources()?;
1549 let table_upload_us = profile::elapsed_us(table_upload_start);
1550 let payload_upload_start = profile::profile_now(collect_stage_timings);
1551 let decode_resources = context
1552 .upload_htj2k_decode_resources_with_tables_and_pool(
1553 &shared_payload,
1554 &table_resources,
1555 &pool,
1556 )
1557 .map_err(cuda_error)?;
1558 let payload_upload_us = profile::elapsed_us(payload_upload_start);
1559
1560 let component_count = colors
1561 .iter()
1562 .map(|color| color.components.len())
1563 .sum::<usize>();
1564 let mut all_component_work = Vec::with_capacity(component_count);
1565 for color in &colors {
1566 for plan in &color.components {
1567 all_component_work.push(decode_cuda_component_subbands_with_resources(
1568 &context,
1569 plan,
1570 &pool,
1571 collect_stage_timings,
1572 )?);
1573 }
1574 }
1575 run_component_cleanup_dequant_batches(
1576 &context,
1577 &decode_resources,
1578 &mut all_component_work,
1579 &pool,
1580 collect_stage_timings,
1581 )?;
1582 let batch_components = colors
1583 .iter()
1584 .flat_map(|color| color.components.iter())
1585 .collect::<Vec<_>>();
1586 let idwt_batched = can_batch_color_idwt(&batch_components);
1587 let pending_idwt_batch = if idwt_batched {
1588 run_color_component_idwt_batches(
1589 &context,
1590 &batch_components,
1591 &mut all_component_work,
1592 &pool,
1593 collect_stage_timings,
1594 )?
1595 } else {
1596 None
1597 };
1598 drop(batch_components);
1599
1600 let can_use_batch_store =
1601 idwt_batched && can_batch_rgb8_mct_color_store(fmt, &colors, &all_component_work)?;
1602 let (surfaces, reports) = if can_use_batch_store {
1603 finish_color_cuda_resident_batch_surfaces_with_rgb8_mct_store(
1604 &context,
1605 fmt,
1606 colors,
1607 all_component_work,
1608 collect_stage_timings,
1609 )?
1610 } else {
1611 let mut surfaces = Vec::with_capacity(colors.len());
1612 let mut reports = Vec::with_capacity(colors.len());
1613 let mut work_iter = all_component_work.into_iter();
1614 for color in colors {
1615 let component_count = color.components.len();
1616 let component_work = work_iter.by_ref().take(component_count).collect::<Vec<_>>();
1617 if component_work.len() != component_count {
1618 return Err(Error::UnsupportedCudaRequest {
1619 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1620 });
1621 }
1622 let (surface, report) = finish_color_cuda_resident_surface_with_component_work(
1623 &context,
1624 &pool,
1625 fmt,
1626 color,
1627 component_work,
1628 None,
1629 collect_stage_timings,
1630 !idwt_batched,
1631 false,
1632 )?;
1633 surfaces.push(surface);
1634 reports.push(report);
1635 }
1636 (surfaces, reports)
1637 };
1638 drop(pending_idwt_batch);
1639
1640 let aggregate = finalize_color_batch_decode_report(
1641 &reports,
1642 table_upload_us,
1643 payload_upload_us,
1644 batch_wall_started,
1645 );
1646 aggregate.emit("decode_batch");
1647
1648 Ok((surfaces, aggregate))
1649}
1650
1651#[cfg(feature = "cuda-runtime")]
1652fn build_cuda_htj2k_color_plans_from_bytes_with_profile<'a>(
1653 input: &'a [u8],
1654 fmt: PixelFormat,
1655 native_context: &mut NativeDecoderContext<'a>,
1656) -> Result<CudaHtj2kColorDecodePlans, Error> {
1657 let total_start = profile::profile_now(true);
1658
1659 let parse_start = profile::profile_now(true);
1660 let image = NativeImage::new(input, &DecodeSettings::default())
1661 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
1662 let parse_us = profile::elapsed_us(parse_start);
1663
1664 let plan_start = profile::profile_now(true);
1665 let native_plan = image
1666 .build_direct_color_plan_with_context(native_context)
1667 .map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
1668 let plan_us = profile::elapsed_us(plan_start);
1669
1670 let flatten_start = profile::profile_now(true);
1671 let mut payload = Vec::new();
1672 let mut components = Vec::with_capacity(native_plan.component_plans.len());
1673 for component_plan in &native_plan.component_plans {
1674 let mut component =
1675 CudaHtj2kDecodePlan::from_grayscale_direct_plan(component_plan, fmt, (0, 0))?;
1676 component.append_payload_to_shared(&mut payload)?;
1677 components.push(component);
1678 }
1679 let flatten_us = profile::elapsed_us(flatten_start);
1680 let block_count = components
1681 .iter()
1682 .map(|plan| plan.code_blocks().len())
1683 .sum::<usize>();
1684 let payload_bytes = payload.len();
1685 let report = CudaHtj2kProfileReport {
1686 parse_us,
1687 plan_us,
1688 flatten_us,
1689 total_us: profile::elapsed_us(total_start),
1690 block_count,
1691 payload_bytes,
1692 dispatch_count: 0,
1693 residency: crate::SurfaceResidency::CudaResidentDecode,
1694 detail: CudaHtj2kDecodeProfileDetail::default(),
1695 ..CudaHtj2kProfileReport::default()
1696 };
1697 report.emit("plan");
1698
1699 Ok(CudaHtj2kColorDecodePlans {
1700 dimensions: native_plan.dimensions,
1701 mct_dimensions: native_plan.dimensions,
1702 bit_depths: native_plan.bit_depths,
1703 mct: native_plan.mct,
1704 transform: CudaHtj2kTransform::from_native(native_plan.transform),
1705 payload,
1706 components,
1707 report,
1708 })
1709}
1710
1711#[cfg(feature = "cuda-runtime")]
1712fn finalize_color_batch_decode_report(
1713 reports: &[CudaHtj2kProfileReport],
1714 table_upload_us: u128,
1715 payload_upload_us: u128,
1716 batch_wall_started: Option<profile::ProfileInstant>,
1717) -> CudaHtj2kProfileReport {
1718 let mut aggregate = aggregate_decode_reports(reports);
1719 aggregate.h2d_us = aggregate
1720 .h2d_us
1721 .saturating_add(table_upload_us)
1722 .saturating_add(payload_upload_us);
1723 aggregate.detail.table_upload_us = aggregate
1724 .detail
1725 .table_upload_us
1726 .saturating_add(table_upload_us);
1727 aggregate.detail.payload_upload_us = aggregate
1728 .detail
1729 .payload_upload_us
1730 .saturating_add(payload_upload_us);
1731 aggregate.detail.wall_total_us = profile::elapsed_us(batch_wall_started);
1732 profile::finalize_decode_total_us(&mut aggregate);
1733 aggregate
1734}
1735
1736#[cfg(feature = "cuda-runtime")]
1737fn can_batch_rgb8_mct_color_store(
1738 fmt: PixelFormat,
1739 colors: &[CudaHtj2kColorDecodePlans],
1740 all_component_work: &[CudaComponentDecodeWork],
1741) -> Result<bool, Error> {
1742 if !matches!(fmt, PixelFormat::Rgb8 | PixelFormat::Rgba8) {
1743 return Ok(false);
1744 }
1745
1746 let mut offset = 0usize;
1747 for color in colors {
1748 let component_count = color.components.len();
1749 if component_count != 3 || offset.saturating_add(component_count) > all_component_work.len()
1750 {
1751 return Err(Error::UnsupportedCudaRequest {
1752 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1753 });
1754 }
1755 if !color.mct {
1756 return Ok(false);
1757 }
1758 let component_work = &all_component_work[offset..offset + component_count];
1759 let stores = [
1760 &component_work[0].store,
1761 &component_work[1].store,
1762 &component_work[2].store,
1763 ];
1764 validate_color_stores(stores, color.dimensions)?;
1765 if !can_fuse_mct_store_for_stores(stores) {
1766 return Ok(false);
1767 }
1768 offset = offset.saturating_add(component_count);
1769 }
1770
1771 if offset != all_component_work.len() {
1772 return Err(Error::UnsupportedCudaRequest {
1773 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1774 });
1775 }
1776 Ok(!colors.is_empty())
1777}
1778
1779#[cfg(feature = "cuda-runtime")]
1780fn finish_color_cuda_resident_batch_surfaces_with_rgb8_mct_store(
1781 context: &j2k_cuda_runtime::CudaContext,
1782 fmt: PixelFormat,
1783 colors: Vec<CudaHtj2kColorDecodePlans>,
1784 all_component_work: Vec<CudaComponentDecodeWork>,
1785 collect_stage_timings: bool,
1786) -> Result<(Vec<Surface>, Vec<CudaHtj2kProfileReport>), Error> {
1787 let mut prepared = Vec::with_capacity(colors.len());
1788 let mut work_iter = all_component_work.into_iter();
1789 for color in colors {
1790 let component_count = color.components.len();
1791 let component_work = work_iter.by_ref().take(component_count).collect::<Vec<_>>();
1792 if component_work.len() != component_count {
1793 return Err(Error::UnsupportedCudaRequest {
1794 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1795 });
1796 }
1797 prepared.push(prepare_rgb8_mct_batch_store(fmt, color, component_work)?);
1798 }
1799 if work_iter.next().is_some() {
1800 return Err(Error::UnsupportedCudaRequest {
1801 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1802 });
1803 }
1804
1805 let targets = prepared
1806 .iter()
1807 .map(rgb8_mct_batch_store_target)
1808 .collect::<Result<Vec<_>, Error>>()?;
1809 let (store_output, store_us) = context
1810 .time_default_stream_named_us_if(
1811 collect_stage_timings,
1812 "j2k.htj2k.decode.store.color.batch",
1813 || context.j2k_store_rgb8_mct_batch_contiguous_device(&targets),
1814 )
1815 .map_err(cuda_error)?;
1816 drop(targets);
1817 let (surface_buffer, surface_ranges, store_stats) = store_output.into_parts();
1818 if surface_ranges.len() != prepared.len() {
1819 return Err(Error::UnsupportedCudaRequest {
1820 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1821 });
1822 }
1823 let shared_surface_buffer = Arc::new(surface_buffer);
1824
1825 let mut surfaces = Vec::with_capacity(prepared.len());
1826 let mut reports = Vec::with_capacity(prepared.len());
1827 let store_dispatches = store_stats.kernel_dispatches();
1828 let store_decode_dispatches = store_stats.decode_kernel_dispatches();
1829 for (index, (mut prepared, surface_range)) in
1830 prepared.into_iter().zip(surface_ranges).enumerate()
1831 {
1832 let report_store_dispatches = if index == 0 { store_dispatches } else { 0 };
1833 let report_store_decode_dispatches = if index == 0 {
1834 store_decode_dispatches
1835 } else {
1836 0
1837 };
1838 let report_store_us = if index == 0 { store_us } else { 0 };
1839 let dispatches = prepared.dispatches.saturating_add(report_store_dispatches);
1840 let decode_dispatches = prepared
1841 .decode_dispatches
1842 .saturating_add(report_store_decode_dispatches);
1843 prepared.color.report.dispatch_count = dispatches;
1844 prepared.color.report.store_us = prepared
1845 .color
1846 .report
1847 .store_us
1848 .saturating_add(report_store_us);
1849 prepared.color.report.detail.store_dispatch_count = prepared
1850 .color
1851 .report
1852 .detail
1853 .store_dispatch_count
1854 .saturating_add(report_store_dispatches);
1855 profile::finalize_decode_total_us(&mut prepared.color.report);
1856
1857 let dimensions = prepared.color.dimensions;
1858 surfaces.push(Surface {
1859 backend: BackendKind::Cuda,
1860 residency: SurfaceResidency::CudaResidentDecode,
1861 dimensions,
1862 fmt,
1863 pitch_bytes: dimensions.0 as usize * fmt.bytes_per_pixel(),
1864 stats: CudaSurfaceStats {
1865 total: dispatches,
1866 copy: 0,
1867 decode: decode_dispatches,
1868 },
1869 storage: cuda_range_storage(
1870 shared_surface_buffer.clone(),
1871 surface_range.offset,
1872 surface_range.len,
1873 ),
1874 });
1875 reports.push(prepared.color.report);
1876 }
1877
1878 Ok((surfaces, reports))
1879}
1880
1881#[cfg(feature = "cuda-runtime")]
1882fn prepare_rgb8_mct_batch_store(
1883 fmt: PixelFormat,
1884 mut color: CudaHtj2kColorDecodePlans,
1885 component_work: Vec<CudaComponentDecodeWork>,
1886) -> Result<CudaPreparedRgb8MctBatchStore, Error> {
1887 let decoded_components = component_work
1888 .into_iter()
1889 .map(finish_cuda_component_decode)
1890 .collect::<Result<Vec<_>, Error>>()?;
1891 let [component0, component1, component2] = decoded_components.as_slice() else {
1892 return Err(Error::UnsupportedCudaRequest {
1893 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1894 });
1895 };
1896 let stores = [&component0.store, &component1.store, &component2.store];
1897 validate_color_stores(stores, color.dimensions)?;
1898 if !color.mct || !can_fuse_mct_store_for_stores(stores) {
1899 return Err(Error::UnsupportedCudaRequest {
1900 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1901 });
1902 }
1903
1904 let dispatches = decoded_components
1905 .iter()
1906 .map(|component| component.dispatches)
1907 .sum::<usize>();
1908 let decode_dispatches = decoded_components
1909 .iter()
1910 .map(|component| component.decode_dispatches)
1911 .sum::<usize>();
1912 for component in &decoded_components {
1913 component.timings.add_to_report(&mut color.report);
1914 }
1915
1916 let addends = [
1917 bit_depth_addend(color.bit_depths[0]),
1918 bit_depth_addend(color.bit_depths[1]),
1919 bit_depth_addend(color.bit_depths[2]),
1920 ];
1921 let job = CudaJ2kStoreRgb8MctJob {
1922 store: CudaJ2kStoreRgb8Job {
1923 input_width0: color_store_input_width(&component0.store),
1924 input_width1: color_store_input_width(&component1.store),
1925 input_width2: color_store_input_width(&component2.store),
1926 source_x0: component0.store.source_x,
1927 source_y0: component0.store.source_y,
1928 source_x1: component1.store.source_x,
1929 source_y1: component1.store.source_y,
1930 source_x2: component2.store.source_x,
1931 source_y2: component2.store.source_y,
1932 copy_width: component0.store.copy_width,
1933 copy_height: component0.store.copy_height,
1934 output_width: component0.store.output_width,
1935 output_height: component0.store.output_height,
1936 output_x: component0.store.output_x,
1937 output_y: component0.store.output_y,
1938 addend0: addends[0],
1939 addend1: addends[1],
1940 addend2: addends[2],
1941 bit_depth0: u32::from(color.bit_depths[0]),
1942 bit_depth1: u32::from(color.bit_depths[1]),
1943 bit_depth2: u32::from(color.bit_depths[2]),
1944 rgba: u32::from(fmt == PixelFormat::Rgba8),
1945 },
1946 irreversible97: u32::from(color.transform == CudaHtj2kTransform::Irreversible97),
1947 };
1948
1949 Ok(CudaPreparedRgb8MctBatchStore {
1950 color,
1951 decoded_components,
1952 dispatches,
1953 decode_dispatches,
1954 job,
1955 })
1956}
1957
1958#[cfg(feature = "cuda-runtime")]
1959fn rgb8_mct_batch_store_target(
1960 prepared: &CudaPreparedRgb8MctBatchStore,
1961) -> Result<CudaJ2kStoreRgb8MctTarget<'_>, Error> {
1962 let [component0, component1, component2] = prepared.decoded_components.as_slice() else {
1963 return Err(Error::UnsupportedCudaRequest {
1964 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
1965 });
1966 };
1967 Ok(CudaJ2kStoreRgb8MctTarget {
1968 plane0: pooled_cuda_buffer(&component0.buffer)?,
1969 plane1: pooled_cuda_buffer(&component1.buffer)?,
1970 plane2: pooled_cuda_buffer(&component2.buffer)?,
1971 job: prepared.job,
1972 })
1973}
1974
1975#[cfg(feature = "cuda-runtime")]
1976fn can_fuse_mct_store_for_stores(stores: [&CudaHtj2kStoreStep; 3]) -> bool {
1977 let input_width0 = color_store_input_width(stores[0]);
1978 let input_width1 = color_store_input_width(stores[1]);
1979 let input_width2 = color_store_input_width(stores[2]);
1980 input_width0 == input_width1
1981 && input_width0 == input_width2
1982 && stores[0].source_x == stores[1].source_x
1983 && stores[0].source_x == stores[2].source_x
1984 && stores[0].source_y == stores[1].source_y
1985 && stores[0].source_y == stores[2].source_y
1986}
1987
1988#[cfg(feature = "cuda-runtime")]
1989fn color_store_input_width(store: &CudaHtj2kStoreStep) -> u32 {
1990 store.input_rect.x1.saturating_sub(store.input_rect.x0)
1991}
1992
1993#[cfg(feature = "cuda-runtime")]
1994#[allow(clippy::too_many_arguments)]
1995fn finish_color_cuda_resident_surface_with_component_work(
1996 context: &j2k_cuda_runtime::CudaContext,
1997 pool: &CudaBufferPool,
1998 fmt: PixelFormat,
1999 mut color: CudaHtj2kColorDecodePlans,
2000 mut component_work: Vec<CudaComponentDecodeWork>,
2001 wall_started: Option<profile::ProfileInstant>,
2002 collect_stage_timings: bool,
2003 run_idwt: bool,
2004 emit_report: bool,
2005) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
2006 let pending_idwt_batch = if run_idwt {
2007 let batch_components = color.components.iter().collect::<Vec<_>>();
2008 if can_batch_color_idwt(&batch_components) {
2009 run_color_component_idwt_batches(
2010 context,
2011 &batch_components,
2012 &mut component_work,
2013 pool,
2014 collect_stage_timings,
2015 )?
2016 } else {
2017 for (plan, work) in color.components.iter().zip(component_work.iter_mut()) {
2018 run_cuda_component_idwt_steps(
2019 context,
2020 plan.idwt_steps(),
2021 work,
2022 pool,
2023 collect_stage_timings,
2024 )?;
2025 }
2026 None
2027 }
2028 } else {
2029 None
2030 };
2031 let decoded_components = component_work
2032 .into_iter()
2033 .map(finish_cuda_component_decode)
2034 .collect::<Result<Vec<_>, Error>>()?;
2035 let [component0, component1, component2] = decoded_components.as_slice() else {
2036 return Err(Error::UnsupportedCudaRequest {
2037 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
2038 });
2039 };
2040 validate_color_stores(
2041 [&component0.store, &component1.store, &component2.store],
2042 color.dimensions,
2043 )?;
2044
2045 let mut dispatches = decoded_components
2046 .iter()
2047 .map(|component| component.dispatches)
2048 .sum::<usize>();
2049 let mut decode_dispatches = decoded_components
2050 .iter()
2051 .map(|component| component.decode_dispatches)
2052 .sum::<usize>();
2053 for component in &decoded_components {
2054 component.timings.add_to_report(&mut color.report);
2055 }
2056 let component0_buffer = pooled_cuda_buffer(&component0.buffer)?;
2057 let component1_buffer = pooled_cuda_buffer(&component1.buffer)?;
2058 let component2_buffer = pooled_cuda_buffer(&component2.buffer)?;
2059 let input_width0 = component0
2060 .store
2061 .input_rect
2062 .x1
2063 .saturating_sub(component0.store.input_rect.x0);
2064 let input_width1 = component1
2065 .store
2066 .input_rect
2067 .x1
2068 .saturating_sub(component1.store.input_rect.x0);
2069 let input_width2 = component2
2070 .store
2071 .input_rect
2072 .x1
2073 .saturating_sub(component2.store.input_rect.x0);
2074 let irreversible97 = u32::from(color.transform == CudaHtj2kTransform::Irreversible97);
2075 let mct_store_addends = [
2076 bit_depth_addend(color.bit_depths[0]),
2077 bit_depth_addend(color.bit_depths[1]),
2078 bit_depth_addend(color.bit_depths[2]),
2079 ];
2080 let can_fuse_mct_store = color.mct
2081 && input_width0 == input_width1
2082 && input_width0 == input_width2
2083 && component0.store.source_x == component1.store.source_x
2084 && component0.store.source_x == component2.store.source_x
2085 && component0.store.source_y == component1.store.source_y
2086 && component0.store.source_y == component2.store.source_y;
2087 let addends = if color.mct && can_fuse_mct_store {
2088 mct_store_addends
2089 } else if color.mct {
2090 let mct_len = u32::try_from(checked_area(
2091 color.mct_dimensions.0,
2092 color.mct_dimensions.1,
2093 )?)
2094 .map_err(|_| Error::UnsupportedCudaRequest {
2095 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
2096 })?;
2097 let stats = context
2098 .time_default_stream_named_us_if(collect_stage_timings, "j2k.htj2k.decode.mct", || {
2099 context.j2k_inverse_mct_device(
2100 component0_buffer,
2101 component1_buffer,
2102 component2_buffer,
2103 CudaJ2kInverseMctJob {
2104 len: mct_len,
2105 irreversible97,
2106 addend0: mct_store_addends[0],
2107 addend1: mct_store_addends[1],
2108 addend2: mct_store_addends[2],
2109 },
2110 )
2111 })
2112 .map_err(cuda_error)?;
2113 let (stats, mct_us) = stats;
2114 dispatches = dispatches.saturating_add(stats.kernel_dispatches());
2115 decode_dispatches = decode_dispatches.saturating_add(stats.decode_kernel_dispatches());
2116 color.report.mct_us = color.report.mct_us.saturating_add(mct_us);
2117 color.report.detail.mct_dispatch_count = color
2118 .report
2119 .detail
2120 .mct_dispatch_count
2121 .saturating_add(stats.kernel_dispatches());
2122 [0.0, 0.0, 0.0]
2123 } else {
2124 [
2125 component0.store.addend,
2126 component1.store.addend,
2127 component2.store.addend,
2128 ]
2129 };
2130 let (store_output, store_us) = context
2131 .time_default_stream_named_us_if(
2132 collect_stage_timings,
2133 "j2k.htj2k.decode.store.color",
2134 || match fmt {
2135 PixelFormat::Rgb8 | PixelFormat::Rgba8 => {
2136 let store_job = CudaJ2kStoreRgb8Job {
2137 input_width0,
2138 input_width1,
2139 input_width2,
2140 source_x0: component0.store.source_x,
2141 source_y0: component0.store.source_y,
2142 source_x1: component1.store.source_x,
2143 source_y1: component1.store.source_y,
2144 source_x2: component2.store.source_x,
2145 source_y2: component2.store.source_y,
2146 copy_width: component0.store.copy_width,
2147 copy_height: component0.store.copy_height,
2148 output_width: component0.store.output_width,
2149 output_height: component0.store.output_height,
2150 output_x: component0.store.output_x,
2151 output_y: component0.store.output_y,
2152 addend0: addends[0],
2153 addend1: addends[1],
2154 addend2: addends[2],
2155 bit_depth0: u32::from(color.bit_depths[0]),
2156 bit_depth1: u32::from(color.bit_depths[1]),
2157 bit_depth2: u32::from(color.bit_depths[2]),
2158 rgba: u32::from(fmt == PixelFormat::Rgba8),
2159 };
2160 if can_fuse_mct_store {
2161 context.j2k_store_rgb8_mct_device(
2162 component0_buffer,
2163 component1_buffer,
2164 component2_buffer,
2165 CudaJ2kStoreRgb8MctJob {
2166 store: store_job,
2167 irreversible97,
2168 },
2169 )
2170 } else {
2171 context.j2k_store_rgb8_device(
2172 component0_buffer,
2173 component1_buffer,
2174 component2_buffer,
2175 store_job,
2176 )
2177 }
2178 }
2179 PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
2180 let store_job = CudaJ2kStoreRgb16Job {
2181 input_width0,
2182 input_width1,
2183 input_width2,
2184 source_x0: component0.store.source_x,
2185 source_y0: component0.store.source_y,
2186 source_x1: component1.store.source_x,
2187 source_y1: component1.store.source_y,
2188 source_x2: component2.store.source_x,
2189 source_y2: component2.store.source_y,
2190 copy_width: component0.store.copy_width,
2191 copy_height: component0.store.copy_height,
2192 output_width: component0.store.output_width,
2193 output_height: component0.store.output_height,
2194 output_x: component0.store.output_x,
2195 output_y: component0.store.output_y,
2196 addend0: addends[0],
2197 addend1: addends[1],
2198 addend2: addends[2],
2199 bit_depth0: u32::from(color.bit_depths[0]),
2200 bit_depth1: u32::from(color.bit_depths[1]),
2201 bit_depth2: u32::from(color.bit_depths[2]),
2202 rgba: u32::from(fmt == PixelFormat::Rgba16),
2203 };
2204 if can_fuse_mct_store {
2205 context.j2k_store_rgb16_mct_device(
2206 component0_buffer,
2207 component1_buffer,
2208 component2_buffer,
2209 CudaJ2kStoreRgb16MctJob {
2210 store: store_job,
2211 irreversible97,
2212 },
2213 )
2214 } else {
2215 context.j2k_store_rgb16_device(
2216 component0_buffer,
2217 component1_buffer,
2218 component2_buffer,
2219 store_job,
2220 )
2221 }
2222 }
2223 _ => Err(CudaError::InvalidArgument {
2224 message: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED.to_string(),
2225 }),
2226 },
2227 )
2228 .map_err(cuda_error)?;
2229 drop(pending_idwt_batch);
2230 let (surface_buffer, store_stats) = store_output.into_parts();
2231 dispatches = dispatches.saturating_add(store_stats.kernel_dispatches());
2232 decode_dispatches = decode_dispatches.saturating_add(store_stats.decode_kernel_dispatches());
2233 color.report.dispatch_count = dispatches;
2234 color.report.store_us = color.report.store_us.saturating_add(store_us);
2235 color.report.detail.store_dispatch_count = color
2236 .report
2237 .detail
2238 .store_dispatch_count
2239 .saturating_add(store_stats.kernel_dispatches());
2240 color.report.detail.wall_total_us = profile::elapsed_us(wall_started);
2241 profile::finalize_decode_total_us(&mut color.report);
2242 if emit_report {
2243 color.report.emit("decode");
2244 }
2245
2246 let surface = Surface {
2247 backend: BackendKind::Cuda,
2248 residency: SurfaceResidency::CudaResidentDecode,
2249 dimensions: color.dimensions,
2250 fmt,
2251 pitch_bytes: color.dimensions.0 as usize * fmt.bytes_per_pixel(),
2252 stats: CudaSurfaceStats {
2253 total: dispatches,
2254 copy: 0,
2255 decode: decode_dispatches,
2256 },
2257 storage: Storage::Cuda(surface_buffer),
2258 };
2259 Ok((surface, color.report))
2260}
2261
2262#[cfg(feature = "cuda-runtime")]
2263fn append_color_payload_to_shared(
2264 color: &mut CudaHtj2kColorDecodePlans,
2265 shared_payload: &mut Vec<u8>,
2266) -> Result<(), Error> {
2267 let base = u64::try_from(shared_payload.len()).map_err(|_| Error::UnsupportedCudaRequest {
2268 reason: CUDA_HTJ2K_BATCH_PAYLOAD_TOO_LARGE,
2269 })?;
2270 shared_payload
2271 .try_reserve(color.payload.len())
2272 .map_err(|_| Error::UnsupportedCudaRequest {
2273 reason: CUDA_HTJ2K_BATCH_PAYLOAD_TOO_LARGE,
2274 })?;
2275 for component in &mut color.components {
2276 component.rebase_payload_offsets(base)?;
2277 }
2278 shared_payload.append(&mut color.payload);
2279 Ok(())
2280}
2281
2282#[cfg(feature = "cuda-runtime")]
2283fn aggregate_decode_reports(reports: &[CudaHtj2kProfileReport]) -> CudaHtj2kProfileReport {
2284 let mut aggregate = CudaHtj2kProfileReport {
2285 residency: SurfaceResidency::CudaResidentDecode,
2286 ..CudaHtj2kProfileReport::default()
2287 };
2288 for report in reports {
2289 add_decode_report(&mut aggregate, report);
2290 }
2291 aggregate
2292}
2293
2294#[cfg(feature = "cuda-runtime")]
2295fn add_decode_report(aggregate: &mut CudaHtj2kProfileReport, report: &CudaHtj2kProfileReport) {
2296 aggregate.parse_us = aggregate.parse_us.saturating_add(report.parse_us);
2297 aggregate.plan_us = aggregate.plan_us.saturating_add(report.plan_us);
2298 aggregate.flatten_us = aggregate.flatten_us.saturating_add(report.flatten_us);
2299 aggregate.h2d_us = aggregate.h2d_us.saturating_add(report.h2d_us);
2300 aggregate.ht_cleanup_us = aggregate.ht_cleanup_us.saturating_add(report.ht_cleanup_us);
2301 aggregate.ht_refine_us = aggregate.ht_refine_us.saturating_add(report.ht_refine_us);
2302 aggregate.dequant_us = aggregate.dequant_us.saturating_add(report.dequant_us);
2303 aggregate.idwt_us = aggregate.idwt_us.saturating_add(report.idwt_us);
2304 aggregate.mct_us = aggregate.mct_us.saturating_add(report.mct_us);
2305 aggregate.store_us = aggregate.store_us.saturating_add(report.store_us);
2306 aggregate.block_count = aggregate.block_count.saturating_add(report.block_count);
2307 aggregate.payload_bytes = aggregate.payload_bytes.saturating_add(report.payload_bytes);
2308 aggregate.dispatch_count = aggregate
2309 .dispatch_count
2310 .saturating_add(report.dispatch_count);
2311 aggregate.detail.table_upload_us = aggregate
2312 .detail
2313 .table_upload_us
2314 .saturating_add(report.detail.table_upload_us);
2315 aggregate.detail.payload_upload_us = aggregate
2316 .detail
2317 .payload_upload_us
2318 .saturating_add(report.detail.payload_upload_us);
2319 aggregate.detail.job_upload_us = aggregate
2320 .detail
2321 .job_upload_us
2322 .saturating_add(report.detail.job_upload_us);
2323 aggregate.detail.status_d2h_us = aggregate
2324 .detail
2325 .status_d2h_us
2326 .saturating_add(report.detail.status_d2h_us);
2327 aggregate.detail.output_d2h_us = aggregate
2328 .detail
2329 .output_d2h_us
2330 .saturating_add(report.detail.output_d2h_us);
2331 aggregate.detail.ht_dispatch_count = aggregate
2332 .detail
2333 .ht_dispatch_count
2334 .saturating_add(report.detail.ht_dispatch_count);
2335 aggregate.detail.dequant_dispatch_count = aggregate
2336 .detail
2337 .dequant_dispatch_count
2338 .saturating_add(report.detail.dequant_dispatch_count);
2339 aggregate.detail.idwt_dispatch_count = aggregate
2340 .detail
2341 .idwt_dispatch_count
2342 .saturating_add(report.detail.idwt_dispatch_count);
2343 aggregate.detail.mct_dispatch_count = aggregate
2344 .detail
2345 .mct_dispatch_count
2346 .saturating_add(report.detail.mct_dispatch_count);
2347 aggregate.detail.store_dispatch_count = aggregate
2348 .detail
2349 .store_dispatch_count
2350 .saturating_add(report.detail.store_dispatch_count);
2351}
2352
2353#[cfg(not(feature = "cuda-runtime"))]
2354fn decode_to_cuda_resident_surface_impl(
2355 _decoder: &mut J2kDecoder<'_>,
2356 _session: &mut CudaSession,
2357 _fmt: PixelFormat,
2358) -> Result<Surface, Error> {
2359 Err(Error::CudaUnavailable)
2360}
2361
2362#[cfg(not(feature = "cuda-runtime"))]
2363fn decode_to_cuda_resident_surface_with_profile_impl(
2364 _decoder: &mut J2kDecoder<'_>,
2365 _session: &mut CudaSession,
2366 _fmt: PixelFormat,
2367) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
2368 Err(Error::CudaUnavailable)
2369}
2370
2371#[cfg(not(feature = "cuda-runtime"))]
2372fn decode_region_to_cuda_resident_surface_impl(
2373 _decoder: &mut J2kDecoder<'_>,
2374 _session: &mut CudaSession,
2375 _fmt: PixelFormat,
2376 _roi: Rect,
2377) -> Result<Surface, Error> {
2378 Err(Error::CudaUnavailable)
2379}
2380
2381#[cfg(not(feature = "cuda-runtime"))]
2382fn decode_scaled_to_cuda_resident_surface_impl(
2383 _decoder: &mut J2kDecoder<'_>,
2384 _session: &mut CudaSession,
2385 _fmt: PixelFormat,
2386 _scale: Downscale,
2387) -> Result<Surface, Error> {
2388 Err(Error::CudaUnavailable)
2389}
2390
2391#[cfg(not(feature = "cuda-runtime"))]
2392fn decode_region_scaled_to_cuda_resident_surface_impl(
2393 _decoder: &mut J2kDecoder<'_>,
2394 _session: &mut CudaSession,
2395 _fmt: PixelFormat,
2396 _roi: Rect,
2397 _scale: Downscale,
2398) -> Result<Surface, Error> {
2399 Err(Error::CudaUnavailable)
2400}
2401
2402#[cfg(not(feature = "cuda-runtime"))]
2403fn decode_batch_to_cuda_resident_surface_with_profile_control(
2404 _inputs: &[&[u8]],
2405 _session: &mut CudaSession,
2406 _fmt: PixelFormat,
2407 _collect_stage_timings: bool,
2408) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
2409 Err(Error::CudaUnavailable)
2410}
2411
2412#[cfg(feature = "cuda-runtime")]
2413fn decode_cuda_component_plan(
2414 context: &j2k_cuda_runtime::CudaContext,
2415 plan: &CudaHtj2kDecodePlan,
2416 tables: &CudaHtj2kDecodeTableResources,
2417 pool: &CudaBufferPool,
2418 collect_stage_timings: bool,
2419) -> Result<CudaDecodedComponent, Error> {
2420 let resource_upload_start = profile::profile_now(collect_stage_timings);
2421 let decode_resources = context
2422 .upload_htj2k_decode_resources_with_tables(plan.payload(), tables)
2423 .map_err(cuda_error)?;
2424 let resource_upload_us = profile::elapsed_us(resource_upload_start);
2425 let mut component = decode_cuda_component_plan_with_resources(
2426 context,
2427 plan,
2428 &decode_resources,
2429 pool,
2430 collect_stage_timings,
2431 )?;
2432 component.timings.h2d = component.timings.h2d.saturating_add(resource_upload_us);
2433 component.timings.payload_upload = component
2434 .timings
2435 .payload_upload
2436 .saturating_add(resource_upload_us);
2437 Ok(component)
2438}
2439
2440#[cfg(test)]
2441fn split_htj2k_subband_decode_dispatches(kernel_dispatches: usize) -> (usize, usize) {
2442 if kernel_dispatches == 0 {
2443 return (0, 0);
2444 }
2445
2446 let dequant_dispatches = usize::from(kernel_dispatches > 1);
2447 (
2448 kernel_dispatches.saturating_sub(dequant_dispatches),
2449 dequant_dispatches,
2450 )
2451}
2452
2453#[cfg(feature = "cuda-runtime")]
2454fn htj2k_batched_cleanup_dispatches(target_count: usize) -> usize {
2455 usize::from(target_count > 0)
2456}
2457
2458#[cfg(any(feature = "cuda-runtime", test))]
2459fn htj2k_batched_dequant_dispatches(target_count: usize) -> usize {
2460 usize::from(target_count > 0)
2461}
2462
2463#[cfg(feature = "cuda-runtime")]
2464fn htj2k_batched_cleanup_dequant_dispatches(
2465 target_count: usize,
2466 fused_cleanup_dequant: bool,
2467) -> (usize, usize) {
2468 if target_count == 0 {
2469 return (0, 0);
2470 }
2471 if fused_cleanup_dequant {
2472 (1, 0)
2473 } else {
2474 (1, 1)
2475 }
2476}
2477
2478#[cfg(feature = "cuda-runtime")]
2479fn decode_cuda_component_plan_with_resources(
2480 context: &j2k_cuda_runtime::CudaContext,
2481 plan: &CudaHtj2kDecodePlan,
2482 decode_resources: &CudaHtj2kDecodeResources,
2483 pool: &CudaBufferPool,
2484 collect_stage_timings: bool,
2485) -> Result<CudaDecodedComponent, Error> {
2486 let mut work =
2487 decode_cuda_component_subbands_with_resources(context, plan, pool, collect_stage_timings)?;
2488 run_component_cleanup_dequant_batches(
2489 context,
2490 decode_resources,
2491 std::slice::from_mut(&mut work),
2492 pool,
2493 collect_stage_timings,
2494 )?;
2495 run_cuda_component_idwt_steps(
2496 context,
2497 plan.idwt_steps(),
2498 &mut work,
2499 pool,
2500 collect_stage_timings,
2501 )?;
2502 finish_cuda_component_decode(work)
2503}
2504
2505#[cfg(feature = "cuda-runtime")]
2506fn decode_cuda_component_subbands_with_resources(
2507 context: &j2k_cuda_runtime::CudaContext,
2508 plan: &CudaHtj2kDecodePlan,
2509 pool: &CudaBufferPool,
2510 collect_stage_timings: bool,
2511) -> Result<CudaComponentDecodeWork, Error> {
2512 let mut bands = Vec::with_capacity(plan.subbands().len() + plan.idwt_steps().len());
2513 let mut pending_dequant_bands = Vec::with_capacity(plan.subbands().len());
2514 let dispatches = 0usize;
2515 let decode_dispatches = 0usize;
2516 let mut timings = CudaDecodeStageTimings::default();
2517
2518 for subband in plan.subbands() {
2519 let start = subband.code_block_start as usize;
2520 let end = start.checked_add(subband.code_block_count as usize).ok_or(
2521 Error::UnsupportedCudaRequest {
2522 reason: CUDA_HTJ2K_PLAN_INVARIANT_FAILED,
2523 },
2524 )?;
2525 let code_blocks =
2526 plan.code_blocks()
2527 .get(start..end)
2528 .ok_or(Error::UnsupportedCudaRequest {
2529 reason: CUDA_HTJ2K_PLAN_INVARIANT_FAILED,
2530 })?;
2531 let jobs = code_blocks
2532 .iter()
2533 .map(|block| cuda_code_block_job_from_plan_block(block, subband.width))
2534 .collect::<Result<Vec<_>, Error>>()?;
2535 let output_words = checked_area(subband.width, subband.height)?;
2536 let allocate_start = profile::profile_now(collect_stage_timings);
2537 let output = context
2538 .allocate_htj2k_codeblock_coefficients_with_pool(&jobs, output_words, pool)
2539 .map_err(cuda_error)?;
2540 let allocate_wall_us = profile::elapsed_us(allocate_start);
2541 timings.h2d = timings.h2d.saturating_add(allocate_wall_us);
2542 let (buffer, _, _) = output.into_parts();
2543 let band_index = bands.len();
2544 bands.push(CudaCoefficientBand {
2545 band_id: subband.band_id,
2546 buffer,
2547 });
2548 if !jobs.is_empty() {
2549 pending_dequant_bands.push(CudaPendingDequantBand {
2550 band_index,
2551 jobs,
2552 output_words,
2553 });
2554 }
2555 }
2556
2557 let [store] = plan.store_steps() else {
2558 return Err(Error::UnsupportedCudaRequest {
2559 reason: CUDA_HTJ2K_STORE_UNSUPPORTED,
2560 });
2561 };
2562
2563 Ok(CudaComponentDecodeWork {
2564 bands,
2565 pending_dequant_bands,
2566 store: *store,
2567 dispatches,
2568 decode_dispatches,
2569 timings,
2570 })
2571}
2572
2573#[cfg(feature = "cuda-runtime")]
2574fn run_component_cleanup_dequant_batches(
2575 context: &j2k_cuda_runtime::CudaContext,
2576 decode_resources: &CudaHtj2kDecodeResources,
2577 component_work: &mut [CudaComponentDecodeWork],
2578 pool: &CudaBufferPool,
2579 collect_stage_timings: bool,
2580) -> Result<(), Error> {
2581 let pending_count = component_work
2582 .iter()
2583 .map(|work| work.pending_dequant_bands.len())
2584 .sum::<usize>();
2585 if pending_count == 0 {
2586 return Ok(());
2587 }
2588 let accounting_index = component_work
2589 .iter()
2590 .position(|work| !work.pending_dequant_bands.is_empty())
2591 .ok_or(Error::UnsupportedCudaRequest {
2592 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
2593 })?;
2594
2595 let has_refinement = component_work.iter().any(|work| {
2596 work.pending_dequant_bands.iter().any(|pending| {
2597 pending
2598 .jobs
2599 .iter()
2600 .any(|job| job.refinement_length > 0 || u32::from(job.number_of_coding_passes) > 1)
2601 })
2602 });
2603 let cleanup_targets = component_work
2604 .iter()
2605 .flat_map(|work| {
2606 work.pending_dequant_bands
2607 .iter()
2608 .map(move |pending| (work, pending))
2609 })
2610 .map(|(work, pending)| {
2611 let coefficients = pooled_cuda_buffer(&work.bands[pending.band_index].buffer)?;
2612 Ok(CudaHtj2kCleanupTarget {
2613 coefficients,
2614 jobs: &pending.jobs,
2615 output_words: pending.output_words,
2616 })
2617 })
2618 .collect::<Result<Vec<_>, Error>>()?;
2619 if !has_refinement {
2620 let stage_start = profile::profile_now(collect_stage_timings);
2621 let ((stats, runtime_timings), fused_us) = context
2622 .time_default_stream_named_us_if(
2623 collect_stage_timings,
2624 "j2k.htj2k.decode.cleanup_dequantize.batch",
2625 || {
2626 context
2627 .decode_htj2k_codeblocks_cleanup_dequantize_multi_with_resources_and_pool_timed(
2628 decode_resources,
2629 &cleanup_targets,
2630 pool,
2631 collect_stage_timings,
2632 )
2633 },
2634 )
2635 .map_err(cuda_error)?;
2636 let stage_wall_us = profile::elapsed_us(stage_start);
2637 let (cleanup_dispatches, dequant_dispatches) =
2638 htj2k_batched_cleanup_dequant_dispatches(pending_count, true);
2639 {
2640 let accounting = &mut component_work[accounting_index];
2641 accounting.timings.h2d = accounting
2642 .timings
2643 .h2d
2644 .saturating_add(stage_wall_us.saturating_sub(fused_us));
2645 accounting.timings.ht_cleanup = accounting.timings.ht_cleanup.saturating_add(fused_us);
2646 accounting.timings.status_d2h = accounting
2647 .timings
2648 .status_d2h
2649 .saturating_add(runtime_timings.status_d2h_us);
2650 accounting.timings.ht_dispatch_count = accounting
2651 .timings
2652 .ht_dispatch_count
2653 .saturating_add(cleanup_dispatches);
2654 accounting.timings.dequant_dispatch_count = accounting
2655 .timings
2656 .dequant_dispatch_count
2657 .saturating_add(dequant_dispatches);
2658 accounting.dispatches = accounting
2659 .dispatches
2660 .saturating_add(stats.kernel_dispatches());
2661 accounting.decode_dispatches = accounting
2662 .decode_dispatches
2663 .saturating_add(stats.decode_kernel_dispatches());
2664 }
2665
2666 for work in component_work {
2667 work.pending_dequant_bands.clear();
2668 }
2669 return Ok(());
2670 }
2671 let mut queued_cleanup: Option<CudaQueuedHtj2kCleanup> = None;
2672 let stage_start = profile::profile_now(collect_stage_timings);
2673 let (stats, cleanup_us, status_d2h_us) = if collect_stage_timings {
2674 let ((stats, runtime_timings), cleanup_us) = context
2675 .time_default_stream_named_us_if(
2676 collect_stage_timings,
2677 "j2k.htj2k.decode.cleanup.batch",
2678 || {
2679 context.decode_htj2k_codeblocks_cleanup_multi_with_resources_and_pool_timed(
2680 decode_resources,
2681 &cleanup_targets,
2682 pool,
2683 collect_stage_timings,
2684 )
2685 },
2686 )
2687 .map_err(cuda_error)?;
2688 (stats, cleanup_us, runtime_timings.status_d2h_us)
2689 } else {
2690 let (queued, cleanup_us) = context
2691 .time_default_stream_named_us_if(false, "j2k.htj2k.decode.cleanup.batch", || {
2692 context.decode_htj2k_codeblocks_cleanup_multi_enqueue_with_resources_and_pool(
2693 decode_resources,
2694 &cleanup_targets,
2695 pool,
2696 )
2697 })
2698 .map_err(cuda_error)?;
2699 let stats = queued.execution();
2700 queued_cleanup = Some(queued);
2701 (stats, cleanup_us, 0)
2702 };
2703 drop(cleanup_targets);
2704 let stage_wall_us = profile::elapsed_us(stage_start);
2705 {
2706 let accounting = &mut component_work[accounting_index];
2707 accounting.timings.h2d = accounting
2708 .timings
2709 .h2d
2710 .saturating_add(stage_wall_us.saturating_sub(cleanup_us));
2711 accounting.timings.ht_cleanup = accounting.timings.ht_cleanup.saturating_add(cleanup_us);
2712 accounting.timings.status_d2h = accounting.timings.status_d2h.saturating_add(status_d2h_us);
2713 if has_refinement {
2714 accounting.timings.ht_refine = accounting.timings.ht_refine.saturating_add(cleanup_us);
2715 }
2716 accounting.timings.ht_dispatch_count = accounting
2717 .timings
2718 .ht_dispatch_count
2719 .saturating_add(htj2k_batched_cleanup_dispatches(pending_count));
2720 accounting.dispatches = accounting
2721 .dispatches
2722 .saturating_add(stats.kernel_dispatches());
2723 accounting.decode_dispatches = accounting
2724 .decode_dispatches
2725 .saturating_add(stats.decode_kernel_dispatches());
2726 }
2727
2728 let stage_start = profile::profile_now(collect_stage_timings);
2729 let (stats, dequant_us, dequant_target_count) = {
2730 let dequant_target_count = pending_count;
2731 let dequant_result = if let Some(queued) = queued_cleanup.as_ref() {
2732 context.time_default_stream_named_us_if(
2733 collect_stage_timings,
2734 "j2k.htj2k.decode.dequantize.batch",
2735 || context.j2k_dequantize_queued_htj2k_cleanup_with_pool(queued),
2736 )
2737 } else {
2738 let dequant_targets = component_work
2739 .iter()
2740 .flat_map(|work| {
2741 work.pending_dequant_bands
2742 .iter()
2743 .map(move |pending| (work, pending))
2744 })
2745 .map(|(work, pending)| {
2746 let coefficients = pooled_cuda_buffer(&work.bands[pending.band_index].buffer)?;
2747 Ok(CudaHtj2kDequantizeTarget {
2748 coefficients,
2749 jobs: &pending.jobs,
2750 output_words: pending.output_words,
2751 })
2752 })
2753 .collect::<Result<Vec<_>, Error>>()?;
2754 context.time_default_stream_named_us_if(
2755 collect_stage_timings,
2756 "j2k.htj2k.decode.dequantize.batch",
2757 || {
2758 context.j2k_dequantize_htj2k_codeblocks_multi_device_with_pool(
2759 &dequant_targets,
2760 pool,
2761 )
2762 },
2763 )
2764 };
2765 let (stats, dequant_us) = match dequant_result {
2766 Ok(result) => result,
2767 Err(error) => {
2768 if let Some(queued) = queued_cleanup.take() {
2769 queued.finish().map_err(cuda_error)?;
2770 }
2771 return Err(cuda_error(error));
2772 }
2773 };
2774 (stats, dequant_us, dequant_target_count)
2775 };
2776 let stage_wall_us = profile::elapsed_us(stage_start);
2777 {
2778 let accounting = &mut component_work[accounting_index];
2779 accounting.timings.h2d = accounting
2780 .timings
2781 .h2d
2782 .saturating_add(stage_wall_us.saturating_sub(dequant_us));
2783 accounting.timings.dequant = accounting.timings.dequant.saturating_add(dequant_us);
2784 accounting.timings.dequant_dispatch_count = accounting
2785 .timings
2786 .dequant_dispatch_count
2787 .saturating_add(htj2k_batched_dequant_dispatches(dequant_target_count));
2788 accounting.dispatches = accounting
2789 .dispatches
2790 .saturating_add(stats.kernel_dispatches());
2791 accounting.decode_dispatches = accounting
2792 .decode_dispatches
2793 .saturating_add(stats.decode_kernel_dispatches());
2794 }
2795 if let Some(queued) = queued_cleanup.take() {
2796 queued.finish().map_err(cuda_error)?;
2797 }
2798
2799 for work in component_work {
2800 work.pending_dequant_bands.clear();
2801 }
2802 Ok(())
2803}
2804
2805#[cfg(feature = "cuda-runtime")]
2806fn run_cuda_component_idwt_steps(
2807 context: &j2k_cuda_runtime::CudaContext,
2808 steps: &[CudaHtj2kIdwtStep],
2809 work: &mut CudaComponentDecodeWork,
2810 pool: &CudaBufferPool,
2811 collect_stage_timings: bool,
2812) -> Result<(), Error> {
2813 for step in steps {
2814 let ll = find_cuda_band(&work.bands, step.ll_band_id)?;
2815 let hl = find_cuda_band(&work.bands, step.hl_band_id)?;
2816 let lh = find_cuda_band(&work.bands, step.lh_band_id)?;
2817 let hh = find_cuda_band(&work.bands, step.hh_band_id)?;
2818 let low_low_device = pooled_cuda_buffer(&ll.buffer)?;
2819 let high_low_device = pooled_cuda_buffer(&hl.buffer)?;
2820 let low_high_device = pooled_cuda_buffer(&lh.buffer)?;
2821 let high_high_device = pooled_cuda_buffer(&hh.buffer)?;
2822 let job = cuda_idwt_job_from_step(step);
2823 let (output, idwt_us) = context
2824 .time_default_stream_named_us_if(collect_stage_timings, "j2k.htj2k.decode.idwt", || {
2825 if collect_stage_timings {
2826 return context.j2k_inverse_dwt_single_device_with_pool(
2827 low_low_device,
2828 high_low_device,
2829 low_high_device,
2830 high_high_device,
2831 job,
2832 pool,
2833 );
2834 }
2835 context.j2k_inverse_dwt_single_device_untimed_with_pool(
2836 low_low_device,
2837 high_low_device,
2838 low_high_device,
2839 high_high_device,
2840 job,
2841 pool,
2842 )
2843 })
2844 .map_err(cuda_error)?;
2845 work.timings.idwt = work.timings.idwt.saturating_add(idwt_us);
2846 let (buffer, stats) = output.into_parts();
2847 work.dispatches = work.dispatches.saturating_add(stats.kernel_dispatches());
2848 work.decode_dispatches = work
2849 .decode_dispatches
2850 .saturating_add(stats.decode_kernel_dispatches());
2851 work.timings.idwt_dispatch_count = work
2852 .timings
2853 .idwt_dispatch_count
2854 .saturating_add(stats.kernel_dispatches());
2855 work.bands.push(CudaCoefficientBand {
2856 band_id: step.output_band_id,
2857 buffer,
2858 });
2859 }
2860 Ok(())
2861}
2862
2863#[cfg(feature = "cuda-runtime")]
2864fn finish_cuda_component_decode(
2865 mut work: CudaComponentDecodeWork,
2866) -> Result<CudaDecodedComponent, Error> {
2867 let input_index = work
2868 .bands
2869 .iter()
2870 .position(|band| band.band_id == work.store.input_band_id)
2871 .ok_or(Error::UnsupportedCudaRequest {
2872 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
2873 })?;
2874 let input = work.bands.swap_remove(input_index);
2875 Ok(CudaDecodedComponent {
2876 buffer: input.buffer,
2877 store: work.store,
2878 dispatches: work.dispatches,
2879 decode_dispatches: work.decode_dispatches,
2880 timings: work.timings,
2881 })
2882}
2883
2884#[cfg(feature = "cuda-runtime")]
2885fn can_batch_color_idwt(components: &[&CudaHtj2kDecodePlan]) -> bool {
2886 let Some(first) = components.first() else {
2887 return false;
2888 };
2889 components
2890 .iter()
2891 .all(|component| component.idwt_steps().len() == first.idwt_steps().len())
2892}
2893
2894#[cfg(feature = "cuda-runtime")]
2895fn run_color_component_idwt_batches(
2896 context: &j2k_cuda_runtime::CudaContext,
2897 components: &[&CudaHtj2kDecodePlan],
2898 component_work: &mut [CudaComponentDecodeWork],
2899 pool: &CudaBufferPool,
2900 collect_stage_timings: bool,
2901) -> Result<Option<CudaQueuedIdwtBatch>, Error> {
2902 let (queued_batch, idwt_us) = context
2903 .time_default_stream_named_us_if(
2904 collect_stage_timings,
2905 "j2k.htj2k.decode.idwt.batch",
2906 || enqueue_color_component_idwt_batches(context, components, component_work, pool),
2907 )
2908 .map_err(cuda_error)?;
2909
2910 if let Some(accounting) = component_work.first_mut() {
2911 accounting.timings.idwt = accounting.timings.idwt.saturating_add(idwt_us);
2912 accounting.dispatches = accounting
2913 .dispatches
2914 .saturating_add(queued_batch.kernel_dispatches);
2915 accounting.decode_dispatches = accounting
2916 .decode_dispatches
2917 .saturating_add(queued_batch.decode_dispatches);
2918 accounting.timings.idwt_dispatch_count = accounting
2919 .timings
2920 .idwt_dispatch_count
2921 .saturating_add(queued_batch.kernel_dispatches);
2922 }
2923 let _queued_resource_count = queued_batch
2924 .queued
2925 .iter()
2926 .map(CudaQueuedExecution::resource_count)
2927 .sum::<usize>();
2928 if collect_stage_timings {
2929 drop(queued_batch);
2930 Ok(None)
2931 } else {
2932 Ok(Some(queued_batch))
2933 }
2934}
2935
2936#[cfg(feature = "cuda-runtime")]
2937fn enqueue_color_component_idwt_batches(
2938 context: &j2k_cuda_runtime::CudaContext,
2939 components: &[&CudaHtj2kDecodePlan],
2940 component_work: &mut [CudaComponentDecodeWork],
2941 pool: &CudaBufferPool,
2942) -> Result<CudaQueuedIdwtBatch, CudaError> {
2943 if components.len() != component_work.len() {
2944 return Err(CudaError::InvalidArgument {
2945 message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
2946 });
2947 }
2948 let Some(first) = components.first() else {
2949 return Ok(CudaQueuedIdwtBatch {
2950 queued: Vec::new(),
2951 kernel_dispatches: 0,
2952 decode_dispatches: 0,
2953 });
2954 };
2955
2956 let mut queued = Vec::with_capacity(first.idwt_steps().len());
2957 let mut kernel_dispatches = 0usize;
2958 let mut decode_dispatches = 0usize;
2959 let step_count = first.idwt_steps().len();
2960 let trace_enabled = cuda_idwt_trace_enabled();
2961 let enqueue_result = (|| -> Result<(), CudaError> {
2962 let mut output_pool_trace = CudaIdwtOutputPoolTraceTotals::default();
2963 let output_alloc_start = trace_enabled.then(std::time::Instant::now);
2964 for step_index in 0..step_count {
2965 for (component_index, component) in components.iter().enumerate() {
2966 let step = component.idwt_steps().get(step_index).ok_or_else(|| {
2967 CudaError::InvalidArgument {
2968 message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
2969 }
2970 })?;
2971 let width = step.rect.x1.saturating_sub(step.rect.x0);
2972 let height = step.rect.y1.saturating_sub(step.rect.y0);
2973 let output_words = checked_area(width, height).map_err(cuda_invalid_decode_plan)?;
2974 let output_bytes = output_words
2975 .checked_mul(std::mem::size_of::<f32>())
2976 .ok_or_else(|| CudaError::InvalidArgument {
2977 message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
2978 })?;
2979 let buffer = if trace_enabled {
2980 let (buffer, trace) = pool.take_with_trace(output_bytes)?;
2981 output_pool_trace.add_take(trace);
2982 buffer
2983 } else {
2984 pool.take(output_bytes)?
2985 };
2986 component_work[component_index]
2987 .bands
2988 .push(CudaCoefficientBand {
2989 band_id: step.output_band_id,
2990 buffer,
2991 });
2992 }
2993 }
2994 let output_alloc_us = elapsed_host_us(output_alloc_start);
2995
2996 let target_build_start = trace_enabled.then(std::time::Instant::now);
2997 let mut target_batches = Vec::with_capacity(step_count);
2998 for step_index in 0..step_count {
2999 let targets = components
3000 .iter()
3001 .enumerate()
3002 .map(|(component_index, component)| {
3003 let step = component.idwt_steps().get(step_index).ok_or_else(|| {
3004 CudaError::InvalidArgument {
3005 message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
3006 }
3007 })?;
3008 let work = &component_work[component_index];
3009 let ll = find_cuda_band(&work.bands, step.ll_band_id)
3010 .map_err(cuda_invalid_decode_plan)?;
3011 let hl = find_cuda_band(&work.bands, step.hl_band_id)
3012 .map_err(cuda_invalid_decode_plan)?;
3013 let lh = find_cuda_band(&work.bands, step.lh_band_id)
3014 .map_err(cuda_invalid_decode_plan)?;
3015 let hh = find_cuda_band(&work.bands, step.hh_band_id)
3016 .map_err(cuda_invalid_decode_plan)?;
3017 let output = find_cuda_band(&work.bands, step.output_band_id)
3018 .map_err(cuda_invalid_decode_plan)?;
3019 Ok(CudaJ2kIdwtTarget {
3020 ll: pooled_cuda_buffer(&ll.buffer).map_err(cuda_invalid_decode_plan)?,
3021 hl: pooled_cuda_buffer(&hl.buffer).map_err(cuda_invalid_decode_plan)?,
3022 lh: pooled_cuda_buffer(&lh.buffer).map_err(cuda_invalid_decode_plan)?,
3023 hh: pooled_cuda_buffer(&hh.buffer).map_err(cuda_invalid_decode_plan)?,
3024 output: pooled_cuda_buffer(&output.buffer)
3025 .map_err(cuda_invalid_decode_plan)?,
3026 job: cuda_idwt_job_from_step(step),
3027 })
3028 })
3029 .collect::<Result<Vec<_>, CudaError>>()?;
3030 target_batches.push(targets);
3031 }
3032 let target_build_us = elapsed_host_us(target_build_start);
3033 let target_slices = target_batches.iter().map(Vec::as_slice).collect::<Vec<_>>();
3034 let enqueue_start = trace_enabled.then(std::time::Instant::now);
3035 let queued_execution =
3036 context.j2k_inverse_dwt_batch_sequence_enqueue_with_pool(&target_slices, pool)?;
3037 let enqueue_us = elapsed_host_us(enqueue_start);
3038 let execution = queued_execution.execution();
3039 kernel_dispatches = kernel_dispatches.saturating_add(execution.kernel_dispatches());
3040 decode_dispatches = decode_dispatches.saturating_add(execution.decode_kernel_dispatches());
3041 queued.push(queued_execution);
3042 if trace_enabled {
3043 let row = CudaIdwtBatchHostTraceRow {
3044 component_count: components.len(),
3045 step_count,
3046 output_alloc_us,
3047 target_build_us,
3048 enqueue_us,
3049 output_take_count: output_pool_trace.take_count,
3050 output_pool_reuse_count: output_pool_trace.reuse_count,
3051 output_pool_alloc_count: output_pool_trace.alloc_count,
3052 output_pool_scanned_count: output_pool_trace.scanned_count,
3053 output_pool_max_free_count: output_pool_trace.max_free_count,
3054 output_requested_bytes: output_pool_trace.requested_bytes,
3055 };
3056 eprintln!("{}", format_cuda_idwt_batch_host_trace_row(row));
3057 }
3058 Ok(())
3059 })();
3060 if let Err(error) = enqueue_result {
3061 if !queued.is_empty() {
3062 let _ = context.synchronize();
3063 }
3064 return Err(error);
3065 }
3066
3067 Ok(CudaQueuedIdwtBatch {
3068 queued,
3069 kernel_dispatches,
3070 decode_dispatches,
3071 })
3072}
3073
3074#[cfg(feature = "cuda-runtime")]
3075fn cuda_code_block_job_from_plan_block(
3076 block: &crate::CudaHtj2kCodeBlock,
3077 subband_width: u32,
3078) -> Result<CudaHtj2kCodeBlockJob, Error> {
3079 let output_offset = block
3080 .output_y
3081 .checked_mul(subband_width)
3082 .and_then(|base| base.checked_add(block.output_x))
3083 .ok_or(Error::UnsupportedCudaRequest {
3084 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3085 })?;
3086 Ok(CudaHtj2kCodeBlockJob {
3087 payload_offset: block.payload_offset,
3088 width: block.width,
3089 height: block.height,
3090 payload_len: block.payload_len,
3091 cleanup_length: block.cleanup_length,
3092 refinement_length: block.refinement_length,
3093 missing_bit_planes: block.missing_bit_planes,
3094 num_bitplanes: block.num_bitplanes,
3095 number_of_coding_passes: block.number_of_coding_passes,
3096 output_stride: block.output_stride,
3097 output_offset,
3098 dequantization_step: block.dequantization_step,
3099 stripe_causal: block.stripe_causal != 0,
3100 })
3101}
3102
3103#[cfg(feature = "cuda-runtime")]
3104fn validate_color_stores(
3105 stores: [&CudaHtj2kStoreStep; 3],
3106 dimensions: (u32, u32),
3107) -> Result<(), Error> {
3108 let first = stores[0];
3109 for store in stores {
3110 let input_width = store.input_rect.x1.saturating_sub(store.input_rect.x0);
3111 let input_height = store.input_rect.y1.saturating_sub(store.input_rect.y0);
3112 let source_end_x =
3113 store
3114 .source_x
3115 .checked_add(store.copy_width)
3116 .ok_or(Error::UnsupportedCudaRequest {
3117 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3118 })?;
3119 let source_end_y =
3120 store
3121 .source_y
3122 .checked_add(store.copy_height)
3123 .ok_or(Error::UnsupportedCudaRequest {
3124 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3125 })?;
3126 if store.output_x != 0
3127 || store.output_y != 0
3128 || store.copy_width != dimensions.0
3129 || store.copy_height != dimensions.1
3130 || store.output_width != dimensions.0
3131 || store.output_height != dimensions.1
3132 || source_end_x > input_width
3133 || source_end_y > input_height
3134 || store.source_x != first.source_x
3135 || store.source_y != first.source_y
3136 {
3137 return Err(Error::UnsupportedCudaRequest {
3138 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3139 });
3140 }
3141 }
3142 Ok(())
3143}
3144
3145#[cfg(feature = "cuda-runtime")]
3146fn bit_depth_addend(bit_depth: u8) -> f32 {
3147 let shift = bit_depth.saturating_sub(1).min(15);
3148 f32::from(1_u16 << shift)
3149}
3150
3151#[cfg(feature = "cuda-runtime")]
3152fn checked_area(width: u32, height: u32) -> Result<usize, Error> {
3153 width
3154 .try_into()
3155 .ok()
3156 .and_then(|width: usize| width.checked_mul(height as usize))
3157 .ok_or(Error::UnsupportedCudaRequest {
3158 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3159 })
3160}
3161
3162#[cfg(feature = "cuda-runtime")]
3163fn find_cuda_band(
3164 bands: &[CudaCoefficientBand],
3165 band_id: CudaHtj2kBandId,
3166) -> Result<&CudaCoefficientBand, Error> {
3167 bands
3168 .iter()
3169 .find(|band| band.band_id == band_id)
3170 .ok_or(Error::UnsupportedCudaRequest {
3171 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3172 })
3173}
3174
3175#[cfg(feature = "cuda-runtime")]
3176fn pooled_cuda_buffer(buffer: &CudaPooledDeviceBuffer) -> Result<&CudaDeviceBuffer, Error> {
3177 buffer
3178 .as_device_buffer()
3179 .ok_or(Error::UnsupportedCudaRequest {
3180 reason: CUDA_HTJ2K_KERNELS_NOT_READY,
3181 })
3182}
3183
3184#[cfg(feature = "cuda-runtime")]
3185#[allow(clippy::needless_pass_by_value)]
3186fn cuda_invalid_decode_plan(error: Error) -> CudaError {
3187 CudaError::InvalidArgument {
3188 message: error.to_string(),
3189 }
3190}
3191
3192#[cfg(feature = "cuda-runtime")]
3193fn cuda_runtime_rect(rect: crate::CudaHtj2kRect) -> CudaJ2kRect {
3194 CudaJ2kRect {
3195 x0: rect.x0,
3196 y0: rect.y0,
3197 x1: rect.x1,
3198 y1: rect.y1,
3199 }
3200}
3201
3202#[cfg(feature = "cuda-runtime")]
3203fn cuda_idwt_job_from_step(step: &CudaHtj2kIdwtStep) -> CudaJ2kIdwtJob {
3204 CudaJ2kIdwtJob {
3205 rect: cuda_runtime_rect(step.rect),
3206 ll_rect: cuda_runtime_rect(step.ll_rect),
3207 hl_rect: cuda_runtime_rect(step.hl_rect),
3208 lh_rect: cuda_runtime_rect(step.lh_rect),
3209 hh_rect: cuda_runtime_rect(step.hh_rect),
3210 irreversible97: u32::from(step.transform == CudaHtj2kTransform::Irreversible97),
3211 }
3212}
3213
3214#[cfg(all(test, feature = "cuda-runtime"))]
3215mod tests {
3216 use super::{
3217 build_cuda_htj2k_color_plans_from_bytes_with_profile, can_batch_color_idwt,
3218 cuda_code_block_job_from_plan_block, htj2k_batched_cleanup_dequant_dispatches,
3219 htj2k_batched_cleanup_dispatches, htj2k_batched_dequant_dispatches, CudaDecodeStageTimings,
3220 };
3221 use j2k_core::PixelFormat;
3222 use j2k_native::{encode_htj2k, DecoderContext as NativeDecoderContext, EncodeOptions};
3223
3224 use crate::CudaHtj2kCodeBlock;
3225
3226 #[test]
3227 fn cuda_runtime_code_block_job_preserves_plan_output_stride() {
3228 let block = CudaHtj2kCodeBlock {
3229 subband_index: 0,
3230 payload_offset: 13,
3231 payload_len: 5,
3232 cleanup_length: 5,
3233 refinement_length: 0,
3234 output_x: 3,
3235 output_y: 2,
3236 width: 4,
3237 height: 5,
3238 output_stride: 99,
3239 missing_bit_planes: 1,
3240 number_of_coding_passes: 1,
3241 num_bitplanes: 8,
3242 stripe_causal: 0,
3243 dequantization_step: 1.0,
3244 };
3245
3246 let job = cuda_code_block_job_from_plan_block(&block, 64)
3247 .expect("valid CUDA code-block runtime job");
3248
3249 assert_eq!(job.output_offset, 131);
3250 assert_eq!(job.output_stride, 99);
3251 }
3252
3253 #[test]
3254 fn batched_cleanup_and_dequant_dispatch_helpers_count_one_shared_dispatch() {
3255 assert_eq!(htj2k_batched_cleanup_dispatches(0), 0);
3256 assert_eq!(htj2k_batched_cleanup_dispatches(1), 1);
3257 assert_eq!(htj2k_batched_cleanup_dispatches(3), 1);
3258 assert_eq!(htj2k_batched_dequant_dispatches(0), 0);
3259 assert_eq!(htj2k_batched_dequant_dispatches(1), 1);
3260 assert_eq!(htj2k_batched_dequant_dispatches(3), 1);
3261 assert_eq!(htj2k_batched_cleanup_dequant_dispatches(0, true), (0, 0));
3262 assert_eq!(htj2k_batched_cleanup_dequant_dispatches(1, true), (1, 0));
3263 assert_eq!(htj2k_batched_cleanup_dequant_dispatches(3, true), (1, 0));
3264 assert_eq!(htj2k_batched_cleanup_dequant_dispatches(1, false), (1, 1));
3265 assert_eq!(htj2k_batched_cleanup_dequant_dispatches(3, false), (1, 1));
3266 }
3267
3268 #[test]
3269 fn profiled_cuda_batch_decode_api_accepts_empty_batch() {
3270 let mut session = crate::CudaSession::default();
3271 let inputs: [&[u8]; 0] = [];
3272
3273 let (surfaces, report) =
3274 crate::J2kDecoder::decode_batch_to_device_with_session_and_profile(
3275 &inputs,
3276 PixelFormat::Rgb8,
3277 &mut session,
3278 )
3279 .expect("empty CUDA batch decode");
3280
3281 assert!(surfaces.is_empty());
3282 assert_eq!(report.block_count, 0);
3283 assert_eq!(report.payload_bytes, 0);
3284 }
3285
3286 #[test]
3287 fn cuda_batch_decode_two_color_images_matches_single_when_runtime_required() {
3288 let pixels_a: Vec<u8> = (0u16..16 * 16 * 3)
3289 .map(|idx| u8::try_from((idx * 7 + idx / 5) & 0xff).expect("masked byte"))
3290 .collect();
3291 let pixels_b: Vec<u8> = (0u16..16 * 16 * 3)
3292 .map(|idx| u8::try_from((idx * 11 + 23) & 0xff).expect("masked byte"))
3293 .collect();
3294 let options = EncodeOptions {
3295 reversible: true,
3296 num_decomposition_levels: 1,
3297 ..EncodeOptions::default()
3298 };
3299 let codestream_a =
3300 encode_htj2k(&pixels_a, 16, 16, 3, 8, false, &options).expect("encode fixture A");
3301 let codestream_b =
3302 encode_htj2k(&pixels_b, 16, 16, 3, 8, false, &options).expect("encode fixture B");
3303 let inputs = [codestream_a.as_slice(), codestream_b.as_slice()];
3304 let mut batch_session = crate::CudaSession::default();
3305
3306 let batch = crate::J2kDecoder::decode_batch_to_device_with_session_and_profile(
3307 &inputs,
3308 PixelFormat::Rgb8,
3309 &mut batch_session,
3310 );
3311 let (surfaces, report) = match batch {
3312 Ok(result) => result,
3313 Err(crate::Error::CudaUnavailable | crate::Error::CudaRuntime { .. })
3314 if !cuda_runtime_required() =>
3315 {
3316 return;
3317 }
3318 Err(error) => panic!("batch CUDA decode failed: {error}"),
3319 };
3320
3321 assert_eq!(surfaces.len(), 2);
3322 assert_eq!(report.detail.ht_dispatch_count, 1);
3323 assert_eq!(report.detail.dequant_dispatch_count, 0);
3324 assert_eq!(report.detail.store_dispatch_count, 1);
3325 let batch_pixels_tight =
3326 crate::Surface::download_batch_tight(&surfaces).expect("download tight CUDA batch");
3327 assert_eq!(batch_pixels_tight.len(), surfaces.len() * 16 * 16 * 3);
3328 for (index, codestream) in inputs.iter().enumerate() {
3329 let mut single_session = crate::CudaSession::default();
3330 let mut decoder = crate::J2kDecoder::new(codestream).expect("single decoder");
3331 let single = decoder
3332 .decode_to_device_with_session(PixelFormat::Rgb8, &mut single_session)
3333 .expect("single CUDA decode");
3334 let mut single_pixels = vec![0u8; 16 * 16 * 3];
3335 let mut batch_pixels = vec![0u8; 16 * 16 * 3];
3336 single
3337 .download_into(&mut single_pixels, 16 * 3)
3338 .expect("download single decode");
3339 surfaces[index]
3340 .download_into(&mut batch_pixels, 16 * 3)
3341 .expect("download batch decode");
3342 assert_eq!(batch_pixels, single_pixels);
3343 assert_eq!(
3344 &batch_pixels_tight[index * 16 * 16 * 3..(index + 1) * 16 * 16 * 3],
3345 single_pixels.as_slice()
3346 );
3347 }
3348 }
3349
3350 #[test]
3351 fn cuda_batch_decode_mixed_idwt_shapes_avoids_fused_batch_store_without_idwt_batch() {
3352 let codestream_a = rgb8_htj2k_fixture(32, 32, 1, 7);
3353 let codestream_b = rgb8_htj2k_fixture(32, 32, 2, 19);
3354 let inputs = [codestream_a.as_slice(), codestream_b.as_slice()];
3355 let mut batch_session = crate::CudaSession::default();
3356
3357 let result = crate::J2kDecoder::decode_batch_to_device_with_session(
3358 &inputs,
3359 PixelFormat::Rgb8,
3360 &mut batch_session,
3361 );
3362 let surfaces = match result {
3363 Ok(surfaces) => surfaces,
3364 Err(crate::Error::CudaUnavailable | crate::Error::CudaRuntime { .. })
3365 if !cuda_runtime_required() =>
3366 {
3367 return;
3368 }
3369 Err(crate::Error::UnsupportedCudaRequest { .. }) => return,
3370 Err(error) => panic!("mixed-shape batch CUDA decode failed: {error}"),
3371 };
3372
3373 assert_eq!(surfaces.len(), inputs.len());
3374 for (index, codestream) in inputs.iter().enumerate() {
3375 let mut single_session = crate::CudaSession::default();
3376 let mut decoder = crate::J2kDecoder::new(codestream).expect("single decoder");
3377 let single = decoder
3378 .decode_to_device_with_session(PixelFormat::Rgb8, &mut single_session)
3379 .expect("single CUDA decode");
3380 let mut single_pixels = vec![0u8; 32 * 32 * 3];
3381 let mut batch_pixels = vec![0u8; 32 * 32 * 3];
3382 single
3383 .download_into(&mut single_pixels, 32 * 3)
3384 .expect("download single decode");
3385 surfaces[index]
3386 .download_into(&mut batch_pixels, 32 * 3)
3387 .expect("download mixed-shape batch decode");
3388 assert_eq!(batch_pixels, single_pixels);
3389 }
3390 }
3391
3392 #[test]
3393 fn decode_stage_timings_report_status_download_detail() {
3394 let mut report = crate::CudaHtj2kProfileReport::default();
3395 let timings = CudaDecodeStageTimings {
3396 h2d: 17,
3397 status_d2h: 5,
3398 ..CudaDecodeStageTimings::default()
3399 };
3400
3401 timings.add_to_report(&mut report);
3402
3403 assert_eq!(report.h2d_us, 17);
3404 assert_eq!(report.detail.status_d2h_us, 5);
3405 }
3406
3407 fn cuda_runtime_required() -> bool {
3408 std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_some()
3409 }
3410
3411 fn rgb8_htj2k_fixture(width: u32, height: u32, levels: u8, seed: u16) -> Vec<u8> {
3412 let mut pixels = Vec::with_capacity(width as usize * height as usize * 3);
3413 for idx in 0..width * height {
3414 let seed = u32::from(seed);
3415 pixels.push(u8::try_from((idx * seed + idx / 3) & 0xff).expect("red"));
3416 pixels.push(u8::try_from((idx * (seed + 11) + 7) & 0xff).expect("green"));
3417 pixels.push(u8::try_from((idx * (seed + 23) + 19) & 0xff).expect("blue"));
3418 }
3419 let options = EncodeOptions {
3420 reversible: true,
3421 num_decomposition_levels: levels,
3422 ..EncodeOptions::default()
3423 };
3424 encode_htj2k(&pixels, width, height, 3, 8, false, &options)
3425 .expect("encode RGB HTJ2K fixture")
3426 }
3427
3428 #[test]
3429 fn color_plan_flattens_one_shared_payload_for_component_decode() {
3430 let pixels: Vec<u8> = (0u16..4 * 4 * 3)
3431 .map(|idx| u8::try_from((idx * 13 + idx / 3) & 0xff).expect("masked byte"))
3432 .collect();
3433 let options = EncodeOptions {
3434 reversible: true,
3435 num_decomposition_levels: 1,
3436 ..EncodeOptions::default()
3437 };
3438 let codestream =
3439 encode_htj2k(&pixels, 4, 4, 3, 8, false, &options).expect("encode HTJ2K RGB fixture");
3440 let mut decoder = crate::J2kDecoder::new(&codestream).expect("decoder");
3441
3442 let color = decoder
3443 .build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
3444 .expect("CUDA color plans");
3445
3446 assert_eq!(color.components.len(), 3);
3447 assert!(!color.payload.is_empty());
3448 assert_eq!(color.report.payload_bytes, color.payload.len());
3449 for component in &color.components {
3450 assert!(component.payload().is_empty());
3451 for block in component.code_blocks() {
3452 let start = usize::try_from(block.payload_offset).expect("payload offset");
3453 let end = start + block.payload_len as usize;
3454 assert!(end <= color.payload.len());
3455 }
3456 }
3457 }
3458
3459 #[test]
3460 fn byte_color_plan_builder_matches_decoder_color_plan() {
3461 let pixels: Vec<u8> = (0u16..8 * 8 * 3)
3462 .map(|idx| u8::try_from((idx * 19 + idx / 5) & 0xff).expect("masked byte"))
3463 .collect();
3464 let options = EncodeOptions {
3465 reversible: true,
3466 num_decomposition_levels: 1,
3467 ..EncodeOptions::default()
3468 };
3469 let codestream =
3470 encode_htj2k(&pixels, 8, 8, 3, 8, false, &options).expect("encode HTJ2K RGB fixture");
3471 let mut decoder = crate::J2kDecoder::new(&codestream).expect("decoder");
3472 let decoder_plan = decoder
3473 .build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
3474 .expect("decoder CUDA color plans");
3475 let mut native_context = NativeDecoderContext::default();
3476 let byte_plan = build_cuda_htj2k_color_plans_from_bytes_with_profile(
3477 &codestream,
3478 PixelFormat::Rgb8,
3479 &mut native_context,
3480 )
3481 .expect("byte CUDA color plans");
3482
3483 assert_eq!(byte_plan.dimensions, decoder_plan.dimensions);
3484 assert_eq!(byte_plan.mct_dimensions, decoder_plan.mct_dimensions);
3485 assert_eq!(byte_plan.bit_depths, decoder_plan.bit_depths);
3486 assert_eq!(byte_plan.mct, decoder_plan.mct);
3487 assert_eq!(byte_plan.components.len(), decoder_plan.components.len());
3488 assert_eq!(byte_plan.payload.len(), decoder_plan.payload.len());
3489 assert_eq!(
3490 byte_plan
3491 .components
3492 .iter()
3493 .map(|component| component.code_blocks().len())
3494 .collect::<Vec<_>>(),
3495 decoder_plan
3496 .components
3497 .iter()
3498 .map(|component| component.code_blocks().len())
3499 .collect::<Vec<_>>()
3500 );
3501 }
3502
3503 #[test]
3504 fn multi_image_color_components_can_share_one_idwt_batch() {
3505 let pixels: Vec<u8> = (0u16..16 * 16 * 3)
3506 .map(|idx| u8::try_from((idx * 17 + idx / 7) & 0xff).expect("masked byte"))
3507 .collect();
3508 let options = EncodeOptions {
3509 reversible: true,
3510 num_decomposition_levels: 1,
3511 ..EncodeOptions::default()
3512 };
3513 let codestream =
3514 encode_htj2k(&pixels, 16, 16, 3, 8, false, &options).expect("encode HTJ2K RGB fixture");
3515 let mut first = crate::J2kDecoder::new(&codestream).expect("first decoder");
3516 let mut second = crate::J2kDecoder::new(&codestream).expect("second decoder");
3517 let first = first
3518 .build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
3519 .expect("first CUDA color plans");
3520 let second = second
3521 .build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
3522 .expect("second CUDA color plans");
3523 let components = first
3524 .components
3525 .iter()
3526 .chain(second.components.iter())
3527 .collect::<Vec<_>>();
3528
3529 assert_eq!(components.len(), 6);
3530 assert!(can_batch_color_idwt(&components));
3531 }
3532
3533 #[test]
3534 fn batched_color_idwt_defers_completion_to_store_sync() {
3535 let source = include_str!("decoder.rs");
3536
3537 assert!(
3538 !source.contains(
3539 "if !collect_stage_timings {\n context.synchronize().map_err(cuda_error)?;\n }"
3540 ),
3541 "batched color IDWT should keep queued resources live and let the following store synchronize"
3542 );
3543 }
3544}
3545
3546impl ImageCodec for J2kDecoder<'_> {
3547 type Error = Error;
3548 type Warning = Infallible;
3549 type Pool = CpuJ2kScratchPool;
3550}
3551
3552impl<'a> ImageDecode<'a> for J2kDecoder<'a> {
3553 type View = J2kView<'a>;
3554
3555 fn inspect(input: &'a [u8]) -> Result<j2k_core::Info, Self::Error> {
3556 Ok(CpuDecoder::inspect(input)?)
3557 }
3558
3559 fn parse(input: &'a [u8]) -> Result<Self::View, Self::Error> {
3560 Ok(J2kView::parse(input)?)
3561 }
3562
3563 fn from_view(view: Self::View) -> Result<Self, Self::Error> {
3564 Ok(Self {
3565 inner: CpuDecoder::from_view(view)?,
3566 pool: CpuJ2kScratchPool::new(),
3567 })
3568 }
3569
3570 fn decode_into(
3571 &mut self,
3572 out: &mut [u8],
3573 stride: usize,
3574 fmt: PixelFormat,
3575 ) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
3576 Ok(self.inner.decode_into(out, stride, fmt)?)
3577 }
3578
3579 fn decode_into_with_scratch(
3580 &mut self,
3581 pool: &mut Self::Pool,
3582 out: &mut [u8],
3583 stride: usize,
3584 fmt: PixelFormat,
3585 ) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
3586 Ok(self
3587 .inner
3588 .decode_into_with_scratch(pool, out, stride, fmt)?)
3589 }
3590
3591 fn decode_region_into(
3592 &mut self,
3593 pool: &mut Self::Pool,
3594 out: &mut [u8],
3595 stride: usize,
3596 fmt: PixelFormat,
3597 roi: Rect,
3598 ) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
3599 Ok(self.inner.decode_region_into(pool, out, stride, fmt, roi)?)
3600 }
3601
3602 fn decode_scaled_into(
3603 &mut self,
3604 pool: &mut Self::Pool,
3605 out: &mut [u8],
3606 stride: usize,
3607 fmt: PixelFormat,
3608 scale: Downscale,
3609 ) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
3610 Ok(self
3611 .inner
3612 .decode_scaled_into(pool, out, stride, fmt, scale)?)
3613 }
3614
3615 fn decode_region_scaled_into(
3616 &mut self,
3617 pool: &mut Self::Pool,
3618 out: &mut [u8],
3619 stride: usize,
3620 fmt: PixelFormat,
3621 roi: Rect,
3622 scale: Downscale,
3623 ) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
3624 Ok(self
3625 .inner
3626 .decode_region_scaled_into(pool, out, stride, fmt, roi, scale)?)
3627 }
3628}
3629
3630impl<'a> ImageDecodeDevice<'a> for J2kDecoder<'a> {
3631 type DeviceSurface = Surface;
3632}
3633
3634impl<'a> ImageDecodeSubmit<'a> for J2kDecoder<'a> {
3635 type Session = CudaSession;
3636 type DeviceSurface = Surface;
3637 type SubmittedSurface = ReadySubmission<Surface, Error>;
3638
3639 fn submit_to_device(
3640 &mut self,
3641 session: &mut Self::Session,
3642 fmt: PixelFormat,
3643 backend: BackendRequest,
3644 ) -> Result<Self::SubmittedSurface, Self::Error> {
3645 validate_surface_request(backend)?;
3646 Ok(submit_ready_device(session, |session| {
3647 self.decode_to_surface_impl(session, fmt, backend)
3648 }))
3649 }
3650
3651 fn submit_region_to_device(
3652 &mut self,
3653 session: &mut Self::Session,
3654 fmt: PixelFormat,
3655 roi: Rect,
3656 backend: BackendRequest,
3657 ) -> Result<Self::SubmittedSurface, Self::Error> {
3658 validate_surface_request(backend)?;
3659 Ok(submit_ready_device(session, |session| {
3660 self.decode_region_to_surface_impl(session, fmt, roi, backend)
3661 }))
3662 }
3663
3664 fn submit_scaled_to_device(
3665 &mut self,
3666 session: &mut Self::Session,
3667 fmt: PixelFormat,
3668 scale: Downscale,
3669 backend: BackendRequest,
3670 ) -> Result<Self::SubmittedSurface, Self::Error> {
3671 validate_surface_request(backend)?;
3672 Ok(submit_ready_device(session, |session| {
3673 self.decode_scaled_to_surface_impl(session, fmt, scale, backend)
3674 }))
3675 }
3676
3677 fn submit_region_scaled_to_device(
3678 &mut self,
3679 session: &mut Self::Session,
3680 fmt: PixelFormat,
3681 roi: Rect,
3682 scale: Downscale,
3683 backend: BackendRequest,
3684 ) -> Result<Self::SubmittedSurface, Self::Error> {
3685 validate_surface_request(backend)?;
3686 Ok(submit_ready_device(session, |session| {
3687 self.decode_region_scaled_to_surface_impl(session, fmt, roi, scale, backend)
3688 }))
3689 }
3690}
3691
3692#[cfg(test)]
3693mod dispatch_tests {
3694 use super::{
3695 format_cuda_idwt_batch_host_trace_row, htj2k_batched_dequant_dispatches,
3696 split_htj2k_subband_decode_dispatches, CudaIdwtBatchHostTraceRow,
3697 };
3698
3699 #[test]
3700 fn htj2k_decode_dispatch_split_separates_ht_and_dequant_counts() {
3701 assert_eq!(split_htj2k_subband_decode_dispatches(0), (0, 0));
3702 assert_eq!(split_htj2k_subband_decode_dispatches(1), (1, 0));
3703 assert_eq!(split_htj2k_subband_decode_dispatches(2), (1, 1));
3704 assert_eq!(split_htj2k_subband_decode_dispatches(3), (2, 1));
3705 }
3706
3707 #[test]
3708 fn htj2k_batched_dequant_dispatch_count_is_one_for_any_non_empty_batch() {
3709 assert_eq!(htj2k_batched_dequant_dispatches(0), 0);
3710 assert_eq!(htj2k_batched_dequant_dispatches(1), 1);
3711 assert_eq!(htj2k_batched_dequant_dispatches(48), 1);
3712 }
3713
3714 #[test]
3715 fn cuda_idwt_batch_host_trace_row_reports_host_split() {
3716 let row = CudaIdwtBatchHostTraceRow {
3717 component_count: 327,
3718 step_count: 5,
3719 output_alloc_us: 11,
3720 target_build_us: 22,
3721 enqueue_us: 33,
3722 output_take_count: 1635,
3723 output_pool_reuse_count: 1600,
3724 output_pool_alloc_count: 35,
3725 output_pool_scanned_count: 2400,
3726 output_pool_max_free_count: 1700,
3727 output_requested_bytes: 28,
3728 };
3729
3730 assert_eq!(
3731 format_cuda_idwt_batch_host_trace_row(row),
3732 "j2k_profile codec=j2k op=cuda_idwt_batch_host path=decode component_count=327 step_count=5 output_alloc_us=11 target_build_us=22 enqueue_us=33 output_take_count=1635 output_pool_reuse_count=1600 output_pool_alloc_count=35 output_pool_scanned_count=2400 output_pool_max_free_count=1700 output_requested_bytes=28"
3733 );
3734 }
3735}