j2k_metal/encode/stats.rs
1// SPDX-License-Identifier: Apache-2.0
2
3use std::time::Duration;
4
5#[cfg(target_os = "macos")]
6use crate::compute;
7
8use super::MetalLosslessBufferEncodeOutcome;
9
10/// Optional resident Metal encode stage timings.
11///
12/// API note: this diagnostic report is constructed by this crate. It is not
13/// `#[non_exhaustive]`, but adapter releases may add diagnostic fields as the
14/// resident encode path gains more profiling detail.
15///
16/// Unless a field explicitly says otherwise, timing fields are host-side
17/// `Instant` buckets for RCA and should not be read as exact GPU execution
18/// elapsed time.
19#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
20pub struct MetalLosslessEncodeStageStats {
21 /// Time spent planning the resident encode batch.
22 pub plan_duration: Duration,
23 /// Time spent preparing and submitting Metal work.
24 pub prepare_submit_duration: Duration,
25 /// Host-side wall time spent preparing resident encode coefficients.
26 pub coefficient_prep_duration: Duration,
27 /// Reserved for future finer-grained deinterleave plus RCT profiling.
28 ///
29 /// Current resident prep timing is reported in `coefficient_prep_duration`.
30 pub deinterleave_rct_duration: Duration,
31 /// Reserved for future finer-grained forward 5/3 DWT profiling.
32 ///
33 /// Current resident prep timing is reported in `coefficient_prep_duration`.
34 pub dwt53_duration: Duration,
35 /// Reserved for future finer-grained coefficient extraction profiling.
36 ///
37 /// Current resident prep timing is reported in `coefficient_prep_duration`.
38 pub coefficient_extract_duration: Duration,
39 /// Time spent building HT lookup tables.
40 pub ht_table_build_duration: Duration,
41 /// Time spent allocating HT output buffers.
42 pub ht_buffer_allocation_duration: Duration,
43 /// Host-side Metal command encoding time for HT resident command buffers.
44 ///
45 /// This is the sum of the split command-encode buckets below and is not GPU
46 /// kernel execution elapsed time.
47 pub ht_command_encode_duration: Duration,
48 /// Host-side Metal command encoding time for HT code-block dispatch setup.
49 pub ht_block_encode_duration: Duration,
50 /// CPU-side setup time for classic Tier-1 batch jobs and buffers.
51 pub classic_tier1_setup_duration: Duration,
52 /// Host-side Metal command encoding time for classic code-block dispatch setup.
53 pub classic_block_encode_duration: Duration,
54 /// Host-side CPU time spent packing compact classic Tier-1 tokens.
55 ///
56 /// This is populated only when
57 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_TOKEN_PACK=1` is enabled.
58 pub classic_tier1_token_pack_duration: Duration,
59 /// CPU-side packet metadata planning time for classic resident batches.
60 pub classic_packet_plan_duration: Duration,
61 /// CPU-side packet/codestream buffer setup time for classic resident batches.
62 pub classic_packet_buffer_setup_duration: Duration,
63 /// Host-side time spent committing split classic resident command buffers.
64 pub classic_command_buffer_commit_duration: Duration,
65 /// Host-side wall time spent harvesting completed resident batch results.
66 pub result_harvest_duration: Duration,
67 /// Host-side time spent copying shared status buffers into CPU-owned status arrays.
68 pub result_status_copy_duration: Duration,
69 /// Host-side time spent returning private buffers to the resident buffer pool.
70 pub result_private_recycle_duration: Duration,
71 /// Host-side time spent returning shared buffers to the resident buffer pool.
72 pub result_shared_recycle_duration: Duration,
73 /// Host-side time spent validating per-tile status and building codestream handles.
74 pub result_codestream_collect_duration: Duration,
75 /// Host-side Metal command encoding time for packet block metadata dispatch setup.
76 pub packet_block_prep_duration: Duration,
77 /// Host-side Metal command encoding time for packet body dispatch setup.
78 pub packetization_duration: Duration,
79 /// Host-side Metal command encoding time for codestream assembly dispatch setup.
80 pub codestream_assembly_duration: Duration,
81 /// GPU time spent preparing resident coefficient buffers.
82 ///
83 /// This includes the resident input deinterleave/RCT, DWT, and coefficient
84 /// extraction command buffer when stage profiling is enabled.
85 pub coefficient_prep_gpu_duration: Duration,
86 /// GPU time spent deinterleaving resident input planes and applying RCT.
87 ///
88 /// This is populated only when resident coefficient-prep split profiling is enabled.
89 pub coefficient_deinterleave_rct_gpu_duration: Duration,
90 /// GPU time spent running resident forward DWT 5/3 coefficient prep.
91 ///
92 /// This is populated only when resident coefficient-prep split profiling is enabled.
93 pub coefficient_dwt53_gpu_duration: Duration,
94 /// GPU time spent in resident forward DWT 5/3 vertical passes.
95 ///
96 /// This is populated only when resident coefficient-prep split profiling is enabled.
97 pub coefficient_dwt53_vertical_gpu_duration: Duration,
98 /// GPU time spent in resident forward DWT 5/3 horizontal passes.
99 ///
100 /// This is populated only when resident coefficient-prep split profiling is enabled.
101 pub coefficient_dwt53_horizontal_gpu_duration: Duration,
102 /// GPU time spent extracting resident code-block coefficients.
103 ///
104 /// This is populated only when resident coefficient-prep split profiling is enabled.
105 pub coefficient_extract_gpu_duration: Duration,
106 /// GPU time spent copying per-tile coefficient buffers into a batch buffer.
107 ///
108 /// This is populated only when resident split-command profiling is enabled.
109 pub coefficient_copy_gpu_duration: Duration,
110 /// Elapsed GPU timestamp window across the resident encode command buffers.
111 ///
112 /// This is `max(GPUEndTime) - min(GPUStartTime)` for the command buffers
113 /// retained by the batch. It is a wall-window companion to summed GPU busy
114 /// rows and should not be added to per-stage GPU durations.
115 pub gpu_elapsed_wall_duration: Duration,
116 /// GPU time spent in the classic Tier-1 code-block encode command.
117 ///
118 /// This is populated only when classic split-command profiling is enabled.
119 pub classic_block_gpu_duration: Duration,
120 /// GPU time spent in the profile-only classic Tier-1 density probe.
121 ///
122 /// This is populated only when classic split-command profiling and
123 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_DENSITY=1` are enabled.
124 pub classic_tier1_density_gpu_duration: Duration,
125 /// GPU time spent in the profile-only classic Tier-1 raw bypass packing probe.
126 ///
127 /// This is populated only when classic split-command profiling and
128 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_RAW_PACK=1` are enabled.
129 pub classic_tier1_raw_pack_gpu_duration: Duration,
130 /// GPU time spent in the profile-only classic Tier-1 MQ arithmetic packing probe.
131 ///
132 /// This is populated only when classic split-command profiling and
133 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_ARITHMETIC_PACK=1` are enabled.
134 pub classic_tier1_arithmetic_pack_gpu_duration: Duration,
135 /// GPU time spent in the profile-only classic Tier-1 ordered symbol-plan probe.
136 ///
137 /// This is populated only when classic split-command profiling and
138 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_SYMBOL_PLAN=1` are enabled.
139 pub classic_tier1_symbol_plan_gpu_duration: Duration,
140 /// GPU time spent in the profile-only classic Tier-1 pass-plan probe.
141 ///
142 /// This is populated only when classic split-command profiling and
143 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_PASS_PLAN=1` are enabled.
144 pub classic_tier1_pass_plan_gpu_duration: Duration,
145 /// GPU time spent in the profile-only classic Tier-1 compact token-emitter probe.
146 ///
147 /// This is populated only when classic split-command profiling and
148 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_TOKEN_EMIT=1` are enabled.
149 pub classic_tier1_token_emit_gpu_duration: Duration,
150 /// GPU time spent in the profile-only classic Tier-1 split MQ/raw token-emitter probe.
151 ///
152 /// This is populated only when classic split-command profiling and
153 /// `J2K_METAL_PROFILE_CLASSIC_TIER1_SPLIT_TOKEN_EMIT=1` are enabled.
154 pub classic_tier1_split_token_emit_gpu_duration: Duration,
155 /// GPU time spent packing compact classic Tier-1 tokens into resident payloads.
156 ///
157 /// This is populated when the gated classic GPU token-pack route is enabled.
158 pub classic_tier1_token_pack_gpu_duration: Duration,
159 /// GPU time spent in the HT Tier-1 code-block encode command.
160 ///
161 /// This is populated only when HT split-command profiling is enabled.
162 pub ht_block_gpu_duration: Duration,
163 /// GPU time spent preparing packet-block metadata from HT Tier-1 status.
164 ///
165 /// This is populated only when HT split-command profiling is enabled.
166 pub packet_block_prep_gpu_duration: Duration,
167 /// GPU time spent in HTJ2K packetization.
168 ///
169 /// This is populated only when HT split-command profiling is enabled.
170 pub packetization_gpu_duration: Duration,
171 /// GPU time spent copying packet payload bytes after header packetization.
172 ///
173 /// This is populated only when HT split-command profiling is enabled.
174 pub packet_payload_copy_gpu_duration: Duration,
175 /// GPU time spent assembling the HTJ2K codestream buffer.
176 ///
177 /// This is populated only when HT split-command profiling is enabled.
178 pub codestream_assembly_gpu_duration: Duration,
179 /// GPU time spent copying packet payload bytes into final codestream buffers.
180 ///
181 /// This is populated only when HT split-command profiling is enabled.
182 pub codestream_payload_copy_gpu_duration: Duration,
183 /// Total Tier-1 output capacity, in bytes, across resident code blocks.
184 pub tier1_output_capacity_total: usize,
185 /// Maximum Tier-1 output capacity, in bytes, for any resident code block.
186 pub max_tier1_output_capacity: usize,
187 /// Actual Tier-1 output bytes written across resident code blocks.
188 pub tier1_output_used_bytes_total: usize,
189 /// Maximum actual Tier-1 output bytes written by any resident code block.
190 pub max_tier1_output_used_bytes: usize,
191 /// Total Tier-1 segment metadata capacity across resident code blocks.
192 pub tier1_segment_capacity_total: usize,
193 /// Maximum Tier-1 segment metadata capacity for any resident code block.
194 pub max_tier1_segment_capacity_per_block: usize,
195 /// Actual Tier-1 coding passes emitted across resident code blocks.
196 pub tier1_coding_pass_count_total: usize,
197 /// Maximum actual Tier-1 coding passes emitted by any resident code block.
198 pub max_tier1_coding_passes_per_block: usize,
199 /// Estimated classic MQ/arithmetic coding passes across resident code blocks.
200 ///
201 /// For HTJ2K Tier-1 this remains zero.
202 pub tier1_arithmetic_pass_count_total: usize,
203 /// Estimated classic raw bypass coding passes across resident code blocks.
204 ///
205 /// For HTJ2K Tier-1 this remains zero.
206 pub tier1_raw_pass_count_total: usize,
207 /// Estimated classic cleanup passes across resident code blocks.
208 ///
209 /// For HTJ2K Tier-1 this remains zero.
210 pub tier1_cleanup_pass_count_total: usize,
211 /// Estimated classic significance propagation passes across resident code blocks.
212 ///
213 /// For HTJ2K Tier-1 this remains zero.
214 pub tier1_sigprop_pass_count_total: usize,
215 /// Estimated classic magnitude refinement passes across resident code blocks.
216 ///
217 /// For HTJ2K Tier-1 this remains zero.
218 pub tier1_magref_pass_count_total: usize,
219 /// Estimated classic MQ/arithmetic cleanup passes across resident code blocks.
220 ///
221 /// For HTJ2K Tier-1 this remains zero.
222 pub tier1_arithmetic_cleanup_pass_count_total: usize,
223 /// Estimated classic MQ/arithmetic significance propagation passes.
224 ///
225 /// For HTJ2K Tier-1 this remains zero.
226 pub tier1_arithmetic_sigprop_pass_count_total: usize,
227 /// Estimated classic MQ/arithmetic magnitude refinement passes.
228 ///
229 /// For HTJ2K Tier-1 this remains zero.
230 pub tier1_arithmetic_magref_pass_count_total: usize,
231 /// Estimated classic raw bypass significance propagation passes.
232 ///
233 /// For HTJ2K Tier-1 this remains zero.
234 pub tier1_raw_sigprop_pass_count_total: usize,
235 /// Estimated classic raw bypass magnitude refinement passes.
236 ///
237 /// For HTJ2K Tier-1 this remains zero.
238 pub tier1_raw_magref_pass_count_total: usize,
239 /// Estimated full coefficient visits made by classic Tier-1 pass scans.
240 ///
241 /// This is derived from actual emitted pass counts and code-block areas.
242 /// For HTJ2K Tier-1 this remains zero.
243 pub tier1_full_scan_coeff_visit_count_total: usize,
244 /// Estimated full coefficient visits made by MQ/arithmetic pass scans.
245 ///
246 /// For HTJ2K Tier-1 this remains zero.
247 pub tier1_arithmetic_scan_coeff_visit_count_total: usize,
248 /// Estimated full coefficient visits made by raw bypass pass scans.
249 ///
250 /// For HTJ2K Tier-1 this remains zero.
251 pub tier1_raw_scan_coeff_visit_count_total: usize,
252 /// Estimated full coefficient visits made by cleanup pass scans.
253 ///
254 /// For HTJ2K Tier-1 this remains zero.
255 pub tier1_cleanup_scan_coeff_visit_count_total: usize,
256 /// Estimated full coefficient visits made by significance propagation scans.
257 ///
258 /// For HTJ2K Tier-1 this remains zero.
259 pub tier1_sigprop_scan_coeff_visit_count_total: usize,
260 /// Estimated full coefficient visits made by magnitude refinement scans.
261 ///
262 /// For HTJ2K Tier-1 this remains zero.
263 pub tier1_magref_scan_coeff_visit_count_total: usize,
264 /// Maximum estimated full coefficient scan visits for any classic block.
265 ///
266 /// For HTJ2K Tier-1 this remains zero.
267 pub max_tier1_full_scan_coeff_visits_per_block: usize,
268 /// Profile-only count of classic significance propagation candidates.
269 ///
270 /// This is populated only when classic Tier-1 density profiling is enabled.
271 pub tier1_sigprop_active_candidate_count_total: usize,
272 /// Profile-only count of coefficients that become significant in sigprop.
273 ///
274 /// This is populated only when classic Tier-1 density profiling is enabled.
275 pub tier1_sigprop_new_significant_count_total: usize,
276 /// Profile-only count of classic magnitude refinement candidates.
277 ///
278 /// This is populated only when classic Tier-1 density profiling is enabled.
279 pub tier1_magref_active_candidate_count_total: usize,
280 /// Profile-only count of arithmetic-coded significance propagation candidates.
281 pub tier1_arithmetic_sigprop_active_candidate_count_total: usize,
282 /// Profile-only count of coefficients that become significant in arithmetic sigprop.
283 pub tier1_arithmetic_sigprop_new_significant_count_total: usize,
284 /// Profile-only count of raw bypass significance propagation candidates.
285 pub tier1_raw_sigprop_active_candidate_count_total: usize,
286 /// Profile-only count of coefficients that become significant in raw sigprop.
287 pub tier1_raw_sigprop_new_significant_count_total: usize,
288 /// Profile-only count of arithmetic-coded magnitude refinement candidates.
289 pub tier1_arithmetic_magref_active_candidate_count_total: usize,
290 /// Profile-only count of raw bypass magnitude refinement candidates.
291 pub tier1_raw_magref_active_candidate_count_total: usize,
292 /// Profile-only count of cleanup-pass coefficient candidates.
293 ///
294 /// This excludes coefficients represented only by cleanup RLC stripes.
295 pub tier1_cleanup_active_candidate_count_total: usize,
296 /// Profile-only count of coefficients that become significant in cleanup.
297 ///
298 /// This includes significance discovered through cleanup RLC.
299 pub tier1_cleanup_new_significant_count_total: usize,
300 /// Profile-only count of cleanup stripes encoded by the RLC path.
301 pub tier1_cleanup_rlc_stripe_count_total: usize,
302 /// Profile-only count of cleanup RLC stripes with no significant coefficient.
303 pub tier1_cleanup_rlc_zero_stripe_count_total: usize,
304 /// Profile-only exact MQ symbol count from the ordered symbol-plan probe.
305 pub tier1_symbol_plan_mq_symbol_count_total: usize,
306 /// Profile-only exact raw bypass bit count from the ordered symbol-plan probe.
307 pub tier1_symbol_plan_raw_bit_count_total: usize,
308 /// Maximum MQ symbols emitted by any block in the ordered symbol-plan probe.
309 pub max_tier1_symbol_plan_mq_symbols_per_block: usize,
310 /// Maximum raw bypass bits emitted by any block in the ordered symbol-plan probe.
311 pub max_tier1_symbol_plan_raw_bits_per_block: usize,
312 /// Estimated compact token bytes needed for all blocks in the symbol-plan probe.
313 pub tier1_symbol_plan_packed_token_bytes_total: usize,
314 /// Maximum estimated compact token bytes needed by any one block.
315 pub max_tier1_symbol_plan_packed_token_bytes_per_block: usize,
316 /// Profile-only exact cleanup MQ symbol count from the ordered symbol-plan probe.
317 pub tier1_symbol_plan_cleanup_mq_symbol_count_total: usize,
318 /// Profile-only exact sigprop MQ symbol count from the ordered symbol-plan probe.
319 pub tier1_symbol_plan_sigprop_mq_symbol_count_total: usize,
320 /// Profile-only exact magref MQ symbol count from the ordered symbol-plan probe.
321 pub tier1_symbol_plan_magref_mq_symbol_count_total: usize,
322 /// Profile-only exact raw sigprop bit count from the ordered symbol-plan probe.
323 pub tier1_symbol_plan_raw_sigprop_bit_count_total: usize,
324 /// Profile-only exact raw magref bit count from the ordered symbol-plan probe.
325 pub tier1_symbol_plan_raw_magref_bit_count_total: usize,
326 /// Profile-only cleanup sign-symbol count from the ordered symbol-plan probe.
327 pub tier1_symbol_plan_cleanup_sign_symbol_count_total: usize,
328 /// Profile-only sigprop sign-symbol count from the ordered symbol-plan probe.
329 pub tier1_symbol_plan_sigprop_sign_symbol_count_total: usize,
330 /// XOR of per-block order-sensitive MQ symbol hashes from the symbol-plan probe.
331 pub tier1_symbol_plan_mq_symbol_hash_xor: usize,
332 /// XOR of per-block order-sensitive raw bit hashes from the symbol-plan probe.
333 pub tier1_symbol_plan_raw_bit_hash_xor: usize,
334 /// Profile-only MQ symbols counted by coding-pass index.
335 pub tier1_pass_plan_mq_symbol_count_total: usize,
336 /// Profile-only raw bypass bits counted by coding-pass index.
337 pub tier1_pass_plan_raw_bit_count_total: usize,
338 /// Count of block-local coding passes that emit at least one MQ symbol.
339 pub tier1_pass_plan_nonempty_mq_pass_count_total: usize,
340 /// Count of block-local coding passes that emit at least one raw bypass bit.
341 pub tier1_pass_plan_nonempty_raw_pass_count_total: usize,
342 /// Maximum MQ symbols emitted by any single block-local coding pass.
343 pub max_tier1_pass_plan_mq_symbols_per_pass: usize,
344 /// Maximum raw bypass bits emitted by any single block-local coding pass.
345 pub max_tier1_pass_plan_raw_bits_per_pass: usize,
346 /// Exact MQ symbol count from the compact token-emitter probe or gated GPU token-pack route.
347 pub tier1_token_emit_mq_symbol_count_total: usize,
348 /// Exact raw bypass bit count from the compact token-emitter probe or gated GPU token-pack route.
349 pub tier1_token_emit_raw_bit_count_total: usize,
350 /// Compact token bytes emitted by the token-emitter probe or gated GPU token-pack route.
351 pub tier1_token_emit_token_bytes_total: usize,
352 /// Maximum compact token bytes emitted by any one block.
353 pub max_tier1_token_emit_token_bytes_per_block: usize,
354 /// Segment records emitted by the token-emitter probe or gated GPU token-pack route.
355 pub tier1_token_emit_segment_count_total: usize,
356 /// Maximum token-emitter segment records for any one block.
357 pub max_tier1_token_emit_segments_per_block: usize,
358 /// XOR of per-block order-sensitive MQ symbol hashes from token emission.
359 pub tier1_token_emit_mq_symbol_hash_xor: usize,
360 /// XOR of per-block order-sensitive raw bit hashes from token emission.
361 pub tier1_token_emit_raw_bit_hash_xor: usize,
362 /// Total bytes produced by packing emitted Tier-1 tokens.
363 pub tier1_token_pack_output_bytes_total: usize,
364 /// Maximum token-pack output bytes for any one block.
365 pub max_tier1_token_pack_output_bytes_per_block: usize,
366 /// Resident Tier-1 code blocks that emitted at least one coding pass.
367 pub tier1_nonzero_block_count_total: usize,
368 /// Resident Tier-1 code blocks that emitted no coding passes.
369 pub tier1_zero_block_count_total: usize,
370 /// Missing most-significant bitplanes across resident Tier-1 code blocks.
371 pub tier1_missing_bitplane_count_total: usize,
372 /// Maximum missing most-significant bitplanes for any resident code block.
373 pub max_tier1_missing_bitplanes_per_block: usize,
374 /// Classic Tier-1 segment records emitted across resident code blocks.
375 ///
376 /// This remains zero for HTJ2K Tier-1, which does not use classic segment
377 /// records.
378 pub tier1_segment_count_total: usize,
379 /// Maximum classic Tier-1 segment records emitted by any resident code block.
380 pub max_tier1_segments_per_block: usize,
381 /// Total host-planned packet payload-copy job slots across resident chunks.
382 pub packet_payload_copy_job_capacity_total: usize,
383 /// Maximum packet payload-copy job slots needed by any tile in the batch.
384 pub max_packet_payload_copy_jobs_per_tile: usize,
385 /// Actual packet payload-copy jobs emitted by packetization across resident chunks.
386 pub packet_payload_copy_job_count_total: usize,
387 /// Maximum actual packet payload-copy jobs emitted by any tile in the batch.
388 pub max_packet_payload_copy_jobs_used_per_tile: usize,
389 /// Actual packet payload-copy bytes emitted by packetization across resident chunks.
390 pub packet_payload_copy_bytes_total: usize,
391 /// Maximum actual packet payload-copy bytes emitted by any tile in the batch.
392 pub max_packet_payload_copy_bytes_per_tile: usize,
393 /// Packet payload-copy jobs at or below one copy-kernel stripe.
394 pub packet_payload_copy_small_job_count_total: usize,
395 /// Packet payload-copy jobs above one stripe and at or below 512 bytes.
396 pub packet_payload_copy_medium_job_count_total: usize,
397 /// Packet payload-copy jobs above 512 bytes.
398 pub packet_payload_copy_large_job_count_total: usize,
399 /// Packet payload-copy stripes launched by the copy kernel.
400 pub packet_payload_copy_launched_stripe_count_total: usize,
401 /// Packet payload-copy stripes that correspond to emitted copy jobs.
402 pub packet_payload_copy_active_stripe_count_total: usize,
403 /// Total packet output capacity, in bytes, across resident chunks.
404 pub packet_output_capacity_total: usize,
405 /// Maximum packet output capacity, in bytes, for any tile in the batch.
406 pub max_packet_output_capacity: usize,
407 /// Actual packet output bytes written by packetization across resident chunks.
408 pub packet_output_used_bytes_total: usize,
409 /// Maximum actual packet output bytes written by any tile in the batch.
410 pub max_packet_output_used_bytes: usize,
411 /// Codestream payload-copy bytes, in bytes, across resident chunks.
412 pub codestream_payload_copy_bytes_total: usize,
413 /// Codestream payload-copy threads launched by the copy kernel.
414 pub codestream_payload_copy_launched_thread_count_total: usize,
415 /// Estimated codestream payload-copy threads with in-range bytes to copy.
416 pub codestream_payload_copy_active_thread_count_total: usize,
417 /// Time spent waiting for codestream buffers.
418 pub codestream_wait_duration: Duration,
419 /// Alias of `codestream_wait_duration` using RCA naming.
420 ///
421 /// Do not sum this with `codestream_wait_duration` as an independent bucket.
422 pub sync_wait_duration: Duration,
423 /// Time spent materializing buffer-backed codestream bytes into host bytes.
424 ///
425 /// Current batch stats paths may leave this at zero. Host byte
426 /// materialization timing is surfaced on `MetalLosslessEncodeOutcome` where
427 /// applicable; this stage-stats bucket is reserved for stats-bearing
428 /// host-output paths.
429 pub host_readback_duration: Duration,
430 /// Number of resident encode chunks.
431 pub chunk_count: usize,
432 /// Number of encoded tiles.
433 pub tile_count: usize,
434 /// Number of encoded code blocks.
435 pub code_block_count: usize,
436}
437
438/// Combine rule for one stage-stat field in
439/// [`MetalLosslessEncodeStageStats::add_assign`]: `dur` and `count` add with
440/// saturation, `max` keeps the per-batch maximum, `xor` folds hashes.
441macro_rules! stage_stat_combine {
442 (dur, $self:ident, $other:ident, $field:ident) => {
443 $self.$field = $self.$field.saturating_add($other.$field);
444 };
445 (count, $self:ident, $other:ident, $field:ident) => {
446 $self.$field = $self.$field.saturating_add($other.$field);
447 };
448 (max, $self:ident, $other:ident, $field:ident) => {
449 $self.$field = $self.$field.max($other.$field);
450 };
451 (xor, $self:ident, $other:ident, $field:ident) => {
452 $self.$field ^= $other.$field;
453 };
454}
455
456/// Contribution of one stage-stat field to
457/// [`MetalLosslessEncodeStageStats::has_timings`]: only `dur` fields count.
458macro_rules! stage_stat_timing_flag {
459 (dur, $any:ident, $self:ident, $field:ident) => {
460 $any = $any || $self.$field > Duration::ZERO;
461 };
462 ($class:ident, $any:ident, $self:ident, $field:ident) => {};
463}
464
465/// `From<compute::J2kResidentEncodeStageStats>` rule for one stage-stat
466/// field: `resident` fields copy from the compute-layer stats, `local`
467/// fields are facade-side only and keep their default.
468#[cfg(target_os = "macos")]
469macro_rules! stage_stat_from_resident {
470 (resident, $out:ident, $stats:ident, $field:ident) => {
471 $out.$field = $stats.$field;
472 };
473 (local, $out:ident, $stats:ident, $field:ident) => {};
474}
475
476/// Generate the per-field `MetalLosslessEncodeStageStats` impls from the
477/// field table. The destructuring check at the end makes the table
478/// exhaustive: adding a struct field without a table entry fails to compile.
479macro_rules! j2k_metal_stage_stats_impls {
480 ($(($field:ident, $class:ident, $source:ident)),* $(,)?) => {
481 impl MetalLosslessEncodeStageStats {
482 /// Return whether any non-zero timing was recorded.
483 pub fn has_timings(&self) -> bool {
484 let mut any = false;
485 $(stage_stat_timing_flag!($class, any, self, $field);)*
486 any
487 }
488
489 /// Accumulate another stage-stats value using saturating duration and counter additions.
490 pub fn add_assign(&mut self, other: Self) {
491 $(stage_stat_combine!($class, self, other, $field);)*
492 }
493 }
494
495 #[cfg(target_os = "macos")]
496 impl From<compute::J2kResidentEncodeStageStats> for MetalLosslessEncodeStageStats {
497 fn from(stats: compute::J2kResidentEncodeStageStats) -> Self {
498 let mut out = Self::default();
499 $(stage_stat_from_resident!($source, out, stats, $field);)*
500 out
501 }
502 }
503
504 const _: fn(MetalLosslessEncodeStageStats) = |stats| {
505 let MetalLosslessEncodeStageStats { $($field: _),* } = stats;
506 };
507 };
508}
509
510j2k_metal_stage_stats_impls! {
511 (plan_duration, dur, local),
512 (prepare_submit_duration, dur, local),
513 (coefficient_prep_duration, dur, resident),
514 (deinterleave_rct_duration, dur, resident),
515 (dwt53_duration, dur, resident),
516 (coefficient_extract_duration, dur, resident),
517 (ht_table_build_duration, dur, resident),
518 (ht_buffer_allocation_duration, dur, resident),
519 (ht_command_encode_duration, dur, resident),
520 (ht_block_encode_duration, dur, resident),
521 (classic_tier1_setup_duration, dur, resident),
522 (classic_block_encode_duration, dur, resident),
523 (classic_tier1_token_pack_duration, dur, resident),
524 (classic_packet_plan_duration, dur, resident),
525 (classic_packet_buffer_setup_duration, dur, resident),
526 (classic_command_buffer_commit_duration, dur, resident),
527 (result_harvest_duration, dur, resident),
528 (result_status_copy_duration, dur, resident),
529 (result_private_recycle_duration, dur, resident),
530 (result_shared_recycle_duration, dur, resident),
531 (result_codestream_collect_duration, dur, resident),
532 (packet_block_prep_duration, dur, resident),
533 (packetization_duration, dur, resident),
534 (codestream_assembly_duration, dur, resident),
535 (coefficient_prep_gpu_duration, dur, resident),
536 (coefficient_deinterleave_rct_gpu_duration, dur, resident),
537 (coefficient_dwt53_gpu_duration, dur, resident),
538 (coefficient_dwt53_vertical_gpu_duration, dur, resident),
539 (coefficient_dwt53_horizontal_gpu_duration, dur, resident),
540 (coefficient_extract_gpu_duration, dur, resident),
541 (coefficient_copy_gpu_duration, dur, resident),
542 (gpu_elapsed_wall_duration, dur, resident),
543 (classic_block_gpu_duration, dur, resident),
544 (classic_tier1_density_gpu_duration, dur, resident),
545 (classic_tier1_raw_pack_gpu_duration, dur, resident),
546 (classic_tier1_arithmetic_pack_gpu_duration, dur, resident),
547 (classic_tier1_symbol_plan_gpu_duration, dur, resident),
548 (classic_tier1_pass_plan_gpu_duration, dur, resident),
549 (classic_tier1_token_emit_gpu_duration, dur, resident),
550 (classic_tier1_split_token_emit_gpu_duration, dur, resident),
551 (classic_tier1_token_pack_gpu_duration, dur, resident),
552 (ht_block_gpu_duration, dur, resident),
553 (packet_block_prep_gpu_duration, dur, resident),
554 (packetization_gpu_duration, dur, resident),
555 (packet_payload_copy_gpu_duration, dur, resident),
556 (codestream_assembly_gpu_duration, dur, resident),
557 (codestream_payload_copy_gpu_duration, dur, resident),
558 (tier1_output_capacity_total, count, resident),
559 (max_tier1_output_capacity, max, resident),
560 (tier1_output_used_bytes_total, count, resident),
561 (max_tier1_output_used_bytes, max, resident),
562 (tier1_segment_capacity_total, count, resident),
563 (max_tier1_segment_capacity_per_block, max, resident),
564 (tier1_coding_pass_count_total, count, resident),
565 (max_tier1_coding_passes_per_block, max, resident),
566 (tier1_arithmetic_pass_count_total, count, resident),
567 (tier1_raw_pass_count_total, count, resident),
568 (tier1_cleanup_pass_count_total, count, resident),
569 (tier1_sigprop_pass_count_total, count, resident),
570 (tier1_magref_pass_count_total, count, resident),
571 (tier1_arithmetic_cleanup_pass_count_total, count, resident),
572 (tier1_arithmetic_sigprop_pass_count_total, count, resident),
573 (tier1_arithmetic_magref_pass_count_total, count, resident),
574 (tier1_raw_sigprop_pass_count_total, count, resident),
575 (tier1_raw_magref_pass_count_total, count, resident),
576 (tier1_full_scan_coeff_visit_count_total, count, resident),
577 (tier1_arithmetic_scan_coeff_visit_count_total, count, resident),
578 (tier1_raw_scan_coeff_visit_count_total, count, resident),
579 (tier1_cleanup_scan_coeff_visit_count_total, count, resident),
580 (tier1_sigprop_scan_coeff_visit_count_total, count, resident),
581 (tier1_magref_scan_coeff_visit_count_total, count, resident),
582 (max_tier1_full_scan_coeff_visits_per_block, max, resident),
583 (tier1_sigprop_active_candidate_count_total, count, resident),
584 (tier1_sigprop_new_significant_count_total, count, resident),
585 (tier1_magref_active_candidate_count_total, count, resident),
586 (tier1_arithmetic_sigprop_active_candidate_count_total, count, resident),
587 (tier1_arithmetic_sigprop_new_significant_count_total, count, resident),
588 (tier1_raw_sigprop_active_candidate_count_total, count, resident),
589 (tier1_raw_sigprop_new_significant_count_total, count, resident),
590 (tier1_arithmetic_magref_active_candidate_count_total, count, resident),
591 (tier1_raw_magref_active_candidate_count_total, count, resident),
592 (tier1_cleanup_active_candidate_count_total, count, resident),
593 (tier1_cleanup_new_significant_count_total, count, resident),
594 (tier1_cleanup_rlc_stripe_count_total, count, resident),
595 (tier1_cleanup_rlc_zero_stripe_count_total, count, resident),
596 (tier1_symbol_plan_mq_symbol_count_total, count, resident),
597 (tier1_symbol_plan_raw_bit_count_total, count, resident),
598 (max_tier1_symbol_plan_mq_symbols_per_block, max, resident),
599 (max_tier1_symbol_plan_raw_bits_per_block, max, resident),
600 (tier1_symbol_plan_packed_token_bytes_total, count, resident),
601 (max_tier1_symbol_plan_packed_token_bytes_per_block, max, resident),
602 (tier1_symbol_plan_cleanup_mq_symbol_count_total, count, resident),
603 (tier1_symbol_plan_sigprop_mq_symbol_count_total, count, resident),
604 (tier1_symbol_plan_magref_mq_symbol_count_total, count, resident),
605 (tier1_symbol_plan_raw_sigprop_bit_count_total, count, resident),
606 (tier1_symbol_plan_raw_magref_bit_count_total, count, resident),
607 (tier1_symbol_plan_cleanup_sign_symbol_count_total, count, resident),
608 (tier1_symbol_plan_sigprop_sign_symbol_count_total, count, resident),
609 (tier1_symbol_plan_mq_symbol_hash_xor, xor, resident),
610 (tier1_symbol_plan_raw_bit_hash_xor, xor, resident),
611 (tier1_pass_plan_mq_symbol_count_total, count, resident),
612 (tier1_pass_plan_raw_bit_count_total, count, resident),
613 (tier1_pass_plan_nonempty_mq_pass_count_total, count, resident),
614 (tier1_pass_plan_nonempty_raw_pass_count_total, count, resident),
615 (max_tier1_pass_plan_mq_symbols_per_pass, max, resident),
616 (max_tier1_pass_plan_raw_bits_per_pass, max, resident),
617 (tier1_token_emit_mq_symbol_count_total, count, resident),
618 (tier1_token_emit_raw_bit_count_total, count, resident),
619 (tier1_token_emit_token_bytes_total, count, resident),
620 (max_tier1_token_emit_token_bytes_per_block, max, resident),
621 (tier1_token_emit_segment_count_total, count, resident),
622 (max_tier1_token_emit_segments_per_block, max, resident),
623 (tier1_token_emit_mq_symbol_hash_xor, xor, resident),
624 (tier1_token_emit_raw_bit_hash_xor, xor, resident),
625 (tier1_token_pack_output_bytes_total, count, resident),
626 (max_tier1_token_pack_output_bytes_per_block, max, resident),
627 (tier1_nonzero_block_count_total, count, resident),
628 (tier1_zero_block_count_total, count, resident),
629 (tier1_missing_bitplane_count_total, count, resident),
630 (max_tier1_missing_bitplanes_per_block, max, resident),
631 (tier1_segment_count_total, count, resident),
632 (max_tier1_segments_per_block, max, resident),
633 (packet_payload_copy_job_capacity_total, count, resident),
634 (max_packet_payload_copy_jobs_per_tile, max, resident),
635 (packet_payload_copy_job_count_total, count, resident),
636 (max_packet_payload_copy_jobs_used_per_tile, max, resident),
637 (packet_payload_copy_bytes_total, count, resident),
638 (max_packet_payload_copy_bytes_per_tile, max, resident),
639 (packet_payload_copy_small_job_count_total, count, resident),
640 (packet_payload_copy_medium_job_count_total, count, resident),
641 (packet_payload_copy_large_job_count_total, count, resident),
642 (packet_payload_copy_launched_stripe_count_total, count, resident),
643 (packet_payload_copy_active_stripe_count_total, count, resident),
644 (packet_output_capacity_total, count, resident),
645 (max_packet_output_capacity, max, resident),
646 (packet_output_used_bytes_total, count, resident),
647 (max_packet_output_used_bytes, max, resident),
648 (codestream_payload_copy_bytes_total, count, resident),
649 (codestream_payload_copy_launched_thread_count_total, count, resident),
650 (codestream_payload_copy_active_thread_count_total, count, resident),
651 (codestream_wait_duration, dur, local),
652 (sync_wait_duration, dur, local),
653 (host_readback_duration, dur, local),
654 (chunk_count, count, local),
655 (tile_count, count, local),
656 (code_block_count, count, resident),
657}
658
659#[cfg(any(target_os = "macos", test))]
660pub(super) fn add_resident_prep_duration(
661 stats: &mut MetalLosslessEncodeBatchStats,
662 duration: Duration,
663 profile_stages: bool,
664) {
665 if !profile_stages {
666 return;
667 }
668 stats.stage_stats.coefficient_prep_duration = stats
669 .stage_stats
670 .coefficient_prep_duration
671 .saturating_add(duration);
672}
673
674#[cfg(any(target_os = "macos", test))]
675pub(super) fn add_resident_prep_wall_duration(
676 stats: &mut MetalLosslessEncodeBatchStats,
677 wall_duration: Duration,
678 profile_stages: bool,
679) {
680 add_resident_prep_duration(stats, wall_duration, profile_stages);
681}
682
683/// Resolved resident Metal lossless J2K/HTJ2K tile batch encode metrics.
684#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
685pub struct MetalLosslessEncodeBatchStats {
686 /// Caller-requested maximum number of in-flight tiles.
687 pub configured_inflight_tiles: Option<usize>,
688 /// Effective maximum number of in-flight tiles after clamping.
689 pub effective_inflight_tiles: usize,
690 /// Caller-requested resident encode memory budget in bytes.
691 pub configured_memory_budget_bytes: Option<usize>,
692 /// Effective resident encode memory budget in bytes.
693 pub effective_memory_budget_bytes: usize,
694 /// Estimated peak resident memory required per tile.
695 pub estimated_peak_bytes_per_tile: usize,
696 /// Maximum observed in-flight tiles during the batch.
697 pub max_observed_inflight_tiles: usize,
698 /// End-to-end wall time for the batch encode.
699 pub encode_wall_duration: Duration,
700 /// Resident encode stage timing summary.
701 pub stage_stats: MetalLosslessEncodeStageStats,
702}
703
704/// Resident Metal lossless J2K/HTJ2K tile batch output and batch-level metrics.
705pub struct MetalLosslessBufferEncodeBatchOutcome {
706 /// Per-tile buffer-backed encode outcomes.
707 pub outcomes: Vec<MetalLosslessBufferEncodeOutcome>,
708 /// Batch-level resident encode metrics.
709 pub stats: MetalLosslessEncodeBatchStats,
710}