Skip to main content

j2k_metal/encode/
stats.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use std::time::Duration;
4
5#[cfg(target_os = "macos")]
6use crate::compute;
7
8use super::MetalLosslessBufferEncodeOutcome;
9
10/// Optional resident Metal encode stage timings.
11///
12/// API note: this diagnostic report is constructed by this crate. It is not
13/// `#[non_exhaustive]`, but adapter releases may add diagnostic fields as the
14/// resident encode path gains more profiling detail.
15///
16/// Unless a field explicitly says otherwise, timing fields are host-side
17/// `Instant` buckets for RCA and should not be read as exact GPU execution
18/// elapsed time.
19#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
20pub struct MetalLosslessEncodeStageStats {
21    /// Time spent planning the resident encode batch.
22    pub plan_duration: Duration,
23    /// Time spent preparing and submitting Metal work.
24    pub prepare_submit_duration: Duration,
25    /// Host-side wall time spent preparing resident encode coefficients.
26    pub coefficient_prep_duration: Duration,
27    /// Reserved for future finer-grained deinterleave plus RCT profiling.
28    ///
29    /// Current resident prep timing is reported in `coefficient_prep_duration`.
30    pub deinterleave_rct_duration: Duration,
31    /// Reserved for future finer-grained forward 5/3 DWT profiling.
32    ///
33    /// Current resident prep timing is reported in `coefficient_prep_duration`.
34    pub dwt53_duration: Duration,
35    /// Reserved for future finer-grained coefficient extraction profiling.
36    ///
37    /// Current resident prep timing is reported in `coefficient_prep_duration`.
38    pub coefficient_extract_duration: Duration,
39    /// Time spent building HT lookup tables.
40    pub ht_table_build_duration: Duration,
41    /// Time spent allocating HT output buffers.
42    pub ht_buffer_allocation_duration: Duration,
43    /// Host-side Metal command encoding time for HT resident command buffers.
44    ///
45    /// This is the sum of the split command-encode buckets below and is not GPU
46    /// kernel execution elapsed time.
47    pub ht_command_encode_duration: Duration,
48    /// Host-side Metal command encoding time for HT code-block dispatch setup.
49    pub ht_block_encode_duration: Duration,
50    /// CPU-side setup time for classic Tier-1 batch jobs and buffers.
51    pub classic_tier1_setup_duration: Duration,
52    /// Host-side Metal command encoding time for classic code-block dispatch setup.
53    pub classic_block_encode_duration: Duration,
54    /// Host-side CPU time spent packing compact classic Tier-1 tokens.
55    ///
56    /// This is populated only when
57    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_TOKEN_PACK=1` is enabled.
58    pub classic_tier1_token_pack_duration: Duration,
59    /// CPU-side packet metadata planning time for classic resident batches.
60    pub classic_packet_plan_duration: Duration,
61    /// CPU-side packet/codestream buffer setup time for classic resident batches.
62    pub classic_packet_buffer_setup_duration: Duration,
63    /// Host-side time spent committing split classic resident command buffers.
64    pub classic_command_buffer_commit_duration: Duration,
65    /// Host-side wall time spent harvesting completed resident batch results.
66    pub result_harvest_duration: Duration,
67    /// Host-side time spent copying shared status buffers into CPU-owned status arrays.
68    pub result_status_copy_duration: Duration,
69    /// Host-side time spent returning private buffers to the resident buffer pool.
70    pub result_private_recycle_duration: Duration,
71    /// Host-side time spent returning shared buffers to the resident buffer pool.
72    pub result_shared_recycle_duration: Duration,
73    /// Host-side time spent validating per-tile status and building codestream handles.
74    pub result_codestream_collect_duration: Duration,
75    /// Host-side Metal command encoding time for packet block metadata dispatch setup.
76    pub packet_block_prep_duration: Duration,
77    /// Host-side Metal command encoding time for packet body dispatch setup.
78    pub packetization_duration: Duration,
79    /// Host-side Metal command encoding time for codestream assembly dispatch setup.
80    pub codestream_assembly_duration: Duration,
81    /// GPU time spent preparing resident coefficient buffers.
82    ///
83    /// This includes the resident input deinterleave/RCT, DWT, and coefficient
84    /// extraction command buffer when stage profiling is enabled.
85    pub coefficient_prep_gpu_duration: Duration,
86    /// GPU time spent deinterleaving resident input planes and applying RCT.
87    ///
88    /// This is populated only when resident coefficient-prep split profiling is enabled.
89    pub coefficient_deinterleave_rct_gpu_duration: Duration,
90    /// GPU time spent running resident forward DWT 5/3 coefficient prep.
91    ///
92    /// This is populated only when resident coefficient-prep split profiling is enabled.
93    pub coefficient_dwt53_gpu_duration: Duration,
94    /// GPU time spent in resident forward DWT 5/3 vertical passes.
95    ///
96    /// This is populated only when resident coefficient-prep split profiling is enabled.
97    pub coefficient_dwt53_vertical_gpu_duration: Duration,
98    /// GPU time spent in resident forward DWT 5/3 horizontal passes.
99    ///
100    /// This is populated only when resident coefficient-prep split profiling is enabled.
101    pub coefficient_dwt53_horizontal_gpu_duration: Duration,
102    /// GPU time spent extracting resident code-block coefficients.
103    ///
104    /// This is populated only when resident coefficient-prep split profiling is enabled.
105    pub coefficient_extract_gpu_duration: Duration,
106    /// GPU time spent copying per-tile coefficient buffers into a batch buffer.
107    ///
108    /// This is populated only when resident split-command profiling is enabled.
109    pub coefficient_copy_gpu_duration: Duration,
110    /// Elapsed GPU timestamp window across the resident encode command buffers.
111    ///
112    /// This is `max(GPUEndTime) - min(GPUStartTime)` for the command buffers
113    /// retained by the batch. It is a wall-window companion to summed GPU busy
114    /// rows and should not be added to per-stage GPU durations.
115    pub gpu_elapsed_wall_duration: Duration,
116    /// GPU time spent in the classic Tier-1 code-block encode command.
117    ///
118    /// This is populated only when classic split-command profiling is enabled.
119    pub classic_block_gpu_duration: Duration,
120    /// GPU time spent in the profile-only classic Tier-1 density probe.
121    ///
122    /// This is populated only when classic split-command profiling and
123    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_DENSITY=1` are enabled.
124    pub classic_tier1_density_gpu_duration: Duration,
125    /// GPU time spent in the profile-only classic Tier-1 raw bypass packing probe.
126    ///
127    /// This is populated only when classic split-command profiling and
128    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_RAW_PACK=1` are enabled.
129    pub classic_tier1_raw_pack_gpu_duration: Duration,
130    /// GPU time spent in the profile-only classic Tier-1 MQ arithmetic packing probe.
131    ///
132    /// This is populated only when classic split-command profiling and
133    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_ARITHMETIC_PACK=1` are enabled.
134    pub classic_tier1_arithmetic_pack_gpu_duration: Duration,
135    /// GPU time spent in the profile-only classic Tier-1 ordered symbol-plan probe.
136    ///
137    /// This is populated only when classic split-command profiling and
138    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_SYMBOL_PLAN=1` are enabled.
139    pub classic_tier1_symbol_plan_gpu_duration: Duration,
140    /// GPU time spent in the profile-only classic Tier-1 pass-plan probe.
141    ///
142    /// This is populated only when classic split-command profiling and
143    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_PASS_PLAN=1` are enabled.
144    pub classic_tier1_pass_plan_gpu_duration: Duration,
145    /// GPU time spent in the profile-only classic Tier-1 compact token-emitter probe.
146    ///
147    /// This is populated only when classic split-command profiling and
148    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_TOKEN_EMIT=1` are enabled.
149    pub classic_tier1_token_emit_gpu_duration: Duration,
150    /// GPU time spent in the profile-only classic Tier-1 split MQ/raw token-emitter probe.
151    ///
152    /// This is populated only when classic split-command profiling and
153    /// `J2K_METAL_PROFILE_CLASSIC_TIER1_SPLIT_TOKEN_EMIT=1` are enabled.
154    pub classic_tier1_split_token_emit_gpu_duration: Duration,
155    /// GPU time spent packing compact classic Tier-1 tokens into resident payloads.
156    ///
157    /// This is populated when the gated classic GPU token-pack route is enabled.
158    pub classic_tier1_token_pack_gpu_duration: Duration,
159    /// GPU time spent in the HT Tier-1 code-block encode command.
160    ///
161    /// This is populated only when HT split-command profiling is enabled.
162    pub ht_block_gpu_duration: Duration,
163    /// GPU time spent preparing packet-block metadata from HT Tier-1 status.
164    ///
165    /// This is populated only when HT split-command profiling is enabled.
166    pub packet_block_prep_gpu_duration: Duration,
167    /// GPU time spent in HTJ2K packetization.
168    ///
169    /// This is populated only when HT split-command profiling is enabled.
170    pub packetization_gpu_duration: Duration,
171    /// GPU time spent copying packet payload bytes after header packetization.
172    ///
173    /// This is populated only when HT split-command profiling is enabled.
174    pub packet_payload_copy_gpu_duration: Duration,
175    /// GPU time spent assembling the HTJ2K codestream buffer.
176    ///
177    /// This is populated only when HT split-command profiling is enabled.
178    pub codestream_assembly_gpu_duration: Duration,
179    /// GPU time spent copying packet payload bytes into final codestream buffers.
180    ///
181    /// This is populated only when HT split-command profiling is enabled.
182    pub codestream_payload_copy_gpu_duration: Duration,
183    /// Total Tier-1 output capacity, in bytes, across resident code blocks.
184    pub tier1_output_capacity_total: usize,
185    /// Maximum Tier-1 output capacity, in bytes, for any resident code block.
186    pub max_tier1_output_capacity: usize,
187    /// Actual Tier-1 output bytes written across resident code blocks.
188    pub tier1_output_used_bytes_total: usize,
189    /// Maximum actual Tier-1 output bytes written by any resident code block.
190    pub max_tier1_output_used_bytes: usize,
191    /// Total Tier-1 segment metadata capacity across resident code blocks.
192    pub tier1_segment_capacity_total: usize,
193    /// Maximum Tier-1 segment metadata capacity for any resident code block.
194    pub max_tier1_segment_capacity_per_block: usize,
195    /// Actual Tier-1 coding passes emitted across resident code blocks.
196    pub tier1_coding_pass_count_total: usize,
197    /// Maximum actual Tier-1 coding passes emitted by any resident code block.
198    pub max_tier1_coding_passes_per_block: usize,
199    /// Estimated classic MQ/arithmetic coding passes across resident code blocks.
200    ///
201    /// For HTJ2K Tier-1 this remains zero.
202    pub tier1_arithmetic_pass_count_total: usize,
203    /// Estimated classic raw bypass coding passes across resident code blocks.
204    ///
205    /// For HTJ2K Tier-1 this remains zero.
206    pub tier1_raw_pass_count_total: usize,
207    /// Estimated classic cleanup passes across resident code blocks.
208    ///
209    /// For HTJ2K Tier-1 this remains zero.
210    pub tier1_cleanup_pass_count_total: usize,
211    /// Estimated classic significance propagation passes across resident code blocks.
212    ///
213    /// For HTJ2K Tier-1 this remains zero.
214    pub tier1_sigprop_pass_count_total: usize,
215    /// Estimated classic magnitude refinement passes across resident code blocks.
216    ///
217    /// For HTJ2K Tier-1 this remains zero.
218    pub tier1_magref_pass_count_total: usize,
219    /// Estimated classic MQ/arithmetic cleanup passes across resident code blocks.
220    ///
221    /// For HTJ2K Tier-1 this remains zero.
222    pub tier1_arithmetic_cleanup_pass_count_total: usize,
223    /// Estimated classic MQ/arithmetic significance propagation passes.
224    ///
225    /// For HTJ2K Tier-1 this remains zero.
226    pub tier1_arithmetic_sigprop_pass_count_total: usize,
227    /// Estimated classic MQ/arithmetic magnitude refinement passes.
228    ///
229    /// For HTJ2K Tier-1 this remains zero.
230    pub tier1_arithmetic_magref_pass_count_total: usize,
231    /// Estimated classic raw bypass significance propagation passes.
232    ///
233    /// For HTJ2K Tier-1 this remains zero.
234    pub tier1_raw_sigprop_pass_count_total: usize,
235    /// Estimated classic raw bypass magnitude refinement passes.
236    ///
237    /// For HTJ2K Tier-1 this remains zero.
238    pub tier1_raw_magref_pass_count_total: usize,
239    /// Estimated full coefficient visits made by classic Tier-1 pass scans.
240    ///
241    /// This is derived from actual emitted pass counts and code-block areas.
242    /// For HTJ2K Tier-1 this remains zero.
243    pub tier1_full_scan_coeff_visit_count_total: usize,
244    /// Estimated full coefficient visits made by MQ/arithmetic pass scans.
245    ///
246    /// For HTJ2K Tier-1 this remains zero.
247    pub tier1_arithmetic_scan_coeff_visit_count_total: usize,
248    /// Estimated full coefficient visits made by raw bypass pass scans.
249    ///
250    /// For HTJ2K Tier-1 this remains zero.
251    pub tier1_raw_scan_coeff_visit_count_total: usize,
252    /// Estimated full coefficient visits made by cleanup pass scans.
253    ///
254    /// For HTJ2K Tier-1 this remains zero.
255    pub tier1_cleanup_scan_coeff_visit_count_total: usize,
256    /// Estimated full coefficient visits made by significance propagation scans.
257    ///
258    /// For HTJ2K Tier-1 this remains zero.
259    pub tier1_sigprop_scan_coeff_visit_count_total: usize,
260    /// Estimated full coefficient visits made by magnitude refinement scans.
261    ///
262    /// For HTJ2K Tier-1 this remains zero.
263    pub tier1_magref_scan_coeff_visit_count_total: usize,
264    /// Maximum estimated full coefficient scan visits for any classic block.
265    ///
266    /// For HTJ2K Tier-1 this remains zero.
267    pub max_tier1_full_scan_coeff_visits_per_block: usize,
268    /// Profile-only count of classic significance propagation candidates.
269    ///
270    /// This is populated only when classic Tier-1 density profiling is enabled.
271    pub tier1_sigprop_active_candidate_count_total: usize,
272    /// Profile-only count of coefficients that become significant in sigprop.
273    ///
274    /// This is populated only when classic Tier-1 density profiling is enabled.
275    pub tier1_sigprop_new_significant_count_total: usize,
276    /// Profile-only count of classic magnitude refinement candidates.
277    ///
278    /// This is populated only when classic Tier-1 density profiling is enabled.
279    pub tier1_magref_active_candidate_count_total: usize,
280    /// Profile-only count of arithmetic-coded significance propagation candidates.
281    pub tier1_arithmetic_sigprop_active_candidate_count_total: usize,
282    /// Profile-only count of coefficients that become significant in arithmetic sigprop.
283    pub tier1_arithmetic_sigprop_new_significant_count_total: usize,
284    /// Profile-only count of raw bypass significance propagation candidates.
285    pub tier1_raw_sigprop_active_candidate_count_total: usize,
286    /// Profile-only count of coefficients that become significant in raw sigprop.
287    pub tier1_raw_sigprop_new_significant_count_total: usize,
288    /// Profile-only count of arithmetic-coded magnitude refinement candidates.
289    pub tier1_arithmetic_magref_active_candidate_count_total: usize,
290    /// Profile-only count of raw bypass magnitude refinement candidates.
291    pub tier1_raw_magref_active_candidate_count_total: usize,
292    /// Profile-only count of cleanup-pass coefficient candidates.
293    ///
294    /// This excludes coefficients represented only by cleanup RLC stripes.
295    pub tier1_cleanup_active_candidate_count_total: usize,
296    /// Profile-only count of coefficients that become significant in cleanup.
297    ///
298    /// This includes significance discovered through cleanup RLC.
299    pub tier1_cleanup_new_significant_count_total: usize,
300    /// Profile-only count of cleanup stripes encoded by the RLC path.
301    pub tier1_cleanup_rlc_stripe_count_total: usize,
302    /// Profile-only count of cleanup RLC stripes with no significant coefficient.
303    pub tier1_cleanup_rlc_zero_stripe_count_total: usize,
304    /// Profile-only exact MQ symbol count from the ordered symbol-plan probe.
305    pub tier1_symbol_plan_mq_symbol_count_total: usize,
306    /// Profile-only exact raw bypass bit count from the ordered symbol-plan probe.
307    pub tier1_symbol_plan_raw_bit_count_total: usize,
308    /// Maximum MQ symbols emitted by any block in the ordered symbol-plan probe.
309    pub max_tier1_symbol_plan_mq_symbols_per_block: usize,
310    /// Maximum raw bypass bits emitted by any block in the ordered symbol-plan probe.
311    pub max_tier1_symbol_plan_raw_bits_per_block: usize,
312    /// Estimated compact token bytes needed for all blocks in the symbol-plan probe.
313    pub tier1_symbol_plan_packed_token_bytes_total: usize,
314    /// Maximum estimated compact token bytes needed by any one block.
315    pub max_tier1_symbol_plan_packed_token_bytes_per_block: usize,
316    /// Profile-only exact cleanup MQ symbol count from the ordered symbol-plan probe.
317    pub tier1_symbol_plan_cleanup_mq_symbol_count_total: usize,
318    /// Profile-only exact sigprop MQ symbol count from the ordered symbol-plan probe.
319    pub tier1_symbol_plan_sigprop_mq_symbol_count_total: usize,
320    /// Profile-only exact magref MQ symbol count from the ordered symbol-plan probe.
321    pub tier1_symbol_plan_magref_mq_symbol_count_total: usize,
322    /// Profile-only exact raw sigprop bit count from the ordered symbol-plan probe.
323    pub tier1_symbol_plan_raw_sigprop_bit_count_total: usize,
324    /// Profile-only exact raw magref bit count from the ordered symbol-plan probe.
325    pub tier1_symbol_plan_raw_magref_bit_count_total: usize,
326    /// Profile-only cleanup sign-symbol count from the ordered symbol-plan probe.
327    pub tier1_symbol_plan_cleanup_sign_symbol_count_total: usize,
328    /// Profile-only sigprop sign-symbol count from the ordered symbol-plan probe.
329    pub tier1_symbol_plan_sigprop_sign_symbol_count_total: usize,
330    /// XOR of per-block order-sensitive MQ symbol hashes from the symbol-plan probe.
331    pub tier1_symbol_plan_mq_symbol_hash_xor: usize,
332    /// XOR of per-block order-sensitive raw bit hashes from the symbol-plan probe.
333    pub tier1_symbol_plan_raw_bit_hash_xor: usize,
334    /// Profile-only MQ symbols counted by coding-pass index.
335    pub tier1_pass_plan_mq_symbol_count_total: usize,
336    /// Profile-only raw bypass bits counted by coding-pass index.
337    pub tier1_pass_plan_raw_bit_count_total: usize,
338    /// Count of block-local coding passes that emit at least one MQ symbol.
339    pub tier1_pass_plan_nonempty_mq_pass_count_total: usize,
340    /// Count of block-local coding passes that emit at least one raw bypass bit.
341    pub tier1_pass_plan_nonempty_raw_pass_count_total: usize,
342    /// Maximum MQ symbols emitted by any single block-local coding pass.
343    pub max_tier1_pass_plan_mq_symbols_per_pass: usize,
344    /// Maximum raw bypass bits emitted by any single block-local coding pass.
345    pub max_tier1_pass_plan_raw_bits_per_pass: usize,
346    /// Exact MQ symbol count from the compact token-emitter probe or gated GPU token-pack route.
347    pub tier1_token_emit_mq_symbol_count_total: usize,
348    /// Exact raw bypass bit count from the compact token-emitter probe or gated GPU token-pack route.
349    pub tier1_token_emit_raw_bit_count_total: usize,
350    /// Compact token bytes emitted by the token-emitter probe or gated GPU token-pack route.
351    pub tier1_token_emit_token_bytes_total: usize,
352    /// Maximum compact token bytes emitted by any one block.
353    pub max_tier1_token_emit_token_bytes_per_block: usize,
354    /// Segment records emitted by the token-emitter probe or gated GPU token-pack route.
355    pub tier1_token_emit_segment_count_total: usize,
356    /// Maximum token-emitter segment records for any one block.
357    pub max_tier1_token_emit_segments_per_block: usize,
358    /// XOR of per-block order-sensitive MQ symbol hashes from token emission.
359    pub tier1_token_emit_mq_symbol_hash_xor: usize,
360    /// XOR of per-block order-sensitive raw bit hashes from token emission.
361    pub tier1_token_emit_raw_bit_hash_xor: usize,
362    /// Total bytes produced by packing emitted Tier-1 tokens.
363    pub tier1_token_pack_output_bytes_total: usize,
364    /// Maximum token-pack output bytes for any one block.
365    pub max_tier1_token_pack_output_bytes_per_block: usize,
366    /// Resident Tier-1 code blocks that emitted at least one coding pass.
367    pub tier1_nonzero_block_count_total: usize,
368    /// Resident Tier-1 code blocks that emitted no coding passes.
369    pub tier1_zero_block_count_total: usize,
370    /// Missing most-significant bitplanes across resident Tier-1 code blocks.
371    pub tier1_missing_bitplane_count_total: usize,
372    /// Maximum missing most-significant bitplanes for any resident code block.
373    pub max_tier1_missing_bitplanes_per_block: usize,
374    /// Classic Tier-1 segment records emitted across resident code blocks.
375    ///
376    /// This remains zero for HTJ2K Tier-1, which does not use classic segment
377    /// records.
378    pub tier1_segment_count_total: usize,
379    /// Maximum classic Tier-1 segment records emitted by any resident code block.
380    pub max_tier1_segments_per_block: usize,
381    /// Total host-planned packet payload-copy job slots across resident chunks.
382    pub packet_payload_copy_job_capacity_total: usize,
383    /// Maximum packet payload-copy job slots needed by any tile in the batch.
384    pub max_packet_payload_copy_jobs_per_tile: usize,
385    /// Actual packet payload-copy jobs emitted by packetization across resident chunks.
386    pub packet_payload_copy_job_count_total: usize,
387    /// Maximum actual packet payload-copy jobs emitted by any tile in the batch.
388    pub max_packet_payload_copy_jobs_used_per_tile: usize,
389    /// Actual packet payload-copy bytes emitted by packetization across resident chunks.
390    pub packet_payload_copy_bytes_total: usize,
391    /// Maximum actual packet payload-copy bytes emitted by any tile in the batch.
392    pub max_packet_payload_copy_bytes_per_tile: usize,
393    /// Packet payload-copy jobs at or below one copy-kernel stripe.
394    pub packet_payload_copy_small_job_count_total: usize,
395    /// Packet payload-copy jobs above one stripe and at or below 512 bytes.
396    pub packet_payload_copy_medium_job_count_total: usize,
397    /// Packet payload-copy jobs above 512 bytes.
398    pub packet_payload_copy_large_job_count_total: usize,
399    /// Packet payload-copy stripes launched by the copy kernel.
400    pub packet_payload_copy_launched_stripe_count_total: usize,
401    /// Packet payload-copy stripes that correspond to emitted copy jobs.
402    pub packet_payload_copy_active_stripe_count_total: usize,
403    /// Total packet output capacity, in bytes, across resident chunks.
404    pub packet_output_capacity_total: usize,
405    /// Maximum packet output capacity, in bytes, for any tile in the batch.
406    pub max_packet_output_capacity: usize,
407    /// Actual packet output bytes written by packetization across resident chunks.
408    pub packet_output_used_bytes_total: usize,
409    /// Maximum actual packet output bytes written by any tile in the batch.
410    pub max_packet_output_used_bytes: usize,
411    /// Codestream payload-copy bytes, in bytes, across resident chunks.
412    pub codestream_payload_copy_bytes_total: usize,
413    /// Codestream payload-copy threads launched by the copy kernel.
414    pub codestream_payload_copy_launched_thread_count_total: usize,
415    /// Estimated codestream payload-copy threads with in-range bytes to copy.
416    pub codestream_payload_copy_active_thread_count_total: usize,
417    /// Time spent waiting for codestream buffers.
418    pub codestream_wait_duration: Duration,
419    /// Alias of `codestream_wait_duration` using RCA naming.
420    ///
421    /// Do not sum this with `codestream_wait_duration` as an independent bucket.
422    pub sync_wait_duration: Duration,
423    /// Time spent materializing buffer-backed codestream bytes into host bytes.
424    ///
425    /// Current batch stats paths may leave this at zero. Host byte
426    /// materialization timing is surfaced on `MetalLosslessEncodeOutcome` where
427    /// applicable; this stage-stats bucket is reserved for stats-bearing
428    /// host-output paths.
429    pub host_readback_duration: Duration,
430    /// Number of resident encode chunks.
431    pub chunk_count: usize,
432    /// Number of encoded tiles.
433    pub tile_count: usize,
434    /// Number of encoded code blocks.
435    pub code_block_count: usize,
436}
437
438/// Combine rule for one stage-stat field in
439/// [`MetalLosslessEncodeStageStats::add_assign`]: `dur` and `count` add with
440/// saturation, `max` keeps the per-batch maximum, `xor` folds hashes.
441macro_rules! stage_stat_combine {
442    (dur, $self:ident, $other:ident, $field:ident) => {
443        $self.$field = $self.$field.saturating_add($other.$field);
444    };
445    (count, $self:ident, $other:ident, $field:ident) => {
446        $self.$field = $self.$field.saturating_add($other.$field);
447    };
448    (max, $self:ident, $other:ident, $field:ident) => {
449        $self.$field = $self.$field.max($other.$field);
450    };
451    (xor, $self:ident, $other:ident, $field:ident) => {
452        $self.$field ^= $other.$field;
453    };
454}
455
456/// Contribution of one stage-stat field to
457/// [`MetalLosslessEncodeStageStats::has_timings`]: only `dur` fields count.
458macro_rules! stage_stat_timing_flag {
459    (dur, $any:ident, $self:ident, $field:ident) => {
460        $any = $any || $self.$field > Duration::ZERO;
461    };
462    ($class:ident, $any:ident, $self:ident, $field:ident) => {};
463}
464
465/// `From<compute::J2kResidentEncodeStageStats>` rule for one stage-stat
466/// field: `resident` fields copy from the compute-layer stats, `local`
467/// fields are facade-side only and keep their default.
468#[cfg(target_os = "macos")]
469macro_rules! stage_stat_from_resident {
470    (resident, $out:ident, $stats:ident, $field:ident) => {
471        $out.$field = $stats.$field;
472    };
473    (local, $out:ident, $stats:ident, $field:ident) => {};
474}
475
476/// Generate the per-field `MetalLosslessEncodeStageStats` impls from the
477/// field table. The destructuring check at the end makes the table
478/// exhaustive: adding a struct field without a table entry fails to compile.
479macro_rules! j2k_metal_stage_stats_impls {
480    ($(($field:ident, $class:ident, $source:ident)),* $(,)?) => {
481        impl MetalLosslessEncodeStageStats {
482            /// Return whether any non-zero timing was recorded.
483            pub fn has_timings(&self) -> bool {
484                let mut any = false;
485                $(stage_stat_timing_flag!($class, any, self, $field);)*
486                any
487            }
488
489            /// Accumulate another stage-stats value using saturating duration and counter additions.
490            pub fn add_assign(&mut self, other: Self) {
491                $(stage_stat_combine!($class, self, other, $field);)*
492            }
493        }
494
495        #[cfg(target_os = "macos")]
496        impl From<compute::J2kResidentEncodeStageStats> for MetalLosslessEncodeStageStats {
497            fn from(stats: compute::J2kResidentEncodeStageStats) -> Self {
498                let mut out = Self::default();
499                $(stage_stat_from_resident!($source, out, stats, $field);)*
500                out
501            }
502        }
503
504        const _: fn(MetalLosslessEncodeStageStats) = |stats| {
505            let MetalLosslessEncodeStageStats { $($field: _),* } = stats;
506        };
507    };
508}
509
510j2k_metal_stage_stats_impls! {
511    (plan_duration, dur, local),
512    (prepare_submit_duration, dur, local),
513    (coefficient_prep_duration, dur, resident),
514    (deinterleave_rct_duration, dur, resident),
515    (dwt53_duration, dur, resident),
516    (coefficient_extract_duration, dur, resident),
517    (ht_table_build_duration, dur, resident),
518    (ht_buffer_allocation_duration, dur, resident),
519    (ht_command_encode_duration, dur, resident),
520    (ht_block_encode_duration, dur, resident),
521    (classic_tier1_setup_duration, dur, resident),
522    (classic_block_encode_duration, dur, resident),
523    (classic_tier1_token_pack_duration, dur, resident),
524    (classic_packet_plan_duration, dur, resident),
525    (classic_packet_buffer_setup_duration, dur, resident),
526    (classic_command_buffer_commit_duration, dur, resident),
527    (result_harvest_duration, dur, resident),
528    (result_status_copy_duration, dur, resident),
529    (result_private_recycle_duration, dur, resident),
530    (result_shared_recycle_duration, dur, resident),
531    (result_codestream_collect_duration, dur, resident),
532    (packet_block_prep_duration, dur, resident),
533    (packetization_duration, dur, resident),
534    (codestream_assembly_duration, dur, resident),
535    (coefficient_prep_gpu_duration, dur, resident),
536    (coefficient_deinterleave_rct_gpu_duration, dur, resident),
537    (coefficient_dwt53_gpu_duration, dur, resident),
538    (coefficient_dwt53_vertical_gpu_duration, dur, resident),
539    (coefficient_dwt53_horizontal_gpu_duration, dur, resident),
540    (coefficient_extract_gpu_duration, dur, resident),
541    (coefficient_copy_gpu_duration, dur, resident),
542    (gpu_elapsed_wall_duration, dur, resident),
543    (classic_block_gpu_duration, dur, resident),
544    (classic_tier1_density_gpu_duration, dur, resident),
545    (classic_tier1_raw_pack_gpu_duration, dur, resident),
546    (classic_tier1_arithmetic_pack_gpu_duration, dur, resident),
547    (classic_tier1_symbol_plan_gpu_duration, dur, resident),
548    (classic_tier1_pass_plan_gpu_duration, dur, resident),
549    (classic_tier1_token_emit_gpu_duration, dur, resident),
550    (classic_tier1_split_token_emit_gpu_duration, dur, resident),
551    (classic_tier1_token_pack_gpu_duration, dur, resident),
552    (ht_block_gpu_duration, dur, resident),
553    (packet_block_prep_gpu_duration, dur, resident),
554    (packetization_gpu_duration, dur, resident),
555    (packet_payload_copy_gpu_duration, dur, resident),
556    (codestream_assembly_gpu_duration, dur, resident),
557    (codestream_payload_copy_gpu_duration, dur, resident),
558    (tier1_output_capacity_total, count, resident),
559    (max_tier1_output_capacity, max, resident),
560    (tier1_output_used_bytes_total, count, resident),
561    (max_tier1_output_used_bytes, max, resident),
562    (tier1_segment_capacity_total, count, resident),
563    (max_tier1_segment_capacity_per_block, max, resident),
564    (tier1_coding_pass_count_total, count, resident),
565    (max_tier1_coding_passes_per_block, max, resident),
566    (tier1_arithmetic_pass_count_total, count, resident),
567    (tier1_raw_pass_count_total, count, resident),
568    (tier1_cleanup_pass_count_total, count, resident),
569    (tier1_sigprop_pass_count_total, count, resident),
570    (tier1_magref_pass_count_total, count, resident),
571    (tier1_arithmetic_cleanup_pass_count_total, count, resident),
572    (tier1_arithmetic_sigprop_pass_count_total, count, resident),
573    (tier1_arithmetic_magref_pass_count_total, count, resident),
574    (tier1_raw_sigprop_pass_count_total, count, resident),
575    (tier1_raw_magref_pass_count_total, count, resident),
576    (tier1_full_scan_coeff_visit_count_total, count, resident),
577    (tier1_arithmetic_scan_coeff_visit_count_total, count, resident),
578    (tier1_raw_scan_coeff_visit_count_total, count, resident),
579    (tier1_cleanup_scan_coeff_visit_count_total, count, resident),
580    (tier1_sigprop_scan_coeff_visit_count_total, count, resident),
581    (tier1_magref_scan_coeff_visit_count_total, count, resident),
582    (max_tier1_full_scan_coeff_visits_per_block, max, resident),
583    (tier1_sigprop_active_candidate_count_total, count, resident),
584    (tier1_sigprop_new_significant_count_total, count, resident),
585    (tier1_magref_active_candidate_count_total, count, resident),
586    (tier1_arithmetic_sigprop_active_candidate_count_total, count, resident),
587    (tier1_arithmetic_sigprop_new_significant_count_total, count, resident),
588    (tier1_raw_sigprop_active_candidate_count_total, count, resident),
589    (tier1_raw_sigprop_new_significant_count_total, count, resident),
590    (tier1_arithmetic_magref_active_candidate_count_total, count, resident),
591    (tier1_raw_magref_active_candidate_count_total, count, resident),
592    (tier1_cleanup_active_candidate_count_total, count, resident),
593    (tier1_cleanup_new_significant_count_total, count, resident),
594    (tier1_cleanup_rlc_stripe_count_total, count, resident),
595    (tier1_cleanup_rlc_zero_stripe_count_total, count, resident),
596    (tier1_symbol_plan_mq_symbol_count_total, count, resident),
597    (tier1_symbol_plan_raw_bit_count_total, count, resident),
598    (max_tier1_symbol_plan_mq_symbols_per_block, max, resident),
599    (max_tier1_symbol_plan_raw_bits_per_block, max, resident),
600    (tier1_symbol_plan_packed_token_bytes_total, count, resident),
601    (max_tier1_symbol_plan_packed_token_bytes_per_block, max, resident),
602    (tier1_symbol_plan_cleanup_mq_symbol_count_total, count, resident),
603    (tier1_symbol_plan_sigprop_mq_symbol_count_total, count, resident),
604    (tier1_symbol_plan_magref_mq_symbol_count_total, count, resident),
605    (tier1_symbol_plan_raw_sigprop_bit_count_total, count, resident),
606    (tier1_symbol_plan_raw_magref_bit_count_total, count, resident),
607    (tier1_symbol_plan_cleanup_sign_symbol_count_total, count, resident),
608    (tier1_symbol_plan_sigprop_sign_symbol_count_total, count, resident),
609    (tier1_symbol_plan_mq_symbol_hash_xor, xor, resident),
610    (tier1_symbol_plan_raw_bit_hash_xor, xor, resident),
611    (tier1_pass_plan_mq_symbol_count_total, count, resident),
612    (tier1_pass_plan_raw_bit_count_total, count, resident),
613    (tier1_pass_plan_nonempty_mq_pass_count_total, count, resident),
614    (tier1_pass_plan_nonempty_raw_pass_count_total, count, resident),
615    (max_tier1_pass_plan_mq_symbols_per_pass, max, resident),
616    (max_tier1_pass_plan_raw_bits_per_pass, max, resident),
617    (tier1_token_emit_mq_symbol_count_total, count, resident),
618    (tier1_token_emit_raw_bit_count_total, count, resident),
619    (tier1_token_emit_token_bytes_total, count, resident),
620    (max_tier1_token_emit_token_bytes_per_block, max, resident),
621    (tier1_token_emit_segment_count_total, count, resident),
622    (max_tier1_token_emit_segments_per_block, max, resident),
623    (tier1_token_emit_mq_symbol_hash_xor, xor, resident),
624    (tier1_token_emit_raw_bit_hash_xor, xor, resident),
625    (tier1_token_pack_output_bytes_total, count, resident),
626    (max_tier1_token_pack_output_bytes_per_block, max, resident),
627    (tier1_nonzero_block_count_total, count, resident),
628    (tier1_zero_block_count_total, count, resident),
629    (tier1_missing_bitplane_count_total, count, resident),
630    (max_tier1_missing_bitplanes_per_block, max, resident),
631    (tier1_segment_count_total, count, resident),
632    (max_tier1_segments_per_block, max, resident),
633    (packet_payload_copy_job_capacity_total, count, resident),
634    (max_packet_payload_copy_jobs_per_tile, max, resident),
635    (packet_payload_copy_job_count_total, count, resident),
636    (max_packet_payload_copy_jobs_used_per_tile, max, resident),
637    (packet_payload_copy_bytes_total, count, resident),
638    (max_packet_payload_copy_bytes_per_tile, max, resident),
639    (packet_payload_copy_small_job_count_total, count, resident),
640    (packet_payload_copy_medium_job_count_total, count, resident),
641    (packet_payload_copy_large_job_count_total, count, resident),
642    (packet_payload_copy_launched_stripe_count_total, count, resident),
643    (packet_payload_copy_active_stripe_count_total, count, resident),
644    (packet_output_capacity_total, count, resident),
645    (max_packet_output_capacity, max, resident),
646    (packet_output_used_bytes_total, count, resident),
647    (max_packet_output_used_bytes, max, resident),
648    (codestream_payload_copy_bytes_total, count, resident),
649    (codestream_payload_copy_launched_thread_count_total, count, resident),
650    (codestream_payload_copy_active_thread_count_total, count, resident),
651    (codestream_wait_duration, dur, local),
652    (sync_wait_duration, dur, local),
653    (host_readback_duration, dur, local),
654    (chunk_count, count, local),
655    (tile_count, count, local),
656    (code_block_count, count, resident),
657}
658
659#[cfg(any(target_os = "macos", test))]
660pub(super) fn add_resident_prep_duration(
661    stats: &mut MetalLosslessEncodeBatchStats,
662    duration: Duration,
663    profile_stages: bool,
664) {
665    if !profile_stages {
666        return;
667    }
668    stats.stage_stats.coefficient_prep_duration = stats
669        .stage_stats
670        .coefficient_prep_duration
671        .saturating_add(duration);
672}
673
674#[cfg(any(target_os = "macos", test))]
675pub(super) fn add_resident_prep_wall_duration(
676    stats: &mut MetalLosslessEncodeBatchStats,
677    wall_duration: Duration,
678    profile_stages: bool,
679) {
680    add_resident_prep_duration(stats, wall_duration, profile_stages);
681}
682
683/// Resolved resident Metal lossless J2K/HTJ2K tile batch encode metrics.
684#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
685pub struct MetalLosslessEncodeBatchStats {
686    /// Caller-requested maximum number of in-flight tiles.
687    pub configured_inflight_tiles: Option<usize>,
688    /// Effective maximum number of in-flight tiles after clamping.
689    pub effective_inflight_tiles: usize,
690    /// Caller-requested resident encode memory budget in bytes.
691    pub configured_memory_budget_bytes: Option<usize>,
692    /// Effective resident encode memory budget in bytes.
693    pub effective_memory_budget_bytes: usize,
694    /// Estimated peak resident memory required per tile.
695    pub estimated_peak_bytes_per_tile: usize,
696    /// Maximum observed in-flight tiles during the batch.
697    pub max_observed_inflight_tiles: usize,
698    /// End-to-end wall time for the batch encode.
699    pub encode_wall_duration: Duration,
700    /// Resident encode stage timing summary.
701    pub stage_stats: MetalLosslessEncodeStageStats,
702}
703
704/// Resident Metal lossless J2K/HTJ2K tile batch output and batch-level metrics.
705pub struct MetalLosslessBufferEncodeBatchOutcome {
706    /// Per-tile buffer-backed encode outcomes.
707    pub outcomes: Vec<MetalLosslessBufferEncodeOutcome>,
708    /// Batch-level resident encode metrics.
709    pub stats: MetalLosslessEncodeBatchStats,
710}