Skip to main content

dsfb_computer_graphics/
gpu_execution.rs

1use std::fmt::Write as _;
2use std::fs;
3use std::path::Path;
4
5use serde::Serialize;
6
7use crate::config::DemoConfig;
8use crate::error::{Error, Result};
9use crate::external::build_owned_inputs_from_sequence;
10use crate::gpu::try_execute_host_minimum_kernel;
11use crate::host::{default_host_realistic_profile, supervise_temporal_reuse};
12use crate::report::EXPERIMENT_SENTENCE;
13use crate::scene::{generate_sequence_for_definition, scenario_by_id, ScenarioId};
14use crate::taa::run_fixed_alpha_baseline;
15
16#[derive(Clone, Debug, Serialize)]
17pub struct GpuExecutionEntry {
18    pub label: String,
19    pub scenario_id: String,
20    pub width: usize,
21    pub height: usize,
22    pub frame_index: usize,
23    pub gpu_path_available: bool,
24    pub actual_gpu_timing_measured: bool,
25    pub adapter_name: Option<String>,
26    pub backend: Option<String>,
27    pub total_ms: Option<f64>,
28    pub dispatch_ms: Option<f64>,
29    pub readback_ms: Option<f64>,
30    pub mean_abs_trust_delta_vs_cpu: Option<f32>,
31    pub mean_abs_alpha_delta_vs_cpu: Option<f32>,
32    pub mean_abs_intervention_delta_vs_cpu: Option<f32>,
33    pub workgroup_size: [u32; 3],
34    pub resolution_tier: Option<String>,
35    pub notes: Vec<String>,
36}
37
38#[derive(Clone, Debug, Serialize)]
39pub struct GpuExecutionMetrics {
40    pub measurement_kind: String,
41    pub actual_gpu_timing_measured: bool,
42    pub shader_language: String,
43    pub kernel_name: String,
44    pub entries: Vec<GpuExecutionEntry>,
45    pub notes: Vec<String>,
46}
47
48pub fn run_gpu_execution_study(config: &DemoConfig) -> Result<GpuExecutionMetrics> {
49    let scenarios = [ScenarioId::RevealBand, ScenarioId::MotionBiasBand];
50    let mut entries = Vec::new();
51    let mut any_measured = false;
52
53    for scenario_id in scenarios {
54        let definition = scenario_by_id(&config.scene, scenario_id).ok_or_else(|| {
55            Error::Message(format!(
56                "GPU execution scenario {} was unavailable",
57                scenario_id.as_str()
58            ))
59        })?;
60        let sequence = generate_sequence_for_definition(&definition);
61        let frame_index = definition
62            .onset_frame
63            .min(sequence.frames.len().saturating_sub(1))
64            .max(1);
65        let fixed_alpha = run_fixed_alpha_baseline(&sequence, config.baseline.fixed_alpha);
66        let previous_history = fixed_alpha.taa.resolved_frames.get(frame_index - 1);
67        let inputs = build_owned_inputs_from_sequence(&sequence, frame_index, previous_history)?;
68        let profile = default_host_realistic_profile(
69            config.dsfb_alpha_range.min,
70            config.dsfb_alpha_range.max,
71        );
72        let cpu_outputs = supervise_temporal_reuse(&inputs.borrow(), &profile);
73
74        let maybe_gpu = try_execute_host_minimum_kernel(&inputs, profile.parameters)?;
75        match maybe_gpu {
76            Some(gpu) => {
77                any_measured = true;
78                entries.push(GpuExecutionEntry {
79                    label: format!("gpu_host_minimum_{}", scenario_id.as_str()),
80                    scenario_id: scenario_id.as_str().to_string(),
81                    width: inputs.width(),
82                    height: inputs.height(),
83                    frame_index,
84                    gpu_path_available: true,
85                    actual_gpu_timing_measured: true,
86                    adapter_name: Some(gpu.adapter_name),
87                    backend: Some(gpu.backend),
88                    total_ms: Some(gpu.total_ms),
89                    dispatch_ms: Some(gpu.dispatch_ms),
90                    readback_ms: Some(gpu.readback_ms),
91                    mean_abs_trust_delta_vs_cpu: Some(mean_abs_delta(
92                        cpu_outputs.trust.values(),
93                        &gpu.trust,
94                    )),
95                    mean_abs_alpha_delta_vs_cpu: Some(mean_abs_delta(
96                        cpu_outputs.alpha.values(),
97                        &gpu.alpha,
98                    )),
99                    mean_abs_intervention_delta_vs_cpu: Some(mean_abs_delta(
100                        cpu_outputs.intervention.values(),
101                        &gpu.intervention,
102                    )),
103                    workgroup_size: [gpu.workgroup_size.0, gpu.workgroup_size.1, gpu.workgroup_size.2],
104                    resolution_tier: Some("native".to_string()),
105                    notes: vec![
106                        "Measured on the current environment because a usable wgpu adapter was available.".to_string(),
107                        "The kernel implements the current minimum host-realistic path, which excludes motion disagreement by design.".to_string(),
108                    ],
109                });
110            }
111            None => {
112                entries.push(GpuExecutionEntry {
113                    label: format!("gpu_host_minimum_{}", scenario_id.as_str()),
114                    scenario_id: scenario_id.as_str().to_string(),
115                    width: inputs.width(),
116                    height: inputs.height(),
117                    frame_index,
118                    gpu_path_available: true,
119                    actual_gpu_timing_measured: false,
120                    adapter_name: None,
121                    backend: None,
122                    total_ms: None,
123                    dispatch_ms: None,
124                    readback_ms: None,
125                    mean_abs_trust_delta_vs_cpu: None,
126                    mean_abs_alpha_delta_vs_cpu: None,
127                    mean_abs_intervention_delta_vs_cpu: None,
128                    workgroup_size: [8, 8, 1],
129                    resolution_tier: Some("native".to_string()),
130                    notes: vec![
131                        "The wgpu compute path is compiled into the crate, but no usable GPU adapter was available in the current environment.".to_string(),
132                        "Run `cargo run --release -- run-gpu-path --output <dir>` on a GPU host to measure this kernel without changing code.".to_string(),
133                    ],
134                });
135            }
136        }
137    }
138
139    // 4K synthetic probe - zero-filled buffers, tests dispatch feasibility at 3840x2160
140    let probe_4k_result = std::panic::catch_unwind(|| {
141        let w4k = 3840usize;
142        let h4k = 2160usize;
143        let n4k = w4k * h4k;
144        let inputs_4k = crate::external::OwnedHostTemporalInputs {
145            current_color: crate::frame::ImageFrame::new(w4k, h4k),
146            reprojected_history: crate::frame::ImageFrame::new(w4k, h4k),
147            motion_vectors: vec![crate::scene::MotionVector { to_prev_x: 0.0, to_prev_y: 0.0 }; n4k],
148            current_depth: vec![1.0f32; n4k],
149            reprojected_depth: vec![1.0f32; n4k],
150            current_normals: vec![crate::scene::Normal3 { x: 0.0, y: 0.0, z: -1.0 }; n4k],
151            reprojected_normals: vec![crate::scene::Normal3 { x: 0.0, y: 0.0, z: -1.0 }; n4k],
152            visibility_hint: None,
153            thin_hint: None,
154        };
155        let profile = default_host_realistic_profile(
156            config.dsfb_alpha_range.min,
157            config.dsfb_alpha_range.max,
158        );
159        try_execute_host_minimum_kernel(&inputs_4k, profile.parameters)
160    });
161
162    match probe_4k_result {
163        Ok(Ok(Some(gpu))) => {
164            any_measured = true;
165            entries.push(GpuExecutionEntry {
166                label: "gpu_4k_synthetic_probe".to_string(),
167                scenario_id: "synthetic_4k".to_string(),
168                width: 3840,
169                height: 2160,
170                frame_index: 0,
171                gpu_path_available: true,
172                actual_gpu_timing_measured: true,
173                adapter_name: Some(gpu.adapter_name),
174                backend: Some(gpu.backend),
175                total_ms: Some(gpu.total_ms),
176                dispatch_ms: Some(gpu.dispatch_ms),
177                readback_ms: Some(gpu.readback_ms),
178                mean_abs_trust_delta_vs_cpu: None,
179                mean_abs_alpha_delta_vs_cpu: None,
180                mean_abs_intervention_delta_vs_cpu: None,
181                workgroup_size: [8, 8, 1],
182                resolution_tier: Some("4k_probe".to_string()),
183                notes: vec![
184                    "Synthetic zero-filled 4K dispatch. Tests wgpu binding limit raise.".to_string(),
185                    "No CPU parity check performed (would require 4K CPU reference run).".to_string(),
186                ],
187            });
188        }
189        Ok(Ok(None)) => {
190            entries.push(GpuExecutionEntry {
191                label: "gpu_4k_synthetic_probe".to_string(),
192                scenario_id: "synthetic_4k".to_string(),
193                width: 3840,
194                height: 2160,
195                frame_index: 0,
196                gpu_path_available: false,
197                actual_gpu_timing_measured: false,
198                adapter_name: None,
199                backend: None,
200                total_ms: None,
201                dispatch_ms: None,
202                readback_ms: None,
203                mean_abs_trust_delta_vs_cpu: None,
204                mean_abs_alpha_delta_vs_cpu: None,
205                mean_abs_intervention_delta_vs_cpu: None,
206                workgroup_size: [8, 8, 1],
207                resolution_tier: Some("4k_probe".to_string()),
208                notes: vec![
209                    "4K synthetic probe: no wgpu adapter available.".to_string(),
210                ],
211            });
212        }
213        Ok(Err(e)) => {
214            entries.push(GpuExecutionEntry {
215                label: "gpu_4k_synthetic_probe".to_string(),
216                scenario_id: "synthetic_4k".to_string(),
217                width: 3840,
218                height: 2160,
219                frame_index: 0,
220                gpu_path_available: true,
221                actual_gpu_timing_measured: false,
222                adapter_name: None,
223                backend: None,
224                total_ms: None,
225                dispatch_ms: None,
226                readback_ms: None,
227                mean_abs_trust_delta_vs_cpu: None,
228                mean_abs_alpha_delta_vs_cpu: None,
229                mean_abs_intervention_delta_vs_cpu: None,
230                workgroup_size: [8, 8, 1],
231                resolution_tier: Some("4k_probe".to_string()),
232                notes: vec![
233                    format!("4K synthetic probe failed with error: {e}"),
234                    "Binding limit raise may be insufficient for this adapter.".to_string(),
235                ],
236            });
237        }
238        Err(panic_val) => {
239            let msg = panic_val
240                .downcast_ref::<String>()
241                .map(|s| s.as_str())
242                .or_else(|| panic_val.downcast_ref::<&str>().copied())
243                .unwrap_or("unknown panic");
244            entries.push(GpuExecutionEntry {
245                label: "gpu_4k_synthetic_probe".to_string(),
246                scenario_id: "synthetic_4k".to_string(),
247                width: 3840,
248                height: 2160,
249                frame_index: 0,
250                gpu_path_available: true,
251                actual_gpu_timing_measured: false,
252                adapter_name: None,
253                backend: None,
254                total_ms: None,
255                dispatch_ms: None,
256                readback_ms: None,
257                mean_abs_trust_delta_vs_cpu: None,
258                mean_abs_alpha_delta_vs_cpu: None,
259                mean_abs_intervention_delta_vs_cpu: None,
260                workgroup_size: [8, 8, 1],
261                resolution_tier: Some("4k_probe".to_string()),
262                notes: vec![
263                    format!("4K synthetic probe panicked: {msg}"),
264                    "OOM or driver limit exceeded despite binding limit raise.".to_string(),
265                ],
266            });
267        }
268    }
269
270    Ok(GpuExecutionMetrics {
271        measurement_kind: if any_measured {
272            "actual_gpu_timing_measured".to_string()
273        } else {
274            "gpu_path_implemented_but_not_measured_in_current_environment".to_string()
275        },
276        actual_gpu_timing_measured: any_measured,
277        shader_language: "wgsl".to_string(),
278        kernel_name: "dsfb_host_minimum".to_string(),
279        entries,
280        notes: vec![
281            "This path is intended to remove the 'CPU-only timing proxy' blocker by providing a real GPU-executable kernel and an honest measured-vs-unmeasured disclosure.".to_string(),
282            "The current kernel covers the minimum host-realistic supervisory path. Motion disagreement remains an optional extension and is not part of the minimum kernel.".to_string(),
283        ],
284    })
285}
286
287pub fn write_gpu_execution_report(path: &Path, metrics: &GpuExecutionMetrics) -> Result<()> {
288    if let Some(parent) = path.parent() {
289        fs::create_dir_all(parent)?;
290    }
291
292    let mut markdown = String::new();
293    let _ = writeln!(markdown, "# GPU Execution Report");
294    let _ = writeln!(markdown);
295    let _ = writeln!(markdown, "{EXPERIMENT_SENTENCE}");
296    let _ = writeln!(markdown);
297    let _ = writeln!(
298        markdown,
299        "Measurement classification: `{}`.",
300        metrics.measurement_kind
301    );
302    let _ = writeln!(markdown);
303    let _ = writeln!(
304        markdown,
305        "Actual GPU timing measured: `{}`.",
306        metrics.actual_gpu_timing_measured
307    );
308    let _ = writeln!(markdown);
309    let _ = writeln!(
310        markdown,
311        "Kernel: `{}` in `{}`.",
312        metrics.kernel_name, metrics.shader_language
313    );
314    let _ = writeln!(markdown);
315    let _ = writeln!(
316        markdown,
317        "| Label | Scenario | Resolution | Tier | Measured | Adapter | Total ms | Dispatch ms | Readback ms | Trust delta vs CPU |"
318    );
319    let _ = writeln!(
320        markdown,
321        "| --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
322    );
323    for entry in &metrics.entries {
324        let _ = writeln!(
325            markdown,
326            "| {} | {} | {}x{} | {} | {} | {} | {} | {} | {} | {} |",
327            entry.label,
328            entry.scenario_id,
329            entry.width,
330            entry.height,
331            entry.resolution_tier.as_deref().unwrap_or("native"),
332            entry.actual_gpu_timing_measured,
333            entry.adapter_name.as_deref().unwrap_or("unavailable"),
334            format_f64(entry.total_ms),
335            format_f64(entry.dispatch_ms),
336            format_f64(entry.readback_ms),
337            format_f32(entry.mean_abs_trust_delta_vs_cpu),
338        );
339    }
340    let _ = writeln!(markdown);
341    let _ = writeln!(markdown, "## GPU Path Status");
342    let _ = writeln!(markdown);
343    for note in &metrics.notes {
344        let _ = writeln!(markdown, "- {note}");
345    }
346    let _ = writeln!(markdown);
347    let _ = writeln!(markdown, "## How To Run On A GPU Host");
348    let _ = writeln!(markdown);
349    let _ = writeln!(
350        markdown,
351        "```bash\ncargo run --release -- run-gpu-path --output generated/gpu_path\n```"
352    );
353    let _ = writeln!(markdown);
354    let _ = writeln!(markdown, "## What Is Not Proven");
355    let _ = writeln!(markdown);
356    let _ = writeln!(
357        markdown,
358        "- This report does not imply measured GPU performance when `Actual GPU timing measured` is `false`."
359    );
360    let _ = writeln!(
361        markdown,
362        "- It does not replace real engine-side GPU profiling or cache/bandwidth measurement."
363    );
364    let _ = writeln!(markdown);
365    let _ = writeln!(markdown, "## Remaining Blockers");
366    let _ = writeln!(markdown);
367    if metrics.actual_gpu_timing_measured {
368        let _ = writeln!(
369            markdown,
370            "- The kernel is measured, but broader engine-integrated GPU profiling still remains."
371        );
372    } else {
373        let _ = writeln!(
374            markdown,
375            "- A GPU-executable path now exists, but the current environment still lacks measured GPU execution."
376        );
377    }
378    let _ = writeln!(
379        markdown,
380        "- Real engine captures and imported external buffers still need GPU-side evaluation."
381    );
382
383    fs::write(path, markdown)?;
384    Ok(())
385}
386
387fn mean_abs_delta(a: &[f32], b: &[f32]) -> f32 {
388    let count = a.len().min(b.len()).max(1);
389    a.iter()
390        .zip(b.iter())
391        .map(|(left, right)| (left - right).abs())
392        .sum::<f32>()
393        / count as f32
394}
395
396fn format_f64(value: Option<f64>) -> String {
397    value
398        .map(|value| format!("{value:.3}"))
399        .unwrap_or_else(|| "n/a".to_string())
400}
401
402fn format_f32(value: Option<f32>) -> String {
403    value
404        .map(|value| format!("{value:.6}"))
405        .unwrap_or_else(|| "n/a".to_string())
406}