1use std::fmt::Write as _;
2use std::fs;
3use std::path::Path;
4
5use serde::Serialize;
6
7use crate::config::DemoConfig;
8use crate::error::{Error, Result};
9use crate::external::build_owned_inputs_from_sequence;
10use crate::gpu::try_execute_host_minimum_kernel;
11use crate::host::{default_host_realistic_profile, supervise_temporal_reuse};
12use crate::report::EXPERIMENT_SENTENCE;
13use crate::scene::{generate_sequence_for_definition, scenario_by_id, ScenarioId};
14use crate::taa::run_fixed_alpha_baseline;
15
16#[derive(Clone, Debug, Serialize)]
17pub struct GpuExecutionEntry {
18 pub label: String,
19 pub scenario_id: String,
20 pub width: usize,
21 pub height: usize,
22 pub frame_index: usize,
23 pub gpu_path_available: bool,
24 pub actual_gpu_timing_measured: bool,
25 pub adapter_name: Option<String>,
26 pub backend: Option<String>,
27 pub total_ms: Option<f64>,
28 pub dispatch_ms: Option<f64>,
29 pub readback_ms: Option<f64>,
30 pub mean_abs_trust_delta_vs_cpu: Option<f32>,
31 pub mean_abs_alpha_delta_vs_cpu: Option<f32>,
32 pub mean_abs_intervention_delta_vs_cpu: Option<f32>,
33 pub workgroup_size: [u32; 3],
34 pub resolution_tier: Option<String>,
35 pub notes: Vec<String>,
36}
37
38#[derive(Clone, Debug, Serialize)]
39pub struct GpuExecutionMetrics {
40 pub measurement_kind: String,
41 pub actual_gpu_timing_measured: bool,
42 pub shader_language: String,
43 pub kernel_name: String,
44 pub entries: Vec<GpuExecutionEntry>,
45 pub notes: Vec<String>,
46}
47
48pub fn run_gpu_execution_study(config: &DemoConfig) -> Result<GpuExecutionMetrics> {
49 let scenarios = [ScenarioId::RevealBand, ScenarioId::MotionBiasBand];
50 let mut entries = Vec::new();
51 let mut any_measured = false;
52
53 for scenario_id in scenarios {
54 let definition = scenario_by_id(&config.scene, scenario_id).ok_or_else(|| {
55 Error::Message(format!(
56 "GPU execution scenario {} was unavailable",
57 scenario_id.as_str()
58 ))
59 })?;
60 let sequence = generate_sequence_for_definition(&definition);
61 let frame_index = definition
62 .onset_frame
63 .min(sequence.frames.len().saturating_sub(1))
64 .max(1);
65 let fixed_alpha = run_fixed_alpha_baseline(&sequence, config.baseline.fixed_alpha);
66 let previous_history = fixed_alpha.taa.resolved_frames.get(frame_index - 1);
67 let inputs = build_owned_inputs_from_sequence(&sequence, frame_index, previous_history)?;
68 let profile = default_host_realistic_profile(
69 config.dsfb_alpha_range.min,
70 config.dsfb_alpha_range.max,
71 );
72 let cpu_outputs = supervise_temporal_reuse(&inputs.borrow(), &profile);
73
74 let maybe_gpu = try_execute_host_minimum_kernel(&inputs, profile.parameters)?;
75 match maybe_gpu {
76 Some(gpu) => {
77 any_measured = true;
78 entries.push(GpuExecutionEntry {
79 label: format!("gpu_host_minimum_{}", scenario_id.as_str()),
80 scenario_id: scenario_id.as_str().to_string(),
81 width: inputs.width(),
82 height: inputs.height(),
83 frame_index,
84 gpu_path_available: true,
85 actual_gpu_timing_measured: true,
86 adapter_name: Some(gpu.adapter_name),
87 backend: Some(gpu.backend),
88 total_ms: Some(gpu.total_ms),
89 dispatch_ms: Some(gpu.dispatch_ms),
90 readback_ms: Some(gpu.readback_ms),
91 mean_abs_trust_delta_vs_cpu: Some(mean_abs_delta(
92 cpu_outputs.trust.values(),
93 &gpu.trust,
94 )),
95 mean_abs_alpha_delta_vs_cpu: Some(mean_abs_delta(
96 cpu_outputs.alpha.values(),
97 &gpu.alpha,
98 )),
99 mean_abs_intervention_delta_vs_cpu: Some(mean_abs_delta(
100 cpu_outputs.intervention.values(),
101 &gpu.intervention,
102 )),
103 workgroup_size: [gpu.workgroup_size.0, gpu.workgroup_size.1, gpu.workgroup_size.2],
104 resolution_tier: Some("native".to_string()),
105 notes: vec![
106 "Measured on the current environment because a usable wgpu adapter was available.".to_string(),
107 "The kernel implements the current minimum host-realistic path, which excludes motion disagreement by design.".to_string(),
108 ],
109 });
110 }
111 None => {
112 entries.push(GpuExecutionEntry {
113 label: format!("gpu_host_minimum_{}", scenario_id.as_str()),
114 scenario_id: scenario_id.as_str().to_string(),
115 width: inputs.width(),
116 height: inputs.height(),
117 frame_index,
118 gpu_path_available: true,
119 actual_gpu_timing_measured: false,
120 adapter_name: None,
121 backend: None,
122 total_ms: None,
123 dispatch_ms: None,
124 readback_ms: None,
125 mean_abs_trust_delta_vs_cpu: None,
126 mean_abs_alpha_delta_vs_cpu: None,
127 mean_abs_intervention_delta_vs_cpu: None,
128 workgroup_size: [8, 8, 1],
129 resolution_tier: Some("native".to_string()),
130 notes: vec![
131 "The wgpu compute path is compiled into the crate, but no usable GPU adapter was available in the current environment.".to_string(),
132 "Run `cargo run --release -- run-gpu-path --output <dir>` on a GPU host to measure this kernel without changing code.".to_string(),
133 ],
134 });
135 }
136 }
137 }
138
139 let probe_4k_result = std::panic::catch_unwind(|| {
141 let w4k = 3840usize;
142 let h4k = 2160usize;
143 let n4k = w4k * h4k;
144 let inputs_4k = crate::external::OwnedHostTemporalInputs {
145 current_color: crate::frame::ImageFrame::new(w4k, h4k),
146 reprojected_history: crate::frame::ImageFrame::new(w4k, h4k),
147 motion_vectors: vec![crate::scene::MotionVector { to_prev_x: 0.0, to_prev_y: 0.0 }; n4k],
148 current_depth: vec![1.0f32; n4k],
149 reprojected_depth: vec![1.0f32; n4k],
150 current_normals: vec![crate::scene::Normal3 { x: 0.0, y: 0.0, z: -1.0 }; n4k],
151 reprojected_normals: vec![crate::scene::Normal3 { x: 0.0, y: 0.0, z: -1.0 }; n4k],
152 visibility_hint: None,
153 thin_hint: None,
154 };
155 let profile = default_host_realistic_profile(
156 config.dsfb_alpha_range.min,
157 config.dsfb_alpha_range.max,
158 );
159 try_execute_host_minimum_kernel(&inputs_4k, profile.parameters)
160 });
161
162 match probe_4k_result {
163 Ok(Ok(Some(gpu))) => {
164 any_measured = true;
165 entries.push(GpuExecutionEntry {
166 label: "gpu_4k_synthetic_probe".to_string(),
167 scenario_id: "synthetic_4k".to_string(),
168 width: 3840,
169 height: 2160,
170 frame_index: 0,
171 gpu_path_available: true,
172 actual_gpu_timing_measured: true,
173 adapter_name: Some(gpu.adapter_name),
174 backend: Some(gpu.backend),
175 total_ms: Some(gpu.total_ms),
176 dispatch_ms: Some(gpu.dispatch_ms),
177 readback_ms: Some(gpu.readback_ms),
178 mean_abs_trust_delta_vs_cpu: None,
179 mean_abs_alpha_delta_vs_cpu: None,
180 mean_abs_intervention_delta_vs_cpu: None,
181 workgroup_size: [8, 8, 1],
182 resolution_tier: Some("4k_probe".to_string()),
183 notes: vec![
184 "Synthetic zero-filled 4K dispatch. Tests wgpu binding limit raise.".to_string(),
185 "No CPU parity check performed (would require 4K CPU reference run).".to_string(),
186 ],
187 });
188 }
189 Ok(Ok(None)) => {
190 entries.push(GpuExecutionEntry {
191 label: "gpu_4k_synthetic_probe".to_string(),
192 scenario_id: "synthetic_4k".to_string(),
193 width: 3840,
194 height: 2160,
195 frame_index: 0,
196 gpu_path_available: false,
197 actual_gpu_timing_measured: false,
198 adapter_name: None,
199 backend: None,
200 total_ms: None,
201 dispatch_ms: None,
202 readback_ms: None,
203 mean_abs_trust_delta_vs_cpu: None,
204 mean_abs_alpha_delta_vs_cpu: None,
205 mean_abs_intervention_delta_vs_cpu: None,
206 workgroup_size: [8, 8, 1],
207 resolution_tier: Some("4k_probe".to_string()),
208 notes: vec![
209 "4K synthetic probe: no wgpu adapter available.".to_string(),
210 ],
211 });
212 }
213 Ok(Err(e)) => {
214 entries.push(GpuExecutionEntry {
215 label: "gpu_4k_synthetic_probe".to_string(),
216 scenario_id: "synthetic_4k".to_string(),
217 width: 3840,
218 height: 2160,
219 frame_index: 0,
220 gpu_path_available: true,
221 actual_gpu_timing_measured: false,
222 adapter_name: None,
223 backend: None,
224 total_ms: None,
225 dispatch_ms: None,
226 readback_ms: None,
227 mean_abs_trust_delta_vs_cpu: None,
228 mean_abs_alpha_delta_vs_cpu: None,
229 mean_abs_intervention_delta_vs_cpu: None,
230 workgroup_size: [8, 8, 1],
231 resolution_tier: Some("4k_probe".to_string()),
232 notes: vec![
233 format!("4K synthetic probe failed with error: {e}"),
234 "Binding limit raise may be insufficient for this adapter.".to_string(),
235 ],
236 });
237 }
238 Err(panic_val) => {
239 let msg = panic_val
240 .downcast_ref::<String>()
241 .map(|s| s.as_str())
242 .or_else(|| panic_val.downcast_ref::<&str>().copied())
243 .unwrap_or("unknown panic");
244 entries.push(GpuExecutionEntry {
245 label: "gpu_4k_synthetic_probe".to_string(),
246 scenario_id: "synthetic_4k".to_string(),
247 width: 3840,
248 height: 2160,
249 frame_index: 0,
250 gpu_path_available: true,
251 actual_gpu_timing_measured: false,
252 adapter_name: None,
253 backend: None,
254 total_ms: None,
255 dispatch_ms: None,
256 readback_ms: None,
257 mean_abs_trust_delta_vs_cpu: None,
258 mean_abs_alpha_delta_vs_cpu: None,
259 mean_abs_intervention_delta_vs_cpu: None,
260 workgroup_size: [8, 8, 1],
261 resolution_tier: Some("4k_probe".to_string()),
262 notes: vec![
263 format!("4K synthetic probe panicked: {msg}"),
264 "OOM or driver limit exceeded despite binding limit raise.".to_string(),
265 ],
266 });
267 }
268 }
269
270 Ok(GpuExecutionMetrics {
271 measurement_kind: if any_measured {
272 "actual_gpu_timing_measured".to_string()
273 } else {
274 "gpu_path_implemented_but_not_measured_in_current_environment".to_string()
275 },
276 actual_gpu_timing_measured: any_measured,
277 shader_language: "wgsl".to_string(),
278 kernel_name: "dsfb_host_minimum".to_string(),
279 entries,
280 notes: vec![
281 "This path is intended to remove the 'CPU-only timing proxy' blocker by providing a real GPU-executable kernel and an honest measured-vs-unmeasured disclosure.".to_string(),
282 "The current kernel covers the minimum host-realistic supervisory path. Motion disagreement remains an optional extension and is not part of the minimum kernel.".to_string(),
283 ],
284 })
285}
286
287pub fn write_gpu_execution_report(path: &Path, metrics: &GpuExecutionMetrics) -> Result<()> {
288 if let Some(parent) = path.parent() {
289 fs::create_dir_all(parent)?;
290 }
291
292 let mut markdown = String::new();
293 let _ = writeln!(markdown, "# GPU Execution Report");
294 let _ = writeln!(markdown);
295 let _ = writeln!(markdown, "{EXPERIMENT_SENTENCE}");
296 let _ = writeln!(markdown);
297 let _ = writeln!(
298 markdown,
299 "Measurement classification: `{}`.",
300 metrics.measurement_kind
301 );
302 let _ = writeln!(markdown);
303 let _ = writeln!(
304 markdown,
305 "Actual GPU timing measured: `{}`.",
306 metrics.actual_gpu_timing_measured
307 );
308 let _ = writeln!(markdown);
309 let _ = writeln!(
310 markdown,
311 "Kernel: `{}` in `{}`.",
312 metrics.kernel_name, metrics.shader_language
313 );
314 let _ = writeln!(markdown);
315 let _ = writeln!(
316 markdown,
317 "| Label | Scenario | Resolution | Tier | Measured | Adapter | Total ms | Dispatch ms | Readback ms | Trust delta vs CPU |"
318 );
319 let _ = writeln!(
320 markdown,
321 "| --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
322 );
323 for entry in &metrics.entries {
324 let _ = writeln!(
325 markdown,
326 "| {} | {} | {}x{} | {} | {} | {} | {} | {} | {} | {} |",
327 entry.label,
328 entry.scenario_id,
329 entry.width,
330 entry.height,
331 entry.resolution_tier.as_deref().unwrap_or("native"),
332 entry.actual_gpu_timing_measured,
333 entry.adapter_name.as_deref().unwrap_or("unavailable"),
334 format_f64(entry.total_ms),
335 format_f64(entry.dispatch_ms),
336 format_f64(entry.readback_ms),
337 format_f32(entry.mean_abs_trust_delta_vs_cpu),
338 );
339 }
340 let _ = writeln!(markdown);
341 let _ = writeln!(markdown, "## GPU Path Status");
342 let _ = writeln!(markdown);
343 for note in &metrics.notes {
344 let _ = writeln!(markdown, "- {note}");
345 }
346 let _ = writeln!(markdown);
347 let _ = writeln!(markdown, "## How To Run On A GPU Host");
348 let _ = writeln!(markdown);
349 let _ = writeln!(
350 markdown,
351 "```bash\ncargo run --release -- run-gpu-path --output generated/gpu_path\n```"
352 );
353 let _ = writeln!(markdown);
354 let _ = writeln!(markdown, "## What Is Not Proven");
355 let _ = writeln!(markdown);
356 let _ = writeln!(
357 markdown,
358 "- This report does not imply measured GPU performance when `Actual GPU timing measured` is `false`."
359 );
360 let _ = writeln!(
361 markdown,
362 "- It does not replace real engine-side GPU profiling or cache/bandwidth measurement."
363 );
364 let _ = writeln!(markdown);
365 let _ = writeln!(markdown, "## Remaining Blockers");
366 let _ = writeln!(markdown);
367 if metrics.actual_gpu_timing_measured {
368 let _ = writeln!(
369 markdown,
370 "- The kernel is measured, but broader engine-integrated GPU profiling still remains."
371 );
372 } else {
373 let _ = writeln!(
374 markdown,
375 "- A GPU-executable path now exists, but the current environment still lacks measured GPU execution."
376 );
377 }
378 let _ = writeln!(
379 markdown,
380 "- Real engine captures and imported external buffers still need GPU-side evaluation."
381 );
382
383 fs::write(path, markdown)?;
384 Ok(())
385}
386
387fn mean_abs_delta(a: &[f32], b: &[f32]) -> f32 {
388 let count = a.len().min(b.len()).max(1);
389 a.iter()
390 .zip(b.iter())
391 .map(|(left, right)| (left - right).abs())
392 .sum::<f32>()
393 / count as f32
394}
395
396fn format_f64(value: Option<f64>) -> String {
397 value
398 .map(|value| format!("{value:.3}"))
399 .unwrap_or_else(|| "n/a".to_string())
400}
401
402fn format_f32(value: Option<f32>) -> String {
403 value
404 .map(|value| format!("{value:.6}"))
405 .unwrap_or_else(|| "n/a".to_string())
406}