dsfb-computer-graphics 0.1.1

use std::fmt::Write as _;
use std::fs;
use std::path::Path;

use serde::Serialize;

use crate::config::DemoConfig;
use crate::error::{Error, Result};
use crate::external::build_owned_inputs_from_sequence;
use crate::gpu::try_execute_host_minimum_kernel;
use crate::host::{default_host_realistic_profile, supervise_temporal_reuse};
use crate::report::EXPERIMENT_SENTENCE;
use crate::scene::{generate_sequence_for_definition, scenario_by_id, ScenarioId};
use crate::taa::run_fixed_alpha_baseline;

#[derive(Clone, Debug, Serialize)]
pub struct GpuExecutionEntry {
    pub label: String,
    pub scenario_id: String,
    pub width: usize,
    pub height: usize,
    pub frame_index: usize,
    pub gpu_path_available: bool,
    pub actual_gpu_timing_measured: bool,
    pub adapter_name: Option<String>,
    pub backend: Option<String>,
    pub total_ms: Option<f64>,
    pub dispatch_ms: Option<f64>,
    pub readback_ms: Option<f64>,
    pub mean_abs_trust_delta_vs_cpu: Option<f32>,
    pub mean_abs_alpha_delta_vs_cpu: Option<f32>,
    pub mean_abs_intervention_delta_vs_cpu: Option<f32>,
    pub workgroup_size: [u32; 3],
    pub resolution_tier: Option<String>,
    pub notes: Vec<String>,
}

#[derive(Clone, Debug, Serialize)]
pub struct GpuExecutionMetrics {
    pub measurement_kind: String,
    pub actual_gpu_timing_measured: bool,
    pub shader_language: String,
    pub kernel_name: String,
    pub entries: Vec<GpuExecutionEntry>,
    pub notes: Vec<String>,
}

pub fn run_gpu_execution_study(config: &DemoConfig) -> Result<GpuExecutionMetrics> {
    let scenarios = [ScenarioId::RevealBand, ScenarioId::MotionBiasBand];
    let mut entries = Vec::new();
    let mut any_measured = false;

    for scenario_id in scenarios {
        let definition = scenario_by_id(&config.scene, scenario_id).ok_or_else(|| {
            Error::Message(format!(
                "GPU execution scenario {} was unavailable",
                scenario_id.as_str()
            ))
        })?;
        let sequence = generate_sequence_for_definition(&definition);
        let frame_index = definition
            .onset_frame
            .min(sequence.frames.len().saturating_sub(1))
            .max(1);
        let fixed_alpha = run_fixed_alpha_baseline(&sequence, config.baseline.fixed_alpha);
        let previous_history = fixed_alpha.taa.resolved_frames.get(frame_index - 1);
        let inputs = build_owned_inputs_from_sequence(&sequence, frame_index, previous_history)?;
        let profile = default_host_realistic_profile(
            config.dsfb_alpha_range.min,
            config.dsfb_alpha_range.max,
        );
        let cpu_outputs = supervise_temporal_reuse(&inputs.borrow(), &profile);

        let maybe_gpu = try_execute_host_minimum_kernel(&inputs, profile.parameters)?;
        match maybe_gpu {
            Some(gpu) => {
                any_measured = true;
                entries.push(GpuExecutionEntry {
                    label: format!("gpu_host_minimum_{}", scenario_id.as_str()),
                    scenario_id: scenario_id.as_str().to_string(),
                    width: inputs.width(),
                    height: inputs.height(),
                    frame_index,
                    gpu_path_available: true,
                    actual_gpu_timing_measured: true,
                    adapter_name: Some(gpu.adapter_name),
                    backend: Some(gpu.backend),
                    total_ms: Some(gpu.total_ms),
                    dispatch_ms: Some(gpu.dispatch_ms),
                    readback_ms: Some(gpu.readback_ms),
                    mean_abs_trust_delta_vs_cpu: Some(mean_abs_delta(
                        cpu_outputs.trust.values(),
                        &gpu.trust,
                    )),
                    mean_abs_alpha_delta_vs_cpu: Some(mean_abs_delta(
                        cpu_outputs.alpha.values(),
                        &gpu.alpha,
                    )),
                    mean_abs_intervention_delta_vs_cpu: Some(mean_abs_delta(
                        cpu_outputs.intervention.values(),
                        &gpu.intervention,
                    )),
                    workgroup_size: [gpu.workgroup_size.0, gpu.workgroup_size.1, gpu.workgroup_size.2],
                    resolution_tier: Some("native".to_string()),
                    notes: vec![
                        "Measured on the current environment because a usable wgpu adapter was available.".to_string(),
                        "The kernel implements the current minimum host-realistic path, which excludes motion disagreement by design.".to_string(),
                    ],
                });
            }
            None => {
                entries.push(GpuExecutionEntry {
                    label: format!("gpu_host_minimum_{}", scenario_id.as_str()),
                    scenario_id: scenario_id.as_str().to_string(),
                    width: inputs.width(),
                    height: inputs.height(),
                    frame_index,
                    gpu_path_available: true,
                    actual_gpu_timing_measured: false,
                    adapter_name: None,
                    backend: None,
                    total_ms: None,
                    dispatch_ms: None,
                    readback_ms: None,
                    mean_abs_trust_delta_vs_cpu: None,
                    mean_abs_alpha_delta_vs_cpu: None,
                    mean_abs_intervention_delta_vs_cpu: None,
                    workgroup_size: [8, 8, 1],
                    resolution_tier: Some("native".to_string()),
                    notes: vec![
                        "The wgpu compute path is compiled into the crate, but no usable GPU adapter was available in the current environment.".to_string(),
                        "Run `cargo run --release -- run-gpu-path --output <dir>` on a GPU host to measure this kernel without changing code.".to_string(),
                    ],
                });
            }
        }
    }

    // 4K synthetic probe - zero-filled buffers, tests dispatch feasibility at 3840x2160
    let probe_4k_result = std::panic::catch_unwind(|| {
        let w4k = 3840usize;
        let h4k = 2160usize;
        let n4k = w4k * h4k;
        let inputs_4k = crate::external::OwnedHostTemporalInputs {
            current_color: crate::frame::ImageFrame::new(w4k, h4k),
            reprojected_history: crate::frame::ImageFrame::new(w4k, h4k),
            motion_vectors: vec![crate::scene::MotionVector { to_prev_x: 0.0, to_prev_y: 0.0 }; n4k],
            current_depth: vec![1.0f32; n4k],
            reprojected_depth: vec![1.0f32; n4k],
            current_normals: vec![crate::scene::Normal3 { x: 0.0, y: 0.0, z: -1.0 }; n4k],
            reprojected_normals: vec![crate::scene::Normal3 { x: 0.0, y: 0.0, z: -1.0 }; n4k],
            visibility_hint: None,
            thin_hint: None,
        };
        let profile = default_host_realistic_profile(
            config.dsfb_alpha_range.min,
            config.dsfb_alpha_range.max,
        );
        try_execute_host_minimum_kernel(&inputs_4k, profile.parameters)
    });

    match probe_4k_result {
        Ok(Ok(Some(gpu))) => {
            any_measured = true;
            entries.push(GpuExecutionEntry {
                label: "gpu_4k_synthetic_probe".to_string(),
                scenario_id: "synthetic_4k".to_string(),
                width: 3840,
                height: 2160,
                frame_index: 0,
                gpu_path_available: true,
                actual_gpu_timing_measured: true,
                adapter_name: Some(gpu.adapter_name),
                backend: Some(gpu.backend),
                total_ms: Some(gpu.total_ms),
                dispatch_ms: Some(gpu.dispatch_ms),
                readback_ms: Some(gpu.readback_ms),
                mean_abs_trust_delta_vs_cpu: None,
                mean_abs_alpha_delta_vs_cpu: None,
                mean_abs_intervention_delta_vs_cpu: None,
                workgroup_size: [8, 8, 1],
                resolution_tier: Some("4k_probe".to_string()),
                notes: vec![
                    "Synthetic zero-filled 4K dispatch. Tests wgpu binding limit raise.".to_string(),
                    "No CPU parity check performed (would require 4K CPU reference run).".to_string(),
                ],
            });
        }
        Ok(Ok(None)) => {
            entries.push(GpuExecutionEntry {
                label: "gpu_4k_synthetic_probe".to_string(),
                scenario_id: "synthetic_4k".to_string(),
                width: 3840,
                height: 2160,
                frame_index: 0,
                gpu_path_available: false,
                actual_gpu_timing_measured: false,
                adapter_name: None,
                backend: None,
                total_ms: None,
                dispatch_ms: None,
                readback_ms: None,
                mean_abs_trust_delta_vs_cpu: None,
                mean_abs_alpha_delta_vs_cpu: None,
                mean_abs_intervention_delta_vs_cpu: None,
                workgroup_size: [8, 8, 1],
                resolution_tier: Some("4k_probe".to_string()),
                notes: vec![
                    "4K synthetic probe: no wgpu adapter available.".to_string(),
                ],
            });
        }
        Ok(Err(e)) => {
            entries.push(GpuExecutionEntry {
                label: "gpu_4k_synthetic_probe".to_string(),
                scenario_id: "synthetic_4k".to_string(),
                width: 3840,
                height: 2160,
                frame_index: 0,
                gpu_path_available: true,
                actual_gpu_timing_measured: false,
                adapter_name: None,
                backend: None,
                total_ms: None,
                dispatch_ms: None,
                readback_ms: None,
                mean_abs_trust_delta_vs_cpu: None,
                mean_abs_alpha_delta_vs_cpu: None,
                mean_abs_intervention_delta_vs_cpu: None,
                workgroup_size: [8, 8, 1],
                resolution_tier: Some("4k_probe".to_string()),
                notes: vec![
                    format!("4K synthetic probe failed with error: {e}"),
                    "Binding limit raise may be insufficient for this adapter.".to_string(),
                ],
            });
        }
        Err(panic_val) => {
            let msg = panic_val
                .downcast_ref::<String>()
                .map(|s| s.as_str())
                .or_else(|| panic_val.downcast_ref::<&str>().copied())
                .unwrap_or("unknown panic");
            entries.push(GpuExecutionEntry {
                label: "gpu_4k_synthetic_probe".to_string(),
                scenario_id: "synthetic_4k".to_string(),
                width: 3840,
                height: 2160,
                frame_index: 0,
                gpu_path_available: true,
                actual_gpu_timing_measured: false,
                adapter_name: None,
                backend: None,
                total_ms: None,
                dispatch_ms: None,
                readback_ms: None,
                mean_abs_trust_delta_vs_cpu: None,
                mean_abs_alpha_delta_vs_cpu: None,
                mean_abs_intervention_delta_vs_cpu: None,
                workgroup_size: [8, 8, 1],
                resolution_tier: Some("4k_probe".to_string()),
                notes: vec![
                    format!("4K synthetic probe panicked: {msg}"),
                    "OOM or driver limit exceeded despite binding limit raise.".to_string(),
                ],
            });
        }
    }

    Ok(GpuExecutionMetrics {
        measurement_kind: if any_measured {
            "actual_gpu_timing_measured".to_string()
        } else {
            "gpu_path_implemented_but_not_measured_in_current_environment".to_string()
        },
        actual_gpu_timing_measured: any_measured,
        shader_language: "wgsl".to_string(),
        kernel_name: "dsfb_host_minimum".to_string(),
        entries,
        notes: vec![
            "This path is intended to remove the 'CPU-only timing proxy' blocker by providing a real GPU-executable kernel and an honest measured-vs-unmeasured disclosure.".to_string(),
            "The current kernel covers the minimum host-realistic supervisory path. Motion disagreement remains an optional extension and is not part of the minimum kernel.".to_string(),
        ],
    })
}

pub fn write_gpu_execution_report(path: &Path, metrics: &GpuExecutionMetrics) -> Result<()> {
    if let Some(parent) = path.parent() {
        fs::create_dir_all(parent)?;
    }

    let mut markdown = String::new();
    let _ = writeln!(markdown, "# GPU Execution Report");
    let _ = writeln!(markdown);
    let _ = writeln!(markdown, "{EXPERIMENT_SENTENCE}");
    let _ = writeln!(markdown);
    let _ = writeln!(
        markdown,
        "Measurement classification: `{}`.",
        metrics.measurement_kind
    );
    let _ = writeln!(markdown);
    let _ = writeln!(
        markdown,
        "Actual GPU timing measured: `{}`.",
        metrics.actual_gpu_timing_measured
    );
    let _ = writeln!(markdown);
    let _ = writeln!(
        markdown,
        "Kernel: `{}` in `{}`.",
        metrics.kernel_name, metrics.shader_language
    );
    let _ = writeln!(markdown);
    let _ = writeln!(
        markdown,
        "| Label | Scenario | Resolution | Tier | Measured | Adapter | Total ms | Dispatch ms | Readback ms | Trust delta vs CPU |"
    );
    let _ = writeln!(
        markdown,
        "| --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
    );
    for entry in &metrics.entries {
        let _ = writeln!(
            markdown,
            "| {} | {} | {}x{} | {} | {} | {} | {} | {} | {} | {} |",
            entry.label,
            entry.scenario_id,
            entry.width,
            entry.height,
            entry.resolution_tier.as_deref().unwrap_or("native"),
            entry.actual_gpu_timing_measured,
            entry.adapter_name.as_deref().unwrap_or("unavailable"),
            format_f64(entry.total_ms),
            format_f64(entry.dispatch_ms),
            format_f64(entry.readback_ms),
            format_f32(entry.mean_abs_trust_delta_vs_cpu),
        );
    }
    let _ = writeln!(markdown);
    let _ = writeln!(markdown, "## GPU Path Status");
    let _ = writeln!(markdown);
    for note in &metrics.notes {
        let _ = writeln!(markdown, "- {note}");
    }
    let _ = writeln!(markdown);
    let _ = writeln!(markdown, "## How To Run On A GPU Host");
    let _ = writeln!(markdown);
    let _ = writeln!(
        markdown,
        "```bash\ncargo run --release -- run-gpu-path --output generated/gpu_path\n```"
    );
    let _ = writeln!(markdown);
    let _ = writeln!(markdown, "## What Is Not Proven");
    let _ = writeln!(markdown);
    let _ = writeln!(
        markdown,
        "- This report does not imply measured GPU performance when `Actual GPU timing measured` is `false`."
    );
    let _ = writeln!(
        markdown,
        "- It does not replace real engine-side GPU profiling or cache/bandwidth measurement."
    );
    let _ = writeln!(markdown);
    let _ = writeln!(markdown, "## Remaining Blockers");
    let _ = writeln!(markdown);
    if metrics.actual_gpu_timing_measured {
        let _ = writeln!(
            markdown,
            "- The kernel is measured, but broader engine-integrated GPU profiling still remains."
        );
    } else {
        let _ = writeln!(
            markdown,
            "- A GPU-executable path now exists, but the current environment still lacks measured GPU execution."
        );
    }
    let _ = writeln!(
        markdown,
        "- Real engine captures and imported external buffers still need GPU-side evaluation."
    );

    fs::write(path, markdown)?;
    Ok(())
}

fn mean_abs_delta(a: &[f32], b: &[f32]) -> f32 {
    let count = a.len().min(b.len()).max(1);
    a.iter()
        .zip(b.iter())
        .map(|(left, right)| (left - right).abs())
        .sum::<f32>()
        / count as f32
}

fn format_f64(value: Option<f64>) -> String {
    value
        .map(|value| format!("{value:.3}"))
        .unwrap_or_else(|| "n/a".to_string())
}

fn format_f32(value: Option<f32>) -> String {
    value
        .map(|value| format!("{value:.6}"))
        .unwrap_or_else(|| "n/a".to_string())
}