Skip to main content

dsfb_computer_graphics/
fast_path.rs

1//! Minimal Inline Deployment (Fast-Path Proxy)
2//!
3//! This module implements a reduced per-pixel proxy derived from residual evolution.
4//! It is NOT the full DSFB supervisory system. It is a constrained proxy
5//! intended to assess deployment feasibility under strict real-time budgets.
6//!
7//! The proxy computes per pixel:
8//!   r_t = L1(C_t - H_t) / 3          residual magnitude
9//!   d_t = r_t - r_{t-1}               drift
10//!   s_t = d_t - d_{t-1}               slew
11//!   u_t = |d_t| + lambda * |s_t|      scalar proxy
12//!   T_t = saturate(1 - k * u_t)       trust
13//!
14//! Optionally, u_t is averaged over a 3×3 neighborhood before trust computation.
15//!
16//! State: two scalar per-pixel history buffers (residual, drift).
17//! Output: one scalar per-pixel trust buffer.
18//!
19//! No heap allocation occurs in the inner loop of the CPU reference path.
20
21use std::sync::mpsc;
22use std::time::Instant;
23
24use bytemuck::{Pod, Zeroable};
25use serde::Serialize;
26use wgpu::util::DeviceExt;
27
28use crate::error::{Error, Result};
29
30// ─── Constants ────────────────────────────────────────────────────────────────
31
32/// Default slew weight λ.  Applied to |s_t| in the proxy formula.
33pub const FAST_PATH_LAMBDA: f32 = 0.5;
34
35/// Default trust slope k.  Scales the proxy before saturation.
36pub const FAST_PATH_K: f32 = 2.0;
37
38// ─── CPU reference ────────────────────────────────────────────────────────────
39
40/// Output from one frame of the CPU reference fast-path computation.
41pub struct FastPathCpuOutput {
42    /// Trust values, one f32 per pixel, in \[0, 1\].
43    pub trust: Vec<f32>,
44    /// Updated residual history (r_t), to be fed as `residual_history_in` next frame.
45    pub residual_history_out: Vec<f32>,
46    /// Updated drift history (d_t), to be fed as `drift_history_in` next frame.
47    pub drift_history_out: Vec<f32>,
48}
49
50/// CPU reference implementation of the minimal inline fast-path proxy.
51///
52/// Inputs are flat slices of per-pixel \[R, G, B\] triples.
53/// All output `Vec`s are pre-allocated before the inner loop; no allocations
54/// occur inside the per-pixel computation.
55///
56/// This function is deterministic: identical inputs produce identical outputs.
57pub fn run_fast_path_cpu(
58    current: &[[f32; 3]],
59    history: &[[f32; 3]],
60    residual_history_in: &[f32],
61    drift_history_in: &[f32],
62    width: usize,
63    height: usize,
64    lambda: f32,
65    k: f32,
66    local_aggregation: bool,
67) -> FastPathCpuOutput {
68    let pixel_count = width * height;
69    assert_eq!(current.len(), pixel_count);
70    assert_eq!(history.len(), pixel_count);
71    assert_eq!(residual_history_in.len(), pixel_count);
72    assert_eq!(drift_history_in.len(), pixel_count);
73
74    // Pre-allocate output buffers before the inner computation loop.
75    // No allocation occurs inside the per-pixel loops below.
76    let mut trust = vec![0.0f32; pixel_count];
77    let mut residual_history_out = vec![0.0f32; pixel_count];
78    let mut drift_history_out = vec![0.0f32; pixel_count];
79    let mut u_field = vec![0.0f32; pixel_count];
80
81    // First pass: compute r_t, d_t, s_t, u_t for all pixels.
82    // History outputs (r_t, d_t) are always written regardless of aggregation mode.
83    for i in 0..pixel_count {
84        let c = current[i];
85        let h = history[i];
86        let r_t = ((c[0] - h[0]).abs() + (c[1] - h[1]).abs() + (c[2] - h[2]).abs()) / 3.0;
87        let d_t = r_t - residual_history_in[i];
88        let s_t = d_t - drift_history_in[i];
89        let u_t = d_t.abs() + lambda * s_t.abs();
90        u_field[i] = u_t;
91        residual_history_out[i] = r_t;
92        drift_history_out[i] = d_t;
93    }
94
95    if local_aggregation {
96        // Optional 3×3 mean of u_t before trust computation.
97        // Uses only stack-local accumulators; no additional heap allocation inside the loop.
98        for y in 0..height {
99            for x in 0..width {
100                let mut u_sum = 0.0f32;
101                for oy in -1i32..=1 {
102                    for ox in -1i32..=1 {
103                        let nx = (x as i32 + ox).clamp(0, width as i32 - 1) as usize;
104                        let ny = (y as i32 + oy).clamp(0, height as i32 - 1) as usize;
105                        u_sum += u_field[ny * width + nx];
106                    }
107                }
108                let u_prime = u_sum / 9.0;
109                trust[y * width + x] = (1.0 - k * u_prime).clamp(0.0, 1.0);
110            }
111        }
112    } else {
113        for i in 0..pixel_count {
114            trust[i] = (1.0 - k * u_field[i]).clamp(0.0, 1.0);
115        }
116    }
117
118    FastPathCpuOutput {
119        trust,
120        residual_history_out,
121        drift_history_out,
122    }
123}
124
125// ─── GPU shader ───────────────────────────────────────────────────────────────
126
127const FAST_PATH_SHADER: &str = r#"
128struct Params {
129    // [width, height, has_local_agg (0=off, 1=on), _padding]
130    size: vec4<u32>,
131    // [lambda, k, _, _]
132    coefficients: vec4<f32>,
133}
134
135@group(0) @binding(0) var<storage, read>       current_color:        array<vec4<f32>>;
136@group(0) @binding(1) var<storage, read>       reprojected_history:  array<vec4<f32>>;
137@group(0) @binding(2) var<storage, read>       residual_history_in:  array<f32>;
138@group(0) @binding(3) var<storage, read>       drift_history_in:     array<f32>;
139@group(0) @binding(4) var<uniform>             params:               Params;
140@group(0) @binding(5) var<storage, read_write> trust_out:            array<f32>;
141@group(0) @binding(6) var<storage, read_write> residual_history_out: array<f32>;
142@group(0) @binding(7) var<storage, read_write> drift_history_out:    array<f32>;
143
144fn l1_residual(a: vec3<f32>, b: vec3<f32>) -> f32 {
145    return (abs(a.x - b.x) + abs(a.y - b.y) + abs(a.z - b.z)) / 3.0;
146}
147
148/// Compute the proxy scalar u for an arbitrary (clamped) pixel coordinate.
149/// Used by the optional 3×3 local aggregation path.
150fn proxy_u_at(xi: i32, yi: i32) -> f32 {
151    let w = i32(params.size.x);
152    let h = i32(params.size.y);
153    let xc = clamp(xi, 0, w - 1);
154    let yc = clamp(yi, 0, h - 1);
155    let j = u32(yc) * params.size.x + u32(xc);
156    let cur  = current_color[j].xyz;
157    let hist = reprojected_history[j].xyz;
158    let r_prev = residual_history_in[j];
159    let d_prev = drift_history_in[j];
160    let r_j = l1_residual(cur, hist);
161    let d_j = r_j - r_prev;
162    let s_j = d_j - d_prev;
163    return abs(d_j) + params.coefficients.x * abs(s_j);
164}
165
166/// Single-pass fast-path proxy kernel.
167///
168/// Computes r_t, d_t, s_t, u_t per pixel.
169/// Writes trust (with optional 3×3 local mean of u), r_t, and d_t.
170@compute @workgroup_size(1, 1, 1)
171fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
172    let w = params.size.x;
173    let h = params.size.y;
174    if (gid.x >= w || gid.y >= h) { return; }
175
176    let i = gid.y * w + gid.x;
177    let lambda = params.coefficients.x;
178    let k      = params.coefficients.y;
179
180    let cur  = current_color[i].xyz;
181    let hist = reprojected_history[i].xyz;
182    let r_prev = residual_history_in[i];
183    let d_prev = drift_history_in[i];
184
185    let r_t = l1_residual(cur, hist);
186    let d_t = r_t - r_prev;
187    let s_t = d_t - d_prev;
188
189    var trust: f32;
190    if (params.size.z == 1u) {
191        // Optional 3×3 local mean of u before trust.
192        var u_sum = 0.0;
193        for (var oy: i32 = -1; oy <= 1; oy = oy + 1) {
194            for (var ox: i32 = -1; ox <= 1; ox = ox + 1) {
195                u_sum = u_sum + proxy_u_at(i32(gid.x) + ox, i32(gid.y) + oy);
196            }
197        }
198        trust = clamp(1.0 - k * (u_sum / 9.0), 0.0, 1.0);
199    } else {
200        let u_t = abs(d_t) + lambda * abs(s_t);
201        trust = clamp(1.0 - k * u_t, 0.0, 1.0);
202    }
203
204    trust_out[i]            = trust;
205    residual_history_out[i] = r_t;
206    drift_history_out[i]    = d_t;
207}
208"#;
209
210// ─── GPU types ────────────────────────────────────────────────────────────────
211
212#[repr(C)]
213#[derive(Clone, Copy, Pod, Zeroable)]
214struct FpGpuParams {
215    size: [u32; 4],
216    coefficients: [f32; 4],
217}
218
219#[repr(C)]
220#[derive(Clone, Copy, Pod, Zeroable)]
221struct FpColor4 {
222    value: [f32; 4],
223}
224
225// ─── GPU result ───────────────────────────────────────────────────────────────
226
227/// Result from a single-frame GPU fast-path kernel execution.
228#[derive(Clone, Debug, Serialize)]
229pub struct FastPathGpuOutput {
230    pub trust: Vec<f32>,
231    pub residual_history_out: Vec<f32>,
232    pub drift_history_out: Vec<f32>,
233    /// Wall-clock time including dispatch + readback (ms).
234    pub total_ms: f64,
235    /// Wall-clock time for dispatch + poll only (ms).
236    pub dispatch_ms: f64,
237    pub adapter_name: String,
238    pub backend: String,
239}
240
241/// Execute one frame of the GPU fast-path proxy.
242///
243/// Returns `Ok(None)` if no wgpu adapter is available.
244pub fn try_run_fast_path_gpu(
245    current: &[[f32; 3]],
246    history: &[[f32; 3]],
247    residual_history_in: &[f32],
248    drift_history_in: &[f32],
249    width: usize,
250    height: usize,
251    lambda: f32,
252    k: f32,
253    local_aggregation: bool,
254) -> Result<Option<FastPathGpuOutput>> {
255    pollster::block_on(try_run_fast_path_gpu_async(
256        current,
257        history,
258        residual_history_in,
259        drift_history_in,
260        width,
261        height,
262        lambda,
263        k,
264        local_aggregation,
265    ))
266}
267
268async fn try_run_fast_path_gpu_async(
269    current: &[[f32; 3]],
270    history: &[[f32; 3]],
271    residual_history_in: &[f32],
272    drift_history_in: &[f32],
273    width: usize,
274    height: usize,
275    lambda: f32,
276    k: f32,
277    local_aggregation: bool,
278) -> Result<Option<FastPathGpuOutput>> {
279    let instance = wgpu::Instance::default();
280    let adapter = match instance
281        .request_adapter(&wgpu::RequestAdapterOptions {
282            power_preference: wgpu::PowerPreference::HighPerformance,
283            compatible_surface: None,
284            force_fallback_adapter: false,
285        })
286        .await
287    {
288        Some(a) => a,
289        None => return Ok(None),
290    };
291    let adapter_info = adapter.get_info();
292    let adapter_limits = adapter.limits();
293    let (device, queue) = adapter
294        .request_device(
295            &wgpu::DeviceDescriptor {
296                label: Some("dsfb-fast-path"),
297                required_features: wgpu::Features::empty(),
298                required_limits: wgpu::Limits {
299                    max_storage_buffer_binding_size: adapter_limits
300                        .max_storage_buffer_binding_size,
301                    max_buffer_size: adapter_limits.max_buffer_size,
302                    ..wgpu::Limits::default()
303                },
304            },
305            None,
306        )
307        .await
308        .map_err(|e| Error::Message(format!("wgpu device request failed: {e}")))?;
309
310    let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
311        label: Some("dsfb-fast-path-wgsl"),
312        source: wgpu::ShaderSource::Wgsl(FAST_PATH_SHADER.into()),
313    });
314    let bind_group_layout = make_bind_group_layout(&device);
315    let pipeline = make_pipeline(&device, &shader, &bind_group_layout);
316
317    let pixel_count = width * height;
318    let current_packed = pack_colors(current);
319    let history_packed = pack_colors(history);
320    let params_val = FpGpuParams {
321        size: [width as u32, height as u32, u32::from(local_aggregation), 0],
322        coefficients: [lambda, k, 0.0, 0.0],
323    };
324
325    let out_bytes = (pixel_count * std::mem::size_of::<f32>()) as u64;
326    let (
327        cur_buf,
328        hist_buf,
329        res_in_buf,
330        drift_in_buf,
331        params_buf,
332        trust_buf,
333        res_out_buf,
334        drift_out_buf,
335    ) = upload_gpu_buffers(
336        &device,
337        &current_packed,
338        &history_packed,
339        residual_history_in,
340        drift_history_in,
341        &params_val,
342        pixel_count,
343    );
344    let trust_staging = make_staging(&device, out_bytes, "fp-trust-stg");
345    let res_staging = make_staging(&device, out_bytes, "fp-res-stg");
346    let drift_staging = make_staging(&device, out_bytes, "fp-drift-stg");
347
348    let bind_group = make_bind_group(
349        &device,
350        &bind_group_layout,
351        &cur_buf,
352        &hist_buf,
353        &res_in_buf,
354        &drift_in_buf,
355        &params_buf,
356        &trust_buf,
357        &res_out_buf,
358        &drift_out_buf,
359    );
360
361    let total_start = Instant::now();
362    let dispatch_start = Instant::now();
363    let mut encoder =
364        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: Some("fp-enc") });
365    {
366        let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
367            label: Some("fp-pass"),
368            timestamp_writes: None,
369        });
370        pass.set_pipeline(&pipeline);
371        pass.set_bind_group(0, &bind_group, &[]);
372        pass.dispatch_workgroups(width as u32, height as u32, 1);
373    }
374    encoder.copy_buffer_to_buffer(&trust_buf, 0, &trust_staging, 0, out_bytes);
375    encoder.copy_buffer_to_buffer(&res_out_buf, 0, &res_staging, 0, out_bytes);
376    encoder.copy_buffer_to_buffer(&drift_out_buf, 0, &drift_staging, 0, out_bytes);
377    queue.submit(Some(encoder.finish()));
378    device.poll(wgpu::Maintain::Wait);
379    let dispatch_ms = dispatch_start.elapsed().as_secs_f64() * 1000.0;
380
381    let trust = read_f32_buf(&device, &trust_staging, pixel_count)?;
382    let residual_out = read_f32_buf(&device, &res_staging, pixel_count)?;
383    let drift_out = read_f32_buf(&device, &drift_staging, pixel_count)?;
384    let total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
385
386    Ok(Some(FastPathGpuOutput {
387        trust,
388        residual_history_out: residual_out,
389        drift_history_out: drift_out,
390        total_ms,
391        dispatch_ms,
392        adapter_name: adapter_info.name,
393        backend: format!("{:?}", adapter_info.backend),
394    }))
395}
396
397// ─── Timing study ─────────────────────────────────────────────────────────────
398
399/// One row in the fast-path timing study output.
400#[derive(Clone, Debug, Serialize)]
401pub struct FastPathTimingEntry {
402    pub resolution_label: String,
403    pub width: usize,
404    pub height: usize,
405    pub pixel_count: usize,
406    pub warmup_runs: usize,
407    pub measured_runs: usize,
408    pub local_aggregation: bool,
409    /// Mean wall-clock dispatch+poll time across measured runs (ms).
410    pub mean_dispatch_ms: f64,
411    /// Mean wall-clock total (dispatch+readback) time across measured runs (ms).
412    pub mean_total_ms: f64,
413    pub min_dispatch_ms: f64,
414    pub max_dispatch_ms: f64,
415    pub adapter_name: Option<String>,
416    pub backend: Option<String>,
417    pub actual_gpu_timing: bool,
418    pub notes: Vec<String>,
419}
420
421/// Full result from `run_fast_path_timing_study`.
422#[derive(Clone, Debug, Serialize)]
423pub struct FastPathTimingStudy {
424    pub measurement_kind: String,
425    pub actual_gpu_timing: bool,
426    pub lambda: f32,
427    pub k: f32,
428    pub entries: Vec<FastPathTimingEntry>,
429    pub notes: Vec<String>,
430}
431
432/// Run the GPU timing study for the fast-path proxy at 1080p and 4K.
433///
434/// Uses synthetic uniform-colour inputs at each resolution.
435/// Runs `warmup_runs` iterations before measuring, then averages `measured_runs`.
436///
437/// Returns measured timings if a wgpu adapter is available; otherwise returns a
438/// study with `actual_gpu_timing = false` and no entries.
439pub fn run_fast_path_timing_study() -> Result<FastPathTimingStudy> {
440    let resolutions: &[(&str, usize, usize)] = if cfg!(debug_assertions) {
441        // Reduced sizes for debug/test builds.
442        &[("854x480_debug", 854, 480), ("1280x720_debug", 1280, 720)]
443    } else {
444        &[("1920x1080", 1920, 1080), ("3840x2160", 3840, 2160)]
445    };
446
447    let warmup_runs = 3usize;
448    let measured_runs = 10usize;
449
450    // Acquire GPU device once; reuse across all resolution tests.
451    let gpu = pollster::block_on(acquire_device());
452    let (device, queue, adapter_name, backend) = match gpu {
453        Some(g) => g,
454        None => {
455            return Ok(FastPathTimingStudy {
456                measurement_kind: "gpu_unavailable_no_measurement".to_string(),
457                actual_gpu_timing: false,
458                lambda: FAST_PATH_LAMBDA,
459                k: FAST_PATH_K,
460                entries: vec![],
461                notes: vec![
462                    "No wgpu adapter was available. GPU timing could not be measured.".to_string(),
463                ],
464            });
465        }
466    };
467
468    // Build shader + pipeline once; shared across all resolutions.
469    let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
470        label: Some("fp-timing-shader"),
471        source: wgpu::ShaderSource::Wgsl(FAST_PATH_SHADER.into()),
472    });
473    let bgl = make_bind_group_layout(&device);
474    let pipeline = make_pipeline(&device, &shader, &bgl);
475
476    let mut entries = Vec::new();
477
478    for &(label, width, height) in resolutions {
479        let pixel_count = width * height;
480        // Synthetic inputs: uniform mid-grey current, slightly darker history, zero histories.
481        let current_rgb: Vec<[f32; 3]> = vec![[0.5f32, 0.5, 0.5]; pixel_count];
482        let history_rgb: Vec<[f32; 3]> = vec![[0.48f32, 0.48, 0.48]; pixel_count];
483        let res_in = vec![0.0f32; pixel_count];
484        let drift_in = vec![0.0f32; pixel_count];
485
486        let current_packed = pack_colors(&current_rgb);
487        let history_packed = pack_colors(&history_rgb);
488        let params_val = FpGpuParams {
489            size: [width as u32, height as u32, 0, 0],
490            coefficients: [FAST_PATH_LAMBDA, FAST_PATH_K, 0.0, 0.0],
491        };
492        let out_bytes = (pixel_count * std::mem::size_of::<f32>()) as u64;
493
494        let (
495            cur_buf,
496            hist_buf,
497            res_in_buf,
498            drift_in_buf,
499            params_buf,
500            trust_buf,
501            res_out_buf,
502            drift_out_buf,
503        ) = upload_gpu_buffers(
504            &device,
505            &current_packed,
506            &history_packed,
507            &res_in,
508            &drift_in,
509            &params_val,
510            pixel_count,
511        );
512        let trust_staging = make_staging(&device, out_bytes, "fp-timing-stg");
513        let bind_group = make_bind_group(
514            &device,
515            &bgl,
516            &cur_buf,
517            &hist_buf,
518            &res_in_buf,
519            &drift_in_buf,
520            &params_buf,
521            &trust_buf,
522            &res_out_buf,
523            &drift_out_buf,
524        );
525
526        // Warmup.
527        for _ in 0..warmup_runs {
528            let mut enc = device
529                .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
530            {
531                let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
532                    label: None,
533                    timestamp_writes: None,
534                });
535                pass.set_pipeline(&pipeline);
536                pass.set_bind_group(0, &bind_group, &[]);
537                pass.dispatch_workgroups(width as u32, height as u32, 1);
538            }
539            queue.submit(Some(enc.finish()));
540            device.poll(wgpu::Maintain::Wait);
541        }
542
543        // Measured runs.
544        let mut dispatch_times = Vec::with_capacity(measured_runs);
545        let mut total_times = Vec::with_capacity(measured_runs);
546        for _ in 0..measured_runs {
547            let t0 = Instant::now();
548            let d0 = Instant::now();
549            let mut enc = device
550                .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
551            {
552                let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
553                    label: None,
554                    timestamp_writes: None,
555                });
556                pass.set_pipeline(&pipeline);
557                pass.set_bind_group(0, &bind_group, &[]);
558                pass.dispatch_workgroups(width as u32, height as u32, 1);
559            }
560            // Include one buffer copy (trust only) to bound actual transfer cost.
561            enc.copy_buffer_to_buffer(&trust_buf, 0, &trust_staging, 0, out_bytes);
562            queue.submit(Some(enc.finish()));
563            device.poll(wgpu::Maintain::Wait);
564            dispatch_times.push(d0.elapsed().as_secs_f64() * 1000.0);
565            total_times.push(t0.elapsed().as_secs_f64() * 1000.0);
566        }
567
568        let n = measured_runs as f64;
569        let mean_dispatch = dispatch_times.iter().sum::<f64>() / n;
570        let mean_total = total_times.iter().sum::<f64>() / n;
571        let min_d = dispatch_times.iter().cloned().fold(f64::INFINITY, f64::min);
572        let max_d = dispatch_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
573
574        entries.push(FastPathTimingEntry {
575            resolution_label: label.to_string(),
576            width,
577            height,
578            pixel_count,
579            warmup_runs,
580            measured_runs,
581            local_aggregation: false,
582            mean_dispatch_ms: mean_dispatch,
583            mean_total_ms: mean_total,
584            min_dispatch_ms: min_d,
585            max_dispatch_ms: max_d,
586            adapter_name: Some(adapter_name.clone()),
587            backend: Some(backend.clone()),
588            actual_gpu_timing: true,
589            notes: vec![
590                "Timing uses wgpu dispatch + Maintain::Wait (CPU-side wall clock).".to_string(),
591                "Inputs are synthetic uniform-colour buffers; no real capture data required.".to_string(),
592                "This reflects the reduced proxy only, not the full DSFB supervisory system.".to_string(),
593            ],
594        });
595    }
596
597    Ok(FastPathTimingStudy {
598        measurement_kind: "gpu_fast_path_proxy_cpu_wall_clock".to_string(),
599        actual_gpu_timing: true,
600        lambda: FAST_PATH_LAMBDA,
601        k: FAST_PATH_K,
602        entries,
603        notes: vec![
604            "These timings reflect the minimal inline deployment proxy.".to_string(),
605            "They must not be interpreted as the cost of the full DSFB supervisory system.".to_string(),
606        ],
607    })
608}
609
610// ─── GPU helpers ──────────────────────────────────────────────────────────────
611
612async fn acquire_device() -> Option<(wgpu::Device, wgpu::Queue, String, String)> {
613    let instance = wgpu::Instance::default();
614    let adapter = instance
615        .request_adapter(&wgpu::RequestAdapterOptions {
616            power_preference: wgpu::PowerPreference::HighPerformance,
617            compatible_surface: None,
618            force_fallback_adapter: false,
619        })
620        .await?;
621    let info = adapter.get_info();
622    let adapter_limits = adapter.limits();
623    let (device, queue) = adapter
624        .request_device(
625            &wgpu::DeviceDescriptor {
626                label: Some("dsfb-fast-path-timing"),
627                required_features: wgpu::Features::empty(),
628                required_limits: wgpu::Limits {
629                    max_storage_buffer_binding_size: adapter_limits
630                        .max_storage_buffer_binding_size,
631                    max_buffer_size: adapter_limits.max_buffer_size,
632                    ..wgpu::Limits::default()
633                },
634            },
635            None,
636        )
637        .await
638        .ok()?;
639    Some((device, queue, info.name, format!("{:?}", info.backend)))
640}
641
642fn make_bind_group_layout(device: &wgpu::Device) -> wgpu::BindGroupLayout {
643    device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
644        label: Some("fp-bgl"),
645        entries: &[
646            storage_entry(0, true),
647            storage_entry(1, true),
648            storage_entry(2, true),
649            storage_entry(3, true),
650            uniform_entry(4),
651            storage_entry(5, false),
652            storage_entry(6, false),
653            storage_entry(7, false),
654        ],
655    })
656}
657
658fn make_pipeline(
659    device: &wgpu::Device,
660    shader: &wgpu::ShaderModule,
661    bgl: &wgpu::BindGroupLayout,
662) -> wgpu::ComputePipeline {
663    let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
664        label: None,
665        bind_group_layouts: &[bgl],
666        push_constant_ranges: &[],
667    });
668    device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
669        label: Some("dsfb-fast-path-pipeline"),
670        layout: Some(&layout),
671        module: shader,
672        entry_point: "main",
673    })
674}
675
676#[allow(clippy::too_many_arguments)]
677fn upload_gpu_buffers(
678    device: &wgpu::Device,
679    current_packed: &[FpColor4],
680    history_packed: &[FpColor4],
681    residual_in: &[f32],
682    drift_in: &[f32],
683    params: &FpGpuParams,
684    pixel_count: usize,
685) -> (
686    wgpu::Buffer,
687    wgpu::Buffer,
688    wgpu::Buffer,
689    wgpu::Buffer,
690    wgpu::Buffer,
691    wgpu::Buffer,
692    wgpu::Buffer,
693    wgpu::Buffer,
694) {
695    let out_size = (pixel_count * std::mem::size_of::<f32>()) as u64;
696    let cur = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
697        label: Some("fp-cur"),
698        contents: bytemuck::cast_slice(current_packed),
699        usage: wgpu::BufferUsages::STORAGE,
700    });
701    let hist = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
702        label: Some("fp-hist"),
703        contents: bytemuck::cast_slice(history_packed),
704        usage: wgpu::BufferUsages::STORAGE,
705    });
706    let res_in = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
707        label: Some("fp-res-in"),
708        contents: bytemuck::cast_slice(residual_in),
709        usage: wgpu::BufferUsages::STORAGE,
710    });
711    let drift_in_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
712        label: Some("fp-drift-in"),
713        contents: bytemuck::cast_slice(drift_in),
714        usage: wgpu::BufferUsages::STORAGE,
715    });
716    let params_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
717        label: Some("fp-params"),
718        contents: bytemuck::bytes_of(params),
719        usage: wgpu::BufferUsages::UNIFORM,
720    });
721    let trust_out = device.create_buffer(&wgpu::BufferDescriptor {
722        label: Some("fp-trust"),
723        size: out_size,
724        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
725        mapped_at_creation: false,
726    });
727    let res_out = device.create_buffer(&wgpu::BufferDescriptor {
728        label: Some("fp-res-out"),
729        size: out_size,
730        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
731        mapped_at_creation: false,
732    });
733    let drift_out = device.create_buffer(&wgpu::BufferDescriptor {
734        label: Some("fp-drift-out"),
735        size: out_size,
736        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
737        mapped_at_creation: false,
738    });
739    (cur, hist, res_in, drift_in_buf, params_buf, trust_out, res_out, drift_out)
740}
741
742#[allow(clippy::too_many_arguments)]
743fn make_bind_group(
744    device: &wgpu::Device,
745    bgl: &wgpu::BindGroupLayout,
746    cur: &wgpu::Buffer,
747    hist: &wgpu::Buffer,
748    res_in: &wgpu::Buffer,
749    drift_in: &wgpu::Buffer,
750    params: &wgpu::Buffer,
751    trust_out: &wgpu::Buffer,
752    res_out: &wgpu::Buffer,
753    drift_out: &wgpu::Buffer,
754) -> wgpu::BindGroup {
755    device.create_bind_group(&wgpu::BindGroupDescriptor {
756        label: None,
757        layout: bgl,
758        entries: &[
759            buf_entry(0, cur),
760            buf_entry(1, hist),
761            buf_entry(2, res_in),
762            buf_entry(3, drift_in),
763            buf_entry(4, params),
764            buf_entry(5, trust_out),
765            buf_entry(6, res_out),
766            buf_entry(7, drift_out),
767        ],
768    })
769}
770
771fn make_staging(device: &wgpu::Device, size: u64, label: &str) -> wgpu::Buffer {
772    device.create_buffer(&wgpu::BufferDescriptor {
773        label: Some(label),
774        size,
775        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
776        mapped_at_creation: false,
777    })
778}
779
780fn read_f32_buf(device: &wgpu::Device, buf: &wgpu::Buffer, count: usize) -> Result<Vec<f32>> {
781    let slice = buf.slice(..);
782    let (tx, rx) = mpsc::channel();
783    slice.map_async(wgpu::MapMode::Read, move |r| {
784        let _ = tx.send(r);
785    });
786    device.poll(wgpu::Maintain::Wait);
787    rx.recv()
788        .map_err(|_| Error::Message("GPU map_async channel closed".to_string()))?
789        .map_err(|e| Error::Message(format!("GPU map_async failed: {e}")))?;
790    let mapped = slice.get_mapped_range();
791    let values: Vec<f32> = bytemuck::cast_slice(&mapped).to_vec();
792    drop(mapped);
793    buf.unmap();
794    if values.len() != count {
795        return Err(Error::Message(format!(
796            "GPU readback: expected {count} f32 values, got {}",
797            values.len()
798        )));
799    }
800    Ok(values)
801}
802
803fn pack_colors(pixels: &[[f32; 3]]) -> Vec<FpColor4> {
804    pixels
805        .iter()
806        .map(|&[r, g, b]| FpColor4 { value: [r, g, b, 1.0] })
807        .collect()
808}
809
810fn storage_entry(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
811    wgpu::BindGroupLayoutEntry {
812        binding,
813        visibility: wgpu::ShaderStages::COMPUTE,
814        ty: wgpu::BindingType::Buffer {
815            ty: wgpu::BufferBindingType::Storage { read_only },
816            has_dynamic_offset: false,
817            min_binding_size: None,
818        },
819        count: None,
820    }
821}
822
823fn uniform_entry(binding: u32) -> wgpu::BindGroupLayoutEntry {
824    wgpu::BindGroupLayoutEntry {
825        binding,
826        visibility: wgpu::ShaderStages::COMPUTE,
827        ty: wgpu::BindingType::Buffer {
828            ty: wgpu::BufferBindingType::Uniform,
829            has_dynamic_offset: false,
830            min_binding_size: None,
831        },
832        count: None,
833    }
834}
835
836fn buf_entry(binding: u32, buffer: &wgpu::Buffer) -> wgpu::BindGroupEntry<'_> {
837    wgpu::BindGroupEntry {
838        binding,
839        resource: buffer.as_entire_binding(),
840    }
841}
842
843// ─── SVG trust visualisation ──────────────────────────────────────────────────
844
845/// Render a trust field as a simple SVG heat-map strip (one rect per pixel column).
846///
847/// Used only for artifact visualisation; not part of the core computation.
848pub fn render_trust_strip_svg(trust: &[f32], width: usize, height: usize) -> String {
849    let sample_width = width.min(256);
850    let step = width / sample_width.max(1);
851    let bar_h = 40usize;
852    let total_w = sample_width * 2;
853    let total_h = bar_h + 20;
854    let mut svg = format!(
855        r#"<svg xmlns="http://www.w3.org/2000/svg" width="{total_w}" height="{total_h}">"#
856    );
857    // Sample the middle row.
858    let mid_y = height / 2;
859    for col in 0..sample_width {
860        let src_x = (col * step).min(width.saturating_sub(1));
861        let i = mid_y * width + src_x;
862        let t = trust.get(i).copied().unwrap_or(1.0).clamp(0.0, 1.0);
863        let g = (t * 220.0) as u8;
864        let r = ((1.0 - t) * 220.0) as u8;
865        svg.push_str(&format!(
866            r#"<rect x="{}" y="0" width="2" height="{bar_h}" fill="rgb({r},{g},80)"/>"#,
867            col * 2
868        ));
869    }
870    svg.push_str(&format!(
871        "<text x=\"4\" y=\"{}\" font-size=\"10\" fill=\"#444\">trust (mid-row, {}x{})</text>",
872        bar_h + 14,
873        width,
874        height
875    ));
876    svg.push_str("</svg>");
877    svg
878}