facett-core 0.1.10

//! **GPU compute particles** (feature `wgpu`) — a storage-buffer boids system
//! stepped in a compute shader and drawn as soft **additive** points into an HDR
//! target. Net-new (the only prior compute shader in the tree is the 2D map cull);
//! it composes with the shared HDR offscreen colour by drawing additively over it.
//!
//! Reynolds' three flocking rules (separation / alignment / cohesion) run O(n²) in
//! `particles.wgsl`'s `cs_step` over a ping-pong pair of storage buffers (read
//! `src`, write `dst` — no read/write hazard). The sim is a **pure function** of the
//! initial state + dt + step count, so it is deterministic (FC-7) and the CPU
//! mirror ([`step_cpu`]) is the parity reference the GPU readback proof checks
//! against.
//!
//! The render pass ([`GpuParticles::render`]) expands a unit quad per particle with
//! a soft radial falloff and premultiplied output, so an additive blend accumulates
//! glow — overlapping particles get brighter, the classic light-field look.

use wgpu::util::DeviceExt;
use wgpu::TextureFormat;

/// One particle: 2D position (normalised `[0,1)` toroidal space) + velocity.
/// 16 bytes, matches `Particle` in `particles.wgsl`.
#[repr(C)]
#[derive(Clone, Copy, Debug, Default, PartialEq, bytemuck::Pod, bytemuck::Zeroable)]
pub struct Particle {
    pub pos: [f32; 2],
    pub vel: [f32; 2],
}

/// The boids tuning. [`Default`] is a calm, cohesive flock.
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct SimParams {
    pub max_speed: f32,
    pub sep_radius: f32,
    pub align_radius: f32,
    pub cohesion_radius: f32,
    pub sep_weight: f32,
    pub align_weight: f32,
    pub cohesion_weight: f32,
}

impl Default for SimParams {
    fn default() -> Self {
        Self {
            max_speed: 0.4,
            sep_radius: 0.04,
            align_radius: 0.10,
            cohesion_radius: 0.10,
            sep_weight: 1.5,
            align_weight: 1.0,
            cohesion_weight: 0.8,
        }
    }
}

/// How the additive points are drawn.
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct DrawParams {
    /// Point sprite size in physical pixels.
    pub point_size: f32,
    /// Glow intensity multiplier (can exceed 1.0 into the HDR target).
    pub intensity: f32,
    /// Base (cool/slow) tint, linear RGB.
    pub tint: [f32; 3],
    /// How strongly speed warms the colour (0 = constant tint).
    pub speed_warm: f32,
}

impl Default for DrawParams {
    fn default() -> Self {
        Self { point_size: 6.0, intensity: 1.0, tint: [0.25, 0.55, 1.0], speed_warm: 2.0 }
    }
}

/// The CPU reference step — advance `particles` by `dt` under `p`, **mirroring**
/// `cs_step` in `particles.wgsl` (same loop order, same math). Pure + deterministic
/// (FC-7): the GPU readback proof asserts the device produces the same result.
pub fn step_cpu(particles: &[Particle], dt: f32, p: &SimParams) -> Vec<Particle> {
    let n = particles.len();
    let mut out = Vec::with_capacity(n);
    for i in 0..n {
        let me = particles[i];
        let mut sep = [0.0f32; 2];
        let mut align_sum = [0.0f32; 2];
        let mut coh_sum = [0.0f32; 2];
        let mut align_n = 0.0f32;
        let mut coh_n = 0.0f32;
        for (j, o) in particles.iter().enumerate() {
            if j == i {
                continue;
            }
            let d = [o.pos[0] - me.pos[0], o.pos[1] - me.pos[1]];
            let dist = (d[0] * d[0] + d[1] * d[1]).sqrt();
            if dist > 0.0 && dist < p.sep_radius {
                let inv = 1.0 / (dist * dist);
                sep[0] -= d[0] * inv;
                sep[1] -= d[1] * inv;
            }
            if dist < p.align_radius {
                align_sum[0] += o.vel[0];
                align_sum[1] += o.vel[1];
                align_n += 1.0;
            }
            if dist < p.cohesion_radius {
                coh_sum[0] += o.pos[0];
                coh_sum[1] += o.pos[1];
                coh_n += 1.0;
            }
        }
        let mut acc = [sep[0] * p.sep_weight, sep[1] * p.sep_weight];
        if align_n > 0.0 {
            acc[0] += (align_sum[0] / align_n - me.vel[0]) * p.align_weight;
            acc[1] += (align_sum[1] / align_n - me.vel[1]) * p.align_weight;
        }
        if coh_n > 0.0 {
            acc[0] += (coh_sum[0] / coh_n - me.pos[0]) * p.cohesion_weight;
            acc[1] += (coh_sum[1] / coh_n - me.pos[1]) * p.cohesion_weight;
        }
        let mut vel = [me.vel[0] + acc[0] * dt, me.vel[1] + acc[1] * dt];
        let sp = (vel[0] * vel[0] + vel[1] * vel[1]).sqrt();
        if sp > p.max_speed {
            let k = p.max_speed / sp;
            vel[0] *= k;
            vel[1] *= k;
        }
        let mut pos = [me.pos[0] + vel[0] * dt, me.pos[1] + vel[1] * dt];
        pos[0] -= pos[0].floor();
        pos[1] -= pos[1].floor();
        out.push(Particle { pos, vel });
    }
    out
}

#[repr(C)]
#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct SimUniforms {
    a: [f32; 4],
    b: [f32; 4],
    c: [f32; 4],
}

#[repr(C)]
#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct DrawUniforms {
    a: [f32; 4],
    tint: [f32; 4],
}

/// The compute shader (`cs_step`) + the additive point render (`pt_vs`/`pt_fs`).
pub const PARTICLES_WGSL: &str = include_str!("particles.wgsl");

/// A ping-pong pair of particle storage buffers + the live count. After each
/// [`GpuParticles::step`] the up-to-date particles are in `a`.
pub struct ParticleBuffers {
    a: wgpu::Buffer,
    b: wgpu::Buffer,
    count: u32,
}

impl ParticleBuffers {
    /// The live particle count.
    #[must_use]
    pub fn count(&self) -> u32 {
        self.count
    }
}

/// The compute-particles renderer: one compute pipeline (the boids step) + one
/// additive render pipeline (the point sprites), built for an HDR `target_format`.
pub struct GpuParticles {
    step_pipeline: wgpu::ComputePipeline,
    step_bgl: wgpu::BindGroupLayout,
    draw_pipeline: wgpu::RenderPipeline,
    draw_bgl: wgpu::BindGroupLayout,
    sim_u: wgpu::Buffer,
    draw_u: wgpu::Buffer,
    target_format: TextureFormat,
}

impl GpuParticles {
    /// Build the compute + additive-render pipelines for `target_format` (e.g. the
    /// HDR `Rgba16Float` offscreen). Validates `particles.wgsl` on build. Requires a
    /// device created with compute + storage support.
    pub fn new(device: &wgpu::Device, target_format: TextureFormat) -> Self {
        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
            label: Some("l0_particles"),
            source: wgpu::ShaderSource::Wgsl(PARTICLES_WGSL.into()),
        });

        // ── compute step bind group: uniform + src(read) + dst(read_write) ──
        let step_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("l0_particles_step_bgl"),
            entries: &[
                wgpu::BindGroupLayoutEntry {
                    binding: 0,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Uniform, has_dynamic_offset: false, min_binding_size: None },
                    count: None,
                },
                wgpu::BindGroupLayoutEntry {
                    binding: 1,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, min_binding_size: None },
                    count: None,
                },
                wgpu::BindGroupLayoutEntry {
                    binding: 2,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: false }, has_dynamic_offset: false, min_binding_size: None },
                    count: None,
                },
            ],
        });
        let step_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
            label: Some("l0_particles_step"),
            layout: Some(&device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                label: Some("l0_particles_step_pll"),
                bind_group_layouts: &[Some(&step_bgl)],
                immediate_size: 0,
            })),
            module: &shader,
            entry_point: Some("cs_step"),
            compilation_options: Default::default(),
            cache: None,
        });

        // ── render bind group: uniform + points(read storage, vertex stage) ──
        let draw_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("l0_particles_draw_bgl"),
            entries: &[
                wgpu::BindGroupLayoutEntry {
                    binding: 0,
                    visibility: wgpu::ShaderStages::VERTEX_FRAGMENT,
                    ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Uniform, has_dynamic_offset: false, min_binding_size: None },
                    count: None,
                },
                wgpu::BindGroupLayoutEntry {
                    binding: 1,
                    visibility: wgpu::ShaderStages::VERTEX,
                    ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, min_binding_size: None },
                    count: None,
                },
            ],
        });
        // Additive: premultiplied source added onto the target (glow accumulates).
        let blend = Some(wgpu::BlendState {
            color: wgpu::BlendComponent { src_factor: wgpu::BlendFactor::One, dst_factor: wgpu::BlendFactor::One, operation: wgpu::BlendOperation::Add },
            alpha: wgpu::BlendComponent { src_factor: wgpu::BlendFactor::One, dst_factor: wgpu::BlendFactor::One, operation: wgpu::BlendOperation::Add },
        });
        let draw_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("l0_particles_draw"),
            layout: Some(&device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                label: Some("l0_particles_draw_pll"),
                bind_group_layouts: &[Some(&draw_bgl)],
                immediate_size: 0,
            })),
            vertex: wgpu::VertexState { module: &shader, entry_point: Some("pt_vs"), compilation_options: Default::default(), buffers: &[] },
            primitive: wgpu::PrimitiveState { topology: wgpu::PrimitiveTopology::TriangleList, ..Default::default() },
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: Some("pt_fs"),
                compilation_options: Default::default(),
                targets: &[Some(wgpu::ColorTargetState { format: target_format, blend, write_mask: wgpu::ColorWrites::ALL })],
            }),
            multiview_mask: None,
            cache: None,
        });

        let mkbuf = |label: &str| device.create_buffer(&wgpu::BufferDescriptor {
            label: Some(label),
            size: std::mem::size_of::<SimUniforms>().max(std::mem::size_of::<DrawUniforms>()) as u64,
            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
            mapped_at_creation: false,
        });
        Self {
            step_pipeline,
            step_bgl,
            draw_pipeline,
            draw_bgl,
            sim_u: mkbuf("l0_particles_sim_u"),
            draw_u: mkbuf("l0_particles_draw_u"),
            target_format,
        }
    }

    /// The colour format the render pipeline targets.
    #[must_use]
    pub fn target_format(&self) -> TextureFormat {
        self.target_format
    }

    /// Allocate the ping-pong storage buffers seeded with `init`.
    pub fn make_buffers(&self, device: &wgpu::Device, init: &[Particle]) -> ParticleBuffers {
        let usage = wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::COPY_DST;
        let a = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
            label: Some("l0_particles_a"),
            contents: bytemuck::cast_slice(init),
            usage,
        });
        let b = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("l0_particles_b"),
            size: (std::mem::size_of::<Particle>() * init.len().max(1)) as u64,
            usage,
            mapped_at_creation: false,
        });
        ParticleBuffers { a, b, count: init.len() as u32 }
    }

    /// Record one boids step: dispatch `cs_step` reading `bufs.a`, writing `bufs.b`,
    /// then swap so the result lands back in `bufs.a`.
    pub fn step(
        &self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        encoder: &mut wgpu::CommandEncoder,
        bufs: &mut ParticleBuffers,
        dt: f32,
        p: &SimParams,
    ) {
        let u = SimUniforms {
            a: [dt, bufs.count as f32, p.max_speed, p.sep_radius],
            b: [p.align_radius, p.cohesion_radius, p.sep_weight, p.align_weight],
            c: [p.cohesion_weight, 0.0, 0.0, 0.0],
        };
        queue.write_buffer(&self.sim_u, 0, bytemuck::bytes_of(&u));
        let bind = device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("l0_particles_step_bind"),
            layout: &self.step_bgl,
            entries: &[
                wgpu::BindGroupEntry { binding: 0, resource: self.sim_u.as_entire_binding() },
                wgpu::BindGroupEntry { binding: 1, resource: bufs.a.as_entire_binding() },
                wgpu::BindGroupEntry { binding: 2, resource: bufs.b.as_entire_binding() },
            ],
        });
        {
            let mut cp = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: Some("l0_particles_step_pass"), timestamp_writes: None });
            cp.set_pipeline(&self.step_pipeline);
            cp.set_bind_group(0, &bind, &[]);
            cp.dispatch_workgroups(bufs.count.div_ceil(64), 1, 1);
        }
        std::mem::swap(&mut bufs.a, &mut bufs.b);
    }

    /// Record the additive point render of `bufs.a` into `target` (must be
    /// `target_format`). `load` keeps the existing target contents (compose over the
    /// HDR scene); `false` clears to transparent first.
    #[allow(clippy::too_many_arguments)]
    pub fn render(
        &self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        encoder: &mut wgpu::CommandEncoder,
        bufs: &ParticleBuffers,
        target: &wgpu::TextureView,
        d: &DrawParams,
        load: bool,
        w: u32,
        h: u32,
    ) {
        let u = DrawUniforms {
            a: [d.point_size, d.intensity, w.max(1) as f32, h.max(1) as f32],
            tint: [d.tint[0], d.tint[1], d.tint[2], d.speed_warm],
        };
        queue.write_buffer(&self.draw_u, 0, bytemuck::bytes_of(&u));
        let bind = device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("l0_particles_draw_bind"),
            layout: &self.draw_bgl,
            entries: &[
                wgpu::BindGroupEntry { binding: 0, resource: self.draw_u.as_entire_binding() },
                wgpu::BindGroupEntry { binding: 1, resource: bufs.a.as_entire_binding() },
            ],
        });
        let load_op = if load { wgpu::LoadOp::Load } else { wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT) };
        let mut rp = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: Some("l0_particles_draw_pass"),
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: target,
                resolve_target: None,
                depth_slice: None,
                ops: wgpu::Operations { load: load_op, store: wgpu::StoreOp::Store },
            })],
            depth_stencil_attachment: None,
            timestamp_writes: None,
            occlusion_query_set: None,
            multiview_mask: None,
        });
        rp.set_pipeline(&self.draw_pipeline);
        rp.set_bind_group(0, &bind, &[]);
        rp.draw(0..6, 0..bufs.count);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn ring(n: usize) -> Vec<Particle> {
        // A deterministic seeded flock: positions on a small circle, tangential vel.
        (0..n)
            .map(|i| {
                let t = i as f32 / n as f32 * std::f32::consts::TAU;
                Particle { pos: [0.5 + 0.08 * t.cos(), 0.5 + 0.08 * t.sin()], vel: [-0.1 * t.sin(), 0.1 * t.cos()] }
            })
            .collect()
    }

    /// INJECT-ASSERT (FC-7): the CPU step is a pure, deterministic function —
    /// stepping the same state twice gives **bit-identical** results.
    #[test]
    fn cpu_step_is_deterministic() {
        let p = SimParams::default();
        let init = ring(32);
        let a = step_cpu(&init, 0.016, &p);
        let b = step_cpu(&init, 0.016, &p);
        assert_eq!(a.len(), init.len());
        for (x, y) in a.iter().zip(&b) {
            assert_eq!(x.pos[0].to_bits(), y.pos[0].to_bits(), "deterministic pos.x");
            assert_eq!(x.vel[1].to_bits(), y.vel[1].to_bits(), "deterministic vel.y");
        }
    }

    /// INJECT-ASSERT: positions stay inside the `[0,1)` toroidal box (the wrap
    /// works), and the speed clamp holds — no particle exceeds `max_speed`.
    #[test]
    fn step_keeps_bounds_and_clamps_speed() {
        let p = SimParams::default();
        let mut cur = ring(48);
        for _ in 0..60 {
            cur = step_cpu(&cur, 0.02, &p);
        }
        for q in &cur {
            assert!((0.0..1.0).contains(&q.pos[0]) && (0.0..1.0).contains(&q.pos[1]), "in box: {:?}", q.pos);
            let sp = (q.vel[0] * q.vel[0] + q.vel[1] * q.vel[1]).sqrt();
            assert!(sp <= p.max_speed + 1e-4, "speed clamped: {sp}");
        }
    }

    /// INJECT-ASSERT: separation actually pushes two near-coincident particles
    /// apart — the distance grows after a step (the rule does something real).
    #[test]
    fn separation_pushes_neighbours_apart() {
        let p = SimParams { align_weight: 0.0, cohesion_weight: 0.0, ..SimParams::default() };
        let init = vec![
            Particle { pos: [0.50, 0.5], vel: [0.0, 0.0] },
            Particle { pos: [0.51, 0.5], vel: [0.0, 0.0] },
        ];
        let d0 = (init[1].pos[0] - init[0].pos[0]).abs();
        let next = step_cpu(&init, 0.05, &p);
        let d1 = (next[1].pos[0] - next[0].pos[0]).abs();
        assert!(d1 > d0, "separation increased the gap ({d0} → {d1})");
    }

    /// Headless device with the adapter's real limits (so compute + storage are
    /// available), guarded on compute support. `None` → self-skip.
    fn compute_device() -> Option<(wgpu::Device, wgpu::Queue)> {
        let instance = wgpu::Instance::default();
        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
            power_preference: wgpu::PowerPreference::default(),
            force_fallback_adapter: false,
            compatible_surface: None,
        }))
        .ok()?;
        if !adapter.get_downlevel_capabilities().flags.contains(wgpu::DownlevelFlags::COMPUTE_SHADERS) {
            return None;
        }
        if adapter.limits().max_storage_buffers_per_shader_stage < 2 {
            return None;
        }
        pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
            label: Some("l0-particles-proof"),
            required_features: wgpu::Features::empty(),
            required_limits: adapter.limits(),
            memory_hints: wgpu::MemoryHints::default(),
            experimental_features: wgpu::ExperimentalFeatures::disabled(),
            trace: wgpu::Trace::Off,
        }))
        .ok()
    }

    fn read_particles(device: &wgpu::Device, queue: &wgpu::Queue, buf: &wgpu::Buffer, n: u32) -> Vec<Particle> {
        let size = (std::mem::size_of::<Particle>() as u32 * n) as u64;
        let readback = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("l0-particles-readback"),
            size,
            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
            mapped_at_creation: false,
        });
        let mut enc = device.create_command_encoder(&Default::default());
        enc.copy_buffer_to_buffer(buf, 0, &readback, 0, size);
        queue.submit(Some(enc.finish()));
        let slice = readback.slice(..);
        let (tx, rx) = std::sync::mpsc::channel();
        slice.map_async(wgpu::MapMode::Read, move |r| { let _ = tx.send(r); });
        device.poll(wgpu::PollType::wait_indefinitely()).ok();
        rx.recv().unwrap().unwrap();
        let data = slice.get_mapped_range();
        let out: Vec<Particle> = bytemuck::cast_slice(&data).to_vec();
        drop(data);
        readback.unmap();
        out
    }

    /// RENDER/COMPUTE PARITY PROOF (self-skips without a compute device): one GPU
    /// boids step matches the CPU reference within float tolerance — proving the
    /// WGSL `cs_step` and the CPU `step_cpu` are the same function (FC-7 parity).
    #[test]
    fn gpu_step_matches_cpu_reference() {
        let Some((device, queue)) = compute_device() else {
            eprintln!("[particles] no compute device — skipping GPU parity proof");
            return;
        };
        let p = SimParams::default();
        let init = ring(40);
        let dt = 0.016f32;

        let gp = GpuParticles::new(&device, TextureFormat::Rgba16Float);
        let mut bufs = gp.make_buffers(&device, &init);
        let mut enc = device.create_command_encoder(&Default::default());
        gp.step(&device, &queue, &mut enc, &mut bufs, dt, &p);
        queue.submit(Some(enc.finish()));
        device.poll(wgpu::PollType::wait_indefinitely()).ok();

        let gpu = read_particles(&device, &queue, &bufs.a, bufs.count);
        let cpu = step_cpu(&init, dt, &p);
        assert_eq!(gpu.len(), cpu.len());
        let mut moved = false;
        for (g, c) in gpu.iter().zip(&cpu) {
            assert!((g.pos[0] - c.pos[0]).abs() < 1e-3, "pos.x parity: gpu {} vs cpu {}", g.pos[0], c.pos[0]);
            assert!((g.pos[1] - c.pos[1]).abs() < 1e-3, "pos.y parity: gpu {} vs cpu {}", g.pos[1], c.pos[1]);
            assert!((g.vel[0] - c.vel[0]).abs() < 1e-3, "vel.x parity");
            if (g.pos[0] - 0.5).abs() > 1e-4 {
                moved = true;
            }
        }
        assert!(moved, "the step actually advanced particles");
    }

    /// RENDER PROOF (self-skips without a compute device): additive points
    /// accumulate — a pixel under TWO stacked particles is brighter than one under a
    /// single particle, and the frame is non-blank. Renders into `Rgba8Unorm` for an
    /// easy readback; production uses the same pipeline against `Rgba16Float`.
    #[test]
    fn additive_render_accumulates_overlap() {
        let Some((device, queue)) = compute_device() else {
            eprintln!("[particles] no compute device — skipping additive render proof");
            return;
        };
        let (w, h) = (64u32, 64u32);
        // Left: a single particle. Right: two particles at the same spot (overlap).
        let parts = vec![
            Particle { pos: [0.25, 0.5], vel: [0.0, 0.0] },
            Particle { pos: [0.75, 0.5], vel: [0.0, 0.0] },
            Particle { pos: [0.75, 0.5], vel: [0.0, 0.0] },
        ];
        let gp = GpuParticles::new(&device, TextureFormat::Rgba8Unorm);
        let bufs = gp.make_buffers(&device, &parts);

        let target = device.create_texture(&wgpu::TextureDescriptor {
            label: Some("l0-particles-target"),
            size: wgpu::Extent3d { width: w, height: h, depth_or_array_layers: 1 },
            mip_level_count: 1,
            sample_count: 1,
            dimension: wgpu::TextureDimension::D2,
            format: TextureFormat::Rgba8Unorm,
            usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
            view_formats: &[],
        });
        let view = target.create_view(&Default::default());
        let draw = DrawParams { point_size: 10.0, intensity: 0.45, tint: [0.4, 0.6, 1.0], speed_warm: 0.0 };
        let mut enc = device.create_command_encoder(&Default::default());
        gp.render(&device, &queue, &mut enc, &bufs, &view, &draw, false, w, h);

        // read back
        let bpp = 4u32;
        let unpadded = w * bpp;
        let align = wgpu::COPY_BYTES_PER_ROW_ALIGNMENT;
        let padded = unpadded.div_ceil(align) * align;
        let readback = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("l0-particles-px-readback"),
            size: (padded * h) as u64,
            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
            mapped_at_creation: false,
        });
        enc.copy_texture_to_buffer(
            wgpu::TexelCopyTextureInfo { texture: &target, mip_level: 0, origin: wgpu::Origin3d::ZERO, aspect: wgpu::TextureAspect::All },
            wgpu::TexelCopyBufferInfo { buffer: &readback, layout: wgpu::TexelCopyBufferLayout { offset: 0, bytes_per_row: Some(padded), rows_per_image: Some(h) } },
            wgpu::Extent3d { width: w, height: h, depth_or_array_layers: 1 },
        );
        queue.submit(Some(enc.finish()));
        let slice = readback.slice(..);
        let (tx, rx) = std::sync::mpsc::channel();
        slice.map_async(wgpu::MapMode::Read, move |r| { let _ = tx.send(r); });
        device.poll(wgpu::PollType::wait_indefinitely()).ok();
        rx.recv().unwrap().unwrap();
        let data = slice.get_mapped_range();
        let mut rgba = Vec::with_capacity((w * h * 4) as usize);
        for row in 0..h {
            let s = (row * padded) as usize;
            rgba.extend_from_slice(&data[s..s + unpadded as usize]);
        }
        drop(data);
        readback.unmap();

        let at = |x: u32, y: u32| -> u32 {
            let i = ((y * w + x) * 4) as usize;
            rgba[i] as u32 + rgba[i + 1] as u32 + rgba[i + 2] as u32
        };
        let lit = rgba.chunks_exact(4).filter(|p| p[0] as u32 + p[1] as u32 + p[2] as u32 > 0).count();
        assert!(lit > 0, "particles drew something ({lit} lit px)");
        let single = at(w / 4, h / 2); // under one particle
        let overlap = at(3 * w / 4, h / 2); // under two stacked particles
        assert!(single > 0, "single particle lit its centre ({single})");
        assert!(overlap > single + 10, "two stacked particles add brighter ({overlap}) than one ({single})");
    }

    /// INJECT-ASSERT: the WGSL exposes the entry points the pipelines name + the
    /// uniform blocks are the size the Rust structs declare.
    #[test]
    fn shader_entry_points_and_uniform_sizes() {
        assert!(PARTICLES_WGSL.contains("fn cs_step"));
        assert!(PARTICLES_WGSL.contains("fn pt_vs"));
        assert!(PARTICLES_WGSL.contains("fn pt_fs"));
        assert!(PARTICLES_WGSL.contains("@workgroup_size(64)"));
        assert_eq!(std::mem::size_of::<Particle>(), 16);
        assert_eq!(std::mem::size_of::<SimUniforms>(), 48);
        assert_eq!(std::mem::size_of::<DrawUniforms>(), 32);
    }
}