facett-core 0.1.7

//! **`GpuSdfRenderer`** (feature `wgpu`) — the L0 GPU SDF kernel: instanced
//! screen-aligned quads whose fragment shader evaluates a signed-distance function
//! (smooth-AA circles / rings / markers), plus thick-AA instanced lines. The
//! coverage math in `sdf.wgsl` / `line.wgsl` is byte-for-byte the CPU
//! [`super::super::cpu::sdf`] math, so a GPU frame matches a CPU frame.
//!
//! Domain-agnostic: it draws [`QuadInstance`]/[`LineInstance`] batches and knows
//! nothing of maps or graphs. It reuses the Phase-A install lifecycle
//! ([`super::install_renderer`]) + bytemuck plumbing ([`super::types`]).
//!
//! Two entry points:
//! - [`GpuSdfRenderer::new`] builds the pipelines from a `&wgpu::Device` + a target
//!   [`wgpu::TextureFormat`] — drive it inside an `egui_wgpu` callback (host owns
//!   the pass) via [`GpuSdfRenderer::record`].
//! - [`offscreen_render`] is the self-contained headless path: it spins up its own
//!   adapter/device, draws to an off-screen RGBA8 texture, reads it back to a
//!   straight-RGBA8 [`Frame`] — the GPU↔CPU parity proof. Returns `None` when no
//!   wgpu adapter is available (a CPU-only / sandboxed CI box), so the caller can
//!   degrade gracefully.

use wgpu::util::DeviceExt;

use super::super::camera::Camera;
use super::super::prim::{LineInstance, QuadInstance};
use super::super::{Backend, Canvas, Frame, Renderer};

/// The viewport uniform both SDF shaders read (size in pixels). 16 bytes.
#[repr(C)]
#[derive(Clone, Copy, Debug, bytemuck::Pod, bytemuck::Zeroable)]
pub struct SdfUniforms {
    pub viewport: [f32; 2],
    pub _pad: [f32; 2],
}

/// The SDF quad shader, byte-identical to the CPU coverage math.
pub const SDF_WGSL: &str = include_str!("sdf.wgsl");
/// The thick-AA line shader.
pub const LINE_WGSL: &str = include_str!("line.wgsl");

/// The L0 GPU SDF renderer: a quad pipeline (SDF circles/rings/markers) + a line
/// pipeline (thick AA capsules), the shared uniform bind group, and the install
/// lifecycle the skins re-point at in Phase C.
pub struct GpuSdfRenderer {
    quad_pipeline: wgpu::RenderPipeline,
    line_pipeline: wgpu::RenderPipeline,
    bind_group_layout: wgpu::BindGroupLayout,
    format: wgpu::TextureFormat,
}

impl GpuSdfRenderer {
    /// Build the two SDF pipelines for `format` (the host's target colour format).
    pub fn new(device: &wgpu::Device, format: wgpu::TextureFormat) -> Self {
        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("l0-sdf-uniforms"),
            entries: &[wgpu::BindGroupLayoutEntry {
                binding: 0,
                visibility: wgpu::ShaderStages::VERTEX_FRAGMENT,
                ty: wgpu::BindingType::Buffer {
                    ty: wgpu::BufferBindingType::Uniform,
                    has_dynamic_offset: false,
                    min_binding_size: None,
                },
                count: None,
            }],
        });
        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
            label: Some("l0-sdf-layout"),
            bind_group_layouts: &[Some(&bind_group_layout)],
            immediate_size: 0,
        });

        let blend = Some(wgpu::BlendState {
            // Premultiplied source-over (the shaders output premultiplied colour).
            color: wgpu::BlendComponent {
                src_factor: wgpu::BlendFactor::One,
                dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
                operation: wgpu::BlendOperation::Add,
            },
            alpha: wgpu::BlendComponent {
                src_factor: wgpu::BlendFactor::One,
                dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
                operation: wgpu::BlendOperation::Add,
            },
        });
        let color_target = wgpu::ColorTargetState {
            format,
            blend,
            write_mask: wgpu::ColorWrites::ALL,
        };

        // ── Quad (SDF) pipeline ──
        let quad_sh = device.create_shader_module(wgpu::ShaderModuleDescriptor {
            label: Some("l0-sdf-quad"),
            source: wgpu::ShaderSource::Wgsl(SDF_WGSL.into()),
        });
        // QuadInstance: center vec2(0), radius f32(8), inner f32(12), color vec4(16),
        // aa f32(32), shape u32(36), _pad vec2(40) → stride 48.
        let quad_attrs = [
            wgpu::VertexAttribute { offset: 0, shader_location: 0, format: wgpu::VertexFormat::Float32x2 },
            wgpu::VertexAttribute { offset: 8, shader_location: 1, format: wgpu::VertexFormat::Float32 },
            wgpu::VertexAttribute { offset: 12, shader_location: 2, format: wgpu::VertexFormat::Float32 },
            wgpu::VertexAttribute { offset: 16, shader_location: 3, format: wgpu::VertexFormat::Float32x4 },
            wgpu::VertexAttribute { offset: 32, shader_location: 4, format: wgpu::VertexFormat::Float32 },
            wgpu::VertexAttribute { offset: 36, shader_location: 5, format: wgpu::VertexFormat::Uint32 },
        ];
        let quad_layout = wgpu::VertexBufferLayout {
            array_stride: std::mem::size_of::<QuadInstance>() as wgpu::BufferAddress,
            step_mode: wgpu::VertexStepMode::Instance,
            attributes: &quad_attrs,
        };
        let quad_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("l0-sdf-quad-pipeline"),
            layout: Some(&pipeline_layout),
            vertex: wgpu::VertexState {
                module: &quad_sh,
                entry_point: Some("vs_main"),
                buffers: &[quad_layout],
                compilation_options: Default::default(),
            },
            fragment: Some(wgpu::FragmentState {
                module: &quad_sh,
                entry_point: Some("fs_main"),
                targets: &[Some(color_target.clone())],
                compilation_options: Default::default(),
            }),
            primitive: wgpu::PrimitiveState { topology: wgpu::PrimitiveTopology::TriangleList, ..Default::default() },
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview_mask: None,
            cache: None,
        });

        // ── Line pipeline ──
        let line_sh = device.create_shader_module(wgpu::ShaderModuleDescriptor {
            label: Some("l0-sdf-line"),
            source: wgpu::ShaderSource::Wgsl(LINE_WGSL.into()),
        });
        // LineInstance: a vec2(0), b vec2(8), half_width f32(16), aa f32(20),
        // cap u32(24), _pad0 u32(28), color vec4(32) → stride 48.
        let line_attrs = [
            wgpu::VertexAttribute { offset: 0, shader_location: 0, format: wgpu::VertexFormat::Float32x2 },
            wgpu::VertexAttribute { offset: 8, shader_location: 1, format: wgpu::VertexFormat::Float32x2 },
            wgpu::VertexAttribute { offset: 16, shader_location: 2, format: wgpu::VertexFormat::Float32 },
            wgpu::VertexAttribute { offset: 20, shader_location: 3, format: wgpu::VertexFormat::Float32 },
            wgpu::VertexAttribute { offset: 24, shader_location: 4, format: wgpu::VertexFormat::Uint32 },
            wgpu::VertexAttribute { offset: 32, shader_location: 5, format: wgpu::VertexFormat::Float32x4 },
        ];
        let line_layout = wgpu::VertexBufferLayout {
            array_stride: std::mem::size_of::<LineInstance>() as wgpu::BufferAddress,
            step_mode: wgpu::VertexStepMode::Instance,
            attributes: &line_attrs,
        };
        let line_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("l0-sdf-line-pipeline"),
            layout: Some(&pipeline_layout),
            vertex: wgpu::VertexState {
                module: &line_sh,
                entry_point: Some("vs_main"),
                buffers: &[line_layout],
                compilation_options: Default::default(),
            },
            fragment: Some(wgpu::FragmentState {
                module: &line_sh,
                entry_point: Some("fs_main"),
                targets: &[Some(color_target)],
                compilation_options: Default::default(),
            }),
            primitive: wgpu::PrimitiveState { topology: wgpu::PrimitiveTopology::TriangleList, ..Default::default() },
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview_mask: None,
            cache: None,
        });

        Self { quad_pipeline, line_pipeline, bind_group_layout, format }
    }

    /// The colour format the pipelines were built for.
    pub fn format(&self) -> wgpu::TextureFormat {
        self.format
    }

    /// Build the uniform buffer + bind group for a `width × height` viewport.
    fn make_bind_group(&self, device: &wgpu::Device, width: u32, height: u32) -> (wgpu::Buffer, wgpu::BindGroup) {
        let u = SdfUniforms { viewport: [width as f32, height as f32], _pad: [0.0, 0.0] };
        let ubuf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
            label: Some("l0-sdf-uniform"),
            contents: bytemuck::bytes_of(&u),
            usage: wgpu::BufferUsages::UNIFORM,
        });
        let bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("l0-sdf-bg"),
            layout: &self.bind_group_layout,
            entries: &[wgpu::BindGroupEntry { binding: 0, resource: ubuf.as_entire_binding() }],
        });
        (ubuf, bg)
    }

    /// Record the draw calls into an existing render pass (the host-owned-pass path,
    /// e.g. inside an `egui_wgpu` callback). `viewport` is the target size in pixels
    /// the bind group was made for. The caller supplies the quad + line instance
    /// buffers (and counts).
    #[allow(clippy::too_many_arguments)]
    pub fn record<'a>(
        &'a self,
        pass: &mut wgpu::RenderPass<'a>,
        bind_group: &'a wgpu::BindGroup,
        quad_buf: Option<(&'a wgpu::Buffer, u32)>,
        line_buf: Option<(&'a wgpu::Buffer, u32)>,
    ) {
        pass.set_bind_group(0, bind_group, &[]);
        if let Some((buf, n)) = line_buf.filter(|(_, n)| *n > 0) {
            pass.set_pipeline(&self.line_pipeline);
            pass.set_vertex_buffer(0, buf.slice(..));
            pass.draw(0..6, 0..n);
        }
        if let Some((buf, n)) = quad_buf.filter(|(_, n)| *n > 0) {
            pass.set_pipeline(&self.quad_pipeline);
            pass.set_vertex_buffer(0, buf.slice(..));
            pass.draw(0..6, 0..n);
        }
    }
}

/// Spin up a wgpu device, draw `quads` + `lines` into a `width × height` off-screen
/// RGBA8 texture, and read it back as a straight-RGBA8 [`Frame`]. The self-contained
/// headless path (no egui, no window) — the GPU↔CPU parity proof.
///
/// Returns `None` if no usable wgpu adapter exists (a CPU-only / sandboxed box):
/// the caller degrades to the CPU lane instead of failing.
pub fn offscreen_render(
    width: u32,
    height: u32,
    background: [f32; 4],
    quads: &[QuadInstance],
    lines: &[LineInstance],
) -> Option<Frame> {
    pollster::block_on(offscreen_render_async(width, height, background, quads, lines))
}

async fn offscreen_render_async(
    width: u32,
    height: u32,
    background: [f32; 4],
    quads: &[QuadInstance],
    lines: &[LineInstance],
) -> Option<Frame> {
    let instance = wgpu::Instance::default();
    let adapter = instance
        .request_adapter(&wgpu::RequestAdapterOptions {
            power_preference: wgpu::PowerPreference::default(),
            force_fallback_adapter: false,
            compatible_surface: None,
        })
        .await
        .ok()?;
    let (device, queue) = adapter
        .request_device(&wgpu::DeviceDescriptor {
            label: Some("l0-sdf-offscreen"),
            required_features: wgpu::Features::empty(),
            required_limits: wgpu::Limits::downlevel_defaults(),
            memory_hints: wgpu::MemoryHints::default(),
            experimental_features: wgpu::ExperimentalFeatures::disabled(),
            trace: wgpu::Trace::Off,
        })
        .await
        .ok()?;

    let format = wgpu::TextureFormat::Rgba8Unorm;
    let renderer = GpuSdfRenderer::new(&device, format);
    let (_ubuf, bind_group) = renderer.make_bind_group(&device, width, height);

    let target = device.create_texture(&wgpu::TextureDescriptor {
        label: Some("l0-sdf-target"),
        size: wgpu::Extent3d { width, height, depth_or_array_layers: 1 },
        mip_level_count: 1,
        sample_count: 1,
        dimension: wgpu::TextureDimension::D2,
        format,
        usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
        view_formats: &[],
    });
    let view = target.create_view(&wgpu::TextureViewDescriptor::default());

    let quad_buf = (!quads.is_empty()).then(|| {
        device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
            label: Some("l0-sdf-quads"),
            contents: bytemuck::cast_slice(quads),
            usage: wgpu::BufferUsages::VERTEX,
        })
    });
    let line_buf = (!lines.is_empty()).then(|| {
        device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
            label: Some("l0-sdf-lines"),
            contents: bytemuck::cast_slice(lines),
            usage: wgpu::BufferUsages::VERTEX,
        })
    });

    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: Some("l0-sdf-enc") });
    {
        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: Some("l0-sdf-pass"),
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: &view,
                resolve_target: None,
                depth_slice: None,
                ops: wgpu::Operations {
                    load: wgpu::LoadOp::Clear(wgpu::Color {
                        r: background[0] as f64,
                        g: background[1] as f64,
                        b: background[2] as f64,
                        a: background[3] as f64,
                    }),
                    store: wgpu::StoreOp::Store,
                },
            })],
            depth_stencil_attachment: None,
            timestamp_writes: None,
            occlusion_query_set: None,
            multiview_mask: None,
        });
        renderer.record(
            &mut pass,
            &bind_group,
            quad_buf.as_ref().map(|b| (b, quads.len() as u32)),
            line_buf.as_ref().map(|b| (b, lines.len() as u32)),
        );
    }

    // Read back: copy the texture into a padded buffer, then map + unpad.
    let bytes_per_pixel = 4u32;
    let unpadded = width * bytes_per_pixel;
    let align = wgpu::COPY_BYTES_PER_ROW_ALIGNMENT;
    let padded = unpadded.div_ceil(align) * align;
    let readback = device.create_buffer(&wgpu::BufferDescriptor {
        label: Some("l0-sdf-readback"),
        size: (padded * height) as u64,
        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
        mapped_at_creation: false,
    });
    encoder.copy_texture_to_buffer(
        wgpu::TexelCopyTextureInfo {
            texture: &target,
            mip_level: 0,
            origin: wgpu::Origin3d::ZERO,
            aspect: wgpu::TextureAspect::All,
        },
        wgpu::TexelCopyBufferInfo {
            buffer: &readback,
            layout: wgpu::TexelCopyBufferLayout {
                offset: 0,
                bytes_per_row: Some(padded),
                rows_per_image: Some(height),
            },
        },
        wgpu::Extent3d { width, height, depth_or_array_layers: 1 },
    );
    queue.submit(Some(encoder.finish()));

    let slice = readback.slice(..);
    let (tx, rx) = std::sync::mpsc::channel();
    slice.map_async(wgpu::MapMode::Read, move |r| { let _ = tx.send(r); });
    device.poll(wgpu::PollType::wait_indefinitely()).ok()?;
    rx.recv().ok()?.ok()?;

    let data = slice.get_mapped_range();
    let mut rgba = Vec::with_capacity((width * height * 4) as usize);
    for row in 0..height {
        let start = (row * padded) as usize;
        rgba.extend_from_slice(&data[start..start + unpadded as usize]);
    }
    drop(data);
    readback.unmap();

    Some(Frame { width, height, rgba })
}

/// A standalone GPU [`Renderer`] over [`offscreen_render`] — batches pushed
/// instances, then `present` renders them headlessly and reads back a [`Frame`].
/// Always honest about availability: if no adapter exists, `present` falls back to
/// the CPU lane so the seam never panics.
pub struct GpuSdfHeadless {
    width: u32,
    height: u32,
    camera: Camera,
    background: [f32; 4],
    quads: Vec<QuadInstance>,
    lines: Vec<LineInstance>,
}

impl GpuSdfHeadless {
    pub fn new(background: [f32; 4]) -> Self {
        Self { width: 0, height: 0, camera: Camera::default(), background, quads: Vec::new(), lines: Vec::new() }
    }
}

impl Canvas for GpuSdfHeadless {
    fn push_quads(&mut self, quads: &[QuadInstance]) {
        self.quads.extend_from_slice(quads);
    }
    fn push_lines(&mut self, lines: &[LineInstance]) {
        self.lines.extend_from_slice(lines);
    }
    fn camera(&self) -> &Camera {
        &self.camera
    }
}

impl Renderer for GpuSdfHeadless {
    fn begin(&mut self, width: u32, height: u32, camera: Camera) -> &mut dyn Canvas {
        self.width = width;
        self.height = height;
        self.camera = camera;
        self.quads.clear();
        self.lines.clear();
        self
    }
    fn present(&mut self) -> Frame {
        let quads = std::mem::take(&mut self.quads);
        let lines = std::mem::take(&mut self.lines);
        match offscreen_render(self.width, self.height, self.background, &quads, &lines) {
            Some(f) => f,
            None => {
                // No GPU adapter → honest CPU fallback through the same SDF math.
                let bg = [
                    (self.background[0] * 255.0) as u8,
                    (self.background[1] * 255.0) as u8,
                    (self.background[2] * 255.0) as u8,
                    (self.background[3] * 255.0) as u8,
                ];
                let mut c = super::super::cpu::CpuCanvas::new(self.width, self.height, self.camera, bg);
                c.push_quads(&quads);
                c.push_lines(&lines);
                c.rasterize()
            }
        }
    }
    fn backend(&self) -> Backend {
        Backend::GpuVello
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// INJECT-ASSERT: the SDF uniform is a tight 16 bytes (matches `sdf.wgsl`'s
    /// `Uniforms` block) and the instance strides match the shader attribute
    /// offsets (the vertex-buffer layouts above hard-code them).
    #[test]
    fn sdf_uniform_and_instance_layouts_match_the_shaders() {
        assert_eq!(std::mem::size_of::<SdfUniforms>(), 16);
        assert_eq!(std::mem::size_of::<QuadInstance>(), 48, "quad stride matches the wgsl attrs");
        assert_eq!(std::mem::size_of::<LineInstance>(), 48, "line stride matches the wgsl attrs");
        // The shaders carry the shape/cap constants the CPU mirrors.
        assert!(SDF_WGSL.contains("SHAPE_RING"));
        assert!(SDF_WGSL.contains("fn coverage_from_sd"));
        assert!(LINE_WGSL.contains("CAP_BUTT"));
    }
}