roxlap-render 0.11.0

Unified CPU/GPU renderer facade for the roxlap scene-graph engine — one SceneRenderer over roxlap-core opticast (softbuffer) and roxlap-gpu (wgpu), with automatic CPU fallback.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
//! roxlap-render — unified CPU/GPU renderer facade.
//!
//! One [`SceneRenderer`] hides the choice between the CPU opticast
//! path (`roxlap-core` / `roxlap-scene`, presented via `softbuffer`)
//! and the GPU compute-shader path (`roxlap-gpu`, presented via its
//! own wgpu surface). Construction picks the GPU backend when asked
//! and able, and **falls back to CPU automatically** when WGPU init
//! fails — so a host never has to branch on GPU availability or carry
//! the `Scene`→GPU upload/refresh/transform glue itself.
//!
//! Hosts stay thin: build a `Scene`, advance it from input, then call
//! [`SceneRenderer::render`] each frame. The facade owns the window
//! surface, the framebuffer/z-buffer (CPU) or the resident scene +
//! dirty-chunk tracking (GPU), and presentation.
//!
//! The per-frame flow is `render` → *(optional overlays)* → finish.
//! Between [`SceneRenderer::render`] and the finishing
//! [`SceneRenderer::present`] / [`SceneRenderer::paint_egui`] call, a
//! host may overlay depth-tested world-space lines with
//! [`SceneRenderer::draw_lines`] (editor gizmos, debug geometry — see
//! [`Line3`]); they land in the framebuffer, occluded by the rendered
//! scene, with egui still painting panels on top.
//!
//! This is the RF.0 skeleton: backend selection + fallback + a
//! clear-to-sky frame. RF.1/RF.2 fill in the real CPU/GPU scene
//! render; RF.3 adds sprites; RF.4 adds framebuffer capture.

#![forbid(unsafe_code)]

mod cpu;
/// WebGL2 framebuffer presenter for the CPU backend on wasm (the
/// browser has no `softbuffer`).
#[cfg(target_arch = "wasm32")]
mod cpu_blit;
#[cfg(feature = "hud")]
mod cpu_egui;
mod gpu;

#[cfg(not(target_arch = "wasm32"))]
use std::sync::Arc;

use roxlap_core::opticast::OpticastSettings;
use roxlap_core::sky::Sky;
use roxlap_core::sprite::SpriteLighting;
use roxlap_core::Camera;
use roxlap_scene::Scene;

pub use roxlap_formats::kfa::KfaSprite;
pub use roxlap_formats::kv6::Kv6;
pub use roxlap_formats::sprite::Sprite;
pub use roxlap_gpu::{GpuInitError, GpuRendererSettings, PowerPreference};
// Re-exported so hosts can name the [`SceneRenderer::new`] bounds
// without adding a direct `raw-window-handle` dependency of their own.
pub use raw_window_handle::{HasDisplayHandle, HasWindowHandle};
// Re-exported so hosts feed [`SceneRenderer::paint_egui`] from the exact
// egui version the renderer was built against (`hud` feature).
#[cfg(feature = "hud")]
pub use egui;

use crate::cpu::CpuBackend;
use crate::gpu::GpuBackend;

/// Type-erased display handle stored by the CPU backend's softbuffer
/// surface. `raw-window-handle` implements `HasDisplayHandle` for
/// `Arc<H>` (`H: ?Sized`), and the bare trait object implements its
/// own object-safe trait — so `Arc<W>` coerces to `Arc<DynDisplay>`
/// for any provider `W`.
#[cfg(not(target_arch = "wasm32"))]
pub(crate) type DynDisplay = dyn HasDisplayHandle + Send + Sync + 'static;
/// Type-erased window handle counterpart to [`DynDisplay`].
#[cfg(not(target_arch = "wasm32"))]
pub(crate) type DynWindow = dyn HasWindowHandle + Send + Sync + 'static;

/// One placed sprite instance: which [`SpriteSet::models`] entry and
/// where in the world.
pub struct SpriteInstanceDesc {
    pub model: usize,
    pub pos: [f32; 3],
}

/// Stable handle to a registered sprite model, returned (one per
/// [`SpriteSet::models`] entry, in order) by
/// [`SceneRenderer::set_sprites`]. Pass it to
/// [`refresh_sprite_model`](SceneRenderer::refresh_sprite_model) to
/// re-register that model's geometry after a content edit — so callers
/// never track the positional `usize` index themselves. Opaque on
/// purpose: there is no arithmetic to do on it.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct SpriteModelId(pub(crate) usize);

/// Backend-agnostic sprite description. The facade builds the CPU
/// per-instance draw list and the GPU instanced registry from the
/// same data, so both backends show identical sprites. The host owns
/// content (which models, where, recolouring) — building a recoloured
/// variant is just a second [`Sprite`] model with edited `kv6.voxels`.
pub struct SpriteSet {
    /// Distinct voxel models (KV6 + base orientation). Instances index
    /// into this; their position overrides the model's.
    pub models: Vec<Sprite>,
    pub instances: Vec<SpriteInstanceDesc>,
    /// Model the [`SceneRenderer::carve_active_sprite`] hotkey edits
    /// (GPU only, mirroring the demo's `G`-carve). `None` disables it.
    pub carve_model: Option<usize>,
}

/// Per-frame inputs both backends consume. The host builds the
/// [`OpticastSettings`] (it owns scan distance etc.); the facade does
/// everything else (pool config, sky fill, render, present).
pub struct FrameParams<'a> {
    /// CPU opticast settings (scan distance, mip ladder, framebuffer
    /// geometry). Ignored by the GPU backend.
    pub settings: &'a OpticastSettings,
    /// Packed engine sky colour: the CPU sky-miss fill + skycast, and
    /// the clear colour if no scene renders.
    pub sky_color: u32,
    /// Optional sky panorama for the CPU rasterizer's sky sampling.
    pub sky: Option<&'a Sky>,
    /// CPU fog: packed colour + max scan distance (voxels). `0` scan
    /// distance disables CPU fog.
    pub fog_color: u32,
    pub fog_max_scan_dist: i32,
    /// CPU: treat z=255 as air (avoids the S1.X bedrock path for
    /// out-of-bounds cameras).
    pub treat_z_max_as_air: bool,
    /// GPU scene-grid LOD scan distance (world units); see GPU.11.1.
    /// Ignored by the CPU backend.
    pub gpu_mip_scan_dist: f32,
    /// GPU outer-DDA step budget (chunks). Ignored by the CPU backend.
    pub gpu_max_outer_steps: u32,
    /// GPU vertical field of view (radians). Ignored by the CPU
    /// backend (it derives projection from [`OpticastSettings`]).
    pub gpu_fov_y_rad: f32,
    /// CPU sprite shading (built by the host from its engine). Required
    /// for the CPU backend to draw sprites; ignored by the GPU backend
    /// (its sprite pass shades from the uploaded model colours). `None`
    /// skips CPU sprite drawing.
    pub sprite_lighting: Option<&'a SpriteLighting<'a>>,
    /// Per-face directional shading for the voxel grids — voxlap's
    /// `setsideshades(top, bot, left, right, up, down)`, the grid-scan
    /// analogue of [`sprite_lighting`](Self::sprite_lighting). Each
    /// entry darkens the faces pointing that way; the host typically
    /// passes its engine's `side_shades()`. The default `[0; 6]` keeps
    /// `sideshademode` off (no per-side shading), so existing hosts and
    /// the oracle goldens are unaffected. Applied each frame by **both**
    /// backends: the CPU rasteriser via `gcsub`, and the GPU scene-DDA
    /// pass by darkening a hit voxel's brightness by the hit face's
    /// shade (the face taken from the DDA's last-stepped axis).
    pub side_shades: [i8; 6],
}

/// Result of [`SceneRenderer::pick`] — a resolved screen→world voxel
/// hit. `world` is the surface point (`cam.pos + t · normalize(ray)`);
/// `grid` + `voxel` are the owning grid and its **grid-local** voxel
/// (transform-correct for rotated / translated grids).
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct PickHit {
    pub world: [f32; 3],
    pub grid: roxlap_scene::GridId,
    pub voxel: glam::IVec3,
}

/// A world-space view ray: the canonical unproject output of
/// [`SceneRenderer::view_ray`]. `dir` is unit-length. Feed it straight
/// to [`roxlap_scene::Scene::raycast`] for depth-free, backend-agnostic
/// voxel picking (`scene.raycast(ray.origin, ray.dir, max_dist)`), or
/// intersect it with a plane for tile selection.
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Ray {
    pub origin: glam::DVec3,
    pub dir: glam::DVec3,
}

/// A world-space line segment to draw over a rendered frame via
/// [`SceneRenderer::draw_lines`] — editor gizmos (bounding boxes, floor
/// grids, axes, hover wireframes), debug paths, etc.
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Line3 {
    /// World-space endpoints (voxel units), in the same frame the
    /// rendered scene + `camera` use.
    pub a: [f64; 3],
    pub b: [f64; 3],
    /// `0xAARRGGBB` — the high byte is an alpha blend factor (`0xFF`
    /// opaque, `0x00` invisible), the low 24 bits the RGB colour.
    pub color: u32,
    /// Screen-space thickness in pixels (`<= 1.0` draws a 1px line).
    pub width_px: f32,
    /// `true`: the segment is occluded by nearer rendered geometry
    /// (depth-tested against the frame's z-buffer). `false`: always on
    /// top (e.g. a hover highlight that should show through the model).
    pub depth_test: bool,
}

/// Which renderer a [`SceneRenderer`] resolved to at construction.
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum Backend {
    /// `roxlap-core` opticast, presented via `softbuffer`.
    Cpu,
    /// `roxlap-gpu` compute marcher, presented via wgpu.
    Gpu,
}

/// Construction-time options for [`SceneRenderer::new`].
pub struct RenderOptions {
    /// Try the GPU backend first. When `false`, or when GPU init
    /// fails, the renderer uses the CPU backend.
    pub want_gpu: bool,
    /// Settings forwarded to [`roxlap_gpu::GpuRenderer`] when the GPU
    /// backend is selected.
    pub gpu: GpuRendererSettings,
    /// Packed `0x00RRGGBB` (alpha ignored) the empty/clear frame fills
    /// with until a scene render lands. Also the CPU sky-miss colour
    /// default if a frame supplies none.
    pub clear_sky: u32,
    /// CPU [`ScratchPool`](roxlap_core::rasterizer::ScratchPool) `lastx`
    /// sizing — the largest combined grid `vsid` the CPU rasterizer
    /// will see. Pre-sizing keeps later frames allocation-free.
    pub cpu_max_grid_vsid: u32,
    /// CPU strip-parallel render thread count (capped to the rayon
    /// pool). One [`ScratchPool`](roxlap_core::rasterizer::ScratchPool)
    /// slot per thread.
    pub cpu_render_threads: usize,
}

impl Default for RenderOptions {
    fn default() -> Self {
        Self {
            want_gpu: false,
            gpu: GpuRendererSettings::default(),
            clear_sky: 0x0099_b3d9,
            // 32 chunks × CHUNK_SIZE_XY — the scene-demo's widest
            // combined ground grid.
            cpu_max_grid_vsid: 32 * roxlap_scene::CHUNK_SIZE_XY,
            cpu_render_threads: 4,
        }
    }
}

/// Renderer-internal backend; never exposes wgpu or softbuffer types.
/// The GPU variant owns the whole wgpu device/queue/pipelines, so
/// it's boxed to keep the enum small.
enum BackendImpl {
    // Both variants boxed so the enum stays small regardless of which
    // backend's state is larger (clippy::large_enum_variant).
    Cpu(Box<CpuBackend>),
    Gpu(Box<GpuBackend>),
}

/// Unified renderer over the CPU and GPU paths. See the crate docs.
pub struct SceneRenderer {
    inner: BackendImpl,
}

impl SceneRenderer {
    /// Build a renderer for `window` — any [`raw-window-handle`]
    /// provider (winit, SDL, GLFW, …) in an `Arc`. `size` is the
    /// window's initial physical framebuffer size in pixels; thereafter
    /// the host reports changes via [`Self::resize`]. Passing the size
    /// explicitly keeps the facade decoupled from any one windowing
    /// library's size API.
    ///
    /// Selects the GPU backend when `opts.want_gpu` and WGPU
    /// initialises; otherwise the CPU backend. **Never fails** — a
    /// missing/incompatible GPU silently yields the CPU path (the
    /// message is logged to stderr).
    ///
    /// [`raw-window-handle`]: raw_window_handle
    #[cfg(not(target_arch = "wasm32"))]
    #[must_use]
    pub fn new<W>(window: Arc<W>, size: (u32, u32), opts: &RenderOptions) -> Self
    where
        W: HasWindowHandle + HasDisplayHandle + Send + Sync + 'static,
    {
        if opts.want_gpu {
            match GpuBackend::new(window.clone(), size, opts) {
                Ok(g) => {
                    return Self {
                        inner: BackendImpl::Gpu(Box::new(g)),
                    };
                }
                Err(e) => {
                    eprintln!(
                        "roxlap-render: GPU init failed ({e}); falling back to the CPU renderer",
                    );
                }
            }
        }
        Self {
            inner: BackendImpl::Cpu(Box::new(CpuBackend::new(window, size, opts))),
        }
    }

    /// wasm/WebGPU build-time entry: build a renderer over an HTML
    /// `canvas`. `size` is the canvas's initial framebuffer size in
    /// pixels; the host reports later changes via [`Self::resize`].
    ///
    /// Async because the browser drives wgpu's adapter/device requests
    /// through its event loop — `await` it inside a
    /// `wasm_bindgen_futures::spawn_local` task. Selects the GPU
    /// (WebGPU) backend when `opts.want_gpu` and WebGPU is available;
    /// otherwise (no WebGPU, or init failed) it falls back to the CPU
    /// opticast path presented through a WebGL2 blit on the same canvas.
    /// **Never fails** — the message is logged to the browser console.
    #[cfg(target_arch = "wasm32")]
    pub async fn new_from_canvas_async(
        canvas: web_sys::HtmlCanvasElement,
        size: (u32, u32),
        opts: &RenderOptions,
    ) -> Self {
        if opts.want_gpu {
            // `SurfaceTarget::Canvas` moves the canvas into wgpu, so the
            // GPU attempt gets a clone — the CPU fallback keeps the
            // original if WebGPU init fails.
            match GpuBackend::new_async(canvas.clone(), size, opts).await {
                Ok(g) => {
                    return Self {
                        inner: BackendImpl::Gpu(Box::new(g)),
                    };
                }
                Err(e) => {
                    web_sys::console::warn_1(
                        &format!("roxlap-render: WebGPU init failed ({e}); using the CPU renderer")
                            .into(),
                    );
                }
            }
        }
        Self {
            inner: BackendImpl::Cpu(Box::new(CpuBackend::new_from_canvas(canvas, size, opts))),
        }
    }

    /// Which backend was selected.
    #[must_use]
    pub fn backend(&self) -> Backend {
        match self.inner {
            BackendImpl::Cpu(_) => Backend::Cpu,
            BackendImpl::Gpu(_) => Backend::Gpu,
        }
    }

    /// The GPU adapter description when on the GPU backend, else
    /// `None`.
    #[must_use]
    pub fn adapter_info(&self) -> Option<&str> {
        match &self.inner {
            BackendImpl::Gpu(g) => Some(g.adapter_info()),
            BackendImpl::Cpu(_) => None,
        }
    }

    /// Upload an equirectangular sky panorama (RGBA8, `w×h`) for the
    /// GPU marcher's sky sampling. No-op on the CPU backend, which
    /// samples the [`Sky`] passed in each [`FrameParams`] instead.
    pub fn set_sky_panorama(&mut self, rgba: &[u8], w: u32, h: u32) {
        if let BackendImpl::Gpu(g) = &mut self.inner {
            g.set_sky_panorama(rgba, w, h);
        }
    }

    /// Follow a window resize. CPU resizes its framebuffer lazily, so
    /// this only matters to the GPU swapchain — but it's safe to call
    /// for both.
    pub fn resize(&mut self, width: u32, height: u32) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.resize(width, height),
            BackendImpl::Gpu(g) => g.resize(width, height),
        }
    }

    /// Composite `scene` from `camera` with `frame` params into the
    /// backend's frame buffer — **without presenting**. The CPU backend
    /// fills sky + runs the opticast compositor into an owned buffer;
    /// the GPU backend uploads/refreshes the scene, runs the compute
    /// marcher + sprite pass, and acquires (but does not present) the
    /// swapchain frame.
    ///
    /// Finish the frame with exactly one of [`present`](Self::present)
    /// (no overlay) or [`paint_egui`](Self::paint_egui) (UI overlay).
    /// Calling `render` again without finishing drops the pending frame.
    pub fn render(&mut self, scene: &mut Scene, camera: &Camera, frame: &FrameParams) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.render(scene, camera, frame),
            BackendImpl::Gpu(g) => g.render(scene, camera, frame),
        }
    }

    /// Draw world-space [`Line3`] segments over the frame
    /// [`render`](Self::render) composited, using that frame's camera +
    /// projection + depth buffer. Call **after** [`render`](Self::render)
    /// and **before** [`present`](Self::present) /
    /// [`paint_egui`](Self::paint_egui) — the lines land in the
    /// framebuffer, so a subsequent `paint_egui` still draws its panels
    /// on top.
    ///
    /// `camera` must be the one the last frame rendered with (the
    /// projection is taken from that frame). Depth-tested segments
    /// (`Line3::depth_test`) are occluded by nearer rendered geometry;
    /// always-on-top segments ignore depth. See [`Line3`] for colour /
    /// width / blend semantics.
    pub fn draw_lines(&mut self, camera: &Camera, lines: &[Line3]) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.draw_lines(camera, lines),
            BackendImpl::Gpu(g) => g.draw_lines(camera, lines),
        }
    }

    /// Present the frame [`render`](Self::render) composited, with no UI
    /// overlay. Pairs with `render`; use [`paint_egui`](Self::paint_egui)
    /// instead to overlay an egui UI before presenting.
    pub fn present(&mut self) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.present(),
            BackendImpl::Gpu(g) => g.present(),
        }
    }

    /// Overlay an egui UI on the frame [`render`](Self::render)
    /// composited, then present it (`hud` feature). The host runs egui
    /// itself (e.g. `egui` + `egui-winit`) and passes the tessellated
    /// `jobs` ([`egui::Context::tessellate`]) and the per-frame
    /// `textures` delta from [`egui::FullOutput`]; `pixels_per_point` is
    /// the UI scale (`ctx.pixels_per_point()`).
    ///
    /// The GPU backend paints via `egui-wgpu`; the CPU backend
    /// software-rasterises the tessellation into its framebuffer. Use
    /// this **instead of** [`present`](Self::present) — both finish the
    /// frame.
    #[cfg(feature = "hud")]
    pub fn paint_egui(
        &mut self,
        jobs: &[egui::ClippedPrimitive],
        textures: &egui::TexturesDelta,
        pixels_per_point: f32,
    ) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.paint_egui(jobs, textures, pixels_per_point),
            BackendImpl::Gpu(g) => g.paint_egui(jobs, textures, pixels_per_point),
        }
    }

    /// Register sprite models + instances. The CPU backend builds a
    /// per-instance draw list; the GPU backend builds an instanced
    /// model registry. Call once at setup (or again to replace).
    pub fn set_sprites(&mut self, set: &SpriteSet) -> Vec<SpriteModelId> {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.set_sprites(set),
            BackendImpl::Gpu(g) => g.set_sprites(set),
        }
        // Handles are positional by construction (model index = chain id
        // on both backends), so the facade hands them out directly —
        // callers keep the handle instead of re-deriving the index.
        (0..set.models.len()).map(SpriteModelId).collect()
    }

    /// Re-register one sprite model's geometry after you've edited its
    /// content (a carve or recolour of its `kv6`). `model` is the
    /// [`SpriteModelId`] handed back by [`set_sprites`](Self::set_sprites);
    /// `kv6` is the model's **new** geometry — the caller owns the source
    /// of truth (e.g. a dense carve grid the surface-only `kv6` can't
    /// represent) and supplies the refreshed mesh here.
    ///
    /// This is a **backend-agnostic content refresh**, not a GPU upload:
    /// the renderer brings its stored model up to date however its active
    /// backend needs to. The instance set is left untouched (an edit never
    /// moves or adds an instance), so on the GPU backend only that one
    /// model's voxel data is re-uploaded — through a slack-backed
    /// suballocator, one model's bytes rather than the whole registry —
    /// while the CPU backend swaps the cached `kv6` into each instance of
    /// the model. Use [`set_sprites`](Self::set_sprites) to add/remove
    /// models or change the instance set.
    pub fn refresh_sprite_model(&mut self, model: SpriteModelId, kv6: &Kv6) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.update_sprite_model(model.0, kv6),
            BackendImpl::Gpu(g) => g.update_sprite_model(model.0, kv6),
        }
    }

    /// Register animated KFA sprites (one or more bone hierarchies).
    /// The GPU backend uploads each limb's kv6 as an instanced model
    /// **once** (appended to the sprite registry) and seeds the limb
    /// instances at their current pose; the CPU backend caches the
    /// posed limbs for drawing. Call once at setup, after
    /// [`set_sprites`](Self::set_sprites), then drive motion per frame
    /// with [`update_kfa_poses`](Self::update_kfa_poses).
    ///
    /// Limbs are posed from the sprites' current
    /// [`kfaval`](roxlap_formats::kfa::KfaSprite::kfaval) (advance
    /// [`animsprite`](roxlap_formats::kfa::KfaSprite::animsprite) first
    /// if using a baked curve), so `kfas` is taken `&mut`.
    pub fn set_kfa_sprites(&mut self, kfas: &mut [KfaSprite]) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.set_kfa_sprites(kfas),
            BackendImpl::Gpu(g) => g.set_kfa_sprites(kfas),
        }
    }

    /// Re-pose the registered KFA sprites from their current
    /// `kfaval[]`. Call each frame after advancing the animation
    /// (`kfa.animsprite(dt_ms)` or poking `kfaval[]`). The GPU backend
    /// takes the cheap transform-only update (no model-volume
    /// re-upload); the CPU backend re-solves limb transforms for the
    /// next [`render`](Self::render). Must follow a
    /// [`set_kfa_sprites`](Self::set_kfa_sprites) with the same sprites.
    pub fn update_kfa_poses(&mut self, kfas: &mut [KfaSprite]) {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.update_kfa_poses(kfas),
            BackendImpl::Gpu(g) => g.update_kfa_poses(kfas),
        }
    }

    /// Carve the next z-layer off the [`SpriteSet::carve_model`] and
    /// re-upload (the demo's `G` hotkey + GPU.12 copy-on-modify). GPU
    /// only; a no-op on the CPU backend. Returns the voxels removed.
    pub fn carve_active_sprite(&mut self) -> u32 {
        match &mut self.inner {
            BackendImpl::Cpu(_) => 0,
            BackendImpl::Gpu(g) => g.carve_active_sprite(),
        }
    }

    /// Request that the next [`render`](Self::render) capture its
    /// framebuffer for [`take_capture`](Self::take_capture). CPU only
    /// (the GPU swapchain isn't read back) — a no-op on GPU.
    pub fn request_capture(&mut self) {
        if let BackendImpl::Cpu(c) = &mut self.inner {
            c.request_capture();
        }
    }

    /// Take the most recently captured frame as packed `0x00RRGGBB`
    /// pixels + dimensions, or `None` if no capture is ready / GPU.
    pub fn take_capture(&mut self) -> Option<(Vec<u32>, u32, u32)> {
        match &mut self.inner {
            BackendImpl::Cpu(c) => c.take_capture(),
            BackendImpl::Gpu(_) => None,
        }
    }

    /// Screen→world picking input: the world-space hit distance `t` at
    /// window pixel `(x, y)` from the **last rendered frame**, or `None`
    /// for out-of-bounds pixels and sky / no-hit. The host reconstructs
    /// the world hit point as `cam.pos + t * normalize(ray_dir)`, where
    /// `ray_dir` is the same per-pixel ray the frame was rendered with
    /// (see the backend's projection).
    ///
    /// `t` is the distance to the nearest **scene-grid** surface
    /// (terrain + grids); sprites do not occlude it (the sprite pass
    /// reads depth read-only), so a cursor sprite under the pointer is
    /// transparent to the pick.
    ///
    /// Cost: the CPU backend reads its in-memory z-buffer (free); the
    /// GPU backend stages the depth buffer and blocks on a device poll
    /// (cheap at click time — do not call every frame). The GPU path
    /// only has depth when the last frame drew sprites (`write_depth`).
    #[must_use]
    pub fn pick_depth(&self, x: u32, y: u32) -> Option<f32> {
        match &self.inner {
            BackendImpl::Cpu(c) => c.pick_depth(x, y),
            BackendImpl::Gpu(g) => g.pick_depth(x, y),
        }
    }

    /// World-space view-ray direction (un-normalised) for window pixel
    /// `(x, y)`, under the projection the **last frame** rendered with.
    /// The backends differ (CPU `setcamera` vs GPU vertical-FOV
    /// pinhole), so this hides which one is active. `None` before the
    /// first frame. Intersect it with a plane for tile picking, or feed
    /// it to [`Self::pick`] for a voxel.
    #[must_use]
    pub fn pixel_ray(&self, camera: &Camera, x: f64, y: f64) -> Option<[f64; 3]> {
        match &self.inner {
            BackendImpl::Cpu(c) => c.pixel_ray(camera, x, y),
            BackendImpl::Gpu(g) => g.pixel_ray(camera, x, y),
        }
    }

    /// Canonical screen→world unproject: the full view [`Ray`]
    /// (`camera.pos` origin + unit direction) for window pixel
    /// `(x, y)`, under whichever projection the last frame used. The
    /// one entry point both backends honour — hosts never reconstruct
    /// the projection. `None` before the first frame or for a
    /// degenerate ray.
    ///
    /// Compose with [`roxlap_scene::Scene::raycast`] for depth-free
    /// picking that's identical on CPU and GPU:
    /// `renderer.view_ray(cam, x, y).and_then(|r| scene.raycast(r.origin, r.dir, max))`.
    #[must_use]
    pub fn view_ray(&self, camera: &Camera, x: f64, y: f64) -> Option<Ray> {
        let d = self.pixel_ray(camera, x, y)?;
        let len = (d[0] * d[0] + d[1] * d[1] + d[2] * d[2]).sqrt();
        if len < 1e-12 {
            return None;
        }
        Some(Ray {
            origin: glam::DVec3::from_array([camera.pos[0], camera.pos[1], camera.pos[2]]),
            dir: glam::DVec3::new(d[0] / len, d[1] / len, d[2] / len),
        })
    }

    /// One-call screen→world voxel pick: unproject pixel `(x, y)` with
    /// the active backend's projection, read the last frame's depth
    /// there, reconstruct the world hit, and resolve it to the owning
    /// grid + grid-local voxel via [`Scene::resolve_voxel`]. `None` on
    /// sky / no-hit, or when no grid claims the surface.
    ///
    /// `scene` and `camera` must be the ones the last frame rendered;
    /// the projection (size + FOV / `hx,hy,hz`) is taken from that
    /// frame. Cheap on CPU (in-memory z-buffer); on GPU it stages the
    /// depth buffer (a click-time device poll — not per frame).
    #[must_use]
    pub fn pick(&self, scene: &Scene, camera: &Camera, x: u32, y: u32) -> Option<PickHit> {
        let dir = self.pixel_ray(camera, f64::from(x), f64::from(y))?;
        let t = f64::from(self.pick_depth(x, y)?);
        let len = (dir[0] * dir[0] + dir[1] * dir[1] + dir[2] * dir[2]).sqrt();
        if len < 1e-9 {
            return None;
        }
        let s = t / len; // world = cam.pos + t · (dir / |dir|)
        let world = glam::DVec3::new(
            camera.pos[0] + dir[0] * s,
            camera.pos[1] + dir[1] * s,
            camera.pos[2] + dir[2] * s,
        );
        let (grid, voxel) = scene.resolve_voxel(world, glam::DVec3::from_array(dir))?;
        #[allow(clippy::cast_possible_truncation)]
        let world_f32 = [world.x as f32, world.y as f32, world.z as f32];
        Some(PickHit {
            world: world_f32,
            grid,
            voxel,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn options_default_is_cpu_intent() {
        let o = RenderOptions::default();
        assert!(!o.want_gpu);
        assert_eq!(o.clear_sky & 0xFF00_0000, 0, "clear_sky is 0x00RRGGBB");
    }
}