awsm-renderer 0.4.1

awsm-renderer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
//! Runtime cluster-LOD (Phase B): the loaded cluster DAG + the LOD-cut
//! selection. The CPU [`select_cut`] here is the **reference spec** for the GPU
//! compute pass (B.2) — the same per-cluster rule runs on-device against the
//! uploaded cluster pages. Inert unless the `virtual_geometry` feature loads a
//! cluster mesh.
//!
//! **The cut.** Each cluster carries `[lod_error, parent_error)`. The DAG build
//! sets a child's `parent_error` equal to the `lod_error` of the coarser
//! clusters its group simplifies into, so these half-open intervals **tile**
//! `[0, ∞)` along every path through the DAG. Selecting
//! `{ c : lod_error <= t < parent_error }` therefore picks **exactly one**
//! cluster per surface region — a watertight cover — and the locked group
//! boundaries make the seam between adjacent detail levels crack-free.

use glam::{Mat4, Vec3};

/// One cluster's runtime page: bounds, LOD errors, and its index slice.
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct ClusterPage {
    /// Bounding-sphere centre (object space).
    pub center: [f32; 3],
    /// Bounding-sphere radius (object space).
    pub radius: f32,
    /// Error introduced creating this cluster (`0` at the finest level).
    pub lod_error: f32,
    /// Error of the group that simplifies this cluster away (root sentinel for
    /// roots).
    pub parent_error: f32,
    /// Group sphere (centre+radius) to project `lod_error` against. Group-shared,
    /// so all clusters of a group flip at the same camera threshold ⇒ crack-free.
    pub lod_bounds_center: [f32; 3],
    pub lod_bounds_radius: f32,
    /// Group sphere to project `parent_error` against.
    pub parent_bounds_center: [f32; 3],
    pub parent_bounds_radius: f32,
    /// First index of this cluster's triangles in the shared index buffer.
    pub first_index: u32,
    /// Index count (triangle count × 3).
    pub index_count: u32,
}

/// Select the LOD cut at a **uniform** object-space error `threshold`: every
/// cluster whose interval `[lod_error, parent_error)` contains it. Watertight by
/// the tiling argument above. Pushes cluster ids into `out` (cleared first;
/// reused across frames → no per-frame allocation).
pub fn select_cut(pages: &[ClusterPage], threshold: f32, out: &mut Vec<u32>) {
    out.clear();
    for (i, p) in pages.iter().enumerate() {
        if p.lod_error <= threshold && threshold < p.parent_error {
            out.push(i as u32);
        }
    }
}

/// **Per-cluster** LOD cut — the GPU cut's CPU reference. Selects each cluster
/// whose own projected error fits the `pixel_budget` but whose parent's doesn't,
/// projecting `lod_error` against its `lod_bounds` sphere and `parent_error`
/// against its `parent_bounds` sphere (both group-shared, so adjacent clusters of
/// a group flip together ⇒ crack-free). Because each cluster uses *its own*
/// distance, detail varies WITHIN one mesh: near clusters stay fine while far
/// clusters coarsen. Reuses `out` (no per-frame allocation). This is exactly what
/// the B.2 compute pass will evaluate per cluster on-device.
pub fn select_cut_per_cluster(
    pages: &[ClusterPage],
    instance_world: &Mat4,
    camera_pos: Vec3,
    tan_half_fov_y: f32,
    viewport_h: f32,
    pixel_budget: f32,
    out: &mut Vec<u32>,
) {
    out.clear();
    let scale = max_axis_scale(instance_world);
    for (i, p) in pages.iter().enumerate() {
        let lod_world = instance_world.transform_point3(Vec3::from(p.lod_bounds_center));
        let parent_world = instance_world.transform_point3(Vec3::from(p.parent_bounds_center));
        let proj_lod = cluster_projected_error(
            p.lod_error,
            lod_world,
            camera_pos,
            tan_half_fov_y,
            viewport_h,
            scale,
        );
        let proj_parent = cluster_projected_error(
            p.parent_error,
            parent_world,
            camera_pos,
            tan_half_fov_y,
            viewport_h,
            scale,
        );
        if proj_lod <= pixel_budget && pixel_budget < proj_parent {
            out.push(i as u32);
        }
    }
}

/// Object-space error budget for a whole instance at uniform detail: the
/// pixel budget back-projected to object space at the instance's distance.
/// `select_cut(pages, instance_error_threshold(...))` then yields a watertight,
/// per-instance LOD that coarsens with distance (the simple cut; the GPU pass
/// refines to per-cluster distances using group-consistent bounds — see B.2).
pub fn instance_error_threshold(
    instance_world: &Mat4,
    camera_pos: Vec3,
    tan_half_fov_y: f32,
    viewport_h: f32,
    pixel_budget: f32,
) -> f32 {
    let center = instance_world.transform_point3(Vec3::ZERO);
    let dist = (center - camera_pos).length();
    let scale = max_axis_scale(instance_world);
    // pixels = error * scale * (viewport_h/2) / (dist * tan) ⇒
    // error_for_budget = budget * dist * tan / (scale * viewport_h/2)
    let denom = scale * (viewport_h * 0.5);
    if denom <= 1e-9 || tan_half_fov_y <= 1e-9 {
        return 0.0; // degenerate ⇒ finest
    }
    pixel_budget * dist * tan_half_fov_y / denom
}

/// Project an object-space error to screen pixels for a cluster centred at
/// `world_center` (the per-cluster form the GPU pass uses; shown here as the
/// reference). `+∞` for a degenerate distance/FOV.
pub fn cluster_projected_error(
    error: f32,
    world_center: Vec3,
    camera_pos: Vec3,
    tan_half_fov_y: f32,
    viewport_h: f32,
    world_scale: f32,
) -> f32 {
    let dist = (world_center - camera_pos).length();
    if dist <= 1e-6 || tan_half_fov_y <= 1e-6 {
        return f32::INFINITY;
    }
    error * world_scale * (viewport_h * 0.5) / (dist * tan_half_fov_y)
}

/// Largest world-space axis scale of an object→world transform.
pub fn max_axis_scale(m: &Mat4) -> f32 {
    m.x_axis
        .truncate()
        .length()
        .max(m.y_axis.truncate().length())
        .max(m.z_axis.truncate().length())
}

/// GPU byte stride of one [`ClusterPage`] in the storage buffer the B.2 compute
/// pass reads. 64 B, matching this WGSL std430 struct (each `vec3<f32>` aligns to
/// 16 B, and the trailing `f32` fills that slot, so the packing is gap-free):
/// ```wgsl
/// struct ClusterPage {            //  byte offset
///     center: vec3<f32>,          //   0  (radius fills 12..16)
///     radius: f32,                //  12
///     lod_bounds_center: vec3<f32>,    // 16  (lod_bounds_radius fills 28..32)
///     lod_bounds_radius: f32,     //  28
///     parent_bounds_center: vec3<f32>, // 32  (parent_bounds_radius fills 44..48)
///     parent_bounds_radius: f32,  //  44
///     lod_error: f32,             //  48
///     parent_error: f32,          //  52
///     first_index: u32,           //  56
///     index_count: u32,           //  60
/// }                               //  size 64, align 16
/// ```
/// `parent_error` carries the runtime root sentinel ([`f32::INFINITY`]); the
/// shader treats `+inf` as "never simplified", so roots always pass the upper
/// bound. The cut predicate the shader evaluates per page is exactly
/// [`select_cut_per_cluster`].
pub const CLUSTER_PAGE_GPU_STRIDE: usize = 64;

/// The B.2 cluster-cut compute shader source (one `@workgroup_size(64)` thread
/// per page; evaluates [`select_cut_per_cluster`] on-device). Embedded so the
/// path is checked at build time; the pipeline loads this string.
pub const CLUSTER_CUT_WGSL: &str =
    include_str!("render_passes/cluster_lod/shader/cluster_lod_wgsl/cluster_cut.wgsl");

/// Byte size of the cluster-cut params uniform (`ClusterCutParams` in the
/// shader). 96 B: a 64-B column-major `mat4x4` then `camera_pos` (vec3, with
/// `tan_half_fov_y` filling its 16-B slot), `viewport_h`, `pixel_budget`,
/// `world_scale`, `cluster_count`.
pub const CLUSTER_CUT_PARAMS_SIZE: usize = 96;

/// Serialise the cluster-cut params uniform (little-endian) into `out`. `world`
/// is column-major (WGSL `mat4x4` storage order = glam `to_cols_array`).
#[allow(clippy::too_many_arguments)]
pub fn write_cluster_cut_params(
    instance_world: &Mat4,
    camera_pos: Vec3,
    tan_half_fov_y: f32,
    viewport_h: f32,
    pixel_budget: f32,
    world_scale: f32,
    cluster_count: u32,
    out: &mut Vec<u8>,
) {
    for c in instance_world.to_cols_array() {
        out.extend_from_slice(&c.to_le_bytes());
    }
    out.extend_from_slice(&camera_pos.x.to_le_bytes());
    out.extend_from_slice(&camera_pos.y.to_le_bytes());
    out.extend_from_slice(&camera_pos.z.to_le_bytes());
    out.extend_from_slice(&tan_half_fov_y.to_le_bytes());
    out.extend_from_slice(&viewport_h.to_le_bytes());
    out.extend_from_slice(&pixel_budget.to_le_bytes());
    out.extend_from_slice(&world_scale.to_le_bytes());
    out.extend_from_slice(&cluster_count.to_le_bytes());
}

/// Serialise one cluster page into the 64-B GPU layout (little-endian),
/// appending to `out`. Mirror of the WGSL struct documented on
/// [`CLUSTER_PAGE_GPU_STRIDE`]; the per-frame uploader reuses one `Vec<u8>`
/// (no per-frame allocation).
pub fn write_cluster_page_gpu(p: &ClusterPage, out: &mut Vec<u8>) {
    let f = |out: &mut Vec<u8>, v: f32| out.extend_from_slice(&v.to_le_bytes());
    let u = |out: &mut Vec<u8>, v: u32| out.extend_from_slice(&v.to_le_bytes());
    f(out, p.center[0]);
    f(out, p.center[1]);
    f(out, p.center[2]);
    f(out, p.radius);
    f(out, p.lod_bounds_center[0]);
    f(out, p.lod_bounds_center[1]);
    f(out, p.lod_bounds_center[2]);
    f(out, p.lod_bounds_radius);
    f(out, p.parent_bounds_center[0]);
    f(out, p.parent_bounds_center[1]);
    f(out, p.parent_bounds_center[2]);
    f(out, p.parent_bounds_radius);
    f(out, p.lod_error);
    f(out, p.parent_error);
    u(out, p.first_index);
    u(out, p.index_count);
}

#[cfg(test)]
mod tests {
    use super::*;

    /// A 3-level synthetic DAG: 4 finest clusters → 2 mid → 1 root, with the
    /// child `parent_error` matching the parent `lod_error` (intervals tile).
    fn synthetic() -> Vec<ClusterPage> {
        let mk = |lod, parent, tris: u32| ClusterPage {
            center: [0.0, 0.0, 0.0],
            radius: 1.0,
            lod_error: lod,
            parent_error: parent,
            lod_bounds_center: [0.0, 0.0, 0.0],
            lod_bounds_radius: 1.0,
            parent_bounds_center: [0.0, 0.0, 0.0],
            parent_bounds_radius: 1.0,
            first_index: 0,
            index_count: tris * 3,
        };
        vec![
            mk(0.0, 1.0, 10), // level 0 ×4
            mk(0.0, 1.0, 10),
            mk(0.0, 1.0, 10),
            mk(0.0, 1.0, 10),
            mk(1.0, 2.0, 12), // level 1 ×2
            mk(1.0, 2.0, 12),
            mk(2.0, f32::INFINITY, 8), // root
        ]
    }

    fn cut_tris(pages: &[ClusterPage], t: f32) -> u32 {
        let mut out = Vec::new();
        select_cut(pages, t, &mut out);
        out.iter().map(|&i| pages[i as usize].index_count / 3).sum()
    }

    #[test]
    fn finest_cut_at_zero() {
        let p = synthetic();
        let mut out = Vec::new();
        select_cut(&p, 0.0, &mut out);
        assert_eq!(out, vec![0, 1, 2, 3], "threshold 0 picks the finest level");
        assert_eq!(cut_tris(&p, 0.0), 40);
    }

    #[test]
    fn mid_and_root_cuts() {
        let p = synthetic();
        let mut out = Vec::new();
        select_cut(&p, 1.5, &mut out);
        assert_eq!(out, vec![4, 5], "1<=1.5<2 picks the mid level");
        select_cut(&p, 5.0, &mut out);
        assert_eq!(out, vec![6], "above all finite errors picks the root");
    }

    #[test]
    fn triangle_count_is_monotone_non_increasing() {
        let p = synthetic();
        let mut prev = u32::MAX;
        for t in [0.0f32, 0.5, 1.0, 1.5, 2.0, 3.0, 100.0] {
            let n = cut_tris(&p, t);
            assert!(n > 0, "the cut always covers the surface");
            assert!(n <= prev, "coarser threshold must not increase triangles");
            prev = n;
        }
    }

    #[test]
    fn every_cluster_is_selected_at_its_lower_bound() {
        // Each cluster appears in the cut at exactly its own lod_error.
        let p = synthetic();
        for (i, page) in p.iter().enumerate() {
            let mut out = Vec::new();
            select_cut(&p, page.lod_error, &mut out);
            assert!(
                out.contains(&(i as u32)),
                "cluster {i} missing at its lod_error"
            );
        }
    }

    #[test]
    fn instance_threshold_coarsens_with_distance() {
        let world = Mat4::IDENTITY;
        let near = instance_error_threshold(&world, Vec3::new(0.0, 0.0, 2.0), 0.5, 1080.0, 1.0);
        let far = instance_error_threshold(&world, Vec3::new(0.0, 0.0, 50.0), 0.5, 1080.0, 1.0);
        assert!(
            far > near,
            "a farther instance tolerates a larger object error"
        );
        // Reuse-buffer call doesn't allocate a fresh vec each time.
        let p = synthetic();
        let mut out = Vec::new();
        select_cut(&p, near, &mut out);
        let near_tris: u32 = out.iter().map(|&i| p[i as usize].index_count / 3).sum();
        select_cut(&p, far, &mut out);
        let far_tris: u32 = out.iter().map(|&i| p[i as usize].index_count / 3).sum();
        assert!(far_tris <= near_tris);
    }

    #[test]
    fn cut_shader_embeds_and_has_entry_point() {
        assert!(CLUSTER_CUT_WGSL.contains("@compute"));
        assert!(CLUSTER_CUT_WGSL.contains("fn cs_main"));
        // The shader struct must declare the same fields the page layout writes.
        for field in [
            "lod_bounds_center",
            "parent_bounds_center",
            "lod_error",
            "parent_error",
            "first_index",
            "index_count",
        ] {
            assert!(CLUSTER_CUT_WGSL.contains(field), "shader missing `{field}`");
        }
    }

    #[test]
    fn cut_params_layout() {
        let mut out = Vec::new();
        let world = Mat4::from_scale(Vec3::splat(2.0));
        write_cluster_cut_params(
            &world,
            Vec3::new(1.0, 2.0, 3.0),
            0.5,
            1080.0,
            1.5,
            2.0,
            7,
            &mut out,
        );
        assert_eq!(out.len(), CLUSTER_CUT_PARAMS_SIZE, "params are 96 B");
        let f = |off: usize| f32::from_le_bytes(out[off..off + 4].try_into().unwrap());
        let u = |off: usize| u32::from_le_bytes(out[off..off + 4].try_into().unwrap());
        // mat4 occupies 0..64 (column-major; scale 2 ⇒ diagonal entries 2.0).
        assert_eq!(f(0), 2.0);
        assert_eq!(f(20), 2.0);
        assert_eq!(f(40), 2.0);
        // camera_pos at 64, tan at 76, then the scalars.
        assert_eq!([f(64), f(68), f(72)], [1.0, 2.0, 3.0]);
        assert_eq!(f(76), 0.5); // tan_half_fov_y
        assert_eq!(f(80), 1080.0); // viewport_h
        assert_eq!(f(84), 1.5); // pixel_budget
        assert_eq!(f(88), 2.0); // world_scale
        assert_eq!(u(92), 7); // cluster_count
    }

    #[test]
    fn gpu_page_layout_offsets_match_std430() {
        let p = ClusterPage {
            center: [1.0, 2.0, 3.0],
            radius: 4.0,
            lod_error: 13.0,
            parent_error: 14.0,
            lod_bounds_center: [5.0, 6.0, 7.0],
            lod_bounds_radius: 8.0,
            parent_bounds_center: [9.0, 10.0, 11.0],
            parent_bounds_radius: 12.0,
            first_index: 15,
            index_count: 16,
        };
        let mut bytes = Vec::new();
        write_cluster_page_gpu(&p, &mut bytes);
        assert_eq!(bytes.len(), CLUSTER_PAGE_GPU_STRIDE, "page is 64 B");
        let f = |off: usize| f32::from_le_bytes(bytes[off..off + 4].try_into().unwrap());
        let u = |off: usize| u32::from_le_bytes(bytes[off..off + 4].try_into().unwrap());
        // Field at each documented std430 offset.
        assert_eq!([f(0), f(4), f(8)], [1.0, 2.0, 3.0]); // center
        assert_eq!(f(12), 4.0); // radius
        assert_eq!([f(16), f(20), f(24)], [5.0, 6.0, 7.0]); // lod_bounds_center
        assert_eq!(f(28), 8.0); // lod_bounds_radius
        assert_eq!([f(32), f(36), f(40)], [9.0, 10.0, 11.0]); // parent_bounds_center
        assert_eq!(f(44), 12.0); // parent_bounds_radius
        assert_eq!(f(48), 13.0); // lod_error
        assert_eq!(f(52), 14.0); // parent_error
        assert_eq!(u(56), 15); // first_index
        assert_eq!(u(60), 16); // index_count
    }

    #[test]
    fn per_cluster_cut_varies_detail_by_distance() {
        // The headline property of the GPU cut: detail varies WITHIN a mesh.
        // Two regions — A at the origin (near the camera), B far down +X — each
        // with a fine cluster (lod 0, small parent error) and a coarse cluster
        // (lod 0.1, root). With one budget, region A should keep its FINE cluster
        // while region B drops to its COARSE one.
        let page = |cx: f32, lod: f32, parent: f32, tris: u32| ClusterPage {
            center: [cx, 0.0, 0.0],
            radius: 1.0,
            lod_error: lod,
            parent_error: parent,
            lod_bounds_center: [cx, 0.0, 0.0],
            lod_bounds_radius: 1.0,
            parent_bounds_center: [cx, 0.0, 0.0],
            parent_bounds_radius: 1.0,
            first_index: 0,
            index_count: tris * 3,
        };
        let pages = vec![
            page(0.0, 0.0, 0.1, 100),            // 0: A fine
            page(0.0, 0.1, f32::INFINITY, 30),   // 1: A coarse
            page(100.0, 0.0, 0.1, 100),          // 2: B fine
            page(100.0, 0.1, f32::INFINITY, 30), // 3: B coarse
        ];
        let mut out = Vec::new();
        select_cut_per_cluster(
            &pages,
            &Mat4::IDENTITY,
            Vec3::new(0.0, 0.0, 3.0), // near A, far from B
            0.5,
            1080.0,
            2.0, // pixel budget
            &mut out,
        );
        out.sort_unstable();
        assert!(out.contains(&0), "near region keeps its FINE cluster");
        assert!(out.contains(&3), "far region drops to its COARSE cluster");
        assert!(
            !out.contains(&1),
            "near region must not pick its coarse cluster"
        );
        assert!(
            !out.contains(&2),
            "far region must not pick its fine cluster"
        );
    }
}