1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
//! Compute-pipeline assembly for the path tracer.
//!
//! The WGSL module is built at runtime by concatenating the shared preamble, a
//! backend-specific intersection snippet, and the shared kernel (wgpu has no
//! `#include`). Bind group 0 holds the frame uniforms and accumulation buffer;
//! bind group 1 holds the scene buffers (and, for the compute backend, the BVH).
use bytemuck::{Pod, Zeroable};
use crate::context::Context;
use super::accumulation::Accumulation;
use super::environment::Environment;
use super::gpu_scene::GpuScene;
use super::RayBackend;
/// Per-frame uniforms, mirroring the WGSL `FrameUniforms`.
///
/// Layout (std140-compatible): the `mat4x4` and the `env_rotation` vec4 lead, then
/// the scalar block. `cam_eye` (vec3) + `width` (u32) together fill one 16-byte
/// row. The trailing scalars are grouped so the whole struct stays 16-aligned.
#[repr(C)]
#[derive(Copy, Clone, Debug, Pod, Zeroable)]
pub struct FrameUniforms {
/// Inverse view-projection matrix (column-major).
pub inv_view_proj: [[f32; 4]; 4],
/// Environment rotation about Y: (cos, sin, luminance_scale, unused).
pub env_rotation: [f32; 4],
/// Camera position in world space.
pub cam_eye: [f32; 3],
/// Render target width in pixels.
pub width: u32,
/// Render target height in pixels.
pub height: u32,
/// Index of the sample being accumulated (0 = first / reset).
pub sample_index: u32,
/// Number of triangles in the scene.
pub num_triangles: u32,
/// Number of lights in the scene.
pub num_lights: u32,
/// Ambient intensity. Acts as a uniform white environment light in the kernel:
/// added on scattered (diffuse/glossy) ray misses so it fills shadowed surfaces
/// like the rasterizer's ambient term, without tinting the visible background.
pub ambient: f32,
/// Maximum path length.
pub max_bounces: u32,
/// RNG seed (varies per frame).
pub seed: u32,
/// Number of samples to trace this dispatch.
pub samples_per_frame: u32,
/// Number of emissive triangles (area-light NEE).
pub num_emitters: u32,
/// Thin-lens aperture radius (0 = pinhole camera).
pub lens_radius: f32,
/// Thin-lens focus distance (world units).
pub focus_distance: f32,
/// 1 if an environment map is bound, else 0 (use the background color).
pub has_env: u32,
/// Background color (RGB + unused A) shown on ray miss when no environment map
/// is bound. Lands at offset 144; its 16 bytes also pad the uniform to the
/// 160-byte stride WGSL rounds this struct up to (mat4x4/vec4 alignment).
pub background: [f32; 4],
/// Ambient light color (RGB; A unused), multiplying the scalar `ambient` fill.
pub ambient_color: [f32; 4],
/// Distance-fog color (RGB) + overall strength (A).
pub fog_color: [f32; 4],
/// Fog falloff encoding `(mode, param_a, param_b, height_falloff)`; `mode == 0`
/// disables fog. See [`crate::light::Fog`].
pub fog_params: [f32; 4],
/// Misc flags. `x = 1` when some object opts out of casting shadows (the kernel
/// then walks shadow-ray occluders to skip non-casters). The rest are reserved.
pub flags: [u32; 4],
}
const PREAMBLE: &str = include_str!("../../builtin/raytrace/rt_preamble.wgsl");
const INTERSECT_BVH: &str = include_str!("../../builtin/raytrace/rt_intersect_bvh.wgsl");
const INTERSECT_RAYQUERY: &str = include_str!("../../builtin/raytrace/rt_intersect_rayquery.wgsl");
const KERNEL: &str = include_str!("../../builtin/raytrace/rt_kernel.wgsl");
/// The compute pipeline plus its persistent frame-uniform buffer and layouts.
pub struct PathTracePipeline {
pipeline: wgpu::ComputePipeline,
group0_layout: wgpu::BindGroupLayout,
group1_layout: wgpu::BindGroupLayout,
frame_uniform: wgpu::Buffer,
}
impl PathTracePipeline {
/// Builds the path-tracing compute pipeline for the given backend.
pub fn new(backend: RayBackend) -> PathTracePipeline {
let ctxt = Context::get();
// Compose the kernel with the preamble and the backend-specific intersection
// module via WESL imports (instead of source concatenation). The selected
// intersect file is mounted as `package::rt_intersect`; the `hardware` feature
// gates the `enable wgpu_ray_query;` directive in the kernel.
let hardware = matches!(backend, RayBackend::Hardware);
let intersect = if hardware {
INTERSECT_RAYQUERY
} else {
INTERSECT_BVH
};
let source = crate::builtin::compile_wesl(
&[
("package::rt_preamble", PREAMBLE),
("package::rt_intersect", intersect),
("package::rt_kernel", KERNEL),
("package::pbr_env", crate::builtin::PBR_ENV_WESL),
("package::common", crate::builtin::COMMON_WESL),
],
"package::rt_kernel",
&[("hardware", hardware)],
);
let shader = ctxt.create_shader_module(Some("rt_path_tracer"), &source);
let group0_layout = ctxt.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("rt_group0_layout"),
entries: &[
uniform_entry(0),
storage_entry(1, false), // radiance + guide regions (read_write)
],
});
let group1_layout = ctxt.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("rt_group1_layout"),
entries: &Self::group1_entries(backend),
});
let pipeline_layout = ctxt.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("rt_pipeline_layout"),
bind_group_layouts: &[Some(&group0_layout), Some(&group1_layout)],
immediate_size: 0,
});
let pipeline = ctxt
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("rt_path_tracer_pipeline"),
layout: Some(&pipeline_layout),
module: &shader,
entry_point: Some("main"),
compilation_options: Default::default(),
cache: None,
});
let frame_uniform = ctxt.create_buffer_simple(
Some("rt_frame_uniform"),
std::mem::size_of::<FrameUniforms>() as u64,
wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
);
PathTracePipeline {
pipeline,
group0_layout,
group1_layout,
frame_uniform,
}
}
/// Shared scene bindings (5..=9): emitters, the PBR texture array + sampler,
/// and the environment map + sampler. Binding 4 is backend-specific.
fn group1_shared_tail(entries: &mut Vec<wgpu::BindGroupLayoutEntry>) {
entries.push(storage_entry(5, true)); // emitters
entries.push(texture_entry(6, wgpu::TextureViewDimension::D2Array)); // tex array
entries.push(sampler_entry(7)); // tex sampler
entries.push(texture_entry(8, wgpu::TextureViewDimension::D2)); // environment
entries.push(sampler_entry(9)); // env sampler
}
/// Compute-backend two-level binding: the instances buffer (binding 12).
/// Binding 4 holds the *merged* BVH (TLAS nodes followed by every mesh's BLAS
/// nodes), and each instance inlines its mesh's node/triangle base offsets, so
/// the separate BLAS-node (10) and mesh-descriptor (11) buffers are gone — this
/// keeps the compute stage within WebGPU's 8-storage-buffers-per-stage limit.
fn group1_two_level_tail(entries: &mut Vec<wgpu::BindGroupLayoutEntry>) {
entries.push(storage_entry(12, true)); // instances
}
fn group1_entries(backend: RayBackend) -> Vec<wgpu::BindGroupLayoutEntry> {
let mut entries = vec![
storage_entry(0, true), // (mesh) vertices
storage_entry(1, true), // (mesh) triangles
storage_entry(2, true), // materials
storage_entry(3, true), // lights
];
match backend {
RayBackend::Software => entries.push(storage_entry(4, true)), // TLAS nodes
RayBackend::Hardware => entries.push(wgpu::BindGroupLayoutEntry {
binding: 4,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::AccelerationStructure {
vertex_return: false,
},
count: None,
}),
}
Self::group1_shared_tail(&mut entries);
match backend {
// Compute traverses the BVH buffers, so it needs all three (10/11/12).
RayBackend::Software => Self::group1_two_level_tail(&mut entries),
// Hardware traverses the TLAS object; it only needs the mesh table and
// instances (11/12) to resolve a hit's triangle + material.
RayBackend::Hardware => {
entries.push(storage_entry(11, true)); // meshes
entries.push(storage_entry(12, true)); // instances
}
}
entries
}
/// Updates the frame uniforms for this frame.
pub fn write_uniforms(&self, uniforms: &FrameUniforms) {
Context::get().write_buffer(&self.frame_uniform, 0, bytemuck::bytes_of(uniforms));
}
/// Builds bind group 0 (frame uniforms + accumulation and guide buffers).
fn make_group0(&self, accum: &Accumulation) -> wgpu::BindGroup {
Context::get().create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("rt_group0"),
layout: &self.group0_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: self.frame_uniform.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: accum.buffer.as_entire_binding(),
},
],
})
}
/// Records the path-tracing dispatch into `encoder` for the compute backend.
pub fn dispatch_compute(
&self,
encoder: &mut wgpu::CommandEncoder,
scene: &GpuScene,
accum: &Accumulation,
env: &Environment,
width: u32,
height: u32,
gpu: &mut crate::renderer::timings::GpuTimer,
) {
let ctxt = Context::get();
let group0 = self.make_group0(accum);
let group1 = ctxt.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("rt_group1"),
layout: &self.group1_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: scene.vertices.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: scene.triangles.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: scene.materials.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 3,
resource: scene.lights.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 4,
resource: scene.bvh.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 5,
resource: scene.emitters.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 6,
resource: wgpu::BindingResource::TextureView(&scene.tex_array.view),
},
wgpu::BindGroupEntry {
binding: 7,
resource: wgpu::BindingResource::Sampler(&scene.tex_array.sampler),
},
wgpu::BindGroupEntry {
binding: 8,
resource: wgpu::BindingResource::TextureView(&env.view),
},
wgpu::BindGroupEntry {
binding: 9,
resource: wgpu::BindingResource::Sampler(&env.sampler),
},
// Two-level acceleration structure: instances (binding 4 above holds
// the merged TLAS+BLAS nodes; each instance inlines its node/tri base).
wgpu::BindGroupEntry {
binding: 12,
resource: scene.instances.as_entire_binding(),
},
],
});
let trace_ts = gpu.compute_scope("trace");
let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("rt_path_trace_pass"),
timestamp_writes: trace_ts,
});
pass.set_pipeline(&self.pipeline);
pass.set_bind_group(0, &group0, &[]);
pass.set_bind_group(1, &group1, &[]);
let gx = width.div_ceil(8);
let gy = height.div_ceil(8);
pass.dispatch_workgroups(gx, gy, 1);
}
/// Records the path-tracing dispatch for the hardware ray-query backend.
///
/// Phase 2: builds bind group 1 with the TLAS bound at binding 4 instead of
/// the BVH buffer. The shared kernel and group 0 are identical to the compute
/// path.
pub fn dispatch_hardware(
&self,
encoder: &mut wgpu::CommandEncoder,
scene: &GpuScene,
accum: &Accumulation,
env: &Environment,
width: u32,
height: u32,
gpu: &mut crate::renderer::timings::GpuTimer,
) {
let ctxt = Context::get();
let tlas = scene
.tlas
.as_ref()
.expect("hardware backend requires a built TLAS");
let group0 = self.make_group0(accum);
let group1 = ctxt.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("rt_group1"),
layout: &self.group1_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: scene.vertices.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: scene.triangles.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: scene.materials.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 3,
resource: scene.lights.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 4,
resource: wgpu::BindingResource::AccelerationStructure(tlas),
},
wgpu::BindGroupEntry {
binding: 5,
resource: scene.emitters.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 6,
resource: wgpu::BindingResource::TextureView(&scene.tex_array.view),
},
wgpu::BindGroupEntry {
binding: 7,
resource: wgpu::BindingResource::Sampler(&scene.tex_array.sampler),
},
wgpu::BindGroupEntry {
binding: 8,
resource: wgpu::BindingResource::TextureView(&env.view),
},
wgpu::BindGroupEntry {
binding: 9,
resource: wgpu::BindingResource::Sampler(&env.sampler),
},
// Mesh table + instances: resolve a ray-query hit's triangle and
// material (binding 4 above is the TLAS itself).
wgpu::BindGroupEntry {
binding: 11,
resource: scene.meshes.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 12,
resource: scene.instances.as_entire_binding(),
},
],
});
let trace_ts = gpu.compute_scope("trace");
let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("rt_path_trace_pass_hw"),
timestamp_writes: trace_ts,
});
pass.set_pipeline(&self.pipeline);
pass.set_bind_group(0, &group0, &[]);
pass.set_bind_group(1, &group1, &[]);
pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1);
}
}
fn uniform_entry(binding: u32) -> wgpu::BindGroupLayoutEntry {
wgpu::BindGroupLayoutEntry {
binding,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
}
}
fn storage_entry(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
wgpu::BindGroupLayoutEntry {
binding,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
}
}
fn texture_entry(binding: u32, dim: wgpu::TextureViewDimension) -> wgpu::BindGroupLayoutEntry {
wgpu::BindGroupLayoutEntry {
binding,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: dim,
multisampled: false,
},
count: None,
}
}
fn sampler_entry(binding: u32) -> wgpu::BindGroupLayoutEntry {
wgpu::BindGroupLayoutEntry {
binding,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering),
count: None,
}
}