Skip to main content

bevy_render/render_resource/
batched_uniform_buffer.rs

1use super::{GpuArrayBufferIndex, GpuArrayBufferable};
2use crate::{
3    render_resource::DynamicUniformBuffer,
4    renderer::{RenderDevice, RenderQueue},
5};
6use core::{marker::PhantomData, num::NonZero};
7use encase::{
8    private::{ArrayMetadata, BufferMut, Metadata, RuntimeSizedArray, WriteInto, Writer},
9    ShaderType,
10};
11use nonmax::NonMaxU32;
12use wgpu::{BindingResource, Limits};
13
14// 1MB else we will make really large arrays on macOS which reports very large
15// `max_uniform_buffer_binding_size`. On macOS this ends up being the minimum
16// size of the uniform buffer as well as the size of each chunk of data at a
17// dynamic offset.
18#[cfg(any(
19    not(feature = "webgl"),
20    not(target_arch = "wasm32"),
21    feature = "webgpu"
22))]
23const MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE: u32 = 1 << 20;
24
25// WebGL2 quirk: using uniform buffers larger than 4KB will cause extremely
26// long shader compilation times, so the limit needs to be lower on WebGL2.
27// This is due to older shader compilers/GPUs that don't support dynamically
28// indexing uniform buffers, and instead emulate it with large switch statements
29// over buffer indices that take a long time to compile.
30#[cfg(all(feature = "webgl", target_arch = "wasm32", not(feature = "webgpu")))]
31const MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE: u32 = 1 << 12;
32
33/// Similar to [`DynamicUniformBuffer`], except every N elements (depending on size)
34/// are grouped into a batch as an `array<T, N>` in WGSL.
35///
36/// This reduces the number of rebindings required due to having to pass dynamic
37/// offsets to bind group commands, and if indices into the array can be passed
38/// in via other means, it enables batching of draw commands.
39pub struct BatchedUniformBuffer<T: GpuArrayBufferable> {
40    // Batches of fixed-size arrays of T are written to this buffer so that
41    // each batch in a fixed-size array can be bound at a dynamic offset.
42    uniforms: DynamicUniformBuffer<MaxCapacityArray<Vec<T>>>,
43    // A batch of T are gathered into this `MaxCapacityArray` until it is full,
44    // then it is written into the `DynamicUniformBuffer`, cleared, and new T
45    // are gathered here, and so on for each batch.
46    temp: MaxCapacityArray<Vec<T>>,
47    current_offset: u32,
48    dynamic_offset_alignment: u32,
49}
50
51impl<T: GpuArrayBufferable> BatchedUniformBuffer<T> {
52    pub fn batch_size(limits: &Limits) -> usize {
53        (limits
54            .max_uniform_buffer_binding_size
55            .min(MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE) as u64
56            / T::min_size().get()) as usize
57    }
58
59    pub fn new(limits: &Limits) -> Self {
60        let capacity = Self::batch_size(limits);
61        let alignment = limits.min_uniform_buffer_offset_alignment;
62
63        Self {
64            uniforms: DynamicUniformBuffer::new_with_alignment(alignment as u64),
65            temp: MaxCapacityArray(Vec::with_capacity(capacity), capacity),
66            current_offset: 0,
67            dynamic_offset_alignment: alignment,
68        }
69    }
70
71    #[inline]
72    pub fn size(&self) -> NonZero<u64> {
73        self.temp.size()
74    }
75
76    pub fn clear(&mut self) {
77        self.uniforms.clear();
78        self.current_offset = 0;
79        self.temp.0.clear();
80    }
81
82    pub fn push(&mut self, component: T) -> GpuArrayBufferIndex<T> {
83        let result = GpuArrayBufferIndex {
84            index: self.temp.0.len() as u32,
85            dynamic_offset: NonMaxU32::new(self.current_offset),
86            element_type: PhantomData,
87        };
88        self.temp.0.push(component);
89        if self.temp.0.len() == self.temp.1 {
90            self.flush();
91        }
92        result
93    }
94
95    pub fn flush(&mut self) {
96        self.uniforms.push(&self.temp);
97
98        self.current_offset +=
99            align_to_next(self.temp.size().get(), self.dynamic_offset_alignment as u64) as u32;
100
101        self.temp.0.clear();
102    }
103
104    pub fn write_buffer(&mut self, device: &RenderDevice, queue: &RenderQueue) {
105        if !self.temp.0.is_empty() {
106            self.flush();
107        }
108        self.uniforms.write_buffer(device, queue);
109    }
110
111    #[inline]
112    pub fn binding(&self) -> Option<BindingResource<'_>> {
113        let mut binding = self.uniforms.binding();
114        if let Some(BindingResource::Buffer(binding)) = &mut binding {
115            // MaxCapacityArray is runtime-sized so can't use T::min_size()
116            binding.size = Some(self.size());
117        }
118        binding
119    }
120}
121
122#[inline]
123fn align_to_next(value: u64, alignment: u64) -> u64 {
124    debug_assert!(alignment.is_power_of_two());
125    ((value - 1) | (alignment - 1)) + 1
126}
127
128// ----------------------------------------------------------------------------
129// MaxCapacityArray was implemented by Teodor Tanasoaia for encase. It was
130// copied here as it was not yet included in an encase release and it is
131// unclear if it is the correct long-term solution for encase.
132
133#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
134struct MaxCapacityArray<T>(T, usize);
135
136impl<T> ShaderType for MaxCapacityArray<T>
137where
138    T: ShaderType<ExtraMetadata = ArrayMetadata>,
139{
140    type ExtraMetadata = ArrayMetadata;
141
142    const METADATA: Metadata<Self::ExtraMetadata> = T::METADATA;
143
144    fn size(&self) -> NonZero<u64> {
145        Self::METADATA.stride().mul(self.1.max(1) as u64).0
146    }
147}
148
149impl<T> WriteInto for MaxCapacityArray<T>
150where
151    T: WriteInto + RuntimeSizedArray,
152{
153    fn write_into<B: BufferMut>(&self, writer: &mut Writer<B>) {
154        debug_assert!(self.0.len() <= self.1);
155        self.0.write_into(writer);
156    }
157}