1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
use std::ops::Range;
use std::sync::mpsc;
use re_log::debug_assert;
use crate::texture_info::Texture2DBufferInfo;
use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool};
/// Identifier used to identify a buffer upon retrieval of the data.
///
/// Does not need to be unique!
pub type GpuReadbackIdentifier = u64;
/// Type used for storing user data on the gpu readback belt.
pub type GpuReadbackUserDataStorage = Box<dyn std::any::Any + 'static + Send>;
struct PendingReadbackRange {
identifier: GpuReadbackIdentifier,
buffer_range: Range<wgpu::BufferAddress>,
user_data: GpuReadbackUserDataStorage,
/// The frame index when the readback was scheduled.
scheduled_frame_index: u64,
}
#[derive(thiserror::Error, Debug)]
pub enum GpuReadbackError {
#[error("Texture format {0:?} is not supported for readback.")]
UnsupportedTextureFormatForReadback(wgpu::TextureFormat),
#[error("Texture or buffer does not have the required copy-source usage flag.")]
MissingSrcCopyUsage,
}
/// A reserved slice for GPU readback.
///
/// Readback needs to happen from a buffer/texture with copy-source usage,
/// as we need to copy the data from the GPU to this CPU accessible buffer.
pub struct GpuReadbackBuffer {
chunk_buffer: GpuBuffer,
range_in_chunk: Range<wgpu::BufferAddress>,
}
impl GpuReadbackBuffer {
/// Populates the buffer with data from a single layer of a 2D texture.
///
/// Implementation note:
/// Does 2D-only entirely for convenience as it greatly simplifies the input parameters.
/// Additionally, we assume as tightly as possible packed data as this is by far the most common use.
pub fn read_texture2d(
&mut self,
encoder: &mut wgpu::CommandEncoder,
source: wgpu::TexelCopyTextureInfo<'_>,
copy_extents: wgpu::Extent3d,
) -> Result<(), GpuReadbackError> {
self.read_multiple_texture2d(encoder, &[(source, copy_extents)])
}
/// Reads multiple textures into the same buffer.
///
/// This is primarily useful if you need to make sure that data from all textures is available at the same time.
///
/// Special care has to be taken to ensure that the buffer is large enough.
///
/// ATTENTION: Keep in mind that internal offsets need to be a multiple of the texture block size!
/// While readback buffer starts are guaranteed to be aligned correctly, there might need to be extra padding needed between texture copies.
/// This method will add the required padding between the texture copies if necessary.
/// Panics if the buffer is too small.
pub fn read_multiple_texture2d(
&mut self,
encoder: &mut wgpu::CommandEncoder,
sources_and_extents: &[(wgpu::TexelCopyTextureInfo<'_>, wgpu::Extent3d)],
) -> Result<(), GpuReadbackError> {
for (source, copy_extents) in sources_and_extents {
let src_texture = source.texture;
if !src_texture.usage().contains(wgpu::TextureUsages::COPY_SRC) {
return Err(GpuReadbackError::MissingSrcCopyUsage);
}
let start_offset = wgpu::util::align_to(
self.range_in_chunk.start,
src_texture
.format()
.block_copy_size(Some(source.aspect))
.ok_or_else(|| {
GpuReadbackError::UnsupportedTextureFormatForReadback(
source.texture.format(),
)
})? as u64,
);
let buffer_info = Texture2DBufferInfo::new(src_texture.format(), *copy_extents);
// Validate that stay within the slice (wgpu can't fully know our intention here, so we have to check).
debug_assert!(
buffer_info.buffer_size_padded <= self.range_in_chunk.end - start_offset,
"Texture data is too large to fit into the readback buffer!"
);
encoder.copy_texture_to_buffer(
*source,
wgpu::TexelCopyBufferInfo {
buffer: &self.chunk_buffer,
layout: wgpu::TexelCopyBufferLayout {
offset: start_offset,
bytes_per_row: Some(buffer_info.bytes_per_row_padded),
rows_per_image: None,
},
},
*copy_extents,
);
self.range_in_chunk =
(start_offset + buffer_info.buffer_size_padded)..self.range_in_chunk.end;
}
Ok(())
}
// TODO(andreas): Unused & untested so far!
//
// Populates the buffer with data from a buffer.
//
// Panics if the readback buffer is too small to fit the data.
// pub fn read_buffer(
// self,
// encoder: &mut wgpu::CommandEncoder,
// source: &GpuBuffer,
// source_offset: wgpu::BufferAddress,
// ) {
// let copy_size = self.range_in_chunk.end - self.range_in_chunk.start;
// // Wgpu does validation as well, but in debug mode we want to panic if the buffer doesn't fit.
// debug_assert!(copy_size <= source_offset + source.size(),
// "Source buffer has a size of {}, can't write {copy_size} bytes with an offset of {source_offset}!",
// source.size());
// encoder.copy_buffer_to_buffer(
// source,
// source_offset,
// &self.chunk_buffer,
// self.range_in_chunk.start,
// copy_size,
// );
// }
}
/// Internal chunk of the staging belt.
struct Chunk {
buffer: GpuBuffer,
/// Offset from which on the buffer is unused.
unused_offset: wgpu::BufferAddress,
/// All ranges that are currently in use, i.e. there is a GPU write to it scheduled.
ranges_in_use: Vec<PendingReadbackRange>,
/// Last frame this chunk was received, i.e. the last time a `map_async` action operation finished with it.
last_received_frame_index: u64,
}
impl Chunk {
fn remaining_capacity(&self) -> wgpu::BufferAddress {
self.buffer.size() - self.unused_offset
}
/// Caller needs to make sure that there is enough space.
fn allocate(
&mut self,
size_in_bytes: wgpu::BufferAddress,
identifier: GpuReadbackIdentifier,
user_data: GpuReadbackUserDataStorage,
scheduled_frame_index: u64,
) -> GpuReadbackBuffer {
debug_assert!(size_in_bytes <= self.remaining_capacity());
let buffer_range = self.unused_offset..self.unused_offset + size_in_bytes;
self.ranges_in_use.push(PendingReadbackRange {
identifier,
buffer_range: buffer_range.clone(),
user_data,
scheduled_frame_index,
});
let buffer = GpuReadbackBuffer {
chunk_buffer: self.buffer.clone(),
range_in_chunk: buffer_range,
};
self.unused_offset += size_in_bytes;
buffer
}
}
/// Efficiently performs many buffer reads by sharing and reusing temporary buffers.
///
/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
pub struct GpuReadbackBelt {
/// Minimum size for new buffers.
chunk_size: u64,
/// Chunks for which the GPU writes are scheduled, but we haven't mapped them yet.
active_chunks: Vec<Chunk>,
/// Chunks that have been unmapped and are ready for writing by the GPU.
free_chunks: Vec<Chunk>,
/// Chunks that are currently mapped and ready for reading by the CPU.
received_chunks: Vec<Chunk>,
/// When a chunk mapping is successful, it is moved to this sender to be read by the CPU.
sender: mpsc::Sender<Chunk>,
/// Chunks are received here are ready to be read by the CPU.
receiver: mpsc::Receiver<Chunk>,
/// Current frame index, used for keeping track of how old chunks are.
frame_index: u64,
}
impl GpuReadbackBelt {
/// All allocations of this allocator will be aligned to at least this size.
///
/// Buffer mappings however are currently NOT guaranteed to be aligned to this size!
/// See this issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508).
const MIN_ALIGNMENT: u64 = wgpu::MAP_ALIGNMENT; //wgpu::COPY_BUFFER_ALIGNMENT.max(wgpu::MAP_ALIGNMENT);
/// Create a ring buffer for efficient & easy gpu memory readback.
///
/// The `chunk_size` is the unit of internal buffer allocation. Reads will be
/// sub-allocated within each chunk. Therefore, for optimal use of memory, the
/// chunk size should be:
///
/// * larger than the largest single [`GpuReadbackBelt::allocate`] operation;
/// * 1-4 times less than the total amount of data uploaded per submission
/// * bigger is better, within these bounds.
///
/// TODO(andreas): Adaptive chunk sizes
/// TODO(andreas): Shrinking after usage spikes (e.g. screenshots of different sizes!)
pub fn new(chunk_size: wgpu::BufferSize) -> Self {
// we must use an unbounded channel to avoid blocking on web
#[expect(clippy::disallowed_methods)]
let (sender, receiver) = mpsc::channel();
Self {
chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT),
active_chunks: Vec::new(),
free_chunks: Vec::new(),
received_chunks: Vec::new(),
sender,
receiver,
frame_index: 0,
}
}
/// Allocates a Gpu writable buffer & cpu readable buffer with a given size.
pub fn allocate(
&mut self,
device: &wgpu::Device,
buffer_pool: &GpuBufferPool,
size_in_bytes: wgpu::BufferAddress,
identifier: GpuReadbackIdentifier,
user_data: GpuReadbackUserDataStorage,
) -> GpuReadbackBuffer {
re_tracing::profile_function!();
debug_assert!(size_in_bytes > 0, "Cannot allocate zero-sized buffer");
let size_in_bytes = wgpu::util::align_to(size_in_bytes, Self::MIN_ALIGNMENT);
// Try to find space in any of the active chunks first.
let mut chunk = if let Some(index) = self
.active_chunks
.iter_mut()
.position(|chunk| chunk.remaining_capacity() >= size_in_bytes)
{
self.active_chunks.swap_remove(index)
} else {
// Use a free chunk if possible, fall back to creating a new one if necessary.
if let Some(index) = self
.free_chunks
.iter()
.position(|chunk| chunk.remaining_capacity() >= size_in_bytes)
{
self.free_chunks.swap_remove(index)
} else {
// Allocation might be bigger than a chunk!
let buffer_size = self.chunk_size.max(size_in_bytes);
re_log::trace!(
"Allocating new GpuReadbackBelt chunk of size {:.1} MiB",
buffer_size as f32 / (1024.0 * 1024.0)
);
let buffer = buffer_pool.alloc(
device,
&BufferDesc {
label: "GpuReadbackBelt chunk buffer".into(),
size: buffer_size,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
},
);
Chunk {
buffer,
unused_offset: 0,
ranges_in_use: Vec::new(),
last_received_frame_index: u64::MAX,
}
}
};
let buffer_slice = chunk.allocate(size_in_bytes, identifier, user_data, self.frame_index);
self.active_chunks.push(chunk);
buffer_slice
}
/// Prepare used buffers for CPU read.
///
/// This should be called after the command encoder(s) used in [`GpuReadbackBuffer`] copy operations are submitted.
///
/// Implementation note:
/// We can't use [`wgpu::CommandEncoder::map_buffer_on_submit`] here because for that we'd need to know which
/// command encoder is the last one scheduling any gpu->cpu copy operations.
/// Note that if chunks were fully tied to a single encoder, we could call [`wgpu::CommandEncoder::map_buffer_on_submit`]
/// once we know a chunk has all its gpu->cpu copy operations scheduled on that very encoder.
pub fn after_queue_submit(&mut self) {
re_tracing::profile_function!();
// TODO(andreas): Use `map_buffer_on_submit` https://github.com/gfx-rs/wgpu/pull/8125 once available.
for chunk in self.active_chunks.drain(..) {
let sender = self.sender.clone();
chunk.buffer.clone().slice(..chunk.unused_offset).map_async(
wgpu::MapMode::Read,
move |result| {
if result.is_err() {
// This should never happen. Drop the chunk and report.
re_log::error_once!("Failed to map staging buffer for reading");
} else {
sender.send(chunk).ok();
}
},
);
}
}
/// Should be called at the beginning of a new frame.
///
/// Discards stale data that hasn't been received by
/// [`GpuReadbackBelt::readback_next_available`]/[`GpuReadbackBelt::readback_newest_available`] for more than a frame.
pub fn begin_frame(&mut self, frame_index: u64) {
// Make sure each frame has at least one `receive_chunk` call before it ends (from the pov of the readback belt).
// It's important to do this before bumping the frame index, because we want to mark all these chunks as "old"
// chunks that were available for the previous frame.
// (A user could have done this just before beginning a frame via `receive_chunks` or not call it at all)
self.receive_chunks();
self.frame_index = frame_index;
// Kill off all stale chunks.
// Note that this happening is unfortunate but not _really_ a user bug as this can happen very easily:
// For example, if a picking operation is scheduled on view that is immediately closed after!
// TODO(andreas): just use `Vec::drain_filter` once it goes stable.
let (discarded, retained) = self.received_chunks.drain(..).partition(|chunk| {
// If the chunk was received last frame it is too early to discard it, we need to wait one more.
// Imagine it was received just at the end of that frame - the user has no chance of getting it back
// at the code that they might be running at the beginning of a frame.
chunk.last_received_frame_index + 1 < self.frame_index
});
self.received_chunks = retained;
for chunk in discarded {
re_log::trace!(
"Unread data from a GpuReadbackBelt was discarded. {} ranges remained unread.",
chunk.ranges_in_use.len()
);
self.reuse_chunk(chunk);
}
}
/// Try to receive a pending data readback with the given identifier and user data type,
/// calling a callback on the first result that is available.
///
/// *Most likely* subsequent calls will return data from newer submissions. But there is no *strict* guarantee for this.
/// It could in theory happen that a readback is scheduled after a previous one, but finishes before it!
/// While there's no documented case of this reordering ever happening, your data handling code should handle this gracefully.
///
/// ATTENTION: Do NOT assume any alignment on the slice passed to `on_data_received`.
/// See this issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508).
///
/// Note that [`Self::begin_frame`] will discard stale data automatically, preventing leaks for any
/// scheduled reads that are never queried.
// TODO(andreas): Can above mentioned reordering _actually_ happen? How contrived does a setup have to be for this?
pub fn readback_next_available<UserDataType: 'static, Ret>(
&mut self,
identifier: GpuReadbackIdentifier,
callback: impl FnOnce(&[u8], Box<UserDataType>) -> Ret,
) -> Option<Ret> {
re_tracing::profile_function!();
self.receive_chunks();
self.readback_next_available_internal(identifier, callback)
.map(|(_, result)| result)
}
/// Try to receive a pending data readback with the given identifier and user data type,
/// calling a callback on all results that are available, but returning only the callback result that is associated with
/// the newest scheduled read.
///
/// *Most likely* subsequent calls will return data from newer submissions. But there is no *strict* guarantee for this.
/// It could in theory happen that a readback is scheduled after a previous one, but finishes before it!
/// While there's no documented case of this reordering ever happening, your data handling code should handle this gracefully.
///
/// Note that [`Self::begin_frame`] will discard stale data automatically, preventing leaks for any
/// scheduled reads that are never queried.
///
/// # Implementation notes
///
/// The callback being called for all results and not just the newest is an artifact of our zero-copy implementation:
/// Whenever inspecting a result, we hold a pointer directly to the mapped GPU data.
/// Moving on to the next result without discarding the previous means we'd potentially have two pointers
/// to the same mapped GPU buffer. This is possible, but inevitably leads to more `unsafe` & more complex code
/// for something that is expected to be a very rare occurrence.
//
// TODO(andreas): Can above mentioned reordering _actually_ happen? How contrived does a setup have to be for this?
pub fn readback_newest_available<UserDataType: 'static, Ret>(
&mut self,
identifier: GpuReadbackIdentifier,
callback: impl Fn(&[u8], Box<UserDataType>) -> Ret,
) -> Option<Ret> {
re_tracing::profile_function!();
self.receive_chunks();
let mut last_result = None;
let mut last_scheduled_frame_index = 0;
while let Some((scheduled_frame_index, result)) =
self.readback_next_available_internal::<UserDataType, Ret>(identifier, &callback)
{
// There could be several results from the same frame. Use the most recently received one in that case.
if scheduled_frame_index >= last_scheduled_frame_index {
last_result = Some(result);
last_scheduled_frame_index = scheduled_frame_index;
}
}
last_result
}
/// Returns result + frame index for when the request was scheduled.
fn readback_next_available_internal<UserDataType: 'static, Ret>(
&mut self,
identifier: GpuReadbackIdentifier,
callback: impl FnOnce(&[u8], Box<UserDataType>) -> Ret,
) -> Option<(u64, Ret)> {
// Search for the user data in the readback chunks.
// A linear search is suited best since we expect both the number of pending chunks (typically just one or two!)
// as well as the number of readbacks per chunk to be small!
// Also note that identifiers may not be unique!
for (chunk_index, chunk) in self.received_chunks.iter_mut().enumerate() {
for (range_index, range) in chunk.ranges_in_use.iter().enumerate() {
if range.identifier != identifier || !range.user_data.is::<UserDataType>() {
continue;
}
let (result, scheduled_frame_index) = {
let range = chunk.ranges_in_use.swap_remove(range_index);
let slice = chunk.buffer.slice(range.buffer_range.clone());
let data = slice.get_mapped_range();
(
callback(&data, range.user_data.downcast::<UserDataType>().unwrap()),
range.scheduled_frame_index,
)
};
// If this was the last range from this chunk, the chunk is ready for re-use!
if chunk.ranges_in_use.is_empty() {
let chunk = self.received_chunks.swap_remove(chunk_index);
self.reuse_chunk(chunk);
}
return Some((scheduled_frame_index, result));
}
}
None
}
/// Check if any new chunks are ready to be read.
fn receive_chunks(&mut self) {
while let Ok(mut chunk) = self.receiver.try_recv() {
chunk.last_received_frame_index = self.frame_index;
self.received_chunks.push(chunk);
}
}
fn reuse_chunk(&mut self, mut chunk: Chunk) {
chunk.buffer.unmap();
chunk.ranges_in_use.clear();
chunk.unused_offset = 0;
self.free_chunks.push(chunk);
}
}
impl std::fmt::Debug for GpuReadbackBelt {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("GpuReadbackBelt")
.field("chunk_size", &self.chunk_size)
.field("active_chunks", &self.active_chunks.len())
.field("free_chunks", &self.free_chunks.len())
.finish_non_exhaustive()
}
}