re_renderer 0.31.2

A wgpu based renderer for all your visualization needs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
use std::sync::mpsc;

use re_log::debug_assert;

use crate::texture_info::Texture2DBufferInfo;
use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, GpuTexture};

#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
pub enum CpuWriteGpuReadError {
    #[error("Attempting to allocate an empty buffer.")]
    ZeroSizeBufferAllocation,

    #[error(
        "Buffer is full, can't append more data! Buffer has a capacity for {buffer_capacity_elements} elements.
 Tried to add {num_elements_attempted_to_add} elements, but only added {num_elements_actually_added}."
    )]
    BufferFull {
        buffer_capacity_elements: usize,
        num_elements_attempted_to_add: usize,
        num_elements_actually_added: usize,
    },

    #[error(
        "Target buffer has a size of {target_buffer_size}, can't write {copy_size} bytes with an offset of {destination_offset}!"
    )]
    TargetBufferTooSmall {
        target_buffer_size: u64,
        copy_size: u64,
        destination_offset: u64,
    },

    #[error(
        "Target texture doesn't fit the size of the written data to this buffer! Texture target buffer should be at most {max_copy_size} bytes, but the to be written data was {written_data_size} bytes."
    )]
    TargetTextureBufferSizeMismatch {
        max_copy_size: u64,
        written_data_size: usize,
    },
}

/// A sub-allocated staging buffer that can be written to.
///
/// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it.
///
/// We do *not* allow reading from this buffer as it is typically write-combined memory.
/// Reading would work, but it can be *very* slow.
/// For details on why, see
/// [Write combining is not your friend, by Fabian Giesen](https://fgiesen.wordpress.com/2013/01/29/write-combining-is-not-your-friend/)
/// Note that the "vec like behavior" further encourages
/// * not leaving holes
/// * keeping writes sequential
///
/// Must be dropped before calling [`CpuWriteGpuReadBelt::before_queue_submit`] (typically the end of a frame).
/// If this buffer is not dropped before calling [`CpuWriteGpuReadBelt::before_queue_submit`], a validation error will occur.
pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + Send + Sync> {
    /// Write view into the relevant buffer portion.
    write_view: wgpu::BufferViewMut,

    /// Range in T elements in `write_view` that haven't been written yet.
    unwritten_element_range: std::ops::Range<usize>,

    chunk_buffer: GpuBuffer,
    byte_offset_in_chunk_buffer: wgpu::BufferAddress,

    /// Marker for the type whose alignment and size requirements are honored by `write_view`.
    _type: std::marker::PhantomData<T>,
}

impl<T> CpuWriteGpuReadBuffer<T>
where
    T: bytemuck::Pod + Send + Sync,
{
    /// Write-only view into the unwritten portion of the buffer.
    ///
    /// Note that we can't rely on any alignment guarantees here!
    /// We could offset the mapped CPU-sided memory, but then the GPU offset won't be aligned anymore.
    /// There's no way we can meet conflicting alignment requirements, so we need to work with unaligned bytes instead.
    /// See [this comment on this wgpu issue](https://github.com/gfx-rs/wgpu/issues/3508#issuecomment-1485044324) about what we tried before.
    ///
    /// Once wgpu has some alignment guarantees, we might be able to use this here to allow faster copies!
    /// (copies of larger blocks are likely less affected as `memcpy` typically does dynamic check/dispatching for SIMD based copies)
    #[inline(always)]
    fn write_only_slice(
        &mut self,
        range: impl std::ops::RangeBounds<usize>,
    ) -> wgpu::WriteOnly<'_, [u8]> {
        let start = self.unwritten_element_range.start * std::mem::size_of::<T>();
        let end = self.unwritten_element_range.end * std::mem::size_of::<T>();
        self.write_view.slice(start..end).into_slice(range)
    }

    /// Pushes a slice of elements into the buffer.
    ///
    /// If the buffer is not big enough, only the first `self.remaining_capacity()` elements are pushed before returning an error.
    #[inline]
    pub fn extend_from_slice(&mut self, elements: &[T]) -> Result<(), CpuWriteGpuReadError> {
        if elements.is_empty() {
            return Ok(());
        }

        re_tracing::profile_function_if!(10_000 < elements.len());

        let remaining_capacity = self.remaining_capacity();
        let (result, elements) = if elements.len() > remaining_capacity {
            (
                Err(CpuWriteGpuReadError::BufferFull {
                    buffer_capacity_elements: self.capacity(),
                    num_elements_attempted_to_add: elements.len(),
                    num_elements_actually_added: remaining_capacity,
                }),
                &elements[..remaining_capacity],
            )
        } else {
            (Ok(()), elements)
        };

        let bytes = bytemuck::cast_slice(elements);
        self.write_only_slice(..bytes.len()).copy_from_slice(bytes);
        self.unwritten_element_range.start += elements.len();

        result
    }

    /// Pushes several elements into the buffer.
    ///
    /// If the buffer is not big enough, only the first [`CpuWriteGpuReadBuffer::remaining_capacity`] elements are pushed before returning an error.
    /// Otherwise, returns the number of elements pushed for convenience.
    #[inline]
    pub fn extend(
        &mut self,
        mut elements: impl ExactSizeIterator<Item = T>,
    ) -> Result<usize, CpuWriteGpuReadError> {
        re_tracing::profile_function!();

        // TODO(emilk): optimize the extend function.
        // Right now it is 3-4x faster to collect to a vec first, which is crazy.
        if true {
            let vec: Vec<T> = elements.collect();

            re_log::debug_assert_eq!(
                vec.as_ptr() as usize % std::mem::align_of::<T>(),
                0,
                "Vec::collect collects into unaligned memory! Is this a bug in the allocator?"
            );

            self.extend_from_slice(vec.as_slice())?;
            Ok(vec.len())
        } else {
            let num_written_before = self.num_written();

            while let Some(element) = elements.next() {
                if self.unwritten_element_range.start >= self.unwritten_element_range.end {
                    let num_elements_actually_added = self.num_written() - num_written_before;

                    return Err(CpuWriteGpuReadError::BufferFull {
                        buffer_capacity_elements: self.capacity(),
                        num_elements_attempted_to_add: num_elements_actually_added
                            + elements.count()
                            + 1,
                        num_elements_actually_added,
                    });
                }

                self.write_only_slice(..std::mem::size_of::<T>())
                    .copy_from_slice(bytemuck::bytes_of(&element));
                self.unwritten_element_range.start += 1;
            }

            Ok(self.num_written() - num_written_before)
        }
    }

    /// Fills the buffer with n instances of an element.
    ///
    /// If the buffer is not big enough, only the first `self.remaining_capacity()` elements are pushed before returning an error.
    pub fn add_n(&mut self, element: T, num_elements: usize) -> Result<(), CpuWriteGpuReadError> {
        if num_elements == 0 {
            return Ok(());
        }

        re_tracing::profile_function_if!(10_000 < num_elements);

        let remaining_capacity = self.remaining_capacity();
        let (result, num_elements) = if num_elements > remaining_capacity {
            (
                Err(CpuWriteGpuReadError::BufferFull {
                    buffer_capacity_elements: self.capacity(),
                    num_elements_attempted_to_add: num_elements,
                    num_elements_actually_added: remaining_capacity,
                }),
                remaining_capacity,
            )
        } else {
            (Ok(()), num_elements)
        };

        let mut offset = 0;
        let element_bytes = bytemuck::bytes_of(&element);
        let mut write_slice = self.write_only_slice(..num_elements * std::mem::size_of::<T>());

        for _ in 0..num_elements {
            let end = offset + std::mem::size_of::<T>();
            write_slice
                .slice(offset..end)
                .copy_from_slice(element_bytes);
            offset = end;
        }
        self.unwritten_element_range.start += num_elements;

        result
    }

    /// Pushes a single element into the buffer and advances the write pointer.
    ///
    /// Returns an error if the data no longer fits into the buffer.
    #[inline]
    pub fn push(&mut self, element: T) -> Result<(), CpuWriteGpuReadError> {
        if self.remaining_capacity() == 0 {
            return Err(CpuWriteGpuReadError::BufferFull {
                buffer_capacity_elements: self.capacity(),
                num_elements_attempted_to_add: 1,
                num_elements_actually_added: 0,
            });
        }

        self.write_only_slice(..std::mem::size_of::<T>())
            .copy_from_slice(bytemuck::bytes_of(&element));
        self.unwritten_element_range.start += 1;

        Ok(())
    }

    /// True if no elements have been pushed into the buffer so far.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.unwritten_element_range.start == 0
    }

    /// The number of elements pushed into the buffer so far.
    #[inline]
    pub fn num_written(&self) -> usize {
        self.unwritten_element_range.start
    }

    /// The number of elements that can still be pushed into the buffer.
    #[inline]
    pub fn remaining_capacity(&self) -> usize {
        self.unwritten_element_range.end - self.unwritten_element_range.start
    }

    /// Total number of elements that the buffer can hold.
    pub fn capacity(&self) -> usize {
        self.unwritten_element_range.end
    }

    /// Copies all so far written data to the first layer of a 2D texture.
    ///
    /// Assumes that the buffer consists of as-tightly-packed-as-possible rows of data.
    /// (taking into account required padding as specified by [`wgpu::COPY_BYTES_PER_ROW_ALIGNMENT`])
    ///
    /// Fails if the buffer size is not sufficient to fill the entire texture.
    pub fn copy_to_texture2d_entire_first_layer(
        self,
        encoder: &mut wgpu::CommandEncoder,
        destination: &GpuTexture,
    ) -> Result<(), CpuWriteGpuReadError> {
        self.copy_to_texture2d(
            encoder,
            wgpu::TexelCopyTextureInfo {
                texture: &destination.texture,
                mip_level: 0,
                origin: wgpu::Origin3d::ZERO,
                aspect: wgpu::TextureAspect::All,
            },
            destination.texture.size(),
        )
    }

    /// Copies all so far written data to a rectangle on a single 2D texture layer.
    ///
    /// Assumes that the buffer consists of as-tightly-packed-as-possible rows of data.
    /// (taking into account required padding as specified by [`wgpu::COPY_BYTES_PER_ROW_ALIGNMENT`])
    ///
    /// Implementation note:
    /// Does 2D-only entirely for convenience as it greatly simplifies the input parameters.
    pub fn copy_to_texture2d(
        self,
        encoder: &mut wgpu::CommandEncoder,
        destination: wgpu::TexelCopyTextureInfo<'_>,
        copy_size: wgpu::Extent3d,
    ) -> Result<(), CpuWriteGpuReadError> {
        let buffer_info = Texture2DBufferInfo::new(destination.texture.format(), copy_size);

        // Validate that we stay within the written part of the slice (wgpu can't fully know our intention here, so we have to check).
        // This is a bit of a leaky check since we haven't looked at copy_size which may limit the amount of memory we need.
        if (buffer_info.buffer_size_padded as usize) < self.num_written() * std::mem::size_of::<T>()
        {
            return Err(CpuWriteGpuReadError::TargetTextureBufferSizeMismatch {
                max_copy_size: buffer_info.buffer_size_padded,
                written_data_size: self.num_written() * std::mem::size_of::<T>(),
            });
        }

        encoder.copy_buffer_to_texture(
            wgpu::TexelCopyBufferInfo {
                buffer: &self.chunk_buffer,
                layout: wgpu::TexelCopyBufferLayout {
                    offset: self.byte_offset_in_chunk_buffer,
                    bytes_per_row: Some(buffer_info.bytes_per_row_padded),
                    rows_per_image: None,
                },
            },
            destination,
            copy_size,
        );

        Ok(())
    }

    /// Copies the entire buffer to another buffer and drops it.
    pub fn copy_to_buffer(
        self,
        encoder: &mut wgpu::CommandEncoder,
        destination: &GpuBuffer,
        destination_offset: wgpu::BufferAddress,
    ) -> Result<(), CpuWriteGpuReadError> {
        let copy_size = (std::mem::size_of::<T>() * self.unwritten_element_range.start) as u64;

        // Wgpu does validation as well, but we want to be able to track this error right away.
        if copy_size > destination_offset + destination.size() {
            return Err(CpuWriteGpuReadError::TargetBufferTooSmall {
                target_buffer_size: destination.size(),
                copy_size,
                destination_offset,
            });
        }

        encoder.copy_buffer_to_buffer(
            &self.chunk_buffer,
            self.byte_offset_in_chunk_buffer,
            destination,
            destination_offset,
            copy_size,
        );

        Ok(())
    }
}

/// Internal chunk of the staging belt.
struct Chunk {
    buffer: GpuBuffer,

    /// Starting at this offset the buffer is unused.
    unused_offset: wgpu::BufferAddress,
}

impl Chunk {
    fn remaining_capacity(&self) -> u64 {
        self.buffer.size() - self.unused_offset
    }

    /// Caller needs to make sure that there is enough space.
    fn allocate<T: bytemuck::Pod + Send + Sync>(
        &mut self,
        num_elements: usize,
        size_in_bytes: u64,
    ) -> CpuWriteGpuReadBuffer<T> {
        debug_assert!(num_elements * std::mem::size_of::<T>() <= size_in_bytes as usize);

        let byte_offset_in_chunk_buffer = self.unused_offset;
        let end_offset = byte_offset_in_chunk_buffer + size_in_bytes;

        debug_assert!(
            byte_offset_in_chunk_buffer.is_multiple_of(CpuWriteGpuReadBelt::MIN_OFFSET_ALIGNMENT)
        );
        debug_assert!(end_offset <= self.buffer.size());

        let buffer_slice = self.buffer.slice(byte_offset_in_chunk_buffer..end_offset);
        let write_view = buffer_slice.get_mapped_range_mut();
        self.unused_offset = end_offset;

        CpuWriteGpuReadBuffer {
            chunk_buffer: self.buffer.clone(),
            byte_offset_in_chunk_buffer,
            write_view,
            unwritten_element_range: 0..num_elements,
            _type: std::marker::PhantomData,
        }
    }
}

/// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
///
/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
///
/// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs)
/// However, there are some important differences:
/// * can create buffers without yet knowing the target copy location
/// * lifetime of returned buffers is independent of the [`CpuWriteGpuReadBelt`] (allows working with several in parallel!)
/// * use of `re_renderer`'s resource pool
/// * handles alignment in a defined manner
///   (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508))
pub struct CpuWriteGpuReadBelt {
    /// Minimum size for new buffers.
    chunk_size: u64,

    /// Chunks which are CPU write at the moment.
    active_chunks: Vec<Chunk>,

    /// Chunks which are GPU read at the moment.
    ///
    /// I.e. they have scheduled transfers already; they are unmapped and one or more
    /// command encoder has one or more `copy_buffer_to_buffer` commands with them
    /// as source.
    closed_chunks: Vec<Chunk>,

    /// Chunks that are back from the GPU and ready to be mapped for write and put
    /// into `active_chunks`.
    free_chunks: Vec<Chunk>,

    /// When closed chunks are mapped again, the map callback sends them here.
    ///
    /// Note that we shouldn't use `SyncSender` since this can block the `Sender` if a buffer is full,
    /// which means that in a single threaded situation (Web!) we might deadlock.
    sender: mpsc::Sender<Chunk>,

    /// Free chunks are received here to be put on `self.free_chunks`.
    receiver: mpsc::Receiver<Chunk>,

    /// Total bytes allocated during the current frame, reset at each frame boundary.
    current_frame_allocated_bytes: u64,

    /// Exponentially smoothed estimate of peak allocation per frame.
    ///
    /// Updated each frame via `max(smoothed * PEAK_DECAY, last_frame_usage)`.
    /// This tracks the "typical" high-water mark while allowing it to decay
    /// after usage spikes so that excess buffers can be released.
    smoothed_peak_bytes: f64,
}

impl CpuWriteGpuReadBelt {
    /// All allocations of this allocator will be aligned to at least this size.
    ///
    /// Requiring a minimum alignment means we need to pad less often.
    /// Also, it has the potential of making memcpy operations faster.
    ///
    /// Needs to be larger or equal than [`wgpu::MAP_ALIGNMENT`], [`wgpu::COPY_BUFFER_ALIGNMENT`]
    /// and the largest possible texel block footprint (since offsets for texture copies require this)
    ///
    /// For alignment requirements in `WebGPU` in general, refer to
    /// [the specification on alignment-class limitations](https://www.w3.org/TR/webgpu/#limit-class-alignment)
    ///
    /// Note that this does NOT mean that the CPU memory has *any* alignment.
    /// See this issue about [lack of CPU memory alignment](https://github.com/gfx-rs/wgpu/issues/3508) in wgpu/WebGPU.
    pub const MIN_OFFSET_ALIGNMENT: u64 = 16;

    /// Per-frame decay factor for the smoothed peak allocation estimate.
    ///
    /// At 60 fps, a spike's influence halves in ~14 frames (~0.23 s).
    /// Slower decay (closer to 1.0) means the belt holds onto memory longer after spikes;
    /// faster decay (closer to 0.0) reclaims sooner but risks re-allocating if usage fluctuates.
    const PEAK_DECAY: f64 = 0.95;

    /// How much free capacity to retain relative to the smoothed peak usage.
    ///
    /// A factor of 1.5 provides headroom for frame-to-frame variance without
    /// immediately re-allocating after a moderate spike.
    const HEADROOM_FACTOR: f64 = 1.5;

    /// Create a cpu-write & gpu-read staging belt.
    ///
    /// The `chunk_size` is the unit of internal buffer allocation; writes will be
    /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
    /// chunk size should be:
    ///
    /// * larger than the largest single [`CpuWriteGpuReadBelt::allocate`] operation;
    /// * 1-4 times less than the total amount of data uploaded per submission
    ///   (per [`CpuWriteGpuReadBelt::before_queue_submit()`]); and
    /// * bigger is better, within these bounds.
    ///
    /// TODO(andreas): Adaptive chunk sizes
    pub fn new(chunk_size: wgpu::BufferSize) -> Self {
        static_assertions::const_assert!(
            wgpu::MAP_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_OFFSET_ALIGNMENT
        );
        static_assertions::const_assert!(
            wgpu::COPY_BUFFER_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_OFFSET_ALIGNMENT
        );
        // Largest uncompressed texture format (btw. many compressed texture format have the same block size!)
        debug_assert!(
            wgpu::TextureFormat::Rgba32Uint
                .block_copy_size(None)
                .unwrap() as u64
                <= Self::MIN_OFFSET_ALIGNMENT
        );

        // we must use an unbounded channel to avoid blocking on web
        #[expect(clippy::disallowed_methods)]
        let (sender, receiver) = mpsc::channel();
        Self {
            chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_OFFSET_ALIGNMENT),
            active_chunks: Vec::new(),
            closed_chunks: Vec::new(),
            free_chunks: Vec::new(),
            sender,
            receiver,
            current_frame_allocated_bytes: 0,
            smoothed_peak_bytes: 0.0,
        }
    }

    /// Allocates a cpu writable buffer for `num_elements` instances of type `T`.
    ///
    /// The buffer will be aligned to T's alignment, but no less than [`Self::MIN_OFFSET_ALIGNMENT`].
    pub fn allocate<T: bytemuck::Pod + Send + Sync>(
        &mut self,
        device: &wgpu::Device,
        buffer_pool: &GpuBufferPool,
        num_elements: usize,
    ) -> Result<CpuWriteGpuReadBuffer<T>, CpuWriteGpuReadError> {
        if num_elements == 0 {
            return Err(CpuWriteGpuReadError::ZeroSizeBufferAllocation);
        }

        re_tracing::profile_function!();

        debug_assert!(num_elements > 0, "Cannot allocate zero-sized buffer");

        // Potentially overestimate size with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
        let size = wgpu::util::align_to(
            (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress,
            Self::MIN_OFFSET_ALIGNMENT,
        );

        // Try to find space in any of the active chunks first.
        let mut chunk = if let Some(index) = self
            .active_chunks
            .iter_mut()
            .position(|chunk| chunk.remaining_capacity() >= size)
        {
            self.active_chunks.swap_remove(index)
        } else {
            self.receive_chunks(); // ensure self.free_chunks is up to date

            // Use a free chunk if possible, fall back to creating a new one if necessary.
            if let Some(index) = self
                .free_chunks
                .iter()
                .position(|chunk| chunk.remaining_capacity() >= size)
            {
                self.free_chunks.swap_remove(index)
            } else {
                // Allocation might be bigger than a chunk!
                let buffer_size =
                    wgpu::util::align_to(self.chunk_size.max(size), Self::MIN_OFFSET_ALIGNMENT);
                re_log::trace!(
                    "Allocating new CpuWriteGpuReadBelt chunk of size {:.1} MiB",
                    buffer_size as f32 / (1024.0 * 1024.0)
                );
                let buffer = buffer_pool.alloc(
                    device,
                    &BufferDesc {
                        label: "CpuWriteGpuReadBelt chunk buffer".into(),
                        size: buffer_size,
                        usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC,
                        mapped_at_creation: true,
                    },
                );

                Chunk {
                    buffer,
                    unused_offset: 0,
                }
            }
        };

        self.current_frame_allocated_bytes += size;

        let cpu_buffer_view = chunk.allocate(num_elements, size);
        self.active_chunks.push(chunk);
        Ok(cpu_buffer_view)
    }

    /// Prepare currently mapped buffers for use in a submission.
    ///
    /// All existing [`CpuWriteGpuReadBuffer`] MUST be dropped before calling this function.
    /// Any not dropped [`CpuWriteGpuReadBuffer`] will cause a validation error.
    ///
    /// This must be called BEFORE the command encoder(s) used in any [`CpuWriteGpuReadBuffer`] copy operations are submitted.
    ///
    /// At this point, all the partially used staging buffers are closed (cannot be used for
    /// further writes) until after [`CpuWriteGpuReadBelt::after_queue_submit`] is called *and* the GPU is done
    /// copying the data from them.
    pub fn before_queue_submit(&mut self) {
        re_tracing::profile_function!();

        // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap
        // https://github.com/gfx-rs/wgpu/issues/1468
        // However, WebGPU does not support this!

        // We're done with writing to this chunk and are ready to have the GPU read it!
        //
        // This part has to happen before submit, otherwise we get a validation error that
        // the buffers are still mapped and can't be read by the gpu.
        for chunk in self.active_chunks.drain(..) {
            chunk.buffer.unmap();
            self.closed_chunks.push(chunk);
        }
    }

    /// Recall all of the closed buffers back to be reused.
    ///
    /// This must only be called after the command encoder(s) used in [`CpuWriteGpuReadBuffer`]
    /// copy operations are submitted. Additional calls are harmless.
    /// Not calling this as soon as possible may result in increased buffer memory usage.
    ///
    /// Implementation note:
    /// We can't use [`wgpu::CommandEncoder::map_buffer_on_submit`] here because for that we'd need to know which
    /// command encoder is the last one scheduling any cpu->gpu copy operations.
    /// Note that if chunks were fully tied to a single encoder, we could call [`wgpu::CommandEncoder::map_buffer_on_submit`]
    /// once we know a chunk has all its cpu->gpu copy operations scheduled on that very encoder.
    pub fn after_queue_submit(&mut self) {
        re_tracing::profile_function!();

        // Update the smoothed peak estimate from last frame's usage, then reset the counter.
        self.smoothed_peak_bytes = f64::max(
            self.smoothed_peak_bytes * Self::PEAK_DECAY,
            self.current_frame_allocated_bytes as f64,
        );
        self.current_frame_allocated_bytes = 0;

        self.receive_chunks();
        self.shrink_free_chunks();

        let sender = &self.sender;
        for chunk in self.closed_chunks.drain(..) {
            let sender = sender.clone();
            chunk
                .buffer
                .clone()
                .slice(..)
                .map_async(wgpu::MapMode::Write, move |_| {
                    sender.send(chunk).ok();
                });
        }
    }

    /// Move all chunks that the GPU is done with (and are now mapped again)
    /// from `self.receiver` to `self.free_chunks`.
    fn receive_chunks(&mut self) {
        while let Ok(mut chunk) = self.receiver.try_recv() {
            chunk.unused_offset = 0;
            self.free_chunks.push(chunk);
        }
    }

    /// Drop free chunks that exceed our estimated needs.
    ///
    /// If total free capacity exceeds `HEADROOM_FACTOR × smoothed_peak`, drops the
    /// smallest chunks first (keeping the larger, more flexible ones).
    /// Always retains at least one `chunk_size` worth of free capacity.
    fn shrink_free_chunks(&mut self) {
        let budget =
            ((self.smoothed_peak_bytes * Self::HEADROOM_FACTOR) as u64).max(self.chunk_size);

        let total_bytes_in_free_chunks: u64 =
            self.free_chunks.iter().map(|c| c.buffer.size()).sum();
        if total_bytes_in_free_chunks <= budget {
            return;
        }

        re_tracing::profile_function!();

        // Sort ascending by size so we drop the smallest first (keep the more flexible large ones).
        self.free_chunks.sort_by_key(|c| c.buffer.size());

        // Find how many chunks from the front (smallest) we need to drop
        // to bring total free capacity within budget.
        let excess = total_bytes_in_free_chunks - budget;
        let num_to_drop = self
            .free_chunks
            .iter()
            .scan(0u64, |cumulative, chunk| {
                *cumulative += chunk.buffer.size();
                Some(*cumulative)
            })
            .take_while(|cumulative| *cumulative <= excess)
            .count()
            + 1; // +1: we also need the chunk that crosses the threshold
        let num_to_drop = num_to_drop.min(self.free_chunks.len());

        if num_to_drop != 0 {
            re_log::trace!(
                "CpuWriteGpuReadBelt: dropping {num_to_drop} free chunk(s), \
             keeping {}, budget {:.1} MiB",
                self.free_chunks.len() - num_to_drop,
                budget as f64 / (1024.0 * 1024.0),
            );
            self.free_chunks.drain(..num_to_drop);
        }
    }

    /// Total capacity (in bytes) currently held across all chunk categories.
    pub fn total_buffer_size_in_bytes(&self) -> u64 {
        let sum = |chunks: &[Chunk]| -> u64 { chunks.iter().map(|c| c.buffer.size()).sum() };
        sum(&self.active_chunks) + sum(&self.closed_chunks) + sum(&self.free_chunks)
    }
}

impl std::fmt::Debug for CpuWriteGpuReadBelt {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CpuWriteGpuReadBelt")
            .field("chunk_size", &self.chunk_size)
            .field("active_chunks", &self.active_chunks.len())
            .field("closed_chunks", &self.closed_chunks.len())
            .field("free_chunks", &self.free_chunks.len())
            .finish_non_exhaustive()
    }
}

#[cfg(test)]
mod tests {
    use crate::RenderContext;

    const CHUNK_SIZE: u64 = RenderContext::CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE.get();

    #[test]
    fn belt_shrinks_after_usage_spike() {
        let mut ctx = RenderContext::new_test();

        // Spike: allocate several chunk-sized buffers to force multiple chunks.
        ctx.execute_test_frame(|ctx| {
            let mut belt = ctx.cpu_write_gpu_read_belt.lock();
            for _ in 0..4 {
                let _buf = belt
                    .allocate::<u8>(&ctx.device, &ctx.gpu_resources.buffers, CHUNK_SIZE as usize)
                    .unwrap();
            }
            []
        });

        // One settling frame so the spike chunks complete map_async and land in free_chunks.
        ctx.execute_test_frame(|_| []);

        let size_after_spike = ctx
            .cpu_write_gpu_read_belt
            .lock()
            .total_buffer_size_in_bytes();
        assert!(
            size_after_spike >= 4 * CHUNK_SIZE,
            "Expected at least {} bytes after spike, got {size_after_spike}",
            4 * CHUNK_SIZE,
        );

        // Run many frames with small allocations so the smoothed peak decays.
        for _ in 0..100 {
            ctx.execute_test_frame(|ctx| {
                let _buf = ctx
                    .cpu_write_gpu_read_belt
                    .lock()
                    .allocate::<u8>(&ctx.device, &ctx.gpu_resources.buffers, 256)
                    .unwrap();
                []
            });
        }

        let size_after_decay = ctx
            .cpu_write_gpu_read_belt
            .lock()
            .total_buffer_size_in_bytes();
        assert!(
            size_after_decay < size_after_spike,
            "Expected belt to shrink after many small frames: \
             before={size_after_spike}, after={size_after_decay}"
        );
    }
}