Skip to main content

oxiui_compute_wgpu/
buffer.rs

1//! Storage, uniform, and staging buffer helpers, typed buffer wrappers,
2//! buffer pool, sub-allocator, and async readback utilities.
3//!
4//! These are free functions and types that take a `&wgpu::Device` (and
5//! optionally a `&wgpu::Queue`) to construct or read back GPU buffers.
6//! They keep the API surface small while covering the patterns used by
7//! every compute workload in the COOLJAPAN ecosystem:
8//!
9//! | Pattern | Helper | Usage |
10//! |---------|--------|-------|
11//! | Upload once, read/write by shader | [`storage_buffer_init`] | SSBO inputs/outputs |
12//! | Small read-only constants for shaders | [`uniform_buffer`] | push-constants, CB0 |
13//! | Readback from GPU to CPU | [`staging_buffer`] + [`read_back`] | result extraction |
14//! | Zero-copy upload via mapped creation | [`mapped_storage_init`] | integrated-GPU fast path |
15//! | Typed wrapper with element count | [`TypedBuffer`] | avoids byte-size arithmetic |
16//! | Buffer recycling across dispatches | [`BufferPool`] | avoids per-frame reallocation |
17//! | One large buffer sliced into regions | [`SubAllocator`] | aligned sub-region tracking |
18//! | Async non-blocking readback | [`read_back_async`] | async-runtime integration |
19//! | Partial readback by byte offset | [`read_back_range`] | read a sub-range of a buffer |
20
21use bytemuck::Pod;
22use std::collections::HashMap;
23use std::marker::PhantomData;
24
25// ── Buffer creation helpers ───────────────────────────────────────────────────
26
27/// Create a GPU storage buffer initialised with `data`.
28///
29/// The returned buffer has usages `STORAGE | COPY_DST | COPY_SRC`:
30/// - `STORAGE`  — bindable as a shader storage buffer.
31/// - `COPY_DST` — allows `queue.write_buffer` updates.
32/// - `COPY_SRC` — allows copying to a [`staging_buffer`] for CPU readback.
33///
34/// # Parameters
35/// - `device` — the logical wgpu device.
36/// - `label`  — debug label visible in GPU capture tools (pass `""` to omit).
37/// - `data`   — raw bytes to upload; length determines the buffer size.
38///
39/// # Panics
40/// Panics if `data` is empty (zero-size buffers are forbidden by the WebGPU
41/// spec and wgpu validation).
42pub fn storage_buffer_init(device: &wgpu::Device, label: &str, data: &[u8]) -> wgpu::Buffer {
43    assert!(
44        !data.is_empty(),
45        "storage_buffer_init: data must be non-empty"
46    );
47    use wgpu::util::DeviceExt as _;
48    device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
49        label: non_empty_label(label),
50        contents: data,
51        usage: wgpu::BufferUsages::STORAGE
52            | wgpu::BufferUsages::COPY_DST
53            | wgpu::BufferUsages::COPY_SRC,
54    })
55}
56
57/// Create a GPU uniform buffer initialised with `data`.
58///
59/// The returned buffer has usages `UNIFORM | COPY_DST`:
60/// - `UNIFORM`  — bindable as a uniform / constant buffer.
61/// - `COPY_DST` — allows `queue.write_buffer` updates between dispatches.
62///
63/// # Panics
64/// Panics if `data` is empty.
65pub fn uniform_buffer(device: &wgpu::Device, label: &str, data: &[u8]) -> wgpu::Buffer {
66    assert!(!data.is_empty(), "uniform_buffer: data must be non-empty");
67    use wgpu::util::DeviceExt as _;
68    device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
69        label: non_empty_label(label),
70        contents: data,
71        usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
72    })
73}
74
75/// Create an empty CPU-mappable staging buffer of `size` bytes.
76///
77/// The returned buffer has usage `MAP_READ | COPY_DST`:
78/// - `COPY_DST` — accept a `copy_buffer_to_buffer` from a storage/output buffer.
79/// - `MAP_READ` — allows `buffer.slice(..).map_async(MapMode::Read, …)`.
80///
81/// # Panics
82/// Panics if `size` is zero.
83pub fn staging_buffer(device: &wgpu::Device, label: &str, size: u64) -> wgpu::Buffer {
84    assert!(size > 0, "staging_buffer: size must be > 0");
85    device.create_buffer(&wgpu::BufferDescriptor {
86        label: non_empty_label(label),
87        size,
88        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
89        mapped_at_creation: false,
90    })
91}
92
93// ── Readback helper ───────────────────────────────────────────────────────────
94
95/// Copy `len` elements of type `T` from `buf` (a GPU storage buffer) to a
96/// `Vec<T>` on the CPU.
97///
98/// Internally this:
99/// 1. Allocates a temporary [`staging_buffer`].
100/// 2. Records a `copy_buffer_to_buffer` command and submits it.
101/// 3. Maps the staging buffer synchronously via `pollster::block_on`.
102/// 4. Copies the mapped bytes into a `Vec<T>` via `bytemuck::cast_slice`.
103/// 5. Unmaps the staging buffer.
104///
105/// # Type parameter
106/// `T` must implement [`bytemuck::Pod`] so the raw GPU bytes can be
107/// reinterpreted safely.
108///
109/// # Panics
110/// Panics if the GPU mapping fails (device lost, buffer too small, …).
111#[cfg_attr(
112    feature = "tracing",
113    tracing::instrument(level = "debug", skip(device, queue, buf))
114)]
115pub fn read_back<T: Pod>(
116    device: &wgpu::Device,
117    queue: &wgpu::Queue,
118    buf: &wgpu::Buffer,
119    len: usize,
120) -> Vec<T> {
121    let byte_size = (std::mem::size_of::<T>() * len) as u64;
122    assert!(byte_size > 0, "read_back: requested size must be > 0");
123
124    // ── 1. Create a staging buffer ─────────────────────────────────────────
125    let staging = staging_buffer(device, "oxiui-compute-wgpu readback staging", byte_size);
126
127    // ── 2. Encode + submit copy ────────────────────────────────────────────
128    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
129        label: Some("oxiui-compute-wgpu readback encoder"),
130    });
131    encoder.copy_buffer_to_buffer(buf, 0, &staging, 0, byte_size);
132    queue.submit(std::iter::once(encoder.finish()));
133
134    // ── 3. Map synchronously ───────────────────────────────────────────────
135    let slice = staging.slice(..);
136    let (tx, rx) = std::sync::mpsc::channel();
137    slice.map_async(wgpu::MapMode::Read, move |result| {
138        let _ = tx.send(result);
139    });
140
141    // Pump the device until the map callback fires.
142    // `PollType::wait_indefinitely()` blocks until the most recent submission
143    // completes — the correct behaviour for a synchronous CPU readback.
144    device
145        .poll(wgpu::PollType::wait_indefinitely())
146        .expect("read_back: device poll failed");
147
148    rx.recv()
149        .expect("read_back: channel closed before map callback")
150        .expect("read_back: GPU mapping failed");
151
152    // ── 4. Copy bytes to Vec<T> ────────────────────────────────────────────
153    let mapped = slice.get_mapped_range();
154    let result: Vec<T> = bytemuck::cast_slice::<u8, T>(&mapped).to_vec();
155
156    // ── 5. Unmap ───────────────────────────────────────────────────────────
157    drop(mapped);
158    staging.unmap();
159
160    result
161}
162
163// ── Zero-copy upload ──────────────────────────────────────────────────────────
164
165/// Create a `STORAGE | COPY_SRC` buffer via `mapped_at_creation`, writing
166/// `data` without an intermediate staging copy.
167///
168/// This is the fastest upload path on integrated (unified-memory) GPUs where
169/// the buffer lives in CPU-visible memory from the start.  On discrete GPUs
170/// wgpu may still arrange an internal transfer, but the CPU side avoids an
171/// extra copy.
172///
173/// # Panics
174/// Panics if `data` is empty (zero-size buffers are rejected by wgpu).
175pub fn mapped_storage_init(device: &wgpu::Device, label: &str, data: &[u8]) -> wgpu::Buffer {
176    assert!(
177        !data.is_empty(),
178        "mapped_storage_init: data must be non-empty"
179    );
180    let buffer = device.create_buffer(&wgpu::BufferDescriptor {
181        label: non_empty_label(label),
182        size: data.len() as u64,
183        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
184        mapped_at_creation: true,
185    });
186    buffer
187        .slice(..)
188        .get_mapped_range_mut()
189        .copy_from_slice(data);
190    buffer.unmap();
191    buffer
192}
193
194// ── Partial readback ──────────────────────────────────────────────────────────
195
196/// Copy `len` elements of type `T` from `src`, starting at `byte_offset`, to a
197/// `Vec<T>` on the CPU.
198///
199/// Unlike [`read_back`] (which always starts at offset 0), this function lets
200/// callers extract a sub-range of a buffer — useful when multiple logical
201/// arrays share one large allocation.
202///
203/// # Panics
204/// Panics if the GPU mapping fails.
205pub fn read_back_range<T: bytemuck::Pod>(
206    device: &wgpu::Device,
207    queue: &wgpu::Queue,
208    src: &wgpu::Buffer,
209    byte_offset: u64,
210    len: usize,
211) -> Vec<T> {
212    let byte_size = (len * std::mem::size_of::<T>()) as u64;
213    assert!(byte_size > 0, "read_back_range: requested size must be > 0");
214    let staging = staging_buffer(device, "", byte_size);
215
216    let mut encoder =
217        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
218    encoder.copy_buffer_to_buffer(src, byte_offset, &staging, 0, byte_size);
219    queue.submit(std::iter::once(encoder.finish()));
220
221    let slice = staging.slice(..);
222    let (tx, rx) = std::sync::mpsc::channel();
223    slice.map_async(wgpu::MapMode::Read, move |result| {
224        let _ = tx.send(result);
225    });
226    device
227        .poll(wgpu::PollType::wait_indefinitely())
228        .expect("read_back_range: device poll failed");
229    rx.recv()
230        .expect("read_back_range: channel closed before map callback")
231        .expect("read_back_range: GPU mapping failed");
232
233    let mapped = slice.get_mapped_range();
234    let result = bytemuck::cast_slice::<u8, T>(&mapped).to_vec();
235    drop(mapped);
236    staging.unmap();
237    result
238}
239
240// ── Async readback ────────────────────────────────────────────────────────────
241
242/// Async version of [`read_back`] — bridges `map_async`'s callback into an
243/// `async fn` via an `mpsc` channel.
244///
245/// **Note on blocking:** this function calls
246/// `device.poll(PollType::wait_indefinitely())` after yielding once to the
247/// executor, which *blocks the OS thread* until the GPU copy finishes.  This
248/// is acceptable for a compute crate where the caller controls the executor
249/// (e.g. `pollster::block_on`).  A fully cooperative non-blocking variant that
250/// drives `Poll` from the executor waker is a planned follow-up.
251///
252/// Compatible runtimes: `pollster`, `tokio::task::spawn_blocking`, or any
253/// single-threaded executor.
254pub async fn read_back_async<T: bytemuck::Pod>(
255    device: &wgpu::Device,
256    queue: &wgpu::Queue,
257    src: &wgpu::Buffer,
258    len: usize,
259) -> Result<Vec<T>, crate::ComputeError> {
260    let byte_size = (len * std::mem::size_of::<T>()) as u64;
261    assert!(byte_size > 0, "read_back_async: requested size must be > 0");
262    let staging = staging_buffer(device, "read-back-async", byte_size);
263
264    let mut encoder = device.create_command_encoder(&Default::default());
265    encoder.copy_buffer_to_buffer(src, 0, &staging, 0, byte_size);
266    queue.submit(std::iter::once(encoder.finish()));
267
268    // Bridge map_async callback into async via channel.
269    let (tx, rx) = std::sync::mpsc::channel::<Result<(), wgpu::BufferAsyncError>>();
270    staging.slice(..).map_async(wgpu::MapMode::Read, move |r| {
271        let _ = tx.send(r);
272    });
273
274    // Yield once to allow the executor to schedule other work, then poll the
275    // device.  This cooperative yield works with pollster and tokio.
276    std::future::ready(()).await;
277    device
278        .poll(wgpu::PollType::wait_indefinitely())
279        .map_err(|e| crate::ComputeError::Operation {
280            op: "read_back_async",
281            detail: e.to_string(),
282        })?;
283
284    rx.recv()
285        .map_err(|_| crate::ComputeError::Operation {
286            op: "read_back_async",
287            detail: "channel closed before map callback fired".into(),
288        })?
289        .map_err(|e| crate::ComputeError::Operation {
290            op: "read_back_async",
291            detail: e.to_string(),
292        })?;
293
294    let mapped = staging.slice(..).get_mapped_range();
295    let data = bytemuck::cast_slice::<u8, T>(&mapped).to_vec();
296    drop(mapped);
297    staging.unmap();
298    Ok(data)
299}
300
301// ── TypedBuffer<T> ────────────────────────────────────────────────────────────
302
303/// A typed wrapper around [`wgpu::Buffer`] that tracks the element count so
304/// callers never have to compute byte sizes manually.
305///
306/// `T` must implement [`bytemuck::Pod`] — the same bound used by
307/// [`storage_buffer_init`] and [`read_back`].
308///
309/// # Example
310/// ```rust,no_run
311/// use oxiui_compute_wgpu::buffer::TypedBuffer;
312///
313/// // Construction, upload, and download are all in element counts.
314/// ```
315pub struct TypedBuffer<T: bytemuck::Pod> {
316    buffer: wgpu::Buffer,
317    len: usize,
318    _phantom: PhantomData<T>,
319}
320
321impl<T: bytemuck::Pod> TypedBuffer<T> {
322    /// Allocate an uninitialised GPU buffer for `len` elements with the given
323    /// `usage` flags.
324    pub fn new(device: &wgpu::Device, label: &str, usage: wgpu::BufferUsages, len: usize) -> Self {
325        let size = (len * std::mem::size_of::<T>()) as u64;
326        let buffer = device.create_buffer(&wgpu::BufferDescriptor {
327            label: non_empty_label(label),
328            size,
329            usage,
330            mapped_at_creation: false,
331        });
332        TypedBuffer {
333            buffer,
334            len,
335            _phantom: PhantomData,
336        }
337    }
338
339    /// Create a `STORAGE | COPY_DST | COPY_SRC` buffer pre-filled with `data`.
340    pub fn from_data(device: &wgpu::Device, label: &str, data: &[T]) -> Self {
341        let bytes = bytemuck::cast_slice(data);
342        let buffer = storage_buffer_init(device, label, bytes);
343        TypedBuffer {
344            buffer,
345            len: data.len(),
346            _phantom: PhantomData,
347        }
348    }
349
350    /// Number of `T` elements the buffer holds.
351    pub fn len(&self) -> usize {
352        self.len
353    }
354
355    /// `true` if the buffer holds zero elements.
356    pub fn is_empty(&self) -> bool {
357        self.len == 0
358    }
359
360    /// Size of the buffer in bytes.
361    pub fn byte_len(&self) -> u64 {
362        (self.len * std::mem::size_of::<T>()) as u64
363    }
364
365    /// Return a [`wgpu::BindingResource`] covering the entire buffer, suitable
366    /// for passing to `BindGroupEntry::resource`.
367    pub fn as_entire_binding(&self) -> wgpu::BindingResource<'_> {
368        self.buffer.as_entire_binding()
369    }
370
371    /// Access the underlying [`wgpu::Buffer`].
372    pub fn inner(&self) -> &wgpu::Buffer {
373        &self.buffer
374    }
375
376    /// Write `data` into the buffer via `queue.write_buffer`.
377    ///
378    /// # Panics
379    /// Panics if `data.len() != self.len`.
380    pub fn upload(&self, queue: &wgpu::Queue, data: &[T]) {
381        assert_eq!(data.len(), self.len, "upload length mismatch");
382        queue.write_buffer(&self.buffer, 0, bytemuck::cast_slice(data));
383    }
384
385    /// Read the buffer contents back to the CPU as a `Vec<T>`.
386    pub fn download(&self, device: &wgpu::Device, queue: &wgpu::Queue) -> Vec<T> {
387        read_back(device, queue, &self.buffer, self.len)
388    }
389}
390
391// ── BufferPool ────────────────────────────────────────────────────────────────
392
393/// A simple free-list pool that recycles [`wgpu::Buffer`]s across dispatches
394/// to avoid per-frame reallocation.
395///
396/// Buffers are bucketed by `(rounded_size, BufferUsages)`.  The size is
397/// rounded up to the next power of two (minimum 256) on both `acquire` and
398/// `release` so that similarly-sized buffers can be reused interchangeably.
399///
400/// # Limitations
401/// The pool does **not** destroy idle buffers; callers that need memory-bounded
402/// recycling should call [`BufferPool::available_count`] and drop excess
403/// buffers manually.
404pub struct BufferPool {
405    buckets: HashMap<(u64, wgpu::BufferUsages), Vec<wgpu::Buffer>>,
406}
407
408impl BufferPool {
409    /// Create an empty pool.
410    pub fn new() -> Self {
411        BufferPool {
412            buckets: HashMap::new(),
413        }
414    }
415
416    /// Acquire a buffer of at least `size` bytes with the given `usage`.
417    ///
418    /// Returns a recycled buffer from the pool when one is available, or
419    /// allocates a new one.  The actual buffer size may be larger than `size`
420    /// due to power-of-two rounding.
421    pub fn acquire(
422        &mut self,
423        device: &wgpu::Device,
424        size: u64,
425        usage: wgpu::BufferUsages,
426    ) -> wgpu::Buffer {
427        let rounded = size.next_power_of_two().max(256);
428        let bucket = self.buckets.entry((rounded, usage)).or_default();
429        if let Some(buf) = bucket.pop() {
430            return buf;
431        }
432        device.create_buffer(&wgpu::BufferDescriptor {
433            label: Some("pool-buffer"),
434            size: rounded,
435            usage,
436            mapped_at_creation: false,
437        })
438    }
439
440    /// Return a buffer to the pool so it can be reused by future `acquire`
441    /// calls.
442    ///
443    /// `size` should be the *logical* size the caller used for `acquire`; the
444    /// pool applies the same rounding so the buffer lands in the correct
445    /// bucket.
446    pub fn release(&mut self, size: u64, usage: wgpu::BufferUsages, buffer: wgpu::Buffer) {
447        let rounded = size.next_power_of_two().max(256);
448        self.buckets
449            .entry((rounded, usage))
450            .or_default()
451            .push(buffer);
452    }
453
454    /// Number of idle buffers in the `(size, usage)` bucket.
455    pub fn available_count(&self, size: u64, usage: wgpu::BufferUsages) -> usize {
456        let rounded = size.next_power_of_two().max(256);
457        self.buckets.get(&(rounded, usage)).map_or(0, |v| v.len())
458    }
459}
460
461impl Default for BufferPool {
462    fn default() -> Self {
463        Self::new()
464    }
465}
466
467// ── SubAllocator ──────────────────────────────────────────────────────────────
468
469/// A description of a sub-region allocated from a [`SubAllocator`].
470#[derive(Debug, Clone, Copy, PartialEq, Eq)]
471pub struct SubRegion {
472    /// Byte offset from the start of the backing buffer.
473    pub offset: u64,
474    /// Size of the region in bytes.
475    pub size: u64,
476}
477
478/// Bump-allocates aligned sub-regions from a single large [`wgpu::Buffer`].
479///
480/// Useful when many small uniform or storage allocations would each require a
481/// separate `wgpu::Buffer` — instead, one large buffer is created once and
482/// sliced into named regions, reducing `BindGroup` churn and allocator
483/// overhead.
484///
485/// # Limitations
486/// `SubAllocator` is a *bump* allocator — individual regions cannot be freed.
487/// Call [`reset`](SubAllocator::reset) to reclaim the entire capacity at once.
488pub struct SubAllocator {
489    buffer: wgpu::Buffer,
490    capacity: u64,
491    cursor: u64,
492    alignment: u64,
493}
494
495impl SubAllocator {
496    /// Create a `SubAllocator` wrapping `buffer` with the given `capacity` and
497    /// minimum `alignment` (must be a power of two; clamped to 1 if zero).
498    pub fn new(buffer: wgpu::Buffer, capacity: u64, alignment: u64) -> Self {
499        SubAllocator {
500            buffer,
501            capacity,
502            cursor: 0,
503            alignment: alignment.max(1),
504        }
505    }
506
507    /// Allocate a contiguous region of `size` bytes, aligned to
508    /// `self.alignment`.
509    ///
510    /// Returns `None` when the remaining capacity is insufficient.
511    pub fn alloc(&mut self, size: u64) -> Option<SubRegion> {
512        let aligned_cursor = align_up(self.cursor, self.alignment);
513        let end = aligned_cursor.checked_add(size)?;
514        if end > self.capacity {
515            return None;
516        }
517        self.cursor = end;
518        Some(SubRegion {
519            offset: aligned_cursor,
520            size,
521        })
522    }
523
524    /// Reset the cursor to zero, making all previously allocated regions
525    /// available again.  The backing buffer is **not** cleared.
526    pub fn reset(&mut self) {
527        self.cursor = 0;
528    }
529
530    /// Access the underlying [`wgpu::Buffer`].
531    pub fn inner(&self) -> &wgpu::Buffer {
532        &self.buffer
533    }
534
535    /// Number of bytes currently allocated (cursor position, before alignment
536    /// of the *next* alloc).
537    pub fn used(&self) -> u64 {
538        self.cursor
539    }
540
541    /// Number of bytes remaining after the current cursor.
542    pub fn remaining(&self) -> u64 {
543        self.capacity.saturating_sub(self.cursor)
544    }
545}
546
547// ── Internal helpers ──────────────────────────────────────────────────────────
548
549/// Convert an empty label string to `None` (wgpu prefers `Option<&str>`).
550#[inline]
551fn non_empty_label(label: &str) -> Option<&str> {
552    if label.is_empty() {
553        None
554    } else {
555        Some(label)
556    }
557}
558
559/// Round `value` up to the nearest multiple of `alignment`.
560///
561/// If `alignment` is 0 the value is returned unchanged.
562#[inline]
563fn align_up(value: u64, alignment: u64) -> u64 {
564    if alignment == 0 {
565        return value;
566    }
567    value.div_ceil(alignment) * alignment
568}
569
570// ── Tests ─────────────────────────────────────────────────────────────────────
571
572#[cfg(test)]
573mod tests {
574    use super::*;
575    use crate::context::ComputeContext;
576
577    /// Helper — skip the test gracefully when no GPU is available (CI).
578    macro_rules! require_gpu {
579        ($ctx:ident) => {
580            let Some($ctx) = ComputeContext::try_new() else {
581                return; // no GPU on this host — graceful skip
582            };
583        };
584    }
585
586    // ── Existing tests ────────────────────────────────────────────────────────
587
588    #[test]
589    fn storage_buffer_init_roundtrip() {
590        require_gpu!(ctx);
591        let data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
592        let bytes = bytemuck::cast_slice::<f32, u8>(&data);
593        let buf = storage_buffer_init(&ctx.device, "test-storage", bytes);
594        let back: Vec<f32> = read_back(&ctx.device, &ctx.queue, &buf, data.len());
595        assert_eq!(back, data);
596    }
597
598    #[test]
599    fn uniform_buffer_created() {
600        require_gpu!(ctx);
601        let data: [f32; 4] = [0.1, 0.2, 0.3, 0.4];
602        let bytes = bytemuck::cast_slice::<f32, u8>(&data);
603        // Just verify it constructs without panic.
604        let _buf = uniform_buffer(&ctx.device, "test-uniform", bytes);
605    }
606
607    #[test]
608    fn staging_buffer_created() {
609        require_gpu!(ctx);
610        let _buf = staging_buffer(&ctx.device, "test-staging", 256);
611    }
612
613    #[test]
614    fn non_empty_label_behaviour() {
615        assert_eq!(non_empty_label("foo"), Some("foo"));
616        assert_eq!(non_empty_label(""), None);
617    }
618
619    // ── Non-GPU unit tests (slice S2) ─────────────────────────────────────────
620
621    /// Verify that the byte_len formula for TypedBuffer is correct.
622    #[test]
623    fn typed_buffer_len_math() {
624        assert_eq!(std::mem::size_of::<f32>(), 4);
625        let len: usize = 8;
626        assert_eq!(len * std::mem::size_of::<f32>(), 32);
627        // u64 cast (what byte_len uses internally)
628        assert_eq!((len * std::mem::size_of::<f32>()) as u64, 32u64);
629    }
630
631    /// Two allocations of 100 bytes with alignment 256 must land at offsets
632    /// 0 and 256 respectively.
633    #[test]
634    fn suballocator_offsets_aligned() {
635        // We need a real wgpu::Buffer to construct SubAllocator; on headless
636        // CI we can verify only the align_up math without a GPU.
637        // Test the alignment arithmetic directly.
638        let first_aligned = align_up(0, 256);
639        assert_eq!(first_aligned, 0);
640        let after_first = first_aligned + 100; // cursor after first alloc
641        let second_aligned = align_up(after_first, 256);
642        assert!(
643            second_aligned >= 256,
644            "second offset {second_aligned} should be >= 256"
645        );
646        assert_eq!(second_aligned % 256, 0);
647    }
648
649    /// After reset the cursor returns to 0, so a fresh alloc gets offset 0.
650    #[test]
651    fn suballocator_reset_rewinds() {
652        // Test SubAllocator logic with a dummy buffer via GPU if available,
653        // otherwise verify the cursor arithmetic directly.
654        require_gpu!(ctx);
655
656        // 1 KiB backing buffer, 256-byte alignment.
657        let backing = ctx.device.create_buffer(&wgpu::BufferDescriptor {
658            label: Some("sub-alloc-test"),
659            size: 1024,
660            usage: wgpu::BufferUsages::STORAGE,
661            mapped_at_creation: false,
662        });
663        let mut sa = SubAllocator::new(backing, 1024, 256);
664
665        let r1 = sa.alloc(100).expect("first alloc should succeed");
666        assert_eq!(r1.offset, 0);
667        sa.reset();
668        let r2 = sa.alloc(100).expect("post-reset alloc should succeed");
669        assert_eq!(r2.offset, 0, "after reset, offset must restart at 0");
670    }
671
672    /// Power-of-two rounding used by BufferPool must work correctly.
673    #[test]
674    fn buffer_pool_size_rounds_up() {
675        assert_eq!(256u64.next_power_of_two(), 256);
676        assert_eq!(300u64.next_power_of_two(), 512);
677        assert_eq!(1u64.next_power_of_two().max(256), 256);
678        assert_eq!(255u64.next_power_of_two().max(256), 256);
679    }
680
681    /// Cast a &[f32] to bytes and back — verifies bytemuck Pod semantics.
682    #[test]
683    fn bytemuck_pod_roundtrip() {
684        let original: [f32; 3] = [1.0, 2.0, 3.0];
685        let bytes: &[u8] = bytemuck::cast_slice(&original);
686        assert_eq!(bytes.len(), 12);
687        let back: &[f32] = bytemuck::cast_slice(bytes);
688        assert_eq!(back, &original);
689    }
690
691    // ── GPU-gated tests (slice S2) ────────────────────────────────────────────
692
693    /// Acquire a buffer, release it, then acquire again — the pool must hand
694    /// back a recycled buffer (available_count drops from 1 to 0).
695    #[test]
696    fn pool_acquire_reuses_buffer() {
697        require_gpu!(ctx);
698        let mut pool = BufferPool::new();
699        let usage = wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC;
700        let size: u64 = 256;
701
702        // Initially nothing in pool.
703        assert_eq!(pool.available_count(size, usage), 0);
704
705        // Acquire allocates fresh.
706        let buf = pool.acquire(&ctx.device, size, usage);
707
708        // Release puts it back.
709        pool.release(size, usage, buf);
710        assert_eq!(pool.available_count(size, usage), 1);
711
712        // Second acquire pulls from pool.
713        let _buf2 = pool.acquire(&ctx.device, size, usage);
714        assert_eq!(pool.available_count(size, usage), 0);
715    }
716
717    /// `mapped_storage_init` must produce a buffer whose contents match the
718    /// input data when read back via a copy + staging buffer.
719    #[test]
720    fn mapped_init_roundtrip() {
721        require_gpu!(ctx);
722        let data: Vec<f32> = vec![10.0, 20.0, 30.0, 40.0];
723        let bytes = bytemuck::cast_slice::<f32, u8>(&data);
724
725        let src = mapped_storage_init(&ctx.device, "mapped-init-test", bytes);
726
727        // mapped_storage_init gives STORAGE | COPY_SRC, so we need a staging
728        // buffer to read back.
729        let staging = staging_buffer(&ctx.device, "mapped-init-staging", bytes.len() as u64);
730        let mut encoder = ctx
731            .device
732            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
733                label: Some("mapped-init-readback"),
734            });
735        encoder.copy_buffer_to_buffer(&src, 0, &staging, 0, bytes.len() as u64);
736        ctx.queue.submit(std::iter::once(encoder.finish()));
737
738        let slice = staging.slice(..);
739        let (tx, rx) = std::sync::mpsc::channel();
740        slice.map_async(wgpu::MapMode::Read, move |r| {
741            let _ = tx.send(r);
742        });
743        ctx.device
744            .poll(wgpu::PollType::wait_indefinitely())
745            .expect("poll failed");
746        rx.recv()
747            .expect("channel closed")
748            .expect("map_async failed");
749
750        let mapped = slice.get_mapped_range();
751        let back: Vec<f32> = bytemuck::cast_slice::<u8, f32>(&mapped).to_vec();
752        drop(mapped);
753        staging.unmap();
754
755        assert_eq!(back, data);
756    }
757
758    /// Write [10, 20, 30, 40] f32 values, then read_back_range at byte offset 4
759    /// (skip first element) with len=2 — must return [20.0, 30.0].
760    #[test]
761    fn read_back_range_returns_subslice() {
762        require_gpu!(ctx);
763        let data: Vec<f32> = vec![10.0, 20.0, 30.0, 40.0];
764        let bytes = bytemuck::cast_slice::<f32, u8>(&data);
765        let buf = storage_buffer_init(&ctx.device, "range-test", bytes);
766
767        // Skip first f32 (4 bytes), read next 2 f32s.
768        let sub: Vec<f32> = read_back_range(&ctx.device, &ctx.queue, &buf, 4, 2);
769        assert_eq!(sub, vec![20.0f32, 30.0]);
770    }
771
772    /// `pollster::block_on(read_back_async(...))` must return the same values
773    /// as the synchronous `read_back(...)`.
774    #[test]
775    fn async_readback_matches_sync() {
776        require_gpu!(ctx);
777        let data: Vec<f32> = vec![5.0, 6.0, 7.0, 8.0];
778        let bytes = bytemuck::cast_slice::<f32, u8>(&data);
779        let buf = storage_buffer_init(&ctx.device, "async-readback-test", bytes);
780
781        let sync_result: Vec<f32> = read_back(&ctx.device, &ctx.queue, &buf, data.len());
782        let async_result: Vec<f32> =
783            pollster::block_on(read_back_async(&ctx.device, &ctx.queue, &buf, data.len()))
784                .expect("async readback failed");
785
786        assert_eq!(sync_result, async_result);
787        assert_eq!(async_result, data);
788    }
789}