oxiui-compute-wgpu 0.1.2

Pure-Rust wgpu GPU-compute abstraction for the COOLJAPAN ecosystem
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
//! Headless GPU compute context: `Instance` → `Adapter` → `Device` + `Queue`.
//!
//! [`ComputeContext`] performs the full no-window, no-surface initialisation
//! chain required for pure GPU compute workloads (sparse solvers, LBM, MC/DC,
//! …).  Three constructors plus a fluent [`ContextBuilder`] are provided:
//!
//! * [`ComputeContext::try_new`] — returns `Option<Self>`; `None` means no GPU
//!   adapter is available (graceful CI skip, never panics).
//! * [`ComputeContext::new`] — returns `Result<Self, ComputeError>`; exposes the
//!   underlying failure reason through [`ComputeError`].
//! * [`ComputeContext::new_async`] — async variant; awaits adapter and device
//!   requests directly without a `pollster::block_on` wrapper.
//! * [`ComputeContext::builder`] — returns a [`ContextBuilder`] for fluent
//!   configuration of limits, features, and power preference.
//! * [`ComputeContext::from_device`] — wraps an externally owned
//!   `wgpu::Device` + `wgpu::Queue` (e.g. from `oxiui-render-wgpu`) so the
//!   compute layer can share the render backend's device.
//!
//! ## Multi-queue support
//!
//! On adapters that expose separate transfer and compute queue families,
//! [`ContextBuilder::with_multi_queue`] requests a dedicated transfer queue.
//! When no second queue family is available the context falls back to a single
//! shared queue (`transfer_queue()` returns `None`).
//!
//! Both sync constructors use `PowerPreference::HighPerformance` and
//! `wgpu::Limits::default()` (not `downlevel_defaults()`, which caps the
//! compute feature set).

use crate::error::ComputeError;

// ── ComputeContext ─────────────────────────────────────────────────────────────

/// An initialised headless GPU compute context.
///
/// Owns the logical [`wgpu::Device`], the primary compute [`wgpu::Queue`],
/// an optional dedicated transfer queue (when the adapter exposes more than one
/// queue family), and the [`wgpu::AdapterInfo`] snapshot captured at
/// construction time.  No window handle, surface, or swap-chain is involved.
pub struct ComputeContext {
    /// The logical GPU device.
    pub device: wgpu::Device,
    /// The primary command submission queue (compute and, when no separate
    /// transfer queue is available, also used for DMA transfers).
    pub queue: wgpu::Queue,
    /// A dedicated transfer queue, present only when the adapter exposes
    /// separate queue families **and** `ContextBuilder::with_multi_queue` was
    /// called.  `None` when the adapter provides a single shared queue family.
    transfer_queue: Option<wgpu::Queue>,
    /// Adapter metadata snapshot (vendor, backend, driver, …).
    adapter_info: wgpu::AdapterInfo,
}

impl ComputeContext {
    /// Return a reference to the adapter metadata captured at construction time.
    ///
    /// The returned [`wgpu::AdapterInfo`] contains fields such as `name`,
    /// `vendor`, `device`, `backend`, `driver`, and `driver_info`.
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// if let Some(ctx) = ComputeContext::try_new() {
    ///     let info = ctx.adapter_info();
    ///     println!("GPU backend: {:?}", info.backend);
    /// }
    /// ```
    pub fn adapter_info(&self) -> &wgpu::AdapterInfo {
        &self.adapter_info
    }

    /// Return the dedicated transfer queue when one was obtained.
    ///
    /// Returns `Some` only when [`ContextBuilder::with_multi_queue`] was
    /// called **and** the underlying adapter exposed a separate transfer queue
    /// family.  Callers should fall back to `self.queue` when this is `None`.
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// if let Some(ctx) = ComputeContext::try_new() {
    ///     if let Some(tq) = ctx.transfer_queue() {
    ///         // Use the dedicated DMA queue for uploads.
    ///         let _ = tq;
    ///     }
    /// }
    /// ```
    pub fn transfer_queue(&self) -> Option<&wgpu::Queue> {
        self.transfer_queue.as_ref()
    }

    /// Return a [`ContextBuilder`] for fluent configuration of limits,
    /// features, and power preference.
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// let ctx = ComputeContext::builder()
    ///     .with_power_preference(wgpu::PowerPreference::LowPower)
    ///     .build();
    /// ```
    pub fn builder() -> ContextBuilder {
        ContextBuilder::default()
    }

    /// Create a context with high-performance power preference and default limits.
    ///
    /// # Errors
    ///
    /// * [`ComputeError::NoAdapter`] — no suitable GPU adapter was found.
    /// * [`ComputeError::DeviceRequest`] — the device/queue request failed.
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::{ComputeContext, ComputeError};
    ///
    /// match ComputeContext::new() {
    ///     Ok(ctx)                      => { let _ = ctx; }
    ///     Err(ComputeError::NoAdapter) => { /* skip */ }
    ///     Err(e)                       => panic!("unexpected: {e}"),
    /// }
    /// ```
    #[cfg_attr(feature = "tracing", tracing::instrument(level = "debug"))]
    pub fn new() -> Result<Self, ComputeError> {
        ContextBuilder::default().build()
    }

    /// Try to create a `ComputeContext`, returning `None` when no suitable GPU
    /// adapter is available on this host.
    ///
    /// This constructor never panics.  Call sites that want a graceful skip on
    /// headless CI environments (VMs, containers without GPU pass-through) should
    /// use this variant:
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// if let Some(ctx) = ComputeContext::try_new() {
    ///     // GPU is available — run the compute workload
    ///     let _ = ctx;
    /// } else {
    ///     // No GPU — skip gracefully
    /// }
    /// ```
    pub fn try_new() -> Option<Self> {
        Self::new().ok()
    }

    /// Async variant of [`new`][Self::new] — awaits adapter and device requests
    /// directly without a `pollster::block_on` wrapper.
    ///
    /// Suitable for use inside an async runtime (Tokio, async-std, etc.).
    ///
    /// # Errors
    ///
    /// Same as [`new`][Self::new].
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// # async fn run() -> Result<(), oxiui_compute_wgpu::ComputeError> {
    /// let ctx = ComputeContext::new_async().await?;
    /// # Ok(())
    /// # }
    /// ```
    pub async fn new_async() -> Result<Self, ComputeError> {
        ContextBuilder::default().build_async().await
    }

    /// Wrap externally owned `wgpu::Device` and `wgpu::Queue` in a
    /// `ComputeContext` so that the compute layer can share a device/queue pair
    /// that was already created by another backend (e.g. `oxiui-render-wgpu`).
    ///
    /// A synthetic [`wgpu::AdapterInfo`] is constructed from the optional
    /// `adapter_info` argument; pass `None` to use a placeholder.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// // Suppose `device` and `queue` come from an external renderer.
    /// # fn external() -> (wgpu::Device, wgpu::Queue) { unimplemented!() }
    /// let (device, queue) = external();
    /// let ctx = ComputeContext::from_device(device, queue, None);
    /// ```
    pub fn from_device(
        device: wgpu::Device,
        queue: wgpu::Queue,
        adapter_info: Option<wgpu::AdapterInfo>,
    ) -> Self {
        let adapter_info = adapter_info.unwrap_or_else(|| wgpu::AdapterInfo {
            name: "external".into(),
            vendor: 0,
            device: 0,
            device_type: wgpu::DeviceType::Other,
            device_pci_bus_id: String::new(),
            driver: String::new(),
            driver_info: String::new(),
            backend: wgpu::Backend::Noop,
            subgroup_min_size: 0,
            subgroup_max_size: 0,
            transient_saves_memory: false,
        });
        ComputeContext {
            device,
            queue,
            transfer_queue: None,
            adapter_info,
        }
    }

    // ── Convenience delegates to ContextBuilder ─────────────────────────────

    /// Create a [`crate::dispatch::Dispatcher`] that borrows this context.
    ///
    /// The `Dispatcher` provides high-level, zero-boilerplate GPU compute
    /// operations (`map_f32`, `zip_map_f32`, `reduce_sum_f32`, `sph_density`,
    /// `sort_f32`, …).
    ///
    /// ```rust,no_run
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// if let Some(ctx) = ComputeContext::try_new() {
    ///     let d = ctx.dispatcher();
    ///     let out = d.map_f32(&[1.0, 2.0, 3.0], "x * 2.0");
    ///     assert_eq!(out, vec![2.0, 4.0, 6.0]);
    /// }
    /// ```
    pub fn dispatcher(&self) -> crate::dispatch::Dispatcher<'_> {
        crate::dispatch::Dispatcher::new(self)
    }

    /// Start building a context with custom memory limits.
    ///
    /// Equivalent to `ComputeContext::builder().with_limits(limits)`.
    pub fn with_limits(limits: wgpu::Limits) -> ContextBuilder {
        ContextBuilder::default().with_limits(limits)
    }

    /// Start building a context with specific GPU features enabled.
    ///
    /// Equivalent to `ComputeContext::builder().with_features(features)`.
    pub fn with_features(features: wgpu::Features) -> ContextBuilder {
        ContextBuilder::default().with_features(features)
    }

    /// Start building a context with a specific power preference.
    ///
    /// Equivalent to `ComputeContext::builder().with_power_preference(pref)`.
    pub fn with_power_preference(pref: wgpu::PowerPreference) -> ContextBuilder {
        ContextBuilder::default().with_power_preference(pref)
    }

    /// Return a [`crate::hot_reload::ShaderWatcher`] that watches WGSL source
    /// files on disk and signals when recompilation is needed.
    ///
    /// Only available when the `hot-reload` Cargo feature is enabled.
    ///
    /// ```rust,no_run
    /// # #[cfg(feature = "hot-reload")]
    /// # {
    /// use oxiui_compute_wgpu::ComputeContext;
    ///
    /// if let Some(ctx) = ComputeContext::try_new() {
    ///     let _watcher = ctx.watcher();
    ///     // Add paths to watch, then call watcher.drain_changed() each frame.
    /// }
    /// # }
    /// ```
    #[cfg(feature = "hot-reload")]
    pub fn watcher(&self) -> crate::hot_reload::ShaderWatcher {
        crate::hot_reload::ShaderWatcher::new()
    }
}

// ── ContextBuilder ─────────────────────────────────────────────────────────────

/// Fluent builder for [`ComputeContext`].
///
/// Compose limits, features, and power preference in one chain, then call
/// [`build`][ContextBuilder::build] (sync) or [`build_async`][ContextBuilder::build_async]
/// (async) to finalise.
///
/// ```rust,no_run
/// use oxiui_compute_wgpu::{ComputeContext, ComputeError};
///
/// let result = ComputeContext::builder()
///     .with_power_preference(wgpu::PowerPreference::HighPerformance)
///     .with_limits(wgpu::Limits::default())
///     .build();
/// ```
#[derive(Debug, Default)]
pub struct ContextBuilder {
    power_preference: wgpu::PowerPreference,
    required_features: wgpu::Features,
    required_limits: Option<wgpu::Limits>,
    /// When `true`, request a dedicated transfer queue family (if supported).
    multi_queue: bool,
}

impl ContextBuilder {
    /// Set the GPU power preference.
    ///
    /// Defaults to [`wgpu::PowerPreference::HighPerformance`] when not
    /// called.
    pub fn with_power_preference(mut self, pref: wgpu::PowerPreference) -> Self {
        self.power_preference = pref;
        self
    }

    /// Request optional GPU features (e.g. `TIMESTAMP_QUERY`, `SHADER_F16`).
    ///
    /// If the adapter does not support the requested features, [`ContextBuilder::build`] will
    /// return [`ComputeError::DeviceRequest`] with a descriptive message before
    /// attempting `request_device`.
    pub fn with_features(mut self, features: wgpu::Features) -> Self {
        self.required_features = features;
        self
    }

    /// Override the default device limits.
    ///
    /// Use [`wgpu::Limits::downlevel_defaults()`] for maximum compatibility or
    /// supply custom limits for high-throughput compute workloads.
    pub fn with_limits(mut self, limits: wgpu::Limits) -> Self {
        self.required_limits = Some(limits);
        self
    }

    /// Request separate transfer and compute queues on adapters that advertise
    /// more than one queue family.
    ///
    /// When the adapter exposes only a single queue family the built context
    /// falls back gracefully: `transfer_queue()` returns `None` and `queue`
    /// is used for all operations.
    ///
    /// Note: wgpu currently exposes at most one queue per device to the Rust
    /// API, so this option records the intent and the context exposes
    /// `transfer_queue()` as `None` until wgpu adds explicit multi-queue
    /// support.  The flag is preserved for forward compatibility.
    pub fn with_multi_queue(mut self) -> Self {
        self.multi_queue = true;
        self
    }

    /// Blocking variant: run the full adapter + device init on the current thread.
    ///
    /// # Errors
    ///
    /// * [`ComputeError::NoAdapter`] — no GPU adapter matched the options.
    /// * [`ComputeError::DeviceRequest`] — features or device request failed.
    pub fn build(self) -> Result<ComputeContext, ComputeError> {
        pollster::block_on(self.build_async())
    }

    /// Async variant: await adapter and device requests inside the caller's runtime.
    ///
    /// # Errors
    ///
    /// * [`ComputeError::NoAdapter`] — no GPU adapter matched the options.
    /// * [`ComputeError::DeviceRequest`] — features or device request failed.
    pub async fn build_async(self) -> Result<ComputeContext, ComputeError> {
        let instance = wgpu::Instance::default();

        let adapter = instance
            .request_adapter(&wgpu::RequestAdapterOptions {
                power_preference: self.power_preference,
                force_fallback_adapter: false,
                // No surface — pure compute, no swap-chain required.
                compatible_surface: None,
            })
            .await
            .map_err(|_| ComputeError::NoAdapter)?;

        // Pre-check requested features before attempting device acquisition so
        // callers get a clear error instead of a cryptic RequestDeviceError.
        if !self.required_features.is_empty()
            && !adapter.features().contains(self.required_features)
        {
            return Err(ComputeError::DeviceRequest(format!(
                "adapter does not support requested features: {:?}",
                self.required_features
            )));
        }

        // Capture adapter metadata before consuming the adapter.
        let adapter_info = adapter.get_info();

        let limits = self.required_limits.unwrap_or_default();

        let (device, queue) = adapter
            .request_device(&wgpu::DeviceDescriptor {
                label: Some("oxiui-compute-wgpu"),
                required_features: self.required_features,
                required_limits: limits,
                ..Default::default()
            })
            .await
            .map_err(|e| ComputeError::DeviceRequest(e.to_string()))?;

        // Multi-queue: wgpu currently exposes one queue per device.  The intent
        // is recorded and `transfer_queue` is set to `None` until wgpu adds
        // explicit multi-queue support.  Callers check `transfer_queue()` and
        // fall back to `queue` automatically.
        let transfer_queue: Option<wgpu::Queue> = if self.multi_queue {
            // Future: when wgpu supports `request_device` returning multiple
            // queues, acquire a second queue here.  For now, advertise that
            // the context was built with multi-queue intent but that the
            // adapter does not expose a second queue.
            None
        } else {
            None
        };

        Ok(ComputeContext {
            device,
            queue,
            transfer_queue,
            adapter_info,
        })
    }
}

// ── Tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    // ── existing tests (preserved) ───────────────────────────────────────────

    #[test]
    fn try_new_does_not_panic() {
        // Gracefully skips if no GPU adapter — must never panic.
        let _ = ComputeContext::try_new();
    }

    #[test]
    fn new_returns_result() {
        match ComputeContext::new() {
            Ok(_ctx) => { /* GPU available — context created successfully */ }
            Err(ComputeError::NoAdapter) => {
                // No GPU on this host (CI, headless VM) — acceptable skip.
            }
            Err(ComputeError::DeviceRequest(ref msg)) => {
                panic!("unexpected DeviceRequest error: {msg}")
            }
            Err(e) => {
                panic!("unexpected error: {e}")
            }
        }
    }

    #[test]
    fn try_new_consistent_with_new() {
        // try_new() must be consistent with new(): both fail or both succeed.
        let via_new = ComputeContext::new();
        let via_try = ComputeContext::try_new();
        match (via_new, via_try) {
            (Ok(_), Some(_)) | (Err(_), None) => { /* consistent */ }
            (Ok(_), None) => panic!("new() succeeded but try_new() returned None"),
            (Err(e), Some(_)) => panic!("new() failed but try_new() returned Some: {e}"),
        }
    }

    // ── new tests (S1) ───────────────────────────────────────────────────────

    /// Non-GPU test: verify that `ContextBuilder::default()` constructs without
    /// panicking, even before `build()` is called.
    #[test]
    fn builder_chain_defaults() {
        // The builder itself must be constructable regardless of GPU availability.
        let _builder = ContextBuilder::default()
            .with_power_preference(wgpu::PowerPreference::HighPerformance)
            .with_limits(wgpu::Limits::default())
            .with_features(wgpu::Features::empty());
        // Attempt build; whether it succeeds depends on host GPU availability —
        // either outcome is acceptable.
        let _result = _builder.build();
        // No assertion: success or NoAdapter are both valid outcomes.
    }

    /// Non-GPU test: `with_multi_queue()` builds correctly.
    #[test]
    fn builder_with_multi_queue_does_not_panic() {
        let _result = ContextBuilder::default().with_multi_queue().build();
        // Either Ok or NoAdapter — neither must panic.
    }

    /// GPU-gated: adapter_info() returns a non-empty backend string.
    #[test]
    fn context_has_adapter_info() {
        oxiui_core::require_gpu!(ctx, ComputeContext::try_new());
        let info = ctx.adapter_info();
        let backend_str = format!("{:?}", info.backend);
        assert!(!backend_str.is_empty(), "backend string must not be empty");
    }

    /// GPU-gated: builder with LowPower preference builds successfully.
    #[test]
    fn builder_with_low_power() {
        oxiui_core::require_gpu!(
            ctx,
            ComputeContext::with_power_preference(wgpu::PowerPreference::LowPower)
                .build()
                .ok()
        );
        let _ = ctx;
    }

    /// GPU-gated: new_async() via pollster::block_on produces a valid context.
    #[test]
    fn new_async_via_pollster() {
        oxiui_core::require_gpu!(ctx, pollster::block_on(ComputeContext::new_async()).ok());
        let _ = ctx;
    }

    /// GPU-gated: requesting all features should return a clean error (not panic)
    /// when the adapter does not support them all.
    #[test]
    fn with_unsupported_features_returns_error() {
        // `wgpu::Features::all()` is almost certainly not fully supported on any
        // single adapter; we expect either a DeviceRequest error or a successful
        // build (the latter is allowed on hardware that does support everything).
        // What must NOT happen is a panic.
        let result = ComputeContext::with_features(wgpu::Features::all()).build();
        match result {
            Ok(_) => { /* hardware supports all features — acceptable */ }
            Err(ComputeError::NoAdapter) => { /* no GPU — skip */ }
            Err(ComputeError::DeviceRequest(_)) => { /* expected clean error */ }
            Err(e) => panic!("unexpected error variant: {e}"),
        }
    }

    /// GPU-gated: `transfer_queue()` returns `None` on a standard context
    /// (multi-queue not requested).
    #[test]
    fn transfer_queue_none_without_multi_queue() {
        oxiui_core::require_gpu!(ctx, ComputeContext::try_new());
        assert!(
            ctx.transfer_queue().is_none(),
            "transfer_queue must be None when multi-queue was not requested"
        );
    }

    /// GPU-gated: multi-queue context builds without panic; transfer_queue is
    /// `None` (graceful fallback) because wgpu exposes one queue per device.
    #[test]
    fn multi_queue_context_builds() {
        oxiui_core::require_gpu!(
            ctx,
            ComputeContext::builder().with_multi_queue().build().ok()
        );
        // wgpu currently exposes at most one queue — transfer_queue is None.
        assert!(ctx.transfer_queue().is_none());
    }

    /// Non-GPU test: `from_device` wraps externally owned resources.
    ///
    /// This test only verifies the `from_device` path at the type level;
    /// actual GPU execution is not performed.
    #[test]
    fn from_device_via_real_gpu() {
        oxiui_core::require_gpu!(ctx, ComputeContext::try_new());
        // Extract adapter info from the context we just built.
        let info = ctx.adapter_info().clone();
        // Rebuild using from_device with the same device + queue.
        // (We move ctx's device/queue into a new context.)
        let ctx2 = ComputeContext::from_device(ctx.device, ctx.queue, Some(info.clone()));
        assert_eq!(ctx2.adapter_info().name, info.name);
        assert!(ctx2.transfer_queue().is_none());
    }
}