Skip to main content

oximedia_gpu/
lib.rs

1//! Cross-platform GPU compute pipeline for OxiMedia using WGPU.
2//!
3//! This crate provides GPU-accelerated media processing via the
4//! [wgpu](https://wgpu.rs/) portability layer, which selects the best
5//! available native backend **at runtime** — no compile-time feature flags
6//! are required:
7//!
8//! | Platform | Backend selected by wgpu |
9//! |----------|--------------------------|
10//! | Linux    | Vulkan (preferred), then OpenGL ES |
11//! | macOS    | Metal |
12//! | Windows  | DirectX 12, then Vulkan |
13//! | Web      | WebGPU |
14//! | All      | CPU software fallback when no GPU adapter is found |
15//!
16//! # Compute kernels
17//!
18//! **Color space:**
19//! - RGB ↔ YUV (BT.601, BT.709, BT.2020) — [`ops::ColorSpaceConversion`]
20//! - Chroma subsampling (4:2:0, 4:2:2, 4:4:4) — [`ops::ChromaOps`]
21//! - Tone mapping (Reinhard, Hable, ACES, Drago) — `ops::tonemap`
22//!
23//! **Geometry and scale:**
24//! - Image scaling: Bilinear, Bicubic, Lanczos-3 — [`ops::ScaleOperation`]
25//! - Convolution filters: blur, sharpen, edge-detect — [`ops::FilterOperation`]
26//! - Perspective transform, mipmap generation
27//!
28//! **Signal processing:**
29//! - DCT and FFT transforms — [`ops::TransformOperation`]
30//! - Histogram equalization (CLAHE) — [`HistogramEqualizer`]
31//! - Optical flow estimation — `optical_flow`
32//! - Motion detection — [`MotionDetector`]
33//! - Film grain synthesis — `film_grain`
34//! - Bilateral / NLM denoising — `ops::denoise`
35//!
36//! **Quality metrics:**
37//! - [`compute_psnr`], [`compute_ssim`], [`compute_ms_ssim`]
38//!
39//! # TexturePool — LRU eviction
40//!
41//! [`TexturePool`] maintains a byte-budget and slot-count capacity.  When both
42//! limits are exhausted, [`TexturePool::allocate_with_lru_eviction`] evicts the
43//! least-recently-used texture in a loop until enough space is reclaimed.  LRU
44//! order is tracked with a monotonic `access_clock` counter; the slot with the
45//! smallest timestamp is selected by [`TexturePool::lru_handle`].  Call
46//! [`TexturePool::touch`] after each use to update the timestamp.
47//!
48//! Supported [`TextureFormat`]s: `Rgba8`, `Rgba16f`, `Rgb10A2`, `R8`, `Rg8`,
49//! `Yuv420`, `Nv12`.
50//!
51//! # Shader cache
52//!
53//! [`shader_cache::GpuShaderCache`] maintains two levels of caching:
54//!
55//! - **In-memory**: LRU, LFU, or OldestFirst eviction (configurable via
56//!   [`shader_cache::EvictionPolicy`]).  Hit/miss counters are tracked.
57//! - **Disk-persistent**: Cache entries are stored as
58//!   `<hex_hash>_<backend>_<flags>.shd` (compiled bytecode) plus a
59//!   `<hex_hash>_<backend>_<flags>.meta` sidecar.  The cache key is a
60//!   [`shader_cache::ShaderVersion`] containing `source_hash: u64`,
61//!   `backend: String`, and `feature_flags: u32`.
62//!
63//! # Pipeline system
64//!
65//! [`GpuPipeline`] is a DAG-based processing pipeline with built-in barrier
66//! management.  Stages: `Decode → Colorspace → Filter → Encode → Display`.
67//! [`BarrierBatcher`] supports three strategies — `Eager`, `Batched`, and
68//! `Deferred` — to minimise synchronisation overhead.
69//!
70//! [`BatchedComputePass`] and [`ComputeShaderSimulator`] provide structured
71//! compute dispatch with recorded [`DispatchCommand`] queues.
72//!
73//! # GPU buffer management
74//!
75//! [`SubAllocator`] implements a bump-pointer sub-allocator with defragmentation
76//! for the GPU buffer pool.  [`memory_pool::DefragResult`] reports how many bytes
77//! were compacted per defrag pass.
78//!
79//! # Example
80//!
81//! ```no_run
82//! use oximedia_gpu::GpuContext;
83//!
84//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
85//! let ctx = GpuContext::new()?;
86//!
87//! let input = vec![0u8; 1920 * 1080 * 4];
88//! let mut output = vec![0u8; 1920 * 1080 * 4];
89//!
90//! ctx.rgb_to_yuv(&input, &mut output)?;
91//! # Ok(())
92//! # }
93//! ```
94
95#![allow(clippy::cast_possible_truncation)]
96#![allow(clippy::cast_sign_loss)]
97#![allow(clippy::cast_precision_loss)]
98#![allow(clippy::cast_possible_wrap)]
99#![allow(clippy::missing_errors_doc)]
100#![allow(clippy::missing_panics_doc)]
101
102// Core modules
103pub mod buffer;
104pub mod device;
105pub mod ops;
106pub mod shader;
107
108// New comprehensive modules
109pub mod accelerator;
110pub mod backend;
111pub mod cache;
112pub mod compiler;
113pub mod compute;
114pub mod kernels;
115pub mod memory;
116pub mod queue;
117pub mod sync;
118
119// GPU compute operation modules
120pub mod histogram;
121pub mod motion_detect;
122pub mod pipeline;
123pub mod texture;
124pub mod video_process;
125
126// New kernel / pass / shader-param modules
127pub mod compute_pass;
128pub mod kernel;
129pub mod shader_params;
130
131// Wave-8 new modules
132pub mod compute_dispatch;
133pub mod memory_pool;
134pub mod shader_cache;
135
136// Wave-9 new modules
137pub mod gpu_buffer;
138pub mod gpu_fence;
139pub mod render_pass;
140
141// Wave-10 new modules
142pub mod command_buffer;
143pub mod resource_manager;
144pub mod sync_primitive;
145
146// Wave-11 new modules
147pub mod descriptor_set;
148pub mod gpu_stats;
149pub mod viewport;
150
151// Wave-12 new modules
152pub mod gpu_profiler;
153pub mod sampler;
154pub mod vertex_buffer;
155
156// Wave-13 new modules
157pub mod fence_pool;
158pub mod gpu_timer;
159pub mod upload_queue;
160
161// Wave-14 new modules
162pub mod buffer_copy;
163pub mod occupancy;
164pub mod workgroup;
165
166// Wave-15 new modules
167pub mod buffer_pool;
168pub mod compute_kernels;
169pub mod pipeline_stages;
170
171// Wave-16 new modules (0.1.2 enhancements)
172pub mod motion_estimation;
173pub mod multi_gpu;
174
175// Wave-17 new modules
176pub mod compute_shader;
177pub mod histogram_equalization;
178
179// Previously undeclared modules (discovered in src/ inventory)
180pub mod async_compute;
181pub mod barrier_manager;
182pub mod blend_kernel;
183pub mod color_convert_kernel;
184pub mod compute_graph;
185pub mod double_buffer;
186pub mod film_grain;
187pub mod gpu_cpu_verify;
188pub mod indirect_dispatch;
189pub mod kernel_scheduler;
190pub mod mipmap_gen;
191pub mod optical_flow;
192pub mod perspective_transform;
193pub mod pipeline_cache;
194pub mod readback;
195pub mod scale_kernel;
196pub mod texture_atlas;
197pub mod texture_cache;
198pub mod tone_curve;
199
200use std::sync::Arc;
201use thiserror::Error;
202
203// Accelerator exports
204pub use accelerator::{AcceleratorBuilder, CpuAccelerator, GpuAccelerator, WgpuAccelerator};
205
206// Core exports
207pub use buffer::{BufferType, GpuBuffer};
208pub use device::{GpuDevice, GpuDeviceInfo};
209pub use ops::quality_metrics::{
210    compute_ms_ssim, compute_psnr, compute_ssim, MsSsimResult, PsnrResult, SsimResult,
211};
212pub use ops::{
213    ChromaOps, ChromaSubsampling, ColorSpaceConversion, FilterOperation, ScaleOperation,
214    TransformOperation, YcbcrCoefficients,
215};
216
217// Backend exports
218pub use backend::{Backend, BackendCapabilities, BackendType, CpuBackend, VulkanBackend};
219
220// Cache exports
221pub use cache::{CacheStats, PipelineCache, ShaderCache};
222
223// Compiler exports
224pub use compiler::{
225    CompilationError, CompilationOptions, OptimizationLevel, ShaderCompiler, ShaderPreprocessor,
226};
227
228// Compute exports
229pub use compute::{
230    ComputeExecutor, ComputePassBuilder, ComputePipelineHandle, ComputePipelineManager,
231    DispatchHelper,
232};
233
234// Kernels exports
235pub use kernels::{
236    ColorConversionKernel, ConvolutionKernel, FilterKernel, ReduceKernel, ReduceOp, ResizeFilter,
237    ResizeKernel, TransformKernel, TransformType,
238};
239
240// Memory exports
241pub use memory::{ManagedBuffer, MemoryAllocator, MemoryPool, MemoryStats};
242
243// Queue exports
244pub use queue::{
245    AsyncSubmission, BatchSubmitter, CommandBufferBuilder, CommandQueue, QueueManager, QueueType,
246};
247
248// Sync exports
249pub use sync::{Barrier, Event, Fence, Semaphore};
250
251// Workgroup auto-tuner exports
252pub use workgroup::{DeviceLimits, WorkgroupAutoTuner};
253
254// Memory pool defragmentation exports
255pub use memory_pool::{CompactionPlan, DefragResult, MigrationEntry};
256
257// Video processing exports
258pub use buffer_pool::SubAllocator;
259pub use compute_pass::{BatchedComputePass, DispatchCommand};
260pub use histogram::{ChannelHistogram, ImageHistogram};
261pub use motion_detect::{MotionAnalysis, MotionDetector, MotionRegion, Sensitivity};
262pub use pipeline::{
263    BarrierBatcher, BarrierKind, BarrierStrategy, BufferBarrier, FlushRecord, GpuPipeline,
264    PipelineMetrics, PipelineNode, PipelineStage,
265};
266pub use texture::{TextureDescriptor, TextureFormat, TexturePool};
267pub use video_process::{FrameProcessConfig, FrameProcessResult, VideoFrameProcessor};
268
269// Wave-17 exports
270pub use compute_shader::{ComputeShaderSimulator, ShaderKernel, ThreadGroupContext};
271pub use histogram_equalization::{ClaheConfig, EqualizationStats, HistogramEqualizer};
272
273/// Error types for GPU operations
274#[derive(Debug, Error)]
275pub enum GpuError {
276    /// Device initialization failed
277    #[error("Failed to initialize GPU device: {0}")]
278    DeviceInit(String),
279
280    /// Adapter selection failed
281    #[error("No suitable GPU adapter found")]
282    NoAdapter,
283
284    /// Device request failed
285    #[error("Failed to request GPU device: {0}")]
286    DeviceRequest(String),
287
288    /// Buffer creation failed
289    #[error("Failed to create GPU buffer: {0}")]
290    BufferCreation(String),
291
292    /// Shader compilation failed
293    #[error("Failed to compile shader: {0}")]
294    ShaderCompilation(String),
295
296    /// Pipeline creation failed
297    #[error("Failed to create compute pipeline: {0}")]
298    PipelineCreation(String),
299
300    /// Command submission failed
301    #[error("Failed to submit GPU commands: {0}")]
302    CommandSubmission(String),
303
304    /// Buffer mapping failed
305    #[error("Failed to map GPU buffer: {0}")]
306    BufferMapping(String),
307
308    /// Invalid dimensions
309    #[error("Invalid image dimensions: {width}x{height}")]
310    InvalidDimensions { width: u32, height: u32 },
311
312    /// Invalid buffer size
313    #[error("Invalid buffer size: expected {expected}, got {actual}")]
314    InvalidBufferSize { expected: usize, actual: usize },
315
316    /// Operation not supported
317    #[error("Operation not supported: {0}")]
318    NotSupported(String),
319
320    /// Internal error
321    #[error("Internal GPU error: {0}")]
322    Internal(String),
323}
324
325pub type Result<T> = std::result::Result<T, GpuError>;
326
327/// GPU context for compute operations
328///
329/// This is the main entry point for GPU-accelerated operations.
330/// It manages device selection, resource allocation, and command submission.
331pub struct GpuContext {
332    device: Arc<GpuDevice>,
333}
334
335impl GpuContext {
336    /// Create a new GPU context with automatic device selection
337    ///
338    /// This will select the most suitable GPU device available on the system.
339    /// If no GPU is available, an error is returned.
340    ///
341    /// # Errors
342    ///
343    /// Returns an error if no suitable GPU device is found or if device
344    /// initialization fails.
345    pub fn new() -> Result<Self> {
346        let device = GpuDevice::new(None)?;
347        Ok(Self {
348            device: Arc::new(device),
349        })
350    }
351
352    /// Create a new GPU context with a specific device
353    ///
354    /// # Arguments
355    ///
356    /// * `device_index` - Index of the device to use (from `list_devices`)
357    ///
358    /// # Errors
359    ///
360    /// Returns an error if the device index is invalid or if device
361    /// initialization fails.
362    pub fn with_device(device_index: usize) -> Result<Self> {
363        let device = GpuDevice::new(Some(device_index))?;
364        Ok(Self {
365            device: Arc::new(device),
366        })
367    }
368
369    /// List available GPU devices
370    ///
371    /// Returns information about all GPU devices available on the system.
372    pub fn list_devices() -> Result<Vec<GpuDeviceInfo>> {
373        GpuDevice::list_devices()
374    }
375
376    /// Get information about the current device
377    #[must_use]
378    pub fn device_info(&self) -> &GpuDeviceInfo {
379        self.device.info()
380    }
381
382    /// Convert RGB to YUV (BT.601)
383    ///
384    /// # Arguments
385    ///
386    /// * `input` - Input RGB buffer (packed RGBA format)
387    /// * `output` - Output YUV buffer (packed YUVA format)
388    ///
389    /// # Errors
390    ///
391    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
392    pub fn rgb_to_yuv(&self, input: &[u8], output: &mut [u8]) -> Result<()> {
393        if input.len() != output.len() {
394            return Err(GpuError::InvalidBufferSize {
395                expected: input.len(),
396                actual: output.len(),
397            });
398        }
399
400        if input.len() % 4 != 0 {
401            return Err(GpuError::InvalidBufferSize {
402                expected: (input.len() / 4) * 4,
403                actual: input.len(),
404            });
405        }
406
407        let width = ((input.len() / 4) as f32).sqrt() as u32;
408        let height = width;
409
410        ops::ColorSpaceConversion::rgb_to_yuv(
411            &self.device,
412            input,
413            output,
414            width,
415            height,
416            ops::ColorSpace::BT601,
417        )
418    }
419
420    /// Convert YUV to RGB (BT.601)
421    ///
422    /// # Arguments
423    ///
424    /// * `input` - Input YUV buffer (packed YUVA format)
425    /// * `output` - Output RGB buffer (packed RGBA format)
426    ///
427    /// # Errors
428    ///
429    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
430    pub fn yuv_to_rgb(&self, input: &[u8], output: &mut [u8]) -> Result<()> {
431        if input.len() != output.len() {
432            return Err(GpuError::InvalidBufferSize {
433                expected: input.len(),
434                actual: output.len(),
435            });
436        }
437
438        if input.len() % 4 != 0 {
439            return Err(GpuError::InvalidBufferSize {
440                expected: (input.len() / 4) * 4,
441                actual: input.len(),
442            });
443        }
444
445        let width = ((input.len() / 4) as f32).sqrt() as u32;
446        let height = width;
447
448        ops::ColorSpaceConversion::yuv_to_rgb(
449            &self.device,
450            input,
451            output,
452            width,
453            height,
454            ops::ColorSpace::BT601,
455        )
456    }
457
458    /// Scale an image using bilinear interpolation
459    ///
460    /// # Arguments
461    ///
462    /// * `input` - Input image buffer (packed RGBA format)
463    /// * `src_width` - Source image width
464    /// * `src_height` - Source image height
465    /// * `output` - Output image buffer (packed RGBA format)
466    /// * `dst_width` - Destination image width
467    /// * `dst_height` - Destination image height
468    ///
469    /// # Errors
470    ///
471    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
472    pub fn scale_bilinear(
473        &self,
474        input: &[u8],
475        src_width: u32,
476        src_height: u32,
477        output: &mut [u8],
478        dst_width: u32,
479        dst_height: u32,
480    ) -> Result<()> {
481        ops::ScaleOperation::scale(
482            &self.device,
483            input,
484            src_width,
485            src_height,
486            output,
487            dst_width,
488            dst_height,
489            ops::ScaleFilter::Bilinear,
490        )
491    }
492
493    /// Scale an image using bicubic interpolation
494    ///
495    /// # Arguments
496    ///
497    /// * `input` - Input image buffer (packed RGBA format)
498    /// * `src_width` - Source image width
499    /// * `src_height` - Source image height
500    /// * `output` - Output image buffer (packed RGBA format)
501    /// * `dst_width` - Destination image width
502    /// * `dst_height` - Destination image height
503    ///
504    /// # Errors
505    ///
506    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
507    pub fn scale_bicubic(
508        &self,
509        input: &[u8],
510        src_width: u32,
511        src_height: u32,
512        output: &mut [u8],
513        dst_width: u32,
514        dst_height: u32,
515    ) -> Result<()> {
516        ops::ScaleOperation::scale(
517            &self.device,
518            input,
519            src_width,
520            src_height,
521            output,
522            dst_width,
523            dst_height,
524            ops::ScaleFilter::Bicubic,
525        )
526    }
527
528    /// Scale an image using Lanczos-3 interpolation (highest quality)
529    ///
530    /// # Arguments
531    ///
532    /// * `input` - Input image buffer (packed RGBA format)
533    /// * `src_width` - Source image width
534    /// * `src_height` - Source image height
535    /// * `output` - Output image buffer (packed RGBA format)
536    /// * `dst_width` - Destination image width
537    /// * `dst_height` - Destination image height
538    ///
539    /// # Errors
540    ///
541    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
542    pub fn scale_lanczos(
543        &self,
544        input: &[u8],
545        src_width: u32,
546        src_height: u32,
547        output: &mut [u8],
548        dst_width: u32,
549        dst_height: u32,
550    ) -> Result<()> {
551        ops::ScaleOperation::scale(
552            &self.device,
553            input,
554            src_width,
555            src_height,
556            output,
557            dst_width,
558            dst_height,
559            ops::ScaleFilter::Lanczos3,
560        )
561    }
562
563    /// Apply Gaussian blur
564    ///
565    /// # Arguments
566    ///
567    /// * `input` - Input image buffer (packed RGBA format)
568    /// * `output` - Output image buffer (packed RGBA format)
569    /// * `width` - Image width
570    /// * `height` - Image height
571    /// * `sigma` - Blur radius (standard deviation)
572    ///
573    /// # Errors
574    ///
575    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
576    #[allow(clippy::too_many_arguments)]
577    pub fn gaussian_blur(
578        &self,
579        input: &[u8],
580        output: &mut [u8],
581        width: u32,
582        height: u32,
583        sigma: f32,
584    ) -> Result<()> {
585        ops::FilterOperation::gaussian_blur(&self.device, input, output, width, height, sigma)
586    }
587
588    /// Apply sharpening filter
589    ///
590    /// # Arguments
591    ///
592    /// * `input` - Input image buffer (packed RGBA format)
593    /// * `output` - Output image buffer (packed RGBA format)
594    /// * `width` - Image width
595    /// * `height` - Image height
596    /// * `amount` - Sharpening strength
597    ///
598    /// # Errors
599    ///
600    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
601    #[allow(clippy::too_many_arguments)]
602    pub fn sharpen(
603        &self,
604        input: &[u8],
605        output: &mut [u8],
606        width: u32,
607        height: u32,
608        amount: f32,
609    ) -> Result<()> {
610        ops::FilterOperation::sharpen(&self.device, input, output, width, height, amount)
611    }
612
613    /// Detect edges using Sobel operator
614    ///
615    /// # Arguments
616    ///
617    /// * `input` - Input image buffer (packed RGBA format)
618    /// * `output` - Output image buffer (packed RGBA format)
619    /// * `width` - Image width
620    /// * `height` - Image height
621    ///
622    /// # Errors
623    ///
624    /// Returns an error if buffer sizes are invalid or if the GPU operation fails.
625    pub fn edge_detect(
626        &self,
627        input: &[u8],
628        output: &mut [u8],
629        width: u32,
630        height: u32,
631    ) -> Result<()> {
632        ops::FilterOperation::edge_detect(&self.device, input, output, width, height)
633    }
634
635    /// Compute 2D DCT (Discrete Cosine Transform)
636    ///
637    /// # Arguments
638    ///
639    /// * `input` - Input data (f32 values)
640    /// * `output` - Output DCT coefficients
641    /// * `width` - Data width (must be multiple of 8)
642    /// * `height` - Data height (must be multiple of 8)
643    ///
644    /// # Errors
645    ///
646    /// Returns an error if dimensions are invalid or if the GPU operation fails.
647    pub fn dct_2d(&self, input: &[f32], output: &mut [f32], width: u32, height: u32) -> Result<()> {
648        ops::TransformOperation::dct_2d(&self.device, input, output, width, height)
649    }
650
651    /// Compute 2D IDCT (Inverse Discrete Cosine Transform)
652    ///
653    /// # Arguments
654    ///
655    /// * `input` - Input DCT coefficients
656    /// * `output` - Output reconstructed data
657    /// * `width` - Data width (must be multiple of 8)
658    /// * `height` - Data height (must be multiple of 8)
659    ///
660    /// # Errors
661    ///
662    /// Returns an error if dimensions are invalid or if the GPU operation fails.
663    pub fn idct_2d(
664        &self,
665        input: &[f32],
666        output: &mut [f32],
667        width: u32,
668        height: u32,
669    ) -> Result<()> {
670        ops::TransformOperation::idct_2d(&self.device, input, output, width, height)
671    }
672
673    /// Wait for all GPU operations to complete
674    ///
675    /// This is useful for synchronization and benchmarking.
676    pub fn wait(&self) {
677        self.device.wait();
678    }
679}
680
681// GpuContext intentionally does not implement Default.
682//
683// GPU context creation is inherently fallible (no adapter, driver error, etc.).
684// Callers must use GpuContext::new() or GpuContext::with_device() and handle
685// the returned Result explicitly.  A silent Default impl that can either panic
686// or silently return a non-functional context would be misleading.
687//
688// If a best-effort fallback context is needed, use:
689//   GpuContext::new().or_else(|_| GpuContext::with_device(0))