baracuda_npp/
lib.rs

1//! Safe Rust wrappers for NVIDIA NPP (Performance Primitives).
2//!
3//! NPP is organized into ~10 separate shared libraries:
4//!
5//! - `nppc` — core (version info).
6//! - `npps` — 1-D signal processing (covered by [`signal`]).
7//! - `nppial` / `nppi*` — 2-D image processing (covered by [`image`]).
8//!
9//! NPP has thousands of functions for every permutation of element type
10//! (8u/16u/16s/32s/32f) × channel count (C1/C3/C4) × region-of-interest
11//! (R). This crate wraps a curated subset: the 32f and 8u C1R variants
12//! for arithmetic, geometric, color-conversion, filter, and statistics
13//! ops. Further variants can be added trivially by following the
14//! existing pattern.
15
16#![warn(missing_debug_implementations)]
17
18use core::ffi::c_void;
19
20use baracuda_driver::DeviceBuffer;
21use baracuda_npp_sys::{nppc, npps, NppLibraryVersion, NppStatus};
22
23pub use baracuda_npp_sys::{NppiInterpolationMode, NppiPoint, NppiRect, NppiSize};
24
25/// Error type for NPP operations.
26pub type Error = baracuda_core::Error<NppStatus>;
27/// Result alias.
28pub type Result<T, E = Error> = core::result::Result<T, E>;
29
30#[inline]
31fn check(status: NppStatus) -> Result<()> {
32    Error::check(status)
33}
34
35/// NPP library version (queries `nppc`).
36pub fn version() -> Result<NppLibraryVersion> {
37    let n = nppc()?;
38    let cu = n.npp_get_lib_version()?;
39    let ptr = unsafe { cu() };
40    if ptr.is_null() {
41        return Err(Error::Status {
42            status: NppStatus(-4),
43        });
44    }
45    Ok(unsafe { *ptr })
46}
47
48/// 1-D signal-processing ops (from `npps`).
49pub mod signal {
50    use super::*;
51
52    /// `src_dst[i] += src[i]` (32-bit float).
53    pub fn add_32f_in_place(
54        src: &DeviceBuffer<f32>,
55        src_dst: &mut DeviceBuffer<f32>,
56        n: i32,
57    ) -> Result<()> {
58        assert!(src.len() >= n as usize);
59        assert!(src_dst.len() >= n as usize);
60        let s = npps()?;
61        let cu = s.npps_add_32f_i()?;
62        check(unsafe {
63            cu(
64                src.as_raw().0 as *const f32,
65                src_dst.as_raw().0 as *mut f32,
66                n,
67            )
68        })
69    }
70
71    /// `src_dst[i] -= src[i]` (32-bit float).
72    pub fn sub_32f_in_place(
73        src: &DeviceBuffer<f32>,
74        src_dst: &mut DeviceBuffer<f32>,
75        n: i32,
76    ) -> Result<()> {
77        assert!(src.len() >= n as usize);
78        assert!(src_dst.len() >= n as usize);
79        let s = npps()?;
80        let cu = s.npps_sub_32f_i()?;
81        check(unsafe {
82            cu(
83                src.as_raw().0 as *const f32,
84                src_dst.as_raw().0 as *mut f32,
85                n,
86            )
87        })
88    }
89
90    /// `src_dst[i] *= src[i]` (32-bit float).
91    pub fn mul_32f_in_place(
92        src: &DeviceBuffer<f32>,
93        src_dst: &mut DeviceBuffer<f32>,
94        n: i32,
95    ) -> Result<()> {
96        assert!(src.len() >= n as usize);
97        assert!(src_dst.len() >= n as usize);
98        let s = npps()?;
99        let cu = s.npps_mul_32f_i()?;
100        check(unsafe {
101            cu(
102                src.as_raw().0 as *const f32,
103                src_dst.as_raw().0 as *mut f32,
104                n,
105            )
106        })
107    }
108
109    /// Return the device scratch-buffer size (in bytes) that
110    /// [`sum_32f`] needs for an `n`-element reduction.
111    pub fn sum_buffer_size_32f(n: i32) -> Result<usize> {
112        let s = npps()?;
113        let cu = s.npps_sum_get_buffer_size_32f()?;
114        let mut bytes: i32 = 0;
115        check(unsafe { cu(n, &mut bytes) })?;
116        Ok(bytes as usize)
117    }
118
119    /// Compute the sum of an `f32` signal, writing the scalar into
120    /// `sum_out[0]`. Caller provides the scratch buffer (use
121    /// [`sum_buffer_size_32f`] to size it).
122    pub fn sum_32f(
123        src: &DeviceBuffer<f32>,
124        n: i32,
125        sum_out: &mut DeviceBuffer<f32>,
126        scratch: &mut DeviceBuffer<u8>,
127    ) -> Result<()> {
128        assert!(src.len() >= n as usize);
129        assert!(!sum_out.is_empty());
130        let s = npps()?;
131        let cu = s.npps_sum_32f()?;
132        check(unsafe {
133            cu(
134                src.as_raw().0 as *const f32,
135                n,
136                sum_out.as_raw().0 as *mut f32,
137                scratch.as_raw().0 as *mut u8,
138            )
139        })
140    }
141
142    /// Scratch size for [`min_max_32f`].
143    pub fn min_max_buffer_size_32f(n: i32) -> Result<usize> {
144        let s = npps()?;
145        let cu = s.npps_min_max_get_buffer_size_32f()?;
146        let mut bytes: i32 = 0;
147        check(unsafe { cu(n, &mut bytes) })?;
148        Ok(bytes as usize)
149    }
150
151    /// Compute (min, max) of an `f32` signal; caller provides scratch.
152    pub fn min_max_32f(
153        src: &DeviceBuffer<f32>,
154        n: i32,
155        min_out: &mut DeviceBuffer<f32>,
156        max_out: &mut DeviceBuffer<f32>,
157        scratch: &mut DeviceBuffer<u8>,
158    ) -> Result<()> {
159        let s = npps()?;
160        let cu = s.npps_min_max_32f()?;
161        check(unsafe {
162            cu(
163                src.as_raw().0 as *const f32,
164                n,
165                min_out.as_raw().0 as *mut f32,
166                max_out.as_raw().0 as *mut f32,
167                scratch.as_raw().0 as *mut u8,
168            )
169        })
170    }
171}
172
173/// 2-D image-processing ops (from `nppial`/`nppig`/`nppicc`/`nppif`/`nppist`).
174pub mod image {
175    use super::*;
176    use baracuda_npp_sys::{nppial, nppicc, nppif, nppig, nppist};
177
178    /// `dst = src1 + src2` for 32f single-channel images (ROI-based).
179    ///
180    /// Steps are in bytes between rows. Pass the pitch your allocator
181    /// returned (typically `width * 4` for packed data, or the
182    /// pitched-alloc pitch from [`baracuda_runtime::memcpy2d::PitchedBuffer`]).
183    ///
184    /// # Safety
185    ///
186    /// All pointers must be device-addressable and cover `size.height`
187    /// rows of `size.width * 4` bytes.
188    #[allow(clippy::too_many_arguments)]
189    pub unsafe fn add_32f_c1r(
190        src1: *const f32,
191        src1_step: i32,
192        src2: *const f32,
193        src2_step: i32,
194        dst: *mut f32,
195        dst_step: i32,
196        size: NppiSize,
197    ) -> Result<()> { unsafe {
198        let l = nppial()?;
199        let cu = l.nppi_add_32f_c1r()?;
200        check(cu(src1, src1_step, src2, src2_step, dst, dst_step, size))
201    }}
202
203    /// `dst = src1 * src2` for 32f single-channel images.
204    ///
205    /// # Safety
206    ///
207    /// Same as [`add_32f_c1r`].
208    #[allow(clippy::too_many_arguments)]
209    pub unsafe fn mul_32f_c1r(
210        src1: *const f32,
211        src1_step: i32,
212        src2: *const f32,
213        src2_step: i32,
214        dst: *mut f32,
215        dst_step: i32,
216        size: NppiSize,
217    ) -> Result<()> { unsafe {
218        let l = nppial()?;
219        let cu = l.nppi_mul_32f_c1r()?;
220        check(cu(src1, src1_step, src2, src2_step, dst, dst_step, size))
221    }}
222
223    /// `dst = (src1 + src2) >> scale_factor` for 8u single-channel.
224    ///
225    /// # Safety
226    ///
227    /// Same as [`add_32f_c1r`], with steps in bytes for 8u data.
228    #[allow(clippy::too_many_arguments)]
229    pub unsafe fn add_8u_c1r_sfs(
230        src1: *const u8,
231        src1_step: i32,
232        src2: *const u8,
233        src2_step: i32,
234        dst: *mut u8,
235        dst_step: i32,
236        size: NppiSize,
237        scale_factor: i32,
238    ) -> Result<()> { unsafe {
239        let l = nppial()?;
240        let cu = l.nppi_add_8u_c1r_sfs()?;
241        check(cu(
242            src1,
243            src1_step,
244            src2,
245            src2_step,
246            dst,
247            dst_step,
248            size,
249            scale_factor,
250        ))
251    }}
252
253    /// 8u single-channel image resize with the given interpolation mode.
254    ///
255    /// # Safety
256    ///
257    /// `src` must cover `src_size` at `src_step` pitch; `dst` must cover
258    /// `dst_size` at `dst_step`.
259    #[allow(clippy::too_many_arguments)]
260    pub unsafe fn resize_8u_c1r(
261        src: *const u8,
262        src_step: i32,
263        src_size: NppiSize,
264        src_rect: NppiRect,
265        dst: *mut u8,
266        dst_step: i32,
267        dst_size: NppiSize,
268        dst_rect: NppiRect,
269        interpolation: i32,
270    ) -> Result<()> { unsafe {
271        let l = nppig()?;
272        let cu = l.nppi_resize_8u_c1r()?;
273        check(cu(
274            src,
275            src_step,
276            src_size,
277            src_rect,
278            dst,
279            dst_step,
280            dst_size,
281            dst_rect,
282            interpolation,
283        ))
284    }}
285
286    /// 32f single-channel image resize.
287    ///
288    /// # Safety
289    ///
290    /// Same as [`resize_8u_c1r`] with 4-byte pixels.
291    #[allow(clippy::too_many_arguments)]
292    pub unsafe fn resize_32f_c1r(
293        src: *const f32,
294        src_step: i32,
295        src_size: NppiSize,
296        src_rect: NppiRect,
297        dst: *mut f32,
298        dst_step: i32,
299        dst_size: NppiSize,
300        dst_rect: NppiRect,
301        interpolation: i32,
302    ) -> Result<()> { unsafe {
303        let l = nppig()?;
304        let cu = l.nppi_resize_32f_c1r()?;
305        check(cu(
306            src,
307            src_step,
308            src_size,
309            src_rect,
310            dst,
311            dst_step,
312            dst_size,
313            dst_rect,
314            interpolation,
315        ))
316    }}
317
318    /// Convert a packed RGB-8u image to single-channel grayscale.
319    ///
320    /// # Safety
321    ///
322    /// `src` must cover `size.height × src_step` bytes (RGB pixels are
323    /// 3 bytes each); `dst` must cover `size.height × dst_step` bytes.
324    #[allow(clippy::too_many_arguments)]
325    pub unsafe fn rgb_to_gray_8u(
326        src: *const u8,
327        src_step: i32,
328        dst: *mut u8,
329        dst_step: i32,
330        size: NppiSize,
331    ) -> Result<()> { unsafe {
332        let l = nppicc()?;
333        let cu = l.nppi_rgb_to_gray_8u_c3c1r()?;
334        check(cu(src, src_step, dst, dst_step, size))
335    }}
336
337    /// Same as [`rgb_to_gray_8u`] but BGR order (OpenCV convention).
338    ///
339    /// # Safety
340    ///
341    /// Same as [`rgb_to_gray_8u`].
342    pub unsafe fn bgr_to_gray_8u(
343        src: *const u8,
344        src_step: i32,
345        dst: *mut u8,
346        dst_step: i32,
347        size: NppiSize,
348    ) -> Result<()> { unsafe {
349        let l = nppicc()?;
350        let cu = l.nppi_bgr_to_gray_8u_c3c1r()?;
351        check(cu(src, src_step, dst, dst_step, size))
352    }}
353
354    /// Apply an averaging (box) filter of size `mask_size`.
355    ///
356    /// # Safety
357    ///
358    /// Same as [`add_32f_c1r`]. NPP requires an apron of ceil(mask/2)
359    /// pixels on the source side.
360    #[allow(clippy::too_many_arguments)]
361    pub unsafe fn filter_box_8u_c1r(
362        src: *const u8,
363        src_step: i32,
364        dst: *mut u8,
365        dst_step: i32,
366        dst_roi: NppiSize,
367        mask_size: NppiSize,
368        anchor: NppiPoint,
369    ) -> Result<()> { unsafe {
370        let l = nppif()?;
371        let cu = l.nppi_filter_box_8u_c1r()?;
372        check(cu(src, src_step, dst, dst_step, dst_roi, mask_size, anchor))
373    }}
374
375    /// Report the device-scratch buffer size required by
376    /// [`sum_32f_c1r`] for an image of size `roi`.
377    pub fn sum_buffer_size_32f_c1r(roi: NppiSize) -> Result<usize> {
378        let l = nppist()?;
379        let cu = l.nppi_sum_get_buffer_host_size_32f_c1r()?;
380        let mut bytes: i32 = 0;
381        check(unsafe { cu(roi, &mut bytes) })?;
382        Ok(bytes as usize)
383    }
384
385    /// Sum of a 32f single-channel image over `roi`, writing the
386    /// scalar result into `sum_out` (device buffer of len ≥ 1).
387    /// Caller provides the scratch buffer (use
388    /// [`sum_buffer_size_32f_c1r`] to size it).
389    ///
390    /// # Safety
391    ///
392    /// `src` must cover `roi.height × src_step` bytes.
393    pub unsafe fn sum_32f_c1r(
394        src: *const f32,
395        src_step: i32,
396        roi: NppiSize,
397        sum_out: *mut f64,
398        scratch: *mut u8,
399    ) -> Result<()> { unsafe {
400        let l = nppist()?;
401        let cu = l.nppi_sum_32f_c1r()?;
402        check(cu(src, src_step, roi, scratch, sum_out))
403    }}
404}
405
406/// Deprecated top-level alias for [`signal::add_32f_in_place`].
407#[deprecated(since = "0.2.0", note = "use baracuda_npp::signal::add_32f_in_place")]
408pub fn adds_32f_in_place(
409    src: &DeviceBuffer<f32>,
410    src_dst: &mut DeviceBuffer<f32>,
411    n: i32,
412) -> Result<()> {
413    signal::add_32f_in_place(src, src_dst, n)
414}
415
416// Silence a "unused import" if nothing in this module currently needs `c_void`.
417#[allow(dead_code)]
418fn _touch() -> *mut c_void {
419    core::ptr::null_mut()
420}
baracuda_npp/lib.rs

baracuda_npp/
lib.rs