nvjpeg_sys/
lib.rs

1//! Raw FFI Rust bindings to nvJPEG.
2
3#![allow(non_camel_case_types)]
4#![allow(non_snake_case)]
5#![allow(non_upper_case_globals)]
6
7mod bindings;
8
9pub use bindings::*;
10
11#[macro_export]
12macro_rules! check {
13    ($status:ident, $err:literal) => {
14        if $status != 0 {
15            Err(format!("{}. Error occured with code: {}", $err, $status))?
16        }
17    };
18}
19#[cfg(test)]
20mod tests {
21
22    use custos::{
23        buf,
24        cuda::launch_kernel,
25        prelude::{CUBuffer, Float, Number},
26        static_api::static_cuda,
27        Buffer, CDatatype,
28    };
29
30    pub fn correlate_cu2<T: Number + CDatatype>(
31        input: &CUBuffer<T>,
32        filter: &CUBuffer<T>,
33        out: &mut CUBuffer<T>,
34        inp_rows: usize,
35        inp_cols: usize,
36        filter_rows: usize,
37        filter_cols: usize,
38    ) {
39        let (out_rows, out_cols) = (inp_rows - filter_rows + 1, inp_cols - filter_cols + 1);
40
41        const THREADS: u32 = 8;
42
43        // THREADS
44        let grid_x = (inp_rows as f32 / THREADS as f32).ceil() as u32;
45        let grid_y = (inp_cols as f32 / THREADS as f32).ceil() as u32;
46        //let grid_z = ( as f32 / THREADS as f32).ceil() as u32;
47
48        let src = format!(
49            r#"
50        extern "C" __global__ void correlate2({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int filter_rows, int filter_cols) {{
51            int moveDown = blockDim.x * blockIdx.x + threadIdx.x;
52            int moveRight = blockDim.y * blockIdx.y + threadIdx.y;
53
54            int outRows = inp_rows - filter_rows + 1;
55            int outCols = inp_cols - filter_cols + 1;
56
57            if (moveDown >= outRows) {{
58                return;
59            }} 
60            if (moveRight >= outCols) {{
61                return;
62            }}
63            {dtype} sum = 0;
64            for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
65                int inputIdx = moveDown * inp_cols + moveRight + filterRow * inp_cols;  
66                for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
67                    sum += input[inputIdx + filterCol] * filter[filterRow * filter_cols + filterCol];
68                }}
69            }}
70            out[moveDown * outCols + moveRight] = sum;
71        }}
72    "#,
73            dtype = T::as_c_type_str()
74        );
75
76        launch_kernel(
77            input.device(),
78            [grid_x, grid_y, 1],
79            [THREADS, THREADS, 1],
80            0,
81            &src,
82            "correlate2",
83            &[
84                input,
85                filter,
86                out,
87                &inp_rows,
88                &inp_cols,
89                &filter_rows,
90                &filter_cols,
91            ],
92        )
93        .unwrap();
94    }
95
96    pub fn correlate_cu_use_z<T: Number + CDatatype>(
97        input: &CUBuffer<T>,
98        filter: &CUBuffer<T>,
99        out: &mut CUBuffer<T>,
100        inp_rows: usize,
101        inp_cols: usize,
102        filter_rows: usize,
103        filter_cols: usize,
104    ) {
105        let (out_rows, out_cols) = (inp_rows - filter_rows + 1, inp_cols - filter_cols + 1);
106
107        const THREADS: u32 = 8;
108
109        // THREADS
110        let grid_x = (inp_rows as f32 / THREADS as f32).ceil() as u32;
111        let grid_y = (inp_cols as f32 / THREADS as f32).ceil() as u32;
112        //let grid_z = ( as f32 / THREADS as f32).ceil() as u32;
113
114        let src = format!(
115            r#"
116        extern "C" __global__ void correlateWithZ({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int filter_rows, int filter_cols) {{
117
118            /*extern __shared__ {dtype} filterData[];
119
120            for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
121                for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
122                    filterData[filterRow * filter_cols + filterCol] = filter[filterRow * filter_cols + filterCol];
123                }}
124            }}
125
126            __syncthreads();*/
127
128
129
130            int moveDown = blockDim.x * blockIdx.x + threadIdx.x;
131            int moveRight = blockDim.y * blockIdx.y + threadIdx.y;
132            //int filterRow = threadIdx.z;
133
134            int outRows = inp_rows - filter_rows + 1;
135            int outCols = inp_cols - filter_cols + 1;
136
137            if (moveDown >= outRows) {{
138                return;
139            }} 
140            if (moveRight >= outCols) {{
141                return;
142            }}
143            {dtype} sum = 0;
144
145            for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
146                int inputIdx = moveDown * inp_cols + moveRight + filterRow * inp_cols;  
147                for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
148                    sum += input[inputIdx + filterCol] * filter[filterRow * filter_cols + filterCol];
149                }}
150            }}
151            out[moveDown * outCols + moveRight] = sum;
152        }}
153    "#,
154            dtype = T::as_c_type_str()
155        );
156
157        launch_kernel(
158            input.device(),
159            [grid_x, grid_y, 1],
160            [THREADS, THREADS, 1],
161            (filter_rows * filter_cols * std::mem::size_of::<T>()) as u32,
162            &src,
163            "correlateWithZ",
164            &[
165                input,
166                filter,
167                out,
168                &inp_rows,
169                &inp_cols,
170                &filter_rows,
171                &filter_cols,
172            ],
173        )
174        .unwrap();
175    }
176
177    #[test]
178    fn test_correleate_cu2_larger() {
179        let height = 1080;
180        let width = 1920;
181
182        let data = (0..height * width)
183            .into_iter()
184            .map(|x| x as f32)
185            .collect::<Vec<f32>>();
186        let data = Buffer::from((static_cuda(), data));
187
188        let filter_rows = 10;
189        let filter_cols = 10;
190
191        let filter = buf![1./3.; filter_rows * filter_cols].to_gpu();
192        let mut out = buf![0.; (height-filter_rows+1) * (width-filter_cols+1)].to_gpu();
193
194        correlate_cu2(
195            &data,
196            &filter,
197            &mut out,
198            height,
199            width,
200            filter_rows,
201            filter_cols,
202        );
203
204        //println!("out: {out:?}");
205
206        let mut cpu_out = buf![0.; out.len()];
207
208        correlate_valid_mut(
209            &data.to_cpu(),
210            (height, width),
211            &filter.to_cpu(),
212            (filter_rows, filter_cols),
213            &mut cpu_out,
214        );
215
216        assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 100.0);
217    }
218
219    #[test]
220    fn test_correlate_cu_larger_assert() {
221        #[rustfmt::skip]
222    let height = 1080;
223        let width = 1920;
224
225        for height in 1080..=1080 {
226            println!("height: {}", height);
227            for width in 1920..=1920 {
228                let data = (0..height * width)
229                    .into_iter()
230                    .map(|x| x as f32)
231                    .collect::<Vec<f32>>();
232                let data = Buffer::from((static_cuda(), data));
233
234                let filter_rows = 10;
235                let filter_cols = 10;
236
237                let filter = buf![1./3.; filter_rows * filter_cols].to_gpu();
238                let mut out = buf![0.; (height-filter_rows+1) * (width-filter_cols+1)].to_gpu();
239
240                correlate_cu2(
241                    &data,
242                    &filter,
243                    &mut out,
244                    height,
245                    width,
246                    filter_rows,
247                    filter_cols,
248                );
249
250                //println!("out: {out:?}");
251
252                let mut cpu_out = buf![0.; out.len()];
253
254                correlate_valid_mut(
255                    &data.to_cpu(),
256                    (height, width),
257                    &filter.to_cpu(),
258                    (filter_rows, filter_cols),
259                    &mut cpu_out,
260                );
261
262                assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 100.0);
263            }
264        }
265
266        let data = (0..height * width)
267            .into_iter()
268            .map(|x| x as f32)
269            .collect::<Vec<f32>>();
270        let data = Buffer::from((static_cuda(), data));
271
272        let filter = buf![1./3.; 9].to_gpu();
273        let mut out = buf![0.; (height-3+1) * (width-3+1)].to_gpu();
274
275        correlate_cu2(&data, &filter, &mut out, height, width, 3, 3);
276
277        //println!("out: {out:?}");
278
279        let mut cpu_out = buf![0.; out.len()];
280
281        correlate_valid_mut(
282            &data.to_cpu(),
283            (height, width),
284            &filter.to_cpu(),
285            (3, 3),
286            &mut cpu_out,
287        );
288
289        assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 0.1);
290    }
291
292    pub fn assert_eq_with_tolerance<T: Float>(a: &[T], b: &[T], tolerance: T) {
293        assert_eq!(a.len(), b.len());
294        for i in 0..a.len() {
295            if (a[i] - b[i]).abs() >= tolerance {
296                panic!(
297                    "
298LHS SIDE: {:?}, 
299            does not match with
300RHS SIDE: {:?} which value?: {}, {}",
301                    a, b, a[i], b[i]
302                );
303            }
304        }
305    }
306
307    pub fn correlate_valid_mut<T: Number>(
308        lhs_slice: &[T],
309        lhs_dims: (usize, usize),
310        kernel_slice: &[T],
311        kernel_dims: (usize, usize),
312        out: &mut [T],
313    ) {
314        let (lhs_rows, lhs_cols) = lhs_dims;
315        let (kernel_rows, kernel_cols) = kernel_dims;
316
317        let (out_rows, out_cols) = (lhs_rows - kernel_rows + 1, lhs_cols - kernel_cols + 1);
318
319        //loop for row-axis (y)
320        //moves multiplication 1 down
321        for y in 0..out_rows {
322            //loop for col-axis (x)
323            //moves multiplication 1 to the right
324            for x in 0..out_cols {
325                let mut sum = T::default();
326                //repeat kernel rows times to use move through all kernel rows
327                for idx in 0..kernel_rows {
328                    let index = idx * lhs_cols + x + y * lhs_cols;
329                    let lhs_kernel_row = &lhs_slice[index..index + kernel_cols];
330
331                    let index = idx * kernel_cols;
332                    let kernel_row = &kernel_slice[index..index + kernel_cols];
333
334                    for (i, value) in lhs_kernel_row.iter().enumerate() {
335                        sum += *value * kernel_row[i];
336                    }
337                }
338                // y * final_cols + x
339                out[y * out_cols + x] = sum;
340            }
341        }
342    }
343
344    pub fn cu_padding<T: CDatatype>(
345        input: &CUBuffer<T>,
346        out: &mut CUBuffer<T>,
347        inp_rows: usize,
348        inp_cols: usize,
349        x_padding: usize,
350        y_padding: usize,
351    ) {
352        let grid_x = ((inp_cols + x_padding * 2) as f32 / 16.).ceil() as u32;
353        let grid_y = ((inp_rows + y_padding * 2) as f32 / 16.).ceil() as u32;
354
355        let src = format!(
356            r#"
357        extern "C" __global__ void addPadding({dtype}* input, {dtype}* out, int inpRows, int inpCols, int xPadding, int yPadding) {{
358            int col = blockDim.x * blockIdx.x + threadIdx.x;
359            int row = blockDim.y * blockIdx.y + threadIdx.y;
360
361            if (row >= inpRows || col >= inpCols) {{
362                return;
363            }}
364
365            out[yPadding * (inpCols + 2*xPadding) + row * (inpCols + 2 * xPadding) + col + xPadding] = input[row * inpCols + col];
366        }}
367    "#,
368            dtype = T::as_c_type_str()
369        );
370        launch_kernel(
371            input.device(),
372            [grid_x, grid_y, 1],
373            [16, 16, 1],
374            0,
375            &src,
376            "addPadding",
377            &[input, out, &inp_rows, &inp_cols, &x_padding, &y_padding],
378        )
379        .unwrap();
380    }
381
382    pub fn add_padding<T: Number>(
383        inputs: &[T],
384        inp_rows: usize,
385        inp_cols: usize,
386        x_padding: usize,
387        y_padding: usize,
388    ) -> Vec<T> {
389        let mut padded_inputs =
390            vec![T::zero(); (inp_rows + y_padding * 2) * (inp_cols + x_padding * 2)];
391
392        for inp_row in 0..inp_rows {
393            for inp_col in 0..inp_cols {
394                padded_inputs[y_padding * (inp_cols + 2 * x_padding)
395                    + x_padding
396                    + inp_row * (inp_cols + 2 * x_padding)
397                    + inp_col] = inputs[inp_row * inp_cols + inp_col];
398            }
399        }
400        padded_inputs
401    }
402
403    #[test]
404    fn test_cu_padding_to_cpu_padding() {
405        let inp_rows = 1080;
406        let inp_cols = 1920;
407        let x_padding = 4;
408        let y_padding = 4;
409
410        let inputs = vec![1.; inp_rows * inp_cols];
411
412        let padded_inputs = add_padding(&inputs, inp_rows, inp_cols, x_padding, y_padding);
413
414        let mut gpu_inputs = buf![0.; inputs.len()].to_gpu();
415        let mut gpu_padded_inputs = buf![0.; padded_inputs.len()].to_gpu();
416
417        gpu_inputs.write(&inputs);
418        cu_padding(
419            &gpu_inputs,
420            &mut gpu_padded_inputs,
421            inp_rows,
422            inp_cols,
423            x_padding,
424            y_padding,
425        );
426
427        /*for (idx, padded_val) in gpu_padded_inputs.read().iter().enumerate() {
428            print!("{padded_val}, ");
429            if (idx + 1) % (inp_cols + 2*x_padding) == 0 {
430                println!()
431            }
432        }*/
433
434        assert_eq_with_tolerance(&gpu_padded_inputs.read(), &padded_inputs, 0.1);
435    }
436}