1#![allow(non_camel_case_types)]
4#![allow(non_snake_case)]
5#![allow(non_upper_case_globals)]
6
7mod bindings;
8
9pub use bindings::*;
10
11#[macro_export]
12macro_rules! check {
13 ($status:ident, $err:literal) => {
14 if $status != 0 {
15 Err(format!("{}. Error occured with code: {}", $err, $status))?
16 }
17 };
18}
19#[cfg(test)]
20mod tests {
21
22 use custos::{
23 buf,
24 cuda::launch_kernel,
25 prelude::{CUBuffer, Float, Number},
26 static_api::static_cuda,
27 Buffer, CDatatype,
28 };
29
30 pub fn correlate_cu2<T: Number + CDatatype>(
31 input: &CUBuffer<T>,
32 filter: &CUBuffer<T>,
33 out: &mut CUBuffer<T>,
34 inp_rows: usize,
35 inp_cols: usize,
36 filter_rows: usize,
37 filter_cols: usize,
38 ) {
39 let (out_rows, out_cols) = (inp_rows - filter_rows + 1, inp_cols - filter_cols + 1);
40
41 const THREADS: u32 = 8;
42
43 let grid_x = (inp_rows as f32 / THREADS as f32).ceil() as u32;
45 let grid_y = (inp_cols as f32 / THREADS as f32).ceil() as u32;
46 let src = format!(
49 r#"
50 extern "C" __global__ void correlate2({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int filter_rows, int filter_cols) {{
51 int moveDown = blockDim.x * blockIdx.x + threadIdx.x;
52 int moveRight = blockDim.y * blockIdx.y + threadIdx.y;
53
54 int outRows = inp_rows - filter_rows + 1;
55 int outCols = inp_cols - filter_cols + 1;
56
57 if (moveDown >= outRows) {{
58 return;
59 }}
60 if (moveRight >= outCols) {{
61 return;
62 }}
63 {dtype} sum = 0;
64 for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
65 int inputIdx = moveDown * inp_cols + moveRight + filterRow * inp_cols;
66 for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
67 sum += input[inputIdx + filterCol] * filter[filterRow * filter_cols + filterCol];
68 }}
69 }}
70 out[moveDown * outCols + moveRight] = sum;
71 }}
72 "#,
73 dtype = T::as_c_type_str()
74 );
75
76 launch_kernel(
77 input.device(),
78 [grid_x, grid_y, 1],
79 [THREADS, THREADS, 1],
80 0,
81 &src,
82 "correlate2",
83 &[
84 input,
85 filter,
86 out,
87 &inp_rows,
88 &inp_cols,
89 &filter_rows,
90 &filter_cols,
91 ],
92 )
93 .unwrap();
94 }
95
96 pub fn correlate_cu_use_z<T: Number + CDatatype>(
97 input: &CUBuffer<T>,
98 filter: &CUBuffer<T>,
99 out: &mut CUBuffer<T>,
100 inp_rows: usize,
101 inp_cols: usize,
102 filter_rows: usize,
103 filter_cols: usize,
104 ) {
105 let (out_rows, out_cols) = (inp_rows - filter_rows + 1, inp_cols - filter_cols + 1);
106
107 const THREADS: u32 = 8;
108
109 let grid_x = (inp_rows as f32 / THREADS as f32).ceil() as u32;
111 let grid_y = (inp_cols as f32 / THREADS as f32).ceil() as u32;
112 let src = format!(
115 r#"
116 extern "C" __global__ void correlateWithZ({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int filter_rows, int filter_cols) {{
117
118 /*extern __shared__ {dtype} filterData[];
119
120 for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
121 for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
122 filterData[filterRow * filter_cols + filterCol] = filter[filterRow * filter_cols + filterCol];
123 }}
124 }}
125
126 __syncthreads();*/
127
128
129
130 int moveDown = blockDim.x * blockIdx.x + threadIdx.x;
131 int moveRight = blockDim.y * blockIdx.y + threadIdx.y;
132 //int filterRow = threadIdx.z;
133
134 int outRows = inp_rows - filter_rows + 1;
135 int outCols = inp_cols - filter_cols + 1;
136
137 if (moveDown >= outRows) {{
138 return;
139 }}
140 if (moveRight >= outCols) {{
141 return;
142 }}
143 {dtype} sum = 0;
144
145 for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
146 int inputIdx = moveDown * inp_cols + moveRight + filterRow * inp_cols;
147 for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
148 sum += input[inputIdx + filterCol] * filter[filterRow * filter_cols + filterCol];
149 }}
150 }}
151 out[moveDown * outCols + moveRight] = sum;
152 }}
153 "#,
154 dtype = T::as_c_type_str()
155 );
156
157 launch_kernel(
158 input.device(),
159 [grid_x, grid_y, 1],
160 [THREADS, THREADS, 1],
161 (filter_rows * filter_cols * std::mem::size_of::<T>()) as u32,
162 &src,
163 "correlateWithZ",
164 &[
165 input,
166 filter,
167 out,
168 &inp_rows,
169 &inp_cols,
170 &filter_rows,
171 &filter_cols,
172 ],
173 )
174 .unwrap();
175 }
176
177 #[test]
178 fn test_correleate_cu2_larger() {
179 let height = 1080;
180 let width = 1920;
181
182 let data = (0..height * width)
183 .into_iter()
184 .map(|x| x as f32)
185 .collect::<Vec<f32>>();
186 let data = Buffer::from((static_cuda(), data));
187
188 let filter_rows = 10;
189 let filter_cols = 10;
190
191 let filter = buf![1./3.; filter_rows * filter_cols].to_gpu();
192 let mut out = buf![0.; (height-filter_rows+1) * (width-filter_cols+1)].to_gpu();
193
194 correlate_cu2(
195 &data,
196 &filter,
197 &mut out,
198 height,
199 width,
200 filter_rows,
201 filter_cols,
202 );
203
204 let mut cpu_out = buf![0.; out.len()];
207
208 correlate_valid_mut(
209 &data.to_cpu(),
210 (height, width),
211 &filter.to_cpu(),
212 (filter_rows, filter_cols),
213 &mut cpu_out,
214 );
215
216 assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 100.0);
217 }
218
219 #[test]
220 fn test_correlate_cu_larger_assert() {
221 #[rustfmt::skip]
222 let height = 1080;
223 let width = 1920;
224
225 for height in 1080..=1080 {
226 println!("height: {}", height);
227 for width in 1920..=1920 {
228 let data = (0..height * width)
229 .into_iter()
230 .map(|x| x as f32)
231 .collect::<Vec<f32>>();
232 let data = Buffer::from((static_cuda(), data));
233
234 let filter_rows = 10;
235 let filter_cols = 10;
236
237 let filter = buf![1./3.; filter_rows * filter_cols].to_gpu();
238 let mut out = buf![0.; (height-filter_rows+1) * (width-filter_cols+1)].to_gpu();
239
240 correlate_cu2(
241 &data,
242 &filter,
243 &mut out,
244 height,
245 width,
246 filter_rows,
247 filter_cols,
248 );
249
250 let mut cpu_out = buf![0.; out.len()];
253
254 correlate_valid_mut(
255 &data.to_cpu(),
256 (height, width),
257 &filter.to_cpu(),
258 (filter_rows, filter_cols),
259 &mut cpu_out,
260 );
261
262 assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 100.0);
263 }
264 }
265
266 let data = (0..height * width)
267 .into_iter()
268 .map(|x| x as f32)
269 .collect::<Vec<f32>>();
270 let data = Buffer::from((static_cuda(), data));
271
272 let filter = buf![1./3.; 9].to_gpu();
273 let mut out = buf![0.; (height-3+1) * (width-3+1)].to_gpu();
274
275 correlate_cu2(&data, &filter, &mut out, height, width, 3, 3);
276
277 let mut cpu_out = buf![0.; out.len()];
280
281 correlate_valid_mut(
282 &data.to_cpu(),
283 (height, width),
284 &filter.to_cpu(),
285 (3, 3),
286 &mut cpu_out,
287 );
288
289 assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 0.1);
290 }
291
292 pub fn assert_eq_with_tolerance<T: Float>(a: &[T], b: &[T], tolerance: T) {
293 assert_eq!(a.len(), b.len());
294 for i in 0..a.len() {
295 if (a[i] - b[i]).abs() >= tolerance {
296 panic!(
297 "
298LHS SIDE: {:?},
299 does not match with
300RHS SIDE: {:?} which value?: {}, {}",
301 a, b, a[i], b[i]
302 );
303 }
304 }
305 }
306
307 pub fn correlate_valid_mut<T: Number>(
308 lhs_slice: &[T],
309 lhs_dims: (usize, usize),
310 kernel_slice: &[T],
311 kernel_dims: (usize, usize),
312 out: &mut [T],
313 ) {
314 let (lhs_rows, lhs_cols) = lhs_dims;
315 let (kernel_rows, kernel_cols) = kernel_dims;
316
317 let (out_rows, out_cols) = (lhs_rows - kernel_rows + 1, lhs_cols - kernel_cols + 1);
318
319 for y in 0..out_rows {
322 for x in 0..out_cols {
325 let mut sum = T::default();
326 for idx in 0..kernel_rows {
328 let index = idx * lhs_cols + x + y * lhs_cols;
329 let lhs_kernel_row = &lhs_slice[index..index + kernel_cols];
330
331 let index = idx * kernel_cols;
332 let kernel_row = &kernel_slice[index..index + kernel_cols];
333
334 for (i, value) in lhs_kernel_row.iter().enumerate() {
335 sum += *value * kernel_row[i];
336 }
337 }
338 out[y * out_cols + x] = sum;
340 }
341 }
342 }
343
344 pub fn cu_padding<T: CDatatype>(
345 input: &CUBuffer<T>,
346 out: &mut CUBuffer<T>,
347 inp_rows: usize,
348 inp_cols: usize,
349 x_padding: usize,
350 y_padding: usize,
351 ) {
352 let grid_x = ((inp_cols + x_padding * 2) as f32 / 16.).ceil() as u32;
353 let grid_y = ((inp_rows + y_padding * 2) as f32 / 16.).ceil() as u32;
354
355 let src = format!(
356 r#"
357 extern "C" __global__ void addPadding({dtype}* input, {dtype}* out, int inpRows, int inpCols, int xPadding, int yPadding) {{
358 int col = blockDim.x * blockIdx.x + threadIdx.x;
359 int row = blockDim.y * blockIdx.y + threadIdx.y;
360
361 if (row >= inpRows || col >= inpCols) {{
362 return;
363 }}
364
365 out[yPadding * (inpCols + 2*xPadding) + row * (inpCols + 2 * xPadding) + col + xPadding] = input[row * inpCols + col];
366 }}
367 "#,
368 dtype = T::as_c_type_str()
369 );
370 launch_kernel(
371 input.device(),
372 [grid_x, grid_y, 1],
373 [16, 16, 1],
374 0,
375 &src,
376 "addPadding",
377 &[input, out, &inp_rows, &inp_cols, &x_padding, &y_padding],
378 )
379 .unwrap();
380 }
381
382 pub fn add_padding<T: Number>(
383 inputs: &[T],
384 inp_rows: usize,
385 inp_cols: usize,
386 x_padding: usize,
387 y_padding: usize,
388 ) -> Vec<T> {
389 let mut padded_inputs =
390 vec![T::zero(); (inp_rows + y_padding * 2) * (inp_cols + x_padding * 2)];
391
392 for inp_row in 0..inp_rows {
393 for inp_col in 0..inp_cols {
394 padded_inputs[y_padding * (inp_cols + 2 * x_padding)
395 + x_padding
396 + inp_row * (inp_cols + 2 * x_padding)
397 + inp_col] = inputs[inp_row * inp_cols + inp_col];
398 }
399 }
400 padded_inputs
401 }
402
403 #[test]
404 fn test_cu_padding_to_cpu_padding() {
405 let inp_rows = 1080;
406 let inp_cols = 1920;
407 let x_padding = 4;
408 let y_padding = 4;
409
410 let inputs = vec![1.; inp_rows * inp_cols];
411
412 let padded_inputs = add_padding(&inputs, inp_rows, inp_cols, x_padding, y_padding);
413
414 let mut gpu_inputs = buf![0.; inputs.len()].to_gpu();
415 let mut gpu_padded_inputs = buf![0.; padded_inputs.len()].to_gpu();
416
417 gpu_inputs.write(&inputs);
418 cu_padding(
419 &gpu_inputs,
420 &mut gpu_padded_inputs,
421 inp_rows,
422 inp_cols,
423 x_padding,
424 y_padding,
425 );
426
427 assert_eq_with_tolerance(&gpu_padded_inputs.read(), &padded_inputs, 0.1);
435 }
436}