use std::{io::Write, mem::size_of, ops::Mul, ptr::null_mut, time::Instant};
use custos::{
buf,
cuda::{api::culaunch_kernel, fn_cache, launch_kernel},
prelude::{launch_kernel1d, CUBuffer, Float, Number},
static_api::static_cuda,
Buffer, CDatatype, Device, CUDA,
};
use nvjpeg_sys::{
check, nvjpegChromaSubsampling_t, nvjpegCreateSimple, nvjpegDecode, nvjpegDestroy,
nvjpegGetImageInfo, nvjpegHandle_t, nvjpegImage_t, nvjpegJpegStateCreate,
nvjpegJpegStateDestroy, nvjpegJpegState_t, nvjpegOutputFormat_t_NVJPEG_OUTPUT_RGB,
};
fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let raw_data = std::fs::read("cat_798x532.jpg")?;
let device = CUDA::new(0)?;
let image = unsafe { decode_raw_jpeg(&raw_data, &device)? };
Ok(())
}
#[derive(Debug, Default)]
pub struct Image {}
unsafe fn decode_raw_jpeg(
raw_data: &[u8],
device: &CUDA,
) -> Result<Image, Box<dyn std::error::Error + Send + Sync>> {
let mut handle: nvjpegHandle_t = null_mut();
let status = nvjpegCreateSimple(&mut handle);
check!(status, "Could not create simple handle. ");
let mut jpeg_state: nvjpegJpegState_t = null_mut();
let status = nvjpegJpegStateCreate(handle, &mut jpeg_state);
check!(status, "Could not create jpeg state. ");
let mut n_components = 0;
let mut subsampling: nvjpegChromaSubsampling_t = 0;
let mut widths = [0, 0, 0];
let mut heights = [0, 0, 0];
let status = nvjpegGetImageInfo(
handle,
raw_data.as_ptr(),
raw_data.len(),
&mut n_components,
&mut subsampling,
widths.as_mut_ptr(),
heights.as_mut_ptr(),
);
check!(status, "Could not get image info. ");
heights[0] = heights[1] * 2;
println!("n_components: {n_components}, subsampling: {subsampling}, widths: {widths:?}, heights: {heights:?}");
let mut image: nvjpegImage_t = nvjpegImage_t::new();
image.pitch[0] = widths[0] as usize;
image.pitch[1] = widths[0] as usize;
image.pitch[2] = widths[0] as usize;
let channel0 = Buffer::<u8, _>::new(device, image.pitch[0] * heights[0] as usize);
let channel1 = Buffer::<u8, _>::new(device, image.pitch[0] * heights[0] as usize);
let channel2 = Buffer::<u8, _>::new(device, image.pitch[0] * heights[0] as usize);
image.channel[0] = channel0.cu_ptr() as *mut _;
image.channel[1] = channel1.cu_ptr() as *mut _;
image.channel[2] = channel2.cu_ptr() as *mut _;
let status = nvjpegDecode(
handle,
jpeg_state,
raw_data.as_ptr(),
raw_data.len(),
nvjpegOutputFormat_t_NVJPEG_OUTPUT_RGB,
&mut image,
device.stream().0 as *mut _,
);
check!(status, "Could not decode image. ");
let channel0 = channel0.read();
let channel1 = channel1.read();
let channel2 = channel2.read();
let filter_rows = 300;
let filter_cols = 300;
let filter = vec![1. / (filter_rows * filter_cols) as f32; filter_rows * filter_cols];
let mut channel0_out = vec![0.; channel0.len()];
correlate_fully(
&channel0.iter().map(|x| *x as f32).collect::<Vec<_>>(),
&filter,
&mut channel0_out,
heights[0] as usize,
widths[0] as usize,
filter_rows,
filter_cols,
);
let mut channel1_out = vec![0.; channel1.len()];
correlate_fully(
&channel1.iter().map(|x| *x as f32).collect::<Vec<_>>(),
&filter,
&mut channel1_out,
heights[0] as usize,
widths[0] as usize,
filter_rows,
filter_cols,
);
let mut channel2_out = vec![0.; channel2.len()];
correlate_fully(
&channel2.iter().map(|x| *x as f32).collect::<Vec<_>>(),
&filter,
&mut channel2_out,
heights[0] as usize,
widths[0] as usize,
filter_rows,
filter_cols,
);
let channel0_out = channel0_out
.into_iter()
.map(|x| x as u8)
.collect::<Vec<_>>();
let channel1_out = channel1_out
.into_iter()
.map(|x| x as u8)
.collect::<Vec<_>>();
let channel2_out = channel2_out
.into_iter()
.map(|x| x as u8)
.collect::<Vec<_>>();
let file = std::fs::File::create("cat_798x532.ppm")?;
let mut writer = std::io::BufWriter::new(file);
writer.write(format!("P6\n{} {}\n255\n", widths[0], heights[0]).as_bytes())?;
for row in 0..heights[0] {
let row = row as usize;
for col in 0..widths[0] {
let col = col as usize;
writer.write(&[
channel0_out[row * widths[0] as usize + col],
channel1_out[row * widths[0] as usize + col],
channel2_out[row * widths[0] as usize + col],
])?;
}
}
writer.flush()?;
Ok(Image::default())
}
pub fn correlate_cu_padded<T: Number + CDatatype>(
input: &CUBuffer<T>,
filter: &CUBuffer<T>,
out: &mut CUBuffer<T>,
inp_rows: usize,
inp_cols: usize,
filter_rows: usize,
filter_cols: usize,
) {
let x_padding = filter_cols - 1;
let y_padding = filter_rows - 1;
let inp_rows2 = inp_rows + y_padding * 2;
let inp_cols2 = inp_cols + x_padding * 2;
let src = format!(
r#"
extern "C" __global__ void correlate({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int unpad_rows, int unpad_cols, int filter_rows, int filter_cols) {{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int xPadding = filter_cols - 1;
int yPadding = filter_rows - 1;
int moveDown = blockIdx.x / (inp_cols-filter_cols + 1);
int moveRight = blockIdx.x % (inp_cols-filter_cols + 1);
int startOfBlock = moveDown * inp_cols + moveRight;
int next = startOfBlock + (threadIdx.x / filter_cols) * inp_cols + (threadIdx.x % filter_cols);
(yPadding + unpad_rows) * inp_cols
if (next < 0) {{
return;
}}
//(yPadding + ) * inp_cols
if (next >= inp_rows * inp_cols || next < 0) {{
// return;
}}
//__shared__ {dtype} res[filter_rows * filter_cols];
extern __shared__ {dtype} res[];
res[threadIdx.x] = input[next] * filter[threadIdx.x];
__syncthreads();
// sum res and write to out
if (threadIdx.x == 0) {{
{dtype} sum = 0;
for (int i = 0; i < filter_rows * filter_cols; i++) {{
sum += res[i];
}}
out[blockIdx.x] = sum;
}}
//printf("block: %d, idx: %d, val: %f \n", blockIdx.x, next, input[next]);
//printf("shared val: %f \n", res[0]);
}}
"#,
dtype = T::as_c_type_str()
);
launch_kernel(
input.device(),
[
((inp_rows2 - filter_rows + 1) * (inp_cols2 - filter_cols + 1)) as u32,
1,
1,
],
[(filter_cols * filter_rows) as u32, 1, 1],
(filter_cols * filter_rows * size_of::<T>()) as u32,
&src,
"correlate",
&[
input,
filter,
out,
&inp_rows2,
&inp_cols2,
&inp_rows,
&inp_cols,
&filter_rows,
&filter_cols,
],
)
.unwrap();
}
#[test]
fn test_correlate_cu_padded() {
#[rustfmt::skip]
let data = buf![
1., 2., 3., 4.,
5., 6., 7., 8.,
9., 10., 11., 12.,
13., 14., 15., 16.,
17., 18., 19., 20.
]
.to_gpu();
let filter = buf![1.; 9].to_gpu();
let mut out = buf![0.; data.len()].to_gpu();
correlate_cu_padded(&data, &filter, &mut out, 5, 4, 3, 3);
println!("out: {out:?}");
}
pub fn cu_padding<T: CDatatype>(
input: &CUBuffer<T>,
out: &mut CUBuffer<T>,
inp_rows: usize,
inp_cols: usize,
x_padding: usize,
y_padding: usize,
) {
let grid_x = ((inp_cols + x_padding * 2) as f32 / 16.).ceil() as u32;
let grid_y = ((inp_rows + y_padding * 2) as f32 / 16.).ceil() as u32;
let src = format!(
r#"
extern "C" __global__ void addPadding({dtype}* input, {dtype}* out, int inpRows, int inpCols, int xPadding, int yPadding) {{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (row >= inpRows || col >= inpCols) {{
return;
}}
out[yPadding * (inpCols + 2*xPadding) + row * (inpCols + 2 * xPadding) + col + xPadding] = input[row * inpCols + col];
}}
"#,
dtype = T::as_c_type_str()
);
launch_kernel(
input.device(),
[grid_x, grid_y, 1],
[16, 16, 1],
0,
&src,
"addPadding",
&[input, out, &inp_rows, &inp_cols, &x_padding, &y_padding],
)
.unwrap();
}
pub fn correlate_cu<T: Number + CDatatype>(
input: &CUBuffer<T>,
filter: &CUBuffer<T>,
out: &mut CUBuffer<T>,
inp_rows: usize,
inp_cols: usize,
filter_rows: usize,
filter_cols: usize,
) {
let src = format!(
r#"
extern "C" __global__ void correlate({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int filter_rows, int filter_cols) {{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int moveDown = blockIdx.x / (inp_cols-filter_cols + 1);
//int moveRight = blockIdx.x % (inp_cols - filter_cols + 1);
int moveRight = blockIdx.x % (inp_cols-filter_cols + 1);
int startOfBlock = moveDown * inp_cols + moveRight;
int next = startOfBlock + (threadIdx.x / filter_cols) * inp_cols + (threadIdx.x % filter_cols);
//__shared__ {dtype} res[filter_rows * filter_cols];
extern __shared__ {dtype} res[];
res[threadIdx.x] = input[next] * filter[threadIdx.x];
__syncthreads();
// sum res and write to out
if (threadIdx.x == 0) {{
{dtype} sum = 0;
for (int i = 0; i < filter_rows * filter_cols; i++) {{
sum += res[i];
}}
out[blockIdx.x] = sum;
}}
//printf("block: %d, idx: %d, val: %f \n", blockIdx.x, next, input[next]);
//printf("shared val: %f \n", res[0]);
}}
"#,
dtype = T::as_c_type_str()
);
launch_kernel(
input.device(),
[
((inp_rows - filter_rows + 1) * (inp_cols - filter_cols + 1)) as u32,
1,
1,
],
[(filter_cols * filter_rows) as u32, 1, 1],
(filter_cols * filter_rows * size_of::<T>()) as u32,
&src,
"correlate",
&[
input,
filter,
out,
&inp_rows,
&inp_cols,
&filter_rows,
&filter_cols,
],
)
.unwrap();
}
#[test]
fn test_correlate_cu() {
let data = buf![
1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.
]
.to_gpu();
let filter = buf![1.; 9].to_gpu();
let mut out = buf![0.; data.len()].to_gpu();
correlate_cu(&data, &filter, &mut out, 5, 4, 3, 3);
println!("out: {out:?}");
}
#[test]
fn test_correlate_cu_larger() {
let rows = 1920;
let cols = 1080;
let data = buf![
1.4; rows * cols
]
.to_gpu();
let filter_rows = 32;
let filter_cols = filter_rows;
let filter = buf![1.; filter_rows * filter_cols].to_gpu();
let mut out = buf![0.; (rows - filter_rows + 1) * (cols - filter_cols +1)].to_gpu();
correlate_cu2(
&data,
&filter,
&mut out,
rows,
cols,
filter_rows,
filter_cols,
);
out.device().stream().sync().unwrap();
let start = std::time::Instant::now();
correlate_cu2(
&data,
&filter,
&mut out,
rows,
cols,
filter_rows,
filter_cols,
);
out.device().stream().sync().unwrap();
println!("elapsed: {:?}", start.elapsed());
}
#[test]
fn test_cu_padding_la() {
let data = buf![1; 10000*14000].to_cuda();
let mut out = buf![0; (10000 + 2*2) * (14000+ 2*2)].to_cuda();
cu_padding(&data, &mut out, 10000, 14000, 2, 2);
data.device().stream().sync().unwrap();
let start = Instant::now();
cu_padding(&data, &mut out, 10000, 14000, 2, 2);
data.device().stream().sync().unwrap();
println!("elapsed: {:?}", start.elapsed());
}
#[test]
fn test_cu_padding() {
#[rustfmt::skip]
let data = buf![
1, 2, 3, 5,
4, 3, 2, 1,
8, 7, 4, 2,
7, 3, 2, 1,
8, 5, 3, 8
].to_cuda();
let mut out = buf![0; (5 + 2*2) * (4+ 2*2)].to_cuda();
cu_padding(&data, &mut out, 5, 4, 2, 2);
println!("out: {out:?}");
for (idx, padded_val) in out.to_cpu().iter().enumerate() {
print!("{padded_val}, ");
if (idx + 1) % (2 + 2 + 4) == 0 {
println!()
}
}
}
#[test]
fn test_add_padding() {
#[rustfmt::skip]
let data = [
1, 2, 3, 5,
4, 3, 2, 1,
8, 7, 4, 2,
7, 3, 2, 1,
8, 5, 3, 8
];
let padded = add_padding(&data, 5, 4, 2, 2);
for (idx, padded_val) in padded.iter().enumerate() {
print!("{padded_val}, ");
if (idx + 1) % (2 + 2 + 4) == 0 {
println!()
}
}
}
pub fn add_padding<T: Number>(
inputs: &[T],
inp_rows: usize,
inp_cols: usize,
x_padding: usize,
y_padding: usize,
) -> Vec<T> {
let mut padded_inputs =
vec![T::zero(); (inp_rows + y_padding * 2) * (inp_cols + x_padding * 2)];
for inp_row in 0..inp_rows {
for inp_col in 0..inp_cols {
padded_inputs[y_padding * (inp_cols + 2 * x_padding)
+ x_padding
+ inp_row * (inp_cols + 2 * x_padding)
+ inp_col] = inputs[inp_row * inp_cols + inp_col];
}
}
padded_inputs
}
pub fn correlate_cu2<T: Number + CDatatype>(
input: &CUBuffer<T>,
filter: &CUBuffer<T>,
out: &mut CUBuffer<T>,
inp_rows: usize,
inp_cols: usize,
filter_rows: usize,
filter_cols: usize,
) {
let (out_rows, out_cols) = (inp_rows - filter_rows + 1, inp_cols - filter_cols + 1);
const THREADS: u32 = 32;
let grid_x = (inp_rows as f32 / THREADS as f32).ceil() as u32;
let grid_y = (inp_cols as f32 / THREADS as f32).ceil() as u32;
let src = format!(
r#"
extern "C" __global__ void correlate2({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int filter_rows, int filter_cols) {{
int moveDown = blockDim.x * blockIdx.x + threadIdx.x;
int moveRight = blockDim.y * blockIdx.y + threadIdx.y;
int outRows = inp_rows - filter_rows + 1;
int outCols = inp_cols - filter_cols + 1;
if (moveDown >= outRows) {{
return;
}}
if (moveRight >= outCols) {{
return;
}}
{dtype} sum = 0;
for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
int inputIdx = moveDown * inp_cols + moveRight + filterRow * inp_cols;
for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
sum += input[inputIdx + filterCol] * filter[filterRow * filter_cols + filterCol];
}}
}}
out[moveDown * outCols + moveRight] = sum;
}}
"#,
dtype = T::as_c_type_str()
);
launch_kernel(
input.device(),
[grid_x, grid_y, 1],
[THREADS, THREADS, 1],
0,
&src,
"correlate2",
&[
input,
filter,
out,
&inp_rows,
&inp_cols,
&filter_rows,
&filter_cols,
],
)
.unwrap();
}
#[test]
fn test_correlate_cu_2() {
#[rustfmt::skip]
let data = buf![
1., 2., 3., 4.,
5., 6., 7., 8.,
9., 10., 11., 12.,
13., 14., 15., 16.,
17., 18., 19., 20.
]
.to_gpu();
let filter = buf![1./3.; 9].to_gpu();
let mut out = buf![0.; (5-3+1) * (4-3+1)].to_gpu();
correlate_cu2(&data, &filter, &mut out, 5, 4, 3, 3);
println!("out: {out:?}");
let mut cpu_out = buf![0.; out.len()];
correlate_valid_mut(
&data.to_cpu(),
(5, 4),
&filter.to_cpu(),
(3, 3),
&mut cpu_out,
);
assert_eq!(cpu_out.read(), out.read());
}
#[test]
fn test_correlate_cu_larger_assert() {
#[rustfmt::skip]
let height = 1080;
let width = 1920;
for height in 950..1100 {
for width in 1500..1980 {
let data = (0..height * width)
.into_iter()
.map(|x| x as f32)
.collect::<Vec<f32>>();
let data = Buffer::from((static_cuda(), data));
let filter_rows = 10;
let filter_cols = 10;
let filter = buf![1./3.; filter_rows * filter_cols].to_gpu();
let mut out = buf![0.; (height-filter_rows+1) * (width-filter_cols+1)].to_gpu();
correlate_cu2(
&data,
&filter,
&mut out,
height,
width,
filter_rows,
filter_cols,
);
let mut cpu_out = buf![0.; out.len()];
correlate_valid_mut(
&data.to_cpu(),
(height, width),
&filter.to_cpu(),
(filter_rows, filter_cols),
&mut cpu_out,
);
assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 100.0);
}
}
let data = (0..height * width)
.into_iter()
.map(|x| x as f32)
.collect::<Vec<f32>>();
let data = Buffer::from((static_cuda(), data));
let filter = buf![1./3.; 9].to_gpu();
let mut out = buf![0.; (height-3+1) * (width-3+1)].to_gpu();
correlate_cu2(&data, &filter, &mut out, height, width, 3, 3);
let mut cpu_out = buf![0.; out.len()];
correlate_valid_mut(
&data.to_cpu(),
(height, width),
&filter.to_cpu(),
(3, 3),
&mut cpu_out,
);
assert_eq_with_tolerance(&cpu_out.read(), &out.read(), 0.1);
}
pub fn assert_eq_with_tolerance<T: Float>(a: &[T], b: &[T], tolerance: T) {
assert_eq!(a.len(), b.len());
for i in 0..a.len() {
if (a[i] - b[i]).abs() >= tolerance {
panic!(
"
LHS SIDE: {:?},
does not match with
RHS SIDE: {:?} which value?: {}, {}",
a, b, a[i], b[i]
);
}
}
}
pub fn correlate_cu2_pad<T: Number + CDatatype>(
input: &CUBuffer<T>,
filter: &CUBuffer<T>,
out: &mut CUBuffer<T>,
inp_rows: usize,
inp_cols: usize,
filter_rows: usize,
filter_cols: usize,
) {
const THREADS: u32 = 8;
let x_padding = filter_cols - 1;
let y_padding = filter_rows - 1;
let inp_rows2 = inp_rows + y_padding * 2;
let inp_cols2 = inp_cols + x_padding * 2;
let (out_rows, out_cols) = (inp_rows2 - filter_rows + 1, inp_cols2 - filter_cols + 1);
let grid_x = (out_rows as f32 / THREADS as f32).ceil() as u32;
let grid_y = (out_cols as f32 / THREADS as f32).ceil() as u32;
let src = format!(
r#"
extern "C" __global__ void correlate2({dtype}* input, {dtype}* filter, {dtype}* out, int inp_rows, int inp_cols, int unpad_rows, int unpad_cols, int filter_rows, int filter_cols) {{
int moveDown = blockDim.x * blockIdx.x + threadIdx.x;
int moveRight = blockDim.y * blockIdx.y + threadIdx.y;
int xPadding = filter_cols - 1;
int yPadding = filter_rows - 1;
int outRows = inp_rows - filter_rows + 1;
int outCols = inp_cols - filter_cols + 1;
if (moveDown >= outRows) {{
return;
}}
if (moveRight >= outCols) {{
return;
}}
{dtype} sum = 0;
for (int filterRow = 0; filterRow < filter_rows; filterRow++) {{
int inputIdx = moveDown * inp_cols
+ moveRight + filterRow * inp_cols
- yPadding * inp_cols - xPadding;
// - (outCols % moveRight == 0) * xPadding;
// + inp_row * (inp_cols + 2 * x_padding)
// + inp_col
for (int filterCol = 0; filterCol < filter_cols; filterCol++) {{
if (inputIdx +filterCol >= unpad_rows * unpad_cols) {{
continue;
}}
if (inputIdx+filterCol < 0) {{
continue;
}}
// print the input index + filterCol and the corresponding input value:
printf("%d, %f\n", inputIdx+filterCol, input[inputIdx+filterCol]);
sum += input[inputIdx + filterCol] * filter[filterRow * filter_cols + filterCol];
}}
}}
out[moveDown * outCols + moveRight] = sum;
}}
"#,
dtype = T::as_c_type_str()
);
launch_kernel(
input.device(),
[grid_x, grid_y, 1],
[THREADS, THREADS, 1],
0,
&src,
"correlate2",
&[
input,
filter,
out,
&inp_rows2,
&inp_cols2,
&inp_rows,
&inp_cols,
&filter_rows,
&filter_cols,
],
)
.unwrap();
}
#[test]
fn test_correlate_cu_2_pad() {
#[rustfmt::skip]
let data = buf![
1., 2., 9.,
4., 5., 6.,
7., 8., 9.,
]
.to_gpu();
let filter = buf![1.; 4].to_gpu();
let mut out = buf![0.; data.len()].to_gpu();
correlate_cu2_pad(&data, &filter, &mut out, 3, 3, 2, 2);
println!("out: {out:?}");
let mut cpu_out = buf![0.; out.len()];
correlate_fully(&data.to_cpu(), &filter.to_cpu(), &mut cpu_out, 3, 3, 2, 2);
println!("cpu out: {cpu_out:?}");
}
pub fn correlate_valid_mut<T: Number>(
lhs_slice: &[T],
lhs_dims: (usize, usize),
kernel_slice: &[T],
kernel_dims: (usize, usize),
out: &mut [T],
) {
let (lhs_rows, lhs_cols) = lhs_dims;
let (kernel_rows, kernel_cols) = kernel_dims;
let (out_rows, out_cols) = (lhs_rows - kernel_rows + 1, lhs_cols - kernel_cols + 1);
for y in 0..out_rows {
for x in 0..out_cols {
let mut sum = T::default();
for idx in 0..kernel_rows {
let index = idx * lhs_cols + x + y * lhs_cols;
let lhs_kernel_row = &lhs_slice[index..index + kernel_cols];
let index = idx * kernel_cols;
let kernel_row = &kernel_slice[index..index + kernel_cols];
for (i, value) in lhs_kernel_row.iter().enumerate() {
sum += *value * kernel_row[i];
}
}
out[y * out_cols + x] = sum;
}
}
}
pub fn correlate_fully<T: Number + Mul<U, Output = T>, U: Number>(
inputs: &[T],
filter: &[U],
out: &mut [T],
inp_rows: usize,
inp_cols: usize,
filter_rows: usize,
filter_cols: usize,
) {
let x_padding = filter_cols - 1;
let y_padding = filter_rows - 1;
let padded_inputs = add_padding(inputs, inp_rows, inp_cols, x_padding, y_padding);
let padded_rows = inp_rows + y_padding * 2;
let padded_cols = inp_cols + x_padding * 2;
for move_down in 0..=padded_rows - filter_rows - y_padding {
for move_right in 0..=padded_cols - filter_cols - x_padding {
let mut sum = T::default();
for idx in 0..filter_rows {
let filter_idx = idx * filter_cols;
let filter_row = &filter[filter_idx..filter_idx + filter_cols];
let input_idx = move_down * padded_cols + move_right + idx * padded_cols;
let input_row = &padded_inputs[input_idx..input_idx + filter_cols];
for (filter_row, input_row) in input_row.iter().zip(filter_row) {
sum += *filter_row * *input_row;
}
}
out[move_down * inp_cols + move_right] = sum;
}
}
}
#[test]
fn test_correlate() {
let data = [
1., 2., 3., 5., 4., 3., 2., 1., 8., 7., 4., 2., 7., 3., 2., 1., 8., 5., 3., 8.,
];
let filter = [1. / 4., 1. / 4., 1. / 4., 1. / 4.];
let mut out = [0.; 5 * 4];
correlate_fully(&data, &filter, &mut out, 5, 4, 2, 2);
println!("out: {out:?}");
let filter = [1. / 4.; 9];
let mut out = [0.; 5 * 4];
correlate_fully(&data, &filter, &mut out, 5, 4, 3, 3);
println!("out: {out:?}");
}
#[test]
fn test_correlate_larger() {
let rows = 5080;
let cols = 520;
let data = buf![
1.4; rows * cols
];
let filter_rows = 32;
let filter_cols = filter_rows;
let filter = buf![1.; filter_rows * filter_cols];
let mut out = buf![0.; rows * cols];
correlate_fully(
&data,
&filter,
&mut out,
rows,
cols,
filter_rows,
filter_cols,
);
println!("out: {out:?}");
}