trueno/matrix/ops/ml_ops/
convolution.rs1use crate::TruenoError;
4
5use super::super::super::Matrix;
6
7impl Matrix<f32> {
8 pub fn convolve2d(&self, kernel: &Matrix<f32>) -> Result<Matrix<f32>, TruenoError> {
66 if kernel.rows > self.rows || kernel.cols > self.cols {
68 return Err(TruenoError::InvalidInput(format!(
69 "Kernel size ({}x{}) larger than input ({}x{})",
70 kernel.rows, kernel.cols, self.rows, self.cols
71 )));
72 }
73
74 let output_rows = self.rows - kernel.rows + 1;
76 let output_cols = self.cols - kernel.cols + 1;
77
78 let mut result = Matrix::zeros_with_backend(output_rows, output_cols, self.backend);
80
81 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
83 const GPU_THRESHOLD: usize = 10_000;
84
85 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
86 {
87 if output_rows * output_cols >= GPU_THRESHOLD {
88 use crate::backends::gpu::GpuBackend;
89
90 if GpuBackend::is_available() {
91 if let Ok(gpu_result) =
92 self.convolve2d_gpu(kernel, &mut result, output_rows, output_cols)
93 {
94 return Ok(gpu_result);
95 }
96 }
97 }
98 }
99
100 let input_data = self.as_slice();
102 let kernel_data = kernel.as_slice();
103 let result_data = result.data.as_mut_slice();
104 let input_cols = self.cols;
105 let kernel_cols = kernel.cols;
106 let result_cols = output_cols;
107
108 for out_row in 0..output_rows {
109 for out_col in 0..output_cols {
110 let mut sum = 0.0;
111
112 for k_row in 0..kernel.rows {
113 let in_row = out_row + k_row;
114 let input_row_offset = in_row * input_cols;
115 let kernel_row_offset = k_row * kernel_cols;
116
117 for k_col in 0..kernel.cols {
118 let in_col = out_col + k_col;
119 sum += input_data[input_row_offset + in_col]
120 * kernel_data[kernel_row_offset + k_col];
121 }
122 }
123
124 result_data[out_row * result_cols + out_col] = sum;
125 }
126 }
127
128 Ok(result)
129 }
130
131 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
133 fn convolve2d_gpu(
134 &self,
135 kernel: &Matrix<f32>,
136 result: &mut Matrix<f32>,
137 _output_rows: usize,
138 _output_cols: usize,
139 ) -> Result<Matrix<f32>, TruenoError> {
140 use crate::backends::gpu::GpuDevice;
141
142 let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
143
144 gpu.convolve2d(
145 self.as_slice(),
146 kernel.as_slice(),
147 result.data.as_mut_slice(),
148 self.rows,
149 self.cols,
150 kernel.rows,
151 kernel.cols,
152 )
153 .map_err(TruenoError::InvalidInput)?;
154
155 Ok(result.clone())
156 }
157}
158
159#[cfg(test)]
160#[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
161mod gpu_tests {
162 use crate::Matrix;
163
164 fn reference_convolve2d(
166 input: &[f32],
167 kernel: &[f32],
168 input_rows: usize,
169 input_cols: usize,
170 kernel_rows: usize,
171 kernel_cols: usize,
172 ) -> Vec<f32> {
173 let output_rows = input_rows - kernel_rows + 1;
174 let output_cols = input_cols - kernel_cols + 1;
175 let mut result = vec![0.0f32; output_rows * output_cols];
176
177 for out_row in 0..output_rows {
178 for out_col in 0..output_cols {
179 let mut sum = 0.0;
180 for k_row in 0..kernel_rows {
181 for k_col in 0..kernel_cols {
182 sum += input[(out_row + k_row) * input_cols + (out_col + k_col)]
183 * kernel[k_row * kernel_cols + k_col];
184 }
185 }
186 result[out_row * output_cols + out_col] = sum;
187 }
188 }
189 result
190 }
191
192 #[test]
196 fn test_convolve2d_gpu_identity_kernel() {
197 use crate::backends::gpu::GpuBackend;
198
199 if !GpuBackend::is_available() {
200 eprintln!("GPU not available, skipping test_convolve2d_gpu_identity_kernel");
201 return;
202 }
203
204 let input_rows = 102;
206 let input_cols = 102;
207 let kernel_rows = 3;
208 let kernel_cols = 3;
209
210 let input_data: Vec<f32> =
212 (0..input_rows * input_cols).map(|i| (i % 100) as f32 * 0.1).collect();
213
214 let mut kernel_data = vec![0.0f32; kernel_rows * kernel_cols];
216 kernel_data[4] = 1.0; let input = Matrix::from_vec(input_rows, input_cols, input_data.clone())
219 .expect("valid input matrix");
220 let kernel = Matrix::from_vec(kernel_rows, kernel_cols, kernel_data.clone())
221 .expect("valid kernel matrix");
222
223 let result = input.convolve2d(&kernel).expect("convolve2d should succeed");
224
225 assert_eq!(result.rows(), 100);
226 assert_eq!(result.cols(), 100);
227
228 let expected = reference_convolve2d(
230 &input_data,
231 &kernel_data,
232 input_rows,
233 input_cols,
234 kernel_rows,
235 kernel_cols,
236 );
237
238 for i in 0..expected.len() {
239 assert!(
240 (result.as_slice()[i] - expected[i]).abs() < 1e-3,
241 "Mismatch at index {}: gpu={}, expected={}",
242 i,
243 result.as_slice()[i],
244 expected[i]
245 );
246 }
247 }
248
249 #[test]
251 fn test_convolve2d_gpu_averaging_kernel() {
252 use crate::backends::gpu::GpuBackend;
253
254 if !GpuBackend::is_available() {
255 eprintln!("GPU not available, skipping test_convolve2d_gpu_averaging_kernel");
256 return;
257 }
258
259 let input_rows = 105;
261 let input_cols = 105;
262 let kernel_rows = 5;
263 let kernel_cols = 5;
264
265 let input_data = vec![1.0f32; input_rows * input_cols];
267
268 let kernel_data = vec![1.0f32 / 25.0; kernel_rows * kernel_cols];
270
271 let input =
272 Matrix::from_vec(input_rows, input_cols, input_data).expect("valid input matrix");
273 let kernel =
274 Matrix::from_vec(kernel_rows, kernel_cols, kernel_data).expect("valid kernel matrix");
275
276 let result = input.convolve2d(&kernel).expect("convolve2d should succeed");
277
278 assert_eq!(result.rows(), 101);
279 assert_eq!(result.cols(), 101);
280
281 for i in 0..result.as_slice().len() {
283 assert!(
284 (result.as_slice()[i] - 1.0).abs() < 1e-3,
285 "Mismatch at index {}: gpu={}, expected=1.0",
286 i,
287 result.as_slice()[i]
288 );
289 }
290 }
291
292 #[test]
294 fn test_convolve2d_gpu_edge_detection() {
295 use crate::backends::gpu::GpuBackend;
296
297 if !GpuBackend::is_available() {
298 eprintln!("GPU not available, skipping test_convolve2d_gpu_edge_detection");
299 return;
300 }
301
302 let input_rows = 103;
304 let input_cols = 103;
305 let kernel_rows = 3;
306 let kernel_cols = 3;
307
308 let input_data: Vec<f32> =
310 (0..input_rows * input_cols).map(|i| (i / input_cols) as f32).collect();
311
312 let kernel_data = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0];
314
315 let input = Matrix::from_vec(input_rows, input_cols, input_data.clone())
316 .expect("valid input matrix");
317 let kernel = Matrix::from_vec(kernel_rows, kernel_cols, kernel_data.clone())
318 .expect("valid kernel matrix");
319
320 let result = input.convolve2d(&kernel).expect("convolve2d should succeed");
321
322 let expected = reference_convolve2d(
324 &input_data,
325 &kernel_data,
326 input_rows,
327 input_cols,
328 kernel_rows,
329 kernel_cols,
330 );
331
332 assert_eq!(result.as_slice().len(), expected.len());
333
334 for i in 0..100.min(expected.len()) {
336 assert!(
337 (result.as_slice()[i] - expected[i]).abs() < 1e-2,
338 "Mismatch at index {}: gpu={}, expected={}",
339 i,
340 result.as_slice()[i],
341 expected[i]
342 );
343 }
344 let len = expected.len();
345 for i in (len.saturating_sub(100))..len {
346 assert!(
347 (result.as_slice()[i] - expected[i]).abs() < 1e-2,
348 "Mismatch at index {}: gpu={}, expected={}",
349 i,
350 result.as_slice()[i],
351 expected[i]
352 );
353 }
354 }
355
356 #[test]
358 fn test_convolve2d_gpu_direct() {
359 use crate::backends::gpu::GpuDevice;
360
361 if !GpuDevice::is_available() {
362 eprintln!("GPU not available, skipping test_convolve2d_gpu_direct");
363 return;
364 }
365
366 let input_rows = 4;
368 let input_cols = 4;
369 let kernel_rows = 2;
370 let kernel_cols = 2;
371 let output_rows = 3;
372 let output_cols = 3;
373
374 let input_data = vec![
375 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
376 ];
377 let kernel_data = vec![1.0, 0.0, 0.0, 1.0]; let input =
380 Matrix::from_vec(input_rows, input_cols, input_data).expect("valid input matrix");
381 let kernel =
382 Matrix::from_vec(kernel_rows, kernel_cols, kernel_data).expect("valid kernel matrix");
383
384 let mut result = Matrix::zeros(output_rows, output_cols);
385
386 let gpu_result = input
387 .convolve2d_gpu(&kernel, &mut result, output_rows, output_cols)
388 .expect("convolve2d_gpu should succeed");
389
390 assert!(
392 (gpu_result.as_slice()[0] - 7.0).abs() < 1e-3,
393 "Expected 7.0, got {}",
394 gpu_result.as_slice()[0]
395 );
396 assert!(
398 (gpu_result.as_slice()[1] - 9.0).abs() < 1e-3,
399 "Expected 9.0, got {}",
400 gpu_result.as_slice()[1]
401 );
402 }
403}