Skip to main content

trueno/matrix/ops/ml_ops/
convolution.rs

1//! 2D convolution operations for Matrix
2
3use crate::TruenoError;
4
5use super::super::super::Matrix;
6
7impl Matrix<f32> {
8    /// Perform 2D convolution with a kernel
9    ///
10    /// Applies a 2D convolution operation using "valid" padding (no padding),
11    /// resulting in an output smaller than the input.
12    ///
13    /// # Arguments
14    ///
15    /// * `kernel` - Convolution kernel (filter) to apply
16    ///
17    /// # Returns
18    ///
19    /// Convolved matrix with dimensions:
20    /// - rows: `input.rows - kernel.rows + 1`
21    /// - cols: `input.cols - kernel.cols + 1`
22    ///
23    /// # Errors
24    ///
25    /// Returns `InvalidInput` if:
26    /// - Kernel is larger than input in any dimension
27    ///
28    /// # Example
29    ///
30    /// ```
31    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
32    /// use trueno::Matrix;
33    ///
34    /// // 5x5 input image
35    /// let input = Matrix::from_vec(
36    ///     5, 5,
37    ///     vec![
38    ///         0.0, 0.0, 0.0, 0.0, 0.0,
39    ///         0.0, 0.0, 0.0, 0.0, 0.0,
40    ///         0.0, 0.0, 9.0, 0.0, 0.0,
41    ///         0.0, 0.0, 0.0, 0.0, 0.0,
42    ///         0.0, 0.0, 0.0, 0.0, 0.0,
43    ///     ]
44    /// )?;
45    ///
46    /// // 3x3 averaging kernel
47    /// let kernel_val = 1.0 / 9.0;
48    /// let kernel = Matrix::from_vec(
49    ///     3, 3,
50    ///     vec![kernel_val; 9]
51    /// )?;
52    ///
53    /// let result = input.convolve2d(&kernel)?;
54    /// assert_eq!(result.rows(), 3); // 5 - 3 + 1
55    /// assert_eq!(result.cols(), 3);
56    /// # Ok(())
57    /// # }
58    /// ```
59    // =========================================================================
60    // HOT PATH - PERFORMANCE CRITICAL
61    // =========================================================================
62    // This function processes millions of elements for typical image sizes.
63    // Any changes to the inner loop REQUIRE benchmark verification.
64    // =========================================================================
65    pub fn convolve2d(&self, kernel: &Matrix<f32>) -> Result<Matrix<f32>, TruenoError> {
66        // Validate kernel size
67        if kernel.rows > self.rows || kernel.cols > self.cols {
68            return Err(TruenoError::InvalidInput(format!(
69                "Kernel size ({}x{}) larger than input ({}x{})",
70                kernel.rows, kernel.cols, self.rows, self.cols
71            )));
72        }
73
74        // Calculate output dimensions (valid padding)
75        let output_rows = self.rows - kernel.rows + 1;
76        let output_cols = self.cols - kernel.cols + 1;
77
78        // Initialize output matrix (reuse parent's backend)
79        let mut result = Matrix::zeros_with_backend(output_rows, output_cols, self.backend);
80
81        // GPU acceleration for large convolutions
82        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
83        const GPU_THRESHOLD: usize = 10_000;
84
85        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
86        {
87            if output_rows * output_cols >= GPU_THRESHOLD {
88                use crate::backends::gpu::GpuBackend;
89
90                if GpuBackend::is_available() {
91                    if let Ok(gpu_result) =
92                        self.convolve2d_gpu(kernel, &mut result, output_rows, output_cols)
93                    {
94                        return Ok(gpu_result);
95                    }
96                }
97            }
98        }
99
100        // Scalar baseline implementation - optimized with direct indexing
101        let input_data = self.as_slice();
102        let kernel_data = kernel.as_slice();
103        let result_data = result.data.as_mut_slice();
104        let input_cols = self.cols;
105        let kernel_cols = kernel.cols;
106        let result_cols = output_cols;
107
108        for out_row in 0..output_rows {
109            for out_col in 0..output_cols {
110                let mut sum = 0.0;
111
112                for k_row in 0..kernel.rows {
113                    let in_row = out_row + k_row;
114                    let input_row_offset = in_row * input_cols;
115                    let kernel_row_offset = k_row * kernel_cols;
116
117                    for k_col in 0..kernel.cols {
118                        let in_col = out_col + k_col;
119                        sum += input_data[input_row_offset + in_col]
120                            * kernel_data[kernel_row_offset + k_col];
121                    }
122                }
123
124                result_data[out_row * result_cols + out_col] = sum;
125            }
126        }
127
128        Ok(result)
129    }
130
131    /// GPU-accelerated 2D convolution helper
132    #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
133    fn convolve2d_gpu(
134        &self,
135        kernel: &Matrix<f32>,
136        result: &mut Matrix<f32>,
137        _output_rows: usize,
138        _output_cols: usize,
139    ) -> Result<Matrix<f32>, TruenoError> {
140        use crate::backends::gpu::GpuDevice;
141
142        let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
143
144        gpu.convolve2d(
145            self.as_slice(),
146            kernel.as_slice(),
147            result.data.as_mut_slice(),
148            self.rows,
149            self.cols,
150            kernel.rows,
151            kernel.cols,
152        )
153        .map_err(TruenoError::InvalidInput)?;
154
155        Ok(result.clone())
156    }
157}
158
159#[cfg(test)]
160#[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
161mod gpu_tests {
162    use crate::Matrix;
163
164    /// Helper: compute expected 2D convolution (scalar reference) for verification
165    fn reference_convolve2d(
166        input: &[f32],
167        kernel: &[f32],
168        input_rows: usize,
169        input_cols: usize,
170        kernel_rows: usize,
171        kernel_cols: usize,
172    ) -> Vec<f32> {
173        let output_rows = input_rows - kernel_rows + 1;
174        let output_cols = input_cols - kernel_cols + 1;
175        let mut result = vec![0.0f32; output_rows * output_cols];
176
177        for out_row in 0..output_rows {
178            for out_col in 0..output_cols {
179                let mut sum = 0.0;
180                for k_row in 0..kernel_rows {
181                    for k_col in 0..kernel_cols {
182                        sum += input[(out_row + k_row) * input_cols + (out_col + k_col)]
183                            * kernel[k_row * kernel_cols + k_col];
184                    }
185                }
186                result[out_row * output_cols + out_col] = sum;
187            }
188        }
189        result
190    }
191
192    /// Test convolve2d_gpu via public API with input large enough to exceed
193    /// GPU_THRESHOLD (output_rows * output_cols >= 10_000).
194    /// Uses a 102x102 input with a 3x3 kernel => 100x100 = 10,000 output elements.
195    #[test]
196    fn test_convolve2d_gpu_identity_kernel() {
197        use crate::backends::gpu::GpuBackend;
198
199        if !GpuBackend::is_available() {
200            eprintln!("GPU not available, skipping test_convolve2d_gpu_identity_kernel");
201            return;
202        }
203
204        // 102x102 input, 3x3 kernel => 100x100 = 10,000 output (meets GPU_THRESHOLD)
205        let input_rows = 102;
206        let input_cols = 102;
207        let kernel_rows = 3;
208        let kernel_cols = 3;
209
210        // Fill input with sequential values for easy verification
211        let input_data: Vec<f32> =
212            (0..input_rows * input_cols).map(|i| (i % 100) as f32 * 0.1).collect();
213
214        // Identity-like kernel: center element = 1.0, rest = 0.0
215        let mut kernel_data = vec![0.0f32; kernel_rows * kernel_cols];
216        kernel_data[4] = 1.0; // center of 3x3
217
218        let input = Matrix::from_vec(input_rows, input_cols, input_data.clone())
219            .expect("valid input matrix");
220        let kernel = Matrix::from_vec(kernel_rows, kernel_cols, kernel_data.clone())
221            .expect("valid kernel matrix");
222
223        let result = input.convolve2d(&kernel).expect("convolve2d should succeed");
224
225        assert_eq!(result.rows(), 100);
226        assert_eq!(result.cols(), 100);
227
228        // With identity kernel, output[r][c] = input[r+1][c+1]
229        let expected = reference_convolve2d(
230            &input_data,
231            &kernel_data,
232            input_rows,
233            input_cols,
234            kernel_rows,
235            kernel_cols,
236        );
237
238        for i in 0..expected.len() {
239            assert!(
240                (result.as_slice()[i] - expected[i]).abs() < 1e-3,
241                "Mismatch at index {}: gpu={}, expected={}",
242                i,
243                result.as_slice()[i],
244                expected[i]
245            );
246        }
247    }
248
249    /// Test convolve2d_gpu with a uniform averaging kernel on a large input.
250    #[test]
251    fn test_convolve2d_gpu_averaging_kernel() {
252        use crate::backends::gpu::GpuBackend;
253
254        if !GpuBackend::is_available() {
255            eprintln!("GPU not available, skipping test_convolve2d_gpu_averaging_kernel");
256            return;
257        }
258
259        // 105x105 input, 5x5 kernel => 101x101 = 10,201 output (exceeds GPU_THRESHOLD)
260        let input_rows = 105;
261        let input_cols = 105;
262        let kernel_rows = 5;
263        let kernel_cols = 5;
264
265        // All-ones input
266        let input_data = vec![1.0f32; input_rows * input_cols];
267
268        // Averaging kernel: each element = 1/25
269        let kernel_data = vec![1.0f32 / 25.0; kernel_rows * kernel_cols];
270
271        let input =
272            Matrix::from_vec(input_rows, input_cols, input_data).expect("valid input matrix");
273        let kernel =
274            Matrix::from_vec(kernel_rows, kernel_cols, kernel_data).expect("valid kernel matrix");
275
276        let result = input.convolve2d(&kernel).expect("convolve2d should succeed");
277
278        assert_eq!(result.rows(), 101);
279        assert_eq!(result.cols(), 101);
280
281        // With all-ones input and averaging kernel, every output should be ~1.0
282        for i in 0..result.as_slice().len() {
283            assert!(
284                (result.as_slice()[i] - 1.0).abs() < 1e-3,
285                "Mismatch at index {}: gpu={}, expected=1.0",
286                i,
287                result.as_slice()[i]
288            );
289        }
290    }
291
292    /// Test convolve2d_gpu with a gradient input and edge-detection kernel.
293    #[test]
294    fn test_convolve2d_gpu_edge_detection() {
295        use crate::backends::gpu::GpuBackend;
296
297        if !GpuBackend::is_available() {
298            eprintln!("GPU not available, skipping test_convolve2d_gpu_edge_detection");
299            return;
300        }
301
302        // 103x103 input, 3x3 kernel => 101x101 = 10,201 output
303        let input_rows = 103;
304        let input_cols = 103;
305        let kernel_rows = 3;
306        let kernel_cols = 3;
307
308        // Horizontal gradient: each row is constant, increases per row
309        let input_data: Vec<f32> =
310            (0..input_rows * input_cols).map(|i| (i / input_cols) as f32).collect();
311
312        // Vertical edge detection kernel (Sobel-like simplified)
313        let kernel_data = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0];
314
315        let input = Matrix::from_vec(input_rows, input_cols, input_data.clone())
316            .expect("valid input matrix");
317        let kernel = Matrix::from_vec(kernel_rows, kernel_cols, kernel_data.clone())
318            .expect("valid kernel matrix");
319
320        let result = input.convolve2d(&kernel).expect("convolve2d should succeed");
321
322        // Verify against scalar reference
323        let expected = reference_convolve2d(
324            &input_data,
325            &kernel_data,
326            input_rows,
327            input_cols,
328            kernel_rows,
329            kernel_cols,
330        );
331
332        assert_eq!(result.as_slice().len(), expected.len());
333
334        // Sample verification (check first 100 elements and last 100)
335        for i in 0..100.min(expected.len()) {
336            assert!(
337                (result.as_slice()[i] - expected[i]).abs() < 1e-2,
338                "Mismatch at index {}: gpu={}, expected={}",
339                i,
340                result.as_slice()[i],
341                expected[i]
342            );
343        }
344        let len = expected.len();
345        for i in (len.saturating_sub(100))..len {
346            assert!(
347                (result.as_slice()[i] - expected[i]).abs() < 1e-2,
348                "Mismatch at index {}: gpu={}, expected={}",
349                i,
350                result.as_slice()[i],
351                expected[i]
352            );
353        }
354    }
355
356    /// Test convolve2d_gpu directly via the private helper method.
357    #[test]
358    fn test_convolve2d_gpu_direct() {
359        use crate::backends::gpu::GpuDevice;
360
361        if !GpuDevice::is_available() {
362            eprintln!("GPU not available, skipping test_convolve2d_gpu_direct");
363            return;
364        }
365
366        // 4x4 input, 2x2 kernel -> 3x3 output (test the private method directly)
367        let input_rows = 4;
368        let input_cols = 4;
369        let kernel_rows = 2;
370        let kernel_cols = 2;
371        let output_rows = 3;
372        let output_cols = 3;
373
374        let input_data = vec![
375            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
376        ];
377        let kernel_data = vec![1.0, 0.0, 0.0, 1.0]; // diagonal sum kernel
378
379        let input =
380            Matrix::from_vec(input_rows, input_cols, input_data).expect("valid input matrix");
381        let kernel =
382            Matrix::from_vec(kernel_rows, kernel_cols, kernel_data).expect("valid kernel matrix");
383
384        let mut result = Matrix::zeros(output_rows, output_cols);
385
386        let gpu_result = input
387            .convolve2d_gpu(&kernel, &mut result, output_rows, output_cols)
388            .expect("convolve2d_gpu should succeed");
389
390        // output[0,0] = input[0,0]*1 + input[0,1]*0 + input[1,0]*0 + input[1,1]*1 = 1+6 = 7
391        assert!(
392            (gpu_result.as_slice()[0] - 7.0).abs() < 1e-3,
393            "Expected 7.0, got {}",
394            gpu_result.as_slice()[0]
395        );
396        // output[0,1] = input[0,1]*1 + input[1,2]*1 = 2+7 = 9
397        assert!(
398            (gpu_result.as_slice()[1] - 9.0).abs() < 1e-3,
399            "Expected 9.0, got {}",
400            gpu_result.as_slice()[1]
401        );
402    }
403}