ocl_convolution/
lib.rs

1//! OpenCL-accelerated 2D convolutions.
2//!
3//! [Convolution] is a fundamental building block in signal processing. This crate is focused
4//! on 2D convolutions (i.e., the signal is a still image) in the context of [deep learning]
5//! (more precisely, [convolutional neural networks][cnn]).
6//! The second requirement means that the convolution filter may contain many (order of hundreds)
7//! filters; and the input may contain many channels (order of hundreds or thousands), rather
8//! than traditional 3 or 4. Computing such convolutions is computationally heavy and can be
9//! effectively accelerated with the help of [OpenCL].
10//!
11//! # Features
12//!
13//! The crate implements convolutions on two numerical formats:
14//!
15//! - Single-precision floats (`f32`)
16//! - Signed 8-bit integers with 32-bit multiply-add accumulator (this format is frequently denoted
17//!   `int8/32` in deep learning literature). Quantization parameters are applied uniformly
18//!   to the entire layer.
19//!
20//! For both cases, dilated or grouped convolutions are supported.
21//!
22//! # Implementation details
23//!
24//! The implementation uses output-stationary workflow (see, e.g., [this paper] for
25//! the definition); that is, each element of the output tensor is computed in a single run
26//! of the OpenCL kernel. This minimizes memory overhead, but may not be the fastest algorithm.
27//!
28//! [Convolution]: https://en.wikipedia.org/wiki/Convolution
29//! [deep learning]: https://en.wikipedia.org/wiki/Deep_learning
30//! [cnn]: https://en.wikipedia.org/wiki/Convolutional_neural_network
31//! [OpenCL]: https://www.khronos.org/opencl/
32//! [this paper]: https://dl.acm.org/citation.cfm?id=3001177
33//!
34//! # Examples
35//!
36//! ## Floating-point convolution
37//!
38//! ```
39//! use ndarray::Array4;
40//! use rand::Rng;
41//! use ocl_convolution::{Convolution, FeatureMap, Params};
42//!
43//! # fn main() -> Result<(), ocl::Error> {
44//! let convolution = Convolution::f32(3)?.build(Params {
45//!     strides: [1, 1],
46//!     pads: [0; 4],
47//!     dilation: [1, 1],
48//!     groups: 1,
49//! })?;
50//!
51//! // Generate random signal with 6x6 spatial dims and 3 channels.
52//! let mut rng = rand::rng();
53//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.random_range(-1.0..=1.0));
54//! // Construct two 3x3 spatial filters.
55//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.random_range(-1.0..=1.0));
56//! // Perform the convolution. The output must have 4x4 spatial dims
57//! // and contain 2 channels (1 per each filter). The output layout will
58//! // be the same as in the signal.
59//! let output = convolution.compute(
60//!     // `FeatureMap` wraps `ArrayView4` with information about
61//!     // memory layout (which is "channels-last" / NHWC in this case).
62//!     FeatureMap::nhwc(&signal),
63//!     &filters,
64//! )?;
65//! assert_eq!(output.shape(), [1, 4, 4, 2]);
66//!
67//! // For increased efficiency, we may pin filter memory.
68//! // This is especially useful when the same filters are convolved
69//! // with multiple signals.
70//! let convolution = convolution.with_filters(&filters)?;
71//! let new_output = convolution.compute(FeatureMap::nhwc(&signal))?;
72//! assert_eq!(output, new_output);
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Quantized convolution
78//!
79//! ```
80//! use ndarray::Array4;
81//! use rand::Rng;
82//! use ocl_convolution::{Convolution, I8Params, FeatureMap, Params};
83//!
84//! # fn main() -> Result<(), ocl::Error> {
85//! const BIT_SHIFT: u8 = 16;
86//! let params = I8Params {
87//!     common: Params::default(),
88//!     // These params are found by profiling; here, they are
89//!     // chosen randomly.
90//!     bit_shift: BIT_SHIFT,
91//!     scale: I8Params::convert_scale(BIT_SHIFT, 0.1),
92//!     output_bias: -10,
93//!     signal_bias: 20,
94//!     filter_bias: -5,
95//! };
96//! let convolution = Convolution::i8(3)?.build(params)?;
97//!
98//! // Generate random signal with 6x6 spatial dims and 3 channels.
99//! let mut rng = rand::rng();
100//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.random_range(-127..=127));
101//! // Construct two 3x3 spatial filters.
102//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.random_range(-127..=127));
103//! // Perform the convolution. The output must have 4x4 spatial dims
104//! // and contain 2 channels (1 per each filter).
105//! let output = convolution.compute(
106//!     FeatureMap::nhwc(&signal),
107//!     &filters,
108//! )?;
109//! assert_eq!(output.shape(), [1, 4, 4, 2]);
110//! # Ok(())
111//! # }
112//! ```
113
114#![doc(html_root_url = "https://docs.rs/ocl-convolution/0.4.0")]
115#![warn(missing_debug_implementations, missing_docs, bare_trait_objects)]
116#![warn(clippy::all, clippy::pedantic)]
117#![allow(
118    clippy::missing_errors_doc,
119    clippy::must_use_candidate,
120    clippy::module_name_repetitions,
121    clippy::doc_markdown
122)]
123
124use std::{fmt, marker::PhantomData};
125
126use ndarray::{Array4, ArrayView4};
127use ocl::OclPrm;
128
129use crate::{
130    base::Base,
131    buffers::{Filters, Pinned},
132};
133pub use crate::{
134    base::ConvolutionBuilder,
135    buffers::{FeatureMap, FeatureMapShape, Layout},
136    params::{I8Params, Params},
137};
138
139mod base;
140mod buffers;
141mod params;
142
143const SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/conv.cl"));
144
145/// Supported element types for convolutions.
146pub trait ConvElement: OclPrm + Copy + 'static {
147    /// Type of the multiply-add accumulator.
148    type Acc: OclPrm + Copy + 'static;
149    /// Parameters of the convolution.
150    type Params: Copy + Into<Params> + Into<Self::ClParams>;
151    /// OpenCL-friendly version of parameters. This is considered an implementation detail.
152    type ClParams: OclPrm;
153}
154
155impl ConvElement for f32 {
156    type Acc = f32;
157    type Params = Params;
158    type ClParams = params::ClParams;
159}
160
161impl ConvElement for i8 {
162    type Acc = i32;
163    type Params = I8Params;
164    type ClParams = params::ClI8Params;
165}
166
167impl ConvolutionBuilder<f32> {
168    /// Creates a new floating-point convolution.
169    pub fn build(&self, params: Params) -> ocl::Result<Convolution<f32>> {
170        Base::new(self, params).map(Convolution)
171    }
172}
173
174impl ConvolutionBuilder<i8> {
175    /// Creates a new quantized convolution.
176    pub fn build(&self, params: I8Params) -> ocl::Result<Convolution<i8>> {
177        Base::new(self, params).map(Convolution)
178    }
179}
180
181/// Convolution without pinned memory.
182pub struct Convolution<T: ConvElement>(Base<PhantomData<T>>);
183
184impl<T> fmt::Debug for Convolution<T>
185where
186    T: ConvElement,
187    T::Params: fmt::Debug,
188{
189    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
190        formatter.debug_tuple("Convolution").field(&self.0).finish()
191    }
192}
193
194impl Convolution<f32> {
195    /// Creates a new floating-point convolution builder. `size` determines the filter size
196    /// and must be odd (1, 3, 5, ...).
197    ///
198    /// # Panics
199    ///
200    /// Panics if the filter `size` is even.
201    pub fn f32(size: u32) -> ocl::Result<ConvolutionBuilder<f32>> {
202        ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 32)], SOURCE)
203    }
204}
205
206/// Quantized convolution over signed 8-bit integers.
207///
208/// Due to use of `i8` inputs, computations are performed much faster than on `f32` inputs
209/// (the difference manifests most on the specialized hardware, but it is seen in this
210/// OpenCL-powered implementation as well).
211///
212/// ## Connection to real-value convolution
213///
214/// Quantized convolution mirrors real-valued convolution in which `i8` elements
215/// of the signal, filter and output tensors represent real-valued numbers with the
216/// following mapping:
217///
218/// ```
219/// let scale: f32 = // ...
220/// # 1.0;
221/// let bias: i32 = // ...
222/// # 0; drop(
223/// |x: i8| -> f32 { scale * (i32::from(x) - bias) as f32 }
224/// # )
225/// ```
226///
227/// `scale` and `bias` may differ for different tensors; these params are usually determined
228/// by *profiling* the corresponding convolutional neural network (see e.g. [this paper]).
229///
230/// Denote these quantiation params for tensor `T` as `T.scale` and `T.bias`. Denote `S`
231/// the signal, `F` the filter, `O` the output. Convolution parameters must be set as follows:
232///
233/// | `I8Params` field | Value     |
234/// |------------------|-----------|
235/// | `signal_bias`    | `-S.bias` |
236/// | `filter_bias`    | `-F.bias` |
237/// | `output_bias`    | `O.bias`  |
238/// | `scale`          | `S.scale * F.scale / O.scale` |
239///
240/// `scale` is represented as a fixed-point number with [`bit_shift`] binary digits after
241/// the point. Note that filter biases `B` are not transformed during the computation.
242///
243/// # Computing convolution
244///
245/// Suppose `S` is the signal and `F` is the filter tensor; both contain `i8` values.
246/// The computation is performed as follows:
247///
248/// 1. Unbias the signal: `S := S + params.signal_bias`.
249/// 2. Unbias the filters: `F := F + params.filter_bias`.
250/// 3. Compute "standard" convolution output `O := S (*) F` using `i32` precision.
251/// 4. Upscale each number in the output: `O := O * params.scale`.
252/// 5. If there is filter bias `B` provided, apply bias to the output per each output channel:
253///    `O[f, ..] := O[f, ..] + B[f]`.
254/// 6. Downscale the output: `O := round(O / 2**self.bit_shift)`,
255///    where `round()` works as floating-point rounding with the default mode
256///    (round to nearest, ties to even).
257/// 7. Apply output bias: `O := O + params.output_bias`.
258/// 8. Saturate output to `i8` range.
259///
260/// [`bit_shift`]: I8Params::bit_shift
261/// [this paper]: https://arxiv.org/abs/1805.00907
262impl Convolution<i8> {
263    /// Creates a new `i8` convolution builder. `size` determines the filter size
264    /// and must be odd (1, 3, 5, ...).
265    ///
266    /// # Panics
267    ///
268    /// Panics if the filter `size` is even.
269    pub fn i8(size: u32) -> ocl::Result<ConvolutionBuilder<i8>> {
270        ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 8)], SOURCE)
271    }
272}
273
274impl<T: ConvElement> Convolution<T> {
275    /// Spatial size of the convolution.
276    pub fn size(&self) -> u32 {
277        self.0.size()
278    }
279
280    /// Returns general parameters of the convolution.
281    pub fn params(&self) -> T::Params {
282        self.0.params()
283    }
284
285    /// Sets convolution parameters.
286    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
287        self.0.set_params(params)
288    }
289
290    /// Returns the convolution with pinned filter memory.
291    ///
292    /// # Parameters
293    ///
294    /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
295    ///   `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
296    pub fn with_filters<'a>(
297        self,
298        filters: impl Into<ArrayView4<'a, T>>,
299    ) -> ocl::Result<FiltersConvolution<T>> {
300        self.0
301            .with_filters(&filters.into(), None)
302            .map(FiltersConvolution)
303    }
304
305    /// Returns the convolution with pinned filter / filter bias memory.
306    pub fn with_biased_filters<'a>(
307        self,
308        filters: impl Into<ArrayView4<'a, T>>,
309        filter_biases: &[T::Acc],
310    ) -> ocl::Result<FiltersConvolution<T>> {
311        self.0
312            .with_filters(&filters.into(), Some(filter_biases))
313            .map(FiltersConvolution)
314    }
315
316    /// Performs convolution on the provided `signal` and `filters`.
317    ///
318    /// # Parameters
319    ///
320    /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
321    ///   `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
322    ///
323    /// # Return value
324    ///
325    /// The output will have the same layout as `signal`. An error means something wrong
326    /// with OpenCL.
327    ///
328    /// # Panics
329    ///
330    /// - Panics if `filters` do not have expected spatial dimensions, i.e.,
331    ///   `self.size() x self.size()`.
332    /// - Panics if the number of input channels differs from number of channels in `filters`.
333    pub fn compute<'a>(
334        &self,
335        signal: FeatureMap<'_, T>,
336        filters: impl Into<ArrayView4<'a, T>>,
337    ) -> ocl::Result<Array4<T>> {
338        self.0.compute(signal, &filters.into(), None)
339    }
340
341    /// Performs convolution on the provided `signal` and `filters`, with the output offset
342    /// by the provided per-filter biases.
343    ///
344    /// Parameters, return value and panics are the same as for [`Self::compute()`].
345    pub fn compute_with_biases<'a>(
346        &self,
347        signal: FeatureMap<'_, T>,
348        filters: impl Into<ArrayView4<'a, T>>,
349        filter_biases: &[T::Acc],
350    ) -> ocl::Result<Array4<T>> {
351        self.0.compute(signal, &filters.into(), Some(filter_biases))
352    }
353}
354
355/// Convolution with pinned filters memory. Pinning memory increases efficiency at the cost
356/// of making the convolution less flexible.
357///
358/// `FiltersConvolution` can be created by calling [`with_filters()`](Convolution::with_filters())
359/// or [`with_biased_filters()`](Convolution::with_biased_filters()) methods in `Convolution`.
360pub struct FiltersConvolution<T: ConvElement>(Base<Filters<T>>);
361
362impl<T> fmt::Debug for FiltersConvolution<T>
363where
364    T: ConvElement,
365    T::Params: fmt::Debug,
366{
367    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
368        formatter
369            .debug_tuple("FiltersConvolution")
370            .field(&self.0)
371            .finish()
372    }
373}
374
375impl<T: ConvElement> FiltersConvolution<T> {
376    /// Spatial size of the convolution.
377    pub fn size(&self) -> u32 {
378        self.0.size()
379    }
380
381    /// Returns general parameters of the convolution.
382    pub fn params(&self) -> T::Params {
383        self.0.params()
384    }
385
386    /// Sets convolution parameters.
387    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
388        self.0.set_params(params)
389    }
390
391    /// Pins signal and output memory for this convolution.
392    pub fn pin(self, signal_shape: FeatureMapShape) -> ocl::Result<PinnedConvolution<T>> {
393        self.0.pinned(signal_shape).map(PinnedConvolution)
394    }
395
396    /// Computes the convolution on the provided signal.
397    pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
398        self.0.compute(signal)
399    }
400}
401
402/// Convolution with pinned memory for filters, signal and output. Pinning memory increases
403/// efficiency at the cost of making the convolution less flexible.
404///
405/// `PinnedConvolution` can be created from a [`FiltersConvolution`] by calling
406/// [`pin()`](FiltersConvolution::pin()).
407pub struct PinnedConvolution<T: ConvElement>(Base<Pinned<T>>);
408
409impl<T> fmt::Debug for PinnedConvolution<T>
410where
411    T: ConvElement,
412    T::Params: fmt::Debug,
413{
414    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
415        formatter
416            .debug_tuple("PinnedConvolution")
417            .field(&self.0)
418            .finish()
419    }
420}
421
422impl<T: ConvElement> PinnedConvolution<T> {
423    /// Spatial size of the convolution.
424    pub fn size(&self) -> u32 {
425        self.0.size()
426    }
427
428    /// Returns general parameters of the convolution.
429    pub fn params(&self) -> T::Params {
430        self.0.params()
431    }
432
433    /// Sets convolution parameters.
434    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
435        self.0.set_params(params)
436    }
437
438    /// Computes the convolution on the provided signal.
439    ///
440    /// # Panics
441    ///
442    /// - Panics if signal dimensions do not agree with the ones provided
443    ///   to the [`pin()` method](FiltersConvolution::pin()).
444    pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
445        self.0.compute(signal)
446    }
447}
448
449#[cfg(doctest)]
450doc_comment::doctest!("../README.md");
ocl_convolution/lib.rs

ocl_convolution/
lib.rs