ocl_convolution/lib.rs
1//! OpenCL-accelerated 2D convolutions.
2//!
3//! [Convolution] is a fundamental building block in signal processing. This crate is focused
4//! on 2D convolutions (i.e., the signal is a still image) in the context of [deep learning]
5//! (more precisely, [convolutional neural networks][cnn]).
6//! The second requirement means that the convolution filter may contain many (order of hundreds)
7//! filters; and the input may contain many channels (order of hundreds or thousands), rather
8//! than traditional 3 or 4. Computing such convolutions is computationally heavy and can be
9//! effectively accelerated with the help of [OpenCL].
10//!
11//! # Features
12//!
13//! The crate implements convolutions on two numerical formats:
14//!
15//! - Single-precision floats (`f32`)
16//! - Signed 8-bit integers with 32-bit multiply-add accumulator (this format is frequently denoted
17//! `int8/32` in deep learning literature). Quantization parameters are applied uniformly
18//! to the entire layer.
19//!
20//! For both cases, dilated or grouped convolutions are supported.
21//!
22//! # Implementation details
23//!
24//! The implementation uses output-stationary workflow (see, e.g., [this paper] for
25//! the definition); that is, each element of the output tensor is computed in a single run
26//! of the OpenCL kernel. This minimizes memory overhead, but may not be the fastest algorithm.
27//!
28//! [Convolution]: https://en.wikipedia.org/wiki/Convolution
29//! [deep learning]: https://en.wikipedia.org/wiki/Deep_learning
30//! [cnn]: https://en.wikipedia.org/wiki/Convolutional_neural_network
31//! [OpenCL]: https://www.khronos.org/opencl/
32//! [this paper]: https://dl.acm.org/citation.cfm?id=3001177
33//!
34//! # Examples
35//!
36//! ## Floating-point convolution
37//!
38//! ```
39//! use ndarray::Array4;
40//! use rand::{Rng, thread_rng};
41//! use ocl_convolution::{Convolution, FeatureMap, Params};
42//!
43//! # fn main() -> Result<(), ocl::Error> {
44//! let convolution = Convolution::f32(3)?.build(Params {
45//! strides: [1, 1],
46//! pads: [0; 4],
47//! dilation: [1, 1],
48//! groups: 1,
49//! })?;
50//!
51//! // Generate random signal with 6x6 spatial dims and 3 channels.
52//! let mut rng = thread_rng();
53//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.gen_range(-1.0..=1.0));
54//! // Construct two 3x3 spatial filters.
55//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.gen_range(-1.0..=1.0));
56//! // Perform the convolution. The output must have 4x4 spatial dims
57//! // and contain 2 channels (1 per each filter). The output layout will
58//! // be the same as in the signal.
59//! let output = convolution.compute(
60//! // `FeatureMap` wraps `ArrayView4` with information about
61//! // memory layout (which is "channels-last" / NHWC in this case).
62//! FeatureMap::nhwc(&signal),
63//! &filters,
64//! )?;
65//! assert_eq!(output.shape(), [1, 4, 4, 2]);
66//!
67//! // For increased efficiency, we may pin filter memory.
68//! // This is especially useful when the same filters are convolved
69//! // with multiple signals.
70//! let convolution = convolution.with_filters(&filters)?;
71//! let new_output = convolution.compute(FeatureMap::nhwc(&signal))?;
72//! assert_eq!(output, new_output);
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Quantized convolution
78//!
79//! ```
80//! use ndarray::Array4;
81//! use rand::{Rng, thread_rng};
82//! use ocl_convolution::{Convolution, I8Params, FeatureMap, Params};
83//!
84//! # fn main() -> Result<(), ocl::Error> {
85//! const BIT_SHIFT: u8 = 16;
86//! let params = I8Params {
87//! common: Params::default(),
88//! // These params are found by profiling; here, they are
89//! // chosen randomly.
90//! bit_shift: BIT_SHIFT,
91//! scale: I8Params::convert_scale(BIT_SHIFT, 0.1),
92//! output_bias: -10,
93//! signal_bias: 20,
94//! filter_bias: -5,
95//! };
96//! let convolution = Convolution::i8(3)?.build(params)?;
97//!
98//! // Generate random signal with 6x6 spatial dims and 3 channels.
99//! let mut rng = thread_rng();
100//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.gen_range(-127..=127));
101//! // Construct two 3x3 spatial filters.
102//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.gen_range(-127..=127));
103//! // Perform the convolution. The output must have 4x4 spatial dims
104//! // and contain 2 channels (1 per each filter).
105//! let output = convolution.compute(
106//! FeatureMap::nhwc(&signal),
107//! &filters,
108//! )?;
109//! assert_eq!(output.shape(), [1, 4, 4, 2]);
110//! # Ok(())
111//! # }
112//! ```
113
114#![doc(html_root_url = "https://docs.rs/ocl-convolution/0.3.0")]
115#![warn(missing_debug_implementations, missing_docs, bare_trait_objects)]
116#![warn(clippy::all, clippy::pedantic)]
117#![allow(
118 clippy::missing_errors_doc,
119 clippy::must_use_candidate,
120 clippy::module_name_repetitions,
121 clippy::doc_markdown
122)]
123
124use ndarray::{Array4, ArrayView4};
125use ocl::OclPrm;
126
127use std::{fmt, marker::PhantomData};
128
129mod base;
130mod buffers;
131mod params;
132
133use crate::{
134 base::Base,
135 buffers::{Filters, Pinned},
136};
137pub use crate::{
138 base::ConvolutionBuilder,
139 buffers::{FeatureMap, FeatureMapShape, Layout},
140 params::{I8Params, Params},
141};
142
143const SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/conv.cl"));
144
145/// Supported element types for convolutions.
146pub trait ConvElement: OclPrm + Copy + 'static {
147 /// Type of the multiply-add accumulator.
148 type Acc: OclPrm + Copy + 'static;
149 /// Parameters of the convolution.
150 type Params: Copy + Into<Params> + Into<Self::ClParams>;
151 /// OpenCL-friendly version of parameters. This is considered an implementation detail.
152 type ClParams: OclPrm;
153}
154
155impl ConvElement for f32 {
156 type Acc = f32;
157 type Params = Params;
158 type ClParams = params::ClParams;
159}
160
161impl ConvElement for i8 {
162 type Acc = i32;
163 type Params = I8Params;
164 type ClParams = params::ClI8Params;
165}
166
167impl ConvolutionBuilder<f32> {
168 /// Creates a new floating-point convolution.
169 pub fn build(&self, params: Params) -> ocl::Result<Convolution<f32>> {
170 Base::new(self, params).map(Convolution)
171 }
172}
173
174impl ConvolutionBuilder<i8> {
175 /// Creates a new quantized convolution.
176 pub fn build(&self, params: I8Params) -> ocl::Result<Convolution<i8>> {
177 Base::new(self, params).map(Convolution)
178 }
179}
180
181/// Convolution without pinned memory.
182pub struct Convolution<T: ConvElement>(Base<PhantomData<T>>);
183
184impl<T> fmt::Debug for Convolution<T>
185where
186 T: ConvElement,
187 T::Params: fmt::Debug,
188{
189 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
190 formatter.debug_tuple("Convolution").field(&self.0).finish()
191 }
192}
193
194impl Convolution<f32> {
195 /// Creates a new floating-point convolution builder. `size` determines the filter size
196 /// and must be odd (1, 3, 5, ...).
197 ///
198 /// # Panics
199 ///
200 /// Panics if the filter `size` is even.
201 pub fn f32(size: u32) -> ocl::Result<ConvolutionBuilder<f32>> {
202 ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 32)], SOURCE)
203 }
204}
205
206/// Quantized convolution over signed 8-bit integers.
207///
208/// Due to use of `i8` inputs, computations are performed much faster than on `f32` inputs
209/// (the difference manifests most on the specialized hardware, but it is seen in this
210/// OpenCL-powered implementation as well).
211///
212/// ## Connection to real-value convolution
213///
214/// Quantized convolution mirrors real-valued convolution in which `i8` elements
215/// of the signal, filter and output tensors represent real-valued numbers with the
216/// following mapping:
217///
218/// ```
219/// let scale: f32 = // ...
220/// # 1.0;
221/// let bias: i32 = // ...
222/// # 0; drop(
223/// |x: i8| -> f32 { scale * (i32::from(x) - bias) as f32 }
224/// # )
225/// ```
226///
227/// `scale` and `bias` may differ for different tensors; these params are usually determined
228/// by *profiling* the corresponding convolutional neural network (see e.g. [this paper]).
229///
230/// Denote these quantiation params for tensor `T` as `T.scale` and `T.bias`. Denote `S`
231/// the signal, `F` the filter, `O` the output. Convolution parameters must be set as follows:
232///
233/// | `I8Params` field | Value |
234/// |------------------|-----------|
235/// | `signal_bias` | `-S.bias` |
236/// | `filter_bias` | `-F.bias` |
237/// | `output_bias` | `O.bias` |
238/// | `scale` | `S.scale * F.scale / O.scale` |
239///
240/// `scale` is represented as a fixed-point number with [`bit_shift`] binary digits after
241/// the point. Note that filter biases `B` are not transformed during the computation.
242///
243/// # Computing convolution
244///
245/// Suppose `S` is the signal and `F` is the filter tensor; both contain `i8` values.
246/// The computation is performed as follows:
247///
248/// 1. Unbias the signal: `S := S + params.signal_bias`.
249/// 2. Unbias the filters: `F := F + params.filter_bias`.
250/// 3. Compute "standard" convolution output `O := S (*) F` using `i32` precision.
251/// 4. Upscale each number in the output: `O := O * params.scale`.
252/// 5. If there is filter bias `B` provided, apply bias to the output per each output channel:
253/// `O[f, ..] := O[f, ..] + B[f]`.
254/// 6. Downscale the output: `O := round(O / 2**self.bit_shift)`,
255/// where `round()` works as floating-point rounding with the default mode
256/// (round to nearest, ties to even).
257/// 7. Apply output bias: `O := O + params.output_bias`.
258/// 8. Saturate output to `i8` range.
259///
260/// [`bit_shift`]: I8Params::bit_shift
261/// [this paper]: https://arxiv.org/abs/1805.00907
262impl Convolution<i8> {
263 /// Creates a new `i8` convolution builder. `size` determines the filter size
264 /// and must be odd (1, 3, 5, ...).
265 ///
266 /// # Panics
267 ///
268 /// Panics if the filter `size` is even.
269 pub fn i8(size: u32) -> ocl::Result<ConvolutionBuilder<i8>> {
270 ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 8)], SOURCE)
271 }
272}
273
274impl<T: ConvElement> Convolution<T> {
275 /// Spatial size of the convolution.
276 pub fn size(&self) -> u32 {
277 self.0.size()
278 }
279
280 /// Returns general parameters of the convolution.
281 pub fn params(&self) -> T::Params {
282 self.0.params()
283 }
284
285 /// Sets convolution parameters.
286 pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
287 self.0.set_params(params)
288 }
289
290 /// Returns the convolution with pinned filter memory.
291 ///
292 /// # Parameters
293 ///
294 /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
295 /// `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
296 pub fn with_filters<'a>(
297 self,
298 filters: impl Into<ArrayView4<'a, T>>,
299 ) -> ocl::Result<FiltersConvolution<T>> {
300 self.0
301 .with_filters(filters.into(), None)
302 .map(FiltersConvolution)
303 }
304
305 /// Returns the convolution with pinned filter / filter bias memory.
306 pub fn with_biased_filters<'a>(
307 self,
308 filters: impl Into<ArrayView4<'a, T>>,
309 filter_biases: &[T::Acc],
310 ) -> ocl::Result<FiltersConvolution<T>> {
311 self.0
312 .with_filters(filters.into(), Some(filter_biases))
313 .map(FiltersConvolution)
314 }
315
316 /// Performs convolution on the provided `signal` and `filters`.
317 ///
318 /// # Parameters
319 ///
320 /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
321 /// `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
322 ///
323 /// # Return value
324 ///
325 /// The output will have the same layout as `signal`. An error means something wrong
326 /// with OpenCL.
327 ///
328 /// # Panics
329 ///
330 /// - Panics if `filters` do not have expected spatial dimensions, i.e.,
331 /// `self.size() x self.size()`.
332 /// - Panics if the number of input channels differs from number of channels in `filters`.
333 pub fn compute<'a>(
334 &self,
335 signal: FeatureMap<'_, T>,
336 filters: impl Into<ArrayView4<'a, T>>,
337 ) -> ocl::Result<Array4<T>> {
338 self.0.compute(signal, filters.into(), None)
339 }
340
341 /// Performs convolution on the provided `signal` and `filters`, with the output offset
342 /// by the provided per-filter biases.
343 ///
344 /// Parameters, return value and panics are the same as for [`Self::compute()`].
345 pub fn compute_with_biases<'a>(
346 &self,
347 signal: FeatureMap<'_, T>,
348 filters: impl Into<ArrayView4<'a, T>>,
349 filter_biases: &[T::Acc],
350 ) -> ocl::Result<Array4<T>> {
351 self.0.compute(signal, filters.into(), Some(filter_biases))
352 }
353}
354
355/// Convolution with pinned filters memory. Pinning memory increases efficiency at the cost
356/// of making the convolution less flexible.
357///
358/// `FiltersConvolution` can be created by calling [`with_filters()`](Convolution::with_filters())
359/// or [`with_biased_filters()`](Convolution::with_biased_filters()) methods in `Convolution`.
360pub struct FiltersConvolution<T: ConvElement>(Base<Filters<T>>);
361
362impl<T> fmt::Debug for FiltersConvolution<T>
363where
364 T: ConvElement,
365 T::Params: fmt::Debug,
366{
367 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
368 formatter
369 .debug_tuple("FiltersConvolution")
370 .field(&self.0)
371 .finish()
372 }
373}
374
375impl<T: ConvElement> FiltersConvolution<T> {
376 /// Spatial size of the convolution.
377 pub fn size(&self) -> u32 {
378 self.0.size()
379 }
380
381 /// Returns general parameters of the convolution.
382 pub fn params(&self) -> T::Params {
383 self.0.params()
384 }
385
386 /// Sets convolution parameters.
387 pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
388 self.0.set_params(params)
389 }
390
391 /// Pins signal and output memory for this convolution.
392 pub fn pin(self, signal_shape: FeatureMapShape) -> ocl::Result<PinnedConvolution<T>> {
393 self.0.pinned(signal_shape).map(PinnedConvolution)
394 }
395
396 /// Computes the convolution on the provided signal.
397 pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
398 self.0.compute(signal)
399 }
400}
401
402/// Convolution with pinned memory for filters, signal and output. Pinning memory increases
403/// efficiency at the cost of making the convolution less flexible.
404///
405/// `PinnedConvolution` can be created from a [`FiltersConvolution`] by calling
406/// [`pin()`](FiltersConvolution::pin()).
407pub struct PinnedConvolution<T: ConvElement>(Base<Pinned<T>>);
408
409impl<T> fmt::Debug for PinnedConvolution<T>
410where
411 T: ConvElement,
412 T::Params: fmt::Debug,
413{
414 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
415 formatter
416 .debug_tuple("PinnedConvolution")
417 .field(&self.0)
418 .finish()
419 }
420}
421
422impl<T: ConvElement> PinnedConvolution<T> {
423 /// Spatial size of the convolution.
424 pub fn size(&self) -> u32 {
425 self.0.size()
426 }
427
428 /// Returns general parameters of the convolution.
429 pub fn params(&self) -> T::Params {
430 self.0.params()
431 }
432
433 /// Sets convolution parameters.
434 pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
435 self.0.set_params(params)
436 }
437
438 /// Computes the convolution on the provided signal.
439 ///
440 /// # Panics
441 ///
442 /// - Panics if signal dimensions do not agree with the ones provided
443 /// to the [`pin()` method](FiltersConvolution::pin()).
444 pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
445 self.0.compute(signal)
446 }
447}
448
449#[cfg(doctest)]
450doc_comment::doctest!("../README.md");