pic_scale/
cbcr8.rs

1/*
2 * Copyright (c) Radzivon Bartoshyk. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification,
5 * are permitted provided that the following conditions are met:
6 *
7 * 1.  Redistributions of source code must retain the above copyright notice, this
8 * list of conditions and the following disclaimer.
9 *
10 * 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3.  Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29#[cfg(all(target_arch = "x86_64", feature = "avx"))]
30use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
31use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
32use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
33use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
34use crate::handler_provider::{
35    handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
36};
37#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
38use crate::sse::{convolve_vertical_sse_row, convolve_vertical_sse_row_lp};
39#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
40use crate::wasm32::wasm_vertical_neon_row;
41use crate::{ImageStore, ImageStoreMut};
42
43impl HorizontalConvolutionPass<u8, f32, 2> for ImageStore<'_, u8, 2> {
44    #[allow(clippy::type_complexity)]
45    fn convolve_horizontal(
46        &self,
47        filter_weights: FilterWeights<f32>,
48        destination: &mut ImageStoreMut<u8, 2>,
49        pool: &novtb::ThreadPool,
50        _options: ConvolutionOptions,
51    ) {
52        let _scale_factor = self.width as f32 / destination.width as f32;
53        let mut _dispatcher_4_rows: Option<
54            fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>),
55        > = Some(handle_fixed_rows_4_u8::<2>);
56        let mut _dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights<i16>) =
57            handle_fixed_row_u8::<2>;
58        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
59        {
60            #[cfg(feature = "rdm")]
61            if _scale_factor < 8.
62                && std::arch::is_aarch64_feature_detected!("rdm")
63                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
64            {
65                use crate::neon::{
66                    convolve_horizontal_cbcr_neon_rdm_row,
67                    convolve_horizontal_cbcr_neon_rows_rdm_4_u8,
68                };
69                _dispatcher_4_rows = Some(convolve_horizontal_cbcr_neon_rows_rdm_4_u8);
70                _dispatcher_1_row = convolve_horizontal_cbcr_neon_rdm_row;
71            }
72            #[cfg(feature = "nightly_i8mm")]
73            if _scale_factor < 5.5 && std::arch::is_aarch64_feature_detected!("i8mm") {
74                use crate::neon::{
75                    convolve_horizontal_cbcr_neon_dot_row,
76                    convolve_horizontal_cbcr_neon_rows_dot_4_u8,
77                };
78                use crate::rgba_u8::DefaultWeightsConverterQ7;
79                let dispatcher_4_rows: Option<
80                    fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i8>),
81                > = Some(convolve_horizontal_cbcr_neon_rows_dot_4_u8);
82                let dispatcher_1_row = convolve_horizontal_cbcr_neon_dot_row;
83                return convolve_horizontal_dispatch_u8(
84                    self,
85                    filter_weights,
86                    destination,
87                    pool,
88                    dispatcher_4_rows,
89                    dispatcher_1_row,
90                    DefaultWeightsConverterQ7::default(),
91                );
92            }
93        }
94        #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
95        {
96            if std::arch::is_x86_feature_detected!("sse4.1")
97                && _scale_factor < 8.
98                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
99            {
100                use crate::sse::{
101                    convolve_horizontal_cbcr_sse_hrs_row_one,
102                    convolve_horizontal_cbcr_sse_hrs_rows_4,
103                };
104                _dispatcher_4_rows = Some(convolve_horizontal_cbcr_sse_hrs_rows_4);
105                _dispatcher_1_row = convolve_horizontal_cbcr_sse_hrs_row_one;
106            }
107        }
108        convolve_horizontal_dispatch_u8(
109            self,
110            filter_weights,
111            destination,
112            pool,
113            _dispatcher_4_rows,
114            _dispatcher_1_row,
115            DefaultWeightsConverter::default(),
116        );
117    }
118}
119
120impl VerticalConvolutionPass<u8, f32, 2> for ImageStore<'_, u8, 2> {
121    fn convolve_vertical(
122        &self,
123        filter_weights: FilterWeights<f32>,
124        destination: &mut ImageStoreMut<u8, 2>,
125        pool: &novtb::ThreadPool,
126        _options: ConvolutionOptions,
127    ) {
128        let _scale_factor = self.height as f32 / destination.height as f32;
129        #[allow(clippy::type_complexity)]
130        let mut _dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[i16]) =
131            handle_fixed_column_u8;
132        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
133        {
134            // For more downscaling better to use more precise version
135            match _options.workload_strategy {
136                crate::WorkloadStrategy::PreferQuality => {
137                    use crate::neon::convolve_vertical_neon_i32_precision_d;
138                    _dispatcher = convolve_vertical_neon_i32_precision_d;
139                }
140                crate::WorkloadStrategy::PreferSpeed => {
141                    // For more downscaling better to use more precise version
142                    #[cfg(feature = "rdm")]
143                    if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
144                        use crate::neon::convolve_vertical_neon_i16_precision;
145                        _dispatcher = convolve_vertical_neon_i16_precision;
146                    } else {
147                        use crate::neon::convolve_vertical_neon_i32_precision;
148                        _dispatcher = convolve_vertical_neon_i32_precision;
149                    }
150                    #[cfg(not(feature = "rdm"))]
151                    {
152                        use crate::neon::convolve_vertical_neon_i32_precision;
153                        _dispatcher = convolve_vertical_neon_i32_precision;
154                    }
155                }
156            }
157        }
158        #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
159        {
160            if std::arch::is_x86_feature_detected!("sse4.1") {
161                if _scale_factor < 8.
162                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
163                {
164                    _dispatcher = convolve_vertical_sse_row_lp;
165                } else {
166                    _dispatcher = convolve_vertical_sse_row;
167                }
168            }
169        }
170        #[cfg(all(target_arch = "x86_64", feature = "avx"))]
171        {
172            if std::arch::is_x86_feature_detected!("avx2") {
173                if _scale_factor < 8.
174                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
175                {
176                    _dispatcher = convolve_vertical_avx_row_lp;
177                } else {
178                    _dispatcher = convolve_vertical_avx_row;
179                }
180            }
181        }
182        #[cfg(all(feature = "nightly_avx512", target_arch = "x86_64"))]
183        if std::arch::is_x86_feature_detected!("avx512bw")
184            && _scale_factor < 8.
185            && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
186        {
187            use crate::avx512::convolve_vertical_avx512_row_lp;
188            _dispatcher = convolve_vertical_avx512_row_lp;
189        }
190        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
191        {
192            _dispatcher = wasm_vertical_neon_row;
193        }
194        convolve_vertical_dispatch_u8(
195            self,
196            filter_weights,
197            destination,
198            pool,
199            _dispatcher,
200            DefaultWeightsConverter::default(),
201        );
202    }
203}