1#[cfg(all(target_arch = "x86_64", feature = "avx"))]
30use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
31use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
32use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
33use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
34use crate::handler_provider::{
35 handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
36};
37#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
38use crate::sse::{convolve_vertical_sse_row, convolve_vertical_sse_row_lp};
39#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
40use crate::wasm32::wasm_vertical_neon_row;
41use crate::{ImageStore, ImageStoreMut};
42
43impl HorizontalConvolutionPass<u8, f32, 2> for ImageStore<'_, u8, 2> {
44 #[allow(clippy::type_complexity)]
45 fn convolve_horizontal(
46 &self,
47 filter_weights: FilterWeights<f32>,
48 destination: &mut ImageStoreMut<u8, 2>,
49 pool: &novtb::ThreadPool,
50 _options: ConvolutionOptions,
51 ) {
52 let _scale_factor = self.width as f32 / destination.width as f32;
53 let mut _dispatcher_4_rows: Option<
54 fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>),
55 > = Some(handle_fixed_rows_4_u8::<2>);
56 let mut _dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights<i16>) =
57 handle_fixed_row_u8::<2>;
58 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
59 {
60 #[cfg(feature = "rdm")]
61 if _scale_factor < 8.
62 && std::arch::is_aarch64_feature_detected!("rdm")
63 && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
64 {
65 use crate::neon::{
66 convolve_horizontal_cbcr_neon_rdm_row,
67 convolve_horizontal_cbcr_neon_rows_rdm_4_u8,
68 };
69 _dispatcher_4_rows = Some(convolve_horizontal_cbcr_neon_rows_rdm_4_u8);
70 _dispatcher_1_row = convolve_horizontal_cbcr_neon_rdm_row;
71 }
72 #[cfg(feature = "nightly_i8mm")]
73 if _scale_factor < 5.5 && std::arch::is_aarch64_feature_detected!("i8mm") {
74 use crate::neon::{
75 convolve_horizontal_cbcr_neon_dot_row,
76 convolve_horizontal_cbcr_neon_rows_dot_4_u8,
77 };
78 use crate::rgba_u8::DefaultWeightsConverterQ7;
79 let dispatcher_4_rows: Option<
80 fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i8>),
81 > = Some(convolve_horizontal_cbcr_neon_rows_dot_4_u8);
82 let dispatcher_1_row = convolve_horizontal_cbcr_neon_dot_row;
83 return convolve_horizontal_dispatch_u8(
84 self,
85 filter_weights,
86 destination,
87 pool,
88 dispatcher_4_rows,
89 dispatcher_1_row,
90 DefaultWeightsConverterQ7::default(),
91 );
92 }
93 }
94 #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
95 {
96 if std::arch::is_x86_feature_detected!("sse4.1")
97 && _scale_factor < 8.
98 && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
99 {
100 use crate::sse::{
101 convolve_horizontal_cbcr_sse_hrs_row_one,
102 convolve_horizontal_cbcr_sse_hrs_rows_4,
103 };
104 _dispatcher_4_rows = Some(convolve_horizontal_cbcr_sse_hrs_rows_4);
105 _dispatcher_1_row = convolve_horizontal_cbcr_sse_hrs_row_one;
106 }
107 }
108 convolve_horizontal_dispatch_u8(
109 self,
110 filter_weights,
111 destination,
112 pool,
113 _dispatcher_4_rows,
114 _dispatcher_1_row,
115 DefaultWeightsConverter::default(),
116 );
117 }
118}
119
120impl VerticalConvolutionPass<u8, f32, 2> for ImageStore<'_, u8, 2> {
121 fn convolve_vertical(
122 &self,
123 filter_weights: FilterWeights<f32>,
124 destination: &mut ImageStoreMut<u8, 2>,
125 pool: &novtb::ThreadPool,
126 _options: ConvolutionOptions,
127 ) {
128 let _scale_factor = self.height as f32 / destination.height as f32;
129 #[allow(clippy::type_complexity)]
130 let mut _dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[i16]) =
131 handle_fixed_column_u8;
132 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
133 {
134 match _options.workload_strategy {
136 crate::WorkloadStrategy::PreferQuality => {
137 use crate::neon::convolve_vertical_neon_i32_precision_d;
138 _dispatcher = convolve_vertical_neon_i32_precision_d;
139 }
140 crate::WorkloadStrategy::PreferSpeed => {
141 #[cfg(feature = "rdm")]
143 if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
144 use crate::neon::convolve_vertical_neon_i16_precision;
145 _dispatcher = convolve_vertical_neon_i16_precision;
146 } else {
147 use crate::neon::convolve_vertical_neon_i32_precision;
148 _dispatcher = convolve_vertical_neon_i32_precision;
149 }
150 #[cfg(not(feature = "rdm"))]
151 {
152 use crate::neon::convolve_vertical_neon_i32_precision;
153 _dispatcher = convolve_vertical_neon_i32_precision;
154 }
155 }
156 }
157 }
158 #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
159 {
160 if std::arch::is_x86_feature_detected!("sse4.1") {
161 if _scale_factor < 8.
162 && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
163 {
164 _dispatcher = convolve_vertical_sse_row_lp;
165 } else {
166 _dispatcher = convolve_vertical_sse_row;
167 }
168 }
169 }
170 #[cfg(all(target_arch = "x86_64", feature = "avx"))]
171 {
172 if std::arch::is_x86_feature_detected!("avx2") {
173 if _scale_factor < 8.
174 && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
175 {
176 _dispatcher = convolve_vertical_avx_row_lp;
177 } else {
178 _dispatcher = convolve_vertical_avx_row;
179 }
180 }
181 }
182 #[cfg(all(feature = "nightly_avx512", target_arch = "x86_64"))]
183 if std::arch::is_x86_feature_detected!("avx512bw")
184 && _scale_factor < 8.
185 && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
186 {
187 use crate::avx512::convolve_vertical_avx512_row_lp;
188 _dispatcher = convolve_vertical_avx512_row_lp;
189 }
190 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
191 {
192 _dispatcher = wasm_vertical_neon_row;
193 }
194 convolve_vertical_dispatch_u8(
195 self,
196 filter_weights,
197 destination,
198 pool,
199 _dispatcher,
200 DefaultWeightsConverter::default(),
201 );
202 }
203}