#[cfg(all(target_arch = "x86_64", feature = "avx"))]
use crate::avx2::{
convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
convolve_vertical_avx_row_f16,
};
use crate::convolution::{
ColumnFilter, ConvolutionOptions, HorizontalFilterPass, RowFilter, VerticalConvolutionPass,
};
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
use crate::filter_weights::WeightsConverter;
use crate::filter_weights::{FilterBounds, FilterWeights};
use crate::floating_point_horizontal::{
convolve_row_handler_floating_point, convolve_row_handler_floating_point_4,
};
use crate::floating_point_vertical::column_handler_floating_point;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
use crate::neon::{
convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16,
convolve_horizontal_rgba_neon_row_one_f16, convolve_horizontal_rgba_neon_rows_4_f16,
convolve_vertical_rgb_neon_row_f16,
};
#[cfg(all(target_arch = "aarch64", feature = "neon",))]
use crate::neon::{
xconvolve_horizontal_rgb_neon_row_one_f16, xconvolve_horizontal_rgb_neon_rows_4_f16,
xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16,
xconvolve_vertical_rgb_neon_row_f16,
};
use crate::plan::{HorizontalFiltering, VerticalFiltering};
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
use crate::sse::{
convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16,
convolve_horizontal_rgba_sse_row_one_f16, convolve_horizontal_rgba_sse_rows_4_f16,
convolve_vertical_sse_row_f16,
};
use crate::{ImageStore, ThreadingPolicy};
use core::{f16, f32};
use std::sync::Arc;
fn convolve_horizontal_rgba_4_row_f16<const CN: usize>(
src: &[f16],
src_stride: usize,
dst: &mut [f16],
dst_stride: usize,
filter_weights: &FilterWeights<f32>,
_: u32,
) {
let transient_src = src.iter().map(|&x| x as f32).collect::<Vec<f32>>();
let mut transient_dst = vec![0f32; dst.len()];
convolve_row_handler_floating_point_4::<f32, f32, f32, CN>(
&transient_src,
src_stride,
&mut transient_dst,
dst_stride,
filter_weights,
8,
);
for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) {
*dst = *src as f16;
}
}
fn convolve_horizontal_rgb_native_row_f16<const CN: usize>(
src: &[f16],
dst: &mut [f16],
filter_weights: &FilterWeights<f32>,
_: u32,
) {
let transient_src = src.iter().map(|&x| x as f32).collect::<Vec<f32>>();
let mut transient_dst = vec![0f32; dst.len()];
convolve_row_handler_floating_point::<f32, f32, f32, CN>(
&transient_src,
&mut transient_dst,
filter_weights,
8,
);
for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) {
*dst = *src as f16;
}
}
impl HorizontalFilterPass<f16, f32, 4> for ImageStore<'_, f16, 4> {
fn horizontal_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_options: ConvolutionOptions,
) -> Arc<dyn RowFilter<f16, 4> + Send + Sync> {
#[allow(clippy::type_complexity)]
let mut _dispatcher_4_rows: Option<
fn(&[f16], usize, &mut [f16], usize, &FilterWeights<f32>, u32),
> = Some(convolve_horizontal_rgba_4_row_f16::<4>);
#[allow(clippy::type_complexity)]
let mut _dispatcher_row: fn(&[f16], &mut [f16], &FilterWeights<f32>, u32) =
convolve_horizontal_rgb_native_row_f16::<4>;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_f16);
_dispatcher_row = convolve_horizontal_rgba_neon_row_one_f16;
match _options.workload_strategy {
crate::WorkloadStrategy::PreferSpeed => {
if std::arch::is_aarch64_feature_detected!("fp16") {
_dispatcher_4_rows = Some(xconvolve_horizontal_rgba_neon_rows_4_f16);
_dispatcher_row = xconvolve_horizontal_rgba_neon_row_one_f16;
}
}
crate::WorkloadStrategy::PreferQuality => {
if std::arch::is_aarch64_feature_detected!("fhm") {
use crate::filter_weights::WeightFloat16Converter;
use crate::neon::{
convolve_horizontal_rgba_neon_row_one_f16_fhm,
convolve_horizontal_rgba_neon_rows_4_f16_fhm,
};
let weights =
WeightFloat16Converter::default().prepare_weights(&filter_weights);
return Arc::new(HorizontalFiltering {
filter_weights: weights,
filter_4_rows: Some(convolve_horizontal_rgba_neon_rows_4_f16_fhm),
filter_row: convolve_horizontal_rgba_neon_row_one_f16_fhm,
threading_policy,
});
}
}
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f16);
_dispatcher_row = convolve_horizontal_rgba_sse_row_one_f16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let is_f16c_available = std::arch::is_x86_feature_detected!("f16c");
let fma_available = std::arch::is_x86_feature_detected!("fma");
if std::arch::is_x86_feature_detected!("avx2") && is_f16c_available {
_dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_f16::<false>);
_dispatcher_row = convolve_horizontal_rgba_avx_row_one_f16::<false>;
if fma_available {
_dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_f16::<true>);
_dispatcher_row = convolve_horizontal_rgba_avx_row_one_f16::<true>;
}
}
}
Arc::new(HorizontalFiltering {
filter_weights,
filter_4_rows: _dispatcher_4_rows,
filter_row: _dispatcher_row,
threading_policy,
})
}
}
fn convolve_vertical_rgb_native_row_f16(
_: usize,
bounds: &FilterBounds,
src: &[f16],
dst: &mut [f16],
src_stride: usize,
weight: &[f32],
_: u32,
) {
let transient_src = src.iter().map(|&x| x as f32).collect::<Vec<f32>>();
let mut transient_dst = vec![0f32; dst.len()];
column_handler_floating_point::<f32, f32, f32>(
0,
bounds,
&transient_src,
&mut transient_dst,
src_stride,
weight,
8,
);
for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) {
*dst = *src as f16;
}
}
impl VerticalConvolutionPass<f16, f32, 4> for ImageStore<'_, f16, 4> {
fn vertical_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_options: ConvolutionOptions,
) -> Arc<dyn ColumnFilter<f16, 4> + Send + Sync> {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(
usize,
&FilterBounds,
&[f16],
&mut [f16],
usize,
&[f32],
u32,
) = convolve_vertical_rgb_native_row_f16;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = convolve_vertical_rgb_neon_row_f16;
match _options.workload_strategy {
crate::WorkloadStrategy::PreferQuality => {
use crate::filter_weights::WeightFloat16Converter;
use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
if std::arch::is_aarch64_feature_detected!("fhm") {
let weights =
WeightFloat16Converter::default().prepare_weights(&filter_weights);
return Arc::new(VerticalFiltering {
filter_weights: weights,
filter_row: convolve_vertical_rgb_neon_row_f16_fhm,
threading_policy,
});
}
}
crate::WorkloadStrategy::PreferSpeed => {
if std::arch::is_aarch64_feature_detected!("fp16") {
_dispatcher = xconvolve_vertical_rgb_neon_row_f16;
}
}
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = convolve_vertical_sse_row_f16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let is_f16c_available = std::arch::is_x86_feature_detected!("f16c");
let is_fma_available = std::arch::is_x86_feature_detected!("fma");
if std::arch::is_x86_feature_detected!("avx2") && is_f16c_available {
_dispatcher = convolve_vertical_avx_row_f16::<false>;
if is_fma_available {
_dispatcher = convolve_vertical_avx_row_f16::<true>;
}
}
}
Arc::new(VerticalFiltering {
filter_weights,
filter_row: _dispatcher,
threading_policy,
})
}
}
impl HorizontalFilterPass<f16, f32, 3> for ImageStore<'_, f16, 3> {
fn horizontal_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_options: ConvolutionOptions,
) -> Arc<dyn RowFilter<f16, 3> + Send + Sync> {
#[allow(clippy::type_complexity)]
let mut _dispatcher_4_rows: Option<
fn(&[f16], usize, &mut [f16], usize, &FilterWeights<f32>, u32),
> = Some(convolve_horizontal_rgba_4_row_f16::<3>);
#[allow(clippy::type_complexity)]
let mut _dispatcher_row: fn(&[f16], &mut [f16], &FilterWeights<f32>, u32) =
convolve_horizontal_rgb_native_row_f16::<3>;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f16);
_dispatcher_row = convolve_horizontal_rgb_neon_row_one_f16;
match _options.workload_strategy {
crate::WorkloadStrategy::PreferQuality => {
if std::arch::is_aarch64_feature_detected!("fhm") {
use crate::filter_weights::WeightFloat16Converter;
use crate::neon::{
convolve_horizontal_rgb_neon_row_one_f16_fhm,
convolve_horizontal_rgb_neon_rows_4_f16_fhm,
};
let weights =
WeightFloat16Converter::default().prepare_weights(&filter_weights);
return Arc::new(HorizontalFiltering {
filter_weights: weights,
filter_4_rows: Some(convolve_horizontal_rgb_neon_rows_4_f16_fhm),
filter_row: convolve_horizontal_rgb_neon_row_one_f16_fhm,
threading_policy,
});
}
}
crate::WorkloadStrategy::PreferSpeed => {
if std::arch::is_aarch64_feature_detected!("fp16")
&& _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
{
_dispatcher_4_rows = Some(xconvolve_horizontal_rgb_neon_rows_4_f16);
_dispatcher_row = xconvolve_horizontal_rgb_neon_row_one_f16;
}
}
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if is_x86_feature_detected!("sse4.1") {
_dispatcher_4_rows = Some(convolve_horizontal_rgb_sse_rows_4_f16);
_dispatcher_row = convolve_horizontal_rgb_sse_row_one_f16;
}
}
Arc::new(HorizontalFiltering {
filter_weights,
filter_4_rows: _dispatcher_4_rows,
filter_row: _dispatcher_row,
threading_policy,
})
}
}
impl VerticalConvolutionPass<f16, f32, 3> for ImageStore<'_, f16, 3> {
fn vertical_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_options: ConvolutionOptions,
) -> Arc<dyn ColumnFilter<f16, 3> + Send + Sync> {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(
usize,
&FilterBounds,
&[f16],
&mut [f16],
usize,
&[f32],
u32,
) = convolve_vertical_rgb_native_row_f16;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = convolve_vertical_rgb_neon_row_f16;
match _options.workload_strategy {
crate::WorkloadStrategy::PreferQuality => {
use crate::filter_weights::WeightFloat16Converter;
use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
if std::arch::is_aarch64_feature_detected!("fhm") {
let weights =
WeightFloat16Converter::default().prepare_weights(&filter_weights);
return Arc::new(VerticalFiltering {
filter_weights: weights,
filter_row: convolve_vertical_rgb_neon_row_f16_fhm,
threading_policy,
});
}
}
crate::WorkloadStrategy::PreferSpeed => {
if std::arch::is_aarch64_feature_detected!("fp16") {
_dispatcher = xconvolve_vertical_rgb_neon_row_f16;
}
}
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = convolve_vertical_sse_row_f16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let is_f16c_available = std::arch::is_x86_feature_detected!("f16c");
let is_fma_available = std::arch::is_x86_feature_detected!("fma");
if std::arch::is_x86_feature_detected!("avx2") && is_f16c_available {
_dispatcher = convolve_vertical_avx_row_f16::<false>;
if is_fma_available {
_dispatcher = convolve_vertical_avx_row_f16::<true>;
}
}
}
Arc::new(VerticalFiltering {
filter_weights,
filter_row: _dispatcher,
threading_policy,
})
}
}
impl HorizontalFilterPass<f16, f32, 1> for ImageStore<'_, f16, 1> {
fn horizontal_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_: ConvolutionOptions,
) -> Arc<dyn RowFilter<f16, 1> + Send + Sync> {
#[allow(clippy::type_complexity)]
let _dispatcher_4_rows: Option<
fn(&[f16], usize, &mut [f16], usize, &FilterWeights<f32>, u32),
> = Some(convolve_horizontal_rgba_4_row_f16::<1>);
let _dispatcher_row: fn(&[f16], &mut [f16], &FilterWeights<f32>, u32) =
convolve_horizontal_rgb_native_row_f16::<1>;
Arc::new(HorizontalFiltering {
filter_weights,
filter_4_rows: _dispatcher_4_rows,
filter_row: _dispatcher_row,
threading_policy,
})
}
}
impl VerticalConvolutionPass<f16, f32, 1> for ImageStore<'_, f16, 1> {
fn vertical_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_options: ConvolutionOptions,
) -> Arc<dyn ColumnFilter<f16, 1> + Send + Sync> {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(
usize,
&FilterBounds,
&[f16],
&mut [f16],
usize,
&[f32],
u32,
) = convolve_vertical_rgb_native_row_f16;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = convolve_vertical_rgb_neon_row_f16;
match _options.workload_strategy {
crate::WorkloadStrategy::PreferQuality => {
use crate::filter_weights::WeightFloat16Converter;
use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
if std::arch::is_aarch64_feature_detected!("fhm") {
let weights =
WeightFloat16Converter::default().prepare_weights(&filter_weights);
return Arc::new(VerticalFiltering {
filter_weights: weights,
filter_row: convolve_vertical_rgb_neon_row_f16_fhm,
threading_policy,
});
}
}
crate::WorkloadStrategy::PreferSpeed => {
if std::arch::is_aarch64_feature_detected!("fp16") {
_dispatcher = xconvolve_vertical_rgb_neon_row_f16;
}
}
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = convolve_vertical_sse_row_f16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let is_f16c_available = std::arch::is_x86_feature_detected!("f16c");
let is_fma_available = std::arch::is_x86_feature_detected!("fma");
if std::arch::is_x86_feature_detected!("avx2") && is_f16c_available {
_dispatcher = convolve_vertical_avx_row_f16::<false>;
if is_fma_available {
_dispatcher = convolve_vertical_avx_row_f16::<true>;
}
}
}
Arc::new(VerticalFiltering {
filter_weights,
filter_row: _dispatcher,
threading_policy,
})
}
}
impl HorizontalFilterPass<f16, f32, 2> for ImageStore<'_, f16, 2> {
fn horizontal_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_: ConvolutionOptions,
) -> Arc<dyn RowFilter<f16, 2> + Send + Sync> {
#[allow(clippy::type_complexity)]
let _dispatcher_4_rows: Option<
fn(&[f16], usize, &mut [f16], usize, &FilterWeights<f32>, u32),
> = Some(convolve_horizontal_rgba_4_row_f16::<2>);
let _dispatcher_row: fn(&[f16], &mut [f16], &FilterWeights<f32>, u32) =
convolve_horizontal_rgb_native_row_f16::<2>;
Arc::new(HorizontalFiltering {
filter_weights,
filter_4_rows: _dispatcher_4_rows,
filter_row: _dispatcher_row,
threading_policy,
})
}
}
impl VerticalConvolutionPass<f16, f32, 2> for ImageStore<'_, f16, 2> {
fn vertical_plan(
filter_weights: FilterWeights<f32>,
threading_policy: ThreadingPolicy,
_options: ConvolutionOptions,
) -> Arc<dyn ColumnFilter<f16, 2> + Send + Sync> {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(
usize,
&FilterBounds,
&[f16],
&mut [f16],
usize,
&[f32],
u32,
) = convolve_vertical_rgb_native_row_f16;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = convolve_vertical_rgb_neon_row_f16;
match _options.workload_strategy {
crate::WorkloadStrategy::PreferQuality => {
use crate::filter_weights::WeightFloat16Converter;
use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
let weights =
WeightFloat16Converter::default().prepare_weights(&filter_weights);
return Arc::new(VerticalFiltering {
filter_weights: weights,
filter_row: convolve_vertical_rgb_neon_row_f16_fhm,
threading_policy,
});
}
crate::WorkloadStrategy::PreferSpeed => {
if std::arch::is_aarch64_feature_detected!("fp16") {
_dispatcher = xconvolve_vertical_rgb_neon_row_f16;
}
}
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = convolve_vertical_sse_row_f16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let is_f16c_available = std::arch::is_x86_feature_detected!("f16c");
let is_fma_available = std::arch::is_x86_feature_detected!("fma");
if std::arch::is_x86_feature_detected!("avx2") && is_f16c_available {
_dispatcher = convolve_vertical_avx_row_f16::<false>;
if is_fma_available {
_dispatcher = convolve_vertical_avx_row_f16::<true>;
}
}
}
Arc::new(VerticalFiltering {
filter_weights,
filter_row: _dispatcher,
threading_policy,
})
}
}