#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
mod avx2;
mod rust;
#[cfg(test)]
mod tests;
use std::{
f32::consts::PI,
mem::{MaybeUninit, size_of},
num::{NonZeroU8, NonZeroUsize},
};
use crate::util::{Pixel, assume_init_vec, uninit_vec};
use safefma::Fma;
use semisafe::slice::get as semisafe_get;
use semisafe::slice::get_mut as semisafe_get_mut;
#[derive(Debug, Clone)]
pub struct OverlapWindows {
nx: NonZeroUsize,
ny: NonZeroUsize,
overlap_9_windows: Vec<u16>,
}
impl OverlapWindows {
const WINDOW_SCALE: f32 = 2048.0;
const WINDOW_VARIANTS: usize = 9;
#[must_use]
#[inline]
pub fn new(nx: NonZeroUsize, ny: NonZeroUsize, ox: usize, oy: usize) -> Self {
let nx_value = nx.get();
let ny_value = ny.get();
let size = nx_value * ny_value;
let (f_win1_uvx, f_win1_uvx_first, f_win1_uvx_last) =
Self::build_axis_windows(nx_value, ox);
let (f_win1_uvy, f_win1_uvy_first, f_win1_uvy_last) =
Self::build_axis_windows(ny_value, oy);
let mut overlap_9_windows: Vec<MaybeUninit<u16>> = uninit_vec(size * Self::WINDOW_VARIANTS);
let (win_over_uvtl, rest) = overlap_9_windows.split_at_mut(size);
let (win_over_uvtm, rest) = rest.split_at_mut(size);
let (win_over_uvtr, rest) = rest.split_at_mut(size);
let (win_over_uvml, rest) = rest.split_at_mut(size);
let (win_over_uvmm, rest) = rest.split_at_mut(size);
let (win_over_uvmr, rest) = rest.split_at_mut(size);
let (win_over_uvbl, rest) = rest.split_at_mut(size);
let (win_over_uvbm, win_over_uvbr) = rest.split_at_mut(size);
for (j, ((&y_first, &y), &y_last)) in f_win1_uvy_first
.iter()
.zip(f_win1_uvy.iter())
.zip(f_win1_uvy_last.iter())
.enumerate()
{
let row_offset = j * nx_value;
for (i, ((&x_first, &x), &x_last)) in f_win1_uvx_first
.iter()
.zip(f_win1_uvx.iter())
.zip(f_win1_uvx_last.iter())
.enumerate()
{
let idx = row_offset + i;
semisafe_get_mut(win_over_uvtl, idx)
.write(Self::quantize_window(y_first * x_first));
semisafe_get_mut(win_over_uvtm, idx).write(Self::quantize_window(y_first * x));
semisafe_get_mut(win_over_uvtr, idx).write(Self::quantize_window(y_first * x_last));
semisafe_get_mut(win_over_uvml, idx).write(Self::quantize_window(y * x_first));
semisafe_get_mut(win_over_uvmm, idx).write(Self::quantize_window(y * x));
semisafe_get_mut(win_over_uvmr, idx).write(Self::quantize_window(y * x_last));
semisafe_get_mut(win_over_uvbl, idx).write(Self::quantize_window(y_last * x_first));
semisafe_get_mut(win_over_uvbm, idx).write(Self::quantize_window(y_last * x));
semisafe_get_mut(win_over_uvbr, idx).write(Self::quantize_window(y_last * x_last));
}
}
let overlap_9_windows = unsafe { assume_init_vec(overlap_9_windows) };
Self {
nx,
ny,
overlap_9_windows,
}
}
#[must_use]
fn build_axis_windows(length: usize, overlap: usize) -> (Vec<f32>, Vec<f32>, Vec<f32>) {
let overlap = overlap.min(length);
let mut window = vec![1.0; length];
let mut window_first = vec![1.0; length];
let mut window_last = vec![1.0; length];
if overlap == 0 {
return (window, window_first, window_last);
}
let overlap_f = overlap as f32;
let length_f = length as f32;
let denominator = overlap_f * 2.0;
for i in 0..overlap {
let phase = (i as f32 - overlap_f + 0.5) / denominator;
let value = Self::cosine_squared(phase);
*semisafe_get_mut(&mut window, i) = value;
*semisafe_get_mut(&mut window_last, i) = value;
}
for i in length - overlap..length {
let phase = (i as f32 - length_f + overlap_f + 0.5) / denominator;
let value = Self::cosine_squared(phase);
*semisafe_get_mut(&mut window, i) = value;
*semisafe_get_mut(&mut window_first, i) = value;
}
(window, window_first, window_last)
}
#[must_use]
fn cosine_squared(value: f32) -> f32 {
let cosine = (PI * value).cos();
cosine * cosine
}
#[must_use]
fn quantize_window(value: f32) -> u16 {
value.fma(Self::WINDOW_SCALE, 0.5) as u16
}
#[must_use]
const fn size(&self) -> usize {
self.nx.get() * self.ny.get()
}
#[must_use]
#[inline]
pub fn get_window(&self, i: usize) -> &[u16] {
let size = self.size();
semisafe_get(&self.overlap_9_windows, (size * i)..)
}
}
pub type OverlapsFn = unsafe fn(
dest: *mut u8,
dest_stride_bytes: NonZeroUsize,
src: *const u8,
src_stride_bytes: NonZeroUsize,
window: *const u16,
window_stride: NonZeroUsize,
);
#[must_use]
#[inline]
pub fn select_overlaps<T: Pixel>(width: NonZeroUsize, height: NonZeroUsize) -> OverlapsFn {
#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
if crate::util::has_avx2() {
match (size_of::<T>(), width.get(), height.get()) {
(1, 2, 2) => return avx2::overlaps_u8::<2, 2>,
(1, 2, 4) => return avx2::overlaps_u8::<2, 4>,
(1, 4, 2) => return avx2::overlaps_u8::<4, 2>,
(1, 4, 4) => return avx2::overlaps_u8::<4, 4>,
(1, 4, 8) => return avx2::overlaps_u8::<4, 8>,
#[cfg(feature = "experimental")]
(1, 8, 1) => return avx2::overlaps_u8::<8, 1>,
#[cfg(feature = "experimental")]
(1, 8, 2) => return avx2::overlaps_u8::<8, 2>,
#[cfg(feature = "experimental")]
(1, 8, 4) => return avx2::overlaps_u8::<8, 4>,
#[cfg(feature = "experimental")]
(1, 8, 8) => return avx2::overlaps_u8::<8, 8>,
#[cfg(feature = "experimental")]
(1, 8, 16) => return avx2::overlaps_u8::<8, 16>,
(1, 16, 1) => return avx2::overlaps_u8::<16, 1>,
(1, 16, 2) => return avx2::overlaps_u8::<16, 2>,
(1, 16, 4) => return avx2::overlaps_u8::<16, 4>,
(1, 16, 8) => return avx2::overlaps_u8::<16, 8>,
(1, 16, 16) => return avx2::overlaps_u8::<16, 16>,
(1, 16, 32) => return avx2::overlaps_u8::<16, 32>,
(1, 32, 8) => return avx2::overlaps_u8::<32, 8>,
(1, 32, 16) => return avx2::overlaps_u8::<32, 16>,
(1, 32, 32) => return avx2::overlaps_u8::<32, 32>,
(1, 32, 64) => return avx2::overlaps_u8::<32, 64>,
(1, 64, 16) => return avx2::overlaps_u8::<64, 16>,
(1, 64, 32) => return avx2::overlaps_u8::<64, 32>,
(1, 64, 64) => return avx2::overlaps_u8::<64, 64>,
(1, 64, 128) => return avx2::overlaps_u8::<64, 128>,
(1, 128, 32) => return avx2::overlaps_u8::<128, 32>,
(1, 128, 64) => return avx2::overlaps_u8::<128, 64>,
(1, 128, 128) => return avx2::overlaps_u8::<128, 128>,
(2, 2, 2) => return avx2::overlaps_u16::<2, 2>,
(2, 2, 4) => return avx2::overlaps_u16::<2, 4>,
(2, 4, 2) => return avx2::overlaps_u16::<4, 2>,
(2, 4, 4) => return avx2::overlaps_u16::<4, 4>,
(2, 4, 8) => return avx2::overlaps_u16::<4, 8>,
#[cfg(feature = "experimental")]
(2, 8, 1) => return avx2::overlaps_u16::<8, 1>,
#[cfg(feature = "experimental")]
(2, 8, 2) => return avx2::overlaps_u16::<8, 2>,
#[cfg(feature = "experimental")]
(2, 8, 4) => return avx2::overlaps_u16::<8, 4>,
#[cfg(feature = "experimental")]
(2, 8, 8) => return avx2::overlaps_u16::<8, 8>,
#[cfg(feature = "experimental")]
(2, 8, 16) => return avx2::overlaps_u16::<8, 16>,
(2, 16, 1) => return avx2::overlaps_u16::<16, 1>,
(2, 16, 2) => return avx2::overlaps_u16::<16, 2>,
(2, 16, 4) => return avx2::overlaps_u16::<16, 4>,
(2, 16, 8) => return avx2::overlaps_u16::<16, 8>,
(2, 16, 16) => return avx2::overlaps_u16::<16, 16>,
(2, 16, 32) => return avx2::overlaps_u16::<16, 32>,
(2, 32, 8) => return avx2::overlaps_u16::<32, 8>,
(2, 32, 16) => return avx2::overlaps_u16::<32, 16>,
(2, 32, 32) => return avx2::overlaps_u16::<32, 32>,
(2, 32, 64) => return avx2::overlaps_u16::<32, 64>,
#[cfg(feature = "experimental")]
(2, 64, 16) => return avx2::overlaps_u16::<64, 16>,
#[cfg(feature = "experimental")]
(2, 64, 32) => return avx2::overlaps_u16::<64, 32>,
#[cfg(feature = "experimental")]
(2, 64, 64) => return avx2::overlaps_u16::<64, 64>,
#[cfg(feature = "experimental")]
(2, 64, 128) => return avx2::overlaps_u16::<64, 128>,
#[cfg(feature = "experimental")]
(2, 128, 32) => return avx2::overlaps_u16::<128, 32>,
#[cfg(feature = "experimental")]
(2, 128, 64) => return avx2::overlaps_u16::<128, 64>,
#[cfg(feature = "experimental")]
(2, 128, 128) => return avx2::overlaps_u16::<128, 128>,
_ => {
}
};
}
match (size_of::<T>(), width.get(), height.get()) {
(1, 2, 2) => rust::overlaps_u8::<2, 2>,
(1, 2, 4) => rust::overlaps_u8::<2, 4>,
(1, 4, 2) => rust::overlaps_u8::<4, 2>,
(1, 4, 4) => rust::overlaps_u8::<4, 4>,
(1, 4, 8) => rust::overlaps_u8::<4, 8>,
(1, 8, 1) => rust::overlaps_u8::<8, 1>,
(1, 8, 2) => rust::overlaps_u8::<8, 2>,
(1, 8, 4) => rust::overlaps_u8::<8, 4>,
(1, 8, 8) => rust::overlaps_u8::<8, 8>,
(1, 8, 16) => rust::overlaps_u8::<8, 16>,
(1, 16, 1) => rust::overlaps_u8::<16, 1>,
(1, 16, 2) => rust::overlaps_u8::<16, 2>,
(1, 16, 4) => rust::overlaps_u8::<16, 4>,
(1, 16, 8) => rust::overlaps_u8::<16, 8>,
(1, 16, 16) => rust::overlaps_u8::<16, 16>,
(1, 16, 32) => rust::overlaps_u8::<16, 32>,
(1, 32, 8) => rust::overlaps_u8::<32, 8>,
(1, 32, 16) => rust::overlaps_u8::<32, 16>,
(1, 32, 32) => rust::overlaps_u8::<32, 32>,
(1, 32, 64) => rust::overlaps_u8::<32, 64>,
(1, 64, 16) => rust::overlaps_u8::<64, 16>,
(1, 64, 32) => rust::overlaps_u8::<64, 32>,
(1, 64, 64) => rust::overlaps_u8::<64, 64>,
(1, 64, 128) => rust::overlaps_u8::<64, 128>,
(1, 128, 32) => rust::overlaps_u8::<128, 32>,
(1, 128, 64) => rust::overlaps_u8::<128, 64>,
(1, 128, 128) => rust::overlaps_u8::<128, 128>,
(2, 2, 2) => rust::overlaps_u16::<2, 2>,
(2, 2, 4) => rust::overlaps_u16::<2, 4>,
(2, 4, 2) => rust::overlaps_u16::<4, 2>,
(2, 4, 4) => rust::overlaps_u16::<4, 4>,
(2, 4, 8) => rust::overlaps_u16::<4, 8>,
(2, 8, 1) => rust::overlaps_u16::<8, 1>,
(2, 8, 2) => rust::overlaps_u16::<8, 2>,
(2, 8, 4) => rust::overlaps_u16::<8, 4>,
(2, 8, 8) => rust::overlaps_u16::<8, 8>,
(2, 8, 16) => rust::overlaps_u16::<8, 16>,
(2, 16, 1) => rust::overlaps_u16::<16, 1>,
(2, 16, 2) => rust::overlaps_u16::<16, 2>,
(2, 16, 4) => rust::overlaps_u16::<16, 4>,
(2, 16, 8) => rust::overlaps_u16::<16, 8>,
(2, 16, 16) => rust::overlaps_u16::<16, 16>,
(2, 16, 32) => rust::overlaps_u16::<16, 32>,
(2, 32, 8) => rust::overlaps_u16::<32, 8>,
(2, 32, 16) => rust::overlaps_u16::<32, 16>,
(2, 32, 32) => rust::overlaps_u16::<32, 32>,
(2, 32, 64) => rust::overlaps_u16::<32, 64>,
(2, 64, 16) => rust::overlaps_u16::<64, 16>,
(2, 64, 32) => rust::overlaps_u16::<64, 32>,
(2, 64, 64) => rust::overlaps_u16::<64, 64>,
(2, 64, 128) => rust::overlaps_u16::<64, 128>,
(2, 128, 32) => rust::overlaps_u16::<128, 32>,
(2, 128, 64) => rust::overlaps_u16::<128, 64>,
(2, 128, 128) => rust::overlaps_u16::<128, 128>,
_ => panic!("unsupported block size for overlaps: {}x{}", width, height),
}
}
pub type ToPixelsFn = unsafe fn(
dest: *mut u8,
dest_stride_bytes: NonZeroUsize,
src: *const u8,
src_stride_bytes: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
);
#[must_use]
#[inline]
pub fn select_to_pixels<T: Pixel>() -> ToPixelsFn {
match size_of::<T>() {
1 => rust::to_pixels_u16_to_u8,
2 => rust::to_pixels_u32_to_u16,
_ => unreachable!(),
}
}