use fovea::border::Clamp;
use fovea::image::{Image, Kernel3x3, Mask3x3, RasterImage};
use fovea::transform::{
FoldItem, FoldOp, MapItem, MapOp, convolve, dilate, erode, fold_neighborhood,
};
use std::hint::black_box;
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_u8_hot(acc: &mut [f32], src: &[u8], weight: f32) {
let n = acc.len().min(src.len());
for i in 0..n {
acc[i] += src[i] as f32 * weight;
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_f32_hot(acc: &mut [f32], src: &[f32], weight: f32) {
let n = acc.len().min(src.len());
for i in 0..n {
acc[i] += src[i] * weight;
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_u8_fma(acc: &mut [f32], src: &[u8], weight: f32) {
let n = acc.len().min(src.len());
for i in 0..n {
acc[i] = (src[i] as f32).mul_add(weight, acc[i]);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_f32_fma(acc: &mut [f32], src: &[f32], weight: f32) {
let n = acc.len().min(src.len());
for i in 0..n {
acc[i] = src[i].mul_add(weight, acc[i]);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_multi_offset_u8_fma(acc: &mut [f32], rows: &[(&[u8], f32)]) {
let n = acc.len();
for &(src, w) in rows {
let len = n.min(src.len());
for i in 0..len {
acc[i] = (src[i] as f32).mul_add(w, acc[i]);
}
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_multi_offset_f32_fma(acc: &mut [f32], rows: &[(&[f32], f32)]) {
let n = acc.len();
for &(src, w) in rows {
let len = n.min(src.len());
for i in 0..len {
acc[i] = src[i].mul_add(w, acc[i]);
}
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_erode_u8_hot(acc: &mut [u8], src: &[u8]) {
let n = acc.len().min(src.len());
for i in 0..n {
acc[i] = acc[i].min(src[i]);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_dilate_u8_hot(acc: &mut [u8], src: &[u8]) {
let n = acc.len().min(src.len());
for i in 0..n {
acc[i] = acc[i].max(src[i]);
}
}
struct DirectSumFold;
impl FoldOp<fovea::pixel::Mono8, f32> for DirectSumFold {
type Accumulator = f32;
type Output = fovea::pixel::MonoF32;
#[inline(always)]
fn init(&self) -> f32 {
0.0
}
#[inline(always)]
fn accumulate(&self, acc: &mut f32, item: FoldItem<fovea::pixel::Mono8, f32>) {
*acc += item.pixel.value() as f32 * item.weight;
}
#[inline(always)]
fn finalize(&mut self, acc: f32) -> fovea::pixel::MonoF32 {
fovea::pixel::MonoF32::new(acc)
}
}
impl FoldOp<f32, f32> for DirectSumFold {
type Accumulator = f32;
type Output = f32;
#[inline(always)]
fn init(&self) -> f32 {
0.0
}
#[inline(always)]
fn accumulate(&self, acc: &mut f32, item: FoldItem<f32, f32>) {
*acc += item.pixel * item.weight;
}
#[inline(always)]
fn finalize(&mut self, acc: f32) -> f32 {
acc
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_trait_convolve_u8_hot(acc: &mut [f32], src: &[u8], weight: f32) {
let op = DirectSumFold;
let n = acc.len().min(src.len());
for i in 0..n {
op.accumulate(
&mut acc[i],
FoldItem {
pixel: fovea::pixel::Mono8::new(src[i]),
weight,
},
);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_trait_convolve_f32_hot(acc: &mut [f32], src: &[f32], weight: f32) {
let op = DirectSumFold;
let n = acc.len().min(src.len());
for i in 0..n {
op.accumulate(
&mut acc[i],
FoldItem {
pixel: src[i],
weight,
},
);
}
}
struct ErodeInspect;
impl MapOp<u8> for ErodeInspect {
type Accumulator = u8;
type Output = u8;
#[inline(always)]
fn init(&self, center: u8) -> u8 {
center
}
#[inline(always)]
fn accumulate(&self, acc: &mut u8, item: MapItem<u8>) {
*acc = (*acc).min(item.pixel);
}
#[inline(always)]
fn finalize(&mut self, acc: u8) -> u8 {
acc
}
}
struct DilateInspect;
impl MapOp<u8> for DilateInspect {
type Accumulator = u8;
type Output = u8;
#[inline(always)]
fn init(&self, center: u8) -> u8 {
center
}
#[inline(always)]
fn accumulate(&self, acc: &mut u8, item: MapItem<u8>) {
*acc = (*acc).max(item.pixel);
}
#[inline(always)]
fn finalize(&mut self, acc: u8) -> u8 {
acc
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_trait_erode_u8_hot(acc: &mut [u8], src: &[u8]) {
let op = ErodeInspect;
let n = acc.len().min(src.len());
for i in 0..n {
op.accumulate(
&mut acc[i],
MapItem {
pixel: src[i],
dx: 0,
dy: 0,
},
);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_trait_dilate_u8_hot(acc: &mut [u8], src: &[u8]) {
let op = DilateInspect;
let n = acc.len().min(src.len());
for i in 0..n {
op.accumulate(
&mut acc[i],
MapItem {
pixel: src[i],
dx: 0,
dy: 0,
},
);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_multi_offset_u8(acc: &mut [f32], rows: &[(&[u8], f32)]) {
let n = acc.len();
for &(src, w) in rows {
let len = n.min(src.len());
for i in 0..len {
acc[i] += src[i] as f32 * w;
}
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_multi_offset_f32(acc: &mut [f32], rows: &[(&[f32], f32)]) {
let n = acc.len();
for &(src, w) in rows {
let len = n.min(src.len());
for i in 0..len {
acc[i] += src[i] * w;
}
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_multi_offset_erode_u8(acc: &mut [u8], rows: &[&[u8]]) {
let n = acc.len();
for &src in rows {
let len = n.min(src.len());
for i in 0..len {
acc[i] = acc[i].min(src[i]);
}
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_multi_offset_dilate_u8(acc: &mut [u8], rows: &[&[u8]]) {
let n = acc.len();
for &src in rows {
let len = n.min(src.len());
for i in 0..len {
acc[i] = acc[i].max(src[i]);
}
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn real_convolve_u8(
img: &Image<fovea::pixel::Mono8>,
kernel: &Kernel3x3,
) -> Image<fovea::pixel::MonoF32> {
convolve::<_, _, _, _, fovea::pixel::MonoF32>(img, kernel, &Clamp)
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn real_erode_u8(img: &Image<u8>, se: &Mask3x3) -> Image<u8> {
erode(img, se, &Clamp)
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn real_dilate_u8(img: &Image<u8>, se: &Mask3x3) -> Image<u8> {
dilate(img, se, &Clamp)
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn real_fold_neighborhood_u8(
img: &Image<fovea::pixel::Mono8>,
kernel: &Kernel3x3,
) -> Image<fovea::pixel::MonoF32> {
fold_neighborhood(
img,
kernel.weights(),
kernel.anchor(),
&Clamp,
DirectSumFold,
)
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_u8_asserted(acc: &mut [f32], src: &[u8], weight: f32) {
assert!(acc.len() == src.len());
let n = acc.len();
for i in 0..n {
acc[i] += src[i] as f32 * weight;
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_erode_u8_asserted(acc: &mut [u8], src: &[u8]) {
assert!(acc.len() == src.len());
let n = acc.len();
for i in 0..n {
acc[i] = acc[i].min(src[i]);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_u8_zip(acc: &mut [f32], src: &[u8], weight: f32) {
for (a, &s) in acc.iter_mut().zip(src.iter()) {
*a += s as f32 * weight;
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn map_erode_u8_zip(acc: &mut [u8], src: &[u8]) {
for (a, &s) in acc.iter_mut().zip(src.iter()) {
*a = (*a).min(s);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_f32_zip(acc: &mut [f32], src: &[f32], weight: f32) {
for (a, &s) in acc.iter_mut().zip(src.iter()) {
*a += s * weight;
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_u8_zip_fma(acc: &mut [f32], src: &[u8], weight: f32) {
for (a, &s) in acc.iter_mut().zip(src.iter()) {
*a = (s as f32).mul_add(weight, *a);
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn fold_convolve_f32_zip_fma(acc: &mut [f32], src: &[f32], weight: f32) {
for (a, &s) in acc.iter_mut().zip(src.iter()) {
*a = s.mul_add(weight, *a);
}
}
fn main() {
let w = 256;
let h = 256;
let img_u8 = Image::generate(w, h, |x, y| ((x * 17 + y * 31) % 256) as u8);
let img_mono8: Image<fovea::pixel::Mono8> = Image::generate(w, h, |x, y| {
fovea::pixel::Mono8::new(((x * 17 + y * 31) % 256) as u8)
});
let img_f32 = Image::generate(w, h, |x, y| (x * 17 + y * 31) as f32 / 256.0);
let kernel = Kernel3x3::gaussian_3x3();
let se = Mask3x3::full_rect_3x3();
let row_u8 = img_u8.row(10);
let row_f32 = img_f32.row(10);
let mut acc_f32 = vec![0.0f32; w];
let mut acc_u8: Vec<u8> = img_u8.row(5).to_vec();
fold_convolve_u8_hot(black_box(&mut acc_f32), black_box(row_u8), black_box(0.5));
fold_convolve_f32_hot(black_box(&mut acc_f32), black_box(row_f32), black_box(0.5));
fold_convolve_u8_fma(black_box(&mut acc_f32), black_box(row_u8), black_box(0.5));
fold_convolve_f32_fma(black_box(&mut acc_f32), black_box(row_f32), black_box(0.5));
map_erode_u8_hot(black_box(&mut acc_u8), black_box(row_u8));
map_dilate_u8_hot(black_box(&mut acc_u8), black_box(row_u8));
acc_f32.fill(0.0);
fold_trait_convolve_u8_hot(black_box(&mut acc_f32), black_box(row_u8), black_box(0.5));
fold_trait_convolve_f32_hot(black_box(&mut acc_f32), black_box(row_f32), black_box(0.5));
acc_u8 = img_u8.row(5).to_vec();
map_trait_erode_u8_hot(black_box(&mut acc_u8), black_box(row_u8));
map_trait_dilate_u8_hot(black_box(&mut acc_u8), black_box(row_u8));
let rows_u8_w: Vec<(&[u8], f32)> = (0..9).map(|i| (img_u8.row(i + 1), 1.0 / 9.0)).collect();
let rows_f32_w: Vec<(&[f32], f32)> = (0..9).map(|i| (img_f32.row(i + 1), 1.0 / 9.0)).collect();
let rows_u8: Vec<&[u8]> = (0..9).map(|i| img_u8.row(i + 1)).collect();
acc_f32.fill(0.0);
fold_multi_offset_u8(black_box(&mut acc_f32), black_box(&rows_u8_w));
fold_multi_offset_f32(black_box(&mut acc_f32), black_box(&rows_f32_w));
fold_multi_offset_u8_fma(black_box(&mut acc_f32), black_box(&rows_u8_w));
fold_multi_offset_f32_fma(black_box(&mut acc_f32), black_box(&rows_f32_w));
acc_u8 = img_u8.row(5).to_vec();
map_multi_offset_erode_u8(black_box(&mut acc_u8), black_box(&rows_u8));
map_multi_offset_dilate_u8(black_box(&mut acc_u8), black_box(&rows_u8));
acc_f32.fill(0.0);
fold_convolve_u8_asserted(black_box(&mut acc_f32), black_box(row_u8), black_box(0.5));
acc_u8 = img_u8.row(5).to_vec();
map_erode_u8_asserted(black_box(&mut acc_u8), black_box(row_u8));
acc_f32.fill(0.0);
fold_convolve_u8_zip(black_box(&mut acc_f32), black_box(row_u8), black_box(0.5));
fold_convolve_f32_zip(black_box(&mut acc_f32), black_box(row_f32), black_box(0.5));
fold_convolve_u8_zip_fma(black_box(&mut acc_f32), black_box(row_u8), black_box(0.5));
fold_convolve_f32_zip_fma(black_box(&mut acc_f32), black_box(row_f32), black_box(0.5));
acc_u8 = img_u8.row(5).to_vec();
map_erode_u8_zip(black_box(&mut acc_u8), black_box(row_u8));
let _conv = real_convolve_u8(black_box(&img_mono8), black_box(&kernel));
let _ero = real_erode_u8(black_box(&img_u8), black_box(&se));
let _dil = real_dilate_u8(black_box(&img_u8), black_box(&se));
let _fold = real_fold_neighborhood_u8(black_box(&img_mono8), black_box(&kernel));
black_box(&acc_f32);
black_box(&acc_u8);
black_box(&_conv);
black_box(&_ero);
black_box(&_dil);
black_box(&_fold);
println!("asm_inspect: all functions exercised");
}