zoomvtools 2.0.0

use std::{
    cmp::min,
    num::{NonZeroU8, NonZeroUsize},
    slice,
};

pub unsafe fn overlaps_u8<const WIDTH: usize, const HEIGHT: usize>(
    dest: *mut u8,
    dest_stride_bytes: NonZeroUsize,
    src: *const u8,
    src_stride_bytes: NonZeroUsize,
    window: *const u16,
    window_stride: NonZeroUsize,
) {
    // pWin from 0 to 2048
    for j in 0..HEIGHT {
        let src = slice::from_raw_parts(src.add(j * src_stride_bytes.get()), WIDTH);
        let dest =
            slice::from_raw_parts_mut(dest.add(j * dest_stride_bytes.get()).cast::<u16>(), WIDTH);
        let win = slice::from_raw_parts(window.add(j * window_stride.get()), WIDTH);

        for ((src, dest), win) in src.iter().zip(dest.iter_mut()).zip(win.iter()) {
            // intermediate cast needed to avoid overflow
            *dest += ((*src as u32 * *win as u32) >> 6) as u16;
        }
    }
}

pub unsafe fn overlaps_u16<const WIDTH: usize, const HEIGHT: usize>(
    dest: *mut u8,
    dest_stride_bytes: NonZeroUsize,
    src: *const u8,
    src_stride_bytes: NonZeroUsize,
    window: *const u16,
    window_stride: NonZeroUsize,
) {
    // pWin from 0 to 2048
    for j in 0..HEIGHT {
        let src = slice::from_raw_parts(src.add(j * src_stride_bytes.get()).cast::<u16>(), WIDTH);
        let dest =
            slice::from_raw_parts_mut(dest.add(j * dest_stride_bytes.get()).cast::<u32>(), WIDTH);
        let win = slice::from_raw_parts(window.add(j * window_stride.get()), WIDTH);

        for ((src, dest), win) in src.iter().zip(dest.iter_mut()).zip(win.iter()) {
            *dest += (*src as u32 * *win as u32) >> 6;
        }
    }
}

// PERF: Benchmarked to be faster than custom SIMD implementation.
pub unsafe fn to_pixels_u16_to_u8(
    dest: *mut u8,
    dest_stride_bytes: NonZeroUsize,
    src: *const u8,
    src_stride_bytes: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    _bits_per_sample: NonZeroU8,
) {
    for h in 0..height.get() {
        let src = slice::from_raw_parts(
            src.add(h * src_stride_bytes.get()).cast::<u16>(),
            width.get(),
        );
        let dest = slice::from_raw_parts_mut(dest.add(h * dest_stride_bytes.get()), width.get());

        for (src, dest) in src.iter().zip(dest.iter_mut()) {
            let a = (*src as i32 + 16) >> 5;
            *dest = (a | ((255 - a) >> (i32::BITS - 1))) as u8;
        }
    }
}

// PERF: Benchmarked to be faster than custom SIMD implementation.
pub unsafe fn to_pixels_u32_to_u16(
    dest: *mut u8,
    dest_stride_bytes: NonZeroUsize,
    src: *const u8,
    src_stride_bytes: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    let pixel_max: i32 = (1 << bits_per_sample.get()) - 1;
    for h in 0..height.get() {
        let src = slice::from_raw_parts(
            src.add(h * src_stride_bytes.get()).cast::<u32>(),
            width.get(),
        );
        let dest = slice::from_raw_parts_mut(
            dest.add(h * dest_stride_bytes.get()).cast::<u16>(),
            width.get(),
        );

        for (src, dest) in src.iter().zip(dest.iter_mut()) {
            let a = (*src as i32 + 16) >> 5;
            *dest = min(pixel_max, a) as u16;
        }
    }
}