libblur 0.24.0 - Docs.rs

// Copyright (c) Radzivon Bartoshyk. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// 1.  Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2.  Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3.  Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use crate::EdgeMode;
use crate::edge_mode::clamp_edge;
use crate::sse::fast_gaussian::SseI32x4;
use crate::sse::store_u16_u32;
use crate::sse::utils::load_u16_s32_fast;
use crate::unsafe_slice::UnsafeSlice;
use crate::util::ScratchBuffer;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

pub(crate) fn fg_horizontal_pass_sse_u16<const CN: usize>(
    bytes: &UnsafeSlice<u16>,
    stride: u32,
    width: u32,
    height: u32,
    radius: u32,
    start: u32,
    end: u32,
    edge_mode: EdgeMode,
) {
    unsafe {
        fg_horizontal_pass_sse_u16_def::<CN>(
            bytes, stride, width, height, radius, start, end, edge_mode,
        );
    }
}

#[target_feature(enable = "sse4.1")]
fn fg_horizontal_pass_sse_u16_def<const CN: usize>(
    bytes: &UnsafeSlice<u16>,
    stride: u32,
    width: u32,
    height: u32,
    radius: u32,
    start: u32,
    end: u32,
    edge_mode: EdgeMode,
) {
    fg_horizontal_pass_sse_u16_impl::<CN, false>(
        bytes, stride, width, height, radius, start, end, edge_mode,
    );
}

#[inline(always)]
fn fg_horizontal_pass_sse_u16_impl<const CN: usize, const FMA: bool>(
    bytes: &UnsafeSlice<u16>,
    stride: u32,
    width: u32,
    height: u32,
    radius: u32,
    start: u32,
    end: u32,
    edge_mode: EdgeMode,
) {
    unsafe {
        let mut buffer = ScratchBuffer::<[SseI32x4; 4], 1024>::new(1024);
        let buffer = buffer.as_mut_slice();

        let initial_sum = ((radius * radius) >> 1) as i32;

        let radius_64 = radius as i64;
        let width_wide = width as i64;

        let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32));

        let mut yy = start;

        while yy + 4 <= height.min(end) {
            let mut diffs0 = _mm_setzero_si128();
            let mut diffs1 = _mm_setzero_si128();
            let mut diffs2 = _mm_setzero_si128();
            let mut diffs3 = _mm_setzero_si128();

            let mut summs0 = _mm_set1_epi32(initial_sum);
            let mut summs1 = _mm_set1_epi32(initial_sum);
            let mut summs2 = _mm_set1_epi32(initial_sum);
            let mut summs3 = _mm_set1_epi32(initial_sum);

            let current_y0 = ((yy as i64) * (stride as i64)) as usize;
            let current_y1 = ((yy as i64 + 1) * (stride as i64)) as usize;
            let current_y2 = ((yy as i64 + 2) * (stride as i64)) as usize;
            let current_y3 = ((yy as i64 + 3) * (stride as i64)) as usize;

            let start_x = 0 - 2 * radius_64;
            for x in start_x..(width as i64) {
                if x >= 0 {
                    let current_px = (x as u32 * CN as u32) as usize;

                    let ss0 = _mm_cvtepi32_ps(summs0);
                    let ss1 = _mm_cvtepi32_ps(summs1);
                    let ss2 = _mm_cvtepi32_ps(summs2);
                    let ss3 = _mm_cvtepi32_ps(summs3);

                    let r0 = _mm_mul_ps(ss0, v_weight);
                    let r1 = _mm_mul_ps(ss1, v_weight);
                    let r2 = _mm_mul_ps(ss2, v_weight);
                    let r3 = _mm_mul_ps(ss3, v_weight);

                    let prepared_px0 = _mm_cvtps_epi32(r0);
                    let prepared_px1 = _mm_cvtps_epi32(r1);
                    let prepared_px2 = _mm_cvtps_epi32(r2);
                    let prepared_px3 = _mm_cvtps_epi32(r3);

                    store_u16_u32::<CN>(bytes.get_ptr(current_y0 + current_px), prepared_px0);
                    store_u16_u32::<CN>(bytes.get_ptr(current_y1 + current_px), prepared_px1);
                    store_u16_u32::<CN>(bytes.get_ptr(current_y2 + current_px), prepared_px2);
                    store_u16_u32::<CN>(bytes.get_ptr(current_y3 + current_px), prepared_px3);

                    let arr_index = ((x - radius_64) & 1023) as usize;
                    let d_arr_index = (x & 1023) as usize;
                    let da_b = buffer.get_unchecked(d_arr_index);
                    let da = buffer.get_unchecked(arr_index);

                    let mut d_stored0 = _mm_load_si128(da_b.as_ptr().cast());
                    let mut d_stored1 = _mm_load_si128(da_b[1..].as_ptr().cast());
                    let mut d_stored2 = _mm_load_si128(da_b[2..].as_ptr().cast());
                    let mut d_stored3 = _mm_load_si128(da_b[3..].as_ptr().cast());

                    d_stored0 = _mm_slli_epi32::<1>(d_stored0);
                    d_stored1 = _mm_slli_epi32::<1>(d_stored1);
                    d_stored2 = _mm_slli_epi32::<1>(d_stored2);
                    d_stored3 = _mm_slli_epi32::<1>(d_stored3);

                    let a_stored0 = _mm_load_si128(da.as_ptr().cast());
                    let a_stored1 = _mm_load_si128(da[1..].as_ptr().cast());
                    let a_stored2 = _mm_load_si128(da[2..].as_ptr().cast());
                    let a_stored3 = _mm_load_si128(da[3..].as_ptr().cast());

                    diffs0 = _mm_add_epi32(diffs0, _mm_sub_epi32(a_stored0, d_stored0));
                    diffs1 = _mm_add_epi32(diffs1, _mm_sub_epi32(a_stored1, d_stored1));
                    diffs2 = _mm_add_epi32(diffs2, _mm_sub_epi32(a_stored2, d_stored2));
                    diffs3 = _mm_add_epi32(diffs3, _mm_sub_epi32(a_stored3, d_stored3));
                } else if x + radius_64 >= 0 {
                    let arr_index = (x & 1023) as usize;
                    let da = buffer.get_unchecked(arr_index);
                    let mut stored0 = _mm_load_si128(da.as_ptr().cast());
                    let mut stored1 = _mm_load_si128(da[1..].as_ptr().cast());
                    let mut stored2 = _mm_load_si128(da[2..].as_ptr().cast());
                    let mut stored3 = _mm_load_si128(da[3..].as_ptr().cast());

                    stored0 = _mm_slli_epi32::<1>(stored0);
                    stored1 = _mm_slli_epi32::<1>(stored1);
                    stored2 = _mm_slli_epi32::<1>(stored2);
                    stored3 = _mm_slli_epi32::<1>(stored3);

                    diffs0 = _mm_sub_epi32(diffs0, stored0);
                    diffs1 = _mm_sub_epi32(diffs1, stored1);
                    diffs2 = _mm_sub_epi32(diffs2, stored2);
                    diffs3 = _mm_sub_epi32(diffs3, stored3);
                }

                let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide);
                let next_row_px = next_row_x * CN;

                let s_ptr0 = bytes.get_ptr(current_y0 + next_row_px);
                let s_ptr1 = bytes.get_ptr(current_y1 + next_row_px);
                let s_ptr2 = bytes.get_ptr(current_y2 + next_row_px);
                let s_ptr3 = bytes.get_ptr(current_y3 + next_row_px);

                let pixel_color0 = load_u16_s32_fast::<CN>(s_ptr0);
                let pixel_color1 = load_u16_s32_fast::<CN>(s_ptr1);
                let pixel_color2 = load_u16_s32_fast::<CN>(s_ptr2);
                let pixel_color3 = load_u16_s32_fast::<CN>(s_ptr3);

                let arr_index = ((x + radius_64) & 1023) as usize;
                let da = buffer.get_unchecked_mut(arr_index);

                _mm_store_si128(da.as_mut_ptr().cast(), pixel_color0);
                _mm_store_si128(da[1..].as_mut_ptr().cast(), pixel_color1);
                _mm_store_si128(da[2..].as_mut_ptr().cast(), pixel_color2);
                _mm_store_si128(da[3..].as_mut_ptr().cast(), pixel_color3);

                diffs0 = _mm_add_epi32(diffs0, pixel_color0);
                diffs1 = _mm_add_epi32(diffs1, pixel_color1);
                diffs2 = _mm_add_epi32(diffs2, pixel_color2);
                diffs3 = _mm_add_epi32(diffs3, pixel_color3);

                summs0 = _mm_add_epi32(summs0, diffs0);
                summs1 = _mm_add_epi32(summs1, diffs1);
                summs2 = _mm_add_epi32(summs2, diffs2);
                summs3 = _mm_add_epi32(summs3, diffs3);
            }

            yy += 4;
        }

        for y in yy..height.min(end) {
            let mut diffs = _mm_setzero_si128();
            let mut summs = _mm_set1_epi32(initial_sum);

            let current_y = ((y as i64) * (stride as i64)) as usize;

            let start_x = 0 - 2 * radius_64;
            for x in start_x..(width as i64) {
                if x >= 0 {
                    let current_px = (x as u32 * CN as u32) as usize;

                    let pixel_f32 = _mm_mul_ps(_mm_cvtepi32_ps(summs), v_weight);
                    let pixel_u32 = _mm_cvtps_epi32(pixel_f32);

                    let bytes_offset = current_y + current_px;

                    store_u16_u32::<CN>(bytes.get_ptr(bytes_offset), pixel_u32);

                    let arr_index = ((x - radius_64) & 1023) as usize;
                    let d_arr_index = (x & 1023) as usize;

                    let d_buf_ptr = buffer.get_unchecked(d_arr_index);
                    let mut d_stored = _mm_load_si128(d_buf_ptr.as_ptr().cast());
                    d_stored = _mm_slli_epi32::<1>(d_stored);

                    let buf_ptr = buffer.get_unchecked(arr_index);
                    let a_stored = _mm_load_si128(buf_ptr.as_ptr().cast());

                    diffs = _mm_add_epi32(diffs, _mm_sub_epi32(a_stored, d_stored));
                } else if x + radius_64 >= 0 {
                    let arr_index = (x & 1023) as usize;
                    let buf_ptr = buffer.get_unchecked(arr_index);
                    let mut stored = _mm_load_si128(buf_ptr.as_ptr().cast());
                    stored = _mm_slli_epi32::<1>(stored);
                    diffs = _mm_sub_epi32(diffs, stored);
                }

                let next_row_y = (y as usize) * (stride as usize);
                let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide);
                let next_row_px = next_row_x * CN;

                let s_ptr = bytes.get_ptr(next_row_y + next_row_px);
                let pixel_color = load_u16_s32_fast::<CN>(s_ptr);

                let arr_index = ((x + radius_64) & 1023) as usize;
                let buf_ptr = buffer.get_unchecked_mut(arr_index);

                diffs = _mm_add_epi32(diffs, pixel_color);
                summs = _mm_add_epi32(summs, diffs);

                _mm_store_si128(buf_ptr.as_mut_ptr().cast(), pixel_color);
            }
        }
    }
}

pub(crate) fn fg_vertical_pass_sse_u16<const CN: usize>(
    bytes: &UnsafeSlice<u16>,
    stride: u32,
    width: u32,
    height: u32,
    radius: u32,
    start: u32,
    end: u32,
    edge_mode: EdgeMode,
) {
    unsafe {
        fg_vertical_pass_sse_u16_def::<CN>(
            bytes, stride, width, height, radius, start, end, edge_mode,
        );
    }
}

#[target_feature(enable = "sse4.1")]
fn fg_vertical_pass_sse_u16_def<const CN: usize>(
    bytes: &UnsafeSlice<u16>,
    stride: u32,
    width: u32,
    height: u32,
    radius: u32,
    start: u32,
    end: u32,
    edge_mode: EdgeMode,
) {
    fg_vertical_pass_sse_u16_impl::<CN>(
        bytes, stride, width, height, radius, start, end, edge_mode,
    );
}

#[inline(always)]
fn fg_vertical_pass_sse_u16_impl<const CN: usize>(
    bytes: &UnsafeSlice<u16>,
    stride: u32,
    width: u32,
    height: u32,
    radius: u32,
    start: u32,
    end: u32,
    edge_mode: EdgeMode,
) {
    unsafe {
        let mut buffer = ScratchBuffer::<[SseI32x4; 4], 1024>::new(1024);
        let buffer = buffer.as_mut_slice();

        let initial_sum = ((radius * radius) >> 1) as i32;

        let height_wide = height as i64;

        let radius_64 = radius as i64;

        let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32));

        let mut xx = start;

        while xx + 4 <= width.min(end) {
            let mut diffs0 = _mm_setzero_si128();
            let mut diffs1 = _mm_setzero_si128();
            let mut diffs2 = _mm_setzero_si128();
            let mut diffs3 = _mm_setzero_si128();

            let mut summs0 = _mm_set1_epi32(initial_sum);
            let mut summs1 = _mm_set1_epi32(initial_sum);
            let mut summs2 = _mm_set1_epi32(initial_sum);
            let mut summs3 = _mm_set1_epi32(initial_sum);

            let start_y = 0 - 2 * radius as i64;

            let current_px0 = (xx * CN as u32) as usize;
            let current_px1 = ((xx + 1) * CN as u32) as usize;
            let current_px2 = ((xx + 2) * CN as u32) as usize;
            let current_px3 = ((xx + 3) * CN as u32) as usize;

            for y in start_y..height_wide {
                if y >= 0 {
                    let ss0 = _mm_cvtepi32_ps(summs0);
                    let ss1 = _mm_cvtepi32_ps(summs1);
                    let ss2 = _mm_cvtepi32_ps(summs2);
                    let ss3 = _mm_cvtepi32_ps(summs3);

                    let r0 = _mm_mul_ps(ss0, v_weight);
                    let r1 = _mm_mul_ps(ss1, v_weight);
                    let r2 = _mm_mul_ps(ss2, v_weight);
                    let r3 = _mm_mul_ps(ss3, v_weight);

                    let prepared_px0 = _mm_cvtps_epi32(r0);
                    let prepared_px1 = _mm_cvtps_epi32(r1);
                    let prepared_px2 = _mm_cvtps_epi32(r2);
                    let prepared_px3 = _mm_cvtps_epi32(r3);

                    let current_y = (y * (stride as i64)) as usize;

                    store_u16_u32::<CN>(bytes.get_ptr(current_y + current_px0), prepared_px0);
                    store_u16_u32::<CN>(bytes.get_ptr(current_y + current_px1), prepared_px1);
                    store_u16_u32::<CN>(bytes.get_ptr(current_y + current_px2), prepared_px2);
                    store_u16_u32::<CN>(bytes.get_ptr(current_y + current_px3), prepared_px3);

                    let arr_index = ((y - radius_64) & 1023) as usize;
                    let d_arr_index = (y & 1023) as usize;

                    let da_b = buffer.get_unchecked(d_arr_index);
                    let da = buffer.get_unchecked(arr_index);

                    let mut d_stored0 = _mm_load_si128(da_b.as_ptr().cast());
                    let mut d_stored1 = _mm_load_si128(da_b[1..].as_ptr().cast());
                    let mut d_stored2 = _mm_load_si128(da_b[2..].as_ptr().cast());
                    let mut d_stored3 = _mm_load_si128(da_b[3..].as_ptr().cast());

                    d_stored0 = _mm_slli_epi32::<1>(d_stored0);
                    d_stored1 = _mm_slli_epi32::<1>(d_stored1);
                    d_stored2 = _mm_slli_epi32::<1>(d_stored2);
                    d_stored3 = _mm_slli_epi32::<1>(d_stored3);

                    let a_stored0 = _mm_load_si128(da.as_ptr().cast());
                    let a_stored1 = _mm_load_si128(da[1..].as_ptr().cast());
                    let a_stored2 = _mm_load_si128(da[2..].as_ptr().cast());
                    let a_stored3 = _mm_load_si128(da[3..].as_ptr().cast());

                    diffs0 = _mm_add_epi32(diffs0, _mm_sub_epi32(a_stored0, d_stored0));
                    diffs1 = _mm_add_epi32(diffs1, _mm_sub_epi32(a_stored1, d_stored1));
                    diffs2 = _mm_add_epi32(diffs2, _mm_sub_epi32(a_stored2, d_stored2));
                    diffs3 = _mm_add_epi32(diffs3, _mm_sub_epi32(a_stored3, d_stored3));
                } else if y + radius_64 >= 0 {
                    let arr_index = (y & 1023) as usize;
                    let da = buffer.get_unchecked(arr_index);
                    let mut stored0 = _mm_load_si128(da.as_ptr().cast());
                    let mut stored1 = _mm_load_si128(da[1..].as_ptr().cast());
                    let mut stored2 = _mm_load_si128(da[2..].as_ptr().cast());
                    let mut stored3 = _mm_load_si128(da[3..].as_ptr().cast());

                    stored0 = _mm_slli_epi32::<1>(stored0);
                    stored1 = _mm_slli_epi32::<1>(stored1);
                    stored2 = _mm_slli_epi32::<1>(stored2);
                    stored3 = _mm_slli_epi32::<1>(stored3);

                    diffs0 = _mm_sub_epi32(diffs0, stored0);
                    diffs1 = _mm_sub_epi32(diffs1, stored1);
                    diffs2 = _mm_sub_epi32(diffs2, stored2);
                    diffs3 = _mm_sub_epi32(diffs3, stored3);
                }

                let next_row_y =
                    clamp_edge!(edge_mode, y + radius_64, 0, height_wide) * (stride as usize);

                let pixel_color0 = load_u16_s32_fast::<CN>(bytes.get_ptr(next_row_y + current_px0));
                let pixel_color1 = load_u16_s32_fast::<CN>(bytes.get_ptr(next_row_y + current_px1));
                let pixel_color2 = load_u16_s32_fast::<CN>(bytes.get_ptr(next_row_y + current_px2));
                let pixel_color3 = load_u16_s32_fast::<CN>(bytes.get_ptr(next_row_y + current_px3));

                let arr_index = ((y + radius_64) & 1023) as usize;

                diffs0 = _mm_add_epi32(diffs0, pixel_color0);
                diffs1 = _mm_add_epi32(diffs1, pixel_color1);
                diffs2 = _mm_add_epi32(diffs2, pixel_color2);
                diffs3 = _mm_add_epi32(diffs3, pixel_color3);

                let da = buffer.get_unchecked_mut(arr_index);

                _mm_store_si128(da.as_mut_ptr().cast(), pixel_color0);
                _mm_store_si128(da[1..].as_mut_ptr().cast(), pixel_color1);
                _mm_store_si128(da[2..].as_mut_ptr().cast(), pixel_color2);
                _mm_store_si128(da[3..].as_mut_ptr().cast(), pixel_color3);

                summs0 = _mm_add_epi32(summs0, diffs0);
                summs1 = _mm_add_epi32(summs1, diffs1);
                summs2 = _mm_add_epi32(summs2, diffs2);
                summs3 = _mm_add_epi32(summs3, diffs3);
            }

            xx += 4;
        }

        for x in xx..width.min(end) {
            let mut diffs = _mm_setzero_si128();
            let mut summs = _mm_set1_epi32(initial_sum);

            let current_px = (x * CN as u32) as usize;

            let start_y = 0 - 2 * radius as i64;
            for y in start_y..height_wide {
                if y >= 0 {
                    let pixel_f32 = _mm_mul_ps(_mm_cvtepi32_ps(summs), v_weight);

                    let current_y = (y * (stride as i64)) as usize;

                    let pixel_u32 = _mm_cvtps_epi32(pixel_f32);

                    let bytes_offset = current_y + current_px;

                    store_u16_u32::<CN>(bytes.get_ptr(bytes_offset), pixel_u32);

                    let arr_index = ((y - radius_64) & 1023) as usize;
                    let d_arr_index = (y & 1023) as usize;

                    let d_buf_ptr = buffer.get_unchecked(d_arr_index);
                    let mut d_stored = _mm_load_si128(d_buf_ptr.as_ptr().cast());
                    d_stored = _mm_slli_epi32::<1>(d_stored);

                    let buf_ptr = buffer.get_unchecked(arr_index);
                    let a_stored = _mm_load_si128(buf_ptr.as_ptr().cast());

                    diffs = _mm_add_epi32(diffs, _mm_sub_epi32(a_stored, d_stored));
                } else if y + radius_64 >= 0 {
                    let arr_index = (y & 1023) as usize;
                    let buf_ptr = buffer.get_unchecked(arr_index);
                    let mut stored = _mm_load_si128(buf_ptr.as_ptr().cast());
                    stored = _mm_slli_epi32::<1>(stored);
                    diffs = _mm_sub_epi32(diffs, stored);
                }

                let next_row_y =
                    clamp_edge!(edge_mode, y + radius_64, 0, height_wide) * (stride as usize);
                let next_row_x = (x * CN as u32) as usize;

                let pixel_color = load_u16_s32_fast::<CN>(bytes.get_ptr(next_row_y + next_row_x));

                let arr_index = ((y + radius_64) & 1023) as usize;
                let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();

                diffs = _mm_add_epi32(diffs, pixel_color);

                _mm_store_si128(buf_ptr.cast(), pixel_color);

                summs = _mm_add_epi32(summs, diffs);
            }
        }
    }
}