jpeg-encoder 0.6.0

JPEG encoder
Documentation
#[cfg(target_arch = "x86")]
use core::arch::x86::{
    __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
    _mm256_srli_epi32, _mm256_sub_epi32,
};

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
    __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
    _mm256_srli_epi32, _mm256_sub_epi32,
};

use alloc::vec::Vec;

use crate::{rgb_to_ycbcr, ImageBuffer, JpegColorType};

macro_rules! ycbcr_image_avx2 {
    ($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => {
        pub(crate) struct $name<'a>(pub &'a [u8], pub u16, pub u16);

        impl<'a> $name<'a> {
            #[target_feature(enable = "avx2")]
            unsafe fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
                unsafe fn load3(data: *const u8) -> __m256i {
                    _mm256_set_epi32(
                        *data as i32,
                        *data.offset(1 * $num_colors) as i32,
                        *data.offset(2 * $num_colors) as i32,
                        *data.offset(3 * $num_colors) as i32,
                        *data.offset(4 * $num_colors) as i32,
                        *data.offset(5 * $num_colors) as i32,
                        *data.offset(6 * $num_colors) as i32,
                        *data.offset(7 * $num_colors) as i32,
                    )
                }

                let mut y_buffer = buffers[0].as_mut_ptr().add(buffers[0].len());
                buffers[0].set_len(buffers[0].len() + self.width() as usize);
                let mut cb_buffer = buffers[1].as_mut_ptr().add(buffers[1].len());
                buffers[1].set_len(buffers[1].len() + self.width() as usize);
                let mut cr_buffer = buffers[2].as_mut_ptr().add(buffers[2].len());
                buffers[2].set_len(buffers[2].len() + self.width() as usize);

                let ymulr = _mm256_set1_epi32(19595);
                let ymulg = _mm256_set1_epi32(38470);
                let ymulb = _mm256_set1_epi32(7471);

                let cbmulr = _mm256_set1_epi32(-11059);
                let cbmulg = _mm256_set1_epi32(21709);
                let cbmulb = _mm256_set1_epi32(32768);

                let crmulr = _mm256_set1_epi32(32768);
                let crmulg = _mm256_set1_epi32(27439);
                let crmulb = _mm256_set1_epi32(5329);

                let mut data = self
                    .0
                    .as_ptr()
                    .offset((y as isize * self.1 as isize * $num_colors));

                for _ in 0..self.width() / 8 {
                    let r = load3(data.offset($o1));
                    let g = load3(data.offset($o2));
                    let b = load3(data.offset($o3));

                    data = data.add($num_colors * 8);

                    let yr = _mm256_mullo_epi32(ymulr, r);
                    let yg = _mm256_mullo_epi32(ymulg, g);
                    let yb = _mm256_mullo_epi32(ymulb, b);

                    let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb);
                    let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF));
                    let y = _mm256_srli_epi32(y, 16);
                    let y: [i32; 8] = core::mem::transmute(y);

                    let cbr = _mm256_mullo_epi32(cbmulr, r);
                    let cbg = _mm256_mullo_epi32(cbmulg, g);
                    let cbb = _mm256_mullo_epi32(cbmulb, b);

                    let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb);
                    let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16));
                    let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF));
                    let cb = _mm256_srli_epi32(cb, 16);
                    let cb: [i32; 8] = core::mem::transmute(cb);

                    let crr = _mm256_mullo_epi32(crmulr, r);
                    let crg = _mm256_mullo_epi32(crmulg, g);
                    let crb = _mm256_mullo_epi32(crmulb, b);

                    let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb);
                    let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16));
                    let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF));
                    let cr = _mm256_srli_epi32(cr, 16);
                    let cr: [i32; 8] = core::mem::transmute(cr);

                    for y in y.iter().rev() {
                        *y_buffer = *y as u8;
                        y_buffer = y_buffer.offset(1);
                    }

                    for cb in cb.iter().rev() {
                        *cb_buffer = *cb as u8;
                        cb_buffer = cb_buffer.offset(1);
                    }

                    for cr in cr.iter().rev() {
                        *cr_buffer = *cr as u8;
                        cr_buffer = cr_buffer.offset(1);
                    }
                }

                for _ in 0..self.width() % 8 {
                    let (y, cb, cr) =
                        rgb_to_ycbcr(*data.offset($o1), *data.offset($o2), *data.offset($o3));

                    data = data.add($num_colors);

                    *y_buffer = y;
                    y_buffer = y_buffer.offset(1);

                    *cb_buffer = cb;
                    cb_buffer = cb_buffer.offset(1);

                    *cr_buffer = cr;
                    cr_buffer = cr_buffer.offset(1);
                }
            }
        }

        impl<'a> ImageBuffer for $name<'a> {
            fn get_jpeg_color_type(&self) -> JpegColorType {
                JpegColorType::Ycbcr
            }

            fn width(&self) -> u16 {
                self.1
            }

            fn height(&self) -> u16 {
                self.2
            }

            #[inline(always)]
            fn fill_buffers(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
                unsafe {
                    self.fill_buffers_avx2(y, buffers);
                }
            }
        }
    };
}

ycbcr_image_avx2!(RgbImageAVX2, 3, 0, 1, 2);
ycbcr_image_avx2!(RgbaImageAVX2, 4, 0, 1, 2);
ycbcr_image_avx2!(BgrImageAVX2, 3, 2, 1, 0);
ycbcr_image_avx2!(BgraImageAVX2, 4, 2, 1, 0);