mod compose;
mod gradient;
mod image;
use crate::filter::filter_lowp;
use crate::fine::lowp::image::{BilinearImagePainter, PlainBilinearImagePainter};
use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE, Splat4thExt};
use crate::fine::{FineKernel, highp, u8_to_f32};
use crate::layer_manager::LayerManager;
use crate::peniko::BlendMode;
use crate::region::Region;
use crate::util::NormalizedMulExt;
use crate::util::scalar::div_255;
use bytemuck::cast_slice;
use core::iter;
use vello_common::coarse::WideTile;
use vello_common::encode::{EncodedGradient, EncodedImage};
use vello_common::fearless_simd::*;
use vello_common::filter_effects::Filter;
use vello_common::kurbo::Affine;
use vello_common::mask::Mask;
use vello_common::paint::{PremulColor, Tint, TintMode};
use vello_common::pixmap::Pixmap;
use vello_common::tile::Tile;
use vello_common::util::{Div255Ext, f32_to_u8};
#[derive(Clone, Copy, Debug)]
pub struct U8Kernel;
impl<S: Simd> FineKernel<S> for U8Kernel {
type Numeric = u8;
type Composite = u8x32<S>;
type NumericVec = u8x16<S>;
#[inline]
fn extract_color(color: PremulColor) -> [Self::Numeric; 4] {
color.as_premul_rgba8().to_u8_array()
}
#[inline(always)]
fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]) {
if region.width != WideTile::WIDTH || region.height != Tile::HEIGHT {
pack(region, blend_buf);
} else {
simd.vectorize(
#[inline(always)]
|| {
pack_block(simd, region, blend_buf);
},
);
}
}
#[inline(always)]
fn unpack(simd: S, region: &mut Region<'_>, blend_buf: &mut [Self::Numeric]) {
if region.width != WideTile::WIDTH || region.height != Tile::HEIGHT {
unpack(region, blend_buf);
} else {
simd.vectorize(
#[inline(always)]
|| {
unpack_block(simd, region, blend_buf);
},
);
}
}
fn filter_layer(
pixmap: &mut Pixmap,
filter: &Filter,
layer_manager: &mut LayerManager,
transform: Affine,
) {
filter_lowp(filter, pixmap, layer_manager, transform);
}
fn copy_solid(simd: S, dest: &mut [Self::Numeric], src: [Self::Numeric; 4]) {
simd.vectorize(
#[inline(always)]
|| {
let target: &mut [u32] = bytemuck::cast_slice_mut(dest);
target.fill(u32::from_ne_bytes(src));
},
);
}
fn gradient_painter<'a>(
simd: S,
gradient: &'a EncodedGradient,
t_vals: &'a [f32],
) -> impl Painter + 'a {
simd.vectorize(
#[inline(always)]
|| gradient::GradientPainter::new(simd, gradient, t_vals),
)
}
fn medium_quality_image_painter<'a>(
simd: S,
image: &'a EncodedImage,
pixmap: &'a Pixmap,
start_x: f64,
start_y: f64,
) -> impl Painter + 'a {
simd.vectorize(
#[inline(always)]
|| BilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
)
}
fn plain_medium_quality_image_painter<'a>(
simd: S,
image: &'a EncodedImage,
pixmap: &'a Pixmap,
start_x: f64,
start_y: f64,
) -> impl Painter + 'a {
simd.vectorize(
#[inline(always)]
|| PlainBilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
)
}
fn apply_mask(
simd: S,
dest: &mut [Self::Numeric],
mut src: impl Iterator<Item = Self::NumericVec>,
) {
simd.vectorize(
#[inline(always)]
|| {
for el in dest.chunks_exact_mut(16) {
let loaded = u8x16::from_slice(simd, el);
let mulled = simd.narrow_u16x16(
(simd.widen_u8x16(loaded) * simd.widen_u8x16(src.next().unwrap()))
.div_255(),
);
el.copy_from_slice(mulled.as_slice());
}
},
);
}
#[inline(always)]
fn apply_painter<'a>(_: S, dest: &mut [Self::Numeric], mut painter: impl Painter + 'a) {
painter.paint_u8(dest);
}
#[inline(always)]
fn apply_tint(simd: S, dest: &mut [Self::Numeric], tint: &Tint) {
let premul = tint.color.premultiply();
let [r, g, b, a] = premul.components;
let to_u8 = |v: f32| (v * 255.0 + 0.5) as u8;
let color = u32::from_ne_bytes([to_u8(r), to_u8(g), to_u8(b), to_u8(a)]);
let tint_v = u32x8::block_splat(u32x4::splat(simd, color)).to_bytes();
simd.vectorize(
#[inline(always)]
|| match tint.mode {
TintMode::AlphaMask => {
for chunk in dest.chunks_exact_mut(32) {
let pixel = u8x32::from_slice(simd, chunk);
let alphas = pixel.splat_4th();
let tinted = tint_v.normalized_mul(alphas);
chunk.copy_from_slice(tinted.as_slice());
}
}
TintMode::Multiply => {
for chunk in dest.chunks_exact_mut(32) {
let pixel = u8x32::from_slice(simd, chunk);
let tinted = pixel.normalized_mul(tint_v);
chunk.copy_from_slice(tinted.as_slice());
}
}
},
);
}
#[inline(always)]
fn alpha_composite_solid(
simd: S,
dest: &mut [Self::Numeric],
src: [Self::Numeric; 4],
alphas: Option<&[u8]>,
) {
if let Some(alphas) = alphas {
alpha_fill::alpha_composite_solid(
simd,
dest,
src,
cast_slice::<u8, [u8; 8]>(alphas).iter().copied(),
);
} else {
fill::alpha_composite_solid(simd, dest, src);
}
}
fn alpha_composite_buffer(
simd: S,
dest: &mut [Self::Numeric],
src: &[Self::Numeric],
alphas: Option<&[u8]>,
) {
let src_iter = src.chunks_exact(32).map(|el| u8x32::from_slice(simd, el));
if let Some(alphas) = alphas {
alpha_fill::alpha_composite(
simd,
dest,
src_iter,
cast_slice::<u8, [u8; 8]>(alphas).iter().copied(),
);
} else {
fill::alpha_composite(simd, dest, src_iter);
}
}
fn blend(
simd: S,
dest: &mut [Self::Numeric],
mut start_x: u16,
start_y: u16,
src: impl Iterator<Item = Self::Composite>,
blend_mode: BlendMode,
alphas: Option<&[u8]>,
mask: Option<&Mask>,
) {
let alpha_iter = alphas.map(|a| cast_slice::<u8, [u8; 8]>(a).iter().copied());
let mask_iter = mask.map(|m| {
iter::from_fn(|| {
let sample = |x: u16, y: u16| {
if x < m.width() && y < m.height() {
m.sample(x, y)
} else {
255
}
};
let samples = [
sample(start_x, start_y),
sample(start_x, start_y + 1),
sample(start_x, start_y + 2),
sample(start_x, start_y + 3),
sample(start_x + 1, start_y),
sample(start_x + 1, start_y + 1),
sample(start_x + 1, start_y + 2),
sample(start_x + 1, start_y + 3),
];
start_x += 2;
Some(samples)
})
});
match (alpha_iter, mask_iter) {
(Some(alpha_iter), Some(mut mask_iter)) => {
let iter = alpha_iter.map(|a1| {
let a2 = mask_iter.next().unwrap();
[
div_255(a1[0] as u16 * a2[0] as u16) as u8,
div_255(a1[1] as u16 * a2[1] as u16) as u8,
div_255(a1[2] as u16 * a2[2] as u16) as u8,
div_255(a1[3] as u16 * a2[3] as u16) as u8,
div_255(a1[4] as u16 * a2[4] as u16) as u8,
div_255(a1[5] as u16 * a2[5] as u16) as u8,
div_255(a1[6] as u16 * a2[6] as u16) as u8,
div_255(a1[7] as u16 * a2[7] as u16) as u8,
]
});
alpha_fill::blend(simd, dest, src, blend_mode, iter);
}
(None, Some(mask_iter)) => alpha_fill::blend(simd, dest, src, blend_mode, mask_iter),
(Some(alpha_iter), None) => alpha_fill::blend(simd, dest, src, blend_mode, alpha_iter),
(None, None) => {
fill::blend(simd, dest, src, blend_mode);
}
}
}
}
mod fill {
use crate::fine::Splat4thExt;
use crate::fine::lowp::compose::ComposeExt;
use crate::fine::lowp::mix;
use crate::peniko::{BlendMode, Mix};
use vello_common::fearless_simd::*;
use vello_common::util::normalized_mul_u8x32;
pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
simd: S,
dest: &mut [u8],
src: T,
blend_mode: BlendMode,
) {
simd.vectorize(
#[inline(always)]
|| {
let default_mix = matches!(blend_mode.mix, Mix::Normal);
for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
let bg_v = u8x32::from_slice(simd, next_dest);
let src_v = if default_mix {
next_src
} else {
mix(next_src, bg_v, blend_mode)
};
let res = blend_mode.compose(simd, src_v, bg_v, None);
next_dest.copy_from_slice(res.as_slice());
}
},
);
}
pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [u8], src: [u8; 4]) {
s.vectorize(
#[inline(always)]
|| {
let one_minus_alpha = 255 - u8x32::splat(s, src[3]);
let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).to_bytes();
for next_dest in dest.chunks_exact_mut(64) {
let bg_v = u8x64::from_slice(s, next_dest);
let (bg_1, bg_2) = s.split_u8x64(bg_v);
let res_1 = alpha_composite_inner(s, bg_1, src_c, one_minus_alpha);
let res_2 = alpha_composite_inner(s, bg_2, src_c, one_minus_alpha);
let combined = s.combine_u8x32(res_1, res_2);
next_dest.copy_from_slice(combined.as_slice());
}
},
);
}
pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
simd: S,
dest: &mut [u8],
src: T,
) {
simd.vectorize(
#[inline(always)]
|| {
for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
let one_minus_alpha = 255 - next_src.splat_4th();
let bg_v = u8x32::from_slice(simd, next_dest);
let res = alpha_composite_inner(simd, bg_v, next_src, one_minus_alpha);
next_dest.copy_from_slice(res.as_slice());
}
},
);
}
#[inline(always)]
fn alpha_composite_inner<S: Simd>(
s: S,
bg: u8x32<S>,
src: u8x32<S>,
one_minus_alpha: u8x32<S>,
) -> u8x32<S> {
s.narrow_u16x32(normalized_mul_u8x32(bg, one_minus_alpha)) + src
}
}
mod alpha_fill {
use crate::fine::Splat4thExt;
use crate::fine::lowp::compose::ComposeExt;
use crate::fine::lowp::{extract_masks, mix};
use crate::peniko::{BlendMode, Mix};
use vello_common::fearless_simd::*;
use vello_common::util::{Div255Ext, normalized_mul_u8x32};
pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
simd: S,
dest: &mut [u8],
src: T,
blend_mode: BlendMode,
alphas: impl Iterator<Item = [u8; 8]>,
) {
simd.vectorize(
#[inline(always)]
|| {
let default_mix = matches!(blend_mode.mix, Mix::Normal);
for ((next_bg, next_mask), next_src) in
dest.chunks_exact_mut(32).zip(alphas).zip(src)
{
let bg_v = u8x32::from_slice(simd, next_bg);
let src_c = if default_mix {
next_src
} else {
mix(next_src, bg_v, blend_mode)
};
let masks = extract_masks(simd, &next_mask);
let res = blend_mode.compose(simd, src_c, bg_v, Some(masks));
next_bg.copy_from_slice(res.as_slice());
}
},
);
}
#[inline(always)]
pub(super) fn alpha_composite_solid<S: Simd>(
s: S,
dest: &mut [u8],
src: [u8; 4],
alphas: impl Iterator<Item = [u8; 8]>,
) {
s.vectorize(
#[inline(always)]
|| {
let src_a = u8x32::splat(s, src[3]);
let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).to_bytes();
let one = u8x32::splat(s, 255);
for (next_bg, next_mask) in dest.chunks_exact_mut(32).zip(alphas) {
alpha_composite_inner(s, next_bg, &next_mask, src_c, src_a, one);
}
},
);
}
#[inline(always)]
pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
simd: S,
dest: &mut [u8],
src: T,
alphas: impl Iterator<Item = [u8; 8]>,
) {
simd.vectorize(
#[inline(always)]
|| {
let one = u8x32::splat(simd, 255);
for ((next_dest, next_mask), next_src) in
dest.chunks_exact_mut(32).zip(alphas).zip(src)
{
let src_a = next_src.splat_4th();
alpha_composite_inner(simd, next_dest, &next_mask, next_src, src_a, one);
}
},
);
}
#[inline(always)]
fn alpha_composite_inner<S: Simd>(
s: S,
dest: &mut [u8],
masks: &[u8; 8],
src_c: u8x32<S>,
src_a: u8x32<S>,
one: u8x32<S>,
) {
s.vectorize(
#[inline(always)]
|| {
let bg_v = u8x32::from_slice(s, dest);
let mask_v = extract_masks(s, masks);
let inv_src_a_mask_a = one - s.narrow_u16x32(normalized_mul_u8x32(src_a, mask_v));
let p1 = s.widen_u8x32(bg_v) * s.widen_u8x32(inv_src_a_mask_a);
let p2 = s.widen_u8x32(src_c) * s.widen_u8x32(mask_v);
let res = s.narrow_u16x32((p1 + p2).div_255());
dest.copy_from_slice(res.as_slice());
},
);
}
}
fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
let to_f32 = |val: u8x32<S>| {
let (a, b) = src_c.simd.split_u8x32(val);
let mut a = u8_to_f32(a);
let mut b = u8_to_f32(b);
a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
(a, b)
};
let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
let val1 =
f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)));
let val2 =
f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)));
val1.simd.combine_u8x16(val1, val2)
};
let (mut src_1, mut src_2) = to_f32(src_c);
let (bg_1, bg_2) = to_f32(bg_c);
src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
src_2 = highp::blend::mix(src_2, bg_2, blend_mode);
to_u8(src_1, src_2)
}
#[inline(always)]
fn extract_masks<S: Simd>(simd: S, masks: &[u8; 8]) -> u8x32<S> {
let m1 = u32x4::splat(simd, u32::from_ne_bytes(masks[0..4].try_into().unwrap())).to_bytes();
let m2 = u32x4::splat(simd, u32::from_ne_bytes(masks[4..8].try_into().unwrap())).to_bytes();
let zipped1 = m1.zip_low(m1);
let zipped1 = zipped1.zip_low(zipped1);
let zipped2 = m2.zip_low(m2);
let zipped2 = zipped2.zip_low(zipped2);
simd.combine_u8x16(zipped1, zipped2)
}
#[inline(always)]
fn pack(region: &mut Region<'_>, blend_buf: &[u8]) {
for y in 0..Tile::HEIGHT {
for (x, pixel) in region
.row_mut(y)
.chunks_exact_mut(COLOR_COMPONENTS)
.enumerate()
{
let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
pixel.copy_from_slice(&blend_buf[idx..][..COLOR_COMPONENTS]);
}
}
}
#[inline(always)]
fn unpack(region: &mut Region<'_>, blend_buf: &mut [u8]) {
for y in 0..Tile::HEIGHT {
for (x, pixel) in region.row_mut(y).chunks_exact(COLOR_COMPONENTS).enumerate() {
let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
blend_buf[idx..][..COLOR_COMPONENTS].copy_from_slice(pixel);
}
}
}
#[inline(always)]
fn pack_block<S: Simd>(simd: S, region: &mut Region<'_>, mut buf: &[u8]) {
buf = &buf[..SCRATCH_BUF_SIZE];
const CHUNK_LENGTH: usize = 64;
const SLICE_WIDTH: usize = WideTile::WIDTH as usize * COLOR_COMPONENTS;
let region_areas = region.areas();
let [s1, s2, s3, s4] = region_areas;
let dest_slices: &mut [&mut [u8; SLICE_WIDTH]; 4] = &mut [
(*s1).try_into().unwrap(),
(*s2).try_into().unwrap(),
(*s3).try_into().unwrap(),
(*s4).try_into().unwrap(),
];
for (idx, col) in buf.chunks_exact(CHUNK_LENGTH).enumerate() {
let dest_idx = idx * CHUNK_LENGTH / 4;
let casted: &[u32; 16] = cast_slice::<u8, u32>(col).try_into().unwrap();
let loaded = simd.load_interleaved_128_u32x16(casted).to_bytes();
dest_slices[0][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[..16]);
dest_slices[1][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[16..32]);
dest_slices[2][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[32..48]);
dest_slices[3][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[48..64]);
}
}
#[inline(always)]
fn unpack_block<S: Simd>(simd: S, region: &mut Region<'_>, buf: &mut [u8]) {
let buf: &mut [f32] = bytemuck::cast_slice_mut(&mut buf[..SCRATCH_BUF_SIZE]);
const CHUNK_LENGTH: usize = 16;
let region_areas = region.areas();
let [s1, s2, s3, s4] = region_areas;
for (idx, col) in buf.as_chunks_mut::<CHUNK_LENGTH>().0.iter_mut().enumerate() {
let src_idx = idx * CHUNK_LENGTH;
let r0 = f32x4::from_bytes(u8x16::from_slice(simd, &s1[src_idx..][..16]));
let r1 = f32x4::from_bytes(u8x16::from_slice(simd, &s2[src_idx..][..16]));
let r2 = f32x4::from_bytes(u8x16::from_slice(simd, &s3[src_idx..][..16]));
let r3 = f32x4::from_bytes(u8x16::from_slice(simd, &s4[src_idx..][..16]));
let combined = simd.combine_f32x8(simd.combine_f32x4(r0, r1), simd.combine_f32x4(r2, r3));
simd.store_interleaved_128_f32x16(combined, col);
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
use alloc::vec::Vec;
use vello_common::fearless_simd::dispatch;
fn test_pack_unpack_roundtrip(
pack_fn: impl FnOnce(&mut Region<'_>, &[u8]),
unpack_fn: impl FnOnce(&mut Region<'_>, &mut [u8]),
) {
let width = WideTile::WIDTH;
let height = Tile::HEIGHT;
let blend_buf = (0..SCRATCH_BUF_SIZE)
.map(|n| ((n * 7 + 13) % 256) as u8)
.collect::<Vec<_>>();
let mut region_data = vec![0_u8; width as usize * height as usize * COLOR_COMPONENTS];
let row_len = width as usize * COLOR_COMPONENTS;
let (r0, rest) = region_data.split_at_mut(row_len);
let (r1, rest) = rest.split_at_mut(row_len);
let (r2, r3) = rest.split_at_mut(row_len);
let mut region = Region::new([r0, r1, r2, r3], 0, 0, width, height);
pack_fn(&mut region, &blend_buf);
let mut unpacked_buf = vec![0_u8; SCRATCH_BUF_SIZE];
unpack_fn(&mut region, &mut unpacked_buf);
assert_eq!(&blend_buf, &unpacked_buf);
}
#[test]
fn pack_unpack_roundtrip() {
test_pack_unpack_roundtrip(pack, unpack);
}
#[test]
fn pack_block_unpack_block_roundtrip() {
dispatch!(Level::try_detect().unwrap_or(Level::baseline()), simd => {
test_pack_unpack_roundtrip(
|region, buf| simd.vectorize(|| pack_block(simd, region, buf)),
|region, buf| simd.vectorize(|| unpack_block(simd, region, buf)),
);
});
}
}