vello_common 0.0.8

// Copyright 2025 the Vello Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

//! A number of SIMD extension traits.

use fearless_simd::*;

/// Splatting every 4th element in the vector, used for splatting the alpha value of
/// a color to all lanes.
pub trait Splat4thExt<S> {
    /// Splat every 4th element of the vector.
    fn splat_4th(self) -> Self;
}

impl<S: Simd> Splat4thExt<S> for f32x4<S> {
    #[inline(always)]
    fn splat_4th(self) -> Self {
        // TODO: Explore whether it's just faster to manually access the 4th element and splat it.
        let zip1 = self.zip_high(self);
        zip1.zip_high(zip1)
    }
}

impl<S: Simd> Splat4thExt<S> for f32x8<S> {
    #[inline(always)]
    fn splat_4th(self) -> Self {
        let (mut p1, mut p2) = self.simd.split_f32x8(self);
        p1 = p1.splat_4th();
        p2 = p2.splat_4th();

        self.simd.combine_f32x4(p1, p2)
    }
}

impl<S: Simd> Splat4thExt<S> for f32x16<S> {
    #[inline(always)]
    fn splat_4th(self) -> Self {
        let (mut p1, mut p2) = self.simd.split_f32x16(self);
        p1 = p1.splat_4th();
        p2 = p2.splat_4th();

        self.simd.combine_f32x8(p1, p2)
    }
}

impl<S: Simd> Splat4thExt<S> for u8x16<S> {
    #[inline(always)]
    fn splat_4th(self) -> Self {
        // TODO: SIMDify
        [
            self[3], self[3], self[3], self[3], self[7], self[7], self[7], self[7], self[11],
            self[11], self[11], self[11], self[15], self[15], self[15], self[15],
        ]
        .simd_into(self.simd)
    }
}

impl<S: Simd> Splat4thExt<S> for u8x32<S> {
    #[inline(always)]
    fn splat_4th(self) -> Self {
        let (mut p1, mut p2) = self.simd.split_u8x32(self);
        p1 = p1.splat_4th();
        p2 = p2.splat_4th();

        self.simd.combine_u8x16(p1, p2)
    }
}

/// Splat each single element in the vector to 4 lanes.
#[inline(always)]
pub fn element_wise_splat<S: Simd>(simd: S, input: f32x4<S>) -> f32x16<S> {
    simd.combine_f32x8(
        simd.combine_f32x4(f32x4::splat(simd, input[0]), f32x4::splat(simd, input[1])),
        simd.combine_f32x4(f32x4::splat(simd, input[2]), f32x4::splat(simd, input[3])),
    )
}