use crate::GpuValue;
pub trait SimdVector<T: GpuValue>: Copy {
const WIDTH: usize;
fn splat(value: T) -> Self;
fn extract(self, lane: usize) -> T;
fn insert(self, lane: usize, value: T) -> Self;
}
pub trait Platform: Copy + 'static {
const WIDTH: usize;
const NAME: &'static str;
type Vector<T: GpuValue>: SimdVector<T>;
type Mask: Copy;
fn broadcast<T: GpuValue>(value: T) -> Self::Vector<T> {
Self::Vector::splat(value)
}
fn shuffle<T: GpuValue>(source: Self::Vector<T>, indices: Self::Vector<u32>)
-> Self::Vector<T>;
fn shuffle_down<T: GpuValue>(source: Self::Vector<T>, delta: usize) -> Self::Vector<T>;
fn shuffle_xor<T: GpuValue>(source: Self::Vector<T>, mask: usize) -> Self::Vector<T>;
fn reduce_sum<T: GpuValue + core::ops::Add<Output = T>>(values: Self::Vector<T>) -> T;
fn reduce_max<T: GpuValue + Ord>(values: Self::Vector<T>) -> T;
fn reduce_min<T: GpuValue + Ord>(values: Self::Vector<T>) -> T;
fn ballot(predicates: Self::Vector<bool>) -> Self::Mask;
fn all(predicates: Self::Vector<bool>) -> bool;
fn any(predicates: Self::Vector<bool>) -> bool;
fn mask_popcount(mask: Self::Mask) -> u32;
}
#[derive(Copy, Clone, Debug)]
pub struct CpuSimd<const WIDTH: usize>;
#[derive(Copy, Clone, Debug)]
pub struct PortableVector<T: GpuValue, const WIDTH: usize> {
data: [T; WIDTH],
}
impl<T: GpuValue, const WIDTH: usize> SimdVector<T> for PortableVector<T, WIDTH> {
const WIDTH: usize = WIDTH;
fn splat(value: T) -> Self {
PortableVector {
data: [value; WIDTH],
}
}
fn extract(self, lane: usize) -> T {
assert!(lane < WIDTH, "extract: lane {lane} >= WIDTH {WIDTH}");
self.data[lane]
}
fn insert(self, lane: usize, value: T) -> Self {
assert!(lane < WIDTH, "insert: lane {lane} >= WIDTH {WIDTH}");
let mut result = self;
result.data[lane] = value;
result
}
}
impl<T: GpuValue, const WIDTH: usize> Default for PortableVector<T, WIDTH> {
fn default() -> Self {
PortableVector {
data: [T::default(); WIDTH],
}
}
}
impl<const WIDTH: usize> Platform for CpuSimd<WIDTH>
where
[(); WIDTH]: Sized,
{
const WIDTH: usize = WIDTH;
const NAME: &'static str = "CpuSimd";
type Vector<T: GpuValue> = PortableVector<T, WIDTH>;
type Mask = u64;
fn shuffle<T: GpuValue>(
source: Self::Vector<T>,
indices: Self::Vector<u32>,
) -> Self::Vector<T> {
let mut result = PortableVector::default();
for i in 0..WIDTH {
let src_idx = indices.data[i] as usize % WIDTH;
result.data[i] = source.data[src_idx];
}
result
}
fn shuffle_down<T: GpuValue>(source: Self::Vector<T>, delta: usize) -> Self::Vector<T> {
let mut result = PortableVector::default();
for i in 0..WIDTH {
let src_idx = if i + delta < WIDTH { i + delta } else { i };
result.data[i] = source.data[src_idx];
}
result
}
fn shuffle_xor<T: GpuValue>(source: Self::Vector<T>, mask: usize) -> Self::Vector<T> {
let mut result = PortableVector::default();
for i in 0..WIDTH {
let src_idx = (i ^ mask) % WIDTH;
result.data[i] = source.data[src_idx];
}
result
}
fn reduce_sum<T: GpuValue + core::ops::Add<Output = T>>(values: Self::Vector<T>) -> T {
const { assert!(WIDTH > 0, "CpuSimd<WIDTH>: reduce requires WIDTH > 0") };
values.data.into_iter().reduce(|a, b| a + b).unwrap()
}
fn reduce_max<T: GpuValue + Ord>(values: Self::Vector<T>) -> T {
const { assert!(WIDTH > 0, "CpuSimd<WIDTH>: reduce requires WIDTH > 0") };
values.data.into_iter().max().unwrap()
}
fn reduce_min<T: GpuValue + Ord>(values: Self::Vector<T>) -> T {
const { assert!(WIDTH > 0, "CpuSimd<WIDTH>: reduce requires WIDTH > 0") };
values.data.into_iter().min().unwrap()
}
fn ballot(predicates: Self::Vector<bool>) -> Self::Mask {
const {
assert!(
WIDTH <= 64,
"CpuSimd<WIDTH>: ballot requires WIDTH <= 64 (u64 mask)"
)
};
let mut mask = 0u64;
for i in 0..WIDTH {
if predicates.data[i] {
mask |= 1u64 << i;
}
}
mask
}
fn all(predicates: Self::Vector<bool>) -> bool {
predicates.data.iter().all(|&b| b)
}
fn any(predicates: Self::Vector<bool>) -> bool {
predicates.data.iter().any(|&b| b)
}
fn mask_popcount(mask: Self::Mask) -> u32 {
mask.count_ones()
}
}
#[derive(Copy, Clone, Debug)]
pub struct GpuWarp32;
#[derive(Copy, Clone, Debug)]
pub struct GpuWarp64;
impl Platform for GpuWarp32 {
const WIDTH: usize = 32;
const NAME: &'static str = "GpuWarp32";
type Vector<T: GpuValue> = PortableVector<T, 32>;
type Mask = u32;
fn shuffle<T: GpuValue>(
source: Self::Vector<T>,
indices: Self::Vector<u32>,
) -> Self::Vector<T> {
let mut result = PortableVector::default();
for i in 0..32 {
let src_idx = indices.data[i] as usize % 32;
result.data[i] = source.data[src_idx];
}
result
}
fn shuffle_down<T: GpuValue>(source: Self::Vector<T>, delta: usize) -> Self::Vector<T> {
let mut result = PortableVector::default();
for i in 0..32 {
let src_idx = i + delta;
result.data[i] = if src_idx < 32 {
source.data[src_idx]
} else {
source.data[i]
};
}
result
}
fn shuffle_xor<T: GpuValue>(source: Self::Vector<T>, mask: usize) -> Self::Vector<T> {
CpuSimd::<32>::shuffle_xor(source, mask)
}
fn reduce_sum<T: GpuValue + core::ops::Add<Output = T>>(values: Self::Vector<T>) -> T {
CpuSimd::<32>::reduce_sum(values)
}
fn reduce_max<T: GpuValue + Ord>(values: Self::Vector<T>) -> T {
CpuSimd::<32>::reduce_max(values)
}
fn reduce_min<T: GpuValue + Ord>(values: Self::Vector<T>) -> T {
CpuSimd::<32>::reduce_min(values)
}
fn ballot(predicates: Self::Vector<bool>) -> Self::Mask {
CpuSimd::<32>::ballot(predicates) as u32
}
fn all(predicates: Self::Vector<bool>) -> bool {
CpuSimd::<32>::all(predicates)
}
fn any(predicates: Self::Vector<bool>) -> bool {
CpuSimd::<32>::any(predicates)
}
fn mask_popcount(mask: Self::Mask) -> u32 {
mask.count_ones()
}
}
pub fn butterfly_reduce_sum<const WIDTH: usize, T>(values: PortableVector<T, WIDTH>) -> T
where
T: GpuValue + core::ops::Add<Output = T>,
{
const {
assert!(
WIDTH.is_power_of_two(),
"butterfly_reduce_sum requires power-of-2 WIDTH"
)
};
let mut v = values;
let mut stride = 1;
while stride < WIDTH {
let mut shuffled: PortableVector<T, WIDTH> = PortableVector::default();
for i in 0..WIDTH {
shuffled.data[i] = v.data[i ^ stride];
}
for i in 0..WIDTH {
v.data[i] = v.data[i] + shuffled.data[i];
}
stride *= 2;
}
v.data[0]
}
pub fn prefix_sum<const WIDTH: usize, T>(
values: PortableVector<T, WIDTH>,
) -> PortableVector<T, WIDTH>
where
T: GpuValue + core::ops::Add<Output = T>,
{
let mut v = values;
let mut stride = 1;
while stride < WIDTH {
let mut result = v;
for i in stride..WIDTH {
result.data[i] = v.data[i] + v.data[i - stride];
}
v = result;
stride *= 2;
}
v
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cpu_simd_broadcast() {
let v = CpuSimd::<8>::broadcast(42i32);
for i in 0..8 {
assert_eq!(v.extract(i), 42);
}
}
#[test]
fn test_cpu_simd_shuffle_xor() {
let mut v = PortableVector::<i32, 8>::default();
for i in 0..8 {
v = v.insert(i, i as i32);
}
let shuffled = CpuSimd::<8>::shuffle_xor(v, 1);
assert_eq!(shuffled.extract(0), 1);
assert_eq!(shuffled.extract(1), 0);
assert_eq!(shuffled.extract(2), 3);
assert_eq!(shuffled.extract(3), 2);
}
#[test]
fn test_cpu_simd_reduce_sum() {
let mut v = PortableVector::<i32, 8>::default();
for i in 0..8 {
v = v.insert(i, (i + 1) as i32);
}
assert_eq!(CpuSimd::<8>::reduce_sum(v), 36);
}
#[test]
fn test_butterfly_reduce() {
let mut v = PortableVector::<i32, 8>::default();
for i in 0..8 {
v = v.insert(i, (i + 1) as i32);
}
let sum = butterfly_reduce_sum::<8, i32>(v);
assert_eq!(sum, 36);
}
#[test]
fn test_ballot() {
let mut predicates = PortableVector::<bool, 8>::default();
for i in 0..8 {
predicates = predicates.insert(i, i % 2 == 1);
}
let mask = CpuSimd::<8>::ballot(predicates);
assert_eq!(mask, 0b10101010);
assert_eq!(CpuSimd::<8>::mask_popcount(mask), 4);
}
#[test]
fn test_gpu_warp32_emulation() {
let v = GpuWarp32::broadcast(7i32);
assert_eq!(v.extract(0), 7);
assert_eq!(v.extract(31), 7);
let mut values = PortableVector::<i32, 32>::default();
for i in 0..32 {
values = values.insert(i, 1);
}
assert_eq!(GpuWarp32::reduce_sum(values), 32);
}
#[test]
fn test_prefix_sum() {
let mut v = PortableVector::<i32, 8>::default();
for i in 0..8 {
v = v.insert(i, 1); }
let result = prefix_sum::<8, i32>(v);
for i in 0..8 {
assert_eq!(result.extract(i), (i + 1) as i32);
}
}
}