#![cfg(all(feature = "simd-portable", feature = "opt-simd-bucket-aggregation"))]
use core::simd::cmp::SimdPartialOrd;
use core::simd::{LaneCount, Simd, SupportedLaneCount};
trait DualElementsToBytes<const N_BYTES: usize, const N_HALF_BYTES: usize, const N_ELEMS: usize>
where
LaneCount<N_ELEMS>: SupportedLaneCount,
{
}
struct DualElementsAndBytes<const N_BYTES: usize, const N_HALF_BYTES: usize, const N_ELEMS: usize>;
impl DualElementsToBytes<2, 1, 8> for DualElementsAndBytes<2, 1, 8> {}
impl DualElementsToBytes<4, 2, 16> for DualElementsAndBytes<4, 2, 16> {}
impl DualElementsToBytes<8, 4, 32> for DualElementsAndBytes<8, 4, 32> {}
impl DualElementsToBytes<16, 8, 64> for DualElementsAndBytes<16, 8, 64> {}
const INTERLEAVE_AS_DIBITS_TABLE: [[[u8; 2]; 256]; 256] = {
let mut array = [[[0; 2]; 256]; 256];
let mut b1 = 0;
while b1 < 256 {
let mut b0 = 0;
while b0 < 256 {
let mut data = 0u16;
let mut i = 0;
while i < 16 {
if i % 2 == 0 {
data |= (((b0 as u16) >> (i / 2)) & 1) << i;
} else {
data |= (((b1 as u16) >> (i / 2)) & 1) << i;
}
i += 1;
}
array[b1][b0] = data.to_be_bytes(); b0 += 1;
}
b1 += 1;
}
array
};
#[inline(always)]
fn sub_aggregation<const N_BYTES: usize, const N_HALF_BYTES: usize, const N_ELEMS: usize>(
buckets: &[u32; N_ELEMS],
q1: u32,
q2: u32,
q3: u32,
) -> [u8; N_BYTES]
where
DualElementsAndBytes<N_BYTES, N_HALF_BYTES, N_ELEMS>:
DualElementsToBytes<N_BYTES, N_HALF_BYTES, N_ELEMS>,
LaneCount<N_ELEMS>: SupportedLaneCount,
{
let qv1 = Simd::<u32, N_ELEMS>::splat(q1);
let qv2 = Simd::<u32, N_ELEMS>::splat(q2);
let qv3 = Simd::<u32, N_ELEMS>::splat(q3);
let data = Simd::<u32, N_ELEMS>::from_array(*buckets);
let qc2 = data.simd_gt(qv2);
let qb1 = qc2;
let qb1 = &qb1.to_bitmask().to_le_bytes()[..N_HALF_BYTES]; let qc1 = data.simd_gt(qv1);
let qc3 = data.simd_gt(qv3);
let qb0 = qc2 ^ qc1;
let qb0 = qb0 ^ qc3;
let qb0 = &qb0.to_bitmask().to_le_bytes()[..N_HALF_BYTES]; let mut out = [0u8; N_BYTES];
for (out, (&b0, &b1)) in out
.chunks_exact_mut(2)
.rev()
.zip(qb0.iter().zip(qb1.iter()))
{
out.copy_from_slice(&INTERLEAVE_AS_DIBITS_TABLE[b1 as usize][b0 as usize]);
}
out
}
#[inline]
pub(super) fn aggregate_48(out: &mut [u8; 12], buckets: &[u32; 48], q1: u32, q2: u32, q3: u32) {
for (out, subbuckets) in out
.chunks_mut(4)
.rev()
.zip(buckets.as_slice().chunks_exact(4 * 4))
{
let subbuckets: [u32; 4 * 4] = subbuckets.try_into().unwrap();
out.copy_from_slice(&sub_aggregation::<4, { 4 / 2 }, { 4 * 4 }>(
&subbuckets,
q1,
q2,
q3,
));
}
}
#[inline]
pub(super) fn aggregate_128(out: &mut [u8; 32], buckets: &[u32; 128], q1: u32, q2: u32, q3: u32) {
for (out, subbuckets) in out
.chunks_mut(16)
.rev()
.zip(buckets.as_slice().chunks_exact(16 * 4))
{
let subbuckets: [u32; 16 * 4] = subbuckets.try_into().unwrap();
out.copy_from_slice(&sub_aggregation::<16, { 16 / 2 }, { 16 * 4 }>(
&subbuckets,
q1,
q2,
q3,
));
}
}
#[inline]
pub(super) fn aggregate_256(out: &mut [u8; 64], buckets: &[u32; 256], q1: u32, q2: u32, q3: u32) {
for (out, subbuckets) in out
.chunks_mut(16)
.rev()
.zip(buckets.as_slice().chunks_exact(16 * 4))
{
let subbuckets: [u32; 16 * 4] = subbuckets.try_into().unwrap();
out.copy_from_slice(&sub_aggregation::<16, { 16 / 2 }, { 16 * 4 }>(
&subbuckets,
q1,
q2,
q3,
));
}
}
mod tests;