use crate::{FastLanes, FL_ORDER};
use seq_macro::seq;
pub trait Transpose: FastLanes {
fn transpose(input: &[Self; 1024], output: &mut [Self; 1024]);
fn untranspose(input: &[Self; 1024], output: &mut [Self; 1024]);
}
impl<T: FastLanes> Transpose for T {
#[inline(never)]
fn transpose(input: &[Self; 1024], output: &mut [Self; 1024]) {
seq!(i in 0..1024 {
output[i] = input[mask(i)];
});
}
#[inline(never)]
fn untranspose(input: &[Self; 1024], output: &mut [Self; 1024]) {
seq!(i in 0..1024 {
output[mask(i)] = input[i];
});
}
}
#[inline(always)]
const fn mask(idx: usize) -> usize {
let lane = idx % 16;
let order = (idx / 16) % 8;
let row = idx / 128;
(lane * 64) + (FL_ORDER[order] * 8) + row
}