macro_rules! swap {
( $x:expr, $y:expr, $j:expr, $m:expr ) => {
let t = ($y ^ ($x >> $j)) & $m;
$y ^= t;
$x ^= t << $j;
};
}
#[allow(dead_code)]
pub(crate) fn transpose_unroll_64x64(a: &mut [u64; 64]) {
let mut a0 = a[0];
let mut a1 = a[1];
let mut a2 = a[2];
let mut a3 = a[3];
let mut a4 = a[4];
let mut a5 = a[5];
let mut a6 = a[6];
let mut a7 = a[7];
let mut a8 = a[8];
let mut a9 = a[9];
let mut a10 = a[10];
let mut a11 = a[11];
let mut a12 = a[12];
let mut a13 = a[13];
let mut a14 = a[14];
let mut a15 = a[15];
let mut a16 = a[16];
let mut a17 = a[17];
let mut a18 = a[18];
let mut a19 = a[19];
let mut a20 = a[20];
let mut a21 = a[21];
let mut a22 = a[22];
let mut a23 = a[23];
let mut a24 = a[24];
let mut a25 = a[25];
let mut a26 = a[26];
let mut a27 = a[27];
let mut a28 = a[28];
let mut a29 = a[29];
let mut a30 = a[30];
let mut a31 = a[31];
let mut a32 = a[32];
let mut a33 = a[33];
let mut a34 = a[34];
let mut a35 = a[35];
let mut a36 = a[36];
let mut a37 = a[37];
let mut a38 = a[38];
let mut a39 = a[39];
let mut a40 = a[40];
let mut a41 = a[41];
let mut a42 = a[42];
let mut a43 = a[43];
let mut a44 = a[44];
let mut a45 = a[45];
let mut a46 = a[46];
let mut a47 = a[47];
let mut a48 = a[48];
let mut a49 = a[49];
let mut a50 = a[50];
let mut a51 = a[51];
let mut a52 = a[52];
let mut a53 = a[53];
let mut a54 = a[54];
let mut a55 = a[55];
let mut a56 = a[56];
let mut a57 = a[57];
let mut a58 = a[58];
let mut a59 = a[59];
let mut a60 = a[60];
let mut a61 = a[61];
let mut a62 = a[62];
let mut a63 = a[63];
let m = 0xffffffffu64;
swap!(a0, a32, 32, m);
swap!(a1, a33, 32, m);
swap!(a2, a34, 32, m);
swap!(a3, a35, 32, m);
swap!(a4, a36, 32, m);
swap!(a5, a37, 32, m);
swap!(a6, a38, 32, m);
swap!(a7, a39, 32, m);
swap!(a8, a40, 32, m);
swap!(a9, a41, 32, m);
swap!(a10, a42, 32, m);
swap!(a11, a43, 32, m);
swap!(a12, a44, 32, m);
swap!(a13, a45, 32, m);
swap!(a14, a46, 32, m);
swap!(a15, a47, 32, m);
swap!(a16, a48, 32, m);
swap!(a17, a49, 32, m);
swap!(a18, a50, 32, m);
swap!(a19, a51, 32, m);
swap!(a20, a52, 32, m);
swap!(a21, a53, 32, m);
swap!(a22, a54, 32, m);
swap!(a23, a55, 32, m);
swap!(a24, a56, 32, m);
swap!(a25, a57, 32, m);
swap!(a26, a58, 32, m);
swap!(a27, a59, 32, m);
swap!(a28, a60, 32, m);
swap!(a29, a61, 32, m);
swap!(a30, a62, 32, m);
swap!(a31, a63, 32, m);
let m = 0xffff0000ffffu64;
swap!(a0, a16, 16, m);
swap!(a1, a17, 16, m);
swap!(a2, a18, 16, m);
swap!(a3, a19, 16, m);
swap!(a4, a20, 16, m);
swap!(a5, a21, 16, m);
swap!(a6, a22, 16, m);
swap!(a7, a23, 16, m);
swap!(a8, a24, 16, m);
swap!(a9, a25, 16, m);
swap!(a10, a26, 16, m);
swap!(a11, a27, 16, m);
swap!(a12, a28, 16, m);
swap!(a13, a29, 16, m);
swap!(a14, a30, 16, m);
swap!(a15, a31, 16, m);
swap!(a32, a48, 16, m);
swap!(a33, a49, 16, m);
swap!(a34, a50, 16, m);
swap!(a35, a51, 16, m);
swap!(a36, a52, 16, m);
swap!(a37, a53, 16, m);
swap!(a38, a54, 16, m);
swap!(a39, a55, 16, m);
swap!(a40, a56, 16, m);
swap!(a41, a57, 16, m);
swap!(a42, a58, 16, m);
swap!(a43, a59, 16, m);
swap!(a44, a60, 16, m);
swap!(a45, a61, 16, m);
swap!(a46, a62, 16, m);
swap!(a47, a63, 16, m);
let m = 0xff00ff00ff00ffu64;
swap!(a0, a8, 8, m);
swap!(a1, a9, 8, m);
swap!(a2, a10, 8, m);
swap!(a3, a11, 8, m);
swap!(a4, a12, 8, m);
swap!(a5, a13, 8, m);
swap!(a6, a14, 8, m);
swap!(a7, a15, 8, m);
swap!(a16, a24, 8, m);
swap!(a17, a25, 8, m);
swap!(a18, a26, 8, m);
swap!(a19, a27, 8, m);
swap!(a20, a28, 8, m);
swap!(a21, a29, 8, m);
swap!(a22, a30, 8, m);
swap!(a23, a31, 8, m);
swap!(a32, a40, 8, m);
swap!(a33, a41, 8, m);
swap!(a34, a42, 8, m);
swap!(a35, a43, 8, m);
swap!(a36, a44, 8, m);
swap!(a37, a45, 8, m);
swap!(a38, a46, 8, m);
swap!(a39, a47, 8, m);
swap!(a48, a56, 8, m);
swap!(a49, a57, 8, m);
swap!(a50, a58, 8, m);
swap!(a51, a59, 8, m);
swap!(a52, a60, 8, m);
swap!(a53, a61, 8, m);
swap!(a54, a62, 8, m);
swap!(a55, a63, 8, m);
let m = 0xf0f0f0f0f0f0f0fu64;
swap!(a0, a4, 4, m);
swap!(a1, a5, 4, m);
swap!(a2, a6, 4, m);
swap!(a3, a7, 4, m);
swap!(a8, a12, 4, m);
swap!(a9, a13, 4, m);
swap!(a10, a14, 4, m);
swap!(a11, a15, 4, m);
swap!(a16, a20, 4, m);
swap!(a17, a21, 4, m);
swap!(a18, a22, 4, m);
swap!(a19, a23, 4, m);
swap!(a24, a28, 4, m);
swap!(a25, a29, 4, m);
swap!(a26, a30, 4, m);
swap!(a27, a31, 4, m);
swap!(a32, a36, 4, m);
swap!(a33, a37, 4, m);
swap!(a34, a38, 4, m);
swap!(a35, a39, 4, m);
swap!(a40, a44, 4, m);
swap!(a41, a45, 4, m);
swap!(a42, a46, 4, m);
swap!(a43, a47, 4, m);
swap!(a48, a52, 4, m);
swap!(a49, a53, 4, m);
swap!(a50, a54, 4, m);
swap!(a51, a55, 4, m);
swap!(a56, a60, 4, m);
swap!(a57, a61, 4, m);
swap!(a58, a62, 4, m);
swap!(a59, a63, 4, m);
let m = 0x3333333333333333;
swap!(a0, a2, 2, m);
swap!(a1, a3, 2, m);
swap!(a4, a6, 2, m);
swap!(a5, a7, 2, m);
swap!(a8, a10, 2, m);
swap!(a9, a11, 2, m);
swap!(a12, a14, 2, m);
swap!(a13, a15, 2, m);
swap!(a16, a18, 2, m);
swap!(a17, a19, 2, m);
swap!(a20, a22, 2, m);
swap!(a21, a23, 2, m);
swap!(a24, a26, 2, m);
swap!(a25, a27, 2, m);
swap!(a28, a30, 2, m);
swap!(a29, a31, 2, m);
swap!(a32, a34, 2, m);
swap!(a33, a35, 2, m);
swap!(a36, a38, 2, m);
swap!(a37, a39, 2, m);
swap!(a40, a42, 2, m);
swap!(a41, a43, 2, m);
swap!(a44, a46, 2, m);
swap!(a45, a47, 2, m);
swap!(a48, a50, 2, m);
swap!(a49, a51, 2, m);
swap!(a52, a54, 2, m);
swap!(a53, a55, 2, m);
swap!(a56, a58, 2, m);
swap!(a57, a59, 2, m);
swap!(a60, a62, 2, m);
swap!(a61, a63, 2, m);
let m = 0x5555555555555555;
swap!(a0, a1, 1, m);
swap!(a2, a3, 1, m);
swap!(a4, a5, 1, m);
swap!(a6, a7, 1, m);
swap!(a8, a9, 1, m);
swap!(a10, a11, 1, m);
swap!(a12, a13, 1, m);
swap!(a14, a15, 1, m);
swap!(a16, a17, 1, m);
swap!(a18, a19, 1, m);
swap!(a20, a21, 1, m);
swap!(a22, a23, 1, m);
swap!(a24, a25, 1, m);
swap!(a26, a27, 1, m);
swap!(a28, a29, 1, m);
swap!(a30, a31, 1, m);
swap!(a32, a33, 1, m);
swap!(a34, a35, 1, m);
swap!(a36, a37, 1, m);
swap!(a38, a39, 1, m);
swap!(a40, a41, 1, m);
swap!(a42, a43, 1, m);
swap!(a44, a45, 1, m);
swap!(a46, a47, 1, m);
swap!(a48, a49, 1, m);
swap!(a50, a51, 1, m);
swap!(a52, a53, 1, m);
swap!(a54, a55, 1, m);
swap!(a56, a57, 1, m);
swap!(a58, a59, 1, m);
swap!(a60, a61, 1, m);
swap!(a62, a63, 1, m);
a[0] = a0;
a[1] = a1;
a[2] = a2;
a[3] = a3;
a[4] = a4;
a[5] = a5;
a[6] = a6;
a[7] = a7;
a[8] = a8;
a[9] = a9;
a[10] = a10;
a[11] = a11;
a[12] = a12;
a[13] = a13;
a[14] = a14;
a[15] = a15;
a[16] = a16;
a[17] = a17;
a[18] = a18;
a[19] = a19;
a[20] = a20;
a[21] = a21;
a[22] = a22;
a[23] = a23;
a[24] = a24;
a[25] = a25;
a[26] = a26;
a[27] = a27;
a[28] = a28;
a[29] = a29;
a[30] = a30;
a[31] = a31;
a[32] = a32;
a[33] = a33;
a[34] = a34;
a[35] = a35;
a[36] = a36;
a[37] = a37;
a[38] = a38;
a[39] = a39;
a[40] = a40;
a[41] = a41;
a[42] = a42;
a[43] = a43;
a[44] = a44;
a[45] = a45;
a[46] = a46;
a[47] = a47;
a[48] = a48;
a[49] = a49;
a[50] = a50;
a[51] = a51;
a[52] = a52;
a[53] = a53;
a[54] = a54;
a[55] = a55;
a[56] = a56;
a[57] = a57;
a[58] = a58;
a[59] = a59;
a[60] = a60;
a[61] = a61;
a[62] = a62;
a[63] = a63;
}
#[cfg(test)]
#[cfg(feature = "rand")]
mod tests {
use crate::base_matrix::slow_transpose;
use crate::transpose64x64_unroll::transpose_unroll_64x64;
use crate::BinaryMatrix64;
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
#[test]
fn test_transpose_64x64_unroll() {
let mut rng = ChaCha8Rng::seed_from_u64(1234);
let mat = BinaryMatrix64::random(64, 64, &mut rng);
let mut newmat = BinaryMatrix64::new();
let mut a = mat.submatrix64(0, 0).cols;
transpose_unroll_64x64(&mut a);
slow_transpose(mat.as_ref(), &mut newmat);
let b = newmat.submatrix64(0, 0).cols;
assert_eq!(a, b);
}
}
#[cfg(test)]
mod bench {
extern crate test;
use crate::transpose64x64_unroll::transpose_unroll_64x64;
use test::bench::Bencher;
#[bench]
fn bench_transpose64x64_unroll(b: &mut Bencher) {
let mut mat = [0u64; 64];
mat[0] = 0xffffffffffffffff;
b.iter(|| {
test::black_box(transpose_unroll_64x64(&mut mat));
});
}
}