use std::arch::asm;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
pub(crate) unsafe fn transpose_asm_aarch64(mut cols: *mut u64) {
asm! {
"mov {save_lr}, lr",
"mov {t2}, {arr}",
"bl 1f",
"mov {t3}, {arr}",
"bl 1f",
"mov {t5}, {t2}",
"mov {t6}, {t3}",
"bl 2f",
"bl 2f",
"mov lr, {save_lr}",
"b 3f",
"1:",
"mov {save_arr}, {arr}",
"ld4 {{ v16.2d, v17.2d, v18.2d, v19.2d }}, [{arr}], #64",
"ld4 {{ v20.2d, v21.2d, v22.2d, v23.2d }}, [{arr}], #64",
"ld4 {{ v24.2d, v25.2d, v26.2d, v27.2d }}, [{arr}], #64",
"ld4 {{ v28.2d, v29.2d, v30.2d, v31.2d }}, [{arr}], #64",
"movi v0.16b, #0x55",
"ushr v6.2d, v16.2d, #1",
"shl v7.2d, v17.2d, #1",
"bif v16.16b, v7.16b, v0.16b",
"bit v17.16b, v6.16b, v0.16b",
"ushr v6.2d, v18.2d, #1",
"shl v7.2d, v19.2d, #1",
"bif v18.16b, v7.16b, v0.16b",
"bit v19.16b, v6.16b, v0.16b",
"ushr v6.2d, v20.2d, #1",
"shl v7.2d, v21.2d, #1",
"bif v20.16b, v7.16b, v0.16b",
"bit v21.16b, v6.16b, v0.16b",
"ushr v6.2d, v22.2d, #1",
"shl v7.2d, v23.2d, #1",
"bif v22.16b, v7.16b, v0.16b",
"bit v23.16b, v6.16b, v0.16b",
"ushr v6.2d, v24.2d, #1",
"shl v7.2d, v25.2d, #1",
"bif v24.16b, v7.16b, v0.16b",
"bit v25.16b, v6.16b, v0.16b",
"ushr v6.2d, v26.2d, #1",
"shl v7.2d, v27.2d, #1",
"bif v26.16b, v7.16b, v0.16b",
"bit v27.16b, v6.16b, v0.16b",
"ushr v6.2d, v28.2d, #1",
"shl v7.2d, v29.2d, #1",
"bif v28.16b, v7.16b, v0.16b",
"bit v29.16b, v6.16b, v0.16b",
"ushr v6.2d, v30.2d, #1",
"shl v7.2d, v31.2d, #1",
"bif v30.16b, v7.16b, v0.16b",
"bit v31.16b, v6.16b, v0.16b",
"movi v0.16b, #0x33",
"ushr v6.2d, v16.2d, #2",
"shl v7.2d, v18.2d, #2",
"bif v16.16b, v7.16b, v0.16b",
"bit v18.16b, v6.16b, v0.16b",
"ushr v6.2d, v17.2d, #2",
"shl v7.2d, v19.2d, #2",
"bif v17.16b, v7.16b, v0.16b",
"bit v19.16b, v6.16b, v0.16b",
"ushr v6.2d, v20.2d, #2",
"shl v7.2d, v22.2d, #2",
"bif v20.16b, v7.16b, v0.16b",
"bit v22.16b, v6.16b, v0.16b",
"ushr v6.2d, v21.2d, #2",
"shl v7.2d, v23.2d, #2",
"bif v21.16b, v7.16b, v0.16b",
"bit v23.16b, v6.16b, v0.16b",
"ushr v6.2d, v24.2d, #2",
"shl v7.2d, v26.2d, #2",
"bif v24.16b, v7.16b, v0.16b",
"bit v26.16b, v6.16b, v0.16b",
"ushr v6.2d, v25.2d, #2",
"shl v7.2d, v27.2d, #2",
"bif v25.16b, v7.16b, v0.16b",
"bit v27.16b, v6.16b, v0.16b",
"ushr v6.2d, v28.2d, #2",
"shl v7.2d, v30.2d, #2",
"bif v28.16b, v7.16b, v0.16b",
"bit v30.16b, v6.16b, v0.16b",
"ushr v6.2d, v29.2d, #2",
"shl v7.2d, v31.2d, #2",
"bif v29.16b, v7.16b, v0.16b",
"bit v31.16b, v6.16b, v0.16b",
"zip1 v0.2d, v16.2d, v17.2d",
"zip2 v17.2d, v16.2d, v17.2d",
"zip1 v1.2d, v18.2d, v19.2d",
"zip2 v19.2d, v18.2d, v19.2d",
"zip1 v2.2d, v20.2d, v21.2d",
"zip2 v21.2d, v20.2d, v21.2d",
"zip1 v3.2d, v22.2d, v23.2d",
"zip2 v23.2d, v22.2d, v23.2d",
"zip1 v4.2d, v24.2d, v25.2d",
"zip2 v25.2d, v24.2d, v25.2d",
"zip1 v5.2d, v26.2d, v27.2d",
"zip2 v27.2d, v26.2d, v27.2d",
"zip1 v6.2d, v28.2d, v29.2d",
"zip2 v29.2d, v28.2d, v29.2d",
"zip1 v7.2d, v30.2d, v31.2d",
"zip2 v31.2d, v30.2d, v31.2d",
"mov v16.2d, v0.2d",
"sli v16.16b, v17.16b, #4",
"sri v17.16b, v0.16b, #4",
"mov v18.2d, v1.2d",
"sli v18.16b, v19.16b, #4",
"sri v19.16b, v1.16b, #4",
"mov v20.2d, v2.2d",
"sli v20.16b, v21.16b, #4",
"sri v21.16b, v2.16b, #4",
"mov v22.2d, v3.2d",
"sli v22.16b, v23.16b, #4",
"sri v23.16b, v3.16b, #4",
"mov v24.2d, v4.2d",
"sli v24.16b, v25.16b, #4",
"sri v25.16b, v4.16b, #4",
"mov v26.2d, v5.2d",
"sli v26.16b, v27.16b, #4",
"sri v27.16b, v5.16b, #4",
"mov v28.2d, v6.2d",
"sli v28.16b, v29.16b, #4",
"sri v29.16b, v6.16b, #4",
"mov v30.2d, v7.2d",
"sli v30.16b, v31.16b, #4",
"sri v31.16b, v7.16b, #4",
"trn1 v0.16b, v16.16b, v20.16b",
"trn2 v20.16b, v16.16b, v20.16b",
"trn1 v1.16b, v17.16b, v21.16b",
"trn2 v21.16b, v17.16b, v21.16b",
"trn1 v2.16b, v18.16b, v22.16b",
"trn2 v22.16b, v18.16b, v22.16b",
"trn1 v3.16b, v19.16b, v23.16b",
"trn2 v23.16b, v19.16b, v23.16b",
"trn1 v4.16b, v24.16b, v28.16b",
"trn2 v28.16b, v24.16b, v28.16b",
"trn1 v5.16b, v25.16b, v29.16b",
"trn2 v29.16b, v25.16b, v29.16b",
"trn1 v6.16b, v26.16b, v30.16b",
"trn2 v30.16b, v26.16b, v30.16b",
"trn1 v7.16b, v27.16b, v31.16b",
"trn2 v31.16b, v27.16b, v31.16b",
"trn1 v16.8h, v0.8h, v4.8h",
"trn2 v24.8h, v0.8h, v4.8h",
"trn1 v18.8h, v1.8h, v5.8h",
"trn2 v26.8h, v1.8h, v5.8h",
"trn1 v17.8h, v2.8h, v6.8h",
"trn2 v25.8h, v2.8h, v6.8h",
"trn1 v19.8h, v3.8h, v7.8h",
"trn2 v27.8h, v3.8h, v7.8h",
"trn1 v0.8h, v20.8h, v28.8h",
"trn2 v4.8h, v20.8h, v28.8h",
"trn1 v2.8h, v21.8h, v29.8h",
"trn2 v6.8h, v21.8h, v29.8h",
"trn1 v1.8h, v22.8h, v30.8h",
"trn2 v5.8h, v22.8h, v30.8h",
"trn1 v3.8h, v23.8h, v31.8h",
"trn2 v7.8h, v23.8h, v31.8h",
"st1 {{ v16.2d, v17.2d, v18.2d, v19.2d }}, [{save_arr}], #64",
"st1 {{ v0.2d, v1.2d, v2.2d, v3.2d }}, [{save_arr}], #64",
"st1 {{ v24.2d, v25.2d, v26.2d, v27.2d }}, [{save_arr}], #64",
"st1 {{ v4.2d, v5.2d, v6.2d, v7.2d }}, [{save_arr}], #64",
"ret",
"2:",
"ld1 {{ v16.2d, v17.2d, v18.2d, v19.2d }}, [{t2}], #64",
"ld1 {{ v24.2d, v25.2d, v26.2d, v27.2d }}, [{t3}], #64",
"ld1 {{ v20.2d, v21.2d, v22.2d, v23.2d }}, [{t2}], #64",
"ld1 {{ v28.2d, v29.2d, v30.2d, v31.2d }}, [{t3}], #64",
"trn1 v0.4s, v16.4s, v24.4s",
"trn2 v4.4s, v16.4s, v24.4s",
"trn1 v1.4s, v17.4s, v25.4s",
"trn2 v5.4s, v17.4s, v25.4s",
"trn1 v2.4s, v18.4s, v26.4s",
"trn2 v6.4s, v18.4s, v26.4s",
"trn1 v3.4s, v19.4s, v27.4s",
"trn2 v7.4s, v19.4s, v27.4s",
"trn1 v16.4s, v20.4s, v28.4s",
"trn2 v24.4s, v20.4s, v28.4s",
"trn1 v17.4s, v21.4s, v29.4s",
"trn2 v25.4s, v21.4s, v29.4s",
"trn1 v18.4s, v22.4s, v30.4s",
"trn2 v26.4s, v22.4s, v30.4s",
"trn1 v19.4s, v23.4s, v31.4s",
"trn2 v27.4s, v23.4s, v31.4s",
"st1 {{ v0.2d, v1.2d, v2.2d, v3.2d }}, [{t5}], #64",
"st1 {{ v4.2d, v5.2d, v6.2d, v7.2d }}, [{t6}], #64",
"st1 {{ v16.2d, v17.2d, v18.2d, v19.2d }}, [{t5}], #64",
"st1 {{ v24.2d, v25.2d, v26.2d, v27.2d }}, [{t6}], #64",
"ret",
"3:",
arr = inout(reg) cols,
save_lr = out(reg) _,
save_arr = out(reg) _,
t2 = out(reg) _,
t3 = out(reg) _,
t5 = out(reg) _,
t6 = out(reg) _,
out("v0") _,
out("v1") _,
out("v2") _,
out("v3") _,
out("v4") _,
out("v5") _,
out("v6") _,
out("v7") _,
out("v16") _,
out("v17") _,
out("v18") _,
out("v19") _,
out("v20") _,
out("v21") _,
out("v22") _,
out("v23") _,
out("v24") _,
out("v25") _,
out("v26") _,
out("v27") _,
out("v28") _,
out("v29") _,
out("v30") _,
out("v31") _,
}
_ = cols;
}
#[cfg(test)]
#[cfg(feature = "rand")]
mod tests {
use crate::base_matrix::slow_transpose;
use crate::transpose64x64_asm_aarch::transpose_asm_aarch64;
use crate::BinaryMatrix64;
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
#[test]
fn test_transpose_64x64_aarch64_asm() {
let mut rng = ChaCha8Rng::seed_from_u64(1234);
let mat = BinaryMatrix64::random(64, 64, &mut rng);
let mut newmat = BinaryMatrix64::new();
let mut a = mat.submatrix64(0, 0).cols;
unsafe { transpose_asm_aarch64(a.as_mut_ptr()) };
slow_transpose(mat.as_ref(), &mut newmat);
let b = newmat.submatrix64(0, 0).cols;
assert_eq!(a, b);
}
}
#[cfg(test)]
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
mod bench {
extern crate test;
use crate::transpose64x64_asm_aarch::transpose_asm_aarch64;
use test::bench::Bencher;
#[bench]
fn bench_transpose64x64_aarch64_asm(b: &mut Bencher) {
let mut mat = [0u64; 64];
mat[0] = 0xffffffffffffffff;
b.iter(|| {
test::black_box(unsafe {
transpose_asm_aarch64(mat.as_mut_ptr());
});
});
}
}