#include <wolfssl/wolfcrypt/libwolfssl_sources_asm.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_CHACHA
#include <wolfssl/wolfcrypt/chacha.h>
static const word32 L_chacha20_arm64_ctr[] = {
0x00000000, 0x00000001, 0x00000002, 0x00000003,
};
static const word32 L_chacha20_arm64_rol8[] = {
0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f,
};
#ifndef WOLFSSL_ARMASM_NO_NEON
void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
{
const word32* rol8 = L_chacha20_arm64_rol8;
const word32* ctr = L_chacha20_arm64_ctr;
__asm__ __volatile__ (
"eor v29.16b, v29.16b, v29.16b\n\t"
"mov x26, #5\n\t"
"eor v31.16b, v31.16b, v31.16b\n\t"
"mov w7, #1\n\t"
"ld1 {v30.16b}, [%[rol8]]\n\t"
"ld1 {v28.4s}, [%[ctr]]\n\t"
"add x4, %x[ctx], #0x44\n\t"
"mov v29.s[0], w26\n\t"
"mov v31.s[0], w7\n\t"
"ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t"
"cmp %w[len], #0x140\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_320_%=\n\t"
"mov w25, #4\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_loop_320_%=: \n\t"
"mov x8, v16.d[0]\n\t"
"mov x10, v16.d[1]\n\t"
"mov x12, v17.d[0]\n\t"
"mov x14, v17.d[1]\n\t"
"mov x16, v18.d[0]\n\t"
"mov x19, v18.d[1]\n\t"
"mov x21, v19.d[0]\n\t"
"mov x23, v19.d[1]\n\t"
"sub %w[len], %w[len], #0x140\n\t"
"dup v0.4s, v16.s[0]\n\t"
"dup v1.4s, v16.s[1]\n\t"
"lsr x9, x8, #32\n\t"
"dup v2.4s, v16.s[2]\n\t"
"dup v3.4s, v16.s[3]\n\t"
"lsr x11, x10, #32\n\t"
"dup v4.4s, v17.s[0]\n\t"
"dup v5.4s, v17.s[1]\n\t"
"lsr x13, x12, #32\n\t"
"dup v6.4s, v17.s[2]\n\t"
"dup v7.4s, v17.s[3]\n\t"
"lsr x15, x14, #32\n\t"
"dup v8.4s, v18.s[0]\n\t"
"dup v9.4s, v18.s[1]\n\t"
"lsr x17, x16, #32\n\t"
"dup v10.4s, v18.s[2]\n\t"
"dup v11.4s, v18.s[3]\n\t"
"lsr x20, x19, #32\n\t"
"dup v12.4s, v19.s[0]\n\t"
"dup v13.4s, v19.s[1]\n\t"
"lsr x22, x21, #32\n\t"
"dup v14.4s, v19.s[2]\n\t"
"dup v15.4s, v19.s[3]\n\t"
"lsr x24, x23, #32\n\t"
"add v12.4s, v12.4s, v28.4s\n\t"
"add w21, w21, w25\n\t"
"mov x26, #10\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_round_start_320_%=: \n\t"
"subs x26, x26, #1\n\t"
"add v0.4s, v0.4s, v4.4s\n\t"
"add w8, w8, w12\n\t"
"add v1.4s, v1.4s, v5.4s\n\t"
"add w9, w9, w13\n\t"
"add v2.4s, v2.4s, v6.4s\n\t"
"add w10, w10, w14\n\t"
"add v3.4s, v3.4s, v7.4s\n\t"
"add w11, w11, w15\n\t"
"eor v12.16b, v12.16b, v0.16b\n\t"
"eor w21, w21, w8\n\t"
"eor v13.16b, v13.16b, v1.16b\n\t"
"eor w22, w22, w9\n\t"
"eor v14.16b, v14.16b, v2.16b\n\t"
"eor w23, w23, w10\n\t"
"eor v15.16b, v15.16b, v3.16b\n\t"
"eor w24, w24, w11\n\t"
"rev32 v12.8h, v12.8h\n\t"
"ror w21, w21, #16\n\t"
"rev32 v13.8h, v13.8h\n\t"
"ror w22, w22, #16\n\t"
"rev32 v14.8h, v14.8h\n\t"
"ror w23, w23, #16\n\t"
"rev32 v15.8h, v15.8h\n\t"
"ror w24, w24, #16\n\t"
"add v8.4s, v8.4s, v12.4s\n\t"
"add w16, w16, w21\n\t"
"add v9.4s, v9.4s, v13.4s\n\t"
"add w17, w17, w22\n\t"
"add v10.4s, v10.4s, v14.4s\n\t"
"add w19, w19, w23\n\t"
"add v11.4s, v11.4s, v15.4s\n\t"
"add w20, w20, w24\n\t"
"eor v20.16b, v4.16b, v8.16b\n\t"
"eor w12, w12, w16\n\t"
"eor v21.16b, v5.16b, v9.16b\n\t"
"eor w13, w13, w17\n\t"
"eor v22.16b, v6.16b, v10.16b\n\t"
"eor w14, w14, w19\n\t"
"eor v23.16b, v7.16b, v11.16b\n\t"
"eor w15, w15, w20\n\t"
"shl v4.4s, v20.4s, #12\n\t"
"ror w12, w12, #20\n\t"
"shl v5.4s, v21.4s, #12\n\t"
"ror w13, w13, #20\n\t"
"shl v6.4s, v22.4s, #12\n\t"
"ror w14, w14, #20\n\t"
"shl v7.4s, v23.4s, #12\n\t"
"ror w15, w15, #20\n\t"
"sri v4.4s, v20.4s, #20\n\t"
"sri v5.4s, v21.4s, #20\n\t"
"sri v6.4s, v22.4s, #20\n\t"
"sri v7.4s, v23.4s, #20\n\t"
"add v0.4s, v0.4s, v4.4s\n\t"
"add w8, w8, w12\n\t"
"add v1.4s, v1.4s, v5.4s\n\t"
"add w9, w9, w13\n\t"
"add v2.4s, v2.4s, v6.4s\n\t"
"add w10, w10, w14\n\t"
"add v3.4s, v3.4s, v7.4s\n\t"
"add w11, w11, w15\n\t"
"eor v12.16b, v12.16b, v0.16b\n\t"
"eor w21, w21, w8\n\t"
"eor v13.16b, v13.16b, v1.16b\n\t"
"eor w22, w22, w9\n\t"
"eor v14.16b, v14.16b, v2.16b\n\t"
"eor w23, w23, w10\n\t"
"eor v15.16b, v15.16b, v3.16b\n\t"
"eor w24, w24, w11\n\t"
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
"ror w21, w21, #24\n\t"
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
"ror w22, w22, #24\n\t"
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
"ror w23, w23, #24\n\t"
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
"ror w24, w24, #24\n\t"
"add v8.4s, v8.4s, v12.4s\n\t"
"add w16, w16, w21\n\t"
"add v9.4s, v9.4s, v13.4s\n\t"
"add w17, w17, w22\n\t"
"add v10.4s, v10.4s, v14.4s\n\t"
"add w19, w19, w23\n\t"
"add v11.4s, v11.4s, v15.4s\n\t"
"add w20, w20, w24\n\t"
"eor v20.16b, v4.16b, v8.16b\n\t"
"eor w12, w12, w16\n\t"
"eor v21.16b, v5.16b, v9.16b\n\t"
"eor w13, w13, w17\n\t"
"eor v22.16b, v6.16b, v10.16b\n\t"
"eor w14, w14, w19\n\t"
"eor v23.16b, v7.16b, v11.16b\n\t"
"eor w15, w15, w20\n\t"
"shl v4.4s, v20.4s, #7\n\t"
"ror w12, w12, #25\n\t"
"shl v5.4s, v21.4s, #7\n\t"
"ror w13, w13, #25\n\t"
"shl v6.4s, v22.4s, #7\n\t"
"ror w14, w14, #25\n\t"
"shl v7.4s, v23.4s, #7\n\t"
"ror w15, w15, #25\n\t"
"sri v4.4s, v20.4s, #25\n\t"
"sri v5.4s, v21.4s, #25\n\t"
"sri v6.4s, v22.4s, #25\n\t"
"sri v7.4s, v23.4s, #25\n\t"
"add v0.4s, v0.4s, v5.4s\n\t"
"add w8, w8, w13\n\t"
"add v1.4s, v1.4s, v6.4s\n\t"
"add w9, w9, w14\n\t"
"add v2.4s, v2.4s, v7.4s\n\t"
"add w10, w10, w15\n\t"
"add v3.4s, v3.4s, v4.4s\n\t"
"add w11, w11, w12\n\t"
"eor v15.16b, v15.16b, v0.16b\n\t"
"eor w24, w24, w8\n\t"
"eor v12.16b, v12.16b, v1.16b\n\t"
"eor w21, w21, w9\n\t"
"eor v13.16b, v13.16b, v2.16b\n\t"
"eor w22, w22, w10\n\t"
"eor v14.16b, v14.16b, v3.16b\n\t"
"eor w23, w23, w11\n\t"
"rev32 v15.8h, v15.8h\n\t"
"ror w24, w24, #16\n\t"
"rev32 v12.8h, v12.8h\n\t"
"ror w21, w21, #16\n\t"
"rev32 v13.8h, v13.8h\n\t"
"ror w22, w22, #16\n\t"
"rev32 v14.8h, v14.8h\n\t"
"ror w23, w23, #16\n\t"
"add v10.4s, v10.4s, v15.4s\n\t"
"add w19, w19, w24\n\t"
"add v11.4s, v11.4s, v12.4s\n\t"
"add w20, w20, w21\n\t"
"add v8.4s, v8.4s, v13.4s\n\t"
"add w16, w16, w22\n\t"
"add v9.4s, v9.4s, v14.4s\n\t"
"add w17, w17, w23\n\t"
"eor v20.16b, v5.16b, v10.16b\n\t"
"eor w13, w13, w19\n\t"
"eor v21.16b, v6.16b, v11.16b\n\t"
"eor w14, w14, w20\n\t"
"eor v22.16b, v7.16b, v8.16b\n\t"
"eor w15, w15, w16\n\t"
"eor v23.16b, v4.16b, v9.16b\n\t"
"eor w12, w12, w17\n\t"
"shl v5.4s, v20.4s, #12\n\t"
"ror w13, w13, #20\n\t"
"shl v6.4s, v21.4s, #12\n\t"
"ror w14, w14, #20\n\t"
"shl v7.4s, v22.4s, #12\n\t"
"ror w15, w15, #20\n\t"
"shl v4.4s, v23.4s, #12\n\t"
"ror w12, w12, #20\n\t"
"sri v5.4s, v20.4s, #20\n\t"
"sri v6.4s, v21.4s, #20\n\t"
"sri v7.4s, v22.4s, #20\n\t"
"sri v4.4s, v23.4s, #20\n\t"
"add v0.4s, v0.4s, v5.4s\n\t"
"add w8, w8, w13\n\t"
"add v1.4s, v1.4s, v6.4s\n\t"
"add w9, w9, w14\n\t"
"add v2.4s, v2.4s, v7.4s\n\t"
"add w10, w10, w15\n\t"
"add v3.4s, v3.4s, v4.4s\n\t"
"add w11, w11, w12\n\t"
"eor v15.16b, v15.16b, v0.16b\n\t"
"eor w24, w24, w8\n\t"
"eor v12.16b, v12.16b, v1.16b\n\t"
"eor w21, w21, w9\n\t"
"eor v13.16b, v13.16b, v2.16b\n\t"
"eor w22, w22, w10\n\t"
"eor v14.16b, v14.16b, v3.16b\n\t"
"eor w23, w23, w11\n\t"
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
"ror w24, w24, #24\n\t"
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
"ror w21, w21, #24\n\t"
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
"ror w22, w22, #24\n\t"
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
"ror w23, w23, #24\n\t"
"add v10.4s, v10.4s, v15.4s\n\t"
"add w19, w19, w24\n\t"
"add v11.4s, v11.4s, v12.4s\n\t"
"add w20, w20, w21\n\t"
"add v8.4s, v8.4s, v13.4s\n\t"
"add w16, w16, w22\n\t"
"add v9.4s, v9.4s, v14.4s\n\t"
"add w17, w17, w23\n\t"
"eor v20.16b, v5.16b, v10.16b\n\t"
"eor w13, w13, w19\n\t"
"eor v21.16b, v6.16b, v11.16b\n\t"
"eor w14, w14, w20\n\t"
"eor v22.16b, v7.16b, v8.16b\n\t"
"eor w15, w15, w16\n\t"
"eor v23.16b, v4.16b, v9.16b\n\t"
"eor w12, w12, w17\n\t"
"shl v5.4s, v20.4s, #7\n\t"
"ror w13, w13, #25\n\t"
"shl v6.4s, v21.4s, #7\n\t"
"ror w14, w14, #25\n\t"
"shl v7.4s, v22.4s, #7\n\t"
"ror w15, w15, #25\n\t"
"shl v4.4s, v23.4s, #7\n\t"
"ror w12, w12, #25\n\t"
"sri v5.4s, v20.4s, #25\n\t"
"sri v6.4s, v21.4s, #25\n\t"
"sri v7.4s, v22.4s, #25\n\t"
"sri v4.4s, v23.4s, #25\n\t"
"b.ne L_chacha_crypt_bytes_arm64_round_start_320_%=\n\t"
"add v12.4s, v12.4s, v28.4s\n\t"
"add w21, w21, w25\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"trn1 v20.4s, v0.4s, v1.4s\n\t"
"trn1 v22.4s, v2.4s, v3.4s\n\t"
"orr x8, x8, x9, lsl 32\n\t"
"trn2 v21.4s, v0.4s, v1.4s\n\t"
"trn2 v23.4s, v2.4s, v3.4s\n\t"
"trn1 v0.2d, v20.2d, v22.2d\n\t"
"trn1 v1.2d, v21.2d, v23.2d\n\t"
"orr x10, x10, x11, lsl 32\n\t"
"trn2 v2.2d, v20.2d, v22.2d\n\t"
"trn2 v3.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v4.4s, v5.4s\n\t"
"trn1 v22.4s, v6.4s, v7.4s\n\t"
"orr x12, x12, x13, lsl 32\n\t"
"trn2 v21.4s, v4.4s, v5.4s\n\t"
"trn2 v23.4s, v6.4s, v7.4s\n\t"
"trn1 v4.2d, v20.2d, v22.2d\n\t"
"trn1 v5.2d, v21.2d, v23.2d\n\t"
"orr x14, x14, x15, lsl 32\n\t"
"trn2 v6.2d, v20.2d, v22.2d\n\t"
"trn2 v7.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v8.4s, v9.4s\n\t"
"trn1 v22.4s, v10.4s, v11.4s\n\t"
"orr x16, x16, x17, lsl 32\n\t"
"trn2 v21.4s, v8.4s, v9.4s\n\t"
"trn2 v23.4s, v10.4s, v11.4s\n\t"
"trn1 v8.2d, v20.2d, v22.2d\n\t"
"trn1 v9.2d, v21.2d, v23.2d\n\t"
"orr x19, x19, x20, lsl 32\n\t"
"trn2 v10.2d, v20.2d, v22.2d\n\t"
"trn2 v11.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v12.4s, v13.4s\n\t"
"trn1 v22.4s, v14.4s, v15.4s\n\t"
"orr x21, x21, x22, lsl 32\n\t"
"trn2 v21.4s, v12.4s, v13.4s\n\t"
"trn2 v23.4s, v14.4s, v15.4s\n\t"
"trn1 v12.2d, v20.2d, v22.2d\n\t"
"trn1 v13.2d, v21.2d, v23.2d\n\t"
"orr x23, x23, x24, lsl 32\n\t"
"trn2 v14.2d, v20.2d, v22.2d\n\t"
"trn2 v15.2d, v21.2d, v23.2d\n\t"
"add v20.4s, v0.4s, v16.4s\n\t"
"add v21.4s, v4.4s, v17.4s\n\t"
"add v22.4s, v8.4s, v18.4s\n\t"
"add v23.4s, v12.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v1.4s, v16.4s\n\t"
"add v21.4s, v5.4s, v17.4s\n\t"
"add v22.4s, v9.4s, v18.4s\n\t"
"add v23.4s, v13.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v2.4s, v16.4s\n\t"
"add v21.4s, v6.4s, v17.4s\n\t"
"add v22.4s, v10.4s, v18.4s\n\t"
"add v23.4s, v14.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v3.4s, v16.4s\n\t"
"add v21.4s, v7.4s, v17.4s\n\t"
"add v22.4s, v11.4s, v18.4s\n\t"
"add v23.4s, v15.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"mov v0.d[0], x8\n\t"
"mov v0.d[1], x10\n\t"
"mov v1.d[0], x12\n\t"
"mov v1.d[1], x14\n\t"
"mov v2.d[0], x16\n\t"
"mov v2.d[1], x19\n\t"
"mov v3.d[0], x21\n\t"
"mov v3.d[1], x23\n\t"
"add v0.4s, v0.4s, v16.4s\n\t"
"add v1.4s, v1.4s, v17.4s\n\t"
"add v2.4s, v2.4s, v18.4s\n\t"
"add v3.4s, v3.4s, v19.4s\n\t"
"eor v0.16b, v0.16b, v24.16b\n\t"
"eor v1.16b, v1.16b, v25.16b\n\t"
"eor v2.16b, v2.16b, v26.16b\n\t"
"eor v3.16b, v3.16b, v27.16b\n\t"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t"
"cmp %w[len], #0x140\n\t"
"add v19.4s, v19.4s, v29.4s\n\t"
"b.ge L_chacha_crypt_bytes_arm64_loop_320_%=\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_320_%=: \n\t"
"cmp %w[len], #0x100\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_256_%=\n\t"
"dup v0.4s, v16.s[0]\n\t"
"dup v1.4s, v16.s[1]\n\t"
"dup v2.4s, v16.s[2]\n\t"
"dup v3.4s, v16.s[3]\n\t"
"dup v4.4s, v17.s[0]\n\t"
"dup v5.4s, v17.s[1]\n\t"
"dup v6.4s, v17.s[2]\n\t"
"dup v7.4s, v17.s[3]\n\t"
"dup v8.4s, v18.s[0]\n\t"
"dup v9.4s, v18.s[1]\n\t"
"dup v10.4s, v18.s[2]\n\t"
"dup v11.4s, v18.s[3]\n\t"
"dup v12.4s, v19.s[0]\n\t"
"dup v13.4s, v19.s[1]\n\t"
"dup v14.4s, v19.s[2]\n\t"
"dup v15.4s, v19.s[3]\n\t"
"add v12.4s, v12.4s, v28.4s\n\t"
"mov x26, #10\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_round_start_256_%=: \n\t"
"subs x26, x26, #1\n\t"
"add v0.4s, v0.4s, v4.4s\n\t"
"add v1.4s, v1.4s, v5.4s\n\t"
"add v2.4s, v2.4s, v6.4s\n\t"
"add v3.4s, v3.4s, v7.4s\n\t"
"eor v12.16b, v12.16b, v0.16b\n\t"
"eor v13.16b, v13.16b, v1.16b\n\t"
"eor v14.16b, v14.16b, v2.16b\n\t"
"eor v15.16b, v15.16b, v3.16b\n\t"
"rev32 v12.8h, v12.8h\n\t"
"rev32 v13.8h, v13.8h\n\t"
"rev32 v14.8h, v14.8h\n\t"
"rev32 v15.8h, v15.8h\n\t"
"add v8.4s, v8.4s, v12.4s\n\t"
"add v9.4s, v9.4s, v13.4s\n\t"
"add v10.4s, v10.4s, v14.4s\n\t"
"add v11.4s, v11.4s, v15.4s\n\t"
"eor v20.16b, v4.16b, v8.16b\n\t"
"eor v21.16b, v5.16b, v9.16b\n\t"
"eor v22.16b, v6.16b, v10.16b\n\t"
"eor v23.16b, v7.16b, v11.16b\n\t"
"shl v4.4s, v20.4s, #12\n\t"
"shl v5.4s, v21.4s, #12\n\t"
"shl v6.4s, v22.4s, #12\n\t"
"shl v7.4s, v23.4s, #12\n\t"
"sri v4.4s, v20.4s, #20\n\t"
"sri v5.4s, v21.4s, #20\n\t"
"sri v6.4s, v22.4s, #20\n\t"
"sri v7.4s, v23.4s, #20\n\t"
"add v0.4s, v0.4s, v4.4s\n\t"
"add v1.4s, v1.4s, v5.4s\n\t"
"add v2.4s, v2.4s, v6.4s\n\t"
"add v3.4s, v3.4s, v7.4s\n\t"
"eor v12.16b, v12.16b, v0.16b\n\t"
"eor v13.16b, v13.16b, v1.16b\n\t"
"eor v14.16b, v14.16b, v2.16b\n\t"
"eor v15.16b, v15.16b, v3.16b\n\t"
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
"add v8.4s, v8.4s, v12.4s\n\t"
"add v9.4s, v9.4s, v13.4s\n\t"
"add v10.4s, v10.4s, v14.4s\n\t"
"add v11.4s, v11.4s, v15.4s\n\t"
"eor v20.16b, v4.16b, v8.16b\n\t"
"eor v21.16b, v5.16b, v9.16b\n\t"
"eor v22.16b, v6.16b, v10.16b\n\t"
"eor v23.16b, v7.16b, v11.16b\n\t"
"shl v4.4s, v20.4s, #7\n\t"
"shl v5.4s, v21.4s, #7\n\t"
"shl v6.4s, v22.4s, #7\n\t"
"shl v7.4s, v23.4s, #7\n\t"
"sri v4.4s, v20.4s, #25\n\t"
"sri v5.4s, v21.4s, #25\n\t"
"sri v6.4s, v22.4s, #25\n\t"
"sri v7.4s, v23.4s, #25\n\t"
"add v0.4s, v0.4s, v5.4s\n\t"
"add v1.4s, v1.4s, v6.4s\n\t"
"add v2.4s, v2.4s, v7.4s\n\t"
"add v3.4s, v3.4s, v4.4s\n\t"
"eor v15.16b, v15.16b, v0.16b\n\t"
"eor v12.16b, v12.16b, v1.16b\n\t"
"eor v13.16b, v13.16b, v2.16b\n\t"
"eor v14.16b, v14.16b, v3.16b\n\t"
"rev32 v15.8h, v15.8h\n\t"
"rev32 v12.8h, v12.8h\n\t"
"rev32 v13.8h, v13.8h\n\t"
"rev32 v14.8h, v14.8h\n\t"
"add v10.4s, v10.4s, v15.4s\n\t"
"add v11.4s, v11.4s, v12.4s\n\t"
"add v8.4s, v8.4s, v13.4s\n\t"
"add v9.4s, v9.4s, v14.4s\n\t"
"eor v20.16b, v5.16b, v10.16b\n\t"
"eor v21.16b, v6.16b, v11.16b\n\t"
"eor v22.16b, v7.16b, v8.16b\n\t"
"eor v23.16b, v4.16b, v9.16b\n\t"
"shl v5.4s, v20.4s, #12\n\t"
"shl v6.4s, v21.4s, #12\n\t"
"shl v7.4s, v22.4s, #12\n\t"
"shl v4.4s, v23.4s, #12\n\t"
"sri v5.4s, v20.4s, #20\n\t"
"sri v6.4s, v21.4s, #20\n\t"
"sri v7.4s, v22.4s, #20\n\t"
"sri v4.4s, v23.4s, #20\n\t"
"add v0.4s, v0.4s, v5.4s\n\t"
"add v1.4s, v1.4s, v6.4s\n\t"
"add v2.4s, v2.4s, v7.4s\n\t"
"add v3.4s, v3.4s, v4.4s\n\t"
"eor v15.16b, v15.16b, v0.16b\n\t"
"eor v12.16b, v12.16b, v1.16b\n\t"
"eor v13.16b, v13.16b, v2.16b\n\t"
"eor v14.16b, v14.16b, v3.16b\n\t"
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
"add v10.4s, v10.4s, v15.4s\n\t"
"add v11.4s, v11.4s, v12.4s\n\t"
"add v8.4s, v8.4s, v13.4s\n\t"
"add v9.4s, v9.4s, v14.4s\n\t"
"eor v20.16b, v5.16b, v10.16b\n\t"
"eor v21.16b, v6.16b, v11.16b\n\t"
"eor v22.16b, v7.16b, v8.16b\n\t"
"eor v23.16b, v4.16b, v9.16b\n\t"
"shl v5.4s, v20.4s, #7\n\t"
"shl v6.4s, v21.4s, #7\n\t"
"shl v7.4s, v22.4s, #7\n\t"
"shl v4.4s, v23.4s, #7\n\t"
"sri v5.4s, v20.4s, #25\n\t"
"sri v6.4s, v21.4s, #25\n\t"
"sri v7.4s, v22.4s, #25\n\t"
"sri v4.4s, v23.4s, #25\n\t"
"b.ne L_chacha_crypt_bytes_arm64_round_start_256_%=\n\t"
"mov x26, #4\n\t"
"add v12.4s, v12.4s, v28.4s\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"trn1 v20.4s, v0.4s, v1.4s\n\t"
"trn1 v22.4s, v2.4s, v3.4s\n\t"
"trn2 v21.4s, v0.4s, v1.4s\n\t"
"trn2 v23.4s, v2.4s, v3.4s\n\t"
"trn1 v0.2d, v20.2d, v22.2d\n\t"
"trn1 v1.2d, v21.2d, v23.2d\n\t"
"trn2 v2.2d, v20.2d, v22.2d\n\t"
"trn2 v3.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v4.4s, v5.4s\n\t"
"trn1 v22.4s, v6.4s, v7.4s\n\t"
"trn2 v21.4s, v4.4s, v5.4s\n\t"
"trn2 v23.4s, v6.4s, v7.4s\n\t"
"trn1 v4.2d, v20.2d, v22.2d\n\t"
"trn1 v5.2d, v21.2d, v23.2d\n\t"
"trn2 v6.2d, v20.2d, v22.2d\n\t"
"trn2 v7.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v8.4s, v9.4s\n\t"
"trn1 v22.4s, v10.4s, v11.4s\n\t"
"trn2 v21.4s, v8.4s, v9.4s\n\t"
"trn2 v23.4s, v10.4s, v11.4s\n\t"
"trn1 v8.2d, v20.2d, v22.2d\n\t"
"trn1 v9.2d, v21.2d, v23.2d\n\t"
"trn2 v10.2d, v20.2d, v22.2d\n\t"
"trn2 v11.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v12.4s, v13.4s\n\t"
"trn1 v22.4s, v14.4s, v15.4s\n\t"
"trn2 v21.4s, v12.4s, v13.4s\n\t"
"trn2 v23.4s, v14.4s, v15.4s\n\t"
"trn1 v12.2d, v20.2d, v22.2d\n\t"
"trn1 v13.2d, v21.2d, v23.2d\n\t"
"trn2 v14.2d, v20.2d, v22.2d\n\t"
"trn2 v15.2d, v21.2d, v23.2d\n\t"
"add v20.4s, v0.4s, v16.4s\n\t"
"add v21.4s, v4.4s, v17.4s\n\t"
"add v22.4s, v8.4s, v18.4s\n\t"
"add v23.4s, v12.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v1.4s, v16.4s\n\t"
"add v21.4s, v5.4s, v17.4s\n\t"
"add v22.4s, v9.4s, v18.4s\n\t"
"add v23.4s, v13.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v2.4s, v16.4s\n\t"
"add v21.4s, v6.4s, v17.4s\n\t"
"add v22.4s, v10.4s, v18.4s\n\t"
"add v23.4s, v14.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v3.4s, v16.4s\n\t"
"add v21.4s, v7.4s, v17.4s\n\t"
"add v22.4s, v11.4s, v18.4s\n\t"
"add v23.4s, v15.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"mov v29.s[0], w26\n\t"
"sub %w[len], %w[len], #0x100\n\t"
"add v19.4s, v19.4s, v29.4s\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_256_%=: \n\t"
"cmp %w[len], #0x80\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_128_%=\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"mov v4.16b, v16.16b\n\t"
"mov v5.16b, v17.16b\n\t"
"mov v6.16b, v18.16b\n\t"
"mov v7.16b, v19.16b\n\t"
"mov v0.16b, v16.16b\n\t"
"mov v1.16b, v17.16b\n\t"
"mov v2.16b, v18.16b\n\t"
"mov v3.16b, v19.16b\n\t"
"add v7.4s, v7.4s, v31.4s\n\t"
"mov x26, #10\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_round_start_128_%=: \n\t"
"subs x26, x26, #1\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"rev32 v3.8h, v3.8h\n\t"
"rev32 v7.8h, v7.8h\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"shl v1.4s, v20.4s, #12\n\t"
"shl v5.4s, v21.4s, #12\n\t"
"sri v1.4s, v20.4s, #20\n\t"
"sri v5.4s, v21.4s, #20\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
"tbl v7.16b, {v7.16b}, v30.16b\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"shl v1.4s, v20.4s, #7\n\t"
"shl v5.4s, v21.4s, #7\n\t"
"sri v1.4s, v20.4s, #25\n\t"
"sri v5.4s, v21.4s, #25\n\t"
"ext v3.16b, v3.16b, v3.16b, #12\n\t"
"ext v7.16b, v7.16b, v7.16b, #12\n\t"
"ext v1.16b, v1.16b, v1.16b, #4\n\t"
"ext v5.16b, v5.16b, v5.16b, #4\n\t"
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
"ext v6.16b, v6.16b, v6.16b, #8\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"rev32 v3.8h, v3.8h\n\t"
"rev32 v7.8h, v7.8h\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"shl v1.4s, v20.4s, #12\n\t"
"shl v5.4s, v21.4s, #12\n\t"
"sri v1.4s, v20.4s, #20\n\t"
"sri v5.4s, v21.4s, #20\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
"tbl v7.16b, {v7.16b}, v30.16b\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"shl v1.4s, v20.4s, #7\n\t"
"shl v5.4s, v21.4s, #7\n\t"
"sri v1.4s, v20.4s, #25\n\t"
"sri v5.4s, v21.4s, #25\n\t"
"ext v3.16b, v3.16b, v3.16b, #4\n\t"
"ext v7.16b, v7.16b, v7.16b, #4\n\t"
"ext v1.16b, v1.16b, v1.16b, #12\n\t"
"ext v5.16b, v5.16b, v5.16b, #12\n\t"
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
"ext v6.16b, v6.16b, v6.16b, #8\n\t"
"b.ne L_chacha_crypt_bytes_arm64_round_start_128_%=\n\t"
"add v0.4s, v0.4s, v16.4s\n\t"
"add v1.4s, v1.4s, v17.4s\n\t"
"add v2.4s, v2.4s, v18.4s\n\t"
"add v3.4s, v3.4s, v19.4s\n\t"
"eor v24.16b, v24.16b, v0.16b\n\t"
"eor v25.16b, v25.16b, v1.16b\n\t"
"eor v26.16b, v26.16b, v2.16b\n\t"
"eor v27.16b, v27.16b, v3.16b\n\t"
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
"st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[c]], #0x40\n\t"
"add v19.4s, v19.4s, v31.4s\n\t"
"add v4.4s, v4.4s, v16.4s\n\t"
"add v5.4s, v5.4s, v17.4s\n\t"
"add v6.4s, v6.4s, v18.4s\n\t"
"add v7.4s, v7.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v4.16b\n\t"
"eor v21.16b, v21.16b, v5.16b\n\t"
"eor v22.16b, v22.16b, v6.16b\n\t"
"eor v23.16b, v23.16b, v7.16b\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v19.4s, v19.4s, v31.4s\n\t"
"sub %w[len], %w[len], #0x80\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_128_%=: \n\t"
"cmp %w[len], #0\n\t"
"b.eq L_chacha_crypt_bytes_arm64_done_all_%=\n\t"
"mov %w[rol8], #0x40\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_loop_64_%=: \n\t"
"mov v0.16b, v16.16b\n\t"
"mov v1.16b, v17.16b\n\t"
"mov v2.16b, v18.16b\n\t"
"mov v3.16b, v19.16b\n\t"
"mov x26, #10\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_round_64_%=: \n\t"
"subs x26, x26, #1\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"rev32 v3.8h, v3.8h\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"shl v1.4s, v20.4s, #12\n\t"
"sri v1.4s, v20.4s, #20\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"shl v1.4s, v20.4s, #7\n\t"
"sri v1.4s, v20.4s, #25\n\t"
"ext v3.16b, v3.16b, v3.16b, #12\n\t"
"ext v1.16b, v1.16b, v1.16b, #4\n\t"
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"rev32 v3.8h, v3.8h\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"shl v1.4s, v20.4s, #12\n\t"
"sri v1.4s, v20.4s, #20\n\t"
"add v0.4s, v0.4s, v1.4s\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
"add v2.4s, v2.4s, v3.4s\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"shl v1.4s, v20.4s, #7\n\t"
"sri v1.4s, v20.4s, #25\n\t"
"ext v3.16b, v3.16b, v3.16b, #4\n\t"
"ext v1.16b, v1.16b, v1.16b, #12\n\t"
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
"b.ne L_chacha_crypt_bytes_arm64_round_64_%=\n\t"
"add v0.4s, v0.4s, v16.4s\n\t"
"add v1.4s, v1.4s, v17.4s\n\t"
"add v2.4s, v2.4s, v18.4s\n\t"
"add v3.4s, v3.4s, v19.4s\n\t"
"cmp %w[len], #0x40\n\t"
"add v19.4s, v19.4s, v31.4s\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_64_%=\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"eor v24.16b, v24.16b, v0.16b\n\t"
"eor v25.16b, v25.16b, v1.16b\n\t"
"eor v26.16b, v26.16b, v2.16b\n\t"
"eor v27.16b, v27.16b, v3.16b\n\t"
"st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[c]], #0x40\n\t"
"subs %w[len], %w[len], #0x40\n\t"
"b.ne L_chacha_crypt_bytes_arm64_loop_64_%=\n\t"
"b L_chacha_crypt_bytes_arm64_done_%=\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_64_%=: \n\t"
"sub %w[rol8], %w[rol8], %w[len]\n\t"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x4]\n\t"
"str %w[rol8], [%x[ctx], #64]\n\t"
"cmp %w[len], #32\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_32_%=\n\t"
"ld1 {v24.16b, v25.16b}, [%x[m]], #32\n\t"
"eor v24.16b, v24.16b, v0.16b\n\t"
"eor v25.16b, v25.16b, v1.16b\n\t"
"st1 {v24.16b, v25.16b}, [%x[c]], #32\n\t"
"subs %w[len], %w[len], #32\n\t"
"mov v0.16b, v2.16b\n\t"
"mov v1.16b, v3.16b\n\t"
"b.eq L_chacha_crypt_bytes_arm64_done_%=\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_32_%=: \n\t"
"cmp %w[len], #16\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_16_%=\n\t"
"ld1 {v24.16b}, [%x[m]], #16\n\t"
"eor v24.16b, v24.16b, v0.16b\n\t"
"st1 {v24.16b}, [%x[c]], #16\n\t"
"subs %w[len], %w[len], #16\n\t"
"mov v0.16b, v1.16b\n\t"
"b.eq L_chacha_crypt_bytes_arm64_done_%=\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_16_%=: \n\t"
"cmp %w[len], #8\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_8_%=\n\t"
"ld1 {v24.8b}, [%x[m]], #8\n\t"
"eor v24.8b, v24.8b, v0.8b\n\t"
"st1 {v24.8b}, [%x[c]], #8\n\t"
"subs %w[len], %w[len], #8\n\t"
"mov v0.d[0], v0.d[1]\n\t"
"b.eq L_chacha_crypt_bytes_arm64_done_%=\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_lt_8_%=: \n\t"
"mov %[rol8], v0.d[0]\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_loop_lt_8_%=: \n\t"
"ldrb %w[ctr], [%x[m]], #1\n\t"
"eor %w[ctr], %w[ctr], %w[rol8]\n\t"
"strb %w[ctr], [%x[c]], #1\n\t"
"subs %w[len], %w[len], #1\n\t"
"lsr %[rol8], %[rol8], #8\n\t"
"b.gt L_chacha_crypt_bytes_arm64_loop_lt_8_%=\n\t"
"\n"
"L_chacha_crypt_bytes_arm64_done_%=: \n\t"
"\n"
"L_chacha_crypt_bytes_arm64_done_all_%=: \n\t"
"st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t"
: [ctx] "+r" (ctx), [c] "+r" (c), [len] "+r" (len)
: [m] "r" (m), [rol8] "r" (rol8), [ctr] "r" (ctr)
: "memory", "cc", "x4", "x7", "x8", "x9", "x10", "x11", "x12", "x13",
"x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23",
"x24", "x25", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31"
);
}
void wc_chacha_setiv(word32* x, const byte* iv, word32 counter)
{
__asm__ __volatile__ (
"ldr x3, [%x[iv]]\n\t"
"ldr w4, [%x[iv], #8]\n\t"
"str %x[counter], [%x[x], #48]\n\t"
"str x3, [%x[x], #52]\n\t"
"str w4, [%x[x], #60]\n\t"
: [x] "+r" (x), [counter] "+r" (counter)
: [iv] "r" (iv)
: "memory", "cc", "x3", "x4"
);
}
static const word32 L_chacha_setkey_arm64_constant[] = {
0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
};
void wc_chacha_setkey(word32* x, const byte* key, word32 keySz)
{
const word32* constant = L_chacha_setkey_arm64_constant;
__asm__ __volatile__ (
"subs %x[keySz], %x[keySz], #16\n\t"
"add %[constant], %[constant], %x[keySz]\n\t"
"ld1 {v0.4s}, [%[constant]]\n\t"
"ld1 {v1.16b}, [%x[key]], #16\n\t"
#ifdef BIG_ENDIAN_ORDER
"rev32 v1.8h, v1.8h\n\t"
#endif
"st1 {v0.4s}, [%x[x]], #16\n\t"
"st1 {v1.4s}, [%x[x]], #16\n\t"
"b.eq L_chacha_setkey_arm64_done_%=\n\t"
"ld1 {v1.16b}, [%x[key]]\n\t"
#ifdef BIG_ENDIAN_ORDER
"rev32 v1.8h, v1.8h\n\t"
#endif
"\n"
"L_chacha_setkey_arm64_done_%=: \n\t"
"st1 {v1.4s}, [%x[x]]\n\t"
: [x] "+r" (x), [keySz] "+r" (keySz)
: [key] "r" (key), [constant] "r" (constant)
: "memory", "cc", "v0", "v1"
);
}
void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
{
__asm__ __volatile__ (
"\n"
"L_chacha_use_over_arm64_16byte_loop_%=: \n\t"
"cmp %w[len], #16\n\t"
"b.lt L_chacha_use_over_arm64_word_loop_%=\n\t"
"ld1 {v0.16b}, [%x[over]], #16\n\t"
"ld1 {v1.16b}, [%x[input]], #16\n\t"
"eor v1.16b, v1.16b, v0.16b\n\t"
"subs %w[len], %w[len], #16\n\t"
"st1 {v1.16b}, [%x[output]], #16\n\t"
"b.eq L_chacha_use_over_arm64_done_%=\n\t"
"b L_chacha_use_over_arm64_16byte_loop_%=\n\t"
"\n"
"L_chacha_use_over_arm64_word_loop_%=: \n\t"
"cmp %w[len], #4\n\t"
"b.lt L_chacha_use_over_arm64_byte_loop_%=\n\t"
"ldr w4, [%x[over]], #4\n\t"
"ldr w5, [%x[input]], #4\n\t"
"eor w5, w5, w4\n\t"
"subs %w[len], %w[len], #4\n\t"
"str w5, [%x[output]], #4\n\t"
"b.eq L_chacha_use_over_arm64_done_%=\n\t"
"b L_chacha_use_over_arm64_word_loop_%=\n\t"
"\n"
"L_chacha_use_over_arm64_byte_loop_%=: \n\t"
"ldrb w4, [%x[over]], #1\n\t"
"ldrb w5, [%x[input]], #1\n\t"
"eor w5, w5, w4\n\t"
"subs %w[len], %w[len], #1\n\t"
"strb w5, [%x[output]], #1\n\t"
"b.eq L_chacha_use_over_arm64_done_%=\n\t"
"b L_chacha_use_over_arm64_byte_loop_%=\n\t"
"\n"
"L_chacha_use_over_arm64_done_%=: \n\t"
: [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)
: [input] "r" (input)
: "memory", "cc", "x4", "x5", "v0", "v1"
);
}
#endif
#endif
#endif
#endif
#endif