/* armv8-mlkem-asm
*
* Copyright (C) 2006-2026 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#include <wolfssl/wolfcrypt/libwolfssl_sources_asm.h>
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./kyber/kyber.rb arm64 \
* ../wolfssl/wolfcrypt/src/port/arm/armv8-mlkem-asm.S
*/
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifndef WOLFSSL_ARMASM_INLINE
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_consts, %object
.section .rodata
.size L_mlkem_aarch64_consts, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_consts:
.short 0x0d01,0xf301,0x4ebf,0x0549,0x5049,0x0000,0x0000,0x0000
#ifdef WOLFSSL_WC_MLKEM
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_zetas, %object
.section .rodata
.size L_mlkem_aarch64_zetas, 576
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_zetas:
.short 0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca
.short 0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc
.short 0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f
.short 0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de
.short 0x04c7,0x04c7,0x04c7,0x04c7,0x028c,0x028c,0x028c,0x028c
.short 0x0ad9,0x0ad9,0x0ad9,0x0ad9,0x03f7,0x03f7,0x03f7,0x03f7
.short 0x07f4,0x07f4,0x07f4,0x07f4,0x05d3,0x05d3,0x05d3,0x05d3
.short 0x0be7,0x0be7,0x0be7,0x0be7,0x06f9,0x06f9,0x06f9,0x06f9
.short 0x0204,0x0204,0x0204,0x0204,0x0cf9,0x0cf9,0x0cf9,0x0cf9
.short 0x0bc1,0x0bc1,0x0bc1,0x0bc1,0x0a67,0x0a67,0x0a67,0x0a67
.short 0x06af,0x06af,0x06af,0x06af,0x0877,0x0877,0x0877,0x0877
.short 0x007e,0x007e,0x007e,0x007e,0x05bd,0x05bd,0x05bd,0x05bd
.short 0x09ac,0x09ac,0x09ac,0x09ac,0x0ca7,0x0ca7,0x0ca7,0x0ca7
.short 0x0bf2,0x0bf2,0x0bf2,0x0bf2,0x033e,0x033e,0x033e,0x033e
.short 0x006b,0x006b,0x006b,0x006b,0x0774,0x0774,0x0774,0x0774
.short 0x0c0a,0x0c0a,0x0c0a,0x0c0a,0x094a,0x094a,0x094a,0x094a
.short 0x0b73,0x0b73,0x0b73,0x0b73,0x03c1,0x03c1,0x03c1,0x03c1
.short 0x071d,0x071d,0x071d,0x071d,0x0a2c,0x0a2c,0x0a2c,0x0a2c
.short 0x01c0,0x01c0,0x01c0,0x01c0,0x08d8,0x08d8,0x08d8,0x08d8
.short 0x02a5,0x02a5,0x02a5,0x02a5,0x0806,0x0806,0x0806,0x0806
.short 0x08b2,0x08b2,0x01ae,0x01ae,0x022b,0x022b,0x034b,0x034b
.short 0x081e,0x081e,0x0367,0x0367,0x060e,0x060e,0x0069,0x0069
.short 0x01a6,0x01a6,0x024b,0x024b,0x00b1,0x00b1,0x0c16,0x0c16
.short 0x0bde,0x0bde,0x0b35,0x0b35,0x0626,0x0626,0x0675,0x0675
.short 0x0c0b,0x0c0b,0x030a,0x030a,0x0487,0x0487,0x0c6e,0x0c6e
.short 0x09f8,0x09f8,0x05cb,0x05cb,0x0aa7,0x0aa7,0x045f,0x045f
.short 0x06cb,0x06cb,0x0284,0x0284,0x0999,0x0999,0x015d,0x015d
.short 0x01a2,0x01a2,0x0149,0x0149,0x0c65,0x0c65,0x0cb6,0x0cb6
.short 0x0331,0x0331,0x0449,0x0449,0x025b,0x025b,0x0262,0x0262
.short 0x052a,0x052a,0x07fc,0x07fc,0x0748,0x0748,0x0180,0x0180
.short 0x0842,0x0842,0x0c79,0x0c79,0x04c2,0x04c2,0x07ca,0x07ca
.short 0x0997,0x0997,0x00dc,0x00dc,0x085e,0x085e,0x0686,0x0686
.short 0x0860,0x0860,0x0707,0x0707,0x0803,0x0803,0x031a,0x031a
.short 0x071b,0x071b,0x09ab,0x09ab,0x099b,0x099b,0x01de,0x01de
.short 0x0c95,0x0c95,0x0bcd,0x0bcd,0x03e4,0x03e4,0x03df,0x03df
.short 0x03be,0x03be,0x074d,0x074d,0x05f2,0x05f2,0x065c,0x065c
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_zetas_qinv, %object
.section .rodata
.size L_mlkem_aarch64_zetas_qinv, 576
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_zetas_qinv:
.short 0xffed,0x7b0b,0x399a,0x0314,0x34d5,0xcf8e,0x6e1f,0xbeca
.short 0xae56,0x6c6e,0xf129,0xc2b6,0x29c2,0x054f,0xd43f,0x79bc
.short 0xe93d,0x43d4,0x9908,0x8e7f,0x15c4,0xfbb2,0x53bf,0x997f
.short 0x9258,0x5ef9,0xd6dc,0x2260,0x47fb,0x229b,0x6834,0xc0de
.short 0xe9c7,0xe9c7,0xe9c7,0xe9c7,0xe68c,0xe68c,0xe68c,0xe68c
.short 0x05d9,0x05d9,0x05d9,0x05d9,0x78f7,0x78f7,0x78f7,0x78f7
.short 0xa3f4,0xa3f4,0xa3f4,0xa3f4,0x4ed3,0x4ed3,0x4ed3,0x4ed3
.short 0x50e7,0x50e7,0x50e7,0x50e7,0x61f9,0x61f9,0x61f9,0x61f9
.short 0xce04,0xce04,0xce04,0xce04,0x67f9,0x67f9,0x67f9,0x67f9
.short 0x3ec1,0x3ec1,0x3ec1,0x3ec1,0xcf67,0xcf67,0xcf67,0xcf67
.short 0x23af,0x23af,0x23af,0x23af,0xfd77,0xfd77,0xfd77,0xfd77
.short 0x9a7e,0x9a7e,0x9a7e,0x9a7e,0x6cbd,0x6cbd,0x6cbd,0x6cbd
.short 0x4dac,0x4dac,0x4dac,0x4dac,0x91a7,0x91a7,0x91a7,0x91a7
.short 0xc1f2,0xc1f2,0xc1f2,0xc1f2,0xdd3e,0xdd3e,0xdd3e,0xdd3e
.short 0x916b,0x916b,0x916b,0x916b,0x2374,0x2374,0x2374,0x2374
.short 0x8a0a,0x8a0a,0x8a0a,0x8a0a,0x474a,0x474a,0x474a,0x474a
.short 0x3473,0x3473,0x3473,0x3473,0x36c1,0x36c1,0x36c1,0x36c1
.short 0x8e1d,0x8e1d,0x8e1d,0x8e1d,0xce2c,0xce2c,0xce2c,0xce2c
.short 0x41c0,0x41c0,0x41c0,0x41c0,0x10d8,0x10d8,0x10d8,0x10d8
.short 0xa1a5,0xa1a5,0xa1a5,0xa1a5,0xba06,0xba06,0xba06,0xba06
.short 0xfeb2,0xfeb2,0x2bae,0x2bae,0xd32b,0xd32b,0x344b,0x344b
.short 0x821e,0x821e,0xc867,0xc867,0x500e,0x500e,0xab69,0xab69
.short 0x93a6,0x93a6,0x334b,0x334b,0x03b1,0x03b1,0xee16,0xee16
.short 0xc5de,0xc5de,0x5a35,0x5a35,0x1826,0x1826,0x1575,0x1575
.short 0x7d0b,0x7d0b,0x810a,0x810a,0x2987,0x2987,0x766e,0x766e
.short 0x71f8,0x71f8,0xb6cb,0xb6cb,0x8fa7,0x8fa7,0x315f,0x315f
.short 0xb7cb,0xb7cb,0x4e84,0x4e84,0x4499,0x4499,0x485d,0x485d
.short 0xc7a2,0xc7a2,0x4c49,0x4c49,0xeb65,0xeb65,0xceb6,0xceb6
.short 0x8631,0x8631,0x4f49,0x4f49,0x635b,0x635b,0x0862,0x0862
.short 0xe32a,0xe32a,0x3bfc,0x3bfc,0x5f48,0x5f48,0x8180,0x8180
.short 0xae42,0xae42,0xe779,0xe779,0x2ac2,0x2ac2,0xc5ca,0xc5ca
.short 0x5e97,0x5e97,0xd4dc,0xd4dc,0x425e,0x425e,0x3886,0x3886
.short 0x2860,0x2860,0xac07,0xac07,0xe103,0xe103,0xb11a,0xb11a
.short 0xa81b,0xa81b,0x5aab,0x5aab,0x2a9b,0x2a9b,0xbbde,0xbbde
.short 0x7b95,0x7b95,0xa2cd,0xa2cd,0x6fe4,0x6fe4,0xb0df,0xb0df
.short 0x5dbe,0x5dbe,0x1e4d,0x1e4d,0xbbf2,0xbbf2,0x5a5c,0x5a5c
#ifndef __APPLE__
.text
.globl mlkem_ntt
.type mlkem_ntt,@function
.align 2
mlkem_ntt:
#else
.section __TEXT,__text
.globl _mlkem_ntt
.p2align 2
_mlkem_ntt:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_aarch64_zetas
add x2, x2, :lo12:L_mlkem_aarch64_zetas
#else
adrp x2, L_mlkem_aarch64_zetas@PAGE
add x2, x2, L_mlkem_aarch64_zetas@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_zetas_qinv
add x3, x3, :lo12:L_mlkem_aarch64_zetas_qinv
#else
adrp x3, L_mlkem_aarch64_zetas_qinv@PAGE
add x3, x3, L_mlkem_aarch64_zetas_qinv@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_aarch64_consts
add x4, x4, :lo12:L_mlkem_aarch64_consts
#else
adrp x4, L_mlkem_aarch64_consts@PAGE
add x4, x4, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
add x1, x0, #0x100
ldr q4, [x4]
ldr q5, [x0]
ldr q6, [x0, #32]
ldr q7, [x0, #64]
ldr q8, [x0, #96]
ldr q9, [x0, #128]
ldr q10, [x0, #160]
ldr q11, [x0, #192]
ldr q12, [x0, #224]
ldr q13, [x1]
ldr q14, [x1, #32]
ldr q15, [x1, #64]
ldr q16, [x1, #96]
ldr q17, [x1, #128]
ldr q18, [x1, #160]
ldr q19, [x1, #192]
ldr q20, [x1, #224]
ldr q0, [x2]
ldr q1, [x3]
mul v29.8h, v13.8h, v1.h[1]
mul v30.8h, v14.8h, v1.h[1]
sqrdmulh v21.8h, v13.8h, v0.h[1]
sqrdmulh v22.8h, v14.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v15.8h, v1.h[1]
mul v30.8h, v16.8h, v1.h[1]
sqrdmulh v23.8h, v15.8h, v0.h[1]
sqrdmulh v24.8h, v16.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[1]
mul v30.8h, v18.8h, v1.h[1]
sqrdmulh v25.8h, v17.8h, v0.h[1]
sqrdmulh v26.8h, v18.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[1]
mul v30.8h, v20.8h, v1.h[1]
sqrdmulh v27.8h, v19.8h, v0.h[1]
sqrdmulh v28.8h, v20.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v13.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v14.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v15.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v16.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v9.8h, v25.8h
add v9.8h, v9.8h, v25.8h
sub v18.8h, v10.8h, v26.8h
add v10.8h, v10.8h, v26.8h
sub v19.8h, v11.8h, v27.8h
add v11.8h, v11.8h, v27.8h
sub v20.8h, v12.8h, v28.8h
add v12.8h, v12.8h, v28.8h
mul v29.8h, v9.8h, v1.h[2]
mul v30.8h, v10.8h, v1.h[2]
sqrdmulh v21.8h, v9.8h, v0.h[2]
sqrdmulh v22.8h, v10.8h, v0.h[2]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[2]
sqrdmulh v23.8h, v11.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[2]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[3]
mul v30.8h, v18.8h, v1.h[3]
sqrdmulh v25.8h, v17.8h, v0.h[3]
sqrdmulh v26.8h, v18.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[3]
mul v30.8h, v20.8h, v1.h[3]
sqrdmulh v27.8h, v19.8h, v0.h[3]
sqrdmulh v28.8h, v20.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v9.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v10.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v12.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v18.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v15.8h, v27.8h
add v15.8h, v15.8h, v27.8h
sub v20.8h, v16.8h, v28.8h
add v16.8h, v16.8h, v28.8h
mul v29.8h, v7.8h, v1.h[4]
mul v30.8h, v8.8h, v1.h[4]
sqrdmulh v21.8h, v7.8h, v0.h[4]
sqrdmulh v22.8h, v8.8h, v0.h[4]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[5]
mul v30.8h, v12.8h, v1.h[5]
sqrdmulh v23.8h, v11.8h, v0.h[5]
sqrdmulh v24.8h, v12.8h, v0.h[5]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v15.8h, v1.h[6]
mul v30.8h, v16.8h, v1.h[6]
sqrdmulh v25.8h, v15.8h, v0.h[6]
sqrdmulh v26.8h, v16.8h, v0.h[6]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[7]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v19.8h, v0.h[7]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v7.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v10.8h, v24.8h
add v10.8h, v10.8h, v24.8h
sub v15.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v18.8h, v28.8h
add v18.8h, v18.8h, v28.8h
ldr q0, [x2, #16]
ldr q1, [x3, #16]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
str q5, [x0]
str q6, [x0, #32]
str q7, [x0, #64]
str q8, [x0, #96]
str q9, [x0, #128]
str q10, [x0, #160]
str q11, [x0, #192]
str q12, [x0, #224]
str q13, [x1]
str q14, [x1, #32]
str q15, [x1, #64]
str q16, [x1, #96]
str q17, [x1, #128]
str q18, [x1, #160]
str q19, [x1, #192]
str q20, [x1, #224]
ldr q5, [x0, #16]
ldr q6, [x0, #48]
ldr q7, [x0, #80]
ldr q8, [x0, #112]
ldr q9, [x0, #144]
ldr q10, [x0, #176]
ldr q11, [x0, #208]
ldr q12, [x0, #240]
ldr q13, [x1, #16]
ldr q14, [x1, #48]
ldr q15, [x1, #80]
ldr q16, [x1, #112]
ldr q17, [x1, #144]
ldr q18, [x1, #176]
ldr q19, [x1, #208]
ldr q20, [x1, #240]
ldr q0, [x2]
ldr q1, [x3]
mul v29.8h, v13.8h, v1.h[1]
mul v30.8h, v14.8h, v1.h[1]
sqrdmulh v21.8h, v13.8h, v0.h[1]
sqrdmulh v22.8h, v14.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v15.8h, v1.h[1]
mul v30.8h, v16.8h, v1.h[1]
sqrdmulh v23.8h, v15.8h, v0.h[1]
sqrdmulh v24.8h, v16.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[1]
mul v30.8h, v18.8h, v1.h[1]
sqrdmulh v25.8h, v17.8h, v0.h[1]
sqrdmulh v26.8h, v18.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[1]
mul v30.8h, v20.8h, v1.h[1]
sqrdmulh v27.8h, v19.8h, v0.h[1]
sqrdmulh v28.8h, v20.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v13.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v14.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v15.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v16.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v9.8h, v25.8h
add v9.8h, v9.8h, v25.8h
sub v18.8h, v10.8h, v26.8h
add v10.8h, v10.8h, v26.8h
sub v19.8h, v11.8h, v27.8h
add v11.8h, v11.8h, v27.8h
sub v20.8h, v12.8h, v28.8h
add v12.8h, v12.8h, v28.8h
mul v29.8h, v9.8h, v1.h[2]
mul v30.8h, v10.8h, v1.h[2]
sqrdmulh v21.8h, v9.8h, v0.h[2]
sqrdmulh v22.8h, v10.8h, v0.h[2]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[2]
sqrdmulh v23.8h, v11.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[2]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[3]
mul v30.8h, v18.8h, v1.h[3]
sqrdmulh v25.8h, v17.8h, v0.h[3]
sqrdmulh v26.8h, v18.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[3]
mul v30.8h, v20.8h, v1.h[3]
sqrdmulh v27.8h, v19.8h, v0.h[3]
sqrdmulh v28.8h, v20.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v9.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v10.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v12.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v18.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v15.8h, v27.8h
add v15.8h, v15.8h, v27.8h
sub v20.8h, v16.8h, v28.8h
add v16.8h, v16.8h, v28.8h
mul v29.8h, v7.8h, v1.h[4]
mul v30.8h, v8.8h, v1.h[4]
sqrdmulh v21.8h, v7.8h, v0.h[4]
sqrdmulh v22.8h, v8.8h, v0.h[4]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[5]
mul v30.8h, v12.8h, v1.h[5]
sqrdmulh v23.8h, v11.8h, v0.h[5]
sqrdmulh v24.8h, v12.8h, v0.h[5]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v15.8h, v1.h[6]
mul v30.8h, v16.8h, v1.h[6]
sqrdmulh v25.8h, v15.8h, v0.h[6]
sqrdmulh v26.8h, v16.8h, v0.h[6]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[7]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v19.8h, v0.h[7]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v7.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v10.8h, v24.8h
add v10.8h, v10.8h, v24.8h
sub v15.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v18.8h, v28.8h
add v18.8h, v18.8h, v28.8h
ldr q0, [x2, #16]
ldr q1, [x3, #16]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
str q5, [x0, #16]
str q6, [x0, #48]
str q7, [x0, #80]
str q8, [x0, #112]
str q9, [x0, #144]
str q10, [x0, #176]
str q11, [x0, #208]
str q12, [x0, #240]
str q13, [x1, #16]
str q14, [x1, #48]
str q15, [x1, #80]
str q16, [x1, #112]
str q17, [x1, #144]
str q18, [x1, #176]
str q19, [x1, #208]
str q20, [x1, #240]
ldp q5, q6, [x0]
ldp q7, q8, [x0, #32]
ldp q9, q10, [x0, #64]
ldp q11, q12, [x0, #96]
ldp q13, q14, [x0, #128]
ldp q15, q16, [x0, #160]
ldp q17, q18, [x0, #192]
ldp q19, q20, [x0, #224]
ldr q0, [x2, #32]
ldr q1, [x3, #32]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #64]
ldr q2, [x2, #80]
ldr q1, [x3, #64]
ldr q3, [x3, #80]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.2d, v5.2d, v6.2d
trn1 v7.2d, v7.2d, v8.2d
trn2 v6.2d, v29.2d, v6.2d
trn2 v8.2d, v30.2d, v8.2d
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #96]
ldr q2, [x2, #112]
ldr q1, [x3, #96]
ldr q3, [x3, #112]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v29.2d, v10.2d
trn2 v12.2d, v30.2d, v12.2d
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #128]
ldr q2, [x2, #144]
ldr q1, [x3, #128]
ldr q3, [x3, #144]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v29.2d, v14.2d
trn2 v16.2d, v30.2d, v16.2d
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #160]
ldr q2, [x2, #176]
ldr q1, [x3, #160]
ldr q3, [x3, #176]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v29.2d, v18.2d
trn2 v20.2d, v30.2d, v20.2d
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #320]
ldr q2, [x2, #336]
ldr q1, [x3, #320]
ldr q3, [x3, #336]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.4s, v5.4s, v6.4s
trn1 v7.4s, v7.4s, v8.4s
trn2 v6.4s, v29.4s, v6.4s
trn2 v8.4s, v30.4s, v8.4s
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #352]
ldr q2, [x2, #368]
ldr q1, [x3, #352]
ldr q3, [x3, #368]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v29.4s, v10.4s
trn2 v12.4s, v30.4s, v12.4s
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #384]
ldr q2, [x2, #400]
ldr q1, [x3, #384]
ldr q3, [x3, #400]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v29.4s, v14.4s
trn2 v16.4s, v30.4s, v16.4s
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #416]
ldr q2, [x2, #432]
ldr q1, [x3, #416]
ldr q3, [x3, #432]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v29.4s, v18.4s
trn2 v20.4s, v30.4s, v20.4s
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
sqdmulh v21.8h, v5.8h, v4.h[2]
sqdmulh v22.8h, v6.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v5.8h, v21.8h, v4.h[0]
mls v6.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v7.8h, v4.h[2]
sqdmulh v22.8h, v8.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v7.8h, v21.8h, v4.h[0]
mls v8.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v9.8h, v4.h[2]
sqdmulh v22.8h, v10.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v9.8h, v21.8h, v4.h[0]
mls v10.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v11.8h, v4.h[2]
sqdmulh v22.8h, v12.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v11.8h, v21.8h, v4.h[0]
mls v12.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v13.8h, v4.h[2]
sqdmulh v22.8h, v14.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v13.8h, v21.8h, v4.h[0]
mls v14.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v15.8h, v4.h[2]
sqdmulh v22.8h, v16.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v15.8h, v21.8h, v4.h[0]
mls v16.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v17.8h, v4.h[2]
sqdmulh v22.8h, v18.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v17.8h, v21.8h, v4.h[0]
mls v18.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v19.8h, v4.h[2]
sqdmulh v22.8h, v20.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v19.8h, v21.8h, v4.h[0]
mls v20.8h, v22.8h, v4.h[0]
mov v29.16b, v5.16b
trn1 v5.4s, v5.4s, v6.4s
trn2 v6.4s, v29.4s, v6.4s
mov v29.16b, v5.16b
trn1 v5.2d, v5.2d, v6.2d
trn2 v6.2d, v29.2d, v6.2d
mov v29.16b, v7.16b
trn1 v7.4s, v7.4s, v8.4s
trn2 v8.4s, v29.4s, v8.4s
mov v29.16b, v7.16b
trn1 v7.2d, v7.2d, v8.2d
trn2 v8.2d, v29.2d, v8.2d
mov v29.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v29.4s, v10.4s
mov v29.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v29.2d, v10.2d
mov v29.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v29.4s, v12.4s
mov v29.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v29.2d, v12.2d
mov v29.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v29.4s, v14.4s
mov v29.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v29.2d, v14.2d
mov v29.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v29.4s, v16.4s
mov v29.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v29.2d, v16.2d
mov v29.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v29.4s, v18.4s
mov v29.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v29.2d, v18.2d
mov v29.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v29.4s, v20.4s
mov v29.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v29.2d, v20.2d
stp q5, q6, [x0]
stp q7, q8, [x0, #32]
stp q9, q10, [x0, #64]
stp q11, q12, [x0, #96]
stp q13, q14, [x0, #128]
stp q15, q16, [x0, #160]
stp q17, q18, [x0, #192]
stp q19, q20, [x0, #224]
ldp q5, q6, [x1]
ldp q7, q8, [x1, #32]
ldp q9, q10, [x1, #64]
ldp q11, q12, [x1, #96]
ldp q13, q14, [x1, #128]
ldp q15, q16, [x1, #160]
ldp q17, q18, [x1, #192]
ldp q19, q20, [x1, #224]
ldr q0, [x2, #48]
ldr q1, [x3, #48]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #192]
ldr q2, [x2, #208]
ldr q1, [x3, #192]
ldr q3, [x3, #208]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.2d, v5.2d, v6.2d
trn1 v7.2d, v7.2d, v8.2d
trn2 v6.2d, v29.2d, v6.2d
trn2 v8.2d, v30.2d, v8.2d
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #224]
ldr q2, [x2, #240]
ldr q1, [x3, #224]
ldr q3, [x3, #240]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v29.2d, v10.2d
trn2 v12.2d, v30.2d, v12.2d
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #256]
ldr q2, [x2, #272]
ldr q1, [x3, #256]
ldr q3, [x3, #272]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v29.2d, v14.2d
trn2 v16.2d, v30.2d, v16.2d
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #288]
ldr q2, [x2, #304]
ldr q1, [x3, #288]
ldr q3, [x3, #304]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v29.2d, v18.2d
trn2 v20.2d, v30.2d, v20.2d
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #448]
ldr q2, [x2, #464]
ldr q1, [x3, #448]
ldr q3, [x3, #464]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.4s, v5.4s, v6.4s
trn1 v7.4s, v7.4s, v8.4s
trn2 v6.4s, v29.4s, v6.4s
trn2 v8.4s, v30.4s, v8.4s
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v21.8h, v21.8h, v29.8h
sub v22.8h, v22.8h, v30.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #480]
ldr q2, [x2, #496]
ldr q1, [x3, #480]
ldr q3, [x3, #496]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v29.4s, v10.4s
trn2 v12.4s, v30.4s, v12.4s
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v23.8h, v23.8h, v29.8h
sub v24.8h, v24.8h, v30.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #512]
ldr q2, [x2, #528]
ldr q1, [x3, #512]
ldr q3, [x3, #528]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v29.4s, v14.4s
trn2 v16.4s, v30.4s, v16.4s
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v25.8h, v25.8h, v29.8h
sub v26.8h, v26.8h, v30.8h
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #544]
ldr q2, [x2, #560]
ldr q1, [x3, #544]
ldr q3, [x3, #560]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v29.4s, v18.4s
trn2 v20.4s, v30.4s, v20.4s
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmulh v29.8h, v29.8h, v4.h[0]
sqrdmulh v30.8h, v30.8h, v4.h[0]
sub v27.8h, v27.8h, v29.8h
sub v28.8h, v28.8h, v30.8h
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
sqdmulh v21.8h, v5.8h, v4.h[2]
sqdmulh v22.8h, v6.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v5.8h, v21.8h, v4.h[0]
mls v6.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v7.8h, v4.h[2]
sqdmulh v22.8h, v8.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v7.8h, v21.8h, v4.h[0]
mls v8.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v9.8h, v4.h[2]
sqdmulh v22.8h, v10.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v9.8h, v21.8h, v4.h[0]
mls v10.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v11.8h, v4.h[2]
sqdmulh v22.8h, v12.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v11.8h, v21.8h, v4.h[0]
mls v12.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v13.8h, v4.h[2]
sqdmulh v22.8h, v14.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v13.8h, v21.8h, v4.h[0]
mls v14.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v15.8h, v4.h[2]
sqdmulh v22.8h, v16.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v15.8h, v21.8h, v4.h[0]
mls v16.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v17.8h, v4.h[2]
sqdmulh v22.8h, v18.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v17.8h, v21.8h, v4.h[0]
mls v18.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v19.8h, v4.h[2]
sqdmulh v22.8h, v20.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v19.8h, v21.8h, v4.h[0]
mls v20.8h, v22.8h, v4.h[0]
mov v29.16b, v5.16b
trn1 v5.4s, v5.4s, v6.4s
trn2 v6.4s, v29.4s, v6.4s
mov v29.16b, v5.16b
trn1 v5.2d, v5.2d, v6.2d
trn2 v6.2d, v29.2d, v6.2d
mov v29.16b, v7.16b
trn1 v7.4s, v7.4s, v8.4s
trn2 v8.4s, v29.4s, v8.4s
mov v29.16b, v7.16b
trn1 v7.2d, v7.2d, v8.2d
trn2 v8.2d, v29.2d, v8.2d
mov v29.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v29.4s, v10.4s
mov v29.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v29.2d, v10.2d
mov v29.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v29.4s, v12.4s
mov v29.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v29.2d, v12.2d
mov v29.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v29.4s, v14.4s
mov v29.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v29.2d, v14.2d
mov v29.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v29.4s, v16.4s
mov v29.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v29.2d, v16.2d
mov v29.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v29.4s, v18.4s
mov v29.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v29.2d, v18.2d
mov v29.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v29.4s, v20.4s
mov v29.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v29.2d, v20.2d
stp q5, q6, [x1]
stp q7, q8, [x1, #32]
stp q9, q10, [x1, #64]
stp q11, q12, [x1, #96]
stp q13, q14, [x1, #128]
stp q15, q16, [x1, #160]
stp q17, q18, [x1, #192]
stp q19, q20, [x1, #224]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_ntt,.-mlkem_ntt
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_zetas_inv, %object
.section .rodata
.size L_mlkem_aarch64_zetas_inv, 576
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_zetas_inv:
.short 0x06a5,0x06a5,0x070f,0x070f,0x05b4,0x05b4,0x0943,0x0943
.short 0x0922,0x0922,0x091d,0x091d,0x0134,0x0134,0x006c,0x006c
.short 0x0b23,0x0b23,0x0366,0x0366,0x0356,0x0356,0x05e6,0x05e6
.short 0x09e7,0x09e7,0x04fe,0x04fe,0x05fa,0x05fa,0x04a1,0x04a1
.short 0x067b,0x067b,0x04a3,0x04a3,0x0c25,0x0c25,0x036a,0x036a
.short 0x0537,0x0537,0x083f,0x083f,0x0088,0x0088,0x04bf,0x04bf
.short 0x0b81,0x0b81,0x05b9,0x05b9,0x0505,0x0505,0x07d7,0x07d7
.short 0x0a9f,0x0a9f,0x0aa6,0x0aa6,0x08b8,0x08b8,0x09d0,0x09d0
.short 0x004b,0x004b,0x009c,0x009c,0x0bb8,0x0bb8,0x0b5f,0x0b5f
.short 0x0ba4,0x0ba4,0x0368,0x0368,0x0a7d,0x0a7d,0x0636,0x0636
.short 0x08a2,0x08a2,0x025a,0x025a,0x0736,0x0736,0x0309,0x0309
.short 0x0093,0x0093,0x087a,0x087a,0x09f7,0x09f7,0x00f6,0x00f6
.short 0x068c,0x068c,0x06db,0x06db,0x01cc,0x01cc,0x0123,0x0123
.short 0x00eb,0x00eb,0x0c50,0x0c50,0x0ab6,0x0ab6,0x0b5b,0x0b5b
.short 0x0c98,0x0c98,0x06f3,0x06f3,0x099a,0x099a,0x04e3,0x04e3
.short 0x09b6,0x09b6,0x0ad6,0x0ad6,0x0b53,0x0b53,0x044f,0x044f
.short 0x04fb,0x04fb,0x04fb,0x04fb,0x0a5c,0x0a5c,0x0a5c,0x0a5c
.short 0x0429,0x0429,0x0429,0x0429,0x0b41,0x0b41,0x0b41,0x0b41
.short 0x02d5,0x02d5,0x02d5,0x02d5,0x05e4,0x05e4,0x05e4,0x05e4
.short 0x0940,0x0940,0x0940,0x0940,0x018e,0x018e,0x018e,0x018e
.short 0x03b7,0x03b7,0x03b7,0x03b7,0x00f7,0x00f7,0x00f7,0x00f7
.short 0x058d,0x058d,0x058d,0x058d,0x0c96,0x0c96,0x0c96,0x0c96
.short 0x09c3,0x09c3,0x09c3,0x09c3,0x010f,0x010f,0x010f,0x010f
.short 0x005a,0x005a,0x005a,0x005a,0x0355,0x0355,0x0355,0x0355
.short 0x0744,0x0744,0x0744,0x0744,0x0c83,0x0c83,0x0c83,0x0c83
.short 0x048a,0x048a,0x048a,0x048a,0x0652,0x0652,0x0652,0x0652
.short 0x029a,0x029a,0x029a,0x029a,0x0140,0x0140,0x0140,0x0140
.short 0x0008,0x0008,0x0008,0x0008,0x0afd,0x0afd,0x0afd,0x0afd
.short 0x0608,0x0608,0x0608,0x0608,0x011a,0x011a,0x011a,0x011a
.short 0x072e,0x072e,0x072e,0x072e,0x050d,0x050d,0x050d,0x050d
.short 0x090a,0x090a,0x090a,0x090a,0x0228,0x0228,0x0228,0x0228
.short 0x0a75,0x0a75,0x0a75,0x0a75,0x083a,0x083a,0x083a,0x083a
.short 0x0623,0x00cd,0x0b66,0x0606,0x0aa1,0x0a25,0x0908,0x02a9
.short 0x0082,0x0642,0x074f,0x033d,0x0b82,0x0bf9,0x052d,0x0ac4
.short 0x0745,0x05c2,0x04b2,0x093f,0x0c4b,0x06d8,0x0a93,0x00ab
.short 0x0c37,0x0be2,0x0773,0x072c,0x05ed,0x0167,0x02f6,0x05a1
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_zetas_inv_qinv, %object
.section .rodata
.size L_mlkem_aarch64_zetas_inv_qinv, 576
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_zetas_inv_qinv:
.short 0xa5a5,0xa5a5,0x440f,0x440f,0xe1b4,0xe1b4,0xa243,0xa243
.short 0x4f22,0x4f22,0x901d,0x901d,0x5d34,0x5d34,0x846c,0x846c
.short 0x4423,0x4423,0xd566,0xd566,0xa556,0xa556,0x57e6,0x57e6
.short 0x4ee7,0x4ee7,0x1efe,0x1efe,0x53fa,0x53fa,0xd7a1,0xd7a1
.short 0xc77b,0xc77b,0xbda3,0xbda3,0x2b25,0x2b25,0xa16a,0xa16a
.short 0x3a37,0x3a37,0xd53f,0xd53f,0x1888,0x1888,0x51bf,0x51bf
.short 0x7e81,0x7e81,0xa0b9,0xa0b9,0xc405,0xc405,0x1cd7,0x1cd7
.short 0xf79f,0xf79f,0x9ca6,0x9ca6,0xb0b8,0xb0b8,0x79d0,0x79d0
.short 0x314b,0x314b,0x149c,0x149c,0xb3b8,0xb3b8,0x385f,0x385f
.short 0xb7a4,0xb7a4,0xbb68,0xbb68,0xb17d,0xb17d,0x4836,0x4836
.short 0xcea2,0xcea2,0x705a,0x705a,0x4936,0x4936,0x8e09,0x8e09
.short 0x8993,0x8993,0xd67a,0xd67a,0x7ef7,0x7ef7,0x82f6,0x82f6
.short 0xea8c,0xea8c,0xe7db,0xe7db,0xa5cc,0xa5cc,0x3a23,0x3a23
.short 0x11eb,0x11eb,0xfc50,0xfc50,0xccb6,0xccb6,0x6c5b,0x6c5b
.short 0x5498,0x5498,0xaff3,0xaff3,0x379a,0x379a,0x7de3,0x7de3
.short 0xcbb6,0xcbb6,0x2cd6,0x2cd6,0xd453,0xd453,0x014f,0x014f
.short 0x45fb,0x45fb,0x45fb,0x45fb,0x5e5c,0x5e5c,0x5e5c,0x5e5c
.short 0xef29,0xef29,0xef29,0xef29,0xbe41,0xbe41,0xbe41,0xbe41
.short 0x31d5,0x31d5,0x31d5,0x31d5,0x71e4,0x71e4,0x71e4,0x71e4
.short 0xc940,0xc940,0xc940,0xc940,0xcb8e,0xcb8e,0xcb8e,0xcb8e
.short 0xb8b7,0xb8b7,0xb8b7,0xb8b7,0x75f7,0x75f7,0x75f7,0x75f7
.short 0xdc8d,0xdc8d,0xdc8d,0xdc8d,0x6e96,0x6e96,0x6e96,0x6e96
.short 0x22c3,0x22c3,0x22c3,0x22c3,0x3e0f,0x3e0f,0x3e0f,0x3e0f
.short 0x6e5a,0x6e5a,0x6e5a,0x6e5a,0xb255,0xb255,0xb255,0xb255
.short 0x9344,0x9344,0x9344,0x9344,0x6583,0x6583,0x6583,0x6583
.short 0x028a,0x028a,0x028a,0x028a,0xdc52,0xdc52,0xdc52,0xdc52
.short 0x309a,0x309a,0x309a,0x309a,0xc140,0xc140,0xc140,0xc140
.short 0x9808,0x9808,0x9808,0x9808,0x31fd,0x31fd,0x31fd,0x31fd
.short 0x9e08,0x9e08,0x9e08,0x9e08,0xaf1a,0xaf1a,0xaf1a,0xaf1a
.short 0xb12e,0xb12e,0xb12e,0xb12e,0x5c0d,0x5c0d,0x5c0d,0x5c0d
.short 0x870a,0x870a,0x870a,0x870a,0xfa28,0xfa28,0xfa28,0xfa28
.short 0x1975,0x1975,0x1975,0x1975,0x163a,0x163a,0x163a,0x163a
.short 0x3f23,0x97cd,0xdd66,0xb806,0xdda1,0x2925,0xa108,0x6da9
.short 0x6682,0xac42,0x044f,0xea3d,0x7182,0x66f9,0xbc2d,0x16c4
.short 0x8645,0x2bc2,0xfab2,0xd63f,0x3d4b,0x0ed8,0x9393,0x51ab
.short 0x4137,0x91e2,0x3073,0xcb2c,0xfced,0xc667,0x84f6,0xd8a1
#ifndef __APPLE__
.text
.globl mlkem_invntt
.type mlkem_invntt,@function
.align 2
mlkem_invntt:
#else
.section __TEXT,__text
.globl _mlkem_invntt
.p2align 2
_mlkem_invntt:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_aarch64_zetas_inv
add x2, x2, :lo12:L_mlkem_aarch64_zetas_inv
#else
adrp x2, L_mlkem_aarch64_zetas_inv@PAGE
add x2, x2, L_mlkem_aarch64_zetas_inv@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_zetas_inv_qinv
add x3, x3, :lo12:L_mlkem_aarch64_zetas_inv_qinv
#else
adrp x3, L_mlkem_aarch64_zetas_inv_qinv@PAGE
add x3, x3, L_mlkem_aarch64_zetas_inv_qinv@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_aarch64_consts
add x4, x4, :lo12:L_mlkem_aarch64_consts
#else
adrp x4, L_mlkem_aarch64_consts@PAGE
add x4, x4, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
add x1, x0, #0x100
ldr q8, [x4]
ldp q9, q10, [x0]
ldp q11, q12, [x0, #32]
ldp q13, q14, [x0, #64]
ldp q15, q16, [x0, #96]
ldp q17, q18, [x0, #128]
ldp q19, q20, [x0, #160]
ldp q21, q22, [x0, #192]
ldp q23, q24, [x0, #224]
mov v25.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v25.2d, v10.2d
mov v25.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v25.4s, v10.4s
mov v25.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v25.2d, v12.2d
mov v25.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v25.4s, v12.4s
mov v25.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v25.2d, v14.2d
mov v25.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v25.4s, v14.4s
mov v25.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v25.2d, v16.2d
mov v25.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v25.4s, v16.4s
mov v25.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v25.2d, v18.2d
mov v25.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v25.4s, v18.4s
mov v25.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v25.2d, v20.2d
mov v25.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v25.4s, v20.4s
mov v25.16b, v21.16b
trn1 v21.2d, v21.2d, v22.2d
trn2 v22.2d, v25.2d, v22.2d
mov v25.16b, v21.16b
trn1 v21.4s, v21.4s, v22.4s
trn2 v22.4s, v25.4s, v22.4s
mov v25.16b, v23.16b
trn1 v23.2d, v23.2d, v24.2d
trn2 v24.2d, v25.2d, v24.2d
mov v25.16b, v23.16b
trn1 v23.4s, v23.4s, v24.4s
trn2 v24.4s, v25.4s, v24.4s
ldr q0, [x2]
ldr q1, [x2, #16]
ldr q2, [x3]
ldr q3, [x3, #16]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #32]
ldr q1, [x2, #48]
ldr q2, [x3, #32]
ldr q3, [x3, #48]
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #64]
ldr q1, [x2, #80]
ldr q2, [x3, #64]
ldr q3, [x3, #80]
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #96]
ldr q1, [x2, #112]
ldr q2, [x3, #96]
ldr q3, [x3, #112]
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #256]
ldr q1, [x2, #272]
ldr q2, [x3, #256]
ldr q3, [x3, #272]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v25.4s, v10.4s
trn2 v12.4s, v26.4s, v12.4s
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #288]
ldr q1, [x2, #304]
ldr q2, [x3, #288]
ldr q3, [x3, #304]
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v25.4s, v14.4s
trn2 v16.4s, v26.4s, v16.4s
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #320]
ldr q1, [x2, #336]
ldr q2, [x3, #320]
ldr q3, [x3, #336]
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v25.4s, v18.4s
trn2 v20.4s, v26.4s, v20.4s
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #352]
ldr q1, [x2, #368]
ldr q2, [x3, #352]
ldr q3, [x3, #368]
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.4s, v21.4s, v22.4s
trn1 v23.4s, v23.4s, v24.4s
trn2 v22.4s, v25.4s, v22.4s
trn2 v24.4s, v26.4s, v24.4s
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #512]
ldr q2, [x3, #512]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v25.2d, v10.2d
trn2 v12.2d, v26.2d, v12.2d
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.h[0]
mul v27.8h, v28.8h, v2.h[1]
sqrdmulh v10.8h, v26.8h, v0.h[0]
sqrdmulh v12.8h, v28.8h, v0.h[1]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v25.2d, v14.2d
trn2 v16.2d, v26.2d, v16.2d
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.h[2]
mul v27.8h, v28.8h, v2.h[3]
sqrdmulh v14.8h, v26.8h, v0.h[2]
sqrdmulh v16.8h, v28.8h, v0.h[3]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v25.2d, v18.2d
trn2 v20.2d, v26.2d, v20.2d
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.h[4]
mul v27.8h, v28.8h, v2.h[5]
sqrdmulh v18.8h, v26.8h, v0.h[4]
sqrdmulh v20.8h, v28.8h, v0.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.2d, v21.2d, v22.2d
trn1 v23.2d, v23.2d, v24.2d
trn2 v22.2d, v25.2d, v22.2d
trn2 v24.2d, v26.2d, v24.2d
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.h[6]
mul v27.8h, v28.8h, v2.h[7]
sqrdmulh v22.8h, v26.8h, v0.h[6]
sqrdmulh v24.8h, v28.8h, v0.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v11.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v11.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v13.8h, v8.h[2]
sqdmulh v26.8h, v15.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v13.8h, v25.8h, v8.h[0]
mls v15.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v19.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v19.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v21.8h, v8.h[2]
sqdmulh v26.8h, v23.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v21.8h, v25.8h, v8.h[0]
mls v23.8h, v26.8h, v8.h[0]
stp q9, q10, [x0]
stp q11, q12, [x0, #32]
stp q13, q14, [x0, #64]
stp q15, q16, [x0, #96]
stp q17, q18, [x0, #128]
stp q19, q20, [x0, #160]
stp q21, q22, [x0, #192]
stp q23, q24, [x0, #224]
ldp q9, q10, [x1]
ldp q11, q12, [x1, #32]
ldp q13, q14, [x1, #64]
ldp q15, q16, [x1, #96]
ldp q17, q18, [x1, #128]
ldp q19, q20, [x1, #160]
ldp q21, q22, [x1, #192]
ldp q23, q24, [x1, #224]
mov v25.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v25.2d, v10.2d
mov v25.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v25.4s, v10.4s
mov v25.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v25.2d, v12.2d
mov v25.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v25.4s, v12.4s
mov v25.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v25.2d, v14.2d
mov v25.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v25.4s, v14.4s
mov v25.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v25.2d, v16.2d
mov v25.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v25.4s, v16.4s
mov v25.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v25.2d, v18.2d
mov v25.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v25.4s, v18.4s
mov v25.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v25.2d, v20.2d
mov v25.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v25.4s, v20.4s
mov v25.16b, v21.16b
trn1 v21.2d, v21.2d, v22.2d
trn2 v22.2d, v25.2d, v22.2d
mov v25.16b, v21.16b
trn1 v21.4s, v21.4s, v22.4s
trn2 v22.4s, v25.4s, v22.4s
mov v25.16b, v23.16b
trn1 v23.2d, v23.2d, v24.2d
trn2 v24.2d, v25.2d, v24.2d
mov v25.16b, v23.16b
trn1 v23.4s, v23.4s, v24.4s
trn2 v24.4s, v25.4s, v24.4s
ldr q0, [x2, #128]
ldr q1, [x2, #144]
ldr q2, [x3, #128]
ldr q3, [x3, #144]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #160]
ldr q1, [x2, #176]
ldr q2, [x3, #160]
ldr q3, [x3, #176]
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #192]
ldr q1, [x2, #208]
ldr q2, [x3, #192]
ldr q3, [x3, #208]
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #224]
ldr q1, [x2, #240]
ldr q2, [x3, #224]
ldr q3, [x3, #240]
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #384]
ldr q1, [x2, #400]
ldr q2, [x3, #384]
ldr q3, [x3, #400]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v25.4s, v10.4s
trn2 v12.4s, v26.4s, v12.4s
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #416]
ldr q1, [x2, #432]
ldr q2, [x3, #416]
ldr q3, [x3, #432]
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v25.4s, v14.4s
trn2 v16.4s, v26.4s, v16.4s
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #448]
ldr q1, [x2, #464]
ldr q2, [x3, #448]
ldr q3, [x3, #464]
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v25.4s, v18.4s
trn2 v20.4s, v26.4s, v20.4s
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #480]
ldr q1, [x2, #496]
ldr q2, [x3, #480]
ldr q3, [x3, #496]
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.4s, v21.4s, v22.4s
trn1 v23.4s, v23.4s, v24.4s
trn2 v22.4s, v25.4s, v22.4s
trn2 v24.4s, v26.4s, v24.4s
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #528]
ldr q2, [x3, #528]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v25.2d, v10.2d
trn2 v12.2d, v26.2d, v12.2d
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.h[0]
mul v27.8h, v28.8h, v2.h[1]
sqrdmulh v10.8h, v26.8h, v0.h[0]
sqrdmulh v12.8h, v28.8h, v0.h[1]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v25.2d, v14.2d
trn2 v16.2d, v26.2d, v16.2d
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.h[2]
mul v27.8h, v28.8h, v2.h[3]
sqrdmulh v14.8h, v26.8h, v0.h[2]
sqrdmulh v16.8h, v28.8h, v0.h[3]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v25.2d, v18.2d
trn2 v20.2d, v26.2d, v20.2d
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.h[4]
mul v27.8h, v28.8h, v2.h[5]
sqrdmulh v18.8h, v26.8h, v0.h[4]
sqrdmulh v20.8h, v28.8h, v0.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.2d, v21.2d, v22.2d
trn1 v23.2d, v23.2d, v24.2d
trn2 v22.2d, v25.2d, v22.2d
trn2 v24.2d, v26.2d, v24.2d
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.h[6]
mul v27.8h, v28.8h, v2.h[7]
sqrdmulh v22.8h, v26.8h, v0.h[6]
sqrdmulh v24.8h, v28.8h, v0.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v11.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v11.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v13.8h, v8.h[2]
sqdmulh v26.8h, v15.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v13.8h, v25.8h, v8.h[0]
mls v15.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v19.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v19.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v21.8h, v8.h[2]
sqdmulh v26.8h, v23.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v21.8h, v25.8h, v8.h[0]
mls v23.8h, v26.8h, v8.h[0]
stp q9, q10, [x1]
stp q11, q12, [x1, #32]
stp q13, q14, [x1, #64]
stp q15, q16, [x1, #96]
stp q17, q18, [x1, #128]
stp q19, q20, [x1, #160]
stp q21, q22, [x1, #192]
stp q23, q24, [x1, #224]
ldr q4, [x2, #544]
ldr q5, [x2, #560]
ldr q6, [x3, #544]
ldr q7, [x3, #560]
ldr q9, [x0]
ldr q10, [x0, #32]
ldr q11, [x0, #64]
ldr q12, [x0, #96]
ldr q13, [x0, #128]
ldr q14, [x0, #160]
ldr q15, [x0, #192]
ldr q16, [x0, #224]
ldr q17, [x1]
ldr q18, [x1, #32]
ldr q19, [x1, #64]
ldr q20, [x1, #96]
ldr q21, [x1, #128]
ldr q22, [x1, #160]
ldr q23, [x1, #192]
ldr q24, [x1, #224]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v6.h[0]
mul v27.8h, v28.8h, v6.h[1]
sqrdmulh v10.8h, v26.8h, v4.h[0]
sqrdmulh v12.8h, v28.8h, v4.h[1]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v6.h[2]
mul v27.8h, v28.8h, v6.h[3]
sqrdmulh v14.8h, v26.8h, v4.h[2]
sqrdmulh v16.8h, v28.8h, v4.h[3]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v6.h[4]
mul v27.8h, v28.8h, v6.h[5]
sqrdmulh v18.8h, v26.8h, v4.h[4]
sqrdmulh v20.8h, v28.8h, v4.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v6.h[6]
mul v27.8h, v28.8h, v6.h[7]
sqrdmulh v22.8h, v26.8h, v4.h[6]
sqrdmulh v24.8h, v28.8h, v4.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v11.8h
sub v28.8h, v10.8h, v12.8h
add v9.8h, v9.8h, v11.8h
add v10.8h, v10.8h, v12.8h
mul v25.8h, v26.8h, v7.h[0]
mul v27.8h, v28.8h, v7.h[0]
sqrdmulh v11.8h, v26.8h, v5.h[0]
sqrdmulh v12.8h, v28.8h, v5.h[0]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v11.8h, v11.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v15.8h
sub v28.8h, v14.8h, v16.8h
add v13.8h, v13.8h, v15.8h
add v14.8h, v14.8h, v16.8h
mul v25.8h, v26.8h, v7.h[1]
mul v27.8h, v28.8h, v7.h[1]
sqrdmulh v15.8h, v26.8h, v5.h[1]
sqrdmulh v16.8h, v28.8h, v5.h[1]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v15.8h, v15.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v19.8h
sub v28.8h, v18.8h, v20.8h
add v17.8h, v17.8h, v19.8h
add v18.8h, v18.8h, v20.8h
mul v25.8h, v26.8h, v7.h[2]
mul v27.8h, v28.8h, v7.h[2]
sqrdmulh v19.8h, v26.8h, v5.h[2]
sqrdmulh v20.8h, v28.8h, v5.h[2]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v19.8h, v19.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v23.8h
sub v28.8h, v22.8h, v24.8h
add v21.8h, v21.8h, v23.8h
add v22.8h, v22.8h, v24.8h
mul v25.8h, v26.8h, v7.h[3]
mul v27.8h, v28.8h, v7.h[3]
sqrdmulh v23.8h, v26.8h, v5.h[3]
sqrdmulh v24.8h, v28.8h, v5.h[3]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v13.8h
sub v28.8h, v10.8h, v14.8h
add v9.8h, v9.8h, v13.8h
add v10.8h, v10.8h, v14.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v13.8h, v26.8h, v5.h[4]
sqrdmulh v14.8h, v28.8h, v5.h[4]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v13.8h, v13.8h, v25.8h
sub v14.8h, v14.8h, v27.8h
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
sub v26.8h, v11.8h, v15.8h
sub v28.8h, v12.8h, v16.8h
add v11.8h, v11.8h, v15.8h
add v12.8h, v12.8h, v16.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v15.8h, v26.8h, v5.h[4]
sqrdmulh v16.8h, v28.8h, v5.h[4]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v15.8h, v15.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v21.8h
sub v28.8h, v18.8h, v22.8h
add v17.8h, v17.8h, v21.8h
add v18.8h, v18.8h, v22.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v21.8h, v26.8h, v5.h[5]
sqrdmulh v22.8h, v28.8h, v5.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v21.8h, v21.8h, v25.8h
sub v22.8h, v22.8h, v27.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v19.8h, v23.8h
sub v28.8h, v20.8h, v24.8h
add v19.8h, v19.8h, v23.8h
add v20.8h, v20.8h, v24.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v23.8h, v26.8h, v5.h[5]
sqrdmulh v24.8h, v28.8h, v5.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v10.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v10.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v11.8h, v8.h[2]
sqdmulh v26.8h, v12.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v11.8h, v25.8h, v8.h[0]
mls v12.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v18.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v18.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v19.8h, v8.h[2]
sqdmulh v26.8h, v20.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v19.8h, v25.8h, v8.h[0]
mls v20.8h, v26.8h, v8.h[0]
sub v26.8h, v9.8h, v17.8h
sub v28.8h, v10.8h, v18.8h
add v9.8h, v9.8h, v17.8h
add v10.8h, v10.8h, v18.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v17.8h, v26.8h, v5.h[6]
sqrdmulh v18.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v17.8h, v17.8h, v25.8h
sub v18.8h, v18.8h, v27.8h
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
sub v26.8h, v11.8h, v19.8h
sub v28.8h, v12.8h, v20.8h
add v11.8h, v11.8h, v19.8h
add v12.8h, v12.8h, v20.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v19.8h, v26.8h, v5.h[6]
sqrdmulh v20.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v19.8h, v19.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v13.8h, v21.8h
sub v28.8h, v14.8h, v22.8h
add v13.8h, v13.8h, v21.8h
add v14.8h, v14.8h, v22.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v21.8h, v26.8h, v5.h[6]
sqrdmulh v22.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v21.8h, v21.8h, v25.8h
sub v22.8h, v22.8h, v27.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v15.8h, v23.8h
sub v28.8h, v16.8h, v24.8h
add v15.8h, v15.8h, v23.8h
add v16.8h, v16.8h, v24.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v23.8h, v26.8h, v5.h[6]
sqrdmulh v24.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v25.8h, v9.8h, v7.h[7]
mul v26.8h, v10.8h, v7.h[7]
sqrdmulh v9.8h, v9.8h, v5.h[7]
sqrdmulh v10.8h, v10.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v9.8h, v9.8h, v25.8h
sub v10.8h, v10.8h, v26.8h
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v25.8h, v11.8h, v7.h[7]
mul v26.8h, v12.8h, v7.h[7]
sqrdmulh v11.8h, v11.8h, v5.h[7]
sqrdmulh v12.8h, v12.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v11.8h, v11.8h, v25.8h
sub v12.8h, v12.8h, v26.8h
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v25.8h, v13.8h, v7.h[7]
mul v26.8h, v14.8h, v7.h[7]
sqrdmulh v13.8h, v13.8h, v5.h[7]
sqrdmulh v14.8h, v14.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v13.8h, v13.8h, v25.8h
sub v14.8h, v14.8h, v26.8h
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v25.8h, v15.8h, v7.h[7]
mul v26.8h, v16.8h, v7.h[7]
sqrdmulh v15.8h, v15.8h, v5.h[7]
sqrdmulh v16.8h, v16.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v15.8h, v15.8h, v25.8h
sub v16.8h, v16.8h, v26.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
mul v25.8h, v17.8h, v7.h[7]
mul v26.8h, v18.8h, v7.h[7]
sqrdmulh v17.8h, v17.8h, v5.h[7]
sqrdmulh v18.8h, v18.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v17.8h, v17.8h, v25.8h
sub v18.8h, v18.8h, v26.8h
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
mul v25.8h, v19.8h, v7.h[7]
mul v26.8h, v20.8h, v7.h[7]
sqrdmulh v19.8h, v19.8h, v5.h[7]
sqrdmulh v20.8h, v20.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v19.8h, v19.8h, v25.8h
sub v20.8h, v20.8h, v26.8h
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
mul v25.8h, v21.8h, v7.h[7]
mul v26.8h, v22.8h, v7.h[7]
sqrdmulh v21.8h, v21.8h, v5.h[7]
sqrdmulh v22.8h, v22.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v21.8h, v21.8h, v25.8h
sub v22.8h, v22.8h, v26.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v25.8h, v23.8h, v7.h[7]
mul v26.8h, v24.8h, v7.h[7]
sqrdmulh v23.8h, v23.8h, v5.h[7]
sqrdmulh v24.8h, v24.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v26.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
str q9, [x0]
str q10, [x0, #32]
str q11, [x0, #64]
str q12, [x0, #96]
str q13, [x0, #128]
str q14, [x0, #160]
str q15, [x0, #192]
str q16, [x0, #224]
str q17, [x1]
str q18, [x1, #32]
str q19, [x1, #64]
str q20, [x1, #96]
str q21, [x1, #128]
str q22, [x1, #160]
str q23, [x1, #192]
str q24, [x1, #224]
ldr q9, [x0, #16]
ldr q10, [x0, #48]
ldr q11, [x0, #80]
ldr q12, [x0, #112]
ldr q13, [x0, #144]
ldr q14, [x0, #176]
ldr q15, [x0, #208]
ldr q16, [x0, #240]
ldr q17, [x1, #16]
ldr q18, [x1, #48]
ldr q19, [x1, #80]
ldr q20, [x1, #112]
ldr q21, [x1, #144]
ldr q22, [x1, #176]
ldr q23, [x1, #208]
ldr q24, [x1, #240]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v6.h[0]
mul v27.8h, v28.8h, v6.h[1]
sqrdmulh v10.8h, v26.8h, v4.h[0]
sqrdmulh v12.8h, v28.8h, v4.h[1]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v10.8h, v10.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v6.h[2]
mul v27.8h, v28.8h, v6.h[3]
sqrdmulh v14.8h, v26.8h, v4.h[2]
sqrdmulh v16.8h, v28.8h, v4.h[3]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v14.8h, v14.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v6.h[4]
mul v27.8h, v28.8h, v6.h[5]
sqrdmulh v18.8h, v26.8h, v4.h[4]
sqrdmulh v20.8h, v28.8h, v4.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v18.8h, v18.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v6.h[6]
mul v27.8h, v28.8h, v6.h[7]
sqrdmulh v22.8h, v26.8h, v4.h[6]
sqrdmulh v24.8h, v28.8h, v4.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v22.8h, v22.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v11.8h
sub v28.8h, v10.8h, v12.8h
add v9.8h, v9.8h, v11.8h
add v10.8h, v10.8h, v12.8h
mul v25.8h, v26.8h, v7.h[0]
mul v27.8h, v28.8h, v7.h[0]
sqrdmulh v11.8h, v26.8h, v5.h[0]
sqrdmulh v12.8h, v28.8h, v5.h[0]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v11.8h, v11.8h, v25.8h
sub v12.8h, v12.8h, v27.8h
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v15.8h
sub v28.8h, v14.8h, v16.8h
add v13.8h, v13.8h, v15.8h
add v14.8h, v14.8h, v16.8h
mul v25.8h, v26.8h, v7.h[1]
mul v27.8h, v28.8h, v7.h[1]
sqrdmulh v15.8h, v26.8h, v5.h[1]
sqrdmulh v16.8h, v28.8h, v5.h[1]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v15.8h, v15.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v19.8h
sub v28.8h, v18.8h, v20.8h
add v17.8h, v17.8h, v19.8h
add v18.8h, v18.8h, v20.8h
mul v25.8h, v26.8h, v7.h[2]
mul v27.8h, v28.8h, v7.h[2]
sqrdmulh v19.8h, v26.8h, v5.h[2]
sqrdmulh v20.8h, v28.8h, v5.h[2]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v19.8h, v19.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v23.8h
sub v28.8h, v22.8h, v24.8h
add v21.8h, v21.8h, v23.8h
add v22.8h, v22.8h, v24.8h
mul v25.8h, v26.8h, v7.h[3]
mul v27.8h, v28.8h, v7.h[3]
sqrdmulh v23.8h, v26.8h, v5.h[3]
sqrdmulh v24.8h, v28.8h, v5.h[3]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v13.8h
sub v28.8h, v10.8h, v14.8h
add v9.8h, v9.8h, v13.8h
add v10.8h, v10.8h, v14.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v13.8h, v26.8h, v5.h[4]
sqrdmulh v14.8h, v28.8h, v5.h[4]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v13.8h, v13.8h, v25.8h
sub v14.8h, v14.8h, v27.8h
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
sub v26.8h, v11.8h, v15.8h
sub v28.8h, v12.8h, v16.8h
add v11.8h, v11.8h, v15.8h
add v12.8h, v12.8h, v16.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v15.8h, v26.8h, v5.h[4]
sqrdmulh v16.8h, v28.8h, v5.h[4]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v15.8h, v15.8h, v25.8h
sub v16.8h, v16.8h, v27.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v21.8h
sub v28.8h, v18.8h, v22.8h
add v17.8h, v17.8h, v21.8h
add v18.8h, v18.8h, v22.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v21.8h, v26.8h, v5.h[5]
sqrdmulh v22.8h, v28.8h, v5.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v21.8h, v21.8h, v25.8h
sub v22.8h, v22.8h, v27.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v19.8h, v23.8h
sub v28.8h, v20.8h, v24.8h
add v19.8h, v19.8h, v23.8h
add v20.8h, v20.8h, v24.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v23.8h, v26.8h, v5.h[5]
sqrdmulh v24.8h, v28.8h, v5.h[5]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v10.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v10.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v11.8h, v8.h[2]
sqdmulh v26.8h, v12.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v11.8h, v25.8h, v8.h[0]
mls v12.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v18.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v18.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v19.8h, v8.h[2]
sqdmulh v26.8h, v20.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v19.8h, v25.8h, v8.h[0]
mls v20.8h, v26.8h, v8.h[0]
sub v26.8h, v9.8h, v17.8h
sub v28.8h, v10.8h, v18.8h
add v9.8h, v9.8h, v17.8h
add v10.8h, v10.8h, v18.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v17.8h, v26.8h, v5.h[6]
sqrdmulh v18.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v17.8h, v17.8h, v25.8h
sub v18.8h, v18.8h, v27.8h
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
sub v26.8h, v11.8h, v19.8h
sub v28.8h, v12.8h, v20.8h
add v11.8h, v11.8h, v19.8h
add v12.8h, v12.8h, v20.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v19.8h, v26.8h, v5.h[6]
sqrdmulh v20.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v19.8h, v19.8h, v25.8h
sub v20.8h, v20.8h, v27.8h
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v13.8h, v21.8h
sub v28.8h, v14.8h, v22.8h
add v13.8h, v13.8h, v21.8h
add v14.8h, v14.8h, v22.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v21.8h, v26.8h, v5.h[6]
sqrdmulh v22.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v21.8h, v21.8h, v25.8h
sub v22.8h, v22.8h, v27.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v15.8h, v23.8h
sub v28.8h, v16.8h, v24.8h
add v15.8h, v15.8h, v23.8h
add v16.8h, v16.8h, v24.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v23.8h, v26.8h, v5.h[6]
sqrdmulh v24.8h, v28.8h, v5.h[6]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v27.8h, v27.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v27.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v25.8h, v9.8h, v7.h[7]
mul v26.8h, v10.8h, v7.h[7]
sqrdmulh v9.8h, v9.8h, v5.h[7]
sqrdmulh v10.8h, v10.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v9.8h, v9.8h, v25.8h
sub v10.8h, v10.8h, v26.8h
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v25.8h, v11.8h, v7.h[7]
mul v26.8h, v12.8h, v7.h[7]
sqrdmulh v11.8h, v11.8h, v5.h[7]
sqrdmulh v12.8h, v12.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v11.8h, v11.8h, v25.8h
sub v12.8h, v12.8h, v26.8h
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v25.8h, v13.8h, v7.h[7]
mul v26.8h, v14.8h, v7.h[7]
sqrdmulh v13.8h, v13.8h, v5.h[7]
sqrdmulh v14.8h, v14.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v13.8h, v13.8h, v25.8h
sub v14.8h, v14.8h, v26.8h
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v25.8h, v15.8h, v7.h[7]
mul v26.8h, v16.8h, v7.h[7]
sqrdmulh v15.8h, v15.8h, v5.h[7]
sqrdmulh v16.8h, v16.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v15.8h, v15.8h, v25.8h
sub v16.8h, v16.8h, v26.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
mul v25.8h, v17.8h, v7.h[7]
mul v26.8h, v18.8h, v7.h[7]
sqrdmulh v17.8h, v17.8h, v5.h[7]
sqrdmulh v18.8h, v18.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v17.8h, v17.8h, v25.8h
sub v18.8h, v18.8h, v26.8h
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
mul v25.8h, v19.8h, v7.h[7]
mul v26.8h, v20.8h, v7.h[7]
sqrdmulh v19.8h, v19.8h, v5.h[7]
sqrdmulh v20.8h, v20.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v19.8h, v19.8h, v25.8h
sub v20.8h, v20.8h, v26.8h
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
mul v25.8h, v21.8h, v7.h[7]
mul v26.8h, v22.8h, v7.h[7]
sqrdmulh v21.8h, v21.8h, v5.h[7]
sqrdmulh v22.8h, v22.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v21.8h, v21.8h, v25.8h
sub v22.8h, v22.8h, v26.8h
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v25.8h, v23.8h, v7.h[7]
mul v26.8h, v24.8h, v7.h[7]
sqrdmulh v23.8h, v23.8h, v5.h[7]
sqrdmulh v24.8h, v24.8h, v5.h[7]
sqrdmulh v25.8h, v25.8h, v8.h[0]
sqrdmulh v26.8h, v26.8h, v8.h[0]
sub v23.8h, v23.8h, v25.8h
sub v24.8h, v24.8h, v26.8h
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
str q9, [x0, #16]
str q10, [x0, #48]
str q11, [x0, #80]
str q12, [x0, #112]
str q13, [x0, #144]
str q14, [x0, #176]
str q15, [x0, #208]
str q16, [x0, #240]
str q17, [x1, #16]
str q18, [x1, #48]
str q19, [x1, #80]
str q20, [x1, #112]
str q21, [x1, #144]
str q22, [x1, #176]
str q23, [x1, #208]
str q24, [x1, #240]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_invntt,.-mlkem_invntt
#endif /* __APPLE__ */
#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH
#ifndef __APPLE__
.text
.globl mlkem_ntt_sqrdmlsh
.type mlkem_ntt_sqrdmlsh,@function
.align 2
mlkem_ntt_sqrdmlsh:
#else
.section __TEXT,__text
.globl _mlkem_ntt_sqrdmlsh
.p2align 2
_mlkem_ntt_sqrdmlsh:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_aarch64_zetas
add x2, x2, :lo12:L_mlkem_aarch64_zetas
#else
adrp x2, L_mlkem_aarch64_zetas@PAGE
add x2, x2, L_mlkem_aarch64_zetas@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_zetas_qinv
add x3, x3, :lo12:L_mlkem_aarch64_zetas_qinv
#else
adrp x3, L_mlkem_aarch64_zetas_qinv@PAGE
add x3, x3, L_mlkem_aarch64_zetas_qinv@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_aarch64_consts
add x4, x4, :lo12:L_mlkem_aarch64_consts
#else
adrp x4, L_mlkem_aarch64_consts@PAGE
add x4, x4, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
add x1, x0, #0x100
ldr q4, [x4]
ldr q5, [x0]
ldr q6, [x0, #32]
ldr q7, [x0, #64]
ldr q8, [x0, #96]
ldr q9, [x0, #128]
ldr q10, [x0, #160]
ldr q11, [x0, #192]
ldr q12, [x0, #224]
ldr q13, [x1]
ldr q14, [x1, #32]
ldr q15, [x1, #64]
ldr q16, [x1, #96]
ldr q17, [x1, #128]
ldr q18, [x1, #160]
ldr q19, [x1, #192]
ldr q20, [x1, #224]
ldr q0, [x2]
ldr q1, [x3]
mul v29.8h, v13.8h, v1.h[1]
mul v30.8h, v14.8h, v1.h[1]
sqrdmulh v21.8h, v13.8h, v0.h[1]
sqrdmulh v22.8h, v14.8h, v0.h[1]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v15.8h, v1.h[1]
mul v30.8h, v16.8h, v1.h[1]
sqrdmulh v23.8h, v15.8h, v0.h[1]
sqrdmulh v24.8h, v16.8h, v0.h[1]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[1]
mul v30.8h, v18.8h, v1.h[1]
sqrdmulh v25.8h, v17.8h, v0.h[1]
sqrdmulh v26.8h, v18.8h, v0.h[1]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[1]
mul v30.8h, v20.8h, v1.h[1]
sqrdmulh v27.8h, v19.8h, v0.h[1]
sqrdmulh v28.8h, v20.8h, v0.h[1]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v13.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v14.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v15.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v16.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v9.8h, v25.8h
add v9.8h, v9.8h, v25.8h
sub v18.8h, v10.8h, v26.8h
add v10.8h, v10.8h, v26.8h
sub v19.8h, v11.8h, v27.8h
add v11.8h, v11.8h, v27.8h
sub v20.8h, v12.8h, v28.8h
add v12.8h, v12.8h, v28.8h
mul v29.8h, v9.8h, v1.h[2]
mul v30.8h, v10.8h, v1.h[2]
sqrdmulh v21.8h, v9.8h, v0.h[2]
sqrdmulh v22.8h, v10.8h, v0.h[2]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[2]
sqrdmulh v23.8h, v11.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[2]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[3]
mul v30.8h, v18.8h, v1.h[3]
sqrdmulh v25.8h, v17.8h, v0.h[3]
sqrdmulh v26.8h, v18.8h, v0.h[3]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[3]
mul v30.8h, v20.8h, v1.h[3]
sqrdmulh v27.8h, v19.8h, v0.h[3]
sqrdmulh v28.8h, v20.8h, v0.h[3]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v9.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v10.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v12.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v18.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v15.8h, v27.8h
add v15.8h, v15.8h, v27.8h
sub v20.8h, v16.8h, v28.8h
add v16.8h, v16.8h, v28.8h
mul v29.8h, v7.8h, v1.h[4]
mul v30.8h, v8.8h, v1.h[4]
sqrdmulh v21.8h, v7.8h, v0.h[4]
sqrdmulh v22.8h, v8.8h, v0.h[4]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[5]
mul v30.8h, v12.8h, v1.h[5]
sqrdmulh v23.8h, v11.8h, v0.h[5]
sqrdmulh v24.8h, v12.8h, v0.h[5]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v15.8h, v1.h[6]
mul v30.8h, v16.8h, v1.h[6]
sqrdmulh v25.8h, v15.8h, v0.h[6]
sqrdmulh v26.8h, v16.8h, v0.h[6]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[7]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v19.8h, v0.h[7]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v7.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v10.8h, v24.8h
add v10.8h, v10.8h, v24.8h
sub v15.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v18.8h, v28.8h
add v18.8h, v18.8h, v28.8h
ldr q0, [x2, #16]
ldr q1, [x3, #16]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
str q5, [x0]
str q6, [x0, #32]
str q7, [x0, #64]
str q8, [x0, #96]
str q9, [x0, #128]
str q10, [x0, #160]
str q11, [x0, #192]
str q12, [x0, #224]
str q13, [x1]
str q14, [x1, #32]
str q15, [x1, #64]
str q16, [x1, #96]
str q17, [x1, #128]
str q18, [x1, #160]
str q19, [x1, #192]
str q20, [x1, #224]
ldr q5, [x0, #16]
ldr q6, [x0, #48]
ldr q7, [x0, #80]
ldr q8, [x0, #112]
ldr q9, [x0, #144]
ldr q10, [x0, #176]
ldr q11, [x0, #208]
ldr q12, [x0, #240]
ldr q13, [x1, #16]
ldr q14, [x1, #48]
ldr q15, [x1, #80]
ldr q16, [x1, #112]
ldr q17, [x1, #144]
ldr q18, [x1, #176]
ldr q19, [x1, #208]
ldr q20, [x1, #240]
ldr q0, [x2]
ldr q1, [x3]
mul v29.8h, v13.8h, v1.h[1]
mul v30.8h, v14.8h, v1.h[1]
sqrdmulh v21.8h, v13.8h, v0.h[1]
sqrdmulh v22.8h, v14.8h, v0.h[1]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v15.8h, v1.h[1]
mul v30.8h, v16.8h, v1.h[1]
sqrdmulh v23.8h, v15.8h, v0.h[1]
sqrdmulh v24.8h, v16.8h, v0.h[1]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[1]
mul v30.8h, v18.8h, v1.h[1]
sqrdmulh v25.8h, v17.8h, v0.h[1]
sqrdmulh v26.8h, v18.8h, v0.h[1]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[1]
mul v30.8h, v20.8h, v1.h[1]
sqrdmulh v27.8h, v19.8h, v0.h[1]
sqrdmulh v28.8h, v20.8h, v0.h[1]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v13.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v14.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v15.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v16.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v9.8h, v25.8h
add v9.8h, v9.8h, v25.8h
sub v18.8h, v10.8h, v26.8h
add v10.8h, v10.8h, v26.8h
sub v19.8h, v11.8h, v27.8h
add v11.8h, v11.8h, v27.8h
sub v20.8h, v12.8h, v28.8h
add v12.8h, v12.8h, v28.8h
mul v29.8h, v9.8h, v1.h[2]
mul v30.8h, v10.8h, v1.h[2]
sqrdmulh v21.8h, v9.8h, v0.h[2]
sqrdmulh v22.8h, v10.8h, v0.h[2]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[2]
sqrdmulh v23.8h, v11.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[2]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v17.8h, v1.h[3]
mul v30.8h, v18.8h, v1.h[3]
sqrdmulh v25.8h, v17.8h, v0.h[3]
sqrdmulh v26.8h, v18.8h, v0.h[3]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[3]
mul v30.8h, v20.8h, v1.h[3]
sqrdmulh v27.8h, v19.8h, v0.h[3]
sqrdmulh v28.8h, v20.8h, v0.h[3]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v9.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v10.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v7.8h, v23.8h
add v7.8h, v7.8h, v23.8h
sub v12.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
sub v17.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v18.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v15.8h, v27.8h
add v15.8h, v15.8h, v27.8h
sub v20.8h, v16.8h, v28.8h
add v16.8h, v16.8h, v28.8h
mul v29.8h, v7.8h, v1.h[4]
mul v30.8h, v8.8h, v1.h[4]
sqrdmulh v21.8h, v7.8h, v0.h[4]
sqrdmulh v22.8h, v8.8h, v0.h[4]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v11.8h, v1.h[5]
mul v30.8h, v12.8h, v1.h[5]
sqrdmulh v23.8h, v11.8h, v0.h[5]
sqrdmulh v24.8h, v12.8h, v0.h[5]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v15.8h, v1.h[6]
mul v30.8h, v16.8h, v1.h[6]
sqrdmulh v25.8h, v15.8h, v0.h[6]
sqrdmulh v26.8h, v16.8h, v0.h[6]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v19.8h, v1.h[7]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v19.8h, v0.h[7]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v7.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v6.8h, v22.8h
add v6.8h, v6.8h, v22.8h
sub v11.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v10.8h, v24.8h
add v10.8h, v10.8h, v24.8h
sub v15.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v14.8h, v26.8h
add v14.8h, v14.8h, v26.8h
sub v19.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v18.8h, v28.8h
add v18.8h, v18.8h, v28.8h
ldr q0, [x2, #16]
ldr q1, [x3, #16]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
str q5, [x0, #16]
str q6, [x0, #48]
str q7, [x0, #80]
str q8, [x0, #112]
str q9, [x0, #144]
str q10, [x0, #176]
str q11, [x0, #208]
str q12, [x0, #240]
str q13, [x1, #16]
str q14, [x1, #48]
str q15, [x1, #80]
str q16, [x1, #112]
str q17, [x1, #144]
str q18, [x1, #176]
str q19, [x1, #208]
str q20, [x1, #240]
ldp q5, q6, [x0]
ldp q7, q8, [x0, #32]
ldp q9, q10, [x0, #64]
ldp q11, q12, [x0, #96]
ldp q13, q14, [x0, #128]
ldp q15, q16, [x0, #160]
ldp q17, q18, [x0, #192]
ldp q19, q20, [x0, #224]
ldr q0, [x2, #32]
ldr q1, [x3, #32]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #64]
ldr q2, [x2, #80]
ldr q1, [x3, #64]
ldr q3, [x3, #80]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.2d, v5.2d, v6.2d
trn1 v7.2d, v7.2d, v8.2d
trn2 v6.2d, v29.2d, v6.2d
trn2 v8.2d, v30.2d, v8.2d
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #96]
ldr q2, [x2, #112]
ldr q1, [x3, #96]
ldr q3, [x3, #112]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v29.2d, v10.2d
trn2 v12.2d, v30.2d, v12.2d
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #128]
ldr q2, [x2, #144]
ldr q1, [x3, #128]
ldr q3, [x3, #144]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v29.2d, v14.2d
trn2 v16.2d, v30.2d, v16.2d
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #160]
ldr q2, [x2, #176]
ldr q1, [x3, #160]
ldr q3, [x3, #176]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v29.2d, v18.2d
trn2 v20.2d, v30.2d, v20.2d
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #320]
ldr q2, [x2, #336]
ldr q1, [x3, #320]
ldr q3, [x3, #336]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.4s, v5.4s, v6.4s
trn1 v7.4s, v7.4s, v8.4s
trn2 v6.4s, v29.4s, v6.4s
trn2 v8.4s, v30.4s, v8.4s
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #352]
ldr q2, [x2, #368]
ldr q1, [x3, #352]
ldr q3, [x3, #368]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v29.4s, v10.4s
trn2 v12.4s, v30.4s, v12.4s
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #384]
ldr q2, [x2, #400]
ldr q1, [x3, #384]
ldr q3, [x3, #400]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v29.4s, v14.4s
trn2 v16.4s, v30.4s, v16.4s
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #416]
ldr q2, [x2, #432]
ldr q1, [x3, #416]
ldr q3, [x3, #432]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v29.4s, v18.4s
trn2 v20.4s, v30.4s, v20.4s
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
sqdmulh v21.8h, v5.8h, v4.h[2]
sqdmulh v22.8h, v6.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v5.8h, v21.8h, v4.h[0]
mls v6.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v7.8h, v4.h[2]
sqdmulh v22.8h, v8.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v7.8h, v21.8h, v4.h[0]
mls v8.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v9.8h, v4.h[2]
sqdmulh v22.8h, v10.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v9.8h, v21.8h, v4.h[0]
mls v10.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v11.8h, v4.h[2]
sqdmulh v22.8h, v12.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v11.8h, v21.8h, v4.h[0]
mls v12.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v13.8h, v4.h[2]
sqdmulh v22.8h, v14.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v13.8h, v21.8h, v4.h[0]
mls v14.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v15.8h, v4.h[2]
sqdmulh v22.8h, v16.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v15.8h, v21.8h, v4.h[0]
mls v16.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v17.8h, v4.h[2]
sqdmulh v22.8h, v18.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v17.8h, v21.8h, v4.h[0]
mls v18.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v19.8h, v4.h[2]
sqdmulh v22.8h, v20.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v19.8h, v21.8h, v4.h[0]
mls v20.8h, v22.8h, v4.h[0]
mov v29.16b, v5.16b
trn1 v5.4s, v5.4s, v6.4s
trn2 v6.4s, v29.4s, v6.4s
mov v29.16b, v5.16b
trn1 v5.2d, v5.2d, v6.2d
trn2 v6.2d, v29.2d, v6.2d
mov v29.16b, v7.16b
trn1 v7.4s, v7.4s, v8.4s
trn2 v8.4s, v29.4s, v8.4s
mov v29.16b, v7.16b
trn1 v7.2d, v7.2d, v8.2d
trn2 v8.2d, v29.2d, v8.2d
mov v29.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v29.4s, v10.4s
mov v29.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v29.2d, v10.2d
mov v29.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v29.4s, v12.4s
mov v29.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v29.2d, v12.2d
mov v29.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v29.4s, v14.4s
mov v29.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v29.2d, v14.2d
mov v29.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v29.4s, v16.4s
mov v29.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v29.2d, v16.2d
mov v29.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v29.4s, v18.4s
mov v29.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v29.2d, v18.2d
mov v29.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v29.4s, v20.4s
mov v29.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v29.2d, v20.2d
stp q5, q6, [x0]
stp q7, q8, [x0, #32]
stp q9, q10, [x0, #64]
stp q11, q12, [x0, #96]
stp q13, q14, [x0, #128]
stp q15, q16, [x0, #160]
stp q17, q18, [x0, #192]
stp q19, q20, [x0, #224]
ldp q5, q6, [x1]
ldp q7, q8, [x1, #32]
ldp q9, q10, [x1, #64]
ldp q11, q12, [x1, #96]
ldp q13, q14, [x1, #128]
ldp q15, q16, [x1, #160]
ldp q17, q18, [x1, #192]
ldp q19, q20, [x1, #224]
ldr q0, [x2, #48]
ldr q1, [x3, #48]
mul v29.8h, v6.8h, v1.h[0]
mul v30.8h, v8.8h, v1.h[1]
sqrdmulh v21.8h, v6.8h, v0.h[0]
sqrdmulh v22.8h, v8.8h, v0.h[1]
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v29.8h, v10.8h, v1.h[2]
mul v30.8h, v12.8h, v1.h[3]
sqrdmulh v23.8h, v10.8h, v0.h[2]
sqrdmulh v24.8h, v12.8h, v0.h[3]
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v29.8h, v14.8h, v1.h[4]
mul v30.8h, v16.8h, v1.h[5]
sqrdmulh v25.8h, v14.8h, v0.h[4]
sqrdmulh v26.8h, v16.8h, v0.h[5]
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
mul v29.8h, v18.8h, v1.h[6]
mul v30.8h, v20.8h, v1.h[7]
sqrdmulh v27.8h, v18.8h, v0.h[6]
sqrdmulh v28.8h, v20.8h, v0.h[7]
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #192]
ldr q2, [x2, #208]
ldr q1, [x3, #192]
ldr q3, [x3, #208]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.2d, v5.2d, v6.2d
trn1 v7.2d, v7.2d, v8.2d
trn2 v6.2d, v29.2d, v6.2d
trn2 v8.2d, v30.2d, v8.2d
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #224]
ldr q2, [x2, #240]
ldr q1, [x3, #224]
ldr q3, [x3, #240]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v29.2d, v10.2d
trn2 v12.2d, v30.2d, v12.2d
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #256]
ldr q2, [x2, #272]
ldr q1, [x3, #256]
ldr q3, [x3, #272]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v29.2d, v14.2d
trn2 v16.2d, v30.2d, v16.2d
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #288]
ldr q2, [x2, #304]
ldr q1, [x3, #288]
ldr q3, [x3, #304]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v29.2d, v18.2d
trn2 v20.2d, v30.2d, v20.2d
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
ldr q0, [x2, #448]
ldr q2, [x2, #464]
ldr q1, [x3, #448]
ldr q3, [x3, #464]
mov v29.16b, v5.16b
mov v30.16b, v7.16b
trn1 v5.4s, v5.4s, v6.4s
trn1 v7.4s, v7.4s, v8.4s
trn2 v6.4s, v29.4s, v6.4s
trn2 v8.4s, v30.4s, v8.4s
mul v29.8h, v6.8h, v1.8h
mul v30.8h, v8.8h, v3.8h
sqrdmulh v21.8h, v6.8h, v0.8h
sqrdmulh v22.8h, v8.8h, v2.8h
sqrdmlsh v21.8h, v29.8h, v4.h[0]
sqrdmlsh v22.8h, v30.8h, v4.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
ldr q0, [x2, #480]
ldr q2, [x2, #496]
ldr q1, [x3, #480]
ldr q3, [x3, #496]
mov v29.16b, v9.16b
mov v30.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v29.4s, v10.4s
trn2 v12.4s, v30.4s, v12.4s
mul v29.8h, v10.8h, v1.8h
mul v30.8h, v12.8h, v3.8h
sqrdmulh v23.8h, v10.8h, v0.8h
sqrdmulh v24.8h, v12.8h, v2.8h
sqrdmlsh v23.8h, v29.8h, v4.h[0]
sqrdmlsh v24.8h, v30.8h, v4.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #512]
ldr q2, [x2, #528]
ldr q1, [x3, #512]
ldr q3, [x3, #528]
mov v29.16b, v13.16b
mov v30.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v29.4s, v14.4s
trn2 v16.4s, v30.4s, v16.4s
mul v29.8h, v14.8h, v1.8h
mul v30.8h, v16.8h, v3.8h
sqrdmulh v25.8h, v14.8h, v0.8h
sqrdmulh v26.8h, v16.8h, v2.8h
sqrdmlsh v25.8h, v29.8h, v4.h[0]
sqrdmlsh v26.8h, v30.8h, v4.h[0]
sshr v25.8h, v25.8h, #1
sshr v26.8h, v26.8h, #1
ldr q0, [x2, #544]
ldr q2, [x2, #560]
ldr q1, [x3, #544]
ldr q3, [x3, #560]
mov v29.16b, v17.16b
mov v30.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v29.4s, v18.4s
trn2 v20.4s, v30.4s, v20.4s
mul v29.8h, v18.8h, v1.8h
mul v30.8h, v20.8h, v3.8h
sqrdmulh v27.8h, v18.8h, v0.8h
sqrdmulh v28.8h, v20.8h, v2.8h
sqrdmlsh v27.8h, v29.8h, v4.h[0]
sqrdmlsh v28.8h, v30.8h, v4.h[0]
sshr v27.8h, v27.8h, #1
sshr v28.8h, v28.8h, #1
sub v6.8h, v5.8h, v21.8h
add v5.8h, v5.8h, v21.8h
sub v8.8h, v7.8h, v22.8h
add v7.8h, v7.8h, v22.8h
sub v10.8h, v9.8h, v23.8h
add v9.8h, v9.8h, v23.8h
sub v12.8h, v11.8h, v24.8h
add v11.8h, v11.8h, v24.8h
sub v14.8h, v13.8h, v25.8h
add v13.8h, v13.8h, v25.8h
sub v16.8h, v15.8h, v26.8h
add v15.8h, v15.8h, v26.8h
sub v18.8h, v17.8h, v27.8h
add v17.8h, v17.8h, v27.8h
sub v20.8h, v19.8h, v28.8h
add v19.8h, v19.8h, v28.8h
sqdmulh v21.8h, v5.8h, v4.h[2]
sqdmulh v22.8h, v6.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v5.8h, v21.8h, v4.h[0]
mls v6.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v7.8h, v4.h[2]
sqdmulh v22.8h, v8.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v7.8h, v21.8h, v4.h[0]
mls v8.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v9.8h, v4.h[2]
sqdmulh v22.8h, v10.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v9.8h, v21.8h, v4.h[0]
mls v10.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v11.8h, v4.h[2]
sqdmulh v22.8h, v12.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v11.8h, v21.8h, v4.h[0]
mls v12.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v13.8h, v4.h[2]
sqdmulh v22.8h, v14.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v13.8h, v21.8h, v4.h[0]
mls v14.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v15.8h, v4.h[2]
sqdmulh v22.8h, v16.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v15.8h, v21.8h, v4.h[0]
mls v16.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v17.8h, v4.h[2]
sqdmulh v22.8h, v18.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v17.8h, v21.8h, v4.h[0]
mls v18.8h, v22.8h, v4.h[0]
sqdmulh v21.8h, v19.8h, v4.h[2]
sqdmulh v22.8h, v20.8h, v4.h[2]
sshr v21.8h, v21.8h, #11
sshr v22.8h, v22.8h, #11
mls v19.8h, v21.8h, v4.h[0]
mls v20.8h, v22.8h, v4.h[0]
mov v29.16b, v5.16b
trn1 v5.4s, v5.4s, v6.4s
trn2 v6.4s, v29.4s, v6.4s
mov v29.16b, v5.16b
trn1 v5.2d, v5.2d, v6.2d
trn2 v6.2d, v29.2d, v6.2d
mov v29.16b, v7.16b
trn1 v7.4s, v7.4s, v8.4s
trn2 v8.4s, v29.4s, v8.4s
mov v29.16b, v7.16b
trn1 v7.2d, v7.2d, v8.2d
trn2 v8.2d, v29.2d, v8.2d
mov v29.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v29.4s, v10.4s
mov v29.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v29.2d, v10.2d
mov v29.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v29.4s, v12.4s
mov v29.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v29.2d, v12.2d
mov v29.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v29.4s, v14.4s
mov v29.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v29.2d, v14.2d
mov v29.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v29.4s, v16.4s
mov v29.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v29.2d, v16.2d
mov v29.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v29.4s, v18.4s
mov v29.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v29.2d, v18.2d
mov v29.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v29.4s, v20.4s
mov v29.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v29.2d, v20.2d
stp q5, q6, [x1]
stp q7, q8, [x1, #32]
stp q9, q10, [x1, #64]
stp q11, q12, [x1, #96]
stp q13, q14, [x1, #128]
stp q15, q16, [x1, #160]
stp q17, q18, [x1, #192]
stp q19, q20, [x1, #224]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_ntt_sqrdmlsh,.-mlkem_ntt_sqrdmlsh
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_invntt_sqrdmlsh
.type mlkem_invntt_sqrdmlsh,@function
.align 2
mlkem_invntt_sqrdmlsh:
#else
.section __TEXT,__text
.globl _mlkem_invntt_sqrdmlsh
.p2align 2
_mlkem_invntt_sqrdmlsh:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_aarch64_zetas_inv
add x2, x2, :lo12:L_mlkem_aarch64_zetas_inv
#else
adrp x2, L_mlkem_aarch64_zetas_inv@PAGE
add x2, x2, L_mlkem_aarch64_zetas_inv@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_zetas_inv_qinv
add x3, x3, :lo12:L_mlkem_aarch64_zetas_inv_qinv
#else
adrp x3, L_mlkem_aarch64_zetas_inv_qinv@PAGE
add x3, x3, L_mlkem_aarch64_zetas_inv_qinv@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_aarch64_consts
add x4, x4, :lo12:L_mlkem_aarch64_consts
#else
adrp x4, L_mlkem_aarch64_consts@PAGE
add x4, x4, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
add x1, x0, #0x100
ldr q8, [x4]
ldp q9, q10, [x0]
ldp q11, q12, [x0, #32]
ldp q13, q14, [x0, #64]
ldp q15, q16, [x0, #96]
ldp q17, q18, [x0, #128]
ldp q19, q20, [x0, #160]
ldp q21, q22, [x0, #192]
ldp q23, q24, [x0, #224]
mov v25.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v25.2d, v10.2d
mov v25.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v25.4s, v10.4s
mov v25.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v25.2d, v12.2d
mov v25.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v25.4s, v12.4s
mov v25.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v25.2d, v14.2d
mov v25.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v25.4s, v14.4s
mov v25.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v25.2d, v16.2d
mov v25.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v25.4s, v16.4s
mov v25.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v25.2d, v18.2d
mov v25.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v25.4s, v18.4s
mov v25.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v25.2d, v20.2d
mov v25.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v25.4s, v20.4s
mov v25.16b, v21.16b
trn1 v21.2d, v21.2d, v22.2d
trn2 v22.2d, v25.2d, v22.2d
mov v25.16b, v21.16b
trn1 v21.4s, v21.4s, v22.4s
trn2 v22.4s, v25.4s, v22.4s
mov v25.16b, v23.16b
trn1 v23.2d, v23.2d, v24.2d
trn2 v24.2d, v25.2d, v24.2d
mov v25.16b, v23.16b
trn1 v23.4s, v23.4s, v24.4s
trn2 v24.4s, v25.4s, v24.4s
ldr q0, [x2]
ldr q1, [x2, #16]
ldr q2, [x3]
ldr q3, [x3, #16]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #32]
ldr q1, [x2, #48]
ldr q2, [x3, #32]
ldr q3, [x3, #48]
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #64]
ldr q1, [x2, #80]
ldr q2, [x3, #64]
ldr q3, [x3, #80]
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #96]
ldr q1, [x2, #112]
ldr q2, [x3, #96]
ldr q3, [x3, #112]
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #256]
ldr q1, [x2, #272]
ldr q2, [x3, #256]
ldr q3, [x3, #272]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v25.4s, v10.4s
trn2 v12.4s, v26.4s, v12.4s
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #288]
ldr q1, [x2, #304]
ldr q2, [x3, #288]
ldr q3, [x3, #304]
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v25.4s, v14.4s
trn2 v16.4s, v26.4s, v16.4s
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #320]
ldr q1, [x2, #336]
ldr q2, [x3, #320]
ldr q3, [x3, #336]
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v25.4s, v18.4s
trn2 v20.4s, v26.4s, v20.4s
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #352]
ldr q1, [x2, #368]
ldr q2, [x3, #352]
ldr q3, [x3, #368]
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.4s, v21.4s, v22.4s
trn1 v23.4s, v23.4s, v24.4s
trn2 v22.4s, v25.4s, v22.4s
trn2 v24.4s, v26.4s, v24.4s
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #512]
ldr q2, [x3, #512]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v25.2d, v10.2d
trn2 v12.2d, v26.2d, v12.2d
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.h[0]
mul v27.8h, v28.8h, v2.h[1]
sqrdmulh v10.8h, v26.8h, v0.h[0]
sqrdmulh v12.8h, v28.8h, v0.h[1]
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v25.2d, v14.2d
trn2 v16.2d, v26.2d, v16.2d
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.h[2]
mul v27.8h, v28.8h, v2.h[3]
sqrdmulh v14.8h, v26.8h, v0.h[2]
sqrdmulh v16.8h, v28.8h, v0.h[3]
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v25.2d, v18.2d
trn2 v20.2d, v26.2d, v20.2d
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.h[4]
mul v27.8h, v28.8h, v2.h[5]
sqrdmulh v18.8h, v26.8h, v0.h[4]
sqrdmulh v20.8h, v28.8h, v0.h[5]
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.2d, v21.2d, v22.2d
trn1 v23.2d, v23.2d, v24.2d
trn2 v22.2d, v25.2d, v22.2d
trn2 v24.2d, v26.2d, v24.2d
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.h[6]
mul v27.8h, v28.8h, v2.h[7]
sqrdmulh v22.8h, v26.8h, v0.h[6]
sqrdmulh v24.8h, v28.8h, v0.h[7]
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v11.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v11.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v13.8h, v8.h[2]
sqdmulh v26.8h, v15.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v13.8h, v25.8h, v8.h[0]
mls v15.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v19.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v19.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v21.8h, v8.h[2]
sqdmulh v26.8h, v23.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v21.8h, v25.8h, v8.h[0]
mls v23.8h, v26.8h, v8.h[0]
stp q9, q10, [x0]
stp q11, q12, [x0, #32]
stp q13, q14, [x0, #64]
stp q15, q16, [x0, #96]
stp q17, q18, [x0, #128]
stp q19, q20, [x0, #160]
stp q21, q22, [x0, #192]
stp q23, q24, [x0, #224]
ldp q9, q10, [x1]
ldp q11, q12, [x1, #32]
ldp q13, q14, [x1, #64]
ldp q15, q16, [x1, #96]
ldp q17, q18, [x1, #128]
ldp q19, q20, [x1, #160]
ldp q21, q22, [x1, #192]
ldp q23, q24, [x1, #224]
mov v25.16b, v9.16b
trn1 v9.2d, v9.2d, v10.2d
trn2 v10.2d, v25.2d, v10.2d
mov v25.16b, v9.16b
trn1 v9.4s, v9.4s, v10.4s
trn2 v10.4s, v25.4s, v10.4s
mov v25.16b, v11.16b
trn1 v11.2d, v11.2d, v12.2d
trn2 v12.2d, v25.2d, v12.2d
mov v25.16b, v11.16b
trn1 v11.4s, v11.4s, v12.4s
trn2 v12.4s, v25.4s, v12.4s
mov v25.16b, v13.16b
trn1 v13.2d, v13.2d, v14.2d
trn2 v14.2d, v25.2d, v14.2d
mov v25.16b, v13.16b
trn1 v13.4s, v13.4s, v14.4s
trn2 v14.4s, v25.4s, v14.4s
mov v25.16b, v15.16b
trn1 v15.2d, v15.2d, v16.2d
trn2 v16.2d, v25.2d, v16.2d
mov v25.16b, v15.16b
trn1 v15.4s, v15.4s, v16.4s
trn2 v16.4s, v25.4s, v16.4s
mov v25.16b, v17.16b
trn1 v17.2d, v17.2d, v18.2d
trn2 v18.2d, v25.2d, v18.2d
mov v25.16b, v17.16b
trn1 v17.4s, v17.4s, v18.4s
trn2 v18.4s, v25.4s, v18.4s
mov v25.16b, v19.16b
trn1 v19.2d, v19.2d, v20.2d
trn2 v20.2d, v25.2d, v20.2d
mov v25.16b, v19.16b
trn1 v19.4s, v19.4s, v20.4s
trn2 v20.4s, v25.4s, v20.4s
mov v25.16b, v21.16b
trn1 v21.2d, v21.2d, v22.2d
trn2 v22.2d, v25.2d, v22.2d
mov v25.16b, v21.16b
trn1 v21.4s, v21.4s, v22.4s
trn2 v22.4s, v25.4s, v22.4s
mov v25.16b, v23.16b
trn1 v23.2d, v23.2d, v24.2d
trn2 v24.2d, v25.2d, v24.2d
mov v25.16b, v23.16b
trn1 v23.4s, v23.4s, v24.4s
trn2 v24.4s, v25.4s, v24.4s
ldr q0, [x2, #128]
ldr q1, [x2, #144]
ldr q2, [x3, #128]
ldr q3, [x3, #144]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #160]
ldr q1, [x2, #176]
ldr q2, [x3, #160]
ldr q3, [x3, #176]
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #192]
ldr q1, [x2, #208]
ldr q2, [x3, #192]
ldr q3, [x3, #208]
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #224]
ldr q1, [x2, #240]
ldr q2, [x3, #224]
ldr q3, [x3, #240]
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #384]
ldr q1, [x2, #400]
ldr q2, [x3, #384]
ldr q3, [x3, #400]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.4s, v9.4s, v10.4s
trn1 v11.4s, v11.4s, v12.4s
trn2 v10.4s, v25.4s, v10.4s
trn2 v12.4s, v26.4s, v12.4s
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v10.8h, v26.8h, v0.8h
sqrdmulh v12.8h, v28.8h, v1.8h
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
ldr q0, [x2, #416]
ldr q1, [x2, #432]
ldr q2, [x3, #416]
ldr q3, [x3, #432]
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.4s, v13.4s, v14.4s
trn1 v15.4s, v15.4s, v16.4s
trn2 v14.4s, v25.4s, v14.4s
trn2 v16.4s, v26.4s, v16.4s
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v14.8h, v26.8h, v0.8h
sqrdmulh v16.8h, v28.8h, v1.8h
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
ldr q0, [x2, #448]
ldr q1, [x2, #464]
ldr q2, [x3, #448]
ldr q3, [x3, #464]
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.4s, v17.4s, v18.4s
trn1 v19.4s, v19.4s, v20.4s
trn2 v18.4s, v25.4s, v18.4s
trn2 v20.4s, v26.4s, v20.4s
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v18.8h, v26.8h, v0.8h
sqrdmulh v20.8h, v28.8h, v1.8h
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
ldr q0, [x2, #480]
ldr q1, [x2, #496]
ldr q2, [x3, #480]
ldr q3, [x3, #496]
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.4s, v21.4s, v22.4s
trn1 v23.4s, v23.4s, v24.4s
trn2 v22.4s, v25.4s, v22.4s
trn2 v24.4s, v26.4s, v24.4s
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.8h
mul v27.8h, v28.8h, v3.8h
sqrdmulh v22.8h, v26.8h, v0.8h
sqrdmulh v24.8h, v28.8h, v1.8h
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
ldr q0, [x2, #528]
ldr q2, [x3, #528]
mov v25.16b, v9.16b
mov v26.16b, v11.16b
trn1 v9.2d, v9.2d, v10.2d
trn1 v11.2d, v11.2d, v12.2d
trn2 v10.2d, v25.2d, v10.2d
trn2 v12.2d, v26.2d, v12.2d
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v2.h[0]
mul v27.8h, v28.8h, v2.h[1]
sqrdmulh v10.8h, v26.8h, v0.h[0]
sqrdmulh v12.8h, v28.8h, v0.h[1]
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
mov v25.16b, v13.16b
mov v26.16b, v15.16b
trn1 v13.2d, v13.2d, v14.2d
trn1 v15.2d, v15.2d, v16.2d
trn2 v14.2d, v25.2d, v14.2d
trn2 v16.2d, v26.2d, v16.2d
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v2.h[2]
mul v27.8h, v28.8h, v2.h[3]
sqrdmulh v14.8h, v26.8h, v0.h[2]
sqrdmulh v16.8h, v28.8h, v0.h[3]
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
mov v25.16b, v17.16b
mov v26.16b, v19.16b
trn1 v17.2d, v17.2d, v18.2d
trn1 v19.2d, v19.2d, v20.2d
trn2 v18.2d, v25.2d, v18.2d
trn2 v20.2d, v26.2d, v20.2d
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v2.h[4]
mul v27.8h, v28.8h, v2.h[5]
sqrdmulh v18.8h, v26.8h, v0.h[4]
sqrdmulh v20.8h, v28.8h, v0.h[5]
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
mov v25.16b, v21.16b
mov v26.16b, v23.16b
trn1 v21.2d, v21.2d, v22.2d
trn1 v23.2d, v23.2d, v24.2d
trn2 v22.2d, v25.2d, v22.2d
trn2 v24.2d, v26.2d, v24.2d
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v2.h[6]
mul v27.8h, v28.8h, v2.h[7]
sqrdmulh v22.8h, v26.8h, v0.h[6]
sqrdmulh v24.8h, v28.8h, v0.h[7]
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v11.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v11.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v13.8h, v8.h[2]
sqdmulh v26.8h, v15.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v13.8h, v25.8h, v8.h[0]
mls v15.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v19.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v19.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v21.8h, v8.h[2]
sqdmulh v26.8h, v23.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v21.8h, v25.8h, v8.h[0]
mls v23.8h, v26.8h, v8.h[0]
stp q9, q10, [x1]
stp q11, q12, [x1, #32]
stp q13, q14, [x1, #64]
stp q15, q16, [x1, #96]
stp q17, q18, [x1, #128]
stp q19, q20, [x1, #160]
stp q21, q22, [x1, #192]
stp q23, q24, [x1, #224]
ldr q4, [x2, #544]
ldr q5, [x2, #560]
ldr q6, [x3, #544]
ldr q7, [x3, #560]
ldr q9, [x0]
ldr q10, [x0, #32]
ldr q11, [x0, #64]
ldr q12, [x0, #96]
ldr q13, [x0, #128]
ldr q14, [x0, #160]
ldr q15, [x0, #192]
ldr q16, [x0, #224]
ldr q17, [x1]
ldr q18, [x1, #32]
ldr q19, [x1, #64]
ldr q20, [x1, #96]
ldr q21, [x1, #128]
ldr q22, [x1, #160]
ldr q23, [x1, #192]
ldr q24, [x1, #224]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v6.h[0]
mul v27.8h, v28.8h, v6.h[1]
sqrdmulh v10.8h, v26.8h, v4.h[0]
sqrdmulh v12.8h, v28.8h, v4.h[1]
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v6.h[2]
mul v27.8h, v28.8h, v6.h[3]
sqrdmulh v14.8h, v26.8h, v4.h[2]
sqrdmulh v16.8h, v28.8h, v4.h[3]
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v6.h[4]
mul v27.8h, v28.8h, v6.h[5]
sqrdmulh v18.8h, v26.8h, v4.h[4]
sqrdmulh v20.8h, v28.8h, v4.h[5]
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v6.h[6]
mul v27.8h, v28.8h, v6.h[7]
sqrdmulh v22.8h, v26.8h, v4.h[6]
sqrdmulh v24.8h, v28.8h, v4.h[7]
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v11.8h
sub v28.8h, v10.8h, v12.8h
add v9.8h, v9.8h, v11.8h
add v10.8h, v10.8h, v12.8h
mul v25.8h, v26.8h, v7.h[0]
mul v27.8h, v28.8h, v7.h[0]
sqrdmulh v11.8h, v26.8h, v5.h[0]
sqrdmulh v12.8h, v28.8h, v5.h[0]
sqrdmlsh v11.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v15.8h
sub v28.8h, v14.8h, v16.8h
add v13.8h, v13.8h, v15.8h
add v14.8h, v14.8h, v16.8h
mul v25.8h, v26.8h, v7.h[1]
mul v27.8h, v28.8h, v7.h[1]
sqrdmulh v15.8h, v26.8h, v5.h[1]
sqrdmulh v16.8h, v28.8h, v5.h[1]
sqrdmlsh v15.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v19.8h
sub v28.8h, v18.8h, v20.8h
add v17.8h, v17.8h, v19.8h
add v18.8h, v18.8h, v20.8h
mul v25.8h, v26.8h, v7.h[2]
mul v27.8h, v28.8h, v7.h[2]
sqrdmulh v19.8h, v26.8h, v5.h[2]
sqrdmulh v20.8h, v28.8h, v5.h[2]
sqrdmlsh v19.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v23.8h
sub v28.8h, v22.8h, v24.8h
add v21.8h, v21.8h, v23.8h
add v22.8h, v22.8h, v24.8h
mul v25.8h, v26.8h, v7.h[3]
mul v27.8h, v28.8h, v7.h[3]
sqrdmulh v23.8h, v26.8h, v5.h[3]
sqrdmulh v24.8h, v28.8h, v5.h[3]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v13.8h
sub v28.8h, v10.8h, v14.8h
add v9.8h, v9.8h, v13.8h
add v10.8h, v10.8h, v14.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v13.8h, v26.8h, v5.h[4]
sqrdmulh v14.8h, v28.8h, v5.h[4]
sqrdmlsh v13.8h, v25.8h, v8.h[0]
sqrdmlsh v14.8h, v27.8h, v8.h[0]
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
sub v26.8h, v11.8h, v15.8h
sub v28.8h, v12.8h, v16.8h
add v11.8h, v11.8h, v15.8h
add v12.8h, v12.8h, v16.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v15.8h, v26.8h, v5.h[4]
sqrdmulh v16.8h, v28.8h, v5.h[4]
sqrdmlsh v15.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v21.8h
sub v28.8h, v18.8h, v22.8h
add v17.8h, v17.8h, v21.8h
add v18.8h, v18.8h, v22.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v21.8h, v26.8h, v5.h[5]
sqrdmulh v22.8h, v28.8h, v5.h[5]
sqrdmlsh v21.8h, v25.8h, v8.h[0]
sqrdmlsh v22.8h, v27.8h, v8.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v19.8h, v23.8h
sub v28.8h, v20.8h, v24.8h
add v19.8h, v19.8h, v23.8h
add v20.8h, v20.8h, v24.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v23.8h, v26.8h, v5.h[5]
sqrdmulh v24.8h, v28.8h, v5.h[5]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v10.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v10.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v11.8h, v8.h[2]
sqdmulh v26.8h, v12.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v11.8h, v25.8h, v8.h[0]
mls v12.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v18.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v18.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v19.8h, v8.h[2]
sqdmulh v26.8h, v20.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v19.8h, v25.8h, v8.h[0]
mls v20.8h, v26.8h, v8.h[0]
sub v26.8h, v9.8h, v17.8h
sub v28.8h, v10.8h, v18.8h
add v9.8h, v9.8h, v17.8h
add v10.8h, v10.8h, v18.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v17.8h, v26.8h, v5.h[6]
sqrdmulh v18.8h, v28.8h, v5.h[6]
sqrdmlsh v17.8h, v25.8h, v8.h[0]
sqrdmlsh v18.8h, v27.8h, v8.h[0]
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
sub v26.8h, v11.8h, v19.8h
sub v28.8h, v12.8h, v20.8h
add v11.8h, v11.8h, v19.8h
add v12.8h, v12.8h, v20.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v19.8h, v26.8h, v5.h[6]
sqrdmulh v20.8h, v28.8h, v5.h[6]
sqrdmlsh v19.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v13.8h, v21.8h
sub v28.8h, v14.8h, v22.8h
add v13.8h, v13.8h, v21.8h
add v14.8h, v14.8h, v22.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v21.8h, v26.8h, v5.h[6]
sqrdmulh v22.8h, v28.8h, v5.h[6]
sqrdmlsh v21.8h, v25.8h, v8.h[0]
sqrdmlsh v22.8h, v27.8h, v8.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v15.8h, v23.8h
sub v28.8h, v16.8h, v24.8h
add v15.8h, v15.8h, v23.8h
add v16.8h, v16.8h, v24.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v23.8h, v26.8h, v5.h[6]
sqrdmulh v24.8h, v28.8h, v5.h[6]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v25.8h, v9.8h, v7.h[7]
mul v26.8h, v10.8h, v7.h[7]
sqrdmulh v9.8h, v9.8h, v5.h[7]
sqrdmulh v10.8h, v10.8h, v5.h[7]
sqrdmlsh v9.8h, v25.8h, v8.h[0]
sqrdmlsh v10.8h, v26.8h, v8.h[0]
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v25.8h, v11.8h, v7.h[7]
mul v26.8h, v12.8h, v7.h[7]
sqrdmulh v11.8h, v11.8h, v5.h[7]
sqrdmulh v12.8h, v12.8h, v5.h[7]
sqrdmlsh v11.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v26.8h, v8.h[0]
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v25.8h, v13.8h, v7.h[7]
mul v26.8h, v14.8h, v7.h[7]
sqrdmulh v13.8h, v13.8h, v5.h[7]
sqrdmulh v14.8h, v14.8h, v5.h[7]
sqrdmlsh v13.8h, v25.8h, v8.h[0]
sqrdmlsh v14.8h, v26.8h, v8.h[0]
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v25.8h, v15.8h, v7.h[7]
mul v26.8h, v16.8h, v7.h[7]
sqrdmulh v15.8h, v15.8h, v5.h[7]
sqrdmulh v16.8h, v16.8h, v5.h[7]
sqrdmlsh v15.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v26.8h, v8.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
mul v25.8h, v17.8h, v7.h[7]
mul v26.8h, v18.8h, v7.h[7]
sqrdmulh v17.8h, v17.8h, v5.h[7]
sqrdmulh v18.8h, v18.8h, v5.h[7]
sqrdmlsh v17.8h, v25.8h, v8.h[0]
sqrdmlsh v18.8h, v26.8h, v8.h[0]
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
mul v25.8h, v19.8h, v7.h[7]
mul v26.8h, v20.8h, v7.h[7]
sqrdmulh v19.8h, v19.8h, v5.h[7]
sqrdmulh v20.8h, v20.8h, v5.h[7]
sqrdmlsh v19.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v26.8h, v8.h[0]
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
mul v25.8h, v21.8h, v7.h[7]
mul v26.8h, v22.8h, v7.h[7]
sqrdmulh v21.8h, v21.8h, v5.h[7]
sqrdmulh v22.8h, v22.8h, v5.h[7]
sqrdmlsh v21.8h, v25.8h, v8.h[0]
sqrdmlsh v22.8h, v26.8h, v8.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v25.8h, v23.8h, v7.h[7]
mul v26.8h, v24.8h, v7.h[7]
sqrdmulh v23.8h, v23.8h, v5.h[7]
sqrdmulh v24.8h, v24.8h, v5.h[7]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v26.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
str q9, [x0]
str q10, [x0, #32]
str q11, [x0, #64]
str q12, [x0, #96]
str q13, [x0, #128]
str q14, [x0, #160]
str q15, [x0, #192]
str q16, [x0, #224]
str q17, [x1]
str q18, [x1, #32]
str q19, [x1, #64]
str q20, [x1, #96]
str q21, [x1, #128]
str q22, [x1, #160]
str q23, [x1, #192]
str q24, [x1, #224]
ldr q9, [x0, #16]
ldr q10, [x0, #48]
ldr q11, [x0, #80]
ldr q12, [x0, #112]
ldr q13, [x0, #144]
ldr q14, [x0, #176]
ldr q15, [x0, #208]
ldr q16, [x0, #240]
ldr q17, [x1, #16]
ldr q18, [x1, #48]
ldr q19, [x1, #80]
ldr q20, [x1, #112]
ldr q21, [x1, #144]
ldr q22, [x1, #176]
ldr q23, [x1, #208]
ldr q24, [x1, #240]
sub v26.8h, v9.8h, v10.8h
sub v28.8h, v11.8h, v12.8h
add v9.8h, v9.8h, v10.8h
add v11.8h, v11.8h, v12.8h
mul v25.8h, v26.8h, v6.h[0]
mul v27.8h, v28.8h, v6.h[1]
sqrdmulh v10.8h, v26.8h, v4.h[0]
sqrdmulh v12.8h, v28.8h, v4.h[1]
sqrdmlsh v10.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v10.8h, v10.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v14.8h
sub v28.8h, v15.8h, v16.8h
add v13.8h, v13.8h, v14.8h
add v15.8h, v15.8h, v16.8h
mul v25.8h, v26.8h, v6.h[2]
mul v27.8h, v28.8h, v6.h[3]
sqrdmulh v14.8h, v26.8h, v4.h[2]
sqrdmulh v16.8h, v28.8h, v4.h[3]
sqrdmlsh v14.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v14.8h, v14.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v18.8h
sub v28.8h, v19.8h, v20.8h
add v17.8h, v17.8h, v18.8h
add v19.8h, v19.8h, v20.8h
mul v25.8h, v26.8h, v6.h[4]
mul v27.8h, v28.8h, v6.h[5]
sqrdmulh v18.8h, v26.8h, v4.h[4]
sqrdmulh v20.8h, v28.8h, v4.h[5]
sqrdmlsh v18.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v18.8h, v18.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v22.8h
sub v28.8h, v23.8h, v24.8h
add v21.8h, v21.8h, v22.8h
add v23.8h, v23.8h, v24.8h
mul v25.8h, v26.8h, v6.h[6]
mul v27.8h, v28.8h, v6.h[7]
sqrdmulh v22.8h, v26.8h, v4.h[6]
sqrdmulh v24.8h, v28.8h, v4.h[7]
sqrdmlsh v22.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v22.8h, v22.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v11.8h
sub v28.8h, v10.8h, v12.8h
add v9.8h, v9.8h, v11.8h
add v10.8h, v10.8h, v12.8h
mul v25.8h, v26.8h, v7.h[0]
mul v27.8h, v28.8h, v7.h[0]
sqrdmulh v11.8h, v26.8h, v5.h[0]
sqrdmulh v12.8h, v28.8h, v5.h[0]
sqrdmlsh v11.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v27.8h, v8.h[0]
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
sub v26.8h, v13.8h, v15.8h
sub v28.8h, v14.8h, v16.8h
add v13.8h, v13.8h, v15.8h
add v14.8h, v14.8h, v16.8h
mul v25.8h, v26.8h, v7.h[1]
mul v27.8h, v28.8h, v7.h[1]
sqrdmulh v15.8h, v26.8h, v5.h[1]
sqrdmulh v16.8h, v28.8h, v5.h[1]
sqrdmlsh v15.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v19.8h
sub v28.8h, v18.8h, v20.8h
add v17.8h, v17.8h, v19.8h
add v18.8h, v18.8h, v20.8h
mul v25.8h, v26.8h, v7.h[2]
mul v27.8h, v28.8h, v7.h[2]
sqrdmulh v19.8h, v26.8h, v5.h[2]
sqrdmulh v20.8h, v28.8h, v5.h[2]
sqrdmlsh v19.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v21.8h, v23.8h
sub v28.8h, v22.8h, v24.8h
add v21.8h, v21.8h, v23.8h
add v22.8h, v22.8h, v24.8h
mul v25.8h, v26.8h, v7.h[3]
mul v27.8h, v28.8h, v7.h[3]
sqrdmulh v23.8h, v26.8h, v5.h[3]
sqrdmulh v24.8h, v28.8h, v5.h[3]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sub v26.8h, v9.8h, v13.8h
sub v28.8h, v10.8h, v14.8h
add v9.8h, v9.8h, v13.8h
add v10.8h, v10.8h, v14.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v13.8h, v26.8h, v5.h[4]
sqrdmulh v14.8h, v28.8h, v5.h[4]
sqrdmlsh v13.8h, v25.8h, v8.h[0]
sqrdmlsh v14.8h, v27.8h, v8.h[0]
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
sub v26.8h, v11.8h, v15.8h
sub v28.8h, v12.8h, v16.8h
add v11.8h, v11.8h, v15.8h
add v12.8h, v12.8h, v16.8h
mul v25.8h, v26.8h, v7.h[4]
mul v27.8h, v28.8h, v7.h[4]
sqrdmulh v15.8h, v26.8h, v5.h[4]
sqrdmulh v16.8h, v28.8h, v5.h[4]
sqrdmlsh v15.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v27.8h, v8.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
sub v26.8h, v17.8h, v21.8h
sub v28.8h, v18.8h, v22.8h
add v17.8h, v17.8h, v21.8h
add v18.8h, v18.8h, v22.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v21.8h, v26.8h, v5.h[5]
sqrdmulh v22.8h, v28.8h, v5.h[5]
sqrdmlsh v21.8h, v25.8h, v8.h[0]
sqrdmlsh v22.8h, v27.8h, v8.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v19.8h, v23.8h
sub v28.8h, v20.8h, v24.8h
add v19.8h, v19.8h, v23.8h
add v20.8h, v20.8h, v24.8h
mul v25.8h, v26.8h, v7.h[5]
mul v27.8h, v28.8h, v7.h[5]
sqrdmulh v23.8h, v26.8h, v5.h[5]
sqrdmulh v24.8h, v28.8h, v5.h[5]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
sqdmulh v25.8h, v9.8h, v8.h[2]
sqdmulh v26.8h, v10.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v9.8h, v25.8h, v8.h[0]
mls v10.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v11.8h, v8.h[2]
sqdmulh v26.8h, v12.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v11.8h, v25.8h, v8.h[0]
mls v12.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v17.8h, v8.h[2]
sqdmulh v26.8h, v18.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v17.8h, v25.8h, v8.h[0]
mls v18.8h, v26.8h, v8.h[0]
sqdmulh v25.8h, v19.8h, v8.h[2]
sqdmulh v26.8h, v20.8h, v8.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v19.8h, v25.8h, v8.h[0]
mls v20.8h, v26.8h, v8.h[0]
sub v26.8h, v9.8h, v17.8h
sub v28.8h, v10.8h, v18.8h
add v9.8h, v9.8h, v17.8h
add v10.8h, v10.8h, v18.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v17.8h, v26.8h, v5.h[6]
sqrdmulh v18.8h, v28.8h, v5.h[6]
sqrdmlsh v17.8h, v25.8h, v8.h[0]
sqrdmlsh v18.8h, v27.8h, v8.h[0]
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
sub v26.8h, v11.8h, v19.8h
sub v28.8h, v12.8h, v20.8h
add v11.8h, v11.8h, v19.8h
add v12.8h, v12.8h, v20.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v19.8h, v26.8h, v5.h[6]
sqrdmulh v20.8h, v28.8h, v5.h[6]
sqrdmlsh v19.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v27.8h, v8.h[0]
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
sub v26.8h, v13.8h, v21.8h
sub v28.8h, v14.8h, v22.8h
add v13.8h, v13.8h, v21.8h
add v14.8h, v14.8h, v22.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v21.8h, v26.8h, v5.h[6]
sqrdmulh v22.8h, v28.8h, v5.h[6]
sqrdmlsh v21.8h, v25.8h, v8.h[0]
sqrdmlsh v22.8h, v27.8h, v8.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
sub v26.8h, v15.8h, v23.8h
sub v28.8h, v16.8h, v24.8h
add v15.8h, v15.8h, v23.8h
add v16.8h, v16.8h, v24.8h
mul v25.8h, v26.8h, v7.h[6]
mul v27.8h, v28.8h, v7.h[6]
sqrdmulh v23.8h, v26.8h, v5.h[6]
sqrdmulh v24.8h, v28.8h, v5.h[6]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v27.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
mul v25.8h, v9.8h, v7.h[7]
mul v26.8h, v10.8h, v7.h[7]
sqrdmulh v9.8h, v9.8h, v5.h[7]
sqrdmulh v10.8h, v10.8h, v5.h[7]
sqrdmlsh v9.8h, v25.8h, v8.h[0]
sqrdmlsh v10.8h, v26.8h, v8.h[0]
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v25.8h, v11.8h, v7.h[7]
mul v26.8h, v12.8h, v7.h[7]
sqrdmulh v11.8h, v11.8h, v5.h[7]
sqrdmulh v12.8h, v12.8h, v5.h[7]
sqrdmlsh v11.8h, v25.8h, v8.h[0]
sqrdmlsh v12.8h, v26.8h, v8.h[0]
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v25.8h, v13.8h, v7.h[7]
mul v26.8h, v14.8h, v7.h[7]
sqrdmulh v13.8h, v13.8h, v5.h[7]
sqrdmulh v14.8h, v14.8h, v5.h[7]
sqrdmlsh v13.8h, v25.8h, v8.h[0]
sqrdmlsh v14.8h, v26.8h, v8.h[0]
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v25.8h, v15.8h, v7.h[7]
mul v26.8h, v16.8h, v7.h[7]
sqrdmulh v15.8h, v15.8h, v5.h[7]
sqrdmulh v16.8h, v16.8h, v5.h[7]
sqrdmlsh v15.8h, v25.8h, v8.h[0]
sqrdmlsh v16.8h, v26.8h, v8.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
mul v25.8h, v17.8h, v7.h[7]
mul v26.8h, v18.8h, v7.h[7]
sqrdmulh v17.8h, v17.8h, v5.h[7]
sqrdmulh v18.8h, v18.8h, v5.h[7]
sqrdmlsh v17.8h, v25.8h, v8.h[0]
sqrdmlsh v18.8h, v26.8h, v8.h[0]
sshr v17.8h, v17.8h, #1
sshr v18.8h, v18.8h, #1
mul v25.8h, v19.8h, v7.h[7]
mul v26.8h, v20.8h, v7.h[7]
sqrdmulh v19.8h, v19.8h, v5.h[7]
sqrdmulh v20.8h, v20.8h, v5.h[7]
sqrdmlsh v19.8h, v25.8h, v8.h[0]
sqrdmlsh v20.8h, v26.8h, v8.h[0]
sshr v19.8h, v19.8h, #1
sshr v20.8h, v20.8h, #1
mul v25.8h, v21.8h, v7.h[7]
mul v26.8h, v22.8h, v7.h[7]
sqrdmulh v21.8h, v21.8h, v5.h[7]
sqrdmulh v22.8h, v22.8h, v5.h[7]
sqrdmlsh v21.8h, v25.8h, v8.h[0]
sqrdmlsh v22.8h, v26.8h, v8.h[0]
sshr v21.8h, v21.8h, #1
sshr v22.8h, v22.8h, #1
mul v25.8h, v23.8h, v7.h[7]
mul v26.8h, v24.8h, v7.h[7]
sqrdmulh v23.8h, v23.8h, v5.h[7]
sqrdmulh v24.8h, v24.8h, v5.h[7]
sqrdmlsh v23.8h, v25.8h, v8.h[0]
sqrdmlsh v24.8h, v26.8h, v8.h[0]
sshr v23.8h, v23.8h, #1
sshr v24.8h, v24.8h, #1
str q9, [x0, #16]
str q10, [x0, #48]
str q11, [x0, #80]
str q12, [x0, #112]
str q13, [x0, #144]
str q14, [x0, #176]
str q15, [x0, #208]
str q16, [x0, #240]
str q17, [x1, #16]
str q18, [x1, #48]
str q19, [x1, #80]
str q20, [x1, #112]
str q21, [x1, #144]
str q22, [x1, #176]
str q23, [x1, #208]
str q24, [x1, #240]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_invntt_sqrdmlsh,.-mlkem_invntt_sqrdmlsh
#endif /* __APPLE__ */
#endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_zetas_mul, %object
.section .rodata
.size L_mlkem_aarch64_zetas_mul, 256
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_zetas_mul:
.short 0x08b2,0xf74e,0x01ae,0xfe52,0x022b,0xfdd5,0x034b,0xfcb5
.short 0x081e,0xf7e2,0x0367,0xfc99,0x060e,0xf9f2,0x0069,0xff97
.short 0x01a6,0xfe5a,0x024b,0xfdb5,0x00b1,0xff4f,0x0c16,0xf3ea
.short 0x0bde,0xf422,0x0b35,0xf4cb,0x0626,0xf9da,0x0675,0xf98b
.short 0x0c0b,0xf3f5,0x030a,0xfcf6,0x0487,0xfb79,0x0c6e,0xf392
.short 0x09f8,0xf608,0x05cb,0xfa35,0x0aa7,0xf559,0x045f,0xfba1
.short 0x06cb,0xf935,0x0284,0xfd7c,0x0999,0xf667,0x015d,0xfea3
.short 0x01a2,0xfe5e,0x0149,0xfeb7,0x0c65,0xf39b,0x0cb6,0xf34a
.short 0x0331,0xfccf,0x0449,0xfbb7,0x025b,0xfda5,0x0262,0xfd9e
.short 0x052a,0xfad6,0x07fc,0xf804,0x0748,0xf8b8,0x0180,0xfe80
.short 0x0842,0xf7be,0x0c79,0xf387,0x04c2,0xfb3e,0x07ca,0xf836
.short 0x0997,0xf669,0x00dc,0xff24,0x085e,0xf7a2,0x0686,0xf97a
.short 0x0860,0xf7a0,0x0707,0xf8f9,0x0803,0xf7fd,0x031a,0xfce6
.short 0x071b,0xf8e5,0x09ab,0xf655,0x099b,0xf665,0x01de,0xfe22
.short 0x0c95,0xf36b,0x0bcd,0xf433,0x03e4,0xfc1c,0x03df,0xfc21
.short 0x03be,0xfc42,0x074d,0xf8b3,0x05f2,0xfa0e,0x065c,0xf9a4
#ifndef __APPLE__
.text
.globl mlkem_basemul_mont
.type mlkem_basemul_mont,@function
.align 2
mlkem_basemul_mont:
#else
.section __TEXT,__text
.globl _mlkem_basemul_mont
.p2align 2
_mlkem_basemul_mont:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_zetas_mul
add x3, x3, :lo12:L_mlkem_aarch64_zetas_mul
#else
adrp x3, L_mlkem_aarch64_zetas_mul@PAGE
add x3, x3, L_mlkem_aarch64_zetas_mul@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_aarch64_consts
add x4, x4, :lo12:L_mlkem_aarch64_consts
#else
adrp x4, L_mlkem_aarch64_consts@PAGE
add x4, x4, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q1, [x4]
ldp q2, q3, [x1]
ldp q4, q5, [x1, #32]
ldp q6, q7, [x1, #64]
ldp q8, q9, [x1, #96]
ldp q10, q11, [x2]
ldp q12, q13, [x2, #32]
ldp q14, q15, [x2, #64]
ldp q16, q17, [x2, #96]
ldr q0, [x3]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0]
ldr q0, [x3, #16]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #32]
ldr q0, [x3, #32]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #64]
ldr q0, [x3, #48]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #96]
ldp q2, q3, [x1, #128]
ldp q4, q5, [x1, #160]
ldp q6, q7, [x1, #192]
ldp q8, q9, [x1, #224]
ldp q10, q11, [x2, #128]
ldp q12, q13, [x2, #160]
ldp q14, q15, [x2, #192]
ldp q16, q17, [x2, #224]
ldr q0, [x3, #64]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #128]
ldr q0, [x3, #80]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #160]
ldr q0, [x3, #96]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #192]
ldr q0, [x3, #112]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #224]
ldp q2, q3, [x1, #256]
ldp q4, q5, [x1, #288]
ldp q6, q7, [x1, #320]
ldp q8, q9, [x1, #352]
ldp q10, q11, [x2, #256]
ldp q12, q13, [x2, #288]
ldp q14, q15, [x2, #320]
ldp q16, q17, [x2, #352]
ldr q0, [x3, #128]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #256]
ldr q0, [x3, #144]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #288]
ldr q0, [x3, #160]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #320]
ldr q0, [x3, #176]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #352]
ldp q2, q3, [x1, #384]
ldp q4, q5, [x1, #416]
ldp q6, q7, [x1, #448]
ldp q8, q9, [x1, #480]
ldp q10, q11, [x2, #384]
ldp q12, q13, [x2, #416]
ldp q14, q15, [x2, #448]
ldp q16, q17, [x2, #480]
ldr q0, [x3, #192]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #384]
ldr q0, [x3, #208]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #416]
ldr q0, [x3, #224]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #448]
ldr q0, [x3, #240]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
stp q24, q25, [x0, #480]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_basemul_mont,.-mlkem_basemul_mont
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_basemul_mont_add
.type mlkem_basemul_mont_add,@function
.align 2
mlkem_basemul_mont_add:
#else
.section __TEXT,__text
.globl _mlkem_basemul_mont_add
.p2align 2
_mlkem_basemul_mont_add:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_zetas_mul
add x3, x3, :lo12:L_mlkem_aarch64_zetas_mul
#else
adrp x3, L_mlkem_aarch64_zetas_mul@PAGE
add x3, x3, L_mlkem_aarch64_zetas_mul@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_aarch64_consts
add x4, x4, :lo12:L_mlkem_aarch64_consts
#else
adrp x4, L_mlkem_aarch64_consts@PAGE
add x4, x4, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q1, [x4]
ldp q2, q3, [x1]
ldp q4, q5, [x1, #32]
ldp q6, q7, [x1, #64]
ldp q8, q9, [x1, #96]
ldp q10, q11, [x2]
ldp q12, q13, [x2, #32]
ldp q14, q15, [x2, #64]
ldp q16, q17, [x2, #96]
ldp q28, q29, [x0]
ldr q0, [x3]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0]
ldp q28, q29, [x0, #32]
ldr q0, [x3, #16]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #32]
ldp q28, q29, [x0, #64]
ldr q0, [x3, #32]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #64]
ldp q28, q29, [x0, #96]
ldr q0, [x3, #48]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #96]
ldp q2, q3, [x1, #128]
ldp q4, q5, [x1, #160]
ldp q6, q7, [x1, #192]
ldp q8, q9, [x1, #224]
ldp q10, q11, [x2, #128]
ldp q12, q13, [x2, #160]
ldp q14, q15, [x2, #192]
ldp q16, q17, [x2, #224]
ldp q28, q29, [x0, #128]
ldr q0, [x3, #64]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #128]
ldp q28, q29, [x0, #160]
ldr q0, [x3, #80]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #160]
ldp q28, q29, [x0, #192]
ldr q0, [x3, #96]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #192]
ldp q28, q29, [x0, #224]
ldr q0, [x3, #112]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #224]
ldp q2, q3, [x1, #256]
ldp q4, q5, [x1, #288]
ldp q6, q7, [x1, #320]
ldp q8, q9, [x1, #352]
ldp q10, q11, [x2, #256]
ldp q12, q13, [x2, #288]
ldp q14, q15, [x2, #320]
ldp q16, q17, [x2, #352]
ldp q28, q29, [x0, #256]
ldr q0, [x3, #128]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #256]
ldp q28, q29, [x0, #288]
ldr q0, [x3, #144]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #288]
ldp q28, q29, [x0, #320]
ldr q0, [x3, #160]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #320]
ldp q28, q29, [x0, #352]
ldr q0, [x3, #176]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #352]
ldp q2, q3, [x1, #384]
ldp q4, q5, [x1, #416]
ldp q6, q7, [x1, #448]
ldp q8, q9, [x1, #480]
ldp q10, q11, [x2, #384]
ldp q12, q13, [x2, #416]
ldp q14, q15, [x2, #448]
ldp q16, q17, [x2, #480]
ldp q28, q29, [x0, #384]
ldr q0, [x3, #192]
uzp1 v18.8h, v2.8h, v3.8h
uzp2 v19.8h, v2.8h, v3.8h
uzp1 v20.8h, v10.8h, v11.8h
uzp2 v21.8h, v10.8h, v11.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #384]
ldp q28, q29, [x0, #416]
ldr q0, [x3, #208]
uzp1 v18.8h, v4.8h, v5.8h
uzp2 v19.8h, v4.8h, v5.8h
uzp1 v20.8h, v12.8h, v13.8h
uzp2 v21.8h, v12.8h, v13.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #416]
ldp q28, q29, [x0, #448]
ldr q0, [x3, #224]
uzp1 v18.8h, v6.8h, v7.8h
uzp2 v19.8h, v6.8h, v7.8h
uzp1 v20.8h, v14.8h, v15.8h
uzp2 v21.8h, v14.8h, v15.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #448]
ldp q28, q29, [x0, #480]
ldr q0, [x3, #240]
uzp1 v18.8h, v8.8h, v9.8h
uzp2 v19.8h, v8.8h, v9.8h
uzp1 v20.8h, v16.8h, v17.8h
uzp2 v21.8h, v16.8h, v17.8h
smull v26.4s, v18.4h, v20.4h
smull2 v27.4s, v18.8h, v20.8h
smull v23.4s, v19.4h, v21.4h
smull2 v24.4s, v19.8h, v21.8h
xtn v25.4h, v23.4s
xtn2 v25.8h, v24.4s
mul v25.8h, v25.8h, v1.h[1]
smlsl v23.4s, v25.4h, v1.h[0]
smlsl2 v24.4s, v25.8h, v1.h[0]
shrn v22.4h, v23.4s, #16
shrn2 v22.8h, v24.4s, #16
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v22.8h, v0.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v22.4h, v26.4s, #16
shrn2 v22.8h, v27.4s, #16
smull v26.4s, v18.4h, v21.4h
smull2 v27.4s, v18.8h, v21.8h
smlal v26.4s, v19.4h, v20.4h
smlal2 v27.4s, v19.8h, v20.8h
xtn v24.4h, v26.4s
xtn2 v24.8h, v27.4s
mul v24.8h, v24.8h, v1.h[1]
smlsl v26.4s, v24.4h, v1.h[0]
smlsl2 v27.4s, v24.8h, v1.h[0]
shrn v23.4h, v26.4s, #16
shrn2 v23.8h, v27.4s, #16
zip1 v24.8h, v22.8h, v23.8h
zip2 v25.8h, v22.8h, v23.8h
add v28.8h, v28.8h, v24.8h
add v29.8h, v29.8h, v25.8h
stp q28, q29, [x0, #480]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_basemul_mont_add,.-mlkem_basemul_mont_add
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_q, %object
.section .rodata
.size L_mlkem_aarch64_q, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_q:
.short 0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
#ifndef __APPLE__
.text
.globl mlkem_csubq_neon
.type mlkem_csubq_neon,@function
.align 2
mlkem_csubq_neon:
#else
.section __TEXT,__text
.globl _mlkem_csubq_neon
.p2align 2
_mlkem_csubq_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x1, L_mlkem_aarch64_q
add x1, x1, :lo12:L_mlkem_aarch64_q
#else
adrp x1, L_mlkem_aarch64_q@PAGE
add x1, x1, L_mlkem_aarch64_q@PAGEOFF
#endif /* __APPLE__ */
ldr q20, [x1]
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40
ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40
ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40
sub x0, x0, #0x100
sub v0.8h, v0.8h, v20.8h
sub v1.8h, v1.8h, v20.8h
sub v2.8h, v2.8h, v20.8h
sub v3.8h, v3.8h, v20.8h
sub v4.8h, v4.8h, v20.8h
sub v5.8h, v5.8h, v20.8h
sub v6.8h, v6.8h, v20.8h
sub v7.8h, v7.8h, v20.8h
sub v8.8h, v8.8h, v20.8h
sub v9.8h, v9.8h, v20.8h
sub v10.8h, v10.8h, v20.8h
sub v11.8h, v11.8h, v20.8h
sub v12.8h, v12.8h, v20.8h
sub v13.8h, v13.8h, v20.8h
sub v14.8h, v14.8h, v20.8h
sub v15.8h, v15.8h, v20.8h
sshr v16.8h, v0.8h, #15
sshr v17.8h, v1.8h, #15
sshr v18.8h, v2.8h, #15
sshr v19.8h, v3.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v0.8h, v0.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
sshr v16.8h, v4.8h, #15
sshr v17.8h, v5.8h, #15
sshr v18.8h, v6.8h, #15
sshr v19.8h, v7.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v4.8h, v4.8h, v16.8h
add v5.8h, v5.8h, v17.8h
add v6.8h, v6.8h, v18.8h
add v7.8h, v7.8h, v19.8h
sshr v16.8h, v8.8h, #15
sshr v17.8h, v9.8h, #15
sshr v18.8h, v10.8h, #15
sshr v19.8h, v11.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v8.8h, v8.8h, v16.8h
add v9.8h, v9.8h, v17.8h
add v10.8h, v10.8h, v18.8h
add v11.8h, v11.8h, v19.8h
sshr v16.8h, v12.8h, #15
sshr v17.8h, v13.8h, #15
sshr v18.8h, v14.8h, #15
sshr v19.8h, v15.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v12.8h, v12.8h, v16.8h
add v13.8h, v13.8h, v17.8h
add v14.8h, v14.8h, v18.8h
add v15.8h, v15.8h, v19.8h
st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40
st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40
st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40
ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40
ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40
sub x0, x0, #0x100
sub v0.8h, v0.8h, v20.8h
sub v1.8h, v1.8h, v20.8h
sub v2.8h, v2.8h, v20.8h
sub v3.8h, v3.8h, v20.8h
sub v4.8h, v4.8h, v20.8h
sub v5.8h, v5.8h, v20.8h
sub v6.8h, v6.8h, v20.8h
sub v7.8h, v7.8h, v20.8h
sub v8.8h, v8.8h, v20.8h
sub v9.8h, v9.8h, v20.8h
sub v10.8h, v10.8h, v20.8h
sub v11.8h, v11.8h, v20.8h
sub v12.8h, v12.8h, v20.8h
sub v13.8h, v13.8h, v20.8h
sub v14.8h, v14.8h, v20.8h
sub v15.8h, v15.8h, v20.8h
sshr v16.8h, v0.8h, #15
sshr v17.8h, v1.8h, #15
sshr v18.8h, v2.8h, #15
sshr v19.8h, v3.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v0.8h, v0.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
sshr v16.8h, v4.8h, #15
sshr v17.8h, v5.8h, #15
sshr v18.8h, v6.8h, #15
sshr v19.8h, v7.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v4.8h, v4.8h, v16.8h
add v5.8h, v5.8h, v17.8h
add v6.8h, v6.8h, v18.8h
add v7.8h, v7.8h, v19.8h
sshr v16.8h, v8.8h, #15
sshr v17.8h, v9.8h, #15
sshr v18.8h, v10.8h, #15
sshr v19.8h, v11.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v8.8h, v8.8h, v16.8h
add v9.8h, v9.8h, v17.8h
add v10.8h, v10.8h, v18.8h
add v11.8h, v11.8h, v19.8h
sshr v16.8h, v12.8h, #15
sshr v17.8h, v13.8h, #15
sshr v18.8h, v14.8h, #15
sshr v19.8h, v15.8h, #15
and v16.16b, v16.16b, v20.16b
and v17.16b, v17.16b, v20.16b
and v18.16b, v18.16b, v20.16b
and v19.16b, v19.16b, v20.16b
add v12.8h, v12.8h, v16.8h
add v13.8h, v13.8h, v17.8h
add v14.8h, v14.8h, v18.8h
add v15.8h, v15.8h, v19.8h
st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40
st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40
st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_csubq_neon,.-mlkem_csubq_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_add_reduce
.type mlkem_add_reduce,@function
.align 2
mlkem_add_reduce:
#else
.section __TEXT,__text
.globl _mlkem_add_reduce
.p2align 2
_mlkem_add_reduce:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_aarch64_consts
add x2, x2, :lo12:L_mlkem_aarch64_consts
#else
adrp x2, L_mlkem_aarch64_consts@PAGE
add x2, x2, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q0, [x2]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_add_reduce,.-mlkem_add_reduce
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_add3_reduce
.type mlkem_add3_reduce,@function
.align 2
mlkem_add3_reduce:
#else
.section __TEXT,__text
.globl _mlkem_add3_reduce
.p2align 2
_mlkem_add3_reduce:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x3, L_mlkem_aarch64_consts
add x3, x3, :lo12:L_mlkem_aarch64_consts
#else
adrp x3, L_mlkem_aarch64_consts@PAGE
add x3, x3, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q0, [x3]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40
ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
add v4.8h, v4.8h, v20.8h
add v5.8h, v5.8h, v21.8h
add v6.8h, v6.8h, v22.8h
add v7.8h, v7.8h, v23.8h
add v8.8h, v8.8h, v24.8h
sqdmulh v25.8h, v1.8h, v0.h[2]
sqdmulh v26.8h, v2.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v1.8h, v25.8h, v0.h[0]
mls v2.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v3.8h, v0.h[2]
sqdmulh v26.8h, v4.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v3.8h, v25.8h, v0.h[0]
mls v4.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v5.8h, v0.h[2]
sqdmulh v26.8h, v6.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v5.8h, v25.8h, v0.h[0]
mls v6.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v7.8h, v0.h[2]
sqdmulh v26.8h, v8.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v7.8h, v25.8h, v0.h[0]
mls v8.8h, v26.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40
ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
add v4.8h, v4.8h, v20.8h
add v5.8h, v5.8h, v21.8h
add v6.8h, v6.8h, v22.8h
add v7.8h, v7.8h, v23.8h
add v8.8h, v8.8h, v24.8h
sqdmulh v25.8h, v1.8h, v0.h[2]
sqdmulh v26.8h, v2.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v1.8h, v25.8h, v0.h[0]
mls v2.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v3.8h, v0.h[2]
sqdmulh v26.8h, v4.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v3.8h, v25.8h, v0.h[0]
mls v4.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v5.8h, v0.h[2]
sqdmulh v26.8h, v6.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v5.8h, v25.8h, v0.h[0]
mls v6.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v7.8h, v0.h[2]
sqdmulh v26.8h, v8.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v7.8h, v25.8h, v0.h[0]
mls v8.8h, v26.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40
ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
add v4.8h, v4.8h, v20.8h
add v5.8h, v5.8h, v21.8h
add v6.8h, v6.8h, v22.8h
add v7.8h, v7.8h, v23.8h
add v8.8h, v8.8h, v24.8h
sqdmulh v25.8h, v1.8h, v0.h[2]
sqdmulh v26.8h, v2.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v1.8h, v25.8h, v0.h[0]
mls v2.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v3.8h, v0.h[2]
sqdmulh v26.8h, v4.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v3.8h, v25.8h, v0.h[0]
mls v4.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v5.8h, v0.h[2]
sqdmulh v26.8h, v6.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v5.8h, v25.8h, v0.h[0]
mls v6.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v7.8h, v0.h[2]
sqdmulh v26.8h, v8.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v7.8h, v25.8h, v0.h[0]
mls v8.8h, v26.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40
ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40
sub x0, x0, #0x80
add v1.8h, v1.8h, v9.8h
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.8h, v4.8h, v12.8h
add v5.8h, v5.8h, v13.8h
add v6.8h, v6.8h, v14.8h
add v7.8h, v7.8h, v15.8h
add v8.8h, v8.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
add v4.8h, v4.8h, v20.8h
add v5.8h, v5.8h, v21.8h
add v6.8h, v6.8h, v22.8h
add v7.8h, v7.8h, v23.8h
add v8.8h, v8.8h, v24.8h
sqdmulh v25.8h, v1.8h, v0.h[2]
sqdmulh v26.8h, v2.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v1.8h, v25.8h, v0.h[0]
mls v2.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v3.8h, v0.h[2]
sqdmulh v26.8h, v4.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v3.8h, v25.8h, v0.h[0]
mls v4.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v5.8h, v0.h[2]
sqdmulh v26.8h, v6.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v5.8h, v25.8h, v0.h[0]
mls v6.8h, v26.8h, v0.h[0]
sqdmulh v25.8h, v7.8h, v0.h[2]
sqdmulh v26.8h, v8.8h, v0.h[2]
sshr v25.8h, v25.8h, #11
sshr v26.8h, v26.8h, #11
mls v7.8h, v25.8h, v0.h[0]
mls v8.8h, v26.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_add3_reduce,.-mlkem_add3_reduce
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_rsub_reduce
.type mlkem_rsub_reduce,@function
.align 2
mlkem_rsub_reduce:
#else
.section __TEXT,__text
.globl _mlkem_rsub_reduce
.p2align 2
_mlkem_rsub_reduce:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_aarch64_consts
add x2, x2, :lo12:L_mlkem_aarch64_consts
#else
adrp x2, L_mlkem_aarch64_consts@PAGE
add x2, x2, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q0, [x2]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
sub v1.8h, v9.8h, v1.8h
sub v2.8h, v10.8h, v2.8h
sub v3.8h, v11.8h, v3.8h
sub v4.8h, v12.8h, v4.8h
sub v5.8h, v13.8h, v5.8h
sub v6.8h, v14.8h, v6.8h
sub v7.8h, v15.8h, v7.8h
sub v8.8h, v16.8h, v8.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
sub v1.8h, v9.8h, v1.8h
sub v2.8h, v10.8h, v2.8h
sub v3.8h, v11.8h, v3.8h
sub v4.8h, v12.8h, v4.8h
sub v5.8h, v13.8h, v5.8h
sub v6.8h, v14.8h, v6.8h
sub v7.8h, v15.8h, v7.8h
sub v8.8h, v16.8h, v8.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
sub v1.8h, v9.8h, v1.8h
sub v2.8h, v10.8h, v2.8h
sub v3.8h, v11.8h, v3.8h
sub v4.8h, v12.8h, v4.8h
sub v5.8h, v13.8h, v5.8h
sub v6.8h, v14.8h, v6.8h
sub v7.8h, v15.8h, v7.8h
sub v8.8h, v16.8h, v8.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40
sub x0, x0, #0x80
sub v1.8h, v9.8h, v1.8h
sub v2.8h, v10.8h, v2.8h
sub v3.8h, v11.8h, v3.8h
sub v4.8h, v12.8h, v4.8h
sub v5.8h, v13.8h, v5.8h
sub v6.8h, v14.8h, v6.8h
sub v7.8h, v15.8h, v7.8h
sub v8.8h, v16.8h, v8.8h
sqdmulh v17.8h, v1.8h, v0.h[2]
sqdmulh v18.8h, v2.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v1.8h, v17.8h, v0.h[0]
mls v2.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v3.8h, v0.h[2]
sqdmulh v18.8h, v4.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v3.8h, v17.8h, v0.h[0]
mls v4.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v5.8h, v0.h[2]
sqdmulh v18.8h, v6.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v5.8h, v17.8h, v0.h[0]
mls v6.8h, v18.8h, v0.h[0]
sqdmulh v17.8h, v7.8h, v0.h[2]
sqdmulh v18.8h, v8.8h, v0.h[2]
sshr v17.8h, v17.8h, #11
sshr v18.8h, v18.8h, #11
mls v7.8h, v17.8h, v0.h[0]
mls v8.8h, v18.8h, v0.h[0]
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_rsub_reduce,.-mlkem_rsub_reduce
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_to_mont
.type mlkem_to_mont,@function
.align 2
mlkem_to_mont:
#else
.section __TEXT,__text
.globl _mlkem_to_mont
.p2align 2
_mlkem_to_mont:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x1, L_mlkem_aarch64_consts
add x1, x1, :lo12:L_mlkem_aarch64_consts
#else
adrp x1, L_mlkem_aarch64_consts@PAGE
add x1, x1, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q0, [x1]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
sub x0, x0, #0x100
mul v17.8h, v1.8h, v0.h[4]
mul v18.8h, v2.8h, v0.h[4]
sqrdmulh v1.8h, v1.8h, v0.h[3]
sqrdmulh v2.8h, v2.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v1.8h, v1.8h, v17.8h
sub v2.8h, v2.8h, v18.8h
sshr v1.8h, v1.8h, #1
sshr v2.8h, v2.8h, #1
mul v17.8h, v3.8h, v0.h[4]
mul v18.8h, v4.8h, v0.h[4]
sqrdmulh v3.8h, v3.8h, v0.h[3]
sqrdmulh v4.8h, v4.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v3.8h, v3.8h, v17.8h
sub v4.8h, v4.8h, v18.8h
sshr v3.8h, v3.8h, #1
sshr v4.8h, v4.8h, #1
mul v17.8h, v5.8h, v0.h[4]
mul v18.8h, v6.8h, v0.h[4]
sqrdmulh v5.8h, v5.8h, v0.h[3]
sqrdmulh v6.8h, v6.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v5.8h, v5.8h, v17.8h
sub v6.8h, v6.8h, v18.8h
sshr v5.8h, v5.8h, #1
sshr v6.8h, v6.8h, #1
mul v17.8h, v7.8h, v0.h[4]
mul v18.8h, v8.8h, v0.h[4]
sqrdmulh v7.8h, v7.8h, v0.h[3]
sqrdmulh v8.8h, v8.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v7.8h, v7.8h, v17.8h
sub v8.8h, v8.8h, v18.8h
sshr v7.8h, v7.8h, #1
sshr v8.8h, v8.8h, #1
mul v17.8h, v9.8h, v0.h[4]
mul v18.8h, v10.8h, v0.h[4]
sqrdmulh v9.8h, v9.8h, v0.h[3]
sqrdmulh v10.8h, v10.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v9.8h, v9.8h, v17.8h
sub v10.8h, v10.8h, v18.8h
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v17.8h, v11.8h, v0.h[4]
mul v18.8h, v12.8h, v0.h[4]
sqrdmulh v11.8h, v11.8h, v0.h[3]
sqrdmulh v12.8h, v12.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v11.8h, v11.8h, v17.8h
sub v12.8h, v12.8h, v18.8h
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v17.8h, v13.8h, v0.h[4]
mul v18.8h, v14.8h, v0.h[4]
sqrdmulh v13.8h, v13.8h, v0.h[3]
sqrdmulh v14.8h, v14.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v13.8h, v13.8h, v17.8h
sub v14.8h, v14.8h, v18.8h
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v17.8h, v15.8h, v0.h[4]
mul v18.8h, v16.8h, v0.h[4]
sqrdmulh v15.8h, v15.8h, v0.h[3]
sqrdmulh v16.8h, v16.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v15.8h, v15.8h, v17.8h
sub v16.8h, v16.8h, v18.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
sub x0, x0, #0x100
mul v17.8h, v1.8h, v0.h[4]
mul v18.8h, v2.8h, v0.h[4]
sqrdmulh v1.8h, v1.8h, v0.h[3]
sqrdmulh v2.8h, v2.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v1.8h, v1.8h, v17.8h
sub v2.8h, v2.8h, v18.8h
sshr v1.8h, v1.8h, #1
sshr v2.8h, v2.8h, #1
mul v17.8h, v3.8h, v0.h[4]
mul v18.8h, v4.8h, v0.h[4]
sqrdmulh v3.8h, v3.8h, v0.h[3]
sqrdmulh v4.8h, v4.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v3.8h, v3.8h, v17.8h
sub v4.8h, v4.8h, v18.8h
sshr v3.8h, v3.8h, #1
sshr v4.8h, v4.8h, #1
mul v17.8h, v5.8h, v0.h[4]
mul v18.8h, v6.8h, v0.h[4]
sqrdmulh v5.8h, v5.8h, v0.h[3]
sqrdmulh v6.8h, v6.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v5.8h, v5.8h, v17.8h
sub v6.8h, v6.8h, v18.8h
sshr v5.8h, v5.8h, #1
sshr v6.8h, v6.8h, #1
mul v17.8h, v7.8h, v0.h[4]
mul v18.8h, v8.8h, v0.h[4]
sqrdmulh v7.8h, v7.8h, v0.h[3]
sqrdmulh v8.8h, v8.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v7.8h, v7.8h, v17.8h
sub v8.8h, v8.8h, v18.8h
sshr v7.8h, v7.8h, #1
sshr v8.8h, v8.8h, #1
mul v17.8h, v9.8h, v0.h[4]
mul v18.8h, v10.8h, v0.h[4]
sqrdmulh v9.8h, v9.8h, v0.h[3]
sqrdmulh v10.8h, v10.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v9.8h, v9.8h, v17.8h
sub v10.8h, v10.8h, v18.8h
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v17.8h, v11.8h, v0.h[4]
mul v18.8h, v12.8h, v0.h[4]
sqrdmulh v11.8h, v11.8h, v0.h[3]
sqrdmulh v12.8h, v12.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v11.8h, v11.8h, v17.8h
sub v12.8h, v12.8h, v18.8h
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v17.8h, v13.8h, v0.h[4]
mul v18.8h, v14.8h, v0.h[4]
sqrdmulh v13.8h, v13.8h, v0.h[3]
sqrdmulh v14.8h, v14.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v13.8h, v13.8h, v17.8h
sub v14.8h, v14.8h, v18.8h
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v17.8h, v15.8h, v0.h[4]
mul v18.8h, v16.8h, v0.h[4]
sqrdmulh v15.8h, v15.8h, v0.h[3]
sqrdmulh v16.8h, v16.8h, v0.h[3]
sqrdmulh v17.8h, v17.8h, v0.h[0]
sqrdmulh v18.8h, v18.8h, v0.h[0]
sub v15.8h, v15.8h, v17.8h
sub v16.8h, v16.8h, v18.8h
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_to_mont,.-mlkem_to_mont
#endif /* __APPLE__ */
#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH
#ifndef __APPLE__
.text
.globl mlkem_to_mont_sqrdmlsh
.type mlkem_to_mont_sqrdmlsh,@function
.align 2
mlkem_to_mont_sqrdmlsh:
#else
.section __TEXT,__text
.globl _mlkem_to_mont_sqrdmlsh
.p2align 2
_mlkem_to_mont_sqrdmlsh:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x1, L_mlkem_aarch64_consts
add x1, x1, :lo12:L_mlkem_aarch64_consts
#else
adrp x1, L_mlkem_aarch64_consts@PAGE
add x1, x1, L_mlkem_aarch64_consts@PAGEOFF
#endif /* __APPLE__ */
ldr q0, [x1]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
sub x0, x0, #0x100
mul v17.8h, v1.8h, v0.h[4]
mul v18.8h, v2.8h, v0.h[4]
sqrdmulh v1.8h, v1.8h, v0.h[3]
sqrdmulh v2.8h, v2.8h, v0.h[3]
sqrdmlsh v1.8h, v17.8h, v0.h[0]
sqrdmlsh v2.8h, v18.8h, v0.h[0]
sshr v1.8h, v1.8h, #1
sshr v2.8h, v2.8h, #1
mul v17.8h, v3.8h, v0.h[4]
mul v18.8h, v4.8h, v0.h[4]
sqrdmulh v3.8h, v3.8h, v0.h[3]
sqrdmulh v4.8h, v4.8h, v0.h[3]
sqrdmlsh v3.8h, v17.8h, v0.h[0]
sqrdmlsh v4.8h, v18.8h, v0.h[0]
sshr v3.8h, v3.8h, #1
sshr v4.8h, v4.8h, #1
mul v17.8h, v5.8h, v0.h[4]
mul v18.8h, v6.8h, v0.h[4]
sqrdmulh v5.8h, v5.8h, v0.h[3]
sqrdmulh v6.8h, v6.8h, v0.h[3]
sqrdmlsh v5.8h, v17.8h, v0.h[0]
sqrdmlsh v6.8h, v18.8h, v0.h[0]
sshr v5.8h, v5.8h, #1
sshr v6.8h, v6.8h, #1
mul v17.8h, v7.8h, v0.h[4]
mul v18.8h, v8.8h, v0.h[4]
sqrdmulh v7.8h, v7.8h, v0.h[3]
sqrdmulh v8.8h, v8.8h, v0.h[3]
sqrdmlsh v7.8h, v17.8h, v0.h[0]
sqrdmlsh v8.8h, v18.8h, v0.h[0]
sshr v7.8h, v7.8h, #1
sshr v8.8h, v8.8h, #1
mul v17.8h, v9.8h, v0.h[4]
mul v18.8h, v10.8h, v0.h[4]
sqrdmulh v9.8h, v9.8h, v0.h[3]
sqrdmulh v10.8h, v10.8h, v0.h[3]
sqrdmlsh v9.8h, v17.8h, v0.h[0]
sqrdmlsh v10.8h, v18.8h, v0.h[0]
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v17.8h, v11.8h, v0.h[4]
mul v18.8h, v12.8h, v0.h[4]
sqrdmulh v11.8h, v11.8h, v0.h[3]
sqrdmulh v12.8h, v12.8h, v0.h[3]
sqrdmlsh v11.8h, v17.8h, v0.h[0]
sqrdmlsh v12.8h, v18.8h, v0.h[0]
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v17.8h, v13.8h, v0.h[4]
mul v18.8h, v14.8h, v0.h[4]
sqrdmulh v13.8h, v13.8h, v0.h[3]
sqrdmulh v14.8h, v14.8h, v0.h[3]
sqrdmlsh v13.8h, v17.8h, v0.h[0]
sqrdmlsh v14.8h, v18.8h, v0.h[0]
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v17.8h, v15.8h, v0.h[4]
mul v18.8h, v16.8h, v0.h[4]
sqrdmulh v15.8h, v15.8h, v0.h[3]
sqrdmulh v16.8h, v16.8h, v0.h[3]
sqrdmlsh v15.8h, v17.8h, v0.h[0]
sqrdmlsh v16.8h, v18.8h, v0.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
sub x0, x0, #0x100
mul v17.8h, v1.8h, v0.h[4]
mul v18.8h, v2.8h, v0.h[4]
sqrdmulh v1.8h, v1.8h, v0.h[3]
sqrdmulh v2.8h, v2.8h, v0.h[3]
sqrdmlsh v1.8h, v17.8h, v0.h[0]
sqrdmlsh v2.8h, v18.8h, v0.h[0]
sshr v1.8h, v1.8h, #1
sshr v2.8h, v2.8h, #1
mul v17.8h, v3.8h, v0.h[4]
mul v18.8h, v4.8h, v0.h[4]
sqrdmulh v3.8h, v3.8h, v0.h[3]
sqrdmulh v4.8h, v4.8h, v0.h[3]
sqrdmlsh v3.8h, v17.8h, v0.h[0]
sqrdmlsh v4.8h, v18.8h, v0.h[0]
sshr v3.8h, v3.8h, #1
sshr v4.8h, v4.8h, #1
mul v17.8h, v5.8h, v0.h[4]
mul v18.8h, v6.8h, v0.h[4]
sqrdmulh v5.8h, v5.8h, v0.h[3]
sqrdmulh v6.8h, v6.8h, v0.h[3]
sqrdmlsh v5.8h, v17.8h, v0.h[0]
sqrdmlsh v6.8h, v18.8h, v0.h[0]
sshr v5.8h, v5.8h, #1
sshr v6.8h, v6.8h, #1
mul v17.8h, v7.8h, v0.h[4]
mul v18.8h, v8.8h, v0.h[4]
sqrdmulh v7.8h, v7.8h, v0.h[3]
sqrdmulh v8.8h, v8.8h, v0.h[3]
sqrdmlsh v7.8h, v17.8h, v0.h[0]
sqrdmlsh v8.8h, v18.8h, v0.h[0]
sshr v7.8h, v7.8h, #1
sshr v8.8h, v8.8h, #1
mul v17.8h, v9.8h, v0.h[4]
mul v18.8h, v10.8h, v0.h[4]
sqrdmulh v9.8h, v9.8h, v0.h[3]
sqrdmulh v10.8h, v10.8h, v0.h[3]
sqrdmlsh v9.8h, v17.8h, v0.h[0]
sqrdmlsh v10.8h, v18.8h, v0.h[0]
sshr v9.8h, v9.8h, #1
sshr v10.8h, v10.8h, #1
mul v17.8h, v11.8h, v0.h[4]
mul v18.8h, v12.8h, v0.h[4]
sqrdmulh v11.8h, v11.8h, v0.h[3]
sqrdmulh v12.8h, v12.8h, v0.h[3]
sqrdmlsh v11.8h, v17.8h, v0.h[0]
sqrdmlsh v12.8h, v18.8h, v0.h[0]
sshr v11.8h, v11.8h, #1
sshr v12.8h, v12.8h, #1
mul v17.8h, v13.8h, v0.h[4]
mul v18.8h, v14.8h, v0.h[4]
sqrdmulh v13.8h, v13.8h, v0.h[3]
sqrdmulh v14.8h, v14.8h, v0.h[3]
sqrdmlsh v13.8h, v17.8h, v0.h[0]
sqrdmlsh v14.8h, v18.8h, v0.h[0]
sshr v13.8h, v13.8h, #1
sshr v14.8h, v14.8h, #1
mul v17.8h, v15.8h, v0.h[4]
mul v18.8h, v16.8h, v0.h[4]
sqrdmulh v15.8h, v15.8h, v0.h[3]
sqrdmulh v16.8h, v16.8h, v0.h[3]
sqrdmlsh v15.8h, v17.8h, v0.h[0]
sqrdmlsh v16.8h, v18.8h, v0.h[0]
sshr v15.8h, v15.8h, #1
sshr v16.8h, v16.8h, #1
st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40
st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40
st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40
st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_to_mont_sqrdmlsh,.-mlkem_to_mont_sqrdmlsh
#endif /* __APPLE__ */
#endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */
#ifndef __APPLE__
.text
.type L_mlkem_to_msg_low, %object
.section .rodata
.size L_mlkem_to_msg_low, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_to_msg_low:
.short 0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373
#ifndef __APPLE__
.text
.type L_mlkem_to_msg_high, %object
.section .rodata
.size L_mlkem_to_msg_high, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_to_msg_high:
.short 0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0
#ifndef __APPLE__
.text
.type L_mlkem_to_msg_bits, %object
.section .rodata
.size L_mlkem_to_msg_bits, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_to_msg_bits:
.short 0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080
#ifndef __APPLE__
.text
.globl mlkem_to_msg_neon
.type mlkem_to_msg_neon,@function
.align 2
mlkem_to_msg_neon:
#else
.section __TEXT,__text
.globl _mlkem_to_msg_neon
.p2align 2
_mlkem_to_msg_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifndef __APPLE__
adrp x2, L_mlkem_to_msg_low
add x2, x2, :lo12:L_mlkem_to_msg_low
#else
adrp x2, L_mlkem_to_msg_low@PAGE
add x2, x2, L_mlkem_to_msg_low@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x3, L_mlkem_to_msg_high
add x3, x3, :lo12:L_mlkem_to_msg_high
#else
adrp x3, L_mlkem_to_msg_high@PAGE
add x3, x3, L_mlkem_to_msg_high@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x4, L_mlkem_to_msg_bits
add x4, x4, :lo12:L_mlkem_to_msg_bits
#else
adrp x4, L_mlkem_to_msg_bits@PAGE
add x4, x4, L_mlkem_to_msg_bits@PAGEOFF
#endif /* __APPLE__ */
ldr q0, [x2]
ldr q1, [x3]
ldr q26, [x4]
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40
ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40
cmge v10.8h, v2.8h, v0.8h
cmge v18.8h, v1.8h, v2.8h
cmge v11.8h, v3.8h, v0.8h
cmge v19.8h, v1.8h, v3.8h
cmge v12.8h, v4.8h, v0.8h
cmge v20.8h, v1.8h, v4.8h
cmge v13.8h, v5.8h, v0.8h
cmge v21.8h, v1.8h, v5.8h
cmge v14.8h, v6.8h, v0.8h
cmge v22.8h, v1.8h, v6.8h
cmge v15.8h, v7.8h, v0.8h
cmge v23.8h, v1.8h, v7.8h
cmge v16.8h, v8.8h, v0.8h
cmge v24.8h, v1.8h, v8.8h
cmge v17.8h, v9.8h, v0.8h
cmge v25.8h, v1.8h, v9.8h
and v18.16b, v18.16b, v10.16b
and v19.16b, v19.16b, v11.16b
and v20.16b, v20.16b, v12.16b
and v21.16b, v21.16b, v13.16b
and v22.16b, v22.16b, v14.16b
and v23.16b, v23.16b, v15.16b
and v24.16b, v24.16b, v16.16b
and v25.16b, v25.16b, v17.16b
and v18.16b, v18.16b, v26.16b
and v19.16b, v19.16b, v26.16b
and v20.16b, v20.16b, v26.16b
and v21.16b, v21.16b, v26.16b
and v22.16b, v22.16b, v26.16b
and v23.16b, v23.16b, v26.16b
and v24.16b, v24.16b, v26.16b
and v25.16b, v25.16b, v26.16b
addv h18, v18.8h
addv h19, v19.8h
addv h20, v20.8h
addv h21, v21.8h
addv h22, v22.8h
addv h23, v23.8h
addv h24, v24.8h
addv h25, v25.8h
ins v18.b[1], v19.b[0]
ins v18.b[2], v20.b[0]
ins v18.b[3], v21.b[0]
ins v18.b[4], v22.b[0]
ins v18.b[5], v23.b[0]
ins v18.b[6], v24.b[0]
ins v18.b[7], v25.b[0]
st1 {v18.8b}, [x0], #8
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40
ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40
cmge v10.8h, v2.8h, v0.8h
cmge v18.8h, v1.8h, v2.8h
cmge v11.8h, v3.8h, v0.8h
cmge v19.8h, v1.8h, v3.8h
cmge v12.8h, v4.8h, v0.8h
cmge v20.8h, v1.8h, v4.8h
cmge v13.8h, v5.8h, v0.8h
cmge v21.8h, v1.8h, v5.8h
cmge v14.8h, v6.8h, v0.8h
cmge v22.8h, v1.8h, v6.8h
cmge v15.8h, v7.8h, v0.8h
cmge v23.8h, v1.8h, v7.8h
cmge v16.8h, v8.8h, v0.8h
cmge v24.8h, v1.8h, v8.8h
cmge v17.8h, v9.8h, v0.8h
cmge v25.8h, v1.8h, v9.8h
and v18.16b, v18.16b, v10.16b
and v19.16b, v19.16b, v11.16b
and v20.16b, v20.16b, v12.16b
and v21.16b, v21.16b, v13.16b
and v22.16b, v22.16b, v14.16b
and v23.16b, v23.16b, v15.16b
and v24.16b, v24.16b, v16.16b
and v25.16b, v25.16b, v17.16b
and v18.16b, v18.16b, v26.16b
and v19.16b, v19.16b, v26.16b
and v20.16b, v20.16b, v26.16b
and v21.16b, v21.16b, v26.16b
and v22.16b, v22.16b, v26.16b
and v23.16b, v23.16b, v26.16b
and v24.16b, v24.16b, v26.16b
and v25.16b, v25.16b, v26.16b
addv h18, v18.8h
addv h19, v19.8h
addv h20, v20.8h
addv h21, v21.8h
addv h22, v22.8h
addv h23, v23.8h
addv h24, v24.8h
addv h25, v25.8h
ins v18.b[1], v19.b[0]
ins v18.b[2], v20.b[0]
ins v18.b[3], v21.b[0]
ins v18.b[4], v22.b[0]
ins v18.b[5], v23.b[0]
ins v18.b[6], v24.b[0]
ins v18.b[7], v25.b[0]
st1 {v18.8b}, [x0], #8
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40
ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40
cmge v10.8h, v2.8h, v0.8h
cmge v18.8h, v1.8h, v2.8h
cmge v11.8h, v3.8h, v0.8h
cmge v19.8h, v1.8h, v3.8h
cmge v12.8h, v4.8h, v0.8h
cmge v20.8h, v1.8h, v4.8h
cmge v13.8h, v5.8h, v0.8h
cmge v21.8h, v1.8h, v5.8h
cmge v14.8h, v6.8h, v0.8h
cmge v22.8h, v1.8h, v6.8h
cmge v15.8h, v7.8h, v0.8h
cmge v23.8h, v1.8h, v7.8h
cmge v16.8h, v8.8h, v0.8h
cmge v24.8h, v1.8h, v8.8h
cmge v17.8h, v9.8h, v0.8h
cmge v25.8h, v1.8h, v9.8h
and v18.16b, v18.16b, v10.16b
and v19.16b, v19.16b, v11.16b
and v20.16b, v20.16b, v12.16b
and v21.16b, v21.16b, v13.16b
and v22.16b, v22.16b, v14.16b
and v23.16b, v23.16b, v15.16b
and v24.16b, v24.16b, v16.16b
and v25.16b, v25.16b, v17.16b
and v18.16b, v18.16b, v26.16b
and v19.16b, v19.16b, v26.16b
and v20.16b, v20.16b, v26.16b
and v21.16b, v21.16b, v26.16b
and v22.16b, v22.16b, v26.16b
and v23.16b, v23.16b, v26.16b
and v24.16b, v24.16b, v26.16b
and v25.16b, v25.16b, v26.16b
addv h18, v18.8h
addv h19, v19.8h
addv h20, v20.8h
addv h21, v21.8h
addv h22, v22.8h
addv h23, v23.8h
addv h24, v24.8h
addv h25, v25.8h
ins v18.b[1], v19.b[0]
ins v18.b[2], v20.b[0]
ins v18.b[3], v21.b[0]
ins v18.b[4], v22.b[0]
ins v18.b[5], v23.b[0]
ins v18.b[6], v24.b[0]
ins v18.b[7], v25.b[0]
st1 {v18.8b}, [x0], #8
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40
ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40
cmge v10.8h, v2.8h, v0.8h
cmge v18.8h, v1.8h, v2.8h
cmge v11.8h, v3.8h, v0.8h
cmge v19.8h, v1.8h, v3.8h
cmge v12.8h, v4.8h, v0.8h
cmge v20.8h, v1.8h, v4.8h
cmge v13.8h, v5.8h, v0.8h
cmge v21.8h, v1.8h, v5.8h
cmge v14.8h, v6.8h, v0.8h
cmge v22.8h, v1.8h, v6.8h
cmge v15.8h, v7.8h, v0.8h
cmge v23.8h, v1.8h, v7.8h
cmge v16.8h, v8.8h, v0.8h
cmge v24.8h, v1.8h, v8.8h
cmge v17.8h, v9.8h, v0.8h
cmge v25.8h, v1.8h, v9.8h
and v18.16b, v18.16b, v10.16b
and v19.16b, v19.16b, v11.16b
and v20.16b, v20.16b, v12.16b
and v21.16b, v21.16b, v13.16b
and v22.16b, v22.16b, v14.16b
and v23.16b, v23.16b, v15.16b
and v24.16b, v24.16b, v16.16b
and v25.16b, v25.16b, v17.16b
and v18.16b, v18.16b, v26.16b
and v19.16b, v19.16b, v26.16b
and v20.16b, v20.16b, v26.16b
and v21.16b, v21.16b, v26.16b
and v22.16b, v22.16b, v26.16b
and v23.16b, v23.16b, v26.16b
and v24.16b, v24.16b, v26.16b
and v25.16b, v25.16b, v26.16b
addv h18, v18.8h
addv h19, v19.8h
addv h20, v20.8h
addv h21, v21.8h
addv h22, v22.8h
addv h23, v23.8h
addv h24, v24.8h
addv h25, v25.8h
ins v18.b[1], v19.b[0]
ins v18.b[2], v20.b[0]
ins v18.b[3], v21.b[0]
ins v18.b[4], v22.b[0]
ins v18.b[5], v23.b[0]
ins v18.b[6], v24.b[0]
ins v18.b[7], v25.b[0]
st1 {v18.8b}, [x0], #8
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size mlkem_to_msg_neon,.-mlkem_to_msg_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_mlkem_from_msg_q1half, %object
.section .rodata
.size L_mlkem_from_msg_q1half, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_from_msg_q1half:
.short 0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681
#ifndef __APPLE__
.text
.type L_mlkem_from_msg_bits, %object
.section .rodata
.size L_mlkem_from_msg_bits, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 1
#else
.p2align 1
#endif /* __APPLE__ */
L_mlkem_from_msg_bits:
.byte 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
.byte 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
#ifndef __APPLE__
.text
.globl mlkem_from_msg_neon
.type mlkem_from_msg_neon,@function
.align 2
mlkem_from_msg_neon:
#else
.section __TEXT,__text
.globl _mlkem_from_msg_neon
.p2align 2
_mlkem_from_msg_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-48]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
#ifndef __APPLE__
adrp x2, L_mlkem_from_msg_q1half
add x2, x2, :lo12:L_mlkem_from_msg_q1half
#else
adrp x2, L_mlkem_from_msg_q1half@PAGE
add x2, x2, L_mlkem_from_msg_q1half@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x3, L_mlkem_from_msg_bits
add x3, x3, :lo12:L_mlkem_from_msg_bits
#else
adrp x3, L_mlkem_from_msg_bits@PAGE
add x3, x3, L_mlkem_from_msg_bits@PAGEOFF
#endif /* __APPLE__ */
ld1 {v2.16b, v3.16b}, [x1]
ldr q1, [x2]
ldr q0, [x3]
dup v4.8b, v2.b[0]
dup v5.8b, v2.b[1]
dup v6.8b, v2.b[2]
dup v7.8b, v2.b[3]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v2.b[4]
dup v5.8b, v2.b[5]
dup v6.8b, v2.b[6]
dup v7.8b, v2.b[7]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v2.b[8]
dup v5.8b, v2.b[9]
dup v6.8b, v2.b[10]
dup v7.8b, v2.b[11]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v2.b[12]
dup v5.8b, v2.b[13]
dup v6.8b, v2.b[14]
dup v7.8b, v2.b[15]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v3.b[0]
dup v5.8b, v3.b[1]
dup v6.8b, v3.b[2]
dup v7.8b, v3.b[3]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v3.b[4]
dup v5.8b, v3.b[5]
dup v6.8b, v3.b[6]
dup v7.8b, v3.b[7]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v3.b[8]
dup v5.8b, v3.b[9]
dup v6.8b, v3.b[10]
dup v7.8b, v3.b[11]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
dup v4.8b, v3.b[12]
dup v5.8b, v3.b[13]
dup v6.8b, v3.b[14]
dup v7.8b, v3.b[15]
cmtst v4.8b, v4.8b, v0.8b
cmtst v5.8b, v5.8b, v0.8b
cmtst v6.8b, v6.8b, v0.8b
cmtst v7.8b, v7.8b, v0.8b
zip1 v4.16b, v4.16b, v4.16b
zip1 v5.16b, v5.16b, v5.16b
zip1 v6.16b, v6.16b, v6.16b
zip1 v7.16b, v7.16b, v7.16b
and v4.16b, v4.16b, v1.16b
and v5.16b, v5.16b, v1.16b
and v6.16b, v6.16b, v1.16b
and v7.16b, v7.16b, v1.16b
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp x29, x30, [sp], #48
ret
#ifndef __APPLE__
.size mlkem_from_msg_neon,.-mlkem_from_msg_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_cmp_neon
.type mlkem_cmp_neon,@function
.align 2
mlkem_cmp_neon:
#else
.section __TEXT,__text
.globl _mlkem_cmp_neon
.p2align 2
_mlkem_cmp_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-48]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v8.16b, v0.16b, v4.16b
eor v9.16b, v1.16b, v5.16b
eor v10.16b, v2.16b, v6.16b
eor v11.16b, v3.16b, v7.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
subs w2, w2, #0x300
beq L_mlkem_aarch64_cmp_neon_done
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
subs w2, w2, #0x140
beq L_mlkem_aarch64_cmp_neon_done
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40
ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
orr v10.16b, v10.16b, v2.16b
orr v11.16b, v11.16b, v3.16b
ld2 {v0.16b, v1.16b}, [x0]
ld2 {v4.16b, v5.16b}, [x1]
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
orr v8.16b, v8.16b, v0.16b
orr v9.16b, v9.16b, v1.16b
L_mlkem_aarch64_cmp_neon_done:
orr v8.16b, v8.16b, v9.16b
orr v10.16b, v10.16b, v11.16b
orr v8.16b, v8.16b, v10.16b
ins v9.b[0], v8.b[1]
orr v8.16b, v8.16b, v9.16b
mov x0, v8.d[0]
subs x0, x0, xzr
csetm w0, ne
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp x29, x30, [sp], #48
ret
#ifndef __APPLE__
.size mlkem_cmp_neon,.-mlkem_cmp_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_mlkem_rej_uniform_mask, %object
.section .rodata
.size L_mlkem_rej_uniform_mask, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_rej_uniform_mask:
.short 0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff
#ifndef __APPLE__
.text
.type L_mlkem_rej_uniform_bits, %object
.section .rodata
.size L_mlkem_rej_uniform_bits, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_rej_uniform_bits:
.short 0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080
#ifndef __APPLE__
.text
.type L_mlkem_rej_uniform_indices, %object
.section .rodata
.size L_mlkem_rej_uniform_indices, 4096
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 1
#else
.p2align 1
#endif /* __APPLE__ */
L_mlkem_rej_uniform_indices:
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0a,0x0b,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0a,0x0b,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0a,0x0b,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0a,0x0b,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0a,0x0b,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0a,0x0b,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0a,0x0b,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0a,0x0b,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0a,0x0b,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0a,0x0b,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0a,0x0b,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0a,0x0b,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0x0a,0x0b,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0a,0x0b,0x0e,0x0f,0xff,0xff
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x08,0x09,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x06,0x07,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d
.byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b
.byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
.byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09
.byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
.byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
#ifndef __APPLE__
.text
.globl mlkem_rej_uniform_neon
.type mlkem_rej_uniform_neon,@function
.align 2
mlkem_rej_uniform_neon:
#else
.section __TEXT,__text
.globl _mlkem_rej_uniform_neon
.p2align 2
_mlkem_rej_uniform_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-64]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
#ifndef __APPLE__
adrp x4, L_mlkem_rej_uniform_mask
add x4, x4, :lo12:L_mlkem_rej_uniform_mask
#else
adrp x4, L_mlkem_rej_uniform_mask@PAGE
add x4, x4, L_mlkem_rej_uniform_mask@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x5, L_mlkem_aarch64_q
add x5, x5, :lo12:L_mlkem_aarch64_q
#else
adrp x5, L_mlkem_aarch64_q@PAGE
add x5, x5, L_mlkem_aarch64_q@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x6, L_mlkem_rej_uniform_bits
add x6, x6, :lo12:L_mlkem_rej_uniform_bits
#else
adrp x6, L_mlkem_rej_uniform_bits@PAGE
add x6, x6, L_mlkem_rej_uniform_bits@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x7, L_mlkem_rej_uniform_indices
add x7, x7, :lo12:L_mlkem_rej_uniform_indices
#else
adrp x7, L_mlkem_rej_uniform_indices@PAGE
add x7, x7, L_mlkem_rej_uniform_indices@PAGEOFF
#endif /* __APPLE__ */
eor v1.16b, v1.16b, v1.16b
eor v12.16b, v12.16b, v12.16b
eor v13.16b, v13.16b, v13.16b
eor x12, x12, x12
eor v10.16b, v10.16b, v10.16b
eor v11.16b, v11.16b, v11.16b
mov x13, #0xd01
ldr q0, [x4]
ldr q3, [x5]
ldr q2, [x6]
subs wzr, w1, #0
beq L_mlkem_rej_uniform_done
subs wzr, w1, #16
blt L_mlkem_rej_uniform_loop_4
L_mlkem_rej_uniform_loop_16:
ld3 {v4.8b, v5.8b, v6.8b}, [x2], #24
zip1 v4.16b, v4.16b, v1.16b
zip1 v5.16b, v5.16b, v1.16b
zip1 v6.16b, v6.16b, v1.16b
shl v7.8h, v5.8h, #8
ushr v8.8h, v5.8h, #4
shl v6.8h, v6.8h, #4
orr v4.16b, v4.16b, v7.16b
orr v5.16b, v8.16b, v6.16b
and v7.16b, v4.16b, v0.16b
and v8.16b, v5.16b, v0.16b
zip1 v4.8h, v7.8h, v8.8h
zip2 v5.8h, v7.8h, v8.8h
cmgt v7.8h, v3.8h, v4.8h
cmgt v8.8h, v3.8h, v5.8h
ushr v12.8h, v7.8h, #15
ushr v13.8h, v8.8h, #15
addv h12, v12.8h
addv h13, v13.8h
mov x10, v12.d[0]
mov x11, v13.d[0]
and v10.16b, v7.16b, v2.16b
and v11.16b, v8.16b, v2.16b
addv h10, v10.8h
addv h11, v11.8h
mov w8, v10.s[0]
mov w9, v11.s[0]
lsl w8, w8, #4
lsl w9, w9, #4
ldr q10, [x7, x8]
ldr q11, [x7, x9]
tbl v7.16b, {v4.16b}, v10.16b
tbl v8.16b, {v5.16b}, v11.16b
str q7, [x0]
add x0, x0, x10, lsl 1
add x12, x12, x10
str q8, [x0]
add x0, x0, x11, lsl 1
add x12, x12, x11
subs w3, w3, #24
beq L_mlkem_rej_uniform_done
sub w10, w1, w12
subs x10, x10, #16
blt L_mlkem_rej_uniform_loop_4
b L_mlkem_rej_uniform_loop_16
L_mlkem_rej_uniform_loop_4:
subs w10, w1, w12
beq L_mlkem_rej_uniform_done
subs x10, x10, #4
blt L_mlkem_rej_uniform_loop_lt_4
ldr x4, [x2], #6
lsr x5, x4, #12
lsr x6, x4, #24
lsr x7, x4, #36
and x4, x4, #0xfff
and x5, x5, #0xfff
and x6, x6, #0xfff
and x7, x7, #0xfff
strh w4, [x0]
subs xzr, x4, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
strh w5, [x0]
subs xzr, x5, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
strh w6, [x0]
subs xzr, x6, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
strh w7, [x0]
subs xzr, x7, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
subs w3, w3, #6
beq L_mlkem_rej_uniform_done
b L_mlkem_rej_uniform_loop_4
L_mlkem_rej_uniform_loop_lt_4:
ldr x4, [x2], #6
lsr x5, x4, #12
lsr x6, x4, #24
lsr x7, x4, #36
and x4, x4, #0xfff
and x5, x5, #0xfff
and x6, x6, #0xfff
and x7, x7, #0xfff
strh w4, [x0]
subs xzr, x4, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
subs wzr, w1, w12
beq L_mlkem_rej_uniform_done
strh w5, [x0]
subs xzr, x5, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
subs wzr, w1, w12
beq L_mlkem_rej_uniform_done
strh w6, [x0]
subs xzr, x6, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
subs wzr, w1, w12
beq L_mlkem_rej_uniform_done
strh w7, [x0]
subs xzr, x7, x13
cinc x0, x0, lt
cinc x0, x0, lt
cinc x12, x12, lt
subs wzr, w1, w12
beq L_mlkem_rej_uniform_done
subs w3, w3, #6
beq L_mlkem_rej_uniform_done
b L_mlkem_rej_uniform_loop_lt_4
L_mlkem_rej_uniform_done:
mov x0, x12
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp x29, x30, [sp], #0x40
ret
#ifndef __APPLE__
.size mlkem_rej_uniform_neon,.-mlkem_rej_uniform_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_sha3_aarch64_r, %object
.section .rodata
.size L_sha3_aarch64_r, 192
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 3
#else
.p2align 3
#endif /* __APPLE__ */
L_sha3_aarch64_r:
.xword 0x0000000000000001
.xword 0x0000000000008082
.xword 0x800000000000808a
.xword 0x8000000080008000
.xword 0x000000000000808b
.xword 0x0000000080000001
.xword 0x8000000080008081
.xword 0x8000000000008009
.xword 0x000000000000008a
.xword 0x0000000000000088
.xword 0x0000000080008009
.xword 0x000000008000000a
.xword 0x000000008000808b
.xword 0x800000000000008b
.xword 0x8000000000008089
.xword 0x8000000000008003
.xword 0x8000000000008002
.xword 0x8000000000000080
.xword 0x000000000000800a
.xword 0x800000008000000a
.xword 0x8000000080008081
.xword 0x8000000000008080
.xword 0x0000000080000001
.xword 0x8000000080008008
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
#ifndef __APPLE__
.text
.globl mlkem_sha3_blocksx3_neon
.type mlkem_sha3_blocksx3_neon,@function
.align 2
mlkem_sha3_blocksx3_neon:
#else
.section __TEXT,__text
.globl _mlkem_sha3_blocksx3_neon
.p2align 2
_mlkem_sha3_blocksx3_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-224]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
stp d8, d9, [x29, #160]
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x27, L_sha3_aarch64_r
add x27, x27, :lo12:L_sha3_aarch64_r
#else
adrp x27, L_sha3_aarch64_r@PAGE
add x27, x27, L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
ld1 {v24.d}[0], [x0]
add x0, x0, #8
ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
ld1 {v24.d}[1], [x0]
add x0, x0, #8
ldp x1, x2, [x0]
ldp x3, x4, [x0, #16]
ldp x5, x6, [x0, #32]
ldp x7, x8, [x0, #48]
ldp x9, x10, [x0, #64]
ldp x11, x12, [x0, #80]
ldp x13, x14, [x0, #96]
ldp x15, x16, [x0, #112]
ldp x17, x19, [x0, #128]
ldp x20, x21, [x0, #144]
ldp x22, x23, [x0, #160]
ldp x24, x25, [x0, #176]
ldr x26, [x0, #192]
mov x28, #24
# Start of 24 rounds
L_SHA3_transform_blocksx3_neon_begin:
stp x27, x28, [x29, #48]
# Col Mix
eor3 v31.16b, v0.16b, v5.16b, v10.16b
eor x0, x5, x10
eor3 v27.16b, v1.16b, v6.16b, v11.16b
eor x30, x1, x6
eor3 v28.16b, v2.16b, v7.16b, v12.16b
eor x28, x3, x8
eor3 v29.16b, v3.16b, v8.16b, v13.16b
eor x0, x0, x15
eor3 v30.16b, v4.16b, v9.16b, v14.16b
eor x30, x30, x11
eor3 v31.16b, v31.16b, v15.16b, v20.16b
eor x28, x28, x13
eor3 v27.16b, v27.16b, v16.16b, v21.16b
eor x0, x0, x21
eor3 v28.16b, v28.16b, v17.16b, v22.16b
eor x30, x30, x16
eor3 v29.16b, v29.16b, v18.16b, v23.16b
eor x28, x28, x19
eor3 v30.16b, v30.16b, v19.16b, v24.16b
eor x0, x0, x26
rax1 v25.2d, v30.2d, v27.2d
eor x30, x30, x22
rax1 v26.2d, v31.2d, v28.2d
eor x28, x28, x24
rax1 v27.2d, v27.2d, v29.2d
str x0, [x29, #32]
rax1 v28.2d, v28.2d, v30.2d
str x28, [x29, #24]
rax1 v29.2d, v29.2d, v31.2d
eor x27, x2, x7
eor v0.16b, v0.16b, v25.16b
xar v30.2d, v1.2d, v26.2d, #63
eor x28, x4, x9
xar v1.2d, v6.2d, v26.2d, #20
eor x27, x27, x12
xar v6.2d, v9.2d, v29.2d, #44
eor x28, x28, x14
xar v9.2d, v22.2d, v27.2d, #3
eor x27, x27, x17
xar v22.2d, v14.2d, v29.2d, #25
eor x28, x28, x20
xar v14.2d, v20.2d, v25.2d, #46
eor x27, x27, x23
xar v20.2d, v2.2d, v27.2d, #2
eor x28, x28, x25
xar v2.2d, v12.2d, v27.2d, #21
eor x0, x0, x27, ror 63
xar v12.2d, v13.2d, v28.2d, #39
eor x27, x27, x28, ror 63
xar v13.2d, v19.2d, v29.2d, #56
eor x1, x1, x0
xar v19.2d, v23.2d, v28.2d, #8
eor x6, x6, x0
xar v23.2d, v15.2d, v25.2d, #23
eor x11, x11, x0
xar v15.2d, v4.2d, v29.2d, #37
eor x16, x16, x0
xar v4.2d, v24.2d, v29.2d, #50
eor x22, x22, x0
xar v24.2d, v21.2d, v26.2d, #62
eor x3, x3, x27
xar v21.2d, v8.2d, v28.2d, #9
eor x8, x8, x27
xar v8.2d, v16.2d, v26.2d, #19
eor x13, x13, x27
xar v16.2d, v5.2d, v25.2d, #28
eor x19, x19, x27
xar v5.2d, v3.2d, v28.2d, #36
eor x24, x24, x27
xar v3.2d, v18.2d, v28.2d, #43
ldr x0, [x29, #32]
xar v18.2d, v17.2d, v27.2d, #49
ldr x27, [x29, #24]
xar v17.2d, v11.2d, v26.2d, #54
eor x28, x28, x30, ror 63
xar v11.2d, v7.2d, v27.2d, #58
eor x30, x30, x27, ror 63
xar v7.2d, v10.2d, v25.2d, #61
eor x27, x27, x0, ror 63
# Row Mix
mov v25.16b, v0.16b
eor x5, x5, x28
mov v26.16b, v1.16b
eor x10, x10, x28
bcax v0.16b, v25.16b, v2.16b, v26.16b
eor x15, x15, x28
bcax v1.16b, v26.16b, v3.16b, v2.16b
eor x21, x21, x28
bcax v2.16b, v2.16b, v4.16b, v3.16b
eor x26, x26, x28
bcax v3.16b, v3.16b, v25.16b, v4.16b
eor x2, x2, x30
bcax v4.16b, v4.16b, v26.16b, v25.16b
eor x7, x7, x30
mov v25.16b, v5.16b
eor x12, x12, x30
mov v26.16b, v6.16b
eor x17, x17, x30
bcax v5.16b, v25.16b, v7.16b, v26.16b
eor x23, x23, x30
bcax v6.16b, v26.16b, v8.16b, v7.16b
eor x4, x4, x27
bcax v7.16b, v7.16b, v9.16b, v8.16b
eor x9, x9, x27
bcax v8.16b, v8.16b, v25.16b, v9.16b
eor x14, x14, x27
bcax v9.16b, v9.16b, v26.16b, v25.16b
eor x20, x20, x27
mov v26.16b, v11.16b
eor x25, x25, x27
# Swap Rotate Base
bcax v10.16b, v30.16b, v12.16b, v26.16b
ror x0, x2, #63
bcax v11.16b, v26.16b, v13.16b, v12.16b
ror x2, x7, #20
bcax v12.16b, v12.16b, v14.16b, v13.16b
ror x7, x10, #44
bcax v13.16b, v13.16b, v30.16b, v14.16b
ror x10, x24, #3
bcax v14.16b, v14.16b, v26.16b, v30.16b
ror x24, x15, #25
mov v25.16b, v15.16b
ror x15, x22, #46
mov v26.16b, v16.16b
ror x22, x3, #2
bcax v15.16b, v25.16b, v17.16b, v26.16b
ror x3, x13, #21
bcax v16.16b, v26.16b, v18.16b, v17.16b
ror x13, x14, #39
bcax v17.16b, v17.16b, v19.16b, v18.16b
ror x14, x21, #56
bcax v18.16b, v18.16b, v25.16b, v19.16b
ror x21, x25, #8
bcax v19.16b, v19.16b, v26.16b, v25.16b
ror x25, x16, #23
mov v25.16b, v20.16b
ror x16, x5, #37
mov v26.16b, v21.16b
ror x5, x26, #50
bcax v20.16b, v25.16b, v22.16b, v26.16b
ror x26, x23, #62
bcax v21.16b, v26.16b, v23.16b, v22.16b
ror x23, x9, #9
bcax v22.16b, v22.16b, v24.16b, v23.16b
ror x9, x17, #19
bcax v23.16b, v23.16b, v25.16b, v24.16b
ror x17, x6, #28
bcax v24.16b, v24.16b, v26.16b, v25.16b
ror x6, x4, #36
ror x4, x20, #43
ror x20, x19, #49
ror x19, x12, #54
ror x12, x8, #58
ror x8, x11, #61
# Row Mix Base
bic x11, x3, x2
bic x27, x4, x3
bic x28, x1, x5
bic x30, x2, x1
eor x1, x1, x11
eor x2, x2, x27
bic x11, x5, x4
eor x4, x4, x28
eor x3, x3, x11
eor x5, x5, x30
bic x11, x8, x7
bic x27, x9, x8
bic x28, x6, x10
bic x30, x7, x6
eor x6, x6, x11
eor x7, x7, x27
bic x11, x10, x9
eor x9, x9, x28
eor x8, x8, x11
eor x10, x10, x30
bic x11, x13, x12
bic x27, x14, x13
bic x28, x0, x15
bic x30, x12, x0
eor x11, x0, x11
eor x12, x12, x27
bic x0, x15, x14
eor x14, x14, x28
eor x13, x13, x0
eor x15, x15, x30
bic x0, x19, x17
bic x27, x20, x19
bic x28, x16, x21
bic x30, x17, x16
eor x16, x16, x0
eor x17, x17, x27
bic x0, x21, x20
eor x20, x20, x28
eor x19, x19, x0
eor x21, x21, x30
bic x0, x24, x23
bic x27, x25, x24
bic x28, x22, x26
bic x30, x23, x22
eor x22, x22, x0
eor x23, x23, x27
bic x0, x26, x25
eor x25, x25, x28
eor x24, x24, x0
eor x26, x26, x30
# Done transforming
ldp x27, x28, [x29, #48]
ldr x0, [x27], #8
subs x28, x28, #1
mov v30.d[0], x0
mov v30.d[1], x0
eor x1, x1, x0
eor v0.16b, v0.16b, v30.16b
bne L_SHA3_transform_blocksx3_neon_begin
ldr x0, [x29, #40]
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.d}[0], [x0]
add x0, x0, #8
st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
st1 {v24.d}[1], [x0]
add x0, x0, #8
stp x1, x2, [x0]
stp x3, x4, [x0, #16]
stp x5, x6, [x0, #32]
stp x7, x8, [x0, #48]
stp x9, x10, [x0, #64]
stp x11, x12, [x0, #80]
stp x13, x14, [x0, #96]
stp x15, x16, [x0, #112]
stp x17, x19, [x0, #128]
stp x20, x21, [x0, #144]
stp x22, x23, [x0, #160]
stp x24, x25, [x0, #176]
str x26, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp d8, d9, [x29, #160]
ldp d10, d11, [x29, #176]
ldp d12, d13, [x29, #192]
ldp d14, d15, [x29, #208]
ldp x29, x30, [sp], #0xe0
ret
#ifndef __APPLE__
.size mlkem_sha3_blocksx3_neon,.-mlkem_sha3_blocksx3_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_shake128_blocksx3_seed_neon
.type mlkem_shake128_blocksx3_seed_neon,@function
.align 2
mlkem_shake128_blocksx3_seed_neon:
#else
.section __TEXT,__text
.globl _mlkem_shake128_blocksx3_seed_neon
.p2align 2
_mlkem_shake128_blocksx3_seed_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-224]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
stp d8, d9, [x29, #160]
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x28, L_sha3_aarch64_r
add x28, x28, :lo12:L_sha3_aarch64_r
#else
adrp x28, L_sha3_aarch64_r@PAGE
add x28, x28, L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
add x0, x0, #32
ld1 {v4.d}[0], [x0]
ldp x2, x3, [x1], #16
add x0, x0, #0xc8
ld1 {v4.d}[1], [x0]
ldp x4, x5, [x1], #16
ldr x6, [x0, #200]
eor v5.16b, v5.16b, v5.16b
eor x7, x7, x7
eor v6.16b, v6.16b, v6.16b
eor x8, x8, x8
eor v7.16b, v7.16b, v7.16b
eor x9, x9, x9
eor v8.16b, v8.16b, v8.16b
eor x10, x10, x10
eor v9.16b, v9.16b, v9.16b
eor x11, x11, x11
eor v10.16b, v10.16b, v10.16b
eor x12, x12, x12
eor v11.16b, v11.16b, v11.16b
eor x13, x13, x13
eor v12.16b, v12.16b, v12.16b
eor x14, x14, x14
eor v13.16b, v13.16b, v13.16b
eor x15, x15, x15
eor v14.16b, v14.16b, v14.16b
eor x16, x16, x16
eor v15.16b, v15.16b, v15.16b
eor x17, x17, x17
eor v16.16b, v16.16b, v16.16b
eor x19, x19, x19
eor v17.16b, v17.16b, v17.16b
eor x20, x20, x20
eor v18.16b, v18.16b, v18.16b
eor x21, x21, x21
eor v19.16b, v19.16b, v19.16b
eor x22, x22, x22
movz x23, #0x8000, lsl 48
eor v21.16b, v21.16b, v21.16b
eor x24, x24, x24
eor v22.16b, v22.16b, v22.16b
eor x25, x25, x25
eor v23.16b, v23.16b, v23.16b
eor x26, x26, x26
eor v24.16b, v24.16b, v24.16b
eor x27, x27, x27
dup v0.2d, x2
dup v1.2d, x3
dup v2.2d, x4
dup v3.2d, x5
dup v20.2d, x23
mov x1, #24
# Start of 24 rounds
L_SHA3_shake128_blocksx3_seed_neon_begin:
stp x28, x1, [x29, #48]
# Col Mix
eor3 v31.16b, v0.16b, v5.16b, v10.16b
eor x0, x6, x11
eor3 v27.16b, v1.16b, v6.16b, v11.16b
eor x30, x2, x7
eor3 v28.16b, v2.16b, v7.16b, v12.16b
eor x28, x4, x9
eor3 v29.16b, v3.16b, v8.16b, v13.16b
eor x0, x0, x16
eor3 v30.16b, v4.16b, v9.16b, v14.16b
eor x30, x30, x12
eor3 v31.16b, v31.16b, v15.16b, v20.16b
eor x28, x28, x14
eor3 v27.16b, v27.16b, v16.16b, v21.16b
eor x0, x0, x22
eor3 v28.16b, v28.16b, v17.16b, v22.16b
eor x30, x30, x17
eor3 v29.16b, v29.16b, v18.16b, v23.16b
eor x28, x28, x20
eor3 v30.16b, v30.16b, v19.16b, v24.16b
eor x0, x0, x27
rax1 v25.2d, v30.2d, v27.2d
eor x30, x30, x23
rax1 v26.2d, v31.2d, v28.2d
eor x28, x28, x25
rax1 v27.2d, v27.2d, v29.2d
str x0, [x29, #32]
rax1 v28.2d, v28.2d, v30.2d
str x28, [x29, #24]
rax1 v29.2d, v29.2d, v31.2d
eor x1, x3, x8
eor v0.16b, v0.16b, v25.16b
xar v30.2d, v1.2d, v26.2d, #63
eor x28, x5, x10
xar v1.2d, v6.2d, v26.2d, #20
eor x1, x1, x13
xar v6.2d, v9.2d, v29.2d, #44
eor x28, x28, x15
xar v9.2d, v22.2d, v27.2d, #3
eor x1, x1, x19
xar v22.2d, v14.2d, v29.2d, #25
eor x28, x28, x21
xar v14.2d, v20.2d, v25.2d, #46
eor x1, x1, x24
xar v20.2d, v2.2d, v27.2d, #2
eor x28, x28, x26
xar v2.2d, v12.2d, v27.2d, #21
eor x0, x0, x1, ror 63
xar v12.2d, v13.2d, v28.2d, #39
eor x1, x1, x28, ror 63
xar v13.2d, v19.2d, v29.2d, #56
eor x2, x2, x0
xar v19.2d, v23.2d, v28.2d, #8
eor x7, x7, x0
xar v23.2d, v15.2d, v25.2d, #23
eor x12, x12, x0
xar v15.2d, v4.2d, v29.2d, #37
eor x17, x17, x0
xar v4.2d, v24.2d, v29.2d, #50
eor x23, x23, x0
xar v24.2d, v21.2d, v26.2d, #62
eor x4, x4, x1
xar v21.2d, v8.2d, v28.2d, #9
eor x9, x9, x1
xar v8.2d, v16.2d, v26.2d, #19
eor x14, x14, x1
xar v16.2d, v5.2d, v25.2d, #28
eor x20, x20, x1
xar v5.2d, v3.2d, v28.2d, #36
eor x25, x25, x1
xar v3.2d, v18.2d, v28.2d, #43
ldr x0, [x29, #32]
xar v18.2d, v17.2d, v27.2d, #49
ldr x1, [x29, #24]
xar v17.2d, v11.2d, v26.2d, #54
eor x28, x28, x30, ror 63
xar v11.2d, v7.2d, v27.2d, #58
eor x30, x30, x1, ror 63
xar v7.2d, v10.2d, v25.2d, #61
eor x1, x1, x0, ror 63
# Row Mix
mov v25.16b, v0.16b
eor x6, x6, x28
mov v26.16b, v1.16b
eor x11, x11, x28
bcax v0.16b, v25.16b, v2.16b, v26.16b
eor x16, x16, x28
bcax v1.16b, v26.16b, v3.16b, v2.16b
eor x22, x22, x28
bcax v2.16b, v2.16b, v4.16b, v3.16b
eor x27, x27, x28
bcax v3.16b, v3.16b, v25.16b, v4.16b
eor x3, x3, x30
bcax v4.16b, v4.16b, v26.16b, v25.16b
eor x8, x8, x30
mov v25.16b, v5.16b
eor x13, x13, x30
mov v26.16b, v6.16b
eor x19, x19, x30
bcax v5.16b, v25.16b, v7.16b, v26.16b
eor x24, x24, x30
bcax v6.16b, v26.16b, v8.16b, v7.16b
eor x5, x5, x1
bcax v7.16b, v7.16b, v9.16b, v8.16b
eor x10, x10, x1
bcax v8.16b, v8.16b, v25.16b, v9.16b
eor x15, x15, x1
bcax v9.16b, v9.16b, v26.16b, v25.16b
eor x21, x21, x1
mov v26.16b, v11.16b
eor x26, x26, x1
# Swap Rotate Base
bcax v10.16b, v30.16b, v12.16b, v26.16b
ror x0, x3, #63
bcax v11.16b, v26.16b, v13.16b, v12.16b
ror x3, x8, #20
bcax v12.16b, v12.16b, v14.16b, v13.16b
ror x8, x11, #44
bcax v13.16b, v13.16b, v30.16b, v14.16b
ror x11, x25, #3
bcax v14.16b, v14.16b, v26.16b, v30.16b
ror x25, x16, #25
mov v25.16b, v15.16b
ror x16, x23, #46
mov v26.16b, v16.16b
ror x23, x4, #2
bcax v15.16b, v25.16b, v17.16b, v26.16b
ror x4, x14, #21
bcax v16.16b, v26.16b, v18.16b, v17.16b
ror x14, x15, #39
bcax v17.16b, v17.16b, v19.16b, v18.16b
ror x15, x22, #56
bcax v18.16b, v18.16b, v25.16b, v19.16b
ror x22, x26, #8
bcax v19.16b, v19.16b, v26.16b, v25.16b
ror x26, x17, #23
mov v25.16b, v20.16b
ror x17, x6, #37
mov v26.16b, v21.16b
ror x6, x27, #50
bcax v20.16b, v25.16b, v22.16b, v26.16b
ror x27, x24, #62
bcax v21.16b, v26.16b, v23.16b, v22.16b
ror x24, x10, #9
bcax v22.16b, v22.16b, v24.16b, v23.16b
ror x10, x19, #19
bcax v23.16b, v23.16b, v25.16b, v24.16b
ror x19, x7, #28
bcax v24.16b, v24.16b, v26.16b, v25.16b
ror x7, x5, #36
ror x5, x21, #43
ror x21, x20, #49
ror x20, x13, #54
ror x13, x9, #58
ror x9, x12, #61
# Row Mix Base
bic x12, x4, x3
bic x1, x5, x4
bic x28, x2, x6
bic x30, x3, x2
eor x2, x2, x12
eor x3, x3, x1
bic x12, x6, x5
eor x5, x5, x28
eor x4, x4, x12
eor x6, x6, x30
bic x12, x9, x8
bic x1, x10, x9
bic x28, x7, x11
bic x30, x8, x7
eor x7, x7, x12
eor x8, x8, x1
bic x12, x11, x10
eor x10, x10, x28
eor x9, x9, x12
eor x11, x11, x30
bic x12, x14, x13
bic x1, x15, x14
bic x28, x0, x16
bic x30, x13, x0
eor x12, x0, x12
eor x13, x13, x1
bic x0, x16, x15
eor x15, x15, x28
eor x14, x14, x0
eor x16, x16, x30
bic x0, x20, x19
bic x1, x21, x20
bic x28, x17, x22
bic x30, x19, x17
eor x17, x17, x0
eor x19, x19, x1
bic x0, x22, x21
eor x21, x21, x28
eor x20, x20, x0
eor x22, x22, x30
bic x0, x25, x24
bic x1, x26, x25
bic x28, x23, x27
bic x30, x24, x23
eor x23, x23, x0
eor x24, x24, x1
bic x0, x27, x26
eor x26, x26, x28
eor x25, x25, x0
eor x27, x27, x30
# Done transforming
ldp x28, x1, [x29, #48]
ldr x0, [x28], #8
subs x1, x1, #1
mov v30.d[0], x0
mov v30.d[1], x0
eor x2, x2, x0
eor v0.16b, v0.16b, v30.16b
bne L_SHA3_shake128_blocksx3_seed_neon_begin
ldr x0, [x29, #40]
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.d}[0], [x0]
add x0, x0, #8
st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
st1 {v24.d}[1], [x0]
add x0, x0, #8
stp x2, x3, [x0]
stp x4, x5, [x0, #16]
stp x6, x7, [x0, #32]
stp x8, x9, [x0, #48]
stp x10, x11, [x0, #64]
stp x12, x13, [x0, #80]
stp x14, x15, [x0, #96]
stp x16, x17, [x0, #112]
stp x19, x20, [x0, #128]
stp x21, x22, [x0, #144]
stp x23, x24, [x0, #160]
stp x25, x26, [x0, #176]
str x27, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp d8, d9, [x29, #160]
ldp d10, d11, [x29, #176]
ldp d12, d13, [x29, #192]
ldp d14, d15, [x29, #208]
ldp x29, x30, [sp], #0xe0
ret
#ifndef __APPLE__
.size mlkem_shake128_blocksx3_seed_neon,.-mlkem_shake128_blocksx3_seed_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_shake256_blocksx3_seed_neon
.type mlkem_shake256_blocksx3_seed_neon,@function
.align 2
mlkem_shake256_blocksx3_seed_neon:
#else
.section __TEXT,__text
.globl _mlkem_shake256_blocksx3_seed_neon
.p2align 2
_mlkem_shake256_blocksx3_seed_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-224]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
stp d8, d9, [x29, #160]
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x28, L_sha3_aarch64_r
add x28, x28, :lo12:L_sha3_aarch64_r
#else
adrp x28, L_sha3_aarch64_r@PAGE
add x28, x28, L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
add x0, x0, #32
ld1 {v4.d}[0], [x0]
ldp x2, x3, [x1], #16
add x0, x0, #0xc8
ld1 {v4.d}[1], [x0]
ldp x4, x5, [x1], #16
ldr x6, [x0, #200]
eor v5.16b, v5.16b, v5.16b
eor x7, x7, x7
eor v6.16b, v6.16b, v6.16b
eor x8, x8, x8
eor v7.16b, v7.16b, v7.16b
eor x9, x9, x9
eor v8.16b, v8.16b, v8.16b
eor x10, x10, x10
eor v9.16b, v9.16b, v9.16b
eor x11, x11, x11
eor v10.16b, v10.16b, v10.16b
eor x12, x12, x12
eor v11.16b, v11.16b, v11.16b
eor x13, x13, x13
eor v12.16b, v12.16b, v12.16b
eor x14, x14, x14
eor v13.16b, v13.16b, v13.16b
eor x15, x15, x15
eor v14.16b, v14.16b, v14.16b
eor x16, x16, x16
eor v15.16b, v15.16b, v15.16b
eor x17, x17, x17
movz x19, #0x8000, lsl 48
eor v17.16b, v17.16b, v17.16b
eor x20, x20, x20
eor v18.16b, v18.16b, v18.16b
eor x21, x21, x21
eor v19.16b, v19.16b, v19.16b
eor x22, x22, x22
eor v20.16b, v20.16b, v20.16b
eor x23, x23, x23
eor v21.16b, v21.16b, v21.16b
eor x24, x24, x24
eor v22.16b, v22.16b, v22.16b
eor x25, x25, x25
eor v23.16b, v23.16b, v23.16b
eor x26, x26, x26
eor v24.16b, v24.16b, v24.16b
eor x27, x27, x27
dup v0.2d, x2
dup v1.2d, x3
dup v2.2d, x4
dup v3.2d, x5
dup v16.2d, x19
mov x1, #24
# Start of 24 rounds
L_SHA3_shake256_blocksx3_seed_neon_begin:
stp x28, x1, [x29, #48]
# Col Mix
eor3 v31.16b, v0.16b, v5.16b, v10.16b
eor x0, x6, x11
eor3 v27.16b, v1.16b, v6.16b, v11.16b
eor x30, x2, x7
eor3 v28.16b, v2.16b, v7.16b, v12.16b
eor x28, x4, x9
eor3 v29.16b, v3.16b, v8.16b, v13.16b
eor x0, x0, x16
eor3 v30.16b, v4.16b, v9.16b, v14.16b
eor x30, x30, x12
eor3 v31.16b, v31.16b, v15.16b, v20.16b
eor x28, x28, x14
eor3 v27.16b, v27.16b, v16.16b, v21.16b
eor x0, x0, x22
eor3 v28.16b, v28.16b, v17.16b, v22.16b
eor x30, x30, x17
eor3 v29.16b, v29.16b, v18.16b, v23.16b
eor x28, x28, x20
eor3 v30.16b, v30.16b, v19.16b, v24.16b
eor x0, x0, x27
rax1 v25.2d, v30.2d, v27.2d
eor x30, x30, x23
rax1 v26.2d, v31.2d, v28.2d
eor x28, x28, x25
rax1 v27.2d, v27.2d, v29.2d
str x0, [x29, #32]
rax1 v28.2d, v28.2d, v30.2d
str x28, [x29, #24]
rax1 v29.2d, v29.2d, v31.2d
eor x1, x3, x8
eor v0.16b, v0.16b, v25.16b
xar v30.2d, v1.2d, v26.2d, #63
eor x28, x5, x10
xar v1.2d, v6.2d, v26.2d, #20
eor x1, x1, x13
xar v6.2d, v9.2d, v29.2d, #44
eor x28, x28, x15
xar v9.2d, v22.2d, v27.2d, #3
eor x1, x1, x19
xar v22.2d, v14.2d, v29.2d, #25
eor x28, x28, x21
xar v14.2d, v20.2d, v25.2d, #46
eor x1, x1, x24
xar v20.2d, v2.2d, v27.2d, #2
eor x28, x28, x26
xar v2.2d, v12.2d, v27.2d, #21
eor x0, x0, x1, ror 63
xar v12.2d, v13.2d, v28.2d, #39
eor x1, x1, x28, ror 63
xar v13.2d, v19.2d, v29.2d, #56
eor x2, x2, x0
xar v19.2d, v23.2d, v28.2d, #8
eor x7, x7, x0
xar v23.2d, v15.2d, v25.2d, #23
eor x12, x12, x0
xar v15.2d, v4.2d, v29.2d, #37
eor x17, x17, x0
xar v4.2d, v24.2d, v29.2d, #50
eor x23, x23, x0
xar v24.2d, v21.2d, v26.2d, #62
eor x4, x4, x1
xar v21.2d, v8.2d, v28.2d, #9
eor x9, x9, x1
xar v8.2d, v16.2d, v26.2d, #19
eor x14, x14, x1
xar v16.2d, v5.2d, v25.2d, #28
eor x20, x20, x1
xar v5.2d, v3.2d, v28.2d, #36
eor x25, x25, x1
xar v3.2d, v18.2d, v28.2d, #43
ldr x0, [x29, #32]
xar v18.2d, v17.2d, v27.2d, #49
ldr x1, [x29, #24]
xar v17.2d, v11.2d, v26.2d, #54
eor x28, x28, x30, ror 63
xar v11.2d, v7.2d, v27.2d, #58
eor x30, x30, x1, ror 63
xar v7.2d, v10.2d, v25.2d, #61
eor x1, x1, x0, ror 63
# Row Mix
mov v25.16b, v0.16b
eor x6, x6, x28
mov v26.16b, v1.16b
eor x11, x11, x28
bcax v0.16b, v25.16b, v2.16b, v26.16b
eor x16, x16, x28
bcax v1.16b, v26.16b, v3.16b, v2.16b
eor x22, x22, x28
bcax v2.16b, v2.16b, v4.16b, v3.16b
eor x27, x27, x28
bcax v3.16b, v3.16b, v25.16b, v4.16b
eor x3, x3, x30
bcax v4.16b, v4.16b, v26.16b, v25.16b
eor x8, x8, x30
mov v25.16b, v5.16b
eor x13, x13, x30
mov v26.16b, v6.16b
eor x19, x19, x30
bcax v5.16b, v25.16b, v7.16b, v26.16b
eor x24, x24, x30
bcax v6.16b, v26.16b, v8.16b, v7.16b
eor x5, x5, x1
bcax v7.16b, v7.16b, v9.16b, v8.16b
eor x10, x10, x1
bcax v8.16b, v8.16b, v25.16b, v9.16b
eor x15, x15, x1
bcax v9.16b, v9.16b, v26.16b, v25.16b
eor x21, x21, x1
mov v26.16b, v11.16b
eor x26, x26, x1
# Swap Rotate Base
bcax v10.16b, v30.16b, v12.16b, v26.16b
ror x0, x3, #63
bcax v11.16b, v26.16b, v13.16b, v12.16b
ror x3, x8, #20
bcax v12.16b, v12.16b, v14.16b, v13.16b
ror x8, x11, #44
bcax v13.16b, v13.16b, v30.16b, v14.16b
ror x11, x25, #3
bcax v14.16b, v14.16b, v26.16b, v30.16b
ror x25, x16, #25
mov v25.16b, v15.16b
ror x16, x23, #46
mov v26.16b, v16.16b
ror x23, x4, #2
bcax v15.16b, v25.16b, v17.16b, v26.16b
ror x4, x14, #21
bcax v16.16b, v26.16b, v18.16b, v17.16b
ror x14, x15, #39
bcax v17.16b, v17.16b, v19.16b, v18.16b
ror x15, x22, #56
bcax v18.16b, v18.16b, v25.16b, v19.16b
ror x22, x26, #8
bcax v19.16b, v19.16b, v26.16b, v25.16b
ror x26, x17, #23
mov v25.16b, v20.16b
ror x17, x6, #37
mov v26.16b, v21.16b
ror x6, x27, #50
bcax v20.16b, v25.16b, v22.16b, v26.16b
ror x27, x24, #62
bcax v21.16b, v26.16b, v23.16b, v22.16b
ror x24, x10, #9
bcax v22.16b, v22.16b, v24.16b, v23.16b
ror x10, x19, #19
bcax v23.16b, v23.16b, v25.16b, v24.16b
ror x19, x7, #28
bcax v24.16b, v24.16b, v26.16b, v25.16b
ror x7, x5, #36
ror x5, x21, #43
ror x21, x20, #49
ror x20, x13, #54
ror x13, x9, #58
ror x9, x12, #61
# Row Mix Base
bic x12, x4, x3
bic x1, x5, x4
bic x28, x2, x6
bic x30, x3, x2
eor x2, x2, x12
eor x3, x3, x1
bic x12, x6, x5
eor x5, x5, x28
eor x4, x4, x12
eor x6, x6, x30
bic x12, x9, x8
bic x1, x10, x9
bic x28, x7, x11
bic x30, x8, x7
eor x7, x7, x12
eor x8, x8, x1
bic x12, x11, x10
eor x10, x10, x28
eor x9, x9, x12
eor x11, x11, x30
bic x12, x14, x13
bic x1, x15, x14
bic x28, x0, x16
bic x30, x13, x0
eor x12, x0, x12
eor x13, x13, x1
bic x0, x16, x15
eor x15, x15, x28
eor x14, x14, x0
eor x16, x16, x30
bic x0, x20, x19
bic x1, x21, x20
bic x28, x17, x22
bic x30, x19, x17
eor x17, x17, x0
eor x19, x19, x1
bic x0, x22, x21
eor x21, x21, x28
eor x20, x20, x0
eor x22, x22, x30
bic x0, x25, x24
bic x1, x26, x25
bic x28, x23, x27
bic x30, x24, x23
eor x23, x23, x0
eor x24, x24, x1
bic x0, x27, x26
eor x26, x26, x28
eor x25, x25, x0
eor x27, x27, x30
# Done transforming
ldp x28, x1, [x29, #48]
ldr x0, [x28], #8
subs x1, x1, #1
mov v30.d[0], x0
mov v30.d[1], x0
eor x2, x2, x0
eor v0.16b, v0.16b, v30.16b
bne L_SHA3_shake256_blocksx3_seed_neon_begin
ldr x0, [x29, #40]
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.d}[0], [x0]
add x0, x0, #8
st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
st1 {v24.d}[1], [x0]
add x0, x0, #8
stp x2, x3, [x0]
stp x4, x5, [x0, #16]
stp x6, x7, [x0, #32]
stp x8, x9, [x0, #48]
stp x10, x11, [x0, #64]
stp x12, x13, [x0, #80]
stp x14, x15, [x0, #96]
stp x16, x17, [x0, #112]
stp x19, x20, [x0, #128]
stp x21, x22, [x0, #144]
stp x23, x24, [x0, #160]
stp x25, x26, [x0, #176]
str x27, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp d8, d9, [x29, #160]
ldp d10, d11, [x29, #176]
ldp d12, d13, [x29, #192]
ldp d14, d15, [x29, #208]
ldp x29, x30, [sp], #0xe0
ret
#ifndef __APPLE__
.size mlkem_shake256_blocksx3_seed_neon,.-mlkem_shake256_blocksx3_seed_neon
#endif /* __APPLE__ */
#else
#ifndef __APPLE__
.text
.globl mlkem_sha3_blocksx3_neon
.type mlkem_sha3_blocksx3_neon,@function
.align 2
mlkem_sha3_blocksx3_neon:
#else
.section __TEXT,__text
.globl _mlkem_sha3_blocksx3_neon
.p2align 2
_mlkem_sha3_blocksx3_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-224]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
stp d8, d9, [x29, #160]
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x27, L_sha3_aarch64_r
add x27, x27, :lo12:L_sha3_aarch64_r
#else
adrp x27, L_sha3_aarch64_r@PAGE
add x27, x27, L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
ld1 {v24.d}[0], [x0]
add x0, x0, #8
ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
ld1 {v24.d}[1], [x0]
add x0, x0, #8
ldp x1, x2, [x0]
ldp x3, x4, [x0, #16]
ldp x5, x6, [x0, #32]
ldp x7, x8, [x0, #48]
ldp x9, x10, [x0, #64]
ldp x11, x12, [x0, #80]
ldp x13, x14, [x0, #96]
ldp x15, x16, [x0, #112]
ldp x17, x19, [x0, #128]
ldp x20, x21, [x0, #144]
ldp x22, x23, [x0, #160]
ldp x24, x25, [x0, #176]
ldr x26, [x0, #192]
mov x28, #24
# Start of 24 rounds
L_SHA3_transform_blocksx3_neon_begin:
stp x27, x28, [x29, #48]
# Col Mix NEON
eor v30.16b, v4.16b, v9.16b
eor x0, x5, x10
eor v27.16b, v1.16b, v6.16b
eor x30, x1, x6
eor v30.16b, v30.16b, v14.16b
eor x28, x3, x8
eor v27.16b, v27.16b, v11.16b
eor x0, x0, x15
eor v30.16b, v30.16b, v19.16b
eor x30, x30, x11
eor v27.16b, v27.16b, v16.16b
eor x28, x28, x13
eor v30.16b, v30.16b, v24.16b
eor x0, x0, x21
eor v27.16b, v27.16b, v21.16b
eor x30, x30, x16
ushr v25.2d, v27.2d, #63
eor x28, x28, x19
sli v25.2d, v27.2d, #1
eor x0, x0, x26
eor v25.16b, v25.16b, v30.16b
eor x30, x30, x22
eor v31.16b, v0.16b, v5.16b
eor x28, x28, x24
eor v28.16b, v2.16b, v7.16b
str x0, [x29, #32]
eor v31.16b, v31.16b, v10.16b
str x28, [x29, #24]
eor v28.16b, v28.16b, v12.16b
eor x27, x2, x7
eor v31.16b, v31.16b, v15.16b
eor x28, x4, x9
eor v28.16b, v28.16b, v17.16b
eor x27, x27, x12
eor v31.16b, v31.16b, v20.16b
eor x28, x28, x14
eor v28.16b, v28.16b, v22.16b
eor x27, x27, x17
ushr v29.2d, v30.2d, #63
eor x28, x28, x20
ushr v26.2d, v28.2d, #63
eor x27, x27, x23
sli v29.2d, v30.2d, #1
eor x28, x28, x25
sli v26.2d, v28.2d, #1
eor x0, x0, x27, ror 63
eor v28.16b, v28.16b, v29.16b
eor x27, x27, x28, ror 63
eor v29.16b, v3.16b, v8.16b
eor x1, x1, x0
eor v26.16b, v26.16b, v31.16b
eor x6, x6, x0
eor v29.16b, v29.16b, v13.16b
eor x11, x11, x0
eor v29.16b, v29.16b, v18.16b
eor x16, x16, x0
eor v29.16b, v29.16b, v23.16b
eor x22, x22, x0
ushr v30.2d, v29.2d, #63
eor x3, x3, x27
sli v30.2d, v29.2d, #1
eor x8, x8, x27
eor v27.16b, v27.16b, v30.16b
eor x13, x13, x27
ushr v30.2d, v31.2d, #63
eor x19, x19, x27
sli v30.2d, v31.2d, #1
eor x24, x24, x27
eor v29.16b, v29.16b, v30.16b
ldr x0, [x29, #32]
# Swap Rotate NEON
eor v0.16b, v0.16b, v25.16b
eor v31.16b, v1.16b, v26.16b
ldr x27, [x29, #24]
eor v6.16b, v6.16b, v26.16b
eor x28, x28, x30, ror 63
ushr v30.2d, v31.2d, #63
eor x30, x30, x27, ror 63
ushr v1.2d, v6.2d, #20
eor x27, x27, x0, ror 63
sli v30.2d, v31.2d, #1
eor x5, x5, x28
sli v1.2d, v6.2d, #44
eor x10, x10, x28
eor v31.16b, v9.16b, v29.16b
eor x15, x15, x28
eor v22.16b, v22.16b, v27.16b
eor x21, x21, x28
ushr v6.2d, v31.2d, #44
eor x26, x26, x28
ushr v9.2d, v22.2d, #3
eor x2, x2, x30
sli v6.2d, v31.2d, #20
eor x7, x7, x30
sli v9.2d, v22.2d, #61
eor x12, x12, x30
eor v31.16b, v14.16b, v29.16b
eor x17, x17, x30
eor v20.16b, v20.16b, v25.16b
eor x23, x23, x30
ushr v22.2d, v31.2d, #25
eor x4, x4, x27
ushr v14.2d, v20.2d, #46
eor x9, x9, x27
sli v22.2d, v31.2d, #39
eor x14, x14, x27
sli v14.2d, v20.2d, #18
eor x20, x20, x27
eor v31.16b, v2.16b, v27.16b
eor x25, x25, x27
# Swap Rotate Base
eor v12.16b, v12.16b, v27.16b
ror x0, x2, #63
ushr v20.2d, v31.2d, #2
ror x2, x7, #20
ushr v2.2d, v12.2d, #21
ror x7, x10, #44
sli v20.2d, v31.2d, #62
ror x10, x24, #3
sli v2.2d, v12.2d, #43
ror x24, x15, #25
eor v31.16b, v13.16b, v28.16b
ror x15, x22, #46
eor v19.16b, v19.16b, v29.16b
ror x22, x3, #2
ushr v12.2d, v31.2d, #39
ror x3, x13, #21
ushr v13.2d, v19.2d, #56
ror x13, x14, #39
sli v12.2d, v31.2d, #25
ror x14, x21, #56
sli v13.2d, v19.2d, #8
ror x21, x25, #8
eor v31.16b, v23.16b, v28.16b
ror x25, x16, #23
eor v15.16b, v15.16b, v25.16b
ror x16, x5, #37
ushr v19.2d, v31.2d, #8
ror x5, x26, #50
ushr v23.2d, v15.2d, #23
ror x26, x23, #62
sli v19.2d, v31.2d, #56
ror x23, x9, #9
sli v23.2d, v15.2d, #41
ror x9, x17, #19
eor v31.16b, v4.16b, v29.16b
ror x17, x6, #28
eor v24.16b, v24.16b, v29.16b
ror x6, x4, #36
ushr v15.2d, v31.2d, #37
ror x4, x20, #43
ushr v4.2d, v24.2d, #50
ror x20, x19, #49
sli v15.2d, v31.2d, #27
ror x19, x12, #54
sli v4.2d, v24.2d, #14
ror x12, x8, #58
eor v31.16b, v21.16b, v26.16b
ror x8, x11, #61
# Row Mix Base
eor v8.16b, v8.16b, v28.16b
bic x11, x3, x2
ushr v24.2d, v31.2d, #62
bic x27, x4, x3
ushr v21.2d, v8.2d, #9
bic x28, x1, x5
sli v24.2d, v31.2d, #2
bic x30, x2, x1
sli v21.2d, v8.2d, #55
eor x1, x1, x11
eor v31.16b, v16.16b, v26.16b
eor x2, x2, x27
eor v5.16b, v5.16b, v25.16b
bic x11, x5, x4
ushr v8.2d, v31.2d, #19
eor x4, x4, x28
ushr v16.2d, v5.2d, #28
eor x3, x3, x11
sli v8.2d, v31.2d, #45
eor x5, x5, x30
sli v16.2d, v5.2d, #36
bic x11, x8, x7
eor v31.16b, v3.16b, v28.16b
bic x27, x9, x8
eor v18.16b, v18.16b, v28.16b
bic x28, x6, x10
ushr v5.2d, v31.2d, #36
bic x30, x7, x6
ushr v3.2d, v18.2d, #43
eor x6, x6, x11
sli v5.2d, v31.2d, #28
eor x7, x7, x27
sli v3.2d, v18.2d, #21
bic x11, x10, x9
eor v31.16b, v17.16b, v27.16b
eor x9, x9, x28
eor v11.16b, v11.16b, v26.16b
eor x8, x8, x11
ushr v18.2d, v31.2d, #49
eor x10, x10, x30
ushr v17.2d, v11.2d, #54
bic x11, x13, x12
sli v18.2d, v31.2d, #15
bic x27, x14, x13
sli v17.2d, v11.2d, #10
bic x28, x0, x15
eor v31.16b, v7.16b, v27.16b
bic x30, x12, x0
eor v10.16b, v10.16b, v25.16b
eor x11, x0, x11
ushr v11.2d, v31.2d, #58
eor x12, x12, x27
ushr v7.2d, v10.2d, #61
bic x0, x15, x14
sli v11.2d, v31.2d, #6
eor x14, x14, x28
sli v7.2d, v10.2d, #3
eor x13, x13, x0
# Row Mix NEON
bic v25.16b, v2.16b, v1.16b
eor x15, x15, x30
bic v26.16b, v3.16b, v2.16b
bic x0, x19, x17
bic v27.16b, v4.16b, v3.16b
bic x27, x20, x19
bic v28.16b, v0.16b, v4.16b
bic x28, x16, x21
bic v29.16b, v1.16b, v0.16b
bic x30, x17, x16
eor v0.16b, v0.16b, v25.16b
eor x16, x16, x0
eor v1.16b, v1.16b, v26.16b
eor x17, x17, x27
eor v2.16b, v2.16b, v27.16b
bic x0, x21, x20
eor v3.16b, v3.16b, v28.16b
eor x20, x20, x28
eor v4.16b, v4.16b, v29.16b
eor x19, x19, x0
bic v25.16b, v7.16b, v6.16b
eor x21, x21, x30
bic v26.16b, v8.16b, v7.16b
bic x0, x24, x23
bic v27.16b, v9.16b, v8.16b
bic x27, x25, x24
bic v28.16b, v5.16b, v9.16b
bic x28, x22, x26
bic v29.16b, v6.16b, v5.16b
bic x30, x23, x22
eor v5.16b, v5.16b, v25.16b
eor x22, x22, x0
eor v6.16b, v6.16b, v26.16b
eor x23, x23, x27
eor v7.16b, v7.16b, v27.16b
bic x0, x26, x25
eor v8.16b, v8.16b, v28.16b
eor x25, x25, x28
eor v9.16b, v9.16b, v29.16b
eor x24, x24, x0
bic v25.16b, v12.16b, v11.16b
eor x26, x26, x30
bic v26.16b, v13.16b, v12.16b
bic v27.16b, v14.16b, v13.16b
bic v28.16b, v30.16b, v14.16b
bic v29.16b, v11.16b, v30.16b
eor v10.16b, v30.16b, v25.16b
eor v11.16b, v11.16b, v26.16b
eor v12.16b, v12.16b, v27.16b
eor v13.16b, v13.16b, v28.16b
eor v14.16b, v14.16b, v29.16b
bic v25.16b, v17.16b, v16.16b
bic v26.16b, v18.16b, v17.16b
bic v27.16b, v19.16b, v18.16b
bic v28.16b, v15.16b, v19.16b
bic v29.16b, v16.16b, v15.16b
eor v15.16b, v15.16b, v25.16b
eor v16.16b, v16.16b, v26.16b
eor v17.16b, v17.16b, v27.16b
eor v18.16b, v18.16b, v28.16b
eor v19.16b, v19.16b, v29.16b
bic v25.16b, v22.16b, v21.16b
bic v26.16b, v23.16b, v22.16b
bic v27.16b, v24.16b, v23.16b
bic v28.16b, v20.16b, v24.16b
bic v29.16b, v21.16b, v20.16b
eor v20.16b, v20.16b, v25.16b
eor v21.16b, v21.16b, v26.16b
eor v22.16b, v22.16b, v27.16b
eor v23.16b, v23.16b, v28.16b
eor v24.16b, v24.16b, v29.16b
# Done transforming
ldp x27, x28, [x29, #48]
ldr x0, [x27], #8
subs x28, x28, #1
mov v30.d[0], x0
mov v30.d[1], x0
eor x1, x1, x0
eor v0.16b, v0.16b, v30.16b
bne L_SHA3_transform_blocksx3_neon_begin
ldr x0, [x29, #40]
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.d}[0], [x0]
add x0, x0, #8
st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
st1 {v24.d}[1], [x0]
add x0, x0, #8
stp x1, x2, [x0]
stp x3, x4, [x0, #16]
stp x5, x6, [x0, #32]
stp x7, x8, [x0, #48]
stp x9, x10, [x0, #64]
stp x11, x12, [x0, #80]
stp x13, x14, [x0, #96]
stp x15, x16, [x0, #112]
stp x17, x19, [x0, #128]
stp x20, x21, [x0, #144]
stp x22, x23, [x0, #160]
stp x24, x25, [x0, #176]
str x26, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp d8, d9, [x29, #160]
ldp d10, d11, [x29, #176]
ldp d12, d13, [x29, #192]
ldp d14, d15, [x29, #208]
ldp x29, x30, [sp], #0xe0
ret
#ifndef __APPLE__
.size mlkem_sha3_blocksx3_neon,.-mlkem_sha3_blocksx3_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_shake128_blocksx3_seed_neon
.type mlkem_shake128_blocksx3_seed_neon,@function
.align 2
mlkem_shake128_blocksx3_seed_neon:
#else
.section __TEXT,__text
.globl _mlkem_shake128_blocksx3_seed_neon
.p2align 2
_mlkem_shake128_blocksx3_seed_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-224]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
stp d8, d9, [x29, #160]
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x28, L_sha3_aarch64_r
add x28, x28, :lo12:L_sha3_aarch64_r
#else
adrp x28, L_sha3_aarch64_r@PAGE
add x28, x28, L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
add x0, x0, #32
ld1 {v4.d}[0], [x0]
ldp x2, x3, [x1], #16
add x0, x0, #0xc8
ld1 {v4.d}[1], [x0]
ldp x4, x5, [x1], #16
ldr x6, [x0, #200]
eor v5.16b, v5.16b, v5.16b
eor x7, x7, x7
eor v6.16b, v6.16b, v6.16b
eor x8, x8, x8
eor v7.16b, v7.16b, v7.16b
eor x9, x9, x9
eor v8.16b, v8.16b, v8.16b
eor x10, x10, x10
eor v9.16b, v9.16b, v9.16b
eor x11, x11, x11
eor v10.16b, v10.16b, v10.16b
eor x12, x12, x12
eor v11.16b, v11.16b, v11.16b
eor x13, x13, x13
eor v12.16b, v12.16b, v12.16b
eor x14, x14, x14
eor v13.16b, v13.16b, v13.16b
eor x15, x15, x15
eor v14.16b, v14.16b, v14.16b
eor x16, x16, x16
eor v15.16b, v15.16b, v15.16b
eor x17, x17, x17
eor v16.16b, v16.16b, v16.16b
eor x19, x19, x19
eor v17.16b, v17.16b, v17.16b
eor x20, x20, x20
eor v18.16b, v18.16b, v18.16b
eor x21, x21, x21
eor v19.16b, v19.16b, v19.16b
eor x22, x22, x22
movz x23, #0x8000, lsl 48
eor v21.16b, v21.16b, v21.16b
eor x24, x24, x24
eor v22.16b, v22.16b, v22.16b
eor x25, x25, x25
eor v23.16b, v23.16b, v23.16b
eor x26, x26, x26
eor v24.16b, v24.16b, v24.16b
eor x27, x27, x27
dup v0.2d, x2
dup v1.2d, x3
dup v2.2d, x4
dup v3.2d, x5
dup v20.2d, x23
mov x1, #24
# Start of 24 rounds
L_SHA3_shake128_blocksx3_seed_neon_begin:
stp x28, x1, [x29, #48]
# Col Mix NEON
eor v30.16b, v4.16b, v9.16b
eor x0, x6, x11
eor v27.16b, v1.16b, v6.16b
eor x30, x2, x7
eor v30.16b, v30.16b, v14.16b
eor x28, x4, x9
eor v27.16b, v27.16b, v11.16b
eor x0, x0, x16
eor v30.16b, v30.16b, v19.16b
eor x30, x30, x12
eor v27.16b, v27.16b, v16.16b
eor x28, x28, x14
eor v30.16b, v30.16b, v24.16b
eor x0, x0, x22
eor v27.16b, v27.16b, v21.16b
eor x30, x30, x17
ushr v25.2d, v27.2d, #63
eor x28, x28, x20
sli v25.2d, v27.2d, #1
eor x0, x0, x27
eor v25.16b, v25.16b, v30.16b
eor x30, x30, x23
eor v31.16b, v0.16b, v5.16b
eor x28, x28, x25
eor v28.16b, v2.16b, v7.16b
str x0, [x29, #32]
eor v31.16b, v31.16b, v10.16b
str x28, [x29, #24]
eor v28.16b, v28.16b, v12.16b
eor x1, x3, x8
eor v31.16b, v31.16b, v15.16b
eor x28, x5, x10
eor v28.16b, v28.16b, v17.16b
eor x1, x1, x13
eor v31.16b, v31.16b, v20.16b
eor x28, x28, x15
eor v28.16b, v28.16b, v22.16b
eor x1, x1, x19
ushr v29.2d, v30.2d, #63
eor x28, x28, x21
ushr v26.2d, v28.2d, #63
eor x1, x1, x24
sli v29.2d, v30.2d, #1
eor x28, x28, x26
sli v26.2d, v28.2d, #1
eor x0, x0, x1, ror 63
eor v28.16b, v28.16b, v29.16b
eor x1, x1, x28, ror 63
eor v29.16b, v3.16b, v8.16b
eor x2, x2, x0
eor v26.16b, v26.16b, v31.16b
eor x7, x7, x0
eor v29.16b, v29.16b, v13.16b
eor x12, x12, x0
eor v29.16b, v29.16b, v18.16b
eor x17, x17, x0
eor v29.16b, v29.16b, v23.16b
eor x23, x23, x0
ushr v30.2d, v29.2d, #63
eor x4, x4, x1
sli v30.2d, v29.2d, #1
eor x9, x9, x1
eor v27.16b, v27.16b, v30.16b
eor x14, x14, x1
ushr v30.2d, v31.2d, #63
eor x20, x20, x1
sli v30.2d, v31.2d, #1
eor x25, x25, x1
eor v29.16b, v29.16b, v30.16b
ldr x0, [x29, #32]
# Swap Rotate NEON
eor v0.16b, v0.16b, v25.16b
eor v31.16b, v1.16b, v26.16b
ldr x1, [x29, #24]
eor v6.16b, v6.16b, v26.16b
eor x28, x28, x30, ror 63
ushr v30.2d, v31.2d, #63
eor x30, x30, x1, ror 63
ushr v1.2d, v6.2d, #20
eor x1, x1, x0, ror 63
sli v30.2d, v31.2d, #1
eor x6, x6, x28
sli v1.2d, v6.2d, #44
eor x11, x11, x28
eor v31.16b, v9.16b, v29.16b
eor x16, x16, x28
eor v22.16b, v22.16b, v27.16b
eor x22, x22, x28
ushr v6.2d, v31.2d, #44
eor x27, x27, x28
ushr v9.2d, v22.2d, #3
eor x3, x3, x30
sli v6.2d, v31.2d, #20
eor x8, x8, x30
sli v9.2d, v22.2d, #61
eor x13, x13, x30
eor v31.16b, v14.16b, v29.16b
eor x19, x19, x30
eor v20.16b, v20.16b, v25.16b
eor x24, x24, x30
ushr v22.2d, v31.2d, #25
eor x5, x5, x1
ushr v14.2d, v20.2d, #46
eor x10, x10, x1
sli v22.2d, v31.2d, #39
eor x15, x15, x1
sli v14.2d, v20.2d, #18
eor x21, x21, x1
eor v31.16b, v2.16b, v27.16b
eor x26, x26, x1
# Swap Rotate Base
eor v12.16b, v12.16b, v27.16b
ror x0, x3, #63
ushr v20.2d, v31.2d, #2
ror x3, x8, #20
ushr v2.2d, v12.2d, #21
ror x8, x11, #44
sli v20.2d, v31.2d, #62
ror x11, x25, #3
sli v2.2d, v12.2d, #43
ror x25, x16, #25
eor v31.16b, v13.16b, v28.16b
ror x16, x23, #46
eor v19.16b, v19.16b, v29.16b
ror x23, x4, #2
ushr v12.2d, v31.2d, #39
ror x4, x14, #21
ushr v13.2d, v19.2d, #56
ror x14, x15, #39
sli v12.2d, v31.2d, #25
ror x15, x22, #56
sli v13.2d, v19.2d, #8
ror x22, x26, #8
eor v31.16b, v23.16b, v28.16b
ror x26, x17, #23
eor v15.16b, v15.16b, v25.16b
ror x17, x6, #37
ushr v19.2d, v31.2d, #8
ror x6, x27, #50
ushr v23.2d, v15.2d, #23
ror x27, x24, #62
sli v19.2d, v31.2d, #56
ror x24, x10, #9
sli v23.2d, v15.2d, #41
ror x10, x19, #19
eor v31.16b, v4.16b, v29.16b
ror x19, x7, #28
eor v24.16b, v24.16b, v29.16b
ror x7, x5, #36
ushr v15.2d, v31.2d, #37
ror x5, x21, #43
ushr v4.2d, v24.2d, #50
ror x21, x20, #49
sli v15.2d, v31.2d, #27
ror x20, x13, #54
sli v4.2d, v24.2d, #14
ror x13, x9, #58
eor v31.16b, v21.16b, v26.16b
ror x9, x12, #61
# Row Mix Base
eor v8.16b, v8.16b, v28.16b
bic x12, x4, x3
ushr v24.2d, v31.2d, #62
bic x1, x5, x4
ushr v21.2d, v8.2d, #9
bic x28, x2, x6
sli v24.2d, v31.2d, #2
bic x30, x3, x2
sli v21.2d, v8.2d, #55
eor x2, x2, x12
eor v31.16b, v16.16b, v26.16b
eor x3, x3, x1
eor v5.16b, v5.16b, v25.16b
bic x12, x6, x5
ushr v8.2d, v31.2d, #19
eor x5, x5, x28
ushr v16.2d, v5.2d, #28
eor x4, x4, x12
sli v8.2d, v31.2d, #45
eor x6, x6, x30
sli v16.2d, v5.2d, #36
bic x12, x9, x8
eor v31.16b, v3.16b, v28.16b
bic x1, x10, x9
eor v18.16b, v18.16b, v28.16b
bic x28, x7, x11
ushr v5.2d, v31.2d, #36
bic x30, x8, x7
ushr v3.2d, v18.2d, #43
eor x7, x7, x12
sli v5.2d, v31.2d, #28
eor x8, x8, x1
sli v3.2d, v18.2d, #21
bic x12, x11, x10
eor v31.16b, v17.16b, v27.16b
eor x10, x10, x28
eor v11.16b, v11.16b, v26.16b
eor x9, x9, x12
ushr v18.2d, v31.2d, #49
eor x11, x11, x30
ushr v17.2d, v11.2d, #54
bic x12, x14, x13
sli v18.2d, v31.2d, #15
bic x1, x15, x14
sli v17.2d, v11.2d, #10
bic x28, x0, x16
eor v31.16b, v7.16b, v27.16b
bic x30, x13, x0
eor v10.16b, v10.16b, v25.16b
eor x12, x0, x12
ushr v11.2d, v31.2d, #58
eor x13, x13, x1
ushr v7.2d, v10.2d, #61
bic x0, x16, x15
sli v11.2d, v31.2d, #6
eor x15, x15, x28
sli v7.2d, v10.2d, #3
eor x14, x14, x0
# Row Mix NEON
bic v25.16b, v2.16b, v1.16b
eor x16, x16, x30
bic v26.16b, v3.16b, v2.16b
bic x0, x20, x19
bic v27.16b, v4.16b, v3.16b
bic x1, x21, x20
bic v28.16b, v0.16b, v4.16b
bic x28, x17, x22
bic v29.16b, v1.16b, v0.16b
bic x30, x19, x17
eor v0.16b, v0.16b, v25.16b
eor x17, x17, x0
eor v1.16b, v1.16b, v26.16b
eor x19, x19, x1
eor v2.16b, v2.16b, v27.16b
bic x0, x22, x21
eor v3.16b, v3.16b, v28.16b
eor x21, x21, x28
eor v4.16b, v4.16b, v29.16b
eor x20, x20, x0
bic v25.16b, v7.16b, v6.16b
eor x22, x22, x30
bic v26.16b, v8.16b, v7.16b
bic x0, x25, x24
bic v27.16b, v9.16b, v8.16b
bic x1, x26, x25
bic v28.16b, v5.16b, v9.16b
bic x28, x23, x27
bic v29.16b, v6.16b, v5.16b
bic x30, x24, x23
eor v5.16b, v5.16b, v25.16b
eor x23, x23, x0
eor v6.16b, v6.16b, v26.16b
eor x24, x24, x1
eor v7.16b, v7.16b, v27.16b
bic x0, x27, x26
eor v8.16b, v8.16b, v28.16b
eor x26, x26, x28
eor v9.16b, v9.16b, v29.16b
eor x25, x25, x0
bic v25.16b, v12.16b, v11.16b
eor x27, x27, x30
bic v26.16b, v13.16b, v12.16b
bic v27.16b, v14.16b, v13.16b
bic v28.16b, v30.16b, v14.16b
bic v29.16b, v11.16b, v30.16b
eor v10.16b, v30.16b, v25.16b
eor v11.16b, v11.16b, v26.16b
eor v12.16b, v12.16b, v27.16b
eor v13.16b, v13.16b, v28.16b
eor v14.16b, v14.16b, v29.16b
bic v25.16b, v17.16b, v16.16b
bic v26.16b, v18.16b, v17.16b
bic v27.16b, v19.16b, v18.16b
bic v28.16b, v15.16b, v19.16b
bic v29.16b, v16.16b, v15.16b
eor v15.16b, v15.16b, v25.16b
eor v16.16b, v16.16b, v26.16b
eor v17.16b, v17.16b, v27.16b
eor v18.16b, v18.16b, v28.16b
eor v19.16b, v19.16b, v29.16b
bic v25.16b, v22.16b, v21.16b
bic v26.16b, v23.16b, v22.16b
bic v27.16b, v24.16b, v23.16b
bic v28.16b, v20.16b, v24.16b
bic v29.16b, v21.16b, v20.16b
eor v20.16b, v20.16b, v25.16b
eor v21.16b, v21.16b, v26.16b
eor v22.16b, v22.16b, v27.16b
eor v23.16b, v23.16b, v28.16b
eor v24.16b, v24.16b, v29.16b
# Done transforming
ldp x28, x1, [x29, #48]
ldr x0, [x28], #8
subs x1, x1, #1
mov v30.d[0], x0
mov v30.d[1], x0
eor x2, x2, x0
eor v0.16b, v0.16b, v30.16b
bne L_SHA3_shake128_blocksx3_seed_neon_begin
ldr x0, [x29, #40]
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.d}[0], [x0]
add x0, x0, #8
st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
st1 {v24.d}[1], [x0]
add x0, x0, #8
stp x2, x3, [x0]
stp x4, x5, [x0, #16]
stp x6, x7, [x0, #32]
stp x8, x9, [x0, #48]
stp x10, x11, [x0, #64]
stp x12, x13, [x0, #80]
stp x14, x15, [x0, #96]
stp x16, x17, [x0, #112]
stp x19, x20, [x0, #128]
stp x21, x22, [x0, #144]
stp x23, x24, [x0, #160]
stp x25, x26, [x0, #176]
str x27, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp d8, d9, [x29, #160]
ldp d10, d11, [x29, #176]
ldp d12, d13, [x29, #192]
ldp d14, d15, [x29, #208]
ldp x29, x30, [sp], #0xe0
ret
#ifndef __APPLE__
.size mlkem_shake128_blocksx3_seed_neon,.-mlkem_shake128_blocksx3_seed_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl mlkem_shake256_blocksx3_seed_neon
.type mlkem_shake256_blocksx3_seed_neon,@function
.align 2
mlkem_shake256_blocksx3_seed_neon:
#else
.section __TEXT,__text
.globl _mlkem_shake256_blocksx3_seed_neon
.p2align 2
_mlkem_shake256_blocksx3_seed_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-224]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
stp d8, d9, [x29, #160]
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x28, L_sha3_aarch64_r
add x28, x28, :lo12:L_sha3_aarch64_r
#else
adrp x28, L_sha3_aarch64_r@PAGE
add x28, x28, L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
add x0, x0, #32
ld1 {v4.d}[0], [x0]
ldp x2, x3, [x1], #16
add x0, x0, #0xc8
ld1 {v4.d}[1], [x0]
ldp x4, x5, [x1], #16
ldr x6, [x0, #200]
eor v5.16b, v5.16b, v5.16b
eor x7, x7, x7
eor v6.16b, v6.16b, v6.16b
eor x8, x8, x8
eor v7.16b, v7.16b, v7.16b
eor x9, x9, x9
eor v8.16b, v8.16b, v8.16b
eor x10, x10, x10
eor v9.16b, v9.16b, v9.16b
eor x11, x11, x11
eor v10.16b, v10.16b, v10.16b
eor x12, x12, x12
eor v11.16b, v11.16b, v11.16b
eor x13, x13, x13
eor v12.16b, v12.16b, v12.16b
eor x14, x14, x14
eor v13.16b, v13.16b, v13.16b
eor x15, x15, x15
eor v14.16b, v14.16b, v14.16b
eor x16, x16, x16
eor v15.16b, v15.16b, v15.16b
eor x17, x17, x17
movz x19, #0x8000, lsl 48
eor v17.16b, v17.16b, v17.16b
eor x20, x20, x20
eor v18.16b, v18.16b, v18.16b
eor x21, x21, x21
eor v19.16b, v19.16b, v19.16b
eor x22, x22, x22
eor v20.16b, v20.16b, v20.16b
eor x23, x23, x23
eor v21.16b, v21.16b, v21.16b
eor x24, x24, x24
eor v22.16b, v22.16b, v22.16b
eor x25, x25, x25
eor v23.16b, v23.16b, v23.16b
eor x26, x26, x26
eor v24.16b, v24.16b, v24.16b
eor x27, x27, x27
dup v0.2d, x2
dup v1.2d, x3
dup v2.2d, x4
dup v3.2d, x5
dup v16.2d, x19
mov x1, #24
# Start of 24 rounds
L_SHA3_shake256_blocksx3_seed_neon_begin:
stp x28, x1, [x29, #48]
# Col Mix NEON
eor v30.16b, v4.16b, v9.16b
eor x0, x6, x11
eor v27.16b, v1.16b, v6.16b
eor x30, x2, x7
eor v30.16b, v30.16b, v14.16b
eor x28, x4, x9
eor v27.16b, v27.16b, v11.16b
eor x0, x0, x16
eor v30.16b, v30.16b, v19.16b
eor x30, x30, x12
eor v27.16b, v27.16b, v16.16b
eor x28, x28, x14
eor v30.16b, v30.16b, v24.16b
eor x0, x0, x22
eor v27.16b, v27.16b, v21.16b
eor x30, x30, x17
ushr v25.2d, v27.2d, #63
eor x28, x28, x20
sli v25.2d, v27.2d, #1
eor x0, x0, x27
eor v25.16b, v25.16b, v30.16b
eor x30, x30, x23
eor v31.16b, v0.16b, v5.16b
eor x28, x28, x25
eor v28.16b, v2.16b, v7.16b
str x0, [x29, #32]
eor v31.16b, v31.16b, v10.16b
str x28, [x29, #24]
eor v28.16b, v28.16b, v12.16b
eor x1, x3, x8
eor v31.16b, v31.16b, v15.16b
eor x28, x5, x10
eor v28.16b, v28.16b, v17.16b
eor x1, x1, x13
eor v31.16b, v31.16b, v20.16b
eor x28, x28, x15
eor v28.16b, v28.16b, v22.16b
eor x1, x1, x19
ushr v29.2d, v30.2d, #63
eor x28, x28, x21
ushr v26.2d, v28.2d, #63
eor x1, x1, x24
sli v29.2d, v30.2d, #1
eor x28, x28, x26
sli v26.2d, v28.2d, #1
eor x0, x0, x1, ror 63
eor v28.16b, v28.16b, v29.16b
eor x1, x1, x28, ror 63
eor v29.16b, v3.16b, v8.16b
eor x2, x2, x0
eor v26.16b, v26.16b, v31.16b
eor x7, x7, x0
eor v29.16b, v29.16b, v13.16b
eor x12, x12, x0
eor v29.16b, v29.16b, v18.16b
eor x17, x17, x0
eor v29.16b, v29.16b, v23.16b
eor x23, x23, x0
ushr v30.2d, v29.2d, #63
eor x4, x4, x1
sli v30.2d, v29.2d, #1
eor x9, x9, x1
eor v27.16b, v27.16b, v30.16b
eor x14, x14, x1
ushr v30.2d, v31.2d, #63
eor x20, x20, x1
sli v30.2d, v31.2d, #1
eor x25, x25, x1
eor v29.16b, v29.16b, v30.16b
ldr x0, [x29, #32]
# Swap Rotate NEON
eor v0.16b, v0.16b, v25.16b
eor v31.16b, v1.16b, v26.16b
ldr x1, [x29, #24]
eor v6.16b, v6.16b, v26.16b
eor x28, x28, x30, ror 63
ushr v30.2d, v31.2d, #63
eor x30, x30, x1, ror 63
ushr v1.2d, v6.2d, #20
eor x1, x1, x0, ror 63
sli v30.2d, v31.2d, #1
eor x6, x6, x28
sli v1.2d, v6.2d, #44
eor x11, x11, x28
eor v31.16b, v9.16b, v29.16b
eor x16, x16, x28
eor v22.16b, v22.16b, v27.16b
eor x22, x22, x28
ushr v6.2d, v31.2d, #44
eor x27, x27, x28
ushr v9.2d, v22.2d, #3
eor x3, x3, x30
sli v6.2d, v31.2d, #20
eor x8, x8, x30
sli v9.2d, v22.2d, #61
eor x13, x13, x30
eor v31.16b, v14.16b, v29.16b
eor x19, x19, x30
eor v20.16b, v20.16b, v25.16b
eor x24, x24, x30
ushr v22.2d, v31.2d, #25
eor x5, x5, x1
ushr v14.2d, v20.2d, #46
eor x10, x10, x1
sli v22.2d, v31.2d, #39
eor x15, x15, x1
sli v14.2d, v20.2d, #18
eor x21, x21, x1
eor v31.16b, v2.16b, v27.16b
eor x26, x26, x1
# Swap Rotate Base
eor v12.16b, v12.16b, v27.16b
ror x0, x3, #63
ushr v20.2d, v31.2d, #2
ror x3, x8, #20
ushr v2.2d, v12.2d, #21
ror x8, x11, #44
sli v20.2d, v31.2d, #62
ror x11, x25, #3
sli v2.2d, v12.2d, #43
ror x25, x16, #25
eor v31.16b, v13.16b, v28.16b
ror x16, x23, #46
eor v19.16b, v19.16b, v29.16b
ror x23, x4, #2
ushr v12.2d, v31.2d, #39
ror x4, x14, #21
ushr v13.2d, v19.2d, #56
ror x14, x15, #39
sli v12.2d, v31.2d, #25
ror x15, x22, #56
sli v13.2d, v19.2d, #8
ror x22, x26, #8
eor v31.16b, v23.16b, v28.16b
ror x26, x17, #23
eor v15.16b, v15.16b, v25.16b
ror x17, x6, #37
ushr v19.2d, v31.2d, #8
ror x6, x27, #50
ushr v23.2d, v15.2d, #23
ror x27, x24, #62
sli v19.2d, v31.2d, #56
ror x24, x10, #9
sli v23.2d, v15.2d, #41
ror x10, x19, #19
eor v31.16b, v4.16b, v29.16b
ror x19, x7, #28
eor v24.16b, v24.16b, v29.16b
ror x7, x5, #36
ushr v15.2d, v31.2d, #37
ror x5, x21, #43
ushr v4.2d, v24.2d, #50
ror x21, x20, #49
sli v15.2d, v31.2d, #27
ror x20, x13, #54
sli v4.2d, v24.2d, #14
ror x13, x9, #58
eor v31.16b, v21.16b, v26.16b
ror x9, x12, #61
# Row Mix Base
eor v8.16b, v8.16b, v28.16b
bic x12, x4, x3
ushr v24.2d, v31.2d, #62
bic x1, x5, x4
ushr v21.2d, v8.2d, #9
bic x28, x2, x6
sli v24.2d, v31.2d, #2
bic x30, x3, x2
sli v21.2d, v8.2d, #55
eor x2, x2, x12
eor v31.16b, v16.16b, v26.16b
eor x3, x3, x1
eor v5.16b, v5.16b, v25.16b
bic x12, x6, x5
ushr v8.2d, v31.2d, #19
eor x5, x5, x28
ushr v16.2d, v5.2d, #28
eor x4, x4, x12
sli v8.2d, v31.2d, #45
eor x6, x6, x30
sli v16.2d, v5.2d, #36
bic x12, x9, x8
eor v31.16b, v3.16b, v28.16b
bic x1, x10, x9
eor v18.16b, v18.16b, v28.16b
bic x28, x7, x11
ushr v5.2d, v31.2d, #36
bic x30, x8, x7
ushr v3.2d, v18.2d, #43
eor x7, x7, x12
sli v5.2d, v31.2d, #28
eor x8, x8, x1
sli v3.2d, v18.2d, #21
bic x12, x11, x10
eor v31.16b, v17.16b, v27.16b
eor x10, x10, x28
eor v11.16b, v11.16b, v26.16b
eor x9, x9, x12
ushr v18.2d, v31.2d, #49
eor x11, x11, x30
ushr v17.2d, v11.2d, #54
bic x12, x14, x13
sli v18.2d, v31.2d, #15
bic x1, x15, x14
sli v17.2d, v11.2d, #10
bic x28, x0, x16
eor v31.16b, v7.16b, v27.16b
bic x30, x13, x0
eor v10.16b, v10.16b, v25.16b
eor x12, x0, x12
ushr v11.2d, v31.2d, #58
eor x13, x13, x1
ushr v7.2d, v10.2d, #61
bic x0, x16, x15
sli v11.2d, v31.2d, #6
eor x15, x15, x28
sli v7.2d, v10.2d, #3
eor x14, x14, x0
# Row Mix NEON
bic v25.16b, v2.16b, v1.16b
eor x16, x16, x30
bic v26.16b, v3.16b, v2.16b
bic x0, x20, x19
bic v27.16b, v4.16b, v3.16b
bic x1, x21, x20
bic v28.16b, v0.16b, v4.16b
bic x28, x17, x22
bic v29.16b, v1.16b, v0.16b
bic x30, x19, x17
eor v0.16b, v0.16b, v25.16b
eor x17, x17, x0
eor v1.16b, v1.16b, v26.16b
eor x19, x19, x1
eor v2.16b, v2.16b, v27.16b
bic x0, x22, x21
eor v3.16b, v3.16b, v28.16b
eor x21, x21, x28
eor v4.16b, v4.16b, v29.16b
eor x20, x20, x0
bic v25.16b, v7.16b, v6.16b
eor x22, x22, x30
bic v26.16b, v8.16b, v7.16b
bic x0, x25, x24
bic v27.16b, v9.16b, v8.16b
bic x1, x26, x25
bic v28.16b, v5.16b, v9.16b
bic x28, x23, x27
bic v29.16b, v6.16b, v5.16b
bic x30, x24, x23
eor v5.16b, v5.16b, v25.16b
eor x23, x23, x0
eor v6.16b, v6.16b, v26.16b
eor x24, x24, x1
eor v7.16b, v7.16b, v27.16b
bic x0, x27, x26
eor v8.16b, v8.16b, v28.16b
eor x26, x26, x28
eor v9.16b, v9.16b, v29.16b
eor x25, x25, x0
bic v25.16b, v12.16b, v11.16b
eor x27, x27, x30
bic v26.16b, v13.16b, v12.16b
bic v27.16b, v14.16b, v13.16b
bic v28.16b, v30.16b, v14.16b
bic v29.16b, v11.16b, v30.16b
eor v10.16b, v30.16b, v25.16b
eor v11.16b, v11.16b, v26.16b
eor v12.16b, v12.16b, v27.16b
eor v13.16b, v13.16b, v28.16b
eor v14.16b, v14.16b, v29.16b
bic v25.16b, v17.16b, v16.16b
bic v26.16b, v18.16b, v17.16b
bic v27.16b, v19.16b, v18.16b
bic v28.16b, v15.16b, v19.16b
bic v29.16b, v16.16b, v15.16b
eor v15.16b, v15.16b, v25.16b
eor v16.16b, v16.16b, v26.16b
eor v17.16b, v17.16b, v27.16b
eor v18.16b, v18.16b, v28.16b
eor v19.16b, v19.16b, v29.16b
bic v25.16b, v22.16b, v21.16b
bic v26.16b, v23.16b, v22.16b
bic v27.16b, v24.16b, v23.16b
bic v28.16b, v20.16b, v24.16b
bic v29.16b, v21.16b, v20.16b
eor v20.16b, v20.16b, v25.16b
eor v21.16b, v21.16b, v26.16b
eor v22.16b, v22.16b, v27.16b
eor v23.16b, v23.16b, v28.16b
eor v24.16b, v24.16b, v29.16b
# Done transforming
ldp x28, x1, [x29, #48]
ldr x0, [x28], #8
subs x1, x1, #1
mov v30.d[0], x0
mov v30.d[1], x0
eor x2, x2, x0
eor v0.16b, v0.16b, v30.16b
bne L_SHA3_shake256_blocksx3_seed_neon_begin
ldr x0, [x29, #40]
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.d}[0], [x0]
add x0, x0, #8
st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32
st1 {v24.d}[1], [x0]
add x0, x0, #8
stp x2, x3, [x0]
stp x4, x5, [x0, #16]
stp x6, x7, [x0, #32]
stp x8, x9, [x0, #48]
stp x10, x11, [x0, #64]
stp x12, x13, [x0, #80]
stp x14, x15, [x0, #96]
stp x16, x17, [x0, #112]
stp x19, x20, [x0, #128]
stp x21, x22, [x0, #144]
stp x23, x24, [x0, #160]
stp x25, x26, [x0, #176]
str x27, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp d8, d9, [x29, #160]
ldp d10, d11, [x29, #176]
ldp d12, d13, [x29, #192]
ldp d14, d15, [x29, #208]
ldp x29, x30, [sp], #0xe0
ret
#ifndef __APPLE__
.size mlkem_shake256_blocksx3_seed_neon,.-mlkem_shake256_blocksx3_seed_neon
#endif /* __APPLE__ */
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_WC_MLKEM */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#endif /* !WOLFSSL_ARMASM_INLINE */