pqc_kyber 0.2.1

A rust implementation of the post-quantum Kyber KEM algorithm
Documentation
.macro shuffle8 r0,r1,r2,r3
vperm2i128	$0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128	$0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq	%ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq	%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vpsllq		$32,%ymm\r1,%ymm\r2
vmovsldup	%ymm\r1,%ymm\r2
vpblendd	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq		$32,%ymm\r0,%ymm\r0
#vmovshdup	%ymm\r0,%ymm\r0
vpblendd	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld		$16,%ymm\r1,%ymm\r2
vpblendw	$0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld		$16,%ymm\r0,%ymm\r0
vpblendw	$0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro red16 r,rs=0,x=12
vpmulhw         %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw	%ymm\rs,%ymm\x,%ymm\x
.else
vpsraw          $10,%ymm\x,%ymm\x
.endif
vpmullw         %ymm0,%ymm\x,%ymm\x
vpsubw          %ymm\x,%ymm\r,%ymm\r
.endm

.macro csubq r,x=12
vpsubw		%ymm0,%ymm\r,%ymm\r
vpsraw		$15,%ymm\r,%ymm\x
vpand		%ymm0,%ymm\x,%ymm\x
vpaddw		%ymm\x,%ymm\r,%ymm\r
.endm

.macro caddq r,x=12
vpsraw		$15,%ymm\r,%ymm\x
vpand		%ymm0,%ymm\x,%ymm\x
vpaddw		%ymm\x,%ymm\r,%ymm\r
.endm

.macro fqmulprecomp al,ah,b,x=12
vpmullw		%ymm\al,%ymm\b,%ymm\x
vpmulhw		%ymm\ah,%ymm\b,%ymm\b
vpmulhw		%ymm0,%ymm\x,%ymm\x
vpsubw		%ymm\x,%ymm\b,%ymm\b
.endm

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
vpsubw		%ymm\rl1,%ymm\rh1,%ymm13

vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw		%ymm\rl2,%ymm\rh2,%ymm14

vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
vpsubw		%ymm\rl3,%ymm\rh3,%ymm15

vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw		%ymm\zl1,%ymm15,%ymm\rh3

vpmulhw		%ymm\zh0,%ymm12,%ymm12
vpmulhw		%ymm\zh0,%ymm13,%ymm13

vpmulhw		%ymm\zh1,%ymm14,%ymm14
vpmulhw		%ymm\zh1,%ymm15,%ymm15

vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0

vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1

vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3

#

#

vpsubw		%ymm\rh0,%ymm12,%ymm\rh0

vpsubw		%ymm\rh1,%ymm13,%ymm\rh1

vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
.endm

.macro intt_levels0t5 off
/* level 0 */
vmovdqa		48*2(%rsi),%ymm2
vmovdqa		64*2(%rsi),%ymm3

vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7

fqmulprecomp	2,3,4
fqmulprecomp	2,3,6
fqmulprecomp	2,3,5
fqmulprecomp	2,3,7

vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa         (128*\off+112)*2(%rdi),%ymm11

fqmulprecomp	2,3,8
fqmulprecomp	2,3,10
fqmulprecomp	2,3,9
fqmulprecomp	2,3,11

vpermq		$0x4E,(160+(1-\off)*224+208)*2(%rsi),%ymm15
vpermq		$0x4E,(160+(1-\off)*224+176)*2(%rsi),%ymm1
vpermq		$0x4E,(160+(1-\off)*224+224)*2(%rsi),%ymm2
vpermq		$0x4E,(160+(1-\off)*224+192)*2(%rsi),%ymm3
vmovdqa		128*2(%rsi),%ymm12
vpshufb		%ymm12,%ymm15,%ymm15
vpshufb		%ymm12,%ymm1,%ymm1
vpshufb		%ymm12,%ymm2,%ymm2
vpshufb		%ymm12,%ymm3,%ymm3

butterfly	4,5,8,9,6,7,10,11,15,1,2,3

/* level 1 */
vpermq		$0x4E,(160+(1-\off)*224+144)*2(%rsi),%ymm2
vpermq		$0x4E,(160+(1-\off)*224+160)*2(%rsi),%ymm3
vmovdqa		128*2(%rsi),%ymm1
vpshufb		%ymm1,%ymm2,%ymm2
vpshufb		%ymm1,%ymm3,%ymm3

butterfly	4,5,6,7,8,9,10,11,2,2,3,3

shuffle1	4,5,3,5
shuffle1	6,7,4,7
shuffle1	8,9,6,9
shuffle1	10,11,8,11

/* level 2 */
vmovdqa		144*2(%rsi),%ymm12
vpermd		(160+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
vpermd		(160+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10

butterfly	3,4,6,8,5,7,9,11,2,2,10,10

vmovdqa		32*2(%rsi),%ymm1
red16		3

shuffle2	3,4,10,4
shuffle2	6,8,3,8
shuffle2	5,7,6,7
shuffle2	9,11,5,11

/* level 3 */
vpermq		$0x1B,(160+(1-\off)*224+80)*2(%rsi),%ymm2
vpermq		$0x1B,(160+(1-\off)*224+96)*2(%rsi),%ymm9

butterfly	10,3,6,5,4,8,7,11,2,2,9,9

shuffle4	10,3,9,3
shuffle4	6,5,10,5
shuffle4	4,8,6,8
shuffle4	7,11,4,11

/* level 4 */
vpermq		$0x4E,(160+(1-\off)*224+48)*2(%rsi),%ymm2
vpermq		$0x4E,(160+(1-\off)*224+64)*2(%rsi),%ymm7

butterfly	9,10,6,4,3,5,8,11,2,2,7,7

red16		9

shuffle8	9,10,7,10
shuffle8	6,4,9,4
shuffle8	3,5,6,5
shuffle8	8,11,3,11

/* level5 */
vmovdqa		(160+(1-\off)*224+16)*2(%rsi),%ymm2
vmovdqa		(160+(1-\off)*224+32)*2(%rsi),%ymm8

butterfly	7,9,6,3,10,4,5,11,2,2,8,8

vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
.endm

.macro intt_level6 off
/* level 6 */
vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
vmovdqa         (64*\off+128)*2(%rdi),%ymm8
vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa         (64*\off+144)*2(%rdi),%ymm9
vpbroadcastq	(160+0)*2(%rsi),%ymm2

vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa         (64*\off+160)*2(%rdi),%ymm10
vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
vmovdqa         (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq	(160+4)*2(%rsi),%ymm3

butterfly	4,5,6,7,8,9,10,11

.if \off == 0
red16		4
.endif

vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
.endm

.text
.global invntt_avx
invntt_avx:
vmovdqa         0*2(%rsi),%ymm0

intt_levels0t5	0
intt_levels0t5	1

intt_level6	0
intt_level6	1
ret