pqc_kyber 0.7.0

A rust implementation of the post-quantum Kyber KEM algorithm
Documentation
%include "shuffle.inc"
%include "consts.inc"

%macro mul 4-8 15,15,2,2
vpmullw		 ymm12,ymm%1,ymm%5
vpmullw		 ymm13,ymm%2,ymm%5

vpmullw		 ymm14,ymm%3,ymm%6
vpmullw		 ymm15,ymm%4,ymm%6

vpmulhw		 ymm%1,ymm%1,ymm%7
vpmulhw		 ymm%2,ymm%2,ymm%7

vpmulhw		 ymm%3,ymm%3,ymm%8
vpmulhw		 ymm%4,ymm%4,ymm%8
%endmacro

%macro reduce 0
vpmulhw		ymm12,ymm12,ymm0
vpmulhw		ymm13,ymm13,ymm0

vpmulhw		ymm14,ymm14,ymm0
vpmulhw		ymm15,ymm15,ymm0
%endmacro

%macro update 9
vpaddw		 ymm%1,ymm%2,ymm%6
vpsubw		 ymm%6,ymm%2,ymm%6
vpaddw		 ymm%2,ymm%3,ymm%7

vpsubw		 ymm%7,ymm%3,ymm%7
vpaddw		 ymm%3,ymm%4,ymm%8
vpsubw		 ymm%8,ymm%4,ymm%8

vpaddw		 ymm%4,ymm%5,ymm%9
vpsubw		 ymm%9,ymm%5,ymm%9

vpsubw		 ymm%1,ymm%1,ymm12
vpaddw		 ymm%6,ymm%6,ymm12
vpsubw		 ymm%2,ymm%2,ymm13

vpaddw		 ymm%7,ymm%7,ymm13
vpsubw		 ymm%3,ymm%3,ymm14
vpaddw		 ymm%8,ymm%8,ymm14

vpsubw		 ymm%4,ymm%4,ymm15
vpaddw		 ymm%9,ymm%9,ymm15
%endmacro

%macro level0 1
vpbroadcastq		ymm15,[rsi+ (_ZETAS_EXP+0)*2]
vmovdqa		 ymm8,[rdi + (64*%1+  128)*2]
vmovdqa		 ymm9,[rdi + (64*%1+  144)*2]
vmovdqa		 ymm10,[rdi + (64*%1+  160)*2]
vmovdqa		 ymm11,[rdi + (64*%1+  176)*2]
vpbroadcastq		ymm2,[rsi+ (_ZETAS_EXP+4)*2]

mul		8,9,10,11

vmovdqa		 ymm4,[rdi + (64*%1+    0)*2]
vmovdqa		 ymm5,[rdi + (64*%1+   16)*2]
vmovdqa		 ymm6,[rdi + (64*%1+   32)*2]
vmovdqa		 ymm7,[rdi + (64*%1+   48)*2]

reduce
update		3,4,5,6,7,8,9,10,11

vmovdqa		[rdi + (64*%1+   0)*2],ymm3
vmovdqa		[rdi + (64*%1+  16)*2],ymm4
vmovdqa		[rdi + (64*%1+  32)*2],ymm5
vmovdqa		[rdi + (64*%1+  48)*2],ymm6
vmovdqa		[rdi + (64*%1+ 128)*2],ymm8
vmovdqa		[rdi + (64*%1+ 144)*2],ymm9
vmovdqa		[rdi + (64*%1+ 160)*2],ymm10
vmovdqa		[rdi + (64*%1+ 176)*2],ymm11
%endmacro

%macro levels1t6 1
;  level 1 
vmovdqa		ymm15,[rsi+ (_ZETAS_EXP+224*%1+16)*2]
vmovdqa		ymm8,[rdi + (128*%1+ 64)*2]
vmovdqa		ymm9,[rdi + (128*%1+ 80)*2]
vmovdqa		ymm10,[rdi + (128*%1+ 96)*2]
vmovdqa		ymm11,[rdi + (128*%1+ 112)*2]
vmovdqa		ymm2,[rsi+ (_ZETAS_EXP+224*%1+32)*2]

mul		8,9,10,11

vmovdqa		ymm4,[rdi + (128*%1+ 0)*2]
vmovdqa		ymm5,[rdi + (128*%1+ 16)*2]
vmovdqa		ymm6,[rdi + (128*%1+ 32)*2]
vmovdqa		ymm7,[rdi + (128*%1+ 48)*2]

reduce
update		3,4,5,6,7,8,9,10,11

;  level 2 
shuffle8	5,10,7,10
shuffle8	6,11,5,11

vmovdqa		ymm15,[rsi+ (_ZETAS_EXP+224*%1+48)*2]
vmovdqa		ymm2,[rsi+ (_ZETAS_EXP+224*%1+64)*2]

mul		7,10,5,11

shuffle8	3,8,6,8
shuffle8	4,9,3,9

reduce
update		4,6,8,3,9,7,10,5,11

;  level 3 
shuffle4	8,5,9,5
shuffle4	3,11,8,11

vmovdqa		ymm15,[rsi+ (_ZETAS_EXP+224*%1+80)*2]
vmovdqa		ymm2,[rsi+ (_ZETAS_EXP+224*%1+96)*2]

mul		9,5,8,11

shuffle4	4,7,3,7
shuffle4	6,10,4,10

reduce
update		6,3,7,4,10,9,5,8,11

;  level 4 
shuffle2	7,8,10,8
shuffle2	4,11,7,11

vmovdqa		ymm15,[rsi+ (_ZETAS_EXP+224*%1+112)*2]
vmovdqa		ymm2,[rsi+ (_ZETAS_EXP+224*%1+128)*2]

mul		10,8,7,11

shuffle2	6,9,4,9
shuffle2	3,5,6,5

reduce
update		3,4,9,6,5,10,8,7,11

;  level 5 
shuffle1	9,7,5,7
shuffle1	6,11,9,11

vmovdqa		ymm15,[rsi+ (_ZETAS_EXP+224*%1+144)*2]
vmovdqa		ymm2,[rsi+ (_ZETAS_EXP+224*%1+160)*2]

mul		5,7,9,11

shuffle1	3,10,6,10
shuffle1	4,8,3,8

reduce
update		4,6,10,3,8,5,7,9,11

;  level 6 
vmovdqa		ymm14,[rsi+ (_ZETAS_EXP+224*%1+176)*2]
vmovdqa		ymm15,[rsi+ (_ZETAS_EXP+224*%1+208)*2]
vmovdqa		ymm8,[rsi+ (_ZETAS_EXP+224*%1+192)*2]
vmovdqa		ymm2,[rsi+ (_ZETAS_EXP+224*%1+224)*2]

mul		10,3,9,11,14,15,8,2

reduce
update		8,4,6,5,7,10,3,9,11

vmovdqa		[rdi + (128*%1+ 0)*2],ymm8
vmovdqa		[rdi + (128*%1+ 16)*2],ymm4
vmovdqa		[rdi + (128*%1+ 32)*2],ymm10
vmovdqa		[rdi + (128*%1+ 48)*2],ymm3
vmovdqa		[rdi + (128*%1+ 64)*2],ymm6
vmovdqa		[rdi + (128*%1+ 80)*2],ymm5
vmovdqa		[rdi + (128*%1+ 96)*2],ymm9
vmovdqa		[rdi + (128*%1+ 112)*2],ymm11
%endmacro

SECTION .text
global ntt_avx
global _ntt_avx
ntt_avx:
_ntt_avx:
vmovdqa		ymm0,[rsi + _16XQ*2]

level0		0
level0		1

levels1t6	0
levels1t6	1

ret