pqc_kyber 0.7.0

A rust implementation of the post-quantum Kyber KEM algorithm
Documentation
%include "shuffle.inc"
%include "fq.inc"
%include "consts.inc"

%macro butterfly 8-12 2,2,3,3
vpsubw   ymm12,ymm%5,ymm%1
vpaddw   ymm%1,ymm%1,ymm%5
vpsubw   ymm13,ymm%6,ymm%2

vpmullw   ymm%5,ymm12,ymm%9
vpaddw    ymm%2,ymm%2,ymm%6
vpsubw    ymm14,ymm%7,ymm%3

vpmullw   ymm%6,ymm13,ymm%9
vpaddw    ymm%3,ymm%3,ymm%7
vpsubw    ymm15,ymm%8,ymm%4

vpmullw   ymm%7,ymm14,ymm%10
vpaddw    ymm%4,ymm%4,ymm%8
vpmullw   ymm%8,ymm15,ymm%10

vpmulhw   ymm12,ymm12,ymm%11
vpmulhw   ymm13,ymm13,ymm%11

vpmulhw   ymm14,ymm14,ymm%12
vpmulhw   ymm15,ymm15,ymm%12

vpmulhw   ymm%5,ymm%5,ymm0

vpmulhw   ymm%6,ymm%6,ymm0

vpmulhw   ymm%7,ymm%7,ymm0
vpmulhw   ymm%8,ymm%8,ymm0

vpsubw    ymm%5,ymm12,ymm%5

vpsubw    ymm%6,ymm13,ymm%6

vpsubw    ymm%7,ymm14,ymm%7
vpsubw    ymm%8,ymm15,ymm%8
%endmacro

%macro intt_levels0t5 1
;  level 0 
vmovdqa   ymm2,[rsi + _16XFLO*2]
vmovdqa   ymm3,[rsi + _16XFHI*2]

vmovdqa   ymm4,[rdi + (128*%1+  0)*2]
vmovdqa   ymm6,[rdi + (128*%1+  32)*2]
vmovdqa   ymm5,[rdi + (128*%1+  16)*2]
vmovdqa   ymm7,[rdi + (128*%1+  48)*2]

fqmulprecomp  2,3,4
fqmulprecomp  2,3,6
fqmulprecomp  2,3,5
fqmulprecomp  2,3,7

vmovdqa   ymm8,[rdi + (128*%1+  64)*2]
vmovdqa   ymm10,[rdi + (128*%1+  96)*2]
vmovdqa   ymm9,[rdi + (128*%1+  80)*2]
vmovdqa   ymm11,[rdi + (128*%1+  112)*2]

fqmulprecomp  2,3,8
fqmulprecomp  2,3,10
fqmulprecomp  2,3,9
fqmulprecomp  2,3,11

vpermq ymm15,[rsi + (_ZETAS_EXP+(1-%1)*224+208)*2],04Eh
vpermq ymm1,[rsi + (_ZETAS_EXP+(1-%1)*224+176)*2],04Eh
vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+224)*2],04Eh
vpermq ymm3,[rsi + (_ZETAS_EXP+(1-%1)*224+192)*2],04Eh
vmovdqa   ymm12,[rsi + _REVIDXB*2]
vpshufb   ymm15,ymm15,ymm12
vpshufb   ymm1,ymm1,ymm12
vpshufb   ymm2,ymm2,ymm12
vpshufb   ymm3,ymm3,ymm12

butterfly  4,5,8,9,6,7,10,11,15,1,2,3

;  level 1 
vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+144)*2],04Eh
vpermq ymm3,[rsi + (_ZETAS_EXP+(1-%1)*224+160)*2],04Eh
vmovdqa   ymm1,[rsi + _REVIDXB*2]
vpshufb   ymm2,ymm2,ymm1
vpshufb   ymm3,ymm3,ymm1

butterfly  4,5,6,7,8,9,10,11,2,2,3,3

shuffle1  4,5,3,5
shuffle1  6,7,4,7
shuffle1  8,9,6,9
shuffle1  10,11,8,11

;  level 2 
vmovdqa   ymm12,[rsi + _REVIDXD*2]
vpermd    ymm2,ymm12,[rsi + (_ZETAS_EXP+(1-%1)*224+112)*2]
vpermd    ymm10,ymm12,[rsi + (_ZETAS_EXP+(1-%1)*224+128)*2]

butterfly  3,4,6,8,5,7,9,11,2,2,10,10

vmovdqa    ymm1,[rsi + _16XV*2]
red16    3

shuffle2  3,4,10,4
shuffle2  6,8,3,8
shuffle2  5,7,6,7
shuffle2  9,11,5,11

;  level 3 
vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+80)*2],01Bh
vpermq ymm9,[rsi + (_ZETAS_EXP+(1-%1)*224+96)*2],01Bh

butterfly  10,3,6,5,4,8,7,11,2,2,9,9

shuffle4  10,3,9,3
shuffle4  6,5,10,5
shuffle4  4,8,6,8
shuffle4  7,11,4,11

;  level 4 
vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+48)*2],04Eh
vpermq ymm7,[rsi + (_ZETAS_EXP+(1-%1)*224+64)*2],04Eh

butterfly 9,10,6,4,3,5,8,11,2,2,7,7

red16    9

shuffle8  9,10,7,10
shuffle8  6,4,9,4
shuffle8  3,5,6,5
shuffle8  8,11,3,11

;  level 5 
vmovdqa    ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+16)*2]
vmovdqa    ymm8,[rsi + (_ZETAS_EXP+(1-%1)*224+32)*2]

butterfly  7,9,6,3,10,4,5,11,2,2,8,8

vmovdqa    [rdi + (128*%1 + 0)*2],ymm7
vmovdqa    [rdi + (128*%1 + 16)*2],ymm9
vmovdqa    [rdi + (128*%1 + 32)*2],ymm6
vmovdqa    [rdi + (128*%1 + 48)*2],ymm3
vmovdqa    [rdi + (128*%1 + 64)*2],ymm10
vmovdqa    [rdi + (128*%1 + 80)*2],ymm4
vmovdqa    [rdi + (128*%1 + 96)*2],ymm5
vmovdqa    [rdi + (128*%1 + 112)*2],ymm11
%endmacro

%macro intt_level6 1
;  level 6 
vmovdqa       ymm4,[rdi + (64*%1+  0)*2]
vmovdqa       ymm8,[rdi + (64*%1+  128)*2]
vmovdqa       ymm5,[rdi + (64*%1+  16)*2]
vmovdqa       ymm9,[rdi + (64*%1+  144)*2]
vpbroadcastq	ymm2,[rsi + (_ZETAS_EXP+0)*2]

vmovdqa       ymm6,[rdi + (64*%1+  32)*2]
vmovdqa       ymm10,[rdi + (64*%1+  160)*2]
vmovdqa       ymm7,[rdi + (64*%1+  48)*2]
vmovdqa       ymm11,[rdi + (64*%1+  176)*2]
vpbroadcastq  ymm3,[rsi + (_ZETAS_EXP+4)*2]

butterfly 4,5,6,7,8,9,10,11

%if %1 == 0
red16 4
%endif

vmovdqa    [rdi + (64*%1+   0)*2],ymm4
vmovdqa    [rdi + (64*%1+  16)*2],ymm5
vmovdqa    [rdi + (64*%1+  32)*2],ymm6
vmovdqa    [rdi + (64*%1+  48)*2],ymm7
vmovdqa    [rdi + (64*%1+ 128)*2],ymm8
vmovdqa    [rdi + (64*%1+ 144)*2],ymm9
vmovdqa    [rdi + (64*%1+ 160)*2],ymm10
vmovdqa    [rdi + (64*%1+ 176)*2],ymm11
%endmacro

SECTION .text
global invntt_avx
global _invntt_avx
invntt_avx:
_invntt_avx:
vmovdqa   ymm0,[rsi + _16XQ*2]

intt_levels0t5  0
intt_levels0t5  1

intt_level6 0
intt_level6 1
ret