pqc_kyber 0.7.0

A rust implementation of the post-quantum Kyber KEM algorithm
Documentation
%include "consts.inc"

%macro schoolbook 1 
vmovdqa		ymm0,[rcx + _16XQINV*2]
vmovdqa		ymm1,[rsi + (64*%1+ 0)*2]		; a0
vmovdqa		ymm2,[rsi + (64*%1+16)*2]		; b0
vmovdqa		ymm3,[rsi + (64*%1+32)*2]		; a1
vmovdqa		ymm4,[rsi + (64*%1+48)*2]		; b1

vpmullw		ymm9,ymm1,ymm0			; a0.lo
vpmullw		ymm10,ymm2,ymm0			; b0.lo
vpmullw		ymm11,ymm3,ymm0			; a1.lo
vpmullw		ymm12,ymm4,ymm0			; b1.lo

vmovdqa		 ymm5,[rdx + (64*%1+   0)*2]		; c0
vmovdqa		 ymm6,[rdx + (64*%1+  16)*2]		; d0

vpmulhw		ymm13,ymm1,ymm5			; a0c0.hi
vpmulhw		ymm1,ymm1,ymm6			; a0d0.hi
vpmulhw		ymm14,ymm2,ymm5			; b0c0.hi
vpmulhw		ymm2,ymm2,ymm6			; b0d0.hi

vmovdqa		 ymm7,[rdx + (64*%1+  32)*2]		; c1
vmovdqa		 ymm8,[rdx + (64*%1+  48)*2]		; d1

vpmulhw		ymm15,ymm3,ymm7			; a1c1.hi
vpmulhw		ymm3,ymm3,ymm8			; a1d1.hi
vpmulhw		ymm0,ymm4,ymm7			; b1c1.hi
vpmulhw		ymm4,ymm4,ymm8			; b1d1.hi

vmovdqa		[rsp],ymm13

vpmullw		ymm13,ymm9,ymm5			; a0c0.lo
vpmullw		ymm9,ymm9,ymm6			; a0d0.lo
vpmullw		ymm5,ymm10,ymm5			; b0c0.lo
vpmullw		ymm10,ymm10,ymm6			; b0d0.lo

vpmullw		ymm6,ymm11,ymm7			; a1c1.lo
vpmullw		ymm11,ymm11,ymm8			; a1d1.lo
vpmullw		ymm7,ymm12,ymm7			; b1c1.lo
vpmullw		ymm12,ymm12,ymm8			; b1d1.lo

vmovdqa		ymm8,[rcx + _16XQ*2]
vpmulhw		ymm13,ymm13,ymm8
vpmulhw		ymm9,ymm9,ymm8
vpmulhw		ymm5,ymm5,ymm8
vpmulhw		ymm10,ymm10,ymm8
vpmulhw		ymm6,ymm6,ymm8
vpmulhw		ymm11,ymm11,ymm8
vpmulhw		ymm7,ymm7,ymm8
vpmulhw		ymm12,ymm12,ymm8

vpsubw		ymm13,ymm13,[rsp]			; -a0c0
vpsubw		ymm9,ymm1,ymm9			; a0d0
vpsubw		ymm5,ymm14,ymm5			; b0c0
vpsubw		ymm10,ymm2,ymm10			; b0d0

vpsubw		ymm6,ymm15,ymm6			; a1c1
vpsubw		ymm11,ymm3,ymm11			; a1d1
vpsubw		ymm7,ymm0,ymm7			; b1c1
vpsubw		ymm12,ymm4,ymm12			; b1d1

vmovdqa		ymm0,[r9]
vmovdqa		ymm1,[r9 + 32]
vpmullw		ymm2,ymm10,ymm0
vpmullw		ymm3,ymm12,ymm0
vpmulhw		ymm10,ymm10,ymm1
vpmulhw		ymm12,ymm12,ymm1
vpmulhw		ymm2,ymm2,ymm8
vpmulhw		ymm3,ymm3,ymm8
vpsubw		ymm10,ymm10,ymm2			; rb0d0
vpsubw		ymm12,ymm12,ymm3			; rb1d1

vpaddw		ymm9,ymm9,ymm5
vpaddw		ymm11,ymm11,ymm7
vpsubw		ymm13,ymm10,ymm13
vpsubw		ymm6,ymm6,ymm12

vmovdqa		[rdi + (64*%1+  0)*2],ymm13
vmovdqa		[rdi + (64*%1+ 16)*2],ymm9
vmovdqa		[rdi + (64*%1+ 32)*2],ymm6
vmovdqa		[rdi + (64*%1+ 48)*2],ymm11
%endmacro

SECTION .text
global basemul_avx
global _basemul_avx
basemul_avx:
_basemul_avx:
mov		r8,rsp
and		rsp,-32
sub		rsp,32

lea		r9,[rcx + (_ZETAS_EXP+176)*2]
schoolbook	0

add		r9,32*2
schoolbook	1

add		r9,192*2
schoolbook	2

add		r9,32*2
schoolbook	3

mov		rsp,r8
ret