kyberlib 0.0.7

A robust, audit-friendly Rust implementation of FIPS 203 ML-KEM (the standardised CRYSTALS-Kyber post-quantum KEM).
Documentation
use crate::{consts::*, params::*, poly::*};
use core::arch::x86_64::*;

#[derive(Clone)]
pub struct Polyvec {
    pub vec: [Poly; KYBER_SECURITY_PARAMETER],
}

impl Copy for Polyvec {}

impl Polyvec {
    pub fn new() -> Self {
        Polyvec {
            vec: [Poly::new(); KYBER_SECURITY_PARAMETER],
        }
    }
}

pub unsafe fn poly_compress10(r: &mut [u8], a: &Poly) {
    let (mut f0, mut f1, mut f2);
    let (mut t0, mut t1);

    let v = _mm256_load_si256(QDATA.vec[_16XV / 16..].as_ptr());
    let v8 = _mm256_slli_epi16(v, 3);
    let off = _mm256_set1_epi16(15);
    let shift1 = _mm256_set1_epi16(1 << 12);
    let mask = _mm256_set1_epi16(1023);
    let shift2 = _mm256_set1_epi64x(
        ((1024u64 << 48) + (1u64 << 32) + (1024 << 16) + 1) as i64,
    );
    let sllvdidx = _mm256_set1_epi64x(12);
    let shufbidx = _mm256_set_epi8(
        8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, -1,
        -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0,
    );

    for i in 0..(KYBER_N / 16) {
        f0 = _mm256_load_si256(&a.vec[i]);
        f1 = _mm256_mullo_epi16(f0, v8);
        f2 = _mm256_add_epi16(f0, off);
        f0 = _mm256_slli_epi16(f0, 3);
        f0 = _mm256_mulhi_epi16(f0, v);
        f2 = _mm256_sub_epi16(f1, f2);
        f1 = _mm256_andnot_si256(f1, f2);
        f1 = _mm256_srli_epi16(f1, 15);
        f0 = _mm256_sub_epi16(f0, f1);
        f0 = _mm256_mulhrs_epi16(f0, shift1);
        f0 = _mm256_and_si256(f0, mask);
        f0 = _mm256_madd_epi16(f0, shift2);
        f0 = _mm256_sllv_epi32(f0, sllvdidx);
        f0 = _mm256_srli_epi64(f0, 12);
        f0 = _mm256_shuffle_epi8(f0, shufbidx);
        t0 = _mm256_castsi256_si128(f0);
        t1 = _mm256_extracti128_si256(f0, 1);
        t0 = _mm_blend_epi16(t0, t1, 0xE0);
        _mm_storeu_si128(r[20 * i..].as_mut_ptr() as *mut __m128i, t0);
        _mm_storeu_si128(
            r[20 * i + 16..].as_mut_ptr() as *mut __m128i,
            t1,
        );
    }
}

pub unsafe fn poly_decompress10(r: &mut Poly, a: &[u8]) {
    let mut f;
    let q = _mm256_set1_epi32(
        ((KYBER_Q as i32) << 16) + 4 * KYBER_Q as i32,
    );
    let shufbidx = _mm256_set_epi8(
        11, 10, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 3, 2, 9, 8, 8, 7,
        7, 6, 6, 5, 4, 3, 3, 2, 2, 1, 1, 0,
    );
    let sllvdidx = _mm256_set1_epi64x(4);
    let mask = _mm256_set1_epi32((32736 << 16) + 8184);
    for i in 0..KYBER_N / 16 {
        f = _mm256_loadu_si256(a[20 * i..].as_ptr() as *const __m256i);
        f = _mm256_permute4x64_epi64(f, 0x94);
        f = _mm256_shuffle_epi8(f, shufbidx);
        f = _mm256_sllv_epi32(f, sllvdidx);
        f = _mm256_srli_epi16(f, 1);
        f = _mm256_and_si256(f, mask);
        f = _mm256_mulhrs_epi16(f, q);
        _mm256_store_si256(&mut r.vec[i], f);
    }
}

pub unsafe fn poly_compress11(r: &mut [u8], a: &Poly) {
    let (mut f0, mut f1, mut f2);
    let (mut t0, mut t1);
    let v = _mm256_load_si256(QDATA.vec[_16XV / 16..].as_ptr());
    let v8 = _mm256_slli_epi16(v, 3);
    let off = _mm256_set1_epi16(36);
    let shift1 = _mm256_set1_epi16(1 << 13);
    let mask = _mm256_set1_epi16(2047);
    let shift2 = _mm256_set1_epi64x(
        ((2048u64 << 48) + (1u64 << 32) + (2048 << 16) + 1) as i64,
    );
    let sllvdidx = _mm256_set1_epi64x(10);
    let srlvqidx = _mm256_set_epi64x(30, 10, 30, 10);
    let shufbidx = _mm256_set_epi8(
        4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, -1, -1,
        -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
    );

    for i in 0..KYBER_N / 16 {
        f0 = _mm256_load_si256(&a.vec[i]);
        f1 = _mm256_mullo_epi16(f0, v8);
        f2 = _mm256_add_epi16(f0, off);
        f0 = _mm256_slli_epi16(f0, 3);
        f0 = _mm256_mulhi_epi16(f0, v);
        f2 = _mm256_sub_epi16(f1, f2);
        f1 = _mm256_andnot_si256(f1, f2);
        f1 = _mm256_srli_epi16(f1, 15);
        f0 = _mm256_sub_epi16(f0, f1);
        f0 = _mm256_mulhrs_epi16(f0, shift1);
        f0 = _mm256_and_si256(f0, mask);
        f0 = _mm256_madd_epi16(f0, shift2);
        f0 = _mm256_sllv_epi32(f0, sllvdidx);
        f1 = _mm256_bsrli_epi128(f0, 8);
        f0 = _mm256_srlv_epi64(f0, srlvqidx);
        f1 = _mm256_slli_epi64(f1, 34);
        f0 = _mm256_add_epi64(f0, f1);
        f0 = _mm256_shuffle_epi8(f0, shufbidx);
        t0 = _mm256_castsi256_si128(f0);
        t1 = _mm256_extracti128_si256(f0, 1);
        t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
        _mm_storeu_si128(
            r[22 * i + 0..].as_mut_ptr() as *mut __m128i,
            t0,
        );
        _mm_storel_epi64(
            r[22 * i + 16..].as_mut_ptr() as *mut __m128i,
            t1,
        );
    }
}

pub unsafe fn poly_decompress11(r: &mut Poly, a: &[u8]) {
    let mut f;

    let q = _mm256_load_si256(QDATA.vec[_16XQ / 16..].as_ptr());
    let shufbidx = _mm256_set_epi8(
        13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 10, 9, 9,
        8, 7, 6, 6, 5, 5, 4, 3, 2, 2, 1, 1, 0,
    );
    let srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0);
    let srlvqidx = _mm256_set_epi64x(2, 0, 2, 0);
    let shift = _mm256_set_epi16(
        4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32,
    );
    let mask = _mm256_set1_epi16(32752);

    for i in 0..(KYBER_N / 16) {
        f = _mm256_loadu_si256(a[22 * i..].as_ptr() as *const __m256i);
        f = _mm256_permute4x64_epi64(f, 0x94);
        f = _mm256_shuffle_epi8(f, shufbidx);
        f = _mm256_srlv_epi32(f, srlvdidx);
        f = _mm256_srlv_epi64(f, srlvqidx);
        f = _mm256_mullo_epi16(f, shift);
        f = _mm256_srli_epi16(f, 1);
        f = _mm256_and_si256(f, mask);
        f = _mm256_mulhrs_epi16(f, q);
        _mm256_store_si256(&mut r.vec[i], f);
    }
}

pub unsafe fn polyvec_compress(r: &mut [u8], a: &Polyvec) {
    if cfg!(feature = "kyber1024") {
        for i in 0..KYBER_SECURITY_PARAMETER {
            poly_compress11(&mut r[352 * i..], &a.vec[i]);
        }
    } else {
        for i in 0..KYBER_SECURITY_PARAMETER {
            poly_compress10(&mut r[320 * i..], &a.vec[i]);
        }
    }
}

pub unsafe fn polyvec_decompress(r: &mut Polyvec, a: &[u8]) {
    if cfg!(feature = "kyber1024") {
        for i in 0..KYBER_SECURITY_PARAMETER {
            poly_decompress11(&mut r.vec[i], &a[352 * i..]);
        }
    } else {
        for i in 0..KYBER_SECURITY_PARAMETER {
            poly_decompress10(&mut r.vec[i], &a[320 * i..]);
        }
    }
}

pub fn polyvec_tobytes(r: &mut [u8], a: &Polyvec) {
    for i in 0..KYBER_SECURITY_PARAMETER {
        poly_tobytes(&mut r[i * KYBER_POLY_BYTES..], a.vec[i]);
    }
}

pub unsafe fn polyvec_frombytes(r: &mut Polyvec, a: &[u8]) {
    for i in 0..KYBER_SECURITY_PARAMETER {
        poly_frombytes(&mut r.vec[i], &a[i * KYBER_POLY_BYTES..]);
    }
}

/// Name:  polyvec_ntt
///
/// Description: Apply forward NTT to all elements of a vector of polynomials
///
/// Arguments:   - Polyvec r: in/output vector of polynomials
pub fn polyvec_ntt(r: &mut Polyvec) {
    for i in 0..KYBER_SECURITY_PARAMETER {
        poly_ntt(&mut r.vec[i]);
    }
}

/// Name:  polyvec_invntt
///
/// Description: Apply inverse NTT to all elements of a vector of polynomials
///
/// Arguments:   - Polyvec r: in/output vector of polynomials
pub fn polyvec_invntt_tomont(r: &mut Polyvec) {
    for i in 0..KYBER_SECURITY_PARAMETER {
        poly_invntt_tomont(&mut r.vec[i]);
    }
}

/// Name:  polyvec_basemul_acc_montgomery
///
/// Description: Pointwise multiply elements of a and b and accumulate into r
///
/// Arguments: - poly *r:  output polynomial
///  - const Polyvec a: first input vector of polynomials
///  - const Polyvec b: second input vector of polynomials
pub fn polyvec_basemul_acc_montgomery(
    r: &mut Poly,
    a: &Polyvec,
    b: &Polyvec,
) {
    let mut t = Poly::new();
    poly_basemul(r, &a.vec[0], &b.vec[0]);
    for i in 1..KYBER_SECURITY_PARAMETER {
        poly_basemul(&mut t, &a.vec[i], &b.vec[i]);
        poly_add(r, &t);
    }
}

/// Name:  polyvec_reduce
///
/// Description: Applies Barrett reduction to each coefficient
///  of each element of a vector of polynomials
///  for details of the Barrett reduction see comments in reduce.c
///
/// Arguments:   - poly *r:   input/output polynomial
pub fn polyvec_reduce(r: &mut Polyvec) {
    for i in 0..KYBER_SECURITY_PARAMETER {
        poly_reduce(&mut r.vec[i]);
    }
}

/// Name:  polyvec_add
///
/// Description: Add vectors of polynomials
///
/// Arguments: - Polyvec r:   output vector of polynomials
///  - const Polyvec a: first input vector of polynomials
///  - const Polyvec b: second input vector of polynomials
pub fn polyvec_add(r: &mut Polyvec, b: &Polyvec) {
    for i in 0..KYBER_SECURITY_PARAMETER {
        poly_add(&mut r.vec[i], &b.vec[i]);
    }
}