p3-goldilocks 0.5.3

//! Implementation of Poseidon2, see: https://eprint.iacr.org/2023/323

use alloc::vec::Vec;

use p3_field::{Algebra, InjectiveMonomial, PrimeCharacteristicRing};
#[cfg(not(target_arch = "aarch64"))]
use p3_poseidon2::Poseidon2;
use p3_poseidon2::{
    ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, GenericPoseidon2LinearLayers,
    InternalLayer, InternalLayerConstructor, MDSMat4, add_rc_and_sbox_generic,
    external_initial_permute_state, external_terminal_permute_state, internal_permute_state,
    matmul_internal,
};

use crate::Goldilocks;
use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE;

/// Number of full rounds per half for Goldilocks Poseidon2 (`RF / 2`).
///
/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending).
/// Follows the Poseidon2 paper's security analysis with a +2 RF margin.
pub const GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS: usize = 4;

/// Number of partial rounds for Goldilocks Poseidon2 (width 8).
///
/// Derived from the interpolation bound in the Poseidon paper (Eq. 3):
///
///   R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5
///            = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20
///
/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8: usize = 22;

/// Number of partial rounds for Goldilocks Poseidon2 (width 12).
///
/// Same interpolation bound as width 8:
///
///   R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20
///
/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_12: usize = 22;

/// An implementation of the Poseidon2 hash function for the Goldilocks field.
///
/// It acts on arrays of the form `[Goldilocks; WIDTH]`.
#[cfg(target_arch = "aarch64")]
pub type Poseidon2Goldilocks<const WIDTH: usize> = crate::Poseidon2GoldilocksFused<WIDTH>;

/// An implementation of the Poseidon2 hash function for the Goldilocks field.
///
/// It acts on arrays of the form `[Goldilocks; WIDTH]`.
#[cfg(not(target_arch = "aarch64"))]
pub type Poseidon2Goldilocks<const WIDTH: usize> = Poseidon2<
    Goldilocks,
    Poseidon2ExternalLayerGoldilocks<WIDTH>,
    Poseidon2InternalLayerGoldilocks,
    WIDTH,
    GOLDILOCKS_S_BOX_DEGREE,
>;

/// Round constants for width-8 Poseidon2 on Goldilocks.
///
/// Generated by the Grain LFSR with parameters:
///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
///
/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
///
/// Layout: external_initial (4 rounds × 8 elements).
pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL: [[Goldilocks; 8]; 4] = [
    Goldilocks::new_array([
        0xdd5743e7f2a5a5d9,
        0xcb3a864e58ada44b,
        0xffa2449ed32f8cdc,
        0x42025f65d6bd13ee,
        0x7889175e25506323,
        0x34b98bb03d24b737,
        0xbdcc535ecc4faa2a,
        0x5b20ad869fc0d033,
    ]),
    Goldilocks::new_array([
        0xf1dda5b9259dfcb4,
        0x27515210be112d59,
        0x4227d1718c766c3f,
        0x26d333161a5bd794,
        0x49b938957bf4b026,
        0x4a56b5938b213669,
        0x1120426b48c8353d,
        0x6b323c3f10a56cad,
    ]),
    Goldilocks::new_array([
        0xce57d6245ddca6b2,
        0xb1fc8d402bba1eb1,
        0xb5c5096ca959bd04,
        0x6db55cd306d31f7f,
        0xc49d293a81cb9641,
        0x1ce55a4fe979719f,
        0xa92e60a9d178a4d1,
        0x002cc64973bcfd8c,
    ]),
    Goldilocks::new_array([
        0xcea721cce82fb11b,
        0xe5b55eb8098ece81,
        0x4e30525c6f1ddd66,
        0x43c6702827070987,
        0xaca68430a7b5762a,
        0x3674238634df9c93,
        0x88cee1c825e33433,
        0xde99ae8d74b57176,
    ]),
];

/// Round constants for width-8 Poseidon2 on Goldilocks.
///
/// Generated by the Grain LFSR with parameters:
///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
///
/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
///
/// Layout: external_final (4 rounds × 8 elements).
pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL: [[Goldilocks; 8]; 4] = [
    Goldilocks::new_array([
        0x014ef1197d341346,
        0x9725e20825d07394,
        0xfdb25aef2c5bae3b,
        0xbe5402dc598c971e,
        0x93a5711f04cdca3d,
        0xc45a9a5b2f8fb97b,
        0xfe8946a924933545,
        0x2af997a27369091c,
    ]),
    Goldilocks::new_array([
        0xaa62c88e0b294011,
        0x058eb9d810ce9f74,
        0xb3cb23eced349ae4,
        0xa3648177a77b4a84,
        0x43153d905992d95d,
        0xf4e2a97cda44aa4b,
        0x5baa2702b908682f,
        0x082923bdf4f750d1,
    ]),
    Goldilocks::new_array([
        0x98ae09a325893803,
        0xf8a6475077968838,
        0xceb0735bf00b2c5f,
        0x0a1a5d953888e072,
        0x2fcb190489f94475,
        0xb5be06270dec69fc,
        0x739cb934b09acf8b,
        0x537750b75ec7f25b,
    ]),
    Goldilocks::new_array([
        0xe9dd318bae1f3961,
        0xf7462137299efe1a,
        0xb1f6b8eee9adb940,
        0xbdebcc8a809dfe6b,
        0x40fc1f791b178113,
        0x3ac1c3362d014864,
        0x9a016184bdb8aeba,
        0x95f2394459fbc25e,
    ]),
];

/// Round constants for width-8 Poseidon2 on Goldilocks.
///
/// Generated by the Grain LFSR with parameters:
///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
///
/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
///
/// Layout: internal (22 scalar constants).
pub const GOLDILOCKS_POSEIDON2_RC_8_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([
    0x488897d85ff51f56,
    0x1140737ccb162218,
    0xa7eeb9215866ed35,
    0x9bd2976fee49fcc9,
    0xc0c8f0de580a3fcc,
    0x4fb2dae6ee8fc793,
    0x343a89f35f37395b,
    0x223b525a77ca72c8,
    0x56ccb62574aaa918,
    0xc4d507d8027af9ed,
    0xa080673cf0b7e95c,
    0xf0184884eb70dcf8,
    0x044f10b0cb3d5c69,
    0xe9e3f7993938f186,
    0x1b761c80e772f459,
    0x606cec607a1b5fac,
    0x14a0c2e1d45f03cd,
    0x4eace8855398574f,
    0xf905ca7103eff3e6,
    0xf8c8f8d20862c059,
    0xb524fe8bdd678e5a,
    0xfbb7865901a1ec41,
]);

/// Round constants for width-12 Poseidon2 on Goldilocks.
///
/// Generated by the Grain LFSR with parameters:
///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
///
/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
///
/// Layout: external_initial (4 rounds × 12 elements).
pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL: [[Goldilocks; 12]; 4] = [
    Goldilocks::new_array([
        0x13dcf33aba214f46,
        0x30b3b654a1da6d83,
        0x1fc634ada6159b56,
        0x937459964dc03466,
        0xedd2ef2ca7949924,
        0xede9affde0e22f68,
        0x8515b9d6bac9282d,
        0x6b5c07b4e9e900d8,
        0x1ec66368838c8a08,
        0x9042367d80d1fbab,
        0x400283564a3c3799,
        0x4a00be0466bca75e,
    ]),
    Goldilocks::new_array([
        0x7913beee58e3817f,
        0xf545e88532237d90,
        0x22f8cb8736042005,
        0x6f04990e247a2623,
        0xfe22e87ba37c38cd,
        0xd20e32c85ffe2815,
        0x117227674048fe73,
        0x4e9fb7ea98a6b145,
        0xe0866c232b8af08b,
        0x00bbc77916884964,
        0x7031c0fb990d7116,
        0x240a9e87cf35108f,
    ]),
    Goldilocks::new_array([
        0x2e6363a5a12244b3,
        0x5e1c3787d1b5011c,
        0x4132660e2a196e8b,
        0x3a013b648d3d4327,
        0xf79839f49888ea43,
        0xfe85658ebafe1439,
        0xb6889825a14240bd,
        0x578453605541382b,
        0x4508cda8f6b63ce9,
        0x9c3ef35848684c91,
        0x0812bde23c87178c,
        0xfe49638f7f722c14,
    ]),
    Goldilocks::new_array([
        0x8e3f688ce885cbf5,
        0xb8e110acf746a87d,
        0xb4b2e8973a6dabef,
        0x9e714c5da3d462ec,
        0x6438f9033d3d0c15,
        0x24312f7cf1a27199,
        0x23f843bb47acbf71,
        0x9183f11a34be9f01,
        0x839062fbb9d45dbf,
        0x24b56e7e6c2e43fa,
        0xe1683da61c962a72,
        0xa95c63971a19bfa7,
    ]),
];

/// Round constants for width-12 Poseidon2 on Goldilocks.
///
/// Generated by the Grain LFSR with parameters:
///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
///
/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
///
/// Layout: external_final (4 rounds × 12 elements).
pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL: [[Goldilocks; 12]; 4] = [
    Goldilocks::new_array([
        0xc68be7c94882a24d,
        0xaf996d5d5cdaedd9,
        0x9717f025e7daf6a5,
        0x6436679e6e7216f4,
        0x8a223d99047af267,
        0xbb512e35a133ba9a,
        0xfbbf44097671aa03,
        0xf04058ebf6811e61,
        0x5cca84703fac7ffb,
        0x9b55c7945de6469f,
        0x8e05bf09808e934f,
        0x2ea900de876307d7,
    ]),
    Goldilocks::new_array([
        0x7748fff2b38dfb89,
        0x6b99a676dd3b5d81,
        0xac4bb7c627cf7c13,
        0xadb6ebe5e9e2f5ba,
        0x2d33378cafa24ae3,
        0x1e5b73807543f8c2,
        0x09208814bfebb10f,
        0x782e64b6bb5b93dd,
        0xadd5a48eac90b50f,
        0xadd4c54c736ea4b1,
        0xd58dbb86ed817fd8,
        0x6d5ed1a533f34ddd,
    ]),
    Goldilocks::new_array([
        0x28686aa3e36b7cb9,
        0x591abd3476689f36,
        0x047d766678f13875,
        0xa2a11112625f5b49,
        0x21fd10a3f8304958,
        0xf9b40711443b0280,
        0xd2697eb8b2bde88e,
        0x3493790b51731b3f,
        0x11caf9dd73764023,
        0x7acfb8f72878164e,
        0x744ec4db23cefc26,
        0x1e00e58f422c6340,
    ]),
    Goldilocks::new_array([
        0x21dd28d906a62dda,
        0xf32a46ab5f465b5f,
        0xbfce13201f3f7e6b,
        0xf30d2e7adb5304e2,
        0xecdf4ee4abad48e9,
        0xf94e82182d395019,
        0x4ee52e3744d887c5,
        0xa1341c7cac0083b2,
        0x2302fb26c30c834a,
        0xaea3c587273bf7d3,
        0xf798e24961823ec7,
        0x962deba3e9a2cd94,
    ]),
];

/// Round constants for width-12 Poseidon2 on Goldilocks.
///
/// Generated by the Grain LFSR with parameters:
///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
///
/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
///
/// Layout: internal (22 scalar constants).
pub const GOLDILOCKS_POSEIDON2_RC_12_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([
    0x4adf842aa75d4316,
    0xf8fbb871aa4ab4eb,
    0x68e85b6eb2dd6aeb,
    0x07a0b06b2d270380,
    0xd94e0228bd282de4,
    0x8bdd91d3250c5278,
    0x209c68b88bba778f,
    0xb5e18cdab77f3877,
    0xb296a3e808da93fa,
    0x8370ecbda11a327e,
    0x3f9075283775dad8,
    0xb78095bb23c6aa84,
    0x3f36b9fe72ad4e5f,
    0x69bc96780b10b553,
    0x3f1d341f2eb7b881,
    0x4e939e9815838818,
    0xda366b3ae2a31604,
    0xbc89db1e7287d509,
    0x6102f411f9ef5659,
    0x58725c5e7ac1f0ab,
    0x0df5856c798883e7,
    0xf7bb62a8da4c961b,
]);

/// Create a default width-8 Poseidon2 permutation for Goldilocks.
#[cfg(not(target_arch = "aarch64"))]
pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> {
    Poseidon2::new(
        ExternalLayerConstants::new(
            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(),
            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(),
        ),
        GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(),
    )
}

/// Create a default width-8 Poseidon2 permutation for Goldilocks.
#[cfg(target_arch = "aarch64")]
pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> {
    crate::Poseidon2GoldilocksFused::new(
        &ExternalLayerConstants::new(
            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(),
            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(),
        ),
        &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL,
    )
}

/// Create a default width-12 Poseidon2 permutation for Goldilocks.
#[cfg(not(target_arch = "aarch64"))]
pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> {
    Poseidon2::new(
        ExternalLayerConstants::new(
            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(),
            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(),
        ),
        GOLDILOCKS_POSEIDON2_RC_12_INTERNAL.to_vec(),
    )
}

/// Create a default width-12 Poseidon2 permutation for Goldilocks.
#[cfg(target_arch = "aarch64")]
pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> {
    crate::Poseidon2GoldilocksFused::new(
        &ExternalLayerConstants::new(
            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(),
            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(),
        ),
        &GOLDILOCKS_POSEIDON2_RC_12_INTERNAL,
    )
}

pub const MATRIX_DIAG_8_GOLDILOCKS: [Goldilocks; 8] = Goldilocks::new_array([
    0xfffffffeffffffff, // -2
    0x0000000000000001, // 1
    0x0000000000000002, // 2
    0x7fffffff80000001, // 1/2
    0x0000000000000003, // 3
    0x7fffffff80000000, // -1/2
    0xfffffffefffffffe, // -3
    0xfffffffefffffffd, // -4
]);

pub const MATRIX_DIAG_12_GOLDILOCKS: [Goldilocks; 12] = Goldilocks::new_array([
    0xfffffffeffffffff, // -2
    0x0000000000000001, // 1
    0x0000000000000002, // 2
    0x7fffffff80000001, // 1/2
    0x0000000000000003, // 3
    0x0000000000000004, // 4
    0x7fffffff80000000, // -1/2
    0xfffffffefffffffe, // -3
    0xfffffffefffffffd, // -4
    0xbfffffff40000001, // 1/2^2
    0x3fffffffc0000000, // -1/2^2
    0xdfffffff20000001, // 1/2^3
]);

pub const MATRIX_DIAG_16_GOLDILOCKS: [Goldilocks; 16] = Goldilocks::new_array([
    0xfffffffeffffffff, // -2
    0x0000000000000001, // 1
    0x0000000000000002, // 2
    0x7fffffff80000001, // 1/2
    0x0000000000000003, // 3
    0x0000000000000004, // 4
    0x7fffffff80000000, // -1/2
    0xfffffffefffffffe, // -3
    0xfffffffefffffffd, // -4
    0xdfffffff20000001, // 1/2^3
    0xefffffff10000001, // 1/2^4
    0xf7ffffff08000001, // 1/2^5
    0x1fffffffe0000000, // -1/2^3
    0x0ffffffff0000000, // -1/2^4
    0x07fffffff8000000, // -1/2^5
    0xfffffffe00000002, // 1/2^32
]);

pub const MATRIX_DIAG_20_GOLDILOCKS: [Goldilocks; 20] = Goldilocks::new_array([
    0x95c381fda3b1fa57,
    0xf36fe9eb1288f42c,
    0x89f5dcdfef277944,
    0x106f22eadeb3e2d2,
    0x684e31a2530e5111,
    0x27435c5d89fd148e,
    0x3ebed31c414dbf17,
    0xfd45b0b2d294e3cc,
    0x48c904473a7f6dbf,
    0xe0d1b67809295b4d,
    0xddd1941e9d199dcb,
    0x8cfe534eeb742219,
    0xa6e5261d9e3b8524,
    0x6897ee5ed0f82c1b,
    0x0e7dcd0739ee5f78,
    0x493253f3d0d32363,
    0xbb2737f5845f05c0,
    0xa187e810b06ad903,
    0xb635b995936c4918,
    0x0b3694a940bd2394,
]);

fn internal_layer_mat_mul_goldilocks_8<A: Algebra<Goldilocks>>(state: &mut [A; 8]) {
    let sum: A = state.iter().map(|r| r.dup()).sum();

    let s0 = state[0].dup();
    let s1 = state[1].dup();
    let s2 = state[2].dup();
    let s3 = state[3].dup();
    let s4 = state[4].dup();
    let s5 = state[5].dup();
    let s6 = state[6].dup();
    let s7 = state[7].dup();

    // V[0] = -2
    let two_s0 = s0.dup() + s0;
    state[0] = sum.dup() - two_s0;

    // V[1] = 1
    state[1] = sum.dup() + s1;

    // V[2] = 2
    let two_s2 = s2.dup() + s2;
    state[2] = sum.dup() + two_s2;

    // V[3] = 1/2
    state[3] = sum.dup() + s3.halve();

    // V[4] = 3
    let two_s4 = s4.dup() + s4.dup();
    let three_s4 = two_s4 + s4;
    state[4] = sum.dup() + three_s4;

    // V[5] = -1/2
    state[5] = sum.dup() - s5.halve();

    // V[6] = -3
    let two_s6 = s6.dup() + s6.dup();
    let three_s6 = two_s6 + s6;
    state[6] = sum.dup() - three_s6;

    // V[7] = -4
    let two_s7 = s7.dup() + s7;
    let four_s7 = two_s7.dup() + two_s7;
    state[7] = sum - four_s7;
}

fn internal_layer_mat_mul_goldilocks_12<A: Algebra<Goldilocks>>(state: &mut [A; 12]) {
    let sum: A = state.iter().map(|r| r.dup()).sum();

    let s0 = state[0].dup();
    let s1 = state[1].dup();
    let s2 = state[2].dup();
    let s3 = state[3].dup();
    let s4 = state[4].dup();
    let s5 = state[5].dup();
    let s6 = state[6].dup();
    let s7 = state[7].dup();
    let s8 = state[8].dup();
    let s9 = state[9].dup();
    let s10 = state[10].dup();
    let s11 = state[11].dup();

    // V[0] = -2
    let two_s0 = s0.dup() + s0;
    state[0] = sum.dup() - two_s0;

    // V[1] = 1
    state[1] = sum.dup() + s1;

    // V[2] = 2
    let two_s2 = s2.dup() + s2;
    state[2] = sum.dup() + two_s2;

    // V[3] = 1/2
    state[3] = sum.dup() + s3.halve();

    // V[4] = 3
    let two_s4 = s4.dup() + s4.dup();
    let three_s4 = two_s4 + s4;
    state[4] = sum.dup() + three_s4;

    // V[5] = 4
    let two_s5 = s5.dup() + s5;
    let four_s5 = two_s5.dup() + two_s5;
    state[5] = sum.dup() + four_s5;

    // V[6] = -1/2
    state[6] = sum.dup() - s6.halve();

    // V[7] = -3
    let two_s7 = s7.dup() + s7.dup();
    let three_s7 = two_s7 + s7;
    state[7] = sum.dup() - three_s7;

    // V[8] = -4
    let two_s8 = s8.dup() + s8;
    let four_s8 = two_s8.dup() + two_s8;
    state[8] = sum.dup() - four_s8;

    // V[9] = 1/2^2
    state[9] = sum.dup() + s9.halve().halve();

    // V[10] = -1/2^2
    state[10] = sum.dup() - s10.halve().halve();

    // V[11] = 1/2^3
    state[11] = sum + s11.halve().halve().halve();
}

fn internal_layer_mat_mul_goldilocks_16<A: Algebra<Goldilocks>>(state: &mut [A; 16]) {
    let sum: A = state.iter().map(|r| r.dup()).sum();

    let s0 = state[0].dup();
    let s1 = state[1].dup();
    let s2 = state[2].dup();
    let s3 = state[3].dup();
    let s4 = state[4].dup();
    let s5 = state[5].dup();
    let s6 = state[6].dup();
    let s7 = state[7].dup();
    let s8 = state[8].dup();
    let s9 = state[9].dup();
    let s10 = state[10].dup();
    let s11 = state[11].dup();
    let s12 = state[12].dup();
    let s13 = state[13].dup();
    let s14 = state[14].dup();
    let s15 = state[15].dup();

    // V[0] = -2
    let two_s0 = s0.dup() + s0;
    state[0] = sum.dup() - two_s0;

    // V[1] = 1
    state[1] = sum.dup() + s1;

    // V[2] = 2
    let two_s2 = s2.dup() + s2;
    state[2] = sum.dup() + two_s2;

    // V[3] = 1/2
    state[3] = sum.dup() + s3.halve();

    // V[4] = 3
    let two_s4 = s4.dup() + s4.dup();
    let three_s4 = two_s4 + s4;
    state[4] = sum.dup() + three_s4;

    // V[5] = 4
    let two_s5 = s5.dup() + s5;
    let four_s5 = two_s5.dup() + two_s5;
    state[5] = sum.dup() + four_s5;

    // V[6] = -1/2
    state[6] = sum.dup() - s6.halve();

    // V[7] = -3
    let two_s7 = s7.dup() + s7.dup();
    let three_s7 = two_s7 + s7;
    state[7] = sum.dup() - three_s7;

    // V[8] = -4
    let two_s8 = s8.dup() + s8;
    let four_s8 = two_s8.dup() + two_s8;
    state[8] = sum.dup() - four_s8;

    // V[9] = 1/2^3
    state[9] = sum.dup() + s9.halve().halve().halve();

    // V[10] = 1/2^4
    state[10] = sum.dup() + s10.halve().halve().halve().halve();

    // V[11] = 1/2^5
    state[11] = sum.dup() + s11.halve().halve().halve().halve().halve();

    // V[12] = -1/2^3
    state[12] = sum.dup() - s12.halve().halve().halve();

    // V[13] = -1/2^4
    state[13] = sum.dup() - s13.halve().halve().halve().halve();

    // V[14] = -1/2^5
    state[14] = sum.dup() - s14.halve().halve().halve().halve().halve();

    // V[15] = 1/2^32
    let inv_2_32 = MATRIX_DIAG_16_GOLDILOCKS[15];
    let v15 = s15 * inv_2_32;
    state[15] = sum + v15;
}

/// The internal layers of the Poseidon2 permutation.
#[derive(Debug, Clone, Default)]
pub struct Poseidon2InternalLayerGoldilocks {
    internal_constants: Vec<Goldilocks>,
}

impl InternalLayerConstructor<Goldilocks> for Poseidon2InternalLayerGoldilocks {
    fn new_from_constants(internal_constants: Vec<Goldilocks>) -> Self {
        Self { internal_constants }
    }
}

impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
    InternalLayer<A, 8, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
{
    /// Perform the internal layers of the Poseidon2 permutation on the given state.
    fn permute_state(&self, state: &mut [A; 8]) {
        internal_permute_state(
            state,
            internal_layer_mat_mul_goldilocks_8,
            &self.internal_constants,
        );
    }
}

impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
    InternalLayer<A, 12, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
{
    /// Perform the internal layers of the Poseidon2 permutation on the given state.
    fn permute_state(&self, state: &mut [A; 12]) {
        internal_permute_state(
            state,
            internal_layer_mat_mul_goldilocks_12,
            &self.internal_constants,
        );
    }
}

impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
    InternalLayer<A, 16, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
{
    /// Perform the internal layers of the Poseidon2 permutation on the given state.
    fn permute_state(&self, state: &mut [A; 16]) {
        internal_permute_state(
            state,
            internal_layer_mat_mul_goldilocks_16,
            &self.internal_constants,
        );
    }
}

impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
    InternalLayer<A, 20, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
{
    /// Perform the internal layers of the Poseidon2 permutation on the given state.
    fn permute_state(&self, state: &mut [A; 20]) {
        internal_permute_state(
            state,
            |x| matmul_internal(x, MATRIX_DIAG_20_GOLDILOCKS),
            &self.internal_constants,
        );
    }
}

/// The external layers of the Poseidon2 permutation.
#[derive(Clone)]
pub struct Poseidon2ExternalLayerGoldilocks<const WIDTH: usize> {
    pub(crate) external_constants: ExternalLayerConstants<Goldilocks, WIDTH>,
}

impl<const WIDTH: usize> ExternalLayerConstructor<Goldilocks, WIDTH>
    for Poseidon2ExternalLayerGoldilocks<WIDTH>
{
    fn new_from_constants(external_constants: ExternalLayerConstants<Goldilocks, WIDTH>) -> Self {
        Self { external_constants }
    }
}

impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>, const WIDTH: usize>
    ExternalLayer<A, WIDTH, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2ExternalLayerGoldilocks<WIDTH>
{
    /// Perform the initial external layers of the Poseidon2 permutation on the given state.
    fn permute_state_initial(&self, state: &mut [A; WIDTH]) {
        external_initial_permute_state(
            state,
            self.external_constants.get_initial_constants(),
            add_rc_and_sbox_generic,
            &MDSMat4,
        );
    }

    /// Perform the terminal external layers of the Poseidon2 permutation on the given state.
    fn permute_state_terminal(&self, state: &mut [A; WIDTH]) {
        external_terminal_permute_state(
            state,
            self.external_constants.get_terminal_constants(),
            add_rc_and_sbox_generic,
            &MDSMat4,
        );
    }
}

/// An implementation of the matrix multiplications in the internal and external layers of Poseidon2.
///
/// This can act on `[A; WIDTH]` for any ring implementing `Algebra<Goldilocks>`.
/// If you have either `[Goldilocks::Packing; WIDTH]` or `[Goldilocks; WIDTH]` it will be much faster
/// to use `Poseidon2Goldilocks<WIDTH>` instead of building a Poseidon2 permutation using this.
#[derive(Clone, Debug, Default)]
pub struct GenericPoseidon2LinearLayersGoldilocks;

impl GenericPoseidon2LinearLayers<8> for GenericPoseidon2LinearLayersGoldilocks {
    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 8]) {
        let sum: R = state.iter().map(|r| r.dup()).sum();
        for i in 0..8 {
            let d = R::from_u64(MATRIX_DIAG_8_GOLDILOCKS[i].value);
            state[i] *= d;
            state[i] += sum.dup();
        }
    }
}

impl GenericPoseidon2LinearLayers<12> for GenericPoseidon2LinearLayersGoldilocks {
    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 12]) {
        let sum: R = state.iter().map(|r| r.dup()).sum();
        for i in 0..12 {
            let d = R::from_u64(MATRIX_DIAG_12_GOLDILOCKS[i].value);
            state[i] *= d;
            state[i] += sum.dup();
        }
    }
}

impl GenericPoseidon2LinearLayers<16> for GenericPoseidon2LinearLayersGoldilocks {
    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 16]) {
        let sum: R = state.iter().map(|r| r.dup()).sum();
        for i in 0..16 {
            let d = R::from_u64(MATRIX_DIAG_16_GOLDILOCKS[i].value);
            state[i] *= d;
            state[i] += sum.dup();
        }
    }
}

impl GenericPoseidon2LinearLayers<20> for GenericPoseidon2LinearLayersGoldilocks {
    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 20]) {
        let sum: R = state.iter().map(|r| r.dup()).sum();
        for i in 0..20 {
            let d = R::from_u64(MATRIX_DIAG_20_GOLDILOCKS[i].value);
            state[i] *= d;
            state[i] += sum.dup();
        }
    }
}

#[cfg(test)]
mod tests {
    use p3_field::PrimeCharacteristicRing;
    use p3_symmetric::Permutation;

    use super::*;

    type F = Goldilocks;

    #[test]
    fn test_generic_internal_linear_layer_8_matches_matmul_internal() {
        let mut state_generic = [
            F::from_u64(1),
            F::from_u64(2),
            F::from_u64(3),
            F::from_u64(4),
            F::from_u64(5),
            F::from_u64(6),
            F::from_u64(7),
            F::from_u64(8),
        ];
        let mut state_existing = state_generic;

        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
        matmul_internal(&mut state_existing, MATRIX_DIAG_8_GOLDILOCKS);

        assert_eq!(state_generic, state_existing);
    }

    #[test]
    fn test_generic_internal_linear_layer_12_matches_matmul_internal() {
        let mut state_generic = [
            F::from_u64(1),
            F::from_u64(2),
            F::from_u64(3),
            F::from_u64(4),
            F::from_u64(5),
            F::from_u64(6),
            F::from_u64(7),
            F::from_u64(8),
            F::from_u64(9),
            F::from_u64(10),
            F::from_u64(11),
            F::from_u64(12),
        ];
        let mut state_existing = state_generic;

        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
        matmul_internal(&mut state_existing, MATRIX_DIAG_12_GOLDILOCKS);

        assert_eq!(state_generic, state_existing);
    }

    #[test]
    fn test_generic_internal_linear_layer_16_matches_matmul_internal() {
        let mut state_generic = [
            F::from_u64(1),
            F::from_u64(2),
            F::from_u64(3),
            F::from_u64(4),
            F::from_u64(5),
            F::from_u64(6),
            F::from_u64(7),
            F::from_u64(8),
            F::from_u64(9),
            F::from_u64(10),
            F::from_u64(11),
            F::from_u64(12),
            F::from_u64(13),
            F::from_u64(14),
            F::from_u64(15),
            F::from_u64(16),
        ];
        let mut state_existing = state_generic;

        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
        matmul_internal(&mut state_existing, MATRIX_DIAG_16_GOLDILOCKS);

        assert_eq!(state_generic, state_existing);
    }

    #[test]
    fn test_generic_internal_linear_layer_20_matches_matmul_internal() {
        let mut state_generic = [
            F::from_u64(1),
            F::from_u64(2),
            F::from_u64(3),
            F::from_u64(4),
            F::from_u64(5),
            F::from_u64(6),
            F::from_u64(7),
            F::from_u64(8),
            F::from_u64(9),
            F::from_u64(10),
            F::from_u64(11),
            F::from_u64(12),
            F::from_u64(13),
            F::from_u64(14),
            F::from_u64(15),
            F::from_u64(16),
            F::from_u64(17),
            F::from_u64(18),
            F::from_u64(19),
            F::from_u64(20),
        ];
        let mut state_existing = state_generic;

        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
        matmul_internal(&mut state_existing, MATRIX_DIAG_20_GOLDILOCKS);

        assert_eq!(state_generic, state_existing);
    }

    #[test]
    fn test_default_goldilocks_poseidon2_width_8() {
        let mut input: [F; 8] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7]);

        let expected: [F; 8] = Goldilocks::new_array([
            0x020cf04a1b214d14,
            0x84e14aaaeacaed25,
            0x1ae0f640e81c7457,
            0xa4d204cbaeb0d8a5,
            0x0cf637b627b3a7ff,
            0x788d304d948b486b,
            0x7327133ea1949af4,
            0xf415abb924da395b,
        ]);

        let perm = default_goldilocks_poseidon2_8();
        perm.permute_mut(&mut input);

        assert_eq!(input, expected);
    }

    #[test]
    fn test_default_goldilocks_poseidon2_width_12() {
        let mut input: [F; 12] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);

        let expected: [F; 12] = Goldilocks::new_array([
            0xf292ab67c0f14b03,
            0x0a32f1b37656544c,
            0x053c61ab895498de,
            0x02ff92e55b196ffb,
            0x58176e8f6f58cab2,
            0xb0aa1206e7aec0f8,
            0xe90c13f3dce83ca4,
            0xf4da15333edf39c2,
            0x23b701c053c2ca6c,
            0xd233d593dcdfbf58,
            0x4effa5f9516fb52e,
            0x0aaf4489f1f40166,
        ]);

        let perm = default_goldilocks_poseidon2_12();
        perm.permute_mut(&mut input);

        assert_eq!(input, expected);
    }
}