shivini 0.155.9

use super::*;
use boojum_cuda::device_structures::{DeviceMatrixChunk, DeviceMatrixChunkMut};
pub use boojum_cuda::gates::GateEvaluationParams;
use boojum_cuda::{
    device_structures::{DeviceMatrix, DeviceMatrixMut},
    extension_field::VectorizedExtensionField,
};
use era_cudart::stream::CudaStreamWaitEventFlags;

#[allow(dead_code)]
pub fn assign_gate_selectors(
    _indexes: &[F],
    _gate_constants: &[F],
    _result: &mut [F],
    _stream: &CudaStream,
) -> CudaResult<()> {
    todo!()
}

#[allow(clippy::too_many_arguments)]
#[allow(clippy::assertions_on_constants)]
pub fn constraint_evaluation(
    gates: &[GateEvaluationParams],
    variable_columns: &[F],
    witness_columns: &[F],
    constant_columns: &[F],
    challenge: &DExt,
    challenge_power_offset: usize,
    quotient: &mut [F],
    domain_size: usize,
    is_specialized: bool,
) -> CudaResult<()> {
    assert_eq!(quotient.len(), 2 * domain_size);
    assert!(!gates.is_empty());

    let mut d_challenge = svec!(2);
    mem::d2d(&challenge.c0.inner[..], &mut d_challenge[..1])?;
    mem::d2d(&challenge.c1.inner[..], &mut d_challenge[1..])?;
    let challenge = unsafe { DeviceSlice::from_slice(&d_challenge[..]) };
    let challenge = unsafe { challenge.transmute::<VectorizedExtensionField>() };

    let variables_slice = unsafe { DeviceSlice::from_slice(variable_columns) };
    let witnesses_slice = unsafe { DeviceSlice::from_slice(witness_columns) };
    let constants_slice = unsafe { DeviceSlice::from_slice(constant_columns) };
    let quotient_slice = unsafe {
        DeviceSlice::from_mut_slice(quotient).transmute_mut::<VectorizedExtensionField>()
    };
    const STREAMS_COUNT: usize = 4;
    assert!(STREAMS_COUNT <= NUM_AUX_STREAMS_AND_EVENTS);
    const BLOCK_SIZE: usize = 128;
    let l2_size = _l2_cache_size();
    let (cc_major, cc_minor) = _compute_capability();
    let cols_count =
        (variables_slice.len() + witnesses_slice.len() + constants_slice.len()) / domain_size + 2;
    let chunk_rows =
        l2_size / (STREAMS_COUNT * size_of::<F>() * cols_count) / BLOCK_SIZE * BLOCK_SIZE;
    let split = if chunk_rows == 0 {
        1
    } else {
        (domain_size + chunk_rows - 1) / chunk_rows
    };
    if is_specialized || split == 1 || cc_major < 8 || (cc_major == 8 && cc_minor == 6) {
        let variable_columns_matrix = DeviceMatrix::new(variables_slice, domain_size);
        let witness_columns_matrix = DeviceMatrix::new(witnesses_slice, domain_size);
        let constant_columns_matrix = DeviceMatrix::new(constants_slice, domain_size);
        let mut quotient_matrix = DeviceMatrixMut::new(quotient_slice, domain_size);
        if_not_dry_run! {
            boojum_cuda::gates::evaluate_gates(
                gates,
                &variable_columns_matrix,
                &witness_columns_matrix,
                &constant_columns_matrix,
                challenge,
                &mut quotient_matrix,
                challenge_power_offset as u32,
                get_stream(),
            ).map(|_| ())
        }
    } else {
        if !is_dry_run()? {
            let events = &_aux_events()[0..STREAMS_COUNT];
            let streams = &_aux_streams()[0..STREAMS_COUNT];
            let main_stream = get_stream();
            events[0].record(main_stream)?;
            for stream in streams.iter() {
                stream.wait_event(&events[0], CudaStreamWaitEventFlags::DEFAULT)?;
            }
            for i in 0..split {
                let offset = i * chunk_rows;
                let rows = if i == split - 1 {
                    domain_size - offset
                } else {
                    chunk_rows
                };
                let variable_columns_matrix =
                    DeviceMatrixChunk::new(variables_slice, domain_size, offset, rows);
                let witness_columns_matrix =
                    DeviceMatrixChunk::new(witnesses_slice, domain_size, offset, rows);
                let constant_columns_matrix =
                    DeviceMatrixChunk::new(constants_slice, domain_size, offset, rows);
                let mut quotient_matrix =
                    DeviceMatrixChunkMut::new(quotient_slice, domain_size, offset, rows);
                let stream = &streams[i % STREAMS_COUNT];
                boojum_cuda::gates::evaluate_gates(
                    gates,
                    &variable_columns_matrix,
                    &witness_columns_matrix,
                    &constant_columns_matrix,
                    challenge,
                    &mut quotient_matrix,
                    challenge_power_offset as u32,
                    stream,
                )
                .map(|_| ())?;
            }
            for (event, stream) in events.iter().zip(streams.iter()) {
                event.record(stream)?;
                main_stream.wait_event(event, CudaStreamWaitEventFlags::DEFAULT)?;
            }
        }
        Ok(())
    }
}

#[allow(dead_code)]
pub fn constraint_evaluation_over_lde(
    gates: Vec<GateEvaluationParams>,
    variable_columns: &DVec<F>,
    witness_columns: &DVec<F>,
    constant_columns: &DVec<F>,
    challenge: &DExt,
    quotient: &mut DVec<F>,
    lde_size: usize,
) -> CudaResult<()> {
    assert_eq!(quotient.len(), 2 * lde_size);
    assert!(!gates.is_empty());

    let variable_columns_matrix = DeviceMatrix::new(
        unsafe { DeviceSlice::from_slice(variable_columns.as_ref()) },
        lde_size,
    );
    let witness_columns_matrix = DeviceMatrix::new(
        unsafe { DeviceSlice::from_slice(witness_columns.as_ref()) },
        lde_size,
    );
    let constant_columns_matrix = DeviceMatrix::new(
        unsafe { DeviceSlice::from_slice(constant_columns.as_ref()) },
        lde_size,
    );

    let mut d_challenge = svec!(2);
    mem::d2d(&challenge.c0.inner[..], &mut d_challenge[..1])?;
    mem::d2d(&challenge.c1.inner[..], &mut d_challenge[1..])?;
    let challenge = unsafe { DeviceSlice::from_slice(&d_challenge[..]) };
    let challenge = unsafe { challenge.transmute::<VectorizedExtensionField>() };

    let quotient = unsafe { DeviceSlice::from_mut_slice(quotient.as_mut()) };
    let mut quotient_matrix = DeviceMatrixMut::new(
        unsafe { quotient.transmute_mut::<VectorizedExtensionField>() },
        lde_size,
    );

    if_not_dry_run! {
        boojum_cuda::gates::evaluate_gates(
            &gates,
            &variable_columns_matrix,
            &witness_columns_matrix,
            &constant_columns_matrix,
            challenge,
            &mut quotient_matrix,
            0,
            get_stream(),
        ).map(|_| ())
    }
}