boojum-cuda 0.156.2

#include "common.cuh"
#include "context.cuh"
#include "goldilocks.cuh"
#include "goldilocks_extension.cuh"
#include "memory.cuh"

namespace goldilocks {

using namespace memory;

template <typename T, int INV_BATCH, bool batch_is_full>
DEVICE_FORCEINLINE void batch_inv_registers(const T *inputs, T *fwd_scan_and_outputs, int runtime_batch_size) {
  // If count < grid size, the kernel is inefficient no matter what (because each thread processes just one element)
  // but we should still bail out if a thread has no assigned elems at all.
  T running_prod = T::one();
  int i = 0;
#pragma unroll
  for (; i < INV_BATCH; i++)
    if (batch_is_full || i < runtime_batch_size) {
      fwd_scan_and_outputs[i] = running_prod;
      running_prod = T::mul(running_prod, inputs[i]);
    }

  T inv = T::inv(running_prod);

  i--;
#pragma unroll
  for (; i >= 0; i--) {
    if (batch_is_full || i < runtime_batch_size) {
      const auto input = inputs[i];
      // Isolates and stores this input's inv
      fwd_scan_and_outputs[i] = T::mul(fwd_scan_and_outputs[i], inv);
      // Removes this input's inv contribution
      if (i > 0)
        inv = T::mul(inv, input);
    }
  }
}

} // namespace goldilocks