sp1-gpu-sys 6.3.0

FFI bindings and CUDA build system for SP1-GPU
Documentation
// Sequential lowering for the DAG-native zerocheck.
//
// Interprets a chunk's bytecode per row, computing one ext_t partial per
// (block, eval point). Mirrors the host-side `ChunkBytecode` layout from
// `sp1-gpu-air/src/ir/bytecode.rs`.

#pragma once

#include "config.cuh"
#include <cstdint>

// Must match `DagInstr` in sp1-gpu-air/src/ir/bytecode.rs.
struct DagInstr {
    uint8_t opcode;
    uint8_t _pad;
    uint16_t out;
    uint16_t a;
    uint16_t b;
};

// Source tag for `LeafRef.source`. Must match
// `LEAF_SOURCE_{PREPROCESSED,MAIN}_LOCAL` in
// `sp1-gpu-air/src/ir/bytecode.rs`. The (`PreprocessedNext`,
// `MainNext`) variants from the jagged-mle column tags would be 3 and 5
// but are never emitted here — constraint lowering only references local
// rows.
constexpr uint8_t LEAF_SOURCE_PREPROCESSED_LOCAL = 2;
constexpr uint8_t LEAF_SOURCE_MAIN_LOCAL         = 4;

// Must match `LeafRef` in sp1-gpu-air/src/ir/bytecode.rs.
struct LeafRef {
    uint8_t source;   // LEAF_SOURCE_PREPROCESSED_LOCAL / LEAF_SOURCE_MAIN_LOCAL
    uint8_t _pad;
    uint32_t col;
};

// Opcodes — must match `BcOp` in sp1-gpu-air/src/ir/bytecode.rs.
// Asserts aren't an opcode: they live in a separate per-chunk
// `(reg, alpha_idx)` array (`stc.asserts`) so the interpreter can iterate
// them after the bytecode body, summed against `powers_of_alpha`.
enum BcOp : uint8_t {
    BC_LOAD_LEAF   = 0,
    BC_LOAD_CONST  = 1,
    BC_LOAD_PUBLIC = 2,
    BC_ADD_F       = 3,
    BC_SUB_F       = 4,
    BC_MUL_F       = 5,
    BC_NEG_F       = 6,
};

// Shard-static per-chunk descriptor. Must match `ChunkStaticC` in
// zerocheck/src/prover.rs (layout-compat). Uploaded once per shard, reused
// across all rounds — none of its fields depend on the per-round trace fold.
struct ChunkStatic {
    const DagInstr* instrs;            // 8
    const LeafRef* leaves;             // 8
    const void* consts;                // 8 — cast to felt_t* in kernel
    const uint32_t* publics;           // 8
    const uint16_t* assert_regs;       // 8
    const uint32_t* assert_alphas;     // 8
    uint32_t n_instrs;                 // 4
    uint32_t n_asserts;                // 4
    uint32_t chip_idx;                 // 4
    /// Carrier-chunk inline-GKR widths. Set non-zero ONLY for narrow chips
    /// (total width ≤ WIDE_GKR_THRESHOLD) where keeping the column sweep
    /// inline preserves L1 cache locality with constraint leaf reads. Wide
    /// chips get GKR via the dedicated `zerocheck_gkr_sweep` kernel and
    /// have these zeroed at shard init.
    uint32_t gkr_main_width;           // 4
    uint32_t gkr_prep_width;           // 4
    uint32_t chip_alpha_offset;        // 4 — added to chip-relative alpha idx
};

// Per-round per-chip trace pointers + height. Must match `ChipLayoutC` in
// zerocheck/src/prover.rs (layout-compat). Indexed by chip_idx — the kernel
// reads `chip_layouts[chunk_static.chip_idx]` after reading the chunk.
struct ChipLayout {
    uint64_t main_ptr;                 // 8
    uint64_t preprocessed_ptr;         // 8
    uint32_t height;                   // 4
    uint32_t _pad;                     // 4
};

// Per-block dispatch entry. One per launched block; the kernel reads
// `dispatch[blockIdx.x]` once at block init and handles `n_rows` rows
// starting at `row_offset` of chunk `chunk_id`. Must match `BlockDispatchC`
// in zerocheck/src/prover.rs.
struct BlockDispatch {
    uint32_t chunk_id;                 // 4
    uint32_t row_offset;               // 4
    uint32_t n_rows;                   // 4
};

// Fused dispatch kernel. One launch handles every Sequential chunk tile
// produced by the host-side dispatch builder. Each block reads its
// `BlockDispatch` once at block init — no per-row binary search.
//
// Tiered variants by MAX_REGS — the launcher partitions chunks into tiers
// and launches one kernel per non-empty tier so each kernel's per-thread
// register array is sized to its tier's worst case (not the entire
// workload's worst case).
extern "C" void* zerocheck_fused_sequential_kb_32_kernel();
extern "C" void* zerocheck_fused_sequential_kb_64_kernel();
extern "C" void* zerocheck_fused_sequential_kb_128_kernel();
extern "C" void* zerocheck_fused_sequential_kb_256_kernel();
extern "C" void* zerocheck_fused_sequential_kb_512_kernel();
extern "C" void* zerocheck_fused_sequential_kb_1024_kernel();
extern "C" void* zerocheck_fused_sequential_ext_32_kernel();
extern "C" void* zerocheck_fused_sequential_ext_64_kernel();
extern "C" void* zerocheck_fused_sequential_ext_128_kernel();
extern "C" void* zerocheck_fused_sequential_ext_256_kernel();
extern "C" void* zerocheck_fused_sequential_ext_512_kernel();
extern "C" void* zerocheck_fused_sequential_ext_1024_kernel();