sp1-gpu-sys 6.2.0

FFI bindings and CUDA build system for SP1-GPU
cmake_minimum_required(VERSION 3.24)
project(sp1-gpu-cuda LANGUAGES CXX CUDA)

# Require CUDA 12.0+
find_package(CUDAToolkit 12.0 REQUIRED)

# Get CUDA version for architecture selection
set(CUDA_VERSION_MAJOR ${CUDAToolkit_VERSION_MAJOR})
set(CUDA_VERSION_MINOR ${CUDAToolkit_VERSION_MINOR})

# C++20 for both host and CUDA code
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# Don't use response files for CUDA includes - this breaks clangd IDE support
# because clangd can't parse the --options-file flag
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES OFF)

# Architecture flags
# sm_100 / sm_120 (Blackwell) require CUDA 12.8+; older toolchains can't compile them.
if(DEFINED CUDA_ARCHS)
    # Convert comma-separated string to CMake list (semicolon-separated)
    string(REPLACE "," ";" CUDA_ARCHS_LIST "${CUDA_ARCHS}")
    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHS_LIST})
elseif((CUDA_VERSION_MAJOR EQUAL 12 AND CUDA_VERSION_MINOR GREATER_EQUAL 8)
        OR CUDA_VERSION_MAJOR GREATER_EQUAL 13)
    set(CMAKE_CUDA_ARCHITECTURES 80 86 89 90 100 120)
else()
    set(CMAKE_CUDA_ARCHITECTURES 80 86 89 90)
endif()

# Build type defaults to Release
if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
endif()

# Common CUDA flags
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -default-stream=per-thread")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp")
# -fPIC (not -fPIE): the static archive may be linked into a cdylib downstream.
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fPIC")

# Relocatable device code (-rdc=true) for cross-file device function calls.
# Setting the variable here initializes CUDA_SEPARABLE_COMPILATION on every
# target created afterwards (including the per-module OBJECT libraries),
# so each .cu file is compiled with -dc and the final STATIC target performs
# device linking.
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# Debug build: -G implies -O0 device-side; nvcc warns when combined with -O.
if(PROFILE_DEBUG_DATA)
    set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -G")
else()
    set(CMAKE_CUDA_FLAGS_DEBUG "-O3")
endif()
set(CMAKE_CUDA_FLAGS_RELEASE "-O3")

# Get the actual source directory (where this CMakeLists.txt lives)
set(CSL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})

# Create an INTERFACE library for common settings (following CUTLASS pattern)
# All CUDA modules link to this to get proper include paths and definitions
add_library(sp1_gpu_common INTERFACE)
target_include_directories(sp1_gpu_common INTERFACE
    ${CSL_SOURCE_DIR}/include
    ${CSL_SOURCE_DIR}/sppark
    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)
target_compile_definitions(sp1_gpu_common INTERFACE SPPARK FEATURE_KOALA_BEAR)

# Add cbindgen include directory if provided
if(DEFINED CBINDGEN_INCLUDE_DIR)
    target_include_directories(sp1_gpu_common INTERFACE ${CBINDGEN_INCLUDE_DIR})
endif()

# Output directories - use the project root's target directory
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CSL_SOURCE_DIR}/target/cuda-build/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CSL_SOURCE_DIR}/target/cuda-build/lib)

# Add subdirectories for each module
add_subdirectory(lib/algebra)
add_subdirectory(lib/basefold)
add_subdirectory(lib/challenger)
add_subdirectory(lib/experimental)
add_subdirectory(lib/jagged_assist)
add_subdirectory(lib/jagged_sumcheck)
add_subdirectory(lib/logup_gkr)
add_subdirectory(lib/merkle_tree)
add_subdirectory(lib/mle)
add_subdirectory(lib/ntt)
add_subdirectory(lib/runtime)
add_subdirectory(lib/scan)
add_subdirectory(lib/sum_and_reduce)
add_subdirectory(lib/tracegen)
add_subdirectory(lib/transpose)
add_subdirectory(lib/zerocheck)

# Sppark sources (external library, kept separate)
# These are C++ files that need CUDA headers
add_library(sppark_objs OBJECT
    ${CSL_SOURCE_DIR}/sppark/lib.cpp
    ${CSL_SOURCE_DIR}/sppark/util/all_gpus.cpp
)
target_link_libraries(sppark_objs PRIVATE sp1_gpu_common)

# Collect all object libraries
set(ALL_CUDA_OBJECTS
    $<TARGET_OBJECTS:algebra_objs>
    $<TARGET_OBJECTS:basefold_objs>
    $<TARGET_OBJECTS:challenger_objs>
    $<TARGET_OBJECTS:experimental_objs>
    $<TARGET_OBJECTS:jagged_assist_objs>
    $<TARGET_OBJECTS:jagged_sumcheck_objs>
    $<TARGET_OBJECTS:logup_gkr_objs>
    $<TARGET_OBJECTS:merkle_tree_objs>
    $<TARGET_OBJECTS:mle_objs>
    $<TARGET_OBJECTS:ntt_objs>
    $<TARGET_OBJECTS:runtime_objs>
    $<TARGET_OBJECTS:scan_objs>
    $<TARGET_OBJECTS:sum_and_reduce_objs>
    $<TARGET_OBJECTS:tracegen_objs>
    $<TARGET_OBJECTS:transpose_objs>
    $<TARGET_OBJECTS:zerocheck_objs>
    $<TARGET_OBJECTS:sppark_objs>
)

# Create static library with device linking
add_library(sys-cuda STATIC ${ALL_CUDA_OBJECTS})
set_target_properties(sys-cuda PROPERTIES
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_RESOLVE_DEVICE_SYMBOLS ON
)

if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
    # nothing needed
else()
    target_link_libraries(sys-cuda PRIVATE
        CUDA::cudart
        CUDA::nvToolsExt
    )
endif()

# Install rule
install(TARGETS sys-cuda
    ARCHIVE DESTINATION lib
)