ringkernel-cuda 0.4.0

CUDA backend for RingKernel - NVIDIA GPU support via cudarc
Documentation
[package]
name = "ringkernel-cuda"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
homepage.workspace = true
rust-version.workspace = true
description = "CUDA backend for RingKernel - NVIDIA GPU support via cudarc"
keywords = ["gpu", "actor", "cuda", "nvidia"]
categories = ["hardware-support"]
readme = "README.md"
build = "build.rs"

[dependencies]
ringkernel-core = { workspace = true }

# CUDA bindings - auto-detect CUDA version from build system
# The nvtx feature enables NVTX profiling integration
cudarc = { version = "0.18.2", optional = true, features = ["cuda-version-from-build-system", "nvtx"] }

# Async runtime
tokio = { workspace = true }
async-trait = { workspace = true }
futures = { workspace = true }

# Error handling
thiserror = { workspace = true }

# Logging
tracing = { workspace = true }

# Synchronization
parking_lot = { workspace = true }

# Serialization (for profiling chrome trace export)
serde = { workspace = true, optional = true }
serde_json = { workspace = true, optional = true }

# Cryptographic hashing (for PTX cache)
sha2 = { version = "0.10", optional = true }

# Temp directory (for cache tests)
tempfile = { workspace = true, optional = true }

[dev-dependencies]
tokio = { workspace = true, features = ["test-util", "macros", "rt-multi-thread"] }

# Note: GPU execution tests require the `cuda` feature to be enabled.
# Run with: cargo test -p ringkernel-cuda --features cuda

[features]
default = []
cuda = ["cudarc"]
# Cooperative groups support - requires nvcc at build time for PTX compilation
# Enables grid-wide synchronization via cuLaunchCooperativeKernel
cooperative = ["cuda"]
# GPU profiling support - NVTX integration, CUDA events, Chrome trace export
profiling = ["cuda", "serde", "serde_json"]
# PTX compilation cache - eliminates first-tick kernel compilation overhead
ptx-cache = ["cuda", "sha2", "tempfile"]