#[cfg(feature = "cuda")]
mod cuda;
#[cfg(feature = "cuda")]
pub use cuda::*;
pub const BFS_KERNEL_SOURCE: &str = r#"
// Level-synchronous BFS kernel
// Each iteration processes all nodes at the current frontier level
// Advance frontier: mark neighbors of frontier nodes
extern "C" __global__ void bfs_advance(
const unsigned long long* row_ptr, // CSR row pointers
const unsigned int* col_idx, // CSR column indices
const int* distances, // Current distances
int* new_distances, // New distances (output)
int* frontier_size, // Output: size of next frontier
unsigned int num_nodes,
int current_level
) {
unsigned int node = blockIdx.x * blockDim.x + threadIdx.x;
if (node >= num_nodes) return;
// Only process nodes at current level
if (distances[node] != current_level) return;
// Explore neighbors
unsigned long long row_start = row_ptr[node];
unsigned long long row_end = row_ptr[node + 1];
for (unsigned long long i = row_start; i < row_end; i++) {
unsigned int neighbor = col_idx[i];
// Try to claim this neighbor (if unvisited)
if (distances[neighbor] == -1) {
// Atomic CAS to prevent races
int old = atomicCAS(&new_distances[neighbor], -1, current_level + 1);
if (old == -1) {
// Successfully claimed - increment frontier
atomicAdd(frontier_size, 1);
}
}
}
}
// Initialize distances: source nodes get 0, others get -1
extern "C" __global__ void bfs_init(
int* distances,
const unsigned int* sources,
unsigned int num_nodes,
unsigned int num_sources
) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize all to -1
if (idx < num_nodes) {
distances[idx] = -1;
}
// Set sources to 0
if (idx < num_sources) {
distances[sources[idx]] = 0;
}
}
// Copy distances for next iteration
extern "C" __global__ void bfs_copy(
const int* src,
int* dst,
unsigned int n
) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
dst[idx] = src[idx];
}
}
"#;
pub const SPMV_KERNEL_SOURCE: &str = r#"
// CSR SpMV: y = alpha * A * x
// Each thread handles one row
extern "C" __global__ void spmv_csr(
const unsigned long long* row_ptr, // CSR row pointers
const unsigned int* col_idx, // CSR column indices
const double* values, // CSR values (can be NULL for unweighted)
const double* x, // Input vector
double* y, // Output vector
unsigned int num_rows,
double alpha,
int has_values // 1 if values array is valid
) {
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= num_rows) return;
unsigned long long row_start = row_ptr[row];
unsigned long long row_end = row_ptr[row + 1];
double sum = 0.0;
if (has_values) {
for (unsigned long long i = row_start; i < row_end; i++) {
unsigned int col = col_idx[i];
sum += values[i] * x[col];
}
} else {
// Unweighted: treat all edges as 1.0
for (unsigned long long i = row_start; i < row_end; i++) {
unsigned int col = col_idx[i];
sum += x[col];
}
}
y[row] = alpha * sum;
}
// CSR SpMV with accumulation: y = alpha * A * x + beta * y
extern "C" __global__ void spmv_csr_axpby(
const unsigned long long* row_ptr,
const unsigned int* col_idx,
const double* values,
const double* x,
double* y,
unsigned int num_rows,
double alpha,
double beta,
int has_values
) {
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= num_rows) return;
unsigned long long row_start = row_ptr[row];
unsigned long long row_end = row_ptr[row + 1];
double sum = 0.0;
if (has_values) {
for (unsigned long long i = row_start; i < row_end; i++) {
unsigned int col = col_idx[i];
sum += values[i] * x[col];
}
} else {
for (unsigned long long i = row_start; i < row_end; i++) {
unsigned int col = col_idx[i];
sum += x[col];
}
}
y[row] = alpha * sum + beta * y[row];
}
// Dot product with block reduction
extern "C" __global__ void dot_product(
const double* x,
const double* y,
double* partial_sums,
unsigned int n
) {
__shared__ double sdata[256];
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (idx < n) ? x[idx] * y[idx] : 0.0;
__syncthreads();
// Reduction in shared memory
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) {
partial_sums[blockIdx.x] = sdata[0];
}
}
// Vector scaling: x = x * scale
extern "C" __global__ void vector_scale(
double* x,
unsigned int n,
double scale
) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
x[idx] *= scale;
}
}
// Vector copy
extern "C" __global__ void vector_copy(
const double* src,
double* dst,
unsigned int n
) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
dst[idx] = src[idx];
}
}
"#;