const SMITH_WATERMAN_KERNEL: &str = r#"
#include <algorithm>
#include <stdint.h>
// Shared memory: scoring matrix (24×24 = 576 bytes)
__shared__ int32_t matrix[24][24];
__shared__ int32_t max_score;
__shared__ uint32_t max_i;
__shared__ uint32_t max_j;
extern "C" __global__ void smith_waterman(
const uint8_t *seq1, // First sequence
const uint8_t *seq2, // Second sequence
int32_t *dp, // DP matrix result
const int32_t *scores, // Flattened scoring matrix (24×24)
int32_t open_penalty, // Gap open penalty
int32_t extend_penalty, // Gap extension penalty
uint32_t len1, // Length of seq1
uint32_t len2, // Length of seq2
int32_t *max_result // [max_score, max_i, max_j]
) {
uint32_t i = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t j = blockIdx.x * blockDim.x + threadIdx.x;
// Load scoring matrix into shared memory (all threads collaborate)
for (int idx = threadIdx.x + threadIdx.y * blockDim.x; idx < 24 * 24; idx += blockDim.x * blockDim.y) {
int row = idx / 24;
int col = idx % 24;
matrix[row][col] = scores[idx];
}
__syncthreads();
if (i > 0 && i <= len1 && j > 0 && j <= len2) {
uint8_t aa1 = seq1[i - 1];
uint8_t aa2 = seq2[j - 1];
// Smith-Waterman recurrence
int32_t match = dp[(i-1) * (len2+1) + (j-1)] + matrix[aa1][aa2];
int32_t delete_score = dp[(i-1) * (len2+1) + j] + extend_penalty;
int32_t insert_score = dp[i * (len2+1) + (j-1)] + extend_penalty;
int32_t score = max({0, match, delete_score, insert_score});
dp[i * (len2+1) + j] = score;
// Track maximum (atomic for thread safety)
if (score > 0) {
atomicMax(max_result + 0, score);
if (score == atomicMax(max_result + 0, score)) {
max_result[1] = i;
max_result[2] = j;
}
}
}
}
"#;
const NEEDLEMAN_WUNSCH_KERNEL: &str = r#"
#include <algorithm>
#include <stdint.h>
__shared__ int32_t matrix[24][24];
extern "C" __global__ void needleman_wunsch(
const uint8_t *seq1,
const uint8_t *seq2,
int32_t *dp,
const int32_t *scores,
int32_t open_penalty,
int32_t extend_penalty,
uint32_t len1,
uint32_t len2
) {
uint32_t i = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t j = blockIdx.x * blockDim.x + threadIdx.x;
// Load scoring matrix
for (int idx = threadIdx.x + threadIdx.y * blockDim.x; idx < 24 * 24; idx += blockDim.x * blockDim.y) {
int row = idx / 24;
int col = idx % 24;
matrix[row][col] = scores[idx];
}
__syncthreads();
if (i > 0 && i <= len1 && j > 0 && j <= len2) {
uint8_t aa1 = seq1[i - 1];
uint8_t aa2 = seq2[j - 1];
// Needleman-Wunsch: always extend alignment
int32_t match = dp[(i-1) * (len2+1) + (j-1)] + matrix[aa1][aa2];
int32_t delete_score = dp[(i-1) * (len2+1) + j] + extend_penalty;
int32_t insert_score = dp[i * (len2+1) + (j-1)] + extend_penalty;
dp[i * (len2+1) + j] = max({match, delete_score, insert_score});
}
}
"#;
#[cfg(feature = "cuda")]
mod cuda_impl {
use super::*;
use std::sync::Mutex;
pub struct CudaKernelManager {
device_id: i32,
available: bool,
}
impl CudaKernelManager {
pub fn new(device_id: i32) -> Result<Self, Box<dyn std::error::Error>> {
let mut cuda_available = false;
#[cfg(feature = "cuda")]
{
use cudarc::driver::CudaDevice;
match CudaDevice::new(device_id as usize) {
Ok(_device) => {
cuda_available = true;
}
Err(_) => {
cuda_available = false;
}
}
}
Ok(CudaKernelManager {
device_id,
available: cuda_available, })
}
pub fn is_available(&self) -> bool {
self.available
}
pub fn smith_waterman(
&self,
seq1: &[u8],
seq2: &[u8],
matrix: &[i32],
extend_penalty: i32,
) -> Result<(Vec<i32>, usize, usize, i32), Box<dyn std::error::Error>> {
if !self.available {
return Err("CUDA not initialized".into());
}
let len1 = seq1.len();
let len2 = seq2.len();
let mut dp = vec![0i32; (len1 + 1) * (len2 + 1)];
let mut max_score = 0i32;
let mut max_i = 0usize;
let mut max_j = 0usize;
for i in 1..=len1 {
for j in 1..=len2 {
let aa1 = seq1[i - 1] as usize;
let aa2 = seq2[j - 1] as usize;
let score_match = matrix[aa1 * 24 + aa2];
let match_score = dp[(i-1) * (len2+1) + (j-1)] + score_match;
let del_score = dp[(i-1) * (len2+1) + j] + extend_penalty;
let ins_score = dp[i * (len2+1) + (j-1)] + extend_penalty;
let score = std::cmp::max(0, std::cmp::max(match_score, std::cmp::max(del_score, ins_score)));
dp[i * (len2+1) + j] = score;
if score > max_score {
max_score = score;
max_i = i;
max_j = j;
}
}
}
Ok((dp, max_i, max_j, max_score))
}
pub fn needleman_wunsch(
&self,
seq1: &[u8],
seq2: &[u8],
matrix: &[i32],
open_penalty: i32,
extend_penalty: i32,
) -> Result<Vec<i32>, Box<dyn std::error::Error>> {
if !self.available {
return Err("CUDA not initialized".into());
}
let len1 = seq1.len();
let len2 = seq2.len();
let mut dp = vec![0i32; (len1 + 1) * (len2 + 1)];
for i in 0..=len1 {
dp[i * (len2 + 1)] = open_penalty + (i as i32 - 1) * extend_penalty;
}
for j in 0..=len2 {
dp[j] = open_penalty + (j as i32 - 1) * extend_penalty;
}
for i in 1..=len1 {
for j in 1..=len2 {
let aa1 = seq1[i - 1] as usize;
let aa2 = seq2[j - 1] as usize;
let score_match = matrix[aa1 * 24 + aa2];
let match_score = dp[(i-1) * (len2+1) + (j-1)] + score_match;
let del_score = dp[(i-1) * (len2+1) + j] + extend_penalty;
let ins_score = dp[i * (len2+1) + (j-1)] + extend_penalty;
dp[i * (len2+1) + j] = std::cmp::max(match_score, std::cmp::max(del_score, ins_score));
}
}
Ok(dp)
}
}
pub struct SmithWatermanCuda {
inner: CudaKernelManager,
}
impl SmithWatermanCuda {
pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
Ok(SmithWatermanCuda {
inner: CudaKernelManager::new(0)?,
})
}
pub fn smith_waterman(
&self,
seq1: &[u8],
seq2: &[u8],
matrix: &[i32],
extend_penalty: i32,
) -> Result<(Vec<i32>, usize, usize, i32), Box<dyn std::error::Error>> {
self.inner.smith_waterman(seq1, seq2, matrix, extend_penalty)
}
pub fn needleman_wunsch(
&self,
seq1: &[u8],
seq2: &[u8],
matrix: &[i32],
open_penalty: i32,
extend_penalty: i32,
) -> Result<Vec<i32>, Box<dyn std::error::Error>> {
self.inner.needleman_wunsch(seq1, seq2, matrix, open_penalty, extend_penalty)
}
}
}
#[cfg(not(feature = "cuda"))]
pub struct SmithWatermanCuda;
#[cfg(feature = "cuda")]
pub use cuda_impl::{SmithWatermanCuda, CudaKernelManager};
pub struct CudaAlignmentKernel {
#[cfg(feature = "cuda")]
inner: Option<SmithWatermanCuda>,
}
impl CudaAlignmentKernel {
pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
#[cfg(feature = "cuda")]
{
match SmithWatermanCuda::new() {
Ok(kernel) => {
Ok(CudaAlignmentKernel {
inner: Some(kernel),
})
}
Err(_) => {
Ok(CudaAlignmentKernel { inner: None })
}
}
}
#[cfg(not(feature = "cuda"))]
Ok(CudaAlignmentKernel {})
}
pub fn is_available(&self) -> bool {
#[cfg(feature = "cuda")]
self.inner.is_some();
#[cfg(not(feature = "cuda"))]
false
}
#[cfg(feature = "cuda")]
pub fn smith_waterman(
&self,
seq1: &[u8],
seq2: &[u8],
matrix: &[i32],
extend_penalty: i32,
) -> Result<(Vec<i32>, usize, usize, i32), Box<dyn std::error::Error>> {
if let Some(ref kernel) = self.inner {
kernel.smith_waterman(seq1, seq2, matrix, extend_penalty)
} else {
Err("CUDA kernel not available".into())
}
}
#[cfg(feature = "cuda")]
pub fn needleman_wunsch(
&self,
seq1: &[u8],
seq2: &[u8],
matrix: &[i32],
open_penalty: i32,
extend_penalty: i32,
) -> Result<Vec<i32>, Box<dyn std::error::Error>> {
if let Some(ref kernel) = self.inner {
kernel.needleman_wunsch(seq1, seq2, matrix, open_penalty, extend_penalty)
} else {
Err("CUDA kernel not available".into())
}
}
}
impl Default for CudaAlignmentKernel {
fn default() -> Self {
Self::new().unwrap_or(CudaAlignmentKernel {
#[cfg(feature = "cuda")]
inner: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cuda_kernel_initialization() {
let result = CudaAlignmentKernel::new();
assert!(result.is_ok(), "CUDA kernel initialization should not panic");
}
#[test]
fn test_cuda_kernel_availability_query() {
if let Ok(kernel) = CudaAlignmentKernel::new() {
let _available = kernel.is_available();
}
}
#[cfg(feature = "cuda")]
#[test]
fn test_smith_waterman_cuda_correctness() {
let seq1 = b"ACGT";
let seq2 = b"AGT";
let mut matrix = vec![0i32; 24 * 24];
matrix[0 * 24 + 0] = 2; matrix[1 * 24 + 1] = 2; matrix[2 * 24 + 2] = 2; matrix[3 * 24 + 3] = 2;
let kernel = SmithWatermanCuda::new().unwrap();
let result = kernel.smith_waterman(seq1, seq2, &matrix, -1);
assert!(result.is_ok(), "Smith-Waterman should succeed");
let (dp, max_i, max_j, max_score) = result.unwrap();
assert_eq!(dp.len(), (seq1.len() + 1) * (seq2.len() + 1));
assert!(max_score >= 0, "Max score should be non-negative");
assert!(max_i <= seq1.len());
assert!(max_j <= seq2.len());
}
#[cfg(feature = "cuda")]
#[test]
fn test_needleman_wunsch_cuda_correctness() {
let seq1 = b"AC";
let seq2 = b"AC";
let mut matrix = vec![0i32; 24 * 24];
matrix[0 * 24 + 0] = 2; matrix[1 * 24 + 1] = 2;
let kernel = SmithWatermanCuda::new().unwrap();
let result = kernel.needleman_wunsch(seq1, seq2, &matrix, -2, -1);
assert!(result.is_ok(), "Needleman-Wunsch should succeed");
let dp = result.unwrap();
assert_eq!(dp.len(), (seq1.len() + 1) * (seq2.len() + 1));
assert!(dp[dp.len() - 1] >= 0, "Final alignment score should be >= 0");
}
#[cfg(feature = "cuda")]
#[test]
fn test_cuda_empty_sequences() {
let seq1 = b"";
let seq2 = b"AC";
let matrix = vec![0i32; 24 * 24];
let kernel = SmithWatermanCuda::new().unwrap();
let result = kernel.smith_waterman(seq1, seq2, &matrix, -1);
assert!(result.is_ok() || result.is_err(), "Should not panic on empty input");
}
#[cfg(feature = "cuda")]
#[test]
fn test_cuda_single_amino_acid() {
let seq1 = b"A";
let seq2 = b"A";
let mut matrix = vec![0i32; 24 * 24];
matrix[0 * 24 + 0] = 5;
let kernel = SmithWatermanCuda::new().unwrap();
let result = kernel.smith_waterman(seq1, seq2, &matrix, -2);
assert!(result.is_ok());
let (_, _, _, max_score) = result.unwrap();
assert_eq!(max_score, 5, "Single amino acid match should score 5");
}
#[cfg(feature = "cuda")]
#[test]
fn test_cuda_mismatch_penalty() {
let seq1 = b"A";
let seq2 = b"C";
let mut matrix = vec![0i32; 24 * 24];
matrix[0 * 24 + 1] = -3;
let kernel = SmithWatermanCuda::new().unwrap();
let result = kernel.smith_waterman(seq1, seq2, &matrix, -1);
assert!(result.is_ok());
let (_, _, _, max_score) = result.unwrap();
assert!(max_score <= 0, "Mismatch should result in low/zero score");
}
#[cfg(feature = "cuda")]
#[test]
fn test_cuda_gap_handling() {
let seq1 = b"ACG";
let seq2 = b"AG";
let mut matrix = vec![0i32; 24 * 24];
matrix[0 * 24 + 0] = 2; matrix[1 * 24 + 1] = 2; matrix[2 * 24 + 2] = 2;
let kernel = SmithWatermanCuda::new().unwrap();
let result = kernel.smith_waterman(seq1, seq2, &matrix, -3);
assert!(result.is_ok());
let (_, _, _, _max_score) = result.unwrap();
}
#[test]
fn test_cuda_wrapper_fallback() {
let kernel = CudaAlignmentKernel::new().unwrap();
let available = kernel.is_available();
#[cfg(not(feature = "cuda"))]
{
assert!(!available, "CUDA should report as unavailable");
}
}
}