pub const SMITH_WATERMAN_KERNEL: &str = r#"
extern "C" __global__ void smith_waterman_kernel(
const unsigned char* seq1, // Query sequence
const unsigned char* seq2, // Subject sequence
const int* scoring_matrix, // 24x24 scoring matrix (row-major)
int* dp_matrix, // Output DP table
int* traceback, // Output traceback
int len1, // Query length
int len2, // Subject length
int gap_open, // Gap open penalty
int gap_extend, // Gap extend penalty
int* max_score, // Output: max score found
int* max_i, // Output: row of max
int* max_j // Output: col of max
) {
// Grid-stride loop pattern for 2D DP
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= len1 || j >= len2) return;
// Shared memory for scoring matrix (optimized access)
__shared__ int smatrix[24 * 24];
if (threadIdx.x < 24 && threadIdx.y < 24) {
smatrix[threadIdx.y * 24 + threadIdx.x] =
scoring_matrix[threadIdx.y * 24 + threadIdx.x];
}
__syncthreads();
int idx = i * len2 + j;
int score = 0;
int trace = 0; // 0=stop, 1=diag, 2=up, 3=left
if (i == 0 && j == 0) {
dp_matrix[idx] = 0;
traceback[idx] = 0;
return;
}
// Get amino acids (0-19 standard IUPAC)
int aa1 = seq1[i - 1];
int aa2 = seq2[j - 1];
// Match/mismatch score
int match_score = (i > 0 && j > 0)
? dp_matrix[(i-1) * len2 + (j-1)] + smatrix[aa1 * 24 + aa2]
: smatrix[aa1 * 24 + aa2];
// Deletion score (gap in seq2)
int del_score = (i > 0)
? dp_matrix[(i-1) * len2 + j] + gap_extend
: 0;
// Insertion score (gap in seq1)
int ins_score = (j > 0)
? dp_matrix[i * len2 + (j-1)] + gap_extend
: 0;
// Local alignment: can start fresh from 0
score = max({0, match_score, del_score, ins_score});
if (score == 0) trace = 0;
else if (score == match_score) trace = 1;
else if (score == del_score) trace = 2;
else trace = 3;
dp_matrix[idx] = score;
traceback[idx] = trace;
// Update global maximum with atomic operation
if (score > 0) {
atomicMax(max_score, score);
if (score == *max_score) {
atomicExch(max_i, i);
atomicExch(max_j, j);
}
}
}
"#;
pub const NEEDLEMAN_WUNSCH_KERNEL: &str = r#"
extern "C" __global__ void needleman_wunsch_kernel(
const unsigned char* seq1,
const unsigned char* seq2,
const int* scoring_matrix,
int* dp_matrix,
int* traceback,
int len1,
int len2,
int gap_open,
int gap_extend
) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i > len1 || j > len2) return;
// Shared memory for scoring matrix
__shared__ int smatrix[24 * 24];
if (threadIdx.x < 24 && threadIdx.y < 24) {
smatrix[threadIdx.y * 24 + threadIdx.x] =
scoring_matrix[threadIdx.y * 24 + threadIdx.x];
}
__syncthreads();
int idx = i * (len2 + 1) + j;
int trace = 0;
// Initialize boundaries
if (i == 0) {
dp_matrix[idx] = j * gap_extend;
traceback[idx] = 3; // left
return;
}
if (j == 0) {
dp_matrix[idx] = i * gap_extend;
traceback[idx] = 2; // up
return;
}
// Get amino acids
int aa1 = seq1[i - 1];
int aa2 = seq2[j - 1];
// Compute DP recurrence
int match = dp_matrix[(i-1) * (len2 + 1) + (j-1)] +
smatrix[aa1 * 24 + aa2];
int del = dp_matrix[(i-1) * (len2 + 1) + j] + gap_extend;
int ins = dp_matrix[i * (len2 + 1) + (j-1)] + gap_extend;
int score = max({match, del, ins});
if (score == match) trace = 1;
else if (score == del) trace = 2;
else trace = 3;
dp_matrix[idx] = score;
traceback[idx] = trace;
}
"#;
pub const VITERBI_HMM_KERNEL: &str = r#"
// Dynamic shared memory macro for flexible allocation
extern "C" __global__ void viterbi_forward_kernel(
const unsigned char* sequence, // Encoded amino acids
float* dp_m, // Match state DP table (output)
float* dp_i, // Insert state DP table
float* dp_d, // Delete state DP table
const float* transitions, // m×3 transition matrix
const float* emissions, // 20×m emission matrix
int n, // Sequence length
int m // HMM length
) {
int pos = blockIdx.x * blockDim.x + threadIdx.x;
int state = blockIdx.y * blockDim.y + threadIdx.y;
if (pos >= n || state >= m) return;
int aa = sequence[pos];
// Dynamic shared memory: trans[m*3] + emis[20*m] floats
// Launch parameter: sharedMemSize = (m*3 + 20*m) * sizeof(float)
extern __shared__ float s_mem[];
float* trans = s_mem; // First m*3 elements
float* emis = &s_mem[m * 3]; // Next 20*m elements
// Load transition matrix into shared memory (all warps participate)
for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < m * 3; i += blockDim.x * blockDim.y) {
trans[i] = transitions[i];
}
// Load emission matrix into shared memory
for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < 20 * m; i += blockDim.x * blockDim.y) {
emis[i] = emissions[i];
}
__syncthreads();
float emit_score = emis[aa * m + state];
if (pos == 0) {
dp_m[state] = emit_score;
dp_i[state] = -1e6f;
dp_d[state] = -1e6f;
return;
}
// Previous states
float prev_m = dp_m[(pos-1) * m + state];
float prev_i = dp_i[(pos-1) * m + state];
float prev_d = dp_d[(pos-1) * m + state];
// Viterbi recurrence
float best_m = max({prev_m, prev_i, prev_d}) +
trans[state * 3 + 0] + emit_score;
float best_i = max({prev_m, prev_i, prev_d}) +
trans[state * 3 + 1] + emit_score;
float best_d = max({prev_m, prev_i, prev_d}) +
trans[state * 3 + 2];
dp_m[pos * m + state] = best_m;
dp_i[pos * m + state] = best_i;
dp_d[pos * m + state] = best_d;
}
"#;