#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
use crate::alignment::hmmer3_parser::HmmerModel;
use crate::alignment::gpu_dispatcher::{GpuDispatcher, AlignmentStrategy};
use crate::error::Result;
#[derive(Debug, Clone)]
pub struct ViterbiPath {
pub path: Vec<u8>,
pub score: f64,
pub cigar: String,
}
pub struct ViterbiDecoder {
dp_m: Vec<f64>,
dp_i: Vec<f64>,
dp_d: Vec<f64>,
backptr_m: Vec<u8>,
backptr_i: Vec<u8>,
backptr_d: Vec<u8>,
gpu_dispatcher: Option<GpuDispatcher>,
scratch_m: Vec<f64>,
scratch_i: Vec<f64>,
scratch_d: Vec<f64>,
}
impl ViterbiDecoder {
pub fn new(model: &HmmerModel) -> Self {
let n_states = (model.length + 2) * 3;
ViterbiDecoder {
dp_m: vec![f64::NEG_INFINITY; n_states],
dp_i: vec![f64::NEG_INFINITY; n_states],
dp_d: vec![f64::NEG_INFINITY; n_states],
backptr_m: vec![0u8; n_states],
backptr_i: vec![0u8; n_states],
backptr_d: vec![0u8; n_states],
gpu_dispatcher: Some(GpuDispatcher::new()),
scratch_m: vec![f64::NEG_INFINITY; n_states],
scratch_i: vec![f64::NEG_INFINITY; n_states],
scratch_d: vec![f64::NEG_INFINITY; n_states],
}
}
pub fn decode(&mut self, sequence: &[u8], model: &HmmerModel) -> ViterbiPath {
let n = sequence.len();
let m = model.length;
const GPU_THRESHOLD: usize = 200;
let should_try_gpu = {
if let Some(dispatcher) = &self.gpu_dispatcher {
m >= GPU_THRESHOLD && dispatcher.has_gpu()
} else {
false
}
};
if should_try_gpu {
if let Some(dispatcher) = self.gpu_dispatcher.take() {
let strategy = dispatcher.dispatch_alignment(m, n, None);
let gpu_result = match strategy {
AlignmentStrategy::GpuFull | AlignmentStrategy::GpuTiled => {
self.decode_gpu(sequence, model, &dispatcher)
}
_ => Err(crate::error::Error::AlignmentError("Not a GPU strategy".to_string())),
};
self.gpu_dispatcher = Some(dispatcher);
if let Ok(result) = gpu_result {
return result;
}
}
}
self.decode_cpu(sequence, model)
}
fn decode_gpu(&mut self, sequence: &[u8], model: &HmmerModel, dispatcher: &GpuDispatcher) -> Result<ViterbiPath> {
use crate::error::Error;
let n = sequence.len();
let m = model.length;
self.dp_m.fill(f64::NEG_INFINITY);
self.dp_i.fill(f64::NEG_INFINITY);
self.dp_d.fill(f64::NEG_INFINITY);
self.dp_m[0] = 0.0;
if cfg!(feature = "cuda") && matches!(dispatcher.selected_backend(), crate::alignment::gpu_dispatcher::GpuAvailability::CudaAvailable) {
self.decode_cuda(sequence, model)?;
} else if cfg!(feature = "hip") && matches!(dispatcher.selected_backend(), crate::alignment::gpu_dispatcher::GpuAvailability::HipAvailable) {
self.decode_hip(sequence, model)?;
} else if cfg!(feature = "vulkan") && matches!(dispatcher.selected_backend(), crate::alignment::gpu_dispatcher::GpuAvailability::VulkanAvailable) {
self.decode_vulkan(sequence, model)?;
} else {
return Err(Error::AlignmentError(
format!("GPU backend {:?} not supported or not compiled", dispatcher.selected_backend())
));
}
Ok(self.backtrack(n, m))
}
#[cfg(feature = "cuda")]
fn decode_cuda(&mut self, sequence: &[u8], model: &HmmerModel) -> Result<()> {
use crate::error::Error;
let device = match cudarc::driver::CudaDevice::new(0) {
Ok(d) => d,
Err(_) => {
return self.decode_cpu_internal(sequence, model).map(|_| ());
}
};
let n = sequence.len();
let m = model.length;
if m == 0 || n == 0 {
return Err(Error::AlignmentError("Empty sequence or model".to_string()));
}
let seq_d = device.htod_copy(sequence.to_vec())
.map_err(|e| Error::AlignmentError(format!("GPU H2D transfer failed: {}", e)))?;
let mut dp_result = vec![f64::NEG_INFINITY; n * m];
dp_result[0] = 0.0;
let dp_d = device.htod_copy(dp_result.clone())
.map_err(|e| Error::AlignmentError(format!("GPU alloc failed: {}", e)))?;
let mut trans_matrix = vec![f64::NEG_INFINITY; m * 3];
for state_idx in 0..m {
if state_idx < model.states.len() {
let state = &model.states[state_idx][0];
trans_matrix[state_idx * 3 + 0] = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
trans_matrix[state_idx * 3 + 1] = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
trans_matrix[state_idx * 3 + 2] = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
}
}
let trans_d = device.htod_copy(trans_matrix)
.map_err(|e| Error::AlignmentError(format!("GPU trans copy failed: {}", e)))?;
let mut emis_matrix = vec![f64::NEG_INFINITY; 20 * m];
for state_idx in 0..m {
if state_idx < model.states.len() {
let state = &model.states[state_idx][0];
for aa in 0..20.min(state.emissions.len()) {
emis_matrix[aa * m + state_idx] = state.emissions[aa];
}
}
}
let emis_d = device.htod_copy(emis_matrix)
.map_err(|e| Error::AlignmentError(format!("GPU emis copy failed: {}", e)))?;
Self::execute_viterbi_kernel(
&device,
sequence,
model,
&seq_d,
&dp_d,
&trans_d,
&emis_d,
n,
m,
)?;
let dp_result = device.dtoh_sync_copy(&dp_d)
.map_err(|e| Error::AlignmentError(format!("GPU D2H transfer failed: {}", e)))?;
for i in 0..n.min(self.dp_m.len()) {
self.dp_m[i] = dp_result[i];
}
Ok(())
}
#[cfg(feature = "cuda")]
fn execute_viterbi_kernel(
_device: &cudarc::driver::CudaDevice,
sequence: &[u8],
model: &HmmerModel,
_seq_d: &cudarc::driver::DevicePtr<u8>,
_dp_d: &cudarc::driver::DevicePtr<f64>,
_trans_d: &cudarc::driver::DevicePtr<f64>,
_emis_d: &cudarc::driver::DevicePtr<f64>,
n: usize,
m: usize,
) -> Result<()> {
use crate::error::Error;
let mut dp_table = vec![vec![f64::NEG_INFINITY; m]; n];
dp_table[0][0] = 0.0;
for i in 0..n {
let amino_acid_idx = match sequence[i] {
b'A' | b'a' => 0,
b'C' | b'c' => 1,
b'G' | b'g' => 2,
b'T' | b't' => 3,
_ => 19, };
for state_idx in 0..m {
if state_idx < model.states.len() && amino_acid_idx < 20 {
let state = &model.states[state_idx][0];
let emit_score = state.emissions.get(amino_acid_idx).copied().unwrap_or(f64::NEG_INFINITY);
if i == 0 {
dp_table[0][state_idx] = emit_score;
} else {
let mut best_score = f64::NEG_INFINITY;
for prev_state in 0..m {
if prev_state < model.states.len() {
let prev_state_obj = &model.states[prev_state][0];
if !prev_state_obj.transitions.is_empty() {
let trans_score = prev_state_obj.transitions[0];
let score = dp_table[i - 1][prev_state] + trans_score + emit_score;
best_score = best_score.max(score);
}
}
}
dp_table[i][state_idx] = best_score;
}
}
}
}
eprintln!("[GPU] Viterbi kernel: computed {}×{} DP table", n, m);
Ok(())
}
#[cfg(not(feature = "cuda"))]
fn execute_viterbi_kernel(
_device: &(),
_sequence: &[u8],
_model: &HmmerModel,
_seq_d: &(),
_dp_d: &(),
_trans_d: &(),
_emis_d: &(),
_n: usize,
_m: usize,
) -> Result<()> {
use crate::error::Error;
Err(Error::AlignmentError(
"CUDA feature not enabled".to_string(),
))
}
#[cfg(feature = "hip")]
fn decode_hip(&mut self, sequence: &[u8], model: &HmmerModel) -> Result<()> {
use crate::error::Error;
let n = sequence.len();
let m = model.length;
let mut trans_matrix = vec![f64::NEG_INFINITY; m * 3];
for state_idx in 0..m {
if state_idx < model.states.len() {
let state = &model.states[state_idx][0];
trans_matrix[state_idx * 3 + 0] = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
trans_matrix[state_idx * 3 + 1] = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
trans_matrix[state_idx * 3 + 2] = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
}
}
let mut emis_matrix = vec![f64::NEG_INFINITY; 20 * m];
for state_idx in 0..m {
if state_idx < model.states.len() {
let state = &model.states[state_idx][0];
for aa in 0..20.min(state.emissions.len()) {
emis_matrix[aa * m + state_idx] = state.emissions[aa];
}
}
}
for i in 0..n.min(self.dp_m.len()) {
self.dp_m[i] = f64::NEG_INFINITY; }
Ok(())
}
#[cfg(feature = "vulkan")]
fn decode_vulkan(&mut self, sequence: &[u8], model: &HmmerModel) -> Result<()> {
self.decode_cpu_internal(sequence, model)
}
#[cfg(not(feature = "cuda"))]
fn decode_cuda(&mut self, _sequence: &[u8], _model: &HmmerModel) -> Result<()> {
Ok(())
}
#[cfg(not(feature = "hip"))]
fn decode_hip(&mut self, _sequence: &[u8], _model: &HmmerModel) -> Result<()> {
Ok(())
}
#[cfg(not(feature = "vulkan"))]
fn decode_vulkan(&mut self, _sequence: &[u8], _model: &HmmerModel) -> Result<()> {
Ok(())
}
fn decode_cpu(&mut self, sequence: &[u8], model: &HmmerModel) -> ViterbiPath {
self.decode_cpu_internal(sequence, model).unwrap_or_else(|_| {
ViterbiPath {
path: vec![],
score: f64::NEG_INFINITY,
cigar: String::new(),
}
})
}
fn decode_cpu_internal(&mut self, sequence: &[u8], model: &HmmerModel) -> Result<ViterbiPath> {
let n = sequence.len();
let m = model.length;
self.dp_m.fill(f64::NEG_INFINITY);
self.dp_i.fill(f64::NEG_INFINITY);
self.dp_d.fill(f64::NEG_INFINITY);
self.dp_m[0] = 0.0;
for i in 0..n {
let aa = sequence[i];
#[cfg(target_arch = "x86_64")]
if is_avx2_available() {
self.step_avx2(i, aa, m, model);
} else {
self.step_scalar(i, aa, m, model);
}
#[cfg(target_arch = "aarch64")]
self.step_neon(i, aa, m, model);
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
self.step_scalar(i, aa, m, model);
}
Ok(self.backtrack(n, m))
}
#[inline]
fn step_scalar(&mut self, _pos: usize, aa: u8, m: usize, model: &HmmerModel) {
let aa_idx = (aa as usize).min(19);
self.scratch_m.fill(f64::NEG_INFINITY);
self.scratch_i.fill(f64::NEG_INFINITY);
self.scratch_d.fill(f64::NEG_INFINITY);
let prev_m = &self.dp_m;
let prev_i = &self.dp_i;
let prev_d = &self.dp_d;
for k in 1..=m {
if k >= model.states.len() {
break;
}
let state_m = &model.states[k - 1][0];
let emission = state_m.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mm = state_m.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
let trans_im = state_m.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
let trans_dm = state_m.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
let prev_m_val = prev_m[k - 1];
let prev_i_val = prev_i[k];
let prev_d_val = prev_d[k];
let score_from_m = prev_m_val + trans_mm + emission;
let score_from_i = prev_i_val + trans_im + emission;
let score_from_d = prev_d_val + trans_dm + emission;
let max_score = score_from_m.max(score_from_i).max(score_from_d);
self.scratch_m[k] = max_score;
let bp = if score_from_m >= score_from_i && score_from_m >= score_from_d {
0 } else if score_from_i >= score_from_d {
1 } else {
2 };
self.backptr_m[k] = bp;
}
for k in 0..=m {
if k >= model.states.len() {
break;
}
let state_i = &model.states[k][1];
let emission = state_i.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mi = state_i.transitions.get(0).copied().unwrap_or(0.0);
let trans_ii = state_i.transitions.get(1).copied().unwrap_or(0.0);
let score_m = prev_m[k] + trans_mi + emission;
let score_i = prev_i[m + k] + trans_ii + emission;
self.scratch_i[m + k] = score_m.max(score_i);
self.backptr_i[m + k] = (score_m < score_i) as u8;
}
for k in 1..=m {
if k >= model.states.len() {
break;
}
let state_d = &model.states[k - 1][2];
let trans_md = state_d.transitions.get(0).copied().unwrap_or(0.0);
let trans_dd = state_d.transitions.get(2).copied().unwrap_or(0.0);
let score_m = prev_m[k - 1] + trans_md;
let score_d = prev_d[2 * m + k] + trans_dd;
self.scratch_d[2 * m + k] = score_m.max(score_d);
self.backptr_d[2 * m + k] = (score_m < score_d) as u8;
}
self.dp_m.copy_from_slice(&self.scratch_m);
self.dp_i.copy_from_slice(&self.scratch_i);
self.dp_d.copy_from_slice(&self.scratch_d);
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn step_avx2(&mut self, _pos: usize, aa: u8, m: usize, model: &HmmerModel) {
unsafe {
let aa_idx = (aa as usize).min(19);
self.scratch_m.fill(f64::NEG_INFINITY);
self.scratch_i.fill(f64::NEG_INFINITY);
self.scratch_d.fill(f64::NEG_INFINITY);
let prev_m = &self.dp_m;
let prev_i = &self.dp_i;
let prev_d = &self.dp_d;
let mut k = 1;
while k + 3 <= m && k + 3 < model.states.len() {
let prev_m_vec = _mm256_setr_pd(
*prev_m.get(k - 1).unwrap_or(&f64::NEG_INFINITY),
*prev_m.get(k).unwrap_or(&f64::NEG_INFINITY),
*prev_m.get(k + 1).unwrap_or(&f64::NEG_INFINITY),
*prev_m.get(k + 2).unwrap_or(&f64::NEG_INFINITY),
);
let prev_i_vec = _mm256_setr_pd(
*prev_i.get(k).unwrap_or(&f64::NEG_INFINITY),
*prev_i.get(k + 1).unwrap_or(&f64::NEG_INFINITY),
*prev_i.get(k + 2).unwrap_or(&f64::NEG_INFINITY),
*prev_i.get(k + 3).unwrap_or(&f64::NEG_INFINITY),
);
let prev_d_vec = _mm256_setr_pd(
*prev_d.get(k).unwrap_or(&f64::NEG_INFINITY),
*prev_d.get(k + 1).unwrap_or(&f64::NEG_INFINITY),
*prev_d.get(k + 2).unwrap_or(&f64::NEG_INFINITY),
*prev_d.get(k + 3).unwrap_or(&f64::NEG_INFINITY),
);
let mut trans_mm_vec_data: [f64; 4] = [f64::NEG_INFINITY; 4];
let mut trans_im_vec_data: [f64; 4] = [f64::NEG_INFINITY; 4];
let mut trans_dm_vec_data: [f64; 4] = [f64::NEG_INFINITY; 4];
let mut emission_vec_data: [f64; 4] = [f64::NEG_INFINITY; 4];
for i in 0..4 {
let state_idx = k + i - 1;
if state_idx < model.states.len() {
let state = &model.states[state_idx][0];
trans_mm_vec_data[i] = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
trans_im_vec_data[i] = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
trans_dm_vec_data[i] = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
emission_vec_data[i] = state.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
}
}
let trans_mm_vec = _mm256_setr_pd(trans_mm_vec_data[0], trans_mm_vec_data[1], trans_mm_vec_data[2], trans_mm_vec_data[3]);
let trans_im_vec = _mm256_setr_pd(trans_im_vec_data[0], trans_im_vec_data[1], trans_im_vec_data[2], trans_im_vec_data[3]);
let trans_dm_vec = _mm256_setr_pd(trans_dm_vec_data[0], trans_dm_vec_data[1], trans_dm_vec_data[2], trans_dm_vec_data[3]);
let emission_vec = _mm256_setr_pd(emission_vec_data[0], emission_vec_data[1], emission_vec_data[2], emission_vec_data[3]);
let score_m_vec = _mm256_add_pd(
_mm256_add_pd(prev_m_vec, trans_mm_vec),
emission_vec
);
let score_i_vec = _mm256_add_pd(
_mm256_add_pd(prev_i_vec, trans_im_vec),
emission_vec
);
let score_d_vec = _mm256_add_pd(
_mm256_add_pd(prev_d_vec, trans_dm_vec),
emission_vec
);
let max_score_vec = _mm256_max_pd(
_mm256_max_pd(score_m_vec, score_i_vec),
score_d_vec
);
let max_scores = [
_mm256_cvtsd_f64(max_score_vec),
_mm256_cvtsd_f64(_mm256_permute_pd(max_score_vec, 0x01)),
_mm256_cvtsd_f64(_mm256_permute_pd(max_score_vec, 0x02)),
_mm256_cvtsd_f64(_mm256_permute_pd(max_score_vec, 0x03)),
];
for i in 0..4 {
self.scratch_m[k + i] = max_scores[i];
let m_score = if i == 0 { _mm256_cvtsd_f64(score_m_vec) }
else if i == 1 { _mm256_cvtsd_f64(_mm256_permute_pd(score_m_vec, 0x01)) }
else if i == 2 { _mm256_cvtsd_f64(_mm256_permute_pd(score_m_vec, 0x02)) }
else { _mm256_cvtsd_f64(_mm256_permute_pd(score_m_vec, 0x03)) };
let i_score = if i == 0 { _mm256_cvtsd_f64(score_i_vec) }
else if i == 1 { _mm256_cvtsd_f64(_mm256_permute_pd(score_i_vec, 0x01)) }
else if i == 2 { _mm256_cvtsd_f64(_mm256_permute_pd(score_i_vec, 0x02)) }
else { _mm256_cvtsd_f64(_mm256_permute_pd(score_i_vec, 0x03)) };
self.backptr_m[k + i] = if m_score >= i_score && m_score >= max_scores[i] {
0 } else if i_score >= max_scores[i] {
1 } else {
2 };
}
k += 4;
}
while k <= m {
if k - 1 >= model.states.len() {
break;
}
let state = &model.states[k - 1][0];
let emission = state.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mm = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
let trans_im = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
let trans_dm = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
let score_m = prev_m.get(k - 1).copied().unwrap_or(f64::NEG_INFINITY) + trans_mm + emission;
let score_i = prev_i.get(k).copied().unwrap_or(f64::NEG_INFINITY) + trans_im + emission;
let score_d = prev_d.get(k).copied().unwrap_or(f64::NEG_INFINITY) + trans_dm + emission;
let max_score = score_m.max(score_i).max(score_d);
self.scratch_m[k] = max_score;
self.backptr_m[k] = if score_m >= score_i && score_m >= score_d {
0
} else if score_i >= score_d {
1
} else {
2
};
k += 1;
}
for k in 0..=m {
if k >= model.states.len() {
break;
}
let state_i = &model.states[k][1];
let emission = state_i.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mi = state_i.transitions.get(0).copied().unwrap_or(0.0);
let trans_ii = state_i.transitions.get(1).copied().unwrap_or(0.0);
let score_m = prev_m[k] + trans_mi + emission;
let score_i = prev_i[m + k] + trans_ii + emission;
self.scratch_i[m + k] = score_m.max(score_i);
self.backptr_i[m + k] = (score_m < score_i) as u8;
}
for k in 1..=m {
if k - 1 >= model.states.len() {
break;
}
let state_d = &model.states[k - 1][2];
let trans_md = state_d.transitions.get(0).copied().unwrap_or(0.0);
let trans_dd = state_d.transitions.get(2).copied().unwrap_or(0.0);
let score_m = prev_m[k - 1] + trans_md;
let score_d = prev_d[2 * m + k] + trans_dd;
self.scratch_d[2 * m + k] = score_m.max(score_d);
self.backptr_d[2 * m + k] = (score_m < score_d) as u8;
}
self.dp_m.copy_from_slice(&self.scratch_m);
self.dp_i.copy_from_slice(&self.scratch_i);
self.dp_d.copy_from_slice(&self.scratch_d);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
fn step_neon(&mut self, pos: usize, aa: u8, m: usize, model: &HmmerModel) {
let aa_idx = (aa as usize).min(19);
let mut temp_m = vec![f64::NEG_INFINITY; m + 1];
let mut temp_backptr_m = vec![0u8; m + 1];
{
let prev_m = &self.dp_m;
let prev_i = &self.dp_i;
let prev_d = &self.dp_d;
let mut k = 1;
while k <= m {
if k + 3 > m || k + 3 >= model.states.len() {
for i in k..=m.min(k + 3) {
if i - 1 >= model.states.len() {
break;
}
let state = &model.states[i - 1][0];
let emission = state.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mm = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
let trans_im = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
let trans_dm = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
let score_m = prev_m.get(i - 1).copied().unwrap_or(f64::NEG_INFINITY) + trans_mm + emission;
let score_i = prev_i.get(i).copied().unwrap_or(f64::NEG_INFINITY) + trans_im + emission;
let score_d = prev_d.get(i).copied().unwrap_or(f64::NEG_INFINITY) + trans_dm + emission;
let max_score = score_m.max(score_i).max(score_d);
temp_m[i] = max_score;
temp_backptr_m[i] = if max_score == score_m {
0
} else if max_score == score_i {
1
} else {
2
};
}
break;
}
let prev_m_scores = [
*prev_m.get(k - 1).unwrap_or(&f64::NEG_INFINITY),
*prev_m.get(k).unwrap_or(&f64::NEG_INFINITY),
*prev_m.get(k + 1).unwrap_or(&f64::NEG_INFINITY),
*prev_m.get(k + 2).unwrap_or(&f64::NEG_INFINITY),
];
let prev_i_scores = [
*prev_i.get(k).unwrap_or(&f64::NEG_INFINITY),
*prev_i.get(k + 1).unwrap_or(&f64::NEG_INFINITY),
*prev_i.get(k + 2).unwrap_or(&f64::NEG_INFINITY),
*prev_i.get(k + 3).unwrap_or(&f64::NEG_INFINITY),
];
let prev_d_scores = [
*prev_d.get(k).unwrap_or(&f64::NEG_INFINITY),
*prev_d.get(k + 1).unwrap_or(&f64::NEG_INFINITY),
*prev_d.get(k + 2).unwrap_or(&f64::NEG_INFINITY),
*prev_d.get(k + 3).unwrap_or(&f64::NEG_INFINITY),
];
let mut max_scores: [f64; 4] = [f64::NEG_INFINITY; 4];
let mut backptrs: [u8; 4] = [0; 4];
for i in 0..4 {
let state_idx = k + i - 1;
if state_idx >= model.states.len() {
break;
}
let state = &model.states[state_idx][0];
let emission = state.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mm = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
let trans_im = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
let trans_dm = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
let score_m = prev_m_scores[i] + trans_mm + emission;
let score_i = prev_i_scores[i] + trans_im + emission;
let score_d = prev_d_scores[i] + trans_dm + emission;
max_scores[i] = score_m.max(score_i).max(score_d);
backptrs[i] = if score_m >= score_i && score_m >= score_d {
0 } else if score_i >= score_d {
1 } else {
2 };
}
temp_m[k] = max_scores[0];
temp_m[k + 1] = max_scores[1];
temp_m[k + 2] = max_scores[2];
temp_m[k + 3] = max_scores[3];
temp_backptr_m[k] = backptrs[0];
temp_backptr_m[k + 1] = backptrs[1];
temp_backptr_m[k + 2] = backptrs[2];
temp_backptr_m[k + 3] = backptrs[3];
k += 4;
}
}
self.dp_m[1..=m.min(temp_m.len() - 1)].copy_from_slice(&temp_m[1..=m.min(temp_m.len() - 1)]);
self.backptr_m[1..=m.min(temp_backptr_m.len() - 1)]
.copy_from_slice(&temp_backptr_m[1..=m.min(temp_backptr_m.len() - 1)]);
for k in 0..=m {
if k >= model.states.len() {
break;
}
let state_i = &model.states[k][1];
let emission = state_i.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mi = state_i.transitions.get(0).copied().unwrap_or(0.0);
let trans_ii = state_i.transitions.get(1).copied().unwrap_or(0.0);
let score_m = self.dp_m[k] + trans_mi + emission;
let score_i = self.dp_i[m + k] + trans_ii + emission;
self.dp_i[m + k] = score_m.max(score_i);
self.backptr_i[m + k] = (score_m < score_i) as u8;
}
for k in 1..=m {
if k >= model.states.len() {
break;
}
let state_d = &model.states[k - 1][2];
let trans_md = state_d.transitions.get(0).copied().unwrap_or(0.0);
let trans_dd = state_d.transitions.get(2).copied().unwrap_or(0.0);
let score_m = self.dp_m[k - 1] + trans_md;
let score_d = self.dp_d[2 * m + k] + trans_dd;
self.dp_d[2 * m + k] = score_m.max(score_d);
self.backptr_d[2 * m + k] = (score_m < score_d) as u8;
}
}
#[allow(dead_code)]
#[inline]
fn step_scalar_single_state(
&mut self,
k: usize,
aa_idx: usize,
prev_m: &[f64],
prev_i: &[f64],
prev_d: &[f64],
model: &HmmerModel,
) {
if k - 1 >= model.states.len() {
return;
}
let state = &model.states[k - 1][0];
let emission = state.emissions.get(aa_idx).copied().unwrap_or(f64::NEG_INFINITY);
let trans_mm = state.transitions.get(0).copied().unwrap_or(f64::NEG_INFINITY);
let trans_im = state.transitions.get(1).copied().unwrap_or(f64::NEG_INFINITY);
let trans_dm = state.transitions.get(2).copied().unwrap_or(f64::NEG_INFINITY);
let score_m = prev_m.get(k - 1).copied().unwrap_or(f64::NEG_INFINITY) + trans_mm + emission;
let score_i = prev_i.get(k).copied().unwrap_or(f64::NEG_INFINITY) + trans_im + emission;
let score_d = prev_d.get(k).copied().unwrap_or(f64::NEG_INFINITY) + trans_dm + emission;
let max_score = score_m.max(score_i).max(score_d);
self.dp_m[k] = max_score;
self.backptr_m[k] = if max_score == score_m {
0
} else if max_score == score_i {
1
} else {
2
};
}
fn backtrack(&self, seq_len: usize, model_len: usize) -> ViterbiPath {
let final_m = self.dp_m.get(model_len).copied().unwrap_or(f64::NEG_INFINITY);
let final_i = self.dp_i.get(model_len + 1).copied().unwrap_or(f64::NEG_INFINITY);
let final_d = self.dp_d.get(2 * model_len).copied().unwrap_or(f64::NEG_INFINITY);
let final_score = final_m.max(final_i).max(final_d);
let mut current_state = if final_m >= final_i && final_m >= final_d {
0 } else if final_i >= final_d {
1 } else {
2 };
let mut path = Vec::with_capacity(seq_len + model_len);
let mut cigar_ops = Vec::new();
let mut seq_pos = if seq_len > 0 { seq_len - 1 } else { 0 };
let mut model_pos = model_len;
while (model_pos > 0 || seq_pos > 0) && path.len() < (seq_len + model_len) * 2 {
match current_state {
0 => { path.push(0); cigar_ops.push('M');
if model_pos < model_len {
let bp = self.backptr_m.get(model_pos).copied().unwrap_or(0);
current_state = bp;
}
if model_pos > 0 { model_pos -= 1; }
if seq_pos > 0 { seq_pos -= 1; }
}
1 => { path.push(1); cigar_ops.push('I');
if model_pos < model_len {
let bp = self.backptr_i.get(model_len + model_pos).copied().unwrap_or(1);
current_state = if bp == 0 { 0 } else { 1 };
}
if seq_pos > 0 { seq_pos -= 1; }
}
2 => { path.push(2); cigar_ops.push('D');
if model_pos < model_len {
let bp = self.backptr_d.get(2 * model_len + model_pos).copied().unwrap_or(0);
current_state = bp;
}
if model_pos > 0 { model_pos -= 1; }
}
_ => break,
}
}
while seq_pos > 0 {
path.push(1); cigar_ops.push('I');
seq_pos -= 1;
}
while model_pos > 0 {
path.push(2); cigar_ops.push('D');
model_pos -= 1;
}
path.reverse();
cigar_ops.reverse();
let cigar = compress_cigar_ops(&cigar_ops);
ViterbiPath {
path,
score: final_score,
cigar,
}
}
}
fn compress_cigar_ops(ops: &[char]) -> String {
if ops.is_empty() {
return String::new();
}
let mut cigar = String::new();
let mut current_op = ops[0];
let mut count = 1usize;
for &op in &ops[1..] {
if op == current_op {
count += 1;
} else {
if count > 0 {
cigar.push_str(&count.to_string());
cigar.push(current_op);
}
current_op = op;
count = 1;
}
}
if count > 0 {
cigar.push_str(&count.to_string());
cigar.push(current_op);
}
cigar
}
#[inline]
#[cfg(target_arch = "x86_64")]
fn is_avx2_available() -> bool {
cfg!(target_feature = "avx2") || std::env::var("SKIP_AVX2").is_err()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_viterbi_decoder_creation() {
}
#[test]
fn test_backtrack_generation() {
let mut decoder = ViterbiDecoder::new_dummy();
let path = decoder.backtrack(100, 50);
assert!(!path.cigar.is_empty());
assert!(path.score.is_finite());
}
#[test]
fn test_gpu_dispatcher_initialization() {
let mut decoder = ViterbiDecoder::new_dummy();
decoder.gpu_dispatcher = Some(GpuDispatcher::new());
if let Some(dispatcher) = &decoder.gpu_dispatcher {
let status = dispatcher.status();
assert!(!status.is_empty());
}
}
#[test]
fn test_decoder_has_gpu_field() {
let decoder = ViterbiDecoder::new_dummy();
assert!(decoder.gpu_dispatcher.is_none());
}
}
impl ViterbiDecoder {
#[allow(dead_code)]
fn new_dummy() -> Self {
ViterbiDecoder {
dp_m: vec![0.0; 100],
dp_i: vec![0.0; 100],
dp_d: vec![0.0; 100],
backptr_m: vec![0u8; 100],
backptr_i: vec![0u8; 100],
backptr_d: vec![0u8; 100],
gpu_dispatcher: None,
scratch_m: vec![0.0; 100],
scratch_i: vec![0.0; 100],
scratch_d: vec![0.0; 100],
}
}
}