use std::sync::Arc;
use crate::genomics::alignment::{banded_affine_align, AffineScores};
use crate::genomics::{BlockedFMIndex, CigarOp, FMInterval};
use thiserror::Error;
#[derive(Debug, Clone)]
pub struct AlignmentResult {
pub interval: FMInterval,
pub primary_position: Option<u32>,
pub is_reverse: bool,
pub cigar: Vec<CigarOp>,
pub nm: u32,
pub as_score: i32,
pub md: String,
pub mapq: u8,
pub processed_bases: usize,
pub mismatches: usize,
pub span: (usize, usize),
pub score: f32,
}
impl AlignmentResult {
pub fn has_candidates(&self) -> bool {
self.primary_position.is_some() || !self.interval.is_empty()
}
}
#[derive(Debug, Clone, Copy)]
struct SeedConfig {
k: usize,
stride: usize,
max_hits_per_seed: usize,
diagonal_tolerance: i32,
}
#[derive(Debug, Error)]
pub enum AlignerError {
#[error("fm-index error: {0}")]
FMIndex(#[from] crate::genomics::FMIndexError),
}
#[derive(Debug)]
pub struct BWTAligner {
fm_index: Arc<BlockedFMIndex>,
reference: Arc<[u8]>,
seed_cfg: SeedConfig,
}
impl BWTAligner {
pub fn new(reference: &[u8]) -> Result<Self, AlignerError> {
let suggested_block_size = ((reference.len() as f64).sqrt().ceil() as usize).max(64);
let fm_index = Arc::new(BlockedFMIndex::build(reference, suggested_block_size)?);
let reference: Arc<[u8]> = Arc::from(
reference
.iter()
.map(|b| b.to_ascii_uppercase())
.collect::<Vec<u8>>()
.into_boxed_slice(),
);
Ok(Self {
fm_index,
reference,
seed_cfg: SeedConfig {
k: 19,
stride: 5,
max_hits_per_seed: 64,
diagonal_tolerance: 3,
},
})
}
pub fn align_read(&mut self, read: &[u8]) -> Result<AlignmentResult, AlignerError> {
let forward = self.align_read_one_strand(read, false);
let reverse = {
let rc = reverse_complement(read);
self.align_read_one_strand(&rc, true)
};
Ok(pick_best(forward, reverse))
}
pub fn align_batch<I>(&mut self, reads: I) -> Result<Vec<AlignmentResult>, AlignerError>
where
I: IntoIterator,
I::Item: AsRef<[u8]>,
{
let mut results = Vec::new();
for read in reads {
results.push(self.align_read(read.as_ref())?);
}
Ok(results)
}
pub fn fm_index(&self) -> &BlockedFMIndex {
&self.fm_index
}
fn align_read_one_strand(&self, read: &[u8], is_reverse: bool) -> AlignmentResult {
let read_upper: Vec<u8> = read.iter().map(|b| b.to_ascii_uppercase()).collect();
let interval = self.fm_index.backward_search(&read_upper);
let mut hits = Vec::new();
if read_upper.len() >= self.seed_cfg.k {
for read_pos in (0..=read_upper.len() - self.seed_cfg.k).step_by(self.seed_cfg.stride) {
let kmer = &read_upper[read_pos..read_pos + self.seed_cfg.k];
if kmer.contains(&b'N') {
continue;
}
let k_interval = self.fm_index.backward_search(kmer);
if k_interval.is_empty() {
continue;
}
let locs = self
.fm_index
.locate_interval(k_interval, self.seed_cfg.max_hits_per_seed);
for ref_pos in locs {
hits.push(SeedHit {
ref_pos: ref_pos as i64,
read_pos: read_pos as i64,
len: self.seed_cfg.k as i32,
});
}
}
}
let candidate_start = if hits.is_empty() {
if interval.is_empty() {
None
} else {
Some(self.fm_index.sa_at(interval.lower as usize))
}
} else {
chain_hits(&hits, self.seed_cfg.diagonal_tolerance).map(|c| c as usize)
};
let mut primary_position: Option<u32> = None;
let mut cigar: Vec<CigarOp> = Vec::new();
let mut nm: u32 = 0;
let mut as_score: i32 = 0;
let mut md = String::new();
if let Some(start) = candidate_start {
let band = 32usize;
let window_start = start.saturating_sub(band);
let window_end = (start + read_upper.len() + band).min(self.reference.len());
if window_start < window_end {
let window = &self.reference[window_start..window_end];
if let Some(refined) =
banded_affine_align(window, &read_upper, band, AffineScores::default())
{
primary_position = Some((window_start + refined.ref_start) as u32);
cigar = refined.cigar;
nm = refined.nm;
as_score = refined.score;
md = refined.md;
}
}
}
let score = if primary_position.is_some() {
as_score as f32
} else if interval.is_empty() {
0.0
} else {
1.0
};
AlignmentResult {
interval,
primary_position,
is_reverse,
cigar,
nm,
as_score,
md,
mapq: if primary_position.is_some() { 60 } else { 0 },
processed_bases: read_upper.len(),
mismatches: nm as usize,
span: (0, 0),
score,
}
}
}
#[derive(Debug, Clone, Copy)]
struct SeedHit {
ref_pos: i64,
read_pos: i64,
len: i32,
}
fn chain_hits(hits: &[SeedHit], diag_tol: i32) -> Option<i64> {
let mut sorted = hits.to_vec();
sorted.sort_by(|a, b| {
(a.ref_pos - a.read_pos)
.cmp(&(b.ref_pos - b.read_pos))
.then_with(|| a.read_pos.cmp(&b.read_pos))
.then_with(|| a.ref_pos.cmp(&b.ref_pos))
});
let mut best_score: i64 = -1;
let mut best_diag: i64 = 0;
let mut i = 0usize;
while i < sorted.len() {
let d0 = sorted[i].ref_pos - sorted[i].read_pos;
let mut j = i + 1;
let mut score = sorted[i].len as i64;
while j < sorted.len() {
let dj = sorted[j].ref_pos - sorted[j].read_pos;
if (dj - d0).abs() > diag_tol as i64 {
break;
}
score += sorted[j].len as i64;
j += 1;
}
if score > best_score || (score == best_score && d0 < best_diag) {
best_score = score;
best_diag = d0;
}
i = j;
}
if best_score <= 0 {
return None;
}
if best_diag < 0 {
return None;
}
Some(best_diag)
}
fn pick_best(a: AlignmentResult, b: AlignmentResult) -> AlignmentResult {
match (a.primary_position, b.primary_position) {
(Some(pa), Some(pb)) => {
if a.score > b.score {
a
} else if b.score > a.score {
b
} else if !a.is_reverse && b.is_reverse {
a
} else if a.is_reverse && !b.is_reverse {
b
} else if pa <= pb {
a
} else {
b
}
}
(Some(_), None) => a,
(None, Some(_)) => b,
(None, None) => {
if !a.interval.is_empty() && b.interval.is_empty() {
a
} else if a.interval.is_empty() && !b.interval.is_empty() {
b
} else {
a
}
}
}
}
fn reverse_complement(read: &[u8]) -> Vec<u8> {
read.iter()
.rev()
.map(|&b| match b.to_ascii_uppercase() {
b'A' => b'T',
b'C' => b'G',
b'G' => b'C',
b'T' => b'A',
_ => b'N',
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn aligner_returns_interval_for_read() {
let reference = b"ACGTCGTAACGT";
let mut aligner = BWTAligner::new(reference).expect("builder should succeed");
let result = aligner
.align_read(b"CGTA")
.expect("alignment should succeed");
assert!(result.has_candidates());
assert!(result.processed_bases > 0);
}
}