use super::{CorrectionConfig, CorrectionStats};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct OverlapRegion {
pub r1_start: usize,
pub r1_end: usize,
pub r2_start: usize,
pub r2_end: usize,
pub mismatches: usize,
}
impl OverlapRegion {
#[inline]
pub fn len(&self) -> usize {
self.r1_end.saturating_sub(self.r1_start)
}
#[inline]
pub fn is_empty(&self) -> bool {
self.r1_end <= self.r1_start
}
}
const COMPLEMENT: [u8; 256] = {
let mut table = [0u8; 256];
let mut i = 0;
while i < 256 {
table[i] = i as u8;
i += 1;
}
table[b'A' as usize] = b'T';
table[b'T' as usize] = b'A';
table[b'G' as usize] = b'C';
table[b'C' as usize] = b'G';
table[b'a' as usize] = b't';
table[b't' as usize] = b'a';
table[b'g' as usize] = b'c';
table[b'c' as usize] = b'g';
table[b'N' as usize] = b'N';
table[b'n' as usize] = b'n';
table
};
#[inline]
pub(crate) fn complement_base(base: u8) -> u8 {
COMPLEMENT[base as usize]
}
#[derive(Debug, Clone)]
pub struct OverlapCorrector {
config: CorrectionConfig,
}
pub struct CorrectionBuffers {
pub r1_seq: Vec<u8>,
pub r1_qual: Vec<u8>,
pub r2_seq: Vec<u8>,
pub r2_qual: Vec<u8>,
pub r2_seq_rc: Vec<u8>,
pub r2_qual_rc: Vec<u8>,
}
impl Default for CorrectionBuffers {
fn default() -> Self {
Self::new()
}
}
impl CorrectionBuffers {
pub fn new() -> Self {
Self {
r1_seq: Vec::with_capacity(300),
r1_qual: Vec::with_capacity(300),
r2_seq: Vec::with_capacity(300),
r2_qual: Vec::with_capacity(300),
r2_seq_rc: Vec::with_capacity(300),
r2_qual_rc: Vec::with_capacity(300),
}
}
#[inline]
pub fn clear(&mut self) {
self.r1_seq.clear();
self.r1_qual.clear();
self.r2_seq.clear();
self.r2_qual.clear();
self.r2_seq_rc.clear();
self.r2_qual_rc.clear();
}
}
impl OverlapCorrector {
pub fn new(config: CorrectionConfig) -> Self {
Self { config }
}
#[inline]
pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| complement_base(b))
.collect()
}
#[inline]
pub fn reverse_quality(qual: &[u8]) -> Vec<u8> {
qual.iter().rev().copied().collect()
}
pub fn find_overlap(&self, r1_seq: &[u8], r2_seq_rc: &[u8]) -> Option<OverlapRegion> {
let r1_len = r1_seq.len();
let r2_len = r2_seq_rc.len();
if r1_len < self.config.min_overlap || r2_len < self.config.min_overlap {
return None;
}
let mut best_overlap: Option<OverlapRegion> = None;
let mut best_score: i64 = i64::MIN;
let max_overlap = r1_len.min(r2_len);
let min_overlap = self.config.min_overlap;
for overlap_len in min_overlap..=max_overlap {
let r1_start = r1_len - overlap_len;
let r2_start = 0;
let r1_region = &r1_seq[r1_start..];
let r2_region = &r2_seq_rc[r2_start..overlap_len];
let mismatches = count_mismatches(r1_region, r2_region);
let mismatch_percent = (mismatches as f64 / overlap_len as f64) * 100.0;
if mismatches <= self.config.diff_limit && mismatch_percent <= self.config.diff_percent_limit {
let score = overlap_len as i64 - 5 * mismatches as i64;
if score > best_score {
best_score = score;
best_overlap = Some(OverlapRegion {
r1_start,
r1_end: r1_len,
r2_start,
r2_end: overlap_len,
mismatches,
});
}
}
}
best_overlap
}
#[deprecated(since = "0.4.0", note = "Use correct_pair_into() with CorrectionBuffers for better performance")]
#[allow(clippy::type_complexity)]
pub fn correct_pair(
&self,
r1_seq: &[u8],
r1_qual: &[u8],
r2_seq: &[u8],
r2_qual: &[u8],
) -> Option<(Vec<u8>, Vec<u8>, Vec<u8>, Vec<u8>, CorrectionStats)> {
if !self.config.enabled {
return None;
}
let r1_len = r1_seq.len();
let r2_len = r2_seq.len();
if r1_len < self.config.min_overlap || r2_len < self.config.min_overlap {
return None;
}
let r2_seq_rc = Self::reverse_complement(r2_seq);
let r2_qual_rc = Self::reverse_quality(r2_qual);
let overlap = self.find_overlap(r1_seq, &r2_seq_rc)?;
let mut stats = CorrectionStats::new();
stats.pairs_processed = 1;
stats.pairs_with_overlap = 1;
let mut corrected_r1_seq = r1_seq.to_vec();
let mut corrected_r1_qual = r1_qual.to_vec();
let mut corrected_r2_seq = r2_seq.to_vec();
let mut corrected_r2_qual = r2_qual.to_vec();
let overlap_len = overlap.len();
for i in 0..overlap_len {
let r1_pos = overlap.r1_start + i;
let r2_pos_rc = overlap.r2_start + i;
let r2_pos = r2_seq.len() - 1 - r2_pos_rc;
let r1_base = r1_seq[r1_pos];
let r2_base_rc = r2_seq_rc[r2_pos_rc];
let r1_q = r1_qual[r1_pos];
let r2_q_rc = r2_qual_rc[r2_pos_rc];
if r1_base != r2_base_rc {
if r1_q >= r2_q_rc {
corrected_r2_seq[r2_pos] = complement_base(r1_base);
let new_qual = calculate_corrected_quality(r1_q, r2_q_rc);
corrected_r2_qual[r2_pos] = new_qual;
stats.bases_corrected_r2 += 1;
} else {
corrected_r1_seq[r1_pos] = r2_base_rc;
let new_qual = calculate_corrected_quality(r2_q_rc, r1_q);
corrected_r1_qual[r1_pos] = new_qual;
stats.bases_corrected_r1 += 1;
}
stats.bases_corrected += 1;
}
}
if stats.bases_corrected > 0 {
stats.pairs_corrected = 1;
}
Some((
corrected_r1_seq,
corrected_r1_qual,
corrected_r2_seq,
corrected_r2_qual,
stats,
))
}
fn apply_corrections_in_place(
&self,
r1_seq_out: &mut [u8],
r1_qual_out: &mut [u8],
r2_seq_out: &mut [u8],
r2_qual_out: &mut [u8],
overlap: &OverlapRegion,
r2_seq_rc: &[u8],
r2_qual_rc: &[u8],
) -> CorrectionStats {
let mut stats = CorrectionStats::new();
stats.pairs_processed = 1;
stats.pairs_with_overlap = 1;
let r2_seq_len = r2_seq_out.len();
let overlap_len = overlap.len();
for i in 0..overlap_len {
let r1_pos = overlap.r1_start + i;
let r2_pos_rc = overlap.r2_start + i;
let r2_pos = r2_seq_len - 1 - r2_pos_rc;
let r1_base = r1_seq_out[r1_pos];
let r2_base_rc = r2_seq_rc[r2_pos_rc];
let r1_q = r1_qual_out[r1_pos];
let r2_q_rc = r2_qual_rc[r2_pos_rc];
if r1_base != r2_base_rc {
if r1_q >= r2_q_rc {
r2_seq_out[r2_pos] = complement_base(r1_base);
let new_qual = calculate_corrected_quality(r1_q, r2_q_rc);
r2_qual_out[r2_pos] = new_qual;
stats.bases_corrected_r2 += 1;
} else {
r1_seq_out[r1_pos] = r2_base_rc;
let new_qual = calculate_corrected_quality(r2_q_rc, r1_q);
r1_qual_out[r1_pos] = new_qual;
stats.bases_corrected_r1 += 1;
}
stats.bases_corrected += 1;
}
}
if stats.bases_corrected > 0 {
stats.pairs_corrected = 1;
}
stats
}
pub fn correct_pair_into(
&self,
r1_seq: &[u8],
r1_qual: &[u8],
r2_seq: &[u8],
r2_qual: &[u8],
buffers: &mut CorrectionBuffers,
) -> Option<CorrectionStats> {
if !self.config.enabled {
return None;
}
if r1_seq.len() < self.config.min_overlap || r2_seq.len() < self.config.min_overlap {
return None;
}
buffers.clear();
buffers.r2_seq_rc.extend(r2_seq.iter().rev().map(|&b| complement_base(b)));
buffers.r2_qual_rc.extend(r2_qual.iter().rev().copied());
let overlap = self.find_overlap(r1_seq, &buffers.r2_seq_rc)?;
buffers.r1_seq.extend_from_slice(r1_seq);
buffers.r1_qual.extend_from_slice(r1_qual);
buffers.r2_seq.extend_from_slice(r2_seq);
buffers.r2_qual.extend_from_slice(r2_qual);
let stats = self.apply_corrections_in_place(
&mut buffers.r1_seq,
&mut buffers.r1_qual,
&mut buffers.r2_seq,
&mut buffers.r2_qual,
&overlap,
&buffers.r2_seq_rc,
&buffers.r2_qual_rc,
);
Some(stats)
}
}
#[inline]
fn count_mismatches(seq1: &[u8], seq2: &[u8]) -> usize {
seq1.iter()
.zip(seq2.iter())
.filter(|(&a, &b)| a != b)
.count()
}
#[inline]
fn calculate_corrected_quality(winner_q: u8, loser_q: u8) -> u8 {
let winner_phred = winner_q.saturating_sub(33);
let loser_phred = loser_q.saturating_sub(33);
let diff = winner_phred.saturating_sub(loser_phred);
let new_phred = if diff < 3 {
((winner_phred as u16 + loser_phred as u16) / 2) as u8
} else {
winner_phred.saturating_sub(2)
};
new_phred.saturating_add(33).max(33)
}
#[cfg(test)]
mod tests {
use super::*;
fn make_qual(scores: &[u8]) -> Vec<u8> {
scores.iter().map(|&s| s + 33).collect()
}
#[test]
fn test_complement_base() {
assert_eq!(complement_base(b'A'), b'T');
assert_eq!(complement_base(b'T'), b'A');
assert_eq!(complement_base(b'G'), b'C');
assert_eq!(complement_base(b'C'), b'G');
assert_eq!(complement_base(b'N'), b'N');
}
#[test]
fn test_reverse_complement() {
let seq = b"ACGT";
let rc = OverlapCorrector::reverse_complement(seq);
assert_eq!(rc, b"ACGT");
let seq2 = b"AAAAGGGG";
let rc2 = OverlapCorrector::reverse_complement(seq2);
assert_eq!(rc2, b"CCCCTTTT");
}
#[test]
fn test_reverse_quality() {
let qual = b"IIIHHH";
let reversed = OverlapCorrector::reverse_quality(qual);
assert_eq!(reversed, b"HHHIII");
}
#[test]
fn test_count_mismatches() {
assert_eq!(count_mismatches(b"ACGT", b"ACGT"), 0);
assert_eq!(count_mismatches(b"ACGT", b"TCGT"), 1);
assert_eq!(count_mismatches(b"ACGT", b"TGCA"), 4);
}
#[test]
fn test_find_overlap_no_overlap() {
let config = CorrectionConfig::new()
.enabled()
.with_min_overlap(10);
let corrector = OverlapCorrector::new(config);
let r1 = b"ACGT";
let r2_rc = b"TGCA";
assert!(corrector.find_overlap(r1, r2_rc).is_none());
}
#[test]
fn test_find_overlap_exact_match() {
let config = CorrectionConfig::new()
.enabled()
.with_min_overlap(10)
.with_diff_limit(5);
let corrector = OverlapCorrector::new(config);
let r1 = b"AAAAAAAAAACCCCCCCCCC";
let r2_rc = b"CCCCCCCCCCGGGGGGGGGG";
let overlap = corrector.find_overlap(r1, r2_rc);
assert!(overlap.is_some());
let o = overlap.unwrap();
assert_eq!(o.len(), 10);
assert_eq!(o.mismatches, 0);
}
#[test]
fn test_find_overlap_with_mismatches() {
let config = CorrectionConfig::new()
.enabled()
.with_min_overlap(10)
.with_diff_limit(2)
.with_diff_percent_limit(25.0); let corrector = OverlapCorrector::new(config);
let r1 = b"AAAAAAAAAACCCCCCCCCC";
let r2_rc = b"CCTCCCCTCCGGGGGGGGGG";
let overlap = corrector.find_overlap(r1, r2_rc);
assert!(overlap.is_some());
let o = overlap.unwrap();
assert_eq!(o.mismatches, 2);
}
#[test]
fn test_find_overlap_too_many_mismatches() {
let config = CorrectionConfig::new()
.enabled()
.with_min_overlap(10)
.with_diff_limit(1);
let corrector = OverlapCorrector::new(config);
let r1 = b"AAAAAAAAAACCCCCCCCCC";
let r2_rc = b"CCTCCCCTCCGGGGGGGGGG";
let overlap = corrector.find_overlap(r1, r2_rc);
assert!(overlap.is_none());
}
#[test]
fn test_correct_pair_disabled() {
let config = CorrectionConfig::new().disabled();
let corrector = OverlapCorrector::new(config);
let r1_seq = b"ACGT";
let r1_qual = make_qual(&[30, 30, 30, 30]);
let r2_seq = b"ACGT";
let r2_qual = make_qual(&[30, 30, 30, 30]);
let result = corrector.correct_pair(r1_seq, &r1_qual, r2_seq, &r2_qual);
assert!(result.is_none());
}
#[test]
fn test_correct_pair_with_correction() {
let config = CorrectionConfig::new()
.enabled()
.with_min_overlap(4)
.with_diff_limit(2)
.with_diff_percent_limit(20.0); let corrector = OverlapCorrector::new(config);
let _r1_seq = b"AAAAAAAAAACCCCCCCCCC";
let _r1_qual = make_qual(&[30; 20]);
let mut r1_seq_with_error = b"AAAAAAAAAACCCCCCCCCC".to_vec();
r1_seq_with_error[15] = b'T'; let mut r1_qual_with_error = vec![30 + 33; 20]; r1_qual_with_error[15] = 10 + 33;
let r2_seq = b"TTTTTTTTTTGGGGGGGGGG";
let r2_qual = make_qual(&[30; 20]);
let result = corrector.correct_pair(
&r1_seq_with_error,
&r1_qual_with_error,
r2_seq,
&r2_qual,
);
assert!(result.is_some());
let (cr1_seq, _cr1_qual, _cr2_seq, _cr2_qual, stats) = result.unwrap();
assert_eq!(stats.pairs_with_overlap, 1);
assert_eq!(stats.bases_corrected, 1);
assert_eq!(stats.bases_corrected_r1, 1);
assert_eq!(cr1_seq[15], b'C'); }
#[test]
fn test_overlap_region() {
let region = OverlapRegion {
r1_start: 10,
r1_end: 20,
r2_start: 0,
r2_end: 10,
mismatches: 2,
};
assert_eq!(region.len(), 10);
assert!(!region.is_empty());
}
#[test]
fn test_overlap_region_empty() {
let region = OverlapRegion {
r1_start: 10,
r1_end: 10,
r2_start: 0,
r2_end: 0,
mismatches: 0,
};
assert!(region.is_empty());
assert_eq!(region.len(), 0);
}
#[test]
fn test_calculate_corrected_quality() {
let q1 = calculate_corrected_quality(40 + 33, 10 + 33);
assert!(q1 > 33);
let q2 = calculate_corrected_quality(30 + 33, 28 + 33);
assert!(q2 > 33);
}
}