use serde::{Deserialize, Serialize};
const DEFAULT_SAMPLE_SIZE: u64 = 10_000;
const MIN_OVERLAP_LENGTH: usize = 10;
const MAX_MISMATCH_RATE: f64 = 0.1;
const MAX_INSERT_SIZE: usize = 1000;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InsertSizeStats {
histogram: Vec<u64>,
peak: usize,
mean: f64,
std_dev: f64,
count: u64,
pairs_sampled: u64,
}
impl Default for InsertSizeStats {
fn default() -> Self {
Self::new()
}
}
impl InsertSizeStats {
pub fn new() -> Self {
Self {
histogram: vec![0u64; MAX_INSERT_SIZE + 1],
peak: 0,
mean: 0.0,
std_dev: 0.0,
count: 0,
pairs_sampled: 0,
}
}
pub fn histogram(&self) -> &[u64] {
&self.histogram
}
pub fn peak(&self) -> usize {
self.peak
}
pub fn mean(&self) -> f64 {
self.mean
}
pub fn std_dev(&self) -> f64 {
self.std_dev
}
pub fn count(&self) -> u64 {
self.count
}
pub fn pairs_sampled(&self) -> u64 {
self.pairs_sampled
}
pub fn detection_rate(&self) -> f64 {
if self.pairs_sampled == 0 {
0.0
} else {
self.count as f64 / self.pairs_sampled as f64
}
}
pub fn has_data(&self) -> bool {
self.count > 0
}
pub fn median(&self) -> usize {
if self.count == 0 {
return 0;
}
let half = self.count / 2;
let mut cumulative = 0u64;
for (size, &freq) in self.histogram.iter().enumerate() {
cumulative += freq;
if cumulative > half {
return size;
}
}
0
}
pub fn merge(&mut self, other: &InsertSizeStats) {
for (i, &count) in other.histogram.iter().enumerate() {
if i < self.histogram.len() {
self.histogram[i] += count;
}
}
self.pairs_sampled += other.pairs_sampled;
self.count += other.count;
self.recalculate_stats();
}
fn recalculate_stats(&mut self) {
if self.count == 0 {
self.peak = 0;
self.mean = 0.0;
self.std_dev = 0.0;
return;
}
let mut max_count = 0u64;
let mut peak = 0usize;
let mut sum = 0u64;
for (size, &count) in self.histogram.iter().enumerate() {
if count > max_count {
max_count = count;
peak = size;
}
sum += size as u64 * count;
}
self.peak = peak;
self.mean = sum as f64 / self.count as f64;
let mut variance_sum = 0.0f64;
for (size, &count) in self.histogram.iter().enumerate() {
if count > 0 {
let diff = size as f64 - self.mean;
variance_sum += diff * diff * count as f64;
}
}
self.std_dev = (variance_sum / self.count as f64).sqrt();
}
}
#[derive(Debug, Clone)]
pub struct InsertSizeEstimator {
histogram: Vec<u64>,
sum: u64,
sum_sq: u64,
count: u64,
pairs_sampled: u64,
sample_size: u64,
sampling_active: bool,
rc_buffer: Vec<u8>,
}
impl Default for InsertSizeEstimator {
fn default() -> Self {
Self::new()
}
}
impl InsertSizeEstimator {
pub fn new() -> Self {
Self::with_sample_size(DEFAULT_SAMPLE_SIZE)
}
pub fn with_sample_size(sample_size: u64) -> Self {
Self {
histogram: vec![0u64; MAX_INSERT_SIZE + 1],
sum: 0,
sum_sq: 0,
count: 0,
pairs_sampled: 0,
sample_size,
sampling_active: true,
rc_buffer: Vec::with_capacity(512), }
}
#[inline]
pub fn is_sampling(&self) -> bool {
self.sampling_active
}
#[inline]
pub fn estimate_from_pair(&mut self, r1_seq: &[u8], r2_seq: &[u8]) {
if !self.sampling_active {
return;
}
self.pairs_sampled += 1;
if let Some(insert_size) = self.estimate_from_overlap(r1_seq, r2_seq) {
self.record_insert_size(insert_size);
}
if self.pairs_sampled >= self.sample_size {
self.sampling_active = false;
}
}
#[inline]
fn record_insert_size(&mut self, insert_size: usize) {
let idx = insert_size.min(MAX_INSERT_SIZE);
self.histogram[idx] += 1;
self.sum += insert_size as u64;
self.sum_sq += (insert_size as u64) * (insert_size as u64);
self.count += 1;
}
fn estimate_from_overlap(&mut self, r1_seq: &[u8], r2_seq: &[u8]) -> Option<usize> {
if r1_seq.len() < MIN_OVERLAP_LENGTH || r2_seq.len() < MIN_OVERLAP_LENGTH {
return None;
}
reverse_complement_into(r2_seq, &mut self.rc_buffer);
if let Some(overlap_len) = find_overlap(r1_seq, &self.rc_buffer) {
let insert_size = r1_seq.len() + r2_seq.len() - overlap_len;
return Some(insert_size);
}
None
}
pub fn finalize(&self) -> InsertSizeStats {
let mut stats = InsertSizeStats {
histogram: self.histogram.clone(),
peak: 0,
mean: 0.0,
std_dev: 0.0,
count: self.count,
pairs_sampled: self.pairs_sampled,
};
if self.count > 0 {
stats.mean = self.sum as f64 / self.count as f64;
let variance =
(self.sum_sq as f64 / self.count as f64) - (stats.mean * stats.mean);
stats.std_dev = if variance > 0.0 { variance.sqrt() } else { 0.0 };
let mut max_count = 0u64;
for (size, &count) in self.histogram.iter().enumerate() {
if count > max_count {
max_count = count;
stats.peak = size;
}
}
}
stats
}
pub fn count(&self) -> u64 {
self.count
}
pub fn pairs_sampled(&self) -> u64 {
self.pairs_sampled
}
pub fn merge(&mut self, other: &InsertSizeEstimator) {
for (i, &count) in other.histogram.iter().enumerate() {
if i < self.histogram.len() {
self.histogram[i] += count;
}
}
self.sum += other.sum;
self.sum_sq += other.sum_sq;
self.count += other.count;
self.pairs_sampled += other.pairs_sampled;
}
}
#[inline]
fn reverse_complement_into(seq: &[u8], buffer: &mut Vec<u8>) {
buffer.clear();
buffer.reserve(seq.len());
buffer.extend(seq.iter().rev().map(|&b| match b {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'N' | b'n' => b'N',
_ => b'N',
}));
}
#[cfg(test)]
#[inline]
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| match b {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'N' | b'n' => b'N',
_ => b'N',
})
.collect()
}
fn find_overlap(seq1: &[u8], seq2: &[u8]) -> Option<usize> {
let min_len = seq1.len().min(seq2.len());
if min_len < MIN_OVERLAP_LENGTH {
return None;
}
let max_overlap = min_len;
const EARLY_EXIT_THRESHOLD: f64 = 0.02;
for overlap_len in (MIN_OVERLAP_LENGTH..=max_overlap).rev() {
let seq1_end = &seq1[seq1.len() - overlap_len..];
let seq2_start = &seq2[..overlap_len];
let mismatches = count_mismatches(seq1_end, seq2_start);
let max_allowed_mismatches = (overlap_len as f64 * MAX_MISMATCH_RATE) as usize;
if mismatches <= max_allowed_mismatches {
let mismatch_rate = mismatches as f64 / overlap_len as f64;
if mismatch_rate <= EARLY_EXIT_THRESHOLD {
return Some(overlap_len);
}
if overlap_len >= 30 {
return Some(overlap_len);
}
return Some(overlap_len);
}
}
None
}
#[inline]
fn count_mismatches(seq1: &[u8], seq2: &[u8]) -> usize {
seq1.iter()
.zip(seq2.iter())
.filter(|(&a, &b)| {
let a_upper = a.to_ascii_uppercase();
let b_upper = b.to_ascii_uppercase();
a_upper != b'N' && b_upper != b'N' && a_upper != b_upper
})
.count()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reverse_complement() {
assert_eq!(reverse_complement(b"ATGC"), b"GCAT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b"GCGC"), b"GCGC");
assert_eq!(reverse_complement(b"atgc"), b"GCAT");
assert_eq!(reverse_complement(b"N"), b"N");
}
#[test]
fn test_count_mismatches() {
assert_eq!(count_mismatches(b"ATGC", b"ATGC"), 0);
assert_eq!(count_mismatches(b"ATGC", b"ATGG"), 1);
assert_eq!(count_mismatches(b"ATGC", b"TTTT"), 3);
assert_eq!(count_mismatches(b"ATGC", b"atgc"), 0); assert_eq!(count_mismatches(b"ANGC", b"ATGC"), 0); }
#[test]
fn test_find_overlap_exact() {
let seq1 = b"GGGGGGGGGGGGATGCATGCATGC";
let seq2 = b"ATGCATGCATGCGGGGGGGGGGGG";
let overlap = find_overlap(seq1, seq2);
assert!(overlap.is_some(), "Expected overlap to be found");
assert!(overlap.unwrap() >= 10);
}
#[test]
fn test_find_overlap_with_mismatch() {
let seq1 = b"GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGCATGCATGCATGCATGC";
let seq2 = b"ATGCATGCATGCATGCATGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG";
let overlap = find_overlap(seq1, seq2);
assert!(overlap.is_some());
}
#[test]
fn test_find_overlap_no_match() {
let seq1 = b"AAAAAAAAAAAAAAAAAAAA";
let seq2 = b"CCCCCCCCCCCCCCCCCCCC";
let overlap = find_overlap(seq1, seq2);
assert!(overlap.is_none());
}
#[test]
fn test_find_overlap_too_short() {
let seq1 = b"ATGC";
let seq2 = b"ATGC";
let overlap = find_overlap(seq1, seq2);
assert!(overlap.is_none());
}
#[test]
fn test_insert_size_estimator_new() {
let est = InsertSizeEstimator::new();
assert!(est.is_sampling());
assert_eq!(est.count(), 0);
assert_eq!(est.pairs_sampled(), 0);
}
#[test]
fn test_insert_size_estimator_overlapping_pair() {
let mut est = InsertSizeEstimator::new();
let r1 = b"AAAAAAAAAAATGCATGCATGC";
let r2 = b"GCATGCATGCATTTTTTTTTTT";
est.estimate_from_pair(r1, r2);
assert_eq!(est.pairs_sampled(), 1);
}
#[test]
fn test_insert_size_estimator_sampling_limit() {
let mut est = InsertSizeEstimator::with_sample_size(5);
for _ in 0..10 {
est.estimate_from_pair(b"AAAAAAAAAAAAAAAAAAAAAA", b"TTTTTTTTTTTTTTTTTTTTTT");
}
assert_eq!(est.pairs_sampled(), 5);
assert!(!est.is_sampling());
}
#[test]
fn test_insert_size_estimator_finalize() {
let est = InsertSizeEstimator::new();
let stats = est.finalize();
assert_eq!(stats.count(), 0);
assert_eq!(stats.peak(), 0);
assert_eq!(stats.mean(), 0.0);
}
#[test]
fn test_insert_size_stats_new() {
let stats = InsertSizeStats::new();
assert_eq!(stats.count(), 0);
assert_eq!(stats.peak(), 0);
assert!(!stats.has_data());
}
#[test]
fn test_insert_size_stats_merge() {
let mut stats1 = InsertSizeStats::new();
stats1.histogram[100] = 5;
stats1.histogram[150] = 10;
stats1.count = 15;
stats1.pairs_sampled = 20;
let mut stats2 = InsertSizeStats::new();
stats2.histogram[100] = 3;
stats2.histogram[200] = 7;
stats2.count = 10;
stats2.pairs_sampled = 15;
stats1.merge(&stats2);
assert_eq!(stats1.histogram[100], 8);
assert_eq!(stats1.histogram[150], 10);
assert_eq!(stats1.histogram[200], 7);
assert_eq!(stats1.count, 25);
assert_eq!(stats1.pairs_sampled, 35);
}
#[test]
fn test_insert_size_stats_detection_rate() {
let mut stats = InsertSizeStats::new();
stats.count = 80;
stats.pairs_sampled = 100;
assert!((stats.detection_rate() - 0.8).abs() < 0.001);
}
#[test]
fn test_insert_size_stats_median() {
let mut stats = InsertSizeStats::new();
stats.histogram[100] = 10;
stats.histogram[200] = 10;
stats.count = 20;
let median = stats.median();
assert!(median == 100 || median == 200);
}
#[test]
fn test_insert_size_estimator_merge() {
let mut est1 = InsertSizeEstimator::new();
est1.histogram[100] = 5;
est1.sum = 500;
est1.sum_sq = 50000;
est1.count = 5;
est1.pairs_sampled = 10;
let mut est2 = InsertSizeEstimator::new();
est2.histogram[100] = 3;
est2.histogram[150] = 2;
est2.sum = 600;
est2.sum_sq = 72000;
est2.count = 5;
est2.pairs_sampled = 8;
est1.merge(&est2);
assert_eq!(est1.histogram[100], 8);
assert_eq!(est1.histogram[150], 2);
assert_eq!(est1.sum, 1100);
assert_eq!(est1.count, 10);
assert_eq!(est1.pairs_sampled, 18);
}
#[test]
fn test_insert_size_stats_serialize() {
let mut stats = InsertSizeStats::new();
stats.histogram[150] = 100;
stats.count = 100;
stats.pairs_sampled = 150;
stats.peak = 150;
stats.mean = 150.0;
stats.std_dev = 10.0;
let json = serde_json::to_string(&stats).unwrap();
let stats2: InsertSizeStats = serde_json::from_str(&json).unwrap();
assert_eq!(stats.count(), stats2.count());
assert_eq!(stats.peak(), stats2.peak());
assert!((stats.mean() - stats2.mean()).abs() < 0.001);
}
#[test]
fn test_realistic_pe_overlap() {
let mut est = InsertSizeEstimator::new();
let insert = b"ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\
ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\
ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\
ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\
ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCAT";
let r1 = &insert[..150];
let r2_orig = &insert[100..250]; let r2 = reverse_complement(r2_orig);
est.estimate_from_pair(r1, &r2);
assert_eq!(est.pairs_sampled(), 1);
}
}